mirror of
https://github.com/fleetdm/fleet
synced 2026-05-24 09:28:54 +00:00
<!-- Add the related story/sub-task/bug number, like Resolves #123, or remove if NA --> **Related issue:** Resolves #40540 Almost done with slog migration. # Checklist for submitter - [ ] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. - Changes present in previous PR ## Testing - [x] Added/updated automated tests - [x] QA'd all new/changed functionality manually <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Chores** * Updated internal logging infrastructure to use Go's standard logging library, modernizing the logging system while maintaining existing functionality and error handling behavior. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
150 lines
4.4 KiB
Go
150 lines
4.4 KiB
Go
package mysql
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/VividCortex/mysqlerr"
|
|
"github.com/cenkalti/backoff/v4"
|
|
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
|
|
"github.com/go-sql-driver/mysql"
|
|
"github.com/jmoiron/sqlx"
|
|
)
|
|
|
|
var (
|
|
fatalErrorMu sync.RWMutex
|
|
fatalErrorHandler func(context.Context, error)
|
|
fatalErrorOnce sync.Once
|
|
)
|
|
|
|
// SetFatalErrorHandler registers a function that will be called (at most once)
|
|
// when a fatal database error is detected, such as the primary becoming
|
|
// read-only during an Aurora failover. The handler should trigger a graceful
|
|
// process shutdown.
|
|
//
|
|
// If no handler is set, the default behavior is to panic.
|
|
func SetFatalErrorHandler(fn func(context.Context, error)) {
|
|
fatalErrorMu.Lock()
|
|
defer fatalErrorMu.Unlock()
|
|
fatalErrorHandler = fn
|
|
fatalErrorOnce = sync.Once{} // reset so handler fires on next fatal error
|
|
}
|
|
|
|
// TriggerFatalError calls the registered fatal error handler exactly once.
|
|
// If no handler is registered, it panics (legacy behavior).
|
|
func TriggerFatalError(ctx context.Context, err error) {
|
|
fatalErrorMu.RLock()
|
|
defer fatalErrorMu.RUnlock()
|
|
|
|
if fatalErrorHandler == nil {
|
|
panic(fmt.Sprintf("database is read-only, possible failover detected: %v", err))
|
|
}
|
|
|
|
fatalErrorOnce.Do(func() {
|
|
fatalErrorHandler(ctx, err)
|
|
})
|
|
}
|
|
|
|
var DoRetryErr = errors.New("fleet datastore retry")
|
|
|
|
type TxFn func(tx sqlx.ExtContext) error
|
|
|
|
// ReadTxFn is the read-only variant of TxFn, with tx only exposing the read methods
|
|
type ReadTxFn func(tx DBReadTx) error
|
|
|
|
// WithRetryTxx provides a common way to commit/rollback a txFn wrapped in a retry with exponential backoff
|
|
func WithRetryTxx(ctx context.Context, db *sqlx.DB, fn TxFn, logger *slog.Logger) error {
|
|
operation := func() error {
|
|
tx, err := db.BeginTxx(ctx, nil)
|
|
if err != nil {
|
|
return ctxerr.Wrap(ctx, err, "create transaction")
|
|
}
|
|
|
|
defer func() {
|
|
if p := recover(); p != nil {
|
|
if err := tx.Rollback(); err != nil {
|
|
logger.ErrorContext(ctx, "error encountered during transaction panic rollback", "err", err)
|
|
}
|
|
panic(p)
|
|
}
|
|
}()
|
|
|
|
if err := fn(tx); err != nil {
|
|
rbErr := tx.Rollback()
|
|
if rbErr != nil && rbErr != sql.ErrTxDone {
|
|
// Consider rollback errors to be non-retryable
|
|
return backoff.Permanent(ctxerr.Wrapf(ctx, err, "got err '%s' rolling back after err", rbErr.Error()))
|
|
}
|
|
|
|
// Read-only errors indicate a DB failover occurred (primary demoted to reader).
|
|
// Trigger graceful shutdown so the orchestrator restarts and reconnects to the new primary.
|
|
if IsReadOnlyError(err) {
|
|
TriggerFatalError(ctx, err)
|
|
return backoff.Permanent(err)
|
|
}
|
|
|
|
if retryableError(err) {
|
|
return err
|
|
}
|
|
|
|
// Consider any other errors to be non-retryable
|
|
return backoff.Permanent(err)
|
|
}
|
|
|
|
if err := tx.Commit(); err != nil {
|
|
err = ctxerr.Wrap(ctx, err, "commit transaction")
|
|
|
|
if IsReadOnlyError(err) {
|
|
TriggerFatalError(ctx, err)
|
|
return backoff.Permanent(err)
|
|
}
|
|
|
|
if retryableError(err) {
|
|
return err
|
|
}
|
|
|
|
return backoff.Permanent(err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
expBo := backoff.NewExponentialBackOff()
|
|
// MySQL innodb_lock_wait_timeout default is 50 seconds, so transaction can be waiting for a lock for several seconds.
|
|
// Setting a higher MaxElapsedTime to increase probability that transaction will be retried.
|
|
// This will reduce the number of retryable 'Deadlock found' errors. However, with a loaded DB, we will still see
|
|
// 'Context cancelled' errors when the server drops long-lasting connections.
|
|
expBo.MaxElapsedTime = 1 * time.Minute
|
|
bo := backoff.WithMaxRetries(expBo, 5)
|
|
return backoff.Retry(operation, bo)
|
|
}
|
|
|
|
// RetryableError determines whether a MySQL error can be retried. By default
|
|
// errors are considered non-retryable. Only errors that we know have a
|
|
// possibility of succeeding on a retry should return true in this function.
|
|
func RetryableError(err error) bool {
|
|
base := ctxerr.Cause(err)
|
|
if b, ok := base.(*mysql.MySQLError); ok {
|
|
switch b.Number {
|
|
// Consider lock related errors to be retryable
|
|
case mysqlerr.ER_LOCK_DEADLOCK, mysqlerr.ER_LOCK_WAIT_TIMEOUT:
|
|
return true
|
|
}
|
|
}
|
|
if errors.Is(err, DoRetryErr) {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// retryableError is the internal (non-exported) version that calls RetryableError.
|
|
// Kept for backwards compatibility with existing callers in this package.
|
|
func retryableError(err error) bool {
|
|
return RetryableError(err)
|
|
}
|