fleet/server/platform/mysql/retry.go
Victor Lyuboslavsky bf9180e6e3
slog migration: initLogger + serve.go + cron + schedule (#40699)
<!-- Add the related story/sub-task/bug number, like Resolves #123, or
remove if NA -->
**Related issue:** Resolves #40540 

Almost done with slog migration.

# Checklist for submitter

- [ ] Changes file added for user-visible changes in `changes/`,
`orbit/changes/` or `ee/fleetd-chrome/changes`.
  - Changes present in previous PR

## Testing

- [x] Added/updated automated tests
- [x] QA'd all new/changed functionality manually


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

* **Chores**
* Updated internal logging infrastructure to use Go's standard logging
library, modernizing the logging system while maintaining existing
functionality and error handling behavior.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2026-02-27 14:29:27 -06:00

150 lines
4.4 KiB
Go

package mysql
import (
"context"
"database/sql"
"errors"
"fmt"
"log/slog"
"sync"
"time"
"github.com/VividCortex/mysqlerr"
"github.com/cenkalti/backoff/v4"
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
"github.com/go-sql-driver/mysql"
"github.com/jmoiron/sqlx"
)
var (
fatalErrorMu sync.RWMutex
fatalErrorHandler func(context.Context, error)
fatalErrorOnce sync.Once
)
// SetFatalErrorHandler registers a function that will be called (at most once)
// when a fatal database error is detected, such as the primary becoming
// read-only during an Aurora failover. The handler should trigger a graceful
// process shutdown.
//
// If no handler is set, the default behavior is to panic.
func SetFatalErrorHandler(fn func(context.Context, error)) {
fatalErrorMu.Lock()
defer fatalErrorMu.Unlock()
fatalErrorHandler = fn
fatalErrorOnce = sync.Once{} // reset so handler fires on next fatal error
}
// TriggerFatalError calls the registered fatal error handler exactly once.
// If no handler is registered, it panics (legacy behavior).
func TriggerFatalError(ctx context.Context, err error) {
fatalErrorMu.RLock()
defer fatalErrorMu.RUnlock()
if fatalErrorHandler == nil {
panic(fmt.Sprintf("database is read-only, possible failover detected: %v", err))
}
fatalErrorOnce.Do(func() {
fatalErrorHandler(ctx, err)
})
}
var DoRetryErr = errors.New("fleet datastore retry")
type TxFn func(tx sqlx.ExtContext) error
// ReadTxFn is the read-only variant of TxFn, with tx only exposing the read methods
type ReadTxFn func(tx DBReadTx) error
// WithRetryTxx provides a common way to commit/rollback a txFn wrapped in a retry with exponential backoff
func WithRetryTxx(ctx context.Context, db *sqlx.DB, fn TxFn, logger *slog.Logger) error {
operation := func() error {
tx, err := db.BeginTxx(ctx, nil)
if err != nil {
return ctxerr.Wrap(ctx, err, "create transaction")
}
defer func() {
if p := recover(); p != nil {
if err := tx.Rollback(); err != nil {
logger.ErrorContext(ctx, "error encountered during transaction panic rollback", "err", err)
}
panic(p)
}
}()
if err := fn(tx); err != nil {
rbErr := tx.Rollback()
if rbErr != nil && rbErr != sql.ErrTxDone {
// Consider rollback errors to be non-retryable
return backoff.Permanent(ctxerr.Wrapf(ctx, err, "got err '%s' rolling back after err", rbErr.Error()))
}
// Read-only errors indicate a DB failover occurred (primary demoted to reader).
// Trigger graceful shutdown so the orchestrator restarts and reconnects to the new primary.
if IsReadOnlyError(err) {
TriggerFatalError(ctx, err)
return backoff.Permanent(err)
}
if retryableError(err) {
return err
}
// Consider any other errors to be non-retryable
return backoff.Permanent(err)
}
if err := tx.Commit(); err != nil {
err = ctxerr.Wrap(ctx, err, "commit transaction")
if IsReadOnlyError(err) {
TriggerFatalError(ctx, err)
return backoff.Permanent(err)
}
if retryableError(err) {
return err
}
return backoff.Permanent(err)
}
return nil
}
expBo := backoff.NewExponentialBackOff()
// MySQL innodb_lock_wait_timeout default is 50 seconds, so transaction can be waiting for a lock for several seconds.
// Setting a higher MaxElapsedTime to increase probability that transaction will be retried.
// This will reduce the number of retryable 'Deadlock found' errors. However, with a loaded DB, we will still see
// 'Context cancelled' errors when the server drops long-lasting connections.
expBo.MaxElapsedTime = 1 * time.Minute
bo := backoff.WithMaxRetries(expBo, 5)
return backoff.Retry(operation, bo)
}
// RetryableError determines whether a MySQL error can be retried. By default
// errors are considered non-retryable. Only errors that we know have a
// possibility of succeeding on a retry should return true in this function.
func RetryableError(err error) bool {
base := ctxerr.Cause(err)
if b, ok := base.(*mysql.MySQLError); ok {
switch b.Number {
// Consider lock related errors to be retryable
case mysqlerr.ER_LOCK_DEADLOCK, mysqlerr.ER_LOCK_WAIT_TIMEOUT:
return true
}
}
if errors.Is(err, DoRetryErr) {
return true
}
return false
}
// retryableError is the internal (non-exported) version that calls RetryableError.
// Kept for backwards compatibility with existing callers in this package.
func retryableError(err error) bool {
return RetryableError(err)
}