fleet/server/service/schedule/schedule.go

497 lines
15 KiB
Go

// Package schedule allows periodic run of a list of jobs.
//
// Type Schedule allows grouping a set of Jobs to run at specific intervals.
// Each Job is executed serially in the order they were added to the Schedule.
package schedule
import (
"context"
"fmt"
"sync"
"time"
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
"github.com/fleetdm/fleet/v4/server/fleet"
"github.com/getsentry/sentry-go"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
)
// ReloadInterval reloads and returns a new interval.
type ReloadInterval func(ctx context.Context) (time.Duration, error)
// Schedule runs a list of jobs serially at a given schedule.
//
// Each job is executed one after the other in the order they were added.
// If one of the job fails, an error is logged and the scheduler
// continues with the next.
type Schedule struct {
ctx context.Context
name string
instanceID string
logger log.Logger
schedIntervalMu sync.Mutex // protects schedInterval.
schedInterval time.Duration
done chan struct{}
configReloadInterval time.Duration
configReloadIntervalFn ReloadInterval
locker Locker
altLockName string
jobs []Job
statsStore CronStatsStore
}
// JobFn is the signature of a Job.
type JobFn func(context.Context) error
// Job represents a job that can be added to Scheduler.
type Job struct {
// ID is the unique identifier for the job.
ID string
// Fn is the job itself.
Fn JobFn
}
// Locker allows a Schedule to acquire a lock before running jobs.
type Locker interface {
Lock(ctx context.Context, scheduleName string, scheduleInstanceID string, expiration time.Duration) (bool, error)
Unlock(ctx context.Context, scheduleName string, scheduleInstanceID string) error
}
// CronStatsStore allows a Schedule to store and retrieve statistics pertaining to the Schedule
type CronStatsStore interface {
// GetLatestCronStats returns the most recent cron stats for the named cron schedule. If no rows
// are found, it returns an empty CronStats struct
GetLatestCronStats(ctx context.Context, name string) (fleet.CronStats, error)
// InsertCronStats inserts cron stats for the named cron schedule
InsertCronStats(ctx context.Context, statsType fleet.CronStatsType, name string, instance string, status fleet.CronStatsStatus) (int, error)
// UpdateCronStats updates the status of the identified cron stats record
UpdateCronStats(ctx context.Context, id int, status fleet.CronStatsStatus) error
}
// Option allows configuring a Schedule.
type Option func(*Schedule)
// WithLogger sets a logger for the Schedule.
func WithLogger(l log.Logger) Option {
return func(s *Schedule) {
s.logger = log.With(l, "schedule", s.name)
}
}
// WithConfigReloadInterval allows setting a reload interval function,
// that will allow updating the interval of a running schedule.
//
// If not set, then the schedule performs no interval reloading.
func WithConfigReloadInterval(interval time.Duration, fn ReloadInterval) Option {
return func(s *Schedule) {
s.configReloadInterval = interval
s.configReloadIntervalFn = fn
}
}
// WithAltLockID sets an alternative identifier to use when acquiring the lock.
//
// If not set, then the Schedule's name is used for acquiring the lock.
func WithAltLockID(name string) Option {
return func(s *Schedule) {
s.altLockName = name
}
}
// WithJob adds a job to the Schedule.
//
// Each job is executed in the order they are added.
func WithJob(id string, fn JobFn) Option {
return func(s *Schedule) {
s.jobs = append(s.jobs, Job{
ID: id,
Fn: fn,
})
}
}
// New creates and returns a Schedule.
// Jobs are added with the WithJob Option.
//
// The jobs are executed serially in order at the provided interval.
//
// The provided locker is used to acquire/release a lock before running the jobs.
// The provided name and instanceID of the Schedule is used as the locking identifier.
func New(
ctx context.Context,
name string,
instanceID string,
interval time.Duration,
locker Locker,
statsStore CronStatsStore,
opts ...Option,
) *Schedule {
sch := &Schedule{
ctx: ctx,
name: name,
instanceID: instanceID,
logger: log.NewNopLogger(),
done: make(chan struct{}),
configReloadInterval: 1 * time.Hour, // by default we will check for updated config once per hour
schedInterval: truncateSecondsWithFloor(interval),
locker: locker,
statsStore: statsStore,
}
for _, fn := range opts {
fn(sch)
}
return sch
}
// Start starts running the added jobs.
//
// All jobs must be added before calling Start.
func (s *Schedule) Start() {
var intervalStartedAt time.Time // start time of the most recent run of the scheduled jobs
var m sync.Mutex // protects intervalStartedAt
getIntervalStartedAt := func() (start time.Time) {
m.Lock()
defer m.Unlock()
return intervalStartedAt
}
setIntervalStartedAt := func(start time.Time) {
m.Lock()
defer m.Unlock()
intervalStartedAt = start.Truncate(time.Second)
}
stats, err := s.getStats()
if err != nil {
level.Error(s.logger).Log("err", "start schedule", "details", err)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
}
setIntervalStartedAt(stats.CreatedAt)
initialWait := 10 * time.Second
if schedInterval := s.getSchedInterval(); schedInterval < initialWait {
initialWait = schedInterval
}
schedTicker := time.NewTicker(initialWait)
var g sync.WaitGroup
g.Add(+1)
go func() {
defer func() {
s.releaseLock()
g.Done()
}()
for {
level.Debug(s.logger).Log("waiting", "wait for next tick")
select {
case <-s.ctx.Done():
schedTicker.Stop()
return
case <-schedTicker.C:
level.Debug(s.logger).Log("waiting", "done")
schedInterval := s.getSchedInterval()
stats, err := s.getStats()
if err != nil {
level.Error(s.logger).Log("err", "get cron stats", "details", err)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
// skip ahead to the next interval
schedTicker.Reset(schedInterval)
continue
}
if stats.Status == fleet.CronStatsStatusPending {
// skip ahead to the next interval
schedTicker.Reset(schedInterval)
continue
}
prevStart := getIntervalStartedAt()
if stats.CreatedAt.After(prevStart) {
// if there's a diff between the datastore and our local value, we use the
// more recent timestamp and update our local value accordingly
setIntervalStartedAt(stats.CreatedAt)
prevStart = getIntervalStartedAt()
}
remainingInterval := getRemainingInterval(prevStart, schedInterval)
if time.Since(prevStart) < schedInterval {
// wait for the remaining interval plus a small buffer
schedTicker.Reset(remainingInterval + 100*time.Millisecond)
continue
}
prevFinish := stats.UpdatedAt.Truncate(time.Second)
prevRuntime := prevFinish.Sub(prevStart)
if prevRuntime > schedInterval {
// if the previous run took longer than the schedule interval, we wait until the start of the next full interval
newStart := prevStart.Add(time.Since(prevStart).Truncate(schedInterval)) // advances start time by the number of full interval elasped
setIntervalStartedAt(newStart)
schedTicker.Reset(getRemainingInterval(newStart, schedInterval))
continue
}
ok, cancelHold := s.holdLock()
if !ok {
// failed to get a lock so skip ahead to the next interval
schedTicker.Reset(schedInterval)
continue
}
newStart := time.Now()
setIntervalStartedAt(newStart)
level.Info(s.logger).Log("status", "pending")
statsID, err := s.insertStats(fleet.CronStatsTypeScheduled, fleet.CronStatsStatusPending)
if err != nil {
level.Error(s.logger).Log("err", fmt.Sprintf("insert cron stats %s", s.name), "details", err)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
}
for _, job := range s.jobs {
level.Debug(s.logger).Log("msg", "starting", "jobID", job.ID)
if err := runJob(s.ctx, job.Fn); err != nil {
level.Error(s.logger).Log("err", "running job", "details", err, "jobID", job.ID)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
}
}
level.Info(s.logger).Log("status", "completed")
if err := s.updateStats(statsID, fleet.CronStatsStatusCompleted); err != nil {
level.Error(s.logger).Log("err", fmt.Sprintf("update cron stats %s", s.name), "details", err)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
}
// we need to re-synchronize this schedule instance so that the next scheduled run
// starts at the beginning of the next full interval
//
// for example, if the interval is 1hr and the schedule takes 0.2 hrs to run
// then we wait 0.8 hrs until the next time we run the schedule, or if the
// the schedule takes 1.5 hrs to run then we wait 0.5 hrs (skipping the scheduled
// tick that would have overlapped with the 1.5hrs running time)
schedInterval = s.getSchedInterval()
if time.Since(newStart) > schedInterval {
level.Info(s.logger).Log("msg", fmt.Sprintf("total runtime (%v) exceeded schedule interval (%v)", time.Since(newStart), schedInterval))
newStart = newStart.Add(time.Since(newStart).Truncate(schedInterval)) // advances start time by the number of full interval elasped
}
remainingInterval = getRemainingInterval(newStart, schedInterval)
clearTickerChannel(schedTicker) // in case another tick arrived during this run
schedTicker.Reset(remainingInterval)
cancelHold()
}
}
}()
if s.configReloadIntervalFn != nil {
// WithConfigReloadInterval option applies so we periodically check for config updates and
// reset the schedInterval for the previous loop
g.Add(+1)
go func() {
defer g.Done()
configTicker := time.NewTicker(s.configReloadInterval)
for {
select {
case <-s.ctx.Done():
configTicker.Stop()
return
case <-configTicker.C:
prevInterval := s.getSchedInterval()
newInterval, err := s.configReloadIntervalFn(s.ctx)
if err != nil {
level.Error(s.logger).Log("err", "schedule interval config reload failed", "details", err)
sentry.CaptureException(err)
continue
}
newInterval = truncateSecondsWithFloor(newInterval)
if newInterval <= 0 {
level.Debug(s.logger).Log("msg", "config reload interval method returned invalid interval")
continue
}
if prevInterval == newInterval {
continue
}
s.setSchedInterval(newInterval)
intervalStartedAt := getIntervalStartedAt()
newWait := 10 * time.Millisecond
if time.Since(intervalStartedAt) < newInterval {
newWait = newInterval - time.Since(intervalStartedAt)
}
clearTickerChannel(schedTicker)
schedTicker.Reset(newWait)
level.Debug(s.logger).Log("msg", fmt.Sprintf("new schedule interval %v", newInterval))
level.Debug(s.logger).Log("msg", fmt.Sprintf("time until next schedule tick %v", newWait))
}
}
}()
}
go func() {
g.Wait()
level.Debug(s.logger).Log("msg", "close schedule")
close(s.done) // communicates that the scheduler has finished running its goroutines
schedTicker.Stop()
}()
}
// runJob executes the job function with panic recovery
func runJob(ctx context.Context, fn JobFn) (err error) {
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("%v", r)
}
}()
if err := fn(ctx); err != nil {
return err
}
return nil
}
// Done returns a channel that will be closed when the scheduler's context is done
// and it has finished running its goroutines.
func (s *Schedule) Done() <-chan struct{} {
return s.done
}
// getScheduleInterval returns the schedule interval
func (s *Schedule) getSchedInterval() time.Duration {
s.schedIntervalMu.Lock()
defer s.schedIntervalMu.Unlock()
return s.schedInterval
}
// setScheduleInterval sets the schedule interval after truncating the duration to seconds and
// applying a one second floor (e.g., 600ms becomes 1s, 1300ms becomes 2s, 1000ms becomes 2s)
func (s *Schedule) setSchedInterval(interval time.Duration) {
s.schedIntervalMu.Lock()
defer s.schedIntervalMu.Unlock()
s.schedInterval = truncateSecondsWithFloor(interval)
}
func (s *Schedule) acquireLock() bool {
locked, err := s.locker.Lock(s.ctx, s.getLockName(), s.instanceID, s.getSchedInterval())
if err != nil {
level.Error(s.logger).Log("msg", "lock failed", "err", err)
sentry.CaptureException(err)
return false
}
if locked {
return true
}
level.Debug(s.logger).Log("msg", "not the lock leader, skipping")
return false
}
func (s *Schedule) releaseLock() {
err := s.locker.Unlock(s.ctx, s.getLockName(), s.instanceID)
if err != nil {
level.Error(s.logger).Log("msg", "unlock failed", "err", err)
sentry.CaptureException(err)
}
}
// holdLock attempts to acquire a schedule lock. If it successfully acquires the lock, it starts a
// goroutine that periodically extends the lock, and it returns `true` along with a
// context.CancelFunc that will end the goroutine and release the lock. If it is unable to initially
// acquire a lock, it returns `false, nil`. The maximum duration of the hold is two hours.
func (s *Schedule) holdLock() (bool, context.CancelFunc) {
if ok := s.acquireLock(); !ok {
return false, nil
}
ctx, cancelFn := context.WithCancel(s.ctx)
go func() {
t := time.NewTimer(s.getSchedInterval() * 8 / 10) // hold timer is 80% of schedule interval
for {
select {
case <-ctx.Done():
if !t.Stop() {
<-t.C
}
s.releaseLock()
return
case <-t.C:
s.acquireLock()
t.Reset(s.getSchedInterval() * 8 / 10)
}
}
}()
return true, cancelFn
}
func (s *Schedule) getStats() (fleet.CronStats, error) {
return s.statsStore.GetLatestCronStats(s.ctx, s.name)
}
func (s *Schedule) insertStats(statsType fleet.CronStatsType, status fleet.CronStatsStatus) (int, error) {
return s.statsStore.InsertCronStats(s.ctx, statsType, s.name, s.instanceID, status)
}
func (s *Schedule) updateStats(id int, status fleet.CronStatsStatus) error {
return s.statsStore.UpdateCronStats(s.ctx, id, status)
}
func (s *Schedule) getLockName() string {
name := s.name
if s.altLockName != "" {
name = s.altLockName
}
return name
}
// getRemainingInterval returns the interval minus the remainder of dividing the time since state by
// the interval
func getRemainingInterval(start time.Time, interval time.Duration) time.Duration {
if interval == 0 {
return 0
}
return interval - (time.Since(start) % interval)
}
// clearTickerChannel performs a non-blocking select on the ticker channel
func clearTickerChannel(ticker *time.Ticker) {
select {
case <-ticker.C:
// pull from ticker channel
default:
// ok
}
}
// truncateSecondsWithFloor returns the result of truncating the duration to seconds and
// and applying a one second floor (e.g., 600ms becomes 1s, 1300ms becomes 2s, 1000ms becomes 2s)
func truncateSecondsWithFloor(d time.Duration) time.Duration {
if d <= 1*time.Second {
return 1 * time.Second
}
return d.Truncate(time.Second)
}