fleet/server/service/schedule/schedule.go
Victor Lyuboslavsky bf9180e6e3
slog migration: initLogger + serve.go + cron + schedule (#40699)
<!-- Add the related story/sub-task/bug number, like Resolves #123, or
remove if NA -->
**Related issue:** Resolves #40540 

Almost done with slog migration.

# Checklist for submitter

- [ ] Changes file added for user-visible changes in `changes/`,
`orbit/changes/` or `ee/fleetd-chrome/changes`.
  - Changes present in previous PR

## Testing

- [x] Added/updated automated tests
- [x] QA'd all new/changed functionality manually


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

* **Chores**
* Updated internal logging infrastructure to use Go's standard logging
library, modernizing the logging system while maintaining existing
functionality and error handling behavior.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2026-02-27 14:29:27 -06:00

851 lines
29 KiB
Go

// Package schedule allows periodic run of a list of jobs.
//
// Type Schedule allows grouping a set of Jobs to run at specific intervals.
// Each Job is executed serially in the order they were added to the Schedule.
package schedule
import (
"context"
"fmt"
"log/slog"
"os"
"runtime/debug"
"sync"
"time"
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
"github.com/fleetdm/fleet/v4/server/fleet"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
)
// ReloadInterval reloads and returns a new interval.
type ReloadInterval func(ctx context.Context) (time.Duration, error)
// Schedule runs a list of jobs serially at a given schedule.
//
// Each job is executed one after the other in the order they were added.
// If one of the job fails, an error is logged and the scheduler
// continues with the next.
type Schedule struct {
ctx context.Context
name string
instanceID string
logger *slog.Logger
defaultPrevRunCreatedAt time.Time // default timestamp of previous run for the schedule if none exists, time.Now if not set
mu sync.Mutex // protects schedInterval and intervalStartedAt
schedInterval time.Duration
intervalStartedAt time.Time // start time of the most recent run of the scheduled jobs
trigger chan int // 0 for in-process trigger, >0 for claimed stats ID from DB poll
done chan struct{}
configReloadInterval time.Duration
configReloadIntervalFn ReloadInterval
locker Locker
altLockName string
jobs []Job
errors fleet.CronScheduleErrors
statsStore CronStatsStore
triggerPollInterval time.Duration
runOnce bool
}
// JobFn is the signature of a Job.
type JobFn func(context.Context) error
// Job represents a job that can be added to Scheduler.
type Job struct {
// ID is the unique identifier for the job.
ID string
// Fn is the job itself.
Fn JobFn
}
// Locker allows a Schedule to acquire a lock before running jobs.
type Locker interface {
Lock(ctx context.Context, scheduleName string, scheduleInstanceID string, expiration time.Duration) (bool, error)
Unlock(ctx context.Context, scheduleName string, scheduleInstanceID string) error
}
// CronStatsStore allows a Schedule to store and retrieve statistics pertaining to the Schedule
type CronStatsStore interface {
// GetLatestCronStats returns a slice of no more than two cron stats records, where index 0 (if
// present) is the most recently created scheduled run, and index 1 (if present) represents a
// triggered run that is currently pending.
GetLatestCronStats(ctx context.Context, name string) ([]fleet.CronStats, error)
// InsertCronStats inserts cron stats for the named cron schedule
InsertCronStats(ctx context.Context, statsType fleet.CronStatsType, name string, instance string, status fleet.CronStatsStatus) (int, error)
// UpdateCronStats updates the status of the identified cron stats record
UpdateCronStats(ctx context.Context, id int, status fleet.CronStatsStatus, cronErrors *fleet.CronScheduleErrors) error
// ClaimCronStats transitions a queued cron stats record to the given status
// and updates the instance to the worker that is claiming it.
ClaimCronStats(ctx context.Context, id int, instance string, status fleet.CronStatsStatus) error
}
// Option allows configuring a Schedule.
type Option func(*Schedule)
// WithLogger sets a logger for the Schedule.
func WithLogger(l *slog.Logger) Option {
return func(s *Schedule) {
s.logger = l.With("schedule", s.name)
}
}
// WithConfigReloadInterval allows setting a reload interval function,
// that will allow updating the interval of a running schedule.
//
// If not set, then the schedule performs no interval reloading.
func WithConfigReloadInterval(interval time.Duration, fn ReloadInterval) Option {
return func(s *Schedule) {
s.configReloadInterval = interval
s.configReloadIntervalFn = fn
}
}
// WithAltLockID sets an alternative identifier to use when acquiring the lock.
//
// If not set, then the Schedule's name is used for acquiring the lock.
func WithAltLockID(name string) Option {
return func(s *Schedule) {
s.altLockName = name
}
}
// WithJob adds a job to the Schedule.
//
// Each job is executed in the order they are added.
func WithJob(id string, fn JobFn) Option {
return func(s *Schedule) {
s.jobs = append(s.jobs, Job{
ID: id,
Fn: fn,
})
}
}
// WithRunOnce sets the Schedule to run only once.
func WithRunOnce(once bool) Option {
return func(s *Schedule) {
s.runOnce = once
}
}
// WithDefaultPrevRunCreatedAt sets the default time to use for the previous
// run of the schedule if it never ran yet. If not specified, the current time
// is used. This affects when the schedule starts running after Fleet is
// started, e.g. if the schedule has an interval of 1h and has no previous run
// recorded, by default its first run after Fleet starts will be in 1h.
func WithDefaultPrevRunCreatedAt(tm time.Time) Option {
return func(s *Schedule) {
s.defaultPrevRunCreatedAt = tm
}
}
// WithTriggerPollInterval enables polling for externally-queued trigger requests.
// When set, the schedule periodically checks the stats store for records with
// "queued" status and executes them. This enables cross-server triggering when
// the schedule runs on a different server than the one receiving the API request.
func WithTriggerPollInterval(interval time.Duration) Option {
return func(s *Schedule) {
s.triggerPollInterval = interval
}
}
// New creates and returns a Schedule.
// Jobs are added with the WithJob Option.
//
// The jobs are executed serially in order at the provided interval.
//
// The provided locker is used to acquire/release a lock before running the jobs.
// The provided name and instanceID of the Schedule is used as the locking identifier.
func New(
ctx context.Context,
name string,
instanceID string,
interval time.Duration,
locker Locker,
statsStore CronStatsStore,
opts ...Option,
) *Schedule {
sch := &Schedule{
ctx: ctx,
name: name,
instanceID: instanceID,
logger: slog.New(slog.DiscardHandler),
trigger: make(chan int),
done: make(chan struct{}),
configReloadInterval: 1 * time.Hour, // by default we will check for updated config once per hour
schedInterval: truncateSecondsWithFloor(interval),
locker: locker,
statsStore: statsStore,
}
for _, fn := range opts {
fn(sch)
}
if sch.logger == nil {
sch.logger = slog.New(slog.DiscardHandler)
}
sch.logger = sch.logger.With("instanceID", instanceID)
sch.errors = make(fleet.CronScheduleErrors)
return sch
}
// Start starts running the added jobs.
//
// All jobs must be added before calling Start.
func (s *Schedule) Start() {
prevScheduledRun, _, err := s.GetLatestStats(s.ctx)
if err != nil {
s.logger.ErrorContext(s.ctx, "start schedule", "err", err)
ctxerr.Handle(s.ctx, err)
}
// if there is no previous run, set the start time to the specified default
// time, falling back to current time.
startedAt := prevScheduledRun.CreatedAt
if startedAt.IsZero() {
startedAt = s.defaultPrevRunCreatedAt
if startedAt.IsZero() {
startedAt = time.Now()
}
} else if s.runOnce && prevScheduledRun.Status == fleet.CronStatsStatusCompleted {
// If job is set to run once, and it already ran, then nothing to do
return
}
s.setIntervalStartedAt(startedAt)
initialWait := 10 * time.Second
if schedInterval := s.getSchedInterval(); schedInterval < initialWait {
initialWait = schedInterval
}
schedTicker := time.NewTicker(initialWait)
var g sync.WaitGroup
g.Add(+1)
go func() {
defer func() {
s.releaseLock(s.ctx)
g.Done()
}()
for {
s.logger.DebugContext(s.ctx, fmt.Sprintf("%v remaining until next tick", s.getRemainingInterval(s.intervalStartedAt)))
select {
case <-s.ctx.Done():
schedTicker.Stop()
return
case claimedStatsID := <-s.trigger:
// Create a root span for the entire triggered execution
ctx, span := startRootSpan(s.ctx, "cron.triggered."+s.name,
attribute.String("cron.name", s.name),
attribute.String("cron.instance", s.instanceID),
attribute.String("cron.type", "triggered"),
)
s.logger.DebugContext(ctx, "done, trigger received")
ok, cancelHold := s.holdLock(ctx)
if !ok {
s.logger.DebugContext(ctx, "unable to acquire lock")
span.End()
continue
}
// If this is a DB-polled trigger, claim the queued record now
// that we hold the lock and are ready to run. This updates the
// instance to the actual worker instance ID.
if claimedStatsID > 0 {
if err := s.statsStore.ClaimCronStats(ctx, claimedStatsID, s.instanceID, fleet.CronStatsStatusPending); err != nil {
s.logger.ErrorContext(ctx, "claiming queued trigger", "err", err)
ctxerr.Handle(ctx, err)
// there is an issue with this stats record; fall through to create a new stats record
claimedStatsID = 0
}
}
s.runWithStats(ctx, fleet.CronStatsTypeTriggered, claimedStatsID)
prevScheduledRun, _, err := s.GetLatestStats(ctx)
if err != nil {
s.logger.ErrorContext(ctx, "trigger get cron stats", "err", err)
ctxerr.Handle(ctx, err)
}
clearScheduleChannels(s.trigger, schedTicker.C) // in case another signal arrived during this run
intervalStartedAt := s.getIntervalStartedAt()
if prevScheduledRun.CreatedAt.After(intervalStartedAt) {
// if there's a diff between the datastore and our local value, we use the
// more recent timestamp and update our local value accordingly
s.setIntervalStartedAt(prevScheduledRun.CreatedAt)
intervalStartedAt = s.getIntervalStartedAt()
}
// if the triggered run spanned the schedule interval, we need to wait until the start of the next full interval
schedInterval := s.getSchedInterval()
if time.Since(intervalStartedAt) > schedInterval+1*time.Second { // we use 2s tolerance here because MySQL timestamps are truncated to 1s
newStart := intervalStartedAt.Add(time.Since(intervalStartedAt).Truncate(schedInterval)) // advances start time by the number of full interval elasped
s.setIntervalStartedAt(newStart)
schedTicker.Reset(s.getRemainingInterval(newStart))
s.logger.DebugContext(ctx, fmt.Sprintf("triggered run spanned schedule interval, new wait %v", s.getRemainingInterval(newStart)))
}
cancelHold()
span.End()
case <-schedTicker.C:
// Create a root span for the entire scheduled tick processing
ctx, span := startRootSpan(s.ctx, "cron.scheduled_tick."+s.name,
attribute.String("cron.name", s.name),
attribute.String("cron.instance", s.instanceID),
attribute.String("cron.type", "scheduled_tick"),
)
s.logger.DebugContext(ctx, "done, tick received")
schedInterval := s.getSchedInterval()
prevScheduledRun, prevTriggeredRun, err := s.GetLatestStats(ctx)
if err != nil {
s.logger.ErrorContext(ctx, "get cron stats", "err", err)
ctxerr.Handle(ctx, err)
// skip ahead to the next interval
schedTicker.Reset(schedInterval)
span.End()
continue
}
if prevScheduledRun.Status == fleet.CronStatsStatusPending || prevTriggeredRun.Status == fleet.CronStatsStatusPending {
// skip ahead to the next interval
s.logger.InfoContext(ctx, fmt.Sprintf("pending job might still be running, wait %v", schedInterval))
schedTicker.Reset(schedInterval)
span.End()
continue
}
intervalStartedAt := s.getIntervalStartedAt()
if prevScheduledRun.CreatedAt.After(intervalStartedAt) {
// if there's a diff between the datastore and our local value, we use the
// more recent timestamp and update our local value accordingly
s.setIntervalStartedAt(prevScheduledRun.CreatedAt)
intervalStartedAt = s.getIntervalStartedAt()
}
if time.Since(intervalStartedAt) < schedInterval {
// wait for the remaining interval plus a small buffer
newWait := s.getRemainingInterval(intervalStartedAt) + 100*time.Millisecond
s.logger.InfoContext(ctx, fmt.Sprintf("wait remaining interval %v", newWait))
schedTicker.Reset(newWait)
span.End()
continue
}
// if the previous run took longer than the schedule interval, we wait until the start of the next full interval
if time.Since(intervalStartedAt) > schedInterval+2*time.Second { // we use a 2s tolerance here because MySQL timestamps are truncated to 1s
newStart := intervalStartedAt.Add(time.Since(intervalStartedAt).Truncate(schedInterval)) // advances start time by the number of full interval elasped
s.setIntervalStartedAt(newStart)
schedTicker.Reset(s.getRemainingInterval(newStart))
s.logger.DebugContext(ctx, fmt.Sprintf("prior run spanned schedule interval, new wait %v", s.getRemainingInterval(newStart)))
span.End()
continue
}
ok, cancelHold := s.holdLock(ctx)
if !ok {
s.logger.DebugContext(ctx, "unable to acquire lock")
schedTicker.Reset(schedInterval)
span.End()
continue
}
newStart := time.Now()
s.setIntervalStartedAt(newStart)
s.runWithStats(ctx, fleet.CronStatsTypeScheduled, 0)
// we need to re-synchronize this schedule instance so that the next scheduled run
// starts at the beginning of the next full interval
//
// for example, if the interval is 1hr and the schedule takes 0.2 hrs to run
// then we wait 0.8 hrs until the next time we run the schedule, or if the
// the schedule takes 1.5 hrs to run then we wait 0.5 hrs (skipping the scheduled
// tick that would have overlapped with the 1.5hrs running time)
schedInterval = s.getSchedInterval()
if time.Since(newStart) > schedInterval {
s.logger.InfoContext(ctx, fmt.Sprintf("total runtime (%v) exceeded schedule interval (%v)", time.Since(newStart), schedInterval))
newStart = newStart.Add(time.Since(newStart).Truncate(schedInterval)) // advances start time by the number of full interval elasped
s.setIntervalStartedAt(newStart)
}
clearScheduleChannels(s.trigger, schedTicker.C) // in case another signal arrived during this run
schedTicker.Reset(s.getRemainingInterval(newStart))
cancelHold()
span.End()
}
}
}()
if s.configReloadIntervalFn != nil {
// WithConfigReloadInterval option applies so we periodically check for config updates and
// reset the schedInterval for the previous loop
g.Add(+1)
go func() {
defer g.Done()
configTicker := time.NewTicker(s.configReloadInterval)
for {
select {
case <-s.ctx.Done():
configTicker.Stop()
return
case <-configTicker.C:
prevInterval := s.getSchedInterval()
newInterval, err := s.configReloadIntervalFn(s.ctx)
if err != nil {
s.logger.ErrorContext(s.ctx, "schedule interval config reload failed", "err", err)
ctxerr.Handle(s.ctx, err)
continue
}
newInterval = truncateSecondsWithFloor(newInterval)
if newInterval <= 0 {
s.logger.DebugContext(s.ctx, "config reload interval method returned invalid interval")
continue
}
if prevInterval == newInterval {
continue
}
s.setSchedInterval(newInterval)
intervalStartedAt := s.getIntervalStartedAt()
newWait := 10 * time.Millisecond
if time.Since(intervalStartedAt) < newInterval {
newWait = s.getRemainingInterval(intervalStartedAt)
}
clearScheduleChannels(s.trigger, schedTicker.C)
schedTicker.Reset(newWait)
s.logger.DebugContext(s.ctx, fmt.Sprintf("new schedule interval %v", newInterval))
s.logger.DebugContext(s.ctx, fmt.Sprintf("time until next schedule tick %v", newWait))
}
}
}()
}
if s.triggerPollInterval > 0 {
g.Go(func() {
pollTicker := time.NewTicker(s.triggerPollInterval)
for {
select {
case <-s.ctx.Done():
pollTicker.Stop()
return
case <-pollTicker.C:
s.pollForQueuedTrigger()
}
}
})
}
go func() {
g.Wait()
s.logger.DebugContext(s.ctx, "close schedule")
close(s.done) // communicates that the scheduler has finished running its goroutines
schedTicker.Stop()
}()
}
// Trigger attempts to signal the schedule to start an ad-hoc run of all jobs after first checking
// whether another run is pending. If another run is already pending, it returns available status
// information for the pending run.
//
// Note that no distinction is made in the return value between the
// case where the signal is published to the trigger channel and the case where the trigger channel
// is blocked or otherwise unavailable to publish the signal. From the caller's perspective, both
// cases are deemed to be equivalent.
func (s *Schedule) Trigger(ctx context.Context) (stats *fleet.CronStats, didTrigger bool, err error) {
sched, trig, err := s.GetLatestStats(ctx)
switch {
case err != nil:
return nil, false, err
case sched.Status == fleet.CronStatsStatusPending:
return &sched, false, nil
case trig.Status == fleet.CronStatsStatusPending || trig.Status == fleet.CronStatsStatusQueued:
return &trig, false, nil
default:
// ok
}
select {
case s.trigger <- 0:
didTrigger = true
default:
s.logger.DebugContext(ctx, "trigger channel not available")
}
return nil, didTrigger, nil
}
// Name returns the name of the schedule.
func (s *Schedule) Name() string {
return s.name
}
// runWithStats runs all jobs in the schedule. If existingStatsID is > 0, it
// uses that record (already claimed by the poll goroutine). Otherwise, it
// creates a new record with "pending" status. After completing the run, the
// stats record is updated to "completed" status.
func (s *Schedule) runWithStats(ctx context.Context, statsType fleet.CronStatsType, existingStatsID int) {
statsID := existingStatsID
if statsID == 0 {
var err error
statsID, err = s.insertStats(ctx, statsType, fleet.CronStatsStatusPending)
if err != nil {
s.logger.ErrorContext(ctx, fmt.Sprintf("insert cron stats %s", s.name), "err", err)
ctxerr.Handle(ctx, err)
}
s.logger.InfoContext(ctx, "pending")
}
s.runAllJobs(ctx)
if err := s.updateStats(ctx, statsID, fleet.CronStatsStatusCompleted); err != nil {
s.logger.ErrorContext(ctx, fmt.Sprintf("update cron stats %s", s.name), "err", err)
ctxerr.Handle(ctx, err)
}
s.logger.InfoContext(ctx, "completed")
}
// runAllJobs runs all jobs in the schedule with tracing context.
func (s *Schedule) runAllJobs(ctx context.Context) {
// Clear errors from the schedule before each run.
s.errors = make(fleet.CronScheduleErrors)
for _, job := range s.jobs {
s.logger.DebugContext(ctx, "starting", "jobID", job.ID)
if err := runJob(ctx, job.Fn); err != nil {
s.errors[job.ID] = err
s.logger.ErrorContext(ctx, "running job", "err", err, "jobID", job.ID)
ctxerr.Handle(ctx, err)
}
}
}
// pollForQueuedTrigger checks for a queued trigger record and signals the
// trigger handler if one is found.
func (s *Schedule) pollForQueuedTrigger() {
ctx, span := startRootSpan(s.ctx, "cron.trigger_poll."+s.name,
attribute.String("cron.name", s.name),
attribute.String("cron.instance", s.instanceID),
attribute.String("cron.type", "trigger_poll"),
)
defer span.End()
_, trig, err := s.GetLatestStats(ctx)
if err != nil {
s.logger.ErrorContext(ctx, "trigger poll get cron stats", "err", err)
ctxerr.Handle(ctx, err)
return
}
if trig.Status == fleet.CronStatsStatusQueued {
// Signal the trigger handler; it will claim the record when ready.
// Non-blocking: if the handler is busy, the record stays queued and the next poll will try again.
select {
case s.trigger <- trig.ID:
s.logger.InfoContext(ctx, "picked up queued trigger", "stats_id", trig.ID)
default:
}
}
}
// runJob executes the job function with panic recovery.
func runJob(ctx context.Context, fn JobFn) (err error) {
defer func() {
if os.Getenv("TEST_CRON_NO_RECOVER") != "1" { // for detecting panics in tests
if r := recover(); r != nil {
err = fmt.Errorf("%v\n%s", r, string(debug.Stack()))
}
}
}()
if err := fn(ctx); err != nil {
return err
}
return nil
}
// Done returns a channel that will be closed when the scheduler's context is done
// and it has finished running its goroutines.
func (s *Schedule) Done() <-chan struct{} {
return s.done
}
// getScheduleInterval returns the schedule interval
func (s *Schedule) getSchedInterval() time.Duration {
s.mu.Lock()
defer s.mu.Unlock()
return s.schedInterval
}
// setScheduleInterval sets the schedule interval after truncating the duration to seconds and
// applying a one second floor (e.g., 600ms becomes 1s, 1300ms becomes 2s, 1000ms becomes 2s)
func (s *Schedule) setSchedInterval(interval time.Duration) {
s.mu.Lock()
defer s.mu.Unlock()
s.schedInterval = truncateSecondsWithFloor(interval)
}
// getIntervalStartedAt returns the start time of the current schedule interval.
func (s *Schedule) getIntervalStartedAt() time.Time {
s.mu.Lock()
defer s.mu.Unlock()
return s.intervalStartedAt
}
// setIntervalStartedAt sets the start time of the current schedule interval. The start time is
// rounded down to the nearest second.
func (s *Schedule) setIntervalStartedAt(start time.Time) {
s.mu.Lock()
defer s.mu.Unlock()
s.intervalStartedAt = start.Truncate(1 * time.Second)
}
// getRemainingInterval returns the interval minus the remainder of dividing the time since state by
// the interval
func (s *Schedule) getRemainingInterval(start time.Time) time.Duration {
interval := s.getSchedInterval()
if interval == 0 {
return 0
}
return interval - (time.Since(start) % interval)
}
func (s *Schedule) acquireLock(ctx context.Context) bool {
ok, err := s.locker.Lock(ctx, s.getLockName(), s.instanceID, s.getSchedInterval())
if err != nil {
s.logger.ErrorContext(ctx, "lock failed", "err", err)
ctxerr.Handle(ctx, err)
return false
}
if !ok {
s.logger.DebugContext(ctx, "not the lock leader, skipping")
return false
}
return true
}
func (s *Schedule) releaseLock(ctx context.Context) {
err := s.locker.Unlock(ctx, s.getLockName(), s.instanceID)
if err != nil {
s.logger.ErrorContext(ctx, "unlock failed", "err", err)
ctxerr.Handle(ctx, err)
}
}
// holdLock attempts to acquire a schedule lock. If it successfully acquires the lock, it starts a
// goroutine that periodically extends the lock, and it returns `true` along with a
// context.CancelFunc that will end the goroutine and release the lock. If it is unable to initially
// acquire a lock, it returns `false, nil`.
func (s *Schedule) holdLock(ctx context.Context) (bool, context.CancelFunc) {
if ok := s.acquireLock(ctx); !ok {
return false, nil
}
ctxWithCancel, cancelFn := context.WithCancel(ctx)
go func() {
t := time.NewTimer(s.getSchedInterval() * 8 / 10) // hold timer is 80% of schedule interval
for {
select {
case <-ctxWithCancel.Done():
if !t.Stop() {
<-t.C
}
s.releaseLock(ctx)
return
case <-t.C:
s.acquireLock(ctx)
t.Reset(s.getSchedInterval() * 8 / 10)
}
}
}()
return true, cancelFn
}
func (s *Schedule) GetLatestStats(ctx context.Context) (fleet.CronStats, fleet.CronStats, error) {
// Create an OTEL span for stats retrieval
// This uses startSpan which will create a child span if there's a parent,
// or a root span if there isn't. If OTEL is disabled, it returns a no-op span.
ctx, span := startSpan(ctx, "cron.get_latest_stats",
attribute.String("cron.name", s.name),
)
defer span.End()
var scheduled, triggered fleet.CronStats
cs, err := s.statsStore.GetLatestCronStats(ctx, s.name)
if err != nil {
return fleet.CronStats{}, fleet.CronStats{}, err
}
if len(cs) > 2 {
return fleet.CronStats{}, fleet.CronStats{}, fmt.Errorf("get latest stats expected length to be no more than two but got length: %d", len(cs))
}
for _, stats := range cs {
switch stats.StatsType {
case fleet.CronStatsTypeScheduled:
scheduled = stats
case fleet.CronStatsTypeTriggered:
triggered = stats
default:
s.logger.ErrorContext(ctx, fmt.Sprintf("get latest stats unexpected type: %s", stats.StatsType))
}
}
return scheduled, triggered, nil
}
func (s *Schedule) insertStats(ctx context.Context, statsType fleet.CronStatsType, status fleet.CronStatsStatus) (int, error) {
return s.statsStore.InsertCronStats(ctx, statsType, s.name, s.instanceID, status)
}
func (s *Schedule) updateStats(ctx context.Context, id int, status fleet.CronStatsStatus) error {
return s.statsStore.UpdateCronStats(ctx, id, status, &s.errors)
}
func (s *Schedule) getLockName() string {
name := s.name
if s.altLockName != "" {
name = s.altLockName
}
return name
}
// clearScheduleChannels performs a non-blocking select on the ticker and trigger channel in order
// to drain each channel. It is intended for use in cases where a signal may have been published to
// a channel during a pending run, in which case the expected behavior is for the signal to be dropped.
func clearScheduleChannels(trigger chan int, ticker <-chan time.Time) {
for {
select {
case <-trigger:
// pull trigger signal from channel
case <-ticker:
// pull ticker signal from channel
default:
return
}
}
}
// truncateSecondsWithFloor returns the result of truncating the duration to seconds and
// and applying a one second floor (e.g., 600ms becomes 1s, 1300ms becomes 2s, 1000ms becomes 2s)
func truncateSecondsWithFloor(d time.Duration) time.Duration {
if d <= 1*time.Second {
return 1 * time.Second
}
return d.Truncate(time.Second)
}
// startRootSpan creates a new root span for async operations
// This is necessary because cron jobs run in background goroutines without parent HTTP contexts
// If OpenTelemetry is not configured at the application level, this will be a no-op
// Details:
// 1. When OpenTelemetry is NOT configured (i.e., config.Logging.TracingEnabled is false):
// - otel.SetTracerProvider() is never called in /cmd/fleet/serve.go
// - The global tracer provider remains unset
// 2. When otel.Tracer() is called:
// - Since no global TracerProvider was set, OpenTelemetry returns a no-op tracer
// 3. When tracer.Start() is called:
// - The no-op tracer returns a no-op span
// - Has minimal performance impact (essentially just returns immediately)
// - Still maintains proper context propagation
func startRootSpan(ctx context.Context, name string, attrs ...attribute.KeyValue) (context.Context, trace.Span) {
return otel.Tracer("github.com/fleetdm/fleet/v4/server/service/schedule").Start(ctx, name,
trace.WithNewRoot(),
trace.WithSpanKind(trace.SpanKindInternal),
trace.WithAttributes(attrs...))
}
// startSpan creates a child span
// If OpenTelemetry is not configured at the application level, this will be a no-op
func startSpan(ctx context.Context, name string, attrs ...attribute.KeyValue) (context.Context, trace.Span) {
return otel.Tracer("github.com/fleetdm/fleet/v4/server/service/schedule").Start(ctx, name,
trace.WithSpanKind(trace.SpanKindInternal),
trace.WithAttributes(attrs...))
}
// RemoteTriggerSchedule implements fleet.CronSchedule for schedules that run on
// a remote server. Instead of running jobs locally, Trigger() inserts a "queued"
// record in the database that the remote server's poll goroutine picks up.
// This is registered on servers where the actual schedule is disabled (e.g.,
// when FLEET_VULNERABILITIES_DISABLE_SCHEDULE=true).
type RemoteTriggerSchedule struct {
name string
statsStore CronStatsStore
}
// NewRemoteTriggerSchedule creates a RemoteTriggerSchedule for the given
// schedule name, using the provided stats store for DB operations.
func NewRemoteTriggerSchedule(name string, statsStore CronStatsStore) *RemoteTriggerSchedule {
return &RemoteTriggerSchedule{name: name, statsStore: statsStore}
}
// Trigger inserts a "queued" record in the database for the remote server to
// pick up. It returns a conflict if there is already a pending or queued run.
func (r *RemoteTriggerSchedule) Trigger(ctx context.Context) (*fleet.CronStats, bool, error) {
ctx, span := startSpan(ctx, "cron.remote_trigger",
attribute.String("cron.name", r.name),
)
defer span.End()
// NOTE: The read-then-insert below is not atomic, so concurrent trigger
// requests could race and insert duplicate queued rows. This is acceptable
// because triggering is a low-frequency manual admin operation, and the
// worst-case outcome is the schedule running twice.
latestStats, err := r.statsStore.GetLatestCronStats(ctx, r.name)
if err != nil {
return nil, false, err
}
for _, s := range latestStats {
switch {
case s.Status == fleet.CronStatsStatusPending:
// A scheduled or triggered run is already in progress.
return &s, false, nil
case s.StatsType == fleet.CronStatsTypeTriggered && s.Status == fleet.CronStatsStatusQueued:
// A triggered run is already queued and waiting to be picked up.
return &s, false, nil
}
}
_, err = r.statsStore.InsertCronStats(ctx, fleet.CronStatsTypeTriggered, r.name, "trigger-api", fleet.CronStatsStatusQueued)
if err != nil {
return nil, false, err
}
return nil, true, nil
}
// Name returns the schedule name.
func (r *RemoteTriggerSchedule) Name() string {
return r.name
}
// Start is a no-op since the actual schedule runs on a remote server.
func (r *RemoteTriggerSchedule) Start() {}