From 1a1d7bae78de8f7c804d85ef6839134029e8dca9 Mon Sep 17 00:00:00 2001 From: Scott Gress Date: Mon, 3 Mar 2025 16:41:48 -0600 Subject: [PATCH] Clear cron schedule errors before each run (#26775) For #26657 This PR fixes an issue that causes cron monitoring alerts to be sent repeatedly after the first instance; that is, if a cron job fails once then the monitor reports the failure every time it runs until the server is restarted. This was due to the errors being held in the Schedule object which persists for the lifetime of the process, rather than being recreated for each run. The solution is to clear the errors from the Schedule object before each run. Added a test that fails on main and passes on this branch. --- server/service/schedule/schedule.go | 2 ++ server/service/schedule/schedule_test.go | 34 ++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/server/service/schedule/schedule.go b/server/service/schedule/schedule.go index 7cf3606c88..5c28a07728 100644 --- a/server/service/schedule/schedule.go +++ b/server/service/schedule/schedule.go @@ -461,6 +461,8 @@ func (s *Schedule) runWithStats(statsType fleet.CronStatsType) { // runAllJobs runs all jobs in the schedule. func (s *Schedule) runAllJobs() { + // Clear errors from the schedule before each run. + s.errors = make(fleet.CronScheduleErrors) for _, job := range s.jobs { level.Debug(s.logger).Log("msg", "starting", "jobID", job.ID) if err := runJob(s.ctx, job.Fn); err != nil { diff --git a/server/service/schedule/schedule_test.go b/server/service/schedule/schedule_test.go index 2e74bf0cba..bd2a07a449 100644 --- a/server/service/schedule/schedule_test.go +++ b/server/service/schedule/schedule_test.go @@ -275,6 +275,40 @@ func TestMultipleJobsInOrder(t *testing.T) { require.Contains(t, test_job_4_err.Error(), "oh no\n") } +func TestClearScheduleErrors(t *testing.T) { + os.Setenv("TEST_CRON_NO_RECOVER", "0") + defer os.Unsetenv("TEST_CRON_NO_RECOVER") + + ctx := context.Background() + errored := false + + s := New(ctx, "test_schedule", "test_instance", 1000*time.Millisecond, NopLocker{}, SetUpMockStatsStore("test_schedule", fleet.CronStats{ + ID: 1, + StatsType: fleet.CronStatsTypeScheduled, + Name: "test_schedule_clear_errors", + Instance: "test_instance", + CreatedAt: time.Now().Truncate(1 * time.Second), + UpdatedAt: time.Now().Truncate(1 * time.Second), + Status: fleet.CronStatsStatusCompleted, + }), + WithJob("test_job_1", func(ctx context.Context) error { + if !errored { + errored = true + return errors.New("oh well") + } + return nil + }), + ) + + // First run should return 1 error. + s.runAllJobs() + require.Equal(t, 1, len(s.errors)) + + // Second run should return no errors. + s.runAllJobs() + require.Equal(t, 0, len(s.errors)) +} + func TestConfigReloadCheck(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) initialSchedInterval := 1 * time.Millisecond