Clear cron schedule errors before each run (#26775)

For #26657 This PR fixes an issue that causes cron monitoring alerts to be sent repeatedly after the first instance; that is, if a cron job fails once then the monitor reports the failure every time it runs until the server is restarted. This was due to the errors being held in the Schedule object which persists for the lifetime of the process, rather than being recreated for each run. The solution is to clear the errors from the Schedule object before each run. Added a test that fails on main and passes on this branch.
2026-05-23 08:58:41 +00:00 · 2025-03-03 16:41:48 -06:00 · 2025-03-03 16:41:48 -06:00 · 1a1d7bae78
commit 1a1d7bae78
parent e552cd3c8e
2 changed files with 36 additions and 0 deletions
--- a/server/service/schedule/schedule.go
+++ b/server/service/schedule/schedule.go
@ -461,6 +461,8 @@ func (s *Schedule) runWithStats(statsType fleet.CronStatsType) {

 // runAllJobs runs all jobs in the schedule.
 func (s *Schedule) runAllJobs() {
+	// Clear errors from the schedule before each run.
+	s.errors = make(fleet.CronScheduleErrors)
 	for _, job := range s.jobs {
 		level.Debug(s.logger).Log("msg", "starting", "jobID", job.ID)
 		if err := runJob(s.ctx, job.Fn); err != nil {
--- a/server/service/schedule/schedule_test.go
+++ b/server/service/schedule/schedule_test.go
@ -275,6 +275,40 @@ func TestMultipleJobsInOrder(t *testing.T) {
 	require.Contains(t, test_job_4_err.Error(), "oh no\n")
 }

+func TestClearScheduleErrors(t *testing.T) {
+	os.Setenv("TEST_CRON_NO_RECOVER", "0")
+	defer os.Unsetenv("TEST_CRON_NO_RECOVER")
+
+	ctx := context.Background()
+	errored := false
+
+	s := New(ctx, "test_schedule", "test_instance", 1000*time.Millisecond, NopLocker{}, SetUpMockStatsStore("test_schedule", fleet.CronStats{
+		ID:        1,
+		StatsType: fleet.CronStatsTypeScheduled,
+		Name:      "test_schedule_clear_errors",
+		Instance:  "test_instance",
+		CreatedAt: time.Now().Truncate(1 * time.Second),
+		UpdatedAt: time.Now().Truncate(1 * time.Second),
+		Status:    fleet.CronStatsStatusCompleted,
+	}),
+		WithJob("test_job_1", func(ctx context.Context) error {
+			if !errored {
+				errored = true
+				return errors.New("oh well")
+			}
+			return nil
+		}),
+	)
+
+	// First run should return 1 error.
+	s.runAllJobs()
+	require.Equal(t, 1, len(s.errors))
+
+	// Second run should return no errors.
+	s.runAllJobs()
+	require.Equal(t, 0, len(s.errors))
+}
+
 func TestConfigReloadCheck(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	initialSchedInterval := 1 * time.Millisecond