Clear cron schedule errors before each run (#26775)

For #26657

This PR fixes an issue that causes cron monitoring alerts to be sent
repeatedly after the first instance; that is, if a cron job fails once
then the monitor reports the failure every time it runs until the server
is restarted. This was due to the errors being held in the Schedule
object which persists for the lifetime of the process, rather than being
recreated for each run. The solution is to clear the errors from the
Schedule object before each run.

Added a test that fails on main and passes on this branch.
This commit is contained in:
Scott Gress 2025-03-03 16:41:48 -06:00 committed by GitHub
parent e552cd3c8e
commit 1a1d7bae78
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 36 additions and 0 deletions

View file

@ -461,6 +461,8 @@ func (s *Schedule) runWithStats(statsType fleet.CronStatsType) {
// runAllJobs runs all jobs in the schedule.
func (s *Schedule) runAllJobs() {
// Clear errors from the schedule before each run.
s.errors = make(fleet.CronScheduleErrors)
for _, job := range s.jobs {
level.Debug(s.logger).Log("msg", "starting", "jobID", job.ID)
if err := runJob(s.ctx, job.Fn); err != nil {

View file

@ -275,6 +275,40 @@ func TestMultipleJobsInOrder(t *testing.T) {
require.Contains(t, test_job_4_err.Error(), "oh no\n")
}
func TestClearScheduleErrors(t *testing.T) {
os.Setenv("TEST_CRON_NO_RECOVER", "0")
defer os.Unsetenv("TEST_CRON_NO_RECOVER")
ctx := context.Background()
errored := false
s := New(ctx, "test_schedule", "test_instance", 1000*time.Millisecond, NopLocker{}, SetUpMockStatsStore("test_schedule", fleet.CronStats{
ID: 1,
StatsType: fleet.CronStatsTypeScheduled,
Name: "test_schedule_clear_errors",
Instance: "test_instance",
CreatedAt: time.Now().Truncate(1 * time.Second),
UpdatedAt: time.Now().Truncate(1 * time.Second),
Status: fleet.CronStatsStatusCompleted,
}),
WithJob("test_job_1", func(ctx context.Context) error {
if !errored {
errored = true
return errors.New("oh well")
}
return nil
}),
)
// First run should return 1 error.
s.runAllJobs()
require.Equal(t, 1, len(s.errors))
// Second run should return no errors.
s.runAllJobs()
require.Equal(t, 0, len(s.errors))
}
func TestConfigReloadCheck(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
initialSchedInterval := 1 * time.Millisecond