fleet/server/service/schedule/testing_utils.go

249 lines
5.4 KiB
Go
Raw Normal View History

package schedule
import (
"context"
"errors"
"sync"
"testing"
"time"
"github.com/fleetdm/fleet/v4/server/fleet"
)
2022-11-28 19:28:06 +00:00
type NopLocker struct{}
func (NopLocker) Lock(context.Context, string, string, time.Duration) (bool, error) {
return true, nil
}
func (NopLocker) Unlock(context.Context, string, string) error {
return nil
}
type NopStatsStore struct{}
2022-12-16 18:00:42 +00:00
func (NopStatsStore) GetLatestCronStats(ctx context.Context, name string) ([]fleet.CronStats, error) {
return []fleet.CronStats{}, nil
2022-11-28 19:28:06 +00:00
}
func (NopStatsStore) InsertCronStats(ctx context.Context, statsType fleet.CronStatsType, name string, instance string, status fleet.CronStatsStatus) (int, error) {
return 0, nil
}
Monitor and alert on errors in cron jobs (#24347) for #19930 # Checklist for submitter - [X] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. - [X] Input data is properly validated, `SELECT *` is avoided, SQL injection is prevented (using placeholders for values in statements) - [X] Added/updated tests - [X] If database migrations are included, checked table schema to confirm autoupdate - [X] Manual QA for all new/changed functionality # Details This PR adds a new feature to the existing monitoring add-on. The add-on will now send an SNS alert whenever a scheduled job like "vulnerabilities" or "apple_mdm_apns_pusher" exits early due to errors. The alert contains the job type and the set of errors (there can be multiple, since jobs can have multiple sub-jobs). By default the SNS topic for this new alert is the same as the one for the existing cron system alerts, but it can be configured to use a separate topic (e.g. dogfood instance will post to a separate slack channel). The actual changes are: **On the server side:** - Add errors field to cron_stats table (json DEFAULT NULL) - Added errors var to `Schedule` struct to collect errors from jobs - In `RunAllJobs`, collect err from job into new errors var - Update `Schedule.updateStats`and `CronStats.UpdateCronStats`to accept errors argument - If provided, update errors field of cron_stats table **On the monitor side:** - Add new SQL query to look for all completed schedules since last run with non-null errors - send SNS with job ID, name, errors # Testing New automated testing was added for the functional code that gathers and stores errors from cron runs in the database. To test the actual Lambda, I added a row in my `cron_stats` table with errors, then compiled and ran the Lambda executable locally, pointing it to my local mysql and localstack instances: ``` 2024/12/03 14:43:54 main.go:258: Lambda execution environment not found. Falling back to local execution. 2024/12/03 14:43:54 main.go:133: Connected to database! 2024/12/03 14:43:54 main.go:161: Row vulnerabilities last updated at 2024-11-27 03:30:03 +0000 UTC 2024/12/03 14:43:54 main.go:163: *** 1h hasn't updated in more than vulnerabilities, alerting! (status completed) 2024/12/03 14:43:54 main.go:70: Sending SNS Message 2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev Message: Fleet cron 'vulnerabilities' hasn't updated in more than 1h. Last status was 'completed' at 2024-11-27 03:30:03 +0000 UTC.' to 'arn:aws:sns:us-east-1:000000000000:topic1' 2024/12/03 14:43:54 main.go:82: { MessageId: "260864ff-4cc9-4951-acea-cef883b2de5f" } 2024/12/03 14:43:54 main.go:198: *** mdm_apple_profile_manager job had errors, alerting! (errors {"something": "wrong"}) 2024/12/03 14:43:54 main.go:70: Sending SNS Message 2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev Message: Fleet cron 'mdm_apple_profile_manager' (last updated 2024-12-03 20:34:14 +0000 UTC) raised errors during its run: {"something": "wrong"}.' to 'arn:aws:sns:us-east-1:000000000000:topic1' 2024/12/03 14:43:54 main.go:82: { MessageId: "5cd085ef-89f6-42c1-8470-d80a22b295f8"
2024-12-19 21:55:29 +00:00
func (NopStatsStore) UpdateCronStats(ctx context.Context, id int, status fleet.CronStatsStatus, cronErrors *fleet.CronScheduleErrors) error {
2022-11-28 19:28:06 +00:00
return nil
}
Trigger vuln processing when it runs on a separate server (#39612) <!-- Add the related story/sub-task/bug number, like Resolves #123, or remove if NA --> **Related issue:** Resolves #35239 Docs PR: #39770 ## Remote trigger approach When FLEET_VULNERABILITIES_DISABLE_SCHEDULE=true, the main Fleet server registers a RemoteTriggerSchedule instead of the real vulnerability schedule. When a user runs fleetctl trigger --name=vulnerabilities: 1. Main server: RemoteTriggerSchedule.Trigger() inserts a cron_stats record with status=queued. 2. Worker server: The vulnerability schedule runs with WithTriggerPollInterval(60s), which starts a poll goroutine that checks the DB every 60s for queued records. 3. Pickup: When the poll goroutine finds a queued record, it sends the stats ID on the trigger channel (non-blocking). 4. Execution: The trigger handler acquires the lock, claims the record via ClaimCronStats (updating status to pending and instance to the actual worker ID), runs all jobs, and marks it completed. Key details: - The trigger channel carries an int: 0 for in-process triggers, >0 for DB-polled stats IDs. This lets runWithStats reuse the existing record instead of inserting a new one. - Both Schedule.Trigger() and RemoteTriggerSchedule.Trigger() treat pending and queued as conflicts to prevent duplicate runs. - Queued records expire after 2 hours via CleanupCronStats, same as pending records. - The poll goroutine only signals; it doesn't modify DB state. The handler claims when ready. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Added support for remote trigger execution in vulnerability scheduling workflows. * Implemented periodic polling mechanism to detect and process externally triggered vulnerability scans. * **Bug Fixes** * Enhanced trigger status tracking to properly handle queued scan jobs. * **Improvements** * Strengthened scheduling system with improved timeout and cancellation management capabilities. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
2026-02-17 15:18:03 +00:00
func (NopStatsStore) ClaimCronStats(ctx context.Context, id int, instance string, status fleet.CronStatsStatus) error {
return nil
}
func SetupMockLocker(name string, owner string, expiresAt time.Time) *MockLock {
return &MockLock{name: name, owner: owner, expiresAt: expiresAt}
}
type MockLock struct {
mu sync.Mutex
name string
owner string
expiresAt time.Time
Locked chan struct{}
LockCount int
Unlocked chan struct{}
UnlockCount int
}
func (ml *MockLock) Lock(ctx context.Context, name string, owner string, expiration time.Duration) (bool, error) {
ml.mu.Lock()
defer ml.mu.Unlock()
if name != ml.name {
return false, errors.New("name doesn't match")
}
now := time.Now()
if ml.owner == owner || now.After(ml.expiresAt) {
ml.owner = owner
ml.expiresAt = now.Add(expiration)
ml.LockCount++
if ml.Locked != nil {
ml.Locked <- struct{}{}
}
return true, nil
}
return false, nil
}
func (ml *MockLock) Unlock(ctx context.Context, name string, owner string) error {
ml.mu.Lock()
defer ml.mu.Unlock()
if name != ml.name {
return errors.New("name doesn't match")
}
if owner != ml.owner {
return errors.New("owner doesn't match")
}
ml.UnlockCount++
if ml.Unlocked != nil {
ml.Unlocked <- struct{}{}
}
ml.expiresAt = time.Now()
return nil
}
func (ml *MockLock) GetLockCount() int {
ml.mu.Lock()
defer ml.mu.Unlock()
return ml.LockCount
}
func (ml *MockLock) GetExpiration() time.Time {
ml.mu.Lock()
defer ml.mu.Unlock()
return ml.expiresAt
}
func (ml *MockLock) AddChannels(t *testing.T, chanNames ...string) error {
ml.mu.Lock()
defer ml.mu.Unlock()
for _, n := range chanNames {
switch n {
case "locked":
ml.Locked = make(chan struct{})
case "unlocked":
ml.Unlocked = make(chan struct{})
default:
t.Errorf("unrecognized channel name")
t.FailNow()
}
}
return nil
}
type MockStatsStore struct {
sync.Mutex
stats map[int]fleet.CronStats
GetStatsCalled chan struct{}
InsertStatsCalled chan struct{}
UpdateStatsCalled chan struct{}
}
2022-12-16 18:00:42 +00:00
func (m *MockStatsStore) GetLatestCronStats(ctx context.Context, name string) ([]fleet.CronStats, error) {
m.Lock()
defer m.Unlock()
if m.GetStatsCalled != nil {
m.GetStatsCalled <- struct{}{}
}
2022-12-16 18:00:42 +00:00
latest := make(map[fleet.CronStatsType]fleet.CronStats)
for _, s := range m.stats {
if s.Name != name {
continue
2022-12-16 18:00:42 +00:00
}
curr := latest[s.StatsType]
if s.CreatedAt.Before(curr.CreatedAt) {
continue
}
2022-12-16 18:00:42 +00:00
latest[s.StatsType] = s
}
res := []fleet.CronStats{}
if s, ok := latest[fleet.CronStatsTypeScheduled]; ok {
res = append(res, s)
}
if s, ok := latest[fleet.CronStatsTypeTriggered]; ok {
res = append(res, s)
}
return res, nil
}
func (m *MockStatsStore) InsertCronStats(ctx context.Context, statsType fleet.CronStatsType, name string, instance string, status fleet.CronStatsStatus) (int, error) {
m.Lock()
defer m.Unlock()
if m.InsertStatsCalled != nil {
m.InsertStatsCalled <- struct{}{}
}
id := len(m.stats) + 1
2022-12-16 18:00:42 +00:00
m.stats[id] = fleet.CronStats{ID: id, StatsType: statsType, Name: name, Instance: instance, Status: status, CreatedAt: time.Now().Truncate(1 * time.Second), UpdatedAt: time.Now().Truncate(time.Second)}
return id, nil
}
Monitor and alert on errors in cron jobs (#24347) for #19930 # Checklist for submitter - [X] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. - [X] Input data is properly validated, `SELECT *` is avoided, SQL injection is prevented (using placeholders for values in statements) - [X] Added/updated tests - [X] If database migrations are included, checked table schema to confirm autoupdate - [X] Manual QA for all new/changed functionality # Details This PR adds a new feature to the existing monitoring add-on. The add-on will now send an SNS alert whenever a scheduled job like "vulnerabilities" or "apple_mdm_apns_pusher" exits early due to errors. The alert contains the job type and the set of errors (there can be multiple, since jobs can have multiple sub-jobs). By default the SNS topic for this new alert is the same as the one for the existing cron system alerts, but it can be configured to use a separate topic (e.g. dogfood instance will post to a separate slack channel). The actual changes are: **On the server side:** - Add errors field to cron_stats table (json DEFAULT NULL) - Added errors var to `Schedule` struct to collect errors from jobs - In `RunAllJobs`, collect err from job into new errors var - Update `Schedule.updateStats`and `CronStats.UpdateCronStats`to accept errors argument - If provided, update errors field of cron_stats table **On the monitor side:** - Add new SQL query to look for all completed schedules since last run with non-null errors - send SNS with job ID, name, errors # Testing New automated testing was added for the functional code that gathers and stores errors from cron runs in the database. To test the actual Lambda, I added a row in my `cron_stats` table with errors, then compiled and ran the Lambda executable locally, pointing it to my local mysql and localstack instances: ``` 2024/12/03 14:43:54 main.go:258: Lambda execution environment not found. Falling back to local execution. 2024/12/03 14:43:54 main.go:133: Connected to database! 2024/12/03 14:43:54 main.go:161: Row vulnerabilities last updated at 2024-11-27 03:30:03 +0000 UTC 2024/12/03 14:43:54 main.go:163: *** 1h hasn't updated in more than vulnerabilities, alerting! (status completed) 2024/12/03 14:43:54 main.go:70: Sending SNS Message 2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev Message: Fleet cron 'vulnerabilities' hasn't updated in more than 1h. Last status was 'completed' at 2024-11-27 03:30:03 +0000 UTC.' to 'arn:aws:sns:us-east-1:000000000000:topic1' 2024/12/03 14:43:54 main.go:82: { MessageId: "260864ff-4cc9-4951-acea-cef883b2de5f" } 2024/12/03 14:43:54 main.go:198: *** mdm_apple_profile_manager job had errors, alerting! (errors {"something": "wrong"}) 2024/12/03 14:43:54 main.go:70: Sending SNS Message 2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev Message: Fleet cron 'mdm_apple_profile_manager' (last updated 2024-12-03 20:34:14 +0000 UTC) raised errors during its run: {"something": "wrong"}.' to 'arn:aws:sns:us-east-1:000000000000:topic1' 2024/12/03 14:43:54 main.go:82: { MessageId: "5cd085ef-89f6-42c1-8470-d80a22b295f8"
2024-12-19 21:55:29 +00:00
func (m *MockStatsStore) UpdateCronStats(ctx context.Context, id int, status fleet.CronStatsStatus, cronErrors *fleet.CronScheduleErrors) error {
m.Lock()
defer m.Unlock()
if m.UpdateStatsCalled != nil {
m.UpdateStatsCalled <- struct{}{}
}
s, ok := m.stats[id]
if !ok {
return errors.New("update failed, id not found")
}
s.Status = status
2022-12-16 18:00:42 +00:00
s.UpdatedAt = time.Now().Truncate(1 * time.Second)
m.stats[id] = s
return nil
}
Trigger vuln processing when it runs on a separate server (#39612) <!-- Add the related story/sub-task/bug number, like Resolves #123, or remove if NA --> **Related issue:** Resolves #35239 Docs PR: #39770 ## Remote trigger approach When FLEET_VULNERABILITIES_DISABLE_SCHEDULE=true, the main Fleet server registers a RemoteTriggerSchedule instead of the real vulnerability schedule. When a user runs fleetctl trigger --name=vulnerabilities: 1. Main server: RemoteTriggerSchedule.Trigger() inserts a cron_stats record with status=queued. 2. Worker server: The vulnerability schedule runs with WithTriggerPollInterval(60s), which starts a poll goroutine that checks the DB every 60s for queued records. 3. Pickup: When the poll goroutine finds a queued record, it sends the stats ID on the trigger channel (non-blocking). 4. Execution: The trigger handler acquires the lock, claims the record via ClaimCronStats (updating status to pending and instance to the actual worker ID), runs all jobs, and marks it completed. Key details: - The trigger channel carries an int: 0 for in-process triggers, >0 for DB-polled stats IDs. This lets runWithStats reuse the existing record instead of inserting a new one. - Both Schedule.Trigger() and RemoteTriggerSchedule.Trigger() treat pending and queued as conflicts to prevent duplicate runs. - Queued records expire after 2 hours via CleanupCronStats, same as pending records. - The poll goroutine only signals; it doesn't modify DB state. The handler claims when ready. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Added support for remote trigger execution in vulnerability scheduling workflows. * Implemented periodic polling mechanism to detect and process externally triggered vulnerability scans. * **Bug Fixes** * Enhanced trigger status tracking to properly handle queued scan jobs. * **Improvements** * Strengthened scheduling system with improved timeout and cancellation management capabilities. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
2026-02-17 15:18:03 +00:00
func (m *MockStatsStore) ClaimCronStats(ctx context.Context, id int, instance string, status fleet.CronStatsStatus) error {
m.Lock()
defer m.Unlock()
s, ok := m.stats[id]
if !ok {
return errors.New("claim failed, id not found")
}
s.Status = status
s.Instance = instance
s.UpdatedAt = time.Now().Truncate(1 * time.Second)
m.stats[id] = s
return nil
}
func (m *MockStatsStore) AddChannels(t *testing.T, chanNames ...string) error {
m.Lock()
defer m.Unlock()
for _, n := range chanNames {
switch n {
case "GetStatsCalled":
m.GetStatsCalled = make(chan struct{})
case "InsertStatsCalled":
m.InsertStatsCalled = make(chan struct{})
case "UpdateStatsCalled":
m.UpdateStatsCalled = make(chan struct{})
default:
t.Errorf("unrecognized channel name")
t.FailNow()
}
}
return nil
}
func SetUpMockStatsStore(name string, initialStats ...fleet.CronStats) *MockStatsStore {
stats := make(map[int]fleet.CronStats)
for _, s := range initialStats {
stats[s.ID] = s
}
store := MockStatsStore{stats: stats}
return &store
}