2022-11-16 21:14:38 +00:00
|
|
|
package schedule
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"context"
|
|
|
|
|
"errors"
|
|
|
|
|
"sync"
|
|
|
|
|
"testing"
|
|
|
|
|
"time"
|
|
|
|
|
|
|
|
|
|
"github.com/fleetdm/fleet/v4/server/fleet"
|
|
|
|
|
)
|
|
|
|
|
|
2022-11-28 19:28:06 +00:00
|
|
|
type NopLocker struct{}
|
|
|
|
|
|
|
|
|
|
func (NopLocker) Lock(context.Context, string, string, time.Duration) (bool, error) {
|
|
|
|
|
return true, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (NopLocker) Unlock(context.Context, string, string) error {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type NopStatsStore struct{}
|
|
|
|
|
|
2022-12-16 18:00:42 +00:00
|
|
|
func (NopStatsStore) GetLatestCronStats(ctx context.Context, name string) ([]fleet.CronStats, error) {
|
|
|
|
|
return []fleet.CronStats{}, nil
|
2022-11-28 19:28:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (NopStatsStore) InsertCronStats(ctx context.Context, statsType fleet.CronStatsType, name string, instance string, status fleet.CronStatsStatus) (int, error) {
|
|
|
|
|
return 0, nil
|
|
|
|
|
}
|
|
|
|
|
|
Monitor and alert on errors in cron jobs (#24347)
for #19930
# Checklist for submitter
- [X] Changes file added for user-visible changes in `changes/`,
`orbit/changes/` or `ee/fleetd-chrome/changes`.
- [X] Input data is properly validated, `SELECT *` is avoided, SQL
injection is prevented (using placeholders for values in statements)
- [X] Added/updated tests
- [X] If database migrations are included, checked table schema to
confirm autoupdate
- [X] Manual QA for all new/changed functionality
# Details
This PR adds a new feature to the existing monitoring add-on. The add-on
will now send an SNS alert whenever a scheduled job like
"vulnerabilities" or "apple_mdm_apns_pusher" exits early due to errors.
The alert contains the job type and the set of errors (there can be
multiple, since jobs can have multiple sub-jobs). By default the SNS
topic for this new alert is the same as the one for the existing cron
system alerts, but it can be configured to use a separate topic (e.g.
dogfood instance will post to a separate slack channel).
The actual changes are:
**On the server side:**
- Add errors field to cron_stats table (json DEFAULT NULL)
- Added errors var to `Schedule` struct to collect errors from jobs
- In `RunAllJobs`, collect err from job into new errors var
- Update `Schedule.updateStats`and `CronStats.UpdateCronStats`to accept
errors argument
- If provided, update errors field of cron_stats table
**On the monitor side:**
- Add new SQL query to look for all completed schedules since last run
with non-null errors
- send SNS with job ID, name, errors
# Testing
New automated testing was added for the functional code that gathers and
stores errors from cron runs in the database. To test the actual Lambda,
I added a row in my `cron_stats` table with errors, then compiled and
ran the Lambda executable locally, pointing it to my local mysql and
localstack instances:
```
2024/12/03 14:43:54 main.go:258: Lambda execution environment not found. Falling back to local execution.
2024/12/03 14:43:54 main.go:133: Connected to database!
2024/12/03 14:43:54 main.go:161: Row vulnerabilities last updated at 2024-11-27 03:30:03 +0000 UTC
2024/12/03 14:43:54 main.go:163: *** 1h hasn't updated in more than vulnerabilities, alerting! (status completed)
2024/12/03 14:43:54 main.go:70: Sending SNS Message
2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev
Message: Fleet cron 'vulnerabilities' hasn't updated in more than 1h. Last status was 'completed' at 2024-11-27 03:30:03 +0000 UTC.' to 'arn:aws:sns:us-east-1:000000000000:topic1'
2024/12/03 14:43:54 main.go:82: {
MessageId: "260864ff-4cc9-4951-acea-cef883b2de5f"
}
2024/12/03 14:43:54 main.go:198: *** mdm_apple_profile_manager job had errors, alerting! (errors {"something": "wrong"})
2024/12/03 14:43:54 main.go:70: Sending SNS Message
2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev
Message: Fleet cron 'mdm_apple_profile_manager' (last updated 2024-12-03 20:34:14 +0000 UTC) raised errors during its run:
{"something": "wrong"}.' to 'arn:aws:sns:us-east-1:000000000000:topic1'
2024/12/03 14:43:54 main.go:82: {
MessageId: "5cd085ef-89f6-42c1-8470-d80a22b295f8"
2024-12-19 21:55:29 +00:00
|
|
|
func (NopStatsStore) UpdateCronStats(ctx context.Context, id int, status fleet.CronStatsStatus, cronErrors *fleet.CronScheduleErrors) error {
|
2022-11-28 19:28:06 +00:00
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
Trigger vuln processing when it runs on a separate server (#39612)
<!-- Add the related story/sub-task/bug number, like Resolves #123, or
remove if NA -->
**Related issue:** Resolves #35239
Docs PR: #39770
## Remote trigger approach
When FLEET_VULNERABILITIES_DISABLE_SCHEDULE=true, the main Fleet server
registers a RemoteTriggerSchedule instead of the real vulnerability
schedule. When a user runs fleetctl trigger --name=vulnerabilities:
1. Main server: RemoteTriggerSchedule.Trigger() inserts a cron_stats
record with status=queued.
2. Worker server: The vulnerability schedule runs with
WithTriggerPollInterval(60s), which starts a poll goroutine that checks
the DB every 60s for queued records.
3. Pickup: When the poll goroutine finds a queued record, it sends the
stats ID on the trigger channel (non-blocking).
4. Execution: The trigger handler acquires the lock, claims the record
via ClaimCronStats (updating status to pending and instance to the
actual worker ID), runs all jobs, and marks it completed.
Key details:
- The trigger channel carries an int: 0 for in-process triggers, >0 for
DB-polled stats IDs. This lets runWithStats reuse the existing record
instead of inserting a new one.
- Both Schedule.Trigger() and RemoteTriggerSchedule.Trigger() treat
pending and queued as conflicts to prevent duplicate runs.
- Queued records expire after 2 hours via CleanupCronStats, same as
pending records.
- The poll goroutine only signals; it doesn't modify DB state. The
handler claims when ready.
<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit
* **New Features**
* Added support for remote trigger execution in vulnerability scheduling
workflows.
* Implemented periodic polling mechanism to detect and process
externally triggered vulnerability scans.
* **Bug Fixes**
* Enhanced trigger status tracking to properly handle queued scan jobs.
* **Improvements**
* Strengthened scheduling system with improved timeout and cancellation
management capabilities.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2026-02-17 15:18:03 +00:00
|
|
|
func (NopStatsStore) ClaimCronStats(ctx context.Context, id int, instance string, status fleet.CronStatsStatus) error {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-16 21:14:38 +00:00
|
|
|
func SetupMockLocker(name string, owner string, expiresAt time.Time) *MockLock {
|
|
|
|
|
return &MockLock{name: name, owner: owner, expiresAt: expiresAt}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type MockLock struct {
|
|
|
|
|
mu sync.Mutex
|
|
|
|
|
|
|
|
|
|
name string
|
|
|
|
|
owner string
|
|
|
|
|
expiresAt time.Time
|
|
|
|
|
|
|
|
|
|
Locked chan struct{}
|
|
|
|
|
LockCount int
|
|
|
|
|
|
|
|
|
|
Unlocked chan struct{}
|
|
|
|
|
UnlockCount int
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (ml *MockLock) Lock(ctx context.Context, name string, owner string, expiration time.Duration) (bool, error) {
|
|
|
|
|
ml.mu.Lock()
|
|
|
|
|
defer ml.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
if name != ml.name {
|
|
|
|
|
return false, errors.New("name doesn't match")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
now := time.Now()
|
|
|
|
|
if ml.owner == owner || now.After(ml.expiresAt) {
|
|
|
|
|
ml.owner = owner
|
|
|
|
|
ml.expiresAt = now.Add(expiration)
|
2024-10-18 17:38:26 +00:00
|
|
|
ml.LockCount++
|
2022-11-16 21:14:38 +00:00
|
|
|
if ml.Locked != nil {
|
|
|
|
|
ml.Locked <- struct{}{}
|
|
|
|
|
}
|
|
|
|
|
return true, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (ml *MockLock) Unlock(ctx context.Context, name string, owner string) error {
|
|
|
|
|
ml.mu.Lock()
|
|
|
|
|
defer ml.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
if name != ml.name {
|
|
|
|
|
return errors.New("name doesn't match")
|
|
|
|
|
}
|
|
|
|
|
if owner != ml.owner {
|
|
|
|
|
return errors.New("owner doesn't match")
|
|
|
|
|
}
|
2024-10-18 17:38:26 +00:00
|
|
|
ml.UnlockCount++
|
2022-11-16 21:14:38 +00:00
|
|
|
if ml.Unlocked != nil {
|
|
|
|
|
ml.Unlocked <- struct{}{}
|
|
|
|
|
}
|
2023-03-03 18:14:10 +00:00
|
|
|
ml.expiresAt = time.Now()
|
2022-11-16 21:14:38 +00:00
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (ml *MockLock) GetLockCount() int {
|
|
|
|
|
ml.mu.Lock()
|
|
|
|
|
defer ml.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
return ml.LockCount
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-06 15:24:40 +00:00
|
|
|
func (ml *MockLock) GetExpiration() time.Time {
|
|
|
|
|
ml.mu.Lock()
|
|
|
|
|
defer ml.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
return ml.expiresAt
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-16 21:14:38 +00:00
|
|
|
func (ml *MockLock) AddChannels(t *testing.T, chanNames ...string) error {
|
|
|
|
|
ml.mu.Lock()
|
|
|
|
|
defer ml.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
for _, n := range chanNames {
|
|
|
|
|
switch n {
|
|
|
|
|
case "locked":
|
|
|
|
|
ml.Locked = make(chan struct{})
|
|
|
|
|
case "unlocked":
|
|
|
|
|
ml.Unlocked = make(chan struct{})
|
|
|
|
|
default:
|
|
|
|
|
t.Errorf("unrecognized channel name")
|
|
|
|
|
t.FailNow()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type MockStatsStore struct {
|
|
|
|
|
sync.Mutex
|
|
|
|
|
stats map[int]fleet.CronStats
|
|
|
|
|
|
|
|
|
|
GetStatsCalled chan struct{}
|
|
|
|
|
InsertStatsCalled chan struct{}
|
|
|
|
|
UpdateStatsCalled chan struct{}
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-16 18:00:42 +00:00
|
|
|
func (m *MockStatsStore) GetLatestCronStats(ctx context.Context, name string) ([]fleet.CronStats, error) {
|
2022-11-16 21:14:38 +00:00
|
|
|
m.Lock()
|
|
|
|
|
defer m.Unlock()
|
|
|
|
|
|
|
|
|
|
if m.GetStatsCalled != nil {
|
|
|
|
|
m.GetStatsCalled <- struct{}{}
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-16 18:00:42 +00:00
|
|
|
latest := make(map[fleet.CronStatsType]fleet.CronStats)
|
|
|
|
|
for _, s := range m.stats {
|
|
|
|
|
if s.Name != name {
|
2022-11-16 21:14:38 +00:00
|
|
|
continue
|
2022-12-16 18:00:42 +00:00
|
|
|
}
|
|
|
|
|
curr := latest[s.StatsType]
|
|
|
|
|
if s.CreatedAt.Before(curr.CreatedAt) {
|
2022-11-16 21:14:38 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2022-12-16 18:00:42 +00:00
|
|
|
latest[s.StatsType] = s
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
res := []fleet.CronStats{}
|
|
|
|
|
if s, ok := latest[fleet.CronStatsTypeScheduled]; ok {
|
|
|
|
|
res = append(res, s)
|
|
|
|
|
}
|
|
|
|
|
if s, ok := latest[fleet.CronStatsTypeTriggered]; ok {
|
|
|
|
|
res = append(res, s)
|
2022-11-16 21:14:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return res, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (m *MockStatsStore) InsertCronStats(ctx context.Context, statsType fleet.CronStatsType, name string, instance string, status fleet.CronStatsStatus) (int, error) {
|
|
|
|
|
m.Lock()
|
|
|
|
|
defer m.Unlock()
|
|
|
|
|
|
|
|
|
|
if m.InsertStatsCalled != nil {
|
|
|
|
|
m.InsertStatsCalled <- struct{}{}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
id := len(m.stats) + 1
|
2022-12-16 18:00:42 +00:00
|
|
|
m.stats[id] = fleet.CronStats{ID: id, StatsType: statsType, Name: name, Instance: instance, Status: status, CreatedAt: time.Now().Truncate(1 * time.Second), UpdatedAt: time.Now().Truncate(time.Second)}
|
2022-11-16 21:14:38 +00:00
|
|
|
|
|
|
|
|
return id, nil
|
|
|
|
|
}
|
|
|
|
|
|
Monitor and alert on errors in cron jobs (#24347)
for #19930
# Checklist for submitter
- [X] Changes file added for user-visible changes in `changes/`,
`orbit/changes/` or `ee/fleetd-chrome/changes`.
- [X] Input data is properly validated, `SELECT *` is avoided, SQL
injection is prevented (using placeholders for values in statements)
- [X] Added/updated tests
- [X] If database migrations are included, checked table schema to
confirm autoupdate
- [X] Manual QA for all new/changed functionality
# Details
This PR adds a new feature to the existing monitoring add-on. The add-on
will now send an SNS alert whenever a scheduled job like
"vulnerabilities" or "apple_mdm_apns_pusher" exits early due to errors.
The alert contains the job type and the set of errors (there can be
multiple, since jobs can have multiple sub-jobs). By default the SNS
topic for this new alert is the same as the one for the existing cron
system alerts, but it can be configured to use a separate topic (e.g.
dogfood instance will post to a separate slack channel).
The actual changes are:
**On the server side:**
- Add errors field to cron_stats table (json DEFAULT NULL)
- Added errors var to `Schedule` struct to collect errors from jobs
- In `RunAllJobs`, collect err from job into new errors var
- Update `Schedule.updateStats`and `CronStats.UpdateCronStats`to accept
errors argument
- If provided, update errors field of cron_stats table
**On the monitor side:**
- Add new SQL query to look for all completed schedules since last run
with non-null errors
- send SNS with job ID, name, errors
# Testing
New automated testing was added for the functional code that gathers and
stores errors from cron runs in the database. To test the actual Lambda,
I added a row in my `cron_stats` table with errors, then compiled and
ran the Lambda executable locally, pointing it to my local mysql and
localstack instances:
```
2024/12/03 14:43:54 main.go:258: Lambda execution environment not found. Falling back to local execution.
2024/12/03 14:43:54 main.go:133: Connected to database!
2024/12/03 14:43:54 main.go:161: Row vulnerabilities last updated at 2024-11-27 03:30:03 +0000 UTC
2024/12/03 14:43:54 main.go:163: *** 1h hasn't updated in more than vulnerabilities, alerting! (status completed)
2024/12/03 14:43:54 main.go:70: Sending SNS Message
2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev
Message: Fleet cron 'vulnerabilities' hasn't updated in more than 1h. Last status was 'completed' at 2024-11-27 03:30:03 +0000 UTC.' to 'arn:aws:sns:us-east-1:000000000000:topic1'
2024/12/03 14:43:54 main.go:82: {
MessageId: "260864ff-4cc9-4951-acea-cef883b2de5f"
}
2024/12/03 14:43:54 main.go:198: *** mdm_apple_profile_manager job had errors, alerting! (errors {"something": "wrong"})
2024/12/03 14:43:54 main.go:70: Sending SNS Message
2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev
Message: Fleet cron 'mdm_apple_profile_manager' (last updated 2024-12-03 20:34:14 +0000 UTC) raised errors during its run:
{"something": "wrong"}.' to 'arn:aws:sns:us-east-1:000000000000:topic1'
2024/12/03 14:43:54 main.go:82: {
MessageId: "5cd085ef-89f6-42c1-8470-d80a22b295f8"
2024-12-19 21:55:29 +00:00
|
|
|
func (m *MockStatsStore) UpdateCronStats(ctx context.Context, id int, status fleet.CronStatsStatus, cronErrors *fleet.CronScheduleErrors) error {
|
2022-11-16 21:14:38 +00:00
|
|
|
m.Lock()
|
|
|
|
|
defer m.Unlock()
|
|
|
|
|
|
|
|
|
|
if m.UpdateStatsCalled != nil {
|
|
|
|
|
m.UpdateStatsCalled <- struct{}{}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
s, ok := m.stats[id]
|
|
|
|
|
if !ok {
|
|
|
|
|
return errors.New("update failed, id not found")
|
|
|
|
|
}
|
|
|
|
|
s.Status = status
|
2022-12-16 18:00:42 +00:00
|
|
|
s.UpdatedAt = time.Now().Truncate(1 * time.Second)
|
2022-11-16 21:14:38 +00:00
|
|
|
m.stats[id] = s
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
Trigger vuln processing when it runs on a separate server (#39612)
<!-- Add the related story/sub-task/bug number, like Resolves #123, or
remove if NA -->
**Related issue:** Resolves #35239
Docs PR: #39770
## Remote trigger approach
When FLEET_VULNERABILITIES_DISABLE_SCHEDULE=true, the main Fleet server
registers a RemoteTriggerSchedule instead of the real vulnerability
schedule. When a user runs fleetctl trigger --name=vulnerabilities:
1. Main server: RemoteTriggerSchedule.Trigger() inserts a cron_stats
record with status=queued.
2. Worker server: The vulnerability schedule runs with
WithTriggerPollInterval(60s), which starts a poll goroutine that checks
the DB every 60s for queued records.
3. Pickup: When the poll goroutine finds a queued record, it sends the
stats ID on the trigger channel (non-blocking).
4. Execution: The trigger handler acquires the lock, claims the record
via ClaimCronStats (updating status to pending and instance to the
actual worker ID), runs all jobs, and marks it completed.
Key details:
- The trigger channel carries an int: 0 for in-process triggers, >0 for
DB-polled stats IDs. This lets runWithStats reuse the existing record
instead of inserting a new one.
- Both Schedule.Trigger() and RemoteTriggerSchedule.Trigger() treat
pending and queued as conflicts to prevent duplicate runs.
- Queued records expire after 2 hours via CleanupCronStats, same as
pending records.
- The poll goroutine only signals; it doesn't modify DB state. The
handler claims when ready.
<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit
* **New Features**
* Added support for remote trigger execution in vulnerability scheduling
workflows.
* Implemented periodic polling mechanism to detect and process
externally triggered vulnerability scans.
* **Bug Fixes**
* Enhanced trigger status tracking to properly handle queued scan jobs.
* **Improvements**
* Strengthened scheduling system with improved timeout and cancellation
management capabilities.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2026-02-17 15:18:03 +00:00
|
|
|
func (m *MockStatsStore) ClaimCronStats(ctx context.Context, id int, instance string, status fleet.CronStatsStatus) error {
|
|
|
|
|
m.Lock()
|
|
|
|
|
defer m.Unlock()
|
|
|
|
|
|
|
|
|
|
s, ok := m.stats[id]
|
|
|
|
|
if !ok {
|
|
|
|
|
return errors.New("claim failed, id not found")
|
|
|
|
|
}
|
|
|
|
|
s.Status = status
|
|
|
|
|
s.Instance = instance
|
|
|
|
|
s.UpdatedAt = time.Now().Truncate(1 * time.Second)
|
|
|
|
|
m.stats[id] = s
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-16 21:14:38 +00:00
|
|
|
func (m *MockStatsStore) AddChannels(t *testing.T, chanNames ...string) error {
|
|
|
|
|
m.Lock()
|
|
|
|
|
defer m.Unlock()
|
|
|
|
|
for _, n := range chanNames {
|
|
|
|
|
switch n {
|
|
|
|
|
case "GetStatsCalled":
|
|
|
|
|
m.GetStatsCalled = make(chan struct{})
|
|
|
|
|
case "InsertStatsCalled":
|
|
|
|
|
m.InsertStatsCalled = make(chan struct{})
|
|
|
|
|
case "UpdateStatsCalled":
|
|
|
|
|
m.UpdateStatsCalled = make(chan struct{})
|
|
|
|
|
default:
|
|
|
|
|
t.Errorf("unrecognized channel name")
|
|
|
|
|
t.FailNow()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func SetUpMockStatsStore(name string, initialStats ...fleet.CronStats) *MockStatsStore {
|
|
|
|
|
stats := make(map[int]fleet.CronStats)
|
|
|
|
|
for _, s := range initialStats {
|
|
|
|
|
stats[s.ID] = s
|
|
|
|
|
}
|
|
|
|
|
store := MockStatsStore{stats: stats}
|
|
|
|
|
|
|
|
|
|
return &store
|
|
|
|
|
}
|