2022-11-16 21:14:38 +00:00
|
|
|
package schedule
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"context"
|
|
|
|
|
"errors"
|
|
|
|
|
"sync"
|
|
|
|
|
"testing"
|
|
|
|
|
"time"
|
|
|
|
|
|
|
|
|
|
"github.com/fleetdm/fleet/v4/server/fleet"
|
|
|
|
|
)
|
|
|
|
|
|
2022-11-28 19:28:06 +00:00
|
|
|
type NopLocker struct{}
|
|
|
|
|
|
|
|
|
|
func (NopLocker) Lock(context.Context, string, string, time.Duration) (bool, error) {
|
|
|
|
|
return true, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (NopLocker) Unlock(context.Context, string, string) error {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type NopStatsStore struct{}
|
|
|
|
|
|
2022-12-16 18:00:42 +00:00
|
|
|
func (NopStatsStore) GetLatestCronStats(ctx context.Context, name string) ([]fleet.CronStats, error) {
|
|
|
|
|
return []fleet.CronStats{}, nil
|
2022-11-28 19:28:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (NopStatsStore) InsertCronStats(ctx context.Context, statsType fleet.CronStatsType, name string, instance string, status fleet.CronStatsStatus) (int, error) {
|
|
|
|
|
return 0, nil
|
|
|
|
|
}
|
|
|
|
|
|
Monitor and alert on errors in cron jobs (#24347)
for #19930
# Checklist for submitter
- [X] Changes file added for user-visible changes in `changes/`,
`orbit/changes/` or `ee/fleetd-chrome/changes`.
- [X] Input data is properly validated, `SELECT *` is avoided, SQL
injection is prevented (using placeholders for values in statements)
- [X] Added/updated tests
- [X] If database migrations are included, checked table schema to
confirm autoupdate
- [X] Manual QA for all new/changed functionality
# Details
This PR adds a new feature to the existing monitoring add-on. The add-on
will now send an SNS alert whenever a scheduled job like
"vulnerabilities" or "apple_mdm_apns_pusher" exits early due to errors.
The alert contains the job type and the set of errors (there can be
multiple, since jobs can have multiple sub-jobs). By default the SNS
topic for this new alert is the same as the one for the existing cron
system alerts, but it can be configured to use a separate topic (e.g.
dogfood instance will post to a separate slack channel).
The actual changes are:
**On the server side:**
- Add errors field to cron_stats table (json DEFAULT NULL)
- Added errors var to `Schedule` struct to collect errors from jobs
- In `RunAllJobs`, collect err from job into new errors var
- Update `Schedule.updateStats`and `CronStats.UpdateCronStats`to accept
errors argument
- If provided, update errors field of cron_stats table
**On the monitor side:**
- Add new SQL query to look for all completed schedules since last run
with non-null errors
- send SNS with job ID, name, errors
# Testing
New automated testing was added for the functional code that gathers and
stores errors from cron runs in the database. To test the actual Lambda,
I added a row in my `cron_stats` table with errors, then compiled and
ran the Lambda executable locally, pointing it to my local mysql and
localstack instances:
```
2024/12/03 14:43:54 main.go:258: Lambda execution environment not found. Falling back to local execution.
2024/12/03 14:43:54 main.go:133: Connected to database!
2024/12/03 14:43:54 main.go:161: Row vulnerabilities last updated at 2024-11-27 03:30:03 +0000 UTC
2024/12/03 14:43:54 main.go:163: *** 1h hasn't updated in more than vulnerabilities, alerting! (status completed)
2024/12/03 14:43:54 main.go:70: Sending SNS Message
2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev
Message: Fleet cron 'vulnerabilities' hasn't updated in more than 1h. Last status was 'completed' at 2024-11-27 03:30:03 +0000 UTC.' to 'arn:aws:sns:us-east-1:000000000000:topic1'
2024/12/03 14:43:54 main.go:82: {
MessageId: "260864ff-4cc9-4951-acea-cef883b2de5f"
}
2024/12/03 14:43:54 main.go:198: *** mdm_apple_profile_manager job had errors, alerting! (errors {"something": "wrong"})
2024/12/03 14:43:54 main.go:70: Sending SNS Message
2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev
Message: Fleet cron 'mdm_apple_profile_manager' (last updated 2024-12-03 20:34:14 +0000 UTC) raised errors during its run:
{"something": "wrong"}.' to 'arn:aws:sns:us-east-1:000000000000:topic1'
2024/12/03 14:43:54 main.go:82: {
MessageId: "5cd085ef-89f6-42c1-8470-d80a22b295f8"
2024-12-19 21:55:29 +00:00
|
|
|
func (NopStatsStore) UpdateCronStats(ctx context.Context, id int, status fleet.CronStatsStatus, cronErrors *fleet.CronScheduleErrors) error {
|
2022-11-28 19:28:06 +00:00
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-16 21:14:38 +00:00
|
|
|
func SetupMockLocker(name string, owner string, expiresAt time.Time) *MockLock {
|
|
|
|
|
return &MockLock{name: name, owner: owner, expiresAt: expiresAt}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type MockLock struct {
|
|
|
|
|
mu sync.Mutex
|
|
|
|
|
|
|
|
|
|
name string
|
|
|
|
|
owner string
|
|
|
|
|
expiresAt time.Time
|
|
|
|
|
|
|
|
|
|
Locked chan struct{}
|
|
|
|
|
LockCount int
|
|
|
|
|
|
|
|
|
|
Unlocked chan struct{}
|
|
|
|
|
UnlockCount int
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (ml *MockLock) Lock(ctx context.Context, name string, owner string, expiration time.Duration) (bool, error) {
|
|
|
|
|
ml.mu.Lock()
|
|
|
|
|
defer ml.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
if name != ml.name {
|
|
|
|
|
return false, errors.New("name doesn't match")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
now := time.Now()
|
|
|
|
|
if ml.owner == owner || now.After(ml.expiresAt) {
|
|
|
|
|
ml.owner = owner
|
|
|
|
|
ml.expiresAt = now.Add(expiration)
|
2024-10-18 17:38:26 +00:00
|
|
|
ml.LockCount++
|
2022-11-16 21:14:38 +00:00
|
|
|
if ml.Locked != nil {
|
|
|
|
|
ml.Locked <- struct{}{}
|
|
|
|
|
}
|
|
|
|
|
return true, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (ml *MockLock) Unlock(ctx context.Context, name string, owner string) error {
|
|
|
|
|
ml.mu.Lock()
|
|
|
|
|
defer ml.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
if name != ml.name {
|
|
|
|
|
return errors.New("name doesn't match")
|
|
|
|
|
}
|
|
|
|
|
if owner != ml.owner {
|
|
|
|
|
return errors.New("owner doesn't match")
|
|
|
|
|
}
|
2024-10-18 17:38:26 +00:00
|
|
|
ml.UnlockCount++
|
2022-11-16 21:14:38 +00:00
|
|
|
if ml.Unlocked != nil {
|
|
|
|
|
ml.Unlocked <- struct{}{}
|
|
|
|
|
}
|
2023-03-03 18:14:10 +00:00
|
|
|
ml.expiresAt = time.Now()
|
2022-11-16 21:14:38 +00:00
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (ml *MockLock) GetLockCount() int {
|
|
|
|
|
ml.mu.Lock()
|
|
|
|
|
defer ml.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
return ml.LockCount
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-06 15:24:40 +00:00
|
|
|
func (ml *MockLock) GetExpiration() time.Time {
|
|
|
|
|
ml.mu.Lock()
|
|
|
|
|
defer ml.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
return ml.expiresAt
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-16 21:14:38 +00:00
|
|
|
func (ml *MockLock) AddChannels(t *testing.T, chanNames ...string) error {
|
|
|
|
|
ml.mu.Lock()
|
|
|
|
|
defer ml.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
for _, n := range chanNames {
|
|
|
|
|
switch n {
|
|
|
|
|
case "locked":
|
|
|
|
|
ml.Locked = make(chan struct{})
|
|
|
|
|
case "unlocked":
|
|
|
|
|
ml.Unlocked = make(chan struct{})
|
|
|
|
|
default:
|
|
|
|
|
t.Errorf("unrecognized channel name")
|
|
|
|
|
t.FailNow()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type MockStatsStore struct {
|
|
|
|
|
sync.Mutex
|
|
|
|
|
stats map[int]fleet.CronStats
|
|
|
|
|
|
|
|
|
|
GetStatsCalled chan struct{}
|
|
|
|
|
InsertStatsCalled chan struct{}
|
|
|
|
|
UpdateStatsCalled chan struct{}
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-16 18:00:42 +00:00
|
|
|
func (m *MockStatsStore) GetLatestCronStats(ctx context.Context, name string) ([]fleet.CronStats, error) {
|
2022-11-16 21:14:38 +00:00
|
|
|
m.Lock()
|
|
|
|
|
defer m.Unlock()
|
|
|
|
|
|
|
|
|
|
if m.GetStatsCalled != nil {
|
|
|
|
|
m.GetStatsCalled <- struct{}{}
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-16 18:00:42 +00:00
|
|
|
latest := make(map[fleet.CronStatsType]fleet.CronStats)
|
|
|
|
|
for _, s := range m.stats {
|
|
|
|
|
if s.Name != name {
|
2022-11-16 21:14:38 +00:00
|
|
|
continue
|
2022-12-16 18:00:42 +00:00
|
|
|
}
|
|
|
|
|
curr := latest[s.StatsType]
|
|
|
|
|
if s.CreatedAt.Before(curr.CreatedAt) {
|
2022-11-16 21:14:38 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2022-12-16 18:00:42 +00:00
|
|
|
latest[s.StatsType] = s
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
res := []fleet.CronStats{}
|
|
|
|
|
if s, ok := latest[fleet.CronStatsTypeScheduled]; ok {
|
|
|
|
|
res = append(res, s)
|
|
|
|
|
}
|
|
|
|
|
if s, ok := latest[fleet.CronStatsTypeTriggered]; ok {
|
|
|
|
|
res = append(res, s)
|
2022-11-16 21:14:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return res, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (m *MockStatsStore) InsertCronStats(ctx context.Context, statsType fleet.CronStatsType, name string, instance string, status fleet.CronStatsStatus) (int, error) {
|
|
|
|
|
m.Lock()
|
|
|
|
|
defer m.Unlock()
|
|
|
|
|
|
|
|
|
|
if m.InsertStatsCalled != nil {
|
|
|
|
|
m.InsertStatsCalled <- struct{}{}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
id := len(m.stats) + 1
|
2022-12-16 18:00:42 +00:00
|
|
|
m.stats[id] = fleet.CronStats{ID: id, StatsType: statsType, Name: name, Instance: instance, Status: status, CreatedAt: time.Now().Truncate(1 * time.Second), UpdatedAt: time.Now().Truncate(time.Second)}
|
2022-11-16 21:14:38 +00:00
|
|
|
|
|
|
|
|
return id, nil
|
|
|
|
|
}
|
|
|
|
|
|
Monitor and alert on errors in cron jobs (#24347)
for #19930
# Checklist for submitter
- [X] Changes file added for user-visible changes in `changes/`,
`orbit/changes/` or `ee/fleetd-chrome/changes`.
- [X] Input data is properly validated, `SELECT *` is avoided, SQL
injection is prevented (using placeholders for values in statements)
- [X] Added/updated tests
- [X] If database migrations are included, checked table schema to
confirm autoupdate
- [X] Manual QA for all new/changed functionality
# Details
This PR adds a new feature to the existing monitoring add-on. The add-on
will now send an SNS alert whenever a scheduled job like
"vulnerabilities" or "apple_mdm_apns_pusher" exits early due to errors.
The alert contains the job type and the set of errors (there can be
multiple, since jobs can have multiple sub-jobs). By default the SNS
topic for this new alert is the same as the one for the existing cron
system alerts, but it can be configured to use a separate topic (e.g.
dogfood instance will post to a separate slack channel).
The actual changes are:
**On the server side:**
- Add errors field to cron_stats table (json DEFAULT NULL)
- Added errors var to `Schedule` struct to collect errors from jobs
- In `RunAllJobs`, collect err from job into new errors var
- Update `Schedule.updateStats`and `CronStats.UpdateCronStats`to accept
errors argument
- If provided, update errors field of cron_stats table
**On the monitor side:**
- Add new SQL query to look for all completed schedules since last run
with non-null errors
- send SNS with job ID, name, errors
# Testing
New automated testing was added for the functional code that gathers and
stores errors from cron runs in the database. To test the actual Lambda,
I added a row in my `cron_stats` table with errors, then compiled and
ran the Lambda executable locally, pointing it to my local mysql and
localstack instances:
```
2024/12/03 14:43:54 main.go:258: Lambda execution environment not found. Falling back to local execution.
2024/12/03 14:43:54 main.go:133: Connected to database!
2024/12/03 14:43:54 main.go:161: Row vulnerabilities last updated at 2024-11-27 03:30:03 +0000 UTC
2024/12/03 14:43:54 main.go:163: *** 1h hasn't updated in more than vulnerabilities, alerting! (status completed)
2024/12/03 14:43:54 main.go:70: Sending SNS Message
2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev
Message: Fleet cron 'vulnerabilities' hasn't updated in more than 1h. Last status was 'completed' at 2024-11-27 03:30:03 +0000 UTC.' to 'arn:aws:sns:us-east-1:000000000000:topic1'
2024/12/03 14:43:54 main.go:82: {
MessageId: "260864ff-4cc9-4951-acea-cef883b2de5f"
}
2024/12/03 14:43:54 main.go:198: *** mdm_apple_profile_manager job had errors, alerting! (errors {"something": "wrong"})
2024/12/03 14:43:54 main.go:70: Sending SNS Message
2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev
Message: Fleet cron 'mdm_apple_profile_manager' (last updated 2024-12-03 20:34:14 +0000 UTC) raised errors during its run:
{"something": "wrong"}.' to 'arn:aws:sns:us-east-1:000000000000:topic1'
2024/12/03 14:43:54 main.go:82: {
MessageId: "5cd085ef-89f6-42c1-8470-d80a22b295f8"
2024-12-19 21:55:29 +00:00
|
|
|
func (m *MockStatsStore) UpdateCronStats(ctx context.Context, id int, status fleet.CronStatsStatus, cronErrors *fleet.CronScheduleErrors) error {
|
2022-11-16 21:14:38 +00:00
|
|
|
m.Lock()
|
|
|
|
|
defer m.Unlock()
|
|
|
|
|
|
|
|
|
|
if m.UpdateStatsCalled != nil {
|
|
|
|
|
m.UpdateStatsCalled <- struct{}{}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
s, ok := m.stats[id]
|
|
|
|
|
if !ok {
|
|
|
|
|
return errors.New("update failed, id not found")
|
|
|
|
|
}
|
|
|
|
|
s.Status = status
|
2022-12-16 18:00:42 +00:00
|
|
|
s.UpdatedAt = time.Now().Truncate(1 * time.Second)
|
2022-11-16 21:14:38 +00:00
|
|
|
m.stats[id] = s
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (m *MockStatsStore) AddChannels(t *testing.T, chanNames ...string) error {
|
|
|
|
|
m.Lock()
|
|
|
|
|
defer m.Unlock()
|
|
|
|
|
for _, n := range chanNames {
|
|
|
|
|
switch n {
|
|
|
|
|
case "GetStatsCalled":
|
|
|
|
|
m.GetStatsCalled = make(chan struct{})
|
|
|
|
|
case "InsertStatsCalled":
|
|
|
|
|
m.InsertStatsCalled = make(chan struct{})
|
|
|
|
|
case "UpdateStatsCalled":
|
|
|
|
|
m.UpdateStatsCalled = make(chan struct{})
|
|
|
|
|
default:
|
|
|
|
|
t.Errorf("unrecognized channel name")
|
|
|
|
|
t.FailNow()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func SetUpMockStatsStore(name string, initialStats ...fleet.CronStats) *MockStatsStore {
|
|
|
|
|
stats := make(map[int]fleet.CronStats)
|
|
|
|
|
for _, s := range initialStats {
|
|
|
|
|
stats[s.ID] = s
|
|
|
|
|
}
|
|
|
|
|
store := MockStatsStore{stats: stats}
|
|
|
|
|
|
|
|
|
|
return &store
|
|
|
|
|
}
|