mirror of
https://github.com/fleetdm/fleet
synced 2026-05-21 07:58:31 +00:00
for #19930 # Checklist for submitter - [X] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. - [X] Input data is properly validated, `SELECT *` is avoided, SQL injection is prevented (using placeholders for values in statements) - [X] Added/updated tests - [X] If database migrations are included, checked table schema to confirm autoupdate - [X] Manual QA for all new/changed functionality # Details This PR adds a new feature to the existing monitoring add-on. The add-on will now send an SNS alert whenever a scheduled job like "vulnerabilities" or "apple_mdm_apns_pusher" exits early due to errors. The alert contains the job type and the set of errors (there can be multiple, since jobs can have multiple sub-jobs). By default the SNS topic for this new alert is the same as the one for the existing cron system alerts, but it can be configured to use a separate topic (e.g. dogfood instance will post to a separate slack channel). The actual changes are: **On the server side:** - Add errors field to cron_stats table (json DEFAULT NULL) - Added errors var to `Schedule` struct to collect errors from jobs - In `RunAllJobs`, collect err from job into new errors var - Update `Schedule.updateStats`and `CronStats.UpdateCronStats`to accept errors argument - If provided, update errors field of cron_stats table **On the monitor side:** - Add new SQL query to look for all completed schedules since last run with non-null errors - send SNS with job ID, name, errors # Testing New automated testing was added for the functional code that gathers and stores errors from cron runs in the database. To test the actual Lambda, I added a row in my `cron_stats` table with errors, then compiled and ran the Lambda executable locally, pointing it to my local mysql and localstack instances: ``` 2024/12/03 14:43:54 main.go:258: Lambda execution environment not found. Falling back to local execution. 2024/12/03 14:43:54 main.go:133: Connected to database! 2024/12/03 14:43:54 main.go:161: Row vulnerabilities last updated at 2024-11-27 03:30:03 +0000 UTC 2024/12/03 14:43:54 main.go:163: *** 1h hasn't updated in more than vulnerabilities, alerting! (status completed) 2024/12/03 14:43:54 main.go:70: Sending SNS Message 2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev Message: Fleet cron 'vulnerabilities' hasn't updated in more than 1h. Last status was 'completed' at 2024-11-27 03:30:03 +0000 UTC.' to 'arn:aws:sns:us-east-1:000000000000:topic1' 2024/12/03 14:43:54 main.go:82: { MessageId: "260864ff-4cc9-4951-acea-cef883b2de5f" } 2024/12/03 14:43:54 main.go:198: *** mdm_apple_profile_manager job had errors, alerting! (errors {"something": "wrong"}) 2024/12/03 14:43:54 main.go:70: Sending SNS Message 2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev Message: Fleet cron 'mdm_apple_profile_manager' (last updated 2024-12-03 20:34:14 +0000 UTC) raised errors during its run: {"something": "wrong"}.' to 'arn:aws:sns:us-east-1:000000000000:topic1' 2024/12/03 14:43:54 main.go:82: { MessageId: "5cd085ef-89f6-42c1-8470-d80a22b295f8"
311 lines
11 KiB
Go
311 lines
11 KiB
Go
package mysql
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"encoding/json"
|
|
"errors"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/fleetdm/fleet/v4/server/fleet"
|
|
"github.com/jmoiron/sqlx"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
type testCronStats struct {
|
|
fleet.CronStats
|
|
// Errors is a JSON string containing any errors encountered during the run.
|
|
Errors sql.NullString `db:"errors"`
|
|
}
|
|
|
|
func TestInsertUpdateCronStats(t *testing.T) {
|
|
const (
|
|
scheduleName = "test_sched"
|
|
instanceID = "test_instance"
|
|
)
|
|
ctx := context.Background()
|
|
ds := CreateMySQLDS(t)
|
|
|
|
id, err := ds.InsertCronStats(ctx, fleet.CronStatsTypeScheduled, scheduleName, instanceID, fleet.CronStatsStatusPending)
|
|
require.NoError(t, err)
|
|
|
|
res, err := ds.GetLatestCronStats(ctx, scheduleName)
|
|
require.NoError(t, err)
|
|
require.Len(t, res, 1)
|
|
require.Equal(t, id, res[0].ID)
|
|
require.Equal(t, fleet.CronStatsTypeScheduled, res[0].StatsType)
|
|
require.Equal(t, fleet.CronStatsStatusPending, res[0].Status)
|
|
|
|
err = ds.UpdateCronStats(ctx, id, fleet.CronStatsStatusCompleted, &fleet.CronScheduleErrors{
|
|
"some_job": errors.New("some error"),
|
|
"some_other_job": errors.New("some other error"),
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
res, err = ds.GetLatestCronStats(ctx, scheduleName)
|
|
require.NoError(t, err)
|
|
require.Len(t, res, 1)
|
|
require.Equal(t, id, res[0].ID)
|
|
require.Equal(t, fleet.CronStatsTypeScheduled, res[0].StatsType)
|
|
require.Equal(t, fleet.CronStatsStatusCompleted, res[0].Status)
|
|
|
|
var stats []testCronStats
|
|
err = sqlx.SelectContext(ctx, ds.reader(ctx), &stats, `SELECT * FROM cron_stats ORDER BY id`)
|
|
require.NoError(t, err)
|
|
// Make sure we got valid JSON back.
|
|
var actualMap map[string]string
|
|
err = json.Unmarshal([]byte(stats[0].Errors.String), &actualMap)
|
|
require.NoError(t, err)
|
|
|
|
// Compare the error JSON with the expected object.
|
|
expectedJSON := `{"some_job": "some error", "some_other_job": "some other error"}`
|
|
var expectedMap map[string]string
|
|
err = json.Unmarshal([]byte(expectedJSON), &expectedMap)
|
|
require.NoError(t, err)
|
|
require.Equal(t, actualMap, expectedMap)
|
|
}
|
|
|
|
func TestGetLatestCronStats(t *testing.T) {
|
|
const (
|
|
scheduleName = "test_sched"
|
|
instanceID = "test_instance"
|
|
)
|
|
ctx := context.Background()
|
|
ds := CreateMySQLDS(t)
|
|
|
|
insertTestCS := func(name string, statsType fleet.CronStatsType, status fleet.CronStatsStatus, createdAt time.Time) {
|
|
stmt := `INSERT INTO cron_stats (stats_type, name, instance, status, created_at) VALUES (?, ?, ?, ?, ?)`
|
|
_, err := ds.writer(ctx).ExecContext(ctx, stmt, statsType, name, instanceID, status, createdAt)
|
|
require.NoError(t, err)
|
|
}
|
|
|
|
then := time.Now().UTC().Truncate(time.Second).Add(-24 * time.Hour)
|
|
|
|
// insert two "scheduled" stats
|
|
insertTestCS(scheduleName, fleet.CronStatsTypeScheduled, fleet.CronStatsStatusPending, then.Add(2*time.Minute))
|
|
insertTestCS(scheduleName, fleet.CronStatsTypeScheduled, fleet.CronStatsStatusCompleted, then.Add(1*time.Minute))
|
|
|
|
// most recent record is returned for "scheduled" stats type
|
|
res, err := ds.GetLatestCronStats(ctx, scheduleName)
|
|
require.NoError(t, err)
|
|
require.Len(t, res, 1)
|
|
require.Equal(t, fleet.CronStatsTypeScheduled, res[0].StatsType)
|
|
require.Equal(t, fleet.CronStatsStatusPending, res[0].Status)
|
|
require.Equal(t, then.Add(2*time.Minute), res[0].CreatedAt)
|
|
|
|
// insert two "triggered" stats
|
|
insertTestCS(scheduleName, fleet.CronStatsTypeTriggered, fleet.CronStatsStatusCompleted, then.Add(2*time.Hour))
|
|
insertTestCS(scheduleName, fleet.CronStatsTypeTriggered, fleet.CronStatsStatusCompleted, then.Add(1*time.Hour))
|
|
|
|
// most recent record is returned for both "scheduled" stats type and "triggered" stats type
|
|
res, err = ds.GetLatestCronStats(ctx, scheduleName)
|
|
require.NoError(t, err)
|
|
require.Len(t, res, 2)
|
|
require.Equal(t, fleet.CronStatsTypeScheduled, res[0].StatsType)
|
|
require.Equal(t, fleet.CronStatsStatusPending, res[0].Status)
|
|
require.Equal(t, then.Add(2*time.Minute), res[0].CreatedAt)
|
|
require.Equal(t, fleet.CronStatsTypeTriggered, res[1].StatsType)
|
|
require.Equal(t, fleet.CronStatsStatusCompleted, res[1].Status)
|
|
require.Equal(t, then.Add(2*time.Hour), res[1].CreatedAt)
|
|
|
|
// insert some other stats that shouldn't be returned
|
|
insertTestCS(scheduleName, fleet.CronStatsTypeScheduled, fleet.CronStatsStatusExpired, then.Add(3*time.Hour)) // expired status shouldn't be returned
|
|
insertTestCS(scheduleName, fleet.CronStatsTypeTriggered, fleet.CronStatsStatusExpired, then.Add(3*time.Hour)) // expired status shouldn't be returned
|
|
insertTestCS(scheduleName, fleet.CronStatsTypeScheduled, fleet.CronStatsStatusCanceled, then.Add(4*time.Hour)) // canceled status shouldn't be returned
|
|
insertTestCS(scheduleName, fleet.CronStatsTypeTriggered, fleet.CronStatsStatusCanceled, then.Add(4*time.Hour)) // canceled status shouldn't be returned
|
|
insertTestCS("schedule_1337", fleet.CronStatsTypeTriggered, fleet.CronStatsStatusPending, then.Add(5*time.Hour)) // different name shouldn't be returned
|
|
|
|
// most recent record is returned for both "scheduled" stats type and "triggered" stats type
|
|
res, err = ds.GetLatestCronStats(ctx, scheduleName)
|
|
require.NoError(t, err)
|
|
require.Len(t, res, 2)
|
|
require.Equal(t, fleet.CronStatsTypeScheduled, res[0].StatsType)
|
|
require.Equal(t, fleet.CronStatsStatusPending, res[0].Status)
|
|
require.Equal(t, then.Add(2*time.Minute), res[0].CreatedAt)
|
|
require.Equal(t, fleet.CronStatsTypeTriggered, res[1].StatsType)
|
|
require.Equal(t, fleet.CronStatsStatusCompleted, res[1].Status)
|
|
require.Equal(t, then.Add(2*time.Hour), res[1].CreatedAt)
|
|
}
|
|
|
|
func TestCleanupCronStats(t *testing.T) {
|
|
ctx := context.Background()
|
|
ds := CreateMySQLDS(t)
|
|
now := time.Now().UTC().Truncate(time.Second)
|
|
twoDaysAgo := now.Add(-2 * 24 * time.Hour)
|
|
name := "test_sched"
|
|
instance := "test_instance"
|
|
|
|
cases := []struct {
|
|
createdAt time.Time
|
|
status fleet.CronStatsStatus
|
|
shouldCleanupMaxPending bool
|
|
shouldCleanupMaxAge bool
|
|
}{
|
|
{
|
|
createdAt: now,
|
|
status: fleet.CronStatsStatusCompleted,
|
|
shouldCleanupMaxPending: false,
|
|
shouldCleanupMaxAge: false,
|
|
},
|
|
{
|
|
createdAt: now,
|
|
status: fleet.CronStatsStatusPending,
|
|
shouldCleanupMaxPending: false,
|
|
shouldCleanupMaxAge: false,
|
|
},
|
|
{
|
|
createdAt: now.Add(-1 * time.Hour),
|
|
status: fleet.CronStatsStatusPending,
|
|
shouldCleanupMaxPending: false,
|
|
shouldCleanupMaxAge: false,
|
|
},
|
|
{
|
|
createdAt: now.Add(-2 * time.Hour),
|
|
status: fleet.CronStatsStatusExpired,
|
|
shouldCleanupMaxPending: false,
|
|
shouldCleanupMaxAge: false,
|
|
},
|
|
{
|
|
createdAt: now.Add(-3 * time.Hour),
|
|
status: fleet.CronStatsStatusPending,
|
|
shouldCleanupMaxPending: true,
|
|
shouldCleanupMaxAge: false,
|
|
},
|
|
{
|
|
createdAt: now.Add(-3 * time.Hour),
|
|
status: fleet.CronStatsStatusCompleted,
|
|
shouldCleanupMaxPending: false,
|
|
shouldCleanupMaxAge: false,
|
|
},
|
|
{
|
|
createdAt: twoDaysAgo.Add(1 * time.Hour),
|
|
status: fleet.CronStatsStatusCompleted,
|
|
shouldCleanupMaxPending: false,
|
|
shouldCleanupMaxAge: false,
|
|
},
|
|
{
|
|
createdAt: twoDaysAgo.Add(-1 * time.Hour),
|
|
status: fleet.CronStatsStatusCompleted,
|
|
shouldCleanupMaxPending: false,
|
|
shouldCleanupMaxAge: true,
|
|
},
|
|
}
|
|
|
|
for _, c := range cases {
|
|
stmt := `INSERT INTO cron_stats (stats_type, name, instance, status, created_at) VALUES (?, ?, ?, ?, ?)`
|
|
_, err := ds.writer(ctx).ExecContext(ctx, stmt, fleet.CronStatsTypeScheduled, name, instance, c.status, c.createdAt)
|
|
require.NoError(t, err)
|
|
}
|
|
|
|
var stats []testCronStats
|
|
err := sqlx.SelectContext(ctx, ds.reader(ctx), &stats, `SELECT * FROM cron_stats ORDER BY id`)
|
|
require.NoError(t, err)
|
|
require.Len(t, stats, len(cases))
|
|
for i, s := range stats {
|
|
require.Equal(t, cases[i].createdAt, s.CreatedAt)
|
|
require.Equal(t, cases[i].status, s.Status)
|
|
}
|
|
|
|
err = ds.CleanupCronStats(ctx)
|
|
require.NoError(t, err)
|
|
|
|
stats = []testCronStats{}
|
|
err = sqlx.SelectContext(ctx, ds.reader(ctx), &stats, `SELECT * FROM cron_stats ORDER BY id`)
|
|
require.NoError(t, err)
|
|
require.Len(t, stats, len(cases)-1) // case[7] was deleted because it exceeded max age
|
|
for i, c := range cases {
|
|
if i >= len(stats) {
|
|
require.True(t, c.shouldCleanupMaxAge)
|
|
break
|
|
}
|
|
if c.shouldCleanupMaxPending {
|
|
require.Equal(t, fleet.CronStatsStatusExpired, stats[i].Status)
|
|
} else {
|
|
require.Equal(t, c.status, stats[i].Status)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestUpdateAllCronStatsForInstance(t *testing.T) {
|
|
ctx := context.Background()
|
|
ds := CreateMySQLDS(t)
|
|
|
|
cases := []struct {
|
|
instance string
|
|
schedName string
|
|
status fleet.CronStatsStatus
|
|
shouldUpdate bool
|
|
}{
|
|
{
|
|
instance: "inst1",
|
|
schedName: "sched1",
|
|
status: fleet.CronStatsStatusCompleted,
|
|
shouldUpdate: false,
|
|
},
|
|
{
|
|
instance: "inst1",
|
|
schedName: "sched1",
|
|
status: fleet.CronStatsStatusPending,
|
|
shouldUpdate: true,
|
|
},
|
|
{
|
|
instance: "inst1",
|
|
schedName: "sched2",
|
|
status: fleet.CronStatsStatusExpired,
|
|
shouldUpdate: false,
|
|
},
|
|
{
|
|
instance: "inst1",
|
|
schedName: "sched2",
|
|
status: fleet.CronStatsStatusPending,
|
|
shouldUpdate: true,
|
|
},
|
|
{
|
|
instance: "inst2",
|
|
schedName: "sched1",
|
|
status: fleet.CronStatsStatusPending,
|
|
shouldUpdate: false,
|
|
},
|
|
{
|
|
instance: "inst2",
|
|
schedName: "sched2",
|
|
status: fleet.CronStatsStatusPending,
|
|
shouldUpdate: false,
|
|
},
|
|
}
|
|
|
|
for _, c := range cases {
|
|
stmt := `INSERT INTO cron_stats (stats_type, name, instance, status) VALUES (?, ?, ?, ?)`
|
|
_, err := ds.writer(ctx).ExecContext(ctx, stmt, fleet.CronStatsTypeScheduled, c.schedName, c.instance, c.status)
|
|
require.NoError(t, err)
|
|
}
|
|
|
|
var stats []testCronStats
|
|
err := sqlx.SelectContext(ctx, ds.reader(ctx), &stats, `SELECT * FROM cron_stats ORDER BY id`)
|
|
require.NoError(t, err)
|
|
require.Len(t, stats, len(cases))
|
|
for i, s := range stats {
|
|
require.Equal(t, cases[i].schedName, s.Name)
|
|
require.Equal(t, cases[i].instance, s.Instance)
|
|
require.Equal(t, cases[i].status, s.Status)
|
|
}
|
|
|
|
err = ds.UpdateAllCronStatsForInstance(ctx, "inst1", fleet.CronStatsStatusPending, fleet.CronStatsStatusCanceled)
|
|
require.NoError(t, err)
|
|
|
|
stats = []testCronStats{}
|
|
err = sqlx.SelectContext(ctx, ds.reader(ctx), &stats, `SELECT * FROM cron_stats ORDER BY id`)
|
|
require.NoError(t, err)
|
|
require.Len(t, stats, len(cases))
|
|
for i, c := range cases {
|
|
s := stats[i]
|
|
require.Equal(t, c.instance, s.Instance)
|
|
require.Equal(t, c.schedName, s.Name)
|
|
if c.shouldUpdate {
|
|
require.Equal(t, fleet.CronStatsStatusCanceled, s.Status)
|
|
} else {
|
|
require.Equal(t, c.status, s.Status)
|
|
}
|
|
}
|
|
}
|