mirror of
https://github.com/fleetdm/fleet
synced 2026-05-22 16:39:01 +00:00
for #19930 # Checklist for submitter - [X] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. - [X] Input data is properly validated, `SELECT *` is avoided, SQL injection is prevented (using placeholders for values in statements) - [X] Added/updated tests - [X] If database migrations are included, checked table schema to confirm autoupdate - [X] Manual QA for all new/changed functionality # Details This PR adds a new feature to the existing monitoring add-on. The add-on will now send an SNS alert whenever a scheduled job like "vulnerabilities" or "apple_mdm_apns_pusher" exits early due to errors. The alert contains the job type and the set of errors (there can be multiple, since jobs can have multiple sub-jobs). By default the SNS topic for this new alert is the same as the one for the existing cron system alerts, but it can be configured to use a separate topic (e.g. dogfood instance will post to a separate slack channel). The actual changes are: **On the server side:** - Add errors field to cron_stats table (json DEFAULT NULL) - Added errors var to `Schedule` struct to collect errors from jobs - In `RunAllJobs`, collect err from job into new errors var - Update `Schedule.updateStats`and `CronStats.UpdateCronStats`to accept errors argument - If provided, update errors field of cron_stats table **On the monitor side:** - Add new SQL query to look for all completed schedules since last run with non-null errors - send SNS with job ID, name, errors # Testing New automated testing was added for the functional code that gathers and stores errors from cron runs in the database. To test the actual Lambda, I added a row in my `cron_stats` table with errors, then compiled and ran the Lambda executable locally, pointing it to my local mysql and localstack instances: ``` 2024/12/03 14:43:54 main.go:258: Lambda execution environment not found. Falling back to local execution. 2024/12/03 14:43:54 main.go:133: Connected to database! 2024/12/03 14:43:54 main.go:161: Row vulnerabilities last updated at 2024-11-27 03:30:03 +0000 UTC 2024/12/03 14:43:54 main.go:163: *** 1h hasn't updated in more than vulnerabilities, alerting! (status completed) 2024/12/03 14:43:54 main.go:70: Sending SNS Message 2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev Message: Fleet cron 'vulnerabilities' hasn't updated in more than 1h. Last status was 'completed' at 2024-11-27 03:30:03 +0000 UTC.' to 'arn:aws:sns:us-east-1:000000000000:topic1' 2024/12/03 14:43:54 main.go:82: { MessageId: "260864ff-4cc9-4951-acea-cef883b2de5f" } 2024/12/03 14:43:54 main.go:198: *** mdm_apple_profile_manager job had errors, alerting! (errors {"something": "wrong"}) 2024/12/03 14:43:54 main.go:70: Sending SNS Message 2024/12/03 14:43:54 main.go:74: Sending 'Environment: dev Message: Fleet cron 'mdm_apple_profile_manager' (last updated 2024-12-03 20:34:14 +0000 UTC) raised errors during its run: {"something": "wrong"}.' to 'arn:aws:sns:us-east-1:000000000000:topic1' 2024/12/03 14:43:54 main.go:82: { MessageId: "5cd085ef-89f6-42c1-8470-d80a22b295f8"
115 lines
3.7 KiB
Go
115 lines
3.7 KiB
Go
package mysql
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"encoding/json"
|
|
|
|
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
|
|
"github.com/fleetdm/fleet/v4/server/fleet"
|
|
"github.com/jmoiron/sqlx"
|
|
)
|
|
|
|
// GetLatestCronStats returns a slice of no more than two cron stats records, where index 0 (if
|
|
// present) is the most recently created scheduled run, and index 1 (if present) represents a
|
|
// triggered run that is currently pending.
|
|
func (ds *Datastore) GetLatestCronStats(ctx context.Context, name string) ([]fleet.CronStats, error) {
|
|
stmt := `
|
|
(
|
|
SELECT
|
|
id, name, instance, stats_type, status, created_at, updated_at
|
|
FROM
|
|
cron_stats
|
|
WHERE
|
|
name = ?
|
|
AND stats_type = 'scheduled'
|
|
AND (status = 'pending' OR status = 'completed')
|
|
ORDER BY
|
|
created_at DESC
|
|
LIMIT 1)
|
|
UNION
|
|
(
|
|
SELECT
|
|
id, name, instance, stats_type, status, created_at, updated_at
|
|
FROM
|
|
cron_stats
|
|
WHERE
|
|
name = ?
|
|
AND stats_type = 'triggered'
|
|
AND (status = 'pending' OR status = 'completed')
|
|
ORDER BY
|
|
created_at DESC
|
|
LIMIT 1)`
|
|
|
|
var res []fleet.CronStats
|
|
err := sqlx.SelectContext(ctx, ds.reader(ctx), &res, stmt, name, name)
|
|
if err != nil {
|
|
return []fleet.CronStats{}, ctxerr.Wrap(ctx, err, "select cron stats")
|
|
}
|
|
|
|
return res, nil
|
|
}
|
|
|
|
func (ds *Datastore) InsertCronStats(ctx context.Context, statsType fleet.CronStatsType, name string, instance string, status fleet.CronStatsStatus) (int, error) {
|
|
stmt := `INSERT INTO cron_stats (stats_type, name, instance, status) VALUES (?, ?, ?, ?)`
|
|
|
|
res, err := ds.writer(ctx).ExecContext(ctx, stmt, statsType, name, instance, status)
|
|
if err != nil {
|
|
return 0, ctxerr.Wrap(ctx, err, "insert cron stats")
|
|
}
|
|
id, err := res.LastInsertId()
|
|
if err != nil {
|
|
return 0, ctxerr.Wrap(ctx, err, "insert cron stats last insert id")
|
|
}
|
|
|
|
return int(id), nil
|
|
}
|
|
|
|
func (ds *Datastore) UpdateCronStats(ctx context.Context, id int, status fleet.CronStatsStatus, cronErrors *fleet.CronScheduleErrors) error {
|
|
stmt := `UPDATE cron_stats SET status = ?, errors = ? WHERE id = ?`
|
|
|
|
errorsJSON := sql.NullString{}
|
|
if len(*cronErrors) > 0 {
|
|
b, err := json.Marshal(cronErrors)
|
|
if err == nil {
|
|
errorsJSON.String = string(b)
|
|
errorsJSON.Valid = true
|
|
}
|
|
}
|
|
|
|
if _, err := ds.writer(ctx).ExecContext(ctx, stmt, status, errorsJSON, id); err != nil {
|
|
return ctxerr.Wrap(ctx, err, "update cron stats")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (ds *Datastore) UpdateAllCronStatsForInstance(ctx context.Context, instance string, fromStatus fleet.CronStatsStatus, toStatus fleet.CronStatsStatus) error {
|
|
stmt := `UPDATE cron_stats SET status = ? WHERE instance = ? AND status = ?`
|
|
|
|
if _, err := ds.writer(ctx).ExecContext(ctx, stmt, toStatus, instance, fromStatus); err != nil {
|
|
return ctxerr.Wrap(ctx, err, "update all cron stats for instance")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (ds *Datastore) CleanupCronStats(ctx context.Context) error {
|
|
return ds.withRetryTxx(ctx, func(tx sqlx.ExtContext) error {
|
|
// Delete cron_stats entries that are older than two days.
|
|
deleteStmt := `DELETE FROM cron_stats WHERE created_at < DATE_SUB(NOW(), INTERVAL 2 DAY)`
|
|
if _, err := tx.ExecContext(ctx, deleteStmt); err != nil {
|
|
return ctxerr.Wrap(ctx, err, "deleting old cron stats")
|
|
}
|
|
// Delete cron_stats entries that have been in pending state for more than two hours.
|
|
//
|
|
// NOTE(lucas): We don't know of any job that is taking longer than two hours. This value might need changing
|
|
// if that is not true anymore in the future.
|
|
updateStmt := `UPDATE cron_stats SET status = ? WHERE created_at < DATE_SUB(NOW(), INTERVAL 2 HOUR) AND status = ?`
|
|
if _, err := tx.ExecContext(ctx, updateStmt, fleet.CronStatsStatusExpired, fleet.CronStatsStatusPending); err != nil {
|
|
return ctxerr.Wrap(ctx, err, "updating expired cron stats")
|
|
}
|
|
|
|
return nil
|
|
})
|
|
}
|