fleet/server/datastore/mysql/cron_stats.go
Victor Lyuboslavsky 6fc7132350
Trigger vuln processing when it runs on a separate server (#39612)
<!-- Add the related story/sub-task/bug number, like Resolves #123, or
remove if NA -->
**Related issue:** Resolves #35239

Docs PR: #39770

## Remote trigger approach
When FLEET_VULNERABILITIES_DISABLE_SCHEDULE=true, the main Fleet server
registers a RemoteTriggerSchedule instead of the real vulnerability
schedule. When a user runs fleetctl trigger --name=vulnerabilities:
1. Main server: RemoteTriggerSchedule.Trigger() inserts a cron_stats
record with status=queued.
2. Worker server: The vulnerability schedule runs with
WithTriggerPollInterval(60s), which starts a poll goroutine that checks
the DB every 60s for queued records.
3. Pickup: When the poll goroutine finds a queued record, it sends the
stats ID on the trigger channel (non-blocking).
4. Execution: The trigger handler acquires the lock, claims the record
via ClaimCronStats (updating status to pending and instance to the
actual worker ID), runs all jobs, and marks it completed.

Key details:
- The trigger channel carries an int: 0 for in-process triggers, >0 for
DB-polled stats IDs. This lets runWithStats reuse the existing record
instead of inserting a new one.
- Both Schedule.Trigger() and RemoteTriggerSchedule.Trigger() treat
pending and queued as conflicts to prevent duplicate runs.
- Queued records expire after 2 hours via CleanupCronStats, same as
pending records.
- The poll goroutine only signals; it doesn't modify DB state. The
handler claims when ready.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Added support for remote trigger execution in vulnerability scheduling
workflows.
* Implemented periodic polling mechanism to detect and process
externally triggered vulnerability scans.

* **Bug Fixes**
* Enhanced trigger status tracking to properly handle queued scan jobs.

* **Improvements**
* Strengthened scheduling system with improved timeout and cancellation
management capabilities.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2026-02-17 09:18:03 -06:00

140 lines
4.5 KiB
Go

package mysql
import (
"context"
"database/sql"
"encoding/json"
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
"github.com/fleetdm/fleet/v4/server/fleet"
"github.com/jmoiron/sqlx"
)
// GetLatestCronStats returns a slice of no more than two cron stats records, where index 0 (if
// present) is the most recently created scheduled run, and index 1 (if present) represents a
// triggered run that is currently pending/queued.
func (ds *Datastore) GetLatestCronStats(ctx context.Context, name string) ([]fleet.CronStats, error) {
stmt := `
(
SELECT
id, name, instance, stats_type, status, created_at, updated_at
FROM
cron_stats
WHERE
name = ?
AND stats_type = 'scheduled'
AND (status = 'pending' OR status = 'completed')
ORDER BY
created_at DESC
LIMIT 1)
UNION
(
SELECT
id, name, instance, stats_type, status, created_at, updated_at
FROM
cron_stats
WHERE
name = ?
AND stats_type = 'triggered'
AND (status = 'pending' OR status = 'completed' OR status = 'queued')
ORDER BY
created_at DESC
LIMIT 1)`
var res []fleet.CronStats
err := sqlx.SelectContext(ctx, ds.reader(ctx), &res, stmt, name, name)
if err != nil {
return []fleet.CronStats{}, ctxerr.Wrap(ctx, err, "select cron stats")
}
return res, nil
}
func (ds *Datastore) InsertCronStats(ctx context.Context, statsType fleet.CronStatsType, name string, instance string, status fleet.CronStatsStatus) (int, error) {
stmt := `INSERT INTO cron_stats (stats_type, name, instance, status) VALUES (?, ?, ?, ?)`
res, err := ds.writer(ctx).ExecContext(ctx, stmt, statsType, name, instance, status)
if err != nil {
return 0, ctxerr.Wrap(ctx, err, "insert cron stats")
}
id, err := res.LastInsertId()
if err != nil {
return 0, ctxerr.Wrap(ctx, err, "insert cron stats last insert id")
}
return int(id), nil
}
func (ds *Datastore) UpdateCronStats(ctx context.Context, id int, status fleet.CronStatsStatus, cronErrors *fleet.CronScheduleErrors) error {
stmt := `UPDATE cron_stats SET status = ?, errors = ? WHERE id = ?`
errorsJSON := sql.NullString{}
if len(*cronErrors) > 0 {
b, err := json.Marshal(cronErrors)
if err == nil {
errorsJSON.String = string(b)
errorsJSON.Valid = true
}
}
if _, err := ds.writer(ctx).ExecContext(ctx, stmt, status, errorsJSON, id); err != nil {
return ctxerr.Wrap(ctx, err, "update cron stats")
}
return nil
}
func (ds *Datastore) ClaimCronStats(ctx context.Context, id int, instance string, status fleet.CronStatsStatus) error {
stmt := `UPDATE cron_stats SET status = ?, instance = ? WHERE id = ?`
if _, err := ds.writer(ctx).ExecContext(ctx, stmt, status, instance, id); err != nil {
return ctxerr.Wrap(ctx, err, "claim cron stats")
}
return nil
}
func (ds *Datastore) UpdateAllCronStatsForInstance(ctx context.Context, instance string, fromStatus fleet.CronStatsStatus, toStatus fleet.CronStatsStatus) error {
stmt := `UPDATE cron_stats SET status = ? WHERE instance = ? AND status = ?`
if _, err := ds.writer(ctx).ExecContext(ctx, stmt, toStatus, instance, fromStatus); err != nil {
return ctxerr.Wrap(ctx, err, "update all cron stats for instance")
}
return nil
}
func (ds *Datastore) CleanupCronStats(ctx context.Context) error {
return ds.withRetryTxx(ctx, func(tx sqlx.ExtContext) error {
// Delete cron_stats entries that are older than two days.
deleteStmt := `DELETE FROM cron_stats WHERE created_at < DATE_SUB(NOW(), INTERVAL 2 DAY)`
if _, err := tx.ExecContext(ctx, deleteStmt); err != nil {
return ctxerr.Wrap(ctx, err, "deleting old cron stats")
}
// Mark cron_stats entries as expired if:
// 1. Pending or queued for >2 hours and no active lock (instance likely crashed), OR
// 2. Pending or queued for >12 hours regardless of lock state (hard cap for hung jobs).
//
// NOTE: The lock check assumes locks.name matches cron_stats.name. Schedules using
// WithAltLockID (e.g., "leader", "worker") store locks under a different name, so
// the NOT EXISTS check won't find their lock and they fall back to the 2-hour timeout.
updateStmt := `
UPDATE cron_stats cs
SET cs.status = ?
WHERE cs.status IN (?, ?)
AND (
(cs.created_at < DATE_SUB(NOW(), INTERVAL 2 HOUR)
AND NOT EXISTS (
SELECT 1 FROM locks l
WHERE l.name = cs.name
AND l.expires_at >= CURRENT_TIMESTAMP
))
OR cs.created_at < DATE_SUB(NOW(), INTERVAL 12 HOUR)
)`
if _, err := tx.ExecContext(ctx, updateStmt, fleet.CronStatsStatusExpired, fleet.CronStatsStatusPending, fleet.CronStatsStatusQueued); err != nil {
return ctxerr.Wrap(ctx, err, "updating expired cron stats")
}
return nil
})
}