mirror of
https://github.com/fleetdm/fleet
synced 2026-05-24 09:28:54 +00:00
<!-- Add the related story/sub-task/bug number, like Resolves #123, or remove if NA --> **Related issue:** Resolves #35603 # Details This PR aims to optimize the system for recording scheduled query results in the database. Previously, each time a result set was received from a host, the Fleet server would count all of the current result rows in the db for that query before deciding whether to save more. This count becomes more expensive as the DB size grows, until it becomes the "long" pole in the recording process. With this PR, the system changes in the following ways: * When result rows are received from the host, no count is immediately taken. Instead, a Redis key is checked which holds a current approximate count of rows in the table. If the count is over the configured row limit, no rows are saved. Otherwise, rows are saved and the count is adjusted accordingly (it can go down, e.g. if a host previously returned 5 rows for a query and now returns 3). Keep in mind that we only store one set of results per host for a scheduled query; when a host reports results for a query, we delete that hosts previous results and write the new ones if there's room. * As an additional failsafe against runaway queries, if a result set contains more than 1000 rows, it is rejected. * Once a minute, a cron job runs which deletes all rows over the limit for each query and resets the counter for all queries to the actual # of rows in the table. The end result is: * No more expensive counts on every distributed write request for scheduled queries * Results for a single query can burst to over the limit for a short time, but will get cleaned up after a minute * Because of concurrency and race issues where multiple hosts might get the same count from Redis before inserting rows, the actual # of results in the db can burst higher than the limit. In testing w/ osquery-perf with 1000 hosts started simultaneously, sending 500 rows at a time, a 50,000 row limit and a query running every 10 seconds, I saw the table get up to 60,000 rows at times before being cleaned up. This is a very bad case; in the real world we'd have a lot more jitter in the reporting, and queries would not typically return this many rows. # Checklist for submitter If some of the following don't apply, delete the relevant line. - [X] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. See [Changes files](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/guides/committing-changes.md#changes-files) for more information. - [X] Input data is properly validated, `SELECT *` is avoided, SQL injection is prevented (using placeholders for values in statements) ## Testing - [X] Added/updated automated tests Added a new test to verify that results are still discarded if table size is > limit, updated existing tests. - [X] Where appropriate, [automated tests simulate multiple hosts and test for host isolation](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/reference/patterns-backend.md#unit-testing) (updates to one hosts's records do not affect another) - [X] QA'd all new/changed functionality manually Ran osquery-perf with 1000 hosts and a 50,000 row limit per query, using queries that returned 1, 500 and 1000 rows at a time. Verified that the limits were respected (subject to the amount of flex discussed above). I'm doing some A/B tests now using local MySQL metrics and will report back. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Automated periodic cleanup of excess query results to retain recent data and free storage * Redis-backed query result counting to track per-query result volumes * **Performance Improvements** * Optimized recording of scheduled query results for reduced overhead * Cleanup runs in configurable batches to lower database contention and balance storage use <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai -->
254 lines
8.8 KiB
Go
254 lines
8.8 KiB
Go
package mysql
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strings"
|
|
|
|
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
|
|
"github.com/fleetdm/fleet/v4/server/fleet"
|
|
"github.com/jmoiron/sqlx"
|
|
)
|
|
|
|
// OverwriteQueryResultRows overwrites the query result rows for a given query and host.
|
|
// It deletes existing rows for the host/query and inserts the new rows.
|
|
// If the incoming result set has more than the row limit, it bails early without storing anything.
|
|
// Excess rows across all hosts are cleaned up by a separate cron job.
|
|
func (ds *Datastore) OverwriteQueryResultRows(ctx context.Context, rows []*fleet.ScheduledQueryResultRow, maxQueryReportRows int) (rowsAdded int, err error) {
|
|
if len(rows) == 0 {
|
|
return 0, nil
|
|
}
|
|
|
|
// Bail early if the incoming result set is too large (more than the row limit from a single host)
|
|
if len(rows) > 1000 {
|
|
return 0, nil
|
|
}
|
|
|
|
err = ds.withRetryTxx(ctx, func(tx sqlx.ExtContext) error {
|
|
// Since we assume all rows have the same queryID, take it from the first row
|
|
queryID := rows[0].QueryID
|
|
hostID := rows[0].HostID
|
|
|
|
// Delete rows based on the specific queryID and hostID
|
|
deleteStmt := `DELETE FROM query_results WHERE host_id = ? AND query_id = ?`
|
|
result, err := tx.ExecContext(ctx, deleteStmt, hostID, queryID)
|
|
if err != nil {
|
|
return ctxerr.Wrap(ctx, err, "deleting query results for host")
|
|
}
|
|
deletedRows, err := result.RowsAffected()
|
|
if err != nil {
|
|
return ctxerr.Wrap(ctx, err, "getting rows affected for delete")
|
|
}
|
|
|
|
// Insert the new rows
|
|
valueStrings := make([]string, 0, len(rows))
|
|
valueArgs := make([]interface{}, 0, len(rows)*4)
|
|
for _, row := range rows {
|
|
valueStrings = append(valueStrings, "(?, ?, ?, ?)")
|
|
valueArgs = append(valueArgs, queryID, hostID, row.LastFetched, row.Data)
|
|
}
|
|
|
|
//nolint:gosec // SQL query is constructed using constant strings
|
|
insertStmt := `
|
|
INSERT IGNORE INTO query_results (query_id, host_id, last_fetched, data) VALUES
|
|
` + strings.Join(valueStrings, ",")
|
|
|
|
result, err = tx.ExecContext(ctx, insertStmt, valueArgs...)
|
|
if err != nil {
|
|
return ctxerr.Wrap(ctx, err, "inserting new rows")
|
|
}
|
|
insertedRows, err := result.RowsAffected()
|
|
if err != nil {
|
|
return ctxerr.Wrap(ctx, err, "getting rows affected for insert")
|
|
}
|
|
|
|
rowsAdded = int(insertedRows - deletedRows)
|
|
return nil
|
|
})
|
|
|
|
return rowsAdded, ctxerr.Wrap(ctx, err, "overwriting query result rows")
|
|
}
|
|
|
|
// TODO(lucas): Any chance we can store hostname in the query_results table?
|
|
// (to avoid having to left join hosts).
|
|
// QueryResultRows returns the query result rows for a given query
|
|
func (ds *Datastore) QueryResultRows(ctx context.Context, queryID uint, filter fleet.TeamFilter) ([]*fleet.ScheduledQueryResultRow, error) {
|
|
selectStmt := fmt.Sprintf(`
|
|
SELECT qr.query_id, qr.host_id, qr.last_fetched, qr.data,
|
|
h.hostname, h.computer_name, h.hardware_model, h.hardware_serial
|
|
FROM query_results qr
|
|
LEFT JOIN hosts h ON (qr.host_id=h.id)
|
|
WHERE query_id = ? AND data IS NOT NULL AND %s
|
|
`, ds.whereFilterHostsByTeams(filter, "h"))
|
|
|
|
results := []*fleet.ScheduledQueryResultRow{}
|
|
err := sqlx.SelectContext(ctx, ds.reader(ctx), &results, selectStmt, queryID)
|
|
if err != nil {
|
|
return nil, ctxerr.Wrap(ctx, err, "selecting query result rows")
|
|
}
|
|
|
|
return results, nil
|
|
}
|
|
|
|
// ResultCountForQuery counts the query report rows for a given query
|
|
// excluding rows with null data
|
|
func (ds *Datastore) ResultCountForQuery(ctx context.Context, queryID uint) (int, error) {
|
|
var count int
|
|
err := sqlx.GetContext(ctx, ds.reader(ctx), &count, `SELECT COUNT(*) FROM query_results WHERE query_id = ? AND data IS NOT NULL`, queryID)
|
|
if err != nil {
|
|
return 0, ctxerr.Wrap(ctx, err, "counting query results for query")
|
|
}
|
|
|
|
return count, nil
|
|
}
|
|
|
|
// ResultCountForQueryAndHost counts the query report rows for a given query and host
|
|
// excluding rows with null data
|
|
func (ds *Datastore) ResultCountForQueryAndHost(ctx context.Context, queryID, hostID uint) (int, error) {
|
|
var count int
|
|
err := sqlx.GetContext(ctx, ds.reader(ctx), &count, `SELECT COUNT(*) FROM query_results WHERE query_id = ? AND host_id = ? AND data IS NOT NULL`, queryID, hostID)
|
|
if err != nil {
|
|
return 0, ctxerr.Wrap(ctx, err, "counting query results for query and host")
|
|
}
|
|
|
|
return count, nil
|
|
}
|
|
|
|
// QueryResultRowsForHost returns the query result rows for a given query and host
|
|
// including rows with null data
|
|
func (ds *Datastore) QueryResultRowsForHost(ctx context.Context, queryID, hostID uint) ([]*fleet.ScheduledQueryResultRow, error) {
|
|
selectStmt := `
|
|
SELECT query_id, host_id, last_fetched, data FROM query_results
|
|
WHERE query_id = ? AND host_id = ?
|
|
`
|
|
results := []*fleet.ScheduledQueryResultRow{}
|
|
err := sqlx.SelectContext(ctx, ds.reader(ctx), &results, selectStmt, queryID, hostID)
|
|
if err != nil {
|
|
return nil, ctxerr.Wrap(ctx, err, "selecting query result rows for host")
|
|
}
|
|
|
|
return results, nil
|
|
}
|
|
|
|
func (ds *Datastore) CleanupDiscardedQueryResults(ctx context.Context) error {
|
|
deleteStmt := `
|
|
DELETE FROM query_results
|
|
WHERE query_id IN
|
|
(SELECT id FROM queries WHERE discard_data = true)
|
|
`
|
|
_, err := ds.writer(ctx).ExecContext(ctx, deleteStmt)
|
|
if err != nil {
|
|
return ctxerr.Wrap(ctx, err, "cleaning up discarded query results")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// CleanupExcessQueryResultRows deletes query result rows that exceed the maximum
|
|
// allowed per query. It keeps the most recent rows (by id, which correlates with insert order) up to the limit.
|
|
// Deletes are batched to avoid large binlogs and long lock times.
|
|
// This runs as a cron job to ensure the query_results table doesn't grow unbounded.
|
|
// Returns a map of query IDs to their current row count after cleanup (for syncing Redis counters).
|
|
func (ds *Datastore) CleanupExcessQueryResultRows(ctx context.Context, maxQueryReportRows int, opts ...fleet.CleanupExcessQueryResultRowsOptions) (map[uint]int, error) {
|
|
batchSize := 500
|
|
// Allow overriding the batch size mainly for tests.
|
|
if len(opts) > 0 && opts[0].BatchSize > 0 {
|
|
batchSize = opts[0].BatchSize
|
|
}
|
|
|
|
// Get all distinct query_ids that have results and are scheduled queries with discard_data = false
|
|
var queryIDs []uint
|
|
selectStmt := `
|
|
SELECT id
|
|
FROM queries
|
|
WHERE discard_data = false AND logging_type = 'snapshot'
|
|
`
|
|
if err := sqlx.SelectContext(ctx, ds.reader(ctx), &queryIDs, selectStmt); err != nil {
|
|
return nil, ctxerr.Wrap(ctx, err, "selecting query IDs for cleanup")
|
|
}
|
|
|
|
// Nothing to do, bail early.
|
|
if len(queryIDs) == 0 {
|
|
return map[uint]int{}, nil
|
|
}
|
|
|
|
// Get the cutoff IDs for each query in one query.
|
|
// Cutoff is the ID of the Nth most recent row,
|
|
// where N is the maxQueryReportRows.
|
|
type cutoffRow struct {
|
|
QueryID uint `db:"query_id"`
|
|
CutoffID uint `db:"cutoff_id"`
|
|
}
|
|
var queryCutoffs []cutoffRow
|
|
cutoffStmt := `
|
|
SELECT query_id, id as cutoff_id FROM (
|
|
SELECT query_id, id,
|
|
ROW_NUMBER() OVER (PARTITION BY query_id ORDER BY id DESC) as rn
|
|
FROM query_results
|
|
WHERE query_id IN (?) AND data IS NOT NULL
|
|
) cutoff
|
|
WHERE rn = ?
|
|
`
|
|
query, args, err := sqlx.In(cutoffStmt, queryIDs, maxQueryReportRows)
|
|
if err != nil {
|
|
return nil, ctxerr.Wrap(ctx, err, "building cutoff query")
|
|
}
|
|
if err := sqlx.SelectContext(ctx, ds.reader(ctx), &queryCutoffs, ds.reader(ctx).Rebind(query), args...); err != nil {
|
|
return nil, ctxerr.Wrap(ctx, err, "selecting cutoffs")
|
|
}
|
|
|
|
// Delete excess rows from each query, in batches.
|
|
if len(queryCutoffs) > 0 {
|
|
for _, c := range queryCutoffs {
|
|
deleteStmt := `
|
|
DELETE FROM query_results
|
|
WHERE query_id = ? AND id < ? AND data IS NOT NULL
|
|
LIMIT ?
|
|
`
|
|
for {
|
|
result, err := ds.writer(ctx).ExecContext(ctx, deleteStmt, c.QueryID, c.CutoffID, batchSize)
|
|
if err != nil {
|
|
return nil, ctxerr.Wrapf(ctx, err, "cleaning up query %d", c.QueryID)
|
|
}
|
|
rowsAffected, _ := result.RowsAffected()
|
|
if rowsAffected == 0 {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Count the results for each query.
|
|
// This will be used to sync Redis counters.
|
|
type countRow struct {
|
|
QueryID uint `db:"query_id"`
|
|
Count int `db:"count"`
|
|
}
|
|
var counts []countRow
|
|
countStmt := `
|
|
SELECT query_id, COUNT(*) as count
|
|
FROM query_results
|
|
WHERE query_id IN (?) AND data IS NOT NULL
|
|
GROUP BY query_id
|
|
`
|
|
query, args, err = sqlx.In(countStmt, queryIDs)
|
|
if err != nil {
|
|
return nil, ctxerr.Wrap(ctx, err, "building count query")
|
|
}
|
|
if err := sqlx.SelectContext(ctx, ds.reader(ctx), &counts, ds.reader(ctx).Rebind(query), args...); err != nil {
|
|
return nil, ctxerr.Wrap(ctx, err, "selecting counts")
|
|
}
|
|
|
|
queryCounts := make(map[uint]int)
|
|
for _, c := range counts {
|
|
queryCounts[c.QueryID] = c.Count
|
|
}
|
|
|
|
// Include queries with 0 results
|
|
for _, qid := range queryIDs {
|
|
if _, ok := queryCounts[qid]; !ok {
|
|
queryCounts[qid] = 0
|
|
}
|
|
}
|
|
|
|
return queryCounts, nil
|
|
}
|