mirror of
https://github.com/fleetdm/fleet
synced 2026-05-24 09:28:54 +00:00
<!-- Add the related story/sub-task/bug number, like Resolves #123, or remove if NA --> **Related issue:** Resolves #44609 # Details This PR optimizes the historical data collection system in two ways: 1. Adds an additional index on the `host_scd_data` table allowing more efficient lookups of rows by their `valid_to`, to optimize both closing out open rows and deleting old rows 2. Implements batching in the job that deletes old rows, so that it no longer blocks writes if the collection job happens to happen at the same time as the cleanup job # Checklist for submitter If some of the following don't apply, delete the relevant line. - [ ] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. See [Changes files](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/guides/committing-changes.md#changes-files) for more information. n/a, unreleased - [X] Input data is properly validated, `SELECT *` is avoided, SQL injection is prevented (using placeholders for values in statements), JS inline code is prevented especially for url redirects, and untrusted data interpolated into shell scripts/commands is validated against shell metacharacters. - [ ] Timeouts are implemented and retries are limited to avoid infinite loops ## Testing - [ ] Added/updated automated tests - [X] Where appropriate, [automated tests simulate multiple hosts and test for host isolation](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/reference/patterns-backend.md#unit-testing) (updates to one hosts's records do not affect another) - [X] QA'd all new/changed functionality manually SQL explains -- before: ``` +----+-------------+---------------+------------+------+---------------+------+---------+------+--------+----------+-------------+ | id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra | +----+-------------+---------------+------------+------+---------------+------+---------+------+--------+----------+-------------+ | 1 | DELETE | host_scd_data | NULL | ALL | NULL | NULL | NULL | NULL | 144320 | 100.00 | Using where | +----+-------------+---------------+------------+------+---------------+------+---------+------+--------+----------+-------------+ +----+-------------+---------------+------------+-------+--------------------------------------+--------------------+---------+-------------+------+----------+-------------+ | id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra | +----+-------------+---------------+------------+-------+--------------------------------------+--------------------+---------+-------------+------+----------+-------------+ | 1 | UPDATE | host_scd_data | NULL | range | uniq_entity_bucket,idx_dataset_range | uniq_entity_bucket | 604 | const,const | 3030 | 100.00 | Using where | +----+-------------+---------------+------------+-------+--------------------------------------+--------------------+---------+-------------+------+----------+-------------+ ``` Using a test set of data (~144k "open" rows), UPDATES happened at 9 ops per second. after: ``` +----+-------------+---------------+------------+-------+----------------------+----------------------+---------+-------+-------+----------+-------------+ | id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra | +----+-------------+---------------+------------+-------+----------------------+----------------------+---------+-------+-------+----------+-------------+ | 1 | DELETE | host_scd_data | NULL | range | idx_valid_to_dataset | idx_valid_to_dataset | 5 | const | 55749 | 100.00 | Using where | +----+-------------+---------------+------------+-------+----------------------+----------------------+---------+-------+-------+----------+-------------+ +----+-------------+---------------+------------+-------+-----------------------------------------------------------+----------------------+---------+-------------------+------+----------+------------------------------+ | id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra | +----+-------------+---------------+------------+-------+-----------------------------------------------------------+----------------------+---------+-------------------+------+----------+------------------------------+ | 1 | UPDATE | host_scd_data | NULL | range | uniq_entity_bucket,idx_dataset_range,idx_valid_to_dataset | idx_valid_to_dataset | 609 | const,const,const | 4 | 100.00 | Using where; Using temporary | +----+-------------+---------------+------------+-------+-----------------------------------------------------------+----------------------+---------+-------------------+------+----------+------------------------------+ ``` Using the same test set of data, UPDATES happened at 4,910 ops per second. For unreleased bug fixes in a release candidate, one of: - [X] Confirmed that the fix is not expected to adversely impact load test results this should significantly improve results! - [ ] Alerted the release DRI if additional load testing is needed ## Database migrations - [X] Checked schema for all modified table for columns that will auto-update timestamps during migration. - [ ] Confirmed that updating the timestamps is acceptable, and will not cause unwanted side effects. - [ ] Ensured the correct collation is explicitly set for character columns (`COLLATE utf8mb4_unicode_ci`). <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Chores** * Cleanup now runs in controlled, ordered batches, removing only closed/historical records while respecting cancellation; error reporting for cleanup was strengthened. * Added a new composite index on historical data to improve cleanup and query performance. * **Tests** * Added tests and test helpers validating batched cleanup behavior, preservation of open records, multi-batch operation, and cancellation handling. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
191 lines
8.3 KiB
Go
191 lines
8.3 KiB
Go
package mysql
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/fleetdm/fleet/v4/server/chart"
|
|
"github.com/fleetdm/fleet/v4/server/chart/api"
|
|
"github.com/fleetdm/fleet/v4/server/chart/internal/testutils"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
func TestAggregateBucketAccumulate(t *testing.T) {
|
|
bucketStart := time.Date(2026, 4, 21, 0, 0, 0, 0, time.UTC)
|
|
bucketEnd := bucketStart.Add(24 * time.Hour)
|
|
|
|
// Three accumulate rows within the bucket, each observed during a different
|
|
// hour. Accumulate semantics = union of all overlapping rows.
|
|
rows := []scdRow{
|
|
{EntityID: "", HostBitmap: chart.HostIDsToBlob([]uint{1, 2}), ValidFrom: bucketStart.Add(2 * time.Hour), ValidTo: bucketStart.Add(3 * time.Hour)},
|
|
{EntityID: "", HostBitmap: chart.HostIDsToBlob([]uint{3}), ValidFrom: bucketStart.Add(10 * time.Hour), ValidTo: bucketStart.Add(11 * time.Hour)},
|
|
{EntityID: "", HostBitmap: chart.HostIDsToBlob([]uint{2, 4}), ValidFrom: bucketStart.Add(15 * time.Hour), ValidTo: bucketStart.Add(16 * time.Hour)},
|
|
}
|
|
|
|
got := aggregateBucket(rows, bucketStart, bucketEnd, api.SampleStrategyAccumulate)
|
|
assert.Equal(t, 4, chart.BlobPopcount(got), "union of {1,2}, {3}, {2,4} = {1,2,3,4}")
|
|
}
|
|
|
|
func TestAggregateBucketAccumulateMultiEntity(t *testing.T) {
|
|
bucketStart := time.Date(2026, 4, 21, 14, 0, 0, 0, time.UTC)
|
|
bucketEnd := bucketStart.Add(time.Hour)
|
|
|
|
// Future-style multi-entity accumulate dataset (e.g. software usage):
|
|
// entity = software name; bitmap = hosts that used that software this hour.
|
|
// Bucket value = distinct hosts using any tracked software during the hour.
|
|
rows := []scdRow{
|
|
{EntityID: "slack", HostBitmap: chart.HostIDsToBlob([]uint{1, 2}), ValidFrom: bucketStart, ValidTo: bucketEnd},
|
|
{EntityID: "zoom", HostBitmap: chart.HostIDsToBlob([]uint{2, 3}), ValidFrom: bucketStart, ValidTo: bucketEnd},
|
|
{EntityID: "chrome", HostBitmap: chart.HostIDsToBlob([]uint{4}), ValidFrom: bucketStart, ValidTo: bucketEnd},
|
|
}
|
|
|
|
got := aggregateBucket(rows, bucketStart, bucketEnd, api.SampleStrategyAccumulate)
|
|
assert.Equal(t, 4, chart.BlobPopcount(got), "union across entities = {1,2,3,4}")
|
|
}
|
|
|
|
func TestAggregateBucketSnapshotEndOfBucket(t *testing.T) {
|
|
bucketStart := time.Date(2026, 4, 21, 0, 0, 0, 0, time.UTC)
|
|
bucketEnd := bucketStart.Add(24 * time.Hour)
|
|
|
|
// One entity "cve-A" changed state mid-bucket: affected hosts were {1,2,3}
|
|
// from hr 0 to hr 14, then {1,2} from hr 14 onward (H3 patched).
|
|
// End-of-bucket semantics should return only the *latest* state, not the OR.
|
|
rows := []scdRow{
|
|
{EntityID: "cve-A", HostBitmap: chart.HostIDsToBlob([]uint{1, 2, 3}), ValidFrom: bucketStart, ValidTo: bucketStart.Add(14 * time.Hour)},
|
|
{EntityID: "cve-A", HostBitmap: chart.HostIDsToBlob([]uint{1, 2}), ValidFrom: bucketStart.Add(14 * time.Hour), ValidTo: time.Date(9999, 12, 31, 0, 0, 0, 0, time.UTC)},
|
|
}
|
|
|
|
got := aggregateBucket(rows, bucketStart, bucketEnd, api.SampleStrategySnapshot)
|
|
assert.Equal(t, 2, chart.BlobPopcount(got), "end-of-bucket state is {1,2}, not union {1,2,3}")
|
|
}
|
|
|
|
func TestAggregateBucketSnapshotMultipleEntities(t *testing.T) {
|
|
bucketStart := time.Date(2026, 4, 21, 0, 0, 0, 0, time.UTC)
|
|
bucketEnd := bucketStart.Add(24 * time.Hour)
|
|
|
|
sentinel := time.Date(9999, 12, 31, 0, 0, 0, 0, time.UTC)
|
|
|
|
// Two entities, each with an end-of-bucket state; snapshot returns OR across
|
|
// entities of each's latest row.
|
|
rows := []scdRow{
|
|
// cve-A: latest state {1,2}
|
|
{EntityID: "cve-A", HostBitmap: chart.HostIDsToBlob([]uint{1, 2, 3}), ValidFrom: bucketStart, ValidTo: bucketStart.Add(14 * time.Hour)},
|
|
{EntityID: "cve-A", HostBitmap: chart.HostIDsToBlob([]uint{1, 2}), ValidFrom: bucketStart.Add(14 * time.Hour), ValidTo: sentinel},
|
|
// cve-B: latest state {3,4}
|
|
{EntityID: "cve-B", HostBitmap: chart.HostIDsToBlob([]uint{3, 4}), ValidFrom: bucketStart.Add(5 * time.Hour), ValidTo: sentinel},
|
|
}
|
|
|
|
got := aggregateBucket(rows, bucketStart, bucketEnd, api.SampleStrategySnapshot)
|
|
assert.Equal(t, 4, chart.BlobPopcount(got), "union of cve-A end-state {1,2} and cve-B end-state {3,4}")
|
|
}
|
|
|
|
func TestAggregateBucketSnapshotEntityDisappears(t *testing.T) {
|
|
bucketStart := time.Date(2026, 4, 21, 0, 0, 0, 0, time.UTC)
|
|
bucketEnd := bucketStart.Add(24 * time.Hour)
|
|
|
|
// Entity was active early in bucket but its row was closed mid-bucket with
|
|
// no replacement (entity disappeared — e.g., last affected host patched).
|
|
// End-of-bucket semantics exclude it: no row is active at bucketEnd.
|
|
rows := []scdRow{
|
|
{EntityID: "cve-A", HostBitmap: chart.HostIDsToBlob([]uint{1, 2, 3}), ValidFrom: bucketStart, ValidTo: bucketStart.Add(14 * time.Hour)},
|
|
}
|
|
|
|
got := aggregateBucket(rows, bucketStart, bucketEnd, api.SampleStrategySnapshot)
|
|
assert.Equal(t, 0, chart.BlobPopcount(got), "entity closed mid-bucket is absent at bucketEnd")
|
|
}
|
|
|
|
func TestAggregateBucketSnapshotRowClosedExactlyAtBucketEnd(t *testing.T) {
|
|
bucketStart := time.Date(2026, 4, 21, 0, 0, 0, 0, time.UTC)
|
|
bucketEnd := bucketStart.Add(24 * time.Hour)
|
|
|
|
// Row's valid_to == bucketEnd. The row represents state up to (but not
|
|
// including) bucketEnd — i.e., the state just before the bucket ends.
|
|
// That's exactly what end-of-bucket semantics should pick.
|
|
rows := []scdRow{
|
|
{EntityID: "cve-A", HostBitmap: chart.HostIDsToBlob([]uint{1, 2}), ValidFrom: bucketStart, ValidTo: bucketEnd},
|
|
}
|
|
|
|
got := aggregateBucket(rows, bucketStart, bucketEnd, api.SampleStrategySnapshot)
|
|
assert.Equal(t, 2, chart.BlobPopcount(got), "row whose valid_to equals bucketEnd covers bucketEnd-ε")
|
|
}
|
|
|
|
func TestCleanupSCDData(t *testing.T) {
|
|
tdb := testutils.SetupTestDB(t, "chart_mysql")
|
|
ds := NewDatastore(tdb.Conns(), tdb.Logger)
|
|
|
|
cases := []struct {
|
|
name string
|
|
fn func(t *testing.T, tdb *testutils.TestDB, ds *Datastore)
|
|
}{
|
|
{"PreservesOpenAndRecent", testCleanupPreservesOpenAndRecent},
|
|
{"MultipleBatches", testCleanupMultipleBatches},
|
|
{"HonorsCtxCancellation", testCleanupHonorsCtxCancellation},
|
|
}
|
|
for _, c := range cases {
|
|
t.Run(c.name, func(t *testing.T) {
|
|
defer tdb.TruncateTables(t)
|
|
c.fn(t, tdb, ds)
|
|
})
|
|
}
|
|
}
|
|
|
|
func testCleanupPreservesOpenAndRecent(t *testing.T, tdb *testutils.TestDB, ds *Datastore) {
|
|
ctx := t.Context()
|
|
now := time.Now().UTC()
|
|
|
|
// Old closed row — should be deleted (valid_to is 40 days ago, retention 30).
|
|
tdb.InsertSCDRow(t, "cve", "old", now.AddDate(0, 0, -45), now.AddDate(0, 0, -40))
|
|
// Recent closed row — within retention window, should be preserved.
|
|
tdb.InsertSCDRow(t, "cve", "recent", now.AddDate(0, 0, -10), now.AddDate(0, 0, -5))
|
|
// Open row (sentinel valid_to) — must always be preserved.
|
|
tdb.InsertSCDRow(t, "cve", "open", now.AddDate(0, 0, -45), scdOpenSentinel)
|
|
|
|
require.NoError(t, ds.CleanupSCDData(ctx, 30))
|
|
|
|
assert.Equal(t, 2, tdb.CountSCDRows(t), "only the old closed row should be deleted")
|
|
|
|
var entities []string
|
|
require.NoError(t, tdb.DB.SelectContext(ctx, &entities, `SELECT entity_id FROM host_scd_data ORDER BY entity_id`))
|
|
assert.Equal(t, []string{"open", "recent"}, entities)
|
|
}
|
|
|
|
func testCleanupMultipleBatches(t *testing.T, tdb *testutils.TestDB, ds *Datastore) {
|
|
ctx := t.Context()
|
|
now := time.Now().UTC()
|
|
|
|
// Shrink batch size so we can prove the loop iterates without inserting
|
|
// thousands of rows.
|
|
prev := scdCleanupBatch
|
|
scdCleanupBatch = 3
|
|
t.Cleanup(func() { scdCleanupBatch = prev })
|
|
|
|
// Insert 10 expired closed rows — that's 4 iterations at batch size 3
|
|
// (3 + 3 + 3 + 1, where the final partial batch terminates the loop).
|
|
for i := range 10 {
|
|
validFrom := now.AddDate(0, 0, -45).Add(time.Duration(i) * time.Minute)
|
|
validTo := now.AddDate(0, 0, -40).Add(time.Duration(i) * time.Minute)
|
|
tdb.InsertSCDRow(t, "cve", fmt.Sprintf("e%d", i), validFrom, validTo)
|
|
}
|
|
|
|
require.NoError(t, ds.CleanupSCDData(ctx, 30))
|
|
|
|
assert.Equal(t, 0, tdb.CountSCDRows(t), "all expired rows should be drained across batches")
|
|
}
|
|
|
|
func testCleanupHonorsCtxCancellation(t *testing.T, tdb *testutils.TestDB, ds *Datastore) {
|
|
now := time.Now().UTC()
|
|
|
|
// Insert a single expired row so a non-canceled call would have something
|
|
// to delete — confirms that nothing was removed because of cancellation.
|
|
tdb.InsertSCDRow(t, "cve", "old", now.AddDate(0, 0, -45), now.AddDate(0, 0, -40))
|
|
|
|
ctx, cancel := context.WithCancel(t.Context())
|
|
cancel()
|
|
|
|
err := ds.CleanupSCDData(ctx, 30)
|
|
require.ErrorIs(t, err, context.Canceled)
|
|
assert.Equal(t, 1, tdb.CountSCDRows(t), "no rows should be deleted when ctx was canceled before the first batch")
|
|
}
|