From ee3bfb759d488af99e97d7622e4e172cdc197c11 Mon Sep 17 00:00:00 2001 From: Jordan Montgomery Date: Thu, 2 Apr 2026 06:16:55 -0400 Subject: [PATCH] #34950 Cleanup nano refetch commands in the background (#42472) **Related issue:** Resolves #34950 I changed from the original spec of 100 old commands to 3 due to load test results. Admittedly my load test meant a very large number of hosts all checked in and triggered deletion at once but at 100 per host and per command the load was too high. 3 still results in cleanup over time and doesn't seem to cause load issues. # Checklist for submitter If some of the following don't apply, delete the relevant line. - [x] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. See [Changes files](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/guides/committing-changes.md#changes-files) for more information. - [x] Input data is properly validated, `SELECT *` is avoided, SQL injection is prevented (using placeholders for values in statements), JS inline code is prevented especially for url redirects, and untrusted data interpolated into shell scripts/commands is validated against shell metacharacters. - [x] If paths of existing endpoints are modified without backwards compatibility, checked the frontend/CLI for any necessary changes ## Testing - [x] Added/updated automated tests - [x] Where appropriate, [automated tests simulate multiple hosts and test for host isolation](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/reference/patterns-backend.md#unit-testing) (updates to one hosts's records do not affect another) - [x] QA'd all new/changed functionality manually --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- changes/34950-nano-tables-cleanup | 1 + cmd/fleet/cron.go | 3 + server/datastore/mysql/apple_mdm.go | 120 ++++++++++++++ server/datastore/mysql/apple_mdm_test.go | 198 +++++++++++++++++++++++ server/fleet/datastore.go | 10 ++ server/mock/datastore_mock.go | 24 +++ server/service/apple_mdm.go | 15 ++ server/service/apple_mdm_test.go | 6 + 8 files changed, 377 insertions(+) create mode 100644 changes/34950-nano-tables-cleanup diff --git a/changes/34950-nano-tables-cleanup b/changes/34950-nano-tables-cleanup new file mode 100644 index 0000000000..313da97dce --- /dev/null +++ b/changes/34950-nano-tables-cleanup @@ -0,0 +1 @@ +* Updated iOS/iPadOS refetch logic to slowly clear out old/stale results diff --git a/cmd/fleet/cron.go b/cmd/fleet/cron.go index 006b3b0e52..b0160f433b 100644 --- a/cmd/fleet/cron.go +++ b/cmd/fleet/cron.go @@ -1205,6 +1205,9 @@ func newCleanupsAndAggregationSchedule( } return nil }), + schedule.WithJob("cleanup_orphaned_nano_refetch_commands", func(ctx context.Context) error { + return ds.CleanupOrphanedNanoRefetchCommands(ctx) + }), ) return s, nil diff --git a/server/datastore/mysql/apple_mdm.go b/server/datastore/mysql/apple_mdm.go index 16f52edfd6..ca843afdb5 100644 --- a/server/datastore/mysql/apple_mdm.go +++ b/server/datastore/mysql/apple_mdm.go @@ -6843,6 +6843,126 @@ WHERE ( return nil } +func (ds *Datastore) CleanupStaleNanoRefetchCommands(ctx context.Context, enrollmentID string, commandUUIDPrefix string, currentCommandUUID string) error { + // Step 1: Get up to 3 old command UUIDs from nano_enrollment_queue for this + // enrollment. The PK is (id, command_uuid) so filtering by id first is efficient, + // and the LIKE prefix on command_uuid narrows within that enrollment's entries. + // 3 may seem like too few but in load testing because of how often this runs it + // was found that larger numbers can cause too much contention + const selectOldCmds = ` + SELECT command_uuid FROM nano_enrollment_queue + WHERE id = ? + AND command_uuid LIKE ? + AND command_uuid != ? + AND created_at < NOW() - INTERVAL 30 DAY + LIMIT 3` + + var oldCmdUUIDs []string + if err := sqlx.SelectContext(ctx, ds.reader(ctx), &oldCmdUUIDs, selectOldCmds, enrollmentID, commandUUIDPrefix+"%", currentCommandUUID); err != nil { + return ctxerr.Wrap(ctx, err, "select old nano refetch commands") + } + if len(oldCmdUUIDs) == 0 { + return nil + } + + // Step 2: From ncr, find which of those command UUIDs have been acknowledged or + // errored for this enrollment. The PK (id, command_uuid) makes this efficient + // since we provide both id and the command_uuid IN list. + selectAckQuery, args, err := sqlx.In(` + SELECT command_uuid FROM nano_command_results + WHERE id = ? AND command_uuid IN (?) AND status IN ('Acknowledged', 'Error')`, + enrollmentID, oldCmdUUIDs) + if err != nil { + return ctxerr.Wrap(ctx, err, "build IN query for nano command results") + } + + var ackCmdUUIDs []string + if err := sqlx.SelectContext(ctx, ds.reader(ctx), &ackCmdUUIDs, selectAckQuery, args...); err != nil { + return ctxerr.Wrap(ctx, err, "select acknowledged nano command results") + } + if len(ackCmdUUIDs) == 0 { + return nil + } + + // Step 3: Delete from neq and ncr in the same transaction, scoped to this enrollment. We may not + // need to delete in a transaction here but it's the easiest way to ensure we cleanup ncr and further + // we absolutely must ensure we don't delete from ncr before deleting from neq to avoid command resends + return ds.withRetryTxx(ctx, func(tx sqlx.ExtContext) error { + deleteNEQ, delArgs, err := sqlx.In( + `DELETE FROM nano_enrollment_queue WHERE id = ? AND command_uuid IN (?)`, + enrollmentID, ackCmdUUIDs) + if err != nil { + return ctxerr.Wrap(ctx, err, "build delete neq query") + } + if _, err := tx.ExecContext(ctx, deleteNEQ, delArgs...); err != nil { + return ctxerr.Wrap(ctx, err, "delete stale nano enrollment queue entries") + } + + deleteNCR, delArgs, err := sqlx.In( + `DELETE FROM nano_command_results WHERE id = ? AND command_uuid IN (?)`, + enrollmentID, ackCmdUUIDs) + if err != nil { + return ctxerr.Wrap(ctx, err, "build delete ncr query") + } + if _, err := tx.ExecContext(ctx, deleteNCR, delArgs...); err != nil { + return ctxerr.Wrap(ctx, err, "delete stale nano command results entries") + } + + return nil + }) +} + +func (ds *Datastore) CleanupOrphanedNanoRefetchCommands(ctx context.Context) error { + // Find up to 100 old REFETCH- commands. Note I am doing this as two queries since nano_commands + // can be large and I want to make sure in the case of a large number of active commands there's + // not going to be a full table scan or something happening. This is a best effort deletion so it + // is OK if our sample deletes nothing + const selectStmt = ` + SELECT command_uuid FROM nano_commands nc + WHERE nc.command_uuid LIKE 'REFETCH-%' + AND nc.created_at < NOW() - INTERVAL 30 DAY + LIMIT 100` + + var cmdUUIDs []string + if err := sqlx.SelectContext(ctx, ds.reader(ctx), &cmdUUIDs, selectStmt); err != nil { + return ctxerr.Wrap(ctx, err, "get mdm apple refetch commands") + } + if len(cmdUUIDs) == 0 { + return nil + } + + // Delete those that don't have a corresponding entry in nano_enrollment_queue + selectOrphanedCommandsStmt := ` + SELECT command_uuid FROM nano_commands nc + WHERE nc.command_uuid IN (?) AND NOT EXISTS ( + SELECT 1 FROM nano_enrollment_queue neq + WHERE neq.command_uuid = nc.command_uuid AND neq.active = 1 + LIMIT 1 + )` + selectOrphanedCommandsStmt, args, err := sqlx.In(selectOrphanedCommandsStmt, cmdUUIDs) + if err != nil { + return ctxerr.Wrap(ctx, err, "build IN query for orphaned refetch commands") + } + if err := sqlx.SelectContext(ctx, ds.reader(ctx), &cmdUUIDs, selectOrphanedCommandsStmt, args...); err != nil { + return ctxerr.Wrap(ctx, err, "get orphaned refetch commands") + } + if len(cmdUUIDs) == 0 { + return nil + } + + deleteOrphanedCommandsStmt := ` + DELETE FROM nano_commands + WHERE command_uuid IN (?)` + deleteOrphanedCommandsStmt, args, err = sqlx.In(deleteOrphanedCommandsStmt, cmdUUIDs) + if err != nil { + return ctxerr.Wrap(ctx, err, "build IN query for deleting orphaned refetch commands") + } + if _, err := ds.writer(ctx).ExecContext(ctx, deleteOrphanedCommandsStmt, args...); err != nil { + return ctxerr.Wrap(ctx, err, "delete orphaned refetch commands") + } + return nil +} + func (ds *Datastore) GetMDMAppleOSUpdatesSettingsByHostSerial(ctx context.Context, serial string) (string, *fleet.AppleOSUpdateSettings, error) { stmt := ` SELECT diff --git a/server/datastore/mysql/apple_mdm_test.go b/server/datastore/mysql/apple_mdm_test.go index 739fe567a0..c71c731bee 100644 --- a/server/datastore/mysql/apple_mdm_test.go +++ b/server/datastore/mysql/apple_mdm_test.go @@ -123,6 +123,8 @@ func TestMDMApple(t *testing.T) { {"GetHostRecoveryLockPasswordStatus", testGetHostRecoveryLockPasswordStatus}, {"ClaimHostsForRecoveryLockClear", testClaimHostsForRecoveryLockClear}, {"RecoveryLockRotation", testRecoveryLockRotation}, + {"CleanupStaleNanoRefetchCommands", testCleanupStaleNanoRefetchCommands}, + {"CleanupOrphanedNanoRefetchCommands", testCleanupOrphanedNanoRefetchCommands}, {"RecoveryLockAutoRotation", testRecoveryLockAutoRotation}, } @@ -11392,6 +11394,202 @@ func testRecoveryLockRotation(t *testing.T, ds *Datastore) { }) } +func testCleanupStaleNanoRefetchCommands(t *testing.T, ds *Datastore) { + ctx := t.Context() + + // Create a host and enroll it in nano MDM. + host, err := ds.NewHost(ctx, &fleet.Host{ + Hostname: "test-cleanup-host", + OsqueryHostID: ptr.String("cleanup-osquery-id"), + NodeKey: ptr.String("cleanup-node-key"), + UUID: "cleanup-test-uuid", + Platform: "ios", + DetailUpdatedAt: time.Now(), + LabelUpdatedAt: time.Now(), + PolicyUpdatedAt: time.Now(), + SeenTime: time.Now(), + }) + require.NoError(t, err) + nanoEnroll(t, ds, host, false) + + enrollmentID := host.UUID + + // Helper to insert a nano command with a specific created_at. + insertNanoCmd := func(cmdUUID, reqType string, createdAt time.Time) { + _, err := ds.writer(ctx).ExecContext(ctx, + `INSERT INTO nano_commands (command_uuid, request_type, command, created_at) VALUES (?, ?, '