fleet/server/datastore/mysql/microsoft_mdm_eager_test.go
Victor Lyuboslavsky 9628f49cb8
Improved the performance of Windows MDM profile reconciliation (#44075)
<!-- Add the related story/sub-task/bug number, like Resolves #123, or
remove if NA -->
**Related issue:** Resolves #44052 

Improve performance by reducing the time for the synchronous API call to
update profiles or switch teams. And spreading out the application of
profiles by processing 2000 hosts every 30 seconds.

1. **Windows profile reconciliation is no longer synchronous to
bulk-set.**
Apple, Android, and Apple-declaration paths still write their pending
state inside the bulk-set transaction. The Windows path commits the
transactional inputs and lets the existing `mdm_windows_profile_manager`
cron pick the work up on its next tick. The visible effect is that
`host_mdm_windows_profiles` is no longer guaranteed to be populated by
the time bulk-set returns; it converges within one cron interval.

2. **The Windows reconciler now processes hosts in bounded batches, with
a persisted cursor.**
Previous behavior was "scan the universe of pending Windows hosts on
every tick." New behavior is a host-window query bounded by batch size
and a `host_uuid` cursor, advanced after the batch commits successfully
and persisted across ticks. A failed tick leaves the cursor untouched so
the same window is retried.

3. **Two replication races are now explicitly handled.**
- Admin-delete vs reconcile: the existence check the reconciler uses to
avoid touching a just-deleted profile reads from the primary, not a
replica.
- Insert lag in the reconciler's own listings: hosts that appear in the
cursor query but are not yet visible in the scoped listings advance the
cursor instead of jamming the loop.

4. **`updates.WindowsConfigProfile` from `BulkSetPendingMDMHostProfiles`
is now always false in production.**
The only consumer ORs it with the transactional signal from
`BatchSetMDMProfiles`, which is the accurate source. The bulk-set call
no longer attempts to compute or return that activity signal itself.

5. **Tests opt in to the old synchronous behavior via a named hook.**
Default test behavior matches production (deferred). Legacy tests whose
assertions require Windows rows immediately after bulk-set call an
explicit enable-hook and rely on `t.Cleanup` to restore.

# Checklist for submitter

If some of the following don't apply, delete the relevant line.

- [x] Changes file added for user-visible changes in `changes/`,
`orbit/changes/` or `ee/fleetd-chrome/changes`.
See [Changes
files](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/guides/committing-changes.md#changes-files)
for more information.

## Testing

- [x] Added/updated automated tests
- [x] QA'd all new/changed functionality manually


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Windows MDM profile reconciliation batching improvements enable large
team transfers and bulk profile change operations to complete faster,
with profile updates rolling out in the background without blocking host
check-ins or other MDM activity.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2026-04-28 15:37:43 -05:00

253 lines
8.6 KiB
Go

package mysql
import (
"cmp"
"context"
"fmt"
"slices"
"strings"
"testing"
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
"github.com/fleetdm/fleet/v4/server/fleet"
common_mysql "github.com/fleetdm/fleet/v4/server/platform/mysql"
"github.com/jmoiron/sqlx"
)
// Test-only Windows profile eager-reconciliation helpers. Production
// reconciles via the mdm_windows_profile_manager cron; tests that assert
// on host_mdm_windows_profiles immediately after BulkSetPendingMDMHostProfiles
// must opt in by calling ds.EnableTestWindowsEagerHook(t).
// EnableTestWindowsEagerHook makes BulkSetPendingMDMHostProfiles reconcile
// Windows rows synchronously for the rest of the test. The prior hook is
// restored via t.Cleanup so sibling tests on a shared datastore (e.g.
// TestMDMShared) aren't affected.
func (ds *Datastore) EnableTestWindowsEagerHook(tb testing.TB) {
tb.Helper()
prev := ds.testWindowsEagerHook
ds.testWindowsEagerHook = ds.bulkSetPendingMDMWindowsHostProfilesForTests
tb.Cleanup(func() { ds.testWindowsEagerHook = prev })
}
// bulkSetPendingMDMWindowsHostProfilesForTests reconciles Windows profile
// state for the given hosts by marking obsolete profiles for removal and
// upserting desired profiles. Each batch is committed in its own
// transaction so InnoDB row locks on host_mdm_windows_profiles are held
// only for the duration of one batch.
func (ds *Datastore) bulkSetPendingMDMWindowsHostProfilesForTests(
ctx context.Context,
hostUUIDs []string,
onlyProfileUUIDs []string,
) (updatedDB bool, err error) {
if len(hostUUIDs) == 0 {
return false, nil
}
// The pre-scan listings are read-only and run outside any transaction;
// they use the writer (same node that handled the prior tx) so they
// observe their own committed writes without replica lag.
profilesToInstall, err := ds.listMDMWindowsProfilesToInstallDB(ctx, ds.writer(ctx), hostUUIDs, onlyProfileUUIDs)
if err != nil {
return false, ctxerr.Wrap(ctx, err, "list profiles to install")
}
profilesToRemove, err := ds.listMDMWindowsProfilesToRemoveDB(ctx, ds.writer(ctx), hostUUIDs, onlyProfileUUIDs)
if err != nil {
return false, ctxerr.Wrap(ctx, err, "list profiles to remove")
}
if len(profilesToInstall) == 0 && len(profilesToRemove) == 0 {
return false, nil
}
// Sort by (HostUUID, ProfileUUID) to match the host_mdm_windows_profiles
// PRIMARY KEY (host_uuid, profile_uuid) so concurrent callers acquire
// InnoDB row locks in a consistent order, reducing deadlock risk.
cmpByHostThenProfile := func(a, b *fleet.MDMWindowsProfilePayload) int {
if c := cmp.Compare(a.HostUUID, b.HostUUID); c != 0 {
return c
}
return cmp.Compare(a.ProfileUUID, b.ProfileUUID)
}
slices.SortFunc(profilesToRemove, cmpByHostThenProfile)
slices.SortFunc(profilesToInstall, cmpByHostThenProfile)
const defaultBatchSize = 2000
removeBatchSize := defaultBatchSize
if ds.testDeleteMDMProfilesBatchSize > 0 {
removeBatchSize = ds.testDeleteMDMProfilesBatchSize
}
installBatchSize := defaultBatchSize
if ds.testUpsertMDMDesiredProfilesBatchSize > 0 {
installBatchSize = ds.testUpsertMDMDesiredProfilesBatchSize
}
if len(profilesToRemove) > 0 {
// Mark profiles for removal instead of deleting them. The reconciler
// will pick these up (status=NULL, operation_type='remove') and
// generate <Delete> SyncML commands.
//
// Tuple order `(host_uuid, profile_uuid)` matches the PK so MySQL can
// perform direct PK point lookups for each pair.
err := common_mysql.BatchProcessSimple(profilesToRemove, removeBatchSize, func(batch []*fleet.MDMWindowsProfilePayload) error {
return ds.withRetryTxx(ctx, func(tx sqlx.ExtContext) error {
var sb strings.Builder
sb.WriteString(`UPDATE host_mdm_windows_profiles
SET operation_type = ?, status = NULL, command_uuid = '', detail = ''
WHERE (host_uuid, profile_uuid) IN (`)
args := make([]any, 0, 1+len(batch)*2)
args = append(args, fleet.MDMOperationTypeRemove)
for j, p := range batch {
if j > 0 {
sb.WriteByte(',')
}
sb.WriteString("(?,?)")
args = append(args, p.HostUUID, p.ProfileUUID)
}
sb.WriteByte(')')
// Use RowsAffected to keep Apple-parity: idempotent re-applies
// (rows already at op=remove,status=NULL) flip nothing, so
// updatedDB must stay false in that case. MySQL's
// CLIENT_FOUND_ROWS is not enabled for our connections, so
// RowsAffected reflects rows actually changed.
res, err := tx.ExecContext(ctx, sb.String(), args...)
if err != nil {
return ctxerr.Wrap(ctx, err, "marking profiles for removal")
}
if rows, _ := res.RowsAffected(); rows > 0 {
updatedDB = true
}
return nil
})
})
if err != nil {
return updatedDB, err
}
}
if len(profilesToInstall) == 0 {
return updatedDB, nil
}
err = common_mysql.BatchProcessSimple(profilesToInstall, installBatchSize, func(batch []*fleet.MDMWindowsProfilePayload) error {
return ds.withRetryTxx(ctx, func(tx sqlx.ExtContext) error {
didUpdate, execErr := executeWindowsProfileUpsertBatchForTests(ctx, tx, batch)
if execErr != nil {
return execErr
}
if didUpdate {
updatedDB = true
}
return nil
})
})
if err != nil {
return updatedDB, err
}
return updatedDB, nil
}
// executeWindowsProfileUpsertBatchForTests performs the pre-read
// skip-if-unchanged check and the ON DUPLICATE KEY UPDATE upsert for a
// single batch of Windows profile installs. Returns true if the batch
// resulted in a DB write.
func executeWindowsProfileUpsertBatchForTests(
ctx context.Context,
tx sqlx.ExtContext,
batch []*fleet.MDMWindowsProfilePayload,
) (bool, error) {
profilesToInsert := make(map[string]*fleet.MDMWindowsProfilePayload, len(batch))
var psb strings.Builder
pargs := make([]any, 0, len(batch)*6)
for _, p := range batch {
// Build the desired post-upsert payload used for the skip-if-unchanged
// comparison below. The payload returned by
// listMDMWindowsProfilesToInstallDB only populates a subset of fields,
// so we construct a full payload here matching the values the ON
// DUPLICATE KEY UPDATE clause will set.
desired := &fleet.MDMWindowsProfilePayload{
ProfileUUID: p.ProfileUUID,
ProfileName: p.ProfileName,
HostUUID: p.HostUUID,
Status: nil,
OperationType: fleet.MDMOperationTypeInstall,
Detail: p.Detail,
CommandUUID: p.CommandUUID,
Retries: p.Retries,
Checksum: p.Checksum,
SecretsUpdatedAt: p.SecretsUpdatedAt,
}
profilesToInsert[fmt.Sprintf("%s\n%s", p.ProfileUUID, p.HostUUID)] = desired
pargs = append(pargs, p.ProfileUUID, p.HostUUID, p.ProfileName,
fleet.MDMOperationTypeInstall, p.Checksum, p.SecretsUpdatedAt)
psb.WriteString("(?, ?, ?, ?, NULL, '', ?, ?),")
}
// Tuple order `(host_uuid, profile_uuid)` matches the PK for direct point
// lookups.
selectStmt := fmt.Sprintf(`
SELECT
profile_uuid,
host_uuid,
status,
checksum,
secrets_updated_at,
COALESCE(operation_type, '') AS operation_type,
COALESCE(detail, '') AS detail,
COALESCE(command_uuid, '') AS command_uuid,
COALESCE(profile_name, '') AS profile_name
FROM host_mdm_windows_profiles WHERE (host_uuid, profile_uuid) IN (%s)`,
strings.TrimSuffix(strings.Repeat("(?,?),", len(batch)), ","))
selectArgs := make([]any, 0, 2*len(batch))
for _, p := range batch {
selectArgs = append(selectArgs, p.HostUUID, p.ProfileUUID)
}
var existingProfiles []fleet.MDMWindowsProfilePayload
if err := sqlx.SelectContext(ctx, tx, &existingProfiles, selectStmt, selectArgs...); err != nil {
return false, ctxerr.Wrap(ctx, err, "bulk set pending profile status select existing")
}
var updateNeeded bool
if len(existingProfiles) == len(profilesToInsert) {
for _, exist := range existingProfiles {
insert, ok := profilesToInsert[fmt.Sprintf("%s\n%s", exist.ProfileUUID, exist.HostUUID)]
if !ok || !exist.Equal(*insert) {
updateNeeded = true
break
}
}
} else {
updateNeeded = true
}
if !updateNeeded {
return false, nil
}
baseStmt := fmt.Sprintf(`
INSERT INTO host_mdm_windows_profiles (
profile_uuid,
host_uuid,
profile_name,
operation_type,
status,
command_uuid,
checksum,
secrets_updated_at
)
VALUES %s
ON DUPLICATE KEY UPDATE
profile_name = VALUES(profile_name),
operation_type = VALUES(operation_type),
status = NULL,
command_uuid = VALUES(command_uuid),
detail = '',
checksum = VALUES(checksum),
secrets_updated_at = VALUES(secrets_updated_at)
`, strings.TrimSuffix(psb.String(), ","))
if _, err := tx.ExecContext(ctx, baseStmt, pargs...); err != nil {
return false, ctxerr.Wrap(ctx, err, "bulk set pending profile status execute batch")
}
return true, nil
}