fleet/changes/34116-cert-source-deadlocks
Victor Lyuboslavsky 9cc7a02209
Fixed MySQL deadlocks when multiple hosts are updating their certificates in host vitals at the same time. (#34119)
<!-- Add the related story/sub-task/bug number, like Resolves #123, or
remove if NA -->
**Related issue:** Resolves #34116

# Checklist for submitter

If some of the following don't apply, delete the relevant line.

- [x] Changes file added for user-visible changes in `changes/`,
`orbit/changes/` or `ee/fleetd-chrome/changes`.
See [Changes
files](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/guides/committing-changes.md#changes-files)
for more information.

## Testing

- [ ] Added/updated automated tests (see below)

- [x] QA'd all new/changed functionality manually

Local test that reproduces the issue and verifies fix. Not checked in.

```go
// testAggressiveDeadlockReproduction creates an aggressive test scenario to reproduce
// deadlocks that occur when multiple hosts update certificate sources concurrently.
//
// This test bypasses the retry logic to actually detect deadlocks and prove the fix works.
//
// Results without fix: ~1100 deadlocks (22% rate) across 50 iterations
// Results with fix: ~160 deadlocks (3% rate) - 86% improvement
func testAggressiveDeadlockReproduction(t *testing.T, ds *Datastore) {
	ctx := context.Background()

	// Create hosts and certificates
	numHosts := 30
	hosts := make([]*fleet.Host, numHosts)
	for i := 0; i < numHosts; i++ {
		host, err := ds.NewHost(ctx, &fleet.Host{
			DetailUpdatedAt: time.Now(),
			LabelUpdatedAt:  time.Now(),
			PolicyUpdatedAt: time.Now(),
			SeenTime:        time.Now(),
			OsqueryHostID:   ptr.String(fmt.Sprintf("deadlock-test-host-%d", i)),
			NodeKey:         ptr.String(fmt.Sprintf("deadlock-test-host-%d-key", i)),
			UUID:            fmt.Sprintf("deadlock-test-host-%d-uuid", i),
			Hostname:        fmt.Sprintf("deadlock-host-%d", i),
		})
		require.NoError(t, err)
		hosts[i] = host
	}

	// Create certificates for each host
	certsPerHost := 15
	allCertIDs := make([][]uint, numHosts)

	for i := 0; i < numHosts; i++ {
		certs := make([]*fleet.HostCertificateRecord, certsPerHost)
		for j := 0; j < certsPerHost; j++ {
			certTemplate := x509.Certificate{
				Subject: pkix.Name{
					Country:            []string{"US"},
					CommonName:         fmt.Sprintf("host%d-cert%d.test.com", i, j),
					Organization:       []string{"Test Org"},
					OrganizationalUnit: []string{"Engineering"},
				},
				Issuer: pkix.Name{
					Country:      []string{"US"},
					CommonName:   "issuer.test.com",
					Organization: []string{"Test Issuer"},
				},
				SerialNumber:          big.NewInt(int64(i*1000 + j)),
				KeyUsage:              x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
				ExtKeyUsage:           []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
				SignatureAlgorithm:    x509.SHA256WithRSA,
				NotBefore:             time.Now().Add(-time.Hour).Truncate(time.Second).UTC(),
				NotAfter:              time.Now().Add(24 * time.Hour).Truncate(time.Second).UTC(),
				BasicConstraintsValid: true,
			}

			cert := generateTestHostCertificateRecord(t, hosts[i].ID, &certTemplate)
			cert.Source = fleet.SystemHostCertificate
			certs[j] = cert
		}

		// Insert certificates
		err := ds.UpdateHostCertificates(ctx, hosts[i].ID, hosts[i].UUID, certs)
		require.NoError(t, err)

		// Load certificate IDs
		loadedCerts, _, err := ds.ListHostCertificates(ctx, hosts[i].ID, fleet.ListOptions{})
		require.NoError(t, err)
		require.Len(t, loadedCerts, certsPerHost)

		certIDs := make([]uint, certsPerHost)
		for j, cert := range loadedCerts {
			certIDs[j] = cert.ID
		}
		allCertIDs[i] = certIDs
	}

	// Run aggressive deadlock test
	totalDeadlocks := 0
	iterations := 50

	for iter := 0; iter < iterations; iter++ {
		t.Logf("Iteration %d/%d", iter+1, iterations)

		type result struct {
			txIdx int
			err   error
		}

		numTransactions := 100 // Many concurrent transactions
		resultsCh := make(chan result, numTransactions)

		// Launch concurrent transactions
		for txIdx := 0; txIdx < numTransactions; txIdx++ {
			go func(idx int) {
				hostIdx := idx % numHosts
				certIDs := allCertIDs[hostIdx]

				// Build UNSORTED source records to trigger deadlocks
				// Reverse order for even transactions to create lock conflicts
				toReplace := make([]*fleet.HostCertificateRecord, len(certIDs))
				for i := range certIDs {
					actualIdx := i
					if idx%2 == 0 {
						actualIdx = len(certIDs) - 1 - i
					}

					toReplace[i] = &fleet.HostCertificateRecord{
						ID:       certIDs[actualIdx],
						Source:   fleet.UserHostCertificate,
						Username: fmt.Sprintf("user%d", idx),
					}
				}

				// Call replaceHostCertsSourcesDB directly (no retry)
				err := ds.withTx(ctx, func(tx sqlx.ExtContext) error {
					return replaceHostCertsSourcesDB(ctx, tx, toReplace)
				})

				resultsCh <- result{txIdx: idx, err: err}
			}(txIdx)
		}

		// Collect results
		iterDeadlocks := 0
		for i := 0; i < numTransactions; i++ {
			res := <-resultsCh
			if res.err != nil {
				if strings.Contains(res.err.Error(), "Deadlock") || strings.Contains(res.err.Error(), "deadlock") {
					iterDeadlocks++
				} else {
					require.NoError(t, res.err, "Transaction %d unexpected error", res.txIdx)
				}
			}
		}

		if iterDeadlocks > 0 {
			t.Logf("  Deadlocks in iteration %d: %d/%d transactions", iter+1, iterDeadlocks, numTransactions)
			totalDeadlocks += iterDeadlocks
		}
	}

	// Report results
	deadlockRate := float64(totalDeadlocks) / float64(iterations*100) * 100
	t.Logf("\n=== DEADLOCK TEST RESULTS ===")
	t.Logf("Total deadlocks: %d across %d iterations", totalDeadlocks, iterations)
	t.Logf("Deadlock rate: %.1f%%", deadlockRate)
	t.Logf("\nExpected without fix: ~1100 deadlocks (22%% rate)")
	t.Logf("Expected with fix: ~160 deadlocks (3%% rate)")

	if totalDeadlocks > 500 {
		t.Fatalf("High deadlock count suggests fix is not applied or not working")
	}
}
```

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- Bug Fixes
- Resolved rare database deadlocks when multiple hosts update
certificates simultaneously, improving reliability of host vitals
updates.
- Reduced unnecessary delete operations during certificate updates to
lower lock contention and improve stability under load.
- Standardized processing of certificate sources to ensure consistent
behavior across concurrent updates.
- Overall improvements result in smoother certificate synchronization
without user-facing changes.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-10-13 09:48:32 -05:00

1 line
107 B
Text

Fixed MySQL deadlocks when multiple hosts are updating their certificates in host vitals at the same time.