mirror of
https://github.com/fleetdm/fleet
synced 2026-05-18 06:28:40 +00:00
<!-- Add the related story/sub-task/bug number, like Resolves #123, or remove if NA --> **Related issue:** Resolves #34116 # Checklist for submitter If some of the following don't apply, delete the relevant line. - [x] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. See [Changes files](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/guides/committing-changes.md#changes-files) for more information. ## Testing - [ ] Added/updated automated tests (see below) - [x] QA'd all new/changed functionality manually Local test that reproduces the issue and verifies fix. Not checked in. ```go // testAggressiveDeadlockReproduction creates an aggressive test scenario to reproduce // deadlocks that occur when multiple hosts update certificate sources concurrently. // // This test bypasses the retry logic to actually detect deadlocks and prove the fix works. // // Results without fix: ~1100 deadlocks (22% rate) across 50 iterations // Results with fix: ~160 deadlocks (3% rate) - 86% improvement func testAggressiveDeadlockReproduction(t *testing.T, ds *Datastore) { ctx := context.Background() // Create hosts and certificates numHosts := 30 hosts := make([]*fleet.Host, numHosts) for i := 0; i < numHosts; i++ { host, err := ds.NewHost(ctx, &fleet.Host{ DetailUpdatedAt: time.Now(), LabelUpdatedAt: time.Now(), PolicyUpdatedAt: time.Now(), SeenTime: time.Now(), OsqueryHostID: ptr.String(fmt.Sprintf("deadlock-test-host-%d", i)), NodeKey: ptr.String(fmt.Sprintf("deadlock-test-host-%d-key", i)), UUID: fmt.Sprintf("deadlock-test-host-%d-uuid", i), Hostname: fmt.Sprintf("deadlock-host-%d", i), }) require.NoError(t, err) hosts[i] = host } // Create certificates for each host certsPerHost := 15 allCertIDs := make([][]uint, numHosts) for i := 0; i < numHosts; i++ { certs := make([]*fleet.HostCertificateRecord, certsPerHost) for j := 0; j < certsPerHost; j++ { certTemplate := x509.Certificate{ Subject: pkix.Name{ Country: []string{"US"}, CommonName: fmt.Sprintf("host%d-cert%d.test.com", i, j), Organization: []string{"Test Org"}, OrganizationalUnit: []string{"Engineering"}, }, Issuer: pkix.Name{ Country: []string{"US"}, CommonName: "issuer.test.com", Organization: []string{"Test Issuer"}, }, SerialNumber: big.NewInt(int64(i*1000 + j)), KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, SignatureAlgorithm: x509.SHA256WithRSA, NotBefore: time.Now().Add(-time.Hour).Truncate(time.Second).UTC(), NotAfter: time.Now().Add(24 * time.Hour).Truncate(time.Second).UTC(), BasicConstraintsValid: true, } cert := generateTestHostCertificateRecord(t, hosts[i].ID, &certTemplate) cert.Source = fleet.SystemHostCertificate certs[j] = cert } // Insert certificates err := ds.UpdateHostCertificates(ctx, hosts[i].ID, hosts[i].UUID, certs) require.NoError(t, err) // Load certificate IDs loadedCerts, _, err := ds.ListHostCertificates(ctx, hosts[i].ID, fleet.ListOptions{}) require.NoError(t, err) require.Len(t, loadedCerts, certsPerHost) certIDs := make([]uint, certsPerHost) for j, cert := range loadedCerts { certIDs[j] = cert.ID } allCertIDs[i] = certIDs } // Run aggressive deadlock test totalDeadlocks := 0 iterations := 50 for iter := 0; iter < iterations; iter++ { t.Logf("Iteration %d/%d", iter+1, iterations) type result struct { txIdx int err error } numTransactions := 100 // Many concurrent transactions resultsCh := make(chan result, numTransactions) // Launch concurrent transactions for txIdx := 0; txIdx < numTransactions; txIdx++ { go func(idx int) { hostIdx := idx % numHosts certIDs := allCertIDs[hostIdx] // Build UNSORTED source records to trigger deadlocks // Reverse order for even transactions to create lock conflicts toReplace := make([]*fleet.HostCertificateRecord, len(certIDs)) for i := range certIDs { actualIdx := i if idx%2 == 0 { actualIdx = len(certIDs) - 1 - i } toReplace[i] = &fleet.HostCertificateRecord{ ID: certIDs[actualIdx], Source: fleet.UserHostCertificate, Username: fmt.Sprintf("user%d", idx), } } // Call replaceHostCertsSourcesDB directly (no retry) err := ds.withTx(ctx, func(tx sqlx.ExtContext) error { return replaceHostCertsSourcesDB(ctx, tx, toReplace) }) resultsCh <- result{txIdx: idx, err: err} }(txIdx) } // Collect results iterDeadlocks := 0 for i := 0; i < numTransactions; i++ { res := <-resultsCh if res.err != nil { if strings.Contains(res.err.Error(), "Deadlock") || strings.Contains(res.err.Error(), "deadlock") { iterDeadlocks++ } else { require.NoError(t, res.err, "Transaction %d unexpected error", res.txIdx) } } } if iterDeadlocks > 0 { t.Logf(" Deadlocks in iteration %d: %d/%d transactions", iter+1, iterDeadlocks, numTransactions) totalDeadlocks += iterDeadlocks } } // Report results deadlockRate := float64(totalDeadlocks) / float64(iterations*100) * 100 t.Logf("\n=== DEADLOCK TEST RESULTS ===") t.Logf("Total deadlocks: %d across %d iterations", totalDeadlocks, iterations) t.Logf("Deadlock rate: %.1f%%", deadlockRate) t.Logf("\nExpected without fix: ~1100 deadlocks (22%% rate)") t.Logf("Expected with fix: ~160 deadlocks (3%% rate)") if totalDeadlocks > 500 { t.Fatalf("High deadlock count suggests fix is not applied or not working") } } ``` <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - Bug Fixes - Resolved rare database deadlocks when multiple hosts update certificates simultaneously, improving reliability of host vitals updates. - Reduced unnecessary delete operations during certificate updates to lower lock contention and improve stability under load. - Standardized processing of certificate sources to ensure consistent behavior across concurrent updates. - Overall improvements result in smoother certificate synchronization without user-facing changes. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 line
107 B
Text
1 line
107 B
Text
Fixed MySQL deadlocks when multiple hosts are updating their certificates in host vitals at the same time.
|