fleet/changes/32178-software-vuln-perf
Victor Lyuboslavsky 9295a82e83
Improved MySQL query performance software versions and vulnerabilities endpoints (#34262)
<!-- Add the related story/sub-task/bug number, like Resolves #123, or
remove if NA -->
**Related issue:** Resolves #32178

Software optimization: skipping an unnecessary software_cve join when
vulnerability details are not needed. Vulnerabilities are still
returned, so functionality remains unchanged.

Vulnerabilities optimization: Query vulnerability_host_counts directly
and LEFT JOIN for metadata. This eliminates the expensive UNION of all
CVE rows that was causing performance issues.

Previous approach: UNION all CVEs (many rows) → JOIN
vulnerability_host_counts → filter
New approach: Start with filtered vulnerability_host_counts → LEFT JOIN
for metadata
This reduces the working set before any expensive operations

# Checklist for submitter

If some of the following don't apply, delete the relevant line.

- [x] Changes file added for user-visible changes in `changes/`,
`orbit/changes/` or `ee/fleetd-chrome/changes`.
See [Changes
files](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/guides/committing-changes.md#changes-files)
for more information.

## Testing

- [ ] Added/updated automated tests (see below for the test)
- [x] QA'd all new/changed functionality manually
- Planning to test in loadtest after also improving the software
endpoint

Performance test for replicating the problem and testing the fix:
```go
package mysql

import (
	"context"
	"fmt"
	"testing"
	"time"

	"github.com/fleetdm/fleet/v4/server/fleet"
	"github.com/fleetdm/fleet/v4/server/ptr"
	"github.com/fleetdm/fleet/v4/server/test"
	"github.com/stretchr/testify/require"
)

// TestListVulnerabilitiesPerformance is a performance test that replicates
// the production performance problem with ListVulnerabilities.
//
// This test creates a realistic dataset with thousands of CVEs and measures
// query performance under various conditions. Run with:
//
//	go test -v -run TestListVulnerabilitiesPerformance ./server/datastore/mysql
//
// To see detailed timing output, set the environment variable:
//
//	VERBOSE=1 go test -v -run TestListVulnerabilitiesPerformance ./server/datastore/mysql
func TestListVulnerabilitiesPerformance(t *testing.T) {
	if testing.Short() {
		t.Skip("skipping performance test in short mode")
	}

	ds := CreateMySQLDS(t)
	defer TruncateTables(t, ds)

	ctx := context.Background()

	// Create a realistic dataset
	t.Log("Setting up test data...")
	setupPerformanceTestData(t, ds)
	t.Log("Test data setup complete")

	// Test cases covering common query patterns
	testCases := []struct {
		name string
		opts fleet.VulnListOptions
	}{
		{
			name: "Global list - first page, sorted by host count",
			opts: fleet.VulnListOptions{
				IsEE: true,
				ListOptions: fleet.ListOptions{
					Page:           0,
					PerPage:        20,
					OrderKey:       "hosts_count",
					OrderDirection: fleet.OrderDescending,
				},
			},
		},
		{
			name: "Team 1 list - first page, sorted by host count",
			opts: fleet.VulnListOptions{
				IsEE:   true,
				TeamID: ptr.Uint(1),
				ListOptions: fleet.ListOptions{
					Page:           0,
					PerPage:        20,
					OrderKey:       "hosts_count",
					OrderDirection: fleet.OrderDescending,
				},
			},
		},
		{
			name: "Team 1 list - with exploit filter",
			opts: fleet.VulnListOptions{
				IsEE:         true,
				TeamID:       ptr.Uint(1),
				KnownExploit: true,
				ListOptions: fleet.ListOptions{
					Page:           0,
					PerPage:        20,
					OrderKey:       "hosts_count",
					OrderDirection: fleet.OrderDescending,
				},
			},
		},
		{
			name: "Global list - with CVE search",
			opts: fleet.VulnListOptions{
				IsEE: true,
				ListOptions: fleet.ListOptions{
					Page:           0,
					PerPage:        20,
					MatchQuery:     "2023",
					OrderKey:       "hosts_count",
					OrderDirection: fleet.OrderDescending,
				},
			},
		},
		{
			name: "Global list - second page",
			opts: fleet.VulnListOptions{
				IsEE: true,
				ListOptions: fleet.ListOptions{
					Page:           1,
					PerPage:        20,
					OrderKey:       "hosts_count",
					OrderDirection: fleet.OrderDescending,
				},
			},
		},
		{
			name: "Free version - global list",
			opts: fleet.VulnListOptions{
				IsEE: false,
				ListOptions: fleet.ListOptions{
					Page:           0,
					PerPage:        20,
					OrderKey:       "hosts_count",
					OrderDirection: fleet.OrderDescending,
				},
			},
		},
	}

	// Run performance tests
	for _, tc := range testCases {
		t.Run(tc.name, func(t *testing.T) {
			// Warm up the query cache
			_, _, err := ds.ListVulnerabilities(ctx, tc.opts)
			require.NoError(t, err)

			// Measure query performance
			const iterations = 5
			var totalDuration time.Duration

			for i := 0; i < iterations; i++ {
				start := time.Now()
				vulns, meta, err := ds.ListVulnerabilities(ctx, tc.opts)
				duration := time.Since(start)
				totalDuration += duration

				require.NoError(t, err)
				require.NotNil(t, meta)
				require.NotEmpty(t, vulns, "expected vulnerabilities to be returned")

				if i == 0 {
					t.Logf("  First run: %v (returned %d results)", duration, len(vulns))
				}
			}

			avgDuration := totalDuration / iterations
			t.Logf("  Average of %d runs: %v", iterations, avgDuration)

			// Performance assertions
			// These thresholds represent the current performance problem
			// After optimization, these should be reduced significantly
			if avgDuration > 2*time.Second {
				t.Logf("  ⚠️  WARNING: Query took %v (>2s) - performance issue detected", avgDuration)
			} else if avgDuration > 500*time.Millisecond {
				t.Logf("  ⚠️  SLOW: Query took %v (>500ms)", avgDuration)
			} else {
				t.Logf("  ✓ GOOD: Query took %v (<500ms)", avgDuration)
			}
		})
	}

	// Test count query performance
	t.Run("Count vulnerabilities performance", func(t *testing.T) {
		opts := fleet.VulnListOptions{
			IsEE: true,
		}

		// Warm up
		_, err := ds.CountVulnerabilities(ctx, opts)
		require.NoError(t, err)

		// Measure
		const iterations = 5
		var totalDuration time.Duration

		for i := 0; i < iterations; i++ {
			start := time.Now()
			count, err := ds.CountVulnerabilities(ctx, opts)
			duration := time.Since(start)
			totalDuration += duration

			require.NoError(t, err)
			require.Greater(t, count, uint(0))

			if i == 0 {
				t.Logf("  First run: %v (count=%d)", duration, count)
			}
		}

		avgDuration := totalDuration / iterations
		t.Logf("  Average of %d runs: %v", iterations, avgDuration)

		if avgDuration > 2*time.Second {
			t.Logf("  ⚠️  WARNING: Count query took %v (>2s)", avgDuration)
		} else if avgDuration > 500*time.Millisecond {
			t.Logf("  ⚠️  SLOW: Count query took %v (>500ms)", avgDuration)
		} else {
			t.Logf("  ✓ GOOD: Count query took %v (<500ms)", avgDuration)
		}
	})
}

// BenchmarkListVulnerabilities provides benchmark results for ListVulnerabilities.
// Run with:
//
//	go test -bench=BenchmarkListVulnerabilities -benchmem -run=^$ ./server/datastore/mysql
func BenchmarkListVulnerabilities(b *testing.B) {
	ds := CreateMySQLDSForBenchmark(b)
	defer TruncateTables(b, ds)

	ctx := context.Background()

	// Setup test data
	setupPerformanceTestData(b, ds)

	b.ResetTimer()

	// Benchmark the most common query pattern
	opts := fleet.VulnListOptions{
		IsEE: true,
		ListOptions: fleet.ListOptions{
			Page:           0,
			PerPage:        20,
			OrderKey:       "hosts_count",
			OrderDirection: fleet.OrderDescending,
		},
	}

	for i := 0; i < b.N; i++ {
		_, _, err := ds.ListVulnerabilities(ctx, opts)
		if err != nil {
			b.Fatal(err)
		}
	}
}

// BenchmarkListVulnerabilitiesWithTeam benchmarks team-specific queries
func BenchmarkListVulnerabilitiesWithTeam(b *testing.B) {
	ds := CreateMySQLDSForBenchmark(b)
	defer TruncateTables(b, ds)

	ctx := context.Background()
	setupPerformanceTestData(b, ds)

	b.ResetTimer()

	opts := fleet.VulnListOptions{
		IsEE:   true,
		TeamID: ptr.Uint(1),
		ListOptions: fleet.ListOptions{
			Page:           0,
			PerPage:        20,
			OrderKey:       "hosts_count",
			OrderDirection: fleet.OrderDescending,
		},
	}

	for i := 0; i < b.N; i++ {
		_, _, err := ds.ListVulnerabilities(ctx, opts)
		if err != nil {
			b.Fatal(err)
		}
	}
}

// BenchmarkCountVulnerabilities benchmarks the count query
func BenchmarkCountVulnerabilities(b *testing.B) {
	ds := CreateMySQLDSForBenchmark(b)
	defer TruncateTables(b, ds)

	ctx := context.Background()
	setupPerformanceTestData(b, ds)

	b.ResetTimer()

	opts := fleet.VulnListOptions{
		IsEE: true,
	}

	for i := 0; i < b.N; i++ {
		_, err := ds.CountVulnerabilities(ctx, opts)
		if err != nil {
			b.Fatal(err)
		}
	}
}

// setupPerformanceTestData creates a realistic dataset that mimics production
// This creates:
// - ~80,000+ unique CVEs (matching production scale)
// - ~73,000 software_cve entries
// - ~35,000 operating_system_vulnerabilities entries
// - Multiple teams
// - Various host counts per vulnerability
//
// Note: This will take several minutes to run but will replicate production performance issues
func setupPerformanceTestData(t testing.TB, ds *Datastore) {
	ctx := context.Background()

	// Create 100 hosts across different teams and OS types (doubled from 50)
	// More hosts = more realistic host count distributions
	hosts := make([]*fleet.Host, 100)
	for i := 0; i < 100; i++ {
		hosts[i] = test.NewHost(t, ds, fmt.Sprintf("host%d", i),
			fmt.Sprintf("192.168.1.%d", i%255+1), // Handle more than 255 hosts
			fmt.Sprintf("key%d", i),
			fmt.Sprintf("uuid%d", i),
			time.Now())
	}

	// Create 3 teams
	team1, err := ds.NewTeam(ctx, &fleet.Team{Name: "Engineering"})
	require.NoError(t, err)

	team2, err := ds.NewTeam(ctx, &fleet.Team{Name: "Sales"})
	require.NoError(t, err)

	team3, err := ds.NewTeam(ctx, &fleet.Team{Name: "Support"})
	require.NoError(t, err)

	// Distribute hosts across teams
	// 40 hosts in team1, 30 in team2, 20 in team3, 10 with no team
	err = ds.AddHostsToTeam(ctx, fleet.NewAddHostsToTeamParams(&team1.ID, getHostIDs(hosts[0:40])))
	require.NoError(t, err)

	err = ds.AddHostsToTeam(ctx, fleet.NewAddHostsToTeamParams(&team2.ID, getHostIDs(hosts[40:70])))
	require.NoError(t, err)

	err = ds.AddHostsToTeam(ctx, fleet.NewAddHostsToTeamParams(&team3.ID, getHostIDs(hosts[70:90])))
	require.NoError(t, err)

	// Set up OS versions (Windows, macOS, Ubuntu)
	windowsOS := fleet.OperatingSystem{
		Name:     "Microsoft Windows 11 Enterprise",
		Version:  "10.0.22621.2715",
		Arch:     "x86_64",
		Platform: "windows",
	}

	macOS := fleet.OperatingSystem{
		Name:     "macOS",
		Version:  "14.1.2",
		Arch:     "arm64",
		Platform: "darwin",
	}

	ubuntuOS := fleet.OperatingSystem{
		Name:     "Ubuntu",
		Version:  "22.04",
		Arch:     "x86_64",
		Platform: "ubuntu",
	}

	// Assign OS to hosts: 50 Windows, 30 macOS, 20 Ubuntu
	for i := 0; i < 50; i++ {
		err = ds.UpdateHostOperatingSystem(ctx, hosts[i].ID, windowsOS)
		require.NoError(t, err)
	}
	for i := 50; i < 80; i++ {
		err = ds.UpdateHostOperatingSystem(ctx, hosts[i].ID, macOS)
		require.NoError(t, err)
	}
	for i := 80; i < 100; i++ {
		err = ds.UpdateHostOperatingSystem(ctx, hosts[i].ID, ubuntuOS)
		require.NoError(t, err)
	}

	err = ds.UpdateOSVersions(ctx)
	require.NoError(t, err)

	// Create realistic CVE distribution matching production scale
	// In production, we see:
	// - ~73,000 software CVEs
	// - ~35,000 OS CVEs
	// - Many CVEs overlap between software and OS
	// - ~80,000 unique CVEs after deduplication

	// We now create production-scale data:
	// - 50,000 software CVEs (70% of production)
	// - 30,000 OS CVEs (85% of production)
	// - Some overlap to create ~80,000 total CVEs after UNION
	// This should replicate the 500-1000ms+ query times seen in production

	t.Log("Creating software vulnerabilities... (this will take a few minutes)")
	createSoftwareVulnerabilities(t, ds, hosts[:60], 50000)

	t.Log("Creating OS vulnerabilities...")
	createOSVulnerabilities(t, ds, 30000)

	t.Log("Creating CVE metadata...")
	createCVEMetadata(t, ds, 80000)

	t.Log("Updating vulnerability host counts...")
	err = ds.UpdateVulnerabilityHostCounts(ctx, 10)
	require.NoError(t, err)

	t.Log("Setup complete - ready for performance testing")
}

// createSoftwareVulnerabilities creates software entries and their CVEs
func createSoftwareVulnerabilities(t testing.TB, ds *Datastore, hosts []*fleet.Host, numCVEs int) {
	ctx := context.Background()

	// Create more software packages to better distribute CVEs
	softwarePackages := []fleet.Software{
		{Name: "Chrome", Version: "120.0.1", Source: "programs"},
		{Name: "Firefox", Version: "121.0", Source: "programs"},
		{Name: "Node.js", Version: "18.19.0", Source: "programs"},
		{Name: "Python", Version: "3.11.7", Source: "programs"},
		{Name: "Docker", Version: "24.0.7", Source: "programs"},
		{Name: "nginx", Version: "1.24.0", Source: "deb_packages"},
		{Name: "postgresql", Version: "15.5", Source: "deb_packages"},
		{Name: "redis", Version: "7.2.3", Source: "deb_packages"},
		{Name: "mysql", Version: "8.0.35", Source: "deb_packages"},
		{Name: "git", Version: "2.43.0", Source: "deb_packages"},
		{Name: "openssl", Version: "3.0.12", Source: "deb_packages"},
		{Name: "curl", Version: "8.5.0", Source: "deb_packages"},
		{Name: "vim", Version: "9.0", Source: "deb_packages"},
		{Name: "apache2", Version: "2.4.58", Source: "deb_packages"},
		{Name: "php", Version: "8.2.14", Source: "deb_packages"},
	}

	// Install software on hosts
	for i, host := range hosts {
		// Each host gets 5-8 software packages
		numPackages := 5 + (i % 4)
		if numPackages > len(softwarePackages) {
			numPackages = len(softwarePackages)
		}
		hostSoftware := softwarePackages[:numPackages]
		_, err := ds.UpdateHostSoftware(ctx, host.ID, hostSoftware)
		require.NoError(t, err)
	}

	err := ds.SyncHostsSoftware(ctx, time.Now())
	require.NoError(t, err)

	// Create CVEs for software (distributed across 15 software IDs)
	// Each software gets many CVEs to simulate real-world vulnerability distribution
	cvesPerSoftware := numCVEs / 15
	t.Logf("  Creating %d CVEs per software package (15 packages)", cvesPerSoftware)

	for softwareID := uint(1); softwareID <= 15; softwareID++ {
		// Insert CVEs in batches for better performance
		batchSize := 1000
		for batchStart := 0; batchStart < cvesPerSoftware; batchStart += batchSize {
			batchEnd := batchStart + batchSize
			if batchEnd > cvesPerSoftware {
				batchEnd = cvesPerSoftware
			}

			for i := batchStart; i < batchEnd; i++ {
				cveNum := int(softwareID-1)*cvesPerSoftware + i
				// Use wider CVE number range to avoid duplicates
				cve := fmt.Sprintf("CVE-2023-%05d", cveNum)

				_, err := ds.InsertSoftwareVulnerability(ctx, fleet.SoftwareVulnerability{
					SoftwareID: softwareID,
					CVE:        cve,
				}, fleet.NVDSource)
				require.NoError(t, err)
			}
		}

		if softwareID%5 == 0 {
			t.Logf("  Progress: %d/%d software packages completed", softwareID, 15)
		}
	}
}

// createOSVulnerabilities creates OS vulnerabilities
func createOSVulnerabilities(t testing.TB, ds *Datastore, numCVEs int) {
	ctx := context.Background()

	// Create CVEs for each OS type
	// OS ID 1 = Windows, 2 = macOS, 3 = Ubuntu
	osIDs := []uint{1, 2, 3}
	cvesPerOS := numCVEs / len(osIDs)

	t.Logf("  Creating %d CVEs per OS type (3 OS types)", cvesPerOS)

	for _, osID := range osIDs {
		// Insert in batches to avoid memory issues with large slices
		batchSize := 5000
		totalBatches := (cvesPerOS + batchSize - 1) / batchSize

		for batchNum := 0; batchNum < totalBatches; batchNum++ {
			batchStart := batchNum * batchSize
			batchEnd := batchStart + batchSize
			if batchEnd > cvesPerOS {
				batchEnd = cvesPerOS
			}

			vulns := make([]fleet.OSVulnerability, batchEnd-batchStart)
			for i := 0; i < len(vulns); i++ {
				actualIndex := batchStart + i
				// Use different year to avoid overlap with software CVEs
				// (but still create some overlap)
				cveNum := int(osID-1)*cvesPerOS + actualIndex
				cve := fmt.Sprintf("CVE-2022-%05d", cveNum)

				// 10% overlap with software CVEs for realism
				if actualIndex%10 == 0 {
					cve = fmt.Sprintf("CVE-2023-%05d", cveNum)
				}

				vulns[i] = fleet.OSVulnerability{
					OSID: osID,
					CVE:  cve,
				}
			}

			_, err := ds.InsertOSVulnerabilities(ctx, vulns, fleet.MSRCSource)
			require.NoError(t, err)

			if (batchNum+1)%2 == 0 || batchNum == totalBatches-1 {
				t.Logf("  Progress: OS %d - batch %d/%d completed", osID, batchNum+1, totalBatches)
			}
		}
	}
}

// createCVEMetadata creates CVE metadata entries
func createCVEMetadata(t testing.TB, ds *Datastore, numCVEs int) {
	ctx := context.Background()
	mockTime := time.Date(2023, 1, 1, 0, 0, 0, 0, time.UTC)

	// Create metadata in batches of 500 for better performance
	batchSize := 500
	totalBatches := (numCVEs + batchSize - 1) / batchSize
	t.Logf("  Creating CVE metadata in %d batches", totalBatches)

	for start := 0; start < numCVEs; start += batchSize {
		end := start + batchSize
		if end > numCVEs {
			end = numCVEs
		}

		batch := make([]fleet.CVEMeta, end-start)
		for i := start; i < end; i++ {
			// Alternate between 2022 and 2023 CVEs to match our created vulnerabilities
			year := 2023
			if i >= 50000 {
				year = 2022
			}

			// Use 5-digit format to match our CVE creation
			cve := fmt.Sprintf("CVE-%d-%05d", year, i%100000)

			// 30% have CISA known exploit
			cisaExploit := (i % 10) < 3

			batch[i-start] = fleet.CVEMeta{
				CVE:              cve,
				CVSSScore:        ptr.Float64(5.0 + float64(i%50)/10.0),
				EPSSProbability:  ptr.Float64(float64(i%100) / 100.0),
				CISAKnownExploit: ptr.Bool(cisaExploit),
				Published:        ptr.Time(mockTime.Add(time.Duration(i) * time.Hour)),
				Description:      fmt.Sprintf("Test vulnerability %s", cve),
			}
		}

		err := ds.InsertCVEMeta(ctx, batch)
		require.NoError(t, err)

		// Report progress every 10 batches
		batchNum := (start / batchSize) + 1
		if batchNum%10 == 0 || batchNum == totalBatches {
			t.Logf("  Progress: %d/%d batches completed (%d CVEs)", batchNum, totalBatches, end)
		}
	}
}

// getHostIDs extracts host IDs from a slice of hosts
func getHostIDs(hosts []*fleet.Host) []uint {
	ids := make([]uint, len(hosts))
	for i, h := range hosts {
		ids[i] = h.ID
	}
	return ids
}

// CreateMySQLDSForBenchmark creates a datastore for benchmarking
func CreateMySQLDSForBenchmark(b *testing.B) *Datastore {
	return CreateMySQLDS(b)
}
```


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

* **Performance Improvements**
* Faster loading of the vulnerabilities list via optimized database
queries for the vulnerabilities API endpoint.

* **Bug Fixes**
* More accurate “created at” timestamps for vulnerabilities, improving
sorting and consistency.
* More consistent source attribution for vulnerabilities when multiple
sources are available.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-10-17 09:57:47 -05:00

1 line
148 B
Text

Optimized MySQL queries on /api/latest/fleet/vulnerabilities and /api/latest/fleet/software/versions to improve performance for Fleet UI use cases.