feat(test-utils): add CPU performance integration test harness (#24951)

This commit is contained in:
Sri Pasumarthi 2026-04-08 14:50:29 -07:00 committed by GitHub
parent 15f7b24312
commit c7b920717f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 1081 additions and 13 deletions

33
.github/workflows/perf-nightly.yml vendored Normal file
View file

@ -0,0 +1,33 @@
name: 'Performance Tests: Nightly'
on:
schedule:
- cron: '0 3 * * *' # Runs at 3 AM every day
workflow_dispatch: # Allow manual trigger
permissions:
contents: 'read'
jobs:
perf-test:
name: 'Run Performance Usage Tests'
runs-on: 'gemini-cli-ubuntu-16-core'
if: "github.repository == 'google-gemini/gemini-cli'"
steps:
- name: 'Checkout'
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
- name: 'Set up Node.js'
uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4
with:
node-version-file: '.nvmrc'
cache: 'npm'
- name: 'Install dependencies'
run: 'npm ci'
- name: 'Build project'
run: 'npm run build'
- name: 'Run Performance Tests'
run: 'npm run test:perf'

1
.gitignore vendored
View file

@ -48,6 +48,7 @@ packages/cli/src/generated/
packages/core/src/generated/ packages/core/src/generated/
packages/devtools/src/_client-assets.ts packages/devtools/src/_client-assets.ts
.integration-tests/ .integration-tests/
.perf-tests/
packages/vscode-ide-companion/*.vsix packages/vscode-ide-companion/*.vsix
packages/cli/download-ripgrep*/ packages/cli/download-ripgrep*/

View file

@ -44,8 +44,13 @@ powerful tool for developers.
- **Test Commands:** - **Test Commands:**
- **Unit (All):** `npm run test` - **Unit (All):** `npm run test`
- **Integration (E2E):** `npm run test:e2e` - **Integration (E2E):** `npm run test:e2e`
- > **NOTE**: Please run the memory and perf tests locally **only if** you are
> implementing changes related to those test areas. Otherwise skip these
> tests locally and rely on CI to run them on nightly builds.
- **Memory (Nightly):** `npm run test:memory` (Runs memory regression tests - **Memory (Nightly):** `npm run test:memory` (Runs memory regression tests
against baselines. Excluded from `preflight`, run nightly.) against baselines. Excluded from `preflight`, run nightly.)
- **Performance (Nightly):** `npm run test:perf` (Runs CPU performance
regression tests against baselines. Excluded from `preflight`, run nightly.)
- **Workspace-Specific:** `npm test -w <pkg> -- <path>` (Note: `<path>` must - **Workspace-Specific:** `npm test -w <pkg> -- <path>` (Note: `<path>` must
be relative to the workspace root, e.g., be relative to the workspace root, e.g.,
`-w @google/gemini-cli-core -- src/routing/modelRouterService.test.ts`) `-w @google/gemini-cli-core -- src/routing/modelRouterService.test.ts`)

View file

@ -157,6 +157,48 @@ The harness (`MemoryTestHarness` in `packages/test-utils`):
- Compares against baselines with a 10% tolerance. - Compares against baselines with a 10% tolerance.
- Can analyze sustained leaks across 3 snapshots using `analyzeSnapshots()`. - Can analyze sustained leaks across 3 snapshots using `analyzeSnapshots()`.
## Performance regression tests
Performance regression tests are designed to detect wall-clock time, CPU usage,
and event loop delay regressions across key CLI scenarios. They are located in
the `perf-tests` directory.
These tests are distinct from standard integration tests because they measure
performance metrics and compare it against committed baselines.
### Running performance tests
Performance tests are not run as part of the default `npm run test` or
`npm run test:e2e` commands. They are run nightly in CI but can be run manually:
```bash
npm run test:perf
```
### Updating baselines
If you intentionally change behavior that affects performance, you may need to
update the baselines. Set the `UPDATE_PERF_BASELINES` environment variable to
`true`:
```bash
UPDATE_PERF_BASELINES=true npm run test:perf
```
This will run the tests multiple times (with warmup), apply IQR outlier
filtering, and overwrite `perf-tests/baselines.json`. You should review the
changes and commit the updated baseline file.
### How it works
The harness (`PerfTestHarness` in `packages/test-utils`):
- Measures wall-clock time using `performance.now()`.
- Measures CPU usage using `process.cpuUsage()`.
- Monitors event loop delay using `perf_hooks.monitorEventLoopDelay()`.
- Applies IQR (Interquartile Range) filtering to remove outlier samples.
- Compares against baselines with a 15% tolerance.
## Diagnostics ## Diagnostics
The integration test runner provides several options for diagnostics to help The integration test runner provides several options for diagnostics to help

View file

@ -14,6 +14,7 @@ import { join, dirname, extname } from 'node:path';
import { fileURLToPath } from 'node:url'; import { fileURLToPath } from 'node:url';
import { canUseRipgrep } from '../packages/core/src/tools/ripGrep.js'; import { canUseRipgrep } from '../packages/core/src/tools/ripGrep.js';
import { disableMouseTracking } from '@google/gemini-cli-core'; import { disableMouseTracking } from '@google/gemini-cli-core';
import { isolateTestEnv } from '../packages/test-utils/src/env-setup.js';
import { createServer, type Server } from 'node:http'; import { createServer, type Server } from 'node:http';
const __dirname = dirname(fileURLToPath(import.meta.url)); const __dirname = dirname(fileURLToPath(import.meta.url));
@ -88,15 +89,8 @@ export async function setup() {
runDir = join(integrationTestsDir, `${Date.now()}`); runDir = join(integrationTestsDir, `${Date.now()}`);
await mkdir(runDir, { recursive: true }); await mkdir(runDir, { recursive: true });
// Set the home directory to the test run directory to avoid conflicts // Isolate environment variables
// with the user's local config. isolateTestEnv(runDir);
process.env['HOME'] = runDir;
if (process.platform === 'win32') {
process.env['USERPROFILE'] = runDir;
}
// We also need to set the config dir explicitly, since the code might
// construct the path before the HOME env var is set.
process.env['GEMINI_CONFIG_DIR'] = join(runDir, '.gemini');
// Download ripgrep to avoid race conditions in parallel tests // Download ripgrep to avoid race conditions in parallel tests
const available = await canUseRipgrep(); const available = await canUseRipgrep();
@ -127,10 +121,6 @@ export async function setup() {
} }
process.env['INTEGRATION_TEST_FILE_DIR'] = runDir; process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true';
// Force file storage to avoid keychain prompts/hangs in CI, especially on macOS
process.env['GEMINI_FORCE_FILE_STORAGE'] = 'true';
process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log');
if (process.env['KEEP_OUTPUT']) { if (process.env['KEEP_OUTPUT']) {
console.log(`Keeping output for test run in: ${runDir}`); console.log(`Keeping output for test run in: ${runDir}`);

1
package-lock.json generated
View file

@ -36,6 +36,7 @@
"@types/ws": "^8.18.1", "@types/ws": "^8.18.1",
"@vitest/coverage-v8": "^3.1.1", "@vitest/coverage-v8": "^3.1.1",
"@vitest/eslint-plugin": "^1.3.4", "@vitest/eslint-plugin": "^1.3.4",
"asciichart": "^1.5.25",
"cross-env": "^7.0.3", "cross-env": "^7.0.3",
"depcheck": "^1.4.7", "depcheck": "^1.4.7",
"domexception": "^4.0.0", "domexception": "^4.0.0",

View file

@ -53,6 +53,8 @@
"test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests", "test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests",
"test:memory": "vitest run --root ./memory-tests", "test:memory": "vitest run --root ./memory-tests",
"test:memory:update-baselines": "cross-env UPDATE_MEMORY_BASELINES=true vitest run --root ./memory-tests", "test:memory:update-baselines": "cross-env UPDATE_MEMORY_BASELINES=true vitest run --root ./memory-tests",
"test:perf": "vitest run --root ./perf-tests",
"test:perf:update-baselines": "cross-env UPDATE_PERF_BASELINES=true vitest run --root ./perf-tests",
"test:integration:sandbox:docker": "cross-env GEMINI_SANDBOX=docker npm run build:sandbox && cross-env GEMINI_SANDBOX=docker vitest run --root ./integration-tests", "test:integration:sandbox:docker": "cross-env GEMINI_SANDBOX=docker npm run build:sandbox && cross-env GEMINI_SANDBOX=docker vitest run --root ./integration-tests",
"test:integration:sandbox:podman": "cross-env GEMINI_SANDBOX=podman vitest run --root ./integration-tests", "test:integration:sandbox:podman": "cross-env GEMINI_SANDBOX=podman vitest run --root ./integration-tests",
"lint": "cross-env NODE_OPTIONS=\"--max-old-space-size=8192\" eslint . --cache --max-warnings 0", "lint": "cross-env NODE_OPTIONS=\"--max-old-space-size=8192\" eslint . --cache --max-warnings 0",
@ -105,6 +107,7 @@
"@types/ws": "^8.18.1", "@types/ws": "^8.18.1",
"@vitest/coverage-v8": "^3.1.1", "@vitest/coverage-v8": "^3.1.1",
"@vitest/eslint-plugin": "^1.3.4", "@vitest/eslint-plugin": "^1.3.4",
"asciichart": "^1.5.25",
"cross-env": "^7.0.3", "cross-env": "^7.0.3",
"depcheck": "^1.4.7", "depcheck": "^1.4.7",
"domexception": "^4.0.0", "domexception": "^4.0.0",

View file

@ -0,0 +1,35 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { join } from 'node:path';
/**
* Isolate the test environment by setting environment variables
* to point to a temporary run directory.
*
* @param runDir - The temporary directory for this test run.
*/
export function isolateTestEnv(runDir: string): void {
// Set the home directory to the test run directory to avoid conflicts
// with the user's local config.
process.env['HOME'] = runDir;
if (process.platform === 'win32') {
process.env['USERPROFILE'] = runDir;
}
// We also need to set the config dir explicitly, since the code might
// construct the path before the HOME env var is set.
process.env['GEMINI_CONFIG_DIR'] = join(runDir, '.gemini');
// Force file storage to avoid keychain prompts/hangs in CI, especially on macOS
process.env['GEMINI_FORCE_FILE_STORAGE'] = 'true';
// Mark as integration test
process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true';
// Isolate telemetry log
process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log');
}

View file

@ -8,6 +8,8 @@ export * from './file-system-test-helpers.js';
export * from './fixtures/agents.js'; export * from './fixtures/agents.js';
export * from './memory-baselines.js'; export * from './memory-baselines.js';
export * from './memory-test-harness.js'; export * from './memory-test-harness.js';
export * from './perf-test-harness.js';
export * from './mock-utils.js'; export * from './mock-utils.js';
export * from './test-mcp-server.js'; export * from './test-mcp-server.js';
export * from './test-rig.js'; export * from './test-rig.js';
export * from './env-setup.js';

View file

@ -0,0 +1,546 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { performance } from 'node:perf_hooks';
import { setTimeout as sleep } from 'node:timers/promises';
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
/** Configuration for asciichart plot function. */
interface PlotConfig {
height?: number;
format?: (x: number) => string;
}
/** Type for the asciichart plot function. */
type PlotFn = (series: number[], config?: PlotConfig) => string;
/**
* Baseline entry for a single performance test scenario.
*/
export interface PerfBaseline {
wallClockMs: number;
cpuTotalUs: number;
eventLoopDelayP99Ms: number;
timestamp: string;
}
/**
* Top-level structure of the perf baselines JSON file.
*/
export interface PerfBaselineFile {
version: number;
updatedAt: string;
scenarios: Record<string, PerfBaseline>;
}
/**
* A single performance snapshot at a point in time.
*/
export interface PerfSnapshot {
timestamp: number;
label: string;
wallClockMs: number;
cpuUserUs: number;
cpuSystemUs: number;
cpuTotalUs: number;
eventLoopDelayP50Ms: number;
eventLoopDelayP95Ms: number;
eventLoopDelayP99Ms: number;
eventLoopDelayMaxMs: number;
}
/**
* Result from running a performance test scenario.
*/
export interface PerfTestResult {
scenarioName: string;
samples: PerfSnapshot[];
filteredSamples: PerfSnapshot[];
median: PerfSnapshot;
baseline: PerfBaseline | undefined;
withinTolerance: boolean;
deltaPercent: number;
cpuDeltaPercent: number;
}
/**
* Options for the PerfTestHarness.
*/
export interface PerfTestHarnessOptions {
/** Path to the baselines JSON file */
baselinesPath: string;
/** Default tolerance percentage (0-100). Default: 15 */
defaultTolerancePercent?: number;
/** Default CPU tolerance percentage (0-100). Optional */
defaultCpuTolerancePercent?: number;
/** Number of samples per scenario. Default: 5 */
sampleCount?: number;
/** Number of warmup runs to discard. Default: 1 */
warmupCount?: number;
/** Pause in ms between samples. Default: 100 */
samplePauseMs?: number;
}
/**
* Active timer state tracked internally.
*/
interface ActiveTimer {
label: string;
startTime: number;
startCpuUsage: NodeJS.CpuUsage;
}
/**
* PerfTestHarness provides infrastructure for running CPU performance tests.
*
* It handles:
* - High-resolution wall-clock timing via performance.now()
* - CPU usage measurement via process.cpuUsage()
* - Event loop delay monitoring via perf_hooks.monitorEventLoopDelay()
* - IQR outlier filtering for noise reduction
* - Warmup runs to avoid JIT compilation noise
* - Comparing against baselines with configurable tolerance
* - Generating ASCII chart reports
*/
export class PerfTestHarness {
private baselines: PerfBaselineFile;
private readonly baselinesPath: string;
private readonly defaultTolerancePercent: number;
private readonly defaultCpuTolerancePercent?: number;
private readonly sampleCount: number;
private readonly warmupCount: number;
private readonly samplePauseMs: number;
private allResults: PerfTestResult[] = [];
private activeTimers: Map<string, ActiveTimer> = new Map();
constructor(options: PerfTestHarnessOptions) {
this.baselinesPath = options.baselinesPath;
this.defaultTolerancePercent = options.defaultTolerancePercent ?? 15;
this.defaultCpuTolerancePercent = options.defaultCpuTolerancePercent;
this.sampleCount = options.sampleCount ?? 5;
this.warmupCount = options.warmupCount ?? 1;
this.samplePauseMs = options.samplePauseMs ?? 100;
this.baselines = loadPerfBaselines(this.baselinesPath);
}
/**
* Start a high-resolution timer with CPU tracking.
*/
startTimer(label: string): void {
this.activeTimers.set(label, {
label,
startTime: performance.now(),
startCpuUsage: process.cpuUsage(),
});
}
/**
* Stop a timer and return the snapshot.
*/
stopTimer(label: string): PerfSnapshot {
const timer = this.activeTimers.get(label);
if (!timer) {
throw new Error(`No active timer found for label "${label}"`);
}
const wallClockMs = performance.now() - timer.startTime;
const cpuDelta = process.cpuUsage(timer.startCpuUsage);
this.activeTimers.delete(label);
return {
timestamp: Date.now(),
label,
wallClockMs,
cpuUserUs: cpuDelta.user,
cpuSystemUs: cpuDelta.system,
cpuTotalUs: cpuDelta.user + cpuDelta.system,
eventLoopDelayP50Ms: 0,
eventLoopDelayP95Ms: 0,
eventLoopDelayP99Ms: 0,
eventLoopDelayMaxMs: 0,
};
}
/**
* Measure a function's wall-clock time and CPU usage.
* Returns the snapshot with timing data.
*/
async measure(label: string, fn: () => Promise<void>): Promise<PerfSnapshot> {
this.startTimer(label);
await fn();
return this.stopTimer(label);
}
/**
* Measure a function with event loop delay monitoring.
* Uses perf_hooks.monitorEventLoopDelay() for histogram data.
*/
async measureWithEventLoop(
label: string,
fn: () => Promise<void>,
): Promise<PerfSnapshot> {
// monitorEventLoopDelay is available in Node.js 12+
const { monitorEventLoopDelay } = await import('node:perf_hooks');
const histogram = monitorEventLoopDelay({ resolution: 10 });
histogram.enable();
this.startTimer(label);
await fn();
const snapshot = this.stopTimer(label);
histogram.disable();
// Convert from nanoseconds to milliseconds
snapshot.eventLoopDelayP50Ms = histogram.percentile(50) / 1e6;
snapshot.eventLoopDelayP95Ms = histogram.percentile(95) / 1e6;
snapshot.eventLoopDelayP99Ms = histogram.percentile(99) / 1e6;
snapshot.eventLoopDelayMaxMs = histogram.max / 1e6;
return snapshot;
}
/**
* Run a scenario multiple times with warmup, outlier filtering, and baseline comparison.
*
* @param name - Scenario name (must match baseline key)
* @param fn - Async function that executes one sample of the scenario.
* Must return a PerfSnapshot with measured values.
* @param tolerancePercent - Override default tolerance for this scenario
*/
async runScenario(
name: string,
fn: () => Promise<PerfSnapshot>,
tolerancePercent?: number,
): Promise<PerfTestResult> {
const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
const totalRuns = this.warmupCount + this.sampleCount;
const allSnapshots: PerfSnapshot[] = [];
for (let i = 0; i < totalRuns; i++) {
const isWarmup = i < this.warmupCount;
const snapshot = await fn();
snapshot.label = isWarmup
? `warmup-${i}`
: `sample-${i - this.warmupCount}`;
if (!isWarmup) {
allSnapshots.push(snapshot);
}
// Brief pause between samples
await sleep(this.samplePauseMs);
}
// Apply IQR outlier filtering on wall-clock time
const filteredSnapshots = this.filterOutliers(allSnapshots, 'wallClockMs');
// Get median of filtered samples
const median = this.getMedianSnapshot(filteredSnapshots);
median.label = 'median';
// Get baseline
const baseline = this.baselines.scenarios[name];
// Determine if within tolerance
let deltaPercent = 0;
let cpuDeltaPercent = 0;
let withinTolerance = true;
if (baseline) {
deltaPercent =
((median.wallClockMs - baseline.wallClockMs) / baseline.wallClockMs) *
100;
cpuDeltaPercent =
((median.cpuTotalUs - baseline.cpuTotalUs) / baseline.cpuTotalUs) * 100;
withinTolerance = deltaPercent <= tolerance;
}
const result: PerfTestResult = {
scenarioName: name,
samples: allSnapshots,
filteredSamples: filteredSnapshots,
median,
baseline,
withinTolerance,
deltaPercent,
cpuDeltaPercent,
};
this.allResults.push(result);
return result;
}
/**
* Assert that a scenario result is within the baseline tolerance.
*/
assertWithinBaseline(
result: PerfTestResult,
tolerancePercent?: number,
cpuTolerancePercent?: number,
): void {
const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
const cpuTolerance = cpuTolerancePercent ?? this.defaultCpuTolerancePercent;
if (!result.baseline) {
console.warn(
`⚠ No baseline found for "${result.scenarioName}". ` +
`Run with UPDATE_PERF_BASELINES=true to create one. ` +
`Measured: ${result.median.wallClockMs.toFixed(1)} ms wall-clock.`,
);
return;
}
const deltaPercent =
((result.median.wallClockMs - result.baseline.wallClockMs) /
result.baseline.wallClockMs) *
100;
if (deltaPercent > tolerance) {
throw new Error(
`Performance regression detected for "${result.scenarioName}"!\n` +
` Measured: ${result.median.wallClockMs.toFixed(1)} ms wall-clock\n` +
` Baseline: ${result.baseline.wallClockMs.toFixed(1)} ms wall-clock\n` +
` Delta: ${deltaPercent.toFixed(1)}% (tolerance: ${tolerance}%)\n` +
` CPU total: ${formatUs(result.median.cpuTotalUs)}\n` +
` EL p99: ${result.median.eventLoopDelayP99Ms.toFixed(1)} ms\n` +
` Samples: ${result.samples.length} (${result.filteredSamples.length} after IQR filter)`,
);
}
if (cpuTolerance !== undefined && result.cpuDeltaPercent > cpuTolerance) {
throw new Error(
`CPU usage regression detected for "${result.scenarioName}"!\n` +
` Measured: ${formatUs(result.median.cpuTotalUs)}\n` +
` Baseline: ${formatUs(result.baseline.cpuTotalUs)}\n` +
` Delta: ${result.cpuDeltaPercent.toFixed(1)}% (tolerance: ${cpuTolerance}%)\n` +
` Wall-clock: ${result.median.wallClockMs.toFixed(1)} ms\n` +
` EL p99: ${result.median.eventLoopDelayP99Ms.toFixed(1)} ms`,
);
}
}
/**
* Update the baseline for a scenario with the current measured values.
*/
updateScenarioBaseline(result: PerfTestResult): void {
updatePerfBaseline(this.baselinesPath, result.scenarioName, {
wallClockMs: result.median.wallClockMs,
cpuTotalUs: result.median.cpuTotalUs,
eventLoopDelayP99Ms: result.median.eventLoopDelayP99Ms,
});
// Reload baselines after update
this.baselines = loadPerfBaselines(this.baselinesPath);
console.log(
`Updated baseline for ${result.scenarioName}: ${result.median.wallClockMs.toFixed(1)} ms`,
);
}
/**
* Generate an ASCII report with summary table and charts.
*/
async generateReport(results?: PerfTestResult[]): Promise<string> {
const resultsToReport = results ?? this.allResults;
const lines: string[] = [];
lines.push('');
lines.push('═══════════════════════════════════════════════════');
lines.push(' PERFORMANCE TEST REPORT');
lines.push('═══════════════════════════════════════════════════');
lines.push('');
for (const result of resultsToReport) {
const measured = `${result.median.wallClockMs.toFixed(1)} ms`;
const baseline = result.baseline
? `${result.baseline.wallClockMs.toFixed(1)} ms`
: 'N/A';
const delta = result.baseline
? `${result.deltaPercent >= 0 ? '+' : ''}${result.deltaPercent.toFixed(1)}%`
: 'N/A';
const status = !result.baseline
? 'NEW'
: result.withinTolerance
? '✅'
: '❌';
lines.push(
`${result.scenarioName}: ${measured} (Baseline: ${baseline}, Delta: ${delta}) ${status}`,
);
// Show CPU breakdown
const cpuMs = `${(result.median.cpuTotalUs / 1000).toFixed(1)} ms`;
lines.push(
` CPU: ${cpuMs} (user: ${formatUs(result.median.cpuUserUs)}, system: ${formatUs(result.median.cpuSystemUs)})`,
);
if (result.median.eventLoopDelayP99Ms > 0) {
lines.push(
` Event loop: p50=${result.median.eventLoopDelayP50Ms.toFixed(1)}ms p95=${result.median.eventLoopDelayP95Ms.toFixed(1)}ms p99=${result.median.eventLoopDelayP99Ms.toFixed(1)}ms max=${result.median.eventLoopDelayMaxMs.toFixed(1)}ms`,
);
}
lines.push(
` Samples: ${result.samples.length}${result.filteredSamples.length} after IQR filter`,
);
}
lines.push('');
// Generate ASCII chart for wall-clock per scenario
try {
// @ts-expect-error - asciichart may not have types
const asciichart = (await import('asciichart')) as {
default?: { plot?: PlotFn };
plot?: PlotFn;
};
const plot: PlotFn | undefined =
asciichart.default?.plot ?? asciichart.plot;
for (const result of resultsToReport) {
if (result.filteredSamples.length > 2) {
lines.push(`📈 Wall-clock trend: ${result.scenarioName}`);
lines.push('─'.repeat(60));
const wallClockData = result.filteredSamples.map(
(s) => s.wallClockMs,
);
if (plot) {
const chart = plot(wallClockData, {
height: 8,
format: (x: number) => `${x.toFixed(0)} ms`.padStart(10),
});
lines.push(chart);
}
const labels = result.filteredSamples.map((s) => s.label);
lines.push(' ' + labels.join(' → '));
lines.push('');
}
}
} catch {
lines.push(
'(asciichart not available — install with: npm install --save-dev asciichart)',
);
lines.push('');
}
lines.push('═══════════════════════════════════════════════════');
lines.push('');
const report = lines.join('\n');
console.log(report);
return report;
}
/**
* Filter outliers using the Interquartile Range (IQR) method.
* Removes samples where the given metric falls outside Q1 - 1.5*IQR or Q3 + 1.5*IQR.
*/
private filterOutliers(
snapshots: PerfSnapshot[],
metric: keyof PerfSnapshot,
): PerfSnapshot[] {
if (snapshots.length < 4) {
// Not enough data for meaningful IQR filtering
return [...snapshots];
}
const sorted = [...snapshots].sort(
(a, b) => (a[metric] as number) - (b[metric] as number),
);
const q1Idx = Math.floor(sorted.length * 0.25);
const q3Idx = Math.floor(sorted.length * 0.75);
const q1 = sorted[q1Idx]![metric] as number;
const q3 = sorted[q3Idx]![metric] as number;
const iqr = q3 - q1;
const lowerBound = q1 - 1.5 * iqr;
const upperBound = q3 + 1.5 * iqr;
return snapshots.filter((s) => {
const val = s[metric] as number;
return val >= lowerBound && val <= upperBound;
});
}
/**
* Get the median snapshot by wall-clock time from a sorted list.
*/
private getMedianSnapshot(snapshots: PerfSnapshot[]): PerfSnapshot {
if (snapshots.length === 0) {
throw new Error('Cannot compute median of empty snapshot list');
}
const sorted = [...snapshots].sort((a, b) => a.wallClockMs - b.wallClockMs);
const medianIdx = Math.floor(sorted.length / 2);
return { ...sorted[medianIdx]! };
}
}
// ─── Baseline management ─────────────────────────────────────────────
/**
* Load perf baselines from a JSON file.
*/
export function loadPerfBaselines(path: string): PerfBaselineFile {
if (!existsSync(path)) {
return {
version: 1,
updatedAt: new Date().toISOString(),
scenarios: {},
};
}
const content = readFileSync(path, 'utf-8');
return JSON.parse(content) as PerfBaselineFile;
}
/**
* Save perf baselines to a JSON file.
*/
export function savePerfBaselines(
path: string,
baselines: PerfBaselineFile,
): void {
baselines.updatedAt = new Date().toISOString();
writeFileSync(path, JSON.stringify(baselines, null, 2) + '\n');
}
/**
* Update (or create) a single scenario baseline in the file.
*/
export function updatePerfBaseline(
path: string,
scenarioName: string,
measured: {
wallClockMs: number;
cpuTotalUs: number;
eventLoopDelayP99Ms: number;
},
): void {
const baselines = loadPerfBaselines(path);
baselines.scenarios[scenarioName] = {
wallClockMs: measured.wallClockMs,
cpuTotalUs: measured.cpuTotalUs,
eventLoopDelayP99Ms: measured.eventLoopDelayP99Ms,
timestamp: new Date().toISOString(),
};
savePerfBaselines(path, baselines);
}
// ─── Helpers ─────────────────────────────────────────────────────────
/**
* Format microseconds as a human-readable string.
*/
function formatUs(us: number): string {
if (us > 1_000_000) {
return `${(us / 1_000_000).toFixed(2)} s`;
}
if (us > 1_000) {
return `${(us / 1_000).toFixed(1)} ms`;
}
return `${us} μs`;
}

121
perf-tests/README.md Normal file
View file

@ -0,0 +1,121 @@
# CPU Performance Integration Test Harness
## Overview
This directory contains performance/CPU integration tests for the Gemini CLI.
These tests measure wall-clock time, CPU usage, and event loop responsiveness to
detect regressions across key scenarios.
CPU performance is inherently noisy, especially in CI. The harness addresses
this with:
- **IQR outlier filtering** — discards anomalous samples
- **Median sampling** — takes N runs, reports the median after filtering
- **Warmup runs** — discards the first run to mitigate JIT compilation noise
- **15% default tolerance** — won't panic at slight regressions
## Running
```bash
# Run tests (compare against committed baselines)
npm run test:perf
# Update baselines (after intentional changes)
npm run test:perf:update-baselines
# Verbose output
VERBOSE=true npm run test:perf
# Keep test artifacts for debugging
KEEP_OUTPUT=true npm run test:perf
```
## How It Works
### Measurement Primitives
The `PerfTestHarness` class (in `packages/test-utils`) provides:
- **`performance.now()`** — high-resolution wall-clock timing
- **`process.cpuUsage()`** — user + system CPU microseconds (delta between
start/stop)
- **`perf_hooks.monitorEventLoopDelay()`** — event loop delay histogram
(p50/p95/p99/max)
### Noise Reduction
1. **Warmup**: First run is discarded to mitigate JIT compilation artifacts
2. **Multiple samples**: Each scenario runs N times (default 5)
3. **IQR filtering**: Samples outside Q11.5×IQR and Q3+1.5×IQR are discarded
4. **Median**: The median of remaining samples is used for comparison
### Baseline Management
Baselines are stored in `baselines.json` in this directory. Each scenario has:
```json
{
"cold-startup-time": {
"wallClockMs": 1234.5,
"cpuTotalUs": 567890,
"eventLoopDelayP99Ms": 12.3,
"timestamp": "2026-04-08T..."
}
}
```
Tests fail if the measured value exceeds `baseline × 1.15` (15% tolerance).
To recalibrate after intentional changes:
```bash
npm run test:perf:update-baselines
# then commit baselines.json
```
### Report Output
After all tests, the harness prints an ASCII summary:
```
═══════════════════════════════════════════════════
PERFORMANCE TEST REPORT
═══════════════════════════════════════════════════
cold-startup-time: 1234.5 ms (Baseline: 1200.0 ms, Delta: +2.9%) ✅
idle-cpu-usage: 2.1 % (Baseline: 2.0 %, Delta: +5.0%) ✅
skill-loading-time: 1567.8 ms (Baseline: 1500.0 ms, Delta: +4.5%) ✅
```
## Architecture
```
perf-tests/
├── README.md ← you are here
├── baselines.json ← committed baseline values
├── globalSetup.ts ← test environment setup
├── perf-usage.test.ts ← test scenarios
├── perf.*.responses ← fake API responses per scenario
├── tsconfig.json ← TypeScript config
└── vitest.config.ts ← vitest config (serial, isolated)
packages/test-utils/src/
├── perf-test-harness.ts ← PerfTestHarness class
└── index.ts ← re-exports
```
## CI Integration
These tests are **excluded from `preflight`** and designed for nightly CI:
```yaml
- name: Performance regression tests
run: npm run test:perf
```
## Adding a New Scenario
1. Add a fake response file: `perf.<scenario-name>.responses`
2. Add a test case in `perf-usage.test.ts` using `harness.runScenario()`
3. Run `npm run test:perf:update-baselines` to establish initial baseline
4. Commit the updated `baselines.json`

24
perf-tests/baselines.json Normal file
View file

@ -0,0 +1,24 @@
{
"version": 1,
"updatedAt": "2026-04-08T18:51:29.839Z",
"scenarios": {
"cold-startup-time": {
"wallClockMs": 1333.4230420000004,
"cpuTotalUs": 1711,
"eventLoopDelayP99Ms": 0,
"timestamp": "2026-04-08T18:50:58.124Z"
},
"idle-cpu-usage": {
"wallClockMs": 5001.926125,
"cpuTotalUs": 128518,
"eventLoopDelayP99Ms": 12.705791,
"timestamp": "2026-04-08T18:51:23.938Z"
},
"skill-loading-time": {
"wallClockMs": 1372.4463749999995,
"cpuTotalUs": 1550,
"eventLoopDelayP99Ms": 0,
"timestamp": "2026-04-08T18:51:29.839Z"
}
}
}

67
perf-tests/globalSetup.ts Normal file
View file

@ -0,0 +1,67 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { mkdir, readdir, rm } from 'node:fs/promises';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { canUseRipgrep } from '../packages/core/src/tools/ripGrep.js';
import { isolateTestEnv } from '../packages/test-utils/src/env-setup.js';
const __dirname = dirname(fileURLToPath(import.meta.url));
const rootDir = join(__dirname, '..');
const perfTestsDir = join(rootDir, '.perf-tests');
const KEEP_RUNS_COUNT = 5;
let runDir = '';
export async function setup() {
runDir = join(perfTestsDir, `${Date.now()}`);
await mkdir(runDir, { recursive: true });
// Isolate environment variables
isolateTestEnv(runDir);
// Download ripgrep to avoid race conditions
const available = await canUseRipgrep();
if (!available) {
throw new Error('Failed to download ripgrep binary');
}
// Clean up old test runs, keeping the latest few for debugging
try {
const testRuns = await readdir(perfTestsDir);
if (testRuns.length > KEEP_RUNS_COUNT) {
const oldRuns = testRuns
.sort()
.slice(0, testRuns.length - KEEP_RUNS_COUNT);
await Promise.all(
oldRuns.map((oldRun) =>
rm(join(perfTestsDir, oldRun), {
recursive: true,
force: true,
}),
),
);
}
} catch (e) {
console.error('Error cleaning up old perf test runs:', e);
}
process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false';
console.log(`\nPerf test output directory: ${runDir}`);
}
export async function teardown() {
// Cleanup unless KEEP_OUTPUT is set
if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) {
try {
await rm(runDir, { recursive: true, force: true });
} catch (e) {
console.warn('Failed to clean up perf test directory:', e);
}
}
}

View file

@ -0,0 +1,153 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, beforeAll, afterAll } from 'vitest';
import { TestRig, PerfTestHarness } from '@google/gemini-cli-test-utils';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = dirname(fileURLToPath(import.meta.url));
const BASELINES_PATH = join(__dirname, 'baselines.json');
const UPDATE_BASELINES = process.env['UPDATE_PERF_BASELINES'] === 'true';
const TOLERANCE_PERCENT = 15;
// Use fewer samples locally for faster iteration, more in CI
const SAMPLE_COUNT = process.env['CI'] ? 5 : 3;
const WARMUP_COUNT = 1;
describe('CPU Performance Tests', () => {
let harness: PerfTestHarness;
beforeAll(() => {
harness = new PerfTestHarness({
baselinesPath: BASELINES_PATH,
defaultTolerancePercent: TOLERANCE_PERCENT,
sampleCount: SAMPLE_COUNT,
warmupCount: WARMUP_COUNT,
});
});
afterAll(async () => {
// Generate the summary report after all tests
await harness.generateReport();
});
it('cold-startup-time: startup completes within baseline', async () => {
const result = await harness.runScenario('cold-startup-time', async () => {
const rig = new TestRig();
try {
rig.setup('perf-cold-startup', {
fakeResponsesPath: join(__dirname, 'perf.cold-startup.responses'),
});
return await harness.measure('cold-startup', async () => {
await rig.run({
args: ['hello'],
timeout: 120000,
env: { GEMINI_API_KEY: 'fake-perf-test-key' },
});
});
} finally {
await rig.cleanup();
}
});
if (UPDATE_BASELINES) {
harness.updateScenarioBaseline(result);
} else {
harness.assertWithinBaseline(result);
}
});
it('idle-cpu-usage: CPU stays low when idle', async () => {
const IDLE_OBSERVATION_MS = 5000;
const result = await harness.runScenario('idle-cpu-usage', async () => {
const rig = new TestRig();
try {
rig.setup('perf-idle-cpu', {
fakeResponsesPath: join(__dirname, 'perf.idle-cpu.responses'),
});
// First, run a prompt to get the CLI into idle state
await rig.run({
args: ['hello'],
timeout: 120000,
env: { GEMINI_API_KEY: 'fake-perf-test-key' },
});
// Now measure CPU during idle period in the test process
return await harness.measureWithEventLoop('idle-cpu', async () => {
// Simulate idle period — just wait
const { setTimeout: sleep } = await import('node:timers/promises');
await sleep(IDLE_OBSERVATION_MS);
});
} finally {
await rig.cleanup();
}
});
if (UPDATE_BASELINES) {
harness.updateScenarioBaseline(result);
} else {
harness.assertWithinBaseline(result);
}
});
it('skill-loading-time: startup with many skills within baseline', async () => {
const SKILL_COUNT = 20;
const result = await harness.runScenario('skill-loading-time', async () => {
const rig = new TestRig();
try {
rig.setup('perf-skill-loading', {
fakeResponsesPath: join(__dirname, 'perf.skill-loading.responses'),
});
// Create many skill directories with SKILL.md files
for (let i = 0; i < SKILL_COUNT; i++) {
const skillDir = `.gemini/skills/perf-skill-${i}`;
rig.mkdir(skillDir);
rig.createFile(
`${skillDir}/SKILL.md`,
[
'---',
`name: perf-skill-${i}`,
`description: Performance test skill number ${i}`,
`activation: manual`,
'---',
'',
`# Performance Test Skill ${i}`,
'',
`This is a test skill for measuring skill loading performance.`,
`It contains some content to simulate real-world skill files.`,
'',
`## Usage`,
'',
`Use this skill by activating it with @perf-skill-${i}.`,
].join('\n'),
);
}
return await harness.measure('skill-loading', async () => {
await rig.run({
args: ['hello'],
timeout: 120000,
env: { GEMINI_API_KEY: 'fake-perf-test-key' },
});
});
} finally {
await rig.cleanup();
}
});
if (UPDATE_BASELINES) {
harness.updateScenarioBaseline(result);
} else {
harness.assertWithinBaseline(result);
}
});
});

View file

@ -0,0 +1,2 @@
{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help. What would you like to work on?"}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":12,"totalTokenCount":17,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}

View file

@ -0,0 +1,2 @@
{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":8,"totalTokenCount":13,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}

View file

@ -0,0 +1,2 @@
{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to assist you with your project."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":10,"totalTokenCount":15,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}

12
perf-tests/tsconfig.json Normal file
View file

@ -0,0 +1,12 @@
{
"extends": "../tsconfig.json",
"compilerOptions": {
"noEmit": true,
"allowJs": true
},
"include": ["**/*.ts"],
"references": [
{ "path": "../packages/core" },
{ "path": "../packages/test-utils" }
]
}

View file

@ -0,0 +1,27 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { defineConfig } from 'vitest/config';
export default defineConfig({
test: {
testTimeout: 600000, // 10 minutes — performance profiling needs time for multiple samples
globalSetup: './globalSetup.ts',
reporters: ['default'],
include: ['**/*.test.ts'],
retry: 0, // No retries — noise is handled by IQR filtering and tolerance
fileParallelism: false, // Must run serially to avoid CPU contention
pool: 'forks',
poolOptions: {
forks: {
singleFork: true, // Single process for accurate per-test CPU readings
},
},
env: {
GEMINI_TEST_TYPE: 'perf',
},
},
});