mirror of
https://github.com/google-gemini/gemini-cli
synced 2026-04-21 13:37:17 +00:00
feat(test-utils): add CPU performance integration test harness (#24951)
This commit is contained in:
parent
15f7b24312
commit
c7b920717f
19 changed files with 1081 additions and 13 deletions
33
.github/workflows/perf-nightly.yml
vendored
Normal file
33
.github/workflows/perf-nightly.yml
vendored
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
name: 'Performance Tests: Nightly'
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 3 * * *' # Runs at 3 AM every day
|
||||||
|
workflow_dispatch: # Allow manual trigger
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: 'read'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
perf-test:
|
||||||
|
name: 'Run Performance Usage Tests'
|
||||||
|
runs-on: 'gemini-cli-ubuntu-16-core'
|
||||||
|
if: "github.repository == 'google-gemini/gemini-cli'"
|
||||||
|
steps:
|
||||||
|
- name: 'Checkout'
|
||||||
|
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
|
||||||
|
|
||||||
|
- name: 'Set up Node.js'
|
||||||
|
uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version-file: '.nvmrc'
|
||||||
|
cache: 'npm'
|
||||||
|
|
||||||
|
- name: 'Install dependencies'
|
||||||
|
run: 'npm ci'
|
||||||
|
|
||||||
|
- name: 'Build project'
|
||||||
|
run: 'npm run build'
|
||||||
|
|
||||||
|
- name: 'Run Performance Tests'
|
||||||
|
run: 'npm run test:perf'
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -48,6 +48,7 @@ packages/cli/src/generated/
|
||||||
packages/core/src/generated/
|
packages/core/src/generated/
|
||||||
packages/devtools/src/_client-assets.ts
|
packages/devtools/src/_client-assets.ts
|
||||||
.integration-tests/
|
.integration-tests/
|
||||||
|
.perf-tests/
|
||||||
packages/vscode-ide-companion/*.vsix
|
packages/vscode-ide-companion/*.vsix
|
||||||
packages/cli/download-ripgrep*/
|
packages/cli/download-ripgrep*/
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -44,8 +44,13 @@ powerful tool for developers.
|
||||||
- **Test Commands:**
|
- **Test Commands:**
|
||||||
- **Unit (All):** `npm run test`
|
- **Unit (All):** `npm run test`
|
||||||
- **Integration (E2E):** `npm run test:e2e`
|
- **Integration (E2E):** `npm run test:e2e`
|
||||||
|
- > **NOTE**: Please run the memory and perf tests locally **only if** you are
|
||||||
|
> implementing changes related to those test areas. Otherwise skip these
|
||||||
|
> tests locally and rely on CI to run them on nightly builds.
|
||||||
- **Memory (Nightly):** `npm run test:memory` (Runs memory regression tests
|
- **Memory (Nightly):** `npm run test:memory` (Runs memory regression tests
|
||||||
against baselines. Excluded from `preflight`, run nightly.)
|
against baselines. Excluded from `preflight`, run nightly.)
|
||||||
|
- **Performance (Nightly):** `npm run test:perf` (Runs CPU performance
|
||||||
|
regression tests against baselines. Excluded from `preflight`, run nightly.)
|
||||||
- **Workspace-Specific:** `npm test -w <pkg> -- <path>` (Note: `<path>` must
|
- **Workspace-Specific:** `npm test -w <pkg> -- <path>` (Note: `<path>` must
|
||||||
be relative to the workspace root, e.g.,
|
be relative to the workspace root, e.g.,
|
||||||
`-w @google/gemini-cli-core -- src/routing/modelRouterService.test.ts`)
|
`-w @google/gemini-cli-core -- src/routing/modelRouterService.test.ts`)
|
||||||
|
|
|
||||||
|
|
@ -157,6 +157,48 @@ The harness (`MemoryTestHarness` in `packages/test-utils`):
|
||||||
- Compares against baselines with a 10% tolerance.
|
- Compares against baselines with a 10% tolerance.
|
||||||
- Can analyze sustained leaks across 3 snapshots using `analyzeSnapshots()`.
|
- Can analyze sustained leaks across 3 snapshots using `analyzeSnapshots()`.
|
||||||
|
|
||||||
|
## Performance regression tests
|
||||||
|
|
||||||
|
Performance regression tests are designed to detect wall-clock time, CPU usage,
|
||||||
|
and event loop delay regressions across key CLI scenarios. They are located in
|
||||||
|
the `perf-tests` directory.
|
||||||
|
|
||||||
|
These tests are distinct from standard integration tests because they measure
|
||||||
|
performance metrics and compare it against committed baselines.
|
||||||
|
|
||||||
|
### Running performance tests
|
||||||
|
|
||||||
|
Performance tests are not run as part of the default `npm run test` or
|
||||||
|
`npm run test:e2e` commands. They are run nightly in CI but can be run manually:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run test:perf
|
||||||
|
```
|
||||||
|
|
||||||
|
### Updating baselines
|
||||||
|
|
||||||
|
If you intentionally change behavior that affects performance, you may need to
|
||||||
|
update the baselines. Set the `UPDATE_PERF_BASELINES` environment variable to
|
||||||
|
`true`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
UPDATE_PERF_BASELINES=true npm run test:perf
|
||||||
|
```
|
||||||
|
|
||||||
|
This will run the tests multiple times (with warmup), apply IQR outlier
|
||||||
|
filtering, and overwrite `perf-tests/baselines.json`. You should review the
|
||||||
|
changes and commit the updated baseline file.
|
||||||
|
|
||||||
|
### How it works
|
||||||
|
|
||||||
|
The harness (`PerfTestHarness` in `packages/test-utils`):
|
||||||
|
|
||||||
|
- Measures wall-clock time using `performance.now()`.
|
||||||
|
- Measures CPU usage using `process.cpuUsage()`.
|
||||||
|
- Monitors event loop delay using `perf_hooks.monitorEventLoopDelay()`.
|
||||||
|
- Applies IQR (Interquartile Range) filtering to remove outlier samples.
|
||||||
|
- Compares against baselines with a 15% tolerance.
|
||||||
|
|
||||||
## Diagnostics
|
## Diagnostics
|
||||||
|
|
||||||
The integration test runner provides several options for diagnostics to help
|
The integration test runner provides several options for diagnostics to help
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@ import { join, dirname, extname } from 'node:path';
|
||||||
import { fileURLToPath } from 'node:url';
|
import { fileURLToPath } from 'node:url';
|
||||||
import { canUseRipgrep } from '../packages/core/src/tools/ripGrep.js';
|
import { canUseRipgrep } from '../packages/core/src/tools/ripGrep.js';
|
||||||
import { disableMouseTracking } from '@google/gemini-cli-core';
|
import { disableMouseTracking } from '@google/gemini-cli-core';
|
||||||
|
import { isolateTestEnv } from '../packages/test-utils/src/env-setup.js';
|
||||||
import { createServer, type Server } from 'node:http';
|
import { createServer, type Server } from 'node:http';
|
||||||
|
|
||||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
@ -88,15 +89,8 @@ export async function setup() {
|
||||||
runDir = join(integrationTestsDir, `${Date.now()}`);
|
runDir = join(integrationTestsDir, `${Date.now()}`);
|
||||||
await mkdir(runDir, { recursive: true });
|
await mkdir(runDir, { recursive: true });
|
||||||
|
|
||||||
// Set the home directory to the test run directory to avoid conflicts
|
// Isolate environment variables
|
||||||
// with the user's local config.
|
isolateTestEnv(runDir);
|
||||||
process.env['HOME'] = runDir;
|
|
||||||
if (process.platform === 'win32') {
|
|
||||||
process.env['USERPROFILE'] = runDir;
|
|
||||||
}
|
|
||||||
// We also need to set the config dir explicitly, since the code might
|
|
||||||
// construct the path before the HOME env var is set.
|
|
||||||
process.env['GEMINI_CONFIG_DIR'] = join(runDir, '.gemini');
|
|
||||||
|
|
||||||
// Download ripgrep to avoid race conditions in parallel tests
|
// Download ripgrep to avoid race conditions in parallel tests
|
||||||
const available = await canUseRipgrep();
|
const available = await canUseRipgrep();
|
||||||
|
|
@ -127,10 +121,6 @@ export async function setup() {
|
||||||
}
|
}
|
||||||
|
|
||||||
process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
|
process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
|
||||||
process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true';
|
|
||||||
// Force file storage to avoid keychain prompts/hangs in CI, especially on macOS
|
|
||||||
process.env['GEMINI_FORCE_FILE_STORAGE'] = 'true';
|
|
||||||
process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log');
|
|
||||||
|
|
||||||
if (process.env['KEEP_OUTPUT']) {
|
if (process.env['KEEP_OUTPUT']) {
|
||||||
console.log(`Keeping output for test run in: ${runDir}`);
|
console.log(`Keeping output for test run in: ${runDir}`);
|
||||||
|
|
|
||||||
1
package-lock.json
generated
1
package-lock.json
generated
|
|
@ -36,6 +36,7 @@
|
||||||
"@types/ws": "^8.18.1",
|
"@types/ws": "^8.18.1",
|
||||||
"@vitest/coverage-v8": "^3.1.1",
|
"@vitest/coverage-v8": "^3.1.1",
|
||||||
"@vitest/eslint-plugin": "^1.3.4",
|
"@vitest/eslint-plugin": "^1.3.4",
|
||||||
|
"asciichart": "^1.5.25",
|
||||||
"cross-env": "^7.0.3",
|
"cross-env": "^7.0.3",
|
||||||
"depcheck": "^1.4.7",
|
"depcheck": "^1.4.7",
|
||||||
"domexception": "^4.0.0",
|
"domexception": "^4.0.0",
|
||||||
|
|
|
||||||
|
|
@ -53,6 +53,8 @@
|
||||||
"test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests",
|
"test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests",
|
||||||
"test:memory": "vitest run --root ./memory-tests",
|
"test:memory": "vitest run --root ./memory-tests",
|
||||||
"test:memory:update-baselines": "cross-env UPDATE_MEMORY_BASELINES=true vitest run --root ./memory-tests",
|
"test:memory:update-baselines": "cross-env UPDATE_MEMORY_BASELINES=true vitest run --root ./memory-tests",
|
||||||
|
"test:perf": "vitest run --root ./perf-tests",
|
||||||
|
"test:perf:update-baselines": "cross-env UPDATE_PERF_BASELINES=true vitest run --root ./perf-tests",
|
||||||
"test:integration:sandbox:docker": "cross-env GEMINI_SANDBOX=docker npm run build:sandbox && cross-env GEMINI_SANDBOX=docker vitest run --root ./integration-tests",
|
"test:integration:sandbox:docker": "cross-env GEMINI_SANDBOX=docker npm run build:sandbox && cross-env GEMINI_SANDBOX=docker vitest run --root ./integration-tests",
|
||||||
"test:integration:sandbox:podman": "cross-env GEMINI_SANDBOX=podman vitest run --root ./integration-tests",
|
"test:integration:sandbox:podman": "cross-env GEMINI_SANDBOX=podman vitest run --root ./integration-tests",
|
||||||
"lint": "cross-env NODE_OPTIONS=\"--max-old-space-size=8192\" eslint . --cache --max-warnings 0",
|
"lint": "cross-env NODE_OPTIONS=\"--max-old-space-size=8192\" eslint . --cache --max-warnings 0",
|
||||||
|
|
@ -105,6 +107,7 @@
|
||||||
"@types/ws": "^8.18.1",
|
"@types/ws": "^8.18.1",
|
||||||
"@vitest/coverage-v8": "^3.1.1",
|
"@vitest/coverage-v8": "^3.1.1",
|
||||||
"@vitest/eslint-plugin": "^1.3.4",
|
"@vitest/eslint-plugin": "^1.3.4",
|
||||||
|
"asciichart": "^1.5.25",
|
||||||
"cross-env": "^7.0.3",
|
"cross-env": "^7.0.3",
|
||||||
"depcheck": "^1.4.7",
|
"depcheck": "^1.4.7",
|
||||||
"domexception": "^4.0.0",
|
"domexception": "^4.0.0",
|
||||||
|
|
|
||||||
35
packages/test-utils/src/env-setup.ts
Normal file
35
packages/test-utils/src/env-setup.ts
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
/**
|
||||||
|
* @license
|
||||||
|
* Copyright 2026 Google LLC
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { join } from 'node:path';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Isolate the test environment by setting environment variables
|
||||||
|
* to point to a temporary run directory.
|
||||||
|
*
|
||||||
|
* @param runDir - The temporary directory for this test run.
|
||||||
|
*/
|
||||||
|
export function isolateTestEnv(runDir: string): void {
|
||||||
|
// Set the home directory to the test run directory to avoid conflicts
|
||||||
|
// with the user's local config.
|
||||||
|
process.env['HOME'] = runDir;
|
||||||
|
if (process.platform === 'win32') {
|
||||||
|
process.env['USERPROFILE'] = runDir;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We also need to set the config dir explicitly, since the code might
|
||||||
|
// construct the path before the HOME env var is set.
|
||||||
|
process.env['GEMINI_CONFIG_DIR'] = join(runDir, '.gemini');
|
||||||
|
|
||||||
|
// Force file storage to avoid keychain prompts/hangs in CI, especially on macOS
|
||||||
|
process.env['GEMINI_FORCE_FILE_STORAGE'] = 'true';
|
||||||
|
|
||||||
|
// Mark as integration test
|
||||||
|
process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true';
|
||||||
|
|
||||||
|
// Isolate telemetry log
|
||||||
|
process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log');
|
||||||
|
}
|
||||||
|
|
@ -8,6 +8,8 @@ export * from './file-system-test-helpers.js';
|
||||||
export * from './fixtures/agents.js';
|
export * from './fixtures/agents.js';
|
||||||
export * from './memory-baselines.js';
|
export * from './memory-baselines.js';
|
||||||
export * from './memory-test-harness.js';
|
export * from './memory-test-harness.js';
|
||||||
|
export * from './perf-test-harness.js';
|
||||||
export * from './mock-utils.js';
|
export * from './mock-utils.js';
|
||||||
export * from './test-mcp-server.js';
|
export * from './test-mcp-server.js';
|
||||||
export * from './test-rig.js';
|
export * from './test-rig.js';
|
||||||
|
export * from './env-setup.js';
|
||||||
|
|
|
||||||
546
packages/test-utils/src/perf-test-harness.ts
Normal file
546
packages/test-utils/src/perf-test-harness.ts
Normal file
|
|
@ -0,0 +1,546 @@
|
||||||
|
/**
|
||||||
|
* @license
|
||||||
|
* Copyright 2026 Google LLC
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { performance } from 'node:perf_hooks';
|
||||||
|
import { setTimeout as sleep } from 'node:timers/promises';
|
||||||
|
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
|
||||||
|
|
||||||
|
/** Configuration for asciichart plot function. */
|
||||||
|
interface PlotConfig {
|
||||||
|
height?: number;
|
||||||
|
format?: (x: number) => string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Type for the asciichart plot function. */
|
||||||
|
type PlotFn = (series: number[], config?: PlotConfig) => string;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Baseline entry for a single performance test scenario.
|
||||||
|
*/
|
||||||
|
export interface PerfBaseline {
|
||||||
|
wallClockMs: number;
|
||||||
|
cpuTotalUs: number;
|
||||||
|
eventLoopDelayP99Ms: number;
|
||||||
|
timestamp: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Top-level structure of the perf baselines JSON file.
|
||||||
|
*/
|
||||||
|
export interface PerfBaselineFile {
|
||||||
|
version: number;
|
||||||
|
updatedAt: string;
|
||||||
|
scenarios: Record<string, PerfBaseline>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A single performance snapshot at a point in time.
|
||||||
|
*/
|
||||||
|
export interface PerfSnapshot {
|
||||||
|
timestamp: number;
|
||||||
|
label: string;
|
||||||
|
wallClockMs: number;
|
||||||
|
cpuUserUs: number;
|
||||||
|
cpuSystemUs: number;
|
||||||
|
cpuTotalUs: number;
|
||||||
|
eventLoopDelayP50Ms: number;
|
||||||
|
eventLoopDelayP95Ms: number;
|
||||||
|
eventLoopDelayP99Ms: number;
|
||||||
|
eventLoopDelayMaxMs: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Result from running a performance test scenario.
|
||||||
|
*/
|
||||||
|
export interface PerfTestResult {
|
||||||
|
scenarioName: string;
|
||||||
|
samples: PerfSnapshot[];
|
||||||
|
filteredSamples: PerfSnapshot[];
|
||||||
|
median: PerfSnapshot;
|
||||||
|
baseline: PerfBaseline | undefined;
|
||||||
|
withinTolerance: boolean;
|
||||||
|
deltaPercent: number;
|
||||||
|
cpuDeltaPercent: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Options for the PerfTestHarness.
|
||||||
|
*/
|
||||||
|
export interface PerfTestHarnessOptions {
|
||||||
|
/** Path to the baselines JSON file */
|
||||||
|
baselinesPath: string;
|
||||||
|
/** Default tolerance percentage (0-100). Default: 15 */
|
||||||
|
defaultTolerancePercent?: number;
|
||||||
|
/** Default CPU tolerance percentage (0-100). Optional */
|
||||||
|
defaultCpuTolerancePercent?: number;
|
||||||
|
/** Number of samples per scenario. Default: 5 */
|
||||||
|
sampleCount?: number;
|
||||||
|
/** Number of warmup runs to discard. Default: 1 */
|
||||||
|
warmupCount?: number;
|
||||||
|
/** Pause in ms between samples. Default: 100 */
|
||||||
|
samplePauseMs?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Active timer state tracked internally.
|
||||||
|
*/
|
||||||
|
interface ActiveTimer {
|
||||||
|
label: string;
|
||||||
|
startTime: number;
|
||||||
|
startCpuUsage: NodeJS.CpuUsage;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PerfTestHarness provides infrastructure for running CPU performance tests.
|
||||||
|
*
|
||||||
|
* It handles:
|
||||||
|
* - High-resolution wall-clock timing via performance.now()
|
||||||
|
* - CPU usage measurement via process.cpuUsage()
|
||||||
|
* - Event loop delay monitoring via perf_hooks.monitorEventLoopDelay()
|
||||||
|
* - IQR outlier filtering for noise reduction
|
||||||
|
* - Warmup runs to avoid JIT compilation noise
|
||||||
|
* - Comparing against baselines with configurable tolerance
|
||||||
|
* - Generating ASCII chart reports
|
||||||
|
*/
|
||||||
|
export class PerfTestHarness {
|
||||||
|
private baselines: PerfBaselineFile;
|
||||||
|
private readonly baselinesPath: string;
|
||||||
|
private readonly defaultTolerancePercent: number;
|
||||||
|
private readonly defaultCpuTolerancePercent?: number;
|
||||||
|
private readonly sampleCount: number;
|
||||||
|
private readonly warmupCount: number;
|
||||||
|
private readonly samplePauseMs: number;
|
||||||
|
private allResults: PerfTestResult[] = [];
|
||||||
|
private activeTimers: Map<string, ActiveTimer> = new Map();
|
||||||
|
|
||||||
|
constructor(options: PerfTestHarnessOptions) {
|
||||||
|
this.baselinesPath = options.baselinesPath;
|
||||||
|
this.defaultTolerancePercent = options.defaultTolerancePercent ?? 15;
|
||||||
|
this.defaultCpuTolerancePercent = options.defaultCpuTolerancePercent;
|
||||||
|
this.sampleCount = options.sampleCount ?? 5;
|
||||||
|
this.warmupCount = options.warmupCount ?? 1;
|
||||||
|
this.samplePauseMs = options.samplePauseMs ?? 100;
|
||||||
|
this.baselines = loadPerfBaselines(this.baselinesPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start a high-resolution timer with CPU tracking.
|
||||||
|
*/
|
||||||
|
startTimer(label: string): void {
|
||||||
|
this.activeTimers.set(label, {
|
||||||
|
label,
|
||||||
|
startTime: performance.now(),
|
||||||
|
startCpuUsage: process.cpuUsage(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stop a timer and return the snapshot.
|
||||||
|
*/
|
||||||
|
stopTimer(label: string): PerfSnapshot {
|
||||||
|
const timer = this.activeTimers.get(label);
|
||||||
|
if (!timer) {
|
||||||
|
throw new Error(`No active timer found for label "${label}"`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const wallClockMs = performance.now() - timer.startTime;
|
||||||
|
const cpuDelta = process.cpuUsage(timer.startCpuUsage);
|
||||||
|
this.activeTimers.delete(label);
|
||||||
|
|
||||||
|
return {
|
||||||
|
timestamp: Date.now(),
|
||||||
|
label,
|
||||||
|
wallClockMs,
|
||||||
|
cpuUserUs: cpuDelta.user,
|
||||||
|
cpuSystemUs: cpuDelta.system,
|
||||||
|
cpuTotalUs: cpuDelta.user + cpuDelta.system,
|
||||||
|
eventLoopDelayP50Ms: 0,
|
||||||
|
eventLoopDelayP95Ms: 0,
|
||||||
|
eventLoopDelayP99Ms: 0,
|
||||||
|
eventLoopDelayMaxMs: 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Measure a function's wall-clock time and CPU usage.
|
||||||
|
* Returns the snapshot with timing data.
|
||||||
|
*/
|
||||||
|
async measure(label: string, fn: () => Promise<void>): Promise<PerfSnapshot> {
|
||||||
|
this.startTimer(label);
|
||||||
|
await fn();
|
||||||
|
return this.stopTimer(label);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Measure a function with event loop delay monitoring.
|
||||||
|
* Uses perf_hooks.monitorEventLoopDelay() for histogram data.
|
||||||
|
*/
|
||||||
|
async measureWithEventLoop(
|
||||||
|
label: string,
|
||||||
|
fn: () => Promise<void>,
|
||||||
|
): Promise<PerfSnapshot> {
|
||||||
|
// monitorEventLoopDelay is available in Node.js 12+
|
||||||
|
const { monitorEventLoopDelay } = await import('node:perf_hooks');
|
||||||
|
const histogram = monitorEventLoopDelay({ resolution: 10 });
|
||||||
|
histogram.enable();
|
||||||
|
|
||||||
|
this.startTimer(label);
|
||||||
|
await fn();
|
||||||
|
const snapshot = this.stopTimer(label);
|
||||||
|
|
||||||
|
histogram.disable();
|
||||||
|
|
||||||
|
// Convert from nanoseconds to milliseconds
|
||||||
|
snapshot.eventLoopDelayP50Ms = histogram.percentile(50) / 1e6;
|
||||||
|
snapshot.eventLoopDelayP95Ms = histogram.percentile(95) / 1e6;
|
||||||
|
snapshot.eventLoopDelayP99Ms = histogram.percentile(99) / 1e6;
|
||||||
|
snapshot.eventLoopDelayMaxMs = histogram.max / 1e6;
|
||||||
|
|
||||||
|
return snapshot;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run a scenario multiple times with warmup, outlier filtering, and baseline comparison.
|
||||||
|
*
|
||||||
|
* @param name - Scenario name (must match baseline key)
|
||||||
|
* @param fn - Async function that executes one sample of the scenario.
|
||||||
|
* Must return a PerfSnapshot with measured values.
|
||||||
|
* @param tolerancePercent - Override default tolerance for this scenario
|
||||||
|
*/
|
||||||
|
async runScenario(
|
||||||
|
name: string,
|
||||||
|
fn: () => Promise<PerfSnapshot>,
|
||||||
|
tolerancePercent?: number,
|
||||||
|
): Promise<PerfTestResult> {
|
||||||
|
const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
|
||||||
|
const totalRuns = this.warmupCount + this.sampleCount;
|
||||||
|
const allSnapshots: PerfSnapshot[] = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < totalRuns; i++) {
|
||||||
|
const isWarmup = i < this.warmupCount;
|
||||||
|
const snapshot = await fn();
|
||||||
|
snapshot.label = isWarmup
|
||||||
|
? `warmup-${i}`
|
||||||
|
: `sample-${i - this.warmupCount}`;
|
||||||
|
|
||||||
|
if (!isWarmup) {
|
||||||
|
allSnapshots.push(snapshot);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Brief pause between samples
|
||||||
|
await sleep(this.samplePauseMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply IQR outlier filtering on wall-clock time
|
||||||
|
const filteredSnapshots = this.filterOutliers(allSnapshots, 'wallClockMs');
|
||||||
|
|
||||||
|
// Get median of filtered samples
|
||||||
|
const median = this.getMedianSnapshot(filteredSnapshots);
|
||||||
|
median.label = 'median';
|
||||||
|
|
||||||
|
// Get baseline
|
||||||
|
const baseline = this.baselines.scenarios[name];
|
||||||
|
|
||||||
|
// Determine if within tolerance
|
||||||
|
let deltaPercent = 0;
|
||||||
|
let cpuDeltaPercent = 0;
|
||||||
|
let withinTolerance = true;
|
||||||
|
|
||||||
|
if (baseline) {
|
||||||
|
deltaPercent =
|
||||||
|
((median.wallClockMs - baseline.wallClockMs) / baseline.wallClockMs) *
|
||||||
|
100;
|
||||||
|
cpuDeltaPercent =
|
||||||
|
((median.cpuTotalUs - baseline.cpuTotalUs) / baseline.cpuTotalUs) * 100;
|
||||||
|
withinTolerance = deltaPercent <= tolerance;
|
||||||
|
}
|
||||||
|
|
||||||
|
const result: PerfTestResult = {
|
||||||
|
scenarioName: name,
|
||||||
|
samples: allSnapshots,
|
||||||
|
filteredSamples: filteredSnapshots,
|
||||||
|
median,
|
||||||
|
baseline,
|
||||||
|
withinTolerance,
|
||||||
|
deltaPercent,
|
||||||
|
cpuDeltaPercent,
|
||||||
|
};
|
||||||
|
|
||||||
|
this.allResults.push(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Assert that a scenario result is within the baseline tolerance.
|
||||||
|
*/
|
||||||
|
assertWithinBaseline(
|
||||||
|
result: PerfTestResult,
|
||||||
|
tolerancePercent?: number,
|
||||||
|
cpuTolerancePercent?: number,
|
||||||
|
): void {
|
||||||
|
const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
|
||||||
|
const cpuTolerance = cpuTolerancePercent ?? this.defaultCpuTolerancePercent;
|
||||||
|
|
||||||
|
if (!result.baseline) {
|
||||||
|
console.warn(
|
||||||
|
`⚠ No baseline found for "${result.scenarioName}". ` +
|
||||||
|
`Run with UPDATE_PERF_BASELINES=true to create one. ` +
|
||||||
|
`Measured: ${result.median.wallClockMs.toFixed(1)} ms wall-clock.`,
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const deltaPercent =
|
||||||
|
((result.median.wallClockMs - result.baseline.wallClockMs) /
|
||||||
|
result.baseline.wallClockMs) *
|
||||||
|
100;
|
||||||
|
|
||||||
|
if (deltaPercent > tolerance) {
|
||||||
|
throw new Error(
|
||||||
|
`Performance regression detected for "${result.scenarioName}"!\n` +
|
||||||
|
` Measured: ${result.median.wallClockMs.toFixed(1)} ms wall-clock\n` +
|
||||||
|
` Baseline: ${result.baseline.wallClockMs.toFixed(1)} ms wall-clock\n` +
|
||||||
|
` Delta: ${deltaPercent.toFixed(1)}% (tolerance: ${tolerance}%)\n` +
|
||||||
|
` CPU total: ${formatUs(result.median.cpuTotalUs)}\n` +
|
||||||
|
` EL p99: ${result.median.eventLoopDelayP99Ms.toFixed(1)} ms\n` +
|
||||||
|
` Samples: ${result.samples.length} (${result.filteredSamples.length} after IQR filter)`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cpuTolerance !== undefined && result.cpuDeltaPercent > cpuTolerance) {
|
||||||
|
throw new Error(
|
||||||
|
`CPU usage regression detected for "${result.scenarioName}"!\n` +
|
||||||
|
` Measured: ${formatUs(result.median.cpuTotalUs)}\n` +
|
||||||
|
` Baseline: ${formatUs(result.baseline.cpuTotalUs)}\n` +
|
||||||
|
` Delta: ${result.cpuDeltaPercent.toFixed(1)}% (tolerance: ${cpuTolerance}%)\n` +
|
||||||
|
` Wall-clock: ${result.median.wallClockMs.toFixed(1)} ms\n` +
|
||||||
|
` EL p99: ${result.median.eventLoopDelayP99Ms.toFixed(1)} ms`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update the baseline for a scenario with the current measured values.
|
||||||
|
*/
|
||||||
|
updateScenarioBaseline(result: PerfTestResult): void {
|
||||||
|
updatePerfBaseline(this.baselinesPath, result.scenarioName, {
|
||||||
|
wallClockMs: result.median.wallClockMs,
|
||||||
|
cpuTotalUs: result.median.cpuTotalUs,
|
||||||
|
eventLoopDelayP99Ms: result.median.eventLoopDelayP99Ms,
|
||||||
|
});
|
||||||
|
// Reload baselines after update
|
||||||
|
this.baselines = loadPerfBaselines(this.baselinesPath);
|
||||||
|
console.log(
|
||||||
|
`Updated baseline for ${result.scenarioName}: ${result.median.wallClockMs.toFixed(1)} ms`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate an ASCII report with summary table and charts.
|
||||||
|
*/
|
||||||
|
async generateReport(results?: PerfTestResult[]): Promise<string> {
|
||||||
|
const resultsToReport = results ?? this.allResults;
|
||||||
|
const lines: string[] = [];
|
||||||
|
|
||||||
|
lines.push('');
|
||||||
|
lines.push('═══════════════════════════════════════════════════');
|
||||||
|
lines.push(' PERFORMANCE TEST REPORT');
|
||||||
|
lines.push('═══════════════════════════════════════════════════');
|
||||||
|
lines.push('');
|
||||||
|
|
||||||
|
for (const result of resultsToReport) {
|
||||||
|
const measured = `${result.median.wallClockMs.toFixed(1)} ms`;
|
||||||
|
const baseline = result.baseline
|
||||||
|
? `${result.baseline.wallClockMs.toFixed(1)} ms`
|
||||||
|
: 'N/A';
|
||||||
|
const delta = result.baseline
|
||||||
|
? `${result.deltaPercent >= 0 ? '+' : ''}${result.deltaPercent.toFixed(1)}%`
|
||||||
|
: 'N/A';
|
||||||
|
const status = !result.baseline
|
||||||
|
? 'NEW'
|
||||||
|
: result.withinTolerance
|
||||||
|
? '✅'
|
||||||
|
: '❌';
|
||||||
|
|
||||||
|
lines.push(
|
||||||
|
`${result.scenarioName}: ${measured} (Baseline: ${baseline}, Delta: ${delta}) ${status}`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Show CPU breakdown
|
||||||
|
const cpuMs = `${(result.median.cpuTotalUs / 1000).toFixed(1)} ms`;
|
||||||
|
lines.push(
|
||||||
|
` CPU: ${cpuMs} (user: ${formatUs(result.median.cpuUserUs)}, system: ${formatUs(result.median.cpuSystemUs)})`,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result.median.eventLoopDelayP99Ms > 0) {
|
||||||
|
lines.push(
|
||||||
|
` Event loop: p50=${result.median.eventLoopDelayP50Ms.toFixed(1)}ms p95=${result.median.eventLoopDelayP95Ms.toFixed(1)}ms p99=${result.median.eventLoopDelayP99Ms.toFixed(1)}ms max=${result.median.eventLoopDelayMaxMs.toFixed(1)}ms`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
lines.push(
|
||||||
|
` Samples: ${result.samples.length} → ${result.filteredSamples.length} after IQR filter`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
lines.push('');
|
||||||
|
|
||||||
|
// Generate ASCII chart for wall-clock per scenario
|
||||||
|
try {
|
||||||
|
// @ts-expect-error - asciichart may not have types
|
||||||
|
const asciichart = (await import('asciichart')) as {
|
||||||
|
default?: { plot?: PlotFn };
|
||||||
|
plot?: PlotFn;
|
||||||
|
};
|
||||||
|
const plot: PlotFn | undefined =
|
||||||
|
asciichart.default?.plot ?? asciichart.plot;
|
||||||
|
|
||||||
|
for (const result of resultsToReport) {
|
||||||
|
if (result.filteredSamples.length > 2) {
|
||||||
|
lines.push(`📈 Wall-clock trend: ${result.scenarioName}`);
|
||||||
|
lines.push('─'.repeat(60));
|
||||||
|
|
||||||
|
const wallClockData = result.filteredSamples.map(
|
||||||
|
(s) => s.wallClockMs,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (plot) {
|
||||||
|
const chart = plot(wallClockData, {
|
||||||
|
height: 8,
|
||||||
|
format: (x: number) => `${x.toFixed(0)} ms`.padStart(10),
|
||||||
|
});
|
||||||
|
lines.push(chart);
|
||||||
|
}
|
||||||
|
|
||||||
|
const labels = result.filteredSamples.map((s) => s.label);
|
||||||
|
lines.push(' ' + labels.join(' → '));
|
||||||
|
lines.push('');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
lines.push(
|
||||||
|
'(asciichart not available — install with: npm install --save-dev asciichart)',
|
||||||
|
);
|
||||||
|
lines.push('');
|
||||||
|
}
|
||||||
|
|
||||||
|
lines.push('═══════════════════════════════════════════════════');
|
||||||
|
lines.push('');
|
||||||
|
|
||||||
|
const report = lines.join('\n');
|
||||||
|
console.log(report);
|
||||||
|
return report;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filter outliers using the Interquartile Range (IQR) method.
|
||||||
|
* Removes samples where the given metric falls outside Q1 - 1.5*IQR or Q3 + 1.5*IQR.
|
||||||
|
*/
|
||||||
|
private filterOutliers(
|
||||||
|
snapshots: PerfSnapshot[],
|
||||||
|
metric: keyof PerfSnapshot,
|
||||||
|
): PerfSnapshot[] {
|
||||||
|
if (snapshots.length < 4) {
|
||||||
|
// Not enough data for meaningful IQR filtering
|
||||||
|
return [...snapshots];
|
||||||
|
}
|
||||||
|
|
||||||
|
const sorted = [...snapshots].sort(
|
||||||
|
(a, b) => (a[metric] as number) - (b[metric] as number),
|
||||||
|
);
|
||||||
|
const q1Idx = Math.floor(sorted.length * 0.25);
|
||||||
|
const q3Idx = Math.floor(sorted.length * 0.75);
|
||||||
|
|
||||||
|
const q1 = sorted[q1Idx]![metric] as number;
|
||||||
|
const q3 = sorted[q3Idx]![metric] as number;
|
||||||
|
const iqr = q3 - q1;
|
||||||
|
const lowerBound = q1 - 1.5 * iqr;
|
||||||
|
const upperBound = q3 + 1.5 * iqr;
|
||||||
|
|
||||||
|
return snapshots.filter((s) => {
|
||||||
|
const val = s[metric] as number;
|
||||||
|
return val >= lowerBound && val <= upperBound;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the median snapshot by wall-clock time from a sorted list.
|
||||||
|
*/
|
||||||
|
private getMedianSnapshot(snapshots: PerfSnapshot[]): PerfSnapshot {
|
||||||
|
if (snapshots.length === 0) {
|
||||||
|
throw new Error('Cannot compute median of empty snapshot list');
|
||||||
|
}
|
||||||
|
|
||||||
|
const sorted = [...snapshots].sort((a, b) => a.wallClockMs - b.wallClockMs);
|
||||||
|
const medianIdx = Math.floor(sorted.length / 2);
|
||||||
|
return { ...sorted[medianIdx]! };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Baseline management ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load perf baselines from a JSON file.
|
||||||
|
*/
|
||||||
|
export function loadPerfBaselines(path: string): PerfBaselineFile {
|
||||||
|
if (!existsSync(path)) {
|
||||||
|
return {
|
||||||
|
version: 1,
|
||||||
|
updatedAt: new Date().toISOString(),
|
||||||
|
scenarios: {},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const content = readFileSync(path, 'utf-8');
|
||||||
|
return JSON.parse(content) as PerfBaselineFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save perf baselines to a JSON file.
|
||||||
|
*/
|
||||||
|
export function savePerfBaselines(
|
||||||
|
path: string,
|
||||||
|
baselines: PerfBaselineFile,
|
||||||
|
): void {
|
||||||
|
baselines.updatedAt = new Date().toISOString();
|
||||||
|
writeFileSync(path, JSON.stringify(baselines, null, 2) + '\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update (or create) a single scenario baseline in the file.
|
||||||
|
*/
|
||||||
|
export function updatePerfBaseline(
|
||||||
|
path: string,
|
||||||
|
scenarioName: string,
|
||||||
|
measured: {
|
||||||
|
wallClockMs: number;
|
||||||
|
cpuTotalUs: number;
|
||||||
|
eventLoopDelayP99Ms: number;
|
||||||
|
},
|
||||||
|
): void {
|
||||||
|
const baselines = loadPerfBaselines(path);
|
||||||
|
baselines.scenarios[scenarioName] = {
|
||||||
|
wallClockMs: measured.wallClockMs,
|
||||||
|
cpuTotalUs: measured.cpuTotalUs,
|
||||||
|
eventLoopDelayP99Ms: measured.eventLoopDelayP99Ms,
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
savePerfBaselines(path, baselines);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Helpers ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Format microseconds as a human-readable string.
|
||||||
|
*/
|
||||||
|
function formatUs(us: number): string {
|
||||||
|
if (us > 1_000_000) {
|
||||||
|
return `${(us / 1_000_000).toFixed(2)} s`;
|
||||||
|
}
|
||||||
|
if (us > 1_000) {
|
||||||
|
return `${(us / 1_000).toFixed(1)} ms`;
|
||||||
|
}
|
||||||
|
return `${us} μs`;
|
||||||
|
}
|
||||||
121
perf-tests/README.md
Normal file
121
perf-tests/README.md
Normal file
|
|
@ -0,0 +1,121 @@
|
||||||
|
# CPU Performance Integration Test Harness
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This directory contains performance/CPU integration tests for the Gemini CLI.
|
||||||
|
These tests measure wall-clock time, CPU usage, and event loop responsiveness to
|
||||||
|
detect regressions across key scenarios.
|
||||||
|
|
||||||
|
CPU performance is inherently noisy, especially in CI. The harness addresses
|
||||||
|
this with:
|
||||||
|
|
||||||
|
- **IQR outlier filtering** — discards anomalous samples
|
||||||
|
- **Median sampling** — takes N runs, reports the median after filtering
|
||||||
|
- **Warmup runs** — discards the first run to mitigate JIT compilation noise
|
||||||
|
- **15% default tolerance** — won't panic at slight regressions
|
||||||
|
|
||||||
|
## Running
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run tests (compare against committed baselines)
|
||||||
|
npm run test:perf
|
||||||
|
|
||||||
|
# Update baselines (after intentional changes)
|
||||||
|
npm run test:perf:update-baselines
|
||||||
|
|
||||||
|
# Verbose output
|
||||||
|
VERBOSE=true npm run test:perf
|
||||||
|
|
||||||
|
# Keep test artifacts for debugging
|
||||||
|
KEEP_OUTPUT=true npm run test:perf
|
||||||
|
```
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
### Measurement Primitives
|
||||||
|
|
||||||
|
The `PerfTestHarness` class (in `packages/test-utils`) provides:
|
||||||
|
|
||||||
|
- **`performance.now()`** — high-resolution wall-clock timing
|
||||||
|
- **`process.cpuUsage()`** — user + system CPU microseconds (delta between
|
||||||
|
start/stop)
|
||||||
|
- **`perf_hooks.monitorEventLoopDelay()`** — event loop delay histogram
|
||||||
|
(p50/p95/p99/max)
|
||||||
|
|
||||||
|
### Noise Reduction
|
||||||
|
|
||||||
|
1. **Warmup**: First run is discarded to mitigate JIT compilation artifacts
|
||||||
|
2. **Multiple samples**: Each scenario runs N times (default 5)
|
||||||
|
3. **IQR filtering**: Samples outside Q1−1.5×IQR and Q3+1.5×IQR are discarded
|
||||||
|
4. **Median**: The median of remaining samples is used for comparison
|
||||||
|
|
||||||
|
### Baseline Management
|
||||||
|
|
||||||
|
Baselines are stored in `baselines.json` in this directory. Each scenario has:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"cold-startup-time": {
|
||||||
|
"wallClockMs": 1234.5,
|
||||||
|
"cpuTotalUs": 567890,
|
||||||
|
"eventLoopDelayP99Ms": 12.3,
|
||||||
|
"timestamp": "2026-04-08T..."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Tests fail if the measured value exceeds `baseline × 1.15` (15% tolerance).
|
||||||
|
|
||||||
|
To recalibrate after intentional changes:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run test:perf:update-baselines
|
||||||
|
# then commit baselines.json
|
||||||
|
```
|
||||||
|
|
||||||
|
### Report Output
|
||||||
|
|
||||||
|
After all tests, the harness prints an ASCII summary:
|
||||||
|
|
||||||
|
```
|
||||||
|
═══════════════════════════════════════════════════
|
||||||
|
PERFORMANCE TEST REPORT
|
||||||
|
═══════════════════════════════════════════════════
|
||||||
|
|
||||||
|
cold-startup-time: 1234.5 ms (Baseline: 1200.0 ms, Delta: +2.9%) ✅
|
||||||
|
idle-cpu-usage: 2.1 % (Baseline: 2.0 %, Delta: +5.0%) ✅
|
||||||
|
skill-loading-time: 1567.8 ms (Baseline: 1500.0 ms, Delta: +4.5%) ✅
|
||||||
|
```
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
perf-tests/
|
||||||
|
├── README.md ← you are here
|
||||||
|
├── baselines.json ← committed baseline values
|
||||||
|
├── globalSetup.ts ← test environment setup
|
||||||
|
├── perf-usage.test.ts ← test scenarios
|
||||||
|
├── perf.*.responses ← fake API responses per scenario
|
||||||
|
├── tsconfig.json ← TypeScript config
|
||||||
|
└── vitest.config.ts ← vitest config (serial, isolated)
|
||||||
|
|
||||||
|
packages/test-utils/src/
|
||||||
|
├── perf-test-harness.ts ← PerfTestHarness class
|
||||||
|
└── index.ts ← re-exports
|
||||||
|
```
|
||||||
|
|
||||||
|
## CI Integration
|
||||||
|
|
||||||
|
These tests are **excluded from `preflight`** and designed for nightly CI:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: Performance regression tests
|
||||||
|
run: npm run test:perf
|
||||||
|
```
|
||||||
|
|
||||||
|
## Adding a New Scenario
|
||||||
|
|
||||||
|
1. Add a fake response file: `perf.<scenario-name>.responses`
|
||||||
|
2. Add a test case in `perf-usage.test.ts` using `harness.runScenario()`
|
||||||
|
3. Run `npm run test:perf:update-baselines` to establish initial baseline
|
||||||
|
4. Commit the updated `baselines.json`
|
||||||
24
perf-tests/baselines.json
Normal file
24
perf-tests/baselines.json
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
{
|
||||||
|
"version": 1,
|
||||||
|
"updatedAt": "2026-04-08T18:51:29.839Z",
|
||||||
|
"scenarios": {
|
||||||
|
"cold-startup-time": {
|
||||||
|
"wallClockMs": 1333.4230420000004,
|
||||||
|
"cpuTotalUs": 1711,
|
||||||
|
"eventLoopDelayP99Ms": 0,
|
||||||
|
"timestamp": "2026-04-08T18:50:58.124Z"
|
||||||
|
},
|
||||||
|
"idle-cpu-usage": {
|
||||||
|
"wallClockMs": 5001.926125,
|
||||||
|
"cpuTotalUs": 128518,
|
||||||
|
"eventLoopDelayP99Ms": 12.705791,
|
||||||
|
"timestamp": "2026-04-08T18:51:23.938Z"
|
||||||
|
},
|
||||||
|
"skill-loading-time": {
|
||||||
|
"wallClockMs": 1372.4463749999995,
|
||||||
|
"cpuTotalUs": 1550,
|
||||||
|
"eventLoopDelayP99Ms": 0,
|
||||||
|
"timestamp": "2026-04-08T18:51:29.839Z"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
67
perf-tests/globalSetup.ts
Normal file
67
perf-tests/globalSetup.ts
Normal file
|
|
@ -0,0 +1,67 @@
|
||||||
|
/**
|
||||||
|
* @license
|
||||||
|
* Copyright 2026 Google LLC
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { mkdir, readdir, rm } from 'node:fs/promises';
|
||||||
|
import { join, dirname } from 'node:path';
|
||||||
|
import { fileURLToPath } from 'node:url';
|
||||||
|
import { canUseRipgrep } from '../packages/core/src/tools/ripGrep.js';
|
||||||
|
import { isolateTestEnv } from '../packages/test-utils/src/env-setup.js';
|
||||||
|
|
||||||
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||||
|
const rootDir = join(__dirname, '..');
|
||||||
|
const perfTestsDir = join(rootDir, '.perf-tests');
|
||||||
|
const KEEP_RUNS_COUNT = 5;
|
||||||
|
let runDir = '';
|
||||||
|
|
||||||
|
export async function setup() {
|
||||||
|
runDir = join(perfTestsDir, `${Date.now()}`);
|
||||||
|
await mkdir(runDir, { recursive: true });
|
||||||
|
|
||||||
|
// Isolate environment variables
|
||||||
|
isolateTestEnv(runDir);
|
||||||
|
|
||||||
|
// Download ripgrep to avoid race conditions
|
||||||
|
const available = await canUseRipgrep();
|
||||||
|
if (!available) {
|
||||||
|
throw new Error('Failed to download ripgrep binary');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean up old test runs, keeping the latest few for debugging
|
||||||
|
try {
|
||||||
|
const testRuns = await readdir(perfTestsDir);
|
||||||
|
if (testRuns.length > KEEP_RUNS_COUNT) {
|
||||||
|
const oldRuns = testRuns
|
||||||
|
.sort()
|
||||||
|
.slice(0, testRuns.length - KEEP_RUNS_COUNT);
|
||||||
|
await Promise.all(
|
||||||
|
oldRuns.map((oldRun) =>
|
||||||
|
rm(join(perfTestsDir, oldRun), {
|
||||||
|
recursive: true,
|
||||||
|
force: true,
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Error cleaning up old perf test runs:', e);
|
||||||
|
}
|
||||||
|
|
||||||
|
process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
|
||||||
|
process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false';
|
||||||
|
|
||||||
|
console.log(`\nPerf test output directory: ${runDir}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function teardown() {
|
||||||
|
// Cleanup unless KEEP_OUTPUT is set
|
||||||
|
if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) {
|
||||||
|
try {
|
||||||
|
await rm(runDir, { recursive: true, force: true });
|
||||||
|
} catch (e) {
|
||||||
|
console.warn('Failed to clean up perf test directory:', e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
153
perf-tests/perf-usage.test.ts
Normal file
153
perf-tests/perf-usage.test.ts
Normal file
|
|
@ -0,0 +1,153 @@
|
||||||
|
/**
|
||||||
|
* @license
|
||||||
|
* Copyright 2026 Google LLC
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, it, beforeAll, afterAll } from 'vitest';
|
||||||
|
import { TestRig, PerfTestHarness } from '@google/gemini-cli-test-utils';
|
||||||
|
import { join, dirname } from 'node:path';
|
||||||
|
import { fileURLToPath } from 'node:url';
|
||||||
|
|
||||||
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||||
|
const BASELINES_PATH = join(__dirname, 'baselines.json');
|
||||||
|
const UPDATE_BASELINES = process.env['UPDATE_PERF_BASELINES'] === 'true';
|
||||||
|
const TOLERANCE_PERCENT = 15;
|
||||||
|
|
||||||
|
// Use fewer samples locally for faster iteration, more in CI
|
||||||
|
const SAMPLE_COUNT = process.env['CI'] ? 5 : 3;
|
||||||
|
const WARMUP_COUNT = 1;
|
||||||
|
|
||||||
|
describe('CPU Performance Tests', () => {
|
||||||
|
let harness: PerfTestHarness;
|
||||||
|
|
||||||
|
beforeAll(() => {
|
||||||
|
harness = new PerfTestHarness({
|
||||||
|
baselinesPath: BASELINES_PATH,
|
||||||
|
defaultTolerancePercent: TOLERANCE_PERCENT,
|
||||||
|
sampleCount: SAMPLE_COUNT,
|
||||||
|
warmupCount: WARMUP_COUNT,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
afterAll(async () => {
|
||||||
|
// Generate the summary report after all tests
|
||||||
|
await harness.generateReport();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('cold-startup-time: startup completes within baseline', async () => {
|
||||||
|
const result = await harness.runScenario('cold-startup-time', async () => {
|
||||||
|
const rig = new TestRig();
|
||||||
|
try {
|
||||||
|
rig.setup('perf-cold-startup', {
|
||||||
|
fakeResponsesPath: join(__dirname, 'perf.cold-startup.responses'),
|
||||||
|
});
|
||||||
|
|
||||||
|
return await harness.measure('cold-startup', async () => {
|
||||||
|
await rig.run({
|
||||||
|
args: ['hello'],
|
||||||
|
timeout: 120000,
|
||||||
|
env: { GEMINI_API_KEY: 'fake-perf-test-key' },
|
||||||
|
});
|
||||||
|
});
|
||||||
|
} finally {
|
||||||
|
await rig.cleanup();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (UPDATE_BASELINES) {
|
||||||
|
harness.updateScenarioBaseline(result);
|
||||||
|
} else {
|
||||||
|
harness.assertWithinBaseline(result);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it('idle-cpu-usage: CPU stays low when idle', async () => {
|
||||||
|
const IDLE_OBSERVATION_MS = 5000;
|
||||||
|
|
||||||
|
const result = await harness.runScenario('idle-cpu-usage', async () => {
|
||||||
|
const rig = new TestRig();
|
||||||
|
try {
|
||||||
|
rig.setup('perf-idle-cpu', {
|
||||||
|
fakeResponsesPath: join(__dirname, 'perf.idle-cpu.responses'),
|
||||||
|
});
|
||||||
|
|
||||||
|
// First, run a prompt to get the CLI into idle state
|
||||||
|
await rig.run({
|
||||||
|
args: ['hello'],
|
||||||
|
timeout: 120000,
|
||||||
|
env: { GEMINI_API_KEY: 'fake-perf-test-key' },
|
||||||
|
});
|
||||||
|
|
||||||
|
// Now measure CPU during idle period in the test process
|
||||||
|
return await harness.measureWithEventLoop('idle-cpu', async () => {
|
||||||
|
// Simulate idle period — just wait
|
||||||
|
const { setTimeout: sleep } = await import('node:timers/promises');
|
||||||
|
await sleep(IDLE_OBSERVATION_MS);
|
||||||
|
});
|
||||||
|
} finally {
|
||||||
|
await rig.cleanup();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (UPDATE_BASELINES) {
|
||||||
|
harness.updateScenarioBaseline(result);
|
||||||
|
} else {
|
||||||
|
harness.assertWithinBaseline(result);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it('skill-loading-time: startup with many skills within baseline', async () => {
|
||||||
|
const SKILL_COUNT = 20;
|
||||||
|
|
||||||
|
const result = await harness.runScenario('skill-loading-time', async () => {
|
||||||
|
const rig = new TestRig();
|
||||||
|
try {
|
||||||
|
rig.setup('perf-skill-loading', {
|
||||||
|
fakeResponsesPath: join(__dirname, 'perf.skill-loading.responses'),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Create many skill directories with SKILL.md files
|
||||||
|
for (let i = 0; i < SKILL_COUNT; i++) {
|
||||||
|
const skillDir = `.gemini/skills/perf-skill-${i}`;
|
||||||
|
rig.mkdir(skillDir);
|
||||||
|
rig.createFile(
|
||||||
|
`${skillDir}/SKILL.md`,
|
||||||
|
[
|
||||||
|
'---',
|
||||||
|
`name: perf-skill-${i}`,
|
||||||
|
`description: Performance test skill number ${i}`,
|
||||||
|
`activation: manual`,
|
||||||
|
'---',
|
||||||
|
'',
|
||||||
|
`# Performance Test Skill ${i}`,
|
||||||
|
'',
|
||||||
|
`This is a test skill for measuring skill loading performance.`,
|
||||||
|
`It contains some content to simulate real-world skill files.`,
|
||||||
|
'',
|
||||||
|
`## Usage`,
|
||||||
|
'',
|
||||||
|
`Use this skill by activating it with @perf-skill-${i}.`,
|
||||||
|
].join('\n'),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return await harness.measure('skill-loading', async () => {
|
||||||
|
await rig.run({
|
||||||
|
args: ['hello'],
|
||||||
|
timeout: 120000,
|
||||||
|
env: { GEMINI_API_KEY: 'fake-perf-test-key' },
|
||||||
|
});
|
||||||
|
});
|
||||||
|
} finally {
|
||||||
|
await rig.cleanup();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (UPDATE_BASELINES) {
|
||||||
|
harness.updateScenarioBaseline(result);
|
||||||
|
} else {
|
||||||
|
harness.assertWithinBaseline(result);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
2
perf-tests/perf.cold-startup.responses
Normal file
2
perf-tests/perf.cold-startup.responses
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
|
||||||
|
{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help. What would you like to work on?"}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":12,"totalTokenCount":17,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
|
||||||
2
perf-tests/perf.idle-cpu.responses
Normal file
2
perf-tests/perf.idle-cpu.responses
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
|
||||||
|
{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":8,"totalTokenCount":13,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
|
||||||
2
perf-tests/perf.skill-loading.responses
Normal file
2
perf-tests/perf.skill-loading.responses
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
|
||||||
|
{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to assist you with your project."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":10,"totalTokenCount":15,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
|
||||||
12
perf-tests/tsconfig.json
Normal file
12
perf-tests/tsconfig.json
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
{
|
||||||
|
"extends": "../tsconfig.json",
|
||||||
|
"compilerOptions": {
|
||||||
|
"noEmit": true,
|
||||||
|
"allowJs": true
|
||||||
|
},
|
||||||
|
"include": ["**/*.ts"],
|
||||||
|
"references": [
|
||||||
|
{ "path": "../packages/core" },
|
||||||
|
{ "path": "../packages/test-utils" }
|
||||||
|
]
|
||||||
|
}
|
||||||
27
perf-tests/vitest.config.ts
Normal file
27
perf-tests/vitest.config.ts
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
/**
|
||||||
|
* @license
|
||||||
|
* Copyright 2026 Google LLC
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { defineConfig } from 'vitest/config';
|
||||||
|
|
||||||
|
export default defineConfig({
|
||||||
|
test: {
|
||||||
|
testTimeout: 600000, // 10 minutes — performance profiling needs time for multiple samples
|
||||||
|
globalSetup: './globalSetup.ts',
|
||||||
|
reporters: ['default'],
|
||||||
|
include: ['**/*.test.ts'],
|
||||||
|
retry: 0, // No retries — noise is handled by IQR filtering and tolerance
|
||||||
|
fileParallelism: false, // Must run serially to avoid CPU contention
|
||||||
|
pool: 'forks',
|
||||||
|
poolOptions: {
|
||||||
|
forks: {
|
||||||
|
singleFork: true, // Single process for accurate per-test CPU readings
|
||||||
|
},
|
||||||
|
},
|
||||||
|
env: {
|
||||||
|
GEMINI_TEST_TYPE: 'perf',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
Loading…
Reference in a new issue