feat(test-utils): add CPU performance integration test harness (#24951)

2026-04-21 13:37:17 +00:00 · 2026-04-08 14:50:29 -07:00 · 2026-04-08 14:50:29 -07:00 · c7b920717f
commit c7b920717f
parent 15f7b24312
19 changed files with 1081 additions and 13 deletions
--- a/.github/workflows/perf-nightly.yml
+++ b/.github/workflows/perf-nightly.yml
@ -0,0 +1,33 @@
 name: 'Performance Tests: Nightly'
 on:
  schedule:
    - cron: '0 3 * * *' # Runs at 3 AM every day
  workflow_dispatch: # Allow manual trigger
 permissions:
  contents: 'read'
 jobs:
  perf-test:
    name: 'Run Performance Usage Tests'
    runs-on: 'gemini-cli-ubuntu-16-core'
    if: "github.repository == 'google-gemini/gemini-cli'"
    steps:
      - name: 'Checkout'
        uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
      - name: 'Set up Node.js'
        uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4
        with:
          node-version-file: '.nvmrc'
          cache: 'npm'
      - name: 'Install dependencies'
        run: 'npm ci'
      - name: 'Build project'
        run: 'npm run build'
      - name: 'Run Performance Tests'
        run: 'npm run test:perf'
--- a/.gitignore
+++ b/.gitignore
@ -48,6 +48,7 @@ packages/cli/src/generated/
 packages/core/src/generated/
 packages/devtools/src/_client-assets.ts
 .integration-tests/
 .perf-tests/
 packages/vscode-ide-companion/*.vsix
 packages/cli/download-ripgrep*/
--- a/GEMINI.md
+++ b/GEMINI.md
@ -44,8 +44,13 @@ powerful tool for developers.
 - **Test Commands:**
  - **Unit (All):** `npm run test`
  - **Integration (E2E):** `npm run test:e2e`
  - > **NOTE**: Please run the memory and perf tests locally **only if** you are
    > implementing changes related to those test areas. Otherwise skip these
    > tests locally and rely on CI to run them on nightly builds.
  - **Memory (Nightly):** `npm run test:memory` (Runs memory regression tests
    against baselines. Excluded from `preflight`, run nightly.)
  - **Performance (Nightly):** `npm run test:perf` (Runs CPU performance
    regression tests against baselines. Excluded from `preflight`, run nightly.)
  - **Workspace-Specific:** `npm test -w <pkg> -- <path>` (Note: `<path>` must
    be relative to the workspace root, e.g.,
    `-w @google/gemini-cli-core -- src/routing/modelRouterService.test.ts`)
--- a/docs/integration-tests.md
+++ b/docs/integration-tests.md
@ -157,6 +157,48 @@ The harness (`MemoryTestHarness` in `packages/test-utils`):
 - Compares against baselines with a 10% tolerance.
 - Can analyze sustained leaks across 3 snapshots using `analyzeSnapshots()`.
 ## Performance regression tests
 Performance regression tests are designed to detect wall-clock time, CPU usage,
 and event loop delay regressions across key CLI scenarios. They are located in
 the `perf-tests` directory.
 These tests are distinct from standard integration tests because they measure
 performance metrics and compare it against committed baselines.
 ### Running performance tests
 Performance tests are not run as part of the default `npm run test` or
 `npm run test:e2e` commands. They are run nightly in CI but can be run manually:
 ```bash
 npm run test:perf
 ```
 ### Updating baselines
 If you intentionally change behavior that affects performance, you may need to
 update the baselines. Set the `UPDATE_PERF_BASELINES` environment variable to
 `true`:
 ```bash
 UPDATE_PERF_BASELINES=true npm run test:perf
 ```
 This will run the tests multiple times (with warmup), apply IQR outlier
 filtering, and overwrite `perf-tests/baselines.json`. You should review the
 changes and commit the updated baseline file.
 ### How it works
 The harness (`PerfTestHarness` in `packages/test-utils`):
 - Measures wall-clock time using `performance.now()`.
 - Measures CPU usage using `process.cpuUsage()`.
 - Monitors event loop delay using `perf_hooks.monitorEventLoopDelay()`.
 - Applies IQR (Interquartile Range) filtering to remove outlier samples.
 - Compares against baselines with a 15% tolerance.
 ## Diagnostics
 The integration test runner provides several options for diagnostics to help
--- a/integration-tests/globalSetup.ts
+++ b/integration-tests/globalSetup.ts
@ -14,6 +14,7 @@ import { join, dirname, extname } from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { canUseRipgrep } from '../packages/core/src/tools/ripGrep.js';
 import { disableMouseTracking } from '@google/gemini-cli-core';
 import { isolateTestEnv } from '../packages/test-utils/src/env-setup.js';
 import { createServer, type Server } from 'node:http';
 const __dirname = dirname(fileURLToPath(import.meta.url));
@ -88,15 +89,8 @@ export async function setup() {
  runDir = join(integrationTestsDir, `${Date.now()}`);
  await mkdir(runDir, { recursive: true });
-  // Set the home directory to the test run directory to avoid conflicts
+  // Isolate environment variables
-  // with the user's local config.
+  isolateTestEnv(runDir);
  process.env['HOME'] = runDir;
  if (process.platform === 'win32') {
    process.env['USERPROFILE'] = runDir;
  }
  // We also need to set the config dir explicitly, since the code might
  // construct the path before the HOME env var is set.
  process.env['GEMINI_CONFIG_DIR'] = join(runDir, '.gemini');
  // Download ripgrep to avoid race conditions in parallel tests
  const available = await canUseRipgrep();
@ -127,10 +121,6 @@ export async function setup() {
  }
  process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
  process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true';
  // Force file storage to avoid keychain prompts/hangs in CI, especially on macOS
  process.env['GEMINI_FORCE_FILE_STORAGE'] = 'true';
  process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log');
  if (process.env['KEEP_OUTPUT']) {
    console.log(`Keeping output for test run in: ${runDir}`);
--- a/package-lock.json
+++ b/package-lock.json
@ -36,6 +36,7 @@
        "@types/ws": "^8.18.1",
        "@vitest/coverage-v8": "^3.1.1",
        "@vitest/eslint-plugin": "^1.3.4",
        "asciichart": "^1.5.25",
        "cross-env": "^7.0.3",
        "depcheck": "^1.4.7",
        "domexception": "^4.0.0",
--- a/package.json
+++ b/package.json
@ -53,6 +53,8 @@
    "test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests",
    "test:memory": "vitest run --root ./memory-tests",
    "test:memory:update-baselines": "cross-env UPDATE_MEMORY_BASELINES=true vitest run --root ./memory-tests",
    "test:perf": "vitest run --root ./perf-tests",
    "test:perf:update-baselines": "cross-env UPDATE_PERF_BASELINES=true vitest run --root ./perf-tests",
    "test:integration:sandbox:docker": "cross-env GEMINI_SANDBOX=docker npm run build:sandbox && cross-env GEMINI_SANDBOX=docker vitest run --root ./integration-tests",
    "test:integration:sandbox:podman": "cross-env GEMINI_SANDBOX=podman vitest run --root ./integration-tests",
    "lint": "cross-env NODE_OPTIONS=\"--max-old-space-size=8192\" eslint . --cache --max-warnings 0",
@ -105,6 +107,7 @@
    "@types/ws": "^8.18.1",
    "@vitest/coverage-v8": "^3.1.1",
    "@vitest/eslint-plugin": "^1.3.4",
    "asciichart": "^1.5.25",
    "cross-env": "^7.0.3",
    "depcheck": "^1.4.7",
    "domexception": "^4.0.0",
--- a/packages/test-utils/src/env-setup.ts
+++ b/packages/test-utils/src/env-setup.ts
@ -0,0 +1,35 @@
 /**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */
 import { join } from 'node:path';
 /**
 * Isolate the test environment by setting environment variables
 * to point to a temporary run directory.
 *
 * @param runDir - The temporary directory for this test run.
 */
 export function isolateTestEnv(runDir: string): void {
  // Set the home directory to the test run directory to avoid conflicts
  // with the user's local config.
  process.env['HOME'] = runDir;
  if (process.platform === 'win32') {
    process.env['USERPROFILE'] = runDir;
  }
  // We also need to set the config dir explicitly, since the code might
  // construct the path before the HOME env var is set.
  process.env['GEMINI_CONFIG_DIR'] = join(runDir, '.gemini');
  // Force file storage to avoid keychain prompts/hangs in CI, especially on macOS
  process.env['GEMINI_FORCE_FILE_STORAGE'] = 'true';
  // Mark as integration test
  process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true';
  // Isolate telemetry log
  process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log');
 }
--- a/packages/test-utils/src/index.ts
+++ b/packages/test-utils/src/index.ts
@ -8,6 +8,8 @@ export * from './file-system-test-helpers.js';
 export * from './fixtures/agents.js';
 export * from './memory-baselines.js';
 export * from './memory-test-harness.js';
 export * from './perf-test-harness.js';
 export * from './mock-utils.js';
 export * from './test-mcp-server.js';
 export * from './test-rig.js';
 export * from './env-setup.js';
--- a/packages/test-utils/src/perf-test-harness.ts
+++ b/packages/test-utils/src/perf-test-harness.ts
@ -0,0 +1,546 @@
 /**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */
 import { performance } from 'node:perf_hooks';
 import { setTimeout as sleep } from 'node:timers/promises';
 import { readFileSync, writeFileSync, existsSync } from 'node:fs';
 /** Configuration for asciichart plot function. */
 interface PlotConfig {
  height?: number;
  format?: (x: number) => string;
 }
 /** Type for the asciichart plot function. */
 type PlotFn = (series: number[], config?: PlotConfig) => string;
 /**
 * Baseline entry for a single performance test scenario.
 */
 export interface PerfBaseline {
  wallClockMs: number;
  cpuTotalUs: number;
  eventLoopDelayP99Ms: number;
  timestamp: string;
 }
 /**
 * Top-level structure of the perf baselines JSON file.
 */
 export interface PerfBaselineFile {
  version: number;
  updatedAt: string;
  scenarios: Record<string, PerfBaseline>;
 }
 /**
 * A single performance snapshot at a point in time.
 */
 export interface PerfSnapshot {
  timestamp: number;
  label: string;
  wallClockMs: number;
  cpuUserUs: number;
  cpuSystemUs: number;
  cpuTotalUs: number;
  eventLoopDelayP50Ms: number;
  eventLoopDelayP95Ms: number;
  eventLoopDelayP99Ms: number;
  eventLoopDelayMaxMs: number;
 }
 /**
 * Result from running a performance test scenario.
 */
 export interface PerfTestResult {
  scenarioName: string;
  samples: PerfSnapshot[];
  filteredSamples: PerfSnapshot[];
  median: PerfSnapshot;
  baseline: PerfBaseline | undefined;
  withinTolerance: boolean;
  deltaPercent: number;
  cpuDeltaPercent: number;
 }
 /**
 * Options for the PerfTestHarness.
 */
 export interface PerfTestHarnessOptions {
  /** Path to the baselines JSON file */
  baselinesPath: string;
  /** Default tolerance percentage (0-100). Default: 15 */
  defaultTolerancePercent?: number;
  /** Default CPU tolerance percentage (0-100). Optional */
  defaultCpuTolerancePercent?: number;
  /** Number of samples per scenario. Default: 5 */
  sampleCount?: number;
  /** Number of warmup runs to discard. Default: 1 */
  warmupCount?: number;
  /** Pause in ms between samples. Default: 100 */
  samplePauseMs?: number;
 }
 /**
 * Active timer state tracked internally.
 */
 interface ActiveTimer {
  label: string;
  startTime: number;
  startCpuUsage: NodeJS.CpuUsage;
 }
 /**
 * PerfTestHarness provides infrastructure for running CPU performance tests.
 *
 * It handles:
 * - High-resolution wall-clock timing via performance.now()
 * - CPU usage measurement via process.cpuUsage()
 * - Event loop delay monitoring via perf_hooks.monitorEventLoopDelay()
 * - IQR outlier filtering for noise reduction
 * - Warmup runs to avoid JIT compilation noise
 * - Comparing against baselines with configurable tolerance
 * - Generating ASCII chart reports
 */
 export class PerfTestHarness {
  private baselines: PerfBaselineFile;
  private readonly baselinesPath: string;
  private readonly defaultTolerancePercent: number;
  private readonly defaultCpuTolerancePercent?: number;
  private readonly sampleCount: number;
  private readonly warmupCount: number;
  private readonly samplePauseMs: number;
  private allResults: PerfTestResult[] = [];
  private activeTimers: Map<string, ActiveTimer> = new Map();
  constructor(options: PerfTestHarnessOptions) {
    this.baselinesPath = options.baselinesPath;
    this.defaultTolerancePercent = options.defaultTolerancePercent ?? 15;
    this.defaultCpuTolerancePercent = options.defaultCpuTolerancePercent;
    this.sampleCount = options.sampleCount ?? 5;
    this.warmupCount = options.warmupCount ?? 1;
    this.samplePauseMs = options.samplePauseMs ?? 100;
    this.baselines = loadPerfBaselines(this.baselinesPath);
  }
  /**
   * Start a high-resolution timer with CPU tracking.
   */
  startTimer(label: string): void {
    this.activeTimers.set(label, {
      label,
      startTime: performance.now(),
      startCpuUsage: process.cpuUsage(),
    });
  }
  /**
   * Stop a timer and return the snapshot.
   */
  stopTimer(label: string): PerfSnapshot {
    const timer = this.activeTimers.get(label);
    if (!timer) {
      throw new Error(`No active timer found for label "${label}"`);
    }
    const wallClockMs = performance.now() - timer.startTime;
    const cpuDelta = process.cpuUsage(timer.startCpuUsage);
    this.activeTimers.delete(label);
    return {
      timestamp: Date.now(),
      label,
      wallClockMs,
      cpuUserUs: cpuDelta.user,
      cpuSystemUs: cpuDelta.system,
      cpuTotalUs: cpuDelta.user + cpuDelta.system,
      eventLoopDelayP50Ms: 0,
      eventLoopDelayP95Ms: 0,
      eventLoopDelayP99Ms: 0,
      eventLoopDelayMaxMs: 0,
    };
  }
  /**
   * Measure a function's wall-clock time and CPU usage.
   * Returns the snapshot with timing data.
   */
  async measure(label: string, fn: () => Promise<void>): Promise<PerfSnapshot> {
    this.startTimer(label);
    await fn();
    return this.stopTimer(label);
  }
  /**
   * Measure a function with event loop delay monitoring.
   * Uses perf_hooks.monitorEventLoopDelay() for histogram data.
   */
  async measureWithEventLoop(
    label: string,
    fn: () => Promise<void>,
  ): Promise<PerfSnapshot> {
    // monitorEventLoopDelay is available in Node.js 12+
    const { monitorEventLoopDelay } = await import('node:perf_hooks');
    const histogram = monitorEventLoopDelay({ resolution: 10 });
    histogram.enable();
    this.startTimer(label);
    await fn();
    const snapshot = this.stopTimer(label);
    histogram.disable();
    // Convert from nanoseconds to milliseconds
    snapshot.eventLoopDelayP50Ms = histogram.percentile(50) / 1e6;
    snapshot.eventLoopDelayP95Ms = histogram.percentile(95) / 1e6;
    snapshot.eventLoopDelayP99Ms = histogram.percentile(99) / 1e6;
    snapshot.eventLoopDelayMaxMs = histogram.max / 1e6;
    return snapshot;
  }
  /**
   * Run a scenario multiple times with warmup, outlier filtering, and baseline comparison.
   *
   * @param name - Scenario name (must match baseline key)
   * @param fn - Async function that executes one sample of the scenario.
   *             Must return a PerfSnapshot with measured values.
   * @param tolerancePercent - Override default tolerance for this scenario
   */
  async runScenario(
    name: string,
    fn: () => Promise<PerfSnapshot>,
    tolerancePercent?: number,
  ): Promise<PerfTestResult> {
    const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
    const totalRuns = this.warmupCount + this.sampleCount;
    const allSnapshots: PerfSnapshot[] = [];
    for (let i = 0; i < totalRuns; i++) {
      const isWarmup = i < this.warmupCount;
      const snapshot = await fn();
      snapshot.label = isWarmup
        ? `warmup-${i}`
        : `sample-${i - this.warmupCount}`;
      if (!isWarmup) {
        allSnapshots.push(snapshot);
      }
      // Brief pause between samples
      await sleep(this.samplePauseMs);
    }
    // Apply IQR outlier filtering on wall-clock time
    const filteredSnapshots = this.filterOutliers(allSnapshots, 'wallClockMs');
    // Get median of filtered samples
    const median = this.getMedianSnapshot(filteredSnapshots);
    median.label = 'median';
    // Get baseline
    const baseline = this.baselines.scenarios[name];
    // Determine if within tolerance
    let deltaPercent = 0;
    let cpuDeltaPercent = 0;
    let withinTolerance = true;
    if (baseline) {
      deltaPercent =
        ((median.wallClockMs - baseline.wallClockMs) / baseline.wallClockMs) *
        100;
      cpuDeltaPercent =
        ((median.cpuTotalUs - baseline.cpuTotalUs) / baseline.cpuTotalUs) * 100;
      withinTolerance = deltaPercent <= tolerance;
    }
    const result: PerfTestResult = {
      scenarioName: name,
      samples: allSnapshots,
      filteredSamples: filteredSnapshots,
      median,
      baseline,
      withinTolerance,
      deltaPercent,
      cpuDeltaPercent,
    };
    this.allResults.push(result);
    return result;
  }
  /**
   * Assert that a scenario result is within the baseline tolerance.
   */
  assertWithinBaseline(
    result: PerfTestResult,
    tolerancePercent?: number,
    cpuTolerancePercent?: number,
  ): void {
    const tolerance = tolerancePercent ?? this.defaultTolerancePercent;
    const cpuTolerance = cpuTolerancePercent ?? this.defaultCpuTolerancePercent;
    if (!result.baseline) {
      console.warn(
        `⚠ No baseline found for "${result.scenarioName}". ` +
          `Run with UPDATE_PERF_BASELINES=true to create one. ` +
          `Measured: ${result.median.wallClockMs.toFixed(1)} ms wall-clock.`,
      );
      return;
    }
    const deltaPercent =
      ((result.median.wallClockMs - result.baseline.wallClockMs) /
        result.baseline.wallClockMs) *
      100;
    if (deltaPercent > tolerance) {
      throw new Error(
        `Performance regression detected for "${result.scenarioName}"!\n` +
          `  Measured:    ${result.median.wallClockMs.toFixed(1)} ms wall-clock\n` +
          `  Baseline:    ${result.baseline.wallClockMs.toFixed(1)} ms wall-clock\n` +
          `  Delta:       ${deltaPercent.toFixed(1)}% (tolerance: ${tolerance}%)\n` +
          `  CPU total:   ${formatUs(result.median.cpuTotalUs)}\n` +
          `  EL p99:      ${result.median.eventLoopDelayP99Ms.toFixed(1)} ms\n` +
          `  Samples:     ${result.samples.length} (${result.filteredSamples.length} after IQR filter)`,
      );
    }
    if (cpuTolerance !== undefined && result.cpuDeltaPercent > cpuTolerance) {
      throw new Error(
        `CPU usage regression detected for "${result.scenarioName}"!\n` +
          `  Measured:    ${formatUs(result.median.cpuTotalUs)}\n` +
          `  Baseline:    ${formatUs(result.baseline.cpuTotalUs)}\n` +
          `  Delta:       ${result.cpuDeltaPercent.toFixed(1)}% (tolerance: ${cpuTolerance}%)\n` +
          `  Wall-clock:  ${result.median.wallClockMs.toFixed(1)} ms\n` +
          `  EL p99:      ${result.median.eventLoopDelayP99Ms.toFixed(1)} ms`,
      );
    }
  }
  /**
   * Update the baseline for a scenario with the current measured values.
   */
  updateScenarioBaseline(result: PerfTestResult): void {
    updatePerfBaseline(this.baselinesPath, result.scenarioName, {
      wallClockMs: result.median.wallClockMs,
      cpuTotalUs: result.median.cpuTotalUs,
      eventLoopDelayP99Ms: result.median.eventLoopDelayP99Ms,
    });
    // Reload baselines after update
    this.baselines = loadPerfBaselines(this.baselinesPath);
    console.log(
      `Updated baseline for ${result.scenarioName}: ${result.median.wallClockMs.toFixed(1)} ms`,
    );
  }
  /**
   * Generate an ASCII report with summary table and charts.
   */
  async generateReport(results?: PerfTestResult[]): Promise<string> {
    const resultsToReport = results ?? this.allResults;
    const lines: string[] = [];
    lines.push('');
    lines.push('═══════════════════════════════════════════════════');
    lines.push('         PERFORMANCE TEST REPORT');
    lines.push('═══════════════════════════════════════════════════');
    lines.push('');
    for (const result of resultsToReport) {
      const measured = `${result.median.wallClockMs.toFixed(1)} ms`;
      const baseline = result.baseline
        ? `${result.baseline.wallClockMs.toFixed(1)} ms`
        : 'N/A';
      const delta = result.baseline
        ? `${result.deltaPercent >= 0 ? '+' : ''}${result.deltaPercent.toFixed(1)}%`
        : 'N/A';
      const status = !result.baseline
        ? 'NEW'
        : result.withinTolerance
          ? '✅'
          : '❌';
      lines.push(
        `${result.scenarioName}: ${measured} (Baseline: ${baseline}, Delta: ${delta}) ${status}`,
      );
      // Show CPU breakdown
      const cpuMs = `${(result.median.cpuTotalUs / 1000).toFixed(1)} ms`;
      lines.push(
        `  CPU: ${cpuMs} (user: ${formatUs(result.median.cpuUserUs)}, system: ${formatUs(result.median.cpuSystemUs)})`,
      );
      if (result.median.eventLoopDelayP99Ms > 0) {
        lines.push(
          `  Event loop: p50=${result.median.eventLoopDelayP50Ms.toFixed(1)}ms p95=${result.median.eventLoopDelayP95Ms.toFixed(1)}ms p99=${result.median.eventLoopDelayP99Ms.toFixed(1)}ms max=${result.median.eventLoopDelayMaxMs.toFixed(1)}ms`,
        );
      }
      lines.push(
        `  Samples: ${result.samples.length} → ${result.filteredSamples.length} after IQR filter`,
      );
    }
    lines.push('');
    // Generate ASCII chart for wall-clock per scenario
    try {
      // @ts-expect-error - asciichart may not have types
      const asciichart = (await import('asciichart')) as {
        default?: { plot?: PlotFn };
        plot?: PlotFn;
      };
      const plot: PlotFn | undefined =
        asciichart.default?.plot ?? asciichart.plot;
      for (const result of resultsToReport) {
        if (result.filteredSamples.length > 2) {
          lines.push(`📈 Wall-clock trend: ${result.scenarioName}`);
          lines.push('─'.repeat(60));
          const wallClockData = result.filteredSamples.map(
            (s) => s.wallClockMs,
          );
          if (plot) {
            const chart = plot(wallClockData, {
              height: 8,
              format: (x: number) => `${x.toFixed(0)} ms`.padStart(10),
            });
            lines.push(chart);
          }
          const labels = result.filteredSamples.map((s) => s.label);
          lines.push('  ' + labels.join(' → '));
          lines.push('');
        }
      }
    } catch {
      lines.push(
        '(asciichart not available — install with: npm install --save-dev asciichart)',
      );
      lines.push('');
    }
    lines.push('═══════════════════════════════════════════════════');
    lines.push('');
    const report = lines.join('\n');
    console.log(report);
    return report;
  }
  /**
   * Filter outliers using the Interquartile Range (IQR) method.
   * Removes samples where the given metric falls outside Q1 - 1.5*IQR or Q3 + 1.5*IQR.
   */
  private filterOutliers(
    snapshots: PerfSnapshot[],
    metric: keyof PerfSnapshot,
  ): PerfSnapshot[] {
    if (snapshots.length < 4) {
      // Not enough data for meaningful IQR filtering
      return [...snapshots];
    }
    const sorted = [...snapshots].sort(
      (a, b) => (a[metric] as number) - (b[metric] as number),
    );
    const q1Idx = Math.floor(sorted.length * 0.25);
    const q3Idx = Math.floor(sorted.length * 0.75);
    const q1 = sorted[q1Idx]![metric] as number;
    const q3 = sorted[q3Idx]![metric] as number;
    const iqr = q3 - q1;
    const lowerBound = q1 - 1.5 * iqr;
    const upperBound = q3 + 1.5 * iqr;
    return snapshots.filter((s) => {
      const val = s[metric] as number;
      return val >= lowerBound && val <= upperBound;
    });
  }
  /**
   * Get the median snapshot by wall-clock time from a sorted list.
   */
  private getMedianSnapshot(snapshots: PerfSnapshot[]): PerfSnapshot {
    if (snapshots.length === 0) {
      throw new Error('Cannot compute median of empty snapshot list');
    }
    const sorted = [...snapshots].sort((a, b) => a.wallClockMs - b.wallClockMs);
    const medianIdx = Math.floor(sorted.length / 2);
    return { ...sorted[medianIdx]! };
  }
 }
 // ─── Baseline management ─────────────────────────────────────────────
 /**
 * Load perf baselines from a JSON file.
 */
 export function loadPerfBaselines(path: string): PerfBaselineFile {
  if (!existsSync(path)) {
    return {
      version: 1,
      updatedAt: new Date().toISOString(),
      scenarios: {},
    };
  }
  const content = readFileSync(path, 'utf-8');
  return JSON.parse(content) as PerfBaselineFile;
 }
 /**
 * Save perf baselines to a JSON file.
 */
 export function savePerfBaselines(
  path: string,
  baselines: PerfBaselineFile,
 ): void {
  baselines.updatedAt = new Date().toISOString();
  writeFileSync(path, JSON.stringify(baselines, null, 2) + '\n');
 }
 /**
 * Update (or create) a single scenario baseline in the file.
 */
 export function updatePerfBaseline(
  path: string,
  scenarioName: string,
  measured: {
    wallClockMs: number;
    cpuTotalUs: number;
    eventLoopDelayP99Ms: number;
  },
 ): void {
  const baselines = loadPerfBaselines(path);
  baselines.scenarios[scenarioName] = {
    wallClockMs: measured.wallClockMs,
    cpuTotalUs: measured.cpuTotalUs,
    eventLoopDelayP99Ms: measured.eventLoopDelayP99Ms,
    timestamp: new Date().toISOString(),
  };
  savePerfBaselines(path, baselines);
 }
 // ─── Helpers ─────────────────────────────────────────────────────────
 /**
 * Format microseconds as a human-readable string.
 */
 function formatUs(us: number): string {
  if (us > 1_000_000) {
    return `${(us / 1_000_000).toFixed(2)} s`;
  }
  if (us > 1_000) {
    return `${(us / 1_000).toFixed(1)} ms`;
  }
  return `${us} μs`;
 }
--- a/perf-tests/README.md
+++ b/perf-tests/README.md
@ -0,0 +1,121 @@
 # CPU Performance Integration Test Harness
 ## Overview
 This directory contains performance/CPU integration tests for the Gemini CLI.
 These tests measure wall-clock time, CPU usage, and event loop responsiveness to
 detect regressions across key scenarios.
 CPU performance is inherently noisy, especially in CI. The harness addresses
 this with:
 - **IQR outlier filtering** — discards anomalous samples
 - **Median sampling** — takes N runs, reports the median after filtering
 - **Warmup runs** — discards the first run to mitigate JIT compilation noise
 - **15% default tolerance** — won't panic at slight regressions
 ## Running
 ```bash
 # Run tests (compare against committed baselines)
 npm run test:perf
 # Update baselines (after intentional changes)
 npm run test:perf:update-baselines
 # Verbose output
 VERBOSE=true npm run test:perf
 # Keep test artifacts for debugging
 KEEP_OUTPUT=true npm run test:perf
 ```
 ## How It Works
 ### Measurement Primitives
 The `PerfTestHarness` class (in `packages/test-utils`) provides:
 - **`performance.now()`** — high-resolution wall-clock timing
 - **`process.cpuUsage()`** — user + system CPU microseconds (delta between
  start/stop)
 - **`perf_hooks.monitorEventLoopDelay()`** — event loop delay histogram
  (p50/p95/p99/max)
 ### Noise Reduction
 1. **Warmup**: First run is discarded to mitigate JIT compilation artifacts
 2. **Multiple samples**: Each scenario runs N times (default 5)
 3. **IQR filtering**: Samples outside Q1−1.5×IQR and Q3+1.5×IQR are discarded
 4. **Median**: The median of remaining samples is used for comparison
 ### Baseline Management
 Baselines are stored in `baselines.json` in this directory. Each scenario has:
 ```json
 {
  "cold-startup-time": {
    "wallClockMs": 1234.5,
    "cpuTotalUs": 567890,
    "eventLoopDelayP99Ms": 12.3,
    "timestamp": "2026-04-08T..."
  }
 }
 ```
 Tests fail if the measured value exceeds `baseline × 1.15` (15% tolerance).
 To recalibrate after intentional changes:
 ```bash
 npm run test:perf:update-baselines
 # then commit baselines.json
 ```
 ### Report Output
 After all tests, the harness prints an ASCII summary:
 ```
 ═══════════════════════════════════════════════════
         PERFORMANCE TEST REPORT
 ═══════════════════════════════════════════════════
 cold-startup-time:   1234.5 ms (Baseline: 1200.0 ms, Delta: +2.9%) ✅
 idle-cpu-usage:         2.1 %  (Baseline: 2.0 %, Delta: +5.0%)     ✅
 skill-loading-time:  1567.8 ms (Baseline: 1500.0 ms, Delta: +4.5%) ✅
 ```
 ## Architecture
 ```
 perf-tests/
 ├── README.md              ← you are here
 ├── baselines.json         ← committed baseline values
 ├── globalSetup.ts         ← test environment setup
 ├── perf-usage.test.ts     ← test scenarios
 ├── perf.*.responses       ← fake API responses per scenario
 ├── tsconfig.json          ← TypeScript config
 └── vitest.config.ts       ← vitest config (serial, isolated)
 packages/test-utils/src/
 ├── perf-test-harness.ts   ← PerfTestHarness class
 └── index.ts               ← re-exports
 ```
 ## CI Integration
 These tests are **excluded from `preflight`** and designed for nightly CI:
 ```yaml
 - name: Performance regression tests
  run: npm run test:perf
 ```
 ## Adding a New Scenario
 1. Add a fake response file: `perf.<scenario-name>.responses`
 2. Add a test case in `perf-usage.test.ts` using `harness.runScenario()`
 3. Run `npm run test:perf:update-baselines` to establish initial baseline
 4. Commit the updated `baselines.json`
--- a/perf-tests/baselines.json
+++ b/perf-tests/baselines.json
@ -0,0 +1,24 @@
 {
  "version": 1,
  "updatedAt": "2026-04-08T18:51:29.839Z",
  "scenarios": {
    "cold-startup-time": {
      "wallClockMs": 1333.4230420000004,
      "cpuTotalUs": 1711,
      "eventLoopDelayP99Ms": 0,
      "timestamp": "2026-04-08T18:50:58.124Z"
    },
    "idle-cpu-usage": {
      "wallClockMs": 5001.926125,
      "cpuTotalUs": 128518,
      "eventLoopDelayP99Ms": 12.705791,
      "timestamp": "2026-04-08T18:51:23.938Z"
    },
    "skill-loading-time": {
      "wallClockMs": 1372.4463749999995,
      "cpuTotalUs": 1550,
      "eventLoopDelayP99Ms": 0,
      "timestamp": "2026-04-08T18:51:29.839Z"
    }
  }
 }
--- a/perf-tests/globalSetup.ts
+++ b/perf-tests/globalSetup.ts
@ -0,0 +1,67 @@
 /**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */
 import { mkdir, readdir, rm } from 'node:fs/promises';
 import { join, dirname } from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { canUseRipgrep } from '../packages/core/src/tools/ripGrep.js';
 import { isolateTestEnv } from '../packages/test-utils/src/env-setup.js';
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const rootDir = join(__dirname, '..');
 const perfTestsDir = join(rootDir, '.perf-tests');
 const KEEP_RUNS_COUNT = 5;
 let runDir = '';
 export async function setup() {
  runDir = join(perfTestsDir, `${Date.now()}`);
  await mkdir(runDir, { recursive: true });
  // Isolate environment variables
  isolateTestEnv(runDir);
  // Download ripgrep to avoid race conditions
  const available = await canUseRipgrep();
  if (!available) {
    throw new Error('Failed to download ripgrep binary');
  }
  // Clean up old test runs, keeping the latest few for debugging
  try {
    const testRuns = await readdir(perfTestsDir);
    if (testRuns.length > KEEP_RUNS_COUNT) {
      const oldRuns = testRuns
        .sort()
        .slice(0, testRuns.length - KEEP_RUNS_COUNT);
      await Promise.all(
        oldRuns.map((oldRun) =>
          rm(join(perfTestsDir, oldRun), {
            recursive: true,
            force: true,
          }),
        ),
      );
    }
  } catch (e) {
    console.error('Error cleaning up old perf test runs:', e);
  }
  process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
  process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false';
  console.log(`\nPerf test output directory: ${runDir}`);
 }
 export async function teardown() {
  // Cleanup unless KEEP_OUTPUT is set
  if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) {
    try {
      await rm(runDir, { recursive: true, force: true });
    } catch (e) {
      console.warn('Failed to clean up perf test directory:', e);
    }
  }
 }
--- a/perf-tests/perf-usage.test.ts
+++ b/perf-tests/perf-usage.test.ts
@ -0,0 +1,153 @@
 /**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */
 import { describe, it, beforeAll, afterAll } from 'vitest';
 import { TestRig, PerfTestHarness } from '@google/gemini-cli-test-utils';
 import { join, dirname } from 'node:path';
 import { fileURLToPath } from 'node:url';
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const BASELINES_PATH = join(__dirname, 'baselines.json');
 const UPDATE_BASELINES = process.env['UPDATE_PERF_BASELINES'] === 'true';
 const TOLERANCE_PERCENT = 15;
 // Use fewer samples locally for faster iteration, more in CI
 const SAMPLE_COUNT = process.env['CI'] ? 5 : 3;
 const WARMUP_COUNT = 1;
 describe('CPU Performance Tests', () => {
  let harness: PerfTestHarness;
  beforeAll(() => {
    harness = new PerfTestHarness({
      baselinesPath: BASELINES_PATH,
      defaultTolerancePercent: TOLERANCE_PERCENT,
      sampleCount: SAMPLE_COUNT,
      warmupCount: WARMUP_COUNT,
    });
  });
  afterAll(async () => {
    // Generate the summary report after all tests
    await harness.generateReport();
  });
  it('cold-startup-time: startup completes within baseline', async () => {
    const result = await harness.runScenario('cold-startup-time', async () => {
      const rig = new TestRig();
      try {
        rig.setup('perf-cold-startup', {
          fakeResponsesPath: join(__dirname, 'perf.cold-startup.responses'),
        });
        return await harness.measure('cold-startup', async () => {
          await rig.run({
            args: ['hello'],
            timeout: 120000,
            env: { GEMINI_API_KEY: 'fake-perf-test-key' },
          });
        });
      } finally {
        await rig.cleanup();
      }
    });
    if (UPDATE_BASELINES) {
      harness.updateScenarioBaseline(result);
    } else {
      harness.assertWithinBaseline(result);
    }
  });
  it('idle-cpu-usage: CPU stays low when idle', async () => {
    const IDLE_OBSERVATION_MS = 5000;
    const result = await harness.runScenario('idle-cpu-usage', async () => {
      const rig = new TestRig();
      try {
        rig.setup('perf-idle-cpu', {
          fakeResponsesPath: join(__dirname, 'perf.idle-cpu.responses'),
        });
        // First, run a prompt to get the CLI into idle state
        await rig.run({
          args: ['hello'],
          timeout: 120000,
          env: { GEMINI_API_KEY: 'fake-perf-test-key' },
        });
        // Now measure CPU during idle period in the test process
        return await harness.measureWithEventLoop('idle-cpu', async () => {
          // Simulate idle period — just wait
          const { setTimeout: sleep } = await import('node:timers/promises');
          await sleep(IDLE_OBSERVATION_MS);
        });
      } finally {
        await rig.cleanup();
      }
    });
    if (UPDATE_BASELINES) {
      harness.updateScenarioBaseline(result);
    } else {
      harness.assertWithinBaseline(result);
    }
  });
  it('skill-loading-time: startup with many skills within baseline', async () => {
    const SKILL_COUNT = 20;
    const result = await harness.runScenario('skill-loading-time', async () => {
      const rig = new TestRig();
      try {
        rig.setup('perf-skill-loading', {
          fakeResponsesPath: join(__dirname, 'perf.skill-loading.responses'),
        });
        // Create many skill directories with SKILL.md files
        for (let i = 0; i < SKILL_COUNT; i++) {
          const skillDir = `.gemini/skills/perf-skill-${i}`;
          rig.mkdir(skillDir);
          rig.createFile(
            `${skillDir}/SKILL.md`,
            [
              '---',
              `name: perf-skill-${i}`,
              `description: Performance test skill number ${i}`,
              `activation: manual`,
              '---',
              '',
              `# Performance Test Skill ${i}`,
              '',
              `This is a test skill for measuring skill loading performance.`,
              `It contains some content to simulate real-world skill files.`,
              '',
              `## Usage`,
              '',
              `Use this skill by activating it with @perf-skill-${i}.`,
            ].join('\n'),
          );
        }
        return await harness.measure('skill-loading', async () => {
          await rig.run({
            args: ['hello'],
            timeout: 120000,
            env: { GEMINI_API_KEY: 'fake-perf-test-key' },
          });
        });
      } finally {
        await rig.cleanup();
      }
    });
    if (UPDATE_BASELINES) {
      harness.updateScenarioBaseline(result);
    } else {
      harness.assertWithinBaseline(result);
    }
  });
 });
--- a/perf-tests/perf.cold-startup.responses
+++ b/perf-tests/perf.cold-startup.responses
@ -0,0 +1,2 @@
 {"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
 {"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help. What would you like to work on?"}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":12,"totalTokenCount":17,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
--- a/perf-tests/perf.idle-cpu.responses
+++ b/perf-tests/perf.idle-cpu.responses
@ -0,0 +1,2 @@
 {"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
 {"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":8,"totalTokenCount":13,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
--- a/perf-tests/perf.skill-loading.responses
+++ b/perf-tests/perf.skill-loading.responses
@ -0,0 +1,2 @@
 {"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}
 {"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to assist you with your project."}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":10,"totalTokenCount":15,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}
--- a/perf-tests/tsconfig.json
+++ b/perf-tests/tsconfig.json
@ -0,0 +1,12 @@
 {
  "extends": "../tsconfig.json",
  "compilerOptions": {
    "noEmit": true,
    "allowJs": true
  },
  "include": ["**/*.ts"],
  "references": [
    { "path": "../packages/core" },
    { "path": "../packages/test-utils" }
  ]
 }
--- a/perf-tests/vitest.config.ts
+++ b/perf-tests/vitest.config.ts
@ -0,0 +1,27 @@
 /**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */
 import { defineConfig } from 'vitest/config';
 export default defineConfig({
  test: {
    testTimeout: 600000, // 10 minutes — performance profiling needs time for multiple samples
    globalSetup: './globalSetup.ts',
    reporters: ['default'],
    include: ['**/*.test.ts'],
    retry: 0, // No retries — noise is handled by IQR filtering and tolerance
    fileParallelism: false, // Must run serially to avoid CPU contention
    pool: 'forks',
    poolOptions: {
      forks: {
        singleFork: true, // Single process for accurate per-test CPU readings
      },
    },
    env: {
      GEMINI_TEST_TYPE: 'perf',
    },
  },
 });
		`@ -0,0 +1,2 @@`
							`{"method":"generateContent","response":{"candidates":[{"content":{"parts":[{"text":"0"}],"role":"model"},"finishReason":"STOP","index":0}]}}`
							`{"method":"generateContentStream","response":[{"candidates":[{"content":{"parts":[{"text":"Hello! I'm ready to help. What would you like to work on?"}],"role":"model"},"finishReason":"STOP","index":0}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":12,"totalTokenCount":17,"promptTokensDetails":[{"modality":"TEXT","tokenCount":5}]}}]}`