mirror of
https://github.com/google-gemini/gemini-cli
synced 2026-04-21 13:37:17 +00:00
fix(evals): remove tool restrictions and add compile-time guards (#23312)
This commit is contained in:
parent
28935d1e6b
commit
d3766875f8
5 changed files with 43 additions and 68 deletions
|
|
@ -15,9 +15,26 @@ import fs from 'node:fs';
|
|||
import path from 'node:path';
|
||||
import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
|
||||
|
||||
/**
|
||||
* Config overrides for evals, with tool-restriction fields explicitly
|
||||
* forbidden. Evals must test against the full, default tool set to ensure
|
||||
* realistic behavior.
|
||||
*/
|
||||
interface EvalConfigOverrides {
|
||||
/** Restricting tools via excludeTools in evals is forbidden. */
|
||||
excludeTools?: never;
|
||||
/** Restricting tools via coreTools in evals is forbidden. */
|
||||
coreTools?: never;
|
||||
/** Restricting tools via allowedTools in evals is forbidden. */
|
||||
allowedTools?: never;
|
||||
/** Restricting tools via mainAgentTools in evals is forbidden. */
|
||||
mainAgentTools?: never;
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
export interface AppEvalCase {
|
||||
name: string;
|
||||
configOverrides?: any;
|
||||
configOverrides?: EvalConfigOverrides;
|
||||
prompt: string;
|
||||
timeout?: number;
|
||||
files?: Record<string, string>;
|
||||
|
|
|
|||
|
|
@ -21,7 +21,6 @@ describe('generalist_delegation', () => {
|
|||
experimental: {
|
||||
enableAgents: true,
|
||||
},
|
||||
excludeTools: ['run_shell_command'],
|
||||
},
|
||||
files: {
|
||||
'file1.ts': 'console.log("no semi")',
|
||||
|
|
@ -65,7 +64,6 @@ describe('generalist_delegation', () => {
|
|||
experimental: {
|
||||
enableAgents: true,
|
||||
},
|
||||
excludeTools: ['run_shell_command'],
|
||||
},
|
||||
files: {
|
||||
'src/a.ts': 'export const a = 1;',
|
||||
|
|
@ -106,7 +104,6 @@ describe('generalist_delegation', () => {
|
|||
experimental: {
|
||||
enableAgents: true,
|
||||
},
|
||||
excludeTools: ['run_shell_command'],
|
||||
},
|
||||
files: {
|
||||
'README.md': 'This is a proyect.',
|
||||
|
|
@ -141,7 +138,6 @@ describe('generalist_delegation', () => {
|
|||
experimental: {
|
||||
enableAgents: true,
|
||||
},
|
||||
excludeTools: ['run_shell_command'],
|
||||
},
|
||||
files: {
|
||||
'src/VERSION': '1.2.3',
|
||||
|
|
|
|||
|
|
@ -15,7 +15,6 @@ describe('Model Steering Behavioral Evals', () => {
|
|||
appEvalTest('USUALLY_PASSES', {
|
||||
name: 'Corrective Hint: Model switches task based on hint during tool turn',
|
||||
configOverrides: {
|
||||
excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
|
||||
modelSteering: true,
|
||||
},
|
||||
files: {
|
||||
|
|
@ -55,7 +54,6 @@ describe('Model Steering Behavioral Evals', () => {
|
|||
appEvalTest('USUALLY_PASSES', {
|
||||
name: 'Suggestive Hint: Model incorporates user guidance mid-stream',
|
||||
configOverrides: {
|
||||
excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
|
||||
modelSteering: true,
|
||||
},
|
||||
files: {},
|
||||
|
|
|
|||
|
|
@ -16,9 +16,7 @@ describe('save_memory', () => {
|
|||
const rememberingFavoriteColor = "Agent remembers user's favorite color";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: rememberingFavoriteColor,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
|
||||
prompt: `remember that my favorite color is blue.
|
||||
|
||||
what is my favorite color? tell me that and surround it with $ symbol`,
|
||||
|
|
@ -38,9 +36,7 @@ describe('save_memory', () => {
|
|||
const rememberingCommandRestrictions = 'Agent remembers command restrictions';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: rememberingCommandRestrictions,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
|
||||
prompt: `I don't want you to ever run npm commands.`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
|
|
@ -59,9 +55,7 @@ describe('save_memory', () => {
|
|||
const rememberingWorkflow = 'Agent remembers workflow preferences';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: rememberingWorkflow,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
|
||||
prompt: `I want you to always lint after building.`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
|
|
@ -81,9 +75,7 @@ describe('save_memory', () => {
|
|||
'Agent ignores temporary conversation details';
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: ignoringTemporaryInformation,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
|
||||
prompt: `I'm going to get a coffee.`,
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForTelemetryReady();
|
||||
|
|
@ -106,9 +98,7 @@ describe('save_memory', () => {
|
|||
const rememberingPetName = "Agent remembers user's pet's name";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: rememberingPetName,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
|
||||
prompt: `Please remember that my dog's name is Buddy.`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
|
|
@ -127,9 +117,7 @@ describe('save_memory', () => {
|
|||
const rememberingCommandAlias = 'Agent remembers custom command aliases';
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: rememberingCommandAlias,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
|
||||
prompt: `When I say 'start server', you should run 'npm run dev'.`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
|
|
@ -149,18 +137,6 @@ describe('save_memory', () => {
|
|||
"Agent ignores workspace's database schema location";
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: ignoringDbSchemaLocation,
|
||||
params: {
|
||||
settings: {
|
||||
tools: {
|
||||
core: [
|
||||
'save_memory',
|
||||
'list_directory',
|
||||
'read_file',
|
||||
'run_shell_command',
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`,
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForTelemetryReady();
|
||||
|
|
@ -180,9 +156,7 @@ describe('save_memory', () => {
|
|||
"Agent remembers user's coding style preference";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: rememberingCodingStyle,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
|
||||
prompt: `I prefer to use tabs instead of spaces for indentation.`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
|
|
@ -202,18 +176,6 @@ describe('save_memory', () => {
|
|||
'Agent ignores workspace build artifact location';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: ignoringBuildArtifactLocation,
|
||||
params: {
|
||||
settings: {
|
||||
tools: {
|
||||
core: [
|
||||
'save_memory',
|
||||
'list_directory',
|
||||
'read_file',
|
||||
'run_shell_command',
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`,
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForTelemetryReady();
|
||||
|
|
@ -232,18 +194,6 @@ describe('save_memory', () => {
|
|||
const ignoringMainEntryPoint = "Agent ignores workspace's main entry point";
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: ignoringMainEntryPoint,
|
||||
params: {
|
||||
settings: {
|
||||
tools: {
|
||||
core: [
|
||||
'save_memory',
|
||||
'list_directory',
|
||||
'read_file',
|
||||
'run_shell_command',
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
prompt: `The main entry point for this workspace is \`src/index.js\`.`,
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForTelemetryReady();
|
||||
|
|
@ -262,9 +212,7 @@ describe('save_memory', () => {
|
|||
const rememberingBirthday = "Agent remembers user's birthday";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: rememberingBirthday,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
|
||||
prompt: `My birthday is on June 15th.`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
|
|
|
|||
|
|
@ -197,9 +197,25 @@ export function symlinkNodeModules(testDir: string) {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Settings that are forbidden in evals. Evals should never restrict which
|
||||
* tools are available — they must test against the full, default tool set
|
||||
* to ensure realistic behavior.
|
||||
*/
|
||||
interface ForbiddenToolSettings {
|
||||
tools?: {
|
||||
/** Restricting core tools in evals is forbidden. */
|
||||
core?: never;
|
||||
[key: string]: unknown;
|
||||
};
|
||||
}
|
||||
|
||||
export interface EvalCase {
|
||||
name: string;
|
||||
params?: Record<string, any>;
|
||||
params?: {
|
||||
settings?: ForbiddenToolSettings & Record<string, unknown>;
|
||||
[key: string]: unknown;
|
||||
};
|
||||
prompt: string;
|
||||
timeout?: number;
|
||||
files?: Record<string, string>;
|
||||
|
|
|
|||
Loading…
Reference in a new issue