n8n/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/argument-parser.ts
oleg f880a74d99
refactor(ai-builder): Implement unified evaluations harness (#23955)
Signed-off-by: Oleg Ivaniv <me@olegivaniv.com>
Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
2026-01-13 12:11:13 +00:00

453 lines
12 KiB
TypeScript

/* eslint-disable @typescript-eslint/naming-convention */
import { z } from 'zod';
import type { BuilderFeatureFlags } from '../../src/workflow-builder-agent.js';
import type { LangsmithExampleFilters } from '../harness/harness-types';
import { DEFAULTS } from '../support/constants';
export type EvaluationSuite = 'llm-judge' | 'pairwise' | 'programmatic' | 'similarity';
export type EvaluationBackend = 'local' | 'langsmith';
export interface EvaluationArgs {
suite: EvaluationSuite;
backend: EvaluationBackend;
verbose: boolean;
repetitions: number;
concurrency: number;
timeoutMs: number;
experimentName?: string;
outputDir?: string;
datasetName?: string;
maxExamples?: number;
filters?: LangsmithExampleFilters;
testCase?: string;
promptsCsv?: string;
prompt?: string;
dos?: string;
donts?: string;
numJudges: number;
numGenerations: number;
featureFlags?: BuilderFeatureFlags;
}
type CliValueKind = 'boolean' | 'string';
type FlagGroup = 'input' | 'eval' | 'pairwise' | 'langsmith' | 'output' | 'feature' | 'advanced';
const cliSchema = z
.object({
suite: z.enum(['llm-judge', 'pairwise', 'programmatic', 'similarity']).default('llm-judge'),
backend: z.enum(['local', 'langsmith']).default('local'),
verbose: z.boolean().default(false),
repetitions: z.coerce.number().int().positive().default(DEFAULTS.REPETITIONS),
concurrency: z.coerce.number().int().positive().default(DEFAULTS.CONCURRENCY),
timeoutMs: z.coerce.number().int().positive().default(DEFAULTS.TIMEOUT_MS),
experimentName: z.string().min(1).optional(),
outputDir: z.string().min(1).optional(),
datasetName: z.string().min(1).optional(),
maxExamples: z.coerce.number().int().positive().optional(),
filter: z.array(z.string().min(1)).default([]),
notionId: z.string().min(1).optional(),
technique: z.string().min(1).optional(),
testCase: z.string().min(1).optional(),
promptsCsv: z.string().min(1).optional(),
prompt: z.string().min(1).optional(),
dos: z.string().min(1).optional(),
donts: z.string().min(1).optional(),
numJudges: z.coerce.number().int().positive().default(DEFAULTS.NUM_JUDGES),
numGenerations: z.coerce.number().int().positive().max(10).default(DEFAULTS.NUM_GENERATIONS),
langsmith: z.boolean().optional(),
templateExamples: z.boolean().default(false),
})
.strict();
type CliKey = keyof z.infer<typeof cliSchema>;
type FlagDef = { key: CliKey; kind: CliValueKind; desc: string; group: FlagGroup };
const FLAG_DEFS: Record<string, FlagDef> = {
// Input sources
'--prompt': { key: 'prompt', kind: 'string', group: 'input', desc: 'Single prompt to evaluate' },
'--prompts-csv': {
key: 'promptsCsv',
kind: 'string',
group: 'input',
desc: 'CSV file with prompts',
},
'--test-case': {
key: 'testCase',
kind: 'string',
group: 'input',
desc: 'Run specific default test case by ID',
},
'--dataset': {
key: 'datasetName',
kind: 'string',
group: 'input',
desc: 'LangSmith dataset name',
},
// Evaluation options
'--suite': {
key: 'suite',
kind: 'string',
group: 'eval',
desc: 'Evaluation suite (llm-judge|pairwise|programmatic|similarity)',
},
'--backend': { key: 'backend', kind: 'string', group: 'eval', desc: 'Backend (local|langsmith)' },
'--max-examples': {
key: 'maxExamples',
kind: 'string',
group: 'eval',
desc: 'Limit number of examples',
},
'--repetitions': {
key: 'repetitions',
kind: 'string',
group: 'eval',
desc: 'Repeat each example N times',
},
'--concurrency': {
key: 'concurrency',
kind: 'string',
group: 'eval',
desc: 'Max parallel evaluations',
},
'--timeout-ms': {
key: 'timeoutMs',
kind: 'string',
group: 'eval',
desc: 'Timeout per evaluation (ms)',
},
// Pairwise options
'--dos': {
key: 'dos',
kind: 'string',
group: 'pairwise',
desc: 'Requirements the workflow must satisfy',
},
'--donts': {
key: 'donts',
kind: 'string',
group: 'pairwise',
desc: 'Things the workflow must avoid',
},
// LangSmith options
'--langsmith': {
key: 'langsmith',
kind: 'boolean',
group: 'langsmith',
desc: 'Shorthand for --backend langsmith',
},
'--name': { key: 'experimentName', kind: 'string', group: 'langsmith', desc: 'Experiment name' },
'--filter': {
key: 'filter',
kind: 'string',
group: 'langsmith',
desc: 'Filter examples (key:value, repeatable)',
},
'--notion-id': {
key: 'notionId',
kind: 'string',
group: 'langsmith',
desc: 'Filter by Notion ID',
},
'--technique': {
key: 'technique',
kind: 'string',
group: 'langsmith',
desc: 'Filter by technique',
},
// Output
'--output-dir': {
key: 'outputDir',
kind: 'string',
group: 'output',
desc: 'Directory for artifacts',
},
'--verbose': { key: 'verbose', kind: 'boolean', group: 'output', desc: 'Verbose logging' },
// Feature flags
'--template-examples': {
key: 'templateExamples',
kind: 'boolean',
group: 'feature',
desc: 'Enable template examples phase',
},
// Advanced
'--judges': { key: 'numJudges', kind: 'string', group: 'advanced', desc: 'Number of LLM judges' },
'--generations': {
key: 'numGenerations',
kind: 'string',
group: 'advanced',
desc: 'Workflow generations per prompt',
},
};
// Aliases (not shown in help)
const FLAG_ALIASES: Record<string, string> = {
'--mode': '--suite',
'-v': '--verbose',
};
// Combined lookup for parsing
const FLAG_TO_KEY: Record<string, FlagDef> = {
...FLAG_DEFS,
...Object.fromEntries(
Object.entries(FLAG_ALIASES).map(([alias, target]) => [alias, FLAG_DEFS[target]]),
),
};
function formatValidFlags(): string {
return Object.keys(FLAG_TO_KEY)
.filter((f) => f.startsWith('--'))
.sort()
.join('\n ');
}
const GROUP_TITLES: Record<FlagGroup, string> = {
input: 'Input Sources',
eval: 'Evaluation Options',
pairwise: 'Pairwise Options',
langsmith: 'LangSmith Options',
output: 'Output',
feature: 'Feature Flags',
advanced: 'Advanced',
};
function formatHelp(): string {
const lines: string[] = [
'Usage: pnpm eval [options]',
'',
'Evaluation harness for AI Workflow Builder.',
'',
];
const groups: FlagGroup[] = [
'input',
'eval',
'pairwise',
'langsmith',
'output',
'feature',
'advanced',
];
for (const group of groups) {
const flags = Object.entries(FLAG_DEFS).filter(([, def]) => def.group === group);
if (flags.length === 0) continue;
lines.push(`${GROUP_TITLES[group]}:`);
for (const [flag, def] of flags) {
const valueHint = def.kind === 'string' ? ' <value>' : '';
const padded = ` ${flag}${valueHint}`.padEnd(28);
lines.push(`${padded}${def.desc}`);
}
lines.push('');
}
lines.push('Examples:');
lines.push(' pnpm eval --verbose');
lines.push(' pnpm eval --prompt "Create a Slack notification workflow"');
lines.push(' pnpm eval --prompts-csv my-prompts.csv --max-examples 5');
lines.push(' pnpm eval:langsmith --dataset "workflow-builder-canvas-prompts" --name "test-run"');
return lines.join('\n');
}
export function printHelp(): void {
console.log(formatHelp());
}
function ensureValue(argv: string[], i: number, flag: string): string {
const value = argv[i + 1];
if (value === undefined) throw new Error(`Flag ${flag} requires a value`);
return value;
}
function splitFlagToken(token: string): { flag: string; inlineValue?: string } {
if (!token.startsWith('--')) return { flag: token };
const equalsIndex = token.indexOf('=');
if (equalsIndex === -1) return { flag: token };
return { flag: token.slice(0, equalsIndex), inlineValue: token.slice(equalsIndex + 1) };
}
function isStringArray(value: unknown): value is string[] {
return Array.isArray(value) && value.every((v): v is string => typeof v === 'string');
}
function parseCli(argv: string[]): {
values: Partial<Record<CliKey, unknown>>;
seenKeys: Set<CliKey>;
} {
const values: Partial<Record<CliKey, unknown>> = {};
const seenKeys = new Set<CliKey>();
for (let i = 0; i < argv.length; i++) {
const token = argv[i];
if (!token.startsWith('-')) continue;
const { flag, inlineValue } = splitFlagToken(token);
const def = FLAG_TO_KEY[flag];
if (!def) {
throw new Error(`Unknown flag: ${flag}\n\nValid flags:\n ${formatValidFlags()}`);
}
seenKeys.add(def.key);
if (def.kind === 'boolean') {
values[def.key] = true;
continue;
}
const value = inlineValue ?? ensureValue(argv, i, flag);
if (inlineValue === undefined) i++;
if (def.key === 'filter') {
const existing = values.filter;
values.filter = isStringArray(existing) ? [...existing, value] : [value];
continue;
}
values[def.key] = value;
}
return { values, seenKeys };
}
function parseFeatureFlags(args: {
templateExamples: boolean;
}): BuilderFeatureFlags | undefined {
const templateExamplesFromEnv = process.env.EVAL_FEATURE_TEMPLATE_EXAMPLES === 'true';
const templateExamples = templateExamplesFromEnv || args.templateExamples;
if (!templateExamples) return undefined;
return {
templateExamples: templateExamples || undefined,
};
}
function parseFilters(args: {
filter: string[];
notionId?: string;
technique?: string;
}): LangsmithExampleFilters | undefined {
const filters: LangsmithExampleFilters = {};
for (const raw of args.filter) {
const match = raw.match(/^(\w+):(.+)$/);
if (!match) {
throw new Error('Invalid `--filter` format. Expected: --filter "key:value"');
}
const [, key, valueRaw] = match;
const value = valueRaw.trim();
if (value.length === 0) {
throw new Error(`Invalid \`--filter\` value for "${key}": value cannot be empty`);
}
switch (key) {
case 'do':
filters.doSearch = value;
break;
case 'dont':
filters.dontSearch = value;
break;
case 'technique':
filters.technique = value;
break;
case 'id':
filters.notionId = value;
break;
default:
throw new Error(`Unknown filter key "${key}". Expected one of: do, dont, technique, id`);
}
}
if (args.notionId && !filters.notionId) filters.notionId = args.notionId;
if (args.technique && !filters.technique) filters.technique = args.technique;
const hasAny = Object.values(filters).some((v) => typeof v === 'string' && v.length > 0);
return hasAny ? filters : undefined;
}
export function parseEvaluationArgs(argv: string[] = process.argv.slice(2)): EvaluationArgs {
// Check for help flag before parsing
if (argv.includes('--help') || argv.includes('-h')) {
printHelp();
process.exit(0);
}
const { values, seenKeys } = parseCli(argv);
if (values.langsmith === true) {
const backendWasExplicit = seenKeys.has('backend');
if (backendWasExplicit && values.backend !== 'langsmith') {
throw new Error('Cannot combine `--langsmith` with `--backend local`');
}
values.backend = 'langsmith';
}
const parsed = cliSchema.parse(values);
const featureFlags = parseFeatureFlags({
templateExamples: parsed.templateExamples,
});
const filters = parseFilters({
filter: parsed.filter,
notionId: parsed.notionId,
technique: parsed.technique,
});
if (parsed.suite !== 'pairwise' && (filters?.doSearch || filters?.dontSearch)) {
throw new Error(
'`--filter do:` and `--filter dont:` are only supported for `--suite pairwise`',
);
}
return {
suite: parsed.suite,
backend: parsed.backend,
verbose: parsed.verbose,
repetitions: parsed.repetitions,
concurrency: parsed.concurrency,
timeoutMs: parsed.timeoutMs,
experimentName: parsed.experimentName,
outputDir: parsed.outputDir,
datasetName: parsed.datasetName,
maxExamples: parsed.maxExamples,
filters,
testCase: parsed.testCase,
promptsCsv: parsed.promptsCsv,
prompt: parsed.prompt,
dos: parsed.dos,
donts: parsed.donts,
numJudges: parsed.numJudges,
numGenerations: parsed.numGenerations,
featureFlags,
};
}
export function getDefaultExperimentName(suite: EvaluationSuite): string {
return suite === 'pairwise' ? DEFAULTS.EXPERIMENT_NAME : DEFAULTS.LLM_JUDGE_EXPERIMENT_NAME;
}
export function getDefaultDatasetName(suite: EvaluationSuite): string {
if (suite === 'pairwise') return DEFAULTS.DATASET_NAME;
return process.env.LANGSMITH_DATASET_NAME ?? 'workflow-builder-canvas-prompts';
}