refactor(ai-builder): Implement unified evaluations harness (#23955)

Signed-off-by: Oleg Ivaniv <me@olegivaniv.com> Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
2026-04-21 15:47:20 +00:00 · 2026-01-13 13:11:13 +01:00 · 2026-01-13 13:11:13 +01:00 · f880a74d99
commit f880a74d99
parent 800075a693
111 changed files with 11135 additions and 4682 deletions
--- a/.gitignore
+++ b/.gitignore
@ -44,4 +44,6 @@ packages/cli/THIRD_PARTY_LICENSES.md
 .coverage
 .nyc_output
 packages/cli/src/commands/export/outputs
-*.bak
+*.bak
+.data
+.data/
--- a/packages/@n8n/ai-workflow-builder.ee/.gitignore
+++ b/packages/@n8n/ai-workflow-builder.ee/.gitignore
@ -1,4 +1,5 @@
 evaluations/results/*
 evaluations/nodes.json
+evaluations/.data/*
 .prompts/
 categorization-summary*
--- a/packages/@n8n/ai-workflow-builder.ee/eslint.config.mjs
+++ b/packages/@n8n/ai-workflow-builder.ee/eslint.config.mjs
@ -2,7 +2,7 @@ import { defineConfig, globalIgnores } from 'eslint/config';
 import { nodeConfig } from '@n8n/eslint-config/node';

 export default defineConfig(
-	globalIgnores(['jest.config*.js', 'evaluations/programmatic/python/.venv/**']),
+	globalIgnores(['coverage/**', 'jest.config*.js', 'evaluations/programmatic/python/.venv/**']),
 	nodeConfig,
 	{
 	rules: {
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/README.md
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/README.md
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/argument-parser.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/argument-parser.test.ts
@ -0,0 +1,70 @@
+import { parseEvaluationArgs } from '../cli/argument-parser';
+
+describe('argument-parser', () => {
+	it('parses numeric flags like --max-examples and --concurrency', () => {
+		const args = parseEvaluationArgs([
+			'--suite',
+			'pairwise',
+			'--backend',
+			'langsmith',
+			'--max-examples',
+			'5',
+			'--concurrency',
+			'3',
+		]);
+
+		expect(args.maxExamples).toBe(5);
+		expect(args.concurrency).toBe(3);
+	});
+
+	it('supports inline --max-examples= syntax', () => {
+		const args = parseEvaluationArgs(['--max-examples=7']);
+		expect(args.maxExamples).toBe(7);
+	});
+
+	it('parses filters for pairwise suite', () => {
+		const args = parseEvaluationArgs([
+			'--suite',
+			'pairwise',
+			'--backend',
+			'langsmith',
+			'--filter',
+			'do:Slack',
+			'--filter',
+			'technique:content_generation',
+		]);
+
+		expect(args.filters).toEqual({
+			doSearch: 'Slack',
+			technique: 'content_generation',
+		});
+	});
+
+	it('accepts prompt values that start with "-"', () => {
+		const args = parseEvaluationArgs(['--prompt', '-starts-with-dash']);
+		expect(args.prompt).toBe('-starts-with-dash');
+	});
+
+	it('rejects conflicting backend/local when --langsmith is set', () => {
+		expect(() => parseEvaluationArgs(['--langsmith', '--backend', 'local'])).toThrow(
+			'Cannot combine `--langsmith` with `--backend local`',
+		);
+	});
+
+	it('treats --langsmith as backend=langsmith', () => {
+		const args = parseEvaluationArgs(['--langsmith']);
+		expect(args.backend).toBe('langsmith');
+	});
+
+	it('rejects do/dont filters for non-pairwise suite', () => {
+		expect(() => parseEvaluationArgs(['--suite', 'llm-judge', '--filter', 'do:Slack'])).toThrow(
+			'only supported for `--suite pairwise`',
+		);
+	});
+
+	it('rejects malformed filters', () => {
+		expect(() =>
+			parseEvaluationArgs(['--suite', 'pairwise', '--backend', 'langsmith', '--filter', 'nope']),
+		).toThrow('Invalid `--filter` format');
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/cli.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/cli.test.ts
@ -0,0 +1,465 @@
+/**
+ * Tests for V2 CLI entry point.
+ *
+ * These tests mock all external dependencies and verify that
+ * the CLI correctly orchestrates evaluation runs.
+ */
+
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import { mock } from 'jest-mock-extended';
+import type { Client } from 'langsmith/client';
+import type { INodeTypeDescription } from 'n8n-workflow';
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+// Store mocks for dependencies
+const mockParseEvaluationArgs = jest.fn();
+const mockSetupTestEnvironment = jest.fn();
+const mockCreateAgent = jest.fn();
+const mockGenerateRunId = jest.fn();
+const mockIsWorkflowStateValues = jest.fn();
+const mockLoadTestCasesFromCsv = jest.fn();
+const mockConsumeGenerator = jest.fn();
+const mockGetChatPayload = jest.fn();
+const mockRunEvaluation = jest.fn();
+const mockCreateConsoleLifecycle = jest.fn();
+const mockCreateLLMJudgeEvaluator = jest.fn();
+const mockCreateProgrammaticEvaluator = jest.fn();
+const mockCreatePairwiseEvaluator = jest.fn();
+
+// Mock all external modules
+jest.mock('../cli/argument-parser', () => ({
+	parseEvaluationArgs: (): unknown => mockParseEvaluationArgs(),
+	getDefaultDatasetName: (suite: unknown): unknown =>
+		suite === 'pairwise' ? 'notion-pairwise-workflows' : 'workflow-builder-canvas-prompts',
+	getDefaultExperimentName: (suite: unknown): unknown =>
+		suite === 'pairwise' ? 'pairwise-evals' : 'workflow-builder-evaluation',
+}));
+
+jest.mock('../support/environment', () => ({
+	setupTestEnvironment: (): unknown => mockSetupTestEnvironment(),
+	createAgent: (...args: unknown[]): unknown => mockCreateAgent(...args),
+}));
+
+jest.mock('../langsmith/types', () => ({
+	generateRunId: (): unknown => mockGenerateRunId(),
+	isWorkflowStateValues: (...args: unknown[]): unknown => mockIsWorkflowStateValues(...args),
+}));
+
+jest.mock('../cli/csv-prompt-loader', () => ({
+	loadTestCasesFromCsv: (...args: unknown[]): unknown => mockLoadTestCasesFromCsv(...args),
+	loadDefaultTestCases: () => [
+		{ id: 'test-case-1', prompt: 'Create a workflow that sends a daily email summary' },
+	],
+	getDefaultTestCaseIds: () => ['test-case-1'],
+}));
+
+jest.mock('../harness/evaluation-helpers', () => ({
+	consumeGenerator: (...args: unknown[]): unknown => mockConsumeGenerator(...args),
+	getChatPayload: (...args: unknown[]): unknown => mockGetChatPayload(...args),
+}));
+
+jest.mock('../index', () => ({
+	runEvaluation: (...args: unknown[]): unknown => mockRunEvaluation(...args),
+	createConsoleLifecycle: (...args: unknown[]): unknown => mockCreateConsoleLifecycle(...args),
+	createLLMJudgeEvaluator: (...args: unknown[]): unknown => mockCreateLLMJudgeEvaluator(...args),
+	createProgrammaticEvaluator: (...args: unknown[]): unknown =>
+		mockCreateProgrammaticEvaluator(...args),
+	createPairwiseEvaluator: (...args: unknown[]): unknown => mockCreatePairwiseEvaluator(...args),
+}));
+
+/** Helper to create a minimal valid workflow for tests */
+function createMockWorkflow(name = 'Test Workflow'): SimpleWorkflow {
+	return { name, nodes: [], connections: {} };
+}
+
+/** Helper to create default args */
+function createMockArgs(overrides: Record<string, unknown> = {}) {
+	return {
+		suite: 'llm-judge',
+		backend: 'local',
+		verbose: false,
+		timeoutMs: 60_000,
+		datasetName: undefined,
+		prompt: undefined,
+		testCase: undefined,
+		promptsCsv: undefined,
+		maxExamples: undefined,
+		dos: undefined,
+		donts: undefined,
+		numJudges: 3,
+		numGenerations: 1,
+		experimentName: undefined,
+		repetitions: 1,
+		concurrency: 4,
+		featureFlags: undefined,
+		...overrides,
+	};
+}
+
+/** Helper to create mock environment */
+function createMockEnvironment() {
+	return {
+		parsedNodeTypes: [] as INodeTypeDescription[],
+		llm: mock<BaseChatModel>(),
+		lsClient: mock<Client>(),
+	};
+}
+
+/** Helper to create mock agent */
+function createMockAgentInstance(workflowJSON: SimpleWorkflow = createMockWorkflow()) {
+	return {
+		chat: jest.fn().mockReturnValue((async function* () {})()),
+		getState: jest.fn().mockResolvedValue({
+			values: {
+				workflowJSON,
+				messages: [],
+			},
+		}),
+	};
+}
+
+/** Helper to create mock run summary */
+function createMockSummary(overrides: Record<string, unknown> = {}) {
+	return {
+		totalExamples: 10,
+		passed: 8,
+		failed: 2,
+		errors: 0,
+		averageScore: 0.85,
+		totalDurationMs: 5000,
+		...overrides,
+	};
+}
+
+describe('CLI', () => {
+	// Mock process.exit to prevent test termination
+	let mockExit: jest.SpyInstance;
+	const originalEnv = process.env;
+
+	beforeEach(() => {
+		jest.clearAllMocks();
+		mockExit = jest.spyOn(process, 'exit').mockImplementation((code) => {
+			throw new Error(`process.exit(${code})`);
+		});
+
+		// Reset environment
+		process.env = { ...originalEnv };
+
+		// Setup default mocks
+		mockParseEvaluationArgs.mockReturnValue(createMockArgs());
+		mockSetupTestEnvironment.mockResolvedValue(createMockEnvironment());
+		mockCreateAgent.mockReturnValue(createMockAgentInstance());
+		mockGenerateRunId.mockReturnValue('test-run-id');
+		mockIsWorkflowStateValues.mockReturnValue(true);
+		mockConsumeGenerator.mockResolvedValue(undefined);
+		mockGetChatPayload.mockReturnValue({});
+		mockRunEvaluation.mockResolvedValue(createMockSummary());
+		mockCreateConsoleLifecycle.mockReturnValue({});
+		mockCreateLLMJudgeEvaluator.mockReturnValue({ name: 'llm-judge', evaluate: jest.fn() });
+		mockCreateProgrammaticEvaluator.mockReturnValue({ name: 'programmatic', evaluate: jest.fn() });
+		mockCreatePairwiseEvaluator.mockReturnValue({ name: 'pairwise', evaluate: jest.fn() });
+	});
+
+	afterEach(() => {
+		mockExit.mockRestore();
+		process.env = originalEnv;
+	});
+
+	describe('runV2Evaluation()', () => {
+		describe('loadTestCases', () => {
+			it('should load test cases from CSV when promptsCsv is set', async () => {
+				mockParseEvaluationArgs.mockReturnValue(
+					createMockArgs({ promptsCsv: '/path/to/prompts.csv' }),
+				);
+				mockLoadTestCasesFromCsv.mockReturnValue([
+					{ prompt: 'CSV prompt 1', id: '1' },
+					{ prompt: 'CSV prompt 2', id: '2' },
+				]);
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockLoadTestCasesFromCsv).toHaveBeenCalledWith('/path/to/prompts.csv');
+				expect(mockRunEvaluation).toHaveBeenCalledWith(
+					expect.objectContaining({
+						dataset: [
+							{ prompt: 'CSV prompt 1', id: '1' },
+							{ prompt: 'CSV prompt 2', id: '2' },
+						],
+					}),
+				);
+			});
+
+			it('should create single test case when prompt is set', async () => {
+				mockParseEvaluationArgs.mockReturnValue(
+					createMockArgs({
+						prompt: 'Create a workflow',
+						dos: 'Use Slack',
+						donts: 'No HTTP',
+					}),
+				);
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockRunEvaluation).toHaveBeenCalledWith(
+					expect.objectContaining({
+						dataset: [
+							{
+								prompt: 'Create a workflow',
+								context: { dos: 'Use Slack', donts: 'No HTTP' },
+							},
+						],
+					}),
+				);
+			});
+
+			it('should use default test case when no prompt source specified', async () => {
+				mockParseEvaluationArgs.mockReturnValue(createMockArgs());
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockRunEvaluation).toHaveBeenCalledWith(
+					expect.objectContaining({
+						dataset: [
+							{ id: 'test-case-1', prompt: 'Create a workflow that sends a daily email summary' },
+						],
+					}),
+				);
+			});
+		});
+
+		describe('mode selection', () => {
+			it('should create LLM-judge + programmatic evaluators for llm-judge suite (local)', async () => {
+				mockParseEvaluationArgs.mockReturnValue(
+					createMockArgs({ suite: 'llm-judge', backend: 'local' }),
+				);
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockCreateLLMJudgeEvaluator).toHaveBeenCalled();
+				expect(mockCreateProgrammaticEvaluator).toHaveBeenCalled();
+				expect(mockCreatePairwiseEvaluator).not.toHaveBeenCalled();
+			});
+
+			it('should create LLM-judge + programmatic evaluators for llm-judge suite (langsmith)', async () => {
+				mockParseEvaluationArgs.mockReturnValue(
+					createMockArgs({ suite: 'llm-judge', backend: 'langsmith' }),
+				);
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockCreateLLMJudgeEvaluator).toHaveBeenCalled();
+				expect(mockCreateProgrammaticEvaluator).toHaveBeenCalled();
+				expect(mockCreatePairwiseEvaluator).not.toHaveBeenCalled();
+			});
+
+			it('should create pairwise + programmatic evaluators for pairwise suite (local)', async () => {
+				mockParseEvaluationArgs.mockReturnValue(
+					createMockArgs({ suite: 'pairwise', backend: 'local', numJudges: 5 }),
+				);
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockCreatePairwiseEvaluator).toHaveBeenCalled();
+				// Verify numJudges was passed correctly
+				const callArgs = mockCreatePairwiseEvaluator.mock.calls[0] as [
+					unknown,
+					{ numJudges: number; numGenerations: number },
+				];
+				expect(callArgs[1]).toEqual({ numJudges: 5, numGenerations: 1 });
+				expect(mockCreateProgrammaticEvaluator).toHaveBeenCalled();
+				expect(mockCreateLLMJudgeEvaluator).not.toHaveBeenCalled();
+			});
+
+			it('should create pairwise + programmatic evaluators for pairwise suite (langsmith)', async () => {
+				mockParseEvaluationArgs.mockReturnValue(
+					createMockArgs({ suite: 'pairwise', backend: 'langsmith' }),
+				);
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockCreatePairwiseEvaluator).toHaveBeenCalled();
+				expect(mockCreateProgrammaticEvaluator).toHaveBeenCalled();
+				expect(mockCreateLLMJudgeEvaluator).not.toHaveBeenCalled();
+			});
+		});
+
+		describe('config building', () => {
+			it('should use local mode for backend=local', async () => {
+				mockParseEvaluationArgs.mockReturnValue(createMockArgs({ backend: 'local' }));
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockRunEvaluation).toHaveBeenCalledWith(
+					expect.objectContaining({
+						mode: 'local',
+					}),
+				);
+			});
+
+			it('should use langsmith mode for backend=langsmith', async () => {
+				mockParseEvaluationArgs.mockReturnValue(createMockArgs({ backend: 'langsmith' }));
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockRunEvaluation).toHaveBeenCalledWith(
+					expect.objectContaining({
+						mode: 'langsmith',
+						langsmithOptions: expect.objectContaining({
+							experimentName: 'workflow-builder-evaluation',
+						}),
+					}),
+				);
+			});
+
+			it('should use datasetName from args for langsmith mode', async () => {
+				mockParseEvaluationArgs.mockReturnValue(
+					createMockArgs({ backend: 'langsmith', datasetName: 'custom-dataset' }),
+				);
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockRunEvaluation).toHaveBeenCalledWith(
+					expect.objectContaining({
+						dataset: 'custom-dataset',
+					}),
+				);
+			});
+
+			it('should fall back to default dataset name when env var not set', async () => {
+				delete process.env.LANGSMITH_DATASET_NAME;
+				mockParseEvaluationArgs.mockReturnValue(createMockArgs({ backend: 'langsmith' }));
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockRunEvaluation).toHaveBeenCalledWith(
+					expect.objectContaining({
+						dataset: 'workflow-builder-canvas-prompts',
+					}),
+				);
+			});
+
+			it('should include langsmithOptions with custom experiment name', async () => {
+				mockParseEvaluationArgs.mockReturnValue(
+					createMockArgs({
+						backend: 'langsmith',
+						experimentName: 'my-experiment',
+						repetitions: 3,
+						concurrency: 8,
+					}),
+				);
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockRunEvaluation).toHaveBeenCalledWith(
+					expect.objectContaining({
+						langsmithOptions: expect.objectContaining({
+							experimentName: 'my-experiment',
+							repetitions: 3,
+							concurrency: 8,
+						}),
+					}),
+				);
+			});
+		});
+
+		describe('exit codes', () => {
+			it('should exit with 0 when pass rate >= 70%', async () => {
+				mockRunEvaluation.mockResolvedValue(createMockSummary({ totalExamples: 10, passed: 7 }));
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit(0)');
+			});
+
+			it('should exit with 1 when pass rate < 70%', async () => {
+				mockRunEvaluation.mockResolvedValue(createMockSummary({ totalExamples: 10, passed: 5 }));
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit(1)');
+			});
+
+			it('should exit with 0 when pass rate is exactly 70%', async () => {
+				mockRunEvaluation.mockResolvedValue(createMockSummary({ totalExamples: 10, passed: 7 }));
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit(0)');
+			});
+
+			it('should exit with 1 when no examples', async () => {
+				mockRunEvaluation.mockResolvedValue(createMockSummary({ totalExamples: 0, passed: 0 }));
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit(1)');
+			});
+		});
+
+		describe('workflow generator', () => {
+			it('should create agent with correct config', async () => {
+				const env = createMockEnvironment();
+				mockSetupTestEnvironment.mockResolvedValue(env);
+				mockParseEvaluationArgs.mockReturnValue(
+					createMockArgs({ featureFlags: { testFlag: true } }),
+				);
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				// Verify generateWorkflow was passed to config
+				expect(mockRunEvaluation).toHaveBeenCalledWith(
+					expect.objectContaining({
+						generateWorkflow: expect.any(Function),
+					}),
+				);
+			});
+
+			it('should setup test environment', async () => {
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockSetupTestEnvironment).toHaveBeenCalled();
+			});
+
+			it('should create console lifecycle with verbose option', async () => {
+				mockParseEvaluationArgs.mockReturnValue(createMockArgs({ verbose: true }));
+
+				const { runV2Evaluation } = await import('../cli');
+
+				await expect(runV2Evaluation()).rejects.toThrow('process.exit');
+
+				expect(mockCreateConsoleLifecycle).toHaveBeenCalledWith(
+					expect.objectContaining({ verbose: true }),
+				);
+			});
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/csv-prompt-loader.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/csv-prompt-loader.test.ts
@ -0,0 +1,145 @@
+/**
+ * Tests for CSV prompt loader.
+ */
+
+import * as fs from 'node:fs';
+import * as os from 'node:os';
+import * as path from 'node:path';
+
+import { loadTestCasesFromCsv } from '../cli/csv-prompt-loader';
+
+describe('csv-prompt-loader', () => {
+	const tempDirs: string[] = [];
+
+	function writeTempCsv(filename: string, content: string): string {
+		const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'eval-csv-'));
+		tempDirs.push(dir);
+		const csvPath = path.join(dir, filename);
+		fs.writeFileSync(csvPath, content, 'utf8');
+		return csvPath;
+	}
+
+	afterEach(() => {
+		while (tempDirs.length > 0) {
+			const dir = tempDirs.pop();
+			if (dir && fs.existsSync(dir)) {
+				fs.rmSync(dir, { recursive: true, force: true });
+			}
+		}
+	});
+
+	it('should load dos/donts into context when present', () => {
+		const csvPath = writeTempCsv(
+			'pairwise.csv',
+			'id,prompt,dos,donts\npw-1,"Create a workflow","Must use Notion","No HTTP Request"\n',
+		);
+
+		const testCases = loadTestCasesFromCsv(csvPath);
+		expect(testCases).toEqual([
+			{
+				id: 'pw-1',
+				prompt: 'Create a workflow',
+				context: { dos: 'Must use Notion', donts: 'No HTTP Request' },
+			},
+		]);
+	});
+
+	it('should load prompts without context when dos/donts are absent', () => {
+		const csvPath = writeTempCsv('llm.csv', 'id,prompt\nllm-1,"Create a workflow"\n');
+
+		const testCases = loadTestCasesFromCsv(csvPath);
+		expect(testCases).toEqual([{ id: 'llm-1', prompt: 'Create a workflow' }]);
+	});
+
+	it('should support header column re-ordering', () => {
+		const csvPath = writeTempCsv(
+			'reorder.csv',
+			'prompt,id,donts,dos\n"Create a workflow","pw-1","No HTTP Request","Must use Notion"\n',
+		);
+
+		const testCases = loadTestCasesFromCsv(csvPath);
+		expect(testCases).toEqual([
+			{
+				id: 'pw-1',
+				prompt: 'Create a workflow',
+				context: { dos: 'Must use Notion', donts: 'No HTTP Request' },
+			},
+		]);
+	});
+
+	it('should support do/dont header aliases', () => {
+		const csvPath = writeTempCsv(
+			'aliases.csv',
+			'id,prompt,do,dont\npw-1,"Create a workflow","Must use Notion","No HTTP Request"\n',
+		);
+
+		const testCases = loadTestCasesFromCsv(csvPath);
+		expect(testCases).toEqual([
+			{
+				id: 'pw-1',
+				prompt: 'Create a workflow',
+				context: { dos: 'Must use Notion', donts: 'No HTTP Request' },
+			},
+		]);
+	});
+
+	it('should not create context when dos/donts columns are present but empty', () => {
+		const csvPath = writeTempCsv(
+			'empty-context.csv',
+			'id,prompt,dos,donts\npw-1,"Create a workflow",,\n',
+		);
+
+		const testCases = loadTestCasesFromCsv(csvPath);
+		expect(testCases).toEqual([{ id: 'pw-1', prompt: 'Create a workflow' }]);
+	});
+
+	it('should allow headerless CSV (treat first column as prompt)', () => {
+		const csvPath = writeTempCsv('no-header.csv', '"Create a workflow"\n"Second prompt"\n');
+
+		const testCases = loadTestCasesFromCsv(csvPath);
+		expect(testCases).toEqual([
+			{ id: 'csv-case-1', prompt: 'Create a workflow' },
+			{ id: 'csv-case-2', prompt: 'Second prompt' },
+		]);
+	});
+
+	it('should ignore rows with empty prompts', () => {
+		const csvPath = writeTempCsv('empty-rows.csv', 'id,prompt\nrow-1,\nrow-2,"Valid prompt"\n');
+
+		const testCases = loadTestCasesFromCsv(csvPath);
+		expect(testCases).toEqual([{ id: 'row-2', prompt: 'Valid prompt' }]);
+	});
+
+	it('should handle UTF-8 BOM', () => {
+		const csvPath = writeTempCsv('bom.csv', '\ufeffid,prompt\nllm-1,"Create a workflow"\n');
+
+		const testCases = loadTestCasesFromCsv(csvPath);
+		expect(testCases).toEqual([{ id: 'llm-1', prompt: 'Create a workflow' }]);
+	});
+
+	it('should resolve relative paths from process.cwd()', () => {
+		const csvPath = writeTempCsv('relative.csv', 'id,prompt\nllm-1,"Create a workflow"\n');
+		const relativePath = path.relative(process.cwd(), csvPath);
+
+		const testCases = loadTestCasesFromCsv(relativePath);
+		expect(testCases).toEqual([{ id: 'llm-1', prompt: 'Create a workflow' }]);
+	});
+
+	it('should throw when file does not exist', () => {
+		expect(() => loadTestCasesFromCsv('/definitely-not-a-real-path.csv')).toThrow(
+			/CSV file not found/,
+		);
+	});
+
+	it('should throw when CSV is empty', () => {
+		const csvPath = writeTempCsv('empty.csv', '');
+		expect(() => loadTestCasesFromCsv(csvPath)).toThrow('The provided CSV file is empty');
+	});
+
+	it('should throw when no valid prompts exist', () => {
+		const csvPath = writeTempCsv('no-prompts.csv', 'id,prompt\nrow-1,\nrow-2,"  "\n');
+		expect(() => loadTestCasesFromCsv(csvPath)).toThrow(
+			'No valid prompts found in the provided CSV file',
+		);
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/evaluation-helpers.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/evaluation-helpers.test.ts
@ -0,0 +1,45 @@
+/**
+ * Tests for evaluation helper utilities.
+ */
+
+import pLimit from 'p-limit';
+
+import { withTimeout } from '../harness/evaluation-helpers';
+
+describe('evaluation-helpers', () => {
+	describe('withTimeout()', () => {
+		it('should allow p-limit slot to be released when timeout triggers (best-effort)', async () => {
+			jest.useFakeTimers();
+			const limit = pLimit(1);
+			const started: string[] = [];
+
+			const never = new Promise<void>(() => {
+				// never resolves
+			});
+
+			const p1 = limit(async () => {
+				started.push('p1');
+				await withTimeout({ promise: never, timeoutMs: 10, label: 'p1' });
+			}).catch(() => {
+				// expected timeout
+			});
+
+			// Give p1 a chance to start.
+			await Promise.resolve();
+
+			const p2 = limit(async () => {
+				started.push('p2');
+			});
+
+			jest.advanceTimersByTime(11);
+			await Promise.resolve();
+			await Promise.resolve();
+
+			await expect(p2).resolves.toBeUndefined();
+			expect(started).toEqual(['p1', 'p2']);
+
+			await p1;
+			jest.useRealTimers();
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/evaluators/llm-judge.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/evaluators/llm-judge.test.ts
@ -0,0 +1,192 @@
+/**
+ * Tests for LLM-as-judge evaluator factory.
+ *
+ * These tests mock the underlying evaluateWorkflow function and verify
+ * that the factory correctly wraps it and transforms the results.
+ */
+
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import { mock } from 'jest-mock-extended';
+import type { INodeTypeDescription } from 'n8n-workflow';
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+// Store original module
+const mockEvaluateWorkflow = jest.fn();
+
+// Mock the evaluateWorkflow function
+jest.mock('../../evaluators/llm-judge/workflow-evaluator', () => ({
+	evaluateWorkflow: (...args: unknown[]): unknown => mockEvaluateWorkflow(...args),
+}));
+
+/** Helper to create a minimal valid workflow for tests */
+function createMockWorkflow(name = 'Test Workflow'): SimpleWorkflow {
+	return { name, nodes: [], connections: {} };
+}
+
+/** Helper to create a mock evaluation result */
+function createMockEvalResult(overrides: Record<string, unknown> = {}) {
+	return {
+		functionality: { score: 0.9, violations: [] },
+		connections: { score: 1.0, violations: [] },
+		expressions: { score: 0.8, violations: [] },
+		nodeConfiguration: { score: 0.85, violations: [] },
+		efficiency: {
+			score: 0.95,
+			violations: [],
+			redundancyScore: 1,
+			pathOptimization: 0.9,
+			nodeCountEfficiency: 0.95,
+		},
+		dataFlow: { score: 0.9, violations: [] },
+		maintainability: {
+			score: 0.88,
+			violations: [],
+			nodeNamingQuality: 0.9,
+			workflowOrganization: 0.85,
+			modularity: 0.9,
+		},
+		bestPractices: { score: 0.82, violations: [] },
+		overallScore: 0.9,
+		summary: 'Good workflow',
+		...overrides,
+	};
+}
+
+describe('LLM-Judge Evaluator', () => {
+	let mockLlm: BaseChatModel;
+	let mockNodeTypes: INodeTypeDescription[];
+
+	beforeEach(() => {
+		jest.clearAllMocks();
+		mockLlm = mock<BaseChatModel>();
+		mockNodeTypes = [];
+	});
+
+	describe('createLLMJudgeEvaluator()', () => {
+		it('should create an evaluator with correct name', async () => {
+			const { createLLMJudgeEvaluator } = await import('../../evaluators/llm-judge');
+			const evaluator = createLLMJudgeEvaluator(mockLlm, mockNodeTypes);
+
+			expect(evaluator.name).toBe('llm-judge');
+		});
+
+		it('should call evaluateWorkflow with workflow and prompt', async () => {
+			mockEvaluateWorkflow.mockResolvedValue(createMockEvalResult());
+
+			const { createLLMJudgeEvaluator } = await import('../../evaluators/llm-judge');
+			const evaluator = createLLMJudgeEvaluator(mockLlm, mockNodeTypes);
+
+			const workflow = createMockWorkflow();
+			const context = { prompt: 'Create a test workflow' };
+
+			await evaluator.evaluate(workflow, context);
+
+			expect(mockEvaluateWorkflow).toHaveBeenCalledWith(mockLlm, {
+				userPrompt: 'Create a test workflow',
+				generatedWorkflow: workflow,
+			});
+		});
+
+		it('should return feedback array with all category scores', async () => {
+			mockEvaluateWorkflow.mockResolvedValue(createMockEvalResult());
+
+			const { createLLMJudgeEvaluator } = await import('../../evaluators/llm-judge');
+			const evaluator = createLLMJudgeEvaluator(mockLlm, mockNodeTypes);
+
+			const workflow = createMockWorkflow();
+			const feedback = await evaluator.evaluate(workflow, { prompt: 'Test' });
+
+			// Should have feedback for each category
+			expect(feedback).toContainEqual(
+				expect.objectContaining({ evaluator: 'llm-judge', metric: 'functionality' }),
+			);
+			expect(feedback).toContainEqual(
+				expect.objectContaining({ evaluator: 'llm-judge', metric: 'connections' }),
+			);
+			expect(feedback).toContainEqual(
+				expect.objectContaining({ evaluator: 'llm-judge', metric: 'expressions' }),
+			);
+			expect(feedback).toContainEqual(
+				expect.objectContaining({ evaluator: 'llm-judge', metric: 'nodeConfiguration' }),
+			);
+			expect(feedback).toContainEqual(
+				expect.objectContaining({ evaluator: 'llm-judge', metric: 'efficiency' }),
+			);
+			expect(feedback).toContainEqual(
+				expect.objectContaining({ evaluator: 'llm-judge', metric: 'dataFlow' }),
+			);
+			expect(feedback).toContainEqual(
+				expect.objectContaining({ evaluator: 'llm-judge', metric: 'maintainability' }),
+			);
+			expect(feedback).toContainEqual(
+				expect.objectContaining({ evaluator: 'llm-judge', metric: 'bestPractices' }),
+			);
+			expect(feedback).toContainEqual(
+				expect.objectContaining({ evaluator: 'llm-judge', metric: 'overallScore' }),
+			);
+		});
+
+		it('should include violations in feedback comments', async () => {
+			mockEvaluateWorkflow.mockResolvedValue(
+				createMockEvalResult({
+					functionality: {
+						score: 0.5,
+						violations: [
+							{ type: 'critical', description: 'Missing HTTP node', pointsDeducted: 0.3 },
+							{ type: 'major', description: 'Incorrect branching', pointsDeducted: 0.2 },
+						],
+					},
+				}),
+			);
+
+			const { createLLMJudgeEvaluator } = await import('../../evaluators/llm-judge');
+			const evaluator = createLLMJudgeEvaluator(mockLlm, mockNodeTypes);
+
+			const workflow = createMockWorkflow();
+			const feedback = await evaluator.evaluate(workflow, { prompt: 'Test' });
+
+			const funcFeedback = feedback.find(
+				(f) => f.evaluator === 'llm-judge' && f.metric === 'functionality',
+			);
+			expect(funcFeedback?.comment).toContain('Missing HTTP node');
+		});
+
+		it('should include bestPractices violations in comments', async () => {
+			mockEvaluateWorkflow.mockResolvedValue(
+				createMockEvalResult({
+					bestPractices: {
+						score: 0.4,
+						violations: [
+							{ type: 'major', description: 'Missing rate limiting', pointsDeducted: 0.2 },
+						],
+					},
+				}),
+			);
+
+			const { createLLMJudgeEvaluator } = await import('../../evaluators/llm-judge');
+			const evaluator = createLLMJudgeEvaluator(mockLlm, mockNodeTypes);
+
+			const feedback = await evaluator.evaluate(createMockWorkflow(), { prompt: 'Test' });
+			const bpFeedback = feedback.find(
+				(f) => f.evaluator === 'llm-judge' && f.metric === 'bestPractices',
+			);
+
+			expect(bpFeedback?.comment).toContain('Missing rate limiting');
+		});
+
+		it('should handle evaluation errors gracefully', async () => {
+			mockEvaluateWorkflow.mockRejectedValue(new Error('LLM API error'));
+
+			const { createLLMJudgeEvaluator } = await import('../../evaluators/llm-judge');
+			const evaluator = createLLMJudgeEvaluator(mockLlm, mockNodeTypes);
+
+			const workflow = createMockWorkflow();
+
+			// Should throw - let the runner handle errors
+			await expect(evaluator.evaluate(workflow, { prompt: 'Test' })).rejects.toThrow(
+				'LLM API error',
+			);
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/evaluators/pairwise.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/evaluators/pairwise.test.ts
@ -0,0 +1,540 @@
+/**
+ * Tests for pairwise evaluator factory.
+ *
+ * These tests mock the underlying runJudgePanel function and verify
+ * that the factory correctly wraps it and transforms the results.
+ */
+
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import { mock } from 'jest-mock-extended';
+import pLimit from 'p-limit';
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+import { PAIRWISE_METRICS } from '../../evaluators/pairwise/metrics';
+
+// Store mock for runJudgePanel
+const mockRunJudgePanel = jest.fn();
+
+// Mock the judge panel module
+jest.mock('../../evaluators/pairwise/judge-panel', () => ({
+	runJudgePanel: (...args: unknown[]): unknown => mockRunJudgePanel(...args),
+}));
+
+/** Helper to create a minimal valid workflow for tests */
+function createMockWorkflow(name = 'Test Workflow'): SimpleWorkflow {
+	return { name, nodes: [], connections: {} };
+}
+
+/** Helper to create a mock judge panel result */
+function createMockPanelResult(
+	overrides: Partial<{
+		primaryPasses: number;
+		majorityPass: boolean;
+		avgDiagnosticScore: number;
+		judgeResults: Array<{
+			primaryPass: boolean;
+			diagnosticScore: number;
+			violations: Array<{ rule: string; justification: string }>;
+			passes: Array<{ rule: string; justification: string }>;
+		}>;
+	}> = {},
+) {
+	return {
+		primaryPasses: 2,
+		majorityPass: true,
+		avgDiagnosticScore: 0.8,
+		judgeResults: [
+			{
+				primaryPass: true,
+				diagnosticScore: 0.9,
+				violations: [],
+				passes: [{ rule: 'Has trigger', justification: 'Gmail trigger exists' }],
+			},
+			{
+				primaryPass: true,
+				diagnosticScore: 0.8,
+				violations: [],
+				passes: [{ rule: 'Has trigger', justification: 'Trigger present' }],
+			},
+			{
+				primaryPass: false,
+				diagnosticScore: 0.7,
+				violations: [{ rule: 'Missing action', justification: 'No Slack node found' }],
+				passes: [],
+			},
+		],
+		...overrides,
+	};
+}
+
+describe('Pairwise Evaluator', () => {
+	let mockLlm: BaseChatModel;
+	type PairwiseFeedback = { evaluator: string; metric: string; score: number; comment?: string };
+	const findFeedback = (feedback: PairwiseFeedback[], metric: string) =>
+		feedback.find((f) => f.evaluator === 'pairwise' && f.metric === metric);
+
+	beforeEach(() => {
+		jest.clearAllMocks();
+		mockLlm = mock<BaseChatModel>();
+	});
+
+	describe('createPairwiseEvaluator()', () => {
+		it('should create an evaluator with correct name', async () => {
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm);
+
+			expect(evaluator.name).toBe('pairwise');
+		});
+
+		it('should call runJudgePanel with workflow and criteria', async () => {
+			mockRunJudgePanel.mockResolvedValue(createMockPanelResult());
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm);
+
+			const workflow = createMockWorkflow();
+			const context = { prompt: 'Test prompt', dos: 'Use Slack', donts: 'No HTTP requests' };
+
+			await evaluator.evaluate(workflow, context);
+
+			expect(mockRunJudgePanel).toHaveBeenCalledWith(
+				mockLlm,
+				workflow,
+				{ dos: 'Use Slack', donts: 'No HTTP requests' },
+				3, // default number of judges
+				expect.any(Object),
+			);
+		});
+
+		it('should use custom number of judges', async () => {
+			mockRunJudgePanel.mockResolvedValue(createMockPanelResult());
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm, { numJudges: 5 });
+
+			const workflow = createMockWorkflow();
+			await evaluator.evaluate(workflow, { prompt: 'Test prompt' });
+
+			expect(mockRunJudgePanel).toHaveBeenCalledWith(
+				mockLlm,
+				workflow,
+				expect.any(Object),
+				5,
+				expect.any(Object),
+			);
+		});
+
+		it('should pass through empty criteria when context has no dos/donts', async () => {
+			mockRunJudgePanel.mockResolvedValue(createMockPanelResult());
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm);
+
+			const workflow = createMockWorkflow();
+			await evaluator.evaluate(workflow, { prompt: 'Test prompt' });
+
+			expect(mockRunJudgePanel).toHaveBeenCalledWith(
+				mockLlm,
+				workflow,
+				{
+					dos: undefined,
+					donts: undefined,
+				},
+				3,
+				expect.any(Object),
+			);
+		});
+
+		it('should return feedback with majority pass result', async () => {
+			mockRunJudgePanel.mockResolvedValue(
+				createMockPanelResult({ majorityPass: true, primaryPasses: 2 }),
+			);
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm);
+
+			const workflow = createMockWorkflow();
+			const feedback = await evaluator.evaluate(workflow, { prompt: 'Test prompt' });
+
+			const majorityFeedback = findFeedback(feedback, PAIRWISE_METRICS.PAIRWISE_PRIMARY);
+			expect(majorityFeedback).toEqual({
+				evaluator: 'pairwise',
+				metric: PAIRWISE_METRICS.PAIRWISE_PRIMARY,
+				score: 1,
+				kind: 'score',
+				comment: '2/3 judges passed',
+			});
+		});
+
+		it('should return feedback with diagnostic score', async () => {
+			mockRunJudgePanel.mockResolvedValue(createMockPanelResult({ avgDiagnosticScore: 0.85 }));
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm);
+
+			const workflow = createMockWorkflow();
+			const feedback = await evaluator.evaluate(workflow, { prompt: 'Test prompt' });
+
+			const diagnosticFeedback = findFeedback(feedback, PAIRWISE_METRICS.PAIRWISE_DIAGNOSTIC);
+			expect(diagnosticFeedback).toEqual({
+				evaluator: 'pairwise',
+				metric: PAIRWISE_METRICS.PAIRWISE_DIAGNOSTIC,
+				score: 0.85,
+				kind: 'metric',
+			});
+		});
+
+		it('should return feedback for each judge', async () => {
+			mockRunJudgePanel.mockResolvedValue(createMockPanelResult());
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm);
+
+			const workflow = createMockWorkflow();
+			const feedback = await evaluator.evaluate(workflow, { prompt: 'Test prompt' });
+
+			expect(feedback).toContainEqual(
+				expect.objectContaining({ evaluator: 'pairwise', metric: 'judge1', score: 1 }),
+			);
+			expect(feedback).toContainEqual(
+				expect.objectContaining({ evaluator: 'pairwise', metric: 'judge2', score: 1 }),
+			);
+			expect(feedback).toContainEqual(
+				expect.objectContaining({ evaluator: 'pairwise', metric: 'judge3', score: 0 }),
+			);
+		});
+
+		it('should include violations in judge feedback comments', async () => {
+			mockRunJudgePanel.mockResolvedValue(
+				createMockPanelResult({
+					judgeResults: [
+						{
+							primaryPass: false,
+							diagnosticScore: 0.5,
+							violations: [
+								{ rule: 'Has Slack', justification: 'Missing Slack node for notifications' },
+								{ rule: 'Has trigger', justification: 'No trigger node found' },
+							],
+							passes: [],
+						},
+					],
+				}),
+			);
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm);
+
+			const workflow = createMockWorkflow();
+			const feedback = await evaluator.evaluate(workflow, { prompt: 'Test prompt' });
+
+			const judgeFeedback = findFeedback(feedback, 'judge1');
+			// Full violation output without truncation
+			expect(judgeFeedback?.comment).toContain('[Has Slack] Missing Slack node for notifications');
+			expect(judgeFeedback?.comment).toContain('[Has trigger] No trigger node found');
+		});
+
+		it('should handle evaluation errors gracefully', async () => {
+			mockRunJudgePanel.mockRejectedValue(new Error('Judge panel failed'));
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm);
+
+			const workflow = createMockWorkflow();
+
+			// Should throw - let the runner handle errors
+			await expect(evaluator.evaluate(workflow, { prompt: 'Test prompt' })).rejects.toThrow(
+				'Judge panel failed',
+			);
+		});
+
+		it('should accept criteria with only dos (no donts)', async () => {
+			mockRunJudgePanel.mockResolvedValue(createMockPanelResult());
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm);
+
+			const workflow = createMockWorkflow();
+			const context = { prompt: 'Test prompt', dos: 'Use Slack node' };
+
+			await evaluator.evaluate(workflow, context);
+
+			expect(mockRunJudgePanel).toHaveBeenCalledWith(
+				mockLlm,
+				workflow,
+				{ dos: 'Use Slack node', donts: undefined },
+				3,
+				expect.any(Object),
+			);
+		});
+
+		it('should accept criteria with only donts (no dos)', async () => {
+			mockRunJudgePanel.mockResolvedValue(createMockPanelResult());
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm);
+
+			const workflow = createMockWorkflow();
+			const context = { prompt: 'Test prompt', donts: 'Do not use HTTP Request node' };
+
+			await evaluator.evaluate(workflow, context);
+
+			expect(mockRunJudgePanel).toHaveBeenCalledWith(
+				mockLlm,
+				workflow,
+				{ dos: undefined, donts: 'Do not use HTTP Request node' },
+				3,
+				expect.any(Object),
+			);
+		});
+	});
+
+	describe('createPairwiseEvaluator() multi-gen', () => {
+		it('should use single-gen behavior when numGenerations is 1', async () => {
+			mockRunJudgePanel.mockResolvedValue(createMockPanelResult());
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm, { numGenerations: 1 });
+
+			const workflow = createMockWorkflow();
+			const feedback = await evaluator.evaluate(workflow, { prompt: 'Test prompt' });
+
+			// Should have single-gen feedback keys
+			expect(findFeedback(feedback, PAIRWISE_METRICS.PAIRWISE_PRIMARY)).toBeDefined();
+			expect(findFeedback(feedback, PAIRWISE_METRICS.PAIRWISE_DIAGNOSTIC)).toBeDefined();
+			// Should NOT have multi-gen keys
+			expect(
+				findFeedback(feedback, PAIRWISE_METRICS.PAIRWISE_GENERATION_CORRECTNESS),
+			).toBeUndefined();
+		});
+
+		it('should generate multiple workflows when numGenerations > 1', async () => {
+			mockRunJudgePanel.mockResolvedValue(createMockPanelResult());
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm, { numJudges: 3, numGenerations: 3 });
+
+			const workflow = createMockWorkflow();
+			const mockGenerateWorkflow = jest.fn().mockResolvedValue(createMockWorkflow());
+
+			await evaluator.evaluate(workflow, {
+				dos: 'Test',
+				donts: 'Test',
+				prompt: 'Create a workflow',
+				generateWorkflow: mockGenerateWorkflow,
+			});
+
+			// Should have called generateWorkflow 3 times
+			expect(mockGenerateWorkflow).toHaveBeenCalledTimes(3);
+			// Should have called runJudgePanel 3 times (once per generation)
+			expect(mockRunJudgePanel).toHaveBeenCalledTimes(3);
+		});
+
+		it('should limit generation concurrency when llmCallLimiter is provided', async () => {
+			mockRunJudgePanel.mockResolvedValue(createMockPanelResult());
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm, { numJudges: 3, numGenerations: 3 });
+
+			const workflow = createMockWorkflow();
+
+			let active = 0;
+			let maxActive = 0;
+			const mockGenerateWorkflow = jest.fn().mockImplementation(async () => {
+				active++;
+				maxActive = Math.max(maxActive, active);
+				await new Promise((r) => setTimeout(r, 25));
+				active--;
+				return createMockWorkflow();
+			});
+
+			await evaluator.evaluate(workflow, {
+				prompt: 'Create a workflow',
+				generateWorkflow: mockGenerateWorkflow,
+				llmCallLimiter: pLimit(1),
+			});
+
+			expect(mockGenerateWorkflow).toHaveBeenCalledTimes(3);
+			expect(maxActive).toBe(1);
+		});
+
+		it('should aggregate results across generations', async () => {
+			// First two generations pass, third fails
+			mockRunJudgePanel
+				.mockResolvedValueOnce(
+					createMockPanelResult({ majorityPass: true, avgDiagnosticScore: 0.9 }),
+				)
+				.mockResolvedValueOnce(
+					createMockPanelResult({ majorityPass: true, avgDiagnosticScore: 0.8 }),
+				)
+				.mockResolvedValueOnce(
+					createMockPanelResult({ majorityPass: false, avgDiagnosticScore: 0.5 }),
+				);
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm, { numJudges: 3, numGenerations: 3 });
+
+			const workflow = createMockWorkflow();
+			const mockGenerateWorkflow = jest.fn().mockResolvedValue(createMockWorkflow());
+
+			const feedback = await evaluator.evaluate(workflow, {
+				prompt: 'Create a workflow',
+				generateWorkflow: mockGenerateWorkflow,
+			});
+
+			// Should have multi-gen feedback
+			const correctness = findFeedback(feedback, PAIRWISE_METRICS.PAIRWISE_GENERATION_CORRECTNESS);
+			expect(correctness?.score).toBeCloseTo(2 / 3); // 2 of 3 passed
+			expect(correctness?.comment).toBe('2/3 generations passed');
+
+			const diagnostic = findFeedback(feedback, PAIRWISE_METRICS.PAIRWISE_AGGREGATED_DIAGNOSTIC);
+			expect(diagnostic?.score).toBeCloseTo((0.9 + 0.8 + 0.5) / 3);
+		});
+
+		it('should return per-generation feedback', async () => {
+			mockRunJudgePanel
+				.mockResolvedValueOnce(
+					createMockPanelResult({
+						majorityPass: true,
+						avgDiagnosticScore: 0.9,
+						primaryPasses: 3,
+						judgeResults: [
+							{
+								primaryPass: true,
+								diagnosticScore: 0.9,
+								violations: [],
+								passes: [{ rule: 'Has trigger', justification: 'OK' }],
+							},
+							{
+								primaryPass: true,
+								diagnosticScore: 0.9,
+								violations: [],
+								passes: [{ rule: 'Has trigger', justification: 'OK' }],
+							},
+							{
+								primaryPass: true,
+								diagnosticScore: 0.9,
+								violations: [],
+								passes: [{ rule: 'Has trigger', justification: 'OK' }],
+							},
+						],
+					}),
+				)
+				.mockResolvedValueOnce(
+					createMockPanelResult({
+						majorityPass: false,
+						avgDiagnosticScore: 0.4,
+						primaryPasses: 1,
+						judgeResults: [
+							{
+								primaryPass: true,
+								diagnosticScore: 0.5,
+								violations: [],
+								passes: [{ rule: 'Has trigger', justification: 'OK' }],
+							},
+							{
+								primaryPass: false,
+								diagnosticScore: 0.3,
+								violations: [{ rule: 'Missing Notion', justification: 'No Notion node found' }],
+								passes: [],
+							},
+							{
+								primaryPass: false,
+								diagnosticScore: 0.4,
+								violations: [{ rule: 'No HTTP', justification: 'Contains HTTP Request node' }],
+								passes: [],
+							},
+						],
+					}),
+				);
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm, { numJudges: 3, numGenerations: 2 });
+
+			const workflow = createMockWorkflow();
+			const mockGenerateWorkflow = jest.fn().mockResolvedValue(createMockWorkflow());
+
+			const feedback = await evaluator.evaluate(workflow, {
+				prompt: 'Create a workflow',
+				generateWorkflow: mockGenerateWorkflow,
+			});
+
+			// Gen 1 feedback
+			const gen1Pass = findFeedback(feedback, 'gen1.majorityPass');
+			expect(gen1Pass?.score).toBe(1);
+			expect(gen1Pass?.comment).toBe('3/3 judges');
+
+			const gen1Diag = findFeedback(feedback, 'gen1.diagnosticScore');
+			expect(gen1Diag?.score).toBe(0.9);
+
+			// Gen 2 feedback
+			const gen2Pass = findFeedback(feedback, 'gen2.majorityPass');
+			expect(gen2Pass?.score).toBe(0);
+			expect(gen2Pass?.comment).toBe('1/3 judges');
+
+			const gen2Diag = findFeedback(feedback, 'gen2.diagnosticScore');
+			expect(gen2Diag?.score).toBe(0.4);
+
+			const gen2Judge2 = findFeedback(feedback, 'gen2.judge2');
+			expect(gen2Judge2?.score).toBe(0);
+			expect(gen2Judge2?.comment).toContain('[Missing Notion] No Notion node found');
+
+			const gen2Judge3 = findFeedback(feedback, 'gen2.judge3');
+			expect(gen2Judge3?.score).toBe(0);
+			expect(gen2Judge3?.comment).toContain('[No HTTP] Contains HTTP Request node');
+		});
+
+		it('should throw if generateWorkflow missing in context for multi-gen', async () => {
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm, { numGenerations: 3 });
+
+			const workflow = createMockWorkflow();
+
+			await expect(
+				evaluator.evaluate(workflow, {
+					prompt: 'Create a workflow',
+					// Missing generateWorkflow
+				}),
+			).rejects.toThrow('Multi-gen requires generateWorkflow and prompt in context');
+		});
+
+		it('should throw if prompt missing in context for multi-gen', async () => {
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm, { numGenerations: 3 });
+
+			const workflow = createMockWorkflow();
+			const mockGenerateWorkflow = jest.fn().mockResolvedValue(createMockWorkflow());
+
+			await expect(
+				// @ts-expect-error Intentionally missing required prompt to verify runtime validation
+				evaluator.evaluate(workflow, {
+					generateWorkflow: mockGenerateWorkflow,
+					// Missing prompt
+				}),
+			).rejects.toThrow('Multi-gen requires generateWorkflow and prompt in context');
+		});
+
+		it('should run all generations in parallel', async () => {
+			// Track call order with timestamps
+			const callTimes: number[] = [];
+			mockRunJudgePanel.mockImplementation(async () => {
+				callTimes.push(Date.now());
+				await new Promise((r) => setTimeout(r, 50)); // Simulate async work
+				return createMockPanelResult();
+			});
+
+			const { createPairwiseEvaluator } = await import('../../evaluators/pairwise');
+			const evaluator = createPairwiseEvaluator(mockLlm, { numJudges: 3, numGenerations: 3 });
+
+			const workflow = createMockWorkflow();
+			const mockGenerateWorkflow = jest.fn().mockResolvedValue(createMockWorkflow());
+
+			await evaluator.evaluate(workflow, {
+				prompt: 'Create a workflow',
+				generateWorkflow: mockGenerateWorkflow,
+			});
+
+			expect(callTimes).toHaveLength(3);
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/evaluators/programmatic.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/evaluators/programmatic.test.ts
@ -0,0 +1,263 @@
+/**
+ * Tests for programmatic evaluator factory.
+ *
+ * These tests mock the underlying programmaticEvaluation function and verify
+ * that the factory correctly wraps it and transforms the results.
+ */
+
+import type { INodeTypeDescription } from 'n8n-workflow';
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+// Store mock for programmaticEvaluation
+const mockProgrammaticEvaluation = jest.fn();
+
+// Mock the programmatic evaluation module
+jest.mock('../../programmatic/programmatic-evaluation', () => ({
+	programmaticEvaluation: (...args: unknown[]): unknown => mockProgrammaticEvaluation(...args),
+}));
+
+/** Helper to create a minimal valid workflow for tests */
+function createMockWorkflow(name = 'Test Workflow'): SimpleWorkflow {
+	return { name, nodes: [], connections: {} };
+}
+
+/** Helper to create mock evaluation result */
+function createMockEvaluationResult(
+	overrides: Partial<{
+		overallScore: number;
+		connections: { score: number; violations: Array<{ type: string; description: string }> };
+		nodes: { score: number; violations: Array<{ type: string; description: string }> };
+		trigger: { score: number; violations: Array<{ type: string; description: string }> };
+		agentPrompt: { score: number; violations: Array<{ type: string; description: string }> };
+		tools: { score: number; violations: Array<{ type: string; description: string }> };
+		fromAi: { score: number; violations: Array<{ type: string; description: string }> };
+		credentials: { score: number; violations: Array<{ type: string; description: string }> };
+		similarity: { score: number; violations: Array<{ type: string; description: string }> } | null;
+	}> = {},
+) {
+	return {
+		overallScore: 0.85,
+		connections: { score: 1.0, violations: [] },
+		nodes: { score: 1.0, violations: [] },
+		trigger: { score: 1.0, violations: [] },
+		agentPrompt: { score: 0.9, violations: [] },
+		tools: { score: 1.0, violations: [] },
+		fromAi: { score: 0.8, violations: [] },
+		credentials: { score: 1.0, violations: [] },
+		similarity: null,
+		...overrides,
+	};
+}
+
+describe('Programmatic Evaluator', () => {
+	const mockNodeTypes: INodeTypeDescription[] = [];
+	type ProgrammaticFeedback = {
+		evaluator: string;
+		metric: string;
+		score: number;
+		comment?: string;
+	};
+	const findFeedback = (feedback: ProgrammaticFeedback[], metric: string) =>
+		feedback.find((f) => f.evaluator === 'programmatic' && f.metric === metric);
+
+	beforeEach(() => {
+		jest.clearAllMocks();
+	});
+
+	describe('createProgrammaticEvaluator()', () => {
+		it('should create an evaluator with correct name', async () => {
+			const { createProgrammaticEvaluator } = await import('../../evaluators/programmatic');
+			const evaluator = createProgrammaticEvaluator(mockNodeTypes);
+
+			expect(evaluator.name).toBe('programmatic');
+		});
+
+		it('should call programmaticEvaluation with workflow and context', async () => {
+			mockProgrammaticEvaluation.mockResolvedValue(createMockEvaluationResult());
+
+			const { createProgrammaticEvaluator } = await import('../../evaluators/programmatic');
+			const evaluator = createProgrammaticEvaluator(mockNodeTypes);
+
+			const workflow = createMockWorkflow();
+			const context = { prompt: 'Create a test workflow' };
+
+			await evaluator.evaluate(workflow, context);
+
+			expect(mockProgrammaticEvaluation).toHaveBeenCalledWith(
+				{
+					userPrompt: 'Create a test workflow',
+					generatedWorkflow: workflow,
+					referenceWorkflows: undefined,
+				},
+				mockNodeTypes,
+			);
+		});
+
+		it('should pass reference workflows when provided in context', async () => {
+			mockProgrammaticEvaluation.mockResolvedValue(createMockEvaluationResult());
+
+			const { createProgrammaticEvaluator } = await import('../../evaluators/programmatic');
+			const evaluator = createProgrammaticEvaluator(mockNodeTypes);
+
+			const workflow = createMockWorkflow();
+			const referenceWorkflow = createMockWorkflow('Reference');
+			const context = { prompt: 'Test prompt', referenceWorkflows: [referenceWorkflow] };
+
+			await evaluator.evaluate(workflow, context);
+
+			expect(mockProgrammaticEvaluation).toHaveBeenCalledWith(
+				{
+					userPrompt: 'Test prompt',
+					generatedWorkflow: workflow,
+					referenceWorkflows: [referenceWorkflow],
+				},
+				mockNodeTypes,
+			);
+		});
+
+		it('should return feedback with overall score', async () => {
+			mockProgrammaticEvaluation.mockResolvedValue(
+				createMockEvaluationResult({ overallScore: 0.92 }),
+			);
+
+			const { createProgrammaticEvaluator } = await import('../../evaluators/programmatic');
+			const evaluator = createProgrammaticEvaluator(mockNodeTypes);
+
+			const workflow = createMockWorkflow();
+			const feedback = await evaluator.evaluate(workflow, { prompt: 'Test' });
+
+			const overallFeedback = findFeedback(feedback, 'overall');
+			expect(overallFeedback).toEqual({
+				evaluator: 'programmatic',
+				metric: 'overall',
+				score: 0.92,
+				kind: 'score',
+			});
+		});
+
+		it('should return feedback for all check categories', async () => {
+			mockProgrammaticEvaluation.mockResolvedValue(createMockEvaluationResult());
+
+			const { createProgrammaticEvaluator } = await import('../../evaluators/programmatic');
+			const evaluator = createProgrammaticEvaluator(mockNodeTypes);
+
+			const workflow = createMockWorkflow();
+			const feedback = await evaluator.evaluate(workflow, { prompt: 'Test' });
+
+			const metrics = feedback.filter((f) => f.evaluator === 'programmatic').map((f) => f.metric);
+			expect(metrics).toContain('overall');
+			expect(metrics).toContain('connections');
+			expect(metrics).toContain('trigger');
+			expect(metrics).toContain('agentPrompt');
+			expect(metrics).toContain('tools');
+			expect(metrics).toContain('fromAi');
+		});
+
+		it('should include violations in feedback comments', async () => {
+			mockProgrammaticEvaluation.mockResolvedValue(
+				createMockEvaluationResult({
+					connections: {
+						score: 0.5,
+						violations: [
+							{ type: 'disconnected-node', description: 'Node A has no connections' },
+							{ type: 'invalid-connection', description: 'Invalid edge between B and C' },
+						],
+					},
+				}),
+			);
+
+			const { createProgrammaticEvaluator } = await import('../../evaluators/programmatic');
+			const evaluator = createProgrammaticEvaluator(mockNodeTypes);
+
+			const workflow = createMockWorkflow();
+			const feedback = await evaluator.evaluate(workflow, { prompt: 'Test' });
+
+			const connectionsFeedback = findFeedback(feedback, 'connections');
+			expect(connectionsFeedback?.comment).toContain(
+				'[disconnected-node] Node A has no connections',
+			);
+			expect(connectionsFeedback?.comment).toContain(
+				'[invalid-connection] Invalid edge between B and C',
+			);
+		});
+
+		it('should not include comment when no violations', async () => {
+			mockProgrammaticEvaluation.mockResolvedValue(
+				createMockEvaluationResult({
+					trigger: { score: 1.0, violations: [] },
+				}),
+			);
+
+			const { createProgrammaticEvaluator } = await import('../../evaluators/programmatic');
+			const evaluator = createProgrammaticEvaluator(mockNodeTypes);
+
+			const workflow = createMockWorkflow();
+			const feedback = await evaluator.evaluate(workflow, { prompt: 'Test' });
+
+			const triggerFeedback = findFeedback(feedback, 'trigger');
+			expect(triggerFeedback?.comment).toBeUndefined();
+		});
+
+		it('should include similarity feedback when reference workflows provided', async () => {
+			mockProgrammaticEvaluation.mockResolvedValue(
+				createMockEvaluationResult({
+					similarity: {
+						score: 0.75,
+						violations: [{ type: 'node-mismatch', description: 'Missing expected node' }],
+					},
+				}),
+			);
+
+			const { createProgrammaticEvaluator } = await import('../../evaluators/programmatic');
+			const evaluator = createProgrammaticEvaluator(mockNodeTypes);
+
+			const workflow = createMockWorkflow();
+			const referenceWorkflow = createMockWorkflow('Reference');
+			const feedback = await evaluator.evaluate(workflow, {
+				prompt: 'Test',
+				referenceWorkflows: [referenceWorkflow],
+			});
+
+			const similarityFeedback = findFeedback(feedback, 'similarity');
+			expect(similarityFeedback).toEqual({
+				evaluator: 'programmatic',
+				metric: 'similarity',
+				score: 0.75,
+				kind: 'metric',
+				comment: '[node-mismatch] Missing expected node',
+			});
+		});
+
+		it('should not include similarity feedback when result is null', async () => {
+			mockProgrammaticEvaluation.mockResolvedValue(
+				createMockEvaluationResult({
+					similarity: null,
+				}),
+			);
+
+			const { createProgrammaticEvaluator } = await import('../../evaluators/programmatic');
+			const evaluator = createProgrammaticEvaluator(mockNodeTypes);
+
+			const workflow = createMockWorkflow();
+			const feedback = await evaluator.evaluate(workflow, { prompt: 'Test' });
+
+			const similarityFeedback = findFeedback(feedback, 'similarity');
+			expect(similarityFeedback).toBeUndefined();
+		});
+
+		it('should handle evaluation errors gracefully', async () => {
+			mockProgrammaticEvaluation.mockRejectedValue(new Error('Evaluation failed'));
+
+			const { createProgrammaticEvaluator } = await import('../../evaluators/programmatic');
+			const evaluator = createProgrammaticEvaluator(mockNodeTypes);
+
+			const workflow = createMockWorkflow();
+
+			// Should throw - let the runner handle errors
+			await expect(evaluator.evaluate(workflow, { prompt: 'Test' })).rejects.toThrow(
+				'Evaluation failed',
+			);
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/evaluators/similarity.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/evaluators/similarity.test.ts
@ -0,0 +1,306 @@
+/**
+ * Tests for similarity evaluator factory.
+ *
+ * These tests mock the underlying workflow similarity functions and verify
+ * that the factory correctly wraps them and transforms the results.
+ */
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+// Store mocks for similarity functions
+const mockEvaluateWorkflowSimilarity = jest.fn();
+const mockEvaluateWorkflowSimilarityMultiple = jest.fn();
+
+// Mock the workflow similarity module
+jest.mock('../../programmatic/evaluators/workflow-similarity', () => ({
+	evaluateWorkflowSimilarity: (...args: unknown[]): unknown =>
+		mockEvaluateWorkflowSimilarity(...args),
+	evaluateWorkflowSimilarityMultiple: (...args: unknown[]): unknown =>
+		mockEvaluateWorkflowSimilarityMultiple(...args),
+}));
+
+/** Helper to create a minimal valid workflow for tests */
+function createMockWorkflow(name = 'Test Workflow'): SimpleWorkflow {
+	return { name, nodes: [], connections: {} };
+}
+
+/** Helper to create mock similarity result */
+function createMockSimilarityResult(
+	overrides: Partial<{
+		violations: Array<{ name: string; type: string; description: string; pointsDeducted: number }>;
+		score: number;
+	}> = {},
+) {
+	return {
+		violations: [],
+		score: 0.85,
+		...overrides,
+	};
+}
+
+describe('Similarity Evaluator', () => {
+	beforeEach(() => {
+		jest.clearAllMocks();
+	});
+	type SimilarityFeedback = { evaluator: string; metric: string; score: number; comment?: string };
+	const findFeedback = (feedback: SimilarityFeedback[], metric: string) =>
+		feedback.find((f) => f.evaluator === 'similarity' && f.metric === metric);
+
+	describe('createSimilarityEvaluator()', () => {
+		it('should create an evaluator with correct name', async () => {
+			const { createSimilarityEvaluator } = await import('../../evaluators/similarity');
+			const evaluator = createSimilarityEvaluator();
+
+			expect(evaluator.name).toBe('similarity');
+		});
+
+		it('should return error feedback when no reference workflow provided', async () => {
+			const { createSimilarityEvaluator } = await import('../../evaluators/similarity');
+			const evaluator = createSimilarityEvaluator();
+
+			const workflow = createMockWorkflow();
+			const feedback = await evaluator.evaluate(workflow, { prompt: 'Test prompt' });
+
+			expect(mockEvaluateWorkflowSimilarity).not.toHaveBeenCalled();
+			expect(mockEvaluateWorkflowSimilarityMultiple).not.toHaveBeenCalled();
+
+			expect(feedback).toContainEqual({
+				evaluator: 'similarity',
+				metric: 'error',
+				score: 0,
+				kind: 'score',
+				comment: 'No reference workflow provided for comparison',
+			});
+		});
+
+		it('should call evaluateWorkflowSimilarity with single reference workflow', async () => {
+			mockEvaluateWorkflowSimilarity.mockResolvedValue(createMockSimilarityResult());
+
+			const { createSimilarityEvaluator } = await import('../../evaluators/similarity');
+			const evaluator = createSimilarityEvaluator({ preset: 'strict' });
+
+			const workflow = createMockWorkflow();
+			const referenceWorkflow = createMockWorkflow('Reference');
+
+			await evaluator.evaluate(workflow, {
+				prompt: 'Test prompt',
+				referenceWorkflows: [referenceWorkflow],
+			});
+
+			expect(mockEvaluateWorkflowSimilarity).toHaveBeenCalledWith(
+				workflow,
+				referenceWorkflow,
+				'strict',
+				undefined,
+			);
+		});
+
+		it('should call evaluateWorkflowSimilarityMultiple with multiple reference workflows', async () => {
+			mockEvaluateWorkflowSimilarityMultiple.mockResolvedValue(createMockSimilarityResult());
+
+			const { createSimilarityEvaluator } = await import('../../evaluators/similarity');
+			const evaluator = createSimilarityEvaluator({ preset: 'lenient' });
+
+			const workflow = createMockWorkflow();
+			const referenceWorkflows = [
+				createMockWorkflow('Reference 1'),
+				createMockWorkflow('Reference 2'),
+			];
+
+			await evaluator.evaluate(workflow, { prompt: 'Test prompt', referenceWorkflows });
+
+			expect(mockEvaluateWorkflowSimilarityMultiple).toHaveBeenCalledWith(
+				workflow,
+				referenceWorkflows,
+				'lenient',
+				undefined,
+			);
+		});
+
+		it('should use default preset when not specified', async () => {
+			mockEvaluateWorkflowSimilarity.mockResolvedValue(createMockSimilarityResult());
+
+			const { createSimilarityEvaluator } = await import('../../evaluators/similarity');
+			const evaluator = createSimilarityEvaluator();
+
+			const workflow = createMockWorkflow();
+			const referenceWorkflow = createMockWorkflow('Reference');
+
+			await evaluator.evaluate(workflow, {
+				prompt: 'Test prompt',
+				referenceWorkflows: [referenceWorkflow],
+			});
+
+			expect(mockEvaluateWorkflowSimilarity).toHaveBeenCalledWith(
+				workflow,
+				referenceWorkflow,
+				'standard',
+				undefined,
+			);
+		});
+
+		it('should return feedback with similarity score', async () => {
+			mockEvaluateWorkflowSimilarity.mockResolvedValue(createMockSimilarityResult({ score: 0.92 }));
+
+			const { createSimilarityEvaluator } = await import('../../evaluators/similarity');
+			const evaluator = createSimilarityEvaluator();
+
+			const workflow = createMockWorkflow();
+			const referenceWorkflow = createMockWorkflow('Reference');
+
+			const feedback = await evaluator.evaluate(workflow, {
+				prompt: 'Test prompt',
+				referenceWorkflows: [referenceWorkflow],
+			});
+
+			const scoreFeedback = findFeedback(feedback, 'score');
+			expect(scoreFeedback?.score).toBe(0.92);
+		});
+
+		it('should include violations in score feedback comment', async () => {
+			mockEvaluateWorkflowSimilarity.mockResolvedValue(
+				createMockSimilarityResult({
+					score: 0.7,
+					violations: [
+						{
+							name: 'workflow-similarity-node-delete',
+							type: 'major',
+							description: 'Missing Slack node',
+							pointsDeducted: 10,
+						},
+					],
+				}),
+			);
+
+			const { createSimilarityEvaluator } = await import('../../evaluators/similarity');
+			const evaluator = createSimilarityEvaluator();
+
+			const workflow = createMockWorkflow();
+			const referenceWorkflow = createMockWorkflow('Reference');
+
+			const feedback = await evaluator.evaluate(workflow, {
+				prompt: 'Test prompt',
+				referenceWorkflows: [referenceWorkflow],
+			});
+
+			const scoreFeedback = findFeedback(feedback, 'score');
+			expect(scoreFeedback?.comment).toContain('[major] Missing Slack node');
+		});
+
+		it('should return feedback for each violation type', async () => {
+			mockEvaluateWorkflowSimilarity.mockResolvedValue(
+				createMockSimilarityResult({
+					violations: [
+						{
+							name: 'workflow-similarity-node-delete',
+							type: 'major',
+							description: 'Missing node A',
+							pointsDeducted: 10,
+						},
+						{
+							name: 'workflow-similarity-node-delete',
+							type: 'major',
+							description: 'Missing node B',
+							pointsDeducted: 10,
+						},
+						{
+							name: 'workflow-similarity-edge-insert',
+							type: 'minor',
+							description: 'Extra connection',
+							pointsDeducted: 5,
+						},
+					],
+				}),
+			);
+
+			const { createSimilarityEvaluator } = await import('../../evaluators/similarity');
+			const evaluator = createSimilarityEvaluator();
+
+			const workflow = createMockWorkflow();
+			const referenceWorkflow = createMockWorkflow('Reference');
+
+			const feedback = await evaluator.evaluate(workflow, {
+				prompt: 'Test prompt',
+				referenceWorkflows: [referenceWorkflow],
+			});
+
+			const nodeDeleteFeedback = findFeedback(feedback, 'node-delete');
+			expect(nodeDeleteFeedback).toBeDefined();
+			expect(nodeDeleteFeedback?.comment).toContain('2 node-delete');
+
+			const edgeInsertFeedback = findFeedback(feedback, 'edge-insert');
+			expect(edgeInsertFeedback).toBeDefined();
+			expect(edgeInsertFeedback?.comment).toContain('1 edge-insert');
+		});
+
+		it('should handle evaluation errors gracefully', async () => {
+			mockEvaluateWorkflowSimilarity.mockRejectedValue(new Error('uvx command not found'));
+
+			const { createSimilarityEvaluator } = await import('../../evaluators/similarity');
+			const evaluator = createSimilarityEvaluator();
+
+			const workflow = createMockWorkflow();
+			const referenceWorkflow = createMockWorkflow('Reference');
+
+			const feedback = await evaluator.evaluate(workflow, {
+				prompt: 'Test prompt',
+				referenceWorkflows: [referenceWorkflow],
+			});
+
+			const errorFeedback = findFeedback(feedback, 'error');
+			expect(errorFeedback).toEqual({
+				evaluator: 'similarity',
+				metric: 'error',
+				score: 0,
+				kind: 'score',
+				comment: 'uvx command not found',
+			});
+		});
+
+		it('should pass custom config path when provided', async () => {
+			mockEvaluateWorkflowSimilarity.mockResolvedValue(createMockSimilarityResult());
+
+			const { createSimilarityEvaluator } = await import('../../evaluators/similarity');
+			const evaluator = createSimilarityEvaluator({
+				preset: 'standard',
+				customConfigPath: '/path/to/config.json',
+			});
+
+			const workflow = createMockWorkflow();
+			const referenceWorkflow = createMockWorkflow('Reference');
+
+			await evaluator.evaluate(workflow, {
+				prompt: 'Test prompt',
+				referenceWorkflows: [referenceWorkflow],
+			});
+
+			expect(mockEvaluateWorkflowSimilarity).toHaveBeenCalledWith(
+				workflow,
+				referenceWorkflow,
+				'standard',
+				'/path/to/config.json',
+			);
+		});
+
+		it('should use evaluateWorkflowSimilarityMultiple when referenceWorkflows has multiple items', async () => {
+			mockEvaluateWorkflowSimilarityMultiple.mockResolvedValue(createMockSimilarityResult());
+
+			const { createSimilarityEvaluator } = await import('../../evaluators/similarity');
+			const evaluator = createSimilarityEvaluator();
+
+			const workflow = createMockWorkflow();
+			const referenceWorkflows = [
+				createMockWorkflow('Reference 1'),
+				createMockWorkflow('Reference 2'),
+			];
+
+			await evaluator.evaluate(workflow, {
+				prompt: 'Test prompt',
+				referenceWorkflows,
+			});
+
+			expect(mockEvaluateWorkflowSimilarityMultiple).toHaveBeenCalled();
+			expect(mockEvaluateWorkflowSimilarity).not.toHaveBeenCalled();
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/feedback.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/feedback.test.ts
@ -0,0 +1,55 @@
+import { langsmithMetricKey } from '../harness/feedback';
+import type { Feedback } from '../harness/harness-types';
+
+describe('langsmithMetricKey()', () => {
+	it('should keep llm-judge metrics unprefixed (root and sub-metrics)', () => {
+		const root: Feedback = {
+			evaluator: 'llm-judge',
+			metric: 'overallScore',
+			score: 1,
+			kind: 'score',
+		};
+		const sub: Feedback = {
+			evaluator: 'llm-judge',
+			metric: 'maintainability.workflowOrganization',
+			score: 1,
+			kind: 'detail',
+		};
+
+		expect(langsmithMetricKey(root)).toBe('overallScore');
+		expect(langsmithMetricKey(sub)).toBe('maintainability.workflowOrganization');
+	});
+
+	it('should prefix programmatic metrics with evaluator name', () => {
+		const fb: Feedback = { evaluator: 'programmatic', metric: 'trigger', score: 1, kind: 'metric' };
+		expect(langsmithMetricKey(fb)).toBe('programmatic.trigger');
+	});
+
+	it('should keep pairwise v1 metrics unprefixed and namespace non-v1 details', () => {
+		const v1: Feedback = {
+			evaluator: 'pairwise',
+			metric: 'pairwise_primary',
+			score: 0,
+			kind: 'score',
+		};
+		const detail: Feedback = { evaluator: 'pairwise', metric: 'judge1', score: 0, kind: 'detail' };
+
+		expect(langsmithMetricKey(v1)).toBe('pairwise_primary');
+		expect(langsmithMetricKey(detail)).toBe('pairwise.judge1');
+	});
+
+	it('should not produce collisions for the known evaluator contract', () => {
+		const feedback: Feedback[] = [
+			{ evaluator: 'llm-judge', metric: 'connections', score: 1, kind: 'metric' },
+			{ evaluator: 'llm-judge', metric: 'overallScore', score: 1, kind: 'score' },
+			{ evaluator: 'programmatic', metric: 'connections', score: 1, kind: 'metric' },
+			{ evaluator: 'programmatic', metric: 'overall', score: 1, kind: 'score' },
+			{ evaluator: 'pairwise', metric: 'pairwise_primary', score: 1, kind: 'score' },
+			{ evaluator: 'pairwise', metric: 'pairwise_total_violations', score: 1, kind: 'detail' },
+			{ evaluator: 'pairwise', metric: 'judge1', score: 0, kind: 'detail' },
+		];
+
+		const keys = feedback.map(langsmithMetricKey);
+		expect(new Set(keys).size).toBe(keys.length);
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/lifecycle.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/lifecycle.test.ts
@ -0,0 +1,818 @@
+/**
+ * Tests for default console lifecycle implementation.
+ */
+
+import { mock } from 'jest-mock-extended';
+import type { Client } from 'langsmith/client';
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+import type {
+	EvaluationLifecycle,
+	RunConfig,
+	ExampleResult,
+	RunSummary,
+	Feedback,
+} from '../harness/harness-types';
+import { createLogger } from '../harness/logger';
+
+const mockLangsmithClient = () => mock<Client>();
+
+// Mock console methods
+const mockConsole = {
+	log: jest.fn(),
+	warn: jest.fn(),
+	error: jest.fn(),
+};
+
+// Store original console
+const originalConsole = { ...console };
+
+beforeEach(() => {
+	jest.clearAllMocks();
+	console.log = mockConsole.log;
+	console.warn = mockConsole.warn;
+	console.error = mockConsole.error;
+});
+
+afterEach(() => {
+	console.log = originalConsole.log;
+	console.warn = originalConsole.warn;
+	console.error = originalConsole.error;
+});
+
+/** Helper to create a minimal valid workflow for tests */
+function createMockWorkflow(name = 'Test Workflow'): SimpleWorkflow {
+	return { name, nodes: [], connections: {} };
+}
+
+describe('Console Lifecycle', () => {
+	describe('createConsoleLifecycle()', () => {
+		it('should create a lifecycle with all hooks', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: false, logger: createLogger(false) });
+
+			expect(lifecycle.onStart).toBeDefined();
+			expect(lifecycle.onExampleStart).toBeDefined();
+			expect(lifecycle.onWorkflowGenerated).toBeDefined();
+			expect(lifecycle.onEvaluatorComplete).toBeDefined();
+			expect(lifecycle.onEvaluatorError).toBeDefined();
+			expect(lifecycle.onExampleComplete).toBeDefined();
+			expect(lifecycle.onEnd).toBeDefined();
+		});
+
+		it('should log experiment info on start with test cases array', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: false, logger: createLogger(false) });
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test' }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [],
+				logger: createLogger(false),
+			};
+
+			lifecycle.onStart(config);
+
+			expect(mockConsole.log).toHaveBeenCalled();
+			const logOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(logOutput).toContain('local');
+			expect(logOutput).toContain('Test cases');
+		});
+
+		it('should log dataset name for langsmith mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: false, logger: createLogger(false) });
+
+			const config: RunConfig = {
+				mode: 'langsmith',
+				dataset: 'my-dataset-name',
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [{ name: 'test-eval', evaluate: jest.fn() }],
+				langsmithClient: mockLangsmithClient(),
+				langsmithOptions: {
+					experimentName: 'test-experiment',
+					repetitions: 1,
+					concurrency: 1,
+				},
+				logger: createLogger(false),
+			};
+
+			lifecycle.onStart(config);
+
+			expect(mockConsole.log).toHaveBeenCalled();
+			const logOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(logOutput).toContain('langsmith');
+			expect(logOutput).toContain('my-dataset-name');
+		});
+
+		it('should not log summary in langsmith mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: false, logger: createLogger(false) });
+
+			const config: RunConfig = {
+				mode: 'langsmith',
+				dataset: 'my-dataset-name',
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [{ name: 'test-eval', evaluate: jest.fn() }],
+				langsmithClient: mockLangsmithClient(),
+				langsmithOptions: {
+					experimentName: 'test-experiment',
+					repetitions: 1,
+					concurrency: 1,
+				},
+				logger: createLogger(false),
+			};
+
+			lifecycle.onStart(config);
+			mockConsole.log.mockClear();
+
+			lifecycle.onEnd({
+				totalExamples: 0,
+				passed: 0,
+				failed: 0,
+				errors: 0,
+				averageScore: 0,
+				totalDurationMs: 0,
+			});
+
+			expect(mockConsole.log).not.toHaveBeenCalled();
+		});
+
+		it('should log example progress in verbose mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(true) });
+
+			lifecycle.onExampleStart(1, 10, 'Test prompt that is quite long and should be truncated');
+
+			expect(mockConsole.log).toHaveBeenCalled();
+			const logOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(logOutput).toContain('[ex 1/10]');
+		});
+
+		it('should not log example progress in non-verbose mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: false, logger: createLogger(false) });
+
+			lifecycle.onExampleStart(1, 10, 'Test prompt');
+
+			// Should not log in non-verbose mode
+			expect(mockConsole.log).not.toHaveBeenCalled();
+		});
+
+		it('should log workflow generation in verbose mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(true) });
+
+			const workflow = createMockWorkflow('My Workflow');
+			lifecycle.onWorkflowGenerated(workflow, 1500);
+
+			// workflow generation is reported as part of the example completion block
+			expect(mockConsole.log).not.toHaveBeenCalled();
+		});
+
+		it('should log evaluator completion in verbose mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(true) });
+
+			lifecycle.onEvaluatorComplete('llm-judge', [
+				{ evaluator: 'llm-judge', metric: 'func', score: 0.8, kind: 'metric' },
+				{ evaluator: 'llm-judge', metric: 'conn', score: 0.9, kind: 'metric' },
+			]);
+
+			// evaluator completion is reported as part of the example completion block
+			expect(mockConsole.log).not.toHaveBeenCalled();
+		});
+
+		it('should not log evaluator completion in non-verbose mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: false, logger: createLogger(false) });
+
+			lifecycle.onEvaluatorComplete('llm-judge', [
+				{ evaluator: 'llm-judge', metric: 'func', score: 0.8, kind: 'metric' },
+			]);
+
+			expect(mockConsole.log).not.toHaveBeenCalled();
+		});
+
+		it('should display critical metrics in verbose mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(true) });
+
+			const result: ExampleResult = {
+				index: 1,
+				prompt: 'Test prompt that is quite long and should be truncated for display in logs',
+				status: 'pass',
+				score: 0.85,
+				feedback: [
+					{ evaluator: 'llm-judge', metric: 'functionality', score: 0.95, kind: 'metric' },
+					{ evaluator: 'llm-judge', metric: 'connections', score: 0.8, kind: 'metric' },
+					{ evaluator: 'llm-judge', metric: 'overallScore', score: 0.85, kind: 'score' },
+					{ evaluator: 'other', metric: 'metric', score: 0.5, kind: 'detail' },
+				],
+				durationMs: 2000,
+				generationDurationMs: 1500,
+				evaluationDurationMs: 500,
+				workflow: createMockWorkflow(),
+			};
+
+			lifecycle.onExampleComplete(1, result);
+
+			expect(mockConsole.log).toHaveBeenCalled();
+			const logOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(logOutput).toContain('prompt=');
+			expect(logOutput).toContain('functionality');
+			expect(logOutput).toContain('connections');
+			expect(logOutput).toContain('overallScore');
+		});
+
+		it('should display violations in verbose mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(true) });
+
+			const result: ExampleResult = {
+				index: 1,
+				prompt: 'Test',
+				status: 'fail',
+				score: 0.3,
+				feedback: [
+					{
+						evaluator: 'llm-judge',
+						metric: 'functionality',
+						score: 0.5,
+						comment: '[critical] Missing trigger node',
+						kind: 'metric',
+					},
+					{
+						evaluator: 'llm-judge',
+						metric: 'connections',
+						score: 0.3,
+						comment: '[major] Disconnected node found',
+						kind: 'metric',
+					},
+				],
+				durationMs: 1500,
+			};
+
+			lifecycle.onExampleComplete(1, result);
+
+			expect(mockConsole.log).toHaveBeenCalled();
+			const logOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(logOutput).toContain('issues');
+			expect(logOutput).toContain('Missing trigger node');
+			expect(logOutput).toContain('Disconnected node found');
+		});
+
+		it('should display pairwise judge violations in verbose mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(true) });
+
+			const result: ExampleResult = {
+				index: 1,
+				prompt: 'Test',
+				status: 'fail',
+				score: 0.5,
+				feedback: [
+					{
+						evaluator: 'pairwise',
+						metric: 'pairwise_generation_correctness',
+						score: 0.5,
+						comment: '1/2 generations passed',
+						kind: 'score',
+					},
+					{
+						evaluator: 'pairwise',
+						metric: 'gen2.judge2',
+						score: 0,
+						comment: '[No HTTP] Contains HTTP Request node',
+						kind: 'detail',
+					},
+				],
+				durationMs: 1500,
+			};
+
+			lifecycle.onExampleComplete(1, result);
+
+			expect(mockConsole.log).toHaveBeenCalled();
+			const logOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(logOutput).toContain('issues');
+			expect(logOutput).toContain('gen2.judge2');
+			expect(logOutput).toContain('Contains HTTP Request node');
+		});
+
+		it('should limit violations display to 5 and show count', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(true) });
+
+			const result: ExampleResult = {
+				index: 1,
+				prompt: 'Test',
+				status: 'fail',
+				score: 0.5,
+				feedback: [
+					{
+						evaluator: 'llm-judge',
+						metric: 'v1',
+						score: 0.5,
+						comment: '[minor] Violation 1',
+						kind: 'detail',
+					},
+					{
+						evaluator: 'llm-judge',
+						metric: 'v2',
+						score: 0.5,
+						comment: '[minor] Violation 2',
+						kind: 'detail',
+					},
+					{
+						evaluator: 'llm-judge',
+						metric: 'v3',
+						score: 0.5,
+						comment: '[minor] Violation 3',
+						kind: 'detail',
+					},
+					{
+						evaluator: 'llm-judge',
+						metric: 'v4',
+						score: 0.5,
+						comment: '[minor] Violation 4',
+						kind: 'detail',
+					},
+				],
+				durationMs: 1000,
+			};
+
+			lifecycle.onExampleComplete(1, result);
+
+			expect(mockConsole.log).toHaveBeenCalled();
+			const logOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(logOutput).toContain('and 1 more');
+		});
+
+		it('should not display violations for error feedback', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(true) });
+
+			const result: ExampleResult = {
+				index: 1,
+				prompt: 'Test',
+				status: 'error',
+				score: 0,
+				feedback: [
+					{
+						evaluator: 'llm-judge',
+						metric: 'error',
+						score: 0,
+						comment: 'Crashed',
+						kind: 'score',
+					},
+				],
+				durationMs: 500,
+				error: 'Generation failed',
+			};
+
+			lifecycle.onExampleComplete(1, result);
+
+			expect(mockConsole.log).toHaveBeenCalled();
+			const logOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(logOutput).not.toContain('issues');
+		});
+
+		it('should handle empty feedback array', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(true) });
+
+			const result: ExampleResult = {
+				index: 1,
+				prompt: 'Test',
+				status: 'fail',
+				score: 0,
+				feedback: [],
+				durationMs: 500,
+			};
+
+			lifecycle.onExampleComplete(1, result);
+
+			expect(mockConsole.log).toHaveBeenCalled();
+			const logOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(logOutput).toContain('0%');
+		});
+
+		it('should log evaluator errors in verbose mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(true) });
+
+			lifecycle.onEvaluatorError('test-evaluator', new Error('Something went wrong'));
+
+			expect(mockConsole.error).toHaveBeenCalled();
+			const errorOutput = mockConsole.error.mock.calls.flat().join(' ');
+			expect(errorOutput).toContain('test-evaluator');
+			expect(errorOutput).toContain('Something went wrong');
+		});
+
+		it('should NOT log evaluator errors in non-verbose mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: false, logger: createLogger(false) });
+
+			lifecycle.onEvaluatorError('test-evaluator', new Error('Something went wrong'));
+
+			expect(mockConsole.error).not.toHaveBeenCalled();
+		});
+
+		it('should log example completion with pass/fail status', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(true) });
+
+			const passResult: ExampleResult = {
+				index: 1,
+				prompt: 'Test',
+				status: 'pass',
+				score: 0.9,
+				feedback: [{ evaluator: 'test-eval', metric: 'test', score: 0.9, kind: 'score' }],
+				durationMs: 2000,
+			};
+
+			const failResult: ExampleResult = {
+				index: 2,
+				prompt: 'Test',
+				status: 'fail',
+				score: 0.3,
+				feedback: [{ evaluator: 'test-eval', metric: 'test', score: 0.3, kind: 'score' }],
+				durationMs: 1500,
+			};
+
+			lifecycle.onExampleComplete(1, passResult);
+			lifecycle.onExampleComplete(2, failResult);
+
+			const allOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(allOutput).toContain('PASS');
+			expect(allOutput).toContain('FAIL');
+		});
+
+		it('should log example completion with error status', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(true) });
+
+			const errorResult: ExampleResult = {
+				index: 1,
+				prompt: 'Test',
+				status: 'error',
+				score: 0,
+				feedback: [],
+				durationMs: 500,
+				error: 'Generation failed',
+			};
+
+			lifecycle.onExampleComplete(1, errorResult);
+
+			const allOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(allOutput).toContain('ERROR');
+		});
+
+		it('should not log example completion in non-verbose mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: false, logger: createLogger(false) });
+
+			const result: ExampleResult = {
+				index: 1,
+				prompt: 'Test',
+				status: 'pass',
+				score: 0.9,
+				feedback: [{ evaluator: 'test-eval', metric: 'test', score: 0.9, kind: 'score' }],
+				durationMs: 2000,
+			};
+
+			lifecycle.onExampleComplete(1, result);
+
+			expect(mockConsole.log).not.toHaveBeenCalled();
+		});
+
+		it('should not log workflow generation in non-verbose mode', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: false, logger: createLogger(false) });
+
+			const workflow = createMockWorkflow('My Workflow');
+			lifecycle.onWorkflowGenerated(workflow, 1500);
+
+			expect(mockConsole.log).not.toHaveBeenCalled();
+		});
+
+		it('should use different colors for different score ranges', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(true) });
+
+			const result: ExampleResult = {
+				index: 1,
+				prompt: 'Test',
+				status: 'pass',
+				score: 0.8,
+				feedback: [
+					{ evaluator: 'high', metric: 'test', score: 0.95, kind: 'score' },
+					{ evaluator: 'medium', metric: 'test', score: 0.75, kind: 'score' },
+					{ evaluator: 'low', metric: 'test', score: 0.5, kind: 'score' },
+				],
+				durationMs: 1000,
+			};
+
+			lifecycle.onExampleComplete(1, result);
+
+			expect(mockConsole.log).toHaveBeenCalled();
+			// The coloring is applied, tests verify that the function runs without error
+			const logOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(logOutput).toContain('high');
+			expect(logOutput).toContain('medium');
+			expect(logOutput).toContain('low');
+		});
+
+		it('should log summary with statistics on end', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: false, logger: createLogger(false) });
+
+			const summary: RunSummary = {
+				totalExamples: 10,
+				passed: 7,
+				failed: 2,
+				errors: 1,
+				averageScore: 0.85,
+				totalDurationMs: 30000,
+			};
+
+			lifecycle.onEnd(summary);
+
+			expect(mockConsole.log).toHaveBeenCalled();
+			const logOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(logOutput).toContain('10');
+			expect(logOutput).toContain('7');
+			expect(logOutput).toContain('85%');
+		});
+
+		it('should not print NaN when feedback contains non-finite scores', async () => {
+			const { createConsoleLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createConsoleLifecycle({ verbose: true, logger: createLogger(false) });
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test' }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [{ name: 'programmatic', evaluate: jest.fn() }],
+				logger: createLogger(false),
+			};
+
+			lifecycle.onStart(config);
+
+			const result: ExampleResult = {
+				index: 1,
+				prompt: 'Test',
+				status: 'pass',
+				score: 1,
+				durationMs: 10,
+				workflow: createMockWorkflow(),
+				feedback: [
+					{ evaluator: 'programmatic', metric: 'connections', score: 1, kind: 'metric' },
+					{ evaluator: 'programmatic', metric: 'trigger', score: Number.NaN, kind: 'metric' },
+				],
+			};
+
+			lifecycle.onExampleComplete(1, result);
+
+			const logOutput = mockConsole.log.mock.calls.flat().join(' ');
+			expect(logOutput).not.toContain('NaN');
+		});
+	});
+
+	describe('createQuietLifecycle()', () => {
+		it('should create lifecycle with empty hooks', async () => {
+			const { createQuietLifecycle } = await import('../harness/lifecycle');
+			const lifecycle = createQuietLifecycle();
+
+			// Should have all hooks
+			expect(lifecycle.onStart).toBeDefined();
+			expect(lifecycle.onEnd).toBeDefined();
+
+			// Hooks should be no-ops
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [],
+				generateWorkflow: jest.fn(),
+				evaluators: [],
+				logger: createLogger(false),
+			};
+
+			lifecycle.onStart(config);
+			lifecycle.onExampleStart(1, 1, 'Test');
+			lifecycle.onEnd({
+				totalExamples: 1,
+				passed: 1,
+				failed: 0,
+				errors: 0,
+				averageScore: 1,
+				totalDurationMs: 1000,
+			});
+
+			// Should not log anything
+			expect(mockConsole.log).not.toHaveBeenCalled();
+		});
+	});
+
+	describe('mergeLifecycles()', () => {
+		it('should merge multiple lifecycles into one', async () => {
+			const { mergeLifecycles } = await import('../harness/lifecycle');
+
+			const hook1 = jest.fn();
+			const hook2 = jest.fn();
+
+			const lifecycle1: Partial<EvaluationLifecycle> = {
+				onStart: hook1,
+			};
+
+			const lifecycle2: Partial<EvaluationLifecycle> = {
+				onStart: hook2,
+			};
+
+			const merged = mergeLifecycles(lifecycle1, lifecycle2);
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [],
+				generateWorkflow: jest.fn(),
+				evaluators: [],
+				logger: createLogger(false),
+			};
+
+			merged.onStart(config);
+
+			expect(hook1).toHaveBeenCalledWith(config);
+			expect(hook2).toHaveBeenCalledWith(config);
+		});
+
+		it('should handle undefined hooks gracefully', async () => {
+			const { mergeLifecycles } = await import('../harness/lifecycle');
+
+			const hook = jest.fn();
+
+			const lifecycle1: Partial<EvaluationLifecycle> = {
+				onStart: hook,
+			};
+
+			const lifecycle2: Partial<EvaluationLifecycle> = {
+				// No onStart
+			};
+
+			const merged = mergeLifecycles(lifecycle1, lifecycle2);
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [],
+				generateWorkflow: jest.fn(),
+				evaluators: [],
+				logger: createLogger(false),
+			};
+
+			merged.onStart(config);
+
+			expect(hook).toHaveBeenCalledWith(config);
+		});
+
+		it('should handle undefined lifecycles in array', async () => {
+			const { mergeLifecycles } = await import('../harness/lifecycle');
+
+			const hook = jest.fn();
+
+			const lifecycle1: Partial<EvaluationLifecycle> = {
+				onStart: hook,
+			};
+
+			const merged = mergeLifecycles(lifecycle1, undefined, undefined);
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [],
+				generateWorkflow: jest.fn(),
+				evaluators: [],
+				logger: createLogger(false),
+			};
+
+			merged.onStart(config);
+
+			expect(hook).toHaveBeenCalledWith(config);
+		});
+
+		it('should merge onExampleStart hooks', async () => {
+			const { mergeLifecycles } = await import('../harness/lifecycle');
+
+			const hook1 = jest.fn();
+			const hook2 = jest.fn();
+
+			const lifecycle1: Partial<EvaluationLifecycle> = { onExampleStart: hook1 };
+			const lifecycle2: Partial<EvaluationLifecycle> = { onExampleStart: hook2 };
+
+			const merged = mergeLifecycles(lifecycle1, lifecycle2);
+			merged.onExampleStart(1, 10, 'Test prompt');
+
+			expect(hook1).toHaveBeenCalledWith(1, 10, 'Test prompt');
+			expect(hook2).toHaveBeenCalledWith(1, 10, 'Test prompt');
+		});
+
+		it('should merge onWorkflowGenerated hooks', async () => {
+			const { mergeLifecycles } = await import('../harness/lifecycle');
+
+			const hook1 = jest.fn();
+			const hook2 = jest.fn();
+
+			const lifecycle1: Partial<EvaluationLifecycle> = { onWorkflowGenerated: hook1 };
+			const lifecycle2: Partial<EvaluationLifecycle> = { onWorkflowGenerated: hook2 };
+
+			const merged = mergeLifecycles(lifecycle1, lifecycle2);
+			const workflow = createMockWorkflow();
+			merged.onWorkflowGenerated(workflow, 1000);
+
+			expect(hook1).toHaveBeenCalledWith(workflow, 1000);
+			expect(hook2).toHaveBeenCalledWith(workflow, 1000);
+		});
+
+		it('should merge onEvaluatorComplete hooks', async () => {
+			const { mergeLifecycles } = await import('../harness/lifecycle');
+
+			const hook1 = jest.fn();
+			const hook2 = jest.fn();
+
+			const lifecycle1: Partial<EvaluationLifecycle> = { onEvaluatorComplete: hook1 };
+			const lifecycle2: Partial<EvaluationLifecycle> = { onEvaluatorComplete: hook2 };
+
+			const merged = mergeLifecycles(lifecycle1, lifecycle2);
+			const feedback: Feedback[] = [
+				{ evaluator: 'test-eval', metric: 'test', score: 0.9, kind: 'score' },
+			];
+			merged.onEvaluatorComplete('test-eval', feedback);
+
+			expect(hook1).toHaveBeenCalledWith('test-eval', feedback);
+			expect(hook2).toHaveBeenCalledWith('test-eval', feedback);
+		});
+
+		it('should merge onEvaluatorError hooks', async () => {
+			const { mergeLifecycles } = await import('../harness/lifecycle');
+
+			const hook1 = jest.fn();
+			const hook2 = jest.fn();
+
+			const lifecycle1: Partial<EvaluationLifecycle> = { onEvaluatorError: hook1 };
+			const lifecycle2: Partial<EvaluationLifecycle> = { onEvaluatorError: hook2 };
+
+			const merged = mergeLifecycles(lifecycle1, lifecycle2);
+			const error = new Error('Test error');
+			merged.onEvaluatorError('test-eval', error);
+
+			expect(hook1).toHaveBeenCalledWith('test-eval', error);
+			expect(hook2).toHaveBeenCalledWith('test-eval', error);
+		});
+
+		it('should merge onExampleComplete hooks', async () => {
+			const { mergeLifecycles } = await import('../harness/lifecycle');
+
+			const hook1 = jest.fn();
+			const hook2 = jest.fn();
+
+			const lifecycle1: Partial<EvaluationLifecycle> = { onExampleComplete: hook1 };
+			const lifecycle2: Partial<EvaluationLifecycle> = { onExampleComplete: hook2 };
+
+			const merged = mergeLifecycles(lifecycle1, lifecycle2);
+			const result: ExampleResult = {
+				index: 1,
+				prompt: 'Test',
+				status: 'pass',
+				score: 1,
+				feedback: [],
+				durationMs: 1000,
+			};
+			merged.onExampleComplete(1, result);
+
+			expect(hook1).toHaveBeenCalledWith(1, result);
+			expect(hook2).toHaveBeenCalledWith(1, result);
+		});
+
+		it('should merge onEnd hooks', async () => {
+			const { mergeLifecycles } = await import('../harness/lifecycle');
+
+			const hook1 = jest.fn();
+			const hook2 = jest.fn();
+
+			const lifecycle1: Partial<EvaluationLifecycle> = { onEnd: hook1 };
+			const lifecycle2: Partial<EvaluationLifecycle> = { onEnd: hook2 };
+
+			const merged = mergeLifecycles(lifecycle1, lifecycle2);
+			const summary: RunSummary = {
+				totalExamples: 10,
+				passed: 8,
+				failed: 1,
+				errors: 1,
+				averageScore: 0.85,
+				totalDurationMs: 5000,
+			};
+			merged.onEnd(summary);
+
+			expect(hook1).toHaveBeenCalledWith(summary);
+			expect(hook2).toHaveBeenCalledWith(summary);
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/multi-gen.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/multi-gen.test.ts
@ -0,0 +1,159 @@
+/**
+ * Tests for multi-generation utilities.
+ *
+ * These utilities support aggregating results across multiple workflow
+ * generations in pairwise evaluation.
+ */
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+import {
+	getMajorityThreshold,
+	aggregateGenerations,
+	type GenerationDetail,
+} from '../harness/multi-gen';
+
+/** Helper to create a mock workflow */
+function createMockWorkflow(name = 'Test Workflow'): SimpleWorkflow {
+	return { name, nodes: [], connections: {} };
+}
+
+/** Helper to create a generation detail */
+function createGenerationDetail(overrides: Partial<GenerationDetail> = {}): GenerationDetail {
+	return {
+		workflow: createMockWorkflow(),
+		majorityPass: true,
+		diagnosticScore: 0.8,
+		primaryPasses: 2,
+		numJudges: 3,
+		...overrides,
+	};
+}
+
+describe('Multi-Generation Utilities', () => {
+	describe('getMajorityThreshold()', () => {
+		it('should throw for 0 judges', () => {
+			expect(() => getMajorityThreshold(0)).toThrow('numJudges must be >= 1');
+		});
+
+		it('should return 1 for 1 judge', () => {
+			expect(getMajorityThreshold(1)).toBe(1);
+		});
+
+		it('should return 2 for 3 judges', () => {
+			expect(getMajorityThreshold(3)).toBe(2);
+		});
+
+		it('should return 3 for 5 judges', () => {
+			expect(getMajorityThreshold(5)).toBe(3);
+		});
+
+		it('should return 1 for 2 judges (tie-goes-to-pass)', () => {
+			expect(getMajorityThreshold(2)).toBe(1);
+		});
+
+		it('should return 2 for 4 judges (tie-goes-to-pass)', () => {
+			expect(getMajorityThreshold(4)).toBe(2);
+		});
+	});
+
+	describe('aggregateGenerations()', () => {
+		it('should calculate generation correctness', () => {
+			const details = [
+				createGenerationDetail({ majorityPass: true }),
+				createGenerationDetail({ majorityPass: true }),
+				createGenerationDetail({ majorityPass: false }),
+			];
+
+			const result = aggregateGenerations(details);
+
+			expect(result.generationCorrectness).toBeCloseTo(2 / 3);
+			expect(result.passingGenerations).toBe(2);
+			expect(result.totalGenerations).toBe(3);
+		});
+
+		it('should calculate aggregated diagnostic score', () => {
+			const details = [
+				createGenerationDetail({ diagnosticScore: 0.9 }),
+				createGenerationDetail({ diagnosticScore: 0.8 }),
+				createGenerationDetail({ diagnosticScore: 0.7 }),
+			];
+
+			const result = aggregateGenerations(details);
+
+			expect(result.aggregatedDiagnosticScore).toBeCloseTo(0.8);
+		});
+
+		it('should count passing generations', () => {
+			const details = [
+				createGenerationDetail({ majorityPass: true }),
+				createGenerationDetail({ majorityPass: false }),
+				createGenerationDetail({ majorityPass: true }),
+				createGenerationDetail({ majorityPass: true }),
+				createGenerationDetail({ majorityPass: false }),
+			];
+
+			const result = aggregateGenerations(details);
+
+			expect(result.passingGenerations).toBe(3);
+			expect(result.totalGenerations).toBe(5);
+		});
+
+		it('should handle all passing', () => {
+			const details = [
+				createGenerationDetail({ majorityPass: true, diagnosticScore: 1.0 }),
+				createGenerationDetail({ majorityPass: true, diagnosticScore: 1.0 }),
+				createGenerationDetail({ majorityPass: true, diagnosticScore: 1.0 }),
+			];
+
+			const result = aggregateGenerations(details);
+
+			expect(result.generationCorrectness).toBe(1);
+			expect(result.aggregatedDiagnosticScore).toBe(1);
+			expect(result.passingGenerations).toBe(3);
+		});
+
+		it('should handle all failing', () => {
+			const details = [
+				createGenerationDetail({ majorityPass: false, diagnosticScore: 0.3 }),
+				createGenerationDetail({ majorityPass: false, diagnosticScore: 0.2 }),
+				createGenerationDetail({ majorityPass: false, diagnosticScore: 0.1 }),
+			];
+
+			const result = aggregateGenerations(details);
+
+			expect(result.generationCorrectness).toBe(0);
+			expect(result.aggregatedDiagnosticScore).toBeCloseTo(0.2);
+			expect(result.passingGenerations).toBe(0);
+		});
+
+		it('should handle single generation', () => {
+			const details = [createGenerationDetail({ majorityPass: true, diagnosticScore: 0.95 })];
+
+			const result = aggregateGenerations(details);
+
+			expect(result.generationCorrectness).toBe(1);
+			expect(result.aggregatedDiagnosticScore).toBe(0.95);
+			expect(result.passingGenerations).toBe(1);
+			expect(result.totalGenerations).toBe(1);
+		});
+
+		it('should preserve generation details', () => {
+			const workflow1 = createMockWorkflow('Workflow 1');
+			const workflow2 = createMockWorkflow('Workflow 2');
+
+			const details = [
+				createGenerationDetail({ workflow: workflow1, primaryPasses: 3 }),
+				createGenerationDetail({ workflow: workflow2, primaryPasses: 1 }),
+			];
+
+			const result = aggregateGenerations(details);
+
+			expect(result.generationDetails).toHaveLength(2);
+			expect(result.generationDetails[0].workflow.name).toBe('Workflow 1');
+			expect(result.generationDetails[0].primaryPasses).toBe(3);
+			expect(result.generationDetails[1].workflow.name).toBe('Workflow 2');
+			expect(result.generationDetails[1].primaryPasses).toBe(1);
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/output.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/output.test.ts
@ -0,0 +1,320 @@
+/**
+ * Tests for artifact saving functionality.
+ */
+
+import * as fs from 'fs';
+import { jsonParse } from 'n8n-workflow';
+import * as os from 'os';
+import * as path from 'path';
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+import type { ExampleResult, RunSummary } from '../harness/harness-types';
+import { createLogger } from '../harness/logger';
+import { createArtifactSaver } from '../harness/output';
+
+const silentLogger = createLogger(false);
+
+function findExampleDir(baseDir: string, paddedIndex: string): string {
+	const entries = fs.readdirSync(baseDir, { withFileTypes: true });
+	const prefix = `example-${paddedIndex}-`;
+	const match = entries.find((e) => e.isDirectory() && e.name.startsWith(prefix));
+	if (!match) throw new Error(`Expected example dir starting with "${prefix}" in ${baseDir}`);
+	return path.join(baseDir, match.name);
+}
+
+/** Type for parsed workflow JSON */
+interface ParsedWorkflow {
+	name: string;
+	nodes: unknown[];
+	connections: Record<string, unknown>;
+}
+
+/** Type for parsed feedback JSON */
+interface ParsedFeedback {
+	index: number;
+	status: string;
+	score: number;
+	evaluators: Array<{
+		name: string;
+		averageScore: number;
+		feedback: Array<{ key: string; score: number }>;
+	}>;
+}
+
+/** Type for parsed summary JSON */
+interface ParsedSummary {
+	totalExamples: number;
+	passed: number;
+	failed: number;
+	passRate: number;
+	timestamp: string;
+	evaluatorAverages: Record<string, number>;
+	results: Array<{
+		prompt: string;
+	}>;
+}
+
+/** Helper to create a minimal valid workflow for tests */
+function createMockWorkflow(name = 'Test Workflow'): SimpleWorkflow {
+	return {
+		name,
+		nodes: [
+			{
+				id: '1',
+				type: 'n8n-nodes-base.start',
+				name: 'Start',
+				position: [0, 0],
+				typeVersion: 1,
+				parameters: {},
+			},
+		],
+		connections: {},
+	};
+}
+
+/** Helper to create a mock example result */
+function createMockResult(overrides: Partial<ExampleResult> = {}): ExampleResult {
+	return {
+		index: 1,
+		prompt: 'Create a test workflow',
+		status: 'pass',
+		score: 0.9,
+		feedback: [
+			{ evaluator: 'llm-judge', metric: 'functionality', score: 0.9, kind: 'metric' },
+			{ evaluator: 'llm-judge', metric: 'connections', score: 0.8, kind: 'metric' },
+			{ evaluator: 'programmatic', metric: 'overall', score: 1.0, kind: 'score' },
+		],
+		durationMs: 1500,
+		workflow: createMockWorkflow(),
+		...overrides,
+	};
+}
+
+/** Helper to create a mock summary */
+function createMockSummary(): RunSummary {
+	return {
+		totalExamples: 3,
+		passed: 2,
+		failed: 1,
+		errors: 0,
+		averageScore: 0.85,
+		totalDurationMs: 5000,
+	};
+}
+
+describe('Artifact Saver', () => {
+	let tempDir: string;
+
+	beforeEach(() => {
+		// Create a unique temp directory for each test
+		tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'v2-eval-test-'));
+	});
+
+	afterEach(() => {
+		// Clean up temp directory
+		if (tempDir && fs.existsSync(tempDir)) {
+			fs.rmSync(tempDir, { recursive: true, force: true });
+		}
+	});
+
+	describe('createArtifactSaver()', () => {
+		it('should create output directory if it does not exist', () => {
+			const outputDir = path.join(tempDir, 'nested', 'output');
+			createArtifactSaver({ outputDir, logger: silentLogger });
+
+			expect(fs.existsSync(outputDir)).toBe(true);
+		});
+
+		it('should return an artifact saver with saveExample and saveSummary methods', () => {
+			const saver = createArtifactSaver({ outputDir: tempDir, logger: silentLogger });
+
+			expect(saver.saveExample).toBeDefined();
+			expect(saver.saveSummary).toBeDefined();
+		});
+	});
+
+	describe('saveExample()', () => {
+		it('should save prompt to prompt.txt', () => {
+			const saver = createArtifactSaver({ outputDir: tempDir, logger: silentLogger });
+			const result = createMockResult({ index: 1 });
+
+			saver.saveExample(result);
+
+			const exampleDir = findExampleDir(tempDir, '001');
+			const promptPath = path.join(exampleDir, 'prompt.txt');
+			expect(fs.existsSync(promptPath)).toBe(true);
+			expect(fs.readFileSync(promptPath, 'utf-8')).toBe('Create a test workflow');
+		});
+
+		it('should save workflow to workflow.json in n8n-importable format', () => {
+			const saver = createArtifactSaver({ outputDir: tempDir, logger: silentLogger });
+			const result = createMockResult();
+
+			saver.saveExample(result);
+
+			const exampleDir = findExampleDir(tempDir, '001');
+			const workflowPath = path.join(exampleDir, 'workflow.json');
+			expect(fs.existsSync(workflowPath)).toBe(true);
+
+			const workflow = jsonParse<ParsedWorkflow>(fs.readFileSync(workflowPath, 'utf-8'));
+			expect(workflow.name).toBe('Test Workflow');
+			expect(workflow.nodes).toHaveLength(1);
+			expect(workflow.connections).toEqual({});
+		});
+
+		it('should save feedback to feedback.json', () => {
+			const saver = createArtifactSaver({ outputDir: tempDir, logger: silentLogger });
+			const result = createMockResult();
+
+			saver.saveExample(result);
+
+			const exampleDir = findExampleDir(tempDir, '001');
+			const feedbackPath = path.join(exampleDir, 'feedback.json');
+			expect(fs.existsSync(feedbackPath)).toBe(true);
+
+			const feedback = jsonParse<ParsedFeedback>(fs.readFileSync(feedbackPath, 'utf-8'));
+			expect(feedback.index).toBe(1);
+			expect(feedback.status).toBe('pass');
+			expect(feedback.evaluators).toHaveLength(2); // llm-judge and programmatic
+		});
+
+		it('should group feedback by evaluator', () => {
+			const saver = createArtifactSaver({ outputDir: tempDir, logger: silentLogger });
+			const result = createMockResult();
+
+			saver.saveExample(result);
+
+			const exampleDir = findExampleDir(tempDir, '001');
+			const feedbackPath = path.join(exampleDir, 'feedback.json');
+			const feedback = jsonParse<ParsedFeedback>(fs.readFileSync(feedbackPath, 'utf-8'));
+
+			const llmJudge = feedback.evaluators.find((e) => e.name === 'llm-judge');
+			expect(llmJudge?.feedback).toHaveLength(2);
+
+			const programmatic = feedback.evaluators.find((e) => e.name === 'programmatic');
+			expect(programmatic?.feedback).toHaveLength(1);
+		});
+
+		it('should ignore non-finite scores when computing evaluator averages', () => {
+			const saver = createArtifactSaver({ outputDir: tempDir, logger: silentLogger });
+			const result = createMockResult({
+				feedback: [
+					{ evaluator: 'programmatic', metric: 'connections', score: 1, kind: 'metric' },
+					{ evaluator: 'programmatic', metric: 'trigger', score: Number.NaN, kind: 'metric' },
+				],
+			});
+
+			saver.saveExample(result);
+
+			const exampleDir = findExampleDir(tempDir, '001');
+			const feedbackPath = path.join(exampleDir, 'feedback.json');
+			const feedback = jsonParse<ParsedFeedback>(fs.readFileSync(feedbackPath, 'utf-8'));
+
+			const programmatic = feedback.evaluators.find((e) => e.name === 'programmatic');
+			expect(programmatic).toBeDefined();
+			expect(programmatic?.averageScore).toBe(1);
+		});
+
+		it('should save error to error.txt when present', () => {
+			const saver = createArtifactSaver({ outputDir: tempDir, logger: silentLogger });
+			const result = createMockResult({
+				status: 'error',
+				score: 0,
+				error: 'Generation failed: timeout',
+			});
+
+			saver.saveExample(result);
+
+			const exampleDir = findExampleDir(tempDir, '001');
+			const errorPath = path.join(exampleDir, 'error.txt');
+			expect(fs.existsSync(errorPath)).toBe(true);
+			expect(fs.readFileSync(errorPath, 'utf-8')).toBe('Generation failed: timeout');
+		});
+
+		it('should not save workflow.json when workflow is undefined', () => {
+			const saver = createArtifactSaver({ outputDir: tempDir, logger: silentLogger });
+			const result = createMockResult({ workflow: undefined });
+
+			saver.saveExample(result);
+
+			const exampleDir = findExampleDir(tempDir, '001');
+			const workflowPath = path.join(exampleDir, 'workflow.json');
+			expect(fs.existsSync(workflowPath)).toBe(false);
+		});
+
+		it('should pad example index in directory name', () => {
+			const saver = createArtifactSaver({ outputDir: tempDir, logger: silentLogger });
+
+			saver.saveExample(createMockResult({ index: 1 }));
+			saver.saveExample(createMockResult({ index: 10 }));
+			saver.saveExample(createMockResult({ index: 100 }));
+
+			expect(() => findExampleDir(tempDir, '001')).not.toThrow();
+			expect(() => findExampleDir(tempDir, '010')).not.toThrow();
+			expect(() => findExampleDir(tempDir, '100')).not.toThrow();
+		});
+	});
+
+	describe('saveSummary()', () => {
+		it('should save summary.json with correct structure', () => {
+			const saver = createArtifactSaver({ outputDir: tempDir, logger: silentLogger });
+			const summary = createMockSummary();
+			const results = [
+				createMockResult({ index: 1, status: 'pass' }),
+				createMockResult({ index: 2, status: 'pass' }),
+				createMockResult({ index: 3, status: 'fail' }),
+			];
+
+			saver.saveSummary(summary, results);
+
+			const summaryPath = path.join(tempDir, 'summary.json');
+			expect(fs.existsSync(summaryPath)).toBe(true);
+
+			const savedSummary = jsonParse<ParsedSummary>(fs.readFileSync(summaryPath, 'utf-8'));
+			expect(savedSummary.totalExamples).toBe(3);
+			expect(savedSummary.passed).toBe(2);
+			expect(savedSummary.failed).toBe(1);
+			expect(savedSummary.passRate).toBeCloseTo(2 / 3);
+		});
+
+		it('should include timestamp', () => {
+			const saver = createArtifactSaver({ outputDir: tempDir, logger: silentLogger });
+			saver.saveSummary(createMockSummary(), [createMockResult()]);
+
+			const summaryPath = path.join(tempDir, 'summary.json');
+			const savedSummary = jsonParse<ParsedSummary>(fs.readFileSync(summaryPath, 'utf-8'));
+
+			expect(savedSummary.timestamp).toBeDefined();
+			expect(new Date(savedSummary.timestamp).getTime()).not.toBeNaN();
+		});
+
+		it('should calculate per-evaluator averages', () => {
+			const saver = createArtifactSaver({ outputDir: tempDir, logger: silentLogger });
+			const results = [createMockResult(), createMockResult()];
+
+			saver.saveSummary(createMockSummary(), results);
+
+			const summaryPath = path.join(tempDir, 'summary.json');
+			const savedSummary = jsonParse<ParsedSummary>(fs.readFileSync(summaryPath, 'utf-8'));
+
+			expect(savedSummary.evaluatorAverages).toBeDefined();
+			expect(savedSummary.evaluatorAverages['llm-judge']).toBeCloseTo(0.85);
+			expect(savedSummary.evaluatorAverages['programmatic']).toBe(1.0);
+		});
+
+		it('should include truncated prompts in results', () => {
+			const saver = createArtifactSaver({ outputDir: tempDir, logger: silentLogger });
+			const longPrompt = 'A'.repeat(200);
+			const results = [createMockResult({ prompt: longPrompt })];
+
+			saver.saveSummary(createMockSummary(), results);
+
+			const summaryPath = path.join(tempDir, 'summary.json');
+			const savedSummary = jsonParse<ParsedSummary>(fs.readFileSync(summaryPath, 'utf-8'));
+
+			expect(savedSummary.results[0].prompt.length).toBeLessThan(longPrompt.length);
+			expect(savedSummary.results[0].prompt).toContain('...');
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/report-generator.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/report-generator.test.ts
@ -0,0 +1,308 @@
+/**
+ * Tests for markdown report generation.
+ *
+ * These utilities generate human-readable markdown reports
+ * from evaluation results.
+ */
+
+import type { ExampleResult, RunSummary, Feedback } from '../harness/harness-types';
+import {
+	extractViolationSeverity,
+	calculateReportMetrics,
+	generateMarkdownReport,
+} from '../support/report-generator';
+
+/** Helper to create a feedback item */
+function createFeedback(
+	evaluator: string,
+	metric: string,
+	score: number,
+	kind: Feedback['kind'] = 'metric',
+	comment?: string,
+): Feedback {
+	return { evaluator, metric, score, kind, ...(comment ? { comment } : {}) };
+}
+
+/** Helper to create an example result */
+function createExampleResult(overrides: Partial<ExampleResult> = {}): ExampleResult {
+	return {
+		index: 1,
+		prompt: 'Test prompt',
+		status: 'pass',
+		score: 0,
+		feedback: [],
+		durationMs: 1000,
+		...overrides,
+	};
+}
+
+/** Helper to create a run summary */
+function createRunSummary(overrides: Partial<RunSummary> = {}): RunSummary {
+	return {
+		totalExamples: 10,
+		passed: 8,
+		failed: 1,
+		errors: 1,
+		averageScore: 0.75,
+		totalDurationMs: 10000,
+		...overrides,
+	};
+}
+
+describe('Report Generator', () => {
+	describe('extractViolationSeverity()', () => {
+		it('should extract critical severity', () => {
+			expect(extractViolationSeverity('[CRITICAL] Missing trigger')).toBe('critical');
+		});
+
+		it('should extract major severity', () => {
+			expect(extractViolationSeverity('[MAJOR] Bad configuration')).toBe('major');
+		});
+
+		it('should extract minor severity', () => {
+			expect(extractViolationSeverity('[MINOR] Style issue')).toBe('minor');
+		});
+
+		it('should return null for no violation marker', () => {
+			expect(extractViolationSeverity('Just a comment')).toBeNull();
+		});
+
+		it('should return null for undefined comment', () => {
+			expect(extractViolationSeverity(undefined)).toBeNull();
+		});
+
+		it('should be case-insensitive', () => {
+			expect(extractViolationSeverity('[critical] lowercase')).toBe('critical');
+			expect(extractViolationSeverity('[Critical] mixed')).toBe('critical');
+		});
+	});
+
+	describe('calculateReportMetrics()', () => {
+		it('should calculate evaluator averages from feedback keys', () => {
+			const results: ExampleResult[] = [
+				createExampleResult({
+					feedback: [
+						createFeedback('llm-judge', 'functionality', 0.8),
+						createFeedback('llm-judge', 'connections', 0.6),
+						createFeedback('programmatic', 'trigger', 1.0),
+					],
+				}),
+			];
+
+			const metrics = calculateReportMetrics(results);
+
+			expect(metrics.evaluatorAverages['llm-judge']).toBeCloseTo(0.7);
+			expect(metrics.evaluatorAverages['programmatic']).toBeCloseTo(1.0);
+		});
+
+		it('should ignore non-finite scores when computing evaluator averages', () => {
+			const results: ExampleResult[] = [
+				createExampleResult({
+					feedback: [
+						createFeedback('programmatic', 'connections', 1),
+						createFeedback('programmatic', 'trigger', Number.NaN),
+					],
+				}),
+			];
+
+			const metrics = calculateReportMetrics(results);
+
+			expect(metrics.evaluatorAverages['programmatic']).toBe(1);
+		});
+
+		it('should count violations by severity from comments', () => {
+			const results: ExampleResult[] = [
+				createExampleResult({
+					feedback: [
+						createFeedback('a', 'b', 0, 'detail', '[CRITICAL] Missing node'),
+						createFeedback('a', 'c', 0, 'detail', '[MAJOR] Bad config'),
+						createFeedback('a', 'd', 0, 'detail', '[MINOR] Style issue'),
+						createFeedback('a', 'e', 0, 'detail', '[CRITICAL] Another critical'),
+					],
+				}),
+			];
+
+			const metrics = calculateReportMetrics(results);
+
+			expect(metrics.violationCounts.critical).toBe(2);
+			expect(metrics.violationCounts.major).toBe(1);
+			expect(metrics.violationCounts.minor).toBe(1);
+		});
+
+		it('should handle empty results', () => {
+			const metrics = calculateReportMetrics([]);
+
+			expect(metrics.evaluatorAverages).toEqual({});
+			expect(metrics.violationCounts).toEqual({ critical: 0, major: 0, minor: 0 });
+		});
+
+		it('should aggregate across multiple results', () => {
+			const results: ExampleResult[] = [
+				createExampleResult({
+					feedback: [createFeedback('llm-judge', 'a', 0.8)],
+				}),
+				createExampleResult({
+					feedback: [createFeedback('llm-judge', 'a', 0.6)],
+				}),
+			];
+
+			const metrics = calculateReportMetrics(results);
+
+			expect(metrics.evaluatorAverages['llm-judge']).toBeCloseTo(0.7);
+		});
+
+		it('should handle results with errors', () => {
+			const results: ExampleResult[] = [
+				createExampleResult({
+					status: 'error',
+					feedback: [],
+					error: 'Something went wrong',
+				}),
+				createExampleResult({
+					feedback: [createFeedback('llm-judge', 'a', 0.8)],
+				}),
+			];
+
+			const metrics = calculateReportMetrics(results);
+
+			// Should still calculate from successful results
+			expect(metrics.evaluatorAverages['llm-judge']).toBeCloseTo(0.8);
+		});
+	});
+
+	describe('generateMarkdownReport()', () => {
+		it('should include summary section', () => {
+			const results: ExampleResult[] = [];
+			const summary = createRunSummary({
+				totalExamples: 10,
+				passed: 8,
+				failed: 1,
+				errors: 1,
+				averageScore: 0.75,
+			});
+
+			const report = generateMarkdownReport(results, summary);
+
+			expect(report).toContain('# AI Workflow Builder Evaluation Report');
+			expect(report).toContain('## Summary');
+			expect(report).toContain('Total Tests: 10');
+			expect(report).toContain('Passed: 8');
+			expect(report).toContain('Failed: 1');
+			expect(report).toContain('Errors: 1');
+			expect(report).toContain('75.0%');
+		});
+
+		it('should include evaluator averages', () => {
+			const results: ExampleResult[] = [
+				createExampleResult({
+					feedback: [
+						createFeedback('llm-judge', 'a', 0.8),
+						createFeedback('programmatic', 'b', 0.6),
+					],
+				}),
+			];
+			const summary = createRunSummary();
+
+			const report = generateMarkdownReport(results, summary);
+
+			expect(report).toContain('## Evaluator Averages');
+			expect(report).toContain('llm-judge');
+			expect(report).toContain('programmatic');
+		});
+
+		it('should include violation summary', () => {
+			const results: ExampleResult[] = [
+				createExampleResult({
+					feedback: [
+						createFeedback('a', 'b', 0, 'detail', '[CRITICAL] Issue 1'),
+						createFeedback('a', 'c', 0, 'detail', '[MAJOR] Issue 2'),
+					],
+				}),
+			];
+			const summary = createRunSummary();
+
+			const report = generateMarkdownReport(results, summary);
+
+			expect(report).toContain('## Violations Summary');
+			expect(report).toContain('Critical: 1');
+			expect(report).toContain('Major: 1');
+		});
+
+		it('should include detailed results when option enabled', () => {
+			const results: ExampleResult[] = [
+				createExampleResult({
+					index: 1,
+					prompt: 'Create a workflow that sends emails',
+					status: 'pass',
+					durationMs: 1500,
+					feedback: [createFeedback('llm-judge', 'a', 0.9, 'metric', 'Good job')],
+				}),
+			];
+			const summary = createRunSummary();
+
+			const report = generateMarkdownReport(results, summary, { includeDetails: true });
+
+			expect(report).toContain('## Detailed Results');
+			expect(report).toContain('### Test 1');
+			expect(report).toContain('Create a workflow');
+			expect(report).toContain('pass');
+			expect(report).toContain('1500ms');
+		});
+
+		it('should not include details when option disabled', () => {
+			const results: ExampleResult[] = [createExampleResult({ prompt: 'Test prompt here' })];
+			const summary = createRunSummary();
+
+			const report = generateMarkdownReport(results, summary, { includeDetails: false });
+
+			expect(report).not.toContain('## Detailed Results');
+			expect(report).not.toContain('Test prompt here');
+		});
+
+		it('should truncate long prompts in details', () => {
+			const longPrompt = 'A'.repeat(200);
+			const results: ExampleResult[] = [createExampleResult({ prompt: longPrompt })];
+			const summary = createRunSummary();
+
+			const report = generateMarkdownReport(results, summary, { includeDetails: true });
+
+			expect(report).toContain('...');
+			expect(report).not.toContain(longPrompt);
+		});
+
+		it('should handle empty results gracefully', () => {
+			const summary = createRunSummary({ totalExamples: 0, passed: 0, failed: 0, errors: 0 });
+
+			const report = generateMarkdownReport([], summary);
+
+			expect(report).toContain('# AI Workflow Builder Evaluation Report');
+			expect(report).toContain('Total Tests: 0');
+		});
+
+		it('should include feedback details in test results', () => {
+			const results: ExampleResult[] = [
+				createExampleResult({
+					feedback: [
+						createFeedback('llm-judge', 'functionality', 0.9, 'metric', 'Great functionality'),
+						createFeedback('programmatic', 'trigger', 1.0),
+					],
+				}),
+			];
+			const summary = createRunSummary();
+
+			const report = generateMarkdownReport(results, summary, { includeDetails: true });
+
+			expect(report).toContain('llm-judge.functionality');
+			expect(report).toContain('90.0%');
+			expect(report).toContain('Great functionality');
+		});
+
+		it('should format pass rate as percentage', () => {
+			const summary = createRunSummary({ totalExamples: 10, passed: 8 });
+
+			const report = generateMarkdownReport([], summary);
+
+			expect(report).toContain('80.0%'); // Pass rate
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/runner-langsmith.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/runner-langsmith.test.ts
@ -0,0 +1,678 @@
+/**
+ * Tests for LangSmith mode runner.
+ *
+ * These tests mock the LangSmith evaluate() function to verify:
+ * - Target function does all work (generation + evaluation)
+ * - Evaluator just extracts pre-computed feedback
+ * - Dataset context extraction is respected
+ * - Filters trigger dataset example preloading
+ */
+
+import { mock } from 'jest-mock-extended';
+import type { Client } from 'langsmith/client';
+import { evaluate as langsmithEvaluate } from 'langsmith/evaluation';
+import type { Dataset, Example } from 'langsmith/schemas';
+import * as fs from 'node:fs';
+import * as os from 'node:os';
+import * as path from 'node:path';
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+import type { Evaluator, Feedback, RunConfig } from '../harness/harness-types';
+import { createLogger } from '../harness/logger';
+
+const silentLogger = createLogger(false);
+
+jest.mock('langsmith/evaluation', () => ({
+	evaluate: jest.fn(),
+}));
+
+jest.mock('langsmith/traceable', () => ({
+	traceable: jest.fn(
+		<T extends (...args: unknown[]) => unknown>(fn: T, _options: unknown): T => fn,
+	),
+}));
+
+// Mock core/environment module (dynamically imported in runner.ts)
+function createMockWorkflow(name = 'Test Workflow'): SimpleWorkflow {
+	return { name, nodes: [], connections: {} };
+}
+
+function createMockEvaluator(
+	name: string,
+	feedback: Feedback[] = [{ evaluator: name, metric: 'score', score: 1, kind: 'score' }],
+): Evaluator {
+	return {
+		name,
+		evaluate: jest.fn().mockResolvedValue(feedback),
+	};
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+	return typeof value === 'object' && value !== null && !Array.isArray(value);
+}
+
+function isCallable(value: unknown): value is (...args: unknown[]) => unknown {
+	return typeof value === 'function';
+}
+
+type LangsmithTargetOutput = {
+	workflow: SimpleWorkflow;
+	prompt: string;
+	feedback: Feedback[];
+};
+
+function isSimpleWorkflow(value: unknown): value is SimpleWorkflow {
+	return isRecord(value) && Array.isArray(value.nodes) && isRecord(value.connections);
+}
+
+function isFeedback(value: unknown): value is Feedback {
+	return (
+		isRecord(value) &&
+		typeof value.evaluator === 'string' &&
+		typeof value.metric === 'string' &&
+		typeof value.score === 'number' &&
+		(value.kind === 'score' || value.kind === 'metric' || value.kind === 'detail')
+	);
+}
+
+function isLangsmithTargetOutput(value: unknown): value is LangsmithTargetOutput {
+	return (
+		isRecord(value) &&
+		isSimpleWorkflow(value.workflow) &&
+		typeof value.prompt === 'string' &&
+		Array.isArray(value.feedback) &&
+		value.feedback.every(isFeedback)
+	);
+}
+
+async function callLangsmithTarget(target: unknown, inputs: unknown): Promise<unknown> {
+	if (isCallable(target)) return await target(inputs);
+	if (isRecord(target) && isCallable(target.invoke)) return await target.invoke(inputs);
+	throw new Error('Expected LangSmith target to be callable');
+}
+
+function createMockLangsmithClient() {
+	const lsClient = mock<Client>();
+	lsClient.readDataset.mockResolvedValue(mock<Dataset>({ id: 'test-dataset-id' }));
+	lsClient.listExamples.mockReturnValue((async function* () {})());
+	lsClient.awaitPendingTraceBatches.mockResolvedValue(undefined);
+	return lsClient;
+}
+
+describe('Runner - LangSmith Mode', () => {
+	beforeEach(() => {
+		jest.clearAllMocks();
+	});
+
+	describe('runEvaluation() with LangSmith', () => {
+		it('should call langsmith evaluate() with correct options', async () => {
+			const mockEvaluate = jest.mocked(langsmithEvaluate);
+			const lsClient = createMockLangsmithClient();
+
+			const config: RunConfig = {
+				mode: 'langsmith',
+				dataset: 'my-dataset',
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [createMockEvaluator('test')],
+				langsmithClient: lsClient,
+				langsmithOptions: {
+					experimentName: 'test-experiment',
+					repetitions: 2,
+					concurrency: 4,
+				},
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(mockEvaluate).toHaveBeenCalledTimes(1);
+			const [_target, options] = mockEvaluate.mock.calls[0];
+			expect(options).toEqual(
+				expect.objectContaining({
+					data: 'my-dataset',
+					experimentPrefix: 'test-experiment',
+					numRepetitions: 2,
+					maxConcurrency: 4,
+					client: lsClient,
+				}),
+			);
+		});
+
+		it('should create target function that generates workflow and runs evaluators', async () => {
+			const mockEvaluate = jest.mocked(langsmithEvaluate);
+			const lsClient = createMockLangsmithClient();
+
+			const workflow = createMockWorkflow('Generated');
+			const generateWorkflow = jest.fn().mockResolvedValue(workflow);
+			const evaluator = createMockEvaluator('test', [
+				{ evaluator: 'test', metric: 'score', score: 0.9, kind: 'score' },
+			]);
+
+			const config: RunConfig = {
+				mode: 'langsmith',
+				dataset: 'test-dataset',
+				generateWorkflow,
+				evaluators: [evaluator],
+				langsmithClient: lsClient,
+				langsmithOptions: {
+					experimentName: 'test',
+					repetitions: 1,
+					concurrency: 1,
+				},
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(mockEvaluate).toHaveBeenCalledTimes(1);
+			const [target] = mockEvaluate.mock.calls[0];
+
+			const result = await callLangsmithTarget(target, { prompt: 'Create a workflow' });
+			expect(isLangsmithTargetOutput(result)).toBe(true);
+			if (!isLangsmithTargetOutput(result)) throw new Error('Expected LangSmith target output');
+
+			expect(generateWorkflow).toHaveBeenCalledWith('Create a workflow');
+			expect(evaluator.evaluate).toHaveBeenCalledWith(
+				workflow,
+				expect.objectContaining({ prompt: 'Create a workflow' }),
+			);
+			expect(result).toEqual({
+				workflow,
+				prompt: 'Create a workflow',
+				feedback: [{ evaluator: 'test', metric: 'score', score: 0.9, kind: 'score' }],
+			});
+		});
+
+		it('should write artifacts when outputDir is provided', async () => {
+			const mockEvaluate = jest.mocked(langsmithEvaluate);
+			const lsClient = createMockLangsmithClient();
+
+			const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'v2-evals-langsmith-out-'));
+			try {
+				const config: RunConfig = {
+					mode: 'langsmith',
+					dataset: 'test-dataset',
+					outputDir: tempDir,
+					generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow('Generated')),
+					evaluators: [createMockEvaluator('test')],
+					langsmithClient: lsClient,
+					langsmithOptions: {
+						experimentName: 'test',
+						repetitions: 1,
+						concurrency: 1,
+					},
+					logger: silentLogger,
+				};
+
+				const { runEvaluation } = await import('../harness/runner');
+				await runEvaluation(config);
+
+				expect(mockEvaluate).toHaveBeenCalledTimes(1);
+				const [target] = mockEvaluate.mock.calls[0];
+
+				await callLangsmithTarget(target, { prompt: 'Create a workflow' });
+
+				const entries = fs.readdirSync(tempDir, { withFileTypes: true });
+				const exampleDir = entries.find(
+					(e) => e.isDirectory() && e.name.startsWith('example-001-'),
+				)?.name;
+				expect(exampleDir).toBeDefined();
+
+				expect(fs.existsSync(path.join(tempDir, exampleDir!, 'prompt.txt'))).toBe(true);
+				expect(fs.existsSync(path.join(tempDir, exampleDir!, 'workflow.json'))).toBe(true);
+				expect(fs.existsSync(path.join(tempDir, exampleDir!, 'feedback.json'))).toBe(true);
+			} finally {
+				fs.rmSync(tempDir, { recursive: true, force: true });
+			}
+		});
+
+		it('should aggregate feedback from multiple evaluators in target', async () => {
+			const mockEvaluate = jest.mocked(langsmithEvaluate);
+			const lsClient = createMockLangsmithClient();
+
+			const evaluator1 = createMockEvaluator('e1', [
+				{ evaluator: 'e1', metric: 'score', score: 0.8, kind: 'score' },
+			]);
+			const evaluator2 = createMockEvaluator('e2', [
+				{ evaluator: 'e2', metric: 'a', score: 0.9, kind: 'metric' },
+				{ evaluator: 'e2', metric: 'b', score: 1.0, kind: 'metric' },
+			]);
+
+			const config: RunConfig = {
+				mode: 'langsmith',
+				dataset: 'test-dataset',
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [evaluator1, evaluator2],
+				langsmithClient: lsClient,
+				langsmithOptions: {
+					experimentName: 'test',
+					repetitions: 1,
+					concurrency: 1,
+				},
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(mockEvaluate).toHaveBeenCalledTimes(1);
+			const [target] = mockEvaluate.mock.calls[0];
+			const result = await callLangsmithTarget(target, { prompt: 'Test' });
+			expect(isLangsmithTargetOutput(result)).toBe(true);
+			if (!isLangsmithTargetOutput(result)) throw new Error('Expected LangSmith target output');
+
+			expect(result.feedback).toHaveLength(3);
+			expect(result.feedback).toContainEqual({
+				evaluator: 'e1',
+				metric: 'score',
+				score: 0.8,
+				kind: 'score',
+			});
+			expect(result.feedback).toContainEqual({
+				evaluator: 'e2',
+				metric: 'a',
+				score: 0.9,
+				kind: 'metric',
+			});
+			expect(result.feedback).toContainEqual({
+				evaluator: 'e2',
+				metric: 'b',
+				score: 1.0,
+				kind: 'metric',
+			});
+		});
+
+		it('should handle evaluator errors gracefully in target', async () => {
+			const mockEvaluate = jest.mocked(langsmithEvaluate);
+			const lsClient = createMockLangsmithClient();
+
+			const goodEvaluator = createMockEvaluator('good', [
+				{ evaluator: 'good', metric: 'score', score: 1, kind: 'score' },
+			]);
+			const badEvaluator: Evaluator = {
+				name: 'bad',
+				evaluate: jest.fn().mockRejectedValue(new Error('Evaluator crashed')),
+			};
+
+			const config: RunConfig = {
+				mode: 'langsmith',
+				dataset: 'test-dataset',
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [goodEvaluator, badEvaluator],
+				langsmithClient: lsClient,
+				langsmithOptions: {
+					experimentName: 'test',
+					repetitions: 1,
+					concurrency: 1,
+				},
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(mockEvaluate).toHaveBeenCalledTimes(1);
+			const [target] = mockEvaluate.mock.calls[0];
+
+			const result = await callLangsmithTarget(target, { prompt: 'Test' });
+			expect(isLangsmithTargetOutput(result)).toBe(true);
+			if (!isLangsmithTargetOutput(result)) throw new Error('Expected LangSmith target output');
+
+			expect(result.feedback).toContainEqual({
+				evaluator: 'good',
+				metric: 'score',
+				score: 1,
+				kind: 'score',
+			});
+			expect(result.feedback).toContainEqual({
+				evaluator: 'bad',
+				metric: 'error',
+				score: 0,
+				kind: 'score',
+				comment: 'Evaluator crashed',
+			});
+		});
+
+		it('should create evaluator that extracts pre-computed feedback', async () => {
+			const mockEvaluate = jest.mocked(langsmithEvaluate);
+			const lsClient = createMockLangsmithClient();
+
+			const config: RunConfig = {
+				mode: 'langsmith',
+				dataset: 'test-dataset',
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [createMockEvaluator('test')],
+				langsmithClient: lsClient,
+				langsmithOptions: {
+					experimentName: 'test',
+					repetitions: 1,
+					concurrency: 1,
+				},
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(mockEvaluate).toHaveBeenCalledTimes(1);
+			const [_target, options] = mockEvaluate.mock.calls[0];
+
+			expect(Array.isArray(options.evaluators)).toBe(true);
+			if (!Array.isArray(options.evaluators))
+				throw new Error('Expected LangSmith evaluators array');
+			expect(options.evaluators).toHaveLength(1);
+
+			const evaluatorFn = options.evaluators[0];
+			expect(isCallable(evaluatorFn)).toBe(true);
+			if (!isCallable(evaluatorFn)) throw new Error('Expected evaluator function');
+
+			const extracted = await evaluatorFn({
+				outputs: {
+					feedback: [
+						{ evaluator: 'test', metric: 'score', score: 0.9, kind: 'score' },
+						{ evaluator: 'other', metric: 'trigger', score: 0.8, kind: 'metric' },
+					],
+				},
+			});
+
+			expect(extracted).toEqual([
+				{ key: 'test.score', score: 0.9 },
+				{ key: 'other.trigger', score: 0.8 },
+			]);
+		});
+
+		it('should keep programmatic prefixes but not llm-judge metric prefixes', async () => {
+			const mockEvaluate = jest.mocked(langsmithEvaluate);
+			const lsClient = createMockLangsmithClient();
+
+			const config: RunConfig = {
+				mode: 'langsmith',
+				dataset: 'test-dataset',
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [createMockEvaluator('test')],
+				langsmithClient: lsClient,
+				langsmithOptions: {
+					experimentName: 'test',
+					repetitions: 1,
+					concurrency: 1,
+				},
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(mockEvaluate).toHaveBeenCalledTimes(1);
+			const [_target, options] = mockEvaluate.mock.calls[0];
+			expect(Array.isArray(options.evaluators)).toBe(true);
+			if (!Array.isArray(options.evaluators))
+				throw new Error('Expected LangSmith evaluators array');
+
+			const evaluatorFn = options.evaluators[0];
+			expect(isCallable(evaluatorFn)).toBe(true);
+			if (!isCallable(evaluatorFn)) throw new Error('Expected evaluator function');
+
+			const extracted = await evaluatorFn({
+				outputs: {
+					feedback: [
+						{ evaluator: 'llm-judge', metric: 'functionality', score: 0.9, kind: 'metric' },
+						{ evaluator: 'programmatic', metric: 'trigger', score: 0.8, kind: 'metric' },
+						{
+							evaluator: 'llm-judge',
+							metric: 'maintainability.nodeNamingQuality',
+							score: 0.7,
+							kind: 'detail',
+						},
+					],
+				},
+			});
+
+			expect(extracted).toEqual([
+				{ key: 'functionality', score: 0.9 },
+				{ key: 'programmatic.trigger', score: 0.8 },
+				{ key: 'maintainability.nodeNamingQuality', score: 0.7 },
+			]);
+		});
+
+		it('should handle missing feedback in outputs', async () => {
+			const mockEvaluate = jest.mocked(langsmithEvaluate);
+			const lsClient = createMockLangsmithClient();
+
+			const config: RunConfig = {
+				mode: 'langsmith',
+				dataset: 'test-dataset',
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [],
+				langsmithClient: lsClient,
+				langsmithOptions: {
+					experimentName: 'test',
+					repetitions: 1,
+					concurrency: 1,
+				},
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(mockEvaluate).toHaveBeenCalledTimes(1);
+			const [_target, options] = mockEvaluate.mock.calls[0];
+
+			expect(Array.isArray(options.evaluators)).toBe(true);
+			if (!Array.isArray(options.evaluators))
+				throw new Error('Expected LangSmith evaluators array');
+			const evaluatorFn = options.evaluators[0];
+			expect(isCallable(evaluatorFn)).toBe(true);
+			if (!isCallable(evaluatorFn)) throw new Error('Expected evaluator function');
+
+			const extracted = await evaluatorFn({ outputs: {} });
+			expect(extracted).toEqual([
+				{
+					key: 'evaluationError',
+					score: 0,
+					comment: 'No feedback found in target output',
+				},
+			]);
+		});
+
+		it('should pass dataset-level context to evaluators', async () => {
+			const mockEvaluate = jest.mocked(langsmithEvaluate);
+			const lsClient = createMockLangsmithClient();
+
+			const evaluateContextual: Evaluator['evaluate'] = async (_workflow, ctx) => [
+				{ evaluator: 'contextual', metric: 'score', score: ctx.dos ? 1 : 0, kind: 'score' },
+			];
+
+			const evaluator: Evaluator = {
+				name: 'contextual',
+				evaluate: jest.fn(evaluateContextual),
+			};
+
+			const config: RunConfig = {
+				mode: 'langsmith',
+				dataset: 'test-dataset',
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [evaluator],
+				langsmithClient: lsClient,
+				langsmithOptions: {
+					experimentName: 'test',
+					repetitions: 1,
+					concurrency: 1,
+				},
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(mockEvaluate).toHaveBeenCalledTimes(1);
+			const [target] = mockEvaluate.mock.calls[0];
+
+			const result = await callLangsmithTarget(target, {
+				prompt: 'Test',
+				evals: { dos: 'Use Slack', donts: 'No HTTP' },
+			});
+			expect(isLangsmithTargetOutput(result)).toBe(true);
+			if (!isLangsmithTargetOutput(result)) throw new Error('Expected LangSmith target output');
+
+			expect(evaluator.evaluate).toHaveBeenCalledWith(
+				expect.anything(),
+				expect.objectContaining({ dos: 'Use Slack', donts: 'No HTTP' }),
+			);
+			expect(result.feedback).toContainEqual({
+				evaluator: 'contextual',
+				metric: 'score',
+				score: 1,
+				kind: 'score',
+			});
+		});
+
+		it('should ignore invalid referenceWorkflow in dataset context', async () => {
+			const mockEvaluate = jest.mocked(langsmithEvaluate);
+			const lsClient = createMockLangsmithClient();
+
+			const evaluate = jest.fn<
+				ReturnType<Evaluator['evaluate']>,
+				Parameters<Evaluator['evaluate']>
+			>(async (_workflow, ctx) => [
+				{
+					evaluator: 'ref-check',
+					metric: 'hasRef',
+					score: ctx.referenceWorkflows && ctx.referenceWorkflows.length > 0 ? 1 : 0,
+					kind: 'score',
+				},
+			]);
+
+			const evaluator: Evaluator = {
+				name: 'ref-check',
+				evaluate,
+			};
+
+			const config: RunConfig = {
+				mode: 'langsmith',
+				dataset: 'test-dataset',
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [evaluator],
+				langsmithClient: lsClient,
+				langsmithOptions: {
+					experimentName: 'test',
+					repetitions: 1,
+					concurrency: 1,
+				},
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(mockEvaluate).toHaveBeenCalledTimes(1);
+			const [target] = mockEvaluate.mock.calls[0];
+
+			const result = await callLangsmithTarget(target, {
+				prompt: 'Test',
+				evals: {
+					referenceWorkflow: { nodes: [{}], connections: {} },
+				},
+			});
+			expect(isLangsmithTargetOutput(result)).toBe(true);
+			if (!isLangsmithTargetOutput(result)) throw new Error('Expected LangSmith target output');
+
+			const ctx = evaluate.mock.calls[0]?.[1];
+			expect(ctx?.referenceWorkflows).toBeUndefined();
+
+			expect(result.feedback).toContainEqual({
+				evaluator: 'ref-check',
+				metric: 'hasRef',
+				score: 0,
+				kind: 'score',
+			});
+		});
+
+		it('should pre-load and filter examples when filters are provided', async () => {
+			const mockEvaluate = jest.mocked(langsmithEvaluate);
+
+			const examples: Example[] = [
+				mock<Example>({
+					id: 'e1',
+					inputs: { prompt: 'One', evals: { dos: 'Use Slack', donts: 'No HTTP' } },
+					metadata: { notion_id: 'n1', categories: ['data_transformation'] },
+				}),
+				mock<Example>({
+					id: 'e2',
+					inputs: { prompt: 'Two', evals: { dos: 'Use Gmail', donts: 'No Slack' } },
+					metadata: { notion_id: 'n2', categories: ['other'] },
+				}),
+			];
+
+			const lsClient = createMockLangsmithClient();
+			lsClient.listExamples.mockReturnValue(
+				(async function* () {
+					for (const ex of examples) yield ex;
+				})(),
+			);
+
+			const config: RunConfig = {
+				mode: 'langsmith',
+				dataset: 'test-dataset',
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [createMockEvaluator('test')],
+				langsmithClient: lsClient,
+				langsmithOptions: {
+					experimentName: 'test',
+					repetitions: 1,
+					concurrency: 1,
+					filters: { notionId: 'n1', technique: 'data_transformation', doSearch: 'slack' },
+				},
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(mockEvaluate).toHaveBeenCalledTimes(1);
+			const [_target, options] = mockEvaluate.mock.calls[0];
+			const data: unknown = options.data;
+			expect(Array.isArray(data)).toBe(true);
+			if (!Array.isArray(data)) throw new Error('Expected `evaluate()` to receive example array');
+
+			const ids = data
+				.filter((e): e is { id: string } => isRecord(e) && typeof e.id === 'string')
+				.map((e) => e.id);
+			expect(ids).toEqual(['e1']);
+		});
+
+		it('should throw when filters match no examples', async () => {
+			const lsClient = createMockLangsmithClient();
+			lsClient.listExamples.mockReturnValue(
+				(async function* () {
+					yield mock<Example>({
+						id: 'e1',
+						inputs: { prompt: 'One', evals: { dos: 'Use Slack', donts: 'No HTTP' } },
+						metadata: { notion_id: 'n1', categories: ['data_transformation'] },
+					});
+				})(),
+			);
+
+			const config: RunConfig = {
+				mode: 'langsmith',
+				dataset: 'test-dataset',
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [createMockEvaluator('test')],
+				langsmithClient: lsClient,
+				langsmithOptions: {
+					experimentName: 'test',
+					repetitions: 1,
+					concurrency: 1,
+					filters: { notionId: 'does-not-exist' },
+				},
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await expect(runEvaluation(config)).rejects.toThrow('No examples matched filters');
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/runner.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/runner.test.ts
@ -0,0 +1,437 @@
+import type { SimpleWorkflow } from '@/types/workflow';
+
+import type {
+	Evaluator,
+	TestCase,
+	Feedback,
+	RunConfig,
+	EvaluationLifecycle,
+	ExampleResult,
+} from '../harness/harness-types';
+import { createLogger } from '../harness/logger';
+
+const silentLogger = createLogger(false);
+
+/** Helper to create a minimal valid workflow for tests */
+function createMockWorkflow(name = 'Test Workflow'): SimpleWorkflow {
+	return { name, nodes: [], connections: {} };
+}
+
+/** Helper to create a simple evaluator */
+function createMockEvaluator(
+	name: string,
+	feedback: Feedback[] = [{ evaluator: name, metric: 'score', score: 1, kind: 'score' }],
+): Evaluator {
+	return {
+		name,
+		evaluate: jest.fn().mockResolvedValue(feedback),
+	};
+}
+
+/** Helper to create a failing evaluator */
+function createFailingEvaluator(name: string, error: Error): Evaluator {
+	return {
+		name,
+		evaluate: jest.fn().mockRejectedValue(error),
+	};
+}
+
+describe('Runner - Local Mode', () => {
+	describe('runEvaluation()', () => {
+		it('should process all test cases sequentially', async () => {
+			const testCases: TestCase[] = [
+				{ prompt: 'Create workflow A' },
+				{ prompt: 'Create workflow B' },
+				{ prompt: 'Create workflow C' },
+			];
+
+			const generateWorkflow = jest.fn().mockResolvedValue(createMockWorkflow());
+			const evaluator = createMockEvaluator('test');
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: testCases,
+				generateWorkflow,
+				evaluators: [evaluator],
+				logger: silentLogger,
+			};
+
+			// Import dynamically to avoid circular deps in test setup
+			const { runEvaluation } = await import('../harness/runner');
+			const summary = await runEvaluation(config);
+
+			expect(generateWorkflow).toHaveBeenCalledTimes(3);
+			expect(evaluator.evaluate).toHaveBeenCalledTimes(3);
+			expect(summary.totalExamples).toBe(3);
+		});
+
+		it('should run all evaluators in parallel for each example', async () => {
+			const evaluator1 = createMockEvaluator('eval1', [
+				{ evaluator: 'eval1', metric: 'score', score: 0.8, kind: 'score' },
+			]);
+			const evaluator2 = createMockEvaluator('eval2', [
+				{ evaluator: 'eval2', metric: 'score', score: 0.9, kind: 'score' },
+			]);
+			const evaluator3 = createMockEvaluator('eval3', [
+				{ evaluator: 'eval3', metric: 'score', score: 1.0, kind: 'score' },
+			]);
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test' }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [evaluator1, evaluator2, evaluator3],
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			const summary = await runEvaluation(config);
+
+			// All evaluators should be called
+			expect(evaluator1.evaluate).toHaveBeenCalledTimes(1);
+			expect(evaluator2.evaluate).toHaveBeenCalledTimes(1);
+			expect(evaluator3.evaluate).toHaveBeenCalledTimes(1);
+
+			// Average score should be (0.8 + 0.9 + 1.0) / 3 = 0.9
+			expect(summary.averageScore).toBeCloseTo(0.9, 2);
+		});
+
+		it('should skip and continue when evaluator throws error', async () => {
+			const goodEvaluator = createMockEvaluator('good', [
+				{ evaluator: 'good', metric: 'score', score: 1.0, kind: 'score' },
+			]);
+			const badEvaluator = createFailingEvaluator('bad', new Error('Evaluator crashed'));
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test' }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [goodEvaluator, badEvaluator],
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			const summary = await runEvaluation(config);
+
+			// Should complete despite error
+			expect(summary.totalExamples).toBe(1);
+			// Good evaluator should still run
+			expect(goodEvaluator.evaluate).toHaveBeenCalled();
+		});
+
+		it('should skip and continue when workflow generation fails', async () => {
+			const generateWorkflow = jest
+				.fn()
+				.mockResolvedValueOnce(createMockWorkflow())
+				.mockRejectedValueOnce(new Error('Generation failed'))
+				.mockResolvedValueOnce(createMockWorkflow());
+
+			const evaluator = createMockEvaluator('test');
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test 1' }, { prompt: 'Test 2' }, { prompt: 'Test 3' }],
+				generateWorkflow,
+				evaluators: [evaluator],
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			const summary = await runEvaluation(config);
+
+			expect(summary.totalExamples).toBe(3);
+			expect(summary.errors).toBe(1);
+			expect(summary.passed + summary.failed).toBe(2);
+		});
+
+		it('should pass context from test case to evaluators', async () => {
+			const evaluate: Evaluator['evaluate'] = async (_workflow, ctx) => {
+				expect(ctx.dos).toBe('Use Slack');
+				expect(ctx.donts).toBe('No HTTP');
+				return [{ evaluator: 'contextual', metric: 'score', score: 1, kind: 'score' }];
+			};
+
+			const evaluator: Evaluator = {
+				name: 'contextual',
+				evaluate: jest.fn(evaluate),
+			};
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [
+					{
+						prompt: 'Test',
+						context: { dos: 'Use Slack', donts: 'No HTTP' },
+					},
+				],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [evaluator],
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(evaluator.evaluate).toHaveBeenCalled();
+		});
+
+		it('should merge global context with test case context', async () => {
+			const evaluate: Evaluator['evaluate'] = async (_workflow, ctx) => {
+				expect(ctx.dos).toBe('Use Slack');
+				expect(ctx.donts).toBe('No HTTP');
+				return [{ evaluator: 'merged', metric: 'score', score: 1, kind: 'score' }];
+			};
+
+			const evaluator: Evaluator = {
+				name: 'merged',
+				evaluate: jest.fn(evaluate),
+			};
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test', context: { donts: 'No HTTP' } }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [evaluator],
+				context: { dos: 'Use Slack' },
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(evaluator.evaluate).toHaveBeenCalled();
+		});
+
+		it('should calculate pass/fail status based on threshold', async () => {
+			const highScoreEvaluator = createMockEvaluator('high', [
+				{ evaluator: 'high', metric: 'score', score: 0.9, kind: 'score' },
+			]);
+			const lowScoreEvaluator = createMockEvaluator('low', [
+				{ evaluator: 'low', metric: 'score', score: 0.3, kind: 'score' },
+			]);
+
+			// High score should pass (>= 0.7 threshold)
+			const config1: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test' }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [highScoreEvaluator],
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			const summary1 = await runEvaluation(config1);
+			expect(summary1.passed).toBe(1);
+
+			// Low score should fail
+			const config2: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test' }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [lowScoreEvaluator],
+				logger: silentLogger,
+			};
+
+			const summary2 = await runEvaluation(config2);
+			expect(summary2.failed).toBe(1);
+		});
+
+		it('should aggregate feedback from all evaluators', async () => {
+			const evaluator1 = createMockEvaluator('e1', [
+				{ evaluator: 'e1', metric: 'func', score: 0.8, kind: 'metric' },
+				{ evaluator: 'e1', metric: 'conn', score: 0.9, kind: 'metric' },
+			]);
+			const evaluator2 = createMockEvaluator('e2', [
+				{ evaluator: 'e2', metric: 'overall', score: 0.85, kind: 'score' },
+			]);
+
+			const collected: ExampleResult[] = [];
+			const lifecycle: Partial<EvaluationLifecycle> = {
+				onExampleComplete: (_index, result) => collected.push(result),
+			};
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test' }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [evaluator1, evaluator2],
+				lifecycle,
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(collected).toHaveLength(1);
+			expect(collected[0].feedback).toHaveLength(3); // 2 from e1 + 1 from e2
+		});
+	});
+
+	describe('Lifecycle Hooks', () => {
+		it('should call onStart at beginning of run', async () => {
+			const lifecycle: Partial<EvaluationLifecycle> = {
+				onStart: jest.fn(),
+			};
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test' }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [],
+				lifecycle,
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(lifecycle.onStart).toHaveBeenCalledWith(config);
+		});
+
+		it('should call onExampleStart before each example', async () => {
+			const lifecycle: Partial<EvaluationLifecycle> = {
+				onExampleStart: jest.fn(),
+			};
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test 1' }, { prompt: 'Test 2' }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [],
+				lifecycle,
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(lifecycle.onExampleStart).toHaveBeenCalledTimes(2);
+			expect(lifecycle.onExampleStart).toHaveBeenNthCalledWith(1, 1, 2, 'Test 1');
+			expect(lifecycle.onExampleStart).toHaveBeenNthCalledWith(2, 2, 2, 'Test 2');
+		});
+
+		it('should call onWorkflowGenerated after generation', async () => {
+			const workflow = createMockWorkflow('Generated');
+			const lifecycle: Partial<EvaluationLifecycle> = {
+				onWorkflowGenerated: jest.fn(),
+			};
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test' }],
+				generateWorkflow: jest.fn().mockResolvedValue(workflow),
+				evaluators: [],
+				lifecycle,
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(lifecycle.onWorkflowGenerated).toHaveBeenCalledWith(
+				workflow,
+				expect.any(Number), // durationMs
+			);
+		});
+
+		it('should call onEvaluatorComplete after each evaluator', async () => {
+			const lifecycle: Partial<EvaluationLifecycle> = {
+				onEvaluatorComplete: jest.fn(),
+			};
+
+			const feedback1: Feedback[] = [
+				{ evaluator: 'eval1', metric: 'score', score: 0.8, kind: 'score' },
+			];
+			const feedback2: Feedback[] = [
+				{ evaluator: 'eval2', metric: 'score', score: 0.9, kind: 'score' },
+			];
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test' }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [
+					createMockEvaluator('eval1', feedback1),
+					createMockEvaluator('eval2', feedback2),
+				],
+				lifecycle,
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(lifecycle.onEvaluatorComplete).toHaveBeenCalledTimes(2);
+			expect(lifecycle.onEvaluatorComplete).toHaveBeenCalledWith('eval1', feedback1);
+			expect(lifecycle.onEvaluatorComplete).toHaveBeenCalledWith('eval2', feedback2);
+		});
+
+		it('should call onEvaluatorError when evaluator fails', async () => {
+			const error = new Error('Evaluator crashed');
+			const lifecycle: Partial<EvaluationLifecycle> = {
+				onEvaluatorError: jest.fn(),
+			};
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test' }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [createFailingEvaluator('failing', error)],
+				lifecycle,
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(lifecycle.onEvaluatorError).toHaveBeenCalledWith('failing', error);
+		});
+
+		it('should call onExampleComplete after each example', async () => {
+			const lifecycle: Partial<EvaluationLifecycle> = {
+				onExampleComplete: jest.fn(),
+			};
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test' }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [createMockEvaluator('test')],
+				lifecycle,
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			await runEvaluation(config);
+
+			expect(lifecycle.onExampleComplete).toHaveBeenCalledWith(
+				1,
+				expect.objectContaining({
+					index: 1,
+					prompt: 'Test',
+					status: 'pass',
+				}),
+			);
+		});
+
+		it('should call onEnd with summary at end of run', async () => {
+			const lifecycle: Partial<EvaluationLifecycle> = {
+				onEnd: jest.fn(),
+			};
+
+			const config: RunConfig = {
+				mode: 'local',
+				dataset: [{ prompt: 'Test' }],
+				generateWorkflow: jest.fn().mockResolvedValue(createMockWorkflow()),
+				evaluators: [createMockEvaluator('test')],
+				lifecycle,
+				logger: silentLogger,
+			};
+
+			const { runEvaluation } = await import('../harness/runner');
+			const summary = await runEvaluation(config);
+
+			expect(lifecycle.onEnd).toHaveBeenCalledWith(summary);
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/score-calculator.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/score-calculator.test.ts
@ -0,0 +1,330 @@
+/**
+ * Tests for score calculation utilities.
+ *
+ * These utilities calculate weighted scores and aggregate feedback
+ * from multiple evaluators.
+ */
+
+import type { Feedback } from '../harness/harness-types';
+import {
+	parseFeedbackKey,
+	extractCategory,
+	groupByEvaluator,
+	calculateWeightedScore,
+	aggregateScores,
+	DEFAULT_EVALUATOR_WEIGHTS,
+} from '../harness/score-calculator';
+
+/** Helper to create feedback items */
+function createFeedback(
+	evaluator: string,
+	metric: string,
+	score: number,
+	kind: Feedback['kind'] = 'metric',
+	comment?: string,
+): Feedback {
+	return { evaluator, metric, score, kind, ...(comment ? { comment } : {}) };
+}
+
+describe('Score Calculator', () => {
+	describe('parseFeedbackKey()', () => {
+		it('should parse two-part key', () => {
+			const result = parseFeedbackKey('llm-judge.functionality');
+			expect(result).toEqual({
+				evaluator: 'llm-judge',
+				category: 'functionality',
+				subcategory: undefined,
+			});
+		});
+
+		it('should parse three-part key', () => {
+			const result = parseFeedbackKey('pairwise.gen1.majorityPass');
+			expect(result).toEqual({
+				evaluator: 'pairwise',
+				category: 'gen1',
+				subcategory: 'majorityPass',
+			});
+		});
+
+		it('should handle single-part key', () => {
+			const result = parseFeedbackKey('overall');
+			expect(result).toEqual({
+				evaluator: 'overall',
+				category: '',
+				subcategory: undefined,
+			});
+		});
+
+		it('should handle keys with more than three parts', () => {
+			const result = parseFeedbackKey('a.b.c.d.e');
+			expect(result).toEqual({
+				evaluator: 'a',
+				category: 'b',
+				subcategory: 'c',
+			});
+		});
+	});
+
+	describe('extractCategory()', () => {
+		it('should extract category from llm-judge key', () => {
+			expect(extractCategory('llm-judge.functionality')).toBe('functionality');
+		});
+
+		it('should extract category from programmatic key', () => {
+			expect(extractCategory('programmatic.trigger')).toBe('trigger');
+		});
+
+		it('should extract category from pairwise key', () => {
+			expect(extractCategory('pairwise.majorityPass')).toBe('majorityPass');
+		});
+
+		it('should return empty string for single-part key', () => {
+			expect(extractCategory('overall')).toBe('');
+		});
+
+		it('should extract first category from multi-part key', () => {
+			expect(extractCategory('pairwise.gen1.diagnosticScore')).toBe('gen1');
+		});
+	});
+
+	describe('groupByEvaluator()', () => {
+		it('should group feedback by evaluator prefix', () => {
+			const feedback: Feedback[] = [
+				createFeedback('llm-judge', 'functionality', 0.8),
+				createFeedback('llm-judge', 'connections', 0.9),
+				createFeedback('programmatic', 'trigger', 1.0),
+			];
+
+			const grouped = groupByEvaluator(feedback);
+
+			expect(Object.keys(grouped)).toHaveLength(2);
+			expect(grouped['llm-judge']).toHaveLength(2);
+			expect(grouped['programmatic']).toHaveLength(1);
+		});
+
+		it('should handle mixed evaluators', () => {
+			const feedback: Feedback[] = [
+				createFeedback('llm-judge', 'a', 0.5),
+				createFeedback('programmatic', 'b', 0.6),
+				createFeedback('pairwise', 'c', 0.7),
+				createFeedback('similarity', 'd', 0.8),
+			];
+
+			const grouped = groupByEvaluator(feedback);
+
+			expect(Object.keys(grouped)).toHaveLength(4);
+			expect(grouped['llm-judge']).toHaveLength(1);
+			expect(grouped['programmatic']).toHaveLength(1);
+			expect(grouped['pairwise']).toHaveLength(1);
+			expect(grouped['similarity']).toHaveLength(1);
+		});
+
+		it('should handle empty array', () => {
+			const grouped = groupByEvaluator([]);
+			expect(grouped).toEqual({});
+		});
+
+		it('should preserve feedback properties', () => {
+			const feedback: Feedback[] = [
+				createFeedback('llm-judge', 'test', 0.75, 'metric', 'Test comment'),
+			];
+
+			const grouped = groupByEvaluator(feedback);
+
+			expect(grouped['llm-judge'][0]).toEqual({
+				evaluator: 'llm-judge',
+				metric: 'test',
+				score: 0.75,
+				kind: 'metric',
+				comment: 'Test comment',
+			});
+		});
+	});
+
+	describe('calculateWeightedScore()', () => {
+		it('should use default weights', () => {
+			const feedback: Feedback[] = [
+				createFeedback('llm-judge', 'a', 1.0),
+				createFeedback('programmatic', 'b', 0.5),
+				createFeedback('pairwise', 'c', 0.5),
+				createFeedback('similarity', 'd', 0.5),
+			];
+
+			const score = calculateWeightedScore(feedback);
+
+			// llm-judge: 1.0 * 0.35 = 0.35
+			// programmatic: 0.5 * 0.25 = 0.125
+			// pairwise: 0.5 * 0.25 = 0.125
+			// similarity: 0.5 * 0.15 = 0.075
+			// Total: 0.675 / 1.0 = 0.675
+			expect(score).toBeCloseTo(0.675);
+		});
+
+		it('should use custom weights', () => {
+			const feedback: Feedback[] = [
+				createFeedback('llm-judge', 'a', 1.0),
+				createFeedback('programmatic', 'b', 0.0),
+			];
+
+			const score = calculateWeightedScore(feedback, {
+				'llm-judge': 0.8,
+				programmatic: 0.2,
+			});
+
+			// llm-judge: 1.0 * 0.8 = 0.8
+			// programmatic: 0.0 * 0.2 = 0.0
+			// Total: 0.8 / 1.0 = 0.8
+			expect(score).toBeCloseTo(0.8);
+		});
+
+		it('should handle missing evaluators with default weight', () => {
+			const feedback: Feedback[] = [createFeedback('unknown-evaluator', 'a', 0.5)];
+
+			const score = calculateWeightedScore(feedback);
+
+			// unknown-evaluator gets default weight of 0.1
+			expect(score).toBeCloseTo(0.5);
+		});
+
+		it('should return 0 for empty feedback', () => {
+			const score = calculateWeightedScore([]);
+			expect(score).toBe(0);
+		});
+
+		it('should average multiple items from same evaluator', () => {
+			const feedback: Feedback[] = [
+				{ ...createFeedback('llm-judge', 'a', 1.0), kind: 'metric' },
+				{ ...createFeedback('llm-judge', 'b', 0.5), kind: 'metric' },
+				{ ...createFeedback('llm-judge', 'c', 0.5), kind: 'metric' },
+			];
+
+			const score = calculateWeightedScore(feedback, { 'llm-judge': 1.0 });
+
+			// avg(1.0, 0.5, 0.5) = 0.666...
+			expect(score).toBeCloseTo(0.666, 2);
+		});
+
+		it('should ignore detail items when score items exist', () => {
+			const feedback: Feedback[] = [
+				{ evaluator: 'llm-judge', metric: 'overallScore', score: 0.8, kind: 'score' },
+				{
+					evaluator: 'llm-judge',
+					metric: 'efficiency.nodeCountEfficiency',
+					score: 0.0,
+					kind: 'detail',
+				},
+				{
+					evaluator: 'llm-judge',
+					metric: 'efficiency.pathOptimization',
+					score: 0.0,
+					kind: 'detail',
+				},
+			];
+
+			expect(calculateWeightedScore(feedback)).toBeCloseTo(0.8, 5);
+		});
+
+		it('should be invariant to extra detail keys', () => {
+			const base: Feedback[] = [
+				{ evaluator: 'pairwise', metric: 'pairwise_primary', score: 1, kind: 'score' },
+			];
+			const withDetails: Feedback[] = [
+				...base,
+				{ evaluator: 'pairwise', metric: 'judge1', score: 0, kind: 'detail' },
+				{ evaluator: 'pairwise', metric: 'judge2', score: 0, kind: 'detail' },
+			];
+
+			expect(calculateWeightedScore(base)).toBeCloseTo(calculateWeightedScore(withDetails), 10);
+		});
+
+		it('should normalize weights', () => {
+			const feedback: Feedback[] = [createFeedback('a', 'x', 1.0), createFeedback('b', 'x', 0.0)];
+
+			// Weights don't sum to 1.0
+			const score = calculateWeightedScore(feedback, {
+				a: 0.5,
+				b: 0.5,
+			});
+
+			// a: 1.0 * 0.5 = 0.5
+			// b: 0.0 * 0.5 = 0.0
+			// Total: 0.5 / 1.0 = 0.5
+			expect(score).toBeCloseTo(0.5);
+		});
+	});
+
+	describe('aggregateScores()', () => {
+		it('should calculate overall score', () => {
+			const feedback: Feedback[] = [
+				createFeedback('llm-judge', 'a', 0.8),
+				createFeedback('programmatic', 'b', 0.6),
+			];
+
+			const result = aggregateScores(feedback);
+
+			// llm-judge: 0.8 * 0.4 = 0.32
+			// programmatic: 0.6 * 0.3 = 0.18
+			// Total weight: 0.7, Total: 0.5 / 0.7 = 0.714...
+			expect(result.overall).toBeCloseTo(0.714, 2);
+		});
+
+		it('should calculate by-evaluator averages', () => {
+			const feedback: Feedback[] = [
+				createFeedback('llm-judge', 'a', 0.8),
+				createFeedback('llm-judge', 'b', 0.6),
+				createFeedback('programmatic', 'c', 1.0),
+			];
+
+			const result = aggregateScores(feedback);
+
+			expect(result.byEvaluator['llm-judge']).toBeCloseTo(0.7); // (0.8 + 0.6) / 2
+			expect(result.byEvaluator['programmatic']).toBeCloseTo(1.0);
+		});
+
+		it('should calculate by-category averages', () => {
+			const feedback: Feedback[] = [
+				createFeedback('llm-judge', 'functionality', 0.8),
+				createFeedback('llm-judge', 'connections', 0.6),
+				createFeedback('programmatic', 'trigger', 1.0),
+			];
+
+			const result = aggregateScores(feedback);
+
+			expect(result.byCategory['functionality']).toBeCloseTo(0.8);
+			expect(result.byCategory['connections']).toBeCloseTo(0.6);
+			expect(result.byCategory['trigger']).toBeCloseTo(1.0);
+		});
+
+		it('should average same categories from different evaluators', () => {
+			const feedback: Feedback[] = [
+				createFeedback('llm-judge', 'functionality', 0.8),
+				createFeedback('programmatic', 'functionality', 0.6),
+			];
+
+			const result = aggregateScores(feedback);
+
+			expect(result.byCategory['functionality']).toBeCloseTo(0.7); // (0.8 + 0.6) / 2
+		});
+
+		it('should handle empty feedback', () => {
+			const result = aggregateScores([]);
+
+			expect(result.overall).toBe(0);
+			expect(result.byEvaluator).toEqual({});
+			expect(result.byCategory).toEqual({});
+		});
+	});
+
+	describe('DEFAULT_EVALUATOR_WEIGHTS', () => {
+		it('should have weights for standard evaluators', () => {
+			expect(DEFAULT_EVALUATOR_WEIGHTS['llm-judge']).toBeDefined();
+			expect(DEFAULT_EVALUATOR_WEIGHTS['programmatic']).toBeDefined();
+			expect(DEFAULT_EVALUATOR_WEIGHTS['pairwise']).toBeDefined();
+		});
+
+		it('should have weights that sum to approximately 1', () => {
+			const sum = Object.values(DEFAULT_EVALUATOR_WEIGHTS).reduce((a, b) => a + b, 0);
+			expect(sum).toBeCloseTo(1.0);
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/test-case-generator.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/test-case-generator.test.ts
@ -0,0 +1,273 @@
+/**
+ * Tests for test case generation.
+ *
+ * These utilities generate test cases for workflow evaluation,
+ * either via LLM or from CSV fixtures.
+ */
+
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import { mock } from 'jest-mock-extended';
+
+import { loadDefaultTestCases } from '../cli/csv-prompt-loader';
+import { createTestCaseGenerator, type GeneratedTestCase } from '../support/test-case-generator';
+
+/** Type guard for message objects with content */
+function isMessageWithContent(msg: unknown): msg is { content: unknown } {
+	return msg !== null && typeof msg === 'object' && 'content' in msg;
+}
+
+/** Type guard for objects with _getType method */
+function hasGetTypeMethod(msg: unknown): msg is { _getType: () => string } {
+	if (msg === null || typeof msg !== 'object') return false;
+	if (!('_getType' in msg)) return false;
+	const obj = msg as { _getType: unknown };
+	return typeof obj._getType === 'function';
+}
+
+/** Helper to extract messages from mock invoke calls */
+function getMessagesFromMockCall(mockInvoke: jest.Mock): { system: string; human: string } {
+	const calls = mockInvoke.mock.calls;
+	if (calls.length === 0) throw new Error('No calls recorded');
+
+	const firstCall = calls[0];
+	if (!Array.isArray(firstCall) || firstCall.length === 0) {
+		throw new Error('First call has no arguments');
+	}
+
+	const messages = firstCall[0];
+	if (!Array.isArray(messages) || messages.length < 2) {
+		throw new Error('Messages array invalid');
+	}
+
+	const systemMsg = messages[0];
+	const humanMsg = messages[1];
+
+	// Type-safe content extraction
+	const getContent = (msg: unknown): string => {
+		if (isMessageWithContent(msg)) {
+			const content = msg.content;
+			if (typeof content === 'string') return content;
+		}
+		return '';
+	};
+
+	return {
+		system: getContent(systemMsg),
+		human: getContent(humanMsg),
+	};
+}
+
+describe('Test Case Generator', () => {
+	describe('createTestCaseGenerator()', () => {
+		let mockLlm: BaseChatModel;
+		let mockInvoke: jest.Mock;
+
+		beforeEach(() => {
+			mockInvoke = jest.fn().mockResolvedValue({ testCases: [] });
+			mockLlm = mock<BaseChatModel>();
+			(mockLlm as unknown as { withStructuredOutput: jest.Mock }).withStructuredOutput = jest
+				.fn()
+				.mockReturnValue({ invoke: mockInvoke });
+		});
+
+		it('should return generator with generate method', () => {
+			const generator = createTestCaseGenerator(mockLlm);
+
+			expect(generator).toHaveProperty('generate');
+			expect(typeof generator.generate).toBe('function');
+		});
+
+		it('should call LLM with structured output', async () => {
+			const generator = createTestCaseGenerator(mockLlm);
+			await generator.generate();
+
+			expect(
+				(mockLlm as unknown as { withStructuredOutput: jest.Mock }).withStructuredOutput,
+			).toHaveBeenCalled();
+			expect(mockInvoke).toHaveBeenCalled();
+		});
+
+		it('should include count in generated prompt', async () => {
+			const generator = createTestCaseGenerator(mockLlm, { count: 20 });
+			await generator.generate();
+
+			const { human } = getMessagesFromMockCall(mockInvoke);
+			expect(human).toContain('20');
+		});
+
+		it('should use default count of 10', async () => {
+			const generator = createTestCaseGenerator(mockLlm);
+			await generator.generate();
+
+			const { human } = getMessagesFromMockCall(mockInvoke);
+			expect(human).toContain('10');
+		});
+
+		it('should include custom focus in generated prompt', async () => {
+			const generator = createTestCaseGenerator(mockLlm, {
+				focus: 'API integrations only',
+			});
+			await generator.generate();
+
+			const { human } = getMessagesFromMockCall(mockInvoke);
+			expect(human).toContain('API integrations only');
+		});
+
+		it('should return properly typed test cases', async () => {
+			const mockTestCases: GeneratedTestCase[] = [
+				{
+					id: 'test_001',
+					name: 'Email Automation',
+					summary: 'Sends automated emails',
+					prompt: 'Create a workflow that sends emails',
+				},
+				{
+					id: 'test_002',
+					name: 'Data Processing',
+					summary: 'Processes CSV data',
+					prompt: 'Create a workflow that processes CSV files',
+				},
+			];
+			mockInvoke.mockResolvedValue({ testCases: mockTestCases });
+
+			const generator = createTestCaseGenerator(mockLlm);
+			const result = await generator.generate();
+
+			expect(result).toHaveLength(2);
+			expect(result[0]).toEqual({
+				id: 'test_001',
+				name: 'Email Automation',
+				summary: 'Sends automated emails',
+				prompt: 'Create a workflow that sends emails',
+			});
+		});
+
+		it('should handle LLM errors gracefully', async () => {
+			mockInvoke.mockRejectedValue(new Error('LLM error'));
+
+			const generator = createTestCaseGenerator(mockLlm);
+
+			await expect(generator.generate()).rejects.toThrow('LLM error');
+		});
+
+		it('should use complexity option in focus', async () => {
+			const generator = createTestCaseGenerator(mockLlm, { complexity: 'complex' });
+			await generator.generate();
+
+			const { human } = getMessagesFromMockCall(mockInvoke);
+			expect(human.toLowerCase()).toContain('complex');
+		});
+
+		it('should use simple complexity focus', async () => {
+			const generator = createTestCaseGenerator(mockLlm, { complexity: 'simple' });
+			await generator.generate();
+
+			const { human } = getMessagesFromMockCall(mockInvoke);
+			expect(human.toLowerCase()).toContain('simple');
+		});
+
+		it('should include system prompt in messages', async () => {
+			const generator = createTestCaseGenerator(mockLlm);
+			await generator.generate();
+
+			const calls = mockInvoke.mock.calls;
+			expect(calls.length).toBeGreaterThan(0);
+
+			// Extract messages from mock calls with proper type narrowing
+			const firstCall = calls[0];
+			if (!Array.isArray(firstCall) || firstCall.length === 0) {
+				throw new Error('Expected firstCall to be a non-empty array');
+			}
+			const firstArg = firstCall[0];
+			if (!Array.isArray(firstArg)) {
+				throw new Error('Expected first argument to be an array');
+			}
+			const messages = firstArg;
+			expect(messages).toHaveLength(2);
+
+			// Verify message types using type guard
+			const systemMsg = messages[0];
+			const humanMsg = messages[1];
+			expect(hasGetTypeMethod(systemMsg)).toBe(true);
+			expect(hasGetTypeMethod(humanMsg)).toBe(true);
+			if (hasGetTypeMethod(systemMsg)) {
+				expect(systemMsg._getType()).toBe('system');
+			}
+			if (hasGetTypeMethod(humanMsg)) {
+				expect(humanMsg._getType()).toBe('human');
+			}
+		});
+	});
+
+	describe('loadDefaultTestCases', () => {
+		it('should have at least 5 test cases', () => {
+			const defaultCases = loadDefaultTestCases();
+			expect(defaultCases.length).toBeGreaterThanOrEqual(5);
+		});
+
+		it('should have required properties on each test case', () => {
+			const defaultCases = loadDefaultTestCases();
+			for (const testCase of defaultCases) {
+				expect(testCase).toHaveProperty('id');
+				expect(testCase).toHaveProperty('prompt');
+				expect(typeof testCase.id).toBe('string');
+				expect(typeof testCase.prompt).toBe('string');
+				expect(testCase.id!.length).toBeGreaterThan(0);
+				expect(testCase.prompt.length).toBeGreaterThan(0);
+			}
+		});
+
+		it('should have unique IDs', () => {
+			const defaultCases = loadDefaultTestCases();
+			const ids = defaultCases.map((tc) => tc.id);
+			const uniqueIds = new Set(ids);
+			expect(uniqueIds.size).toBe(ids.length);
+		});
+
+		it('should cover different workflow types', () => {
+			const defaultCases = loadDefaultTestCases();
+			const prompts = defaultCases.map((tc) => tc.prompt.toLowerCase());
+
+			// Check for variety in test cases
+			const hasEmail = prompts.some((p) => p.includes('email'));
+			const hasApi = prompts.some((p) => p.includes('api') || p.includes('webhook'));
+			const hasData = prompts.some((p) => p.includes('data') || p.includes('process'));
+
+			expect(hasEmail || hasApi || hasData).toBe(true);
+		});
+
+		it('should have meaningful prompts', () => {
+			const defaultCases = loadDefaultTestCases();
+			for (const testCase of defaultCases) {
+				// Prompts should be descriptive enough
+				expect(testCase.prompt.length).toBeGreaterThan(20);
+			}
+		});
+	});
+
+	describe('generated test cases', () => {
+		it('should be compatible with v2 TestCase format', async () => {
+			const mockTestCases: GeneratedTestCase[] = [
+				{
+					id: 'gen_001',
+					name: 'Generated Test',
+					summary: 'A generated test case',
+					prompt: 'Create a workflow',
+				},
+			];
+
+			const mockInvoke = jest.fn().mockResolvedValue({ testCases: mockTestCases });
+			const mockLlm = mock<BaseChatModel>();
+			(mockLlm as unknown as { withStructuredOutput: jest.Mock }).withStructuredOutput = jest
+				.fn()
+				.mockReturnValue({ invoke: mockInvoke });
+
+			const generator = createTestCaseGenerator(mockLlm);
+			const generated = await generator.generate();
+
+			// Generated test cases should have id and prompt (compatible with v2 TestCase)
+			expect(generated[0].id).toBe('gen_001');
+			expect(generated[0].prompt).toBe('Create a workflow');
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/trace-filters.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/tests/trace-filters.test.ts
@ -0,0 +1,47 @@
+/**
+ * Tests for LangSmith trace filters.
+ */
+
+import type { KVMap } from 'langsmith/schemas';
+
+import type { EvalLogger } from '../harness/logger';
+import { createTraceFilters } from '../langsmith/trace-filters';
+
+describe('trace-filters', () => {
+	it('should not trim messages (keeps array) while still filtering other large state fields', () => {
+		const logs: string[] = [];
+		const logger: EvalLogger = {
+			isVerbose: true,
+			info: (m) => logs.push(m),
+			verbose: (m) => logs.push(m),
+			success: (m) => logs.push(m),
+			warn: (m) => logs.push(m),
+			error: (m) => logs.push(m),
+			dim: (m) => logs.push(m),
+		};
+
+		const { filterInputs } = createTraceFilters(logger);
+
+		const msg = { type: 'ai', content: 'hello' };
+		const input: KVMap = {
+			cachedTemplates: [
+				{
+					templateId: 't1',
+					name: 'Template',
+					// Extra properties that should be filtered out
+					workflow: { nodes: [], connections: {} },
+					description: 'A long description that should be removed',
+				},
+			],
+			messages: [msg],
+		};
+
+		const filtered = filterInputs({ ...input });
+
+		expect(Array.isArray(filtered.messages)).toBe(true);
+		expect(filtered.messages).toEqual([msg]);
+		// Verify that cachedTemplates was summarized - only templateId and name are preserved
+		expect(filtered.cachedTemplates).toEqual([{ templateId: 't1', name: 'Template' }]);
+		expect(logs.join('\n')).toContain('LangSmith trace filtering: ACTIVE');
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/test-case-generator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/test-case-generator.ts
@ -1,170 +0,0 @@
-import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
-import { SystemMessage } from '@langchain/core/messages';
-import { ChatPromptTemplate, HumanMessagePromptTemplate } from '@langchain/core/prompts';
-import { readFileSync } from 'fs';
-import { jsonParse, OperationalError } from 'n8n-workflow';
-import { join } from 'path';
-import { z } from 'zod';
-
-import type { SimpleWorkflow } from '../../src/types/workflow';
-import type { TestCase } from '../types/evaluation';
-
-// Helper to load reference workflows
-function loadReferenceWorkflow(filename: string): SimpleWorkflow {
-	const path = join(__dirname, '..', 'reference-workflows', filename);
-	return jsonParse<SimpleWorkflow>(readFileSync(path, 'utf-8'));
-}
-
-const testCasesSchema = z.object({
-	testCases: z.array(
-		z.object({
-			id: z.string(),
-			name: z.string(),
-			summary: z.string(),
-			prompt: z.string(),
-		}),
-	),
-});
-
-const systemPrompt = `You are an expert at generating diverse test cases for an n8n workflow builder AI. Create test cases that cover various real-world scenarios and complexity levels.
-
-## Test Case Requirements:
-
-1. **Simple Test Cases**: Single operation workflows
-   - API calls
-   - Data transformations
-   - File operations
-   - Basic integrations
-
-2. **Medium Test Cases**: Multi-step workflows with logic
-   - Conditional logic (IF nodes)
-   - Data filtering and transformation
-   - Multiple API integrations
-   - Error handling
-
-3. **Complex Test Cases**: Advanced workflows
-   - Parallel execution branches
-   - Complex error handling and retry logic
-   - Multiple integrations with data synchronization
-   - Webhooks and event-driven flows
-
-## Guidelines:
- Create realistic business scenarios
- Include specific requirements that can be evaluated
- Vary the domains (e-commerce, HR, marketing, DevOps, etc.)
- Include both common and edge-case scenarios
- Make prompts clear and unambiguous
- Specify expected node types when possible
-
-## Output Format:
-Each test case should have:
- Unique ID (e.g., "test_001")
- Descriptive name
- Brief description
- Clear prompt that a user would give
- Expected node types (array of node names)
- Complexity level
- Relevant tags`;
-
-const humanTemplate = `Generate {count} diverse test cases for workflow generation evaluation.
-
-Focus on:
-{focus}
-
-Ensure a good mix of complexity levels and use cases.`;
-
-export function createTestCaseGeneratorChain(llm: BaseChatModel) {
-	if (!llm.bindTools) {
-		throw new OperationalError("LLM doesn't support binding tools");
-	}
-
-	const prompt = ChatPromptTemplate.fromMessages([
-		new SystemMessage(systemPrompt),
-		HumanMessagePromptTemplate.fromTemplate(humanTemplate),
-	]);
-
-	const llmWithStructuredOutput = llm.withStructuredOutput(testCasesSchema);
-	return prompt.pipe(llmWithStructuredOutput);
-}
-
-export async function generateTestCases(
-	llm: BaseChatModel,
-	count: number = 10,
-	focus: string = 'balanced mix of API integrations, data processing, and automation scenarios',
-): Promise<TestCase[]> {
-	const chain = createTestCaseGeneratorChain(llm);
-
-	const result = (await chain.invoke({
-		count,
-		focus,
-	})) as z.infer<typeof testCasesSchema>;
-
-	return result.testCases;
-}
-
-export const basicTestCases: TestCase[] = [
-	{
-		id: 'multi-agent-research',
-		name: 'Multi-agent research workflow',
-		prompt:
-			'Create a multi-agent AI workflow using `gpt-4.1-mini` where several agents work together to research a topic, fact-check the findings, and write a report that\'s sent as an HTML email. One agent should gather recent, credible information about the topic. Another agent should verify the facts and only mark something as "verified" if it appears in at least two independent sources. A third agent should combine the verified information into a clear, well-written report under 1,000 words. A final agent should edit and format the report to make it look clean and professional in the body of the email. Use Gmail to send the report.',
-	},
-	{
-		id: 'email-summary',
-		name: 'Summarize emails with AI',
-		prompt:
-			'Create an automation that runs on Monday mornings. It reads my Gmail inbox from the weekend, analyzes them with `gpt-4.1-mini` to find action items and priorities, and emails me a structured email using Gmail.',
-		referenceWorkflow: loadReferenceWorkflow('email-summary.json'),
-	},
-	{
-		id: 'ai-news-digest',
-		name: 'Daily AI news digest',
-		prompt:
-			'Build an automation that runs every night 8pm. Use the NewsAPI "/everything" endpoint to search for AI-related news from the day. Pick the top 5 articles and use OpenAI `gpt-4.1-mini` to summarize each in two sentences. Generate an image using OpenAI based on the top article\'s summary. Send a structured Telegram message.',
-	},
-	{
-		id: 'daily-weather-report',
-		name: 'Daily weather report',
-		prompt:
-			'Create an automation that checks the weather for my location every morning at 5 a.m using OpenWeather. Send me a short weather report by email using Gmail. Use OpenAI `gpt-4.1-mini` to write a short, fun formatted email body by adding personality when describing the weather and how the day might feel. Include all details relevant to decide on my plans and clothes for the day.',
-		referenceWorkflow: loadReferenceWorkflow('daily-weather-report.json'),
-	},
-	{
-		id: 'invoice-pipeline',
-		name: 'Invoice processing pipeline',
-		prompt:
-			'Create an invoice processing workflow using an n8n Form. When a user submits an invoice file (PDF or image) with their email address, use OpenAI `gpt-4.1-mini` to extract invoice data. Then, validate the date format is correct, the currency is valid, and the total amount is greater than zero. If validation fails, email the user a clear error message that explains which check failed from my Gmail. If the data passes validation, store the structured result in a datatable plus email the user. Every Monday morning, generate a weekly spending report using `gpt-4.1-mini` based on stored invoices and send a clean email using Gmail.',
-	},
-	{
-		id: 'rag-assistant',
-		name: 'RAG knowledge assistant',
-		prompt:
-			'Build an automation that creates a document-to-chat RAG pipeline. The workflow starts with an n8n Form where a user uploads one or more files (PDF, CSV, or JSON). Each upload should trigger a process that reads the file, splits it into chunks, and generates embeddings using OpenAI `gpt-4.1-mini` model, saved in one Pinecone table. Add a second part of the workflow for querying: use a Chat Message Trigger to act as a chatbot interface. When a user sends a question, retrieve the top 5 most relevant chunks from Pinecone, pass them into `gpt-4.1-mini` as context, and have it answer naturally using only the retrieved information. If a question can\'t be answered confidently, the bot should respond with: "I couldn\'t find that in the uploaded documents." Log each chat interaction in a Data Table with the user query, matched file(s), and timestamp. Send a daily summary email through Gmail showing total questions asked, top files referenced, and any failed lookups.',
-	},
-	{
-		id: 'lead-qualification',
-		name: 'Lead qualification and call scheduling',
-		prompt:
-			'Create an n8n form with a lead generation form I can embed on my website homepage. Build an automation that processes form submissions, uses AI to qualify the lead, sends data to an n8n data table. For high-score leads, it should also email them to offer to schedule a 15-min call in a free slot in my calendar.',
-	},
-	{
-		id: 'youtube-auto-chapters',
-		name: 'YouTube video chapters',
-		prompt:
-			"Build an n8n workflow that automatically generates YouTube chapter timestamps from video captions. Use the n8n chat trigger for me to enter the URL of the YouTube video. Use the YouTube Get a video node to get the video title, description, and existing metadata. Use the YouTube Captions API to download the transcript for the given video ID. Send the transcript to AI agent using Anthropic's Claude model. Prompt the model to identify topic shifts and return structured output in timestamp - chapter format. Append the generated chapter list to the existing video description. Use the YouTube Update a video node to update the video description. Respond back with the updates using the respond to chat node.",
-	},
-	{
-		id: 'google-sheets-processing',
-		name: 'Process large Google Sheets data',
-		prompt:
-			'Create a workflow that reads all rows from a Google Sheets document with thousands of customer records. For each row, call an external API to get additional customer data, process the response, and update the row with the enriched information. Handle rate limiting and errors gracefully.',
-		referenceWorkflow: loadReferenceWorkflow('google-sheets-processing.json'),
-	},
-	{
-		id: 'extract-from-file',
-		name: 'Extract data from uploaded files',
-		prompt:
-			'Build a workflow that accepts file uploads through an n8n form. When users upload PDF documents, CSV files, or Excel spreadsheets, automatically extract the text content and data from these files. Transform the extracted data into a structured format and save it to a database or send it via email as a summary.',
-		referenceWorkflow: loadReferenceWorkflow('extract-from-file.json'),
-	},
-];
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/argument-parser.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/argument-parser.ts
@ -0,0 +1,453 @@
+/* eslint-disable @typescript-eslint/naming-convention */
+import { z } from 'zod';
+
+import type { BuilderFeatureFlags } from '../../src/workflow-builder-agent.js';
+import type { LangsmithExampleFilters } from '../harness/harness-types';
+import { DEFAULTS } from '../support/constants';
+
+export type EvaluationSuite = 'llm-judge' | 'pairwise' | 'programmatic' | 'similarity';
+export type EvaluationBackend = 'local' | 'langsmith';
+
+export interface EvaluationArgs {
+	suite: EvaluationSuite;
+	backend: EvaluationBackend;
+
+	verbose: boolean;
+	repetitions: number;
+	concurrency: number;
+	timeoutMs: number;
+	experimentName?: string;
+	outputDir?: string;
+	datasetName?: string;
+	maxExamples?: number;
+	filters?: LangsmithExampleFilters;
+
+	testCase?: string;
+	promptsCsv?: string;
+
+	prompt?: string;
+	dos?: string;
+	donts?: string;
+
+	numJudges: number;
+	numGenerations: number;
+
+	featureFlags?: BuilderFeatureFlags;
+}
+
+type CliValueKind = 'boolean' | 'string';
+type FlagGroup = 'input' | 'eval' | 'pairwise' | 'langsmith' | 'output' | 'feature' | 'advanced';
+
+const cliSchema = z
+	.object({
+		suite: z.enum(['llm-judge', 'pairwise', 'programmatic', 'similarity']).default('llm-judge'),
+		backend: z.enum(['local', 'langsmith']).default('local'),
+
+		verbose: z.boolean().default(false),
+		repetitions: z.coerce.number().int().positive().default(DEFAULTS.REPETITIONS),
+		concurrency: z.coerce.number().int().positive().default(DEFAULTS.CONCURRENCY),
+		timeoutMs: z.coerce.number().int().positive().default(DEFAULTS.TIMEOUT_MS),
+		experimentName: z.string().min(1).optional(),
+		outputDir: z.string().min(1).optional(),
+		datasetName: z.string().min(1).optional(),
+		maxExamples: z.coerce.number().int().positive().optional(),
+		filter: z.array(z.string().min(1)).default([]),
+		notionId: z.string().min(1).optional(),
+		technique: z.string().min(1).optional(),
+
+		testCase: z.string().min(1).optional(),
+		promptsCsv: z.string().min(1).optional(),
+
+		prompt: z.string().min(1).optional(),
+		dos: z.string().min(1).optional(),
+		donts: z.string().min(1).optional(),
+
+		numJudges: z.coerce.number().int().positive().default(DEFAULTS.NUM_JUDGES),
+		numGenerations: z.coerce.number().int().positive().max(10).default(DEFAULTS.NUM_GENERATIONS),
+
+		langsmith: z.boolean().optional(),
+		templateExamples: z.boolean().default(false),
+	})
+	.strict();
+
+type CliKey = keyof z.infer<typeof cliSchema>;
+
+type FlagDef = { key: CliKey; kind: CliValueKind; desc: string; group: FlagGroup };
+
+const FLAG_DEFS: Record<string, FlagDef> = {
+	// Input sources
+	'--prompt': { key: 'prompt', kind: 'string', group: 'input', desc: 'Single prompt to evaluate' },
+	'--prompts-csv': {
+		key: 'promptsCsv',
+		kind: 'string',
+		group: 'input',
+		desc: 'CSV file with prompts',
+	},
+	'--test-case': {
+		key: 'testCase',
+		kind: 'string',
+		group: 'input',
+		desc: 'Run specific default test case by ID',
+	},
+	'--dataset': {
+		key: 'datasetName',
+		kind: 'string',
+		group: 'input',
+		desc: 'LangSmith dataset name',
+	},
+
+	// Evaluation options
+	'--suite': {
+		key: 'suite',
+		kind: 'string',
+		group: 'eval',
+		desc: 'Evaluation suite (llm-judge|pairwise|programmatic|similarity)',
+	},
+	'--backend': { key: 'backend', kind: 'string', group: 'eval', desc: 'Backend (local|langsmith)' },
+	'--max-examples': {
+		key: 'maxExamples',
+		kind: 'string',
+		group: 'eval',
+		desc: 'Limit number of examples',
+	},
+	'--repetitions': {
+		key: 'repetitions',
+		kind: 'string',
+		group: 'eval',
+		desc: 'Repeat each example N times',
+	},
+	'--concurrency': {
+		key: 'concurrency',
+		kind: 'string',
+		group: 'eval',
+		desc: 'Max parallel evaluations',
+	},
+	'--timeout-ms': {
+		key: 'timeoutMs',
+		kind: 'string',
+		group: 'eval',
+		desc: 'Timeout per evaluation (ms)',
+	},
+
+	// Pairwise options
+	'--dos': {
+		key: 'dos',
+		kind: 'string',
+		group: 'pairwise',
+		desc: 'Requirements the workflow must satisfy',
+	},
+	'--donts': {
+		key: 'donts',
+		kind: 'string',
+		group: 'pairwise',
+		desc: 'Things the workflow must avoid',
+	},
+
+	// LangSmith options
+	'--langsmith': {
+		key: 'langsmith',
+		kind: 'boolean',
+		group: 'langsmith',
+		desc: 'Shorthand for --backend langsmith',
+	},
+	'--name': { key: 'experimentName', kind: 'string', group: 'langsmith', desc: 'Experiment name' },
+	'--filter': {
+		key: 'filter',
+		kind: 'string',
+		group: 'langsmith',
+		desc: 'Filter examples (key:value, repeatable)',
+	},
+	'--notion-id': {
+		key: 'notionId',
+		kind: 'string',
+		group: 'langsmith',
+		desc: 'Filter by Notion ID',
+	},
+	'--technique': {
+		key: 'technique',
+		kind: 'string',
+		group: 'langsmith',
+		desc: 'Filter by technique',
+	},
+
+	// Output
+	'--output-dir': {
+		key: 'outputDir',
+		kind: 'string',
+		group: 'output',
+		desc: 'Directory for artifacts',
+	},
+	'--verbose': { key: 'verbose', kind: 'boolean', group: 'output', desc: 'Verbose logging' },
+
+	// Feature flags
+	'--template-examples': {
+		key: 'templateExamples',
+		kind: 'boolean',
+		group: 'feature',
+		desc: 'Enable template examples phase',
+	},
+
+	// Advanced
+	'--judges': { key: 'numJudges', kind: 'string', group: 'advanced', desc: 'Number of LLM judges' },
+	'--generations': {
+		key: 'numGenerations',
+		kind: 'string',
+		group: 'advanced',
+		desc: 'Workflow generations per prompt',
+	},
+};
+
+// Aliases (not shown in help)
+const FLAG_ALIASES: Record<string, string> = {
+	'--mode': '--suite',
+	'-v': '--verbose',
+};
+
+// Combined lookup for parsing
+const FLAG_TO_KEY: Record<string, FlagDef> = {
+	...FLAG_DEFS,
+	...Object.fromEntries(
+		Object.entries(FLAG_ALIASES).map(([alias, target]) => [alias, FLAG_DEFS[target]]),
+	),
+};
+
+function formatValidFlags(): string {
+	return Object.keys(FLAG_TO_KEY)
+		.filter((f) => f.startsWith('--'))
+		.sort()
+		.join('\n  ');
+}
+
+const GROUP_TITLES: Record<FlagGroup, string> = {
+	input: 'Input Sources',
+	eval: 'Evaluation Options',
+	pairwise: 'Pairwise Options',
+	langsmith: 'LangSmith Options',
+	output: 'Output',
+	feature: 'Feature Flags',
+	advanced: 'Advanced',
+};
+
+function formatHelp(): string {
+	const lines: string[] = [
+		'Usage: pnpm eval [options]',
+		'',
+		'Evaluation harness for AI Workflow Builder.',
+		'',
+	];
+
+	const groups: FlagGroup[] = [
+		'input',
+		'eval',
+		'pairwise',
+		'langsmith',
+		'output',
+		'feature',
+		'advanced',
+	];
+
+	for (const group of groups) {
+		const flags = Object.entries(FLAG_DEFS).filter(([, def]) => def.group === group);
+		if (flags.length === 0) continue;
+
+		lines.push(`${GROUP_TITLES[group]}:`);
+		for (const [flag, def] of flags) {
+			const valueHint = def.kind === 'string' ? ' <value>' : '';
+			const padded = `  ${flag}${valueHint}`.padEnd(28);
+			lines.push(`${padded}${def.desc}`);
+		}
+		lines.push('');
+	}
+
+	lines.push('Examples:');
+	lines.push('  pnpm eval --verbose');
+	lines.push('  pnpm eval --prompt "Create a Slack notification workflow"');
+	lines.push('  pnpm eval --prompts-csv my-prompts.csv --max-examples 5');
+	lines.push('  pnpm eval:langsmith --dataset "workflow-builder-canvas-prompts" --name "test-run"');
+
+	return lines.join('\n');
+}
+
+export function printHelp(): void {
+	console.log(formatHelp());
+}
+
+function ensureValue(argv: string[], i: number, flag: string): string {
+	const value = argv[i + 1];
+	if (value === undefined) throw new Error(`Flag ${flag} requires a value`);
+	return value;
+}
+
+function splitFlagToken(token: string): { flag: string; inlineValue?: string } {
+	if (!token.startsWith('--')) return { flag: token };
+	const equalsIndex = token.indexOf('=');
+	if (equalsIndex === -1) return { flag: token };
+	return { flag: token.slice(0, equalsIndex), inlineValue: token.slice(equalsIndex + 1) };
+}
+
+function isStringArray(value: unknown): value is string[] {
+	return Array.isArray(value) && value.every((v): v is string => typeof v === 'string');
+}
+
+function parseCli(argv: string[]): {
+	values: Partial<Record<CliKey, unknown>>;
+	seenKeys: Set<CliKey>;
+} {
+	const values: Partial<Record<CliKey, unknown>> = {};
+	const seenKeys = new Set<CliKey>();
+
+	for (let i = 0; i < argv.length; i++) {
+		const token = argv[i];
+		if (!token.startsWith('-')) continue;
+
+		const { flag, inlineValue } = splitFlagToken(token);
+		const def = FLAG_TO_KEY[flag];
+
+		if (!def) {
+			throw new Error(`Unknown flag: ${flag}\n\nValid flags:\n  ${formatValidFlags()}`);
+		}
+
+		seenKeys.add(def.key);
+
+		if (def.kind === 'boolean') {
+			values[def.key] = true;
+			continue;
+		}
+
+		const value = inlineValue ?? ensureValue(argv, i, flag);
+		if (inlineValue === undefined) i++;
+
+		if (def.key === 'filter') {
+			const existing = values.filter;
+			values.filter = isStringArray(existing) ? [...existing, value] : [value];
+			continue;
+		}
+
+		values[def.key] = value;
+	}
+
+	return { values, seenKeys };
+}
+
+function parseFeatureFlags(args: {
+	templateExamples: boolean;
+}): BuilderFeatureFlags | undefined {
+	const templateExamplesFromEnv = process.env.EVAL_FEATURE_TEMPLATE_EXAMPLES === 'true';
+	const templateExamples = templateExamplesFromEnv || args.templateExamples;
+
+	if (!templateExamples) return undefined;
+
+	return {
+		templateExamples: templateExamples || undefined,
+	};
+}
+
+function parseFilters(args: {
+	filter: string[];
+	notionId?: string;
+	technique?: string;
+}): LangsmithExampleFilters | undefined {
+	const filters: LangsmithExampleFilters = {};
+
+	for (const raw of args.filter) {
+		const match = raw.match(/^(\w+):(.+)$/);
+		if (!match) {
+			throw new Error('Invalid `--filter` format. Expected: --filter "key:value"');
+		}
+
+		const [, key, valueRaw] = match;
+		const value = valueRaw.trim();
+		if (value.length === 0) {
+			throw new Error(`Invalid \`--filter\` value for "${key}": value cannot be empty`);
+		}
+		switch (key) {
+			case 'do':
+				filters.doSearch = value;
+				break;
+			case 'dont':
+				filters.dontSearch = value;
+				break;
+			case 'technique':
+				filters.technique = value;
+				break;
+			case 'id':
+				filters.notionId = value;
+				break;
+			default:
+				throw new Error(`Unknown filter key "${key}". Expected one of: do, dont, technique, id`);
+		}
+	}
+
+	if (args.notionId && !filters.notionId) filters.notionId = args.notionId;
+	if (args.technique && !filters.technique) filters.technique = args.technique;
+
+	const hasAny = Object.values(filters).some((v) => typeof v === 'string' && v.length > 0);
+	return hasAny ? filters : undefined;
+}
+
+export function parseEvaluationArgs(argv: string[] = process.argv.slice(2)): EvaluationArgs {
+	// Check for help flag before parsing
+	if (argv.includes('--help') || argv.includes('-h')) {
+		printHelp();
+		process.exit(0);
+	}
+
+	const { values, seenKeys } = parseCli(argv);
+
+	if (values.langsmith === true) {
+		const backendWasExplicit = seenKeys.has('backend');
+		if (backendWasExplicit && values.backend !== 'langsmith') {
+			throw new Error('Cannot combine `--langsmith` with `--backend local`');
+		}
+		values.backend = 'langsmith';
+	}
+
+	const parsed = cliSchema.parse(values);
+
+	const featureFlags = parseFeatureFlags({
+		templateExamples: parsed.templateExamples,
+	});
+
+	const filters = parseFilters({
+		filter: parsed.filter,
+		notionId: parsed.notionId,
+		technique: parsed.technique,
+	});
+
+	if (parsed.suite !== 'pairwise' && (filters?.doSearch || filters?.dontSearch)) {
+		throw new Error(
+			'`--filter do:` and `--filter dont:` are only supported for `--suite pairwise`',
+		);
+	}
+
+	return {
+		suite: parsed.suite,
+		backend: parsed.backend,
+		verbose: parsed.verbose,
+		repetitions: parsed.repetitions,
+		concurrency: parsed.concurrency,
+		timeoutMs: parsed.timeoutMs,
+		experimentName: parsed.experimentName,
+		outputDir: parsed.outputDir,
+		datasetName: parsed.datasetName,
+		maxExamples: parsed.maxExamples,
+		filters,
+		testCase: parsed.testCase,
+		promptsCsv: parsed.promptsCsv,
+		prompt: parsed.prompt,
+		dos: parsed.dos,
+		donts: parsed.donts,
+		numJudges: parsed.numJudges,
+		numGenerations: parsed.numGenerations,
+		featureFlags,
+	};
+}
+
+export function getDefaultExperimentName(suite: EvaluationSuite): string {
+	return suite === 'pairwise' ? DEFAULTS.EXPERIMENT_NAME : DEFAULTS.LLM_JUDGE_EXPERIMENT_NAME;
+}
+
+export function getDefaultDatasetName(suite: EvaluationSuite): string {
+	if (suite === 'pairwise') return DEFAULTS.DATASET_NAME;
+	return process.env.LANGSMITH_DATASET_NAME ?? 'workflow-builder-canvas-prompts';
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/csv-prompt-loader.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/csv-prompt-loader.ts
@ -0,0 +1,132 @@
+import { parse } from 'csv-parse/sync';
+import { existsSync, readFileSync } from 'node:fs';
+import { join, isAbsolute, resolve } from 'node:path';
+
+import type { TestCase } from '../harness/harness-types.js';
+
+/** Path to the default prompts CSV fixture */
+const DEFAULT_PROMPTS_PATH = join(__dirname, '..', 'fixtures', 'default-prompts.csv');
+
+type ParsedCsvRow = string[];
+
+function isHeaderRow(row: ParsedCsvRow) {
+	return row.some((cell) => cell.trim().toLowerCase() === 'prompt');
+}
+
+function detectColumnIndex(header: ParsedCsvRow, name: string) {
+	const normalized = name.toLowerCase();
+	const index = header.findIndex((cell) => cell.trim().toLowerCase() === normalized);
+	return index >= 0 ? index : undefined;
+}
+
+function sanitizeValue(value: string | undefined) {
+	return value?.trim() ?? '';
+}
+
+function parseCsv(content: string): ParsedCsvRow[] {
+	try {
+		const rows = parse(content.replace(/^\ufeff/, ''), {
+			columns: false,
+			skip_empty_lines: true,
+			trim: true,
+			relax_column_count: true,
+		}) as ParsedCsvRow[];
+
+		return rows.map((row) => row.map((cell) => cell ?? ''));
+	} catch (error) {
+		const message = error instanceof Error ? error.message : 'Unknown parsing error';
+		throw new Error(`Failed to parse CSV file: ${message}`);
+	}
+}
+
+export function loadTestCasesFromCsv(csvPath: string): TestCase[] {
+	const resolvedPath = isAbsolute(csvPath) ? csvPath : resolve(process.cwd(), csvPath);
+
+	if (!existsSync(resolvedPath)) {
+		throw new Error(`CSV file not found at ${resolvedPath}`);
+	}
+
+	const rows = parseCsv(readFileSync(resolvedPath, 'utf8'));
+
+	if (rows.length === 0) {
+		throw new Error('The provided CSV file is empty');
+	}
+
+	const hasHeader = isHeaderRow(rows[0]);
+	const header = hasHeader ? rows[0] : undefined;
+	const dataRows = hasHeader ? rows.slice(1) : rows;
+
+	if (dataRows.length === 0) {
+		throw new Error('No prompt rows found in the provided CSV file');
+	}
+
+	// Find column index by name(s), returns undefined if no header
+	const findColumn = (...names: string[]): number | undefined => {
+		if (!header) return undefined;
+		for (const name of names) {
+			const idx = detectColumnIndex(header, name);
+			if (idx !== undefined) return idx;
+		}
+		return undefined;
+	};
+
+	const promptIdx = findColumn('prompt') ?? 0;
+	const idIdx = findColumn('id');
+	const dosIdx = findColumn('dos', 'do');
+	const dontsIdx = findColumn('donts', 'dont');
+
+	const getCell = (row: ParsedCsvRow, idx: number | undefined): string =>
+		idx !== undefined ? sanitizeValue(row[idx]) : '';
+
+	const testCases: TestCase[] = [];
+
+	for (let i = 0; i < dataRows.length; i++) {
+		const row = dataRows[i];
+		const prompt = getCell(row, promptIdx);
+
+		if (!prompt) continue;
+
+		const dos = getCell(row, dosIdx);
+		const donts = getCell(row, dontsIdx);
+
+		const testCase: TestCase = {
+			id: getCell(row, idIdx) || `csv-case-${i + 1}`,
+			prompt,
+		};
+
+		if (dos || donts) {
+			testCase.context = {};
+			if (dos) testCase.context.dos = dos;
+			if (donts) testCase.context.donts = donts;
+		}
+
+		testCases.push(testCase);
+	}
+
+	if (testCases.length === 0) {
+		throw new Error('No valid prompts found in the provided CSV file');
+	}
+
+	return testCases;
+}
+
+/** Cached default test cases */
+let cachedDefaultTestCases: TestCase[] | null = null;
+
+/**
+ * Load the default test cases from the bundled CSV fixture.
+ * Results are cached after first load.
+ */
+export function loadDefaultTestCases(): TestCase[] {
+	cachedDefaultTestCases ??= loadTestCasesFromCsv(DEFAULT_PROMPTS_PATH);
+	return cachedDefaultTestCases;
+}
+
+/**
+ * Get available test case IDs from the default fixture.
+ */
+export function getDefaultTestCaseIds(): string[] {
+	return loadDefaultTestCases()
+		.map((tc) => tc.id)
+		.filter((id): id is string => id !== undefined);
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/display.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/display.ts
@ -1,119 +0,0 @@
-import cliProgress from 'cli-progress';
-import pc from 'picocolors';
-
-import type { TestCase } from '../types/evaluation.js';
-import type { TestResult } from '../types/test-result.js';
-import {
-	calculateTestMetrics,
-	calculateCategoryAverages,
-	countViolationsByType,
-	calculateProgrammaticAverages,
-	countProgrammaticViolationsByType,
-} from '../utils/evaluation-calculator.js';
-import {
-	displayTestResults,
-	displaySummaryTable,
-	displayViolationsDetail,
-	displayCacheStatistics,
-} from '../utils/evaluation-reporter.js';
-
-/**
- * Creates a progress bar for test execution
- * @param total - Total number of tests
- * @returns Progress bar instance
- */
-export function createProgressBar(total: number): cliProgress.SingleBar {
-	const progressBar = new cliProgress.SingleBar(
-		{
-			format: 'Progress |{bar}| {percentage}% | {value}/{total} Tests | {status}',
-			barCompleteChar: '█',
-			barIncompleteChar: '░',
-			hideCursor: true,
-		},
-		cliProgress.Presets.shades_classic,
-	);
-	progressBar.start(total, 0, { status: 'Starting...' });
-	return progressBar;
-}
-
-/**
- * Updates progress bar with current status
- * @param progressBar - Progress bar instance
- * @param completed - Number of completed tests
- * @param total - Total number of tests
- * @param status - Optional status message
- */
-export function updateProgress(
-	progressBar: cliProgress.SingleBar,
-	completed: number,
-	total: number,
-	status?: string,
-): void {
-	progressBar.update(completed, {
-		status: status ?? `${completed}/${total} completed`,
-	});
-}
-
-/**
- * Displays evaluation results in the console
- * @param testCases - Array of test cases
- * @param results - Array of test results
- * @param totalTime - Total execution time in milliseconds
- */
-export function displayResults(
-	testCases: TestCase[],
-	results: TestResult[],
-	totalTime: number,
-): void {
-	// Display test results
-	displayTestResults(testCases, results);
-
-	console.log();
-	console.log(pc.green(`✓ All tests completed in ${(totalTime / 1000).toFixed(1)}s`));
-
-	// Calculate metrics
-	const metrics = calculateTestMetrics(results);
-	const categoryAverages = calculateCategoryAverages(results);
-	const violationCounts = countViolationsByType(results);
-	const programmaticAverages = calculateProgrammaticAverages(results);
-	const programmaticViolationCounts = countProgrammaticViolationsByType(results);
-
-	const combinedMetrics = {
-		...metrics,
-		categoryAverages,
-		violationCounts,
-		programmaticAverages,
-		programmaticViolationCounts,
-	};
-
-	// Display summary
-	displaySummaryTable(combinedMetrics);
-
-	// Display cache statistics
-	displayCacheStatistics(results);
-
-	// Display violations if any exist (from either LLM or programmatic evaluation)
-	const hasLLMViolations =
-		violationCounts.critical > 0 || violationCounts.major > 0 || violationCounts.minor > 0;
-	const hasProgViolations =
-		programmaticViolationCounts.critical > 0 ||
-		programmaticViolationCounts.major > 0 ||
-		programmaticViolationCounts.minor > 0;
-
-	if (hasLLMViolations || hasProgViolations) {
-		displayViolationsDetail(results);
-	}
-}
-
-/**
- * Displays error message and exits
- * @param message - Error message
- * @param error - Optional error object
- */
-export function displayError(message: string, error?: unknown): void {
-	console.error(pc.red(`✗ ${message}`));
-	if (error) {
-		console.error(error);
-	}
-	process.exit(1);
-}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/index.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/index.ts
@ -0,0 +1,243 @@
+/**
+ * V2 CLI Entry Point
+ *
+ * Demonstrates how to use the v2 evaluation harness.
+ * Can be run directly or used as a reference for custom setups.
+ */
+
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import type { INodeTypeDescription } from 'n8n-workflow';
+import pLimit from 'p-limit';
+
+import type { SimpleWorkflow } from '@/types/workflow';
+import type { BuilderFeatureFlags } from '@/workflow-builder-agent';
+
+import {
+	getDefaultDatasetName,
+	getDefaultExperimentName,
+	parseEvaluationArgs,
+} from './argument-parser';
+import {
+	runEvaluation,
+	createConsoleLifecycle,
+	createLLMJudgeEvaluator,
+	createProgrammaticEvaluator,
+	createPairwiseEvaluator,
+	createSimilarityEvaluator,
+	type RunConfig,
+	type TestCase,
+	type Evaluator,
+	type EvaluationContext,
+} from '../index';
+import {
+	loadTestCasesFromCsv,
+	loadDefaultTestCases,
+	getDefaultTestCaseIds,
+} from './csv-prompt-loader';
+import { consumeGenerator, getChatPayload } from '../harness/evaluation-helpers';
+import { createLogger } from '../harness/logger';
+import { generateRunId, isWorkflowStateValues } from '../langsmith/types';
+import { EVAL_TYPES, EVAL_USERS } from '../support/constants';
+import { setupTestEnvironment, createAgent } from '../support/environment';
+
+/**
+ * Create a workflow generator function.
+ * NOTE: Don't pass a tracer - LangSmith tracing is handled via traceable() in the runner.
+ */
+function createWorkflowGenerator(
+	parsedNodeTypes: INodeTypeDescription[],
+	llm: BaseChatModel,
+	featureFlags?: BuilderFeatureFlags,
+): (prompt: string) => Promise<SimpleWorkflow> {
+	return async (prompt: string): Promise<SimpleWorkflow> => {
+		const runId = generateRunId();
+
+		const agent = createAgent({
+			parsedNodeTypes,
+			llm,
+			featureFlags,
+		});
+
+		await consumeGenerator(
+			agent.chat(
+				getChatPayload({
+					evalType: EVAL_TYPES.LANGSMITH,
+					message: prompt,
+					workflowId: runId,
+					featureFlags,
+				}),
+				EVAL_USERS.LANGSMITH,
+			),
+		);
+
+		const state = await agent.getState(runId, EVAL_USERS.LANGSMITH);
+
+		if (!state.values || !isWorkflowStateValues(state.values)) {
+			throw new Error('Invalid workflow state: workflow or messages missing');
+		}
+
+		return state.values.workflowJSON;
+	};
+}
+
+/**
+ * Load test cases from various sources.
+ */
+function loadTestCases(args: ReturnType<typeof parseEvaluationArgs>): TestCase[] {
+	// From CSV file
+	if (args.promptsCsv) {
+		const testCases = loadTestCasesFromCsv(args.promptsCsv);
+		return args.maxExamples ? testCases.slice(0, args.maxExamples) : testCases;
+	}
+
+	// Predefined test case by id
+	if (args.testCase) {
+		const defaultCases = loadDefaultTestCases();
+		const match = defaultCases.find((tc) => tc.id === args.testCase);
+		if (!match) {
+			const options = getDefaultTestCaseIds().join(', ');
+			throw new Error(`Unknown --test-case "${args.testCase}". Available: ${options}`);
+		}
+
+		const testCases: TestCase[] = [
+			{
+				prompt: match.prompt,
+				id: match.id,
+				context: { dos: args.dos, donts: args.donts },
+			},
+		];
+
+		return args.maxExamples ? testCases.slice(0, args.maxExamples) : testCases;
+	}
+
+	// Single prompt from CLI
+	if (args.prompt) {
+		const testCases: TestCase[] = [
+			{
+				prompt: args.prompt,
+				context: {
+					dos: args.dos,
+					donts: args.donts,
+				},
+			},
+		];
+		return args.maxExamples ? testCases.slice(0, args.maxExamples) : testCases;
+	}
+
+	// Default: use bundled test cases
+	const defaultCases = loadDefaultTestCases();
+	return args.maxExamples ? defaultCases.slice(0, args.maxExamples) : defaultCases;
+}
+
+/**
+ * Main entry point for v2 evaluation CLI.
+ */
+export async function runV2Evaluation(): Promise<void> {
+	const args = parseEvaluationArgs();
+
+	if (args.backend === 'langsmith' && (args.prompt || args.promptsCsv || args.testCase)) {
+		throw new Error(
+			'LangSmith mode requires `--dataset` and does not support `--prompt`, `--prompts-csv`, or `--test-case`',
+		);
+	}
+
+	// Setup environment
+	const logger = createLogger(args.verbose);
+	const lifecycle = createConsoleLifecycle({ verbose: args.verbose, logger });
+	const env = await setupTestEnvironment(logger);
+
+	// Validate LangSmith client early if langsmith backend is requested
+	if (args.backend === 'langsmith' && !env.lsClient) {
+		throw new Error('LangSmith client not initialized - check LANGSMITH_API_KEY');
+	}
+
+	// Create workflow generator (tracing handled via traceable() in runner)
+	const generateWorkflow = createWorkflowGenerator(env.parsedNodeTypes, env.llm, args.featureFlags);
+
+	// Create evaluators based on mode
+	const evaluators: Array<Evaluator<EvaluationContext>> = [];
+
+	switch (args.suite) {
+		case 'llm-judge':
+			evaluators.push(createLLMJudgeEvaluator(env.llm, env.parsedNodeTypes));
+			evaluators.push(createProgrammaticEvaluator(env.parsedNodeTypes));
+			break;
+		case 'pairwise':
+			evaluators.push(
+				createPairwiseEvaluator(env.llm, {
+					numJudges: args.numJudges,
+					numGenerations: args.numGenerations,
+				}),
+			);
+			evaluators.push(createProgrammaticEvaluator(env.parsedNodeTypes));
+			break;
+		case 'programmatic':
+			evaluators.push(createProgrammaticEvaluator(env.parsedNodeTypes));
+			break;
+		case 'similarity':
+			evaluators.push(createSimilarityEvaluator());
+			break;
+	}
+
+	// Build context - include generateWorkflow for multi-gen pairwise
+	const isMultiGen = args.suite === 'pairwise' && args.numGenerations > 1;
+	const llmCallLimiter = pLimit(args.concurrency);
+
+	const baseConfig = {
+		generateWorkflow,
+		evaluators,
+		lifecycle,
+		logger,
+		outputDir: args.outputDir,
+		timeoutMs: args.timeoutMs,
+		context: isMultiGen ? { generateWorkflow, llmCallLimiter } : { llmCallLimiter },
+	};
+
+	const config: RunConfig =
+		args.backend === 'langsmith'
+			? {
+					...baseConfig,
+					mode: 'langsmith',
+					dataset: args.datasetName ?? getDefaultDatasetName(args.suite),
+					langsmithClient: env.lsClient!,
+					langsmithOptions: {
+						experimentName: args.experimentName ?? getDefaultExperimentName(args.suite),
+						repetitions: args.repetitions,
+						concurrency: args.concurrency,
+						maxExamples: args.maxExamples,
+						filters: args.filters,
+						experimentMetadata:
+							args.suite === 'pairwise'
+								? {
+										numJudges: args.numJudges,
+										numGenerations: args.numGenerations,
+										scoringMethod:
+											args.numGenerations > 1 ? 'hierarchical-multi-generation' : 'hierarchical',
+									}
+								: undefined,
+					},
+				}
+			: {
+					...baseConfig,
+					mode: 'local',
+					dataset: loadTestCases(args),
+				};
+
+	// Run evaluation
+	const summary = await runEvaluation(config);
+
+	// Exit with appropriate code
+	// Check pass rate
+	const passRate = summary.totalExamples > 0 ? summary.passed / summary.totalExamples : 0;
+	process.exit(passRate >= 0.7 ? 0 : 1);
+}
+
+// Run if called directly
+if (require.main === module) {
+	runV2Evaluation().catch((error) => {
+		const logger = createLogger(true);
+		const message = error instanceof Error ? (error.stack ?? error.message) : String(error);
+		logger.error(`Evaluation failed: ${message}`);
+		process.exit(1);
+	});
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/runner.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/runner.ts
@ -1,173 +0,0 @@
-import pLimit from 'p-limit';
-import pc from 'picocolors';
-
-import { createProgressBar, updateProgress, displayResults, displayError } from './display.js';
-import type { BuilderFeatureFlags } from '../../src/workflow-builder-agent.js';
-import { basicTestCases, generateTestCases } from '../chains/test-case-generator.js';
-import {
-	setupTestEnvironment,
-	createAgent,
-	getConcurrencyLimit,
-	shouldGenerateTestCases,
-	howManyTestCasesToGenerate,
-} from '../core/environment.js';
-import { runSingleTest, initializeTestTracking } from '../core/test-runner.js';
-import type { TestCase } from '../types/evaluation.js';
-import type { TestResult } from '../types/test-result.js';
-import {
-	calculateTestMetrics,
-	calculateCategoryAverages,
-	countViolationsByType,
-} from '../utils/evaluation-calculator.js';
-import { formatHeader, saveEvaluationResults } from '../utils/evaluation-helpers.js';
-import { generateMarkdownReport } from '../utils/evaluation-reporter.js';
-
-type CliEvaluationOptions = {
-	testCaseFilter?: string; // Optional test case ID to run only a specific test
-	testCases?: TestCase[]; // Optional array of test cases to run (if not provided, uses defaults and generation)
-	repetitions?: number; // Number of times to run each test (e.g. for cache warming analysis)
-	featureFlags?: BuilderFeatureFlags; // Optional feature flags to pass to the agent (e.g. templateExamples)
-};
-
-/**
- * Main CLI evaluation runner that executes all test cases in parallel
- * Supports concurrency control via EVALUATION_CONCURRENCY environment variable
- */
-export async function runCliEvaluation(options: CliEvaluationOptions = {}): Promise<void> {
-	const { repetitions = 1, testCaseFilter, featureFlags } = options;
-
-	console.log(formatHeader('AI Workflow Builder Full Evaluation', 70));
-	if (repetitions > 1) {
-		console.log(pc.yellow(`➔ Each test will be run ${repetitions} times for cache analysis`));
-	}
-	if (featureFlags) {
-		const enabledFlags = Object.entries(featureFlags)
-			.filter(([, v]) => v === true)
-			.map(([k]) => k);
-		if (enabledFlags.length > 0) {
-			console.log(pc.green(`➔ Feature flags enabled: ${enabledFlags.join(', ')}`));
-		}
-	}
-	console.log();
-	try {
-		// Setup test environment
-		const { parsedNodeTypes, llm, tracer } = await setupTestEnvironment();
-
-		// Determine test cases to run
-		const providedTestCases =
-			options.testCases && options.testCases.length > 0 ? options.testCases : undefined;
-
-		let testCases: TestCase[] = providedTestCases ?? basicTestCases;
-
-		if (providedTestCases) {
-			console.log(pc.blue(`➔ Loaded ${providedTestCases.length} test cases from CSV`));
-		}
-
-		// Filter to single test case if specified
-		if (testCaseFilter) {
-			const filteredCase = testCases.find((tc) => tc.id === testCaseFilter);
-			if (filteredCase) {
-				testCases = [filteredCase];
-				console.log(pc.blue(`➔ Running single test case: ${filteredCase.name}`));
-			} else {
-				console.log(pc.red(`❌ Test case '${testCaseFilter}' not found`));
-				console.log(pc.dim(`Available test cases: ${testCases.map((tc) => tc.id).join(', ')}`));
-				return;
-			}
-		} else {
-			// Optionally generate additional test cases
-			if (!providedTestCases && shouldGenerateTestCases()) {
-				console.log(pc.blue('➔ Generating additional test cases...'));
-				const generatedCases = await generateTestCases(llm, howManyTestCasesToGenerate());
-				testCases = [...testCases, ...generatedCases];
-			}
-		}
-
-		// Get concurrency from environment
-		const concurrency = getConcurrencyLimit();
-		console.log(pc.dim(`Running ${testCases.length} test cases with concurrency=${concurrency}`));
-		console.log();
-
-		const startTime = Date.now();
-		const allRepetitionResults: TestResult[][] = [];
-
-		// Run tests for each repetition
-		for (let rep = 0; rep < repetitions; rep++) {
-			if (repetitions > 1) {
-				console.log(pc.cyan(`\n═══ Repetition ${rep + 1}/${repetitions} ═══\n`));
-			}
-
-			// Create progress bar for this repetition
-			const progressBar = createProgressBar(testCases.length);
-
-			// Create concurrency limiter
-			const limit = pLimit(concurrency);
-
-			// Track progress
-			let completed = 0;
-			const testResults = initializeTestTracking(testCases);
-
-			// Run all test cases in parallel with concurrency limit
-			const promises = testCases.map(
-				async (testCase) =>
-					await limit(async () => {
-						updateProgress(progressBar, completed, testCases.length, `Running: ${testCase.name}`);
-
-						// Create a dedicated agent for this test to avoid state conflicts
-						const testAgent = createAgent({ parsedNodeTypes, llm, tracer });
-						const result = await runSingleTest(testAgent, llm, testCase, parsedNodeTypes, {
-							featureFlags,
-						});
-
-						testResults[testCase.id] = result.error ? 'fail' : 'pass';
-						completed++;
-						updateProgress(progressBar, completed, testCases.length);
-						return result;
-					}),
-			);
-
-			const results = await Promise.all(promises);
-			progressBar.stop();
-			allRepetitionResults.push(results);
-
-			// Show brief stats for this repetition if running multiple times
-			if (repetitions > 1) {
-				const repStats = results.map((r) => r.cacheStats).filter((s) => s !== undefined);
-				if (repStats.length > 0) {
-					const avgHitRate = repStats.reduce((sum, s) => sum + s.cacheHitRate, 0) / repStats.length;
-					console.log(
-						pc.dim(`\n  Repetition ${rep + 1} cache hit rate: ${(avgHitRate * 100).toFixed(1)}%`),
-					);
-				}
-			}
-		}
-
-		const totalTime = Date.now() - startTime;
-
-		// Use last repetition results for display (most representative)
-		const results = allRepetitionResults[allRepetitionResults.length - 1];
-
-		// Display results
-		displayResults(testCases, results, totalTime);
-
-		// Calculate metrics for report
-		const metrics = calculateTestMetrics(results);
-		const categoryAverages = calculateCategoryAverages(results);
-		const violationCounts = countViolationsByType(results);
-
-		const combinedMetrics = {
-			...metrics,
-			categoryAverages,
-			violationCounts,
-		};
-
-		// Generate and save results
-		const report = generateMarkdownReport(results, combinedMetrics);
-		const { reportPath, resultsPath } = saveEvaluationResults(results, report);
-
-		console.log(`\nReport saved to: ${reportPath}`);
-		console.log(`Detailed results saved to: ${resultsPath}`);
-	} catch (error) {
-		displayError('Evaluation failed', error);
-	}
-}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/core/test-runner.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/core/test-runner.ts
@ -1,166 +0,0 @@
-import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
-import type { INodeTypeDescription } from 'n8n-workflow';
-
-import type { BuilderFeatureFlags, WorkflowBuilderAgent } from '../../src/workflow-builder-agent';
-import { evaluateWorkflow } from '../chains/workflow-evaluator';
-import { programmaticEvaluation } from '../programmatic/programmatic-evaluation';
-import type { EvaluationInput, TestCase } from '../types/evaluation';
-import { isWorkflowStateValues, safeExtractUsage } from '../types/langsmith';
-import type { TestResult } from '../types/test-result';
-import { calculateCacheStats } from '../utils/cache-analyzer';
-import { consumeGenerator, getChatPayload } from '../utils/evaluation-helpers';
-
-/**
- * Creates an error result for a failed test
- * @param testCase - The test case that failed
- * @param error - The error that occurred
- * @returns TestResult with error information
- */
-export function createErrorResult(testCase: TestCase, error: unknown): TestResult {
-	const errorMessage = error instanceof Error ? error.message : String(error);
-
-	return {
-		testCase,
-		generatedWorkflow: { nodes: [], connections: {}, name: 'Generated Workflow' },
-		evaluationResult: {
-			overallScore: 0,
-			functionality: { score: 0, violations: [] },
-			connections: { score: 0, violations: [] },
-			expressions: { score: 0, violations: [] },
-			nodeConfiguration: { score: 0, violations: [] },
-			efficiency: {
-				score: 0,
-				violations: [],
-				redundancyScore: 0,
-				pathOptimization: 0,
-				nodeCountEfficiency: 0,
-			},
-			dataFlow: {
-				score: 0,
-				violations: [],
-			},
-			maintainability: {
-				score: 0,
-				violations: [],
-				nodeNamingQuality: 0,
-				workflowOrganization: 0,
-				modularity: 0,
-			},
-			bestPractices: {
-				score: 0,
-				violations: [],
-				techniques: [],
-			},
-			structuralSimilarity: { score: 0, violations: [], applicable: false },
-			summary: `Evaluation failed: ${errorMessage}`,
-		},
-		programmaticEvaluationResult: {
-			overallScore: 0,
-			connections: { violations: [], score: 0 },
-			nodes: { violations: [], score: 0 },
-			trigger: { violations: [], score: 0 },
-			agentPrompt: { violations: [], score: 0 },
-			tools: { violations: [], score: 0 },
-			fromAi: { violations: [], score: 0 },
-			credentials: { violations: [], score: 0 },
-			similarity: null,
-		},
-		generationTime: 0,
-		error: errorMessage,
-	};
-}
-
-export interface RunSingleTestOptions {
-	agent: WorkflowBuilderAgent;
-	llm: BaseChatModel;
-	testCase: TestCase;
-	nodeTypes: INodeTypeDescription[];
-	userId?: string;
-	featureFlags?: BuilderFeatureFlags;
-}
-
-/**
- * Runs a single test case by generating a workflow and evaluating it
- * @param agent - The workflow builder agent to use
- * @param llm - Language model for evaluation
- * @param testCase - Test case to execute
- * @param nodeTypes - Array of node type descriptions
- * @params opts - userId, User ID for the session and featureFlags, Optional feature flags to pass to the agent
- * @returns Test result with generated workflow and evaluation
- */
-export async function runSingleTest(
-	agent: WorkflowBuilderAgent,
-	llm: BaseChatModel,
-	testCase: TestCase,
-	nodeTypes: INodeTypeDescription[],
-	opts?: { userId?: string; featureFlags?: BuilderFeatureFlags },
-): Promise<TestResult> {
-	const userId = opts?.userId ?? 'test-user';
-	try {
-		// Generate workflow
-		const startTime = Date.now();
-		await consumeGenerator(
-			agent.chat(
-				getChatPayload({
-					evalType: 'single-eval',
-					message: testCase.prompt,
-					workflowId: testCase.id,
-					featureFlags: opts?.featureFlags,
-				}),
-				userId,
-			),
-		);
-		const generationTime = Date.now() - startTime;
-
-		// Get generated workflow with validation
-		const state = await agent.getState(testCase.id, userId);
-
-		// Validate workflow state
-		if (!state.values || !isWorkflowStateValues(state.values)) {
-			throw new Error('Invalid workflow state: missing or malformed workflow');
-		}
-
-		const generatedWorkflow = state.values.workflowJSON;
-
-		// Extract cache statistics from messages
-		const usage = safeExtractUsage(state.values.messages);
-		const cacheStats = calculateCacheStats(usage);
-
-		// Evaluate
-		const evaluationInput: EvaluationInput = {
-			userPrompt: testCase.prompt,
-			generatedWorkflow,
-			referenceWorkflow: testCase.referenceWorkflow,
-			referenceWorkflows: testCase.referenceWorkflows,
-		};
-
-		const evaluationResult = await evaluateWorkflow(llm, evaluationInput);
-		const programmaticEvaluationResult = await programmaticEvaluation(evaluationInput, nodeTypes);
-
-		return {
-			testCase,
-			generatedWorkflow,
-			evaluationResult,
-			programmaticEvaluationResult,
-			generationTime,
-			cacheStats,
-		};
-	} catch (error) {
-		return createErrorResult(testCase, error);
-	}
-}
-
-/**
- * Initialize test tracking map
- * @param testCases - Array of test cases
- * @returns Map of test ID to status
- */
-export function initializeTestTracking(
-	testCases: TestCase[],
-): Record<string, 'pending' | 'pass' | 'fail'> {
-	const tracking: Record<string, 'pending' | 'pass' | 'fail'> = {};
-	for (const testCase of testCases) {
-		tracking[testCase.id] = 'pending';
-	}
-	return tracking;
-}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/index.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/index.ts
@ -0,0 +1,17 @@
+/**
+ * Evaluator factories for the v2 evaluation harness.
+ *
+ * Each factory creates an Evaluator that wraps existing evaluation logic.
+ * All evaluators are independent and can run in parallel.
+ */
+
+export { createLLMJudgeEvaluator } from './llm-judge';
+export { createProgrammaticEvaluator } from './programmatic';
+export {
+	createPairwiseEvaluator,
+	type PairwiseEvaluatorOptions,
+} from './pairwise';
+export {
+	createSimilarityEvaluator,
+	type SimilarityEvaluatorOptions,
+} from './similarity';
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluation.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluation.ts
@ -1,6 +1,6 @@
 import { z } from 'zod';

-import type { SimpleWorkflow } from '../../src/types/workflow';
+import type { SimpleWorkflow } from '../../../src/types/workflow';

 // Violation schema
 const violationSchema = z.object({
@ -81,7 +81,6 @@ export const testCaseSchema = z.object({
 	id: z.string(),
 	name: z.string(),
 	prompt: z.string(),
-	referenceWorkflow: z.custom<SimpleWorkflow>().optional(),
 	referenceWorkflows: z.array(z.custom<SimpleWorkflow>()).optional(),
 });

@ -91,7 +90,6 @@ export type TestCase = z.infer<typeof testCaseSchema>;
 export const evaluationInputSchema = z.object({
 	userPrompt: z.string(),
 	generatedWorkflow: z.custom<SimpleWorkflow>(),
-	referenceWorkflow: z.custom<SimpleWorkflow>().optional(),
 	referenceWorkflows: z.array(z.custom<SimpleWorkflow>()).optional(),
 	preset: z.enum(['strict', 'standard', 'lenient']).optional(),
 });
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/base.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/base.ts
@ -6,7 +6,7 @@ import { RunnableSequence } from '@langchain/core/runnables';
 import { OperationalError } from 'n8n-workflow';
 import type { z } from 'zod';

-import type { EvaluationInput } from '../../types/evaluation';
+import type { EvaluationInput } from '../evaluation';

 type EvaluatorChainInput = {
 	userPrompt: string;
@ -39,9 +39,10 @@ export async function invokeEvaluatorChain<TResult>(
 	input: EvaluationInput,
 	config?: RunnableConfig,
 ): Promise<TResult> {
-	const referenceSection = input.referenceWorkflow
-		? `<reference_workflow>\n${JSON.stringify(input.referenceWorkflow, null, 2)}\n</reference_workflow>`
-		: '';
+	const referenceSection =
+		input.referenceWorkflows && input.referenceWorkflows.length > 0
+			? `<reference_workflows>\n${JSON.stringify(input.referenceWorkflows, null, 2)}\n</reference_workflows>`
+			: '';

 	const result = await chain.invoke(
 		{
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/best-practices-evaluator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/best-practices-evaluator.ts
@ -5,7 +5,7 @@ import { promptCategorizationChain } from '@/chains/prompt-categorization';
 import { documentation } from '@/tools/best-practices';

 import { createEvaluatorChain } from './base';
-import type { EvaluationInput } from '../../types/evaluation';
+import type { EvaluationInput } from '../evaluation';

 // Schema for best practices evaluation result
 const bestPracticesResultSchema = z.object({
@ -226,9 +226,10 @@ export async function evaluateBestPractices(
 	);

 	// Prepare the reference section
-	const referenceSection = input.referenceWorkflow
-		? `<reference_workflow>\n${JSON.stringify(input.referenceWorkflow, null, 2)}\n</reference_workflow>`
-		: '';
+	const referenceSection =
+		input.referenceWorkflows && input.referenceWorkflows.length > 0
+			? `<reference_workflows>\n${JSON.stringify(input.referenceWorkflows, null, 2)}\n</reference_workflows>`
+			: '';

 	// Invoke the evaluator chain with best practices
 	const chain = createBestPracticesEvaluatorChain(llm);
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/connections-evaluator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/connections-evaluator.ts
@ -2,7 +2,7 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
 import { z } from 'zod';

 import { createEvaluatorChain, invokeEvaluatorChain } from './base';
-import type { EvaluationInput } from '../../types/evaluation';
+import type { EvaluationInput } from '../evaluation';

 // Schema for connections evaluation result
 const connectionsResultSchema = z.object({
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/data-flow-evaluator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/data-flow-evaluator.ts
@ -2,7 +2,7 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
 import { z } from 'zod';

 import { createEvaluatorChain, invokeEvaluatorChain } from './base';
-import type { EvaluationInput } from '../../types/evaluation';
+import type { EvaluationInput } from '../evaluation';

 // Schema for data flow evaluation result
 const dataFlowResultSchema = z.object({
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/efficiency-evaluator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/efficiency-evaluator.ts
@ -2,7 +2,7 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
 import { z } from 'zod';

 import { createEvaluatorChain, invokeEvaluatorChain } from './base';
-import type { EvaluationInput } from '../../types/evaluation';
+import type { EvaluationInput } from '../evaluation';

 // Schema for efficiency evaluation result
 const efficiencyResultSchema = z.object({
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/expressions-evaluator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/expressions-evaluator.ts
@ -2,7 +2,7 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
 import { z } from 'zod';

 import { createEvaluatorChain, invokeEvaluatorChain } from './base';
-import type { EvaluationInput } from '../../types/evaluation';
+import type { EvaluationInput } from '../evaluation';

 // Schema for expressions evaluation result
 const expressionsResultSchema = z.object({
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/functionality-evaluator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/functionality-evaluator.ts
@ -2,7 +2,7 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
 import { z } from 'zod';

 import { createEvaluatorChain, invokeEvaluatorChain } from './base';
-import type { EvaluationInput } from '../../types/evaluation';
+import type { EvaluationInput } from '../evaluation';

 // Schema for functionality evaluation result
 const functionalityResultSchema = z.object({
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/index.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/index.ts
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/maintainability-evaluator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/maintainability-evaluator.ts
@ -2,7 +2,7 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
 import { z } from 'zod';

 import { createEvaluatorChain, invokeEvaluatorChain } from './base';
-import type { EvaluationInput } from '../../types/evaluation';
+import type { EvaluationInput } from '../evaluation';

 // Schema for maintainability evaluation result
 const maintainabilityResultSchema = z.object({
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/node-configuration-evaluator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/evaluators/node-configuration-evaluator.ts
@ -2,7 +2,7 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
 import { z } from 'zod';

 import { createEvaluatorChain, invokeEvaluatorChain } from './base';
-import type { EvaluationInput } from '../../types/evaluation';
+import type { EvaluationInput } from '../evaluation';

 // Schema for node configuration evaluation result
 const nodeConfigurationResultSchema = z.object({
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/index.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/index.ts
@ -0,0 +1,145 @@
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import type { INodeTypeDescription } from 'n8n-workflow';
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+import type { EvaluationInput } from './evaluation';
+import { evaluateWorkflow } from './workflow-evaluator';
+import { runWithOptionalLimiter, withTimeout } from '../../harness/evaluation-helpers';
+import type { EvaluationContext, Evaluator, Feedback } from '../../harness/harness-types';
+
+const EVALUATOR_NAME = 'llm-judge';
+
+/**
+ * Violation type from evaluation results.
+ */
+interface Violation {
+	type: string;
+	description: string;
+	pointsDeducted: number;
+}
+
+/**
+ * Format violations as a comment string.
+ */
+function formatViolations(violations: Violation[]): string {
+	if (!violations || violations.length === 0) return '';
+	return violations.map((v) => `[${v.type}] ${v.description}`).join('; ');
+}
+
+/**
+ * Create an LLM-as-judge evaluator that uses the existing evaluateWorkflow chain.
+ *
+ * @param llm - The LLM to use for evaluation
+ * @param _nodeTypes - Node type descriptions (unused but kept for interface compatibility)
+ * @returns An evaluator that produces feedback from LLM evaluation
+ */
+export function createLLMJudgeEvaluator(
+	llm: BaseChatModel,
+	_nodeTypes: INodeTypeDescription[],
+): Evaluator<EvaluationContext> {
+	const fb = (
+		metric: string,
+		score: number,
+		kind: Feedback['kind'],
+		comment?: string,
+	): Feedback => ({
+		evaluator: EVALUATOR_NAME,
+		metric,
+		score,
+		kind,
+		...(comment ? { comment } : {}),
+	});
+
+	return {
+		name: EVALUATOR_NAME,
+
+		async evaluate(workflow: SimpleWorkflow, ctx: EvaluationContext): Promise<Feedback[]> {
+			const input: EvaluationInput = {
+				userPrompt: ctx.prompt,
+				generatedWorkflow: workflow,
+			};
+
+			const result = await runWithOptionalLimiter(async () => {
+				return await withTimeout({
+					promise: evaluateWorkflow(llm, input),
+					timeoutMs: ctx.timeoutMs,
+					label: 'llm-judge:evaluateWorkflow',
+				});
+			}, ctx.llmCallLimiter);
+
+			return [
+				// Core category scores
+				fb(
+					'functionality',
+					result.functionality.score,
+					'metric',
+					formatViolations(result.functionality.violations),
+				),
+				fb(
+					'connections',
+					result.connections.score,
+					'metric',
+					formatViolations(result.connections.violations),
+				),
+				fb(
+					'expressions',
+					result.expressions.score,
+					'metric',
+					formatViolations(result.expressions.violations),
+				),
+				fb(
+					'nodeConfiguration',
+					result.nodeConfiguration.score,
+					'metric',
+					formatViolations(result.nodeConfiguration.violations),
+				),
+
+				// Efficiency with sub-metrics
+				fb(
+					'efficiency',
+					result.efficiency.score,
+					'metric',
+					formatViolations(result.efficiency.violations),
+				),
+				fb('efficiency.redundancyScore', result.efficiency.redundancyScore, 'detail'),
+				fb('efficiency.pathOptimization', result.efficiency.pathOptimization, 'detail'),
+				fb('efficiency.nodeCountEfficiency', result.efficiency.nodeCountEfficiency, 'detail'),
+
+				// Data flow
+				fb(
+					'dataFlow',
+					result.dataFlow.score,
+					'metric',
+					formatViolations(result.dataFlow.violations),
+				),
+
+				// Maintainability with sub-metrics
+				fb(
+					'maintainability',
+					result.maintainability.score,
+					'metric',
+					formatViolations(result.maintainability.violations),
+				),
+				fb('maintainability.nodeNamingQuality', result.maintainability.nodeNamingQuality, 'detail'),
+				fb(
+					'maintainability.workflowOrganization',
+					result.maintainability.workflowOrganization,
+					'detail',
+				),
+				fb('maintainability.modularity', result.maintainability.modularity, 'detail'),
+
+				// Best practices adherence
+				fb(
+					'bestPractices',
+					result.bestPractices.score,
+					'metric',
+					formatViolations(result.bestPractices.violations),
+				),
+
+				// Overall score
+				fb('overallScore', result.overallScore, 'score', result.summary),
+			];
+		},
+	};
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/workflow-evaluator.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/workflow-evaluator.test.ts
@ -0,0 +1,309 @@
+import type { EvaluationResult, CategoryScore } from './evaluation';
+import {
+	calculateWeightedScore,
+	generateEvaluationSummary,
+	identifyCriticalIssues,
+	LLM_JUDGE_CATEGORY_WEIGHTS,
+	TOTAL_WEIGHT_WITHOUT_STRUCTURAL,
+	TOTAL_WEIGHT_WITH_STRUCTURAL,
+} from './workflow-evaluator';
+
+/**
+ * Creates a minimal category score for testing.
+ */
+function createCategoryScore(
+	score: number,
+	violations: CategoryScore['violations'] = [],
+): CategoryScore {
+	return { score, violations };
+}
+
+/**
+ * Creates a complete evaluation result with all scores set to the same value.
+ */
+function createUniformResult(score: number): EvaluationResult {
+	return {
+		overallScore: 0,
+		functionality: createCategoryScore(score),
+		connections: createCategoryScore(score),
+		expressions: createCategoryScore(score),
+		nodeConfiguration: createCategoryScore(score),
+		efficiency: {
+			...createCategoryScore(score),
+			redundancyScore: score,
+			pathOptimization: score,
+			nodeCountEfficiency: score,
+		},
+		dataFlow: createCategoryScore(score),
+		maintainability: {
+			...createCategoryScore(score),
+			nodeNamingQuality: score,
+			workflowOrganization: score,
+			modularity: score,
+		},
+		bestPractices: createCategoryScore(score),
+		structuralSimilarity: {
+			score: 0,
+			violations: [],
+			applicable: false,
+		},
+		summary: '',
+	};
+}
+
+describe('workflow-evaluator', () => {
+	describe('calculateWeightedScore', () => {
+		it('should return 1.0 when all scores are perfect', () => {
+			const result = createUniformResult(1.0);
+			expect(calculateWeightedScore(result)).toBeCloseTo(1.0, 5);
+		});
+
+		it('should return 0 when all scores are zero', () => {
+			const result = createUniformResult(0);
+			expect(calculateWeightedScore(result)).toBe(0);
+		});
+
+		it('should return 0.5 when all scores are 0.5', () => {
+			const result = createUniformResult(0.5);
+			expect(calculateWeightedScore(result)).toBeCloseTo(0.5, 5);
+		});
+
+		it('should weight functionality at 25%', () => {
+			const result = createUniformResult(0);
+			result.functionality.score = 1.0;
+			const expected = LLM_JUDGE_CATEGORY_WEIGHTS.functionality / TOTAL_WEIGHT_WITHOUT_STRUCTURAL;
+			expect(calculateWeightedScore(result)).toBeCloseTo(expected, 5);
+		});
+
+		it('should weight connections at 15%', () => {
+			const result = createUniformResult(0);
+			result.connections.score = 1.0;
+			const expected = LLM_JUDGE_CATEGORY_WEIGHTS.connections / TOTAL_WEIGHT_WITHOUT_STRUCTURAL;
+			expect(calculateWeightedScore(result)).toBeCloseTo(expected, 5);
+		});
+
+		it('should include structural similarity when applicable', () => {
+			const result = createUniformResult(1.0);
+			result.structuralSimilarity = {
+				score: 0,
+				violations: [],
+				applicable: true,
+			};
+			// With structural similarity at 0, weighted sum = TOTAL_WEIGHT_WITHOUT_STRUCTURAL
+			const expected = TOTAL_WEIGHT_WITHOUT_STRUCTURAL / TOTAL_WEIGHT_WITH_STRUCTURAL;
+			expect(calculateWeightedScore(result)).toBeCloseTo(expected, 5);
+		});
+
+		it('should not include structural similarity when not applicable', () => {
+			const result = createUniformResult(1.0);
+			result.structuralSimilarity = {
+				score: 0.5,
+				violations: [],
+				applicable: false,
+			};
+			// Should still be 1.0 since structural similarity is not counted
+			expect(calculateWeightedScore(result)).toBeCloseTo(1.0, 5);
+		});
+
+		it('should handle mixed scores correctly', () => {
+			const result = createUniformResult(0);
+			result.functionality.score = 1.0;
+			result.connections.score = 0.8;
+			result.expressions.score = 0.6;
+			result.nodeConfiguration.score = 0.4;
+			result.efficiency.score = 0.2;
+			result.dataFlow.score = 0.0;
+			result.maintainability.score = 1.0;
+			result.bestPractices.score = 0.5;
+
+			const w = LLM_JUDGE_CATEGORY_WEIGHTS;
+			const weightedSum =
+				1.0 * w.functionality +
+				0.8 * w.connections +
+				0.6 * w.expressions +
+				0.4 * w.nodeConfiguration +
+				0.2 * w.efficiency +
+				0.0 * w.dataFlow +
+				1.0 * w.maintainability +
+				0.5 * w.bestPractices;
+			const expected = weightedSum / TOTAL_WEIGHT_WITHOUT_STRUCTURAL;
+
+			expect(calculateWeightedScore(result)).toBeCloseTo(expected, 5);
+		});
+	});
+
+	describe('generateEvaluationSummary', () => {
+		it('should list strengths for scores >= 0.8', () => {
+			const result = createUniformResult(0.9);
+			const summary = generateEvaluationSummary(result);
+
+			expect(summary).toContain('strong functional implementation');
+			expect(summary).toContain('well-connected nodes');
+			expect(summary).toContain('correct expression syntax');
+			expect(summary).toContain('well-configured nodes');
+			expect(summary).toContain('proper data flow');
+			expect(summary).toContain('efficient design');
+			expect(summary).toContain('maintainable structure');
+			expect(summary).toContain('follows best practices');
+		});
+
+		it('should list weaknesses for scores < 0.5', () => {
+			const result = createUniformResult(0.3);
+			const summary = generateEvaluationSummary(result);
+
+			expect(summary).toContain('functional gaps');
+			expect(summary).toContain('connection issues');
+			expect(summary).toContain('expression errors');
+			expect(summary).toContain('node configuration issues');
+			expect(summary).toContain('data flow problems');
+			expect(summary).toContain('inefficiencies');
+			expect(summary).toContain('poor maintainability');
+			expect(summary).toContain('deviates from best practices');
+		});
+
+		it('should not list scores between 0.5 and 0.8 as strengths or weaknesses', () => {
+			const result = createUniformResult(0.65);
+			const summary = generateEvaluationSummary(result);
+
+			// Should return the default message since no strengths or weaknesses
+			expect(summary).toBe(
+				'The workflow shows adequate implementation across all evaluated metrics.',
+			);
+		});
+
+		it('should handle mixed scores', () => {
+			const result = createUniformResult(0.65);
+			result.functionality.score = 0.9; // strength
+			result.connections.score = 0.3; // weakness
+
+			const summary = generateEvaluationSummary(result);
+
+			expect(summary).toContain('strong functional implementation');
+			expect(summary).toContain('connection issues');
+			expect(summary).not.toContain('adequate implementation');
+		});
+
+		it('should format summary with proper grammar', () => {
+			const result = createUniformResult(0.65);
+			result.functionality.score = 0.9;
+			result.connections.score = 0.9;
+
+			const summary = generateEvaluationSummary(result);
+
+			expect(summary).toMatch(/^The workflow demonstrates .+\.$/);
+			expect(summary).toContain(', '); // Multiple strengths should be comma-separated
+		});
+
+		it('should include "Key areas for improvement" for weaknesses', () => {
+			const result = createUniformResult(0.65);
+			result.functionality.score = 0.3;
+
+			const summary = generateEvaluationSummary(result);
+
+			expect(summary).toContain('Key areas for improvement include');
+		});
+	});
+
+	describe('identifyCriticalIssues', () => {
+		it('should return undefined when no critical violations exist', () => {
+			const result = createUniformResult(0.5);
+			result.functionality.violations = [
+				{ type: 'major', description: 'Some major issue', pointsDeducted: 20 },
+				{ type: 'minor', description: 'Some minor issue', pointsDeducted: 5 },
+			];
+
+			expect(identifyCriticalIssues(result)).toBeUndefined();
+		});
+
+		it('should extract critical violations from all categories', () => {
+			const result = createUniformResult(0.5);
+			result.functionality.violations = [
+				{ type: 'critical', description: 'Missing trigger', pointsDeducted: 50 },
+			];
+			result.connections.violations = [
+				{ type: 'critical', description: 'Disconnected node', pointsDeducted: 40 },
+			];
+
+			const issues = identifyCriticalIssues(result);
+
+			expect(issues).toHaveLength(2);
+			expect(issues).toContain('[functionality] Missing trigger');
+			expect(issues).toContain('[connections] Disconnected node');
+		});
+
+		it('should only include critical violations, not major or minor', () => {
+			const result = createUniformResult(0.5);
+			result.functionality.violations = [
+				{ type: 'critical', description: 'Critical issue', pointsDeducted: 50 },
+				{ type: 'major', description: 'Major issue', pointsDeducted: 20 },
+				{ type: 'minor', description: 'Minor issue', pointsDeducted: 5 },
+			];
+
+			const issues = identifyCriticalIssues(result);
+
+			expect(issues).toHaveLength(1);
+			expect(issues).toContain('[functionality] Critical issue');
+		});
+
+		it('should handle multiple critical violations in same category', () => {
+			const result = createUniformResult(0.5);
+			result.functionality.violations = [
+				{ type: 'critical', description: 'First critical', pointsDeducted: 50 },
+				{ type: 'critical', description: 'Second critical', pointsDeducted: 40 },
+			];
+
+			const issues = identifyCriticalIssues(result);
+
+			expect(issues).toHaveLength(2);
+			expect(issues).toContain('[functionality] First critical');
+			expect(issues).toContain('[functionality] Second critical');
+		});
+
+		it('should check all eight evaluation categories', () => {
+			const result = createUniformResult(0.5);
+
+			// Add a critical violation to each category
+			result.functionality.violations = [
+				{ type: 'critical', description: 'func issue', pointsDeducted: 50 },
+			];
+			result.connections.violations = [
+				{ type: 'critical', description: 'conn issue', pointsDeducted: 50 },
+			];
+			result.expressions.violations = [
+				{ type: 'critical', description: 'expr issue', pointsDeducted: 50 },
+			];
+			result.nodeConfiguration.violations = [
+				{ type: 'critical', description: 'config issue', pointsDeducted: 50 },
+			];
+			result.efficiency.violations = [
+				{ type: 'critical', description: 'eff issue', pointsDeducted: 50 },
+			];
+			result.dataFlow.violations = [
+				{ type: 'critical', description: 'flow issue', pointsDeducted: 50 },
+			];
+			result.maintainability.violations = [
+				{ type: 'critical', description: 'maint issue', pointsDeducted: 50 },
+			];
+			result.bestPractices.violations = [
+				{ type: 'critical', description: 'bp issue', pointsDeducted: 50 },
+			];
+
+			const issues = identifyCriticalIssues(result);
+
+			expect(issues).toHaveLength(8);
+			expect(issues).toContain('[functionality] func issue');
+			expect(issues).toContain('[connections] conn issue');
+			expect(issues).toContain('[expressions] expr issue');
+			expect(issues).toContain('[nodeConfiguration] config issue');
+			expect(issues).toContain('[efficiency] eff issue');
+			expect(issues).toContain('[dataFlow] flow issue');
+			expect(issues).toContain('[maintainability] maint issue');
+			expect(issues).toContain('[bestPractices] bp issue');
+		});
+
+		it('should return undefined for empty violations arrays', () => {
+			const result = createUniformResult(1.0);
+			expect(identifyCriticalIssues(result)).toBeUndefined();
+		});
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/workflow-evaluator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/llm-judge/workflow-evaluator.ts
@ -1,5 +1,6 @@
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';

+import type { EvaluationInput, EvaluationResult } from './evaluation';
 import {
 	evaluateFunctionality,
 	evaluateConnections,
@ -10,7 +11,49 @@ import {
 	evaluateMaintainability,
 	evaluateBestPractices,
 } from './evaluators';
-import type { EvaluationInput, EvaluationResult } from '../types/evaluation';
+
+/**
+ * Weights for each LLM-judge evaluation category used in overall score calculation.
+ *
+ * This is evaluator-internal weighting, and is independent from the harness-level
+ * cross-evaluator weighting in `evaluations/score-calculator.ts`.
+ * Exported for use in tests.
+ */
+export const LLM_JUDGE_CATEGORY_WEIGHTS = {
+	functionality: 0.25,
+	connections: 0.15,
+	expressions: 0.15,
+	nodeConfiguration: 0.15,
+	efficiency: 0.1,
+	dataFlow: 0.1,
+	maintainability: 0.05,
+	bestPractices: 0.1,
+	structuralSimilarity: 0.05,
+} as const;
+
+/**
+ * @deprecated Use `LLM_JUDGE_CATEGORY_WEIGHTS` (kept for backwards compatibility within the package).
+ */
+export const EVALUATION_WEIGHTS = LLM_JUDGE_CATEGORY_WEIGHTS;
+
+/**
+ * Total weight when structural similarity is not applicable.
+ */
+export const TOTAL_WEIGHT_WITHOUT_STRUCTURAL =
+	LLM_JUDGE_CATEGORY_WEIGHTS.functionality +
+	LLM_JUDGE_CATEGORY_WEIGHTS.connections +
+	LLM_JUDGE_CATEGORY_WEIGHTS.expressions +
+	LLM_JUDGE_CATEGORY_WEIGHTS.nodeConfiguration +
+	LLM_JUDGE_CATEGORY_WEIGHTS.efficiency +
+	LLM_JUDGE_CATEGORY_WEIGHTS.dataFlow +
+	LLM_JUDGE_CATEGORY_WEIGHTS.maintainability +
+	LLM_JUDGE_CATEGORY_WEIGHTS.bestPractices;
+
+/**
+ * Total weight when structural similarity is applicable.
+ */
+export const TOTAL_WEIGHT_WITH_STRUCTURAL =
+	TOTAL_WEIGHT_WITHOUT_STRUCTURAL + LLM_JUDGE_CATEGORY_WEIGHTS.structuralSimilarity;

 /**
 * Calculate weighted score for the overall evaluation
@ -18,45 +61,25 @@ import type { EvaluationInput, EvaluationResult } from '../types/evaluation';
 * @returns Weighted overall score
 */
 export function calculateWeightedScore(result: EvaluationResult): number {
-	// Define weights for each category
-	const weights = {
-		functionality: 0.25,
-		connections: 0.15,
-		expressions: 0.15,
-		nodeConfiguration: 0.15,
-		efficiency: 0.1,
-		dataFlow: 0.1,
-		maintainability: 0.05,
-		bestPractices: 0.1,
+	const w = LLM_JUDGE_CATEGORY_WEIGHTS;

-		// Structural similarity (5% if applicable)
-		structuralSimilarity: 0.05,
-	};
+	// Calculate weighted sum for all categories
+	const weightedSum =
+		result.functionality.score * w.functionality +
+		result.connections.score * w.connections +
+		result.expressions.score * w.expressions +
+		result.nodeConfiguration.score * w.nodeConfiguration +
+		result.efficiency.score * w.efficiency +
+		result.dataFlow.score * w.dataFlow +
+		result.maintainability.score * w.maintainability +
+		result.bestPractices.score * w.bestPractices +
+		(result.structuralSimilarity?.applicable
+			? result.structuralSimilarity.score * w.structuralSimilarity
+			: 0);

-	let totalWeight = 0;
-	let weightedSum = 0;
-
-	// Add scores for core categories (always present)
-	weightedSum += result.functionality.score * weights.functionality;
-	weightedSum += result.connections.score * weights.connections;
-	weightedSum += result.expressions.score * weights.expressions;
-	weightedSum += result.nodeConfiguration.score * weights.nodeConfiguration;
-	totalWeight +=
-		weights.functionality + weights.connections + weights.expressions + weights.nodeConfiguration;
-
-	// Add scores for new metrics (now required)
-	weightedSum += result.efficiency.score * weights.efficiency;
-	weightedSum += result.dataFlow.score * weights.dataFlow;
-	weightedSum += result.maintainability.score * weights.maintainability;
-	weightedSum += result.bestPractices.score * weights.bestPractices;
-	totalWeight +=
-		weights.efficiency + weights.dataFlow + weights.maintainability + weights.bestPractices;
-
-	// Add structural similarity only if applicable
-	if (result.structuralSimilarity?.applicable) {
-		weightedSum += result.structuralSimilarity.score * weights.structuralSimilarity;
-		totalWeight += weights.structuralSimilarity;
-	}
+	const totalWeight = result.structuralSimilarity?.applicable
+		? TOTAL_WEIGHT_WITH_STRUCTURAL
+		: TOTAL_WEIGHT_WITHOUT_STRUCTURAL;

 	return totalWeight > 0 ? weightedSum / totalWeight : 0;
 }
@ -66,7 +89,7 @@ export function calculateWeightedScore(result: EvaluationResult): number {
 * @param result - Complete evaluation result
 * @returns Summary string describing strengths and weaknesses
 */
-function generateEvaluationSummary(result: EvaluationResult): string {
+export function generateEvaluationSummary(result: EvaluationResult): string {
 	const strengths: string[] = [];
 	const weaknesses: string[] = [];

@ -116,7 +139,7 @@ function generateEvaluationSummary(result: EvaluationResult): string {
 * @param result - Complete evaluation result
 * @returns Array of critical issues if any
 */
-function identifyCriticalIssues(result: EvaluationResult): string[] | undefined {
+export function identifyCriticalIssues(result: EvaluationResult): string[] | undefined {
 	const criticalIssues: string[] = [];

 	// Check all categories for critical violations
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/index.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/index.ts
@ -0,0 +1,270 @@
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+import { runJudgePanel, type EvalCriteria } from './judge-panel';
+import { PAIRWISE_METRICS } from './metrics';
+import { runWithOptionalLimiter, withTimeout } from '../../harness/evaluation-helpers';
+import type { EvaluationContext, Evaluator, Feedback } from '../../harness/harness-types';
+import { aggregateGenerations, type GenerationDetail } from '../../harness/multi-gen';
+
+type MultiGenContext = EvaluationContext & {
+	generateWorkflow: (prompt: string) => Promise<SimpleWorkflow>;
+};
+
+function assertMultiGenContext(ctx: EvaluationContext): asserts ctx is MultiGenContext {
+	if (!ctx.generateWorkflow || !ctx.prompt) {
+		throw new Error('Multi-gen requires generateWorkflow and prompt in context');
+	}
+}
+
+/**
+ * Options for creating a pairwise evaluator.
+ */
+export interface PairwiseEvaluatorOptions {
+	/** Number of judges to run (default: 3) */
+	numJudges?: number;
+	/** Number of workflow generations (default: 1, no multi-gen) */
+	numGenerations?: number;
+}
+
+/**
+ * Evaluate a single workflow with a judge panel.
+ * Returns feedback with majority pass, diagnostic score, and per-judge results.
+ */
+async function evaluateSingleGeneration(
+	llm: BaseChatModel,
+	workflow: SimpleWorkflow,
+	ctx: EvaluationContext,
+	numJudges: number,
+): Promise<Feedback[]> {
+	const evalCriteria: EvalCriteria = {
+		dos: ctx?.dos,
+		donts: ctx?.donts,
+	};
+
+	const result = await runJudgePanel(llm, workflow, evalCriteria, numJudges, {
+		llmCallLimiter: ctx.llmCallLimiter,
+		timeoutMs: ctx.timeoutMs,
+	});
+
+	const feedback: Feedback[] = [];
+
+	const totalViolations = result.judgeResults.reduce((sum, r) => sum + r.violations.length, 0);
+	const totalPasses = result.judgeResults.reduce((sum, r) => sum + r.passes.length, 0);
+
+	// v1-compatible single-generation metrics
+	feedback.push({
+		evaluator: 'pairwise',
+		metric: PAIRWISE_METRICS.PAIRWISE_PRIMARY,
+		score: result.majorityPass ? 1 : 0,
+		kind: 'score',
+		comment: `${result.primaryPasses}/${numJudges} judges passed`,
+	});
+	feedback.push({
+		evaluator: 'pairwise',
+		metric: PAIRWISE_METRICS.PAIRWISE_DIAGNOSTIC,
+		score: result.avgDiagnosticScore,
+		kind: 'metric',
+	});
+	feedback.push({
+		evaluator: 'pairwise',
+		metric: PAIRWISE_METRICS.PAIRWISE_JUDGES_PASSED,
+		score: result.primaryPasses,
+		kind: 'detail',
+	});
+	feedback.push({
+		evaluator: 'pairwise',
+		metric: PAIRWISE_METRICS.PAIRWISE_TOTAL_PASSES,
+		score: totalPasses,
+		kind: 'detail',
+	});
+	feedback.push({
+		evaluator: 'pairwise',
+		metric: PAIRWISE_METRICS.PAIRWISE_TOTAL_VIOLATIONS,
+		score: totalViolations,
+		kind: 'detail',
+	});
+
+	// Individual judge results
+	for (let i = 0; i < result.judgeResults.length; i++) {
+		const judge = result.judgeResults[i];
+		// Build comment from violations
+		const violationSummary =
+			judge.violations.length > 0
+				? judge.violations.map((v) => `[${v.rule}] ${v.justification}`).join('; ')
+				: undefined;
+		feedback.push({
+			evaluator: 'pairwise',
+			metric: `judge${i + 1}`,
+			score: judge.primaryPass ? 1 : 0,
+			kind: 'detail',
+			comment: violationSummary,
+		});
+	}
+
+	return feedback;
+}
+
+/**
+ * Evaluate multiple workflow generations with a judge panel each.
+ * Returns aggregated feedback with generation correctness and per-generation details.
+ */
+async function evaluateMultiGeneration(
+	llm: BaseChatModel,
+	ctx: MultiGenContext,
+	numJudges: number,
+	numGenerations: number,
+): Promise<Feedback[]> {
+	const evalCriteria: EvalCriteria = {
+		dos: ctx?.dos,
+		donts: ctx?.donts,
+	};
+
+	// Generate all workflows and evaluate in parallel
+	const generationRuns = await Promise.all(
+		Array.from({ length: numGenerations }, async (_, i) => {
+			const workflow = await runWithOptionalLimiter(async () => {
+				return await withTimeout({
+					promise: ctx.generateWorkflow(ctx.prompt),
+					timeoutMs: ctx.timeoutMs,
+					label: 'pairwise:workflow_generation',
+				});
+			}, ctx.llmCallLimiter);
+			const result = await runJudgePanel(llm, workflow, evalCriteria, numJudges, {
+				generationIndex: i + 1,
+				llmCallLimiter: ctx.llmCallLimiter,
+				timeoutMs: ctx.timeoutMs,
+			});
+			return { workflow, result };
+		}),
+	);
+
+	const generationDetails: GenerationDetail[] = generationRuns.map(({ workflow, result }) => ({
+		workflow,
+		majorityPass: result.majorityPass,
+		diagnosticScore: result.avgDiagnosticScore,
+		primaryPasses: result.primaryPasses,
+		numJudges,
+	}));
+
+	// Aggregate results
+	const aggregation = aggregateGenerations(generationDetails);
+
+	// Build feedback
+	const feedback: Feedback[] = [
+		{
+			evaluator: 'pairwise',
+			metric: PAIRWISE_METRICS.PAIRWISE_GENERATION_CORRECTNESS,
+			score: aggregation.generationCorrectness,
+			kind: 'score',
+			comment: `${aggregation.passingGenerations}/${aggregation.totalGenerations} generations passed`,
+		},
+		{
+			evaluator: 'pairwise',
+			metric: PAIRWISE_METRICS.PAIRWISE_AGGREGATED_DIAGNOSTIC,
+			score: aggregation.aggregatedDiagnosticScore,
+			kind: 'metric',
+		},
+		{
+			evaluator: 'pairwise',
+			metric: PAIRWISE_METRICS.PAIRWISE_GENERATIONS_PASSED,
+			score: aggregation.passingGenerations,
+			kind: 'detail',
+		},
+		{
+			evaluator: 'pairwise',
+			metric: PAIRWISE_METRICS.PAIRWISE_TOTAL_JUDGE_CALLS,
+			score: aggregation.totalGenerations * numJudges,
+			kind: 'detail',
+		},
+	];
+
+	// Per-judge details (one set per generation)
+	generationRuns.forEach(({ result }, genIndex) => {
+		for (let i = 0; i < result.judgeResults.length; i++) {
+			const judge = result.judgeResults[i];
+			const violationSummary =
+				judge.violations.length > 0
+					? judge.violations.map((v) => `[${v.rule}] ${v.justification}`).join('; ')
+					: undefined;
+
+			feedback.push({
+				evaluator: 'pairwise',
+				metric: `gen${genIndex + 1}.judge${i + 1}`,
+				score: judge.primaryPass ? 1 : 0,
+				kind: 'detail',
+				comment: violationSummary,
+			});
+		}
+	});
+
+	// Per-generation details
+	aggregation.generationDetails.forEach((gen, i) => {
+		feedback.push({
+			evaluator: 'pairwise',
+			metric: `gen${i + 1}.majorityPass`,
+			score: gen.majorityPass ? 1 : 0,
+			kind: 'detail',
+			comment: `${gen.primaryPasses}/${gen.numJudges} judges`,
+		});
+		feedback.push({
+			evaluator: 'pairwise',
+			metric: `gen${i + 1}.diagnosticScore`,
+			score: gen.diagnosticScore,
+			kind: 'detail',
+		});
+	});
+
+	return feedback;
+}
+
+/**
+ * Create a pairwise evaluator that uses a panel of judges.
+ * Each judge evaluates the workflow against dos/donts criteria.
+ *
+ * For single generation (default): evaluates the provided workflow.
+ * For multi-generation: generates N workflows and aggregates results.
+ *
+ * @param llm - Language model for evaluation
+ * @param options - Configuration options
+ * @returns An evaluator that produces feedback from pairwise evaluation
+ *
+ * @example Single generation
+ * ```typescript
+ * const evaluator = createPairwiseEvaluator(llm, { numJudges: 3 });
+ * const feedback = await evaluator.evaluate(workflow, { dos, donts });
+ * ```
+ *
+ * @example Multi-generation
+ * ```typescript
+ * const evaluator = createPairwiseEvaluator(llm, { numJudges: 3, numGenerations: 5 });
+ * const feedback = await evaluator.evaluate(workflow, {
+ *   dos, donts,
+ *   prompt: "Create a workflow...",
+ *   generateWorkflow: async (p) => agent.generate(p),
+ * });
+ * ```
+ */
+export function createPairwiseEvaluator(
+	llm: BaseChatModel,
+	options?: PairwiseEvaluatorOptions,
+): Evaluator<EvaluationContext> {
+	const numJudges = options?.numJudges ?? 3;
+	const numGenerations = options?.numGenerations ?? 1;
+
+	return {
+		name: 'pairwise',
+
+		async evaluate(workflow: SimpleWorkflow, ctx: EvaluationContext): Promise<Feedback[]> {
+			// Single generation (existing behavior)
+			if (numGenerations === 1) {
+				return await evaluateSingleGeneration(llm, workflow, ctx, numJudges);
+			}
+
+			// Multi-generation - validate required context
+			assertMultiGenContext(ctx);
+			return await evaluateMultiGeneration(llm, ctx, numJudges, numGenerations);
+		},
+	};
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/judge-chain.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/judge-chain.test.ts
@ -1,20 +1,19 @@
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import { mock } from 'jest-mock-extended';
+
+import type { SimpleWorkflow } from '@/types/workflow';

 import { evaluateWorkflowPairwise, type PairwiseEvaluationInput } from './judge-chain';
-import type { SimpleWorkflow } from '../../src/types/workflow';
-import * as baseEvaluator from '../chains/evaluators/base';
+import * as baseEvaluator from '../llm-judge/evaluators/base';

 // Mock the base evaluator module
-jest.mock('../chains/evaluators/base', () => ({
+jest.mock('../llm-judge/evaluators/base', () => ({
 	createEvaluatorChain: jest.fn(),
 	invokeEvaluatorChain: jest.fn(),
 }));

 describe('evaluateWorkflowPairwise', () => {
-	const mockLlm = {
-		bindTools: jest.fn(),
-		withStructuredOutput: jest.fn(),
-	} as unknown as BaseChatModel;
+	const mockLlm = mock<BaseChatModel>();

 	const mockWorkflow: SimpleWorkflow = {
 		nodes: [],
@ -43,7 +42,7 @@ describe('evaluateWorkflowPairwise', () => {
 			],
 		};

-		(baseEvaluator.invokeEvaluatorChain as jest.Mock).mockResolvedValue(mockResult);
+		jest.mocked(baseEvaluator.invokeEvaluatorChain).mockResolvedValue(mockResult);

 		const result = await evaluateWorkflowPairwise(mockLlm, input);

@ -74,7 +73,7 @@ describe('evaluateWorkflowPairwise', () => {
 			passes: [{ rule: 'Do this', justification: 'Done' }],
 		};

-		(baseEvaluator.invokeEvaluatorChain as jest.Mock).mockResolvedValue(mockResult);
+		jest.mocked(baseEvaluator.invokeEvaluatorChain).mockResolvedValue(mockResult);

 		const result = await evaluateWorkflowPairwise(mockLlm, input);

@ -88,7 +87,7 @@ describe('evaluateWorkflowPairwise', () => {
 			passes: [],
 		};

-		(baseEvaluator.invokeEvaluatorChain as jest.Mock).mockResolvedValue(mockResult);
+		jest.mocked(baseEvaluator.invokeEvaluatorChain).mockResolvedValue(mockResult);

 		const result = await evaluateWorkflowPairwise(mockLlm, input);

--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/judge-chain.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/judge-chain.ts
@ -2,10 +2,10 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
 import type { RunnableConfig } from '@langchain/core/runnables';
 import { z } from 'zod';

-import type { EvalCriteria } from './types';
-import { prompt } from '../../src/prompts/builder';
-import type { SimpleWorkflow } from '../../src/types/workflow';
-import { createEvaluatorChain, invokeEvaluatorChain } from '../chains/evaluators/base';
+import type { EvalCriteria } from './judge-panel';
+import { prompt } from '../../../src/prompts/builder';
+import type { SimpleWorkflow } from '../../../src/types/workflow';
+import { createEvaluatorChain, invokeEvaluatorChain } from '../llm-judge/evaluators/base';

 export interface PairwiseEvaluationInput {
 	evalCriteria: EvalCriteria;
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/judge-panel.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/judge-panel.test.ts
@ -0,0 +1,46 @@
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import { mock } from 'jest-mock-extended';
+import pLimit from 'p-limit';
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+import { runJudgePanel } from './judge-panel';
+
+const mockEvaluateWorkflowPairwise = jest.fn();
+
+jest.mock('./judge-chain', () => ({
+	evaluateWorkflowPairwise: (...args: unknown[]): unknown => mockEvaluateWorkflowPairwise(...args),
+}));
+
+function createMockWorkflow(name = 'Test Workflow'): SimpleWorkflow {
+	return { name, nodes: [], connections: {} };
+}
+
+describe('runJudgePanel()', () => {
+	beforeEach(() => {
+		jest.clearAllMocks();
+	});
+
+	it('should respect llmCallLimiter concurrency', async () => {
+		let active = 0;
+		let maxActive = 0;
+
+		mockEvaluateWorkflowPairwise.mockImplementation(async () => {
+			active++;
+			maxActive = Math.max(maxActive, active);
+			await new Promise((r) => setTimeout(r, 20));
+			active--;
+			return { violations: [], passes: [], primaryPass: true, diagnosticScore: 1 };
+		});
+
+		const llm = mock<BaseChatModel>();
+		const workflow = createMockWorkflow();
+
+		await runJudgePanel(llm, workflow, { dos: 'Do X', donts: 'Do not Y' }, 5, {
+			llmCallLimiter: pLimit(2),
+		});
+
+		expect(maxActive).toBeLessThanOrEqual(2);
+		expect(mockEvaluateWorkflowPairwise).toHaveBeenCalledTimes(5);
+	});
+});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/judge-panel.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/judge-panel.ts
@ -1,18 +1,34 @@
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';

 import { evaluateWorkflowPairwise, type PairwiseEvaluationResult } from './judge-chain';
-import type { EvalCriteria } from './types';
-import type { SimpleWorkflow } from '../../src/types/workflow';
+import type { SimpleWorkflow } from '../../../src/types/workflow';
+import { runWithOptionalLimiter, withTimeout } from '../../harness/evaluation-helpers';
+import type { EvaluationContext } from '../../harness/harness-types';

 // ============================================================================
 // Types
 // ============================================================================

+/** Evaluation criteria - at least one of dos or donts should be provided */
+export interface EvalCriteria {
+	dos?: string;
+	donts?: string;
+}
+
+export interface JudgePanelTiming {
+	/** Total time for all judges in milliseconds */
+	totalMs: number;
+	/** Time per judge in milliseconds */
+	perJudgeMs: number[];
+}
+
 export interface JudgePanelResult {
 	judgeResults: PairwiseEvaluationResult[];
 	primaryPasses: number;
 	majorityPass: boolean;
 	avgDiagnosticScore: number;
+	/** Timing information (only populated when timing is tracked) */
+	timing?: JudgePanelTiming;
 }

 export interface GenerationResult extends JudgePanelResult {
@ -57,6 +73,10 @@ export interface JudgePanelOptions {
 	generationIndex?: number;
 	/** Experiment name for metadata */
 	experimentName?: string;
+	/** Optional limiter for LLM calls (shared across harness) */
+	llmCallLimiter?: EvaluationContext['llmCallLimiter'];
+	/** Optional timeout for each judge call */
+	timeoutMs?: number;
 }

 /**
@ -77,26 +97,48 @@ export async function runJudgePanel(
 	numJudges: number,
 	options?: JudgePanelOptions,
 ): Promise<JudgePanelResult> {
-	const { generationIndex, experimentName } = options ?? {};
+	const { generationIndex, experimentName, llmCallLimiter, timeoutMs } = options ?? {};
+	const panelStartTime = Date.now();

-	// Run all judges in parallel
+	// Run all judges in parallel, tracking timing for each
+	const judgeTimings: number[] = [];
 	const judgeResults = await Promise.all(
 		Array.from({ length: numJudges }, async (_, judgeIndex) => {
-			return await evaluateWorkflowPairwise(
-				llm,
-				{ workflowJSON: workflow, evalCriteria },
-				{
-					runName: `judge_${judgeIndex + 1}`,
-					metadata: {
-						...(experimentName && { experiment_name: experimentName }),
-						...(generationIndex && { evaluating_generation: `generation_${generationIndex}` }),
-					},
-				},
-			);
+			const runJudge = async (): Promise<PairwiseEvaluationResult> => {
+				const judgeStartTime = Date.now();
+				const result = await withTimeout({
+					promise: evaluateWorkflowPairwise(
+						llm,
+						{ workflowJSON: workflow, evalCriteria },
+						{
+							runName: `judge_${judgeIndex + 1}`,
+							metadata: {
+								...(experimentName && { experiment_name: experimentName }),
+								...(generationIndex && { evaluating_generation: `generation_${generationIndex}` }),
+							},
+						},
+					),
+					timeoutMs,
+					label: `pairwise:judge${judgeIndex + 1}`,
+				});
+				judgeTimings[judgeIndex] = Date.now() - judgeStartTime;
+				return result;
+			};
+
+			return await runWithOptionalLimiter(runJudge, llmCallLimiter);
 		}),
 	);

-	return aggregateJudgeResults(judgeResults, numJudges);
+	const totalMs = Date.now() - panelStartTime;
+	const aggregated = aggregateJudgeResults(judgeResults, numJudges);
+
+	return {
+		...aggregated,
+		timing: {
+			totalMs,
+			perJudgeMs: judgeTimings,
+		},
+	};
 }

 /**
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/metrics.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/metrics.ts
@ -0,0 +1,14 @@
+export const PAIRWISE_METRICS = {
+	// Single generation metrics
+	PAIRWISE_DIAGNOSTIC: 'pairwise_diagnostic',
+	PAIRWISE_JUDGES_PASSED: 'pairwise_judges_passed',
+	PAIRWISE_PRIMARY: 'pairwise_primary',
+	PAIRWISE_TOTAL_PASSES: 'pairwise_total_passes',
+	PAIRWISE_TOTAL_VIOLATIONS: 'pairwise_total_violations',
+
+	// Multi-generation metrics
+	PAIRWISE_AGGREGATED_DIAGNOSTIC: 'pairwise_aggregated_diagnostic',
+	PAIRWISE_GENERATION_CORRECTNESS: 'pairwise_generation_correctness',
+	PAIRWISE_GENERATIONS_PASSED: 'pairwise_generations_passed',
+	PAIRWISE_TOTAL_JUDGE_CALLS: 'pairwise_total_judge_calls',
+} as const;
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/programmatic/index.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/programmatic/index.ts
@ -0,0 +1,97 @@
+import type { INodeTypeDescription } from 'n8n-workflow';
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+import type { EvaluationContext, Evaluator, Feedback } from '../../harness/harness-types';
+import { programmaticEvaluation } from '../../programmatic/programmatic-evaluation';
+
+/**
+ * Format violations as a comment string.
+ */
+function formatViolations(
+	violations: Array<{ type: string; description: string }>,
+): string | undefined {
+	if (!violations || violations.length === 0) return undefined;
+	return violations.map((v) => `[${v.type}] ${v.description}`).join('; ');
+}
+
+/**
+ * Create a programmatic evaluator that runs rule-based checks.
+ * This doesn't require an LLM - it uses static analysis.
+ *
+ * @param nodeTypes - Node type descriptions for validation
+ * @returns An evaluator that produces feedback from programmatic checks
+ */
+export function createProgrammaticEvaluator(
+	nodeTypes: INodeTypeDescription[],
+): Evaluator<EvaluationContext> {
+	const fb = (
+		metric: string,
+		score: number,
+		kind: Feedback['kind'],
+		comment?: string,
+	): Feedback => ({
+		evaluator: 'programmatic',
+		metric,
+		score,
+		kind,
+		...(comment ? { comment } : {}),
+	});
+
+	return {
+		name: 'programmatic',
+
+		async evaluate(workflow: SimpleWorkflow, ctx: EvaluationContext): Promise<Feedback[]> {
+			const result = await programmaticEvaluation(
+				{
+					userPrompt: ctx.prompt,
+					generatedWorkflow: workflow,
+					referenceWorkflows: ctx.referenceWorkflows,
+				},
+				nodeTypes,
+			);
+
+			const feedback: Feedback[] = [
+				// Overall programmatic score (scoring)
+				fb('overall', result.overallScore, 'score'),
+				// Stable category metrics (dashboard)
+				fb(
+					'connections',
+					result.connections.score,
+					'metric',
+					formatViolations(result.connections.violations),
+				),
+				fb('nodes', result.nodes.score, 'metric', formatViolations(result.nodes.violations)),
+				fb('trigger', result.trigger.score, 'metric', formatViolations(result.trigger.violations)),
+				fb(
+					'agentPrompt',
+					result.agentPrompt.score,
+					'metric',
+					formatViolations(result.agentPrompt.violations),
+				),
+				fb('tools', result.tools.score, 'metric', formatViolations(result.tools.violations)),
+				fb('fromAi', result.fromAi.score, 'metric', formatViolations(result.fromAi.violations)),
+				fb(
+					'credentials',
+					result.credentials.score,
+					'metric',
+					formatViolations(result.credentials.violations),
+				),
+			];
+
+			// Similarity check (if reference workflow provided)
+			if (result.similarity !== null && result.similarity !== undefined) {
+				feedback.push(
+					fb(
+						'similarity',
+						result.similarity.score,
+						'metric',
+						formatViolations(result.similarity.violations),
+					),
+				);
+			}
+
+			return feedback;
+		},
+	};
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/similarity/index.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/similarity/index.ts
@ -0,0 +1,147 @@
+import type { SimpleWorkflow } from '@/types/workflow';
+
+import type { EvaluationContext, Evaluator, Feedback } from '../../harness/harness-types';
+import {
+	evaluateWorkflowSimilarity,
+	evaluateWorkflowSimilarityMultiple,
+} from '../../programmatic/evaluators/workflow-similarity';
+
+/**
+ * Options for creating a similarity evaluator.
+ */
+export interface SimilarityEvaluatorOptions {
+	/** Comparison preset: 'strict' | 'standard' | 'lenient' (default: 'standard') */
+	preset?: 'strict' | 'standard' | 'lenient';
+	/** Optional path to custom configuration file */
+	customConfigPath?: string;
+}
+
+/**
+ * Format violations as a comment string.
+ */
+function formatViolations(
+	violations: Array<{ name: string; type: string; description: string; pointsDeducted: number }>,
+): string | undefined {
+	if (!violations || violations.length === 0) return undefined;
+	return violations.map((v) => `[${v.type}] ${v.description}`).join('; ');
+}
+
+/**
+ * Create a similarity evaluator that compares workflows using graph edit distance.
+ *
+ * This evaluator uses a Python script to calculate similarity between the generated
+ * workflow and reference workflow(s). It requires `uvx` to be installed.
+ *
+ * @param options - Configuration options
+ * @returns An evaluator that produces feedback from similarity comparison
+ *
+ * @example
+ * ```typescript
+ * const evaluator = createSimilarityEvaluator({ preset: 'standard' });
+ *
+ * // With single reference workflow
+ * const feedback = await evaluator.evaluate(workflow, {
+ *   referenceWorkflows: [referenceWorkflow]
+ * });
+ *
+ * // With multiple reference workflows (best match wins)
+ * const feedback = await evaluator.evaluate(workflow, {
+ *   referenceWorkflows: [ref1, ref2, ref3]
+ * });
+ * ```
+ */
+export function createSimilarityEvaluator(
+	options?: SimilarityEvaluatorOptions,
+): Evaluator<EvaluationContext> {
+	const preset = options?.preset ?? 'standard';
+	const customConfigPath = options?.customConfigPath;
+
+	return {
+		name: 'similarity',
+
+		async evaluate(workflow: SimpleWorkflow, ctx: EvaluationContext): Promise<Feedback[]> {
+			const feedback: Feedback[] = [];
+
+			const referenceWorkflows = ctx.referenceWorkflows;
+
+			// No reference workflows provided - treat as configuration error
+			if (!referenceWorkflows?.length) {
+				feedback.push({
+					evaluator: 'similarity',
+					metric: 'error',
+					score: 0,
+					kind: 'score',
+					comment: 'No reference workflow provided for comparison',
+				});
+				return feedback;
+			}
+
+			try {
+				let result: {
+					violations: Array<{
+						name: string;
+						type: string;
+						description: string;
+						pointsDeducted: number;
+					}>;
+					score: number;
+				};
+
+				if (referenceWorkflows.length === 1) {
+					result = await evaluateWorkflowSimilarity(
+						workflow,
+						referenceWorkflows[0],
+						preset,
+						customConfigPath,
+					);
+				} else {
+					result = await evaluateWorkflowSimilarityMultiple(
+						workflow,
+						referenceWorkflows,
+						preset,
+						customConfigPath,
+					);
+				}
+
+				// Overall similarity score
+				feedback.push({
+					evaluator: 'similarity',
+					metric: 'score',
+					score: result.score,
+					kind: 'score',
+					comment: formatViolations(result.violations),
+				});
+
+				// Count violations by type
+				const violationsByType: Record<string, number> = {};
+				for (const v of result.violations) {
+					const type = v.name.replace('workflow-similarity-', '');
+					violationsByType[type] = (violationsByType[type] || 0) + 1;
+				}
+
+				// Add individual violation counts as feedback
+				for (const [type, count] of Object.entries(violationsByType)) {
+					feedback.push({
+						evaluator: 'similarity',
+						metric: type,
+						score: Math.max(0, 1 - count * 0.1), // Penalty per violation
+						kind: 'detail',
+						comment: `${count} ${type} edit(s)`,
+					});
+				}
+			} catch (error) {
+				// Return error feedback
+				const errorMessage = error instanceof Error ? error.message : String(error);
+				feedback.push({
+					evaluator: 'similarity',
+					metric: 'error',
+					score: 0,
+					kind: 'score',
+					comment: errorMessage,
+				});
+			}
+
+			return feedback;
+		},
+	};
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/default-prompts.csv
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/default-prompts.csv
@ -0,0 +1,11 @@
+id,prompt
+email-summary,"Create a workflow that runs every Monday morning to analyze weekend emails and create a summary report."
+ai-news-digest,"Build a nightly workflow that fetches news from NewsAPI, uses OpenAI to summarize articles, and generates images for the top stories."
+daily-weather-report,"Create a workflow that sends a personalized morning weather report email using OpenWeather API data."
+invoice-pipeline,"Build an invoice processing workflow that extracts data from PDF/image invoices, validates the information, and generates weekly reports."
+lead-qualification,"Create a workflow that receives web form submissions, uses AI to qualify leads, and schedules calendar appointments for qualified prospects."
+slack-notification,"Build a workflow that monitors a database for new entries and sends Slack notifications with formatted summaries."
+github-issue-triage,"Create a workflow that automatically labels and assigns GitHub issues based on their content using AI classification."
+csv-data-processing,"Build a workflow that processes large CSV files, transforms the data, and uploads results to Google Sheets with rate limiting."
+webhook-order-processing,"Create a webhook-triggered workflow that processes e-commerce orders, updates inventory, and sends confirmation emails."
+content-moderation,"Build a workflow that reviews user-submitted content for moderation, flags inappropriate items, and notifies moderators."
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/ai-news-digest.json
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/ai-news-digest.json
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/daily-weather-report.json
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/daily-weather-report.json
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/email-summary.json
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/email-summary.json
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/extract-from-file.json
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/extract-from-file.json
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/google-sheets-processing.json
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/google-sheets-processing.json
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/invoice-pipeline.json
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/invoice-pipeline.json
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/lead-qualification.json
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/lead-qualification.json
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/multi-agent-research.json
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/multi-agent-research.json
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/rag-assistant.json
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/rag-assistant.json
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/youtube-auto-chapters.json
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/fixtures/reference-workflows/youtube-auto-chapters.json
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/evaluation-helpers.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/evaluation-helpers.ts
@ -0,0 +1,68 @@
+import { v4 as uuid } from 'uuid';
+
+import type { LlmCallLimiter } from './harness-types';
+import type { BuilderFeatureFlags, ChatPayload } from '../../src/workflow-builder-agent';
+import { DEFAULTS } from '../support/constants';
+
+export async function consumeGenerator<T>(gen: AsyncGenerator<T>) {
+	for await (const _ of gen) {
+		/* consume all */
+	}
+}
+
+export async function runWithOptionalLimiter<T>(
+	fn: () => Promise<T>,
+	limiter?: LlmCallLimiter,
+): Promise<T> {
+	return limiter ? await limiter(fn) : await fn();
+}
+
+export async function withTimeout<T>(args: {
+	promise: Promise<T>;
+	timeoutMs?: number;
+	label: string;
+}): Promise<T> {
+	// NOTE:
+	// - This is a best-effort timeout. It does NOT cancel/abort the underlying work.
+	// - If the underlying work supports cancellation (e.g. AbortSignal), plumb that through instead.
+	// - When combined with `p-limit`, prefer applying the timeout *inside* the limited function so the
+	//   limiter slot is released when the timeout triggers.
+	const { promise, timeoutMs, label } = args;
+	if (typeof timeoutMs !== 'number') return await promise;
+	if (!Number.isFinite(timeoutMs) || timeoutMs <= 0) {
+		throw new Error(`Invalid timeoutMs (${String(timeoutMs)}) for ${label}`);
+	}
+
+	let timer: NodeJS.Timeout | undefined;
+	try {
+		const timeout = new Promise<never>((_resolve, reject) => {
+			timer = setTimeout(
+				() => reject(new Error(`Timed out after ${timeoutMs}ms in ${label}`)),
+				timeoutMs,
+			);
+		});
+		return await Promise.race([promise, timeout]);
+	} finally {
+		if (timer) clearTimeout(timer);
+	}
+}
+
+export interface GetChatPayloadOptions {
+	evalType: string;
+	message: string;
+	workflowId: string;
+	featureFlags?: BuilderFeatureFlags;
+}
+
+export function getChatPayload(options: GetChatPayloadOptions): ChatPayload {
+	const { evalType, message, workflowId, featureFlags } = options;
+
+	return {
+		id: `${evalType}-${uuid()}`,
+		featureFlags: featureFlags ?? DEFAULTS.FEATURE_FLAGS,
+		message,
+		workflowContext: {
+			currentWorkflow: { id: workflowId, nodes: [], connections: {} },
+		},
+	};
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/feedback.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/feedback.ts
@ -0,0 +1,48 @@
+import type { Feedback } from './harness-types';
+
+export interface LangsmithEvaluationResultLike {
+	key: string;
+	score: number;
+	comment?: string;
+}
+
+export function feedbackKey(feedback: Feedback): string {
+	return `${feedback.evaluator}.${feedback.metric}`;
+}
+
+function isPairwiseV1Metric(metric: string): boolean {
+	return metric.startsWith('pairwise_');
+}
+
+/**
+ * Metric key mapping for LangSmith.
+ *
+ * Goal: keep keys comparable with historical runs.
+ * - Programmatic: keep evaluator prefix (e.g. `programmatic.trigger`)
+ * - LLM-judge: keep metrics unprefixed (e.g. `overallScore`, `connections`, `maintainability.nodeNamingQuality`)
+ * - Pairwise: keep v1 metrics unprefixed (e.g. `pairwise_primary`), but namespace non-v1 details.
+ */
+export function langsmithMetricKey(feedback: Feedback): string {
+	if (feedback.evaluator === 'pairwise') {
+		return isPairwiseV1Metric(feedback.metric) ? feedback.metric : feedbackKey(feedback);
+	}
+
+	if (feedback.evaluator === 'programmatic') {
+		return feedbackKey(feedback);
+	}
+
+	if (feedback.evaluator === 'llm-judge') {
+		return feedback.metric;
+	}
+
+	// Default: prefix unknown evaluators to avoid collisions with unprefixed `llm-judge` metrics.
+	return feedbackKey(feedback);
+}
+
+export function toLangsmithEvaluationResult(feedback: Feedback): LangsmithEvaluationResultLike {
+	return {
+		key: langsmithMetricKey(feedback),
+		score: feedback.score,
+		...(feedback.comment ? { comment: feedback.comment } : {}),
+	};
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/harness-types.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/harness-types.ts
@ -0,0 +1,199 @@
+import type { Client as LangsmithClient } from 'langsmith/client';
+import type pLimit from 'p-limit';
+
+import type { EvalLogger } from './logger.js';
+import type { SimpleWorkflow } from '../../src/types/workflow.js';
+
+export type LlmCallLimiter = ReturnType<typeof pLimit>;
+
+/**
+ * Shared context passed to all evaluators.
+ *
+ * Keep this as the single "base" context so callers (CLI/runner) never need casts.
+ * Evaluators should validate required fields at runtime when optional fields are needed.
+ */
+export interface EvaluationContext {
+	/** The original user prompt for this example */
+	prompt: string;
+	/** Pairwise criteria: required behaviors */
+	dos?: string;
+	/** Pairwise criteria: forbidden behaviors */
+	donts?: string;
+	/** Optional reference workflows for similarity-based checks (best match wins) */
+	referenceWorkflows?: SimpleWorkflow[];
+	/**
+	 * Optional generator for multi-generation evaluations.
+	 * When present, pairwise evaluator can generate multiple workflows from the same prompt.
+	 */
+	generateWorkflow?: (prompt: string) => Promise<SimpleWorkflow>;
+	/**
+	 * Optional limiter for LLM-bound work (generation + evaluators).
+	 * When provided, treat it as the global knob for overall parallel LLM calls.
+	 */
+	llmCallLimiter?: LlmCallLimiter;
+	/**
+	 * Optional timeout used for LLM-bound work (generation + evaluators).
+	 * Note: timeouts are best-effort unless underlying calls support cancellation (AbortSignal).
+	 */
+	timeoutMs?: number;
+}
+
+/** Context attached to an individual test case (prompt is provided separately). */
+export type TestCaseContext = Omit<Partial<EvaluationContext>, 'prompt'>;
+
+/** Global context attached to a run (prompt is provided per test case). */
+export type GlobalRunContext = Omit<Partial<EvaluationContext>, 'prompt'>;
+
+/**
+ * What evaluators return - a single piece of feedback.
+ */
+export interface Feedback {
+	/** Evaluator name emitting this feedback (e.g. `llm-judge`, `programmatic`) */
+	evaluator: string;
+	/** Metric name within the evaluator (e.g. `functionality`, `efficiency.nodeCountEfficiency`) */
+	metric: string;
+	score: number;
+	comment?: string;
+	/**
+	 * Classification of this feedback item.
+	 *
+	 * - `score`: the single score used for overall scoring for this evaluator
+	 * - `metric`: stable category-level metrics (useful for dashboards)
+	 * - `detail`: unstable/verbose metrics that should not affect scoring
+	 */
+	kind: 'score' | 'metric' | 'detail';
+}
+
+/**
+ * An evaluator that can assess a generated workflow.
+ * Optionally typed with context for evaluator-specific data.
+ */
+export interface Evaluator<TContext = EvaluationContext> {
+	name: string;
+	evaluate(workflow: SimpleWorkflow, ctx: TContext): Promise<Feedback[]>;
+}
+
+/**
+ * A single test case for evaluation.
+ */
+export interface TestCase {
+	prompt: string;
+	id?: string;
+	/** Context passed to evaluators (e.g., pairwise dos/donts) */
+	context?: TestCaseContext;
+	/** Reference workflows for similarity comparison (best match wins) */
+	referenceWorkflows?: SimpleWorkflow[];
+}
+
+/**
+ * Configuration for an evaluation run.
+ */
+export interface RunConfigBase {
+	/** Function to generate workflow from prompt */
+	generateWorkflow: (prompt: string) => Promise<SimpleWorkflow>;
+	/** Evaluators to run on each generated workflow */
+	evaluators: Array<Evaluator<EvaluationContext>>;
+	/** Global context available to all evaluators */
+	context?: GlobalRunContext;
+	/** Directory for JSON output files */
+	outputDir?: string;
+	/** Threshold for pass/fail classification of an example score (0-1). */
+	passThreshold?: number;
+	/** Timeout for generation/evaluator operations (ms). */
+	timeoutMs?: number;
+	/** Lifecycle hooks for logging and monitoring */
+	lifecycle?: Partial<EvaluationLifecycle>;
+	/** Logger for all output (use `createQuietLifecycle()` to suppress output in tests) */
+	logger: EvalLogger;
+}
+
+export interface LocalRunConfig extends RunConfigBase {
+	mode: 'local';
+	/** Local mode requires an in-memory dataset */
+	dataset: TestCase[];
+	langsmithOptions?: never;
+}
+
+export interface LangsmithRunConfig extends RunConfigBase {
+	mode: 'langsmith';
+	/** LangSmith mode uses a remote dataset name */
+	dataset: string;
+	langsmithOptions: LangsmithOptions;
+	/** LangSmith client used by both evaluate() and traceable() */
+	langsmithClient: LangsmithClient;
+}
+
+export type RunConfig = LocalRunConfig | LangsmithRunConfig;
+
+/**
+ * LangSmith-specific configuration.
+ */
+export interface LangsmithOptions {
+	experimentName: string;
+	repetitions: number;
+	concurrency: number;
+	/** Maximum number of examples to evaluate from the dataset */
+	maxExamples?: number;
+	/** Optional dataset filtering (requires pre-loading examples). */
+	filters?: LangsmithExampleFilters;
+	/** Enable trace filtering to reduce payload sizes (default: true) */
+	enableTraceFiltering?: boolean;
+	/** Arbitrary metadata passed to LangSmith experiment (e.g., numJudges, scoringMethod) */
+	experimentMetadata?: Record<string, unknown>;
+}
+
+export interface LangsmithExampleFilters {
+	/** Filter by `example.metadata.notion_id`. */
+	notionId?: string;
+	/** Filter by `example.metadata.categories` (contains). */
+	technique?: string;
+	/** Filter by `example.inputs.evals.dos` (substring match, case-insensitive). */
+	doSearch?: string;
+	/** Filter by `example.inputs.evals.donts` (substring match, case-insensitive). */
+	dontSearch?: string;
+}
+
+/**
+ * Result of evaluating a single example.
+ */
+export interface ExampleResult {
+	index: number;
+	prompt: string;
+	status: 'pass' | 'fail' | 'error';
+	/** Example-level score (0-1). In v2 this should be scoring-strategy aware (not key-count dependent). */
+	score: number;
+	feedback: Feedback[];
+	durationMs: number;
+	/** Time spent generating the workflow, when known. */
+	generationDurationMs?: number;
+	/** Time spent running evaluators, when known. */
+	evaluationDurationMs?: number;
+	workflow?: SimpleWorkflow;
+	error?: string;
+}
+
+/**
+ * Summary of an entire evaluation run.
+ */
+export interface RunSummary {
+	totalExamples: number;
+	passed: number;
+	failed: number;
+	errors: number;
+	averageScore: number;
+	totalDurationMs: number;
+	evaluatorAverages?: Record<string, number>;
+}
+
+/**
+ * Lifecycle hooks for centralized logging and monitoring.
+ */
+export interface EvaluationLifecycle {
+	onStart(config: RunConfig): void;
+	onExampleStart(index: number, total: number, prompt: string): void;
+	onWorkflowGenerated(workflow: SimpleWorkflow, durationMs: number): void;
+	onEvaluatorComplete(name: string, feedback: Feedback[]): void;
+	onEvaluatorError(name: string, error: Error): void;
+	onExampleComplete(index: number, result: ExampleResult): void;
+	onEnd(summary: RunSummary): void;
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/lifecycle.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/lifecycle.ts
@ -0,0 +1,442 @@
+import pc from 'picocolors';
+
+import type {
+	EvaluationLifecycle,
+	RunConfig,
+	Feedback,
+	ExampleResult,
+	RunSummary,
+} from './harness-types.js';
+import type { EvalLogger } from './logger.js';
+import { groupByEvaluator, selectScoringItems, calculateFiniteAverage } from './score-calculator';
+import type { SimpleWorkflow } from '../../src/types/workflow.js';
+
+/**
+ * Truncate a string for display.
+ */
+function truncate(str: string, maxLen = 50): string {
+	const cleaned = str.replace(/\s+/g, ' ').trim();
+	return cleaned.length > maxLen ? cleaned.slice(0, maxLen) + '...' : cleaned;
+}
+
+function truncateForSingleLine(str: string, maxLen: number): string {
+	return truncate(str.replace(/\n/g, ' '), maxLen);
+}
+
+function exampleLabel(mode: RunConfig['mode'] | undefined): 'call' | 'ex' {
+	return mode === 'langsmith' ? 'call' : 'ex';
+}
+
+/**
+ * Format a score as percentage.
+ */
+function formatScore(score: number): string {
+	if (!Number.isFinite(score)) return 'N/A';
+	return `${(score * 100).toFixed(0)}%`;
+}
+
+/**
+ * Format duration in seconds.
+ */
+function formatDuration(ms: number): string {
+	return `${(ms / 1000).toFixed(1)}s`;
+}
+
+/**
+ * Critical metrics to always show in verbose mode.
+ */
+const CRITICAL_METRICS = [
+	'functionality',
+	'connections',
+	'expressions',
+	'nodeConfiguration',
+	'overallScore',
+	'overall', // programmatic uses 'overall' not 'overallScore'
+	'trigger',
+];
+
+const DISPLAY_METRICS_BY_EVALUATOR: Record<string, string[]> = {
+	'llm-judge': ['functionality', 'connections', 'expressions', 'nodeConfiguration', 'overallScore'],
+	programmatic: ['overall', 'connections', 'trigger'],
+	pairwise: [
+		'pairwise_primary',
+		'pairwise_diagnostic',
+		'pairwise_judges_passed',
+		'pairwise_total_passes',
+		'pairwise_total_violations',
+		'pairwise_generation_correctness',
+		'pairwise_aggregated_diagnostic',
+		'pairwise_generations_passed',
+		'pairwise_total_judge_calls',
+	],
+};
+
+const PAIRWISE_COUNT_METRICS = new Set([
+	'pairwise_judges_passed',
+	'pairwise_total_passes',
+	'pairwise_total_violations',
+	'pairwise_generations_passed',
+	'pairwise_total_judge_calls',
+]);
+
+function formatMetricValue(evaluator: string, metric: string, score: number): string {
+	if (evaluator === 'pairwise' && PAIRWISE_COUNT_METRICS.has(metric)) {
+		if (!Number.isFinite(score)) return 'N/A';
+		return Number.isInteger(score) ? String(score) : score.toFixed(0);
+	}
+	return formatScore(score);
+}
+
+function hasSeverityMarker(comment: string): boolean {
+	const lower = comment.toLowerCase();
+	return lower.includes('[critical]') || lower.includes('[major]') || lower.includes('[minor]');
+}
+
+function extractIssuesForLogs(evaluator: string, feedback: Feedback[]): Feedback[] {
+	const withComments = feedback.filter(
+		(f) => typeof f.comment === 'string' && f.comment.trim().length > 0 && f.metric !== 'error',
+	);
+
+	if (evaluator === 'llm-judge') {
+		return withComments.filter((f) => (f.comment ? hasSeverityMarker(f.comment) : false));
+	}
+
+	if (evaluator === 'pairwise') {
+		const isJudgeMetric = (metric: string) =>
+			/^judge\d+$/u.test(metric) || /^gen\d+\.judge\d+$/u.test(metric);
+
+		return withComments.filter((f) => {
+			if (isJudgeMetric(f.metric)) return true;
+
+			// Only show high-level status summaries when not fully passing.
+			if (f.metric === 'pairwise_primary' && f.score < 1) return true;
+			if (f.metric === 'pairwise_generation_correctness' && f.score < 1) return true;
+
+			return false;
+		});
+	}
+
+	return withComments;
+}
+
+function formatExampleHeaderLines(args: {
+	mode: RunConfig['mode'] | undefined;
+	index: number;
+	status: string;
+	score: number;
+	prompt: string;
+	durationMs: number;
+	generationDurationMs?: number;
+	evaluationDurationMs?: number;
+	nodeCount: number;
+}): string[] {
+	const {
+		mode,
+		index,
+		status,
+		score,
+		prompt,
+		durationMs,
+		generationDurationMs,
+		evaluationDurationMs,
+		nodeCount,
+	} = args;
+
+	const promptSnippet = truncateForSingleLine(prompt, 80);
+	const genStr =
+		typeof generationDurationMs === 'number' ? formatDuration(generationDurationMs) : '?';
+	const evalStr =
+		typeof evaluationDurationMs === 'number' ? formatDuration(evaluationDurationMs) : '?';
+
+	return [
+		`${pc.dim(`[${exampleLabel(mode)} ${index}]`)} ${status} ${formatScore(score)} ${pc.dim(
+			`prompt="${promptSnippet}"`,
+		)}`,
+		pc.dim(
+			`  gen=${genStr} eval=${evalStr} total=${formatDuration(durationMs)} nodes=${nodeCount}`,
+		),
+	];
+}
+
+function splitEvaluatorFeedback(feedback: Feedback[]): {
+	errors: Feedback[];
+	nonErrorFeedback: Feedback[];
+} {
+	return {
+		errors: feedback.filter((f) => f.metric === 'error'),
+		nonErrorFeedback: feedback.filter((f) => f.metric !== 'error'),
+	};
+}
+
+function formatEvaluatorLines(args: {
+	evaluatorName: string;
+	feedback: Feedback[];
+}): string[] {
+	const { evaluatorName, feedback } = args;
+
+	const { errors, nonErrorFeedback } = splitEvaluatorFeedback(feedback);
+
+	const scoringItems = selectScoringItems(feedback);
+	const avgScore = calculateFiniteAverage(scoringItems);
+
+	const colorFn = scoreColor(avgScore);
+
+	const lines: string[] = [];
+	lines.push(
+		pc.dim(`  ${evaluatorName}: `) +
+			colorFn(formatScore(avgScore)) +
+			pc.dim(
+				errors.length > 0
+					? ` (metrics=${nonErrorFeedback.length}, errors=${errors.length})`
+					: ` (metrics=${feedback.length})`,
+			),
+	);
+
+	const displayMetrics = DISPLAY_METRICS_BY_EVALUATOR[evaluatorName] ?? CRITICAL_METRICS;
+	const picked = nonErrorFeedback.filter((f) => displayMetrics.includes(f.metric));
+	if (picked.length > 0) {
+		const metricsLine = picked
+			.map((f) => {
+				const color = scoreColor(f.score);
+				return `${f.metric}: ${color(formatMetricValue(evaluatorName, f.metric, f.score))}`;
+			})
+			.join(pc.dim(' | '));
+		lines.push(pc.dim('    ') + metricsLine);
+	}
+
+	if (errors.length > 0) {
+		const topErrors = errors.slice(0, 2);
+		lines.push(pc.dim(`    errors(top=${topErrors.length}):`));
+		for (const errorItem of topErrors) {
+			const comment = truncateForSingleLine(errorItem.comment ?? '', 240);
+			lines.push(pc.dim('      - ') + pc.red(comment));
+		}
+		if (errors.length > topErrors.length) {
+			lines.push(pc.dim(`      ... and ${errors.length - topErrors.length} more`));
+		}
+	}
+
+	const issues = extractIssuesForLogs(evaluatorName, feedback);
+	if (issues.length > 0) {
+		const top = issues.slice(0, 3);
+		lines.push(pc.dim(`    issues(top=${top.length}):`));
+		for (const issue of top) {
+			const comment = truncateForSingleLine(issue.comment ?? '', 320);
+			lines.push(pc.dim(`      - [${issue.metric}] `) + pc.red(comment));
+		}
+		if (issues.length > top.length) {
+			lines.push(pc.dim(`      ... and ${issues.length - top.length} more`));
+		}
+	}
+
+	return lines;
+}
+
+/**
+ * Get color based on score.
+ */
+function scoreColor(score: number): (s: string) => string {
+	if (score >= 0.9) return pc.green;
+	if (score >= 0.7) return pc.yellow;
+	return pc.red;
+}
+
+function formatExampleStatus(status: ExampleResult['status']): string {
+	switch (status) {
+		case 'pass':
+			return pc.green('PASS');
+		case 'fail':
+			return pc.yellow('FAIL');
+		case 'error':
+			return pc.red('ERROR');
+	}
+}
+
+/**
+ * Options for creating a console lifecycle.
+ */
+export interface ConsoleLifecycleOptions {
+	verbose: boolean;
+	logger: EvalLogger;
+}
+
+/**
+ * Create a lifecycle that logs to console.
+ * Verbose mode shows detailed progress, non-verbose shows summary only.
+ */
+export function createConsoleLifecycle(options: ConsoleLifecycleOptions): EvaluationLifecycle {
+	const { verbose, logger } = options;
+	let runMode: RunConfig['mode'] | undefined;
+	let evaluatorOrder: string[] = [];
+
+	return {
+		onStart(config: RunConfig): void {
+			runMode = config.mode;
+			evaluatorOrder = config.evaluators.map((e) => e.name);
+
+			logger.info(`\nStarting evaluation in ${pc.cyan(config.mode)} mode`);
+
+			if (typeof config.dataset === 'string') {
+				logger.info(`Dataset: ${pc.dim(config.dataset)}`);
+			} else {
+				logger.info(`Test cases: ${pc.dim(String(config.dataset.length))}`);
+			}
+
+			logger.info(
+				`Evaluators: ${pc.dim(config.evaluators.map((e) => e.name).join(', ') || 'none')}`,
+			);
+			logger.info('');
+		},
+
+		onExampleStart(index: number, total: number, prompt: string): void {
+			if (!verbose) return;
+
+			const totalStr = total > 0 ? String(total) : '?';
+			const prefix = pc.dim(`[${exampleLabel(runMode)} ${index}/${totalStr}]`);
+			const status = pc.yellow('START');
+			const promptStr = pc.dim(`prompt="${truncateForSingleLine(prompt, 80)}"`);
+			logger.info(`${prefix} ${status} ${promptStr}`);
+		},
+
+		onWorkflowGenerated: () => {},
+
+		onEvaluatorComplete: () => {},
+
+		onEvaluatorError(name: string, error: Error): void {
+			if (!verbose) return;
+			logger.error(`    ERROR in ${name}: ${error.message}`);
+		},
+
+		onExampleComplete(index: number, result: ExampleResult): void {
+			if (!verbose) return;
+
+			const status = formatExampleStatus(result.status);
+
+			const nodeCount = result.workflow?.nodes?.length ?? 0;
+			const lines: string[] = formatExampleHeaderLines({
+				mode: runMode,
+				index,
+				status,
+				score: result.score,
+				prompt: result.prompt,
+				durationMs: result.durationMs,
+				generationDurationMs: result.generationDurationMs,
+				evaluationDurationMs: result.evaluationDurationMs,
+				nodeCount,
+			});
+
+			if (result.error) {
+				lines.push(pc.red(`  error: ${result.error}`));
+				logger.info(lines.join('\n'));
+				return;
+			}
+
+			const grouped = groupByEvaluator(result.feedback);
+			const orderedEvaluators = [
+				...evaluatorOrder.filter((name) => name in grouped),
+				...Object.keys(grouped).filter((name) => !evaluatorOrder.includes(name)),
+			];
+
+			for (const evaluatorName of orderedEvaluators) {
+				const feedback = grouped[evaluatorName] ?? [];
+				lines.push(...formatEvaluatorLines({ evaluatorName, feedback }));
+			}
+
+			logger.info(lines.join('\n'));
+		},
+
+		onEnd(summary: RunSummary): void {
+			if (runMode === 'langsmith') {
+				return;
+			}
+
+			logger.info('\n' + pc.bold('═══════════════════ SUMMARY ═══════════════════'));
+			logger.info(
+				`  Total: ${summary.totalExamples} | ` +
+					`Pass: ${pc.green(String(summary.passed))} | ` +
+					`Fail: ${pc.yellow(String(summary.failed))} | ` +
+					`Error: ${pc.red(String(summary.errors))}`,
+			);
+			const passRate = summary.totalExamples > 0 ? summary.passed / summary.totalExamples : 0;
+			logger.info(`  Pass rate: ${formatScore(passRate)}`);
+			logger.info(`  Average score: ${formatScore(summary.averageScore)}`);
+			logger.info(`  Total time: ${formatDuration(summary.totalDurationMs)}`);
+			logger.info(pc.bold('═══════════════════════════════════════════════\n'));
+		},
+	};
+}
+
+/**
+ * Create a quiet lifecycle that does nothing.
+ * Useful for testing or when no output is desired.
+ */
+export function createQuietLifecycle(): EvaluationLifecycle {
+	return {
+		onStart: () => {},
+		onExampleStart: () => {},
+		onWorkflowGenerated: () => {},
+		onEvaluatorComplete: () => {},
+		onEvaluatorError: () => {},
+		onExampleComplete: () => {},
+		onEnd: () => {},
+	};
+}
+
+/** Type predicate for filtering undefined values */
+function isDefined<T>(value: T | undefined): value is T {
+	return value !== undefined;
+}
+
+/**
+ * Merge multiple partial lifecycles into a single complete lifecycle.
+ * All hooks will be called in order.
+ */
+export function mergeLifecycles(
+	...lifecycles: Array<Partial<EvaluationLifecycle> | undefined>
+): EvaluationLifecycle {
+	const validLifecycles = lifecycles.filter(isDefined);
+
+	return {
+		onStart(config: RunConfig): void {
+			for (const lc of validLifecycles) {
+				lc.onStart?.(config);
+			}
+		},
+
+		onExampleStart(index: number, total: number, prompt: string): void {
+			for (const lc of validLifecycles) {
+				lc.onExampleStart?.(index, total, prompt);
+			}
+		},
+
+		onWorkflowGenerated(workflow: SimpleWorkflow, durationMs: number): void {
+			for (const lc of validLifecycles) {
+				lc.onWorkflowGenerated?.(workflow, durationMs);
+			}
+		},
+
+		onEvaluatorComplete(name: string, feedback: Feedback[]): void {
+			for (const lc of validLifecycles) {
+				lc.onEvaluatorComplete?.(name, feedback);
+			}
+		},
+
+		onEvaluatorError(name: string, error: Error): void {
+			for (const lc of validLifecycles) {
+				lc.onEvaluatorError?.(name, error);
+			}
+		},
+
+		onExampleComplete(index: number, result: ExampleResult): void {
+			for (const lc of validLifecycles) {
+				lc.onExampleComplete?.(index, result);
+			}
+		},
+
+		onEnd(summary: RunSummary): void {
+			for (const lc of validLifecycles) {
+				lc.onEnd?.(summary);
+			}
+		},
+	};
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/logger.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/logger.ts
@ -29,13 +29,14 @@ export interface EvalLogger {
 export function createLogger(verbose: boolean = false): EvalLogger {
 	return {
 		isVerbose: verbose,
-		info: (message: string) => console.log(pc.blue(message)),
+		// Keep info plain so lifecycle can apply its own formatting without double-coloring.
+		info: (message: string) => console.log(message),
 		verbose: (message: string) => {
 			if (verbose) console.log(pc.dim(message));
 		},
 		success: (message: string) => console.log(pc.green(message)),
-		warn: (message: string) => console.log(pc.yellow(message)),
-		error: (message: string) => console.log(pc.red(message)),
+		warn: (message: string) => console.warn(pc.yellow(message)),
+		error: (message: string) => console.error(pc.red(message)),
 		dim: (message: string) => console.log(pc.dim(message)),
 	};
 }
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/multi-gen.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/multi-gen.ts
@ -0,0 +1,97 @@
+/**
+ * Multi-generation utilities for pairwise evaluation.
+ *
+ * These utilities support generating multiple workflows from a single prompt
+ * and aggregating evaluation results across all generations.
+ */
+
+import type { SimpleWorkflow } from '@/types/workflow';
+
+/**
+ * Details for a single generation in multi-gen evaluation.
+ */
+export interface GenerationDetail {
+	/** The generated workflow */
+	workflow: SimpleWorkflow;
+	/** Whether majority of judges passed this generation */
+	majorityPass: boolean;
+	/** Average diagnostic score across judges */
+	diagnosticScore: number;
+	/** Number of judges that passed */
+	primaryPasses: number;
+	/** Total number of judges */
+	numJudges: number;
+}
+
+/**
+ * Aggregated result across multiple generations.
+ */
+export interface MultiGenerationResult {
+	/** Ratio of passing generations (0-1) */
+	generationCorrectness: number;
+	/** Average diagnostic score across all generations */
+	aggregatedDiagnosticScore: number;
+	/** Count of generations that passed */
+	passingGenerations: number;
+	/** Total number of generations */
+	totalGenerations: number;
+	/** Detailed results for each generation */
+	generationDetails: GenerationDetail[];
+}
+
+/**
+ * Calculate the majority threshold for a given number of judges.
+ *
+ * @param numJudges - Number of judges in the panel
+ * @returns Minimum number of passes needed for majority
+ *
+ * @example
+ * getMajorityThreshold(3) // returns 2
+ * getMajorityThreshold(5) // returns 3
+ */
+export function getMajorityThreshold(numJudges: number): number {
+	if (!Number.isFinite(numJudges) || numJudges < 1) {
+		throw new Error(`numJudges must be >= 1 (received ${String(numJudges)})`);
+	}
+	return Math.ceil(numJudges / 2);
+}
+
+/**
+ * Aggregate results across multiple workflow generations.
+ *
+ * Calculates:
+ * - Generation correctness: passing generations / total generations
+ * - Aggregated diagnostic: average diagnostic score across all generations
+ *
+ * @param details - Array of generation details to aggregate
+ * @returns Aggregated multi-generation result
+ *
+ * @example
+ * ```typescript
+ * const result = aggregateGenerations([
+ *   { majorityPass: true, diagnosticScore: 0.9, ... },
+ *   { majorityPass: false, diagnosticScore: 0.6, ... },
+ *   { majorityPass: true, diagnosticScore: 0.85, ... },
+ * ]);
+ * // result.generationCorrectness = 0.67 (2/3)
+ * // result.aggregatedDiagnosticScore = 0.78
+ * ```
+ */
+export function aggregateGenerations(details: GenerationDetail[]): MultiGenerationResult {
+	const totalGenerations = details.length;
+	const passingGenerations = details.filter((d) => d.majorityPass).length;
+	const generationCorrectness = totalGenerations > 0 ? passingGenerations / totalGenerations : 0;
+
+	const aggregatedDiagnosticScore =
+		totalGenerations > 0
+			? details.reduce((sum, d) => sum + d.diagnosticScore, 0) / totalGenerations
+			: 0;
+
+	return {
+		generationCorrectness,
+		aggregatedDiagnosticScore,
+		passingGenerations,
+		totalGenerations,
+		generationDetails: details,
+	};
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/output.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/output.ts
@ -0,0 +1,214 @@
+/**
+ * Artifact saving for v2 evaluation harness.
+ *
+ * Saves evaluation results to disk in JSON format for later analysis.
+ */
+
+import { createHash } from 'crypto';
+import * as fs from 'fs';
+import * as path from 'path';
+
+import { feedbackKey } from './feedback';
+import type { ExampleResult, Feedback, RunSummary } from './harness-types.js';
+import type { EvalLogger } from './logger.js';
+import { selectScoringItems, calculateFiniteAverage } from './score-calculator';
+import type { SimpleWorkflow } from '../../src/types/workflow.js';
+
+/**
+ * Interface for saving evaluation artifacts to disk.
+ */
+export interface ArtifactSaver {
+	/** Save a single example result */
+	saveExample(result: ExampleResult): void;
+	/** Save the final summary */
+	saveSummary(summary: RunSummary, results: ExampleResult[]): void;
+}
+
+/**
+ * Options for creating an artifact saver.
+ */
+export interface ArtifactSaverOptions {
+	/** Directory to save artifacts to */
+	outputDir: string;
+	/** Logger for optional save logs */
+	logger: EvalLogger;
+}
+
+/**
+ * Create an artifact saver for persisting evaluation results to disk.
+ *
+ * Directory structure:
+ * ```
+ * outputDir/
+ * ├── example-001/
+ * │   ├── prompt.txt
+ * │   ├── workflow.json
+ * │   └── feedback.json
+ * ├── example-002/
+ * │   └── ...
+ * └── summary.json
+ * ```
+ *
+ * @param options - Configuration options
+ * @returns ArtifactSaver instance or null if outputDir is not provided
+ */
+export function createArtifactSaver(options: ArtifactSaverOptions): ArtifactSaver {
+	const { outputDir, logger } = options;
+
+	// Create output directory if it doesn't exist
+	fs.mkdirSync(outputDir, { recursive: true });
+
+	return {
+		saveExample(result: ExampleResult): void {
+			const exampleDir = path.join(outputDir, getExampleDirName(result));
+			fs.mkdirSync(exampleDir, { recursive: true });
+
+			// Save prompt
+			fs.writeFileSync(path.join(exampleDir, 'prompt.txt'), result.prompt, 'utf-8');
+
+			// Save workflow if available
+			if (result.workflow) {
+				const workflowForExport = formatWorkflowForExport(result.workflow);
+				fs.writeFileSync(
+					path.join(exampleDir, 'workflow.json'),
+					JSON.stringify(workflowForExport, null, 2),
+					'utf-8',
+				);
+			}
+
+			// Save feedback
+			const feedbackOutput = formatFeedbackForExport(result);
+			fs.writeFileSync(
+				path.join(exampleDir, 'feedback.json'),
+				JSON.stringify(feedbackOutput, null, 2),
+				'utf-8',
+			);
+
+			// Save error if present
+			if (result.error) {
+				fs.writeFileSync(path.join(exampleDir, 'error.txt'), result.error, 'utf-8');
+			}
+
+			logger.verbose(`Saved example ${result.index} to ${exampleDir}`);
+		},
+
+		saveSummary(summary: RunSummary, results: ExampleResult[]): void {
+			const summaryOutput = formatSummaryForExport(summary, results);
+			fs.writeFileSync(
+				path.join(outputDir, 'summary.json'),
+				JSON.stringify(summaryOutput, null, 2),
+				'utf-8',
+			);
+
+			logger.verbose(`Saved summary to ${path.join(outputDir, 'summary.json')}`);
+		},
+	};
+}
+
+function getExampleDirName(result: ExampleResult): string {
+	const index = String(result.index).padStart(3, '0');
+	const id = shortId(`${result.prompt}\n${result.index}`);
+	return `example-${index}-${id}`;
+}
+
+function shortId(input: string): string {
+	// Small deterministic id to avoid collisions when example folders are written concurrently
+	// and to keep folder names stable across reruns with the same prompts.
+	return createHash('md5').update(input).digest('hex').slice(0, 8);
+}
+
+/**
+ * Format a workflow for export (n8n-importable format).
+ */
+function formatWorkflowForExport(workflow: SimpleWorkflow): object {
+	return {
+		name: workflow.name ?? 'Generated Workflow',
+		nodes: workflow.nodes ?? [],
+		connections: workflow.connections ?? {},
+	};
+}
+
+/**
+ * Format feedback for export.
+ */
+function formatFeedbackForExport(result: ExampleResult): object {
+	// Group feedback by evaluator
+	const byEvaluator: Record<string, Feedback[]> = {};
+	for (const fb of result.feedback) {
+		const evaluator = fb.evaluator;
+		if (!byEvaluator[evaluator]) {
+			byEvaluator[evaluator] = [];
+		}
+		byEvaluator[evaluator].push(fb);
+	}
+
+	return {
+		index: result.index,
+		status: result.status,
+		durationMs: result.durationMs,
+		score: result.score,
+		evaluators: Object.entries(byEvaluator).map(([name, items]) => ({
+			name,
+			feedback: items.map((f) => ({
+				key: feedbackKey(f),
+				metric: f.metric,
+				score: f.score,
+				kind: f.kind,
+				...(f.comment ? { comment: f.comment } : {}),
+			})),
+			averageScore: calculateFiniteAverage(selectScoringItems(items)),
+		})),
+		allFeedback: result.feedback,
+	};
+}
+
+/**
+ * Format summary for export.
+ */
+function formatSummaryForExport(summary: RunSummary, results: ExampleResult[]): object {
+	const resultsSorted = [...results].sort((a, b) => a.index - b.index);
+
+	// Calculate per-evaluator statistics
+	const evaluatorStats: Record<string, { scores: number[] }> = {};
+	for (const result of resultsSorted) {
+		const byEvaluator: Record<string, Feedback[]> = {};
+		for (const fb of result.feedback) {
+			const evaluator = fb.evaluator;
+			if (!byEvaluator[evaluator]) byEvaluator[evaluator] = [];
+			byEvaluator[evaluator].push(fb);
+		}
+		for (const [evaluator, items] of Object.entries(byEvaluator)) {
+			if (!evaluatorStats[evaluator]) {
+				evaluatorStats[evaluator] = { scores: [] };
+			}
+			const scoringItems = selectScoringItems(items);
+			const avg = calculateFiniteAverage(scoringItems);
+			evaluatorStats[evaluator].scores.push(avg);
+		}
+	}
+
+	const evaluatorAverages: Record<string, number> = {};
+	for (const [name, stats] of Object.entries(evaluatorStats)) {
+		evaluatorAverages[name] = stats.scores.reduce((a, b) => a + b, 0) / stats.scores.length;
+	}
+
+	return {
+		timestamp: new Date().toISOString(),
+		totalExamples: summary.totalExamples,
+		passed: summary.passed,
+		failed: summary.failed,
+		errors: summary.errors,
+		passRate: summary.totalExamples > 0 ? summary.passed / summary.totalExamples : 0,
+		averageScore: summary.averageScore,
+		totalDurationMs: summary.totalDurationMs,
+		evaluatorAverages,
+		results: resultsSorted.map((r) => ({
+			index: r.index,
+			prompt: r.prompt.slice(0, 100) + (r.prompt.length > 100 ? '...' : ''),
+			status: r.status,
+			score: r.score,
+			durationMs: r.durationMs,
+			...(r.error ? { error: r.error } : {}),
+		})),
+	};
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/runner.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/runner.ts
@ -0,0 +1,985 @@
+import type { BaseMessage } from '@langchain/core/messages';
+import { evaluate } from 'langsmith/evaluation';
+import type { Run, Example } from 'langsmith/schemas';
+import { traceable } from 'langsmith/traceable';
+import pLimit from 'p-limit';
+
+import { runWithOptionalLimiter, withTimeout } from './evaluation-helpers';
+import { toLangsmithEvaluationResult } from './feedback';
+import type {
+	Evaluator,
+	TestCase,
+	EvaluationContext,
+	GlobalRunContext,
+	TestCaseContext,
+	Feedback,
+	RunConfig,
+	LocalRunConfig,
+	LangsmithRunConfig,
+	ExampleResult,
+	RunSummary,
+	EvaluationLifecycle,
+	LangsmithExampleFilters,
+} from './harness-types.js';
+import type { EvalLogger } from './logger';
+import { createArtifactSaver, type ArtifactSaver } from './output';
+import { calculateWeightedScore } from './score-calculator';
+import type { SimpleWorkflow } from '../../src/types/workflow.js';
+import { extractMessageContent } from '../langsmith/types';
+
+const DEFAULT_PASS_THRESHOLD = 0.7;
+
+/**
+ * Run evaluators in parallel for a single workflow.
+ * Handles errors gracefully - skip and continue.
+ */
+async function evaluateWithPlugins(
+	workflow: SimpleWorkflow,
+	evaluators: Array<Evaluator<EvaluationContext>>,
+	context: EvaluationContext,
+	timeoutMs: number | undefined,
+	lifecycle?: Partial<EvaluationLifecycle>,
+): Promise<Feedback[]> {
+	const results = await Promise.all(
+		evaluators.map(async (evaluator): Promise<Feedback[]> => {
+			try {
+				const feedback = await withTimeout({
+					promise: evaluator.evaluate(workflow, context),
+					timeoutMs,
+					label: `evaluator:${evaluator.name}`,
+				});
+				lifecycle?.onEvaluatorComplete?.(evaluator.name, feedback);
+				return feedback;
+			} catch (error) {
+				const evaluatorError = error instanceof Error ? error : new Error(String(error));
+				lifecycle?.onEvaluatorError?.(evaluator.name, evaluatorError);
+				const errorFeedback: Feedback = {
+					evaluator: evaluator.name,
+					metric: 'error',
+					score: 0,
+					kind: 'score',
+					comment: evaluatorError.message,
+				};
+				return [errorFeedback];
+			}
+		}),
+	);
+
+	return results.flat();
+}
+
+/**
+ * Calculate example score from feedback using evaluator-weighted scoring.
+ */
+function calculateExampleScore(feedback: Feedback[]): number {
+	return calculateWeightedScore(feedback);
+}
+
+/**
+ * Determine pass/fail status based on average score.
+ */
+function determineStatus(args: { score: number; passThreshold: number }): 'pass' | 'fail' {
+	const { score, passThreshold } = args;
+	return score >= passThreshold ? 'pass' : 'fail';
+}
+
+function hasErrorFeedback(feedback: Feedback[]): boolean {
+	return feedback.some((f) => f.metric === 'error');
+}
+
+/**
+ * Build a typed evaluation context for evaluators.
+ */
+function buildContext(args: {
+	prompt: string;
+	globalContext?: GlobalRunContext;
+	testCaseContext?: TestCaseContext;
+	referenceWorkflows?: SimpleWorkflow[];
+}): EvaluationContext {
+	const { prompt, globalContext, testCaseContext, referenceWorkflows } = args;
+
+	return {
+		prompt,
+		...(globalContext ?? {}),
+		...(testCaseContext ?? {}),
+		...(referenceWorkflows?.length ? { referenceWorkflows } : {}),
+	};
+}
+
+function isUnknownRecord(value: unknown): value is Record<string, unknown> {
+	return typeof value === 'object' && value !== null && !Array.isArray(value);
+}
+
+function isUnknownArray(value: unknown): value is unknown[] {
+	return Array.isArray(value);
+}
+
+function asRecord(value: unknown): Record<string, unknown> {
+	return isUnknownRecord(value) ? value : {};
+}
+
+function isNumberArray2(value: unknown): value is [number, number] {
+	return (
+		Array.isArray(value) &&
+		value.length === 2 &&
+		typeof value[0] === 'number' &&
+		Number.isFinite(value[0]) &&
+		typeof value[1] === 'number' &&
+		Number.isFinite(value[1])
+	);
+}
+
+function isNodeLike(value: unknown): boolean {
+	if (!isUnknownRecord(value)) return false;
+	const name = value.name;
+	const type = value.type;
+	const typeVersion = value.typeVersion;
+	const position = value.position;
+	return (
+		typeof name === 'string' &&
+		name.length > 0 &&
+		typeof type === 'string' &&
+		type.length > 0 &&
+		typeof typeVersion === 'number' &&
+		Number.isFinite(typeVersion) &&
+		isNumberArray2(position)
+	);
+}
+
+function isConnectionsLike(value: unknown): boolean {
+	if (!isUnknownRecord(value)) return false;
+	for (const nodeConnections of Object.values(value)) {
+		if (!isUnknownRecord(nodeConnections)) return false;
+		for (const connectionTypeValue of Object.values(nodeConnections)) {
+			if (!Array.isArray(connectionTypeValue)) return false;
+			for (const output of connectionTypeValue) {
+				if (!Array.isArray(output)) return false;
+				for (const connection of output) {
+					if (!isUnknownRecord(connection)) return false;
+				}
+			}
+		}
+	}
+	return true;
+}
+
+function isSimpleWorkflow(value: unknown): value is SimpleWorkflow {
+	if (!isUnknownRecord(value)) return false;
+	if (!Array.isArray(value.nodes)) return false;
+	if (!isConnectionsLike(value.connections)) return false;
+	return value.nodes.every(isNodeLike);
+}
+
+function getNotionIdFromMetadata(metadata: unknown): string | undefined {
+	const record = asRecord(metadata);
+	return typeof record.notion_id === 'string' ? record.notion_id : undefined;
+}
+
+function getCategoriesFromMetadata(metadata: unknown): string[] | undefined {
+	const record = asRecord(metadata);
+	const categories = record.categories;
+	if (!Array.isArray(categories)) return undefined;
+	const strings = categories.filter((c): c is string => typeof c === 'string');
+	return strings.length > 0 ? strings : undefined;
+}
+
+function getEvalsFromExampleInputs(exampleInputs: unknown): { dos?: string; donts?: string } {
+	const inputs = asRecord(exampleInputs);
+	const evals = asRecord(inputs.evals);
+	const result: { dos?: string; donts?: string } = {};
+	if (typeof evals.dos === 'string') result.dos = evals.dos;
+	if (typeof evals.donts === 'string') result.donts = evals.donts;
+	return result;
+}
+
+function isFeedback(value: unknown): value is Feedback {
+	const kinds = new Set(['score', 'metric', 'detail'] as const);
+	return (
+		isUnknownRecord(value) &&
+		typeof value.evaluator === 'string' &&
+		typeof value.metric === 'string' &&
+		typeof value.score === 'number' &&
+		typeof value.kind === 'string' &&
+		kinds.has(value.kind as 'score' | 'metric' | 'detail')
+	);
+}
+
+function exampleMatchesFilters(example: Example, filters: LangsmithExampleFilters): boolean {
+	if (filters.notionId) {
+		if (getNotionIdFromMetadata(example.metadata) !== filters.notionId) return false;
+	}
+
+	if (filters.technique) {
+		const categories = getCategoriesFromMetadata(example.metadata) ?? [];
+		if (!categories.includes(filters.technique)) return false;
+	}
+
+	if (filters.doSearch || filters.dontSearch) {
+		const { dos, donts } = getEvalsFromExampleInputs(example.inputs);
+		if (filters.doSearch) {
+			const haystack = (dos ?? '').toLowerCase();
+			if (!haystack.includes(filters.doSearch.toLowerCase())) return false;
+		}
+		if (filters.dontSearch) {
+			const haystack = (donts ?? '').toLowerCase();
+			if (!haystack.includes(filters.dontSearch.toLowerCase())) return false;
+		}
+	}
+
+	return true;
+}
+
+async function loadExamplesFromDataset(params: {
+	lsClient: {
+		readDataset: (args: { datasetName: string }) => Promise<{ id: string }>;
+		listExamples: (args: { datasetId: string; limit?: number }) => AsyncIterable<Example>;
+	};
+	datasetName: string;
+	maxExamples?: number;
+	filters?: LangsmithExampleFilters;
+}): Promise<Example[]> {
+	const { lsClient, datasetName, maxExamples, filters } = params;
+
+	const datasetInfo = await lsClient.readDataset({ datasetName });
+	const matches: Example[] = [];
+
+	let scanned = 0;
+	const listArgs: { datasetId: string; limit?: number } = { datasetId: datasetInfo.id };
+	if (!filters && maxExamples) listArgs.limit = maxExamples;
+
+	for await (const example of lsClient.listExamples(listArgs)) {
+		scanned++;
+		if (filters && !exampleMatchesFilters(example, filters)) continue;
+		matches.push(example);
+		if (maxExamples && matches.length >= maxExamples) break;
+	}
+
+	if (filters && matches.length === 0) {
+		const filterSummary = [
+			filters.notionId ? `id:${filters.notionId}` : undefined,
+			filters.technique ? `technique:${filters.technique}` : undefined,
+			filters.doSearch ? `do:${filters.doSearch}` : undefined,
+			filters.dontSearch ? `dont:${filters.dontSearch}` : undefined,
+		]
+			.filter((v): v is string => v !== undefined)
+			.join(', ');
+
+		throw new Error(
+			`No examples matched filters (${filterSummary}) in dataset "${datasetName}" (scanned ${scanned})`,
+		);
+	}
+
+	if (!filters && maxExamples && matches.length === 0) {
+		throw new Error(`No examples found in dataset "${datasetName}"`);
+	}
+
+	return matches;
+}
+
+async function resolveLangsmithData(params: {
+	dataset: string;
+	langsmithOptions: LangsmithRunConfig['langsmithOptions'];
+	lsClient: {
+		readDataset: (args: { datasetName: string }) => Promise<{ id: string }>;
+		listExamples: (args: { datasetId: string; limit?: number }) => AsyncIterable<Example>;
+	};
+	logger: EvalLogger;
+}): Promise<string | Example[]> {
+	const { dataset, langsmithOptions, lsClient, logger } = params;
+
+	const datasetName = dataset;
+	const maxExamples = langsmithOptions.maxExamples;
+	const filters = langsmithOptions.filters;
+
+	const shouldLoadExamples =
+		(typeof maxExamples === 'number' && maxExamples > 0) || filters !== undefined;
+
+	if (!shouldLoadExamples) return datasetName;
+
+	logger.info(
+		filters
+			? `Loading examples from dataset "${datasetName}" with filters...`
+			: `Loading up to ${maxExamples} examples from dataset "${datasetName}"...`,
+	);
+
+	try {
+		return await loadExamplesFromDataset({
+			lsClient,
+			datasetName,
+			maxExamples,
+			filters,
+		});
+	} catch (error) {
+		const errorMessage = error instanceof Error ? error.message : String(error);
+		if (
+			errorMessage.startsWith('No examples matched filters') ||
+			errorMessage.startsWith('No examples found in dataset')
+		) {
+			throw error instanceof Error ? error : new Error(errorMessage);
+		}
+		throw new Error(`Dataset "${datasetName}" not found: ${errorMessage}`);
+	}
+}
+
+function extractContextFromLangsmithInputs(inputs: unknown): TestCaseContext {
+	const record = asRecord(inputs);
+	const context: TestCaseContext = {};
+
+	if (typeof record.dos === 'string') context.dos = record.dos;
+	if (typeof record.donts === 'string') context.donts = record.donts;
+
+	// Support both legacy referenceWorkflow (single) and referenceWorkflows (array) from dataset
+	if (
+		Array.isArray(record.referenceWorkflows) &&
+		record.referenceWorkflows.every((wf) => isSimpleWorkflow(wf))
+	) {
+		context.referenceWorkflows = record.referenceWorkflows;
+	} else if (isSimpleWorkflow(record.referenceWorkflow)) {
+		// Convert legacy single reference to array
+		context.referenceWorkflows = [record.referenceWorkflow];
+	}
+
+	return context;
+}
+
+async function runLocalExample(args: {
+	index: number;
+	total: number;
+	testCase: TestCase;
+	generateWorkflow: (prompt: string) => Promise<SimpleWorkflow>;
+	evaluators: Array<Evaluator<EvaluationContext>>;
+	globalContext?: GlobalRunContext;
+	passThreshold: number;
+	timeoutMs: number | undefined;
+	lifecycle?: Partial<EvaluationLifecycle>;
+	artifactSaver?: ArtifactSaver | null;
+}): Promise<ExampleResult> {
+	const {
+		index,
+		total,
+		testCase,
+		generateWorkflow,
+		evaluators,
+		globalContext,
+		passThreshold,
+		timeoutMs,
+		lifecycle,
+		artifactSaver,
+	} = args;
+
+	const startTime = Date.now();
+	lifecycle?.onExampleStart?.(index, total, testCase.prompt);
+
+	try {
+		// Generate workflow
+		const genStartTime = Date.now();
+		const workflow = await runWithOptionalLimiter(async () => {
+			return await withTimeout({
+				promise: generateWorkflow(testCase.prompt),
+				timeoutMs,
+				label: 'workflow_generation',
+			});
+		}, globalContext?.llmCallLimiter);
+		const genDurationMs = Date.now() - genStartTime;
+		lifecycle?.onWorkflowGenerated?.(workflow, genDurationMs);
+
+		const context = buildContext({
+			prompt: testCase.prompt,
+			globalContext: {
+				...(globalContext ?? {}),
+				timeoutMs,
+			},
+			testCaseContext: testCase.context,
+			referenceWorkflows: testCase.referenceWorkflows,
+		});
+
+		// Run evaluators in parallel
+		const evalStartTime = Date.now();
+		const feedback = await evaluateWithPlugins(workflow, evaluators, context, timeoutMs, lifecycle);
+		const evalDurationMs = Date.now() - evalStartTime;
+
+		// Calculate result
+		const score = calculateExampleScore(feedback);
+		const status = hasErrorFeedback(feedback) ? 'error' : determineStatus({ score, passThreshold });
+		const durationMs = Date.now() - startTime;
+
+		const result: ExampleResult = {
+			index,
+			prompt: testCase.prompt,
+			status,
+			score,
+			feedback,
+			durationMs,
+			generationDurationMs: genDurationMs,
+			evaluationDurationMs: evalDurationMs,
+			workflow,
+		};
+
+		artifactSaver?.saveExample(result);
+		lifecycle?.onExampleComplete?.(index, result);
+		return result;
+	} catch (error) {
+		const durationMs = Date.now() - startTime;
+		const errorMessage = error instanceof Error ? error.message : String(error);
+		const result: ExampleResult = {
+			index,
+			prompt: testCase.prompt,
+			status: 'error',
+			score: 0,
+			feedback: [
+				{
+					evaluator: 'runner',
+					metric: 'error',
+					score: 0,
+					kind: 'score',
+					comment: errorMessage,
+				},
+			],
+			durationMs,
+			error: errorMessage,
+		};
+
+		artifactSaver?.saveExample(result);
+		lifecycle?.onExampleComplete?.(index, result);
+		return result;
+	}
+}
+
+/**
+ * Run evaluation in local mode.
+ */
+function createArtifactSaverIfRequested(args: {
+	outputDir?: string;
+	logger: EvalLogger;
+}): ArtifactSaver | null {
+	const { outputDir, logger } = args;
+	if (!outputDir) return null;
+	return createArtifactSaver({ outputDir, logger });
+}
+
+async function runLocalDataset(params: {
+	testCases: TestCase[];
+	generateWorkflow: (prompt: string) => Promise<SimpleWorkflow>;
+	evaluators: Array<Evaluator<EvaluationContext>>;
+	globalContext?: GlobalRunContext;
+	passThreshold: number;
+	timeoutMs: number | undefined;
+	lifecycle?: Partial<EvaluationLifecycle>;
+	artifactSaver: ArtifactSaver | null;
+}): Promise<ExampleResult[]> {
+	const {
+		testCases,
+		generateWorkflow,
+		evaluators,
+		globalContext,
+		passThreshold,
+		timeoutMs,
+		lifecycle,
+		artifactSaver,
+	} = params;
+
+	const results: ExampleResult[] = [];
+	for (let i = 0; i < testCases.length; i++) {
+		const testCase = testCases[i];
+		const index = i + 1;
+		const result = await runLocalExample({
+			index,
+			total: testCases.length,
+			testCase,
+			generateWorkflow,
+			evaluators,
+			globalContext,
+			passThreshold,
+			timeoutMs,
+			lifecycle,
+			artifactSaver,
+		});
+		results.push(result);
+	}
+	return results;
+}
+
+function buildRunSummary(results: ExampleResult[]): RunSummary {
+	const passed = results.filter((r) => r.status === 'pass').length;
+	const failed = results.filter((r) => r.status === 'fail').length;
+	const errors = results.filter((r) => r.status === 'error').length;
+
+	const averageScore =
+		results.length > 0 ? results.reduce((sum, r) => sum + r.score, 0) / results.length : 0;
+	const totalDurationMs = results.reduce((sum, r) => sum + r.durationMs, 0);
+
+	return {
+		totalExamples: results.length,
+		passed,
+		failed,
+		errors,
+		averageScore,
+		totalDurationMs,
+	};
+}
+
+async function runLocal(config: LocalRunConfig): Promise<RunSummary> {
+	const {
+		dataset,
+		generateWorkflow,
+		evaluators,
+		context: globalContext,
+		passThreshold = DEFAULT_PASS_THRESHOLD,
+		timeoutMs,
+		lifecycle,
+		outputDir,
+		logger,
+	} = config;
+
+	const testCases: TestCase[] = dataset;
+	if (testCases.length === 0) {
+		logger.warn('No test cases provided');
+	}
+
+	const effectiveGlobalContext: GlobalRunContext = {
+		...(globalContext ?? {}),
+		llmCallLimiter: globalContext?.llmCallLimiter ?? pLimit(4),
+		timeoutMs,
+	};
+
+	// Create artifact saver if outputDir is provided
+	const artifactSaver = createArtifactSaverIfRequested({ outputDir, logger });
+
+	lifecycle?.onStart?.(config);
+
+	const results = await runLocalDataset({
+		testCases,
+		generateWorkflow,
+		evaluators,
+		globalContext: effectiveGlobalContext,
+		passThreshold,
+		timeoutMs,
+		lifecycle,
+		artifactSaver,
+	});
+	const summary = buildRunSummary(results);
+
+	// Save summary to disk if outputDir is provided
+	artifactSaver?.saveSummary(summary, results);
+
+	lifecycle?.onEnd?.(summary);
+
+	return summary;
+}
+
+/**
+ * Output from LangSmith target function.
+ */
+interface LangsmithTargetOutput {
+	workflow: SimpleWorkflow;
+	prompt: string;
+	feedback: Feedback[];
+}
+
+/**
+ * Input from LangSmith dataset.
+ * Supports both direct prompt string and messages array format.
+ */
+interface LangsmithDatasetInput {
+	prompt?: string;
+	messages?: BaseMessage[];
+	evals?: Record<string, unknown>;
+	[key: string]: unknown;
+}
+
+/**
+ * Extract prompt from dataset input.
+ * Supports both direct prompt and messages array format.
+ */
+function extractPrompt(inputs: LangsmithDatasetInput): string {
+	// Direct prompt string
+	if (inputs.prompt && typeof inputs.prompt === 'string') {
+		return inputs.prompt;
+	}
+
+	// Messages array format
+	if (inputs.messages && Array.isArray(inputs.messages) && inputs.messages.length > 0) {
+		return extractMessageContent(inputs.messages[0]);
+	}
+
+	throw new Error('No prompt found in inputs - expected "prompt" string or "messages" array');
+}
+
+function createLangsmithFeedbackExtractor(): (
+	rootRun: Run,
+	_example?: Example,
+) => Promise<Array<{ key: string; score: number; comment?: string }>> {
+	return async (rootRun: Run, _example?: Example) => {
+		const outputs = rootRun.outputs;
+		const feedback =
+			isUnknownRecord(outputs) &&
+			isUnknownArray(outputs.feedback) &&
+			outputs.feedback.every(isFeedback)
+				? outputs.feedback
+				: undefined;
+
+		if (!feedback) {
+			return [
+				{
+					key: 'evaluationError',
+					score: 0,
+					comment: 'No feedback found in target output',
+				},
+			];
+		}
+
+		return feedback.map((fb) => toLangsmithEvaluationResult(fb));
+	};
+}
+
+function applyRepetitions(data: string | Example[], repetitions: number): string | Example[] {
+	if (!Array.isArray(data) || repetitions <= 1) return data;
+	return Array.from({ length: repetitions }, () => data).flat();
+}
+
+function computeFilterMetadata(filters?: LangsmithExampleFilters): {
+	runType: string;
+	filterValue?: string;
+} {
+	if (!filters) return { runType: 'full' };
+
+	const parts: string[] = [];
+	const values: string[] = [];
+
+	if (filters.notionId) {
+		parts.push('id');
+		values.push(`id:${filters.notionId}`);
+	}
+	if (filters.technique) {
+		parts.push('category');
+		values.push(`category:${filters.technique}`);
+	}
+	if (filters.doSearch) {
+		parts.push('do');
+		values.push(`do:${filters.doSearch}`);
+	}
+	if (filters.dontSearch) {
+		parts.push('dont');
+		values.push(`dont:${filters.dontSearch}`);
+	}
+
+	if (parts.length === 0) return { runType: 'full' };
+
+	return {
+		runType: `by-${parts.join('-and-')}`,
+		filterValue: values.join(' '),
+	};
+}
+
+function logLangsmithInputsSummary(logger: EvalLogger, effectiveData: string | Example[]): void {
+	if (!Array.isArray(effectiveData)) {
+		logger.verbose('Data source: dataset (streaming)');
+		return;
+	}
+
+	logger.verbose(`Data source: preloaded examples (${effectiveData.length})`);
+	logger.verbose(
+		`Example IDs in data: ${effectiveData
+			.slice(0, 20)
+			.map((e) => e.id)
+			.join(', ')}`,
+	);
+}
+
+async function runLangsmithEvaluateAndFlush(params: {
+	target: (inputs: LangsmithDatasetInput) => Promise<LangsmithTargetOutput>;
+	effectiveData: string | Example[];
+	feedbackExtractor: ReturnType<typeof createLangsmithFeedbackExtractor>;
+	langsmithOptions: LangsmithRunConfig['langsmithOptions'];
+	lsClient: LangsmithRunConfig['langsmithClient'];
+	logger: EvalLogger;
+	targetCallCount: () => number;
+}): Promise<void> {
+	const {
+		target,
+		effectiveData,
+		feedbackExtractor,
+		langsmithOptions,
+		lsClient,
+		logger,
+		targetCallCount,
+	} = params;
+
+	const exampleCount = Array.isArray(effectiveData) ? effectiveData.length : 'dataset';
+
+	logger.info(
+		`Starting LangSmith evaluate() with ${exampleCount} examples, ${langsmithOptions.repetitions} repetitions, concurrency ${langsmithOptions.concurrency}...`,
+	);
+
+	const { runType, filterValue } = computeFilterMetadata(langsmithOptions.filters);
+
+	const evalStartTime = Date.now();
+	await evaluate(target, {
+		data: effectiveData,
+		evaluators: [feedbackExtractor],
+		experimentPrefix: langsmithOptions.experimentName,
+		// Repetitions are applied explicitly when pre-loading examples to keep behavior consistent.
+		// When streaming from a dataset name, the SDK may support repetitions internally.
+		...(!Array.isArray(effectiveData) &&
+			langsmithOptions.repetitions > 1 && { numRepetitions: langsmithOptions.repetitions }),
+		maxConcurrency: langsmithOptions.concurrency,
+		client: lsClient,
+		metadata: {
+			repetitions: langsmithOptions.repetitions,
+			concurrency: langsmithOptions.concurrency,
+			runType,
+			...(filterValue && { filterValue }),
+			...langsmithOptions.experimentMetadata,
+		},
+	});
+	logger.info(
+		`Evaluation completed in ${((Date.now() - evalStartTime) / 1000).toFixed(1)}s (target called ${targetCallCount()} times)`,
+	);
+
+	// Flush pending traces to ensure all data is sent to LangSmith
+	logger.verbose('Flushing pending trace batches...');
+	const flushStartTime = Date.now();
+	await lsClient.awaitPendingTraceBatches();
+	logger.verbose(`Flush completed in ${((Date.now() - flushStartTime) / 1000).toFixed(1)}s`);
+}
+
+/**
+ * Run evaluation in LangSmith mode.
+ * This wraps generation + evaluation in a traceable function.
+ */
+async function runLangsmith(config: LangsmithRunConfig): Promise<RunSummary> {
+	const {
+		dataset,
+		generateWorkflow,
+		evaluators,
+		context: globalContext,
+		outputDir,
+		passThreshold = DEFAULT_PASS_THRESHOLD,
+		timeoutMs,
+		langsmithOptions,
+		langsmithClient: lsClient,
+		lifecycle,
+		logger,
+	} = config;
+
+	// Enable tracing (required in langsmith 0.4.x)
+	process.env.LANGSMITH_TRACING = 'true';
+
+	lifecycle?.onStart?.(config);
+
+	const effectiveGlobalContext: GlobalRunContext = {
+		...(globalContext ?? {}),
+		llmCallLimiter: globalContext?.llmCallLimiter ?? pLimit(langsmithOptions.concurrency),
+		timeoutMs,
+	};
+
+	const artifactSaver = createArtifactSaverIfRequested({ outputDir, logger });
+	const capturedResults: ExampleResult[] | null = artifactSaver ? [] : null;
+
+	// Create target function that does ALL work (generation + evaluation)
+	// NOTE: Do NOT wrap target with traceable() - evaluate() handles that automatically
+	// and applies critical options (on_end callback, reference_example_id, client).
+	// Only wrap inner operations with traceable() for child traces.
+	let targetCallCount = 0;
+	let totalExamples = 0;
+	const stats = {
+		total: 0,
+		passed: 0,
+		failed: 0,
+		errors: 0,
+		scoreSum: 0,
+		durationSumMs: 0,
+	};
+	const target = async (inputs: LangsmithDatasetInput): Promise<LangsmithTargetOutput> => {
+		targetCallCount++;
+		const index = targetCallCount;
+		// Extract prompt from inputs (supports both direct prompt and messages array)
+		const prompt = extractPrompt(inputs);
+		const { evals: datasetContext, ...rest } = inputs;
+
+		lifecycle?.onExampleStart?.(index, totalExamples, prompt);
+		const startTime = Date.now();
+		const genStart = Date.now();
+
+		try {
+			const limiter = effectiveGlobalContext.llmCallLimiter;
+
+			// Generate workflow - wrapped in traceable for proper child trace visibility
+			const traceableGenerate = traceable(
+				async () =>
+					await runWithOptionalLimiter(async () => {
+						return await withTimeout({
+							promise: generateWorkflow(prompt),
+							timeoutMs,
+							label: 'workflow_generation',
+						});
+					}, limiter),
+				{
+					name: 'workflow_generation',
+					run_type: 'chain',
+					client: lsClient,
+				},
+			);
+			const workflow = await traceableGenerate();
+			const genDurationMs = Date.now() - genStart;
+			lifecycle?.onWorkflowGenerated?.(workflow, genDurationMs);
+
+			const extracted = extractContextFromLangsmithInputs({
+				...asRecord(datasetContext),
+				...asRecord(rest),
+			});
+			const context = buildContext({
+				prompt,
+				globalContext: effectiveGlobalContext,
+				testCaseContext: extracted,
+			});
+
+			// Run all evaluators in parallel
+			const evalStart = Date.now();
+			const feedback = await evaluateWithPlugins(
+				workflow,
+				evaluators,
+				context,
+				timeoutMs,
+				lifecycle,
+			);
+			const evalDurationMs = Date.now() - evalStart;
+			const totalDurationMs = Date.now() - startTime;
+
+			const score = calculateExampleScore(feedback);
+			const status = hasErrorFeedback(feedback)
+				? 'error'
+				: determineStatus({ score, passThreshold });
+
+			stats.total++;
+			stats.scoreSum += score;
+			stats.durationSumMs += totalDurationMs;
+
+			if (status === 'pass') stats.passed++;
+			else if (status === 'fail') stats.failed++;
+			else stats.errors++;
+
+			const result: ExampleResult = {
+				index,
+				prompt,
+				status,
+				score,
+				feedback,
+				durationMs: totalDurationMs,
+				generationDurationMs: genDurationMs,
+				evaluationDurationMs: evalDurationMs,
+				workflow,
+			};
+
+			artifactSaver?.saveExample(result);
+			capturedResults?.push(result);
+			lifecycle?.onExampleComplete?.(index, result);
+
+			return {
+				workflow,
+				prompt,
+				feedback,
+			};
+		} catch (error) {
+			const errorMessage = error instanceof Error ? error.message : String(error);
+			const workflow: SimpleWorkflow = { name: 'Evaluation Error', nodes: [], connections: {} };
+			const feedback: Feedback[] = [
+				{
+					evaluator: 'runner',
+					metric: 'error',
+					score: 0,
+					kind: 'score',
+					comment: errorMessage,
+				},
+			];
+
+			const totalDurationMs = Date.now() - startTime;
+			const genDurationMs = Date.now() - genStart;
+			stats.total++;
+			stats.errors++;
+			stats.durationSumMs += totalDurationMs;
+			const result: ExampleResult = {
+				index,
+				prompt,
+				status: 'error',
+				score: 0,
+				feedback,
+				durationMs: totalDurationMs,
+				generationDurationMs: genDurationMs,
+				workflow,
+				error: errorMessage,
+			};
+
+			artifactSaver?.saveExample(result);
+			capturedResults?.push(result);
+			lifecycle?.onExampleComplete?.(index, result);
+
+			return { workflow, prompt, feedback };
+		}
+	};
+
+	const feedbackExtractor = createLangsmithFeedbackExtractor();
+
+	// Load examples if maxExamples is set
+	if (typeof dataset !== 'string') {
+		throw new Error('LangSmith mode requires dataset to be a dataset name string');
+	}
+
+	let data = await resolveLangsmithData({ dataset, langsmithOptions, lsClient, logger });
+	// Defensive: if maxExamples/filters were requested but we still got a dataset name,
+	// fall back to preloading so we can honor limits instead of streaming everything.
+	if (
+		typeof data === 'string' &&
+		((langsmithOptions.maxExamples ?? 0) > 0 || langsmithOptions.filters !== undefined)
+	) {
+		data = await loadExamplesFromDataset({
+			lsClient,
+			datasetName: data,
+			maxExamples: langsmithOptions.maxExamples,
+			filters: langsmithOptions.filters,
+		});
+	}
+
+	const effectiveData = applyRepetitions(data, langsmithOptions.repetitions);
+
+	totalExamples = Array.isArray(effectiveData) ? effectiveData.length : 0;
+
+	logLangsmithInputsSummary(logger, effectiveData);
+	await runLangsmithEvaluateAndFlush({
+		target,
+		effectiveData,
+		feedbackExtractor,
+		langsmithOptions,
+		lsClient,
+		logger,
+		targetCallCount: () => targetCallCount,
+	});
+
+	// Return placeholder summary - LangSmith handles actual results
+	const summary: RunSummary = {
+		totalExamples: stats.total,
+		passed: stats.passed,
+		failed: stats.failed,
+		errors: stats.errors,
+		averageScore: stats.total > 0 ? stats.scoreSum / stats.total : 0,
+		totalDurationMs: stats.durationSumMs,
+	};
+
+	if (artifactSaver && capturedResults) {
+		artifactSaver.saveSummary(summary, capturedResults);
+	}
+
+	lifecycle?.onEnd?.(summary);
+
+	return summary;
+}
+
+/**
+ * Main entry point for running evaluations.
+ */
+export async function runEvaluation(config: RunConfig): Promise<RunSummary> {
+	if (config.mode === 'langsmith') {
+		return await runLangsmith(config);
+	}
+	return await runLocal(config);
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/score-calculator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/harness/score-calculator.ts
@ -0,0 +1,228 @@
+/**
+ * Score Calculation Utilities
+ *
+ * Provides functions for calculating weighted scores and aggregating
+ * feedback from multiple evaluators.
+ */
+
+import type { Feedback } from './harness-types';
+
+/**
+ * Weights for each evaluator type.
+ */
+export interface ScoreWeights {
+	[evaluatorPrefix: string]: number;
+}
+
+/**
+ * Result of score aggregation.
+ */
+export interface AggregatedScore {
+	/** Weighted overall score (0-1) */
+	overall: number;
+	/** Average score per evaluator */
+	byEvaluator: Record<string, number>;
+	/** Average score per category */
+	byCategory: Record<string, number>;
+}
+
+/**
+ * Parsed feedback key structure.
+ */
+export interface FeedbackKeyParts {
+	evaluator: string;
+	category: string;
+	subcategory?: string;
+}
+
+/**
+ * Default weights for standard evaluators (cross-evaluator weighting).
+ *
+ * This is the *harness-level* weighting between evaluators like `llm-judge`,
+ * `programmatic`, and `pairwise`. It is independent from any evaluator-internal
+ * weighting (e.g. LLM judge category weights).
+ * Weights should sum to approximately 1.0.
+ */
+export const DEFAULT_EVALUATOR_WEIGHTS: ScoreWeights = {
+	'llm-judge': 0.35,
+	programmatic: 0.25,
+	pairwise: 0.25,
+	similarity: 0.15,
+};
+
+/**
+ * @deprecated Use `DEFAULT_EVALUATOR_WEIGHTS` (kept for backwards compatibility within the package).
+ */
+export const DEFAULT_WEIGHTS: ScoreWeights = DEFAULT_EVALUATOR_WEIGHTS;
+
+/** Default weight for unknown evaluators */
+const UNKNOWN_EVALUATOR_WEIGHT = 0.1;
+
+/**
+ * Parse a feedback key into its component parts.
+ *
+ * @example
+ * parseFeedbackKey('llm-judge.functionality')
+ * // => { evaluator: 'llm-judge', category: 'functionality' }
+ *
+ * parseFeedbackKey('pairwise.gen1.majorityPass')
+ * // => { evaluator: 'pairwise', category: 'gen1', subcategory: 'majorityPass' }
+ */
+export function parseFeedbackKey(key: string): FeedbackKeyParts {
+	const parts = key.split('.');
+	return {
+		evaluator: parts[0],
+		category: parts[1] ?? '',
+		subcategory: parts[2],
+	};
+}
+
+/**
+ * Extract the category from a feedback key.
+ *
+ * @example
+ * extractCategory('llm-judge.functionality') // => 'functionality'
+ * extractCategory('programmatic.trigger') // => 'trigger'
+ */
+export function extractCategory(key: string): string {
+	return parseFeedbackKey(key).category;
+}
+
+/**
+ * Group feedback items by their evaluator prefix.
+ *
+ * @example
+ * groupByEvaluator([
+ *   { evaluator: 'llm-judge', metric: 'a', score: 0.8 },
+ *   { evaluator: 'programmatic', metric: 'b', score: 0.6 },
+ * ])
+ * // => { 'llm-judge': [...], 'programmatic': [...] }
+ */
+export function groupByEvaluator(feedback: Feedback[]): Record<string, Feedback[]> {
+	const grouped: Record<string, Feedback[]> = {};
+
+	for (const item of feedback) {
+		const evaluator = item.evaluator;
+		if (!grouped[evaluator]) {
+			grouped[evaluator] = [];
+		}
+		grouped[evaluator].push(item);
+	}
+
+	return grouped;
+}
+
+/**
+ * Calculate average score for an array of feedback items.
+ */
+export function calculateFiniteAverage(items: Feedback[]): number {
+	if (items.length === 0) return 0;
+	const finiteScores = items.map((f) => f.score).filter((s) => Number.isFinite(s));
+	if (finiteScores.length === 0) return 0;
+	const total = finiteScores.reduce((sum, s) => sum + s, 0);
+	return total / finiteScores.length;
+}
+
+/**
+ * Pick which feedback items should be used for evaluator-level scoring.
+ *
+ * Order of preference:
+ * - `kind: 'score'` (single authoritative score)
+ * - `kind: 'metric'` (stable category metrics)
+ * - any non-`detail` items
+ * - otherwise, all items
+ */
+export function selectScoringItems(items: Feedback[]): Feedback[] {
+	const scoreItems = items.filter((i) => i.kind === 'score');
+	if (scoreItems.length > 0) return scoreItems;
+
+	const metricItems = items.filter((i) => i.kind === 'metric');
+	if (metricItems.length > 0) return metricItems;
+
+	const nonDetailItems = items.filter((i) => i.kind !== 'detail');
+	if (nonDetailItems.length > 0) return nonDetailItems;
+
+	return items;
+}
+
+/**
+ * Calculate weighted overall score from feedback.
+ *
+ * Each evaluator's average score is weighted according to the weights map.
+ * Unknown evaluators receive the default weight.
+ *
+ * @param feedback - Array of feedback items
+ * @param weights - Weight per evaluator (defaults to DEFAULT_WEIGHTS)
+ * @returns Weighted average score (0-1)
+ */
+export function calculateWeightedScore(
+	feedback: Feedback[],
+	weights: ScoreWeights = DEFAULT_EVALUATOR_WEIGHTS,
+): number {
+	if (feedback.length === 0) return 0;
+
+	const byEvaluator = groupByEvaluator(feedback);
+
+	let totalWeight = 0;
+	let weightedSum = 0;
+
+	for (const [evaluator, items] of Object.entries(byEvaluator)) {
+		const avgScore = calculateFiniteAverage(selectScoringItems(items));
+		const weight = weights[evaluator] ?? UNKNOWN_EVALUATOR_WEIGHT;
+		weightedSum += avgScore * weight;
+		totalWeight += weight;
+	}
+
+	return totalWeight > 0 ? weightedSum / totalWeight : 0;
+}
+
+/**
+ * Aggregate scores by evaluator and category.
+ *
+ * @param feedback - Array of feedback items
+ * @returns Aggregated scores with overall, by-evaluator, and by-category breakdowns
+ */
+export function aggregateScores(feedback: Feedback[]): AggregatedScore {
+	if (feedback.length === 0) {
+		return {
+			overall: 0,
+			byEvaluator: {},
+			byCategory: {},
+		};
+	}
+
+	// Calculate overall weighted score
+	const overall = calculateWeightedScore(feedback);
+
+	// Calculate by-evaluator averages
+	const byEvaluator: Record<string, number> = {};
+	const grouped = groupByEvaluator(feedback);
+	for (const [evaluator, items] of Object.entries(grouped)) {
+		byEvaluator[evaluator] = calculateFiniteAverage(selectScoringItems(items));
+	}
+
+	// Calculate by-category averages
+	const byCategory: Record<string, number> = {};
+	const categoryGroups: Record<string, Feedback[]> = {};
+
+	for (const item of feedback) {
+		if (item.kind === 'detail') continue;
+		const category = item.metric.split('.')[0] ?? '';
+		if (category) {
+			if (!categoryGroups[category]) {
+				categoryGroups[category] = [];
+			}
+			categoryGroups[category].push(item);
+		}
+	}
+
+	for (const [category, items] of Object.entries(categoryGroups)) {
+		byCategory[category] = calculateFiniteAverage(items);
+	}
+
+	return {
+		overall,
+		byEvaluator,
+		byCategory,
+	};
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/index.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/index.ts
@ -1,262 +1,105 @@
-import type { BuilderFeatureFlags } from '@/workflow-builder-agent';
-
-import { runCliEvaluation } from './cli/runner.js';
-import { runLangsmithEvaluation } from './langsmith/runner.js';
-import { runLocalPairwiseEvaluation, runPairwiseLangsmithEvaluation } from './pairwise/runner.js';
-import { loadTestCasesFromCsv } from './utils/csv-prompt-loader.js';
-
-// Re-export for external use if needed
-export { runCliEvaluation } from './cli/runner.js';
-export { runLangsmithEvaluation } from './langsmith/runner.js';
-export {
-	runLocalPairwiseEvaluation,
-	runPairwiseLangsmithEvaluation,
-} from './pairwise/runner.js';
-export { runSingleTest } from './core/test-runner.js';
-export { setupTestEnvironment, createAgent } from './core/environment.js';
-
-/** All valid CLI flags */
-const VALID_FLAGS = [
-	'--test-case',
-	'--prompts-csv',
-	'--repetitions',
-	'--notion-id', // Backwards-compatible alias for --filter id:
-	'--technique', // Backwards-compatible alias for --filter technique:
-	'--filter', // Unified filter flag with key:value syntax
-	'--judges',
-	'--generations',
-	'--concurrency',
-	'--max-examples',
-	'--verbose',
-	'-v',
-	'--name',
-	'--output-dir',
-	'--prompt',
-	'--dos',
-	'--donts',
-	'--template-examples',
-] as const;
-
-/** Validate that all provided CLI flags are recognized */
-function validateCliArgs(): void {
-	const args = process.argv.slice(2); // Skip node and script path
-
-	for (const arg of args) {
-		// Skip values (non-flag arguments)
-		if (!arg.startsWith('-')) continue;
-
-		// Handle --flag=value format
-		const flagName = arg.includes('=') ? arg.split('=')[0] : arg;
-
-		if (!VALID_FLAGS.includes(flagName as (typeof VALID_FLAGS)[number])) {
-			const validFlagsList = VALID_FLAGS.filter((f) => f.startsWith('--')).join('\n  ');
-			throw new Error(`Unknown flag: ${flagName}\n\nValid flags:\n  ${validFlagsList}`);
-		}
-	}
-}
-
-/** Parse an integer flag with default value */
-function getIntFlag(flag: string, defaultValue: number, max?: number): number {
-	const arg = getFlagValue(flag);
-	if (!arg) return defaultValue;
-	const parsed = parseInt(arg, 10);
-	if (Number.isNaN(parsed) || parsed < 1) return defaultValue;
-	return max ? Math.min(parsed, max) : parsed;
-}
-
-interface FilterOptions {
-	doSearch?: string;
-	dontSearch?: string;
-	technique?: string;
-	notionId?: string;
-}
-
 /**
- * Parse --filter flags with key:value syntax.
- * Supports multiple --filter flags that are applied progressively.
- * Also handles backwards-compatible --technique and --notion-id aliases.
+ * V2 Evaluation Harness
 *
- * @example
- * --filter "do:structured output" --filter "technique:data_transformation"
- * --filter "id:abc123" --filter "dont:hardcoded"
+ * A factory-based, testable evaluation system for AI workflow generation.
+ *
+ * Key features:
+ * - Factory pattern for evaluator creation
+ * - Parallel evaluator execution
+ * - Both local and LangSmith modes
+ * - Centralized lifecycle hooks for logging
+ * - Pre-computed feedback pattern for LangSmith compatibility
 */
-function parseFilterFlags(): FilterOptions {
-	const filters: FilterOptions = {};

-	// Extract values following --filter flags
-	const filterValues = process.argv
-		.map((arg, i, arr) => (arg === '--filter' ? arr[i + 1] : null))
-		.filter((v): v is string => v !== null);
+// Core runner
+export { runEvaluation } from './harness/runner';

-	// Parse each filter (format: key:value)
-	const filterPattern = /^(\w+):(.+)$/;
+// Types
+export type {
+	Feedback,
+	EvaluationContext,
+	TestCaseContext,
+	GlobalRunContext,
+	Evaluator,
+	TestCase,
+	RunConfig,
+	ExampleResult,
+	RunSummary,
+	EvaluationLifecycle,
+	LangsmithOptions,
+} from './harness/harness-types';

-	for (const value of filterValues) {
-		const match = value.match(filterPattern);
-		if (!match) {
-			throw new Error('Invalid --filter format. Expected: --filter "key:value"');
-		}
+// Lifecycle
+export {
+	createConsoleLifecycle,
+	createQuietLifecycle,
+	mergeLifecycles,
+	type ConsoleLifecycleOptions,
+} from './harness/lifecycle';

-		const [, key, filterValue] = match;
-		switch (key) {
-			case 'do':
-				filters.doSearch = filterValue;
-				break;
-			case 'dont':
-				filters.dontSearch = filterValue;
-				break;
-			case 'technique':
-				filters.technique = filterValue;
-				break;
-			case 'id':
-				filters.notionId = filterValue;
-				break;
-		}
-	}
+// Evaluator factories
+export {
+	createLLMJudgeEvaluator,
+	createProgrammaticEvaluator,
+	createPairwiseEvaluator,
+	createSimilarityEvaluator,
+	type PairwiseEvaluatorOptions,
+	type SimilarityEvaluatorOptions,
+} from './evaluators';

-	// Backwards-compatible aliases (--technique and --notion-id)
-	// These are overridden by --filter if both are specified
-	const techniqueAlias = getFlagValue('--technique');
-	if (techniqueAlias && !filters.technique) {
-		filters.technique = techniqueAlias;
-	}
+// Output
+export {
+	createArtifactSaver,
+	type ArtifactSaver,
+	type ArtifactSaverOptions,
+} from './harness/output';

-	const notionIdAlias = getFlagValue('--notion-id');
-	if (notionIdAlias && !filters.notionId) {
-		filters.notionId = notionIdAlias;
-	}
+// Multi-generation utilities
+export {
+	getMajorityThreshold,
+	aggregateGenerations,
+	type GenerationDetail,
+	type MultiGenerationResult,
+} from './harness/multi-gen';

-	return filters;
-}
+// Trace filtering (re-exported from v1 for convenience)
+export {
+	createTraceFilters,
+	isMinimalTracingEnabled,
+	type TraceFilters,
+} from './langsmith/trace-filters';

-/** Parse all CLI arguments */
-function parseCliArgs() {
-	validateCliArgs();
+// Score calculation utilities
+export {
+	parseFeedbackKey,
+	extractCategory,
+	groupByEvaluator,
+	calculateWeightedScore,
+	aggregateScores,
+	DEFAULT_EVALUATOR_WEIGHTS,
+	DEFAULT_WEIGHTS,
+	type ScoreWeights,
+	type AggregatedScore,
+	type FeedbackKeyParts,
+} from './harness/score-calculator';

-	return {
-		testCaseId: process.argv.includes('--test-case')
-			? process.argv[process.argv.indexOf('--test-case') + 1]
-			: undefined,
-		promptsCsvPath: getFlagValue('--prompts-csv') ?? process.env.PROMPTS_CSV_FILE,
-		repetitions: getIntFlag('--repetitions', 1),
-		filters: parseFilterFlags(),
-		numJudges: getIntFlag('--judges', 3),
-		numGenerations: getIntFlag('--generations', 1, 10),
-		concurrency: getIntFlag('--concurrency', 5),
-		// Use 0 as sentinel for "no limit", convert to undefined for cleaner API
-		maxExamples: getIntFlag('--max-examples', 0) || undefined,
-		verbose: process.argv.includes('--verbose') || process.argv.includes('-v'),
-		experimentName: getFlagValue('--name'),
-		outputDir: getFlagValue('--output-dir'),
-		prompt: getFlagValue('--prompt'),
-		dos: getFlagValue('--dos'),
-		donts: getFlagValue('--donts'),
-	};
-}
+// Report generation
+export {
+	extractViolationSeverity,
+	calculateReportMetrics,
+	generateMarkdownReport,
+	type ViolationSeverity,
+	type ReportOptions,
+	type ReportMetrics,
+} from './support/report-generator';

-/**
- * Main entry point for evaluation
- * Determines which evaluation mode to run based on environment variables
- */
-async function main(): Promise<void> {
-	const useLangsmith = process.env.USE_LANGSMITH_EVAL === 'true';
-	const usePairwiseEval = process.env.USE_PAIRWISE_EVAL === 'true';
-	const args = parseCliArgs();
+// Test case generation
+export {
+	createTestCaseGenerator,
+	type TestCaseGeneratorOptions,
+	type GeneratedTestCase,
+	type TestCaseGenerator,
+} from './support/test-case-generator';

-	if (args.promptsCsvPath && (useLangsmith || usePairwiseEval)) {
-		console.warn('CSV-driven evaluations are only supported in CLI mode. Ignoring --prompts-csv.');
-	}
-
-	// Parse feature flags from environment variables or CLI arguments
-	const featureFlags = parseFeatureFlags();
-
-	if (usePairwiseEval) {
-		if (args.prompt) {
-			// Local mode - run single evaluation without LangSmith
-			await runLocalPairwiseEvaluation({
-				prompt: args.prompt,
-				criteria: { dos: args.dos ?? '', donts: args.donts ?? '' },
-				numJudges: args.numJudges,
-				numGenerations: args.numGenerations,
-				verbose: args.verbose,
-				outputDir: args.outputDir,
-				featureFlags,
-			});
-		} else {
-			// LangSmith mode
-			await runPairwiseLangsmithEvaluation({
-				repetitions: args.repetitions,
-				...args.filters,
-				numJudges: args.numJudges,
-				numGenerations: args.numGenerations,
-				verbose: args.verbose,
-				experimentName: args.experimentName,
-				concurrency: args.concurrency,
-				maxExamples: args.maxExamples,
-				featureFlags,
-			});
-		}
-	} else if (useLangsmith) {
-		await runLangsmithEvaluation(args.repetitions, featureFlags);
-	} else {
-		const csvTestCases = args.promptsCsvPath
-			? loadTestCasesFromCsv(args.promptsCsvPath)
-			: undefined;
-		await runCliEvaluation({
-			testCases: csvTestCases,
-			testCaseFilter: args.testCaseId,
-			repetitions: args.repetitions,
-			featureFlags,
-		});
-	}
-}
-
-function getFlagValue(flag: string): string | undefined {
-	const exactMatchIndex = process.argv.findIndex((arg) => arg === flag);
-	if (exactMatchIndex !== -1) {
-		const value = process.argv[exactMatchIndex + 1];
-		if (!value || value.startsWith('--')) {
-			throw new Error(`Flag ${flag} requires a value`);
-		}
-		return value;
-	}
-
-	const withValue = process.argv.find((arg) => arg.startsWith(`${flag}=`));
-	if (withValue) {
-		const value = withValue.slice(flag.length + 1);
-		if (!value) {
-			throw new Error(`Flag ${flag} requires a value`);
-		}
-		return value;
-	}
-
-	return undefined;
-}
-
-/**
- * Parse feature flags from environment variables or CLI arguments.
- * Environment variables:
- *   - EVAL_FEATURE_TEMPLATE_EXAMPLES=true - Enable template examples feature
- * CLI arguments:
- *   - --template-examples - Enable template examples feature
- */
-function parseFeatureFlags(): BuilderFeatureFlags | undefined {
-	const templateExamplesFromEnv = process.env.EVAL_FEATURE_TEMPLATE_EXAMPLES === 'true';
-	const templateExamplesFromCli = process.argv.includes('--template-examples');
-
-	const templateExamples = templateExamplesFromEnv || templateExamplesFromCli;
-
-	// Only return feature flags object if at least one flag is set
-	if (templateExamples) {
-		return {
-			templateExamples: templateExamples || undefined,
-		};
-	}
-
-	return undefined;
-}
-
-// Run if called directly
-if (require.main === module) {
-	main().catch(console.error);
-}
+// CSV loader utilities
+export { loadDefaultTestCases, getDefaultTestCaseIds } from './cli/csv-prompt-loader';
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/evaluator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/evaluator.ts
@ -1,288 +0,0 @@
-import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
-import type { EvaluationResult as LangsmithEvaluationResult } from 'langsmith/evaluation';
-import type { Run, Example } from 'langsmith/schemas';
-import type { INodeTypeDescription } from 'n8n-workflow';
-
-import type { SimpleWorkflow } from '../../src/types/workflow.js';
-import { evaluateWorkflow } from '../chains/workflow-evaluator.js';
-import { programmaticEvaluation } from '../programmatic/programmatic-evaluation';
-import type { EvaluationInput, CategoryScore } from '../types/evaluation.js';
-import {
-	isSimpleWorkflow,
-	isValidPrompt,
-	formatViolations,
-	type UsageMetadata,
-} from '../types/langsmith.js';
-
-// Helper to validate run outputs
-function validateRunOutputs(outputs: unknown): {
-	workflow?: SimpleWorkflow;
-	prompt?: string;
-	referenceWorkflow?: SimpleWorkflow;
-	usage?: Partial<UsageMetadata>;
-	error?: string;
-} {
-	if (!outputs || typeof outputs !== 'object') {
-		return { error: 'No outputs found in run' };
-	}
-
-	const runOutputs = outputs as Record<string, unknown>;
-
-	if (!isSimpleWorkflow(runOutputs.workflow)) {
-		return { error: 'Invalid or missing workflow in outputs' };
-	}
-
-	if (!isValidPrompt(runOutputs.prompt)) {
-		return { error: 'Invalid or missing prompt in outputs' };
-	}
-
-	// Extract usage metadata if available
-	const usage = extractUsageMetadata(runOutputs.usage);
-
-	// Extract reference workflow if available
-	let referenceWorkflow: SimpleWorkflow | undefined;
-	if (runOutputs.referenceOutputs && typeof runOutputs.referenceOutputs === 'object') {
-		const refOutputs = runOutputs.referenceOutputs as Record<string, unknown>;
-		if (isSimpleWorkflow(refOutputs.workflowJSON)) {
-			referenceWorkflow = refOutputs.workflowJSON;
-		}
-	}
-
-	return {
-		workflow: runOutputs.workflow,
-		prompt: runOutputs.prompt,
-		referenceWorkflow,
-		usage,
-	};
-}
-
-// Helper to extract usage metadata
-function extractUsageMetadata(usage: unknown): Partial<UsageMetadata> {
-	if (!usage || typeof usage !== 'object') return {};
-
-	const rawUsage = usage as Record<string, unknown>;
-	const usageFieldMap: Record<string, keyof UsageMetadata> = {
-		input_tokens: 'input_tokens',
-		output_tokens: 'output_tokens',
-		cache_creation_input_tokens: 'cache_creation_input_tokens',
-		cache_read_input_tokens: 'cache_read_input_tokens',
-	};
-
-	const result: Partial<UsageMetadata> = {};
-	for (const [sourceKey, targetKey] of Object.entries(usageFieldMap)) {
-		const value = rawUsage[sourceKey];
-		if (typeof value === 'number') {
-			result[targetKey] = value;
-		}
-	}
-	return result;
-}
-
-// Helper to convert category scores to Langsmith results
-function categoryToResult(key: string, category: CategoryScore): LangsmithEvaluationResult {
-	return {
-		key,
-		score: category.score,
-		comment: formatViolations(category.violations),
-	};
-}
-
-/**
- * Creates a Langsmith evaluator function that uses the LLM-based workflow evaluator and programmatic evaluation.
- * @param llm - Language model to use for evaluation
- * @param parsedNodeTypes - Node types for programmatic evaluation
- * @returns Evaluator function compatible with Langsmith
- */
-export function createLangsmithEvaluator(
-	llm: BaseChatModel,
-	parsedNodeTypes: INodeTypeDescription[],
-): (rootRun: Run, example?: Example) => Promise<LangsmithEvaluationResult[]> {
-	// eslint-disable-next-line complexity
-	return async (rootRun: Run, example?: Example): Promise<LangsmithEvaluationResult[]> => {
-		// Validate and extract outputs
-		const validation = validateRunOutputs(rootRun.outputs);
-		if (validation.error) {
-			return [
-				{
-					key: 'evaluationError',
-					score: 0,
-					comment: validation.error,
-				},
-			];
-		}
-
-		let referenceWorkflow: SimpleWorkflow | SimpleWorkflow[] | undefined = undefined;
-		let referenceWorkflows: SimpleWorkflow[] | undefined = undefined;
-		let preset: 'strict' | 'standard' | 'lenient' | undefined = undefined;
-		// Extract reference workflow and preset from example outputs if available
-		if (example?.outputs) {
-			const exampleOutputs = example.outputs as Record<string, unknown>;
-			if (Array.isArray(exampleOutputs.workflowJSON)) {
-				referenceWorkflows = [];
-				for (const workflow of exampleOutputs.workflowJSON) {
-					if (isSimpleWorkflow(workflow)) {
-						referenceWorkflows.push(workflow);
-					}
-				}
-			}
-			if (isSimpleWorkflow(exampleOutputs.workflowJSON)) {
-				referenceWorkflow = exampleOutputs.workflowJSON;
-			}
-			// Extract preset if available
-			if (
-				typeof exampleOutputs.preset === 'string' &&
-				['strict', 'standard', 'lenient'].includes(exampleOutputs.preset)
-			) {
-				preset = exampleOutputs.preset as 'strict' | 'standard' | 'lenient';
-			}
-		}
-
-		const evaluationInput: EvaluationInput = {
-			userPrompt: validation.prompt!,
-			generatedWorkflow: validation.workflow!,
-			referenceWorkflow,
-			referenceWorkflows,
-			preset,
-		};
-
-		try {
-			// Run LLM-based evaluation
-			const evaluationResult = await evaluateWorkflow(llm, evaluationInput);
-
-			// Run programmatic evaluation
-			const programmaticResult = await programmaticEvaluation(evaluationInput, parsedNodeTypes);
-
-			const results: LangsmithEvaluationResult[] = [];
-
-			// Add core category scores
-			const categories = [
-				{ key: 'functionality', score: evaluationResult.functionality },
-				{ key: 'connections', score: evaluationResult.connections },
-				{ key: 'expressions', score: evaluationResult.expressions },
-				{ key: 'nodeConfiguration', score: evaluationResult.nodeConfiguration },
-			];
-
-			for (const { key, score } of categories) {
-				results.push(categoryToResult(key, score));
-			}
-
-			results.push(categoryToResult('efficiency', evaluationResult.efficiency));
-			// Add sub-metrics
-			results.push({
-				key: 'efficiency.redundancyScore',
-				score: evaluationResult.efficiency.redundancyScore,
-			});
-			results.push({
-				key: 'efficiency.pathOptimization',
-				score: evaluationResult.efficiency.pathOptimization,
-			});
-			results.push({
-				key: 'efficiency.nodeCountEfficiency',
-				score: evaluationResult.efficiency.nodeCountEfficiency,
-			});
-
-			results.push(categoryToResult('dataFlow', evaluationResult.dataFlow));
-
-			results.push(categoryToResult('maintainability', evaluationResult.maintainability));
-			results.push({
-				key: 'maintainability.nodeNamingQuality',
-				score: evaluationResult.maintainability.nodeNamingQuality,
-			});
-			results.push({
-				key: 'maintainability.workflowOrganization',
-				score: evaluationResult.maintainability.workflowOrganization,
-			});
-			results.push({
-				key: 'maintainability.modularity',
-				score: evaluationResult.maintainability.modularity,
-			});
-
-			// Add usage metadata if available
-			const usageMetrics = [
-				{ key: 'inputTokens', value: validation.usage?.input_tokens },
-				{ key: 'outputTokens', value: validation.usage?.output_tokens },
-				{ key: 'cacheCreationInputTokens', value: validation.usage?.cache_creation_input_tokens },
-				{ key: 'cacheReadInputTokens', value: validation.usage?.cache_read_input_tokens },
-			];
-
-			for (const metric of usageMetrics) {
-				if (metric.value !== undefined) {
-					// Langsmith has a limitation on large scores (>99999) so we track in thousands
-					results.push({ key: metric.key, score: metric.value / 1000 });
-				}
-			}
-
-			// Add total prompt tokens for clarity (sum of all input token types)
-			const totalPromptTokens =
-				(validation.usage?.input_tokens ?? 0) +
-				(validation.usage?.cache_creation_input_tokens ?? 0) +
-				(validation.usage?.cache_read_input_tokens ?? 0);
-
-			if (totalPromptTokens > 0) {
-				results.push({
-					key: 'totalPromptTokens',
-					score: totalPromptTokens / 1000,
-					comment: 'Total prompt size (fresh + cached + cache creation)',
-				});
-			}
-
-			// Calculate and add cache hit rate if cache data is available
-			if (validation.usage?.cache_read_input_tokens !== undefined) {
-				const inputTokens = validation.usage.input_tokens ?? 0;
-				const cacheCreationTokens = validation.usage.cache_creation_input_tokens ?? 0;
-				const cacheReadTokens = validation.usage.cache_read_input_tokens ?? 0;
-
-				const totalInputTokens = inputTokens + cacheCreationTokens + cacheReadTokens;
-				const cacheHitRate = totalInputTokens > 0 ? cacheReadTokens / totalInputTokens : 0;
-
-				// Store as percentage (0-1 scale)
-				results.push({
-					key: 'cacheHitRate',
-					score: cacheHitRate,
-					comment: `${(cacheHitRate * 100).toFixed(1)}% of input tokens served from cache`,
-				});
-			}
-
-			// Add structural similarity if applicable
-			if (validation.referenceWorkflow && evaluationResult.structuralSimilarity.applicable) {
-				results.push(
-					categoryToResult('structuralSimilarity', evaluationResult.structuralSimilarity),
-				);
-			}
-
-			// Add overall score
-			results.push({
-				key: 'overallScore',
-				score: evaluationResult.overallScore,
-				comment: evaluationResult.summary,
-			});
-
-			// Add programmatic evaluation scores
-			results.push({
-				key: 'programmatic.overall',
-				score: programmaticResult.overallScore,
-			});
-			results.push(categoryToResult('programmatic.connections', programmaticResult.connections));
-			results.push(categoryToResult('programmatic.trigger', programmaticResult.trigger));
-			results.push(categoryToResult('programmatic.agentPrompt', programmaticResult.agentPrompt));
-			results.push(categoryToResult('programmatic.tools', programmaticResult.tools));
-			results.push(categoryToResult('programmatic.fromAi', programmaticResult.fromAi));
-
-			// Add workflow similarity if available
-			if (programmaticResult.similarity !== null && programmaticResult.similarity !== undefined) {
-				results.push(categoryToResult('programmatic.similarity', programmaticResult.similarity));
-			}
-
-			return results;
-		} catch (error) {
-			const errorMessage = error instanceof Error ? error.message : String(error);
-			return [
-				{
-					key: 'evaluationError',
-					score: 0,
-					comment: `Evaluation failed: ${errorMessage}`,
-				},
-			];
-		}
-	};
-}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/runner.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/runner.ts
@ -1,199 +0,0 @@
-import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
-import { evaluate } from 'langsmith/evaluation';
-import { getLangchainCallbacks } from 'langsmith/langchain';
-import { traceable } from 'langsmith/traceable';
-import type { INodeTypeDescription } from 'n8n-workflow';
-import pc from 'picocolors';
-
-import { createLangsmithEvaluator } from './evaluator';
-import type { BuilderFeatureFlags } from '../../src/workflow-builder-agent';
-import type { WorkflowState } from '../../src/workflow-state';
-import { EVAL_TYPES, EVAL_USERS, TRACEABLE_NAMES } from '../constants';
-import { setupTestEnvironment, createAgent } from '../core/environment';
-import {
-	generateRunId,
-	safeExtractUsage,
-	isWorkflowStateValues,
-	extractMessageContent,
-} from '../types/langsmith';
-import { consumeGenerator, formatHeader, getChatPayload } from '../utils/evaluation-helpers';
-
-/**
- * Creates a workflow generation function for Langsmith evaluation
- * Uses traceable wrapper for proper LangSmith context propagation
- * @param parsedNodeTypes - Node types
- * @param llm - Language model
- * @param featureFlags - Optional feature flags to pass to the agent
- * @returns Function that generates workflows from inputs
- */
-function createWorkflowGenerator(
-	parsedNodeTypes: INodeTypeDescription[],
-	llm: BaseChatModel,
-	featureFlags?: BuilderFeatureFlags,
-) {
-	// Wrap the inner function with traceable for proper LangSmith context propagation
-	const generateWorkflow = traceable(
-		async (inputs: typeof WorkflowState.State) => {
-			// Generate a unique ID for this evaluation run
-			const runId = generateRunId();
-
-			// Validate inputs
-			if (!inputs.messages || !Array.isArray(inputs.messages) || inputs.messages.length === 0) {
-				throw new Error('No messages provided in inputs');
-			}
-
-			// Extract first message content safely
-			const firstMessage = inputs.messages[0];
-			const messageContent = extractMessageContent(firstMessage);
-
-			// Get LangChain callbacks linked to current traceable context.
-			// This is the official bridge between LangSmith's traceable and LangChain callbacks.
-			const callbacks = await getLangchainCallbacks();
-
-			// Create agent for this run (no tracer - callbacks passed at invocation)
-			const agent = createAgent({ parsedNodeTypes, llm, featureFlags });
-			await consumeGenerator(
-				agent.chat(
-					getChatPayload({
-						evalType: EVAL_TYPES.LANGSMITH,
-						message: messageContent,
-						workflowId: runId,
-						featureFlags,
-					}),
-					EVAL_USERS.LANGSMITH,
-					undefined, // abortSignal
-					callbacks, // externalCallbacks for LangSmith tracing
-				),
-			);
-
-			// Get generated workflow with validation
-			const state = await agent.getState(runId, EVAL_USERS.LANGSMITH);
-
-			// Validate state
-			if (!state.values) {
-				throw new Error('No values in agent state');
-			}
-
-			if (!isWorkflowStateValues(state.values)) {
-				throw new Error('Invalid workflow state: workflow or messages missing');
-			}
-
-			const generatedWorkflow = state.values.workflowJSON;
-			const messages = state.values.messages;
-
-			// Extract usage metadata safely
-			const usage = safeExtractUsage(messages);
-
-			return {
-				workflow: generatedWorkflow,
-				prompt: messageContent,
-				usage,
-			};
-		},
-		{ name: TRACEABLE_NAMES.WORKFLOW_GENERATION, run_type: 'chain' },
-	);
-
-	return generateWorkflow;
-}
-
-/**
- * Runs evaluation using Langsmith
- * @param repetitions - Number of times to run each example (default: 1)
- * @param featureFlags - Optional feature flags to pass to the agent
- */
-export async function runLangsmithEvaluation(
-	repetitions: number = 1,
-	featureFlags?: BuilderFeatureFlags,
-): Promise<void> {
-	console.log(formatHeader('AI Workflow Builder Langsmith Evaluation', 70));
-	if (repetitions > 1) {
-		console.log(pc.yellow(`➔ Each example will be run ${repetitions} times`));
-	}
-	if (featureFlags) {
-		const enabledFlags = Object.entries(featureFlags)
-			.filter(([, v]) => v === true)
-			.map(([k]) => k);
-		if (enabledFlags.length > 0) {
-			console.log(pc.green(`➔ Feature flags enabled: ${enabledFlags.join(', ')}`));
-		}
-	}
-	console.log();
-
-	try {
-		// Check for Langsmith API key
-		if (!process.env.LANGSMITH_API_KEY) {
-			throw new Error('LANGSMITH_API_KEY environment variable not set');
-		}
-
-		// Setup test environment
-		const { parsedNodeTypes, llm, lsClient } = await setupTestEnvironment();
-		// Note: Don't use the tracer from setupTestEnvironment() here.
-		// LangSmith's evaluate() manages its own tracing context - passing a separate
-		// tracer would create disconnected runs in a different project.
-
-		if (!lsClient) {
-			throw new Error('Langsmith client not initialized');
-		}
-
-		// Get dataset name from env or use default
-		const datasetName = process.env.LANGSMITH_DATASET_NAME ?? 'workflow-builder-canvas-prompts';
-		console.log(pc.blue(`➔ Using dataset: ${datasetName}`));
-
-		// Verify dataset exists
-		try {
-			await lsClient.readDataset({ datasetName });
-		} catch {
-			// List available datasets for helpful error message
-			const availableDatasets: string[] = [];
-			for await (const dataset of lsClient.listDatasets()) {
-				availableDatasets.push(`${dataset.name} (${dataset.id})`);
-			}
-
-			throw new Error(
-				`Dataset "${datasetName}" not found. Available datasets: ${availableDatasets.join(', ') || 'none'}. ` +
-					'Set LANGSMITH_DATASET_NAME environment variable to use a different dataset.',
-			);
-		}
-
-		console.log();
-		const startTime = Date.now();
-
-		// Create workflow generation function
-		// Uses traceable wrapper internally for proper LangSmith context propagation
-		const generateWorkflow = createWorkflowGenerator(parsedNodeTypes, llm, featureFlags);
-
-		// Create evaluator with both LLM-based and programmatic evaluation
-		const evaluator = createLangsmithEvaluator(llm, parsedNodeTypes);
-
-		// Run Langsmith evaluation
-		const results = await evaluate(generateWorkflow, {
-			data: datasetName,
-			evaluators: [evaluator],
-			maxConcurrency: 7,
-			experimentPrefix: 'workflow-builder-evaluation',
-			numRepetitions: repetitions,
-			metadata: {
-				evaluationType: 'llm-based',
-				modelName: process.env.LLM_MODEL ?? 'default',
-			},
-		});
-
-		const totalTime = Date.now() - startTime;
-		console.log(pc.green(`✓ Evaluation completed in ${(totalTime / 1000).toFixed(1)}s`));
-
-		// Display results information
-		console.log('\nView detailed results in Langsmith dashboard');
-		console.log(
-			`Experiment name: workflow-builder-evaluation-${new Date().toISOString().split('T')[0]}`,
-		);
-
-		// Log summary of results if available
-		if (results) {
-			console.log(pc.dim('Evaluation run completed successfully'));
-			console.log(pc.dim(`Dataset: ${datasetName}`));
-		}
-	} catch (error) {
-		console.error(pc.red('✗ Langsmith evaluation failed:'), error);
-		process.exit(1);
-	}
-}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/trace-filters.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/trace-filters.ts
@ -0,0 +1,258 @@
+import type { KVMap } from 'langsmith/schemas';
+
+import { isSimpleWorkflow } from './types';
+import type { EvalLogger } from '../harness/logger.js';
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Type guards
+// ─────────────────────────────────────────────────────────────────────────────
+
+/**
+ * Type guard: check if value is a non-null object (Record).
+ */
+function isRecord(value: unknown): value is Record<string, unknown> {
+	return typeof value === 'object' && value !== null;
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Constants
+// ─────────────────────────────────────────────────────────────────────────────
+
+/**
+ * Large state fields that should be filtered from traces.
+ * These contribute most to payload bloat.
+ */
+const LARGE_STATE_FIELDS = ['cachedTemplates', 'parsedNodeTypes'] as const;
+
+/**
+ * Keys that indicate a LangChain serializable object.
+ * These should be passed through unchanged - copying them causes size inflation.
+ */
+const LANGCHAIN_SERIALIZABLE_KEYS = ['lc_serializable', 'lc_kwargs', 'lc_namespace'] as const;
+
+/**
+ * Large context fields within workflowContext that should be filtered.
+ */
+const LARGE_CONTEXT_FIELDS = ['executionData', 'executionSchema', 'expressionValues'] as const;
+
+/**
+ * Threshold for summarizing workflows instead of including full definition.
+ */
+const WORKFLOW_SUMMARY_THRESHOLD = 20;
+
+/**
+ * Check if an object is a LangChain serializable object.
+ * These objects should not be filtered as copying them causes size inflation.
+ */
+function isLangChainSerializable(obj: KVMap): boolean {
+	return LANGCHAIN_SERIALIZABLE_KEYS.some((key) => key in obj);
+}
+
+/**
+ * Check if an object has any fields worth filtering.
+ */
+function hasFilterableFields(obj: KVMap): boolean {
+	return (
+		LARGE_STATE_FIELDS.some((field) => field in obj) ||
+		'workflowContext' in obj ||
+		'workflowJSON' in obj ||
+		'workflow' in obj ||
+		'input' in obj // LangChain model inputs can be large
+	);
+}
+
+/**
+ * Summarize a workflow for minimal trace output.
+ * Preserves node counts and names without full definitions.
+ */
+function summarizeWorkflow(workflow: unknown): Record<string, unknown> {
+	if (!isSimpleWorkflow(workflow)) {
+		return { unknown: true };
+	}
+
+	return {
+		nodeCount: workflow.nodes.length,
+		nodeNames: workflow.nodes.map((n) => n.name).filter(Boolean),
+		connectionCount: Object.keys(workflow.connections).length,
+		name: workflow.name,
+	};
+}
+
+/**
+ * Summarize cached templates - just IDs and names, not full workflows.
+ */
+function summarizeCachedTemplates(templates: unknown[]): Array<Record<string, unknown>> {
+	return templates.map((t) => {
+		if (!isRecord(t)) return { unknown: true };
+		return {
+			templateId: t.templateId,
+			name: t.name,
+		};
+	});
+}
+
+/**
+ * Filter large state fields in-place (mutates the object).
+ * Shared logic for both input and output filtering.
+ */
+function filterLargeStateFields(obj: KVMap): void {
+	for (const field of LARGE_STATE_FIELDS) {
+		if (field in obj) {
+			if (field === 'cachedTemplates' && Array.isArray(obj[field])) {
+				obj[field] = summarizeCachedTemplates(obj[field] as unknown[]);
+			} else if (field === 'parsedNodeTypes' && Array.isArray(obj[field])) {
+				obj[field] = `[${(obj[field] as unknown[]).length} node types]`;
+			}
+		}
+	}
+}
+
+/**
+ * Summarize a large context field to a placeholder string.
+ */
+function summarizeContextField(key: string, value: unknown): string {
+	switch (key) {
+		case 'executionData':
+			return '[execution data omitted]';
+		case 'executionSchema':
+			return `[${Array.isArray(value) ? value.length : 0} schemas]`;
+		case 'expressionValues':
+			return `[${typeof value === 'object' && value ? Object.keys(value).length : 0} expressions]`;
+		default:
+			return '[omitted]';
+	}
+}
+
+/**
+ * Filter workflowContext object, summarizing large fields.
+ */
+function filterWorkflowContext(ctx: Record<string, unknown>): Record<string, unknown> {
+	const filtered: Record<string, unknown> = {};
+
+	for (const [key, value] of Object.entries(ctx)) {
+		if ((LARGE_CONTEXT_FIELDS as readonly string[]).includes(key)) {
+			filtered[key] = summarizeContextField(key, value);
+		} else if (key === 'currentWorkflow' && value) {
+			filtered[key] = summarizeWorkflow(value);
+		} else {
+			filtered[key] = value;
+		}
+	}
+
+	return filtered;
+}
+
+/**
+ * Summarize a workflow field if it exceeds the node threshold.
+ */
+function summarizeLargeWorkflow(workflow: unknown): unknown {
+	if (!isSimpleWorkflow(workflow)) {
+		return workflow;
+	}
+	if (workflow.nodes.length > WORKFLOW_SUMMARY_THRESHOLD) {
+		return summarizeWorkflow(workflow);
+	}
+	return workflow;
+}
+
+/**
+ * Check if minimal tracing is enabled.
+ * Default: true (enabled by default for evaluations)
+ * Set LANGSMITH_MINIMAL_TRACING=false to disable.
+ */
+export function isMinimalTracingEnabled(): boolean {
+	const envValue = process.env.LANGSMITH_MINIMAL_TRACING;
+	// Default to true if not set, only disable if explicitly set to 'false'
+	return envValue !== 'false';
+}
+
+/**
+ * Trace filter functions used by LangSmith client configuration.
+ */
+export interface TraceFilters {
+	/** Filter function for hideInputs */
+	filterInputs: (inputs: KVMap) => KVMap;
+	/** Filter function for hideOutputs */
+	filterOutputs: (outputs: KVMap) => KVMap;
+}
+
+/**
+ * Creates trace filter functions.
+ * @param logger - Optional logger for output (uses console.log if not provided)
+ */
+export function createTraceFilters(logger?: EvalLogger): TraceFilters {
+	let hasLoggedFilteringActive = false;
+
+	const filterInputs = (inputs: KVMap): KVMap => {
+		// Log once per client to confirm filtering is active
+		if (!hasLoggedFilteringActive) {
+			hasLoggedFilteringActive = true;
+			const log = logger?.info ?? console.log;
+			log('➔ LangSmith trace filtering: ACTIVE (set LANGSMITH_MINIMAL_TRACING=false to disable)');
+		}
+
+		// Skip LangChain serializable objects - copying them causes size inflation
+		if (isLangChainSerializable(inputs)) {
+			return inputs;
+		}
+
+		// Skip if no filterable fields - avoid unnecessary copy overhead
+		if (!hasFilterableFields(inputs)) {
+			return inputs;
+		}
+
+		const filtered = { ...inputs };
+
+		// Handle large top-level fields
+		filterLargeStateFields(filtered);
+
+		// Handle workflowContext if present
+		if (isRecord(filtered.workflowContext)) {
+			filtered.workflowContext = filterWorkflowContext(filtered.workflowContext);
+		}
+
+		// Handle workflowJSON if present at top level
+		if (filtered.workflowJSON && typeof filtered.workflowJSON === 'object') {
+			filtered.workflowJSON = summarizeLargeWorkflow(filtered.workflowJSON);
+		}
+
+		// Handle large 'input' field (LangChain model inputs with system prompts)
+		if (filtered.input && typeof filtered.input === 'string' && filtered.input.length > 1000) {
+			filtered.input = `[input truncated: ${filtered.input.length} chars]`;
+		}
+
+		return filtered;
+	};
+
+	const filterOutputs = (outputs: KVMap): KVMap => {
+		// Skip LangChain serializable objects - copying them causes size inflation
+		if (isLangChainSerializable(outputs)) {
+			return outputs;
+		}
+
+		// Check if there are any filterable fields in outputs
+		const hasFilterableOutputFields =
+			'workflow' in outputs || LARGE_STATE_FIELDS.some((field) => field in outputs);
+
+		// Skip if no filterable fields
+		if (!hasFilterableOutputFields) {
+			return outputs;
+		}
+
+		const filtered = { ...outputs };
+
+		// Handle large state fields in outputs
+		filterLargeStateFields(filtered);
+
+		// Summarize workflow outputs if present and large
+		if (filtered.workflow && typeof filtered.workflow === 'object') {
+			filtered.workflow = summarizeLargeWorkflow(filtered.workflow);
+		}
+
+		// Keep feedback array as-is - it's essential for evaluation results
+
+		return filtered;
+	};
+
+	return { filterInputs, filterOutputs };
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/types.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/types.ts
@ -3,10 +3,6 @@ import type { BaseMessage } from '@langchain/core/messages';
 import { cleanContextTags } from '@/utils/stream-processor';

 import type { SimpleWorkflow } from '../../src/types/workflow';
-import type { AIMessageWithUsageMetadata } from '../../src/utils/token-usage';
-
-// Define strict interfaces
-export type UsageMetadata = AIMessageWithUsageMetadata['response_metadata']['usage'];

 export interface WorkflowOutput {
 	workflow?: unknown;
@ -20,37 +16,6 @@ export interface WorkflowStateValues {
 	[key: string]: unknown;
 }

-// Type guards - no coercion, just validation
-export function isMessageWithMetadata(message: BaseMessage): message is AIMessageWithUsageMetadata {
-	return (
-		message.response_metadata !== undefined &&
-		message.response_metadata !== null &&
-		typeof message.response_metadata === 'object'
-	);
-}
-
-export function hasUsageMetadata(metadata: { usage?: unknown }): metadata is {
-	usage: Partial<UsageMetadata>;
-} {
-	if (!metadata.usage || typeof metadata.usage !== 'object') {
-		return false;
-	}
-
-	const usage = metadata.usage as Record<string, unknown>;
-
-	// Validate each field is either undefined or a number
-	const validFields = [
-		'input_tokens',
-		'output_tokens',
-		'cache_read_input_tokens',
-		'cache_creation_input_tokens',
-	];
-
-	return validFields.every(
-		(field) => usage[field] === undefined || typeof usage[field] === 'number',
-	);
-}
-
 export function isValidPrompt(value: unknown): value is string {
 	return typeof value === 'string' && value.length > 0;
 }
@ -71,32 +36,6 @@ export function isWorkflowStateValues(values: unknown): values is WorkflowStateV
 	return Array.isArray(values.messages) && isSimpleWorkflow(values.workflowJSON);
 }

-// Safe extraction without coercion
-export function safeExtractUsage(messages: BaseMessage[]): UsageMetadata {
-	const defaultUsage: UsageMetadata = {
-		input_tokens: 0,
-		output_tokens: 0,
-		cache_read_input_tokens: 0,
-		cache_creation_input_tokens: 0,
-	};
-
-	return messages.reduce((acc, message) => {
-		if (!isMessageWithMetadata(message)) return acc;
-		if (!hasUsageMetadata(message.response_metadata)) return acc;
-
-		const usage = message.response_metadata.usage;
-
-		return {
-			input_tokens: acc.input_tokens + (usage.input_tokens ?? 0),
-			output_tokens: acc.output_tokens + (usage.output_tokens ?? 0),
-			cache_read_input_tokens:
-				(acc?.cache_read_input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0),
-			cache_creation_input_tokens:
-				(acc?.cache_creation_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0),
-		};
-	}, defaultUsage);
-}
-
 // Helper to format violations for display
 export function formatViolations(violations: Array<{ type: string; description: string }>): string {
 	if (violations.length === 0) {
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/pairwise/generator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/pairwise/generator.ts
@ -1,119 +0,0 @@
-import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
-import type { EvaluationResult as LangsmithEvaluationResult } from 'langsmith/evaluation';
-import { traceable } from 'langsmith/traceable';
-import type { INodeTypeDescription } from 'n8n-workflow';
-
-import { runJudgePanel, aggregateGenerations, type GenerationResult } from './judge-panel';
-import { buildSingleGenerationResults, buildMultiGenerationResults } from './metrics-builder';
-import type { PairwiseDatasetInput, PairwiseTargetOutput } from './types';
-import type { SimpleWorkflow } from '../../src/types/workflow';
-import type { BuilderFeatureFlags } from '../../src/workflow-builder-agent';
-import { EVAL_TYPES, EVAL_USERS, TRACEABLE_NAMES } from '../constants';
-import { createAgent } from '../core/environment';
-import { generateRunId, isWorkflowStateValues } from '../types/langsmith';
-import { consumeGenerator, getChatPayload } from '../utils/evaluation-helpers';
-
-// ============================================================================
-// Target Factory
-// ============================================================================
-
-export interface CreatePairwiseTargetOptions {
-	parsedNodeTypes: INodeTypeDescription[];
-	llm: BaseChatModel;
-	numJudges: number;
-	numGenerations: number;
-	featureFlags?: BuilderFeatureFlags;
-	experimentName?: string;
-}
-
-/**
- * Creates a target function that does ALL the work:
- * - Generates all workflows (each wrapped in traceable)
- * - Runs judge panels
- * - Returns pre-computed feedback
- *
- * The evaluator then just extracts the pre-computed feedback.
- * This avoids 403 errors from nested traceable in evaluator context.
- */
-export function createPairwiseTarget(options: CreatePairwiseTargetOptions) {
-	const { parsedNodeTypes, llm, numJudges, numGenerations, featureFlags, experimentName } = options;
-
-	return traceable(
-		async (inputs: PairwiseDatasetInput): Promise<PairwiseTargetOutput> => {
-			const { prompt, evals: evalCriteria } = inputs;
-
-			// Generate ALL workflows and run judges in parallel
-			const generationResults: GenerationResult[] = await Promise.all(
-				Array.from({ length: numGenerations }, async (_, i) => {
-					const generationIndex = i + 1;
-					// Wrap each generation in traceable for proper visibility
-					const generate = traceable(
-						async () => await generateWorkflow(parsedNodeTypes, llm, prompt, featureFlags),
-						{
-							name: `generation_${generationIndex}`,
-							run_type: 'chain',
-							metadata: {
-								...(experimentName && { experiment_name: experimentName }),
-							},
-						},
-					);
-					const workflow = await generate();
-					const panelResult = await runJudgePanel(llm, workflow, evalCriteria, numJudges, {
-						generationIndex,
-						experimentName,
-					});
-					return { workflow, ...panelResult };
-				}),
-			);
-
-			if (numGenerations === 1) {
-				const singleGenFeedback = buildSingleGenerationResults(generationResults[0], numJudges);
-				return { prompt, evals: evalCriteria, feedback: singleGenFeedback };
-			}
-
-			const aggregation = aggregateGenerations(generationResults);
-			const multiGenFeedback: LangsmithEvaluationResult[] = buildMultiGenerationResults(
-				aggregation,
-				numJudges,
-			);
-
-			return { prompt, evals: evalCriteria, feedback: multiGenFeedback };
-		},
-		{ name: TRACEABLE_NAMES.PAIRWISE_EVALUATION, run_type: 'chain' },
-	);
-}
-
-/**
- * Generate a single workflow.
- * Used for local evaluation and regeneration in multi-generation mode.
- */
-export async function generateWorkflow(
-	parsedNodeTypes: INodeTypeDescription[],
-	llm: BaseChatModel,
-	prompt: string,
-	featureFlags?: BuilderFeatureFlags,
-): Promise<SimpleWorkflow> {
-	const runId = generateRunId();
-
-	const agent = createAgent({ parsedNodeTypes, llm, featureFlags });
-
-	await consumeGenerator(
-		agent.chat(
-			getChatPayload({
-				evalType: EVAL_TYPES.PAIRWISE_LOCAL,
-				message: prompt,
-				workflowId: runId,
-				featureFlags,
-			}),
-			EVAL_USERS.PAIRWISE_LOCAL,
-		),
-	);
-
-	const state = await agent.getState(runId, EVAL_USERS.PAIRWISE_LOCAL);
-
-	if (!state.values || !isWorkflowStateValues(state.values)) {
-		throw new Error('Invalid workflow state: workflow or messages missing');
-	}
-
-	return state.values.workflowJSON;
-}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/pairwise/metrics-builder.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/pairwise/metrics-builder.ts
@ -1,176 +0,0 @@
-import type { EvaluationResult as LangsmithEvaluationResult } from 'langsmith/evaluation';
-import type { Run } from 'langsmith/schemas';
-
-import type { JudgePanelResult, MultiGenerationAggregation } from './judge-panel';
-import { isPairwiseTargetOutput } from './types';
-import { METRIC_KEYS } from '../constants';
-
-// ============================================================================
-// Result Builders
-// ============================================================================
-
-/**
- * Build LangSmith-compatible evaluation results from judge panel output.
- */
-export function buildSingleGenerationResults(
-	result: JudgePanelResult,
-	numJudges: number,
-): LangsmithEvaluationResult[] {
-	const { judgeResults, primaryPasses, majorityPass, avgDiagnosticScore } = result;
-
-	const allViolations = judgeResults.flatMap((r, i) =>
-		r.violations.map((v) => `[Judge ${i + 1}] ${v.rule}: ${v.justification}`),
-	);
-	const allPasses = judgeResults.flatMap((r, i) =>
-		r.passes.map((p) => `[Judge ${i + 1}] ${p.rule}`),
-	);
-
-	const comment = [
-		`Majority vote: ${primaryPasses}/${numJudges} judges passed`,
-		allViolations.length > 0 ? `\nViolations:\n${allViolations.join('\n')}` : '',
-		allPasses.length > 0 ? `\nPasses:\n${allPasses.join('\n')}` : '',
-	]
-		.filter(Boolean)
-		.join('');
-
-	return [
-		{
-			key: METRIC_KEYS.PAIRWISE_DIAGNOSTIC,
-			score: avgDiagnosticScore,
-			comment: `Average diagnostic score across ${numJudges} judges`,
-		},
-		{
-			key: METRIC_KEYS.PAIRWISE_JUDGES_PASSED,
-			score: primaryPasses,
-			comment: `${primaryPasses} of ${numJudges} judges returned primaryPass=true`,
-		},
-		{ key: METRIC_KEYS.PAIRWISE_PRIMARY, score: majorityPass ? 1 : 0, comment },
-		{
-			key: METRIC_KEYS.PAIRWISE_TOTAL_PASSES,
-			score: judgeResults.reduce((sum, r) => sum + r.passes.length, 0),
-		},
-		{
-			key: METRIC_KEYS.PAIRWISE_TOTAL_VIOLATIONS,
-			score: judgeResults.reduce((sum, r) => sum + r.violations.length, 0),
-		},
-	];
-}
-
-/**
- * Build LangSmith-compatible evaluation results for multi-generation aggregation.
- */
-export function buildMultiGenerationResults(
-	aggregation: MultiGenerationAggregation,
-	numJudges: number,
-): LangsmithEvaluationResult[] {
-	const { generationCorrectness, aggregatedDiagnosticScore, passingGenerations, totalGenerations } =
-		aggregation;
-
-	// Build detailed comment with per-generation breakdown
-	const genBreakdown = aggregation.generationDetails
-		.map(
-			(g, i) =>
-				`Gen ${i + 1}: ${g.majorityPass ? 'PASS' : 'FAIL'} (${g.primaryPasses}/${numJudges} judges, ${(g.avgDiagnosticScore * 100).toFixed(0)}%)`,
-		)
-		.join('\n');
-
-	const comment = [
-		`Generation Correctness: ${passingGenerations}/${totalGenerations} generations passed`,
-		`\nPer-generation breakdown:\n${genBreakdown}`,
-	].join('');
-
-	// Use first generation for backward-compatible metrics
-	const firstGen = aggregation.generationDetails[0];
-
-	// Aggregate counts across all generations
-	const totalJudgesPassed = aggregation.generationDetails.reduce(
-		(sum, g) => sum + g.primaryPasses,
-		0,
-	);
-	const totalViolations = aggregation.generationDetails.reduce(
-		(sum, g) => sum + g.judgeResults.reduce((jSum, r) => jSum + r.violations.length, 0),
-		0,
-	);
-	const totalPasses = aggregation.generationDetails.reduce(
-		(sum, g) => sum + g.judgeResults.reduce((jSum, r) => jSum + r.passes.length, 0),
-		0,
-	);
-
-	return [
-		{
-			key: METRIC_KEYS.PAIRWISE_AGGREGATED_DIAGNOSTIC,
-			score: aggregatedDiagnosticScore,
-			comment: `Average diagnostic score across ${totalGenerations} generations`,
-		},
-		{
-			key: METRIC_KEYS.PAIRWISE_DIAGNOSTIC,
-			score: firstGen.avgDiagnosticScore,
-			comment: 'First generation diagnostic score',
-		},
-		{
-			key: METRIC_KEYS.PAIRWISE_GENERATION_CORRECTNESS,
-			score: generationCorrectness,
-			comment: `${passingGenerations} of ${totalGenerations} generations passed majority vote`,
-		},
-		{
-			key: METRIC_KEYS.PAIRWISE_GENERATIONS_PASSED,
-			score: passingGenerations,
-			comment,
-		},
-		{
-			key: METRIC_KEYS.PAIRWISE_JUDGES_PASSED,
-			score: totalJudgesPassed,
-			comment: `${totalJudgesPassed} of ${totalGenerations * numJudges} total judge calls passed`,
-		},
-		{
-			key: METRIC_KEYS.PAIRWISE_PRIMARY,
-			score: firstGen.majorityPass ? 1 : 0,
-			comment: `First generation: ${firstGen.primaryPasses}/${numJudges} judges passed`,
-		},
-		{
-			key: METRIC_KEYS.PAIRWISE_TOTAL_JUDGE_CALLS,
-			score: totalGenerations * numJudges,
-			comment: `${totalGenerations} generations x ${numJudges} judges`,
-		},
-		{
-			key: METRIC_KEYS.PAIRWISE_TOTAL_PASSES,
-			score: totalPasses,
-			comment: `Total criteria passes across all ${totalGenerations} generations`,
-		},
-		{
-			key: METRIC_KEYS.PAIRWISE_TOTAL_VIOLATIONS,
-			score: totalViolations,
-			comment: `Total violations across all ${totalGenerations} generations`,
-		},
-	];
-}
-
-// ============================================================================
-// LangSmith Evaluator
-// ============================================================================
-
-/**
- * LangSmith evaluator that extracts pre-computed metrics from target output.
- *
- * All the work (generation + judging) happens in the target function.
- * This evaluator just returns the pre-computed metrics from target output.
- * This avoids 403 errors from nested traceable calls in evaluator context.
- */
-export async function pairwiseLangsmithEvaluator(
-	rootRun: Run,
-): Promise<LangsmithEvaluationResult[]> {
-	const outputs = rootRun.outputs;
-
-	if (!isPairwiseTargetOutput(outputs)) {
-		return [
-			{
-				key: METRIC_KEYS.PAIRWISE_PRIMARY,
-				score: 0,
-				comment: 'Invalid output - missing feedback from target',
-			},
-			{ key: METRIC_KEYS.PAIRWISE_DIAGNOSTIC, score: 0 },
-		];
-	}
-
-	return outputs.feedback;
-}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/pairwise/runner.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/pairwise/runner.ts
@ -1,562 +0,0 @@
-import { evaluate } from 'langsmith/evaluation';
-import pc from 'picocolors';
-
-import { createPairwiseTarget, generateWorkflow } from './generator';
-import { aggregateGenerations, runJudgePanel, type GenerationResult } from './judge-panel';
-import { pairwiseLangsmithEvaluator } from './metrics-builder';
-import { isPairwiseExample, type PairwiseExample } from './types';
-import type { BuilderFeatureFlags } from '../../src/workflow-builder-agent';
-import { DEFAULTS } from '../constants';
-import { setupTestEnvironment } from '../core/environment';
-import { createArtifactSaver } from '../utils/artifact-saver';
-import { formatHeader } from '../utils/evaluation-helpers';
-import { createLogger, type EvalLogger } from '../utils/logger';
-
-// ============================================================================
-// Helpers
-// ============================================================================
-
-/** Extract notion_id from metadata if present */
-function getNotionId(metadata: unknown): string | undefined {
-	if (typeof metadata === 'object' && metadata !== null && 'notion_id' in metadata) {
-		const id = (metadata as { notion_id: unknown }).notion_id;
-		return typeof id === 'string' ? id : undefined;
-	}
-	return undefined;
-}
-
-/** Extract categories from metadata if present */
-function getCategories(metadata: unknown): string[] | undefined {
-	if (typeof metadata === 'object' && metadata !== null && 'categories' in metadata) {
-		const categories = (metadata as { categories: unknown }).categories;
-		return Array.isArray(categories)
-			? categories.filter((c): c is string => typeof c === 'string')
-			: undefined;
-	}
-	return undefined;
-}
-
-/** Filter examples by a search string in a specific eval field (do or don't) */
-function filterByEvalField(
-	examples: PairwiseExample[],
-	field: 'dos' | 'donts',
-	search: string,
-	log: EvalLogger,
-): PairwiseExample[] {
-	const searchLower = search.toLowerCase();
-	const fieldLabel = field === 'dos' ? 'do' : "don't";
-
-	log.warn(`🔍 Filtering by ${fieldLabel} containing: "${search}"`);
-	const filtered = examples.filter((e) => {
-		const fieldValue = e.inputs.evals[field];
-		return fieldValue?.toLowerCase().includes(searchLower) ?? false;
-	});
-
-	if (filtered.length === 0) {
-		throw new Error(`No examples found with ${fieldLabel} containing: "${search}"`);
-	}
-
-	log.success(`✅ Found ${filtered.length} example(s) matching "${search}" in ${fieldLabel}`);
-	return filtered;
-}
-
-/** Filter examples by notion_id */
-function filterByNotionId(
-	examples: PairwiseExample[],
-	notionId: string,
-	log: EvalLogger,
-): PairwiseExample[] {
-	log.warn(`🔍 Filtering by notion_id: ${notionId}`);
-	const filtered = examples.filter((e) => getNotionId(e.metadata) === notionId);
-
-	if (filtered.length === 0) {
-		const availableIds = examples.map((e) => getNotionId(e.metadata)).filter(Boolean);
-		throw new Error(
-			`No example found with notion_id: ${notionId}. Available: ${availableIds.join(', ')}`,
-		);
-	}
-
-	log.success(`✅ Found ${filtered.length} example(s) with notion_id "${notionId}"`);
-	return filtered;
-}
-
-/** Filter examples by technique/category */
-function filterByTechnique(
-	examples: PairwiseExample[],
-	technique: string,
-	log: EvalLogger,
-): PairwiseExample[] {
-	log.warn(`🔍 Filtering by technique: ${technique}`);
-	const filtered = examples.filter((e) => {
-		const categories = getCategories(e.metadata);
-		return categories?.includes(technique);
-	});
-
-	if (filtered.length === 0) {
-		const availableTechniques = new Set<string>();
-		for (const example of examples) {
-			const categories = getCategories(example.metadata);
-			if (categories) {
-				for (const category of categories) {
-					availableTechniques.add(category);
-				}
-			}
-		}
-		throw new Error(
-			`No examples found with technique: ${technique}. Available techniques: ${Array.from(availableTechniques).sort().join(', ')}`,
-		);
-	}
-
-	log.success(`✅ Found ${filtered.length} example(s) with technique "${technique}"`);
-	return filtered;
-}
-
-/** Filter examples by all provided criteria progressively */
-function filterExamples(
-	allExamples: PairwiseExample[],
-	notionId: string | undefined,
-	technique: string | undefined,
-	doSearch: string | undefined,
-	dontSearch: string | undefined,
-	maxExamples: number | undefined,
-	log: EvalLogger,
-): PairwiseExample[] {
-	let filtered = allExamples;
-
-	if (notionId) {
-		filtered = filterByNotionId(filtered, notionId, log);
-	}
-
-	if (technique) {
-		filtered = filterByTechnique(filtered, technique, log);
-	}
-
-	if (doSearch) {
-		filtered = filterByEvalField(filtered, 'dos', doSearch, log);
-	}
-
-	if (dontSearch) {
-		filtered = filterByEvalField(filtered, 'donts', dontSearch, log);
-	}
-
-	if (maxExamples && maxExamples > 0) {
-		log.warn(`➔ Limiting to ${maxExamples} example(s)`);
-		filtered = filtered.slice(0, maxExamples);
-	}
-
-	return filtered;
-}
-
-/** Log enabled feature flags */
-function logFeatureFlags(log: EvalLogger, featureFlags?: BuilderFeatureFlags): void {
-	if (!featureFlags) return;
-	const enabledFlags = Object.entries(featureFlags)
-		.filter(([, v]) => v === true)
-		.map(([k]) => k);
-	if (enabledFlags.length > 0) {
-		log.success(`➔ Feature flags enabled: ${enabledFlags.join(', ')}`);
-	}
-}
-
-/** Log configuration for pairwise evaluation */
-function logPairwiseConfig(
-	log: EvalLogger,
-	options: {
-		experimentName: string;
-		numGenerations: number;
-		numJudges: number;
-		repetitions: number;
-		concurrency: number;
-	},
-): void {
-	const { experimentName, numGenerations, numJudges, repetitions, concurrency } = options;
-	log.info(`➔ Experiment: ${experimentName}`);
-	log.info(
-		`➔ Config: ${numGenerations} gen(s) × ${numJudges} judges × ${repetitions} reps (concurrency: ${concurrency})${log.isVerbose ? ' (verbose)' : ''}`,
-	);
-	if (numGenerations > 1) {
-		log.verbose('   Generation Correctness: (# passing gens) / total gens');
-		log.verbose('   Aggregated Diagnostic: average across all generations');
-	} else {
-		log.verbose('   Primary: ALL criteria must pass → majority vote');
-		log.verbose('   Secondary: Average diagnostic score');
-	}
-}
-
-/** Validate common pairwise evaluation inputs */
-function validatePairwiseInputs(numJudges: number, numGenerations: number): void {
-	if (numJudges < 1) {
-		throw new Error('numJudges must be at least 1');
-	}
-	if (numGenerations < 1) {
-		throw new Error('numGenerations must be at least 1');
-	}
-}
-
-/** Determine run type and filter value for metadata */
-function determineRunType(options: {
-	notionId?: string;
-	technique?: string;
-	doSearch?: string;
-	dontSearch?: string;
-}): { runType: string; filterValue: string | undefined } {
-	const { notionId, technique, doSearch, dontSearch } = options;
-
-	const filters: string[] = [];
-	const values: string[] = [];
-
-	if (notionId) {
-		filters.push('id');
-		values.push(`id:${notionId}`);
-	}
-	if (technique) {
-		filters.push('category');
-		values.push(`category:${technique}`);
-	}
-	if (doSearch) {
-		filters.push('do');
-		values.push(`do:${doSearch}`);
-	}
-	if (dontSearch) {
-		filters.push('dont');
-		values.push(`dont:${dontSearch}`);
-	}
-
-	if (filters.length === 0) {
-		return { runType: 'full', filterValue: undefined };
-	}
-
-	return {
-		runType: `by-${filters.join('-and-')}`,
-		filterValue: values.join(' '),
-	};
-}
-
-/** Display results for local pairwise evaluation */
-function displayLocalResults(
-	log: EvalLogger,
-	options: {
-		generationResults: GenerationResult[];
-		numJudges: number;
-		numGenerations: number;
-		totalTime: number;
-		verbose: boolean;
-	},
-): void {
-	const { generationResults, numJudges, numGenerations, totalTime, verbose } = options;
-
-	// Defensive check - should never happen due to validation, but prevents runtime errors
-	if (generationResults.length === 0) {
-		log.error('No generation results to display');
-		return;
-	}
-
-	const aggregation = aggregateGenerations(generationResults);
-
-	// Display aggregated result
-	if (numGenerations > 1) {
-		log.info(
-			`\n📊 Generation Correctness: ${aggregation.passingGenerations}/${aggregation.totalGenerations} → ` +
-				`${aggregation.generationCorrectness >= 0.5 ? pc.green(aggregation.generationCorrectness.toFixed(2)) : pc.red(aggregation.generationCorrectness.toFixed(2))}`,
-		);
-		log.info(
-			`   Aggregated Diagnostic: ${(aggregation.aggregatedDiagnosticScore * 100).toFixed(0)}%`,
-		);
-	} else {
-		// Single generation - show original format
-		const firstGen = generationResults[0];
-		log.info(
-			`\n📊 Result: ${firstGen.primaryPasses}/${numJudges} judges → ` +
-				`${firstGen.majorityPass ? pc.green('PASS') : pc.red('FAIL')} ` +
-				`(${(firstGen.avgDiagnosticScore * 100).toFixed(0)}%)`,
-		);
-	}
-	log.dim(`   Timing: ${totalTime.toFixed(1)}s total`);
-
-	// Per-generation breakdown (verbose or multi-gen)
-	if (verbose && numGenerations > 1) {
-		log.info(pc.dim('\nPer-generation breakdown:'));
-		generationResults.forEach((g, i) => {
-			log.info(
-				pc.dim(
-					`  Gen ${i + 1}: ${g.majorityPass ? 'PASS' : 'FAIL'} ` +
-						`(${g.primaryPasses}/${numJudges} judges, ${(g.avgDiagnosticScore * 100).toFixed(0)}%)`,
-				),
-			);
-		});
-	}
-
-	// Show violations if any (from first generation for simplicity)
-	const allViolations = generationResults[0].judgeResults.flatMap((r, i) =>
-		r.violations.map((v) => ({ judge: i + 1, rule: v.rule, justification: v.justification })),
-	);
-	if (allViolations.length > 0) {
-		log.info(pc.yellow('\nViolations (Gen 1):'));
-		for (const v of allViolations) {
-			log.info(pc.dim(`  [Judge ${v.judge}] ${v.rule}: ${v.justification}`));
-		}
-	}
-
-	// Show workflow summary
-	if (verbose && generationResults[0].workflow.nodes) {
-		log.info(pc.dim('\nWorkflow nodes (Gen 1):'));
-		for (const node of generationResults[0].workflow.nodes) {
-			log.info(pc.dim(`  - ${node.name} (${node.type})`));
-		}
-	}
-}
-
-// ============================================================================
-// Public API - LangSmith Evaluation
-// ============================================================================
-
-export interface PairwiseEvaluationOptions {
-	repetitions?: number;
-	notionId?: string;
-	technique?: string;
-	/** Case-insensitive search string to filter examples by dos content */
-	doSearch?: string;
-	/** Case-insensitive search string to filter examples by donts content */
-	dontSearch?: string;
-	numJudges?: number;
-	numGenerations?: number;
-	verbose?: boolean;
-	experimentName?: string;
-	concurrency?: number;
-	maxExamples?: number;
-	featureFlags?: BuilderFeatureFlags;
-}
-
-/**
- * Runs pairwise evaluation using LangSmith.
- * Generates workflows from dataset prompts and evaluates them against do/don't criteria.
- */
-export async function runPairwiseLangsmithEvaluation(
-	options: PairwiseEvaluationOptions = {},
-): Promise<void> {
-	const {
-		repetitions = DEFAULTS.REPETITIONS,
-		notionId,
-		technique,
-		doSearch,
-		dontSearch,
-		numJudges = DEFAULTS.NUM_JUDGES,
-		numGenerations = DEFAULTS.NUM_GENERATIONS,
-		verbose = false,
-		experimentName = DEFAULTS.EXPERIMENT_NAME,
-		concurrency = DEFAULTS.CONCURRENCY,
-		maxExamples,
-		featureFlags,
-	} = options;
-	const log = createLogger(verbose);
-
-	console.log(formatHeader('AI Workflow Builder Pairwise Evaluation', 70));
-	logPairwiseConfig(log, { experimentName, numGenerations, numJudges, repetitions, concurrency });
-
-	logFeatureFlags(log, featureFlags);
-
-	try {
-		validatePairwiseInputs(numJudges, numGenerations);
-
-		if (!process.env.LANGSMITH_API_KEY) {
-			throw new Error('LANGSMITH_API_KEY environment variable not set');
-		}
-
-		// Ensure LANGSMITH_TRACING is enabled
-		if (!process.env.LANGSMITH_TRACING) {
-			process.env.LANGSMITH_TRACING = 'true';
-			log.verbose('➔ Enabled LANGSMITH_TRACING=true');
-		}
-
-		const { parsedNodeTypes, llm, lsClient } = await setupTestEnvironment();
-
-		if (!lsClient) {
-			throw new Error('Langsmith client not initialized');
-		}
-
-		const datasetName = process.env.LANGSMITH_DATASET_NAME ?? DEFAULTS.DATASET_NAME;
-		log.info(`➔ Dataset: ${datasetName}`);
-
-		// Verify dataset exists
-		let datasetId: string;
-		try {
-			const dataset = await lsClient.readDataset({ datasetName });
-			datasetId = dataset.id;
-		} catch {
-			throw new Error(`Dataset "${datasetName}" not found`);
-		}
-
-		// Fetch and filter examples
-		const allExamples: PairwiseExample[] = [];
-		log.verbose('➔ Fetching examples from dataset...');
-		for await (const example of lsClient.listExamples({ datasetId })) {
-			if (isPairwiseExample(example)) {
-				allExamples.push(example);
-			} else {
-				log.verbose(`⚠️ Skipping invalid example: ${example.id}`);
-			}
-		}
-		log.verbose(`📊 Total examples in dataset: ${allExamples.length}`);
-
-		const data = filterExamples(
-			allExamples,
-			notionId,
-			technique,
-			doSearch,
-			dontSearch,
-			maxExamples,
-			log,
-		);
-		log.info(`➔ Running ${data.length} example(s) × ${repetitions} rep(s)`);
-
-		// Create target (does all work) and evaluator (extracts pre-computed metrics)
-		const target = createPairwiseTarget({
-			parsedNodeTypes,
-			llm,
-			numJudges,
-			numGenerations,
-			featureFlags,
-			experimentName,
-		});
-		const evaluator = pairwiseLangsmithEvaluator;
-
-		const evalStartTime = Date.now();
-
-		// Determine run type for metadata
-		const { runType, filterValue } = determineRunType({
-			notionId,
-			technique,
-			doSearch,
-			dontSearch,
-		});
-
-		// Run evaluation using LangSmith's built-in features
-		await evaluate(target, {
-			data,
-			evaluators: [evaluator],
-			maxConcurrency: concurrency,
-			experimentPrefix: experimentName,
-			numRepetitions: repetitions,
-			metadata: {
-				numJudges,
-				numGenerations,
-				repetitions,
-				concurrency,
-				scoringMethod: numGenerations > 1 ? 'hierarchical-multi-generation' : 'hierarchical',
-				runType,
-				...(filterValue && { filterValue }),
-			},
-		});
-
-		const totalEvalTime = Date.now() - evalStartTime;
-
-		log.success('\n✓ Pairwise evaluation completed');
-		log.dim(`   Total time: ${(totalEvalTime / 1000).toFixed(1)}s`);
-		log.dim('   View results in LangSmith dashboard');
-	} catch (error) {
-		log.error(
-			`✗ Pairwise evaluation failed: ${error instanceof Error ? error.message : String(error)}`,
-		);
-		process.exit(1);
-	}
-}
-
-// ============================================================================
-// Public API - Local Evaluation
-// ============================================================================
-
-export interface LocalPairwiseOptions {
-	prompt: string;
-	criteria: { dos: string; donts: string };
-	numJudges?: number;
-	numGenerations?: number;
-	verbose?: boolean;
-	outputDir?: string;
-	featureFlags?: BuilderFeatureFlags;
-}
-
-/**
- * Runs a single pairwise evaluation locally without LangSmith.
- * Useful for testing prompts and criteria before running full dataset evaluation.
- */
-export async function runLocalPairwiseEvaluation(options: LocalPairwiseOptions): Promise<void> {
-	const {
-		prompt,
-		criteria,
-		numJudges = DEFAULTS.NUM_JUDGES,
-		numGenerations = DEFAULTS.NUM_GENERATIONS,
-		verbose = false,
-		outputDir,
-		featureFlags,
-	} = options;
-	const log = createLogger(verbose);
-
-	console.log(formatHeader('Local Pairwise Evaluation', 50));
-	log.info(`➔ Generations: ${numGenerations}, Judges: ${numJudges}`);
-	if (outputDir) {
-		log.info(`➔ Output directory: ${outputDir}`);
-	}
-	log.verbose(`➔ Prompt: ${prompt.slice(0, 80)}${prompt.length > 80 ? '...' : ''}`);
-	log.verbose(`➔ Dos: ${criteria.dos.slice(0, 60)}${criteria.dos.length > 60 ? '...' : ''}`);
-	if (criteria.donts) {
-		log.verbose(
-			`➔ Donts: ${criteria.donts.slice(0, 60)}${criteria.donts.length > 60 ? '...' : ''}`,
-		);
-	}
-
-	const startTime = Date.now();
-
-	try {
-		validatePairwiseInputs(numJudges, numGenerations);
-
-		const { parsedNodeTypes, llm } = await setupTestEnvironment();
-
-		// Create artifact saver if output directory is configured
-		const artifactSaver = createArtifactSaver(outputDir, log);
-		const promptId = 'local';
-
-		// Save prompt artifacts
-		artifactSaver?.savePrompt(promptId, prompt, criteria);
-
-		log.info(`➔ Running ${numGenerations} generation(s)...`);
-
-		// Run all generations in parallel
-		const generationResults: GenerationResult[] = await Promise.all(
-			Array.from({ length: numGenerations }, async (_, genIndex) => {
-				const genStartTime = Date.now();
-
-				// Generate workflow
-				const workflow = await generateWorkflow(parsedNodeTypes, llm, prompt, featureFlags);
-				const genTime = (Date.now() - genStartTime) / 1000;
-
-				log.verbose(
-					`  Gen ${genIndex + 1}: Workflow done (${workflow?.nodes?.length ?? 0} nodes) [${genTime.toFixed(1)}s]`,
-				);
-
-				// Run judge panel
-				const panelResult = await runJudgePanel(llm, workflow, criteria, numJudges);
-
-				log.verbose(
-					`  Gen ${genIndex + 1}: ${panelResult.majorityPass ? '✓ PASS' : '✗ FAIL'} (${panelResult.primaryPasses}/${numJudges} judges, ${(panelResult.avgDiagnosticScore * 100).toFixed(0)}%)`,
-				);
-
-				return { workflow, ...panelResult };
-			}),
-		);
-
-		// Save generation artifacts
-		if (artifactSaver) {
-			for (let i = 0; i < generationResults.length; i++) {
-				artifactSaver.saveGeneration(promptId, i, generationResults[i]);
-			}
-		}
-
-		const totalTime = (Date.now() - startTime) / 1000;
-		displayLocalResults(log, { generationResults, numJudges, numGenerations, totalTime, verbose });
-	} catch (error) {
-		log.error(
-			`✗ Local evaluation failed: ${error instanceof Error ? error.message : String(error)}`,
-		);
-		process.exit(1);
-	}
-}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/pairwise/types.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/pairwise/types.ts
@ -1,58 +0,0 @@
-import type { EvaluationResult as LangsmithEvaluationResult } from 'langsmith/evaluation';
-import type { Example } from 'langsmith/schemas';
-
-// ============================================================================
-// Evaluation Criteria
-// ============================================================================
-
-/** Evaluation criteria requiring at least one of dos or donts */
-export type EvalCriteria = { dos: string; donts?: string } | { dos?: string; donts: string };
-
-// ============================================================================
-// Dataset Input/Output Types
-// ============================================================================
-
-export interface PairwiseDatasetInput {
-	evals: EvalCriteria;
-	prompt: string;
-}
-
-/** LangSmith Example with typed inputs for pairwise evaluation */
-export interface PairwiseExample extends Omit<Example, 'inputs'> {
-	inputs: PairwiseDatasetInput;
-}
-
-export interface PairwiseTargetOutput {
-	prompt: string;
-	evals: EvalCriteria;
-	/** Pre-computed feedback results */
-	feedback: LangsmithEvaluationResult[];
-}
-
-// ============================================================================
-// Type Guards
-// ============================================================================
-
-export function isPairwiseTargetOutput(outputs: unknown): outputs is PairwiseTargetOutput {
-	if (!outputs || typeof outputs !== 'object') return false;
-	const obj = outputs as Record<string, unknown>;
-	return (
-		typeof obj.prompt === 'string' &&
-		Array.isArray(obj.feedback) &&
-		obj.evals !== undefined &&
-		typeof obj.evals === 'object'
-	);
-}
-
-export function isPairwiseExample(example: Example): example is PairwiseExample {
-	const inputs = example.inputs as Record<string, unknown> | undefined;
-	if (!inputs || typeof inputs !== 'object') return false;
-
-	const evals = inputs.evals as Record<string, unknown> | undefined;
-	if (!evals || typeof evals !== 'object') return false;
-
-	return (
-		typeof inputs.prompt === 'string' &&
-		(typeof evals.dos === 'string' || typeof evals.donts === 'string')
-	);
-}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/agent-prompt.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/agent-prompt.ts
@ -2,7 +2,7 @@ import type { SimpleWorkflow } from '@/types';
 import { validateAgentPrompt } from '@/validation/checks';
 import type { SingleEvaluatorResult } from '@/validation/types';

-import { calcSingleEvaluatorScore } from '../../utils/score';
+import { calcSingleEvaluatorScore } from '../score';

 export function evaluateAgentPrompt(workflow: SimpleWorkflow): SingleEvaluatorResult {
 	const violations = validateAgentPrompt(workflow);
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/connections.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/connections.ts
@ -4,7 +4,7 @@ import type { SimpleWorkflow } from '@/types';
 import { validateConnections } from '@/validation/checks';
 import type { SingleEvaluatorResult } from '@/validation/types';

-import { calcSingleEvaluatorScore } from '../../utils/score';
+import { calcSingleEvaluatorScore } from '../score';

 export function evaluateConnections(
 	workflow: SimpleWorkflow,
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/credentials.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/credentials.ts
@ -2,7 +2,7 @@ import type { SimpleWorkflow } from '@/types';
 import { validateCredentials } from '@/validation/checks';
 import type { SingleEvaluatorResult } from '@/validation/types';

-import { calcSingleEvaluatorScore } from '../../utils/score';
+import { calcSingleEvaluatorScore } from '../score';

 export function evaluateCredentials(workflow: SimpleWorkflow): SingleEvaluatorResult {
 	const violations = validateCredentials(workflow);
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/from-ai.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/from-ai.ts
@ -4,7 +4,7 @@ import type { SimpleWorkflow } from '@/types';
 import { validateFromAi } from '@/validation/checks';
 import type { SingleEvaluatorResult } from '@/validation/types';

-import { calcSingleEvaluatorScore } from '../../utils/score';
+import { calcSingleEvaluatorScore } from '../score';

 export function evaluateFromAi(
 	workflow: SimpleWorkflow,
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/nodes.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/nodes.ts
@ -4,7 +4,7 @@ import type { SimpleWorkflow } from '@/types';
 import { validateNodes } from '@/validation/checks';
 import type { SingleEvaluatorResult } from '@/validation/types';

-import { calcSingleEvaluatorScore } from '../../utils/score';
+import { calcSingleEvaluatorScore } from '../score';

 export function evaluateNodes(
 	workflow: SimpleWorkflow,
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/tools.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/tools.ts
@ -4,7 +4,7 @@ import type { SimpleWorkflow } from '@/types';
 import { validateTools } from '@/validation/checks';
 import type { SingleEvaluatorResult } from '@/validation/types';

-import { calcSingleEvaluatorScore } from '../../utils/score';
+import { calcSingleEvaluatorScore } from '../score';

 export function evaluateTools(
 	workflow: SimpleWorkflow,
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/trigger.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/trigger.ts
@ -4,7 +4,7 @@ import type { SimpleWorkflow } from '@/types';
 import { validateTrigger } from '@/validation/checks';
 import type { SingleEvaluatorResult } from '@/validation/types';

-import { calcSingleEvaluatorScore } from '../../utils/score';
+import { calcSingleEvaluatorScore } from '../score';

 export function evaluateTrigger(
 	workflow: SimpleWorkflow,
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/programmatic-evaluation.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/programmatic-evaluation.ts
@ -15,13 +15,13 @@ import {
 	evaluateWorkflowSimilarity,
 	evaluateWorkflowSimilarityMultiple,
 } from './evaluators/workflow-similarity';
-import { calculateOverallScore } from '../utils/score';
+import { calculateOverallScore } from './score';

 export async function programmaticEvaluation(
 	input: ProgrammaticEvaluationInput,
 	nodeTypes: INodeTypeDescription[],
 ) {
-	const { generatedWorkflow, referenceWorkflow, referenceWorkflows, preset = 'standard' } = input;
+	const { generatedWorkflow, referenceWorkflows, preset = 'standard' } = input;

 	const connectionsEvaluationResult = evaluateConnections(generatedWorkflow, nodeTypes);
 	const nodesEvaluationResult = evaluateNodes(generatedWorkflow, nodeTypes);
@ -31,41 +31,26 @@ export async function programmaticEvaluation(
 	const fromAiEvaluationResult = evaluateFromAi(generatedWorkflow, nodeTypes);
 	const credentialsEvaluationResult = evaluateCredentials(generatedWorkflow);

-	// Workflow similarity evaluation (supports both single and multiple reference workflows)
+	// Workflow similarity evaluation
 	let similarityEvaluationResult = null;

-	// Prioritize referenceWorkflows (multiple) over referenceWorkflow (single)
 	if (referenceWorkflows && referenceWorkflows.length > 0) {
 		try {
-			similarityEvaluationResult = await evaluateWorkflowSimilarityMultiple(
-				generatedWorkflow,
-				referenceWorkflows,
-				preset,
-			);
+			if (referenceWorkflows.length === 1) {
+				similarityEvaluationResult = await evaluateWorkflowSimilarity(
+					generatedWorkflow,
+					referenceWorkflows[0],
+					preset,
+				);
+			} else {
+				similarityEvaluationResult = await evaluateWorkflowSimilarityMultiple(
+					generatedWorkflow,
+					referenceWorkflows,
+					preset,
+				);
+			}
 		} catch (error) {
-			console.warn('Multiple workflow similarity evaluation failed:', error);
-			// Fallback to neutral result if similarity check fails
-			const violation: ProgrammaticViolation = {
-				name: 'workflow-similarity-evaluation-failed',
-				type: 'critical',
-				description: `Similarity evaluation failed: ${(error as Error).message}`,
-				pointsDeducted: 0,
-			};
-			similarityEvaluationResult = {
-				violations: [violation],
-				score: 0,
-			};
-		}
-	} else if (referenceWorkflow) {
-		try {
-			similarityEvaluationResult = await evaluateWorkflowSimilarity(
-				generatedWorkflow,
-				referenceWorkflow,
-				preset,
-			);
-		} catch (error) {
-			console.warn('Workflow similarity evaluation failed:', error);
-			// Fallback to neutral result if similarity check fails
+			// Fallback to neutral result if similarity check fails - error captured in violation
 			const violation: ProgrammaticViolation = {
 				name: 'workflow-similarity-evaluation-failed',
 				type: 'critical',
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/score.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/score.ts
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/prompts-example.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/prompts-example.ts
@ -1,27 +0,0 @@
-/**
- * Example prompts for categorization evaluation
- *
- * Use this format to create custom prompt sets for evaluation.
- * You can export prompts from JSONL files using scripts/extract-user-prompts.js
- * and use them directly.
- *
- * To run with custom prompts:
- * 1. Create a file with an array of prompt strings
- * 2. Import and pass to runCategorizationEvaluation()
- */
-export const examplePrompts = [
-	'Create a workflow that monitors my website every 5 minutes and sends me a Slack notification if it goes down',
-	'Build a chatbot that can answer customer questions about our product catalog using information from our knowledge base',
-	'Set up a form to collect user feedback, analyze sentiment with AI, and store the results in Airtable',
-	'Extract data from PDF invoices uploaded via form and update our accounting spreadsheet',
-	'Scrape competitor pricing daily and generate a weekly summary report with price changes',
-];
-
-/**
- * Example usage:
- *
- * import { runCategorizationEvaluation } from './categorize-prompt-evaluation';
- * import { examplePrompts } from './prompts-example';
- *
- * runCategorizationEvaluation(examplePrompts).catch(console.error);
- */
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/support/constants.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/support/constants.ts
@ -18,28 +18,6 @@ export const TRACEABLE_NAMES = {
 	WORKFLOW_GENERATION: 'workflow_generation',
 } as const;

-// ============================================================================
-// LangSmith Metric Keys
-// ============================================================================
-
-/**
- * Metric keys for LangSmith evaluation results.
- */
-export const METRIC_KEYS = {
-	// Single generation metrics
-	PAIRWISE_DIAGNOSTIC: 'pairwise_diagnostic',
-	PAIRWISE_JUDGES_PASSED: 'pairwise_judges_passed',
-	PAIRWISE_PRIMARY: 'pairwise_primary',
-	PAIRWISE_TOTAL_PASSES: 'pairwise_total_passes',
-	PAIRWISE_TOTAL_VIOLATIONS: 'pairwise_total_violations',
-
-	// Multi-generation metrics
-	PAIRWISE_AGGREGATED_DIAGNOSTIC: 'pairwise_aggregated_diagnostic',
-	PAIRWISE_GENERATION_CORRECTNESS: 'pairwise_generation_correctness',
-	PAIRWISE_GENERATIONS_PASSED: 'pairwise_generations_passed',
-	PAIRWISE_TOTAL_JUDGE_CALLS: 'pairwise_total_judge_calls',
-} as const;
-
 // ============================================================================
 // Default Values
 // ============================================================================
@ -48,8 +26,11 @@ export const DEFAULTS = {
 	NUM_JUDGES: 3,
 	NUM_GENERATIONS: 1,
 	EXPERIMENT_NAME: 'pairwise-evals',
+	LLM_JUDGE_EXPERIMENT_NAME: 'workflow-builder-evaluation',
 	CONCURRENCY: 5,
 	REPETITIONS: 1,
+	/** Per-operation timeout (generation / evaluator) */
+	TIMEOUT_MS: 20 * 60 * 1000,
 	DATASET_NAME: 'notion-pairwise-workflows',
 	FEATURE_FLAGS: {
 		templateExamples: false,
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/support/environment.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/support/environment.ts
@ -4,16 +4,30 @@ import { MemorySaver } from '@langchain/langgraph';
 import { Client } from 'langsmith/client';
 import type { INodeTypeDescription } from 'n8n-workflow';

+import { loadNodesFromFile } from './load-nodes.js';
 import { anthropicClaudeSonnet45 } from '../../src/llm-config.js';
 import type { BuilderFeatureFlags } from '../../src/workflow-builder-agent.js';
 import { WorkflowBuilderAgent } from '../../src/workflow-builder-agent.js';
-import { loadNodesFromFile } from '../load-nodes.js';
+import type { EvalLogger } from '../harness/logger.js';
+import {
+	createTraceFilters,
+	isMinimalTracingEnabled,
+	type TraceFilters,
+} from '../langsmith/trace-filters.js';
+
+/** Maximum batch size in bytes for trace uploads (2MB - reduced to avoid 403 errors) */
+const TRACE_BATCH_SIZE_LIMIT = 2_000_000;
+
+/** Number of concurrent trace batch uploads */
+const TRACE_BATCH_CONCURRENCY = 1;

 export interface TestEnvironment {
 	parsedNodeTypes: INodeTypeDescription[];
 	llm: BaseChatModel;
 	tracer?: LangChainTracer;
 	lsClient?: Client;
+	/** Trace filtering utilities (only present when minimal tracing is enabled) */
+	traceFilters?: TraceFilters;
 }

 /**
@ -42,29 +56,65 @@ export function createTracer(client: Client, projectName: string): LangChainTrac
 }

 /**
- * Creates a Langsmith client if API key is available
- * @returns Langsmith client or undefined
+ * Result of creating a LangSmith client with optional filtering.
 */
-export function createLangsmithClient(): Client | undefined {
+export interface LangsmithClientResult {
+	client: Client;
+	/** Trace filters (only present when minimal tracing is enabled) */
+	traceFilters?: TraceFilters;
+}
+
+/**
+ * Creates a Langsmith client if API key is available.
+ * By default, minimal tracing is enabled to reduce payload sizes and avoid 403 errors.
+ * Set LANGSMITH_MINIMAL_TRACING=false to disable filtering and get full traces.
+ * @param logger - Optional logger for trace filter output
+ * @returns LangSmith client with optional trace filters, or undefined if no API key
+ */
+export function createLangsmithClient(logger?: EvalLogger): LangsmithClientResult | undefined {
 	const apiKey = process.env.LANGSMITH_API_KEY;
 	if (!apiKey) {
 		return undefined;
 	}
-	return new Client({ apiKey });
+
+	const minimalTracing = isMinimalTracingEnabled();
+
+	if (!minimalTracing) {
+		return { client: new Client({ apiKey }) };
+	}
+
+	// Create closure-scoped filters for this client instance
+	const traceFilters = createTraceFilters(logger);
+
+	const client = new Client({
+		apiKey,
+		// Filter large fields from traces to avoid 403 payload errors
+		hideInputs: traceFilters.filterInputs,
+		hideOutputs: traceFilters.filterOutputs,
+		// Reduce batch size and concurrency for high-volume scenarios
+		batchSizeBytesLimit: TRACE_BATCH_SIZE_LIMIT,
+		batchSizeLimit: 10, // Limit runs per batch (default 100) to avoid 403 multipart errors
+		traceBatchConcurrency: TRACE_BATCH_CONCURRENCY,
+	});
+
+	return { client, traceFilters };
 }

 /**
 * Sets up the test environment with LLM, nodes, and tracing
+ * @param logger - Optional logger for trace filter output
 * @returns Test environment configuration
 */
-export async function setupTestEnvironment(): Promise<TestEnvironment> {
+export async function setupTestEnvironment(logger?: EvalLogger): Promise<TestEnvironment> {
 	const parsedNodeTypes = loadNodesFromFile();
 	const llm = await setupLLM();
-	const lsClient = createLangsmithClient();
+	const lsClientResult = createLangsmithClient(logger);

+	const lsClient = lsClientResult?.client;
+	const traceFilters = lsClientResult?.traceFilters;
 	const tracer = lsClient ? createTracer(lsClient, 'workflow-builder-evaluation') : undefined;

-	return { parsedNodeTypes, llm, tracer, lsClient };
+	return { parsedNodeTypes, llm, tracer, lsClient, traceFilters };
 }

 export interface CreateAgentOptions {
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/support/load-nodes.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/support/load-nodes.ts
@ -58,12 +58,14 @@ function filterNodeTypes(
 }

 export function loadNodesFromFile(): INodeTypeDescription[] {
-	const nodesPath = join(__dirname, 'nodes.json');
+	const preferredPath = join(__dirname, '..', '.data', 'nodes.json');
+	const legacyPath = join(__dirname, '..', 'nodes.json');
+	const nodesPath = existsSync(preferredPath) ? preferredPath : legacyPath;

 	if (!existsSync(nodesPath)) {
 		throw new Error(
 			`nodes.json not found at ${nodesPath}. ` +
-				'Run n8n and export node definitions to evaluations/nodes.json',
+				'Run n8n and export node definitions to evaluations/.data/nodes.json',
 		);
 	}

--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/support/report-generator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/support/report-generator.ts
@ -0,0 +1,208 @@
+/**
+ * Markdown Report Generator
+ *
+ * Generates human-readable markdown reports from evaluation results.
+ */
+
+import { feedbackKey } from '../harness/feedback';
+import type { ExampleResult, RunSummary } from '../harness/harness-types';
+import {
+	groupByEvaluator,
+	selectScoringItems,
+	calculateFiniteAverage,
+} from '../harness/score-calculator';
+
+/**
+ * Violation severity levels.
+ */
+export type ViolationSeverity = 'critical' | 'major' | 'minor';
+
+/**
+ * Options for report generation.
+ */
+export interface ReportOptions {
+	/** Include detailed per-test results (default: false) */
+	includeDetails?: boolean;
+	/** Include violation breakdown (default: true) */
+	includeViolations?: boolean;
+}
+
+/**
+ * Metrics calculated from evaluation results.
+ */
+export interface ReportMetrics {
+	/** Average score per evaluator */
+	evaluatorAverages: Record<string, number>;
+	/** Count of violations by severity */
+	violationCounts: { critical: number; major: number; minor: number };
+}
+
+/** Maximum prompt length before truncation */
+const MAX_PROMPT_LENGTH = 80;
+
+/**
+ * Extract violation severity from a feedback comment.
+ *
+ * Looks for markers like [CRITICAL], [MAJOR], [MINOR] in the comment.
+ *
+ * @param comment - The feedback comment to parse
+ * @returns The violation severity or null if not found
+ */
+export function extractViolationSeverity(comment?: string): ViolationSeverity | null {
+	if (!comment) return null;
+
+	const lowerComment = comment.toLowerCase();
+
+	if (lowerComment.includes('[critical]')) return 'critical';
+	if (lowerComment.includes('[major]')) return 'major';
+	if (lowerComment.includes('[minor]')) return 'minor';
+
+	return null;
+}
+
+/**
+ * Format a number as a percentage string.
+ */
+function formatPercentage(value: number, decimals = 1): string {
+	if (!Number.isFinite(value)) return 'N/A';
+	return `${(value * 100).toFixed(decimals)}%`;
+}
+
+/**
+ * Truncate a string to a maximum length with ellipsis.
+ */
+function truncate(str: string, maxLength: number): string {
+	if (str.length <= maxLength) return str;
+	return str.slice(0, maxLength - 3) + '...';
+}
+
+/**
+ * Calculate report metrics from evaluation results.
+ *
+ * @param results - Array of example results
+ * @returns Calculated metrics including evaluator averages and violation counts
+ */
+export function calculateReportMetrics(results: ExampleResult[]): ReportMetrics {
+	const okResults = results.filter((r) => r.status !== 'error');
+
+	// Calculate evaluator averages (per-example, then average; avoids key-count skew)
+	const evaluatorScores: Record<string, number[]> = {};
+	for (const result of okResults) {
+		const grouped = groupByEvaluator(result.feedback);
+		for (const [evaluator, items] of Object.entries(grouped)) {
+			if (!evaluatorScores[evaluator]) evaluatorScores[evaluator] = [];
+			evaluatorScores[evaluator].push(calculateFiniteAverage(selectScoringItems(items)));
+		}
+	}
+
+	const evaluatorAverages: Record<string, number> = {};
+	for (const [evaluator, scores] of Object.entries(evaluatorScores)) {
+		evaluatorAverages[evaluator] = scores.reduce((sum, s) => sum + s, 0) / scores.length;
+	}
+
+	// Count violations by severity
+	const violationCounts = { critical: 0, major: 0, minor: 0 };
+	for (const result of okResults) {
+		for (const feedback of result.feedback) {
+			const severity = extractViolationSeverity(feedback.comment);
+			if (severity) {
+				violationCounts[severity]++;
+			}
+		}
+	}
+
+	return {
+		evaluatorAverages,
+		violationCounts,
+	};
+}
+
+/**
+ * Generate a markdown report from evaluation results.
+ *
+ * @param results - Array of example results
+ * @param summary - Run summary with totals
+ * @param options - Report generation options
+ * @returns Formatted markdown string
+ */
+export function generateMarkdownReport(
+	results: ExampleResult[],
+	summary: RunSummary,
+	options: ReportOptions = {},
+): string {
+	const { includeDetails = false, includeViolations = true } = options;
+
+	const metrics = calculateReportMetrics(results);
+	const passRate = summary.totalExamples > 0 ? summary.passed / summary.totalExamples : 0;
+
+	let report = `# AI Workflow Builder Evaluation Report
+
+## Summary
+- Total Tests: ${summary.totalExamples}
+- Passed: ${summary.passed} (${formatPercentage(passRate)})
+- Failed: ${summary.failed}
+- Errors: ${summary.errors}
+- Average Score: ${formatPercentage(summary.averageScore)}
+- Total Duration: ${(summary.totalDurationMs / 1000).toFixed(1)}s
+
+`;
+
+	// Evaluator Averages
+	if (Object.keys(metrics.evaluatorAverages).length > 0) {
+		report += `## Evaluator Averages
+`;
+		for (const [evaluator, avg] of Object.entries(metrics.evaluatorAverages)) {
+			report += `- ${evaluator}: ${formatPercentage(avg)}
+`;
+		}
+		report += '\n';
+	}
+
+	// Violations Summary
+	if (includeViolations) {
+		const { critical, major, minor } = metrics.violationCounts;
+		report += `## Violations Summary
+- Critical: ${critical}
+- Major: ${major}
+- Minor: ${minor}
+
+`;
+	}
+
+	// Detailed Results
+	if (includeDetails && results.length > 0) {
+		report += `## Detailed Results
+
+`;
+		for (const result of results) {
+			const promptPreview = truncate(result.prompt, MAX_PROMPT_LENGTH);
+			const resultScore = result.score;
+
+			report += `### Test ${result.index}: ${promptPreview}
+- **Status**: ${result.status}
+- **Score**: ${formatPercentage(resultScore)}
+- **Duration**: ${result.durationMs}ms
+`;
+
+			if (result.error) {
+				report += `- **Error**: ${result.error}
+`;
+			}
+
+			if (result.feedback.length > 0) {
+				report += `- **Feedback**:
+`;
+				for (const fb of result.feedback) {
+					const scoreStr = formatPercentage(fb.score);
+					const commentStr = fb.comment ? ` - ${fb.comment}` : '';
+					report += `  - [${feedbackKey(fb)}] ${scoreStr}${commentStr}
+`;
+				}
+			}
+
+			report += '\n';
+		}
+	}
+
+	return report;
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/support/test-case-generator.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/support/test-case-generator.ts
@ -0,0 +1,179 @@
+/**
+ * Test Case Generator
+ *
+ * Generates test cases for workflow evaluation using LLM with structured output.
+ * For default test cases, see fixtures/default-prompts.csv.
+ */
+
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import { HumanMessage, SystemMessage } from '@langchain/core/messages';
+import { z } from 'zod';
+
+/**
+ * Options for test case generation.
+ */
+export interface TestCaseGeneratorOptions {
+	/** Number of test cases to generate (default: 10) */
+	count?: number;
+	/** Focus areas for generation */
+	focus?: string;
+	/** Complexity distribution */
+	complexity?: 'balanced' | 'simple' | 'complex';
+}
+
+/**
+ * A test case generated by the LLM.
+ */
+export interface GeneratedTestCase {
+	id: string;
+	name: string;
+	summary: string;
+	prompt: string;
+}
+
+/**
+ * Test case generator interface.
+ */
+export interface TestCaseGenerator {
+	/** Generate test cases */
+	generate(): Promise<GeneratedTestCase[]>;
+}
+
+/**
+ * Zod schema for structured output.
+ */
+const generatedTestCasesSchema = z.object({
+	testCases: z.array(
+		z.object({
+			id: z.string().describe('Unique identifier (e.g., "test_001")'),
+			name: z.string().describe('Short descriptive title'),
+			summary: z.string().describe('Brief description of what the workflow does'),
+			prompt: z.string().describe('User-facing prompt for workflow generation'),
+		}),
+	),
+});
+
+/** Inferred type from the Zod schema */
+type GeneratedTestCasesOutput = z.infer<typeof generatedTestCasesSchema>;
+
+/** Parse and validate LLM output using the Zod schema */
+function parseTestCasesOutput(value: unknown): GeneratedTestCasesOutput {
+	const parsed = generatedTestCasesSchema.safeParse(value);
+	if (!parsed.success) {
+		throw new Error(`Invalid LLM output: ${parsed.error.message}`);
+	}
+	return parsed.data;
+}
+
+/**
+ * System prompt for test case generation.
+ */
+const systemPrompt = `You are an expert at generating diverse test cases for an n8n workflow builder AI. Create test cases that cover various real-world scenarios and complexity levels.
+
+## Test Case Requirements:
+
+1. **Simple Test Cases**: Single operation workflows
+   - API calls
+   - Data transformations
+   - File operations
+   - Basic integrations
+
+2. **Medium Test Cases**: Multi-step workflows with logic
+   - Conditional logic (IF nodes)
+   - Data filtering and transformation
+   - Multiple API integrations
+   - Error handling
+
+3. **Complex Test Cases**: Advanced workflows
+   - Parallel execution branches
+   - Complex error handling and retry logic
+   - Multiple integrations with data synchronization
+   - Webhooks and event-driven flows
+
+## Guidelines:
+- Create realistic business scenarios
+- Include specific requirements that can be evaluated
+- Vary the domains (e-commerce, HR, marketing, DevOps, etc.)
+- Include both common and edge-case scenarios
+- Make prompts clear and unambiguous
+- Specify expected node types when possible
+
+## Output Format:
+Each test case should have:
+- Unique ID (e.g., "test_001")
+- Descriptive name
+- Brief description
+- Clear prompt that a user would give`;
+
+/**
+ * Get default focus based on complexity option.
+ */
+function getFocus(options?: TestCaseGeneratorOptions): string {
+	const complexity = options?.complexity ?? 'balanced';
+
+	if (options?.focus) {
+		return options.focus;
+	}
+
+	switch (complexity) {
+		case 'simple':
+			return 'simple, single-operation workflows like basic API calls, data transformations, and file operations';
+		case 'complex':
+			return 'complex, multi-step workflows with parallel execution, error handling, and multiple integrations';
+		case 'balanced':
+		default:
+			return 'balanced mix of API integrations, data processing, and automation scenarios';
+	}
+}
+
+/**
+ * Build the human message content.
+ */
+function buildHumanMessage(count: number, focus: string): string {
+	return `Generate ${count} diverse test cases for workflow generation evaluation.
+
+Focus on:
+${focus}
+
+Ensure a good mix of complexity levels and use cases.`;
+}
+
+/**
+ * Create a test case generator that uses LLM to generate test cases.
+ *
+ * @param llm - Language model to use for generation
+ * @param options - Generation options
+ * @returns A test case generator
+ *
+ * @example
+ * ```typescript
+ * const generator = createTestCaseGenerator(llm, { count: 20 });
+ * const testCases = await generator.generate();
+ * ```
+ */
+export function createTestCaseGenerator(
+	llm: BaseChatModel,
+	options?: TestCaseGeneratorOptions,
+): TestCaseGenerator {
+	const count = options?.count ?? 10;
+	const focus = getFocus(options);
+
+	// Create LLM with structured output
+	const llmWithStructuredOutput = llm.withStructuredOutput(generatedTestCasesSchema);
+
+	return {
+		async generate(): Promise<GeneratedTestCase[]> {
+			const humanMessage = buildHumanMessage(count, focus);
+
+			const rawResult = await llmWithStructuredOutput.invoke([
+				new SystemMessage(systemPrompt),
+				new HumanMessage(humanMessage),
+			]);
+
+			// Validate and parse the LLM output using Zod
+			const result = parseTestCasesOutput(rawResult);
+
+			return result.testCases;
+		},
+	};
+}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/types/categorization-evaluation.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/types/categorization-evaluation.ts
@ -1,55 +0,0 @@
-import { z } from 'zod';
-
-import type { PromptCategorization, WorkflowTechniqueType } from '../../src/types/categorization';
-
-/**
- * Test case for categorization evaluation
- */
-export const categorizationTestCaseSchema = z.object({
-	id: z.string(),
-	prompt: z.string(),
-});
-
-export type CategorizationTestCase = z.infer<typeof categorizationTestCaseSchema>;
-
-/**
- * Result of a single categorization test
- */
-export interface CategorizationTestResult {
-	testCase: CategorizationTestCase;
-	categorization: PromptCategorization;
-	techniqueDescriptions: Record<WorkflowTechniqueType, string>;
-	executionTime: number;
-	error?: string;
-}
-
-/**
- * Technique frequency statistics
- */
-export interface TechniqueFrequency {
-	technique: WorkflowTechniqueType;
-	description: string;
-	count: number;
-	percentage: number;
-}
-
-/**
- * Summary of categorization evaluation results
- */
-export interface CategorizationEvaluationSummary {
-	totalPrompts: number;
-	successfulCategorizations: number;
-	failedCategorizations: number;
-	averageConfidence: number;
-	averageExecutionTime: number;
-	techniqueFrequencies: TechniqueFrequency[];
-}
-
-/**
- * Complete categorization evaluation output
- */
-export interface CategorizationEvaluationOutput {
-	timestamp: string;
-	summary: CategorizationEvaluationSummary;
-	results: CategorizationTestResult[];
-}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/types/test-result.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/types/test-result.ts
@ -1,49 +0,0 @@
-import type { ProgrammaticEvaluationResult } from '@/validation/types';
-
-import type { TestCase, EvaluationResult } from './evaluation';
-import type { SimpleWorkflow } from '../../src/types/workflow.js';
-
-export type {
-	ProgrammaticEvaluationResult,
-	SingleEvaluatorResult,
-} from '@/validation/types';
-
-/**
- * Cache statistics for prompt caching analysis
- */
-export interface CacheStatistics {
-	inputTokens: number;
-	outputTokens: number;
-	cacheCreationTokens: number;
-	cacheReadTokens: number;
-	cacheHitRate: number;
-}
-
-/**
- * Cache statistics for a single message/API call
- */
-export interface MessageCacheStats {
-	messageIndex: number;
-	timestamp: string;
-	messageType: 'user' | 'assistant' | 'tool_call' | 'tool_response';
-	role?: string;
-	toolName?: string;
-	inputTokens: number;
-	outputTokens: number;
-	cacheCreationTokens: number;
-	cacheReadTokens: number;
-	cacheHitRate: number;
-}
-
-/**
- * Result of running a single test case
- */
-export interface TestResult {
-	testCase: TestCase;
-	generatedWorkflow: SimpleWorkflow;
-	evaluationResult: EvaluationResult;
-	programmaticEvaluationResult: ProgrammaticEvaluationResult;
-	generationTime: number;
-	cacheStats?: CacheStatistics;
-	error?: string;
-}
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/utils/tests/cache-analyzer.test.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/utils/tests/cache-analyzer.test.ts
@ -1,105 +0,0 @@
-import type { UsageMetadata } from '../../types/langsmith';
-import type { CacheStatistics } from '../../types/test-result';
-import { calculateCacheStats, aggregateCacheStats, formatCacheStats } from '../cache-analyzer';
-
-describe('cache-analyzer', () => {
-	describe('calculateCacheStats', () => {
-		it('should calculate cache statistics correctly', () => {
-			const usage: Partial<UsageMetadata> = {
-				input_tokens: 1000,
-				output_tokens: 500,
-				cache_creation_input_tokens: 2000,
-				cache_read_input_tokens: 3000,
-			};
-
-			const stats = calculateCacheStats(usage);
-
-			expect(stats.inputTokens).toBe(1000);
-			expect(stats.outputTokens).toBe(500);
-			expect(stats.cacheCreationTokens).toBe(2000);
-			expect(stats.cacheReadTokens).toBe(3000);
-			expect(stats.cacheHitRate).toBeCloseTo(0.5, 2); // 3000 / (1000 + 2000 + 3000)
-		});
-
-		it('should handle zero tokens', () => {
-			const usage: Partial<UsageMetadata> = {
-				input_tokens: 0,
-				output_tokens: 0,
-			};
-
-			const stats = calculateCacheStats(usage);
-
-			expect(stats.cacheHitRate).toBe(0);
-		});
-
-		it('should handle undefined cache tokens', () => {
-			const usage: Partial<UsageMetadata> = {
-				input_tokens: 1000,
-				output_tokens: 500,
-			};
-
-			const stats = calculateCacheStats(usage);
-
-			expect(stats.cacheCreationTokens).toBe(0);
-			expect(stats.cacheReadTokens).toBe(0);
-			expect(stats.cacheHitRate).toBe(0);
-		});
-	});
-
-	describe('aggregateCacheStats', () => {
-		it('should aggregate multiple cache statistics', () => {
-			const stats: CacheStatistics[] = [
-				{
-					inputTokens: 1000,
-					outputTokens: 500,
-					cacheCreationTokens: 2000,
-					cacheReadTokens: 3000,
-					cacheHitRate: 0.5,
-				},
-				{
-					inputTokens: 1500,
-					outputTokens: 750,
-					cacheCreationTokens: 2500,
-					cacheReadTokens: 3500,
-					cacheHitRate: 0.6,
-				},
-			];
-
-			const aggregate = aggregateCacheStats(stats);
-
-			expect(aggregate.inputTokens).toBe(2500);
-			expect(aggregate.outputTokens).toBe(1250);
-			expect(aggregate.cacheCreationTokens).toBe(4500);
-			expect(aggregate.cacheReadTokens).toBe(6500);
-			// Cache hit rate recalculated: 6500 / (2500 + 4500 + 6500)
-			expect(aggregate.cacheHitRate).toBeCloseTo(0.4815, 3);
-		});
-
-		it('should handle empty array', () => {
-			const aggregate = aggregateCacheStats([]);
-
-			expect(aggregate.inputTokens).toBe(0);
-			expect(aggregate.cacheHitRate).toBe(0);
-		});
-	});
-
-	describe('formatCacheStats', () => {
-		it('should format statistics for display', () => {
-			const stats: CacheStatistics = {
-				inputTokens: 12345,
-				outputTokens: 6789,
-				cacheCreationTokens: 15000,
-				cacheReadTokens: 30000,
-				cacheHitRate: 0.6677,
-			};
-
-			const formatted = formatCacheStats(stats);
-
-			expect(formatted.inputTokens).toBe('12,345');
-			expect(formatted.outputTokens).toBe('6,789');
-			expect(formatted.cacheCreationTokens).toBe('15,000');
-			expect(formatted.cacheReadTokens).toBe('30,000');
-			expect(formatted.cacheHitRate).toBe('66.77%');
-		});
-	});
-});
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/utils/artifact-saver.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/utils/artifact-saver.ts
@ -1,118 +0,0 @@
-import * as fs from 'fs';
-import * as path from 'path';
-
-import type { EvalLogger } from './logger';
-import type { GenerationResult, MultiGenerationAggregation } from '../pairwise/judge-panel';
-
-export interface ArtifactSaver {
-	savePrompt(promptId: string, prompt: string, criteria: { dos: string; donts: string }): void;
-	saveGeneration(promptId: string, genIndex: number, result: GenerationResult): void;
-	saveSummary(results: Array<{ promptId: string; aggregation: MultiGenerationAggregation }>): void;
-}
-
-// ============================================================================
-// Implementation
-// ============================================================================
-
-/**
- * Creates an artifact saver for persisting evaluation results to disk.
- * @param outputDir - Directory to save artifacts to
- * @param log - Logger instance for verbose output
- * @returns ArtifactSaver instance or null if outputDir is not provided
- */
-export function createArtifactSaver(
-	outputDir: string | undefined,
-	log: EvalLogger,
-): ArtifactSaver | null {
-	if (!outputDir) return null;
-
-	// Create output directory if it doesn't exist
-	fs.mkdirSync(outputDir, { recursive: true });
-
-	return {
-		savePrompt(promptId: string, prompt: string, criteria: { dos: string; donts: string }): void {
-			const promptDir = path.join(outputDir, `prompt-${promptId}`);
-			fs.mkdirSync(promptDir, { recursive: true });
-
-			// Save prompt text
-			fs.writeFileSync(path.join(promptDir, 'prompt.txt'), prompt, 'utf-8');
-
-			// Save criteria
-			fs.writeFileSync(
-				path.join(promptDir, 'criteria.json'),
-				JSON.stringify(criteria, null, 2),
-				'utf-8',
-			);
-
-			log.verbose(`  📁 Saved prompt artifacts to ${promptDir}`);
-		},
-
-		saveGeneration(promptId: string, genIndex: number, result: GenerationResult): void {
-			const genDir = path.join(outputDir, `prompt-${promptId}`, `gen-${genIndex + 1}`);
-			fs.mkdirSync(genDir, { recursive: true });
-
-			// Save workflow as importable n8n JSON
-			const workflowForExport = {
-				name: result.workflow.name ?? `Generated Workflow - Gen ${genIndex + 1}`,
-				nodes: result.workflow.nodes ?? [],
-				connections: result.workflow.connections ?? {},
-			};
-			fs.writeFileSync(
-				path.join(genDir, 'workflow.json'),
-				JSON.stringify(workflowForExport, null, 2),
-				'utf-8',
-			);
-
-			// Save evaluation results
-			const evalResult = {
-				generationIndex: genIndex + 1,
-				majorityPass: result.majorityPass,
-				primaryPasses: result.primaryPasses,
-				numJudges: result.judgeResults.length,
-				diagnosticScore: result.avgDiagnosticScore,
-				judges: result.judgeResults.map((jr, i) => ({
-					judgeIndex: i + 1,
-					primaryPass: jr.primaryPass,
-					diagnosticScore: jr.diagnosticScore,
-					violations: jr.violations,
-					passes: jr.passes,
-				})),
-			};
-			fs.writeFileSync(
-				path.join(genDir, 'evaluation.json'),
-				JSON.stringify(evalResult, null, 2),
-				'utf-8',
-			);
-
-			log.verbose(`  📁 Saved gen-${genIndex + 1} artifacts to ${genDir}`);
-		},
-
-		saveSummary(
-			results: Array<{ promptId: string; aggregation: MultiGenerationAggregation }>,
-		): void {
-			const summary = {
-				timestamp: new Date().toISOString(),
-				totalPrompts: results.length,
-				results: results.map((r) => ({
-					promptId: r.promptId,
-					generationCorrectness: r.aggregation.generationCorrectness,
-					aggregatedDiagnosticScore: r.aggregation.aggregatedDiagnosticScore,
-					passingGenerations: r.aggregation.passingGenerations,
-					totalGenerations: r.aggregation.totalGenerations,
-				})),
-				averageGenerationCorrectness:
-					results.reduce((sum, r) => sum + r.aggregation.generationCorrectness, 0) / results.length,
-				averageDiagnosticScore:
-					results.reduce((sum, r) => sum + r.aggregation.aggregatedDiagnosticScore, 0) /
-					results.length,
-			};
-			fs.writeFileSync(
-				path.join(outputDir, 'summary.json'),
-				JSON.stringify(summary, null, 2),
-				'utf-8',
-			);
-
-			log.info(`📁 Saved summary to ${path.join(outputDir, 'summary.json')}`);
-		},
-	};
-}
--- a/Show more
+++ b/Show more