feat(ai-builder): Workflow evaluation framework with LLM mock execution (#27818)

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: Arvin A <51036481+DeveloperTheExplorer@users.noreply.github.com> Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
2026-04-21 15:47:20 +00:00 · 2026-04-07 14:31:16 +01:00 · 2026-04-07 14:31:16 +01:00 · 2383749980
commit 2383749980
parent 7ed34d7f85
49 changed files with 7986 additions and 10 deletions
--- a/packages/@n8n/api-types/src/index.ts
+++ b/packages/@n8n/api-types/src/index.ts
@ -283,6 +283,7 @@ export {
 	gatewayConfirmationRequiredPayloadSchema,
 	GATEWAY_CONFIRMATION_REQUIRED_PREFIX,
 	InstanceAiSendMessageRequest,
+	InstanceAiEvalExecutionRequest,
 	instanceAiGatewayKeySchema,
 	InstanceAiGatewayEventsQuery,
 	InstanceAiEventsQuery,
@ -355,6 +356,11 @@ export type {
 	GatewayConfirmationRequiredPayload,
 	ToolCategory,
 	InstanceAiWorkflowSetupNode,
+	InstanceAiEvalNodeExecutionMode,
+	InstanceAiEvalInterceptedRequest,
+	InstanceAiEvalNodeResult,
+	InstanceAiEvalMockHints,
+	InstanceAiEvalExecutionResult,
 } from './schemas/instance-ai.schema';

 export {
--- a/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts
+++ b/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts
@ -911,3 +911,50 @@ export function getRenderHint(toolName: string): InstanceAiToolCallState['render
 	if (RESEARCH_RENDER_HINT_TOOLS.has(toolName)) return 'researcher';
 	return 'default';
 }
+
+// ---------------------------------------------------------------------------
+// Eval mock execution — request/response types for LLM-based workflow evaluation
+// ---------------------------------------------------------------------------
+
+export type InstanceAiEvalNodeExecutionMode = 'mocked' | 'pinned' | 'real';
+
+export interface InstanceAiEvalInterceptedRequest {
+	url: string;
+	method: string;
+	nodeType: string;
+	/** The request body sent by the node (if any) */
+	requestBody?: unknown;
+	/** The mock response body returned by the LLM handler for this request */
+	mockResponse?: unknown;
+}
+
+export interface InstanceAiEvalNodeResult {
+	output: unknown;
+	interceptedRequests: InstanceAiEvalInterceptedRequest[];
+	executionMode: InstanceAiEvalNodeExecutionMode;
+	/** Missing required parameters detected before execution (empty = fully configured) */
+	configIssues?: Record<string, string[]>;
+	/** Epoch ms when the node started executing — used to sort the execution trace chronologically */
+	startTime?: number;
+}
+
+export interface InstanceAiEvalMockHints {
+	globalContext: string;
+	triggerContent: Record<string, unknown>;
+	nodeHints: Record<string, string>;
+	warnings: string[];
+	/** Pin data generated for nodes that bypass the HTTP mock layer (AI roots, protocol nodes) */
+	bypassPinData: Record<string, Array<{ json: Record<string, unknown> }>>;
+}
+
+export interface InstanceAiEvalExecutionResult {
+	executionId: string;
+	success: boolean;
+	nodeResults: Record<string, InstanceAiEvalNodeResult>;
+	errors: string[];
+	hints: InstanceAiEvalMockHints;
+}
+
+export class InstanceAiEvalExecutionRequest extends Z.class({
+	scenarioHints: z.string().max(2000).optional(),
+}) {}
--- a/packages/@n8n/instance-ai/evaluations/README.md
+++ b/packages/@n8n/instance-ai/evaluations/README.md
@ -0,0 +1,148 @@
+# Workflow evaluation framework
+
+Tests whether workflows built by Instance AI actually work by executing them with LLM-generated mock HTTP responses.
+
+## Quick start
+
+```bash
+# From packages/@n8n/instance-ai/
+
+# Run all test cases
+dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai workflows --verbose
+
+# Run a single test case
+dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai workflows --filter contact-form --verbose
+```
+
+The n8n server must be running with `N8N_ENABLED_MODULES=instance-ai`.
+
+### Environment variables
+
+Set these in `.env.local`:
+
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `N8N_INSTANCE_AI_MODEL_API_KEY` | Yes | Anthropic API key — shared with the Instance AI agent and used for Phase 1 hints, Phase 2 mock generation, and verification |
+| `N8N_EVAL_EMAIL` | Yes | n8n login email for the eval runner |
+| `N8N_EVAL_PASSWORD` | Yes | n8n login password |
+| `CONTEXT7_API_KEY` | No | Context7 API key for higher rate limits on API doc lookups. Free tier is 1,000 req/month |
+
+## How it works
+
+Each test run:
+
+1. **Build** — sends the test case prompt to Instance AI, which builds a workflow
+2. **Phase 1** — analyzes the workflow and generates consistent mock data hints (one Sonnet call per scenario)
+3. **Phase 2** — executes the workflow with all HTTP requests intercepted. Each request goes to an LLM that generates a realistic API response using the node's configuration and API documentation from Context7
+4. **Verify** — an LLM evaluates whether the scenario's success criteria were met, categorizes any failure as `builder_issue`, `mock_issue`, `legitimate_failure`, or `verification_gap`
+
+### What gets mocked
+
+- **Mocked nodes** — any node that makes HTTP requests (Gmail, Slack, Google Sheets, HTTP Request, etc.). The request is intercepted before it leaves the process. An LLM generates the response.
+- **Pinned nodes** — trigger/start nodes get LLM-generated input data injected as pin data
+- **Real nodes** — logic nodes (Code, Set, Merge, Filter, Sort, IF, Switch) execute their actual code on the mocked/pinned data
+
+No real credentials or API connections are needed.
+
+## Adding test cases
+
+Test cases live in `evaluations/data/workflows/*.json`:
+
+```json
+{
+  "prompt": "Create a workflow that...",
+  "complexity": "medium",
+  "tags": ["build", "webhook", "gmail"],
+  "triggerType": "webhook",
+  "scenarios": [
+    {
+      "name": "happy-path",
+      "description": "Normal operation",
+      "dataSetup": "The webhook receives a submission from Jane (jane@example.com)...",
+      "successCriteria": "The workflow executes without errors. An email is sent to jane@example.com..."
+    }
+  ]
+}
+```
+
+### Writing good test cases
+
+**Prompt tips:**
+- Be specific about node configuration — include document IDs, sheet names, channel names, chat IDs. The agent won't ask for these in eval mode (no multi-turn support yet).
+- Say "Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later."
+- If a built-in node doesn't expose the fields you need (e.g., Linear node doesn't query `creator.email`), tell the agent to use an HTTP Request node with a custom API call instead.
+
+**Scenario tips:**
+- Don't specify exact counts that depend on mock data (e.g., "exactly 7 posts remain"). The LLM generates data non-deterministically. Instead say "some posts are filtered out — fewer remain than the original 10."
+- The `dataSetup` field steers the mock data generation. Describe what each service should return, not the exact JSON.
+- For error scenarios, describe the error condition: "The Telegram node returns an error indicating the chat was not found."
+- The `successCriteria` is what the verification LLM checks. Be specific about what constitutes success — "None of the titles in the Slack message should contain the word 'qui'."
+
+**Scenarios to include:**
+- `happy-path` — everything works as expected
+- Edge cases — empty data, missing fields, single vs multiple items
+- Error scenarios only if the workflow is expected to handle them gracefully. Most agent-built workflows don't include error handling, so testing "the workflow crashes on invalid input" is a legitimate finding, not a test case failure.
+
+## Understanding the report
+
+Each run generates a timestamped HTML report in `.data/` plus a stable `workflow-eval-report.html`.
+
+### Failure categories
+
+When a scenario fails, the verifier categorizes the root cause:
+
+- **builder_issue** (amber) — the agent misconfigured a node, chose the wrong node type, or the workflow structure doesn't match what was asked. Examples: Switch node missing required `conditions.options`, Linear node not querying `creator.email`, missing error handling.
+- **mock_issue** (red) — the LLM mock returned incorrect data. Examples: `_evalMockError` (JSON parse failure), wrong response shape for the endpoint, identical responses for repeated calls.
+- **legitimate_failure** — the workflow genuinely doesn't meet the success criteria. Neither builder nor mock is at fault.
+- **verification_gap** — not enough information to determine the cause.
+
+### Report sections
+
+- **Dashboard** — pass rate, counts at a glance
+- **Scenario indicators** — inline pass/fail on the collapsed test case card
+- **Built workflow** — node list with execution modes and config issues
+- **Agent output** — raw workflow JSON for cross-run comparison
+- **Execution trace** — per-node detail with request/response pairs for mocked nodes
+- **Mock data plan** — Phase 1 hints (global context, trigger content, per-node hints)
+- **Diagnosis** — verifier reasoning with failure category and root cause
+
+## Known limitations
+
+- **LangChain/AI nodes** — use their own SDKs, not intercepted by the HTTP mock layer. These nodes will fail with credential errors. Use pin data for these (tracked in AI-2297).
+- **GraphQL APIs** — response shape depends on the query, not just the endpoint. The mock handles this when the request body (containing the query) is passed to the LLM, but quality depends on the LLM knowing the API schema.
+- **Context7 quota** — free tier is 1,000 requests/month, 60/hour. A full suite run uses ~100 requests. Set `CONTEXT7_API_KEY` for sustained use. When quota is exceeded, a warning is logged and the LLM falls back to its training data.
+- **Non-determinism** — the agent builds different workflows each run. Some configurations work, some don't. Contact Form is stable at 5/5. Other test cases vary based on how the agent configures nodes.
+- **Switch/IF nodes** — the agent sometimes builds these without the required `conditions.options` block, causing a `caseSensitive` runtime crash. This is a known agent builder issue.
+
+## Architecture
+
+```
+evaluations/
+├── cli/                  # CLI entry point and args parsing
+├── clients/              # n8n REST + SSE clients
+├── checklist/            # Verification (programmatic + LLM)
+├── credentials/          # Test credential seeding
+├── data/
+│   ├── prompts.ts        # Original prompt-based eval prompts
+│   └── workflows/        # Workflow test case JSON files
+├── harness/              # Runner orchestration
+├── outcome/              # Outcome extraction (original flow)
+├── execution/            # Post-build execution (original flow)
+├── report/               # HTML report generators
+└── system-prompts/       # LLM prompts (builder-* for original flow, mock-* for mock execution)
+
+packages/cli/src/modules/instance-ai/eval/
+├── execution.service.ts    # Phase 1 + Phase 2 orchestration
+├── workflow-analysis.ts    # Hint generation (Phase 1)
+├── mock-handler.ts         # Per-request mock generation (Phase 2)
+├── api-docs.ts             # Context7 API doc fetcher
+├── node-config.ts          # Node config serializer
+├── pin-data-generator.ts   # LLM pin data for bypass nodes (Phase 1.5)
+
+packages/core/src/execution-engine/
+├── eval-mock-helpers.ts        # HTTP interception utilities
+```
+
+Two evaluation approaches coexist:
+- **Original** (`pnpm eval:instance-ai`) — prompt-based builder evaluation using checklists
+- **Workflow test cases** (`pnpm eval:instance-ai workflows`) — mock execution evaluation
--- a/packages/@n8n/instance-ai/evaluations/tests/event-parser.test.ts
+++ b/packages/@n8n/instance-ai/evaluations/tests/event-parser.test.ts
@ -0,0 +1,341 @@
+import { extractOutcomeFromEvents, buildMetrics } from '../outcome/event-parser';
+import type { CapturedEvent } from '../types';
+
+// ---------------------------------------------------------------------------
+// extractOutcomeFromEvents
+// ---------------------------------------------------------------------------
+
+describe('extractOutcomeFromEvents', () => {
+	it('returns empty outcome for no events', () => {
+		const result = extractOutcomeFromEvents([]);
+		expect(result.workflowIds).toEqual([]);
+		expect(result.executionIds).toEqual([]);
+		expect(result.dataTableIds).toEqual([]);
+		expect(result.finalText).toBe('');
+		expect(result.toolCalls).toEqual([]);
+		expect(result.agentActivities).toEqual([]);
+	});
+
+	it('collects text from text-delta events', () => {
+		const events: CapturedEvent[] = [
+			{ timestamp: 1000, type: 'text-delta', data: { type: 'text-delta', text: 'Hello ' } },
+			{ timestamp: 1001, type: 'text-delta', data: { type: 'text-delta', text: 'World' } },
+		];
+
+		const result = extractOutcomeFromEvents(events);
+		expect(result.finalText).toBe('Hello World');
+	});
+
+	it('extracts text from payload field', () => {
+		const events: CapturedEvent[] = [
+			{
+				timestamp: 1000,
+				type: 'text-delta',
+				data: { type: 'text-delta', payload: { text: 'nested text' } },
+			},
+		];
+
+		const result = extractOutcomeFromEvents(events);
+		expect(result.finalText).toBe('nested text');
+	});
+
+	it('tracks tool calls with duration', () => {
+		const events: CapturedEvent[] = [
+			{
+				timestamp: 1000,
+				type: 'tool-call',
+				data: {
+					type: 'tool-call',
+					payload: {
+						toolCallId: 'tc-1',
+						toolName: 'build-workflow',
+						args: { name: 'Test' },
+					},
+				},
+			},
+			{
+				timestamp: 1500,
+				type: 'tool-result',
+				data: {
+					type: 'tool-result',
+					payload: {
+						toolCallId: 'tc-1',
+						toolName: 'build-workflow',
+						result: { workflowId: 'wf-123' },
+					},
+				},
+			},
+		];
+
+		const result = extractOutcomeFromEvents(events);
+		expect(result.toolCalls).toHaveLength(1);
+		expect(result.toolCalls[0].toolName).toBe('build-workflow');
+		expect(result.toolCalls[0].durationMs).toBe(500);
+		expect(result.workflowIds).toContain('wf-123');
+	});
+
+	it('extracts workflow IDs from known tool results', () => {
+		const events: CapturedEvent[] = [
+			{
+				timestamp: 1000,
+				type: 'tool-call',
+				data: {
+					type: 'tool-call',
+					payload: { toolCallId: 'tc-1', toolName: 'submit-workflow', args: {} },
+				},
+			},
+			{
+				timestamp: 1100,
+				type: 'tool-result',
+				data: {
+					type: 'tool-result',
+					payload: { toolCallId: 'tc-1', result: { id: 'wf-456' } },
+				},
+			},
+		];
+
+		const result = extractOutcomeFromEvents(events);
+		expect(result.workflowIds).toContain('wf-456');
+	});
+
+	it('extracts execution IDs from run-workflow results', () => {
+		const events: CapturedEvent[] = [
+			{
+				timestamp: 1000,
+				type: 'tool-call',
+				data: {
+					type: 'tool-call',
+					payload: { toolCallId: 'tc-1', toolName: 'run-workflow', args: {} },
+				},
+			},
+			{
+				timestamp: 1100,
+				type: 'tool-result',
+				data: {
+					type: 'tool-result',
+					payload: {
+						toolCallId: 'tc-1',
+						toolName: 'run-workflow',
+						result: { executionId: 'exec-789' },
+					},
+				},
+			},
+		];
+
+		const result = extractOutcomeFromEvents(events);
+		expect(result.executionIds).toContain('exec-789');
+	});
+
+	it('extracts data table IDs from create-data-table results', () => {
+		const events: CapturedEvent[] = [
+			{
+				timestamp: 1000,
+				type: 'tool-call',
+				data: {
+					type: 'tool-call',
+					payload: { toolCallId: 'tc-1', toolName: 'create-data-table', args: {} },
+				},
+			},
+			{
+				timestamp: 1100,
+				type: 'tool-result',
+				data: {
+					type: 'tool-result',
+					payload: {
+						toolCallId: 'tc-1',
+						toolName: 'create-data-table',
+						result: { dataTableId: 'dt-001' },
+					},
+				},
+			},
+		];
+
+		const result = extractOutcomeFromEvents(events);
+		expect(result.dataTableIds).toContain('dt-001');
+	});
+
+	it('captures tool errors', () => {
+		const events: CapturedEvent[] = [
+			{
+				timestamp: 1000,
+				type: 'tool-call',
+				data: {
+					type: 'tool-call',
+					payload: { toolCallId: 'tc-err', toolName: 'build-workflow', args: {} },
+				},
+			},
+			{
+				timestamp: 1200,
+				type: 'tool-error',
+				data: {
+					type: 'tool-error',
+					payload: { toolCallId: 'tc-err', error: 'Something went wrong' },
+				},
+			},
+		];
+
+		const result = extractOutcomeFromEvents(events);
+		expect(result.toolCalls).toHaveLength(1);
+		expect(result.toolCalls[0].error).toBe('Something went wrong');
+		expect(result.toolCalls[0].durationMs).toBe(200);
+	});
+
+	it('tracks agent activities', () => {
+		const events: CapturedEvent[] = [
+			{
+				timestamp: 1000,
+				type: 'agent-spawned',
+				data: {
+					type: 'agent-spawned',
+					agentId: 'agent-1',
+					payload: { agentId: 'agent-1', role: 'builder', parentId: 'root' },
+				},
+			},
+			{
+				timestamp: 2000,
+				type: 'agent-completed',
+				data: {
+					type: 'agent-completed',
+					agentId: 'agent-1',
+					payload: { agentId: 'agent-1', status: 'completed', result: 'Done' },
+				},
+			},
+		];
+
+		const result = extractOutcomeFromEvents(events);
+		expect(result.agentActivities).toHaveLength(1);
+		expect(result.agentActivities[0].role).toBe('builder');
+		expect(result.agentActivities[0].status).toBe('completed');
+	});
+
+	it('deduplicates resource IDs', () => {
+		const events: CapturedEvent[] = [
+			{
+				timestamp: 1000,
+				type: 'tool-call',
+				data: {
+					type: 'tool-call',
+					payload: { toolCallId: 'tc-1', toolName: 'build-workflow', args: {} },
+				},
+			},
+			{
+				timestamp: 1100,
+				type: 'tool-result',
+				data: {
+					type: 'tool-result',
+					payload: { toolCallId: 'tc-1', result: { workflowId: 'wf-1' } },
+				},
+			},
+			{
+				timestamp: 1200,
+				type: 'tool-call',
+				data: {
+					type: 'tool-call',
+					payload: { toolCallId: 'tc-2', toolName: 'patch-workflow', args: {} },
+				},
+			},
+			{
+				timestamp: 1300,
+				type: 'tool-result',
+				data: {
+					type: 'tool-result',
+					payload: { toolCallId: 'tc-2', result: { workflowId: 'wf-1' } },
+				},
+			},
+		];
+
+		const result = extractOutcomeFromEvents(events);
+		expect(result.workflowIds).toEqual(['wf-1']);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// buildMetrics
+// ---------------------------------------------------------------------------
+
+describe('buildMetrics', () => {
+	const startTime = 1000;
+
+	it('returns zero metrics for no events', () => {
+		const metrics = buildMetrics([], startTime);
+		expect(metrics.totalTimeMs).toBe(0);
+		expect(metrics.timeToFirstTextMs).toBe(0);
+		expect(metrics.timeToRunFinishMs).toBe(0);
+		expect(metrics.totalToolCalls).toBe(0);
+		expect(metrics.subAgentsSpawned).toBe(0);
+		expect(metrics.confirmationRequests).toBe(0);
+	});
+
+	it('computes time to first text', () => {
+		const events: CapturedEvent[] = [
+			{ timestamp: 1500, type: 'tool-call', data: { type: 'tool-call' } },
+			{ timestamp: 2000, type: 'text-delta', data: { type: 'text-delta', text: 'hi' } },
+			{ timestamp: 2500, type: 'text-delta', data: { type: 'text-delta', text: ' there' } },
+		];
+
+		const metrics = buildMetrics(events, startTime);
+		expect(metrics.timeToFirstTextMs).toBe(1000); // 2000 - 1000
+	});
+
+	it('counts tool calls', () => {
+		const events: CapturedEvent[] = [
+			{ timestamp: 1100, type: 'tool-call', data: { type: 'tool-call' } },
+			{ timestamp: 1200, type: 'tool-call', data: { type: 'tool-call' } },
+			{ timestamp: 1300, type: 'tool-call', data: { type: 'tool-call' } },
+		];
+
+		const metrics = buildMetrics(events, startTime);
+		expect(metrics.totalToolCalls).toBe(3);
+	});
+
+	it('counts sub-agents spawned', () => {
+		const events: CapturedEvent[] = [
+			{
+				timestamp: 1100,
+				type: 'agent-spawned',
+				data: { type: 'agent-spawned', agentId: 'a1', payload: { agentId: 'a1', role: 'builder' } },
+			},
+			{
+				timestamp: 1200,
+				type: 'agent-spawned',
+				data: {
+					type: 'agent-spawned',
+					agentId: 'a2',
+					payload: { agentId: 'a2', role: 'researcher' },
+				},
+			},
+		];
+
+		const metrics = buildMetrics(events, startTime);
+		expect(metrics.subAgentsSpawned).toBe(2);
+	});
+
+	it('counts confirmation requests', () => {
+		const events: CapturedEvent[] = [
+			{ timestamp: 1100, type: 'confirmation-request', data: { type: 'confirmation-request' } },
+		];
+
+		const metrics = buildMetrics(events, startTime);
+		expect(metrics.confirmationRequests).toBe(1);
+	});
+
+	it('captures time to run finish', () => {
+		const events: CapturedEvent[] = [
+			{ timestamp: 1100, type: 'tool-call', data: { type: 'tool-call' } },
+			{ timestamp: 3000, type: 'run-finish', data: { type: 'run-finish' } },
+		];
+
+		const metrics = buildMetrics(events, startTime);
+		expect(metrics.timeToRunFinishMs).toBe(2000); // 3000 - 1000
+	});
+
+	it('computes total time from last event', () => {
+		const events: CapturedEvent[] = [
+			{ timestamp: 1100, type: 'tool-call', data: { type: 'tool-call' } },
+			{ timestamp: 5000, type: 'run-finish', data: { type: 'run-finish' } },
+		];
+
+		const metrics = buildMetrics(events, startTime);
+		expect(metrics.totalTimeMs).toBe(4000); // 5000 - 1000
+	});
+});
--- a/packages/@n8n/instance-ai/evaluations/checklist/verifier.ts
+++ b/packages/@n8n/instance-ai/evaluations/checklist/verifier.ts
@ -0,0 +1,99 @@
+import { createEvalAgent, extractText } from '../../src/utils/eval-agents';
+import type { WorkflowResponse } from '../clients/n8n-client';
+import { MOCK_EXECUTION_VERIFY_PROMPT } from '../system-prompts/mock-execution-verify';
+import type { ChecklistItem, ChecklistResult } from '../types';
+
+// ---------------------------------------------------------------------------
+// JSON parsing helpers
+// ---------------------------------------------------------------------------
+
+function parseJsonArray(text: string): unknown[] {
+	// Try fenced code block first
+	const fenceMatch = text.match(/```(?:json)?\s*\n?([\s\S]*?)```/);
+	const jsonStr = fenceMatch ? fenceMatch[1].trim() : text.trim();
+
+	try {
+		const parsed: unknown = JSON.parse(jsonStr);
+		if (Array.isArray(parsed)) return parsed;
+	} catch {
+		// Try extracting array from anywhere in the text
+		const arrayMatch = jsonStr.match(/\[[\s\S]*\]/);
+		if (arrayMatch) {
+			try {
+				const parsed: unknown = JSON.parse(arrayMatch[0]);
+				if (Array.isArray(parsed)) return parsed;
+			} catch {
+				// fall through
+			}
+		}
+	}
+
+	// Log failure for debugging — this causes "No verification result"
+	console.warn(
+		`[verifier] Failed to parse JSON array from LLM response (${text.length} chars). First 500 chars: ${text.slice(0, 500)}`,
+	);
+	return [];
+}
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+export async function verifyChecklist(
+	checklist: ChecklistItem[],
+	verificationArtifact: string,
+	_workflowJsons: WorkflowResponse[],
+): Promise<ChecklistResult[]> {
+	const llmItems = checklist.filter((i) => i.strategy === 'llm');
+	const results: ChecklistResult[] = [];
+
+	if (llmItems.length > 0) {
+		const userMessage = `## Checklist
+
+${JSON.stringify(llmItems, null, 2)}
+
+## Verification Artifact
+
+${verificationArtifact}
+
+Verify each checklist item against the artifact above.`;
+
+		const agent = createEvalAgent('eval-checklist-verifier', {
+			instructions: MOCK_EXECUTION_VERIFY_PROMPT,
+			cache: true,
+		});
+
+		const result = await agent.generate(userMessage, {
+			providerOptions: { anthropic: { maxTokens: 16_384 } },
+		});
+
+		const content = extractText(result);
+
+		const rawResults = parseJsonArray(content);
+
+		const validIds = new Set(llmItems.map((i) => i.id));
+		for (const raw of rawResults) {
+			const entry = raw as Record<string, unknown>;
+			if (
+				typeof entry.id === 'number' &&
+				typeof entry.pass === 'boolean' &&
+				validIds.has(entry.id)
+			) {
+				results.push({
+					id: entry.id,
+					pass: entry.pass,
+					reasoning: typeof entry.reasoning === 'string' ? entry.reasoning : '',
+					strategy: 'llm',
+					failureCategory:
+						typeof entry.failureCategory === 'string' ? entry.failureCategory : undefined,
+					rootCause: typeof entry.rootCause === 'string' ? entry.rootCause : undefined,
+				});
+			}
+		}
+	}
+
+	// Sort results by id for deterministic output
+	results.sort((a, b) => a.id - b.id);
+
+	return results;
+}
--- a/packages/@n8n/instance-ai/evaluations/cli/args.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/args.ts
@ -0,0 +1,136 @@
+// ---------------------------------------------------------------------------
+// CLI argument parser for the instance-ai workflow evaluator
+//
+// Uses manual parsing (no external CLI lib) to keep dependencies minimal.
+// Validates and normalizes arguments into a typed CliArgs interface.
+// ---------------------------------------------------------------------------
+
+import { z } from 'zod';
+
+// ---------------------------------------------------------------------------
+// Public types
+// ---------------------------------------------------------------------------
+
+export interface CliArgs {
+	timeoutMs: number;
+	baseUrl: string;
+	email?: string;
+	password?: string;
+	verbose: boolean;
+	/** Filter workflow test cases by filename substring (e.g. "contact-form") */
+	filter?: string;
+}
+
+// ---------------------------------------------------------------------------
+// Zod schema for validation
+// ---------------------------------------------------------------------------
+
+const cliArgsSchema = z.object({
+	timeoutMs: z.number().int().positive().default(600_000),
+	baseUrl: z.string().url().default('http://localhost:5678'),
+	email: z.string().optional(),
+	password: z.string().optional(),
+	verbose: z.boolean().default(false),
+	filter: z.string().optional(),
+});
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+export function parseCliArgs(argv: string[]): CliArgs {
+	const raw = parseRawArgs(argv);
+	const validated = cliArgsSchema.parse(raw);
+
+	return {
+		timeoutMs: validated.timeoutMs,
+		baseUrl: validated.baseUrl,
+		email: validated.email,
+		password: validated.password,
+		verbose: validated.verbose,
+		filter: validated.filter,
+	};
+}
+
+// ---------------------------------------------------------------------------
+// Raw argument parsing
+// ---------------------------------------------------------------------------
+
+interface RawArgs {
+	timeoutMs: number;
+	baseUrl: string;
+	email?: string;
+	password?: string;
+	verbose: boolean;
+	filter?: string;
+}
+
+function parseRawArgs(argv: string[]): RawArgs {
+	const result: RawArgs = {
+		timeoutMs: 600_000,
+		baseUrl: 'http://localhost:5678',
+		verbose: false,
+	};
+
+	for (let i = 0; i < argv.length; i++) {
+		const arg = argv[i];
+
+		switch (arg) {
+			case '--timeout-ms':
+				result.timeoutMs = parseIntArg(argv, i, '--timeout-ms');
+				i++;
+				break;
+
+			case '--base-url':
+				result.baseUrl = nextArg(argv, i, '--base-url');
+				i++;
+				break;
+
+			case '--email':
+				result.email = nextArg(argv, i, '--email');
+				i++;
+				break;
+
+			case '--password':
+				result.password = nextArg(argv, i, '--password');
+				i++;
+				break;
+
+			case '--verbose':
+				result.verbose = true;
+				break;
+
+			case '--filter':
+				result.filter = nextArg(argv, i, '--filter');
+				i++;
+				break;
+
+			default:
+				// Ignore unknown flags
+				break;
+		}
+	}
+
+	return result;
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function nextArg(argv: string[], currentIndex: number, flagName: string): string {
+	const value = argv[currentIndex + 1];
+	if (value === undefined || value.startsWith('--')) {
+		throw new Error(`Missing value for ${flagName}`);
+	}
+	return value;
+}
+
+function parseIntArg(argv: string[], currentIndex: number, flagName: string): number {
+	const raw = nextArg(argv, currentIndex, flagName);
+	const parsed = parseInt(raw, 10);
+	if (Number.isNaN(parsed)) {
+		throw new Error(`Invalid integer for ${flagName}: ${raw}`);
+	}
+	return parsed;
+}
--- a/packages/@n8n/instance-ai/evaluations/cli/index.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/index.ts
@ -0,0 +1,96 @@
+#!/usr/bin/env node
+import { parseCliArgs } from './args';
+import { N8nClient } from '../clients/n8n-client';
+import { seedCredentials, cleanupCredentials } from '../credentials/seeder';
+import { loadWorkflowTestCases } from '../data/workflows';
+import { createLogger } from '../harness/logger';
+import { runWorkflowTestCase, runWithConcurrency } from '../harness/runner';
+import { snapshotWorkflowIds } from '../outcome/workflow-discovery';
+import { writeWorkflowReport } from '../report/workflow-report';
+
+async function main(): Promise<void> {
+	const args = parseCliArgs(process.argv.slice(2));
+
+	const testCases = loadWorkflowTestCases(args.filter);
+	if (testCases.length === 0) {
+		console.log('No workflow test cases found in evaluations/data/workflows/');
+		return;
+	}
+
+	const totalScenarios = testCases.reduce((sum, tc) => sum + tc.scenarios.length, 0);
+	console.log(
+		`Running ${String(testCases.length)} workflow test case(s) with ${String(totalScenarios)} scenario(s)\n`,
+	);
+
+	const logger = createLogger(args.verbose);
+
+	// Setup: authenticate, seed credentials, snapshot workflows
+	const client = new N8nClient(args.baseUrl);
+	logger.info(`Authenticating with ${args.baseUrl}...`);
+	await client.login(args.email, args.password);
+	logger.success('Authenticated');
+
+	logger.info('Seeding credentials...');
+	const seedResult = await seedCredentials(client);
+	logger.info(`Seeded ${String(seedResult.credentialIds.length)} credential(s)`);
+
+	const preRunWorkflowIds = await snapshotWorkflowIds(client);
+	const claimedWorkflowIds = new Set<string>();
+
+	// Run test cases with bounded concurrency.
+	// Each test case builds a workflow (uses n8n's agent) then runs scenarios
+	// (uses our Anthropic key for Phase 1 + Phase 2 mock generation).
+	// At Tier 4 (20K RPM) no practical limit is needed — set high to run all in parallel.
+	const MAX_CONCURRENT_TEST_CASES = 4;
+	let results;
+	try {
+		results = await runWithConcurrency(
+			testCases,
+			async (testCase) =>
+				await runWorkflowTestCase({
+					client,
+					testCase,
+					timeoutMs: args.timeoutMs,
+					seededCredentialTypes: seedResult.seededTypes,
+					preRunWorkflowIds,
+					claimedWorkflowIds,
+					logger,
+				}),
+			MAX_CONCURRENT_TEST_CASES,
+		);
+	} finally {
+		// Cleanup credentials even if test execution fails
+		await cleanupCredentials(client, seedResult.credentialIds).catch(() => {});
+	}
+
+	// Generate HTML report
+	const reportPath = writeWorkflowReport(results);
+	console.log(`Report: ${reportPath}`);
+
+	// Print summary
+	console.log('\n=== Workflow Test Case Results ===\n');
+	for (const r of results) {
+		const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED';
+		console.log(`${r.testCase.prompt.slice(0, 70)}...`);
+		console.log(`  Workflow: ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`);
+		if (r.buildError) {
+			console.log(`  Error: ${r.buildError.slice(0, 200)}`);
+		}
+
+		for (const sr of r.scenarioResults) {
+			const icon = sr.success ? '\u2713' : '\u2717';
+			console.log(
+				`  ${icon} ${sr.scenario.name}: ${sr.success ? 'PASS' : 'FAIL'} (${String(sr.score * 100)}%)`,
+			);
+			if (!sr.success) {
+				console.log(`    ${sr.reasoning.slice(0, 120)}`);
+			}
+		}
+		console.log('');
+	}
+}
+
+main().catch((error) => {
+	console.error('Fatal error:', error);
+	process.exit(1);
+});
--- a/packages/@n8n/instance-ai/evaluations/clients/n8n-client.ts
+++ b/packages/@n8n/instance-ai/evaluations/clients/n8n-client.ts
@ -0,0 +1,437 @@
+// ---------------------------------------------------------------------------
+// HTTP client for n8n REST + instance-ai APIs
+//
+// Used by the evaluation runner to interact with a running n8n instance:
+// authenticate, send chat messages, confirm actions, and query the REST API
+// for post-run verification.
+// ---------------------------------------------------------------------------
+
+import type { InstanceAiRichMessagesResponse, InstanceAiEvalExecutionResult } from '@n8n/api-types';
+
+// ---------------------------------------------------------------------------
+// Response shapes from the n8n REST API (wrapped in { data: ... })
+// ---------------------------------------------------------------------------
+
+/** A node as returned by the n8n REST API — the fields eval code reads. */
+export interface WorkflowNodeResponse {
+	name: string;
+	type: string;
+	parameters?: Record<string, unknown>;
+	disabled?: boolean;
+	credentials?: Record<string, unknown>;
+}
+
+/** A workflow as returned by GET /rest/workflows/:id. */
+export interface WorkflowResponse {
+	id: string;
+	name: string;
+	active: boolean;
+	nodes: WorkflowNodeResponse[];
+	connections: Record<string, unknown>;
+	pinData?: Record<string, unknown>;
+}
+
+interface WorkflowListItem {
+	id: string;
+	name: string;
+	active: boolean;
+	nodes: WorkflowNodeResponse[];
+}
+
+interface ExecutionListItem {
+	id: string;
+	workflowId: string;
+	status: string;
+}
+
+export interface ExecutionDetail {
+	id: string;
+	workflowId: string;
+	status: string;
+	/** Flatted-serialized execution data (contains error details, run data per node) */
+	data: string;
+}
+
+// -- Thread types ------------------------------------------------------------
+
+interface ThreadStatus {
+	hasActiveRun: boolean;
+	isSuspended: boolean;
+	backgroundTasks: Array<{
+		taskId: string;
+		role: string;
+		agentId: string;
+		status: 'running' | 'completed' | 'failed' | 'cancelled';
+		startedAt: number;
+		runId?: string;
+		messageGroupId?: string;
+	}>;
+}
+
+// ---------------------------------------------------------------------------
+// Client
+// ---------------------------------------------------------------------------
+
+export class N8nClient {
+	private sessionCookie?: string;
+
+	constructor(private readonly baseUrl: string) {}
+
+	// -- Auth ----------------------------------------------------------------
+
+	/**
+	 * Authenticate with the n8n instance via POST /rest/login.
+	 * Captures the `n8n-auth` cookie for subsequent requests.
+	 */
+	async login(email?: string, password?: string): Promise<void> {
+		const loginEmail = email ?? process.env.N8N_EVAL_EMAIL ?? 'admin@n8n.io';
+		const loginPassword = password ?? process.env.N8N_EVAL_PASSWORD ?? 'password';
+
+		await this.fetch('/rest/login', {
+			method: 'POST',
+			body: { emailOrLdapLoginId: loginEmail, password: loginPassword },
+		});
+
+		if (!this.sessionCookie) {
+			throw new Error('Failed to authenticate with n8n — no session cookie received');
+		}
+	}
+
+	// -- Instance-AI endpoints -----------------------------------------------
+
+	/**
+	 * Send a chat message to the instance-ai agent.
+	 * POST /rest/instance-ai/chat/:threadId  body: { message }
+	 */
+	async sendMessage(threadId: string, message: string): Promise<{ runId: string }> {
+		const result = await this.fetch(`/rest/instance-ai/chat/${threadId}`, {
+			method: 'POST',
+			body: { message },
+		});
+		return result as { runId: string };
+	}
+
+	/**
+	 * Confirm or reject an action requested by the agent.
+	 * POST /rest/instance-ai/confirm/:requestId
+	 * body: { approved, mockCredentials?, credentialId?, ... }
+	 */
+	async confirmAction(
+		requestId: string,
+		approved: boolean,
+		options?: { mockCredentials?: boolean },
+	): Promise<void> {
+		await this.fetch(`/rest/instance-ai/confirm/${requestId}`, {
+			method: 'POST',
+			body: { approved, ...options },
+		});
+	}
+
+	/**
+	 * Cancel the active run for a thread.
+	 * POST /rest/instance-ai/chat/:threadId/cancel
+	 */
+	async cancelRun(threadId: string): Promise<void> {
+		await this.fetch(`/rest/instance-ai/chat/${threadId}/cancel`, {
+			method: 'POST',
+		});
+	}
+
+	/**
+	 * Get the current status of a thread (active run, suspended, background tasks).
+	 * GET /rest/instance-ai/threads/:threadId/status
+	 */
+	async getThreadStatus(threadId: string): Promise<ThreadStatus> {
+		return (await this.fetch(`/rest/instance-ai/threads/${threadId}/status`)) as ThreadStatus;
+	}
+
+	/**
+	 * Get rich messages for a thread (structured agent trees with tool call results).
+	 * GET /rest/instance-ai/threads/:threadId/messages
+	 */
+	async getThreadMessages(threadId: string): Promise<InstanceAiRichMessagesResponse> {
+		const result = (await this.fetch(`/rest/instance-ai/threads/${threadId}/messages`)) as {
+			data: InstanceAiRichMessagesResponse;
+		};
+		return result.data;
+	}
+
+	// -- REST API (verification helpers) -------------------------------------
+
+	/**
+	 * List all workflows visible to the authenticated user.
+	 * GET /rest/workflows
+	 */
+	async listWorkflows(): Promise<WorkflowListItem[]> {
+		const result = (await this.fetch('/rest/workflows')) as { data: WorkflowListItem[] };
+		return result.data;
+	}
+
+	/**
+	 * Get a single workflow by ID.
+	 * GET /rest/workflows/:id
+	 */
+	async getWorkflow(id: string): Promise<WorkflowResponse> {
+		const result = (await this.fetch(`/rest/workflows/${id}`)) as {
+			data: WorkflowResponse;
+		};
+		return result.data;
+	}
+
+	/**
+	 * List executions, optionally filtered by workflow ID.
+	 * GET /rest/executions?workflowId=:id
+	 */
+	async listExecutions(workflowId?: string): Promise<ExecutionListItem[]> {
+		const query = workflowId ? `?workflowId=${workflowId}` : '';
+		const result = (await this.fetch(`/rest/executions${query}`)) as {
+			data: ExecutionListItem[] | { results: ExecutionListItem[]; count: number };
+		};
+		// The API may return either a direct array or { results: [...], count }
+		return Array.isArray(result.data) ? result.data : result.data.results;
+	}
+
+	/**
+	 * Execute a workflow manually.
+	 * POST /rest/workflows/:id/run  body: { triggerToStartFrom?: { name } }
+	 */
+	async executeWorkflow(
+		workflowId: string,
+		triggerNodeName?: string,
+	): Promise<{ executionId: string }> {
+		const body: Record<string, unknown> = {};
+		if (triggerNodeName) {
+			body.triggerToStartFrom = { name: triggerNodeName };
+		}
+		const result = (await this.fetch(`/rest/workflows/${workflowId}/run`, {
+			method: 'POST',
+			body,
+		})) as { data: { executionId: string } };
+		return { executionId: result.data.executionId };
+	}
+
+	/**
+	 * Get a single execution by ID.
+	 * GET /rest/executions/:id
+	 */
+	async getExecution(executionId: string): Promise<ExecutionDetail> {
+		const result = (await this.fetch(`/rest/executions/${executionId}`)) as {
+			data: ExecutionDetail;
+		};
+		return result.data;
+	}
+
+	/**
+	 * Update a workflow (partial update).
+	 * PATCH /rest/workflows/:id -- used to set/restore pin data for execution eval.
+	 */
+	async updateWorkflow(id: string, updates: Record<string, unknown>): Promise<WorkflowResponse> {
+		const result = (await this.fetch(`/rest/workflows/${id}`, {
+			method: 'PATCH',
+			body: updates,
+		})) as { data: WorkflowResponse };
+		return result.data;
+	}
+
+	/**
+	 * Activate a workflow.
+	 * PATCH /rest/workflows/:id  body: { active: true }
+	 */
+	async activateWorkflow(id: string): Promise<void> {
+		await this.fetch(`/rest/workflows/${id}`, {
+			method: 'PATCH',
+			body: { active: true },
+		});
+	}
+
+	/**
+	 * Deactivate a workflow.
+	 * PATCH /rest/workflows/:id  body: { active: false }
+	 */
+	async deactivateWorkflow(id: string): Promise<void> {
+		await this.fetch(`/rest/workflows/${id}`, {
+			method: 'PATCH',
+			body: { active: false },
+		});
+	}
+
+	/**
+	 * Call a live webhook endpoint.
+	 * Sends an HTTP request to ${baseUrl}/webhook/${path} and returns the
+	 * status code and parsed response body. The workflow must be active.
+	 */
+	async callWebhook(
+		path: string,
+		method: string,
+		body?: Record<string, unknown>,
+	): Promise<{ status: number; data: unknown }> {
+		const url = `${this.baseUrl}/webhook/${path}`;
+		const headers: Record<string, string> = { 'Content-Type': 'application/json' };
+		if (this.sessionCookie) {
+			headers.cookie = this.sessionCookie;
+		}
+
+		const res = await fetch(url, {
+			method: method.toUpperCase(),
+			headers,
+			body: body ? JSON.stringify(body) : undefined,
+		});
+
+		let data: unknown;
+		const contentType = res.headers.get('content-type') ?? '';
+		if (contentType.includes('application/json')) {
+			data = await res.json();
+		} else {
+			data = await res.text();
+		}
+
+		return { status: res.status, data };
+	}
+
+	/**
+	 * Delete a workflow by ID.
+	 * DELETE /rest/workflows/:id
+	 */
+	async deleteWorkflow(id: string): Promise<void> {
+		await this.fetch(`/rest/workflows/${id}`, { method: 'DELETE' });
+	}
+
+	/**
+	 * Create a credential.
+	 * POST /rest/credentials  body: { name, type, data }
+	 */
+	async createCredential(
+		name: string,
+		type: string,
+		data: Record<string, unknown>,
+	): Promise<{ id: string }> {
+		const result = (await this.fetch('/rest/credentials', {
+			method: 'POST',
+			body: { name, type, data },
+		})) as { data: { id: string } };
+		return { id: result.data.id };
+	}
+
+	/**
+	 * Delete a credential by ID.
+	 * DELETE /rest/credentials/:id
+	 */
+	async deleteCredential(id: string): Promise<void> {
+		await this.fetch(`/rest/credentials/${id}`, { method: 'DELETE' });
+	}
+
+	// -- Data tables ---------------------------------------------------------
+
+	/**
+	 * Get the personal project ID for the authenticated user.
+	 * GET /rest/me  → user.personalProjectId (or similar)
+	 */
+	async getPersonalProjectId(): Promise<string> {
+		const result = (await this.fetch('/rest/me')) as {
+			data: { personalProjectId?: string; defaultPersonalProjectId?: string };
+		};
+		const projectId = result.data.personalProjectId ?? result.data.defaultPersonalProjectId ?? '';
+		if (!projectId) {
+			throw new Error('Could not determine personal project ID');
+		}
+		return projectId;
+	}
+
+	/**
+	 * List data tables in a project.
+	 * GET /rest/projects/:projectId/data-tables
+	 */
+	async listDataTables(projectId: string): Promise<Array<{ id: string; name: string }>> {
+		const result = (await this.fetch(`/rest/projects/${projectId}/data-tables`)) as {
+			data: Array<{ id: string; name: string }>;
+		};
+		return Array.isArray(result.data) ? result.data : [];
+	}
+
+	/**
+	 * Delete a data table by ID.
+	 * DELETE /rest/projects/:projectId/data-tables/:dataTableId
+	 */
+	async deleteDataTable(projectId: string, dataTableId: string): Promise<void> {
+		await this.fetch(`/rest/projects/${projectId}/data-tables/${dataTableId}`, {
+			method: 'DELETE',
+		});
+	}
+
+	// -- Eval mock execution -------------------------------------------------
+
+	/**
+	 * Execute a workflow with LLM-based HTTP mocking.
+	 * The server handles hint generation and mock execution in a single synchronous call.
+	 */
+	async executeWithLlmMock(
+		workflowId: string,
+		scenarioHints?: string,
+		timeoutMs: number = 120_000,
+	): Promise<InstanceAiEvalExecutionResult> {
+		const result = (await this.fetch(`/rest/instance-ai/eval/execute-with-llm-mock/${workflowId}`, {
+			method: 'POST',
+			body: scenarioHints ? { scenarioHints } : {},
+			timeoutMs,
+		})) as { data: InstanceAiEvalExecutionResult };
+		return result.data;
+	}
+
+	// -- SSE helpers ---------------------------------------------------------
+
+	/**
+	 * Build the SSE events URL for a given thread.
+	 * Used by the SSE client to open a streaming connection.
+	 */
+	getEventsUrl(threadId: string): string {
+		return `${this.baseUrl}/rest/instance-ai/events/${threadId}`;
+	}
+
+	/**
+	 * Expose the session cookie so the SSE client can authenticate.
+	 */
+	get cookie(): string {
+		if (!this.sessionCookie) {
+			throw new Error('Not authenticated — call login() first');
+		}
+		return this.sessionCookie;
+	}
+
+	// -- Internal fetch ------------------------------------------------------
+
+	private async fetch(
+		path: string,
+		options: { method?: string; body?: unknown; timeoutMs?: number } = {},
+	): Promise<unknown> {
+		const headers: Record<string, string> = { 'Content-Type': 'application/json' };
+
+		if (this.sessionCookie) {
+			headers.cookie = this.sessionCookie;
+		}
+
+		const method = options.method ?? 'GET';
+
+		const res = await fetch(`${this.baseUrl}${path}`, {
+			method,
+			headers,
+			body: options.body ? JSON.stringify(options.body) : undefined,
+			...(options.timeoutMs ? { signal: AbortSignal.timeout(options.timeoutMs) } : {}),
+		});
+
+		if (!res.ok) {
+			const text = await res.text();
+			throw new Error(`n8n API ${method} ${path} failed (${res.status}): ${text}`);
+		}
+
+		// Capture auth cookie from login response
+		const setCookie = res.headers.get('set-cookie');
+		if (setCookie) {
+			const match = setCookie.match(/n8n-auth=[^;]+/);
+			if (match) {
+				this.sessionCookie = match[0];
+			}
+		}
+
+		return await res.json();
+	}
+}
--- a/packages/@n8n/instance-ai/evaluations/clients/sse-client.ts
+++ b/packages/@n8n/instance-ai/evaluations/clients/sse-client.ts
@ -0,0 +1,178 @@
+// ---------------------------------------------------------------------------
+// Lightweight SSE stream consumer
+//
+// Node.js has no built-in EventSource. This module implements a minimal
+// SSE parser using native fetch() with streaming response body. It handles
+// partial chunks, comment lines, and multi-field events per the SSE spec.
+//
+// Reference: https://html.spec.whatwg.org/multipage/server-sent-events.html
+// ---------------------------------------------------------------------------
+
+export interface SseEvent {
+	id?: string;
+	type?: string;
+	data: string;
+}
+
+/**
+ * Open an SSE connection and invoke `handler` for each received event.
+ *
+ * The function resolves when the stream ends (server closes connection or
+ * the provided AbortSignal fires). It rejects if the initial fetch fails or
+ * the response is not a valid SSE stream.
+ */
+export async function consumeSseStream(
+	url: string,
+	cookie: string,
+	handler: (event: SseEvent) => void,
+	signal: AbortSignal,
+): Promise<void> {
+	const res = await fetch(url, {
+		headers: {
+			cookie,
+			Accept: 'text/event-stream',
+		},
+		signal,
+	});
+
+	if (!res.ok) {
+		const text = await res.text();
+		throw new Error(`SSE connection failed (${res.status}): ${text}`);
+	}
+
+	if (!res.body) {
+		throw new Error('SSE response has no body');
+	}
+
+	const reader = res.body.pipeThrough(new TextDecoderStream()).getReader();
+
+	// Accumulator for partial lines (SSE data can be split across chunks)
+	let buffer = '';
+
+	// Current event being assembled
+	let eventId: string | undefined;
+	let eventType: string | undefined;
+	let dataLines: string[] = [];
+
+	try {
+		while (true) {
+			const { value, done } = await reader.read();
+			if (done) break;
+
+			buffer += value;
+
+			// Process all complete lines in the buffer.
+			// Lines are terminated by \n, \r\n, or \r.
+			let lineEnd: number;
+			while ((lineEnd = findLineEnd(buffer)) !== -1) {
+				const line = buffer.slice(0, lineEnd);
+				buffer = buffer.slice(lineEnd + getLineTerminatorLength(buffer, lineEnd));
+
+				if (line === '') {
+					// Empty line = end of event. Dispatch if we have data.
+					if (dataLines.length > 0) {
+						handler({
+							id: eventId,
+							type: eventType,
+							data: dataLines.join('\n'),
+						});
+					}
+					// Reset for next event
+					eventId = undefined;
+					eventType = undefined;
+					dataLines = [];
+					continue;
+				}
+
+				// Comment lines start with ':'
+				if (line.startsWith(':')) {
+					continue;
+				}
+
+				// Parse "field: value" or "field:value" or "field"
+				const colonIndex = line.indexOf(':');
+				let field: string;
+				let fieldValue: string;
+
+				if (colonIndex === -1) {
+					// Field with no value
+					field = line;
+					fieldValue = '';
+				} else {
+					field = line.slice(0, colonIndex);
+					// Per spec: if there's a space after the colon, skip it
+					const valueStart = colonIndex + 1;
+					fieldValue =
+						valueStart < line.length && line[valueStart] === ' '
+							? line.slice(valueStart + 1)
+							: line.slice(valueStart);
+				}
+
+				switch (field) {
+					case 'id':
+						eventId = fieldValue;
+						break;
+					case 'event':
+						eventType = fieldValue;
+						break;
+					case 'data':
+						dataLines.push(fieldValue);
+						break;
+					case 'retry':
+						// Ignored -- reconnection is handled externally
+						break;
+					default:
+						// Unknown fields are ignored per spec
+						break;
+				}
+			}
+		}
+	} catch (error: unknown) {
+		// AbortError is expected when the signal fires -- swallow it
+		if (isAbortError(error)) {
+			return;
+		}
+		throw error;
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/**
+ * Find the index of the first line terminator (\n, \r\n, or \r) in the
+ * buffer. Returns -1 if no complete line is available.
+ */
+function findLineEnd(buffer: string): number {
+	for (let i = 0; i < buffer.length; i++) {
+		if (buffer[i] === '\n') return i;
+		if (buffer[i] === '\r') return i;
+	}
+	return -1;
+}
+
+/**
+ * Determine the length of the line terminator at the given position.
+ * Handles \r\n (2 chars) and \n or \r (1 char).
+ */
+function getLineTerminatorLength(buffer: string, position: number): number {
+	if (buffer[position] === '\r' && position + 1 < buffer.length && buffer[position + 1] === '\n') {
+		return 2;
+	}
+	return 1;
+}
+
+/**
+ * Type guard for AbortError (thrown when fetch is aborted via AbortSignal).
+ */
+function isAbortError(error: unknown): boolean {
+	if (error instanceof DOMException && error.name === 'AbortError') {
+		return true;
+	}
+	// Node.js fetch may throw a plain Error with name 'AbortError'
+	if (error instanceof Error && error.name === 'AbortError') {
+		return true;
+	}
+	return false;
+}
--- a/packages/@n8n/instance-ai/evaluations/credentials/seeder.ts
+++ b/packages/@n8n/instance-ai/evaluations/credentials/seeder.ts
@ -0,0 +1,160 @@
+// ---------------------------------------------------------------------------
+// Credential seeding and cleanup for evaluation runs
+//
+// External service credentials (Slack, GitHub, etc.) require real tokens
+// via environment variables. If the env var is not set, the credential
+// is skipped. Generic HTTP credentials use placeholder values and are
+// always seeded.
+//
+// POST /rest/credentials takes raw values -- n8n encrypts them server-side.
+// ---------------------------------------------------------------------------
+
+import type { N8nClient } from '../clients/n8n-client';
+
+// ---------------------------------------------------------------------------
+// Config types
+// ---------------------------------------------------------------------------
+
+interface CredentialConfig {
+	envVar: string;
+	type: string;
+	name: string;
+	dataBuilder: (token: string) => Record<string, unknown>;
+}
+
+interface GenericCredentialConfig {
+	type: string;
+	name: string;
+	data: Record<string, unknown>;
+}
+
+// ---------------------------------------------------------------------------
+// Credential definitions
+// ---------------------------------------------------------------------------
+
+const CREDENTIAL_CONFIGS: CredentialConfig[] = [
+	{
+		envVar: 'EVAL_SLACK_ACCESS_TOKEN',
+		type: 'slackApi',
+		name: '[eval] Slack',
+		dataBuilder: (token) => ({ accessToken: token }),
+	},
+	{
+		envVar: 'EVAL_NOTION_API_KEY',
+		type: 'notionApi',
+		name: '[eval] Notion',
+		dataBuilder: (key) => ({ apiKey: key }),
+	},
+	{
+		envVar: 'EVAL_GITHUB_ACCESS_TOKEN',
+		type: 'githubApi',
+		name: '[eval] GitHub',
+		dataBuilder: (token) => ({ accessToken: token }),
+	},
+	{
+		envVar: 'EVAL_GMAIL_ACCESS_TOKEN',
+		type: 'gmailOAuth2Api',
+		name: '[eval] Gmail',
+		dataBuilder: (token) => ({ oauthTokenData: { access_token: token } }),
+	},
+	{
+		envVar: 'EVAL_TEAMS_ACCESS_TOKEN',
+		type: 'microsoftTeamsOAuth2Api',
+		name: '[eval] Teams',
+		dataBuilder: (token) => ({
+			scope: 'openid',
+			oauthTokenData: { access_token: token },
+		}),
+	},
+];
+
+const GENERIC_CREDENTIALS: GenericCredentialConfig[] = [
+	{
+		type: 'httpHeaderAuth',
+		name: '[eval] HTTP Header',
+		data: { name: 'Authorization', value: 'Bearer eval-placeholder' },
+	},
+	{
+		type: 'httpBasicAuth',
+		name: '[eval] HTTP Basic',
+		data: { user: 'eval-user', password: 'eval-pass' },
+	},
+];
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+export interface SeedResult {
+	credentialIds: string[];
+	seededTypes: string[];
+}
+
+/**
+ * Seed credentials into the n8n instance for evaluation runs.
+ *
+ * For env-var-based configs, the credential is skipped if the env var is not
+ * set. Generic credentials (HTTP Header, HTTP Basic) are always seeded.
+ *
+ * When `requiredTypes` is provided, only credentials matching those types are
+ * seeded. When omitted, all available credentials are seeded.
+ */
+export async function seedCredentials(
+	client: N8nClient,
+	requiredTypes?: string[],
+): Promise<SeedResult> {
+	const credentialIds: string[] = [];
+	const seededTypes: string[] = [];
+	const typeFilter = requiredTypes ? new Set(requiredTypes) : undefined;
+
+	// Seed env-var-based credentials
+	for (const config of CREDENTIAL_CONFIGS) {
+		if (typeFilter && !typeFilter.has(config.type)) continue;
+
+		const token = process.env[config.envVar];
+		if (!token) {
+			console.log(`  Skipping ${config.name}: ${config.envVar} not set`);
+			continue;
+		}
+
+		try {
+			const data = config.dataBuilder(token);
+			const { id } = await client.createCredential(config.name, config.type, data);
+			credentialIds.push(id);
+			seededTypes.push(config.type);
+		} catch {
+			// Non-fatal -- credential type may not exist on this n8n version
+		}
+	}
+
+	// Always seed generic credentials
+	for (const generic of GENERIC_CREDENTIALS) {
+		if (typeFilter && !typeFilter.has(generic.type)) continue;
+
+		try {
+			const { id } = await client.createCredential(generic.name, generic.type, generic.data);
+			credentialIds.push(id);
+			seededTypes.push(generic.type);
+		} catch {
+			// Non-fatal
+		}
+	}
+
+	return { credentialIds, seededTypes };
+}
+
+/**
+ * Best-effort cleanup of seeded credentials after an evaluation run.
+ */
+export async function cleanupCredentials(
+	client: N8nClient,
+	credentialIds: string[],
+): Promise<void> {
+	for (const id of credentialIds) {
+		try {
+			await client.deleteCredential(id);
+		} catch {
+			// Best-effort cleanup
+		}
+	}
+}
--- a/packages/@n8n/instance-ai/evaluations/data/workflows/contact-form-automation.json
+++ b/packages/@n8n/instance-ai/evaluations/data/workflows/contact-form-automation.json
@ -0,0 +1,40 @@
+{
+	"prompt": "Create a workflow that handles contact form submissions via a webhook. It should send an auto-reply email to the person who submitted the form, notify my team on Telegram, and log each submission to Google Sheets. For the Google Sheets node use documentId: '1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgVE2upms' and sheet name: 'Contact Submissions'. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
+	"complexity": "medium",
+	"tags": ["build", "webhook", "gmail", "telegram", "google-sheets", "multi-action"],
+	"triggerType": "webhook",
+	"scenarios": [
+		{
+			"name": "happy-path",
+			"description": "A valid contact form submission triggers all 3 actions",
+			"dataSetup": "The webhook receives a contact form submission from Jane Smith (jane@example.com) with the message 'I am interested in your enterprise plan, can you send me more details?'. The Gmail, Telegram, and Google Sheets nodes return success responses.",
+			"successCriteria": "The workflow executes without errors. The webhook returns a response acknowledging the submission. An auto-reply email is sent to jane@example.com. A Telegram notification is sent containing the submission details (name, email, message). The submission is logged to Google Sheets with the form fields. All 3 actions receive the data from the webhook trigger."
+		},
+		{
+			"name": "missing-fields",
+			"description": "The form submission is missing the name field",
+			"dataSetup": "The webhook receives a submission from anonymous@example.com with the message 'Quick question about pricing' but no name is provided. The Gmail, Telegram, and Google Sheets nodes return success responses.",
+			"successCriteria": "The workflow handles the missing name field without crashing. The email is still sent to the provided address. The Telegram alert and Google Sheets log contain whatever data was available. The workflow does not fail silently."
+		},
+		{
+			"name": "partial-action-failure",
+			"description": "One of the 3 actions fails, the others should still complete if possible",
+			"dataSetup": "The webhook receives a submission from Tom Lee (tom@example.com) with the message 'Hello!'. The Gmail node returns a success response. The Telegram node returns an error indicating the chat was not found. The Google Sheets node returns a success response.",
+			"successCriteria": "The workflow handles the Telegram failure gracefully without crashing. At minimum, the actions before the failure complete successfully. Ideally, independent actions still execute despite the failure — but the key requirement is that the workflow does not lose all data or fail silently due to one action erroring.",
+			"requires": "mock-server"
+		},
+		{
+			"name": "empty-message",
+			"description": "The form submission has an empty message body",
+			"dataSetup": "The webhook receives a submission from Alex Kim (alex@example.com) with an empty message. All service nodes return success responses.",
+			"successCriteria": "The workflow handles the empty message without errors. The auto-reply is still sent. The Telegram notification and Google Sheets log reflect the empty message field without failing."
+		},
+		{
+			"name": "invalid-email",
+			"description": "The submitted email address is malformed",
+			"dataSetup": "The webhook receives a submission from 'John Doe' with the email 'not-an-email' and the message 'Please contact me'. The Gmail node returns an error indicating an invalid recipient. The Telegram and Google Sheets nodes return success responses.",
+			"successCriteria": "The workflow handles the invalid email gracefully. The Telegram notification and Google Sheets logging should still work with the available data. The email failure is handled without crashing the entire workflow.",
+			"requires": "mock-server"
+		}
+	]
+}
--- a/packages/@n8n/instance-ai/evaluations/data/workflows/cross-team-linear-report.json
+++ b/packages/@n8n/instance-ai/evaluations/data/workflows/cross-team-linear-report.json
@ -0,0 +1,39 @@
+{
+	"prompt": "Get all the Linear issues created in the last 2 weeks. Filter them for issues created for a different team than the one the creator is in. I have this team mapping to use: Alice (alice@company.com) belongs to Frontend, Bob (bob@company.com) to Backend, Carol (carol@company.com) to Backend. Store this mapping in the workflow and use it to filter cross-team issues. Then calculate the number of tickets per creator and post the list (ordered descending) to a Slack channel called #cross-team-reports. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
+	"complexity": "complex",
+	"tags": ["build", "linear", "slack", "schedule", "data-processing"],
+	"triggerType": "schedule",
+	"scenarios": [
+		{
+			"name": "happy-path",
+			"description": "Multiple issues from different creators, some cross-team, some same-team",
+			"dataSetup": "The data source returns 8 issues total. Within the last 2 weeks: Alice (alice@company.com) created 3 issues for team 'Backend' and 1 for team 'Frontend', Bob (bob@company.com) created 1 issue for team 'Frontend', Carol (carol@company.com) created 1 issue for team 'Backend'. Outside the 2-week window (3 weeks ago): Alice created 1 issue for team 'Backend', Bob created 1 issue for team 'Frontend'. The people/teams data maps Alice to team 'Frontend', Bob to team 'Backend', and Carol to team 'Backend'. The Slack post node returns a success response.",
+			"successCriteria": "The workflow executes without errors. Only the 6 issues from the last 2 weeks are processed \u2014 the 2 older issues are excluded. The cross-team filter correctly identifies Alice's 3 Backend issues and Bob's 1 Frontend issue as cross-team. Carol's issue is filtered out (same team). The count per creator shows Alice: 3, Bob: 1. The list is sorted descending by count. The result is posted to Slack."
+		},
+		{
+			"name": "multi-team-creator",
+			"description": "A creator belongs to multiple teams, issues for any of their teams are not cross-team",
+			"dataSetup": "The data source returns 4 issues from the last 2 weeks. Alice (alice@company.com) created 2 issues for team 'Backend' and 1 for team 'AI'. Bob (bob@company.com) created 1 issue for team 'Frontend'. The people/teams data maps Alice to teams ['Frontend', 'AI'] and Bob to team 'Backend'. So Alice's 2 Backend issues are cross-team (not in her teams), but her AI issue is not cross-team. Bob's Frontend issue is cross-team. The Slack post node returns a success response.",
+			"successCriteria": "The workflow correctly handles multi-team membership. Alice's AI issue is excluded from the cross-team report (AI is one of her teams). Alice has more cross-team issues than Bob. The report is sorted descending by count, with Alice listed first."
+		},
+		{
+			"name": "no-cross-team-issues",
+			"description": "All issues are created for the creator's own team",
+			"dataSetup": "The data source returns 4 issues from the last 2 weeks: Alice (alice@company.com) created 2 issues for team 'Frontend', Bob (bob@company.com) created 2 issues for team 'Backend'. The people/teams data maps Alice to 'Frontend' and Bob to 'Backend'. All issues match the creator's team. The Slack post node returns a success response.",
+			"successCriteria": "The workflow executes without errors. The cross-team filter removes all issues. The workflow handles the empty result gracefully \u2014 either posting a 'no cross-team issues' message or completing without error."
+		},
+		{
+			"name": "unknown-creator",
+			"description": "An issue creator is not in the people/teams list",
+			"dataSetup": "The data source returns 4 issues from the last 2 weeks. Two are by Alice (alice@company.com, mapped to team 'Frontend'), two are by Dave (dave@company.com) who is not in the people/teams data at all. The Slack post node returns a success response.",
+			"successCriteria": "The workflow handles the unknown creator without crashing. Dave's issues are either excluded from the cross-team report or handled with a sensible default. Alice's cross-team issues are still correctly processed."
+		},
+		{
+			"name": "api-error",
+			"description": "Linear API returns an authentication error",
+			"dataSetup": "The Linear/data source node returns an authentication error. The Slack post node returns a success response.",
+			"successCriteria": "The workflow handles the API error gracefully. It should not crash silently or post empty/misleading data to Slack. The error is either reported or the workflow stops cleanly.",
+			"requires": "mock-server"
+		}
+	]
+}
--- a/packages/@n8n/instance-ai/evaluations/data/workflows/daily-slack-summary.json
+++ b/packages/@n8n/instance-ai/evaluations/data/workflows/daily-slack-summary.json
@ -0,0 +1,40 @@
+{
+	"prompt": "Every day, get the posts made in the past day on 3 different Slack channels (#general, #engineering, #product), summarize them using AI, and post the summary on #daily-digest. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
+	"complexity": "medium",
+	"tags": ["build", "slack", "ai", "schedule"],
+	"triggerType": "schedule",
+	"scenarios": [
+		{
+			"name": "happy-path",
+			"description": "All 3 channels return messages, AI summarizes, posts to channel",
+			"dataSetup": "Each Slack channel history node should return 2-3 messages. Channel 1: messages about sprint planning for next week and a production deployment completed successfully. Channel 2: a design review request for the new dashboard and positive feedback on the mockups. Channel 3: a critical bug report about login failures and a customer escalation from Acme Corp. The AI/LLM node should return a summary organized by topic that references these specific discussions. The final Slack post-message node returns a success response.",
+			"successCriteria": "The workflow executes without errors. The summary mentions sprint planning, the deployment, the design review, and the bug report. The summary reaches the final posting node."
+		},
+		{
+			"name": "empty-channel",
+			"description": "One channel returns no messages, others return normally",
+			"dataSetup": "Two Slack channel history nodes return 2-3 messages each — one about a team offsite being planned, another about API performance improvements. The third channel returns an empty array. The AI/LLM node should return a summary covering only the available content. The Slack post-message node returns a success response.",
+			"successCriteria": "The workflow handles the empty channel without errors. The merge step completes with partial data. A summary is generated covering the team offsite and API performance topics. The workflow completes end-to-end."
+		},
+		{
+			"name": "high-volume",
+			"description": "Channels return many messages, testing merge and aggregation",
+			"dataSetup": "Each Slack channel history node returns 8-10 messages covering diverse topics: hiring updates, infrastructure changes, feature launches, customer feedback, and team announcements. The AI/LLM node should return a comprehensive summary. The Slack post-message node returns a success response.",
+			"successCriteria": "The workflow handles the volume without errors. All messages are merged/aggregated correctly. The summary covers the major topics and is posted successfully. No data is lost in the merge step."
+		},
+		{
+			"name": "channel-not-found",
+			"description": "One Slack channel does not exist or has been deleted",
+			"dataSetup": "Two Slack channel history nodes return 2-3 messages each with normal content. The third returns an error indicating the channel was not found. The AI/LLM node should return a summary if it receives any input. The Slack post-message node returns a success response.",
+			"successCriteria": "The workflow handles the channel error gracefully — either by skipping the failed channel and summarizing the rest, or by surfacing a clear error. It should not silently drop all data due to one channel failing.",
+			"requires": "mock-server"
+		},
+		{
+			"name": "insufficient-permissions",
+			"description": "Slack credentials lack permission to read one channel",
+			"dataSetup": "Two Slack channel history nodes return 2-3 messages each. The third returns an error indicating the bot is not a member of that channel. The AI/LLM node should return a summary if it receives any input. The Slack post-message node returns a success response.",
+			"successCriteria": "The workflow handles the permission error gracefully. It should either continue with available data or clearly report the access issue. It should not crash or lose data from the accessible channels.",
+			"requires": "mock-server"
+		}
+	]
+}
--- a/packages/@n8n/instance-ai/evaluations/data/workflows/form-to-hubspot.json
+++ b/packages/@n8n/instance-ai/evaluations/data/workflows/form-to-hubspot.json
@ -0,0 +1,20 @@
+{
+	"prompt": "Create a form that collects: name, email, company, and interest level (dropdown: starter, professional, enterprise). When submitted, create a new contact in HubSpot with firstname, lastname (split from name), email, company, and a custom property 'interest_level'. Then send a confirmation email via SendGrid to the submitted email address — the subject should be 'Thanks for reaching out, {name}!' and the body should mention their company. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
+	"complexity": "medium",
+	"tags": ["build", "form-trigger", "hubspot", "sendgrid", "crm"],
+	"triggerType": "form",
+	"scenarios": [
+		{
+			"name": "happy-path",
+			"description": "A complete form submission creates a HubSpot contact and sends confirmation",
+			"dataSetup": "The form receives: name='Sarah Chen', email='sarah@techstartup.io', company='TechStartup Inc', interest_level='enterprise'. The HubSpot node returns a success response with a contact ID. The SendGrid node returns a success response.",
+			"successCriteria": "The workflow executes without errors. A HubSpot contact is created with firstname='Sarah', lastname='Chen', email='sarah@techstartup.io', company='TechStartup Inc', interest_level='enterprise'. A confirmation email is sent via SendGrid to sarah@techstartup.io mentioning TechStartup Inc."
+		},
+		{
+			"name": "single-name",
+			"description": "The submitted name has no space (single name, no last name)",
+			"dataSetup": "The form receives: name='Ravi', email='ravi@example.com', company='Freelance', interest_level='starter'. The HubSpot and SendGrid nodes return success responses.",
+			"successCriteria": "The workflow handles a single name without crashing. The HubSpot contact is created with the name handled gracefully (either firstname='Ravi' with empty lastname, or the full name in firstname). The confirmation email is sent."
+		}
+	]
+}
--- a/packages/@n8n/instance-ai/evaluations/data/workflows/github-notion-sync.json
+++ b/packages/@n8n/instance-ai/evaluations/data/workflows/github-notion-sync.json
@ -0,0 +1,20 @@
+{
+	"prompt": "Every day, fetch all open GitHub issues from repository 'acme-corp/backend' that have the label 'bug'. For each issue, create a page in a Notion database (database ID: 'a1b2c3d4e5f6789012345678abcdef01') with properties: Name (issue title), URL (issue html_url), Created (issue created_at date), Assignee (assignee login name or 'Unassigned'), and Status set to 'Open'. Use the HTTP Request node to call the GitHub API directly (GET https://api.github.com/repos/acme-corp/backend/issues?labels=bug&state=open) with a Bearer token authorization header. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
+	"complexity": "complex",
+	"tags": ["build", "schedule", "http-request", "notion", "github-api", "data-sync"],
+	"triggerType": "schedule",
+	"scenarios": [
+		{
+			"name": "happy-path",
+			"description": "GitHub returns issues, each is synced to Notion",
+			"dataSetup": "The GitHub API returns 3 open bug issues. Issue 1: title='Login timeout on mobile', created_at='2026-03-15T10:00:00Z', assignee.login='alice', html_url='https://github.com/acme-corp/backend/issues/142'. Issue 2: title='API rate limit not enforced', created_at='2026-03-20T14:30:00Z', assignee=null, html_url='https://github.com/acme-corp/backend/issues/155'. Issue 3: title='Memory leak in worker pool', created_at='2026-03-22T09:00:00Z', assignee.login='bob', html_url='https://github.com/acme-corp/backend/issues/158'. Each Notion create-page call returns a success response.",
+			"successCriteria": "The workflow executes without errors. All 3 issues are fetched from GitHub. 3 pages are created in Notion with the correct titles, URLs, dates, and assignees. The unassigned issue (Issue 2) has 'Unassigned' as the assignee value."
+		},
+		{
+			"name": "no-bugs",
+			"description": "GitHub returns an empty array — no open bugs",
+			"dataSetup": "The GitHub API returns an empty array []. No Notion calls should be made.",
+			"successCriteria": "The workflow handles the empty result without errors. No Notion pages are created. The workflow completes cleanly."
+		}
+	]
+}
--- a/packages/@n8n/instance-ai/evaluations/data/workflows/index.ts
+++ b/packages/@n8n/instance-ai/evaluations/data/workflows/index.ts
@ -0,0 +1,24 @@
+import { readFileSync, readdirSync } from 'fs';
+import { join } from 'path';
+
+import type { WorkflowTestCase } from '../../types';
+
+function parseTestCaseFile(filePath: string): WorkflowTestCase {
+	const content = readFileSync(filePath, 'utf-8');
+	try {
+		return JSON.parse(content) as WorkflowTestCase;
+	} catch (error) {
+		throw new Error(
+			`Failed to parse test case ${filePath}: ${error instanceof Error ? error.message : String(error)}`,
+		);
+	}
+}
+
+export function loadWorkflowTestCases(filter?: string): WorkflowTestCase[] {
+	const dir = __dirname;
+	let files = readdirSync(dir).filter((f) => f.endsWith('.json'));
+	if (filter) {
+		files = files.filter((f) => f.toLowerCase().includes(filter.toLowerCase()));
+	}
+	return files.map((f) => parseTestCaseFile(join(dir, f)));
+}
--- a/packages/@n8n/instance-ai/evaluations/data/workflows/notification-router.json
+++ b/packages/@n8n/instance-ai/evaluations/data/workflows/notification-router.json
@ -0,0 +1,26 @@
+{
+	"prompt": "Create a workflow that receives webhook notifications with a JSON body containing 'level' (high, medium, or low), 'title', and 'message'. Route them based on level: high priority goes to Microsoft Teams (use team ID '9b4c3a2f-1d8e-4f5b-a6c7-8e9f0b1d2c3a' and channel ID '19:a1b2c3d4e5f6@thread.tacv2'), medium goes to Slack channel #notifications, and low goes to Gmail (send to alerts@ourcompany.com). Each notification should include the title and message from the payload. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
+	"complexity": "medium",
+	"tags": ["build", "webhook", "switch", "microsoft-teams", "slack", "gmail", "routing"],
+	"triggerType": "webhook",
+	"scenarios": [
+		{
+			"name": "high-priority",
+			"description": "A high-priority notification is routed to Microsoft Teams",
+			"dataSetup": "The webhook receives { \"level\": \"high\", \"title\": \"Server Down\", \"message\": \"Production database is unreachable\" }. The Microsoft Teams node returns a success response.",
+			"successCriteria": "The workflow routes the high-priority notification to Microsoft Teams. The Teams message contains 'Server Down' and the database message. No other channels receive the message."
+		},
+		{
+			"name": "medium-priority",
+			"description": "A medium-priority notification is routed to Slack",
+			"dataSetup": "The webhook receives { \"level\": \"medium\", \"title\": \"Deployment Complete\", \"message\": \"Version 2.5.0 deployed to staging\" }. The Slack node returns a success response.",
+			"successCriteria": "The workflow routes the medium-priority notification to Slack #notifications. The Slack message contains the deployment details."
+		},
+		{
+			"name": "low-priority",
+			"description": "A low-priority notification is routed to email",
+			"dataSetup": "The webhook receives { \"level\": \"low\", \"title\": \"Weekly Report Ready\", \"message\": \"The weekly analytics report has been generated\" }. The Gmail node returns a success response.",
+			"successCriteria": "The workflow routes the low-priority notification to Gmail. The email is sent to alerts@ourcompany.com with the report notification."
+		}
+	]
+}
--- a/packages/@n8n/instance-ai/evaluations/data/workflows/rest-api-data-pipeline.json
+++ b/packages/@n8n/instance-ai/evaluations/data/workflows/rest-api-data-pipeline.json
@ -0,0 +1,26 @@
+{
+	"prompt": "Fetch the latest 10 posts from the JSONPlaceholder API (GET https://jsonplaceholder.typicode.com/posts with query parameter _limit=10). Filter out any posts where the title contains the word 'qui'. Then post a summary message to a Slack channel called #api-digest that says how many posts remain and lists their titles. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
+	"complexity": "medium",
+	"tags": ["build", "http-request", "slack", "data-transformation", "schedule"],
+	"triggerType": "schedule",
+	"scenarios": [
+		{
+			"name": "happy-path",
+			"description": "API returns 10 posts, some get filtered, summary posted to Slack",
+			"dataSetup": "The HTTP Request node returns 10 JSON objects with fields: id, userId, title, body. Some of them should have 'qui' in the title. The Slack node returns a success response.",
+			"successCriteria": "The workflow executes without errors. The HTTP Request fetches data successfully. Posts containing 'qui' in the title are filtered out — fewer posts remain than the original 10. The Slack message is posted to #api-digest with the count and titles of the remaining posts. None of the titles in the Slack message should contain the word 'qui'."
+		},
+		{
+			"name": "empty-response",
+			"description": "The API returns an empty array",
+			"dataSetup": "The HTTP Request node returns an empty array []. The Slack node returns a success response.",
+			"successCriteria": "The workflow handles an empty API response without crashing. The Slack message is either posted with a 'no posts' message or the workflow completes gracefully without posting."
+		},
+		{
+			"name": "all-filtered",
+			"description": "Every post contains 'qui' in the title and all are filtered out",
+			"dataSetup": "The HTTP Request node returns 3 posts, all with 'qui' in their titles. The Slack node returns a success response.",
+			"successCriteria": "The workflow handles the case where all items are filtered out. It should not crash or send an empty message to Slack."
+		}
+	]
+}
--- a/packages/@n8n/instance-ai/evaluations/data/workflows/weather-monitoring.json
+++ b/packages/@n8n/instance-ai/evaluations/data/workflows/weather-monitoring.json
@ -0,0 +1,28 @@
+{
+	"prompt": "Every hour, check the current weather for London, New York, and Tokyo using the OpenWeatherMap API (GET https://api.openweathermap.org/data/2.5/weather?q={city}&units=metric&appid=YOUR_API_KEY). Use 3 separate HTTP Request nodes, one per city. If any city has a temperature above 30°C, send a Telegram alert to chat ID -1001234567890 listing the hot cities. Log all readings to an Airtable base (base ID: 'appABC123def456', table ID: 'tblWeather789xyz') with columns: city, temperature, humidity, timestamp. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
+	"complexity": "complex",
+	"tags": [
+		"build",
+		"schedule",
+		"http-request",
+		"telegram",
+		"airtable",
+		"conditional",
+		"multi-request"
+	],
+	"triggerType": "schedule",
+	"scenarios": [
+		{
+			"name": "happy-path",
+			"description": "All 3 cities return weather data, one is above 30°C",
+			"dataSetup": "London returns: temp=18.5, humidity=72. New York returns: temp=32.1, humidity=65. Tokyo returns: temp=28.3, humidity=80. The Telegram node returns a success response. The Airtable node returns success for each row created.",
+			"successCriteria": "The workflow executes without errors. All 3 weather API calls succeed. The Telegram alert is sent mentioning New York (32.1°C). All 3 readings are logged to Airtable with city, temperature, humidity, and timestamp."
+		},
+		{
+			"name": "no-alerts",
+			"description": "All cities are below 30°C — no alert needed",
+			"dataSetup": "London returns: temp=15.2, humidity=68. New York returns: temp=22.7, humidity=55. Tokyo returns: temp=19.8, humidity=75. The Airtable node returns success for each row created.",
+			"successCriteria": "The workflow executes without errors. No Telegram alert is sent (all temps below 30°C). All 3 readings are still logged to Airtable."
+		}
+	]
+}
--- a/packages/@n8n/instance-ai/evaluations/harness/logger.ts
+++ b/packages/@n8n/instance-ai/evaluations/harness/logger.ts
@ -0,0 +1,44 @@
+// ---------------------------------------------------------------------------
+// Simple evaluation logger with timestamp prefixes and verbosity control
+// ---------------------------------------------------------------------------
+
+export interface EvalLogger {
+	info(msg: string): void;
+	verbose(msg: string): void;
+	success(msg: string): void;
+	warn(msg: string): void;
+	error(msg: string): void;
+	isVerbose: boolean;
+}
+
+export function createLogger(verbose: boolean): EvalLogger {
+	return {
+		isVerbose: verbose,
+
+		info(msg: string): void {
+			console.log(`${timestamp()} [INFO] ${msg}`);
+		},
+
+		verbose(msg: string): void {
+			if (verbose) {
+				console.log(`${timestamp()} [VERBOSE] ${msg}`);
+			}
+		},
+
+		success(msg: string): void {
+			console.log(`${timestamp()} [OK] ${msg}`);
+		},
+
+		warn(msg: string): void {
+			console.log(`${timestamp()} [WARN] ${msg}`);
+		},
+
+		error(msg: string): void {
+			console.error(`${timestamp()} [ERROR] ${msg}`);
+		},
+	};
+}
+
+function timestamp(): string {
+	return new Date().toISOString();
+}
--- a/packages/@n8n/instance-ai/evaluations/harness/runner.ts
+++ b/packages/@n8n/instance-ai/evaluations/harness/runner.ts
@ -0,0 +1,723 @@
+// ---------------------------------------------------------------------------
+// Workflow test case evaluation orchestrator
+//
+// Manages the full lifecycle of a workflow test case evaluation:
+// authentication, SSE capture, workflow build, scenario execution with
+// LLM-mocked HTTP, checklist verification, and result aggregation.
+// ---------------------------------------------------------------------------
+
+import type { InstanceAiEvalExecutionResult } from '@n8n/api-types';
+import crypto from 'node:crypto';
+
+import { verifyChecklist } from '../checklist/verifier';
+import type { N8nClient, WorkflowResponse } from '../clients/n8n-client';
+import { consumeSseStream } from '../clients/sse-client';
+import { extractOutcomeFromEvents } from '../outcome/event-parser';
+import { buildAgentOutcome, extractWorkflowIdsFromMessages } from '../outcome/workflow-discovery';
+import type {
+	ChecklistItem,
+	CapturedEvent,
+	ScenarioResult,
+	TestScenario,
+	WorkflowTestCase,
+	WorkflowTestCaseResult,
+} from '../types';
+import { type EvalLogger } from './logger';
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+const DEFAULT_TIMEOUT_MS = 600_000;
+const SSE_SETTLE_DELAY_MS = 200;
+const POLL_INTERVAL_MS = 500;
+const BACKGROUND_TASK_POLL_INTERVAL_MS = 2_000;
+const MAX_CONFIRMATION_RETRIES = 5;
+
+/** Max concurrent scenario executions per test case */
+const MAX_CONCURRENT_SCENARIOS = 99;
+
+// ---------------------------------------------------------------------------
+// Workflow test case runner — build once, run scenarios against it
+// ---------------------------------------------------------------------------
+
+const SCENARIO_BG_TASK_TIMEOUT_MS = 240_000;
+
+interface WorkflowTestCaseConfig {
+	client: N8nClient;
+	testCase: WorkflowTestCase;
+	timeoutMs: number;
+	seededCredentialTypes: string[];
+	preRunWorkflowIds: Set<string>;
+	claimedWorkflowIds: Set<string>;
+	logger: EvalLogger;
+}
+
+export async function runWorkflowTestCase(
+	config: WorkflowTestCaseConfig,
+): Promise<WorkflowTestCaseResult> {
+	const { client, testCase, logger } = config;
+	const threadId = `eval-${crypto.randomUUID()}`;
+	const startTime = Date.now();
+	const timeoutMs = config.timeoutMs > 0 ? config.timeoutMs : DEFAULT_TIMEOUT_MS;
+
+	const result: WorkflowTestCaseResult = {
+		testCase,
+		workflowBuildSuccess: false,
+		scenarioResults: [],
+	};
+
+	const abortController = new AbortController();
+	const events: CapturedEvent[] = [];
+	const approvedRequests = new Set<string>();
+
+	try {
+		// 1. Send prompt to Instance AI and wait for workflow to be built (ONCE)
+		logger.info(`  Building workflow: "${truncate(testCase.prompt, 60)}"`);
+
+		const ssePromise = startSseConnection(client, threadId, events, abortController.signal).catch(
+			() => {
+				// SSE errors are non-fatal — workflow discovery falls back to event-based approach
+			},
+		);
+
+		await delay(SSE_SETTLE_DELAY_MS);
+
+		await client.sendMessage(threadId, testCase.prompt);
+
+		// Wait with shorter timeout for scenario mode
+		await waitForAllActivity({
+			client,
+			threadId,
+			events,
+			approvedRequests,
+			startTime,
+			timeoutMs: Math.min(timeoutMs, SCENARIO_BG_TASK_TIMEOUT_MS),
+			logger,
+		});
+
+		abortController.abort();
+		await ssePromise.catch(() => {});
+
+		// 2. Capture the built workflow
+		let threadMessages;
+		try {
+			threadMessages = await client.getThreadMessages(threadId);
+		} catch {
+			logger.verbose(`[${threadId}] Thread messages unavailable — using SSE events only`);
+			threadMessages = { messages: [] as never[] };
+		}
+		const messageWorkflowIds = extractWorkflowIdsFromMessages(threadMessages.messages);
+		const eventOutcome = extractOutcomeFromEvents(events);
+		const outcome = await buildAgentOutcome(
+			client,
+			eventOutcome,
+			config.preRunWorkflowIds,
+			config.claimedWorkflowIds,
+		);
+
+		if (messageWorkflowIds.length > 0) {
+			const messageWfSet = new Set(messageWorkflowIds);
+			outcome.workflowsCreated = outcome.workflowsCreated.filter((wf) => messageWfSet.has(wf.id));
+			outcome.workflowJsons = outcome.workflowJsons.filter(
+				(wf) => typeof wf.id === 'string' && messageWfSet.has(wf.id),
+			);
+		}
+
+		if (outcome.workflowsCreated.length === 0) {
+			// Extract error information from SSE events and thread messages
+			const toolErrors = events
+				.filter((e) => e.type === 'tool-error')
+				.map((e) => {
+					const payload =
+						typeof e.data.payload === 'object' && e.data.payload !== null
+							? (e.data.payload as Record<string, unknown>)
+							: e.data;
+					const toolError = payload.error ?? payload.message;
+					return typeof toolError === 'string' ? toolError : 'unknown tool error';
+				});
+
+			const agentText = events
+				.filter((e) => e.type === 'text-delta')
+				.map((e) => {
+					const text =
+						typeof e.data.text === 'string'
+							? e.data.text
+							: typeof e.data.payload === 'object' &&
+									e.data.payload !== null &&
+									'text' in (e.data.payload as Record<string, unknown>)
+								? String((e.data.payload as Record<string, unknown>).text)
+								: '';
+					return text;
+				})
+				.join('');
+
+			const buildError =
+				toolErrors.length > 0
+					? `Tool errors: ${toolErrors.join('; ')}`
+					: agentText.length > 0
+						? `Agent response: ${agentText.slice(0, 500)}`
+						: 'No workflow produced — no error details captured';
+
+			result.buildError = buildError;
+			logger.warn(`  No workflow created for: "${truncate(testCase.prompt, 60)}"`);
+			logger.warn(`  ${buildError.slice(0, 200)}`);
+			return result;
+		}
+
+		result.workflowBuildSuccess = true;
+		result.workflowId = outcome.workflowsCreated[0].id;
+		result.workflowJson = outcome.workflowJsons[0];
+
+		logger.info(
+			`  Workflow built: ${outcome.workflowsCreated[0].name} (${String(outcome.workflowsCreated[0].nodeCount)} nodes)`,
+		);
+
+		// 3. Run scenarios with bounded concurrency to avoid API rate limits
+		const workflowId = outcome.workflowsCreated[0].id;
+
+		for (const scenario of testCase.scenarios) {
+			logger.info(`    Scenario: ${scenario.name}`);
+		}
+
+		result.scenarioResults = await runWithConcurrency(
+			testCase.scenarios,
+			async (scenario) => {
+				try {
+					return await runScenario(client, scenario, workflowId, outcome.workflowJsons, logger);
+				} catch (error: unknown) {
+					const errorMessage = error instanceof Error ? error.message : String(error);
+					logger.error(`    ERROR [${scenario.name}]: ${errorMessage}`);
+					return {
+						scenario,
+						success: false,
+						score: 0,
+						reasoning: `Error: ${errorMessage}`,
+					} satisfies ScenarioResult;
+				}
+			},
+			MAX_CONCURRENT_SCENARIOS,
+		);
+
+		// 4. Cleanup — delete workflows created during build
+		for (const wf of outcome.workflowsCreated) {
+			try {
+				await client.deleteWorkflow(wf.id);
+			} catch {
+				// Best-effort cleanup
+			}
+		}
+
+		// Clean up data tables created during this run
+		if (outcome.dataTablesCreated.length > 0) {
+			try {
+				const projectId = await client.getPersonalProjectId();
+				for (const dtId of outcome.dataTablesCreated) {
+					try {
+						await client.deleteDataTable(projectId, dtId);
+					} catch {
+						// Best-effort cleanup
+					}
+				}
+				logger.verbose(`  Cleaned up ${String(outcome.dataTablesCreated.length)} data table(s)`);
+			} catch {
+				// Non-fatal — project ID lookup may fail
+			}
+		}
+	} catch (error: unknown) {
+		abortController.abort();
+		const errorMessage = error instanceof Error ? error.message : String(error);
+		result.buildError = errorMessage;
+		logger.error(`  Build failed: ${errorMessage}`);
+	}
+
+	return result;
+}
+
+// ---------------------------------------------------------------------------
+// Scenario execution
+// ---------------------------------------------------------------------------
+
+async function runScenario(
+	client: N8nClient,
+	scenario: TestScenario,
+	workflowId: string,
+	workflowJsons: WorkflowResponse[],
+	logger: EvalLogger,
+): Promise<ScenarioResult> {
+	const evalResult = await client.executeWithLlmMock(workflowId, scenario.dataSetup);
+
+	logger.verbose(
+		`    [${scenario.name}] Execution ${evalResult.executionId}: ${evalResult.success ? 'success' : 'failed'}` +
+			` (${Object.keys(evalResult.nodeResults).length} nodes, ${evalResult.errors.length} errors)`,
+	);
+
+	const verificationArtifact = buildVerificationArtifact(scenario, evalResult, workflowJsons);
+
+	const scenarioChecklist: ChecklistItem[] = [
+		{
+			id: 1,
+			description: scenario.successCriteria,
+			category: 'execution',
+			strategy: 'llm',
+		},
+	];
+
+	const verificationResults = await verifyChecklist(
+		scenarioChecklist,
+		verificationArtifact,
+		workflowJsons,
+	);
+
+	const passed = verificationResults.length > 0 && verificationResults[0].pass;
+	const result = verificationResults[0];
+	const reasoning = result?.reasoning ?? 'No verification result';
+	const failureCategory = result?.failureCategory;
+	const rootCause = result?.rootCause;
+
+	const categoryLabel = failureCategory ? ` [${failureCategory}]` : '';
+	logger.info(
+		`    [${scenario.name}] ${passed ? 'PASS' : 'FAIL'}${categoryLabel}: ${reasoning.slice(0, 100)}`,
+	);
+
+	return {
+		scenario,
+		success: passed,
+		evalResult,
+		score: passed ? 1 : 0,
+		reasoning,
+		failureCategory,
+		rootCause,
+	};
+}
+
+// ---------------------------------------------------------------------------
+// Verification artifact builder
+// ---------------------------------------------------------------------------
+
+/**
+ * Build a rich verification artifact from the execution result.
+ * Includes execution trace with mock responses, config issues,
+ * and pre-analysis flags so the verifier can diagnose root causes.
+ */
+function buildVerificationArtifact(
+	scenario: TestScenario,
+	evalResult: InstanceAiEvalExecutionResult,
+	workflowJsons: WorkflowResponse[],
+): string {
+	const sections: string[] = [];
+
+	// --- Scenario context ---
+	sections.push(
+		'## Scenario',
+		'',
+		`**Name:** ${scenario.name} — ${scenario.description}`,
+		`**Data setup:** ${scenario.dataSetup}`,
+		'',
+	);
+
+	// --- Pre-analysis: flag known issues programmatically ---
+	const preAnalysis: string[] = [];
+
+	// Flag Phase 1 failures — these cause empty trigger data and cascade failures
+	if (evalResult.hints.warnings.length > 0) {
+		for (const warning of evalResult.hints.warnings) {
+			preAnalysis.push(`⚠ FRAMEWORK ISSUE: ${warning}`);
+		}
+	}
+	if (Object.keys(evalResult.hints.triggerContent).length === 0) {
+		preAnalysis.push(
+			'⚠ FRAMEWORK ISSUE: Trigger content is empty — the start node received no input data. All downstream failures are likely caused by this, not by the workflow builder.',
+		);
+	}
+
+	for (const [nodeName, nr] of Object.entries(evalResult.nodeResults)) {
+		if (nr.configIssues && Object.keys(nr.configIssues).length > 0) {
+			preAnalysis.push(
+				`⚠ BUILDER ISSUE: "${nodeName}" has missing config: ${Object.values(nr.configIssues).flat().join('; ')}`,
+			);
+		}
+		for (const req of nr.interceptedRequests) {
+			if (
+				typeof req.mockResponse === 'object' &&
+				req.mockResponse !== null &&
+				'_evalMockError' in (req.mockResponse as Record<string, unknown>)
+			) {
+				const msg = (req.mockResponse as Record<string, unknown>).message;
+				const msgStr = typeof msg === 'string' ? msg : 'unknown';
+				preAnalysis.push(
+					`⚠ MOCK ISSUE: "${nodeName}" ${req.method} ${req.url} → mock generation failed: ${msgStr}`,
+				);
+			}
+		}
+	}
+
+	if (preAnalysis.length > 0) {
+		sections.push('## Pre-analysis (automated flags)', '', ...preAnalysis, '');
+	}
+
+	// --- Execution summary ---
+	const mockedNodes: string[] = [];
+	const pinnedNodes: string[] = [];
+	const realNodes: string[] = [];
+
+	for (const [nodeName, nr] of Object.entries(evalResult.nodeResults)) {
+		if (nr.executionMode === 'mocked') mockedNodes.push(nodeName);
+		else if (nr.executionMode === 'pinned') pinnedNodes.push(nodeName);
+		else realNodes.push(nodeName);
+	}
+
+	sections.push(
+		'## Execution summary',
+		'',
+		`**Status:** ${evalResult.success ? 'success' : 'failed'}`,
+		`**Mocked nodes** (HTTP intercepted, responses generated by LLM): ${mockedNodes.join(', ') || 'none'}`,
+		`**Pinned nodes** (trigger data provided, not executed): ${pinnedNodes.join(', ') || 'none'}`,
+		`**Real nodes** (executed with actual logic on mock/pinned data): ${realNodes.join(', ') || 'none'}`,
+		'',
+	);
+
+	if (evalResult.errors.length > 0) {
+		sections.push('## Errors', '', ...evalResult.errors.map((e) => `- ${e}`), '');
+	}
+
+	// --- Build a node config lookup from workflow JSON ---
+	const nodeConfigs = new Map<string, Record<string, unknown>>();
+	const wf = workflowJsons[0];
+	if (wf) {
+		for (const node of wf.nodes) {
+			if (node.name && node.parameters) {
+				nodeConfigs.set(node.name, { type: node.type, parameters: node.parameters });
+			}
+		}
+	}
+
+	// --- Workflow structure: ALL nodes and connections ---
+	const executedNodeNames = new Set(Object.keys(evalResult.nodeResults));
+	if (wf) {
+		sections.push('## Workflow structure (all nodes)', '');
+		for (const node of wf.nodes) {
+			const ran = node.name ? executedNodeNames.has(node.name) : false;
+			const status = ran ? 'EXECUTED' : 'DID NOT RUN';
+			sections.push(`- **${node.name ?? '(unnamed)'}** (${node.type}) — ${status}`);
+		}
+		sections.push('');
+		sections.push('**Connections:**');
+		sections.push('```json', JSON.stringify(wf.connections, null, 2), '```');
+		sections.push('');
+	}
+
+	// --- Execution trace: per-node detail (sorted by execution order) ---
+	sections.push('## Execution trace', '');
+
+	const sortedNodeResults = Object.entries(evalResult.nodeResults).sort(
+		([, a], [, b]) => (a.startTime ?? 0) - (b.startTime ?? 0),
+	);
+
+	for (const [nodeName, nr] of sortedNodeResults) {
+		sections.push(`### ${nodeName} [${nr.executionMode}]`);
+
+		// Node configuration (from workflow JSON)
+		const nodeConfig = nodeConfigs.get(nodeName);
+		if (nodeConfig) {
+			sections.push('**Node config:**');
+			sections.push('```json', JSON.stringify(nodeConfig, null, 2), '```');
+		}
+
+		// Config issues
+		if (nr.configIssues && Object.keys(nr.configIssues).length > 0) {
+			sections.push(`**Config issues:** ${Object.values(nr.configIssues).flat().join('; ')}`);
+		}
+
+		// Intercepted requests + mock responses (for mocked nodes)
+		for (const req of nr.interceptedRequests) {
+			sections.push(`**Request:** ${req.method} ${req.url}`);
+			if (req.requestBody) {
+				sections.push('```json', JSON.stringify(req.requestBody, null, 2), '```');
+			}
+			if (req.mockResponse) {
+				sections.push('**Mock response:**');
+				sections.push('```json', JSON.stringify(req.mockResponse, null, 2), '```');
+			}
+		}
+
+		// Node output
+		if (nr.output !== null && nr.output !== undefined) {
+			sections.push('**Output:**');
+			sections.push('```json', JSON.stringify(nr.output, null, 2), '```');
+		} else {
+			sections.push('**Output:** none');
+		}
+
+		sections.push('');
+	}
+
+	return sections.join('\n');
+}
+
+// ---------------------------------------------------------------------------
+// SSE connection
+// ---------------------------------------------------------------------------
+
+async function startSseConnection(
+	client: N8nClient,
+	threadId: string,
+	events: CapturedEvent[],
+	signal: AbortSignal,
+): Promise<void> {
+	const url = client.getEventsUrl(threadId);
+	const cookie = client.cookie;
+
+	return await consumeSseStream(
+		url,
+		cookie,
+		(sseEvent) => {
+			try {
+				const parsed = JSON.parse(sseEvent.data) as Record<string, unknown>;
+				events.push({
+					timestamp: Date.now(),
+					type: typeof parsed.type === 'string' ? parsed.type : 'unknown',
+					data: parsed,
+				});
+			} catch {
+				// Ignore malformed events
+			}
+		},
+		signal,
+	);
+}
+
+// ---------------------------------------------------------------------------
+// Wait for all activity: run-finish -> background tasks -> possible new run
+// ---------------------------------------------------------------------------
+
+interface WaitConfig {
+	client: N8nClient;
+	threadId: string;
+	events: CapturedEvent[];
+	approvedRequests: Set<string>;
+	startTime: number;
+	timeoutMs: number;
+	logger: EvalLogger;
+}
+
+async function waitForAllActivity(config: WaitConfig): Promise<void> {
+	let runFinishCount = 0;
+
+	while (true) {
+		await waitForRunFinish(config, runFinishCount);
+		runFinishCount = countEvents(config.events, 'run-finish');
+
+		config.logger.verbose(
+			`[${config.threadId}] Run #${String(runFinishCount)} finished -- time: ${String(Date.now() - config.startTime)}ms`,
+		);
+
+		// Wait for background tasks (sub-agents) to complete
+		const remainingMs = Math.max(0, config.timeoutMs - (Date.now() - config.startTime));
+		await waitForBackgroundTasks(config, remainingMs);
+
+		// Check if the main agent started a new run after background tasks completed
+		await delay(SSE_SETTLE_DELAY_MS);
+		const newRunStarts = countEvents(config.events, 'run-start');
+		const currentRunFinishes = countEvents(config.events, 'run-finish');
+		if (newRunStarts <= currentRunFinishes) {
+			break;
+		}
+
+		config.logger.verbose(
+			`[${config.threadId}] Main agent resumed (run-start #${String(newRunStarts)}) -- waiting for completion`,
+		);
+
+		if (Date.now() - config.startTime > config.timeoutMs) {
+			throw new Error(`Run timed out after ${String(config.timeoutMs)}ms`);
+		}
+	}
+}
+
+async function waitForRunFinish(config: WaitConfig, expectedFinishCount: number): Promise<void> {
+	while (countEvents(config.events, 'run-finish') <= expectedFinishCount) {
+		const elapsed = Date.now() - config.startTime;
+		if (elapsed > config.timeoutMs) {
+			await config.client.cancelRun(config.threadId).catch(() => {});
+			throw new Error(`Run timed out after ${String(config.timeoutMs)}ms`);
+		}
+
+		await processConfirmationRequests(config);
+		await delay(POLL_INTERVAL_MS);
+	}
+}
+
+async function waitForBackgroundTasks(config: WaitConfig, timeoutMs: number): Promise<void> {
+	const deadline = Date.now() + timeoutMs;
+
+	const hasSpawnedAgents = config.events.some((e) => e.type === 'agent-spawned');
+	if (!hasSpawnedAgents) {
+		config.logger.verbose('No sub-agents spawned -- skipping background task wait');
+		return;
+	}
+
+	config.logger.verbose('Sub-agent(s) detected -- waiting for background tasks...');
+
+	while (Date.now() < deadline) {
+		await processConfirmationRequests(config);
+
+		// Check REST API for background task status
+		const status = await config.client.getThreadStatus(config.threadId);
+		const tasks = status.backgroundTasks ?? [];
+		const restRunning = tasks.filter((t) => t.status === 'running');
+
+		// Check SSE events for unmatched agent-spawned / agent-completed
+		const ssePending = getPendingAgentIds(config.events);
+
+		if (restRunning.length === 0 && ssePending.length === 0) {
+			config.logger.verbose('All background tasks completed');
+			await delay(1000);
+			return;
+		}
+
+		config.logger.verbose(
+			`Waiting for ${String(restRunning.length)} REST task(s), ${String(ssePending.length)} SSE agent(s)`,
+		);
+
+		await delay(BACKGROUND_TASK_POLL_INTERVAL_MS);
+	}
+
+	config.logger.verbose(
+		`Background task wait timed out after ${String(timeoutMs)}ms -- continuing`,
+	);
+}
+
+// ---------------------------------------------------------------------------
+// Confirmation auto-approval
+// ---------------------------------------------------------------------------
+
+const confirmationRetries = new Map<string, number>();
+
+async function processConfirmationRequests(config: WaitConfig): Promise<void> {
+	const confirmationEvents = config.events.filter((e) => e.type === 'confirmation-request');
+
+	for (const event of confirmationEvents) {
+		const requestId = extractConfirmationRequestId(event);
+		if (!requestId || config.approvedRequests.has(requestId)) {
+			continue;
+		}
+
+		const retryCount = confirmationRetries.get(requestId) ?? 0;
+		if (retryCount >= MAX_CONFIRMATION_RETRIES) {
+			continue;
+		}
+
+		if (retryCount === 0) {
+			config.logger.verbose(`[auto-approve] Approving confirmation: ${requestId}`);
+		}
+
+		try {
+			// Always offer mock credentials — the eval runner doesn't have real
+			// credentials for most services, so tell Instance AI to use mock data
+			await config.client.confirmAction(requestId, true, { mockCredentials: true });
+			config.approvedRequests.add(requestId);
+			confirmationRetries.delete(requestId);
+		} catch (error: unknown) {
+			confirmationRetries.set(requestId, retryCount + 1);
+			const msg = error instanceof Error ? error.message : String(error);
+			config.logger.verbose(
+				`[auto-approve] Failed to approve ${requestId} (attempt ${String(retryCount + 1)}/${String(MAX_CONFIRMATION_RETRIES)}): ${msg}`,
+			);
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Event helpers
+// ---------------------------------------------------------------------------
+
+function countEvents(events: CapturedEvent[], type: string): number {
+	return events.filter((e) => e.type === type).length;
+}
+
+function getPendingAgentIds(events: CapturedEvent[]): string[] {
+	const spawned = new Set<string>();
+	const completed = new Set<string>();
+
+	for (const event of events) {
+		const agentId = extractAgentId(event);
+		if (!agentId) continue;
+
+		if (event.type === 'agent-spawned') spawned.add(agentId);
+		if (event.type === 'agent-completed') completed.add(agentId);
+	}
+
+	return [...spawned].filter((id) => !completed.has(id));
+}
+
+function extractConfirmationRequestId(event: CapturedEvent): string | undefined {
+	const payload = getNestedRecord(event.data, 'payload');
+	if (payload && typeof payload.requestId === 'string') {
+		return payload.requestId;
+	}
+	if (typeof event.data.requestId === 'string') {
+		return event.data.requestId;
+	}
+	return undefined;
+}
+
+function extractAgentId(event: CapturedEvent): string | undefined {
+	if (typeof event.data.agentId === 'string') return event.data.agentId;
+
+	const payload = getNestedRecord(event.data, 'payload');
+	if (payload && typeof payload.agentId === 'string') return payload.agentId;
+
+	return undefined;
+}
+
+function getNestedRecord(
+	obj: Record<string, unknown>,
+	key: string,
+): Record<string, unknown> | undefined {
+	const value = obj[key];
+	if (typeof value === 'object' && value !== null && !Array.isArray(value)) {
+		return value as Record<string, unknown>;
+	}
+	return undefined;
+}
+
+// ---------------------------------------------------------------------------
+// Concurrency control
+// ---------------------------------------------------------------------------
+
+/**
+ * Run tasks with bounded concurrency. Like Promise.all but limits how many
+ * tasks execute simultaneously to avoid API rate limits.
+ */
+export async function runWithConcurrency<T, R>(
+	items: T[],
+	fn: (item: T) => Promise<R>,
+	limit: number,
+): Promise<R[]> {
+	const results = new Array<R>(items.length);
+	let nextIndex = 0;
+
+	async function worker(): Promise<void> {
+		while (nextIndex < items.length) {
+			const index = nextIndex++;
+			results[index] = await fn(items[index]);
+		}
+	}
+
+	const workers = Array.from({ length: Math.min(limit, items.length) }, async () => await worker());
+	await Promise.all(workers);
+	return results;
+}
+
+// ---------------------------------------------------------------------------
+// Utility helpers
+// ---------------------------------------------------------------------------
+
+async function delay(ms: number): Promise<void> {
+	return await new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+function truncate(text: string, maxLength: number): string {
+	if (text.length <= maxLength) return text;
+	return text.slice(0, maxLength) + '...';
+}
--- a/packages/@n8n/instance-ai/evaluations/outcome/event-parser.ts
+++ b/packages/@n8n/instance-ai/evaluations/outcome/event-parser.ts
@ -0,0 +1,379 @@
+// ---------------------------------------------------------------------------
+// Event parsing: extract outcome and metrics from captured SSE events
+// ---------------------------------------------------------------------------
+
+import type {
+	AgentActivity,
+	CapturedEvent,
+	CapturedToolCall,
+	EventOutcome,
+	InstanceAiMetrics,
+} from '../types';
+
+// ---------------------------------------------------------------------------
+// Tool names whose results contain resource IDs we need to track
+// ---------------------------------------------------------------------------
+
+const WORKFLOW_TOOLS = new Set([
+	'build-workflow',
+	'submit-workflow',
+	'patch-workflow',
+	'build-workflow-with-agent',
+]);
+
+const EXECUTION_TOOL = 'run-workflow';
+const DATA_TABLE_TOOL = 'create-data-table';
+
+// ---------------------------------------------------------------------------
+// Type guards for event payloads
+// ---------------------------------------------------------------------------
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+	return typeof value === 'object' && value !== null && !Array.isArray(value);
+}
+
+function getString(obj: Record<string, unknown>, key: string): string | undefined {
+	const value = obj[key];
+	return typeof value === 'string' ? value : undefined;
+}
+
+function getRecord(obj: Record<string, unknown>, key: string): Record<string, unknown> | undefined {
+	const value = obj[key];
+	return isRecord(value) ? value : undefined;
+}
+
+// ---------------------------------------------------------------------------
+// extractOutcomeFromEvents
+// ---------------------------------------------------------------------------
+
+export function extractOutcomeFromEvents(events: CapturedEvent[]): EventOutcome {
+	const workflowIds: string[] = [];
+	const executionIds: string[] = [];
+	const dataTableIds: string[] = [];
+	const textChunks: string[] = [];
+	const toolCalls: CapturedToolCall[] = [];
+	const agentActivities: AgentActivity[] = [];
+
+	// Track in-progress tool calls by toolCallId for duration calculation
+	const toolCallStarts = new Map<
+		string,
+		{ timestamp: number; toolName: string; args: Record<string, unknown> }
+	>();
+
+	// Track agent activities by agentId
+	const agentMap = new Map<string, AgentActivity>();
+
+	for (const event of events) {
+		const { type, data } = event;
+
+		switch (type) {
+			case 'text-delta': {
+				const text = getString(data, 'text') ?? getString(getRecord(data, 'payload') ?? {}, 'text');
+				if (text) {
+					textChunks.push(text);
+				}
+				break;
+			}
+
+			case 'tool-call': {
+				const payload = getRecord(data, 'payload') ?? data;
+				const toolName = getString(payload, 'toolName') ?? '';
+				const toolCallId = getString(payload, 'toolCallId') ?? getString(data, 'toolCallId') ?? '';
+				const argsRaw = getRecord(payload, 'args');
+
+				toolCallStarts.set(toolCallId || `${event.timestamp}-${toolName}`, {
+					timestamp: event.timestamp,
+					toolName,
+					args: argsRaw ?? {},
+				});
+				break;
+			}
+
+			case 'tool-result': {
+				const payload = getRecord(data, 'payload') ?? data;
+				const toolCallId = getString(payload, 'toolCallId') ?? getString(data, 'toolCallId') ?? '';
+
+				const startEntry = toolCallStarts.get(toolCallId);
+				// tool-result events may not include toolName; fall back to the
+				// name captured from the corresponding tool-call event.
+				const toolName = getString(payload, 'toolName') ?? startEntry?.toolName ?? '';
+				const result = payload.result ?? data.result;
+
+				const durationMs = startEntry ? event.timestamp - startEntry.timestamp : 0;
+				const args = startEntry?.args ?? {};
+
+				const toolCall: CapturedToolCall = {
+					toolCallId: toolCallId || `auto-${event.timestamp}`,
+					toolName,
+					args,
+					result,
+					durationMs,
+				};
+				toolCalls.push(toolCall);
+
+				// Extract resource IDs from tool results
+				extractResourceIds(toolName, result, workflowIds, executionIds, dataTableIds);
+				break;
+			}
+
+			case 'tool-error': {
+				const payload = getRecord(data, 'payload') ?? data;
+				const toolCallId = getString(payload, 'toolCallId') ?? getString(data, 'toolCallId') ?? '';
+				const errorMsg = getString(payload, 'error') ?? getString(data, 'error') ?? 'Unknown error';
+
+				const startEntry = toolCallStarts.get(toolCallId);
+				const toolName = getString(payload, 'toolName') ?? startEntry?.toolName ?? '';
+				const durationMs = startEntry ? event.timestamp - startEntry.timestamp : 0;
+				const args = startEntry?.args ?? {};
+
+				toolCalls.push({
+					toolCallId: toolCallId || `auto-${event.timestamp}`,
+					toolName,
+					args,
+					error: errorMsg,
+					durationMs,
+				});
+				break;
+			}
+
+			case 'agent-spawned': {
+				const payload = getRecord(data, 'payload') ?? data;
+				const agentId = getString(data, 'agentId') ?? getString(payload, 'agentId') ?? '';
+				const role = getString(payload, 'role') ?? '';
+				const parentId = getString(payload, 'parentId');
+				const toolsRaw = payload.tools;
+				const tools = Array.isArray(toolsRaw)
+					? (toolsRaw as unknown[]).filter((t): t is string => typeof t === 'string')
+					: [];
+
+				const activity: AgentActivity = {
+					agentId,
+					role,
+					parentId,
+					toolCalls: [],
+					textContent: '',
+					reasoning: '',
+					status: 'running',
+				};
+				agentMap.set(agentId, activity);
+
+				// Store tools info in reasoning for visibility
+				if (tools.length > 0) {
+					activity.reasoning = `Tools: ${tools.join(', ')}`;
+				}
+				break;
+			}
+
+			case 'agent-completed': {
+				const payload = getRecord(data, 'payload') ?? data;
+				const agentId = getString(data, 'agentId') ?? getString(payload, 'agentId') ?? '';
+				const status = getString(payload, 'status') ?? 'completed';
+				const resultText = getString(payload, 'result');
+
+				const activity = agentMap.get(agentId);
+				if (activity) {
+					activity.status = status;
+					if (resultText) {
+						activity.textContent = resultText;
+					}
+				}
+				break;
+			}
+
+			default:
+				// Other event types (run-start, run-finish, confirmation-request, etc.)
+				// are not directly needed for outcome extraction
+				break;
+		}
+	}
+
+	// Assign tool calls to their respective agents
+	for (const tc of toolCalls) {
+		// Find the matching event to get agentId
+		const matchingEvent = events.find(
+			(e) =>
+				(e.type === 'tool-result' || e.type === 'tool-error') &&
+				(getString(getRecord(e.data, 'payload') ?? e.data, 'toolCallId') === tc.toolCallId ||
+					getString(e.data, 'toolCallId') === tc.toolCallId),
+		);
+		if (matchingEvent) {
+			const agentId = getString(matchingEvent.data, 'agentId') ?? '';
+			const activity = agentMap.get(agentId);
+			if (activity) {
+				activity.toolCalls.push(tc);
+			}
+		}
+	}
+
+	// Convert agent map to array
+	for (const activity of agentMap.values()) {
+		agentActivities.push(activity);
+	}
+
+	return {
+		workflowIds: dedupe(workflowIds),
+		executionIds: dedupe(executionIds),
+		dataTableIds: dedupe(dataTableIds),
+		finalText: textChunks.join(''),
+		toolCalls,
+		agentActivities,
+	};
+}
+
+// ---------------------------------------------------------------------------
+// buildMetrics
+// ---------------------------------------------------------------------------
+
+export function buildMetrics(events: CapturedEvent[], startTime: number): InstanceAiMetrics {
+	let timeToFirstTextMs = 0;
+	let timeToRunFinishMs = 0;
+	let totalToolCalls = 0;
+	let subAgentsSpawned = 0;
+	let confirmationRequests = 0;
+	const agentActivities: AgentActivity[] = [];
+
+	const agentMap = new Map<string, AgentActivity>();
+	let foundFirstText = false;
+
+	for (const event of events) {
+		const elapsed = event.timestamp - startTime;
+
+		switch (event.type) {
+			case 'text-delta': {
+				if (!foundFirstText) {
+					timeToFirstTextMs = elapsed;
+					foundFirstText = true;
+				}
+				break;
+			}
+
+			case 'tool-call': {
+				totalToolCalls++;
+				break;
+			}
+
+			case 'agent-spawned': {
+				subAgentsSpawned++;
+				const payload = getRecord(event.data, 'payload') ?? event.data;
+				const agentId = getString(event.data, 'agentId') ?? getString(payload, 'agentId') ?? '';
+				const role = getString(payload, 'role') ?? '';
+				const parentId = getString(payload, 'parentId');
+
+				agentMap.set(agentId, {
+					agentId,
+					role,
+					parentId,
+					toolCalls: [],
+					textContent: '',
+					reasoning: '',
+					status: 'running',
+				});
+				break;
+			}
+
+			case 'agent-completed': {
+				const payload = getRecord(event.data, 'payload') ?? event.data;
+				const agentId = getString(event.data, 'agentId') ?? getString(payload, 'agentId') ?? '';
+				const status = getString(payload, 'status') ?? 'completed';
+				const activity = agentMap.get(agentId);
+				if (activity) {
+					activity.status = status;
+				}
+				break;
+			}
+
+			case 'confirmation-request': {
+				confirmationRequests++;
+				break;
+			}
+
+			case 'run-finish': {
+				timeToRunFinishMs = elapsed;
+				break;
+			}
+
+			default:
+				break;
+		}
+	}
+
+	for (const activity of agentMap.values()) {
+		agentActivities.push(activity);
+	}
+
+	const totalTimeMs = events.length > 0 ? events[events.length - 1].timestamp - startTime : 0;
+
+	return {
+		totalTimeMs,
+		timeToFirstTextMs,
+		timeToRunFinishMs,
+		totalToolCalls,
+		subAgentsSpawned,
+		confirmationRequests,
+		agentActivities,
+		events,
+	};
+}
+
+// ---------------------------------------------------------------------------
+// Internal helpers
+// ---------------------------------------------------------------------------
+
+function extractResourceIds(
+	toolName: string,
+	result: unknown,
+	workflowIds: string[],
+	executionIds: string[],
+	dataTableIds: string[],
+): void {
+	if (WORKFLOW_TOOLS.has(toolName)) {
+		const id = extractIdFromResult(result, 'workflowId', 'id');
+		if (id) workflowIds.push(id);
+	}
+
+	if (toolName === EXECUTION_TOOL) {
+		const id = extractIdFromResult(result, 'executionId', 'id');
+		if (id) executionIds.push(id);
+	}
+
+	if (toolName === DATA_TABLE_TOOL) {
+		const id = extractIdFromResult(result, 'dataTableId', 'id');
+		if (id) dataTableIds.push(id);
+	}
+}
+
+function extractIdFromResult(result: unknown, ...keys: string[]): string | undefined {
+	if (!isRecord(result)) {
+		// Result might be a stringified JSON
+		if (typeof result === 'string') {
+			try {
+				const parsed: unknown = JSON.parse(result);
+				if (isRecord(parsed)) {
+					return extractIdFromRecord(parsed, keys);
+				}
+			} catch {
+				return undefined;
+			}
+		}
+		return undefined;
+	}
+	return extractIdFromRecord(result, keys);
+}
+
+function extractIdFromRecord(record: Record<string, unknown>, keys: string[]): string | undefined {
+	for (const key of keys) {
+		const value = record[key];
+		if (typeof value === 'string' && value.length > 0) {
+			return value;
+		}
+		// Some APIs return numeric IDs
+		if (typeof value === 'number') {
+			return String(value);
+		}
+	}
+	return undefined;
+}
+
+function dedupe(arr: string[]): string[] {
+	return [...new Set(arr)];
+}
--- a/packages/@n8n/instance-ai/evaluations/outcome/workflow-discovery.ts
+++ b/packages/@n8n/instance-ai/evaluations/outcome/workflow-discovery.ts
@ -0,0 +1,213 @@
+// ---------------------------------------------------------------------------
+// Workflow discovery: snapshot IDs, build agent outcome, extract IDs from messages
+// ---------------------------------------------------------------------------
+
+import type { InstanceAiAgentNode, InstanceAiMessage } from '@n8n/api-types';
+
+import type { N8nClient, WorkflowResponse } from '../clients/n8n-client';
+import type { AgentOutcome, EventOutcome, ExecutionSummary, WorkflowSummary } from '../types';
+
+// ---------------------------------------------------------------------------
+// Tool names whose results contain workflow IDs
+// ---------------------------------------------------------------------------
+
+const WORKFLOW_TOOLS = new Set([
+	'build-workflow',
+	'submit-workflow',
+	'patch-workflow',
+	'build-workflow-with-agent',
+]);
+
+// ---------------------------------------------------------------------------
+// snapshotWorkflowIds -- call before the run to know what existed prior
+// ---------------------------------------------------------------------------
+
+export async function snapshotWorkflowIds(client: N8nClient): Promise<Set<string>> {
+	try {
+		const workflows = await client.listWorkflows();
+		return new Set(workflows.map((wf) => wf.id));
+	} catch {
+		return new Set();
+	}
+}
+
+// ---------------------------------------------------------------------------
+// buildAgentOutcome
+// ---------------------------------------------------------------------------
+
+export async function buildAgentOutcome(
+	client: N8nClient,
+	eventOutcome: EventOutcome,
+	preRunWorkflowIds?: Set<string>,
+	claimedWorkflowIds?: Set<string>,
+): Promise<AgentOutcome> {
+	const workflowsCreated: WorkflowSummary[] = [];
+	const workflowJsons: WorkflowResponse[] = [];
+	const executionsRun: ExecutionSummary[] = [];
+
+	// Collect workflow IDs from events
+	const knownWfIds = new Set(eventOutcome.workflowIds);
+
+	// Mark event-based workflow IDs as claimed so concurrent runs skip them
+	if (claimedWorkflowIds) {
+		for (const id of knownWfIds) {
+			claimedWorkflowIds.add(id);
+		}
+	}
+
+	// Diff against pre-run snapshot to find workflows created by background tasks
+	// that didn't surface in the SSE events we parsed.
+	// When running concurrently, skip workflows already claimed by another run.
+	if (preRunWorkflowIds) {
+		try {
+			const currentWorkflows = await client.listWorkflows();
+			for (const wf of currentWorkflows) {
+				if (
+					!preRunWorkflowIds.has(wf.id) &&
+					!knownWfIds.has(wf.id) &&
+					!claimedWorkflowIds?.has(wf.id)
+				) {
+					knownWfIds.add(wf.id);
+					claimedWorkflowIds?.add(wf.id);
+				}
+			}
+		} catch {
+			// Non-fatal -- fall back to event-based IDs only
+		}
+	}
+
+	// Fetch workflow details
+	for (const wfId of knownWfIds) {
+		try {
+			const wf = await client.getWorkflow(wfId);
+			workflowsCreated.push({
+				id: wfId,
+				name: wf.name,
+				nodeCount: wf.nodes.length,
+				active: wf.active,
+			});
+			workflowJsons.push(wf);
+		} catch {
+			// Workflow may have been deleted or is inaccessible
+			workflowsCreated.push({
+				id: wfId,
+				name: '(fetch failed)',
+				nodeCount: 0,
+				active: false,
+			});
+		}
+	}
+
+	// Fetch execution details
+	for (const execId of eventOutcome.executionIds) {
+		try {
+			const executions = await client.listExecutions();
+			const match = executions.find((e) => e.id === execId);
+			if (match) {
+				executionsRun.push({
+					id: match.id,
+					workflowId: match.workflowId,
+					status: match.status,
+				});
+			} else {
+				executionsRun.push({
+					id: execId,
+					workflowId: 'unknown',
+					status: 'not-found',
+				});
+			}
+		} catch {
+			executionsRun.push({
+				id: execId,
+				workflowId: 'unknown',
+				status: 'fetch-failed',
+			});
+		}
+	}
+
+	return {
+		workflowsCreated,
+		executionsRun,
+		dataTablesCreated: eventOutcome.dataTableIds,
+		finalText: eventOutcome.finalText,
+		workflowJsons,
+	};
+}
+
+// ---------------------------------------------------------------------------
+// extractWorkflowIdsFromMessages
+//
+// Extracts workflow IDs from agent tree targetResource fields AND from
+// tool call results (build-workflow, submit-workflow, etc.).
+// Thread-scoped -- avoids cross-run workflow attribution.
+// ---------------------------------------------------------------------------
+
+export function extractWorkflowIdsFromMessages(messages: InstanceAiMessage[]): string[] {
+	const ids = new Set<string>();
+
+	for (const message of messages) {
+		if (message.role === 'assistant' && message.agentTree) {
+			collectWorkflowIds(message.agentTree, ids);
+		}
+	}
+
+	return [...ids];
+}
+
+// ---------------------------------------------------------------------------
+// Internal helpers
+// ---------------------------------------------------------------------------
+
+function collectWorkflowIds(node: InstanceAiAgentNode, ids: Set<string>): void {
+	if (node.targetResource?.type === 'workflow' && node.targetResource.id) {
+		ids.add(node.targetResource.id);
+	}
+
+	// Extract workflow IDs from tool call results
+	for (const tc of node.toolCalls) {
+		if (WORKFLOW_TOOLS.has(tc.toolName)) {
+			const id = extractIdFromResult(tc.result);
+			if (id) ids.add(id);
+		}
+	}
+
+	for (const child of node.children) {
+		collectWorkflowIds(child, ids);
+	}
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+	return typeof value === 'object' && value !== null && !Array.isArray(value);
+}
+
+function extractIdFromResult(result: unknown): string | undefined {
+	const keys = ['workflowId', 'id'];
+
+	if (!isRecord(result)) {
+		if (typeof result === 'string') {
+			try {
+				const parsed: unknown = JSON.parse(result);
+				if (isRecord(parsed)) {
+					return extractIdFromRecord(parsed, keys);
+				}
+			} catch {
+				return undefined;
+			}
+		}
+		return undefined;
+	}
+	return extractIdFromRecord(result, keys);
+}
+
+function extractIdFromRecord(record: Record<string, unknown>, keys: string[]): string | undefined {
+	for (const key of keys) {
+		const value = record[key];
+		if (typeof value === 'string' && value.length > 0) {
+			return value;
+		}
+		if (typeof value === 'number') {
+			return String(value);
+		}
+	}
+	return undefined;
+}
--- a/packages/@n8n/instance-ai/evaluations/report/workflow-report.ts
+++ b/packages/@n8n/instance-ai/evaluations/report/workflow-report.ts
@ -0,0 +1,501 @@
+/**
+ * HTML report generator for workflow test case evaluations.
+ *
+ * Produces a self-contained HTML file optimized for three tasks:
+ * 1. Triage — which scenarios failed? (seconds)
+ * 2. Diagnose — why did they fail? (minutes)
+ * 3. Compare — what changed between runs? (cross-report)
+ */
+
+import fs from 'fs';
+import path from 'path';
+
+import type { WorkflowTestCaseResult, ScenarioResult } from '../types';
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function escapeHtml(str: string): string {
+	return str
+		.replace(/&/g, '&amp;')
+		.replace(/</g, '&lt;')
+		.replace(/>/g, '&gt;')
+		.replace(/"/g, '&quot;')
+		.replace(/'/g, '&#39;');
+}
+
+// ---------------------------------------------------------------------------
+// Scenario rendering
+// ---------------------------------------------------------------------------
+
+function renderScenario(sr: ScenarioResult, index: number): string {
+	const icon = sr.success ? '&#10003;' : '&#10007;';
+	const statusClass = sr.success ? 'pass' : 'fail';
+
+	// Passing scenarios: compact one-liner with collapsible detail
+	if (sr.success) {
+		const summary = sr.reasoning ? sr.reasoning.slice(0, 150) : 'All checks passed';
+		return `<div class="scenario ${statusClass}">
+			<div class="scenario-header" onclick="this.parentElement.classList.toggle('expanded')">
+				<span class="scenario-icon ${statusClass}">${icon}</span>
+				<span class="scenario-name">${escapeHtml(sr.scenario.name)}</span>
+				<span class="scenario-summary-inline">${escapeHtml(summary)}${sr.reasoning && sr.reasoning.length > 150 ? '...' : ''}</span>
+			</div>
+			<div class="scenario-detail" id="scenario-${String(index)}">
+				${renderScenarioDetail(sr)}
+			</div>
+		</div>`;
+	}
+
+	// Failing scenarios: show error prominently, detail expanded by default
+	return `<div class="scenario ${statusClass} expanded">
+		<div class="scenario-header" onclick="this.parentElement.classList.toggle('expanded')">
+			<span class="scenario-icon ${statusClass}">${icon}</span>
+			<span class="scenario-name">${escapeHtml(sr.scenario.name)}</span>
+			<span class="scenario-desc">${escapeHtml(sr.scenario.description)}</span>
+		</div>
+		<div class="scenario-detail" id="scenario-${String(index)}">
+			${renderScenarioDetail(sr)}
+		</div>
+	</div>`;
+}
+
+function renderScenarioDetail(sr: ScenarioResult): string {
+	let html = '';
+
+	if (!sr.evalResult) {
+		if (sr.reasoning) {
+			html += `<div class="diagnosis">${escapeHtml(sr.reasoning)}</div>`;
+		}
+		return html;
+	}
+
+	// Failure category badge
+	if (!sr.success && sr.failureCategory) {
+		const catClass =
+			sr.failureCategory === 'builder_issue'
+				? 'warn'
+				: sr.failureCategory === 'mock_issue'
+					? 'fail'
+					: 'info';
+		html += `<div class="category-badge category-${catClass}">${escapeHtml(sr.failureCategory)}${sr.rootCause ? ': ' + escapeHtml(sr.rootCause) : ''}</div>`;
+	}
+
+	// 1. Error — what broke
+	if (sr.evalResult.errors.length > 0) {
+		html += `<div class="error-box">${escapeHtml(sr.evalResult.errors.join('; '))}</div>`;
+	}
+
+	// Phase 1 warnings
+	const warnings = sr.evalResult.hints?.warnings ?? [];
+	if (warnings.length > 0) {
+		html += `<div class="warning-box">${escapeHtml(warnings.join('; '))}</div>`;
+	}
+
+	// 2. Diagnosis — verifier's reasoning
+	if (sr.reasoning) {
+		html += '<details class="section" open><summary>Diagnosis</summary>';
+		html += `<div class="diagnosis">${escapeHtml(sr.reasoning)}</div>`;
+		html += '</details>';
+	}
+
+	// 3. Mock data plan — Phase 1 hints
+	if (sr.evalResult.hints) {
+		html += '<details class="section"><summary>Mock data plan</summary>';
+		const { globalContext, triggerContent, nodeHints } = sr.evalResult.hints;
+
+		if (globalContext) {
+			html += '<div class="subsection-label">Global context</div>';
+			html += `<div class="hint-text">${escapeHtml(globalContext)}</div>`;
+		}
+
+		if (Object.keys(triggerContent ?? {}).length > 0) {
+			html += '<div class="subsection-label">Trigger content</div>';
+			html += `<pre class="json-block"><code>${escapeHtml(JSON.stringify(triggerContent, null, 2))}</code></pre>`;
+		} else {
+			html +=
+				'<div class="warning-inline">No trigger content generated \u2014 start node has no input data</div>';
+		}
+
+		if (nodeHints && Object.keys(nodeHints).length > 0) {
+			html += '<div class="subsection-label">Per-node hints</div>';
+			for (const [nodeName, hint] of Object.entries(nodeHints)) {
+				html += `<details class="node-hint"><summary>${escapeHtml(nodeName)}</summary>`;
+				html += `<div class="hint-text">${escapeHtml(hint)}</div>`;
+				html += '</details>';
+			}
+		}
+		html += '</details>';
+	}
+
+	// 4. Execution trace — per-node results
+	const nodeEntries = Object.entries(sr.evalResult.nodeResults);
+	if (nodeEntries.length > 0) {
+		html += '<details class="section"><summary>Execution trace</summary>';
+		html +=
+			'<div class="trace-legend"><span class="node-mode-mocked">mocked</span> <span class="node-mode-pinned">pinned</span> <span class="node-mode-real">real</span></div>';
+
+		for (const [nodeName, nr] of nodeEntries) {
+			const modeClass = `node-mode-${nr.executionMode}`;
+			const hasError = nr.configIssues && Object.keys(nr.configIssues).length > 0;
+			const configWarning = hasError
+				? `<span class="build-issue">Build issue: ${escapeHtml(Object.values(nr.configIssues!).flat().join('; '))}</span>`
+				: '';
+
+			html += '<div class="trace-node">';
+			html += '<div class="trace-node-header">';
+			html += `<span class="${modeClass}">[${nr.executionMode}]</span> <strong>${escapeHtml(nodeName)}</strong>`;
+			if (nr.interceptedRequests.length > 0) {
+				html += ` <span class="request-count">${String(nr.interceptedRequests.length)} request(s)</span>`;
+			}
+			html += '</div>';
+			if (configWarning) html += configWarning;
+
+			// Intercepted requests
+			for (const req of nr.interceptedRequests) {
+				html += '<div class="request-pair">';
+				html += '<div class="request-header">Request sent</div>';
+				html += `<div class="request-method">${escapeHtml(req.method)} ${escapeHtml(req.url)}</div>`;
+				if (req.requestBody) {
+					html += `<pre class="json-block json-sm"><code>${escapeHtml(JSON.stringify(req.requestBody, null, 2))}</code></pre>`;
+				}
+				html += '<div class="response-header">Mock returned</div>';
+				if (req.mockResponse) {
+					html += `<pre class="json-block json-sm"><code>${escapeHtml(JSON.stringify(req.mockResponse, null, 2))}</code></pre>`;
+				} else {
+					html += '<div class="muted">no mock response</div>';
+				}
+				html += '</div>';
+			}
+
+			// Node output
+			if (nr.output !== null && nr.output !== undefined) {
+				html += '<details class="node-output-toggle"><summary>Node output</summary>';
+				html += `<pre class="json-block"><code>${escapeHtml(JSON.stringify(nr.output, null, 2))}</code></pre>`;
+				html += '</details>';
+			} else {
+				html += '<div class="muted">no output</div>';
+			}
+
+			html += '</div>';
+		}
+		html += '</details>';
+	}
+
+	return html;
+}
+
+// ---------------------------------------------------------------------------
+// Workflow summary
+// ---------------------------------------------------------------------------
+
+function renderWorkflowSummary(result: WorkflowTestCaseResult): string {
+	const firstEval = result.scenarioResults[0]?.evalResult;
+
+	let nodesHtml = '';
+	if (firstEval) {
+		const nodes = Object.entries(firstEval.nodeResults);
+		if (nodes.length > 0) {
+			const nodeList = nodes
+				.map(([name, nr]) => {
+					const mode = nr.executionMode;
+					const requests = nr.interceptedRequests.length;
+					const issues = nr.configIssues ? Object.values(nr.configIssues).flat().join('; ') : '';
+					let line = `<span class="node-mode-${mode}">[${mode}]</span> ${escapeHtml(name)}`;
+					if (requests > 0) line += ` <span class="muted">(${String(requests)} req)</span>`;
+					if (issues)
+						line += ` <span class="build-issue">Build issue: ${escapeHtml(issues)}</span>`;
+					return `<li>${line}</li>`;
+				})
+				.join('');
+			nodesHtml = `<details class="section"><summary>Built workflow (${String(nodes.length)} nodes)</summary><ul class="node-list">${nodeList}</ul></details>`;
+		}
+	}
+
+	let jsonHtml = '';
+	if (result.workflowJson) {
+		const raw = JSON.stringify(result.workflowJson, null, 2);
+		jsonHtml = `<details class="section"><summary>Agent output (raw JSON)</summary><pre class="json-block"><code>${escapeHtml(raw)}</code></pre></details>`;
+	}
+
+	return nodesHtml + jsonHtml;
+}
+
+// ---------------------------------------------------------------------------
+// Test case rendering
+// ---------------------------------------------------------------------------
+
+function renderTestCase(result: WorkflowTestCaseResult, tcIndex: number): string {
+	const passCount = result.scenarioResults.filter((sr) => sr.success).length;
+	const totalCount = result.scenarioResults.length;
+	const allPass = passCount === totalCount && totalCount > 0;
+	const statusClass = result.workflowBuildSuccess ? (allPass ? 'pass' : 'mixed') : 'fail';
+
+	const buildBadge = result.workflowBuildSuccess
+		? '<span class="badge badge-pass">BUILT</span>'
+		: '<span class="badge badge-fail">BUILD FAILED</span>';
+
+	const scoreBadge =
+		totalCount > 0
+			? `<span class="badge badge-${allPass ? 'pass' : 'fail'}">${String(passCount)}/${String(totalCount)}</span>`
+			: '';
+
+	const prompt = result.testCase.prompt;
+	const truncatedPrompt = prompt.length > 100 ? prompt.slice(0, 100) + '...' : prompt;
+
+	// Inline scenario indicators for quick triage without expanding
+	const scenarioIndicators = result.scenarioResults
+		.map(
+			(sr) =>
+				`<span class="scenario-indicator ${sr.success ? 'pass' : 'fail'}" title="${escapeHtml(sr.scenario.name)}">${sr.success ? '✓' : '✗'} ${escapeHtml(sr.scenario.name)}</span>`,
+		)
+		.join(' ');
+
+	let scenariosHtml = '';
+	if (result.scenarioResults.length > 0) {
+		scenariosHtml = result.scenarioResults
+			.map((sr, i) => renderScenario(sr, tcIndex * 100 + i))
+			.join('');
+	} else if (!result.workflowBuildSuccess) {
+		const errorDetail = result.buildError
+			? `<div class="error-box">${escapeHtml(result.buildError)}</div>`
+			: '';
+		scenariosHtml = `<div class="muted">Workflow failed to build — no scenarios executed</div>${errorDetail}`;
+	}
+
+	return `<div class="test-case ${statusClass}">
+		<div class="test-case-header" onclick="this.parentElement.classList.toggle('expanded')">
+			<div class="test-case-title">
+				${buildBadge} ${scoreBadge}
+				<span class="test-case-prompt">${escapeHtml(truncatedPrompt)}</span>
+			</div>
+			<div class="test-case-meta">
+				<span class="badge badge-tag">${escapeHtml(result.testCase.complexity)}</span>
+				${result.workflowId ? `<span class="workflow-id">${escapeHtml(result.workflowId)}</span>` : ''}
+			</div>
+			<div class="scenario-indicators">${scenarioIndicators}</div>
+		</div>
+		<div class="test-case-detail">
+			<details class="section"><summary>Prompt</summary><div class="prompt-text">${escapeHtml(prompt)}</div></details>
+			${renderWorkflowSummary(result)}
+			${scenariosHtml}
+		</div>
+	</div>`;
+}
+
+// ---------------------------------------------------------------------------
+// Full report
+// ---------------------------------------------------------------------------
+
+export function generateWorkflowReport(results: WorkflowTestCaseResult[]): string {
+	const totalTestCases = results.length;
+	const builtCount = results.filter((r) => r.workflowBuildSuccess).length;
+	const allScenarios = results.flatMap((r) => r.scenarioResults);
+	const passCount = allScenarios.filter((sr) => sr.success).length;
+	const failCount = allScenarios.length - passCount;
+	const totalScenarios = allScenarios.length;
+	const passRate = totalScenarios > 0 ? Math.round((passCount / totalScenarios) * 100) : 0;
+
+	return `<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Workflow evaluation report</title>
+<style>
+	:root {
+		--bg-primary: #0d1117;
+		--bg-secondary: #161b22;
+		--bg-tertiary: #1c2129;
+		--border: #30363d;
+		--border-light: #21262d;
+		--text-primary: #f0f6fc;
+		--text-secondary: #c9d1d9;
+		--text-muted: #8b949e;
+		--color-pass: #3fb950;
+		--color-fail: #f85149;
+		--color-warn: #d29922;
+		--color-info: #58a6ff;
+		--color-purple: #bc8cff;
+		--color-pass-bg: #23863622;
+		--color-fail-bg: #da363322;
+		--color-warn-bg: #d2992222;
+	}
+
+	* { margin: 0; padding: 0; box-sizing: border-box; }
+	body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; background: var(--bg-primary); color: var(--text-secondary); padding: 24px; max-width: 1400px; margin: 0 auto; font-size: 14px; line-height: 1.5; }
+
+	/* Header */
+	h1 { color: var(--text-primary); font-size: 20px; margin-bottom: 2px; }
+	.subtitle { color: var(--text-muted); font-size: 13px; margin-bottom: 20px; }
+
+	/* Dashboard */
+	.dashboard { display: flex; gap: 12px; margin-bottom: 24px; flex-wrap: wrap; align-items: stretch; }
+	.stat-card { background: var(--bg-secondary); border: 1px solid var(--border); border-radius: 8px; padding: 14px 20px; min-width: 120px; }
+	.stat-card .label { color: var(--text-muted); font-size: 12px; }
+	.stat-card .value { color: var(--text-primary); font-size: 26px; font-weight: 700; margin-top: 2px; }
+	.stat-card .value.pass { color: var(--color-pass); }
+	.stat-card .value.fail { color: var(--color-fail); }
+	.stat-card .value.mixed { color: var(--color-warn); }
+
+	/* Toolbar */
+	.toolbar { display: flex; gap: 8px; margin-bottom: 16px; }
+	.toolbar button { background: var(--bg-secondary); border: 1px solid var(--border); border-radius: 6px; color: var(--text-secondary); padding: 6px 12px; font-size: 12px; cursor: pointer; }
+	.toolbar button:hover { background: var(--bg-tertiary); color: var(--text-primary); }
+	.toolbar button.active { border-color: var(--color-info); color: var(--color-info); }
+
+	/* Badges */
+	.badge { display: inline-block; padding: 2px 8px; border-radius: 12px; font-size: 11px; font-weight: 600; margin-right: 4px; }
+	.badge-pass { background: var(--color-pass-bg); color: var(--color-pass); }
+	.badge-fail { background: var(--color-fail-bg); color: var(--color-fail); }
+	.badge-tag { background: var(--border); color: var(--text-muted); }
+
+	/* Test case cards */
+	.test-case { background: var(--bg-secondary); border: 1px solid var(--border); border-radius: 8px; margin-bottom: 10px; overflow: hidden; }
+	.test-case.pass { border-left: 3px solid var(--color-pass); }
+	.test-case.fail { border-left: 3px solid var(--color-fail); }
+	.test-case.mixed { border-left: 3px solid var(--color-warn); }
+	.test-case-header { padding: 12px 16px; cursor: pointer; }
+	.test-case-header:hover { background: var(--bg-tertiary); }
+	.test-case-title { display: flex; align-items: center; gap: 8px; margin-bottom: 4px; }
+	.test-case-prompt { color: var(--text-primary); font-weight: 500; font-size: 13px; }
+	.test-case-meta { display: flex; align-items: center; gap: 6px; margin-bottom: 6px; }
+	.workflow-id { color: var(--text-muted); font-size: 11px; font-family: monospace; }
+	.scenario-indicators { display: flex; gap: 8px; flex-wrap: wrap; }
+	.scenario-indicator { font-size: 11px; font-family: monospace; }
+	.scenario-indicator.pass { color: var(--color-pass); }
+	.scenario-indicator.fail { color: var(--color-fail); }
+	.test-case-detail { display: none; padding: 0 16px 16px; }
+	.test-case.expanded .test-case-detail { display: block; }
+
+	/* Sections (collapsible) */
+	.section { margin: 8px 0; }
+	.section > summary { cursor: pointer; color: var(--color-info); font-size: 12px; font-weight: 600; padding: 4px 0; }
+	.section > summary:hover { text-decoration: underline; }
+
+	/* Scenarios */
+	.scenario { border: 1px solid var(--border-light); border-radius: 6px; margin-bottom: 6px; overflow: hidden; }
+	.scenario-header { padding: 8px 12px; cursor: pointer; display: flex; align-items: center; gap: 8px; font-size: 13px; }
+	.scenario-header:hover { background: var(--bg-tertiary); }
+	.scenario-icon { font-weight: bold; font-size: 14px; min-width: 16px; }
+	.scenario-icon.pass { color: var(--color-pass); }
+	.scenario-icon.fail { color: var(--color-fail); }
+	.scenario-name { color: var(--text-primary); font-weight: 600; }
+	.scenario-desc { color: var(--text-muted); font-size: 12px; }
+	.scenario-summary-inline { color: var(--text-muted); font-size: 12px; flex: 1; }
+	.scenario-detail { display: none; padding: 10px 12px; border-top: 1px solid var(--border-light); background: var(--bg-primary); }
+	.scenario.expanded .scenario-detail { display: block; }
+
+	/* Error and warning boxes */
+	.error-box { color: var(--color-fail); font-size: 12px; padding: 6px 10px; background: var(--color-fail-bg); border-radius: 4px; margin-bottom: 8px; border-left: 3px solid var(--color-fail); }
+	.warning-box { color: var(--color-warn); font-size: 12px; padding: 6px 10px; background: var(--color-warn-bg); border-radius: 4px; margin-bottom: 8px; border-left: 3px solid var(--color-warn); }
+	.warning-inline { color: var(--color-warn); font-size: 11px; margin: 4px 0; }
+	.build-issue { color: var(--color-warn); font-size: 11px; display: block; margin-top: 2px; }
+
+	/* Diagnosis */
+	.diagnosis { color: var(--text-secondary); font-size: 12px; line-height: 1.6; padding: 6px 0; }
+
+	/* Prompt */
+	.prompt-text { color: var(--text-secondary); font-size: 13px; line-height: 1.6; padding: 10px; background: var(--bg-primary); border: 1px solid var(--border); border-radius: 6px; white-space: pre-wrap; }
+
+	/* Execution trace */
+	.trace-legend { font-size: 11px; margin-bottom: 8px; display: flex; gap: 12px; }
+	.trace-node { border: 1px solid var(--border-light); border-radius: 4px; margin-bottom: 6px; padding: 8px; }
+	.trace-node-header { font-size: 12px; font-family: monospace; margin-bottom: 4px; }
+	.request-count { color: var(--text-muted); font-size: 11px; }
+
+	/* Request/response pairs */
+	.request-pair { border: 1px solid var(--border-light); border-radius: 4px; margin: 6px 0; overflow: hidden; }
+	.request-header { background: #1c3a5e; color: var(--color-info); font-size: 10px; font-weight: 700; padding: 3px 8px; letter-spacing: 0.5px; }
+	.response-header { background: #2a1f3e; color: var(--color-purple); font-size: 10px; font-weight: 700; padding: 3px 8px; letter-spacing: 0.5px; }
+	.request-method { font-size: 11px; color: var(--text-primary); padding: 4px 8px; font-family: monospace; font-weight: 600; background: var(--bg-primary); }
+
+	/* JSON blocks */
+	.json-block { font-size: 11px; margin: 4px 0; padding: 8px; background: var(--bg-secondary); border: 1px solid var(--border-light); border-radius: 4px; overflow-x: auto; }
+	.json-sm { font-size: 10px; }
+	pre { overflow-x: auto; margin: 0; }
+	code { color: var(--text-secondary); }
+
+	/* Node list */
+	.node-list { list-style: none; padding: 4px 0; font-size: 12px; font-family: monospace; }
+	.node-list li { padding: 3px 0; }
+	.node-mode-mocked { color: var(--color-info); font-weight: 600; }
+	.node-mode-pinned { color: var(--color-warn); font-weight: 600; }
+	.node-mode-real { color: var(--color-pass); font-weight: 600; }
+
+	/* Node output toggle */
+	.node-output-toggle { margin: 4px 0; }
+	.node-output-toggle > summary { cursor: pointer; color: var(--text-muted); font-size: 11px; }
+
+	/* Node hint */
+	.node-hint { margin: 2px 0; }
+	.node-hint > summary { cursor: pointer; color: var(--text-secondary); font-size: 11px; font-family: monospace; }
+	.hint-text { color: var(--text-muted); font-size: 11px; padding: 4px 0; line-height: 1.5; }
+	.subsection-label { color: var(--text-primary); font-size: 11px; font-weight: 600; margin-top: 8px; margin-bottom: 2px; }
+
+	/* Category badges */
+	.category-badge { font-size: 11px; font-weight: 600; padding: 4px 10px; border-radius: 4px; margin-bottom: 8px; }
+	.category-warn { background: var(--color-warn-bg); color: var(--color-warn); border-left: 3px solid var(--color-warn); }
+	.category-fail { background: var(--color-fail-bg); color: var(--color-fail); border-left: 3px solid var(--color-fail); }
+	.category-info { background: #1c3a5e33; color: var(--color-info); border-left: 3px solid var(--color-info); }
+
+	/* Utilities */
+	.muted { color: var(--text-muted); font-size: 12px; }
+</style>
+</head>
+<body>
+
+<h1>Workflow evaluation report</h1>
+<p class="subtitle">Generated ${new Date().toLocaleString()} &mdash; ${String(totalScenarios)} scenarios across ${String(totalTestCases)} test cases</p>
+
+<div class="dashboard">
+	<div class="stat-card">
+		<div class="label">Pass rate</div>
+		<div class="value${passRate >= 80 ? ' pass' : passRate >= 50 ? ' mixed' : ' fail'}">${String(passRate)}%</div>
+	</div>
+	<div class="stat-card">
+		<div class="label">Passed</div>
+		<div class="value pass">${String(passCount)}</div>
+	</div>
+	<div class="stat-card">
+		<div class="label">Failed</div>
+		<div class="value${failCount > 0 ? ' fail' : ''}">${String(failCount)}</div>
+	</div>
+	<div class="stat-card">
+		<div class="label">Built</div>
+		<div class="value${builtCount === totalTestCases ? ' pass' : ' mixed'}">${String(builtCount)}/${String(totalTestCases)}</div>
+	</div>
+</div>
+
+<div class="toolbar">
+	<button onclick="document.querySelectorAll('.test-case').forEach(e => e.classList.add('expanded'))">Expand all</button>
+	<button onclick="document.querySelectorAll('.test-case').forEach(e => e.classList.remove('expanded'))">Collapse all</button>
+	<button onclick="document.querySelectorAll('.test-case').forEach(e => { e.style.display = e.classList.contains('pass') ? 'none' : '' }); this.classList.toggle('active')">Show failures only</button>
+</div>
+
+${results.map((r, i) => renderTestCase(r, i)).join('')}
+
+</body>
+</html>`;
+}
+
+// ---------------------------------------------------------------------------
+// Write report to disk
+// ---------------------------------------------------------------------------
+
+export function writeWorkflowReport(results: WorkflowTestCaseResult[]): string {
+	const reportDir = path.join(__dirname, '..', '..', '.data');
+	if (!fs.existsSync(reportDir)) {
+		fs.mkdirSync(reportDir, { recursive: true });
+	}
+	const html = generateWorkflowReport(results);
+	const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
+	const reportPath = path.join(reportDir, `workflow-eval-${timestamp}.html`);
+	fs.writeFileSync(reportPath, html);
+
+	// Also write to the stable filename for quick access
+	fs.writeFileSync(path.join(reportDir, 'workflow-eval-report.html'), html);
+
+	return reportPath;
+}
--- a/packages/@n8n/instance-ai/evaluations/system-prompts/mock-execution-verify.ts
+++ b/packages/@n8n/instance-ai/evaluations/system-prompts/mock-execution-verify.ts
@ -0,0 +1,85 @@
+export const MOCK_EXECUTION_VERIFY_PROMPT = `You are an expert evaluator for n8n workflow execution tests. Workflows are built by an AI agent and executed with LLM-generated mock HTTP responses. Your job is to verify whether each checklist item is satisfied AND diagnose the root cause of any failure.
+
+## How mock execution works
+
+This is a test environment. No real credentials or API connections exist. ALL HTTP calls are intercepted and answered by an LLM mock. This is by design — the purpose is to test the workflow structure and data flow without real services.
+
+- **Mocked nodes**: Made HTTP requests that were intercepted. An LLM generated the response. The node then processed the mock response using its real code. These nodes have NO real credentials — they use mock credentials that allow the node code to run but never reach real APIs.
+- **Pinned nodes**: Trigger/start nodes whose output was generated by an LLM to simulate incoming data (webhooks, schedules). They didn't execute — their output was injected directly.
+- **Real nodes**: Logic nodes (Code, Set, Merge, Filter, Sort, IF, Switch) that executed their actual code on data from mocked/pinned upstream nodes.
+
+IMPORTANT: Nodes receiving mock responses instead of real API responses is EXPECTED. Missing or mock credentials is EXPECTED. Don't flag these as issues — they are the testing mechanism itself.
+
+## What you receive
+
+The verification artifact contains:
+- **Pre-analysis**: Automated flags for known issues (builder config problems, mock generation failures)
+- **Execution summary**: Which nodes were mocked, pinned, or real
+- **Errors**: Any runtime errors from the execution
+- **Workflow structure**: ALL nodes that were built, whether they executed or not, plus the full connections JSON showing how nodes are wired. Use this to verify node existence and wiring before making claims about missing nodes or wrong connections.
+- **Execution trace**: Per-node detail including HTTP requests sent, mock responses returned, and node output. Only includes nodes that actually ran. **IMPORTANT: The trace is NOT in chronological order.** Do not infer execution sequence from the order nodes appear in the trace. Use the connections JSON in the workflow structure to determine execution flow.
+
+## How to evaluate
+
+1. Check the **pre-analysis flags first** — they identify known issues programmatically
+2. Check if the workflow **executed without errors** (unless the scenario specifically tests error handling)
+3. For each error, **trace the root cause** through the execution trace:
+   - Did a mocked node receive "_evalMockError" in the response? → mock generation failure
+   - Did a node have configIssues flagged? → builder issue (agent misconfigured the node)
+   - Did a real node crash because a field is missing? → **check the request that was sent**: if the HTTP request (e.g., GraphQL query) didn't ask for that field, the mock correctly omitted it — that's a builder issue (wrong query or wrong node choice), NOT a mock issue. The mock can only return what was requested.
+   - Did the mock response have the wrong shape for the endpoint? (e.g., returning a write response for a GET request) → mock issue
+   - Did the mock return identical responses for multiple calls to the same endpoint with different request bodies? → mock issue
+   - Did the workflow handle an error scenario but the success criteria is ambiguous about what "graceful" means? → evaluate based on whether data was lost or the workflow crashed entirely
+
+   KEY PRINCIPLE: A mock response that faithfully matches the HTTP request is NEVER a mock issue, even if downstream nodes needed different data. If the request didn't ask for a field, the mock shouldn't invent it. The fault lies with whatever built the request (the node choice or its configuration).
+
+4. **Be definitive, not speculative.** You have the full execution trace, node configurations, request bodies, mock responses, and node outputs. Use this data to give exact answers. Say "the expression references $json.firstName but the upstream output has the field as firstname (lowercase)" — not "likely references a field that doesn't resolve correctly." If a node errored, quote the exact error. If a field is missing, name it and trace where it should have come from. Never use "likely", "might", "probably", or "possibly" when the data in the artifact gives you a definitive answer.
+5. **Always check the "Workflow structure" section before claiming a node is missing or miswired.** The workflow structure lists ALL nodes that were built AND the connections JSON showing exactly how they are wired. The execution trace only shows nodes that actually ran. Before claiming a branch is wired to the wrong node, verify against the connections JSON. If a node exists in the structure but not in the trace, check why: was an upstream condition met unexpectedly? Was the IF/Switch node's condition misconfigured? Was the input data wrong? Don't assume miswiring — check the connections first.
+6. **Workflows can branch.** Not every node runs in every execution. A crashed or misconfigured node prevents all downstream branches from running. When diagnosing, identify the single root cause (the first node that crashed) rather than listing each unexecuted downstream node as a separate issue.
+7. Check the **success criteria** against the execution trace and node outputs
+8. For scenarios with no errors and no output beyond the trigger: this usually means the workflow handled empty data gracefully (no crash = success for empty-input scenarios)
+
+## Failure categories
+
+When a checklist item fails, categorize the root cause:
+- **builder_issue**: The AI agent that built the workflow misconfigured a node (missing parameters, wrong settings, incomplete config, wrong routing logic, missing nodes). Evidence: configIssues flags, nodes crashing before making HTTP requests, Switch/IF nodes missing required options, workflow structure doesn't match what the prompt asked for.
+- **mock_issue**: The LLM mock handler returned incorrect or missing data. Evidence: _evalMockError in responses, mock response shape doesn't match what the node expects, identical responses for different requests, mock data missing fields that downstream nodes reference.
+- **legitimate_failure**: The workflow genuinely doesn't meet the success criteria and neither the builder nor mock is at fault. The test is working as designed — for example, the workflow lacks error handling that the scenario tests for.
+- **framework_issue**: The evaluation framework itself failed — Phase 1 returned an error or empty trigger content, causing cascading failures. Evidence: pre-analysis flags starting with "FRAMEWORK ISSUE", empty trigger node output (empty JSON object), "Phase 1 error" warnings. When this happens, downstream node crashes are a consequence of the empty input, NOT a builder or mock problem.
+- **verification_gap**: You don't have enough information in the artifact to make a determination.
+
+NOT failure categories:
+- Nodes using mock credentials instead of real ones — this is expected
+- HTTP responses coming from the LLM mock instead of real APIs — this is expected
+- Trigger nodes having pinned/generated data instead of real events — this is expected
+
+## Output format
+
+Return ONLY a JSON array:
+
+\`\`\`json
+[
+  {
+    "id": 1,
+    "pass": true,
+    "reasoning": "All nodes executed without errors. The webhook data flowed through Gmail, Telegram, and Google Sheets correctly.",
+    "failureCategory": null,
+    "rootCause": null
+  }
+]
+\`\`\`
+
+For failures:
+
+\`\`\`json
+[
+  {
+    "id": 1,
+    "pass": false,
+    "reasoning": "The Sort node crashed because the upstream Filter & Count node produced {noData: true} instead of items with a 'count' field.",
+    "failureCategory": "mock_issue",
+    "rootCause": "The Linear node's mock response didn't include creator.email, so the Filter code node filtered out all items."
+  }
+]
+\`\`\`
+`;
--- a/packages/@n8n/instance-ai/evaluations/types.ts
+++ b/packages/@n8n/instance-ai/evaluations/types.ts
@ -0,0 +1,171 @@
+// ---------------------------------------------------------------------------
+// Shared types for the instance-ai workflow test case evaluator
+// ---------------------------------------------------------------------------
+
+import type { InstanceAiEvalExecutionResult } from '@n8n/api-types';
+
+import type { WorkflowResponse } from './clients/n8n-client';
+
+// ---------------------------------------------------------------------------
+// Checklist items and verification
+// ---------------------------------------------------------------------------
+
+export type ChecklistCategory = 'structure' | 'data' | 'behavior' | 'execution';
+
+export type VerificationStrategy = 'programmatic' | 'llm';
+
+export interface ChecklistItem {
+	id: number;
+	description: string;
+	category: ChecklistCategory;
+	strategy: VerificationStrategy;
+}
+
+export interface ChecklistResult {
+	id: number;
+	pass: boolean;
+	reasoning: string;
+	strategy: VerificationStrategy;
+	failureCategory?: string;
+	rootCause?: string;
+}
+
+// ---------------------------------------------------------------------------
+// SSE event capture
+// ---------------------------------------------------------------------------
+
+export interface CapturedEvent {
+	timestamp: number;
+	type: string;
+	data: Record<string, unknown>;
+}
+
+export interface CapturedToolCall {
+	toolCallId: string;
+	toolName: string;
+	args: Record<string, unknown>;
+	result?: unknown;
+	error?: string;
+	durationMs: number;
+}
+
+export interface AgentActivity {
+	agentId: string;
+	role: string;
+	parentId?: string;
+	toolCalls: CapturedToolCall[];
+	textContent: string;
+	reasoning: string;
+	status: string;
+}
+
+// ---------------------------------------------------------------------------
+// Metrics
+// ---------------------------------------------------------------------------
+
+export interface InstanceAiMetrics {
+	totalTimeMs: number;
+	timeToFirstTextMs: number;
+	timeToRunFinishMs: number;
+	totalToolCalls: number;
+	subAgentsSpawned: number;
+	confirmationRequests: number;
+	agentActivities: AgentActivity[];
+	events: CapturedEvent[];
+}
+
+// ---------------------------------------------------------------------------
+// Outcome types
+// ---------------------------------------------------------------------------
+
+export interface WorkflowSummary {
+	id: string;
+	name: string;
+	nodeCount: number;
+	active: boolean;
+}
+
+export interface NodeOutputData {
+	nodeName: string;
+	data: Array<Record<string, unknown>>;
+}
+
+export interface WebhookResponse {
+	status: number;
+	body: unknown;
+}
+
+export interface ExecutionSummary {
+	id: string;
+	workflowId: string;
+	status: string;
+	error?: string;
+	failedNode?: string;
+	triggeredByEval?: boolean;
+	outputData?: NodeOutputData[];
+	webhookResponse?: WebhookResponse;
+}
+
+export interface AgentOutcome {
+	workflowsCreated: WorkflowSummary[];
+	executionsRun: ExecutionSummary[];
+	dataTablesCreated: string[];
+	finalText: string;
+	workflowJsons: WorkflowResponse[];
+}
+
+export interface EventOutcome {
+	workflowIds: string[];
+	executionIds: string[];
+	dataTableIds: string[];
+	finalText: string;
+	toolCalls: CapturedToolCall[];
+	agentActivities: AgentActivity[];
+}
+
+// ---------------------------------------------------------------------------
+// Workflow evaluation test cases
+// ---------------------------------------------------------------------------
+
+export interface TestScenario {
+	name: string;
+	description: string;
+	/** Instructions for mock data generation — passed as scenario hints to the LLM mock endpoint */
+	dataSetup: string;
+	/** Criteria the LLM verifier checks against the execution result */
+	successCriteria: string;
+}
+
+export interface WorkflowTestCase {
+	prompt: string;
+	complexity: 'simple' | 'medium' | 'complex';
+	tags: string[];
+	triggerType?: 'manual' | 'webhook' | 'schedule' | 'form';
+	scenarios: TestScenario[];
+}
+
+// ---------------------------------------------------------------------------
+// Workflow test case results
+// ---------------------------------------------------------------------------
+
+export interface ScenarioResult {
+	scenario: TestScenario;
+	success: boolean;
+	evalResult?: InstanceAiEvalExecutionResult;
+	score: number;
+	reasoning: string;
+	/** Root cause category when the scenario fails */
+	failureCategory?: string;
+	/** Detailed root cause explanation */
+	rootCause?: string;
+}
+
+export interface WorkflowTestCaseResult {
+	testCase: WorkflowTestCase;
+	workflowId?: string;
+	workflowBuildSuccess: boolean;
+	buildError?: string;
+	scenarioResults: ScenarioResult[];
+	/** The built workflow JSON — saved for debugging and cross-run comparison */
+	workflowJson?: WorkflowResponse;
+}
--- a/packages/@n8n/instance-ai/package.json
+++ b/packages/@n8n/instance-ai/package.json
@ -9,7 +9,8 @@
    "format:check": "biome ci src",
    "test": "jest",
    "lint": "eslint . --quiet",
-    "lint:fix": "eslint . --fix"
+    "lint:fix": "eslint . --fix",
+    "eval:instance-ai": "tsx evaluations/cli/index.ts"
  },
  "main": "dist/index.js",
  "module": "src/index.ts",
@ -43,12 +44,15 @@
    "turndown": "^7.2.0",
    "zod": "catalog:",
    "@ai-sdk/provider-v5": "npm:@ai-sdk/provider@2.0.0",
-    "zod-from-json-schema-v3": "npm:zod-from-json-schema@^0.0.5"
+    "zod-from-json-schema-v3": "npm:zod-from-json-schema@^0.0.5",
+    "@n8n/agents": "workspace:*",
+    "flatted": "catalog:"
  },
  "devDependencies": {
    "@ai-sdk/anthropic": "2.0.61",
    "@n8n/typescript-config": "workspace:*",
    "@types/luxon": "3.2.0",
-    "@types/turndown": "^5.0.5"
+    "@types/turndown": "^5.0.5",
+    "tsx": "catalog:"
  }
 }
--- a/packages/@n8n/instance-ai/src/index.ts
+++ b/packages/@n8n/instance-ai/src/index.ts
@ -48,6 +48,7 @@ export { truncateToTitle, generateThreadTitle } from './memory/title-utils';
 export { McpClientManager } from './mcp/mcp-client-manager';
 export { mapMastraChunkToEvent } from './stream/map-chunk';
 export { isRecord, parseSuspension, asResumable } from './utils/stream-helpers';
+export { createEvalAgent, extractText, Tool, SONNET_MODEL, HAIKU_MODEL } from './utils/eval-agents';
 export type { SuspensionInfo, Resumable } from './utils/stream-helpers';
 export { buildAgentTreeFromEvents, findAgentNodeInTree } from './utils/agent-tree';
 export { registerWithMastra } from './agent/register-with-mastra';
--- a/packages/@n8n/instance-ai/src/utils/eval-agents.ts
+++ b/packages/@n8n/instance-ai/src/utils/eval-agents.ts
@ -0,0 +1,87 @@
+/**
+ * Shared agent factory for eval LLM calls.
+ *
+ * Centralizes model config, API key resolution, and text extraction
+ * for the 3 eval call sites (hint generation, mock responses, pin data).
+ */
+
+import { Agent, Tool, type GenerateResult } from '@n8n/agents';
+
+export { Tool };
+
+// ---------------------------------------------------------------------------
+// Model constants
+// ---------------------------------------------------------------------------
+
+export const SONNET_MODEL = 'anthropic/claude-sonnet-4-6';
+export const HAIKU_MODEL = 'anthropic/claude-haiku-4-5-20251001';
+
+// ---------------------------------------------------------------------------
+// API key resolution
+// ---------------------------------------------------------------------------
+
+function getApiKey(): string {
+	const key = process.env.N8N_INSTANCE_AI_MODEL_API_KEY ?? process.env.N8N_AI_ANTHROPIC_KEY;
+	if (!key) {
+		throw new Error(
+			'Missing API key. Set N8N_INSTANCE_AI_MODEL_API_KEY (or N8N_AI_ANTHROPIC_KEY) in your environment.',
+		);
+	}
+	return key;
+}
+
+// ---------------------------------------------------------------------------
+// Agent factory
+// ---------------------------------------------------------------------------
+
+const CACHE_PROVIDER_OPTS = {
+	providerOptions: {
+		anthropic: { cacheControl: { type: 'ephemeral' as const } },
+	},
+};
+
+export function createEvalAgent(
+	name: string,
+	options: {
+		model?: string;
+		instructions: string;
+		cache?: boolean;
+	},
+): Agent {
+	const agent = new Agent(name).model({
+		id: options.model ?? SONNET_MODEL,
+		apiKey: getApiKey(),
+	});
+
+	if (options.cache) {
+		agent.instructions(options.instructions, CACHE_PROVIDER_OPTS);
+	} else {
+		agent.instructions(options.instructions);
+	}
+
+	return agent;
+}
+
+// ---------------------------------------------------------------------------
+// Text extraction
+// ---------------------------------------------------------------------------
+
+export function extractText(result: GenerateResult): string {
+	const texts: string[] = [];
+	for (const msg of result.messages) {
+		if (!('role' in msg) || msg.role !== 'assistant') continue;
+		if (!('content' in msg) || !Array.isArray(msg.content)) continue;
+		for (const part of msg.content) {
+			if (
+				typeof part === 'object' &&
+				part !== null &&
+				'type' in part &&
+				part.type === 'text' &&
+				'text' in part
+			) {
+				texts.push(String(part.text));
+			}
+		}
+	}
+	return texts.join('');
+}
--- a/packages/@n8n/instance-ai/tsconfig.json
+++ b/packages/@n8n/instance-ai/tsconfig.json
@ -16,5 +16,5 @@
 		"tsBuildInfoFile": "dist/typecheck.tsbuildinfo",
 		"types": ["node", "jest"]
 	},
-	"include": ["src/**/*.ts"]
+	"include": ["src/**/*.ts", "evaluations/**/*.ts"]
 }
--- a/packages/cli/src/modules/instance-ai/tests/instance-ai.controller.test.ts
+++ b/packages/cli/src/modules/instance-ai/tests/instance-ai.controller.test.ts
@ -8,6 +8,10 @@ jest.mock('@n8n/instance-ai', () => ({
 	workflowBuildOutcomeSchema: z.string(),
 }));

+jest.mock('../eval/execution.service', () => ({
+	EvalExecutionService: jest.fn(),
+}));
+
 import type {
 	InstanceAiSendMessageRequest,
 	InstanceAiCorrectTaskRequest,
@ -38,6 +42,7 @@ import { ForbiddenError } from '@/errors/response-errors/forbidden.error';
 import { NotFoundError } from '@/errors/response-errors/not-found.error';
 import type { Push } from '@/push';

+import type { EvalExecutionService } from '../eval/execution.service';
 import type { InProcessEventBus } from '../event-bus/in-process-event-bus';
 import type { InstanceAiMemoryService } from '../instance-ai-memory.service';
 import type { InstanceAiSettingsService } from '../instance-ai-settings.service';
@ -75,6 +80,7 @@ describe('InstanceAiController', () => {
 		instanceAiService,
 		memoryService,
 		settingsService,
+		mock<EvalExecutionService>(),
 		eventBus,
 		moduleRegistry,
 		push,
--- a/packages/cli/src/modules/instance-ai/eval/tests/api-docs.test.ts
+++ b/packages/cli/src/modules/instance-ai/eval/tests/api-docs.test.ts
@ -0,0 +1,253 @@
+const mockLogger = {
+	warn: jest.fn(),
+	debug: jest.fn(),
+};
+
+jest.mock('@n8n/backend-common', () => ({
+	Logger: class Logger {},
+}));
+
+jest.mock('@n8n/di', () => ({
+	Container: {
+		get: jest.fn().mockReturnValue(mockLogger),
+	},
+	Service: () => jest.fn(),
+}));
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+const FALLBACK_INSTRUCTIONS =
+	'No API documentation was found for this endpoint. Generate the response based on your knowledge of this API. Follow standard REST conventions for the HTTP method: GET returns resource data, POST returns the created resource, PUT/PATCH returns the updated resource, DELETE returns 204 or confirmation.';
+
+let mockFetch: jest.Mock;
+let savedApiKey: string | undefined;
+
+function mockFetchResponse(status: number, body: unknown, ok?: boolean) {
+	const isOk = ok ?? (status >= 200 && status < 300);
+	const textBody = typeof body === 'string' ? body : JSON.stringify(body);
+	mockFetch.mockResolvedValueOnce({
+		ok: isOk,
+		status,
+		text: jest.fn().mockResolvedValue(textBody),
+		json: jest.fn().mockResolvedValue(body),
+	});
+}
+
+beforeEach(() => {
+	jest.resetModules();
+	jest.clearAllMocks();
+	mockFetch = jest.fn();
+	global.fetch = mockFetch;
+	savedApiKey = process.env.CONTEXT7_API_KEY;
+	delete process.env.CONTEXT7_API_KEY;
+});
+
+afterEach(() => {
+	if (savedApiKey !== undefined) {
+		process.env.CONTEXT7_API_KEY = savedApiKey;
+	} else {
+		delete process.env.CONTEXT7_API_KEY;
+	}
+});
+
+// ---------------------------------------------------------------------------
+// resolveLibraryId (tested indirectly through fetchApiDocs)
+// ---------------------------------------------------------------------------
+
+describe('resolveLibraryId (via fetchApiDocs)', () => {
+	it('should return docs when library search and docs fetch both succeed', async () => {
+		const { fetchApiDocs } = await import('../api-docs');
+
+		// Library search
+		mockFetchResponse(200, [{ id: '/lib/slack-api', trust_score: 90 }]);
+		// Docs fetch
+		mockFetchResponse(200, 'POST /chat.postMessage — sends a message to a channel');
+
+		const result = await fetchApiDocs('Slack', 'chat.postMessage');
+
+		expect(result).toBe('POST /chat.postMessage — sends a message to a channel');
+		expect(mockFetch).toHaveBeenCalledTimes(2);
+	});
+
+	it('should cache library ID — second call does not re-fetch library search', async () => {
+		const { fetchApiDocs } = await import('../api-docs');
+
+		// First call: library search + docs
+		mockFetchResponse(200, [{ id: '/lib/slack-api', trust_score: 90 }]);
+		mockFetchResponse(200, 'docs for postMessage');
+
+		await fetchApiDocs('Slack', 'chat.postMessage');
+
+		// Second call: only docs fetch needed (library ID is cached)
+		mockFetchResponse(200, 'docs for conversations.list');
+
+		const result = await fetchApiDocs('Slack', 'conversations.list');
+
+		expect(result).toBe('docs for conversations.list');
+		// 2 from first call + 1 from second call = 3
+		expect(mockFetch).toHaveBeenCalledTimes(3);
+	});
+
+	it('should return fallback when library search returns non-200', async () => {
+		const { fetchApiDocs } = await import('../api-docs');
+
+		mockFetchResponse(500, 'Internal Server Error');
+
+		const result = await fetchApiDocs('FakeService', 'endpoint');
+
+		expect(result).toBe(FALLBACK_INSTRUCTIONS);
+		expect(mockFetch).toHaveBeenCalledTimes(1);
+	});
+
+	it('should return fallback when library search returns empty results', async () => {
+		const { fetchApiDocs } = await import('../api-docs');
+
+		mockFetchResponse(200, []);
+
+		const result = await fetchApiDocs('ObscureService', 'endpoint');
+
+		expect(result).toBe(FALLBACK_INSTRUCTIONS);
+		expect(mockFetch).toHaveBeenCalledTimes(1);
+	});
+
+	it('should log warning on 429 status (quota exceeded)', async () => {
+		const { fetchApiDocs } = await import('../api-docs');
+
+		mockFetchResponse(429, 'Rate limit exceeded');
+
+		await fetchApiDocs('Slack', 'endpoint');
+
+		expect(mockLogger.warn).toHaveBeenCalledWith(
+			expect.stringContaining('Context7 quota exceeded'),
+		);
+	});
+
+	it('should log warning when response body contains "Quota"', async () => {
+		const { fetchApiDocs } = await import('../api-docs');
+
+		mockFetchResponse(403, 'Quota limit reached for your plan');
+
+		await fetchApiDocs('Slack', 'endpoint');
+
+		expect(mockLogger.warn).toHaveBeenCalledWith(
+			expect.stringContaining('Context7 quota exceeded'),
+		);
+	});
+
+	it('should log warning only once per session (context7WarningLogged flag)', async () => {
+		const { fetchApiDocs } = await import('../api-docs');
+
+		// First call — 429, should warn
+		mockFetchResponse(429, 'Rate limit');
+		await fetchApiDocs('Service1', 'endpoint');
+
+		// Second call — also 429, should NOT warn again
+		mockFetchResponse(429, 'Rate limit');
+		await fetchApiDocs('Service2', 'endpoint');
+
+		expect(mockLogger.warn).toHaveBeenCalledTimes(1);
+	});
+
+	it('should return fallback on fetch timeout / network error', async () => {
+		const { fetchApiDocs } = await import('../api-docs');
+
+		mockFetch.mockRejectedValueOnce(new Error('network timeout'));
+
+		const result = await fetchApiDocs('Slack', 'endpoint');
+
+		expect(result).toBe(FALLBACK_INSTRUCTIONS);
+		expect(mockLogger.debug).toHaveBeenCalledWith(
+			expect.stringContaining('Context7 library search failed'),
+		);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// fetchApiDocs
+// ---------------------------------------------------------------------------
+
+describe('fetchApiDocs', () => {
+	it('should return docs text from Context7 on success', async () => {
+		const { fetchApiDocs } = await import('../api-docs');
+
+		mockFetchResponse(200, [{ id: '/lib/github-api', trust_score: 80 }]);
+		mockFetchResponse(200, 'GET /repos/{owner}/{repo} — returns a repository');
+
+		const result = await fetchApiDocs('GitHub', 'repos');
+
+		expect(result).toBe('GET /repos/{owner}/{repo} — returns a repository');
+	});
+
+	it('should cache docs per serviceName + endpointQuery', async () => {
+		const { fetchApiDocs } = await import('../api-docs');
+
+		// First call: library search + docs
+		mockFetchResponse(200, [{ id: '/lib/slack-api', trust_score: 90 }]);
+		mockFetchResponse(200, 'cached docs content');
+
+		const first = await fetchApiDocs('Slack', 'chat.postMessage');
+
+		// Second call with same args: should hit cache, no fetch
+		const second = await fetchApiDocs('Slack', 'chat.postMessage');
+
+		expect(first).toBe('cached docs content');
+		expect(second).toBe('cached docs content');
+		// Only 2 fetches total (library search + docs from first call)
+		expect(mockFetch).toHaveBeenCalledTimes(2);
+	});
+
+	it('should return fallback when docs endpoint returns non-200', async () => {
+		const { fetchApiDocs } = await import('../api-docs');
+
+		mockFetchResponse(200, [{ id: '/lib/slack-api', trust_score: 90 }]);
+		mockFetchResponse(500, 'Server Error');
+
+		const result = await fetchApiDocs('Slack', 'endpoint');
+
+		expect(result).toBe(FALLBACK_INSTRUCTIONS);
+	});
+
+	it('should return fallback when docs response is empty text', async () => {
+		const { fetchApiDocs } = await import('../api-docs');
+
+		mockFetchResponse(200, [{ id: '/lib/slack-api', trust_score: 90 }]);
+		mockFetchResponse(200, '   ');
+
+		const result = await fetchApiDocs('Slack', 'endpoint');
+
+		expect(result).toBe(FALLBACK_INSTRUCTIONS);
+	});
+
+	it('should pass CONTEXT7_API_KEY as Bearer Authorization header when env var is set', async () => {
+		process.env.CONTEXT7_API_KEY = 'test-api-key-123';
+		const { fetchApiDocs } = await import('../api-docs');
+
+		mockFetchResponse(200, [{ id: '/lib/slack-api', trust_score: 90 }]);
+		mockFetchResponse(200, 'docs');
+
+		await fetchApiDocs('Slack', 'endpoint');
+
+		// Both the library search and docs fetch should include the header
+		for (const call of mockFetch.mock.calls) {
+			const options = call[1] as { headers: Record<string, string> };
+			expect(options.headers.Authorization).toBe('Bearer test-api-key-123');
+		}
+	});
+
+	it('should not send Authorization header when CONTEXT7_API_KEY is not set', async () => {
+		delete process.env.CONTEXT7_API_KEY;
+		const { fetchApiDocs } = await import('../api-docs');
+
+		mockFetchResponse(200, [{ id: '/lib/slack-api', trust_score: 90 }]);
+		mockFetchResponse(200, 'docs');
+
+		await fetchApiDocs('Slack', 'endpoint');
+
+		for (const call of mockFetch.mock.calls) {
+			const options = call[1] as { headers: Record<string, string> };
+			expect(options.headers).not.toHaveProperty('Authorization');
+		}
+	});
+});
--- a/packages/cli/src/modules/instance-ai/eval/tests/execution.service.test.ts
+++ b/packages/cli/src/modules/instance-ai/eval/tests/execution.service.test.ts
@ -0,0 +1,521 @@
+import { mock } from 'jest-mock-extended';
+import type { User } from '@n8n/db';
+import type { Logger } from '@n8n/backend-common';
+import type {
+	INode,
+	IRunExecutionData,
+	IRun,
+	IWorkflowBase,
+	INodeTypeDescription,
+} from 'n8n-workflow';
+
+import type { WorkflowFinderService } from '@/workflows/workflow-finder.service';
+import type { NodeTypes } from '@/node-types';
+
+// ---------------------------------------------------------------------------
+// Mocks — must be before the import of the class under test
+// ---------------------------------------------------------------------------
+
+jest.mock('@n8n/instance-ai', () => ({
+	createEvalAgent: jest.fn(),
+	extractText: jest.fn(),
+}));
+jest.mock('../pin-data-generator', () => ({
+	generatePinData: jest.fn(),
+}));
+jest.mock('../mock-handler', () => ({
+	createLlmMockHandler: jest.fn(),
+}));
+jest.mock('../workflow-analysis', () => ({
+	generateMockHints: jest.fn(),
+	identifyNodesForHints: jest.fn(),
+	identifyNodesForPinData: jest.fn(),
+}));
+jest.mock('@n8n/workflow-sdk', () => ({
+	normalizePinData: jest.fn((pd: unknown) => pd),
+}));
+jest.mock('@/workflow-execute-additional-data', () => ({
+	getBase: jest.fn().mockResolvedValue({
+		hooks: undefined,
+		evalLlmMockHandler: undefined,
+	}),
+}));
+
+// WorkflowExecute is a class instantiated with `new` — mock it so
+// processRunExecutionData returns a controllable IRun.
+const mockProcessRunExecutionData = jest.fn();
+jest.mock('n8n-core', () => {
+	const actual = jest.requireActual('n8n-core');
+	return {
+		...actual,
+		WorkflowExecute: jest.fn().mockImplementation(() => ({
+			processRunExecutionData: mockProcessRunExecutionData,
+		})),
+		ExecutionLifecycleHooks: jest.fn().mockImplementation(() => ({})),
+	};
+});
+
+// Workflow is a class instantiated with `new` — mock getStartNode
+const mockGetStartNode = jest.fn();
+jest.mock('n8n-workflow', () => {
+	const actual = jest.requireActual('n8n-workflow');
+	return {
+		...actual,
+		Workflow: jest.fn().mockImplementation(() => ({
+			getStartNode: mockGetStartNode,
+			nodes: {},
+		})),
+	};
+});
+
+// ---------------------------------------------------------------------------
+// Import SUT and mocked modules (after jest.mock calls)
+// ---------------------------------------------------------------------------
+
+import { EvalExecutionService } from '../execution.service';
+import {
+	generateMockHints,
+	identifyNodesForHints,
+	identifyNodesForPinData,
+} from '../workflow-analysis';
+import { createLlmMockHandler } from '../mock-handler';
+import type { MockHints } from '../workflow-analysis';
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+const generateMockHintsMock = jest.mocked(generateMockHints);
+const identifyNodesForHintsMock = jest.mocked(identifyNodesForHints);
+const identifyNodesForPinDataMock = jest.mocked(identifyNodesForPinData);
+const createLlmMockHandlerMock = jest.mocked(createLlmMockHandler);
+
+function makeWorkflowEntity(overrides: Partial<IWorkflowBase> = {}) {
+	return {
+		id: 'wf-1',
+		name: 'Test Workflow',
+		active: false,
+		nodes: [
+			{
+				id: 'node-1',
+				name: 'Webhook',
+				type: 'n8n-nodes-base.webhook',
+				typeVersion: 1,
+				position: [0, 0],
+				parameters: {},
+			} as INode,
+			{
+				id: 'node-2',
+				name: 'HTTP Request',
+				type: 'n8n-nodes-base.httpRequest',
+				typeVersion: 1,
+				position: [200, 0],
+				parameters: {},
+			} as INode,
+		],
+		connections: {},
+		createdAt: new Date().toISOString(),
+		updatedAt: new Date().toISOString(),
+		...overrides,
+	} as unknown as IWorkflowBase;
+}
+
+function makeUser(): User {
+	return { id: 'user-1' } as User;
+}
+
+function makeEmptyHints(): MockHints {
+	return {
+		globalContext: 'Test context',
+		triggerContent: { body: { email: 'test@example.com' } },
+		nodeHints: { 'HTTP Request': 'Return user data' },
+		warnings: [],
+		bypassPinData: {},
+	};
+}
+
+function makeIRun(overrides: Partial<IRun> = {}): IRun {
+	return {
+		data: {
+			resultData: {
+				runData: {},
+			},
+		} as unknown as IRunExecutionData,
+		mode: 'evaluation',
+		startedAt: new Date(),
+		status: 'success',
+		...overrides,
+	} as IRun;
+}
+
+function makeStartNode(): INode {
+	return {
+		id: 'node-1',
+		name: 'Webhook',
+		type: 'n8n-nodes-base.webhook',
+		typeVersion: 1,
+		position: [0, 0] as [number, number],
+		parameters: {},
+	} as INode;
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+describe('EvalExecutionService', () => {
+	let service: EvalExecutionService;
+	const workflowFinderService = mock<WorkflowFinderService>();
+	const nodeTypes = mock<NodeTypes>();
+	const logger = mock<Logger>();
+
+	beforeEach(() => {
+		jest.clearAllMocks();
+
+		service = new EvalExecutionService(workflowFinderService, nodeTypes, logger);
+
+		// Default mock returns — happy path
+		identifyNodesForHintsMock.mockReturnValue([]);
+		identifyNodesForPinDataMock.mockReturnValue([]);
+		generateMockHintsMock.mockResolvedValue(makeEmptyHints());
+		createLlmMockHandlerMock.mockReturnValue(jest.fn());
+		mockGetStartNode.mockReturnValue(makeStartNode());
+		mockProcessRunExecutionData.mockResolvedValue(makeIRun());
+
+		// NodeTypes.getByNameAndVersion returns a minimal node type with no webhook
+		nodeTypes.getByNameAndVersion.mockReturnValue({
+			description: { properties: [] } as unknown as INodeTypeDescription,
+		} as never);
+	});
+
+	// ── errorResult (workflow not found) ─────────────────────────────
+
+	describe('when workflow is not found', () => {
+		it('returns error result with descriptive message', async () => {
+			workflowFinderService.findWorkflowForUser.mockResolvedValue(null);
+
+			const result = await service.executeWithLlmMock('wf-missing', makeUser());
+
+			expect(result.success).toBe(false);
+			expect(result.errors).toEqual(
+				expect.arrayContaining([expect.stringContaining('wf-missing')]),
+			);
+		});
+
+		it('returns empty nodeResults', async () => {
+			workflowFinderService.findWorkflowForUser.mockResolvedValue(null);
+
+			const result = await service.executeWithLlmMock('wf-missing', makeUser());
+
+			expect(result.nodeResults).toEqual({});
+		});
+
+		it('returns empty hints with defaults', async () => {
+			workflowFinderService.findWorkflowForUser.mockResolvedValue(null);
+
+			const result = await service.executeWithLlmMock('wf-missing', makeUser());
+
+			expect(result.hints.globalContext).toBe('');
+			expect(result.hints.triggerContent).toEqual({});
+			expect(result.hints.nodeHints).toEqual({});
+		});
+	});
+
+	// ── executeWithLlmMock orchestration ─────────────────────────────
+
+	describe('orchestration', () => {
+		beforeEach(() => {
+			workflowFinderService.findWorkflowForUser.mockResolvedValue(makeWorkflowEntity() as never);
+		});
+
+		it('calls generateMockHints with workflow and node names', async () => {
+			const hintNodes = [makeStartNode()];
+			identifyNodesForHintsMock.mockReturnValue(hintNodes);
+
+			await service.executeWithLlmMock('wf-1', makeUser());
+
+			expect(generateMockHintsMock).toHaveBeenCalledWith(
+				expect.objectContaining({
+					workflow: expect.objectContaining({ id: 'wf-1' }),
+					nodeNames: ['Webhook'],
+				}),
+			);
+		});
+
+		it('forwards scenarioHints to generateMockHints', async () => {
+			identifyNodesForHintsMock.mockReturnValue([]);
+
+			await service.executeWithLlmMock('wf-1', makeUser(), {
+				scenarioHints: 'error scenario',
+			});
+
+			expect(generateMockHintsMock).toHaveBeenCalledWith(
+				expect.objectContaining({ scenarioHints: 'error scenario' }),
+			);
+		});
+
+		it('calls createLlmMockHandler with hints from Phase 1', async () => {
+			const hints = makeEmptyHints();
+			hints.globalContext = 'shared context';
+			hints.nodeHints = { 'HTTP Request': 'return list of users' };
+			generateMockHintsMock.mockResolvedValue(hints);
+
+			await service.executeWithLlmMock('wf-1', makeUser());
+
+			expect(createLlmMockHandlerMock).toHaveBeenCalledWith(
+				expect.objectContaining({
+					globalContext: 'shared context',
+					nodeHints: { 'HTTP Request': 'return list of users' },
+				}),
+			);
+		});
+
+		it('returns executionId in the result', async () => {
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			expect(result.executionId).toBeDefined();
+			expect(typeof result.executionId).toBe('string');
+			expect(result.executionId.length).toBeGreaterThan(0);
+		});
+	});
+
+	// ── buildResult behavior ─────────────────────────────────────────
+
+	describe('buildResult (via execution)', () => {
+		beforeEach(() => {
+			workflowFinderService.findWorkflowForUser.mockResolvedValue(makeWorkflowEntity() as never);
+		});
+
+		it('returns success=true when no errors in run data', async () => {
+			mockProcessRunExecutionData.mockResolvedValue(
+				makeIRun({
+					data: {
+						resultData: {
+							runData: {
+								'HTTP Request': [
+									{
+										startTime: 1000,
+										executionTime: 200,
+										executionIndex: 0,
+										source: [],
+										data: { main: [[{ json: { id: 1 } }]] },
+									},
+								],
+							},
+						},
+					} as unknown as IRunExecutionData,
+				}),
+			);
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			expect(result.success).toBe(true);
+			expect(result.errors).toEqual([]);
+		});
+
+		it('captures node errors from run data', async () => {
+			mockProcessRunExecutionData.mockResolvedValue(
+				makeIRun({
+					data: {
+						resultData: {
+							runData: {
+								'HTTP Request': [
+									{
+										startTime: 1000,
+										executionTime: 200,
+										executionIndex: 0,
+										source: [],
+										data: { main: [[{ json: {} }]] },
+										error: { message: 'Connection refused' } as Error,
+									},
+								],
+							},
+						},
+					} as unknown as IRunExecutionData,
+				}),
+			);
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			expect(result.success).toBe(false);
+			expect(result.errors).toEqual(
+				expect.arrayContaining([expect.stringContaining('Connection refused')]),
+			);
+		});
+
+		it('captures workflow-level execution error', async () => {
+			mockProcessRunExecutionData.mockResolvedValue(
+				makeIRun({
+					data: {
+						resultData: {
+							runData: {},
+							error: { message: 'Workflow timed out' } as Error,
+						},
+					} as unknown as IRunExecutionData,
+				}),
+			);
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			expect(result.success).toBe(false);
+			expect(result.errors).toEqual(
+				expect.arrayContaining([expect.stringContaining('Workflow timed out')]),
+			);
+		});
+
+		it('sets executionMode to "real" for logic nodes in run data', async () => {
+			mockProcessRunExecutionData.mockResolvedValue(
+				makeIRun({
+					data: {
+						resultData: {
+							runData: {
+								'Set Node': [
+									{
+										startTime: 2000,
+										executionTime: 50,
+										executionIndex: 0,
+										source: [],
+										data: { main: [[{ json: { key: 'value' } }]] },
+									},
+								],
+							},
+						},
+					} as unknown as IRunExecutionData,
+				}),
+			);
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			expect(result.nodeResults['Set Node']).toBeDefined();
+			expect(result.nodeResults['Set Node'].executionMode).toBe('real');
+		});
+
+		it('captures startTime from run data', async () => {
+			mockProcessRunExecutionData.mockResolvedValue(
+				makeIRun({
+					data: {
+						resultData: {
+							runData: {
+								'HTTP Request': [
+									{
+										startTime: 1710000000,
+										executionTime: 200,
+										executionIndex: 0,
+										source: [],
+										data: { main: [[{ json: { ok: true } }]] },
+									},
+								],
+							},
+						},
+					} as unknown as IRunExecutionData,
+				}),
+			);
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			expect(result.nodeResults['HTTP Request'].startTime).toBe(1710000000);
+		});
+
+		it('captures output from run data, limited to MAX_OUTPUT_ITEMS_PER_NODE', async () => {
+			const items = Array.from({ length: 10 }, (_, i) => ({ json: { idx: i } }));
+			mockProcessRunExecutionData.mockResolvedValue(
+				makeIRun({
+					data: {
+						resultData: {
+							runData: {
+								'HTTP Request': [
+									{
+										startTime: 1000,
+										executionTime: 200,
+										executionIndex: 0,
+										source: [],
+										data: { main: [items] },
+									},
+								],
+							},
+						},
+					} as unknown as IRunExecutionData,
+				}),
+			);
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			// MAX_OUTPUT_ITEMS_PER_NODE is 5
+			expect(result.nodeResults['HTTP Request'].output).toHaveLength(5);
+		});
+	});
+
+	// ── buildTriggerPinData (via execution) ──────────────────────────
+
+	describe('buildTriggerPinData (via execution)', () => {
+		beforeEach(() => {
+			workflowFinderService.findWorkflowForUser.mockResolvedValue(makeWorkflowEntity() as never);
+		});
+
+		it('marks the trigger node as pinned when triggerContent is present', async () => {
+			const hints = makeEmptyHints();
+			hints.triggerContent = { body: { email: 'test@example.com' } };
+			generateMockHintsMock.mockResolvedValue(hints);
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			expect(result.nodeResults['Webhook']).toBeDefined();
+			expect(result.nodeResults['Webhook'].executionMode).toBe('pinned');
+		});
+
+		it('does not create pin data when triggerContent is empty', async () => {
+			const hints = makeEmptyHints();
+			hints.triggerContent = {};
+			generateMockHintsMock.mockResolvedValue(hints);
+
+			mockProcessRunExecutionData.mockResolvedValue(makeIRun());
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			// When triggerContent is empty, the start node is NOT marked as pinned
+			// (it may still appear in results from run data as 'real')
+			const webhookResult = result.nodeResults['Webhook'];
+			if (webhookResult) {
+				expect(webhookResult.executionMode).not.toBe('pinned');
+			}
+		});
+	});
+
+	// ── No start node ────────────────────────────────────────────────
+
+	describe('when no start node is found', () => {
+		it('returns error result', async () => {
+			workflowFinderService.findWorkflowForUser.mockResolvedValue(makeWorkflowEntity() as never);
+			mockGetStartNode.mockReturnValue(undefined);
+			// Also ensure findWebhookNode returns nothing — nodeTypes must not match webhook
+			nodeTypes.getByNameAndVersion.mockReturnValue({
+				description: { properties: [] } as unknown as INodeTypeDescription,
+			} as never);
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			expect(result.success).toBe(false);
+			expect(result.errors).toEqual(
+				expect.arrayContaining([expect.stringContaining('No trigger or start node')]),
+			);
+		});
+	});
+
+	// ── hints passthrough ────────────────────────────────────────────
+
+	describe('hints in result', () => {
+		it('includes Phase 1 hints in the execution result', async () => {
+			workflowFinderService.findWorkflowForUser.mockResolvedValue(makeWorkflowEntity() as never);
+			const hints = makeEmptyHints();
+			hints.globalContext = 'Users: jane@example.com, john@example.com';
+			hints.nodeHints = { 'HTTP Request': 'Return user profiles' };
+			hints.triggerContent = { body: { action: 'create' } };
+			generateMockHintsMock.mockResolvedValue(hints);
+
+			const result = await service.executeWithLlmMock('wf-1', makeUser());
+
+			expect(result.hints.globalContext).toBe('Users: jane@example.com, john@example.com');
+			expect(result.hints.nodeHints).toEqual({ 'HTTP Request': 'Return user profiles' });
+		});
+	});
+});
--- a/packages/cli/src/modules/instance-ai/eval/tests/mock-handler.test.ts
+++ b/packages/cli/src/modules/instance-ai/eval/tests/mock-handler.test.ts
@ -0,0 +1,483 @@
+const mockGenerate = jest.fn();
+const mockAgent = { tool: jest.fn().mockReturnThis(), generate: mockGenerate };
+const mockExtractText = jest.fn((result: { _text?: string }) => result._text ?? '');
+
+jest.mock('@n8n/instance-ai', () => ({
+	createEvalAgent: jest.fn(() => mockAgent),
+	extractText: mockExtractText,
+	Tool: jest.fn().mockImplementation(() => ({
+		description: jest.fn().mockReturnThis(),
+		input: jest.fn().mockReturnThis(),
+		handler: jest.fn().mockReturnThis(),
+		build: jest.fn().mockReturnValue({}),
+	})),
+}));
+
+jest.mock('../api-docs', () => ({ fetchApiDocs: jest.fn() }));
+
+jest.mock('../node-config', () => ({
+	extractNodeConfig: jest.fn().mockReturnValue('{}'),
+}));
+
+jest.mock('@n8n/di', () => ({
+	Container: {
+		get: jest.fn().mockReturnValue({
+			warn: jest.fn(),
+			error: jest.fn(),
+			debug: jest.fn(),
+		}),
+	},
+}));
+
+import type { IHttpRequestOptions, INode } from 'n8n-workflow';
+
+import { createLlmMockHandler } from '../mock-handler';
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function llmReturns(text: string) {
+	mockGenerate.mockResolvedValue({ _text: text, messages: [] });
+}
+
+function llmRejects(error: Error) {
+	mockGenerate.mockRejectedValue(error);
+}
+
+const baseRequest = {
+	url: 'https://api.slack.com/chat.postMessage',
+	method: 'POST',
+} as IHttpRequestOptions;
+const baseNode = { name: 'Slack', type: 'n8n-nodes-base.slack' } as INode;
+
+async function callHandler(
+	handler: ReturnType<typeof createLlmMockHandler>,
+	request = baseRequest,
+	node = baseNode,
+) {
+	const result = await handler(request, node);
+	if (!result) throw new Error('Expected mock handler to return a response');
+	return result;
+}
+
+beforeEach(() => {
+	jest.clearAllMocks();
+});
+
+// ---------------------------------------------------------------------------
+// createLlmMockHandler — response materialization via agent mock
+// ---------------------------------------------------------------------------
+
+describe('createLlmMockHandler', () => {
+	it('should return a function', () => {
+		const handler = createLlmMockHandler();
+		expect(typeof handler).toBe('function');
+	});
+
+	it('should materialize clean JSON spec', async () => {
+		llmReturns('{ "type": "json", "body": { "id": 1, "ok": true } }');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result).toEqual({
+			body: { id: 1, ok: true },
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		});
+	});
+
+	it('should parse JSON from fenced code block', async () => {
+		llmReturns('```json\n{"type":"json","body":{"ok":true}}\n```');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result).toEqual({
+			body: { ok: true },
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		});
+	});
+
+	it('should parse JSON from fenced block without language tag', async () => {
+		llmReturns('```\n{"type":"json","body":{"data":[]}}\n```');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result).toEqual({
+			body: { data: [] },
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		});
+	});
+
+	it('should extract JSON wrapped in prose', async () => {
+		llmReturns('Based on the API docs, here is the response: {"type":"json","body":{"data":[]}}');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result).toEqual({
+			body: { data: [] },
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		});
+	});
+
+	it('should default to json type when type field is missing', async () => {
+		llmReturns('{"body":{"id":1}}');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		// When type is missing, parseResponseText wraps the whole parsed object as body
+		expect(result).toEqual({
+			body: { body: { id: 1 } },
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		});
+	});
+
+	it('should default to json type when type field is unrecognized', async () => {
+		llmReturns('{"type":"xml","body":{"id":1}}');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		// Unrecognized type wraps the entire parsed object as body
+		expect(result).toEqual({
+			body: { type: 'xml', body: { id: 1 } },
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		});
+	});
+
+	it('should return _evalMockError on unparseable text', async () => {
+		llmReturns('I cannot generate this response');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result).toEqual({
+			body: expect.objectContaining({ _evalMockError: true }),
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		});
+	});
+
+	it('should materialize binary spec with Buffer body', async () => {
+		llmReturns('{"type":"binary","contentType":"application/pdf","filename":"doc.pdf"}');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result.statusCode).toBe(200);
+		expect(result.headers['content-type']).toBe('application/pdf');
+		expect(Buffer.isBuffer(result.body)).toBe(true);
+		expect((result.body as Buffer).toString()).toContain('doc.pdf');
+	});
+
+	it('should use default filename and content-type for binary when omitted', async () => {
+		llmReturns('{"type":"binary"}');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result.statusCode).toBe(200);
+		expect(result.headers['content-type']).toBe('application/octet-stream');
+		expect(Buffer.isBuffer(result.body)).toBe(true);
+		expect((result.body as Buffer).toString()).toContain('mock-file.dat');
+	});
+
+	it('should materialize error spec with correct status code', async () => {
+		llmReturns('{"type":"error","statusCode":404,"body":{"error":"not found"}}');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result).toEqual({
+			body: { error: 'not found' },
+			headers: { 'content-type': 'application/json' },
+			statusCode: 404,
+		});
+	});
+
+	it('should default error status code to 500 when omitted', async () => {
+		llmReturns('{"type":"error"}');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result).toEqual({
+			body: { error: 'Mock error' },
+			headers: { 'content-type': 'application/json' },
+			statusCode: 500,
+		});
+	});
+
+	it('should default json body to { ok: true } when body is omitted', async () => {
+		llmReturns('{"type":"json"}');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result).toEqual({
+			body: { ok: true },
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		});
+	});
+
+	it('should return _evalMockError when agent.generate rejects', async () => {
+		llmRejects(new Error('LLM timeout'));
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result).toEqual({
+			body: expect.objectContaining({
+				_evalMockError: true,
+				message: expect.stringContaining('LLM timeout'),
+			}),
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		});
+	});
+
+	it('should cache node config across calls for the same node name', async () => {
+		const { extractNodeConfig } = require('../node-config') as {
+			extractNodeConfig: jest.Mock;
+		};
+		extractNodeConfig.mockReturnValue('{"resource":"message"}');
+
+		llmReturns('{"type":"json","body":{"ok":true}}');
+		const handler = createLlmMockHandler();
+
+		await handler(baseRequest, baseNode);
+		await handler(baseRequest, baseNode);
+
+		expect(extractNodeConfig).toHaveBeenCalledTimes(1);
+	});
+
+	it('should extract config separately for different node names', async () => {
+		const { extractNodeConfig } = require('../node-config') as {
+			extractNodeConfig: jest.Mock;
+		};
+		extractNodeConfig.mockReturnValue('{}');
+
+		llmReturns('{"type":"json","body":{}}');
+		const handler = createLlmMockHandler();
+
+		await handler(baseRequest, { name: 'Slack', type: 'n8n-nodes-base.slack' } as INode);
+		await handler(baseRequest, { name: 'Gmail', type: 'n8n-nodes-base.gmail' } as INode);
+
+		expect(extractNodeConfig).toHaveBeenCalledTimes(2);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Prompt construction — verify request details reach the agent
+// ---------------------------------------------------------------------------
+
+describe('prompt construction', () => {
+	it('should include request body in prompt', async () => {
+		llmReturns('{"type":"json","body":{}}');
+		const handler = createLlmMockHandler();
+
+		await handler(
+			{ url: 'https://api.slack.com/chat.postMessage', method: 'POST', body: { text: 'hi' } },
+			baseNode,
+		);
+
+		const prompt: string = mockGenerate.mock.calls[0][0];
+		expect(prompt).toContain('"text":"hi"');
+	});
+
+	it('should include query string in prompt', async () => {
+		llmReturns('{"type":"json","body":{}}');
+		const handler = createLlmMockHandler();
+
+		await handler(
+			{ url: 'https://api.slack.com/channels', method: 'GET', qs: { limit: 10 } },
+			baseNode,
+		);
+
+		const prompt: string = mockGenerate.mock.calls[0][0];
+		expect(prompt).toContain('"limit":10');
+	});
+
+	it('should include scenario hints when provided', async () => {
+		llmReturns('{"type":"json","body":{}}');
+		const handler = createLlmMockHandler({ scenarioHints: 'return rate-limited error' });
+
+		await handler(baseRequest, baseNode);
+
+		const prompt: string = mockGenerate.mock.calls[0][0];
+		expect(prompt).toContain('return rate-limited error');
+	});
+
+	it('should include global context and node hints when provided', async () => {
+		llmReturns('{"type":"json","body":{}}');
+		const handler = createLlmMockHandler({
+			globalContext: 'project-id=abc123',
+			nodeHints: { Slack: 'channel=#general' },
+		});
+
+		await handler(baseRequest, baseNode);
+
+		const prompt: string = mockGenerate.mock.calls[0][0];
+		expect(prompt).toContain('project-id=abc123');
+		expect(prompt).toContain('channel=#general');
+	});
+
+	it('should add GraphQL format guidance for /graphql endpoints', async () => {
+		llmReturns('{"type":"json","body":{"data":{"viewer":{"id":"1"}}}}');
+		const handler = createLlmMockHandler();
+
+		await handler(
+			{
+				url: 'https://api.github.com/graphql',
+				method: 'POST',
+				body: { query: '{ viewer { id } }' },
+			} as IHttpRequestOptions,
+			{ name: 'GitHub', type: 'n8n-nodes-base.github' } as INode,
+		);
+
+		const prompt: string = mockGenerate.mock.calls[0][0];
+		expect(prompt).toContain('GraphQL');
+	});
+
+	it('should add GraphQL format guidance when body contains query field', async () => {
+		llmReturns('{"type":"json","body":{"data":{}}}');
+		const handler = createLlmMockHandler();
+
+		await handler(
+			{
+				url: 'https://api.linear.app/v1',
+				method: 'POST',
+				body: { query: '{ issues { nodes { id } } }' },
+			} as IHttpRequestOptions,
+			{ name: 'Linear', type: 'n8n-nodes-base.httpRequest' } as INode,
+		);
+
+		const prompt: string = mockGenerate.mock.calls[0][0];
+		expect(prompt).toContain('GraphQL');
+	});
+
+	it('should default method to GET when not specified', async () => {
+		llmReturns('{"type":"json","body":{}}');
+		const handler = createLlmMockHandler();
+
+		await handler({ url: 'https://api.slack.com/channels' }, baseNode);
+
+		const prompt: string = mockGenerate.mock.calls[0][0];
+		expect(prompt).toContain('GET');
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Edge cases for JSON extraction
+// ---------------------------------------------------------------------------
+
+describe('JSON extraction edge cases', () => {
+	it('should handle JSON with nested braces in string values', async () => {
+		llmReturns('Here: {"type":"json","body":{"msg":"value with {braces}"}}');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result.body).toEqual({ msg: 'value with {braces}' });
+		expect(result.statusCode).toBe(200);
+	});
+
+	it('should handle extra whitespace around fenced blocks', async () => {
+		llmReturns('  ```json  \n  {"type":"json","body":{"ok":true}}  \n  ```  ');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result.body).toEqual({ ok: true });
+	});
+
+	it('should handle a raw object without type as entire body', async () => {
+		// When the LLM returns a plain object that isn't a spec, the whole thing becomes the body
+		llmReturns('{"id": 42, "name": "test"}');
+		const handler = createLlmMockHandler();
+		const result = await callHandler(handler);
+
+		expect(result).toEqual({
+			body: { id: 42, name: 'test' },
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		});
+	});
+});
+
+// ---------------------------------------------------------------------------
+// extractServiceName — tested indirectly through prompt content
+// ---------------------------------------------------------------------------
+
+describe('service name extraction (via prompt)', () => {
+	it('should extract "Slack" from api.slack.com', async () => {
+		llmReturns('{"type":"json","body":{}}');
+		const handler = createLlmMockHandler();
+
+		await handler({ url: 'https://api.slack.com/chat.postMessage' }, baseNode);
+
+		const prompt: string = mockGenerate.mock.calls[0][0];
+		expect(prompt).toContain('Service: Slack');
+	});
+
+	it('should extract "Googleapis" from www.googleapis.com', async () => {
+		llmReturns('{"type":"json","body":{}}');
+		const handler = createLlmMockHandler();
+
+		await handler(
+			{ url: 'https://www.googleapis.com/sheets/v4/spreadsheets' } as IHttpRequestOptions,
+			{ name: 'Sheets', type: 'n8n-nodes-base.googleSheets' } as INode,
+		);
+
+		const prompt: string = mockGenerate.mock.calls[0][0];
+		expect(prompt).toContain('Service: Googleapis');
+	});
+
+	it('should return "Unknown" for invalid URLs', async () => {
+		llmReturns('{"type":"json","body":{}}');
+		const handler = createLlmMockHandler();
+
+		await handler({ url: 'not-a-url' }, baseNode);
+
+		const prompt: string = mockGenerate.mock.calls[0][0];
+		expect(prompt).toContain('Service: Unknown');
+	});
+});
+
+// ---------------------------------------------------------------------------
+// extractEndpoint — tested indirectly through prompt content
+// ---------------------------------------------------------------------------
+
+describe('endpoint extraction (via prompt)', () => {
+	it('should extract path and query from URL', async () => {
+		llmReturns('{"type":"json","body":{}}');
+		const handler = createLlmMockHandler();
+
+		await handler(
+			{ url: 'https://api.slack.com/conversations.list?limit=100&cursor=abc' },
+			baseNode,
+		);
+
+		const prompt: string = mockGenerate.mock.calls[0][0];
+		expect(prompt).toContain('/conversations.list?limit=100&cursor=abc');
+	});
+
+	it('should extract path without query when there is none', async () => {
+		llmReturns('{"type":"json","body":{}}');
+		const handler = createLlmMockHandler();
+
+		await handler({ url: 'https://api.slack.com/chat.postMessage' }, baseNode);
+
+		const prompt: string = mockGenerate.mock.calls[0][0];
+		expect(prompt).toContain('/chat.postMessage');
+		// Should not have a '?' in the endpoint portion
+		const match = prompt.match(/(?:GET|POST)\s+(\S+)/);
+		expect(match?.[1]).not.toContain('?');
+	});
+
+	it('should fall back to raw url for invalid URLs', async () => {
+		llmReturns('{"type":"json","body":{}}');
+		const handler = createLlmMockHandler();
+
+		await handler({ url: 'not-a-url', method: 'GET' }, baseNode);
+
+		const prompt: string = mockGenerate.mock.calls[0][0];
+		expect(prompt).toContain('GET not-a-url');
+	});
+});
--- a/packages/cli/src/modules/instance-ai/eval/tests/node-config.test.ts
+++ b/packages/cli/src/modules/instance-ai/eval/tests/node-config.test.ts
@ -0,0 +1,24 @@
+import type { INode } from 'n8n-workflow';
+
+import { extractNodeConfig } from '../node-config';
+
+const makeNode = (parameters: Record<string, unknown>): INode =>
+	({ parameters }) as unknown as INode;
+
+describe('extractNodeConfig', () => {
+	it('should return JSON.stringify of node.parameters', () => {
+		const node = makeNode({ url: 'https://example.com', method: 'GET' });
+		expect(extractNodeConfig(node)).toBe('{"url":"https://example.com","method":"GET"}');
+	});
+
+	it('should return empty string for falsy parameters', () => {
+		expect(extractNodeConfig({ parameters: undefined } as unknown as INode)).toBe('');
+		expect(extractNodeConfig({ parameters: null } as unknown as INode)).toBe('');
+	});
+
+	it('should return empty string when JSON.stringify throws', () => {
+		const circular: Record<string, unknown> = { key: 'value' };
+		circular.self = circular;
+		expect(extractNodeConfig(makeNode(circular))).toBe('');
+	});
+});
--- a/packages/cli/src/modules/instance-ai/eval/tests/workflow-analysis.test.ts
+++ b/packages/cli/src/modules/instance-ai/eval/tests/workflow-analysis.test.ts
@ -0,0 +1,222 @@
+jest.mock('@n8n/instance-ai', () => ({
+	createEvalAgent: jest.fn(),
+	extractText: jest.fn(),
+}));
+
+jest.mock('../node-config', () => ({
+	extractNodeConfig: jest.fn(),
+}));
+
+import type { IConnections, INode, IWorkflowBase } from 'n8n-workflow';
+
+import { identifyNodesForHints, identifyNodesForPinData } from '../workflow-analysis';
+
+function makeNode(overrides: Partial<INode> & { name: string; type: string }): INode {
+	return {
+		id: overrides.name,
+		typeVersion: 1,
+		position: [0, 0] as [number, number],
+		parameters: {},
+		...overrides,
+	};
+}
+
+function makeWorkflow(nodes: INode[], connections: IConnections = {}): IWorkflowBase {
+	return {
+		id: 'test-workflow',
+		name: 'Test',
+		active: false,
+		isArchived: false,
+		activeVersionId: null,
+		nodes,
+		connections,
+		createdAt: new Date(),
+		updatedAt: new Date(),
+	};
+}
+
+describe('identifyNodesForPinData', () => {
+	it('should identify AI root nodes as needing pin data', () => {
+		const nodes = [
+			makeNode({ name: 'ChatOpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
+			makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
+			makeNode({ name: 'Set', type: 'n8n-nodes-base.set' }),
+		];
+		const connections: IConnections = {
+			ChatOpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] },
+		};
+
+		const result = identifyNodesForPinData(makeWorkflow(nodes, connections));
+		const names = result.map((n) => n.name);
+
+		expect(names).toContain('Agent');
+		expect(names).not.toContain('ChatOpenAI');
+		expect(names).not.toContain('Set');
+	});
+
+	it('should identify protocol/bypass nodes as needing pin data', () => {
+		const nodes = [
+			makeNode({ name: 'My Redis', type: 'n8n-nodes-base.redis' }),
+			makeNode({ name: 'My Postgres', type: 'n8n-nodes-base.postgres' }),
+			makeNode({ name: 'My Kafka', type: 'n8n-nodes-base.kafka' }),
+			makeNode({ name: 'HTTP Request', type: 'n8n-nodes-base.httpRequest' }),
+			makeNode({ name: 'Set', type: 'n8n-nodes-base.set' }),
+		];
+
+		const result = identifyNodesForPinData(makeWorkflow(nodes));
+		const names = result.map((n) => n.name);
+
+		expect(names).toContain('My Redis');
+		expect(names).toContain('My Postgres');
+		expect(names).toContain('My Kafka');
+		expect(names).not.toContain('HTTP Request');
+		expect(names).not.toContain('Set');
+	});
+
+	it('should exclude disabled nodes', () => {
+		const nodes = [
+			makeNode({ name: 'My Redis', type: 'n8n-nodes-base.redis', disabled: true }),
+			makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent', disabled: true }),
+		];
+		const connections: IConnections = {
+			ChatOpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] },
+		};
+
+		const result = identifyNodesForPinData(makeWorkflow(nodes, connections));
+
+		expect(result).toHaveLength(0);
+	});
+
+	it('should return empty for workflow with only logic nodes', () => {
+		const nodes = [
+			makeNode({ name: 'Set', type: 'n8n-nodes-base.set' }),
+			makeNode({ name: 'IF', type: 'n8n-nodes-base.if' }),
+			makeNode({ name: 'Merge', type: 'n8n-nodes-base.merge' }),
+		];
+
+		const result = identifyNodesForPinData(makeWorkflow(nodes));
+
+		expect(result).toHaveLength(0);
+	});
+
+	it('should handle Agent with multiple sub-nodes', () => {
+		const nodes = [
+			makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
+			makeNode({ name: 'Memory', type: '@n8n/n8n-nodes-langchain.memoryBufferWindow' }),
+			makeNode({ name: 'Calculator', type: '@n8n/n8n-nodes-langchain.toolCalculator' }),
+			makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
+		];
+		const connections: IConnections = {
+			OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] },
+			Memory: { ai_memory: [[{ node: 'Agent', type: 'ai_memory', index: 0 }]] },
+			Calculator: { ai_tool: [[{ node: 'Agent', type: 'ai_tool', index: 0 }]] },
+		};
+
+		const result = identifyNodesForPinData(makeWorkflow(nodes, connections));
+		const names = result.map((n) => n.name);
+
+		expect(names).toEqual(['Agent']);
+	});
+
+	it('should identify all bypass node types', () => {
+		const bypassTypes = [
+			'n8n-nodes-base.redis',
+			'n8n-nodes-base.mongoDb',
+			'n8n-nodes-base.mySql',
+			'n8n-nodes-base.postgres',
+			'n8n-nodes-base.microsoftSql',
+			'n8n-nodes-base.snowflake',
+			'n8n-nodes-base.kafka',
+			'n8n-nodes-base.rabbitmq',
+			'n8n-nodes-base.mqtt',
+			'n8n-nodes-base.amqp',
+			'n8n-nodes-base.ftp',
+			'n8n-nodes-base.ssh',
+			'n8n-nodes-base.ldap',
+			'n8n-nodes-base.emailSend',
+			'n8n-nodes-base.rssFeedRead',
+			'n8n-nodes-base.git',
+		];
+
+		const nodes = bypassTypes.map((type, i) => makeNode({ name: `Node${i}`, type }));
+		const result = identifyNodesForPinData(makeWorkflow(nodes));
+
+		expect(result).toHaveLength(bypassTypes.length);
+	});
+});
+
+describe('identifyNodesForHints', () => {
+	it('should exclude AI sub-nodes from hints', () => {
+		const nodes = [
+			makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
+			makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
+			makeNode({ name: 'Slack', type: 'n8n-nodes-base.slack' }),
+		];
+		const connections: IConnections = {
+			OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] },
+		};
+
+		const result = identifyNodesForHints(makeWorkflow(nodes, connections));
+		const names = result.map((n) => n.name);
+
+		expect(names).not.toContain('OpenAI');
+		expect(names).not.toContain('Agent');
+		expect(names).toContain('Slack');
+	});
+
+	it('should exclude pinned bypass nodes from hints', () => {
+		const nodes = [
+			makeNode({ name: 'Webhook', type: 'n8n-nodes-base.webhook' }),
+			makeNode({ name: 'Redis', type: 'n8n-nodes-base.redis' }),
+			makeNode({ name: 'Slack', type: 'n8n-nodes-base.slack' }),
+			makeNode({ name: 'Set', type: 'n8n-nodes-base.set' }),
+		];
+
+		const result = identifyNodesForHints(makeWorkflow(nodes));
+		const names = result.map((n) => n.name);
+
+		expect(names).not.toContain('Redis');
+		expect(names).toContain('Webhook');
+		expect(names).toContain('Slack');
+		expect(names).toContain('Set');
+	});
+
+	it('should exclude disabled nodes', () => {
+		const nodes = [
+			makeNode({ name: 'Slack', type: 'n8n-nodes-base.slack', disabled: true }),
+			makeNode({ name: 'Gmail', type: 'n8n-nodes-base.gmail' }),
+		];
+
+		const result = identifyNodesForHints(makeWorkflow(nodes));
+		const names = result.map((n) => n.name);
+
+		expect(names).not.toContain('Slack');
+		expect(names).toContain('Gmail');
+	});
+
+	it('should return only HTTP-interceptible nodes for a mixed workflow', () => {
+		const nodes = [
+			makeNode({ name: 'Webhook', type: 'n8n-nodes-base.webhook' }),
+			makeNode({ name: 'OpenAI', type: '@n8n/n8n-nodes-langchain.lmChatOpenAi' }),
+			makeNode({ name: 'Agent', type: '@n8n/n8n-nodes-langchain.agent' }),
+			makeNode({ name: 'Postgres', type: 'n8n-nodes-base.postgres' }),
+			makeNode({ name: 'Slack', type: 'n8n-nodes-base.slack' }),
+			makeNode({ name: 'Set', type: 'n8n-nodes-base.set' }),
+		];
+		const connections: IConnections = {
+			OpenAI: { ai_languageModel: [[{ node: 'Agent', type: 'ai_languageModel', index: 0 }]] },
+		};
+
+		const result = identifyNodesForHints(makeWorkflow(nodes, connections));
+		const names = result.map((n) => n.name);
+
+		// Should include: Webhook (trigger, gets hints), Slack (HTTP service), Set (logic)
+		expect(names).toContain('Webhook');
+		expect(names).toContain('Slack');
+		expect(names).toContain('Set');
+		// Should exclude: OpenAI (AI sub-node), Agent (AI root, pinned), Postgres (bypass, pinned)
+		expect(names).not.toContain('OpenAI');
+		expect(names).not.toContain('Agent');
+		expect(names).not.toContain('Postgres');
+	});
+});
--- a/packages/cli/src/modules/instance-ai/eval/api-docs.ts
+++ b/packages/cli/src/modules/instance-ai/eval/api-docs.ts
@ -0,0 +1,106 @@
+/**
+ * API documentation fetcher for eval mock generation.
+ *
+ * Uses Context7 to fetch real API documentation at runtime so the
+ * Phase 2 LLM can generate accurate mock responses for any service.
+ * Results are cached per query to avoid redundant fetches.
+ */
+
+import { Logger } from '@n8n/backend-common';
+import { Container } from '@n8n/di';
+
+const CONTEXT7_BASE_URL = 'https://context7.com/api/v2';
+const FETCH_TIMEOUT_MS = 10_000;
+
+const FALLBACK_INSTRUCTIONS =
+	'No API documentation was found for this endpoint. Generate the response based on your knowledge of this API. Follow standard REST conventions for the HTTP method: GET returns resource data, POST returns the created resource, PUT/PATCH returns the updated resource, DELETE returns 204 or confirmation.';
+
+const docsCache = new Map<string, string>();
+
+/** Track whether we've warned about context7 issues this session */
+let context7WarningLogged = false;
+
+async function resolveLibraryId(serviceName: string): Promise<string | undefined> {
+	const cacheKey = `lib:${serviceName}`;
+	if (docsCache.has(cacheKey)) return docsCache.get(cacheKey);
+
+	try {
+		const apiKey = process.env.CONTEXT7_API_KEY;
+		const headers: Record<string, string> = {};
+		if (apiKey) headers.Authorization = `Bearer ${apiKey}`;
+
+		const url = `${CONTEXT7_BASE_URL}/libs/search?libraryName=${encodeURIComponent(serviceName + ' API')}&query=REST+API+endpoints+response+format`;
+		const res = await fetch(url, { signal: AbortSignal.timeout(FETCH_TIMEOUT_MS), headers });
+
+		if (!res.ok) {
+			const body = await res.text().catch(() => '');
+			if (body.includes('Quota') || body.includes('quota') || res.status === 429) {
+				if (!context7WarningLogged) {
+					Container.get(Logger).warn(
+						'[EvalMock] Context7 quota exceeded — mock responses will rely on LLM training data. Set CONTEXT7_API_KEY for higher limits.',
+					);
+					context7WarningLogged = true;
+				}
+			}
+			return undefined;
+		}
+
+		const results = (await res.json()) as Array<{ id: string; trust_score?: number }>;
+		const best = results.sort((a, b) => (b.trust_score ?? 0) - (a.trust_score ?? 0))[0];
+		if (best?.id) {
+			docsCache.set(cacheKey, best.id);
+			return best.id;
+		}
+	} catch (error) {
+		Container.get(Logger).debug(
+			`[EvalMock] Context7 library search failed for "${serviceName}": ${error instanceof Error ? error.message : String(error)}`,
+		);
+	}
+	return undefined;
+}
+
+/**
+ * Fetch API documentation for a specific endpoint from Context7.
+ * Returns plain text ready for LLM consumption.
+ * When docs aren't available, returns explicit fallback instructions
+ * so the LLM knows to use its training data.
+ */
+export async function fetchApiDocs(serviceName: string, endpointQuery: string): Promise<string> {
+	const cacheKey = `docs:${serviceName}:${endpointQuery}`;
+	if (docsCache.has(cacheKey)) return docsCache.get(cacheKey)!;
+
+	const libraryId = await resolveLibraryId(serviceName);
+	if (!libraryId) return FALLBACK_INSTRUCTIONS;
+
+	try {
+		const apiKey = process.env.CONTEXT7_API_KEY;
+		const headers: Record<string, string> = {};
+		if (apiKey) headers.Authorization = `Bearer ${apiKey}`;
+
+		const url = `${CONTEXT7_BASE_URL}/context?libraryId=${encodeURIComponent(libraryId)}&query=${encodeURIComponent(endpointQuery)}&type=txt`;
+		const res = await fetch(url, { signal: AbortSignal.timeout(FETCH_TIMEOUT_MS), headers });
+
+		if (!res.ok) {
+			if (res.status === 429) {
+				if (!context7WarningLogged) {
+					Container.get(Logger).warn(
+						'[EvalMock] Context7 quota exceeded — mock responses will rely on LLM training data. Set CONTEXT7_API_KEY for higher limits.',
+					);
+					context7WarningLogged = true;
+				}
+			}
+			return FALLBACK_INSTRUCTIONS;
+		}
+
+		const text = await res.text();
+		if (!text.trim()) return FALLBACK_INSTRUCTIONS;
+
+		docsCache.set(cacheKey, text);
+		return text;
+	} catch (error) {
+		Container.get(Logger).debug(
+			`[EvalMock] Context7 docs fetch failed for "${serviceName}": ${error instanceof Error ? error.message : String(error)}`,
+		);
+		return FALLBACK_INSTRUCTIONS;
+	}
+}
--- a/packages/cli/src/modules/instance-ai/eval/execution.service.ts
+++ b/packages/cli/src/modules/instance-ai/eval/execution.service.ts
@ -0,0 +1,480 @@
+import type {
+	InstanceAiEvalExecutionRequest,
+	InstanceAiEvalNodeResult,
+	InstanceAiEvalExecutionResult,
+} from '@n8n/api-types';
+import { Logger } from '@n8n/backend-common';
+import type { User } from '@n8n/db';
+import { Service } from '@n8n/di';
+import {
+	type EvalLlmMockHandler,
+	type EvalMockHttpResponse,
+	ExecutionLifecycleHooks,
+	WorkflowExecute,
+} from 'n8n-core';
+import {
+	type IDataObject,
+	type IHttpRequestOptions,
+	type INode,
+	type IPinData,
+	type IRun,
+	type IRunExecutionData,
+	type IWorkflowBase,
+	type IWorkflowExecuteAdditionalData,
+	createRunExecutionData,
+	NodeHelpers,
+	Workflow,
+} from 'n8n-workflow';
+import { randomUUID } from 'node:crypto';
+
+import { NodeTypes } from '@/node-types';
+import { getBase } from '@/workflow-execute-additional-data';
+import { WorkflowFinderService } from '@/workflows/workflow-finder.service';
+
+import type { WorkflowJSON } from '@n8n/workflow-sdk';
+import { normalizePinData } from '@n8n/workflow-sdk';
+
+import { generatePinData } from './pin-data-generator';
+
+import {
+	generateMockHints,
+	identifyNodesForHints,
+	identifyNodesForPinData,
+	type MockHints,
+} from './workflow-analysis';
+import { createLlmMockHandler } from './mock-handler';
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+/** Maximum number of output items to include per node in the result */
+const MAX_OUTPUT_ITEMS_PER_NODE = 5;
+
+// ---------------------------------------------------------------------------
+// Service
+// ---------------------------------------------------------------------------
+
+/**
+ * Executes workflows with LLM-based HTTP mocking for evaluation purposes.
+ *
+ * Orchestrates two phases:
+ *   Phase 1: Analyze the workflow and generate consistent per-node mock hints
+ *            (one LLM call, ensures cross-node data consistency)
+ *   Phase 2: Execute the workflow with a mock HTTP handler that uses the hints
+ *            to generate realistic API responses at interception time
+ *
+ * Safety: The mock handler is set per-execution on a fresh additionalData instance.
+ * No global state is modified. Normal workflow executions are never affected.
+ */
+@Service()
+export class EvalExecutionService {
+	constructor(
+		private readonly workflowFinderService: WorkflowFinderService,
+		private readonly nodeTypes: NodeTypes,
+		private readonly logger: Logger,
+	) {}
+
+	async executeWithLlmMock(
+		workflowId: string,
+		user: User,
+		options: InstanceAiEvalExecutionRequest = {},
+	): Promise<InstanceAiEvalExecutionResult> {
+		const executionId = randomUUID();
+
+		const workflowEntity = await this.workflowFinderService.findWorkflowForUser(workflowId, user, [
+			'workflow:execute',
+		]);
+
+		if (!workflowEntity) {
+			return this.errorResult(executionId, `Workflow ${workflowId} not found or not accessible`);
+		}
+
+		const hints = await this.analyzeWorkflow(workflowEntity, options.scenarioHints);
+
+		return await this.execute(workflowEntity, user, executionId, hints, options.scenarioHints);
+	}
+
+	// ── Phase 1: Workflow analysis ─────────────────────────────────────────
+
+	private async analyzeWorkflow(
+		workflowEntity: IWorkflowBase,
+		scenarioHints?: string,
+	): Promise<MockHints> {
+		// Phase 1: Generate mock hints for HTTP-interceptible nodes
+		const hintNodes = identifyNodesForHints(workflowEntity);
+		const nodeNames = hintNodes.map((n) => n.name);
+
+		this.logger.debug(
+			`[EvalMock] Generating hints for ${nodeNames.length} nodes: ${nodeNames.join(', ')}`,
+		);
+
+		const hints = await generateMockHints({
+			workflow: workflowEntity,
+			nodeNames,
+			scenarioHints,
+		});
+
+		if (!hints.globalContext && nodeNames.length > 0) {
+			this.logger.warn(
+				'[EvalMock] Phase 1 hint generation returned empty — mock responses will lack cross-node consistency',
+			);
+		}
+
+		this.logger.debug(
+			`[EvalMock] Phase 1 result — globalContext: ${hints.globalContext ? 'present' : 'EMPTY'}, triggerContent keys: ${JSON.stringify(Object.keys(hints.triggerContent))}, nodeHints: ${Object.keys(hints.nodeHints).join(', ')}`,
+		);
+
+		// Phase 1.5: Generate pin data for nodes that bypass the HTTP mock layer
+		const bypassNodes = identifyNodesForPinData(workflowEntity);
+		const bypassNodeNames = bypassNodes.map((n) => n.name);
+
+		if (bypassNodeNames.length > 0) {
+			this.logger.debug(
+				`[EvalMock] Generating pin data for ${bypassNodeNames.length} bypass nodes: ${bypassNodeNames.join(', ')}`,
+			);
+			hints.bypassPinData = await this.generateBypassPinData(
+				workflowEntity,
+				bypassNodeNames,
+				hints.globalContext,
+			);
+			this.logger.debug(
+				`[EvalMock] Phase 1.5 result — pinned nodes: ${Object.keys(hints.bypassPinData).join(', ') || 'none'}`,
+			);
+		}
+
+		return hints;
+	}
+
+	// ── Phase 1.5: Pin data for bypass nodes ─────────────────────────────
+
+	/**
+	 * Generate pin data for nodes that bypass the HTTP mock layer.
+	 * Uses the existing LLM-based pin data generator with Phase 1's globalContext
+	 * for cross-node data consistency.
+	 */
+	private async generateBypassPinData(
+		workflowEntity: IWorkflowBase,
+		bypassNodeNames: string[],
+		globalContext: string,
+	): Promise<IPinData> {
+		if (bypassNodeNames.length === 0) return {};
+
+		try {
+			const result = await generatePinData({
+				workflow: workflowEntity as unknown as WorkflowJSON,
+				nodeNames: bypassNodeNames,
+				instructions: globalContext ? { dataDescription: globalContext } : undefined,
+			});
+
+			return normalizePinData(result as unknown as IPinData);
+		} catch (error) {
+			const errorMsg = error instanceof Error ? error.message : String(error);
+			this.logger.error(`[EvalMock] Phase 1.5 pin data generation failed: ${errorMsg}`);
+			return normalizePinData(
+				Object.fromEntries(
+					bypassNodeNames.map((nodeName) => [nodeName, [{ json: {} }]]),
+				) as IPinData,
+			);
+		}
+	}
+
+	// ── Phase 2: Mock execution ────────────────────────────────────────────
+
+	private async execute(
+		workflowEntity: IWorkflowBase,
+		user: User,
+		executionId: string,
+		hints: MockHints,
+		scenarioHints?: string,
+	): Promise<InstanceAiEvalExecutionResult> {
+		const nodeResults: Record<string, InstanceAiEvalNodeResult> = {};
+
+		const workflow = this.buildWorkflow(workflowEntity);
+		const startNode = this.findStartNode(workflow);
+
+		if (!startNode) {
+			return this.errorResult(executionId, 'No trigger or start node found in the workflow');
+		}
+
+		const mockHandler = createLlmMockHandler({
+			scenarioHints,
+			globalContext: hints.globalContext,
+			nodeHints: hints.nodeHints,
+		});
+
+		const additionalData = await getBase({
+			userId: user.id,
+			workflowId: workflowEntity.id,
+			workflowSettings: workflowEntity.settings ?? {},
+		});
+		additionalData.evalLlmMockHandler = this.createInterceptingHandler(mockHandler, nodeResults);
+		additionalData.hooks = new ExecutionLifecycleHooks('evaluation', executionId, workflowEntity);
+
+		const triggerPinData = this.buildTriggerPinData(startNode, hints.triggerContent);
+		const pinData: IPinData = { ...triggerPinData, ...hints.bypassPinData };
+		const pinDataNodeNames = Object.keys(pinData);
+
+		// Check config completeness before execution — detect missing required parameters
+		this.checkNodeConfig(workflow, nodeResults, pinDataNodeNames);
+		const executionData = this.buildExecutionData(startNode, pinData);
+
+		// Mark the trigger node as pinned (it gets its output from pin data, not execution)
+		// Preserve any configIssues that checkNodeConfig may have already recorded.
+		if (Object.keys(triggerPinData).length > 0) {
+			const existing = nodeResults[startNode.name];
+			nodeResults[startNode.name] = {
+				output: null,
+				interceptedRequests: [],
+				executionMode: 'pinned',
+				...(existing?.configIssues ? { configIssues: existing.configIssues } : {}),
+			};
+		}
+
+		// Mark bypass nodes as pinned
+		for (const nodeName of Object.keys(hints.bypassPinData)) {
+			const existing = nodeResults[nodeName];
+			nodeResults[nodeName] = {
+				output: null,
+				interceptedRequests: [],
+				executionMode: 'pinned',
+				...(existing?.configIssues ? { configIssues: existing.configIssues } : {}),
+			};
+		}
+
+		try {
+			const result = await this.runWorkflow(workflow, additionalData, executionData);
+			return this.buildResult(executionId, result, nodeResults, hints);
+		} catch (error: unknown) {
+			const message = error instanceof Error ? error.message : String(error);
+			this.logger.error(`[EvalMock] Workflow execution failed: ${message}`);
+			return {
+				executionId,
+				success: false,
+				nodeResults,
+				errors: [`Execution failed: ${message}`],
+				hints,
+			};
+		}
+	}
+
+	// ── Workflow construction ──────────────────────────────────────────────
+
+	private buildWorkflow(workflowEntity: IWorkflowBase): Workflow {
+		return new Workflow({
+			id: workflowEntity.id,
+			name: workflowEntity.name,
+			nodes: workflowEntity.nodes,
+			connections: workflowEntity.connections,
+			active: false,
+			nodeTypes: this.nodeTypes,
+			staticData: workflowEntity.staticData,
+			settings: workflowEntity.settings ?? {},
+		});
+	}
+
+	/**
+	 * Find the workflow's trigger/start node.
+	 * Uses Workflow.getStartNode() first (handles trigger, poll, and STARTING_NODE_TYPES),
+	 * then falls back to checking for webhook nodes which getStartNode() doesn't cover.
+	 */
+	private findStartNode(workflow: Workflow): INode | undefined {
+		return workflow.getStartNode() ?? this.findWebhookNode(workflow);
+	}
+
+	private findWebhookNode(workflow: Workflow): INode | undefined {
+		return Object.values(workflow.nodes).find((node) => {
+			if (node.disabled) return false;
+			const nodeType = this.nodeTypes.getByNameAndVersion(node.type, node.typeVersion);
+			return nodeType !== undefined && 'webhook' in nodeType;
+		});
+	}
+
+	/**
+	 * Check each node for missing required parameters and record issues
+	 * in nodeResults. This runs before execution so the report shows
+	 * configuration completeness regardless of whether the node crashes.
+	 */
+	private checkNodeConfig(
+		workflow: Workflow,
+		nodeResults: Record<string, InstanceAiEvalNodeResult>,
+		pinDataNodeNames: string[],
+	): void {
+		for (const node of Object.values(workflow.nodes)) {
+			if (node.disabled) continue;
+			const nodeType = this.nodeTypes.getByNameAndVersion(node.type, node.typeVersion);
+			if (!nodeType) continue;
+
+			const issues = NodeHelpers.getNodeParametersIssues(
+				nodeType.description.properties,
+				node,
+				nodeType.description,
+				pinDataNodeNames,
+			);
+
+			if (issues?.parameters && Object.keys(issues.parameters).length > 0) {
+				const entry = (nodeResults[node.name] ??= {
+					output: null,
+					interceptedRequests: [],
+					executionMode: 'real',
+				});
+				entry.configIssues = issues.parameters;
+			}
+		}
+	}
+
+	// ── Execution data ────────────────────────────────────────────────────
+
+	/**
+	 * Build pin data for the trigger/start node from LLM-generated content.
+	 * Pin data provides the trigger's output — the node doesn't execute,
+	 * since trigger nodes receive external events that don't fire in eval mode.
+	 */
+	private buildTriggerPinData(startNode: INode, triggerContent: Record<string, unknown>): IPinData {
+		if (Object.keys(triggerContent).length === 0) return {};
+		return { [startNode.name]: [{ json: triggerContent as IDataObject }] };
+	}
+
+	/**
+	 * Build execution data with the trigger node on the execution stack.
+	 * We use processRunExecutionData() instead of run() because run() relies on
+	 * getStartNode() which doesn't find webhook nodes (they define `webhook`,
+	 * not `trigger`). This follows the same pattern as InstanceAiAdapterService.
+	 * Pin data carries the trigger's output; the execution stack just marks where to start.
+	 */
+	private buildExecutionData(startNode: INode, pinData: IPinData): IRunExecutionData {
+		return createRunExecutionData({
+			startData: {},
+			resultData: { pinData, runData: {} },
+			executionData: {
+				contextData: {},
+				metadata: {},
+				nodeExecutionStack: [
+					{
+						node: startNode,
+						data: { main: [[{ json: {} }]] },
+						source: null,
+					},
+				],
+				waitingExecution: {},
+				waitingExecutionSource: {},
+			},
+		});
+	}
+
+	private async runWorkflow(
+		workflow: Workflow,
+		additionalData: IWorkflowExecuteAdditionalData,
+		executionData: IRunExecutionData,
+	): Promise<IRun> {
+		const workflowExecute = new WorkflowExecute(additionalData, 'evaluation', executionData);
+		return await workflowExecute.processRunExecutionData(workflow);
+	}
+
+	// ── Request interception ─────────────────────────────────────────────
+
+	/**
+	 * Wraps the mock handler to collect intercepted request metadata for diagnostics.
+	 */
+	private createInterceptingHandler(
+		mockHandler: EvalLlmMockHandler,
+		nodeResults: Record<string, InstanceAiEvalNodeResult>,
+	): EvalLlmMockHandler {
+		return async (
+			requestOptions: IHttpRequestOptions,
+			node: INode,
+		): Promise<EvalMockHttpResponse | undefined> => {
+			// A node may make multiple HTTP requests — ensure it's marked as mocked.
+			// checkNodeConfig may have pre-created the entry as 'real', so always override.
+			const entry = (nodeResults[node.name] ??= {
+				output: null,
+				interceptedRequests: [],
+				executionMode: 'mocked',
+			});
+			entry.executionMode = 'mocked';
+			const response = await mockHandler(requestOptions, node);
+
+			entry.interceptedRequests.push({
+				url: requestOptions.url,
+				method: requestOptions.method ?? 'GET',
+				nodeType: node.type,
+				requestBody: requestOptions.body,
+				mockResponse: response?.body,
+			});
+
+			this.logger.debug(
+				`[EvalMock] Intercepted ${requestOptions.method ?? 'GET'} ${requestOptions.url} from "${node.name}" (${node.type})`,
+			);
+
+			return response;
+		};
+	}
+
+	// ── Result extraction ─────────────────────────────────────────────────
+
+	private buildResult(
+		executionId: string,
+		result: IRun,
+		nodeResults: Record<string, InstanceAiEvalNodeResult>,
+		hints: MockHints,
+	): InstanceAiEvalExecutionResult {
+		const errors: string[] = [];
+
+		const runData = result.data?.resultData?.runData ?? {};
+		for (const [nodeName, nodeRuns] of Object.entries(runData)) {
+			// Nodes already in nodeResults were intercepted (mocked) or pinned.
+			// Nodes appearing here for the first time executed for real (logic nodes).
+			const entry = (nodeResults[nodeName] ??= {
+				output: null,
+				interceptedRequests: [],
+				executionMode: 'real',
+			});
+			const lastRun = nodeRuns[nodeRuns.length - 1];
+			if (lastRun?.startTime) {
+				entry.startTime = lastRun.startTime;
+			}
+			if (lastRun?.data?.main) {
+				// Capture output from all branches (Switch/IF nodes have multiple outputs)
+				const allOutputs = lastRun.data.main
+					.flat()
+					.filter(Boolean)
+					.slice(0, MAX_OUTPUT_ITEMS_PER_NODE);
+				if (allOutputs.length > 0) {
+					entry.output = allOutputs;
+				}
+			}
+			if (lastRun?.error) {
+				errors.push(`Node "${nodeName}": ${lastRun.error.message}`);
+			}
+		}
+
+		const executionError = result.data?.resultData?.error;
+		if (executionError) {
+			errors.push(`Workflow error: ${executionError.message}`);
+		}
+
+		return {
+			executionId,
+			success: executionError === undefined && errors.length === 0,
+			nodeResults,
+			errors,
+			hints,
+		};
+	}
+
+	private errorResult(executionId: string, message: string): InstanceAiEvalExecutionResult {
+		return {
+			executionId,
+			success: false,
+			nodeResults: {},
+			errors: [message],
+			hints: {
+				globalContext: '',
+				triggerContent: {},
+				nodeHints: {},
+				warnings: [],
+				bypassPinData: {},
+			},
+		};
+	}
+}
--- a/packages/cli/src/modules/instance-ai/eval/mock-handler.ts
+++ b/packages/cli/src/modules/instance-ai/eval/mock-handler.ts
@ -0,0 +1,391 @@
+/**
+ * LLM-powered HTTP mock handler for evaluation.
+ *
+ * Generates realistic API responses on-the-fly based on the intercepted
+ * request (URL, method, body) and optional scenario hints. Uses Claude Sonnet
+ * with tool access to API documentation (Context7) and node configuration.
+ *
+ * The LLM returns a structured **response spec** (json, binary, or error)
+ * which the handler materializes into the correct runtime format. This lets
+ * the LLM decide whether an endpoint returns JSON, a file download, or an
+ * error — without us maintaining per-service detection rules.
+ *
+ * Used by:
+ *   - Instance AI agent tools (self-validation during workflow building)
+ *   - Eval CLI test suite (scenario-based testing via REST endpoint)
+ */
+
+import { Logger } from '@n8n/backend-common';
+import { Container } from '@n8n/di';
+import type { EvalLlmMockHandler, EvalMockHttpResponse } from 'n8n-core';
+import { jsonParse } from 'n8n-workflow';
+import { z } from 'zod';
+
+import { createEvalAgent, extractText, Tool } from '@n8n/instance-ai';
+import { fetchApiDocs } from './api-docs';
+import { extractNodeConfig } from './node-config';
+
+// ---------------------------------------------------------------------------
+// System prompt
+// ---------------------------------------------------------------------------
+
+const MOCK_SYSTEM_PROMPT = `You are an API mock server generating realistic HTTP responses for n8n workflow evaluation.
+
+## Your tools
+
+You have two tools. Call them before generating your response:
+
+"lookup_api_docs" — Fetches real API documentation for a service endpoint. Use this to learn the correct response STRUCTURE (what fields, what nesting, what types the real API returns). Pay special attention to what the real API returns for the exact HTTP method and URL path you're responding to.
+
+"get_node_config" — Returns the n8n node's configuration parameters. This tells you what the node is set up to work with. The configuration contains the values the node expects to find in API responses — resource IDs, field names, column names, etc. Every node type has different parameters, so you need to interpret the config intelligently. Key patterns:
+  - Objects with "__rl" are resource selectors — "value" is the selected resource (a document ID, channel, project, etc.)
+  - "schema" arrays list the columns/fields the node expects. CRITICAL: use the "id" field as the exact column/field name in your response — NOT "displayName". For example, if schema has {"id": "timestamp", "displayName": "Timestamp"}, the API response must use "timestamp" (lowercase), not "Timestamp"
+  - "operation" and "resource" describe what the node does (e.g. "send" a "message", "create" an "issue")
+  - Strings starting with "=" are expressions (ignore these) — all other strings are literal values
+
+## How to combine them
+
+The API docs tell you the response SHAPE. The node config tells you the exact DATA VALUES to put in that shape. All names, IDs, and identifiers from the node config are case-sensitive — use them character-for-character.
+
+## Output format
+
+Respond with ONLY a JSON object. No explanation, no markdown, no prose.
+
+{ "type": "json", "body": { ...realistic API response... } }
+{ "type": "binary", "contentType": "application/pdf", "filename": "doc.pdf" }
+{ "type": "error", "statusCode": 404, "body": { ...service error format... } }
+
+## Rules
+
+- A node may make MULTIPLE sequential HTTP requests in a single execution (e.g., first GET metadata, then GET headers, then POST data). You are responding to ONE specific request. Match your response to the URL + method of THIS request only. A GET to a metadata endpoint must return metadata — not a write result — even if the node's overall purpose is to write data.
+- Echo request values faithfully. If the request contains an identifier, name, or reference value (even one that looks like a placeholder such as "YOUR_CHAT_ID" or "YOUR_API_KEY"), echo it back exactly in the corresponding response field. The real API would reflect the same value the client sent.
+- Some APIs return empty or minimal responses on success (204 with no body, 202 with empty body). If the API documentation indicates an empty response body, return { "type": "json", "body": {} }. Don't invent additional response fields.
+- No pagination — always indicate end of results (has_more=false, nextPageToken=null, etc.)`;
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+interface MockHandlerOptions {
+	/** Optional scenario description — steers the LLM toward specific behavior (errors, edge cases) */
+	scenarioHints?: string;
+	/** Pre-generated consistent data context from Phase 1 (generateMockHints) */
+	globalContext?: string;
+	/** Per-node data hints from Phase 1, keyed by node name */
+	nodeHints?: Record<string, string>;
+}
+
+/** Structured response spec returned by the LLM */
+interface MockResponseSpec {
+	type: 'json' | 'binary' | 'error';
+	body?: unknown;
+	statusCode?: number;
+	contentType?: string;
+	filename?: string;
+}
+
+// ---------------------------------------------------------------------------
+// Handler factory
+// ---------------------------------------------------------------------------
+
+/**
+ * Creates an LLM-powered mock handler that generates realistic API responses.
+ *
+ * The handler is called for each intercepted HTTP request during eval execution.
+ * It uses the request URL + method + body + node type to generate an appropriate
+ * response spec, then materializes it into the correct format (JSON, Buffer, error).
+ */
+export function createLlmMockHandler(options?: MockHandlerOptions): EvalLlmMockHandler {
+	// Pre-compute node configs so we don't re-extract on every request
+	const nodeConfigCache = new Map<string, string>();
+
+	return async (requestOptions, node) => {
+		if (!nodeConfigCache.has(node.name)) {
+			nodeConfigCache.set(node.name, extractNodeConfig(node));
+		}
+		return await generateMockResponse(requestOptions, node, {
+			scenarioHints: options?.scenarioHints,
+			globalContext: options?.globalContext,
+			nodeHint: options?.nodeHints?.[node.name],
+			nodeConfig: nodeConfigCache.get(node.name) ?? '',
+		});
+	};
+}
+
+// ---------------------------------------------------------------------------
+// LLM mock generation
+// ---------------------------------------------------------------------------
+
+interface MockResponseContext {
+	scenarioHints?: string;
+	globalContext?: string;
+	nodeHint?: string;
+	nodeConfig: string;
+}
+
+async function generateMockResponse(
+	request: { url: string; method?: string; body?: unknown; qs?: Record<string, unknown> },
+	node: { name: string; type: string },
+	context: MockResponseContext,
+): Promise<EvalMockHttpResponse> {
+	const serviceName = extractServiceName(request.url);
+	const endpoint = extractEndpoint(request.url);
+
+	// Build user prompt with clearly separated sections
+	const sections: string[] = [
+		'## Request',
+		`Service: ${serviceName}`,
+		`Node: "${node.name}" (${node.type})`,
+		(request.method ?? 'GET') + ' ' + endpoint,
+		'Generate the response for this EXACT endpoint and method.',
+	];
+
+	if (request.body) {
+		sections.push(`Body: ${JSON.stringify(request.body)}`);
+	}
+	if (request.qs && Object.keys(request.qs).length > 0) {
+		sections.push(`Query: ${JSON.stringify(request.qs)}`);
+	}
+
+	// Detect GraphQL and add format constraint
+	const isGraphQL =
+		endpoint.includes('/graphql') ||
+		(typeof request.body === 'object' && request.body !== null && 'query' in request.body);
+
+	if (isGraphQL) {
+		sections.push('', '## GraphQL format requirement');
+		sections.push(
+			'This is a GraphQL endpoint. ALL responses MUST use GraphQL response format:',
+			'- Success: { "data": { ...fields matching the query... } }',
+			'- Error: { "errors": [{ "message": "...", "extensions": { "code": "..." } }], "data": null }',
+			'Never return flat REST-style error objects.',
+		);
+	}
+
+	if (context.nodeConfig) {
+		sections.push('', '## Node Configuration', context.nodeConfig);
+	}
+
+	if (context.globalContext || context.nodeHint || context.scenarioHints) {
+		sections.push('', '## Context');
+		if (context.globalContext) sections.push(`Data: ${context.globalContext}`);
+		if (context.nodeHint) sections.push(`Hint: ${context.nodeHint}`);
+		if (context.scenarioHints) {
+			sections.push(`Scenario: ${context.scenarioHints}`);
+			sections.push(
+				isGraphQL
+					? '(For error scenarios, use GraphQL error format with "data": null. Don\'t use "type": "error" wrapper.)'
+					: '(Use "error" type with appropriate statusCode for error scenarios.)',
+			);
+		}
+	}
+
+	const userPrompt = sections.join('\n');
+
+	try {
+		const spec = await callLlm(userPrompt, context.nodeConfig);
+		return materializeSpec(spec);
+	} catch (error) {
+		const errorMsg = error instanceof Error ? error.message : String(error);
+		const safeUrl = extractEndpoint(request.url);
+		Container.get(Logger).error(
+			`[EvalMock] Mock generation failed for ${request.method ?? 'GET'} ${safeUrl}: ${errorMsg}`,
+		);
+		return {
+			body: { _evalMockError: true, message: `Mock generation failed: ${errorMsg}` },
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		};
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Tool definitions (@n8n/agents)
+// ---------------------------------------------------------------------------
+
+const apiDocsTool = new Tool('lookup_api_docs')
+	.description(
+		'Look up official API documentation for a specific REST endpoint to understand the exact response format.',
+	)
+	.input(
+		z.object({
+			serviceName: z
+				.string()
+				.describe('The API service name (e.g. "Google Sheets", "Gmail", "Slack")'),
+			endpointDescription: z
+				.string()
+				.describe('Description of the endpoint (e.g. "GET spreadsheets values response format")'),
+		}),
+	)
+	.handler(async (input: { serviceName: string; endpointDescription: string }) => {
+		return await fetchApiDocs(input.serviceName, input.endpointDescription);
+	})
+	.build();
+
+function createNodeConfigTool(nodeConfig: string) {
+	return new Tool('get_node_config')
+		.description(
+			"Get the n8n node's configuration parameters — resource IDs, field names, settings, etc. Your mock data must match these exact values.",
+		)
+		.input(z.object({}))
+		.handler(async () => nodeConfig)
+		.build();
+}
+
+// ---------------------------------------------------------------------------
+// LLM call with tool use (agent handles multi-round loop automatically)
+// ---------------------------------------------------------------------------
+
+async function callLlm(userPrompt: string, nodeConfig: string): Promise<MockResponseSpec> {
+	const agent = createEvalAgent('eval-mock-responder', {
+		instructions: MOCK_SYSTEM_PROMPT,
+	})
+		.tool(apiDocsTool)
+		.tool(createNodeConfigTool(nodeConfig));
+
+	const result = await agent.generate(userPrompt, {
+		providerOptions: { anthropic: { maxTokens: 4096 } },
+	});
+
+	const text: string = extractText(result);
+	return parseResponseText(text);
+}
+
+function parseResponseText(raw: string): MockResponseSpec {
+	let text = raw.trim();
+
+	// Extract JSON from a ```json fenced block (anywhere in the text)
+	// Allow optional newline after opening fence — LLM sometimes omits it
+	const fencedMatch = text.match(/```(?:json)?\s*\n?([\s\S]*?)\n?\s*```/i);
+	if (fencedMatch) {
+		text = fencedMatch[1].trim();
+	}
+
+	// Strip any remaining fences at boundaries
+	text = text
+		.replace(/^```(?:json)?\s*\n?/i, '')
+		.replace(/\n?\s*```\s*$/i, '')
+		.trim();
+
+	// If still starts with prose, extract the JSON object by finding balanced braces
+	if (text.length > 0 && !text.startsWith('{') && !text.startsWith('[')) {
+		const extracted = extractJsonObject(text);
+		if (extracted) {
+			text = extracted;
+		}
+	}
+
+	const parsed = jsonParse<MockResponseSpec>(text);
+
+	if (!parsed.type || !['json', 'binary', 'error'].includes(parsed.type)) {
+		return { type: 'json', body: parsed };
+	}
+
+	return parsed;
+}
+
+// ---------------------------------------------------------------------------
+// Spec materialization
+// ---------------------------------------------------------------------------
+
+function materializeSpec(spec: MockResponseSpec): EvalMockHttpResponse {
+	switch (spec.type) {
+		case 'json':
+			return {
+				body: spec.body ?? { ok: true },
+				headers: { 'content-type': 'application/json' },
+				statusCode: 200,
+			};
+
+		case 'binary': {
+			const filename = spec.filename ?? 'mock-file.dat';
+			const contentType = spec.contentType ?? 'application/octet-stream';
+			const content = `[eval-mock] Synthetic file: ${filename} (${contentType})`;
+			return {
+				body: Buffer.from(content),
+				headers: { 'content-type': contentType },
+				statusCode: 200,
+			};
+		}
+
+		case 'error':
+			return {
+				body: spec.body ?? { error: 'Mock error' },
+				headers: { 'content-type': 'application/json' },
+				statusCode: spec.statusCode ?? 500,
+			};
+
+		default:
+			return {
+				body: spec.body ?? { ok: true },
+				headers: { 'content-type': 'application/json' },
+				statusCode: 200,
+			};
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/**
+ * Extract a JSON object from text by finding balanced braces.
+ * Handles the case where the LLM wraps JSON in prose.
+ */
+function extractJsonObject(text: string): string | undefined {
+	const start = text.indexOf('{');
+	if (start < 0) return undefined;
+
+	let depth = 0;
+	let inString = false;
+	let escape = false;
+
+	for (let i = start; i < text.length; i++) {
+		const ch = text[i];
+		if (escape) {
+			escape = false;
+			continue;
+		}
+		if (ch === '\\' && inString) {
+			escape = true;
+			continue;
+		}
+		if (ch === '"') {
+			inString = !inString;
+			continue;
+		}
+		if (inString) continue;
+		if (ch === '{') depth++;
+		if (ch === '}') {
+			depth--;
+			if (depth === 0) {
+				return text.slice(start, i + 1);
+			}
+		}
+	}
+	// Unbalanced — fall back to slice from start
+	return text.slice(start);
+}
+
+function extractServiceName(url: string): string {
+	try {
+		const hostname = new URL(url).hostname;
+		// "api.slack.com" → "Slack", "www.googleapis.com" → "Google APIs"
+		const parts = hostname
+			.replace(/^api\./, '')
+			.replace(/^www\./, '')
+			.split('.');
+		return parts[0].charAt(0).toUpperCase() + parts[0].slice(1);
+	} catch {
+		return 'Unknown';
+	}
+}
+
+function extractEndpoint(url: string): string {
+	try {
+		const parsed = new URL(url);
+		return parsed.pathname + (parsed.search ? parsed.search : '');
+	} catch {
+		return url;
+	}
+}
--- a/packages/cli/src/modules/instance-ai/eval/node-config.ts
+++ b/packages/cli/src/modules/instance-ai/eval/node-config.ts
@ -0,0 +1,18 @@
+/**
+ * Node configuration extractor for eval mock prompts.
+ *
+ * Serializes node.parameters as JSON for LLM context.
+ */
+
+import type { INode } from 'n8n-workflow';
+
+export function extractNodeConfig(node: INode): string {
+	const params = node.parameters;
+	if (!params || typeof params !== 'object') return '';
+
+	try {
+		return JSON.stringify(params);
+	} catch {
+		return '';
+	}
+}
--- a/packages/cli/src/modules/instance-ai/eval/pin-data-generator.ts
+++ b/packages/cli/src/modules/instance-ai/eval/pin-data-generator.ts
@ -0,0 +1,454 @@
+/**
+ * LLM-based pin data generator.
+ *
+ * Generates realistic mock output data for specified nodes in a workflow
+ * via a single LLM call, ensuring cross-node data consistency. The caller
+ * decides which nodes need pin data — this module only generates it.
+ *
+ * NOTE: Adapted from @n8n/ai-workflow-builder.ee/evaluations/support/pin-data-generator.ts
+ * This should be extracted to a shared package (e.g., @n8n/eval-utils) for reuse
+ * by MCP, frontend, instance-ai evaluations, and other teams.
+ */
+
+import type { WorkflowJSON, NodeJSON } from '@n8n/workflow-sdk';
+import { existsSync, readFileSync, readdirSync } from 'fs';
+import { join } from 'path';
+
+import { createEvalAgent, extractText } from '@n8n/instance-ai';
+
+type PinData = Record<string, Array<Record<string, unknown>>>;
+
+interface PinDataGenerationInstructions {
+	dataDescription: string;
+}
+
+// ---------------------------------------------------------------------------
+// Public types
+// ---------------------------------------------------------------------------
+
+export interface GeneratePinDataOptions {
+	/** The workflow containing the nodes to generate data for */
+	workflow: WorkflowJSON;
+	/** Node names to generate pin data for */
+	nodeNames: string[];
+	/** Optional instructions to enrich the LLM prompt */
+	instructions?: PinDataGenerationInstructions;
+}
+
+// ---------------------------------------------------------------------------
+// Internal types
+// ---------------------------------------------------------------------------
+
+interface NodeSchemaContext {
+	nodeName: string;
+	nodeType: string;
+	typeVersion: number;
+	resource?: string;
+	operation?: string;
+	schema?: Record<string, unknown>;
+}
+
+// ---------------------------------------------------------------------------
+// nodesBasePath auto-resolution
+// ---------------------------------------------------------------------------
+
+let _resolvedNodesBasePath: string | undefined | null;
+
+function resolveNodesBasePath(): string | undefined {
+	if (_resolvedNodesBasePath !== undefined) {
+		return _resolvedNodesBasePath ?? undefined;
+	}
+
+	let dir = __dirname;
+	for (let i = 0; i < 10; i++) {
+		const candidate = join(dir, 'packages', 'nodes-base', 'nodes');
+		if (existsSync(candidate)) {
+			_resolvedNodesBasePath = candidate;
+			return candidate;
+		}
+		const parent = join(dir, '..');
+		if (parent === dir) break;
+		dir = parent;
+	}
+
+	_resolvedNodesBasePath = null;
+	return undefined;
+}
+
+// ---------------------------------------------------------------------------
+// Schema resolution
+// ---------------------------------------------------------------------------
+
+const schemaMapCache = new Map<string, Map<string, string>>();
+
+/**
+ * Build a map from node type name (e.g., "n8n-nodes-base.linear") to the
+ * directory containing its __schema__ folder by scanning .node.ts files.
+ */
+function buildSchemaMap(nodesBasePath: string): Map<string, string> {
+	const cached = schemaMapCache.get(nodesBasePath);
+	if (cached) return cached;
+
+	const result = new Map<string, string>();
+
+	function scanDir(dir: string) {
+		try {
+			for (const entry of readdirSync(dir, { withFileTypes: true })) {
+				if (!entry.isDirectory()) continue;
+
+				const entryPath = join(dir, entry.name);
+				const schemaDir = join(entryPath, '__schema__');
+
+				if (existsSync(schemaDir)) {
+					const nodeFiles = readdirSync(entryPath).filter(
+						(f) => f.endsWith('.node.ts') || f.endsWith('.node.js'),
+					);
+					for (const nodeFile of nodeFiles) {
+						try {
+							const content = readFileSync(join(entryPath, nodeFile), 'utf-8');
+							const nameMatch = content.match(/name:\s*['"]([^'"]+)['"]/);
+							if (nameMatch) {
+								result.set(`n8n-nodes-base.${nameMatch[1]}`, entryPath);
+							}
+						} catch {
+							// Skip files that can't be read
+						}
+					}
+				}
+
+				scanDir(entryPath);
+			}
+		} catch {
+			// Directory doesn't exist or can't be read
+		}
+	}
+
+	scanDir(nodesBasePath);
+	schemaMapCache.set(nodesBasePath, result);
+	return result;
+}
+
+function normalizeVersion(version: number): string {
+	const str = String(version);
+	const parts = str.split('.');
+	while (parts.length < 3) parts.push('0');
+	return parts.join('.');
+}
+
+/**
+ * Resolve the __schema__ JSON Schema for a node's output, if available.
+ */
+function resolveSchemaForNode(
+	nodeType: string,
+	typeVersion: number,
+	resource: string | undefined,
+	operation: string | undefined,
+	nodesBasePath: string,
+): Record<string, unknown> | undefined {
+	const schemaMap = buildSchemaMap(nodesBasePath);
+	const nodeDir = schemaMap.get(nodeType);
+	if (!nodeDir) return undefined;
+
+	const schemaBaseDir = join(nodeDir, '__schema__');
+	if (!existsSync(schemaBaseDir)) return undefined;
+
+	const versionStr = normalizeVersion(typeVersion);
+
+	const versionDirs = [
+		`v${versionStr}`,
+		...readdirSync(schemaBaseDir)
+			.filter((d) => d.startsWith('v'))
+			.sort((a, b) => {
+				const partsA = a.slice(1).split('.').map(Number);
+				const partsB = b.slice(1).split('.').map(Number);
+				for (let i = 0; i < Math.max(partsA.length, partsB.length); i++) {
+					const diff = (partsB[i] ?? 0) - (partsA[i] ?? 0);
+					if (diff !== 0) return diff;
+				}
+				return 0;
+			}),
+	];
+
+	for (const vDir of [...new Set(versionDirs)]) {
+		const versionPath = join(schemaBaseDir, vDir);
+		if (!existsSync(versionPath)) continue;
+
+		const parts = [versionPath, resource, operation ? `${operation}.json` : undefined].filter(
+			Boolean,
+		) as string[];
+
+		const schemaFile = join(...parts);
+		if (existsSync(schemaFile)) {
+			try {
+				return JSON.parse(readFileSync(schemaFile, 'utf-8')) as Record<string, unknown>;
+			} catch {
+				return undefined;
+			}
+		}
+	}
+
+	return undefined;
+}
+
+// ---------------------------------------------------------------------------
+// Schema context building
+// ---------------------------------------------------------------------------
+
+function buildSchemaContexts(nodes: NodeJSON[], nodesBasePath?: string): NodeSchemaContext[] {
+	return nodes.map((node) => {
+		const params = node.parameters as Record<string, unknown> | undefined;
+		const resource = typeof params?.resource === 'string' ? params.resource : undefined;
+		const operation = typeof params?.operation === 'string' ? params.operation : undefined;
+
+		let schema: Record<string, unknown> | undefined;
+		if (nodesBasePath) {
+			schema = resolveSchemaForNode(
+				node.type,
+				node.typeVersion,
+				resource,
+				operation,
+				nodesBasePath,
+			);
+		}
+
+		return {
+			nodeName: node.name ?? node.type,
+			nodeType: node.type,
+			typeVersion: node.typeVersion,
+			resource,
+			operation,
+			schema,
+		};
+	});
+}
+
+// ---------------------------------------------------------------------------
+// Mermaid diagram generation
+// ---------------------------------------------------------------------------
+
+/**
+ * Convert a workflow's nodes and connections to a mermaid flowchart string.
+ * Provides the LLM with workflow structure context for consistent data generation.
+ */
+function workflowToMermaid(workflow: WorkflowJSON): string {
+	const lines: string[] = ['flowchart LR'];
+
+	const nodeIdMap = new Map<string, string>();
+	workflow.nodes.forEach((node, i) => {
+		if (node.name) nodeIdMap.set(node.name, `n${String(i)}`);
+	});
+
+	for (const node of workflow.nodes) {
+		if (!node.name) continue;
+		const id = nodeIdMap.get(node.name);
+		if (!id) continue;
+
+		const params = node.parameters as Record<string, unknown> | undefined;
+		const resource = typeof params?.resource === 'string' ? params.resource : undefined;
+		const operation = typeof params?.operation === 'string' ? params.operation : undefined;
+
+		const shortType = node.type.split('.').pop() ?? node.type;
+		let label = `${node.name} (${shortType} v${String(node.typeVersion)}`;
+		if (resource) label += `, resource:${resource}`;
+		if (operation) label += `, op:${operation}`;
+		label += ')';
+
+		lines.push(`  ${id}["${label}"]`);
+	}
+
+	const { connections } = workflow;
+	for (const [sourceName, nodeConns] of Object.entries(connections)) {
+		const sourceId = nodeIdMap.get(sourceName);
+		if (!sourceId) continue;
+
+		for (const [, outputConnections] of Object.entries(nodeConns as Record<string, unknown>)) {
+			if (!Array.isArray(outputConnections)) continue;
+
+			for (const outputGroup of outputConnections) {
+				if (!Array.isArray(outputGroup)) continue;
+
+				for (const conn of outputGroup) {
+					if (typeof conn !== 'object' || conn === null || !('node' in conn)) continue;
+					const targetId = nodeIdMap.get((conn as { node: string }).node);
+					if (targetId) {
+						lines.push(`  ${sourceId} --> ${targetId}`);
+					}
+				}
+			}
+		}
+	}
+
+	return lines.join('\n');
+}
+
+// ---------------------------------------------------------------------------
+// LLM prompt construction
+// ---------------------------------------------------------------------------
+
+const SYSTEM_PROMPT = `You are a test data generator for n8n workflow automation. Generate realistic mock API response data for service nodes in a workflow.
+
+RULES:
+1. Data must be consistent across nodes. If node A creates an entity with id "abc-123", downstream nodes referencing that entity must use "abc-123".
+2. Generate 1-2 items per node.
+3. When a JSON Schema is provided, follow its structure exactly.
+4. When no schema is provided, generate a realistic response based on the node type, resource, and operation.
+5. Use realistic but clearly fake values (e.g., "jane@example.com", "proj_abc123", "2024-01-15T10:30:00Z").
+6. Return ONLY a valid JSON object, no explanation or markdown fencing.
+7. CRITICAL: You MUST generate data for EVERY node listed in "Nodes Requiring Mock Data". Never skip a node, even if the test scenario describes an empty or error response. An empty response is still valid data.`;
+
+function buildUserPrompt(
+	workflow: WorkflowJSON,
+	contexts: NodeSchemaContext[],
+	instructions?: PinDataGenerationInstructions,
+): string {
+	const mermaid = workflowToMermaid(workflow);
+
+	const sections: string[] = ['Generate mock output data for service nodes in this workflow.'];
+
+	if (instructions?.dataDescription) {
+		sections.push('');
+		sections.push('## Data Generation Instructions');
+		sections.push('');
+		sections.push(instructions.dataDescription);
+	}
+
+	sections.push('');
+	sections.push('## Workflow Graph');
+	sections.push('');
+	sections.push('```mermaid');
+	sections.push(mermaid);
+	sections.push('```');
+	sections.push('');
+	sections.push('## Nodes Requiring Mock Data');
+
+	for (const ctx of contexts) {
+		sections.push('');
+		sections.push(`### ${ctx.nodeName} (${ctx.nodeType} v${String(ctx.typeVersion)})`);
+		if (ctx.resource || ctx.operation) {
+			const parts: string[] = [];
+			if (ctx.resource) parts.push(`Resource: ${ctx.resource}`);
+			if (ctx.operation) parts.push(`Operation: ${ctx.operation}`);
+			sections.push(`- ${parts.join(' | ')}`);
+		}
+		if (ctx.schema) {
+			const schemaStr = JSON.stringify(ctx.schema, null, 2);
+			const truncated = schemaStr.length > 3000 ? schemaStr.slice(0, 3000) + '\n...' : schemaStr;
+			sections.push('- Output JSON Schema:');
+			sections.push('```json');
+			sections.push(truncated);
+			sections.push('```');
+		} else {
+			sections.push('(no schema available — generate based on API knowledge)');
+		}
+	}
+
+	sections.push('');
+	sections.push('## Expected Output Format');
+	sections.push('');
+	sections.push(
+		'Return a JSON object where each key is the exact node name and the value is an array of items, each wrapped in a "json" key:',
+	);
+	sections.push('');
+	sections.push('```json');
+	sections.push('{');
+	for (let i = 0; i < Math.min(contexts.length, 2); i++) {
+		const ctx = contexts[i];
+		const comma = i < Math.min(contexts.length, 2) - 1 ? ',' : '';
+		sections.push(`  "${ctx.nodeName}": [{ "json": { ... } }]${comma}`);
+	}
+	if (contexts.length > 2) {
+		sections.push('  ...');
+	}
+	sections.push('}');
+	sections.push('```');
+
+	return sections.join('\n');
+}
+
+// ---------------------------------------------------------------------------
+// Response parsing
+// ---------------------------------------------------------------------------
+
+/**
+ * Parse the LLM response into PinData format.
+ * Handles both `{ "json": {...} }` wrapped and unwrapped items.
+ */
+function parsePinDataResponse(responseText: string, expectedNodes: string[]): PinData {
+	let cleaned = responseText.trim();
+	if (cleaned.startsWith('```')) {
+		cleaned = cleaned.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, '');
+	}
+
+	let parsed: Record<string, unknown>;
+	try {
+		parsed = JSON.parse(cleaned) as Record<string, unknown>;
+	} catch {
+		return {};
+	}
+
+	const pinData: PinData = {};
+
+	for (const nodeName of expectedNodes) {
+		const nodeData = parsed[nodeName];
+		if (!Array.isArray(nodeData) || nodeData.length === 0) continue;
+
+		pinData[nodeName] = nodeData.map((item: unknown) => {
+			// The execution engine expects { json: IDataObject } format.
+			// The LLM may return items with or without the json wrapper.
+			if (typeof item === 'object' && item !== null && 'json' in item) {
+				return item as Record<string, unknown>;
+			}
+			// Wrap raw objects in { json: ... } for the execution engine
+			return { json: item ?? {} };
+		});
+	}
+
+	return pinData;
+}
+
+// ---------------------------------------------------------------------------
+// Main entry point
+// ---------------------------------------------------------------------------
+
+/**
+ * Generate pin data for specified nodes in a workflow using an LLM.
+ * Produces consistent cross-node mock data in a single LLM call.
+ *
+ * The caller decides which nodes need pin data (via nodeNames).
+ * This function only generates it.
+ *
+ * @returns PinData map (node name → data items). Returns {} on failure.
+ */
+export async function generatePinData(options: GeneratePinDataOptions): Promise<PinData> {
+	const { workflow, nodeNames, instructions } = options;
+
+	if (nodeNames.length === 0) return {};
+
+	// Resolve target nodes from the workflow
+	const targetNodes = workflow.nodes.filter((n) => n.name && nodeNames.includes(n.name));
+	if (targetNodes.length === 0) return {};
+
+	// Build schema contexts with optional __schema__ enrichment
+	const nodesBasePath = resolveNodesBasePath();
+	const contexts = buildSchemaContexts(targetNodes, nodesBasePath);
+
+	// Build prompt and call LLM
+	const userPrompt = buildUserPrompt(workflow, contexts, instructions);
+	const expectedNodeNames = contexts.map((c) => c.nodeName);
+
+	try {
+		const agent = createEvalAgent('eval-pin-data-generator', {
+			instructions: SYSTEM_PROMPT,
+			cache: true,
+		});
+
+		const result = await agent.generate(userPrompt, {
+			providerOptions: { anthropic: { maxTokens: 16_384 } },
+		});
+
+		const responseText = extractText(result);
+		return parsePinDataResponse(responseText, expectedNodeNames);
+	} catch {
+		return {};
+	}
+}
--- a/packages/cli/src/modules/instance-ai/eval/workflow-analysis.ts
+++ b/packages/cli/src/modules/instance-ai/eval/workflow-analysis.ts
@ -0,0 +1,308 @@
+/**
+ * Workflow analysis utilities for evaluation mock execution.
+ *
+ * Identifies which nodes should receive mock hints and generates consistent
+ * per-node hints + trigger data via a single LLM call.
+ *
+ * Adapted from @n8n/instance-ai/evaluations/support/ — this copy lives
+ * in the CLI package because it runs in-process during workflow execution.
+ * TODO: Extract to a shared @n8n/eval-utils package for reuse by
+ * the eval CLI, MCP, and other consumers.
+ */
+
+import { Logger } from '@n8n/backend-common';
+import { Container } from '@n8n/di';
+import { type INode, type IPinData, type IWorkflowBase, jsonParse } from 'n8n-workflow';
+
+import { createEvalAgent, extractText } from '@n8n/instance-ai';
+import { extractNodeConfig } from './node-config';
+
+// ---------------------------------------------------------------------------
+// Node classification
+// ---------------------------------------------------------------------------
+
+/**
+ * Find node names that are targets of ai_* connections (Agent, Chain nodes).
+ * These are "root" AI nodes whose sub-nodes use vendor SDKs. Pinning the root
+ * prevents supplyData() on all connected sub-nodes, avoiding SDK calls entirely.
+ *
+ * Ported from @n8n/instance-ai/evaluations/support/service-node-classifier.ts
+ */
+function findAiRootNodeNames(workflow: IWorkflowBase): Set<string> {
+	const roots = new Set<string>();
+	for (const nodeConns of Object.values(workflow.connections)) {
+		for (const [connType, outputs] of Object.entries(nodeConns)) {
+			if (!connType.startsWith('ai_') || !Array.isArray(outputs)) continue;
+			for (const group of outputs) {
+				if (!Array.isArray(group)) continue;
+				for (const conn of group) {
+					if (typeof conn === 'object' && conn !== null && 'node' in conn) {
+						roots.add((conn as { node: string }).node);
+					}
+				}
+			}
+		}
+	}
+	return roots;
+}
+
+/**
+ * Find node names that are sources of ai_* connections (LLM models, tools, memory).
+ * These are sub-nodes handled via their root — they should not be pinned individually
+ * or receive mock hints.
+ *
+ * Ported from @n8n/instance-ai/evaluations/support/service-node-classifier.ts
+ */
+function findAiSubNodeNames(workflow: IWorkflowBase): Set<string> {
+	const subNodes = new Set<string>();
+	for (const [sourceName, nodeConns] of Object.entries(workflow.connections)) {
+		for (const connType of Object.keys(nodeConns)) {
+			if (connType.startsWith('ai_')) {
+				subNodes.add(sourceName);
+			}
+		}
+	}
+	return subNodes;
+}
+
+// ---------------------------------------------------------------------------
+// Bypass node types — nodes that use non-HTTP protocols or bypass n8n's
+// request helper functions. These can't be intercepted by the eval mock handler.
+// ---------------------------------------------------------------------------
+
+const BYPASS_NODE_TYPES = new Set([
+	// Databases (TCP/binary protocol)
+	'n8n-nodes-base.redis',
+	'n8n-nodes-base.mongoDb',
+	'n8n-nodes-base.mySql',
+	'n8n-nodes-base.postgres',
+	'n8n-nodes-base.microsoftSql',
+	'n8n-nodes-base.snowflake',
+	// Message queues (TCP/binary protocol)
+	'n8n-nodes-base.kafka',
+	'n8n-nodes-base.rabbitmq',
+	'n8n-nodes-base.mqtt',
+	'n8n-nodes-base.amqp',
+	// File/network protocols
+	'n8n-nodes-base.ftp',
+	'n8n-nodes-base.ssh',
+	'n8n-nodes-base.ldap',
+	'n8n-nodes-base.emailSend',
+	// Non-helper HTTP
+	'n8n-nodes-base.rssFeedRead',
+	'n8n-nodes-base.git',
+]);
+
+/**
+ * Identify nodes that bypass the HTTP mock layer and need pin data instead.
+ * Returns AI root nodes (Agent, Chain) and protocol/bypass nodes.
+ */
+export function identifyNodesForPinData(workflow: IWorkflowBase): INode[] {
+	const aiRootNodes = findAiRootNodeNames(workflow);
+
+	return workflow.nodes.filter((node) => {
+		if (node.disabled) return false;
+		if (aiRootNodes.has(node.name)) return true;
+		if (BYPASS_NODE_TYPES.has(node.type)) return true;
+		return false;
+	});
+}
+
+/**
+ * Identify which nodes in a workflow should receive mock hints.
+ * Excludes AI sub-nodes (handled via their root) and nodes that will be
+ * pinned (they don't execute, so hints are irrelevant for them).
+ */
+export function identifyNodesForHints(workflow: IWorkflowBase): INode[] {
+	const aiSubNodes = findAiSubNodeNames(workflow);
+	const aiRootNodes = findAiRootNodeNames(workflow);
+	const pinnedNodeNames = new Set(identifyNodesForPinData(workflow).map((n) => n.name));
+
+	return workflow.nodes.filter((node) => {
+		if (node.disabled) return false;
+		if (aiSubNodes.has(node.name)) return false;
+		if (aiRootNodes.has(node.name)) return false;
+		if (pinnedNodeNames.has(node.name)) return false;
+		return true;
+	});
+}
+
+// ---------------------------------------------------------------------------
+// Mock hints generation
+// ---------------------------------------------------------------------------
+
+export interface MockHints {
+	/** Shared data context for all nodes (user IDs, entity names, relationships) */
+	globalContext: string;
+	/** Per-node hints describing what data to return for each service node */
+	nodeHints: Record<string, string>;
+	/** Generated trigger output matching what the start node would produce */
+	triggerContent: Record<string, unknown>;
+	/** Errors encountered during hint generation or mock execution */
+	warnings: string[];
+	/** Pin data for nodes that bypass the HTTP mock layer (AI roots, protocol nodes) */
+	bypassPinData: IPinData;
+}
+
+export interface GenerateMockHintsOptions {
+	workflow: IWorkflowBase;
+	nodeNames: string[];
+	scenarioHints?: string;
+}
+
+const SYSTEM_PROMPT = `You are a test data planner for n8n workflow automation. Your job is to create a consistent data context, trigger output data, and per-node hints that will guide an API mock server to generate realistic, coherent responses across all nodes in a workflow.
+
+RULES:
+1. Create a "globalContext" that defines the shared world — user IDs, entity names, channel names, email addresses, and relationships that ALL nodes should reference consistently.
+2. Create a "triggerContent" object that represents the exact output the workflow's trigger/start node would produce. This is used as pin data (the node's output), so it must match what downstream nodes reference:
+   - Look at the trigger node's type to determine the output structure
+   - For webhook triggers: include { headers: {}, query: {}, body: { ...fields } } since downstream nodes reference $json.body.fieldName
+   - For service-specific triggers (Gmail Trigger, Slack Trigger, etc.): match the service's real event/message output format
+   - For schedule triggers: include timestamp fields
+   - For manual triggers: include the fields that downstream nodes reference
+   - CRITICAL: check what downstream nodes reference (e.g., $json.body.email, $json.subject, $json.text) and ensure those paths exist in triggerContent
+3. Create a "nodeHints" object with one entry per node. Each hint describes what data that specific node's API response should contain, referencing entities from the global context.
+4. Hints should describe the DATA CONTENT, not the API response format. The mock server already knows the API schema.
+5. Ensure data flows logically through the workflow. If node A fetches items that node B processes, the items in A's hint should match what B expects.
+6. Use realistic but clearly fake values (e.g., "jane@example.com", "U_abc123").
+7. **If a "Test Scenario" section is provided, it OVERRIDES your default data generation.** Use the exact names, emails, values, and conditions described in the scenario. If the scenario says "no name field", do NOT include a name. If it says "email is not-an-email", use that exact value. The scenario defines the test — follow it precisely.
+8. Return ONLY valid JSON, no explanation or markdown fencing.`;
+
+function buildUserPrompt(
+	workflow: IWorkflowBase,
+	nodeNames: string[],
+	scenarioHints?: string,
+): string {
+	const sections: string[] = [
+		'Generate a consistent data context and per-node mock hints for this workflow.',
+	];
+
+	if (scenarioHints) {
+		sections.push('', '## Test Scenario', '', scenarioHints);
+	}
+
+	sections.push('', '## Workflow Nodes', '');
+	for (const node of workflow.nodes) {
+		let line = `- ${node.name} (${node.type})`;
+		const config = extractNodeConfig(node);
+		if (config) {
+			line += ` ${config}`;
+		}
+		sections.push(line);
+	}
+
+	sections.push('', '## Connections', '');
+	for (const [sourceName, nodeConns] of Object.entries(workflow.connections)) {
+		for (const [connType, outputs] of Object.entries(nodeConns)) {
+			if (!Array.isArray(outputs)) continue;
+			for (const group of outputs) {
+				if (!Array.isArray(group)) continue;
+				for (const conn of group) {
+					if (typeof conn === 'object' && conn !== null && 'node' in conn) {
+						sections.push(`  ${sourceName} -[${connType}]-> ${(conn as { node: string }).node}`);
+					}
+				}
+			}
+		}
+	}
+
+	sections.push('', '## Expected Output', '', '```json', '{');
+	sections.push('  "globalContext": "Shared entities: ...",');
+	sections.push('  "triggerContent": { "...exact output the trigger node would produce..." },');
+	sections.push('  "nodeHints": {');
+	for (let i = 0; i < Math.min(nodeNames.length, 3); i++) {
+		const comma = i < Math.min(nodeNames.length, 3) - 1 ? ',' : '';
+		sections.push(`    "${nodeNames[i]}": "What data to return..."${comma}`);
+	}
+	if (nodeNames.length > 3) sections.push('    ...');
+	sections.push('  }', '}', '```');
+
+	return sections.join('\n');
+}
+
+/**
+ * Generate consistent mock hints for service nodes in a workflow.
+ * One LLM call produces a global context, trigger data, and per-node hints.
+ */
+export async function generateMockHints(options: GenerateMockHintsOptions): Promise<MockHints> {
+	const { workflow, nodeNames, scenarioHints } = options;
+	const emptyResult: MockHints = {
+		globalContext: '',
+		nodeHints: {},
+		triggerContent: {},
+		warnings: [],
+		bypassPinData: {},
+	};
+
+	if (nodeNames.length === 0) return emptyResult;
+
+	const userPrompt = buildUserPrompt(workflow, nodeNames, scenarioHints);
+
+	try {
+		const agent = createEvalAgent('eval-hint-generator', {
+			instructions: SYSTEM_PROMPT,
+		});
+
+		const result = await agent.generate(userPrompt, {
+			providerOptions: { anthropic: { maxTokens: 4096 } },
+		});
+
+		let text: string = extractText(result);
+
+		text = text
+			.replace(/^```(?:json)?\s*\n?/i, '')
+			.replace(/\n?\s*```\s*$/i, '')
+			.trim();
+
+		const parsed: Record<string, unknown> = jsonParse(text);
+
+		// globalContext may come back as a string or object — normalize to string
+		let globalContext = '';
+		if (typeof parsed.globalContext === 'string') {
+			globalContext = parsed.globalContext;
+		} else if (typeof parsed.globalContext === 'object' && parsed.globalContext !== null) {
+			globalContext = JSON.stringify(parsed.globalContext);
+		}
+
+		if (
+			typeof parsed.nodeHints !== 'object' ||
+			parsed.nodeHints === null ||
+			Array.isArray(parsed.nodeHints)
+		) {
+			const preview = text.slice(0, 300);
+			return {
+				...emptyResult,
+				warnings: [`Phase 1: LLM returned invalid structure. Raw: ${preview}`],
+			};
+		}
+
+		const warnings: string[] = [];
+		const triggerContent =
+			typeof parsed.triggerContent === 'object' &&
+			parsed.triggerContent !== null &&
+			!Array.isArray(parsed.triggerContent)
+				? parsed.triggerContent
+				: {};
+		if (Object.keys(triggerContent).length === 0) {
+			warnings.push('Phase 1: LLM returned empty triggerContent — trigger node will have no data');
+		}
+
+		// Coerce nodeHints values to strings — LLM may return objects instead of strings
+		const nodeHints: Record<string, string> = {};
+		for (const [key, value] of Object.entries(parsed.nodeHints as Record<string, unknown>)) {
+			nodeHints[key] = typeof value === 'string' ? value : JSON.stringify(value);
+		}
+
+		return {
+			globalContext,
+			nodeHints,
+			triggerContent: triggerContent as Record<string, unknown>,
+			warnings,
+			bypassPinData: {},
+		};
+	} catch (error) {
+		const errorMsg = error instanceof Error ? error.message : String(error);
+		Container.get(Logger).error(`[EvalMock] Phase 1 hint generation failed: ${errorMsg}`);
+		return { ...emptyResult, warnings: [`Phase 1 error: ${errorMsg}`] };
+	}
+}
--- a/packages/cli/src/modules/instance-ai/instance-ai.controller.ts
+++ b/packages/cli/src/modules/instance-ai/instance-ai.controller.ts
@ -12,6 +12,7 @@ import {
 	InstanceAiThreadMessagesQuery,
 	InstanceAiAdminSettingsUpdateRequest,
 	InstanceAiUserPreferencesUpdateRequest,
+	InstanceAiEvalExecutionRequest,
 } from '@n8n/api-types';
 import { ModuleRegistry } from '@n8n/backend-common';
 import { GlobalConfig } from '@n8n/config';
@ -33,6 +34,7 @@ import type { StoredEvent } from '@n8n/instance-ai';
 import { buildAgentTreeFromEvents } from '@n8n/instance-ai';
 import type { NextFunction, Request, Response } from 'express';
 import { randomUUID, timingSafeEqual } from 'node:crypto';
+import { EvalExecutionService } from './eval/execution.service';
 import { InProcessEventBus } from './event-bus/in-process-event-bus';
 import { InstanceAiMemoryService } from './instance-ai-memory.service';
 import { InstanceAiSettingsService } from './instance-ai-settings.service';
@ -58,6 +60,7 @@ export class InstanceAiController {
 		private readonly instanceAiService: InstanceAiService,
 		private readonly memoryService: InstanceAiMemoryService,
 		private readonly settingsService: InstanceAiSettingsService,
+		private readonly evalExecutionService: EvalExecutionService,
 		private readonly eventBus: InProcessEventBus,
 		private readonly moduleRegistry: ModuleRegistry,
 		private readonly push: Push,
@ -507,6 +510,19 @@ export class InstanceAiController {
 		return await this.memoryService.getThreadContext(req.user.id, threadId);
 	}

+	// ── Evaluation endpoints ──────────────────────────────────────────────────
+
+	@Post('/eval/execute-with-llm-mock/:workflowId')
+	@GlobalScope('instanceAi:message')
+	async executeWithLlmMock(
+		req: AuthenticatedRequest,
+		_res: Response,
+		@Param('workflowId') workflowId: string,
+		@Body payload: InstanceAiEvalExecutionRequest,
+	) {
+		return await this.evalExecutionService.executeWithLlmMock(workflowId, req.user, payload);
+	}
+
 	// ── Gateway endpoints (daemon ↔ server) ──────────────────────────────────

 	@Post('/gateway/create-link')
--- a/packages/core/src/execution-engine/tests/eval-mock-helpers.test.ts
+++ b/packages/core/src/execution-engine/tests/eval-mock-helpers.test.ts
@ -0,0 +1,303 @@
+import { mock } from 'jest-mock-extended';
+import type { IHttpRequestOptions, INode, IRequestOptions } from 'n8n-workflow';
+
+import {
+	buildEvalMockCredentials,
+	callEvalMockHandler,
+	normalizeLegacyRequest,
+	serializeMockToHttpResponse,
+} from '../eval-mock-helpers';
+import type { EvalLlmMockHandler, EvalMockHttpResponse } from '../index';
+
+describe('eval-mock-helpers', () => {
+	// -----------------------------------------------------------------------
+	// buildEvalMockCredentials
+	// -----------------------------------------------------------------------
+	describe('buildEvalMockCredentials', () => {
+		it('should populate each property with eval-mock-value', () => {
+			const result = buildEvalMockCredentials([{ name: 'apiKey' }, { name: 'domain' }]);
+
+			expect(result.apiKey).toBe('eval-mock-value');
+			expect(result.domain).toBe('eval-mock-value');
+		});
+
+		it('should always include oauthTokenData with access_token, token_type, and refresh_token', () => {
+			const result = buildEvalMockCredentials([{ name: 'apiKey' }]);
+
+			expect(result.oauthTokenData).toEqual({
+				access_token: 'eval-mock-access-token',
+				token_type: 'Bearer',
+				refresh_token: 'eval-mock-refresh-token',
+			});
+		});
+
+		it('should always include a privateKey containing an RSA key', () => {
+			const result = buildEvalMockCredentials([{ name: 'apiKey' }]);
+
+			expect(result.privateKey).toEqual(expect.stringContaining('BEGIN RSA PRIVATE KEY'));
+			expect(result.privateKey).toEqual(expect.stringContaining('END RSA PRIVATE KEY'));
+		});
+
+		it('should return oauthTokenData and privateKey even with empty properties array', () => {
+			const result = buildEvalMockCredentials([]);
+
+			expect(Object.keys(result)).toEqual(expect.arrayContaining(['oauthTokenData', 'privateKey']));
+			expect(result.oauthTokenData).toBeDefined();
+			expect(result.privateKey).toBeDefined();
+		});
+	});
+
+	// -----------------------------------------------------------------------
+	// serializeMockToHttpResponse
+	// -----------------------------------------------------------------------
+	describe('serializeMockToHttpResponse', () => {
+		it('should convert a JSON body to a Buffer', () => {
+			const mockResponse: EvalMockHttpResponse = {
+				body: { message: 'hello' },
+				headers: { 'content-type': 'application/json' },
+				statusCode: 200,
+			};
+
+			const result = serializeMockToHttpResponse(mockResponse);
+
+			expect(Buffer.isBuffer(result.body)).toBe(true);
+			expect(result.body.toString()).toBe(JSON.stringify({ message: 'hello' }));
+		});
+
+		it('should preserve headers and statusCode', () => {
+			const mockResponse: EvalMockHttpResponse = {
+				body: { ok: true },
+				headers: { 'x-custom': 'value' },
+				statusCode: 201,
+			};
+
+			const result = serializeMockToHttpResponse(mockResponse);
+
+			expect(result.headers).toEqual({ 'x-custom': 'value' });
+			expect(result.statusCode).toBe(201);
+			expect(result.statusMessage).toBe('OK');
+		});
+
+		it('should pass through a Buffer body without double-encoding', () => {
+			const originalBuffer = Buffer.from('raw-binary-data');
+			const mockResponse: EvalMockHttpResponse = {
+				body: originalBuffer,
+				headers: {},
+				statusCode: 200,
+			};
+
+			const result = serializeMockToHttpResponse(mockResponse);
+
+			expect(result.body).toBe(originalBuffer);
+			expect(result.body.toString()).toBe('raw-binary-data');
+		});
+	});
+
+	// -----------------------------------------------------------------------
+	// normalizeLegacyRequest
+	// -----------------------------------------------------------------------
+	describe('normalizeLegacyRequest', () => {
+		it('should convert string URI + options object into IHttpRequestOptions', () => {
+			const options: IRequestOptions = {
+				method: 'POST',
+				headers: { Authorization: 'Bearer token' },
+				body: { key: 'value' },
+				qs: { page: '1' },
+				uri: 'ignored-when-string-first-arg',
+			};
+
+			const result = normalizeLegacyRequest('https://api.example.com/data', options);
+
+			expect(result.url).toBe('https://api.example.com/data');
+			expect(result.method).toBe('POST');
+			expect(result.headers).toEqual({ Authorization: 'Bearer token' });
+			expect(result.body).toEqual({ key: 'value' });
+			expect(result.qs).toEqual({ page: '1' });
+		});
+
+		it('should use uri field when given an IRequestOptions object with uri', () => {
+			const requestObj: IRequestOptions = {
+				uri: 'https://api.example.com/from-uri',
+				method: 'GET',
+			};
+
+			const result = normalizeLegacyRequest(requestObj);
+
+			expect(result.url).toBe('https://api.example.com/from-uri');
+			expect(result.method).toBe('GET');
+		});
+
+		it('should use url field when given an IRequestOptions object with url', () => {
+			const requestObj: IRequestOptions = {
+				url: 'https://api.example.com/from-url',
+				method: 'DELETE',
+			};
+
+			const result = normalizeLegacyRequest(requestObj);
+
+			expect(result.url).toBe('https://api.example.com/from-url');
+			expect(result.method).toBe('DELETE');
+		});
+
+		it('should fall back to empty string when neither uri nor url is present', () => {
+			const requestObj: IRequestOptions = {
+				method: 'PATCH',
+			};
+
+			const result = normalizeLegacyRequest(requestObj);
+
+			expect(result.url).toBe('');
+		});
+
+		it('should preserve method, headers, body, and qs from an IRequestOptions object', () => {
+			const requestObj: IRequestOptions = {
+				uri: 'https://api.example.com',
+				method: 'PUT',
+				headers: { 'Content-Type': 'application/json' },
+				body: { update: true },
+				qs: { version: '2' },
+			};
+
+			const result = normalizeLegacyRequest(requestObj);
+
+			expect(result.method).toBe('PUT');
+			expect(result.headers).toEqual({ 'Content-Type': 'application/json' });
+			expect(result.body).toEqual({ update: true });
+			expect(result.qs).toEqual({ version: '2' });
+		});
+	});
+
+	// -----------------------------------------------------------------------
+	// callEvalMockHandler
+	// -----------------------------------------------------------------------
+	describe('callEvalMockHandler', () => {
+		const node = mock<INode>();
+		const requestOptions: IHttpRequestOptions = {
+			url: 'https://api.example.com/test',
+			method: 'GET',
+		};
+
+		const successResponse: EvalMockHttpResponse = {
+			body: { data: 'mocked' },
+			headers: { 'content-type': 'application/json' },
+			statusCode: 200,
+		};
+
+		it('should return body when handler responds and returnFullResponse is false', async () => {
+			const handler: EvalLlmMockHandler = jest.fn().mockResolvedValue(successResponse);
+
+			const result = await callEvalMockHandler(handler, requestOptions, node);
+
+			expect(result).toEqual({ data: 'mocked' });
+		});
+
+		it('should return serialized response when returnFullResponse is true', async () => {
+			const handler: EvalLlmMockHandler = jest.fn().mockResolvedValue(successResponse);
+
+			const result = await callEvalMockHandler(handler, requestOptions, node, true);
+
+			expect(result).toHaveProperty('body');
+			expect(result).toHaveProperty('headers');
+			expect(result).toHaveProperty('statusCode', 200);
+			const typedResult = result as ReturnType<typeof serializeMockToHttpResponse>;
+			expect(Buffer.isBuffer(typedResult.body)).toBe(true);
+		});
+
+		it('should return undefined when handler returns undefined', async () => {
+			const handler: EvalLlmMockHandler = jest.fn().mockResolvedValue(undefined);
+
+			const result = await callEvalMockHandler(handler, requestOptions, node);
+
+			expect(result).toBeUndefined();
+		});
+
+		it('should throw axios-shaped error for status >= 400 with httpLibrary=axios', async () => {
+			const errorResponse: EvalMockHttpResponse = {
+				body: { error: 'Not Found' },
+				headers: { 'content-type': 'application/json' },
+				statusCode: 404,
+			};
+			const handler: EvalLlmMockHandler = jest.fn().mockResolvedValue(errorResponse);
+
+			await expect(
+				callEvalMockHandler(handler, requestOptions, node, false, 'axios'),
+			).rejects.toThrow('Request failed with status code 404');
+		});
+
+		it('should throw legacy-shaped error for status >= 400 with httpLibrary=legacy', async () => {
+			const errorResponse: EvalMockHttpResponse = {
+				body: { error: 'Server Error' },
+				headers: {},
+				statusCode: 500,
+			};
+			const handler: EvalLlmMockHandler = jest.fn().mockResolvedValue(errorResponse);
+
+			await expect(
+				callEvalMockHandler(handler, requestOptions, node, false, 'legacy'),
+			).rejects.toThrow('Request failed with status code 500');
+		});
+
+		it('should include isAxiosError=true on axios-shaped errors', async () => {
+			const errorResponse: EvalMockHttpResponse = {
+				body: { error: 'Bad Request' },
+				headers: { 'x-error': 'true' },
+				statusCode: 400,
+			};
+			const handler: EvalLlmMockHandler = jest.fn().mockResolvedValue(errorResponse);
+
+			try {
+				await callEvalMockHandler(handler, requestOptions, node, false, 'axios');
+				fail('Expected error to be thrown');
+			} catch (error: unknown) {
+				const err = error as Error & {
+					isAxiosError: boolean;
+					response: { status: number; data: unknown; headers: Record<string, string> };
+				};
+				expect(err.isAxiosError).toBe(true);
+				expect(err.response.status).toBe(400);
+				expect(err.response.data).toEqual({ error: 'Bad Request' });
+				expect(err.response.headers).toEqual({ 'x-error': 'true' });
+			}
+		});
+
+		it('should include statusCode and response.body on legacy-shaped errors', async () => {
+			const errorResponse: EvalMockHttpResponse = {
+				body: { error: 'Forbidden' },
+				headers: { 'x-reason': 'denied' },
+				statusCode: 403,
+			};
+			const handler: EvalLlmMockHandler = jest.fn().mockResolvedValue(errorResponse);
+
+			try {
+				await callEvalMockHandler(handler, requestOptions, node, false, 'legacy');
+				fail('Expected error to be thrown');
+			} catch (error: unknown) {
+				const err = error as Error & {
+					statusCode: number;
+					response: { statusCode: number; body: unknown; headers: Record<string, string> };
+				};
+				expect(err.statusCode).toBe(403);
+				expect(err.response.statusCode).toBe(403);
+				expect(err.response.body).toEqual({ error: 'Forbidden' });
+				expect(err.response.headers).toEqual({ 'x-reason': 'denied' });
+			}
+		});
+
+		it('should default to axios httpLibrary when not specified', async () => {
+			const errorResponse: EvalMockHttpResponse = {
+				body: { error: 'Unauthorized' },
+				headers: {},
+				statusCode: 401,
+			};
+			const handler: EvalLlmMockHandler = jest.fn().mockResolvedValue(errorResponse);
+
+			try {
+				await callEvalMockHandler(handler, requestOptions, node);
+				fail('Expected error to be thrown');
+			} catch (error: unknown) {
+				const err = error as Error & { isAxiosError: boolean };
+				expect(err.isAxiosError).toBe(true);
+			}
+		});
+	});
+});
--- a/packages/core/src/execution-engine/eval-mock-helpers.ts
+++ b/packages/core/src/execution-engine/eval-mock-helpers.ts
@ -0,0 +1,157 @@
+/**
+ * Eval mock helpers — support code for LLM-based HTTP mocking during evaluation.
+ *
+ * Used by the credential bypass (node-execution-context.ts), HTTP interception
+ * (request-helper-functions.ts), and workflow validation bypass (workflow-execute.ts).
+ * All eval-specific logic in packages/core is either in this file or in the
+ * type definitions in index.ts (module augmentation).
+ */
+
+import type { IHttpRequestOptions, INode, IRequestOptions } from 'n8n-workflow';
+
+import type { EvalLlmMockHandler, EvalMockHttpResponse } from './index';
+
+// ---------------------------------------------------------------------------
+// Mock credentials
+// ---------------------------------------------------------------------------
+
+// NOT A SECRET — throwaway RSA key used only in eval mode to satisfy OAuth
+// signing requirements so HTTP requests reach the interception layer. This key
+// has no access to any real service. It never leaves the process. The mock
+// credential system needs a structurally valid key for jwt.sign() to succeed;
+// without it, OAuth nodes crash before the HTTP interceptor can capture the request.
+// Generated once offline via: openssl genrsa 2048
+// prettier-ignore
+const EVAL_MOCK_RSA_KEY =
+	'-----BEGIN RSA PRIVATE KEY-----\n' +
+	'MIIEpAIBAAKCAQEA0Z3VS5JJcds3xfn/ygWep4PAtGoRBh2hHiwxBgNlHOVMSMk7\n' +
+	'R1ueXBOwqmLMSsGCnl1kV2QLFG6mnMBOxJBbXGLuzJsFMDPCnZGfnJBfcCnxGYCE\n' +
+	'c0bO3GN/S4Lk1eTarfEDQC/k0GFyyGPMQ5rnmZxSOqX1MtVCoB5FEGnLJEMqNFDt\n' +
+	'tJmYMmzxR9Lgd7bVMOYG8xDT/PYWw28GdgNZhAIPqFVHqMjUFWC76Q8rA6OF4OU0\n' +
+	'S0IAejdh3LGAzMIjCMfmSBn+VaRzcBVoKBpZgN0a1YjqFCr8LpqpMIxLfm+7SIdB\n' +
+	'Z6YWxEeOwKoiMIB9drmHO2lNzSTmblOKMPmqJwIDAQABAoIBAC5RgZ+hBx7xHNaM\n' +
+	'pPgwGMnCd3KE2M8RMBx1bfOUEODjQx7E3fOtqqa4HNqVGz9HBfVzL4JBpYCknI1X\n' +
+	'p9Dxd6hf0Ht5BPMWxPBqKGhqCSxIxwvGLShDANGKbjilSTkmhGBDrGj3U0DRXKxmU\n' +
+	'i6jDP0VJwy9ZmkBqxJvYEhW0m+fQd0JJKQ5HRk2RNXoP+GBmZsBeIs4uAt14i6n4\n' +
+	'kfYCR9CMSBC6DlNWxqGSAWzPrKAMPMiL5GJWGhy+A4DEXPewYQ6LpbD4xXEJN2v7\n' +
+	'Tae0YYjM/B7oy3JV5UsMaQKBgQDjYKMcn8io6Ei7RDYH8sMpKLejIEjE7ksMvYCk\n' +
+	'1RGx/w0Q3n5FVjMP3oG3UcUx9EB7GD8NMo74J/lEJ2UsBnIP3ggOb3AE+pWHNE0K\n' +
+	'-----END RSA PRIVATE KEY-----';
+
+/**
+ * Build mock credentials for eval mode from the credential type's property definitions.
+ * Includes auth-related fields (OAuth tokens, RSA keys) that nodes need beyond
+ * the UI properties — nodes pick what they need and ignore the rest.
+ */
+export function buildEvalMockCredentials(
+	properties: Array<{ name: string }>,
+): Record<string, unknown> {
+	const mockCredentials: Record<string, unknown> = {};
+	for (const prop of properties) {
+		mockCredentials[prop.name] = 'eval-mock-value';
+	}
+	mockCredentials.oauthTokenData = {
+		access_token: 'eval-mock-access-token',
+		token_type: 'Bearer',
+		refresh_token: 'eval-mock-refresh-token',
+	};
+	mockCredentials.privateKey = EVAL_MOCK_RSA_KEY;
+	return mockCredentials;
+}
+
+// ---------------------------------------------------------------------------
+// HTTP response helpers
+// ---------------------------------------------------------------------------
+
+/**
+ * Convert an EvalMockHttpResponse into the full-response shape that callers expect
+ * when `returnFullResponse` / `resolveWithFullResponse` is true.
+ * Body is serialized to a Buffer so downstream processing (binary detection,
+ * encoding detection, stream handling) works exactly as with a real HTTP response.
+ */
+export function serializeMockToHttpResponse(mock: EvalMockHttpResponse) {
+	const body =
+		mock.body instanceof Buffer
+			? mock.body
+			: Buffer.from(typeof mock.body === 'string' ? mock.body : JSON.stringify(mock.body));
+	return { body, headers: mock.headers, statusCode: mock.statusCode, statusMessage: 'OK' };
+}
+
+/** Normalize legacy IRequestOptions or (uri, options) args into IHttpRequestOptions for the eval mock handler. */
+export function normalizeLegacyRequest(
+	uriOrObject: string | IRequestOptions,
+	options?: IRequestOptions,
+): IHttpRequestOptions {
+	if (typeof uriOrObject === 'string') {
+		return {
+			url: uriOrObject,
+			method: options?.method,
+			headers: options?.headers,
+			body: options?.body as IHttpRequestOptions['body'],
+			qs: options?.qs,
+		};
+	}
+	return {
+		url: uriOrObject.uri ?? uriOrObject.url ?? '',
+		method: uriOrObject.method,
+		headers: uriOrObject.headers,
+		body: uriOrObject.body as IHttpRequestOptions['body'],
+		qs: uriOrObject.qs,
+	};
+}
+
+/**
+ * Call the eval mock handler and format its response for the calling helper.
+ * When `returnFullResponse` is true, serializes to `{ body: Buffer, headers, statusCode }`
+ * matching the shape that nodes expect from real HTTP responses.
+ * For error responses (status >= 400), throws an error matching the HTTP library's
+ * error shape so nodes handle it identically to real HTTP failures.
+ * Returns `undefined` if the handler did not produce a response.
+ */
+export async function callEvalMockHandler(
+	handler: EvalLlmMockHandler,
+	requestOptions: IHttpRequestOptions,
+	node: INode,
+	returnFullResponse?: boolean,
+	httpLibrary: 'axios' | 'legacy' = 'axios',
+): Promise<unknown> {
+	const response = await handler(requestOptions, node);
+	if (!response) return undefined;
+
+	if (response.statusCode >= 400) {
+		throwHttpError(response, httpLibrary);
+	}
+
+	return returnFullResponse ? serializeMockToHttpResponse(response) : response.body;
+}
+
+/**
+ * Throw an error matching what the real HTTP library would throw,
+ * so node error handling (retries, continueOnFail, NodeApiError) works identically.
+ */
+function throwHttpError(response: EvalMockHttpResponse, library: 'axios' | 'legacy'): never {
+	const message = `Request failed with status code ${response.statusCode}`;
+
+	if (library === 'axios') {
+		// Match AxiosError shape: error.response.{status, data, headers}, error.isAxiosError
+		throw Object.assign(new Error(message), {
+			isAxiosError: true,
+			response: {
+				status: response.statusCode,
+				statusText: message,
+				data: response.body,
+				headers: response.headers,
+			},
+		});
+	}
+
+	// Match legacy request-promise error shape: error.statusCode, error.response.body
+	throw Object.assign(new Error(message), {
+		statusCode: response.statusCode,
+		response: {
+			statusCode: response.statusCode,
+			body: response.body,
+			headers: response.headers,
+		},
+	});
+}
--- a/packages/core/src/execution-engine/index.ts
+++ b/packages/core/src/execution-engine/index.ts
@ -2,6 +2,8 @@ import type {
 	DataTableProxyProvider,
 	DynamicCredentialCheckProxyProvider,
 	IExecutionContext,
+	IHttpRequestOptions,
+	INode,
 	IWorkflowSettings,
 	Result,
 } from 'n8n-workflow';
@ -10,6 +12,23 @@ import type { LookupFunction } from 'node:net';
 import type { ExecutionLifecycleHooks } from './execution-lifecycle-hooks';
 import type { ExternalSecretsProxy } from './external-secrets-proxy';

+/** Standardized mock HTTP response returned by the eval mock handler. */
+export interface EvalMockHttpResponse {
+	body: unknown;
+	headers: Record<string, string>;
+	statusCode: number;
+}
+
+/**
+ * Handler for LLM-based HTTP mocking during evaluation.
+ * Receives the fully-built request (after credential auth) and the executing node.
+ * Return a full mock response, or `undefined` to pass through to real HTTP.
+ */
+export type EvalLlmMockHandler = (
+	requestOptions: IHttpRequestOptions,
+	node: INode,
+) => Promise<EvalMockHttpResponse | undefined>;
+
 export type SsrfCheckResult = Result<void, Error>;

 /**
@ -35,6 +54,13 @@ declare module 'n8n-workflow' {
 		externalSecretProviderKeysAccessibleByCredential?: Set<string>;
 		/** SSRF protection bridge — present only when N8N_SSRF_PROTECTION_ENABLED=true */
 		ssrfBridge?: SsrfBridge;
+		/**
+		 * LLM-based HTTP mock handler for evaluation mode.
+		 * When set, HTTP requests from service nodes are intercepted and routed
+		 * through this handler instead of making real API calls.
+		 * Only set by the eval execution service — never present in normal executions.
+		 */
+		evalLlmMockHandler?: EvalLlmMockHandler;
 		'data-table'?: { dataTableProxyProvider: DataTableProxyProvider };
 		'dynamic-credentials'?: { credentialCheckProxy: DynamicCredentialCheckProxyProvider };
 		// Project ID is currently only added on the additionalData if the user
--- a/packages/core/src/execution-engine/node-execution-context/node-execution-context.ts
+++ b/packages/core/src/execution-engine/node-execution-context/node-execution-context.ts
@ -304,6 +304,18 @@ export abstract class NodeExecutionContext implements Omit<FunctionsBase, 'getCr
 		itemIndex?: number,
 	): Promise<T> {
 		const { workflow, node, additionalData, mode, runExecutionData, runIndex } = this;
+
+		// Eval-mode bypass: when executing with LLM mock handler and node has no credentials
+		// configured, return mock credentials built from the credential type's property definitions.
+		// This allows the node to execute (build requests, parse responses) without real credentials.
+		// Triple-gated: evaluation mode + mock handler present + credentials actually missing.
+		if (mode === 'evaluation' && additionalData.evalLlmMockHandler && !node.credentials?.[type]) {
+			const { buildEvalMockCredentials } = await import('../eval-mock-helpers');
+			return buildEvalMockCredentials(
+				additionalData.credentialsHelper.getCredentialsProperties(type),
+			) as T;
+		}
+
 		// Get the NodeType as it has the information if the credentials are required
 		const nodeType = workflow.nodeTypes.getByNameAndVersion(node.type, node.typeVersion);

--- a/packages/core/src/execution-engine/node-execution-context/utils/request-helper-functions.ts
+++ b/packages/core/src/execution-engine/node-execution-context/utils/request-helper-functions.ts
@ -67,6 +67,7 @@ import { stringify } from 'qs';
 import { Readable } from 'stream';

 import type { SsrfBridge } from '@/execution-engine';
+import { callEvalMockHandler, normalizeLegacyRequest } from '@/execution-engine/eval-mock-helpers';
 import type { IResponseError } from '@/interfaces';

 import { binaryToString } from './binary-helper-functions';
@ -1061,6 +1062,18 @@ export async function httpRequestWithAuthentication(
 	}

 	let credentialsDecrypted: ICredentialDataDecryptedObject | undefined;
+
+	// Eval LLM mock: intercept before credential auth and OAuth signing
+	if (additionalData.evalLlmMockHandler) {
+		const evalMockResponse = await callEvalMockHandler(
+			additionalData.evalLlmMockHandler,
+			requestOptions,
+			node,
+			requestOptions.returnFullResponse,
+		);
+		if (evalMockResponse !== undefined) return evalMockResponse;
+	}
+
 	try {
 		const parentTypes = additionalData.credentialsHelper.getParentTypes(credentialsType);

@ -1148,7 +1161,6 @@ export async function httpRequestWithAuthentication(
 						node,
 					);
 				}
-				// retry the request
 				return await httpRequest(requestOptions, additionalData.ssrfBridge);
 			} catch (error) {
 				throw new NodeApiError(this.getNode(), error);
@ -1174,6 +1186,18 @@ export async function requestWithAuthentication(

 	let credentialsDecrypted: ICredentialDataDecryptedObject | undefined;

+	// Eval LLM mock: intercept before credential auth and OAuth signing (legacy path)
+	if (additionalData.evalLlmMockHandler) {
+		const evalMockResponse = await callEvalMockHandler(
+			additionalData.evalLlmMockHandler,
+			normalizeLegacyRequest(requestOptions),
+			node,
+			requestOptions.resolveWithFullResponse,
+			'legacy',
+		);
+		if (evalMockResponse !== undefined) return evalMockResponse;
+	}
+
 	try {
 		const parentTypes = additionalData.credentialsHelper.getParentTypes(credentialsType);

@ -1254,7 +1278,6 @@ export async function requestWithAuthentication(
 						workflow,
 						node,
 					)) as IRequestOptions;
-					// retry the request
 					return await proxyRequestToAxios(workflow, additionalData, node, requestOptions);
 				}
 			}
@ -1518,9 +1541,22 @@ export const getRequestHelperFunctions = (
 		return responseData;
 	}

+	// Eval LLM mock handler: extract once for use in direct helpers below
+	const evalLlmMock = additionalData.evalLlmMockHandler;
+
 	return {
-		httpRequest: async (requestOptions: IHttpRequestOptions) =>
-			await httpRequest(requestOptions, additionalData.ssrfBridge),
+		httpRequest: async (requestOptions: IHttpRequestOptions) => {
+			if (evalLlmMock) {
+				const evalMockResponse = await callEvalMockHandler(
+					evalLlmMock,
+					requestOptions,
+					node,
+					requestOptions.returnFullResponse,
+				);
+				if (evalMockResponse !== undefined) return evalMockResponse;
+			}
+			return await httpRequest(requestOptions, additionalData.ssrfBridge);
+		},
 		requestWithAuthenticationPaginated,
 		async httpRequestWithAuthentication(
 			this,
@ -1553,8 +1589,20 @@ export const getRequestHelperFunctions = (
 			);
 		},

-		request: async (uriOrObject, options) =>
-			await proxyRequestToAxios(workflow, additionalData, node, uriOrObject, options),
+		request: async (uriOrObject, options) => {
+			if (evalLlmMock) {
+				const wantsFull = typeof uriOrObject !== 'string' && uriOrObject.resolveWithFullResponse;
+				const evalMockResponse = await callEvalMockHandler(
+					evalLlmMock,
+					normalizeLegacyRequest(uriOrObject, options),
+					node,
+					wantsFull,
+					'legacy',
+				);
+				if (evalMockResponse !== undefined) return evalMockResponse;
+			}
+			return await proxyRequestToAxios(workflow, additionalData, node, uriOrObject, options);
+		},

 		async requestWithAuthentication(
 			this,
@ -1580,6 +1628,16 @@ export const getRequestHelperFunctions = (
 			credentialsType: string,
 			requestOptions: IRequestOptions,
 		): Promise<any> {
+			if (evalLlmMock) {
+				const evalMockResponse = await callEvalMockHandler(
+					evalLlmMock,
+					normalizeLegacyRequest(requestOptions),
+					node,
+					requestOptions.resolveWithFullResponse,
+					'legacy',
+				);
+				if (evalMockResponse !== undefined) return evalMockResponse;
+			}
 			return await requestOAuth1.call(this, credentialsType, requestOptions);
 		},

@ -1589,6 +1647,16 @@ export const getRequestHelperFunctions = (
 			requestOptions: IRequestOptions,
 			oAuth2Options?: IOAuth2Options,
 		): Promise<any> {
+			if (evalLlmMock) {
+				const evalMockResponse = await callEvalMockHandler(
+					evalLlmMock,
+					normalizeLegacyRequest(requestOptions),
+					node,
+					requestOptions.resolveWithFullResponse,
+					'legacy',
+				);
+				if (evalMockResponse !== undefined) return evalMockResponse;
+			}
 			return await requestOAuth2.call(
 				this,
 				credentialsType,
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@ -1578,6 +1578,9 @@ importers:
      '@mozilla/readability':
        specifier: ^0.6.0
        version: 0.6.0
+      '@n8n/agents':
+        specifier: workspace:*
+        version: link:../agents
      '@n8n/api-types':
        specifier: workspace:*
        version: link:../api-types
@ -1587,6 +1590,9 @@ importers:
      '@n8n/workflow-sdk':
        specifier: workspace:*
        version: link:../workflow-sdk
+      flatted:
+        specifier: 3.4.2
+        version: 3.4.2
      langsmith:
        specifier: '>=0.4.6'
        version: 0.4.12(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.213.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.6.0(@opentelemetry/api@1.9.0))(openai@6.19.0(ws@8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))
@ -1624,6 +1630,9 @@ importers:
      '@types/turndown':
        specifier: ^5.0.5
        version: 5.0.6
+      tsx:
+        specifier: 'catalog:'
+        version: 4.19.3

  packages/@n8n/json-schema-to-zod:
    devDependencies: