test: Add Instance AI workflow evals CI pipeline (no-changelog) (#28366)

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
José Braulio González Valido 2026-04-20 15:15:41 +01:00 committed by GitHub
parent 73d93d4edf
commit 560f300716
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 557 additions and 669 deletions

View file

@ -27,6 +27,7 @@ jobs:
db: ${{ fromJSON(steps.ci-filter.outputs.results).db == true }}
performance: ${{ fromJSON(steps.ci-filter.outputs.results).performance == true }}
e2e_performance: ${{ fromJSON(steps.ci-filter.outputs.results)['e2e-performance'] == true }}
instance_ai_workflow_eval: ${{ fromJSON(steps.ci-filter.outputs.results)['instance-ai-workflow-eval'] == true }}
commit_sha: ${{ steps.commit-sha.outputs.sha }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@ -70,6 +71,12 @@ jobs:
packages/testing/playwright/utils/performance-helper.ts
packages/testing/containers/**
.github/workflows/test-e2e-performance-reusable.yml
instance-ai-workflow-eval:
packages/@n8n/instance-ai/src/**
packages/@n8n/instance-ai/evaluations/**
packages/cli/src/modules/instance-ai/**
packages/core/src/execution-engine/eval-mock-helpers.ts
.github/workflows/test-evals-instance-ai*.yml
db:
packages/cli/src/databases/**
packages/cli/src/modules/*/database/**
@ -196,6 +203,18 @@ jobs:
ref: ${{ needs.install-and-build.outputs.commit_sha }}
secrets: inherit
instance-ai-workflow-evals:
name: Instance AI Workflow Evals
needs: install-and-build
if: >-
needs.install-and-build.outputs.instance_ai_workflow_eval == 'true' &&
github.repository == 'n8n-io/n8n' &&
(github.event_name != 'pull_request' || !github.event.pull_request.head.repo.fork)
uses: ./.github/workflows/test-evals-instance-ai.yml
with:
branch: ${{ needs.install-and-build.outputs.commit_sha }}
secrets: inherit
# This job is required by GitHub branch protection rules.
# PRs cannot be merged unless this job passes.
required-checks:

View file

@ -0,0 +1,141 @@
name: 'Test: Instance AI Exec Evals'
on:
workflow_call:
inputs:
branch:
description: 'GitHub branch to test'
required: false
type: string
default: 'master'
filter:
description: 'Filter test cases by name (e.g. "contact-form")'
required: false
type: string
default: ''
workflow_dispatch:
inputs:
branch:
description: 'GitHub branch to test'
required: false
default: 'master'
filter:
description: 'Filter test cases by name (e.g. "contact-form")'
required: false
default: ''
jobs:
run-evals:
name: 'Run Evals'
runs-on: blacksmith-4vcpu-ubuntu-2204
timeout-minutes: 45
permissions:
contents: read
pull-requests: write
steps:
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
ref: ${{ inputs.branch || github.ref }}
fetch-depth: 1
- name: Setup Environment
uses: ./.github/actions/setup-nodejs
with:
build-command: 'pnpm build'
- name: Build Docker image
run: pnpm build:docker
env:
INCLUDE_TEST_CONTROLLER: 'true'
- name: Start n8n container
run: |
docker run -d --name n8n-eval \
-e E2E_TESTS=true \
-e N8N_ENABLED_MODULES=instance-ai \
-e N8N_AI_ENABLED=true \
-e N8N_INSTANCE_AI_MODEL_API_KEY=${{ secrets.EVALS_ANTHROPIC_KEY }} \
-e N8N_LICENSE_ACTIVATION_KEY=${{ secrets.N8N_LICENSE_ACTIVATION_KEY }} \
-e N8N_LICENSE_CERT=${{ secrets.N8N_LICENSE_CERT }} \
-e N8N_ENCRYPTION_KEY=${{ secrets.N8N_ENCRYPTION_KEY }} \
-p 5678:5678 \
n8nio/n8n:local
echo "Waiting for n8n to be ready..."
for i in $(seq 1 60); do
if curl -s http://localhost:5678/healthz/readiness -o /dev/null -w "%{http_code}" | grep -q 200; then
echo "n8n ready after ${i}s"
exit 0
fi
sleep 1
done
echo "::error::n8n failed to start within 60s"
docker logs n8n-eval --tail 30
exit 1
- name: Create test user
run: |
curl -sf -X POST http://localhost:5678/rest/e2e/reset \
-H "Content-Type: application/json" \
-d '{
"owner":{"email":"nathan@n8n.io","password":"PlaywrightTest123","firstName":"Eval","lastName":"Owner"},
"admin":{"email":"admin@n8n.io","password":"PlaywrightTest123","firstName":"Admin","lastName":"User"},
"members":[],
"chat":{"email":"chat@n8n.io","password":"PlaywrightTest123","firstName":"Chat","lastName":"User"}
}'
- name: Run Instance AI Evals
continue-on-error: true
working-directory: packages/@n8n/instance-ai
run: >-
pnpm eval:instance-ai
--base-url http://localhost:5678
--verbose
${{ inputs.filter && format('--filter "{0}"', inputs.filter) || '' }}
env:
N8N_INSTANCE_AI_MODEL_API_KEY: ${{ secrets.EVALS_ANTHROPIC_KEY }}
- name: Stop n8n container
if: ${{ always() }}
run: docker stop n8n-eval && docker rm n8n-eval || true
- name: Post eval results to PR
if: ${{ always() && github.event_name == 'pull_request' }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
RESULTS_FILE="packages/@n8n/instance-ai/eval-results.json"
if [ ! -f "$RESULTS_FILE" ]; then
echo "No eval results file found"
exit 0
fi
# Build the full comment body with jq
jq -r '
"### Instance AI Workflow Eval Results\n\n" +
"**\(.summary.built)/\(.summary.testCases) built | \(.summary.scenariosPassed)/\(.summary.scenariosTotal) passed (\(.summary.passRate * 100 | floor)%)**\n\n" +
"| Workflow | Build | Passed |\n|---|---|---|\n" +
([.testCases[] | "| \(.name) | \(if .built then "✅" else "❌" end) | \([.scenarios[] | select(.passed)] | length)/\(.scenarios | length) |"] | join("\n")) +
"\n\n<details><summary>Failure details</summary>\n\n" +
([.testCases[].scenarios[] | select(.passed == false) | "**\(.name)** \(if .failureCategory then "[\(.failureCategory)]" else "" end)\n> \(.reasoning | .[0:200])\n"] | join("\n")) +
"\n</details>"
' "$RESULTS_FILE" > /tmp/eval-comment.md
# Find and update existing eval comment, or create new one
COMMENT_ID=$(gh api "repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \
--jq '.[] | select(.body | startswith("### Instance AI Workflow Eval")) | .id' | tail -1)
if [ -n "$COMMENT_ID" ]; then
gh api "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}" -X PATCH -F body=@/tmp/eval-comment.md
else
gh pr comment "${{ github.event.pull_request.number }}" --body-file /tmp/eval-comment.md
fi
- name: Upload Results
if: ${{ always() }}
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: instance-ai-workflow-eval-results
path: packages/@n8n/instance-ai/eval-results.json
retention-days: 14

2
.gitignore vendored
View file

@ -33,6 +33,8 @@ test-results.json
*.0x
packages/testing/playwright/playwright-report
packages/testing/playwright/test-results
packages/testing/playwright/eval-results.json
packages/@n8n/instance-ai/eval-results.json
packages/testing/playwright/.playwright-browsers
packages/testing/playwright/.playwright-cli
test-results/

View file

@ -2,19 +2,49 @@
Tests whether workflows built by Instance AI actually work by executing them with LLM-generated mock HTTP responses.
## Quick start
## Running evals
### CLI
```bash
# From packages/@n8n/instance-ai/
# From packages/@n8n/instance-ai/, with n8n running via pnpm dev:ai
# Run all test cases
dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai workflows --verbose
dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai --verbose
# Run a single test case
dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai workflows --filter contact-form --verbose
dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai --filter contact-form --verbose
# Keep built workflows for inspection
dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai --filter contact-form --keep-workflows --verbose
```
The n8n server must be running with `N8N_ENABLED_MODULES=instance-ai`.
Results are printed to the console and written to `eval-results.json`.
### Docker (without pnpm dev:ai)
```bash
# Build the Docker image
INCLUDE_TEST_CONTROLLER=true pnpm build:docker
# Start a container
docker run -d --name n8n-eval \
-e E2E_TESTS=true \
-e N8N_ENABLED_MODULES=instance-ai \
-e N8N_AI_ENABLED=true \
-e N8N_INSTANCE_AI_MODEL_API_KEY=your-key \
-p 5678:5678 \
n8nio/n8n:local
# Run evals against it
pnpm eval:instance-ai --base-url http://localhost:5678 --verbose
```
### CI
Evals run automatically on PRs that change Instance AI code (path-filtered). The CI workflow starts a single Docker container and runs the CLI against it. See `.github/workflows/test-evals-instance-ai.yml`.
The eval job is **non-blocking**. Results are posted as a PR comment and uploaded as artifacts.
### Environment variables
@ -22,9 +52,9 @@ Set these in `.env.local`:
| Variable | Required | Description |
|----------|----------|-------------|
| `N8N_INSTANCE_AI_MODEL_API_KEY` | Yes | Anthropic API key — shared with the Instance AI agent and used for Phase 1 hints, Phase 2 mock generation, and verification |
| `N8N_EVAL_EMAIL` | Yes | n8n login email for the eval runner |
| `N8N_EVAL_PASSWORD` | Yes | n8n login password |
| `N8N_INSTANCE_AI_MODEL_API_KEY` | Yes | Anthropic API key for the Instance AI agent, mock generation, and verification |
| `N8N_EVAL_EMAIL` | No | n8n login email (defaults to E2E test owner) |
| `N8N_EVAL_PASSWORD` | No | n8n login password (defaults to E2E test owner) |
| `CONTEXT7_API_KEY` | No | Context7 API key for higher rate limits on API doc lookups. Free tier is 1,000 req/month |
## How it works
@ -34,7 +64,7 @@ Each test run:
1. **Build** — sends the test case prompt to Instance AI, which builds a workflow
2. **Phase 1** — analyzes the workflow and generates consistent mock data hints (one Sonnet call per scenario)
3. **Phase 2** — executes the workflow with all HTTP requests intercepted. Each request goes to an LLM that generates a realistic API response using the node's configuration and API documentation from Context7
4. **Verify** — an LLM evaluates whether the scenario's success criteria were met, categorizes any failure as `builder_issue`, `mock_issue`, `legitimate_failure`, or `verification_gap`
4. **Verify** — an LLM evaluates whether the scenario's success criteria were met and categorizes any failure by root cause (see Failure categories below)
### What gets mocked
@ -83,66 +113,43 @@ Test cases live in `evaluations/data/workflows/*.json`:
- Edge cases — empty data, missing fields, single vs multiple items
- Error scenarios only if the workflow is expected to handle them gracefully. Most agent-built workflows don't include error handling, so testing "the workflow crashes on invalid input" is a legitimate finding, not a test case failure.
## Understanding the report
Each run generates a timestamped HTML report in `.data/` plus a stable `workflow-eval-report.html`.
### Failure categories
## Failure categories
When a scenario fails, the verifier categorizes the root cause:
- **builder_issue** (amber) — the agent misconfigured a node, chose the wrong node type, or the workflow structure doesn't match what was asked. Examples: Switch node missing required `conditions.options`, Linear node not querying `creator.email`, missing error handling.
- **mock_issue** (red) — the LLM mock returned incorrect data. Examples: `_evalMockError` (JSON parse failure), wrong response shape for the endpoint, identical responses for repeated calls.
- **legitimate_failure** — the workflow genuinely doesn't meet the success criteria. Neither builder nor mock is at fault.
- **verification_gap** — not enough information to determine the cause.
### Report sections
- **Dashboard** — pass rate, counts at a glance
- **Scenario indicators** — inline pass/fail on the collapsed test case card
- **Built workflow** — node list with execution modes and config issues
- **Agent output** — raw workflow JSON for cross-run comparison
- **Execution trace** — per-node detail with request/response pairs for mocked nodes
- **Mock data plan** — Phase 1 hints (global context, trigger content, per-node hints)
- **Diagnosis** — verifier reasoning with failure category and root cause
## Known limitations
- **LangChain/AI nodes** — use their own SDKs, not intercepted by the HTTP mock layer. These nodes will fail with credential errors. Use pin data for these (tracked in AI-2297).
- **GraphQL APIs** — response shape depends on the query, not just the endpoint. The mock handles this when the request body (containing the query) is passed to the LLM, but quality depends on the LLM knowing the API schema.
- **Context7 quota** — free tier is 1,000 requests/month, 60/hour. A full suite run uses ~100 requests. Set `CONTEXT7_API_KEY` for sustained use. When quota is exceeded, a warning is logged and the LLM falls back to its training data.
- **Non-determinism** — the agent builds different workflows each run. Some configurations work, some don't. Contact Form is stable at 5/5. Other test cases vary based on how the agent configures nodes.
- **Switch/IF nodes** — the agent sometimes builds these without the required `conditions.options` block, causing a `caseSensitive` runtime crash. This is a known agent builder issue.
- **builder_issue** — the agent misconfigured a node, chose the wrong node type, or the workflow structure doesn't match what was asked
- **mock_issue** — the LLM mock returned incorrect data (e.g., `_evalMockError`, wrong response shape)
- **framework_issue** — Phase 1 failed (empty trigger content), cascading errors from the eval framework itself
- **verification_failure** — the LLM verifier couldn't produce a valid result
- **build_failure** — Instance AI failed to build the workflow or a scenario timed out
## Architecture
```
evaluations/
├── index.ts # Public API
├── cli/ # CLI entry point and args parsing
├── clients/ # n8n REST + SSE clients
├── checklist/ # Verification (programmatic + LLM)
├── checklist/ # LLM verification with retry
├── credentials/ # Test credential seeding
├── data/
│ ├── prompts.ts # Original prompt-based eval prompts
│ └── workflows/ # Workflow test case JSON files
├── harness/ # Runner orchestration
├── outcome/ # Outcome extraction (original flow)
├── execution/ # Post-build execution (original flow)
├── report/ # HTML report generators
└── system-prompts/ # LLM prompts (builder-* for original flow, mock-* for mock execution)
├── data/workflows/ # Test case JSON files
├── harness/ # Runner: buildWorkflow, executeScenario, cleanupBuild
├── outcome/ # SSE event parsing, workflow discovery
└── system-prompts/ # LLM prompts for verification
packages/cli/src/modules/instance-ai/eval/
├── execution.service.ts # Phase 1 + Phase 2 orchestration
├── workflow-analysis.ts # Hint generation (Phase 1)
├── mock-handler.ts # Per-request mock generation (Phase 2)
├── api-docs.ts # Context7 API doc fetcher
├── node-config.ts # Node config serializer
├── pin-data-generator.ts # LLM pin data for bypass nodes (Phase 1.5)
packages/core/src/execution-engine/
├── eval-mock-helpers.ts # HTTP interception utilities
├── execution.service.ts # Phase 1 + Phase 2 orchestration
├── workflow-analysis.ts # Hint generation (Phase 1)
├── mock-handler.ts # Per-request mock generation (Phase 2)
├── api-docs.ts # Context7 API doc fetcher
├── node-config.ts # Node config serializer
└── pin-data-generator.ts # LLM pin data for bypass nodes (Phase 1.5)
```
Two evaluation approaches coexist:
- **Original** (`pnpm eval:instance-ai`) — prompt-based builder evaluation using checklists
- **Workflow test cases** (`pnpm eval:instance-ai workflows`) — mock execution evaluation
## Known limitations
- **LangChain/AI nodes** — use their own SDKs, not intercepted by the HTTP mock layer. These nodes will fail with credential errors. Use pin data for these.
- **GraphQL APIs** — response shape depends on the query, not just the endpoint. Quality depends on the LLM knowing the API schema.
- **Context7 quota** — free tier is 1,000 requests/month, 60/hour. A full suite run uses ~100 requests. When quota is exceeded, the LLM falls back to its training data.
- **Non-determinism** — the agent builds different workflows each run. Pass rates vary between 40-65%.
- **Large workflows** — the verification artifact includes full execution traces. For complex workflows (12+ nodes) this can hit token limits. See TRUST-43 for the tool-based verifier approach.

View file

@ -25,16 +25,17 @@ const checklistResultSchema = z.object({
// Public API
// ---------------------------------------------------------------------------
const MAX_VERIFY_ATTEMPTS = 2;
export async function verifyChecklist(
checklist: ChecklistItem[],
verificationArtifact: string,
_workflowJsons: WorkflowResponse[],
): Promise<ChecklistResult[]> {
const llmItems = checklist.filter((i) => i.strategy === 'llm');
const results: ChecklistResult[] = [];
if (llmItems.length === 0) return [];
if (llmItems.length > 0) {
const userMessage = `## Checklist
const userMessage = `## Checklist
${JSON.stringify(llmItems, null, 2)}
@ -44,6 +45,9 @@ ${verificationArtifact}
Verify each checklist item against the artifact above.`;
const validIds = new Set(llmItems.map((i) => i.id));
for (let attempt = 0; attempt < MAX_VERIFY_ATTEMPTS; attempt++) {
const agent = createEvalAgent('eval-checklist-verifier', {
instructions: MOCK_EXECUTION_VERIFY_PROMPT,
cache: true,
@ -51,8 +55,8 @@ Verify each checklist item against the artifact above.`;
const result = await agent.generate(userMessage);
const validIds = new Set(llmItems.map((i) => i.id));
const parsed = result.structuredOutput as z.infer<typeof checklistResultSchema> | undefined;
const results: ChecklistResult[] = [];
if (parsed?.results) {
for (const entry of parsed.results) {
@ -66,20 +70,19 @@ Verify each checklist item against the artifact above.`;
pass: entry.pass,
reasoning: entry.reasoning ?? '',
strategy: 'llm',
failureCategory: entry.failureCategory,
failureCategory:
entry.failureCategory ?? (!entry.pass ? 'verification_failure' : undefined),
rootCause: entry.rootCause,
});
}
}
} else {
console.warn(
'[verifier] structuredOutput returned null — LLM did not produce parseable results',
);
}
if (results.length > 0) {
results.sort((a, b) => a.id - b.id);
return results;
}
}
// Sort results by id for deterministic output
results.sort((a, b) => a.id - b.id);
return results;
return [];
}

View file

@ -21,6 +21,8 @@ export interface CliArgs {
filter?: string;
/** Keep built workflows after evaluation instead of deleting them */
keepWorkflows: boolean;
/** Directory to write eval-results.json (defaults to cwd) */
outputDir?: string;
}
// ---------------------------------------------------------------------------
@ -35,6 +37,7 @@ const cliArgsSchema = z.object({
verbose: z.boolean().default(false),
filter: z.string().optional(),
keepWorkflows: z.boolean().default(false),
outputDir: z.string().optional(),
});
// ---------------------------------------------------------------------------
@ -53,6 +56,7 @@ export function parseCliArgs(argv: string[]): CliArgs {
verbose: validated.verbose,
filter: validated.filter,
keepWorkflows: validated.keepWorkflows,
outputDir: validated.outputDir,
};
}
@ -68,6 +72,7 @@ interface RawArgs {
verbose: boolean;
filter?: string;
keepWorkflows: boolean;
outputDir?: string;
}
function parseRawArgs(argv: string[]): RawArgs {
@ -76,6 +81,7 @@ function parseRawArgs(argv: string[]): RawArgs {
baseUrl: 'http://localhost:5678',
verbose: false,
keepWorkflows: false,
outputDir: undefined,
};
for (let i = 0; i < argv.length; i++) {
@ -115,6 +121,11 @@ function parseRawArgs(argv: string[]): RawArgs {
result.keepWorkflows = true;
break;
case '--output-dir':
result.outputDir = nextArg(argv, i, '--output-dir');
i++;
break;
default:
// Ignore unknown flags
break;

View file

@ -1,4 +1,7 @@
#!/usr/bin/env node
import { mkdirSync, writeFileSync } from 'fs';
import { join } from 'path';
import { parseCliArgs } from './args';
import { N8nClient } from '../clients/n8n-client';
import { seedCredentials, cleanupCredentials } from '../credentials/seeder';
@ -6,7 +9,7 @@ import { loadWorkflowTestCases } from '../data/workflows';
import { createLogger } from '../harness/logger';
import { runWorkflowTestCase, runWithConcurrency } from '../harness/runner';
import { snapshotWorkflowIds } from '../outcome/workflow-discovery';
import { writeWorkflowReport } from '../report/workflow-report';
import type { WorkflowTestCaseResult } from '../types';
async function main(): Promise<void> {
const args = parseCliArgs(process.argv.slice(2));
@ -31,7 +34,7 @@ async function main(): Promise<void> {
logger.success('Authenticated');
logger.info('Seeding credentials...');
const seedResult = await seedCredentials(client);
const seedResult = await seedCredentials(client, undefined, logger);
logger.info(`Seeded ${String(seedResult.credentialIds.length)} credential(s)`);
const preRunWorkflowIds = await snapshotWorkflowIds(client);
@ -40,8 +43,8 @@ async function main(): Promise<void> {
// Run test cases with bounded concurrency.
// Each test case builds a workflow (uses n8n's agent) then runs scenarios
// (uses our Anthropic key for Phase 1 + Phase 2 mock generation).
// At Tier 4 (20K RPM) no practical limit is needed — set high to run all in parallel.
const MAX_CONCURRENT_TEST_CASES = 4;
const startTime = Date.now();
let results;
try {
results = await runWithConcurrency(
@ -64,12 +67,60 @@ async function main(): Promise<void> {
await cleanupCredentials(client, seedResult.credentialIds).catch(() => {});
}
// Generate HTML report
const reportPath = writeWorkflowReport(results);
console.log(`Report: ${reportPath}`);
const totalDuration = Date.now() - startTime;
// Print summary
console.log('\n=== Workflow Test Case Results ===\n');
// Write eval-results.json for CI consumption (PR comments, artifacts)
const outputPath = writeEvalResults(results, totalDuration, args.outputDir);
console.log(`Results: ${outputPath}`);
// Print console summary
printSummary(results);
}
/** Write structured JSON results for CI (PR comments, artifact upload). */
function writeEvalResults(
results: WorkflowTestCaseResult[],
duration: number,
outputDir?: string,
): string {
const allScenarios = results.flatMap((r) => r.scenarioResults);
const passed = allScenarios.filter((s) => s.success).length;
const report = {
timestamp: new Date().toISOString(),
duration,
summary: {
testCases: results.length,
built: results.filter((r) => r.workflowBuildSuccess).length,
scenariosTotal: allScenarios.length,
scenariosPassed: passed,
passRate: allScenarios.length > 0 ? passed / allScenarios.length : 0,
},
testCases: results.map((r) => ({
name: r.testCase.prompt.slice(0, 70),
built: r.workflowBuildSuccess,
buildError: r.buildError,
workflowId: r.workflowId,
scenarios: r.scenarioResults.map((sr) => ({
name: sr.scenario.name,
passed: sr.success,
score: sr.score,
reasoning: sr.reasoning,
failureCategory: sr.failureCategory,
rootCause: sr.rootCause,
})),
})),
};
const dir = outputDir ?? process.cwd();
mkdirSync(dir, { recursive: true });
const outputPath = join(dir, 'eval-results.json');
writeFileSync(outputPath, JSON.stringify(report, null, 2));
return outputPath;
}
function printSummary(results: WorkflowTestCaseResult[]): void {
console.log('\n=== Workflow Eval Results ===\n');
for (const r of results) {
const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED';
console.log(`${r.testCase.prompt.slice(0, 70)}...`);
@ -89,6 +140,14 @@ async function main(): Promise<void> {
}
console.log('');
}
// Totals
const allScenarios = results.flatMap((r) => r.scenarioResults);
const passed = allScenarios.filter((s) => s.success).length;
const built = results.filter((r) => r.workflowBuildSuccess).length;
console.log(
`${String(built)}/${String(results.length)} built | ${String(passed)}/${String(allScenarios.length)} passed (${String(allScenarios.length > 0 ? Math.round((passed / allScenarios.length) * 100) : 0)}%)`,
);
}
main().catch((error) => {

View file

@ -75,17 +75,23 @@ interface ThreadStatus {
export class N8nClient {
private sessionCookie?: string;
constructor(private readonly baseUrl: string) {}
constructor(readonly baseUrl: string) {}
// -- Auth ----------------------------------------------------------------
/** Set the session cookie directly (for sharing across workers). */
setSessionCookie(cookie: string): void {
this.sessionCookie = cookie;
}
/**
* Authenticate with the n8n instance via POST /rest/login.
* Captures the `n8n-auth` cookie for subsequent requests.
*/
async login(email?: string, password?: string): Promise<void> {
const loginEmail = email ?? process.env.N8N_EVAL_EMAIL ?? 'admin@n8n.io';
const loginPassword = password ?? process.env.N8N_EVAL_PASSWORD ?? 'password';
// Defaults match the E2E test owner created by the E2E_TESTS=true bootstrap
const loginEmail = email ?? process.env.N8N_EVAL_EMAIL ?? 'nathan@n8n.io';
const loginPassword = password ?? process.env.N8N_EVAL_PASSWORD ?? 'PlaywrightTest123';
await this.fetch('/rest/login', {
method: 'POST',

View file

@ -10,6 +10,7 @@
// ---------------------------------------------------------------------------
import type { N8nClient } from '../clients/n8n-client';
import type { EvalLogger } from '../harness/logger';
// ---------------------------------------------------------------------------
// Config types
@ -102,6 +103,7 @@ export interface SeedResult {
export async function seedCredentials(
client: N8nClient,
requiredTypes?: string[],
logger?: EvalLogger,
): Promise<SeedResult> {
const credentialIds: string[] = [];
const seededTypes: string[] = [];
@ -113,7 +115,7 @@ export async function seedCredentials(
const token = process.env[config.envVar];
if (!token) {
console.log(` Skipping ${config.name}: ${config.envVar} not set`);
logger?.verbose(` Skipping ${config.name}: ${config.envVar} not set`);
continue;
}

View file

@ -18,8 +18,8 @@
},
{
"name": "high-volume",
"description": "Channels return many messages, testing merge and aggregation",
"dataSetup": "Each Slack channel history node returns 8-10 messages covering diverse topics: hiring updates, infrastructure changes, feature launches, customer feedback, and team announcements. The AI/LLM node should return a comprehensive summary. The Slack post-message node returns a success response.",
"description": "Channels return several messages each, testing merge and aggregation",
"dataSetup": "Each Slack channel history node returns 4-5 messages. Channel 1: hiring updates and a new joiner announcement. Channel 2: infrastructure migration discussion and CI/CD pipeline changes. Channel 3: feature launch announcement and customer feedback. The AI/LLM node should return a comprehensive summary. The Slack post-message node returns a success response.",
"successCriteria": "The workflow handles the volume without errors. All messages are merged/aggregated correctly. The summary covers the major topics and is posted successfully. No data is lost in the merge step."
},
{

View file

@ -1,5 +1,5 @@
{
"prompt": "Every hour, check the current weather for London, New York, and Tokyo using the OpenWeatherMap API (GET https://api.openweathermap.org/data/2.5/weather?q={city}&units=metric&appid=YOUR_API_KEY). Use 3 separate HTTP Request nodes, one per city. If any city has a temperature above 30°C, send a Telegram alert to chat ID -1001234567890 listing the hot cities. Log all readings to an Airtable table with columns: city, temperature, humidity, timestamp. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
"prompt": "Every hour, check the current weather for London, New York, and Tokyo using the OpenWeatherMap API (GET https://api.openweathermap.org/data/2.5/weather?q={city}&units=metric&appid=YOUR_API_KEY). Use 3 separate HTTP Request nodes, one per city. If any city has a temperature above 30°C, send a Telegram alert to chat ID -1001234567890 listing the hot cities. Log all readings to an Airtable table (base ID: 'appK2xGfGNOIrl2gT', table name: 'Weather Logs') with columns: city, temperature, humidity, timestamp. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
"complexity": "complex",
"tags": [
"build",

View file

@ -9,6 +9,7 @@
import type { InstanceAiEvalExecutionResult } from '@n8n/api-types';
import crypto from 'node:crypto';
import { type EvalLogger } from './logger';
import { verifyChecklist } from '../checklist/verifier';
import type { N8nClient, WorkflowResponse } from '../clients/n8n-client';
import { consumeSseStream } from '../clients/sse-client';
@ -22,7 +23,6 @@ import type {
WorkflowTestCase,
WorkflowTestCaseResult,
} from '../types';
import { type EvalLogger } from './logger';
// ---------------------------------------------------------------------------
// Constants
@ -54,12 +54,15 @@ interface WorkflowTestCaseConfig {
keepWorkflows: boolean;
}
/**
* All-in-one test case runner: build workflow + run all scenarios + cleanup.
* Used by the CLI. The split API (buildWorkflow + executeScenario + cleanupBuild)
* is available for custom orchestration (e.g. LangSmith evaluate).
*/
export async function runWorkflowTestCase(
config: WorkflowTestCaseConfig,
): Promise<WorkflowTestCaseResult> {
const { client, testCase, logger } = config;
const threadId = `eval-${crypto.randomUUID()}`;
const startTime = Date.now();
const timeoutMs = config.timeoutMs > 0 ? config.timeoutMs : DEFAULT_TIMEOUT_MS;
const result: WorkflowTestCaseResult = {
@ -68,25 +71,110 @@ export async function runWorkflowTestCase(
scenarioResults: [],
};
const build = await buildWorkflow({
client,
prompt: testCase.prompt,
timeoutMs,
preRunWorkflowIds: config.preRunWorkflowIds,
claimedWorkflowIds: config.claimedWorkflowIds,
logger,
});
if (!build.success || !build.workflowId) {
result.buildError = build.error;
return result;
}
result.workflowBuildSuccess = true;
result.workflowId = build.workflowId;
result.workflowJson = build.workflowJsons[0];
const scenarioStart = Date.now();
result.scenarioResults = await runWithConcurrency(
testCase.scenarios,
async (scenario) => {
try {
return await executeScenario(
client,
build.workflowId!,
scenario,
build.workflowJsons,
logger,
);
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error(` ERROR [${scenario.name}]: ${errorMessage}`);
return {
scenario,
success: false,
score: 0,
reasoning: `Error: ${errorMessage}`,
} satisfies ScenarioResult;
}
},
MAX_CONCURRENT_SCENARIOS,
);
const scenarioMs = Date.now() - scenarioStart;
logger.info(
` Scenarios done: ${String(result.scenarioResults.length)} scenarios [${String(Math.round(scenarioMs / 1000))}s]`,
);
if (!config.keepWorkflows) {
await cleanupBuild(client, build, logger);
}
return result;
}
// ---------------------------------------------------------------------------
// Split API: build once, run scenarios independently
// ---------------------------------------------------------------------------
export interface BuildResult {
success: boolean;
workflowId?: string;
workflowJsons: WorkflowResponse[];
error?: string;
/** IDs to pass to cleanupBuild() */
createdWorkflowIds: string[];
createdDataTableIds: string[];
}
export interface BuildWorkflowConfig {
client: N8nClient;
prompt: string;
timeoutMs?: number;
preRunWorkflowIds: Set<string>;
claimedWorkflowIds: Set<string>;
logger: EvalLogger;
}
/**
* Build a workflow via Instance AI. Returns the workflow ID for use with
* executeScenario(). Call cleanupBuild() when done.
*/
export async function buildWorkflow(config: BuildWorkflowConfig): Promise<BuildResult> {
const { client, prompt, logger } = config;
const threadId = `eval-${crypto.randomUUID()}`;
const startTime = Date.now();
const timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const abortController = new AbortController();
const events: CapturedEvent[] = [];
const approvedRequests = new Set<string>();
try {
// 1. Send prompt to Instance AI and wait for workflow to be built (ONCE)
logger.info(` Building workflow: "${truncate(testCase.prompt, 60)}"`);
const buildStart = Date.now();
logger.info(` Building workflow: "${truncate(prompt, 60)}"`);
const ssePromise = startSseConnection(client, threadId, events, abortController.signal).catch(
() => {
// SSE errors are non-fatal — workflow discovery falls back to event-based approach
},
() => {},
);
await delay(SSE_SETTLE_DELAY_MS);
await client.sendMessage(threadId, prompt);
await client.sendMessage(threadId, testCase.prompt);
// Wait with shorter timeout for scenario mode
await waitForAllActivity({
client,
threadId,
@ -100,14 +188,13 @@ export async function runWorkflowTestCase(
abortController.abort();
await ssePromise.catch(() => {});
// 2. Capture the built workflow
let threadMessages;
try {
threadMessages = await client.getThreadMessages(threadId);
} catch {
logger.verbose(`[${threadId}] Thread messages unavailable — using SSE events only`);
threadMessages = { messages: [] as never[] };
threadMessages = { messages: [] };
}
const messageWorkflowIds = extractWorkflowIdsFromMessages(threadMessages.messages);
const eventOutcome = extractOutcomeFromEvents(events);
const outcome = await buildAgentOutcome(
@ -126,7 +213,6 @@ export async function runWorkflowTestCase(
}
if (outcome.workflowsCreated.length === 0) {
// Extract error information from SSE events and thread messages
const toolErrors = events
.filter((e) => e.type === 'tool-error')
.map((e) => {
@ -160,85 +246,87 @@ export async function runWorkflowTestCase(
? `Agent response: ${agentText.slice(0, 500)}`
: 'No workflow produced — no error details captured';
result.buildError = buildError;
logger.warn(` No workflow created for: "${truncate(testCase.prompt, 60)}"`);
logger.warn(` ${buildError.slice(0, 200)}`);
return result;
return {
success: false,
error: buildError,
workflowJsons: [],
createdWorkflowIds: [],
createdDataTableIds: outcome.dataTablesCreated,
};
}
result.workflowBuildSuccess = true;
result.workflowId = outcome.workflowsCreated[0].id;
result.workflowJson = outcome.workflowJsons[0];
const buildMs = Date.now() - buildStart;
logger.info(
` Workflow built: ${outcome.workflowsCreated[0].name} (${String(outcome.workflowsCreated[0].nodeCount)} nodes)`,
` Workflow built: ${outcome.workflowsCreated[0].name} (${String(outcome.workflowsCreated[0].nodeCount)} nodes) [${String(Math.round(buildMs / 1000))}s]`,
);
// 3. Run scenarios with bounded concurrency to avoid API rate limits
const workflowId = outcome.workflowsCreated[0].id;
return {
success: true,
workflowId: outcome.workflowsCreated[0].id,
workflowJsons: outcome.workflowJsons,
createdWorkflowIds: outcome.workflowsCreated.map((wf) => wf.id),
createdDataTableIds: outcome.dataTablesCreated,
};
} catch (error: unknown) {
abortController.abort();
return {
success: false,
error: error instanceof Error ? error.message : String(error),
workflowJsons: [],
createdWorkflowIds: [],
createdDataTableIds: [],
};
}
}
for (const scenario of testCase.scenarios) {
logger.info(` Scenario: ${scenario.name}`);
/**
* Execute a single scenario against a pre-built workflow and verify the result.
*/
export async function executeScenario(
client: N8nClient,
workflowId: string,
scenario: TestScenario,
workflowJsons: WorkflowResponse[],
logger: EvalLogger,
): Promise<ScenarioResult> {
return await runScenario(client, scenario, workflowId, workflowJsons, logger);
}
/**
* Clean up workflows and data tables created during a build.
*/
export async function cleanupBuild(
client: N8nClient,
build: BuildResult,
logger: EvalLogger,
): Promise<void> {
for (const id of build.createdWorkflowIds) {
try {
await client.deleteWorkflow(id);
} catch {
// Best-effort cleanup
}
}
result.scenarioResults = await runWithConcurrency(
testCase.scenarios,
async (scenario) => {
if (build.createdDataTableIds.length > 0) {
try {
const projectId = await client.getPersonalProjectId();
for (const dtId of build.createdDataTableIds) {
try {
return await runScenario(client, scenario, workflowId, outcome.workflowJsons, logger);
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error(` ERROR [${scenario.name}]: ${errorMessage}`);
return {
scenario,
success: false,
score: 0,
reasoning: `Error: ${errorMessage}`,
} satisfies ScenarioResult;
}
},
MAX_CONCURRENT_SCENARIOS,
);
// 4. Cleanup — delete workflows created during build (unless --keep-workflows)
if (!config.keepWorkflows) {
for (const wf of outcome.workflowsCreated) {
try {
await client.deleteWorkflow(wf.id);
await client.deleteDataTable(projectId, dtId);
} catch {
// Best-effort cleanup
}
}
logger.verbose(` Cleaned up ${String(build.createdDataTableIds.length)} data table(s)`);
} catch {
// Non-fatal — project ID lookup may fail
}
// Clean up data tables created during this run
if (outcome.dataTablesCreated.length > 0) {
try {
const projectId = await client.getPersonalProjectId();
for (const dtId of outcome.dataTablesCreated) {
try {
await client.deleteDataTable(projectId, dtId);
} catch {
// Best-effort cleanup
}
}
logger.verbose(` Cleaned up ${String(outcome.dataTablesCreated.length)} data table(s)`);
} catch {
// Non-fatal — project ID lookup may fail
}
}
} catch (error: unknown) {
abortController.abort();
const errorMessage = error instanceof Error ? error.message : String(error);
result.buildError = errorMessage;
logger.error(` Build failed: ${errorMessage}`);
}
return result;
}
// ---------------------------------------------------------------------------
// Scenario execution
// Scenario execution (internal)
// ---------------------------------------------------------------------------
async function runScenario(
@ -248,13 +336,15 @@ async function runScenario(
workflowJsons: WorkflowResponse[],
logger: EvalLogger,
): Promise<ScenarioResult> {
const execStart = Date.now();
const evalResult = await client.executeWithLlmMock(workflowId, scenario.dataSetup);
const execMs = Date.now() - execStart;
logger.verbose(
` [${scenario.name}] Execution ${evalResult.executionId}: ${evalResult.success ? 'success' : 'failed'}` +
` (${Object.keys(evalResult.nodeResults).length} nodes, ${evalResult.errors.length} errors)`,
logger.info(
` [${scenario.name}] exec=${String(Math.round(execMs / 1000))}s (${Object.keys(evalResult.nodeResults).length} nodes)`,
);
const verifyStart = Date.now();
const verificationArtifact = buildVerificationArtifact(scenario, evalResult, workflowJsons);
const scenarioChecklist: ChecklistItem[] = [
@ -272,16 +362,20 @@ async function runScenario(
workflowJsons,
);
const verifyMs = Date.now() - verifyStart;
const passed = verificationResults.length > 0 && verificationResults[0].pass;
const result = verificationResults[0];
const reasoning = result?.reasoning ?? 'No verification result';
const failureCategory = result?.failureCategory;
const reasoning = result?.reasoning ?? 'No verification result — LLM verifier returned empty';
const failureCategory = result?.failureCategory ?? (result ? undefined : 'verification_failure');
const rootCause = result?.rootCause;
const categoryLabel = failureCategory ? ` [${failureCategory}]` : '';
logger.info(
` [${scenario.name}] ${passed ? 'PASS' : 'FAIL'}${categoryLabel}: ${reasoning.slice(0, 100)}`,
` [${scenario.name}] ${passed ? 'PASS' : 'FAIL'}${categoryLabel} verify=${String(Math.round(verifyMs / 1000))}s`,
);
if (!passed) {
logger.info(` [${scenario.name}] ${reasoning}`);
}
return {
scenario,

View file

@ -0,0 +1,40 @@
// ---------------------------------------------------------------------------
// Public API for the instance-ai workflow evaluation framework
//
// This module exports the domain logic used by the CLI (evaluations/cli/)
// and available for custom orchestration (e.g. LangSmith evaluate).
// ---------------------------------------------------------------------------
// -- Client & Auth --
export { N8nClient } from './clients/n8n-client';
export type { WorkflowResponse, WorkflowNodeResponse, ExecutionDetail } from './clients/n8n-client';
// -- Test case data --
export { loadWorkflowTestCases } from './data/workflows';
// -- Credentials --
export { seedCredentials, cleanupCredentials } from './credentials/seeder';
export type { SeedResult } from './credentials/seeder';
// -- Runner (all-in-one) --
export { runWorkflowTestCase, runWithConcurrency } from './harness/runner';
// -- Runner (split API: build once, run scenarios independently) --
export { buildWorkflow, executeScenario, cleanupBuild } from './harness/runner';
export type { BuildResult, BuildWorkflowConfig } from './harness/runner';
// -- Workflow discovery --
export { snapshotWorkflowIds } from './outcome/workflow-discovery';
// -- Logger --
export { type EvalLogger, createLogger } from './harness/logger';
// -- Types --
export type {
WorkflowTestCase,
TestScenario,
WorkflowTestCaseResult,
ScenarioResult,
ChecklistItem,
ChecklistResult,
} from './types';

View file

@ -1,501 +0,0 @@
/**
* HTML report generator for workflow test case evaluations.
*
* Produces a self-contained HTML file optimized for three tasks:
* 1. Triage which scenarios failed? (seconds)
* 2. Diagnose why did they fail? (minutes)
* 3. Compare what changed between runs? (cross-report)
*/
import fs from 'fs';
import path from 'path';
import type { WorkflowTestCaseResult, ScenarioResult } from '../types';
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function escapeHtml(str: string): string {
return str
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
}
// ---------------------------------------------------------------------------
// Scenario rendering
// ---------------------------------------------------------------------------
function renderScenario(sr: ScenarioResult, index: number): string {
const icon = sr.success ? '&#10003;' : '&#10007;';
const statusClass = sr.success ? 'pass' : 'fail';
// Passing scenarios: compact one-liner with collapsible detail
if (sr.success) {
const summary = sr.reasoning ? sr.reasoning.slice(0, 150) : 'All checks passed';
return `<div class="scenario ${statusClass}">
<div class="scenario-header" onclick="this.parentElement.classList.toggle('expanded')">
<span class="scenario-icon ${statusClass}">${icon}</span>
<span class="scenario-name">${escapeHtml(sr.scenario.name)}</span>
<span class="scenario-summary-inline">${escapeHtml(summary)}${sr.reasoning && sr.reasoning.length > 150 ? '...' : ''}</span>
</div>
<div class="scenario-detail" id="scenario-${String(index)}">
${renderScenarioDetail(sr)}
</div>
</div>`;
}
// Failing scenarios: show error prominently, detail expanded by default
return `<div class="scenario ${statusClass} expanded">
<div class="scenario-header" onclick="this.parentElement.classList.toggle('expanded')">
<span class="scenario-icon ${statusClass}">${icon}</span>
<span class="scenario-name">${escapeHtml(sr.scenario.name)}</span>
<span class="scenario-desc">${escapeHtml(sr.scenario.description)}</span>
</div>
<div class="scenario-detail" id="scenario-${String(index)}">
${renderScenarioDetail(sr)}
</div>
</div>`;
}
function renderScenarioDetail(sr: ScenarioResult): string {
let html = '';
if (!sr.evalResult) {
if (sr.reasoning) {
html += `<div class="diagnosis">${escapeHtml(sr.reasoning)}</div>`;
}
return html;
}
// Failure category badge
if (!sr.success && sr.failureCategory) {
const catClass =
sr.failureCategory === 'builder_issue'
? 'warn'
: sr.failureCategory === 'mock_issue'
? 'fail'
: 'info';
html += `<div class="category-badge category-${catClass}">${escapeHtml(sr.failureCategory)}${sr.rootCause ? ': ' + escapeHtml(sr.rootCause) : ''}</div>`;
}
// 1. Error — what broke
if (sr.evalResult.errors.length > 0) {
html += `<div class="error-box">${escapeHtml(sr.evalResult.errors.join('; '))}</div>`;
}
// Phase 1 warnings
const warnings = sr.evalResult.hints?.warnings ?? [];
if (warnings.length > 0) {
html += `<div class="warning-box">${escapeHtml(warnings.join('; '))}</div>`;
}
// 2. Diagnosis — verifier's reasoning
if (sr.reasoning) {
html += '<details class="section" open><summary>Diagnosis</summary>';
html += `<div class="diagnosis">${escapeHtml(sr.reasoning)}</div>`;
html += '</details>';
}
// 3. Mock data plan — Phase 1 hints
if (sr.evalResult.hints) {
html += '<details class="section"><summary>Mock data plan</summary>';
const { globalContext, triggerContent, nodeHints } = sr.evalResult.hints;
if (globalContext) {
html += '<div class="subsection-label">Global context</div>';
html += `<div class="hint-text">${escapeHtml(globalContext)}</div>`;
}
if (Object.keys(triggerContent ?? {}).length > 0) {
html += '<div class="subsection-label">Trigger content</div>';
html += `<pre class="json-block"><code>${escapeHtml(JSON.stringify(triggerContent, null, 2))}</code></pre>`;
} else {
html +=
'<div class="warning-inline">No trigger content generated \u2014 start node has no input data</div>';
}
if (nodeHints && Object.keys(nodeHints).length > 0) {
html += '<div class="subsection-label">Per-node hints</div>';
for (const [nodeName, hint] of Object.entries(nodeHints)) {
html += `<details class="node-hint"><summary>${escapeHtml(nodeName)}</summary>`;
html += `<div class="hint-text">${escapeHtml(hint)}</div>`;
html += '</details>';
}
}
html += '</details>';
}
// 4. Execution trace — per-node results
const nodeEntries = Object.entries(sr.evalResult.nodeResults);
if (nodeEntries.length > 0) {
html += '<details class="section"><summary>Execution trace</summary>';
html +=
'<div class="trace-legend"><span class="node-mode-mocked">mocked</span> <span class="node-mode-pinned">pinned</span> <span class="node-mode-real">real</span></div>';
for (const [nodeName, nr] of nodeEntries) {
const modeClass = `node-mode-${nr.executionMode}`;
const hasError = nr.configIssues && Object.keys(nr.configIssues).length > 0;
const configWarning = hasError
? `<span class="build-issue">Build issue: ${escapeHtml(Object.values(nr.configIssues!).flat().join('; '))}</span>`
: '';
html += '<div class="trace-node">';
html += '<div class="trace-node-header">';
html += `<span class="${modeClass}">[${nr.executionMode}]</span> <strong>${escapeHtml(nodeName)}</strong>`;
if (nr.interceptedRequests.length > 0) {
html += ` <span class="request-count">${String(nr.interceptedRequests.length)} request(s)</span>`;
}
html += '</div>';
if (configWarning) html += configWarning;
// Intercepted requests
for (const req of nr.interceptedRequests) {
html += '<div class="request-pair">';
html += '<div class="request-header">Request sent</div>';
html += `<div class="request-method">${escapeHtml(req.method)} ${escapeHtml(req.url)}</div>`;
if (req.requestBody) {
html += `<pre class="json-block json-sm"><code>${escapeHtml(JSON.stringify(req.requestBody, null, 2))}</code></pre>`;
}
html += '<div class="response-header">Mock returned</div>';
if (req.mockResponse) {
html += `<pre class="json-block json-sm"><code>${escapeHtml(JSON.stringify(req.mockResponse, null, 2))}</code></pre>`;
} else {
html += '<div class="muted">no mock response</div>';
}
html += '</div>';
}
// Node output
if (nr.output !== null && nr.output !== undefined) {
html += '<details class="node-output-toggle"><summary>Node output</summary>';
html += `<pre class="json-block"><code>${escapeHtml(JSON.stringify(nr.output, null, 2))}</code></pre>`;
html += '</details>';
} else {
html += '<div class="muted">no output</div>';
}
html += '</div>';
}
html += '</details>';
}
return html;
}
// ---------------------------------------------------------------------------
// Workflow summary
// ---------------------------------------------------------------------------
function renderWorkflowSummary(result: WorkflowTestCaseResult): string {
const firstEval = result.scenarioResults[0]?.evalResult;
let nodesHtml = '';
if (firstEval) {
const nodes = Object.entries(firstEval.nodeResults);
if (nodes.length > 0) {
const nodeList = nodes
.map(([name, nr]) => {
const mode = nr.executionMode;
const requests = nr.interceptedRequests.length;
const issues = nr.configIssues ? Object.values(nr.configIssues).flat().join('; ') : '';
let line = `<span class="node-mode-${mode}">[${mode}]</span> ${escapeHtml(name)}`;
if (requests > 0) line += ` <span class="muted">(${String(requests)} req)</span>`;
if (issues)
line += ` <span class="build-issue">Build issue: ${escapeHtml(issues)}</span>`;
return `<li>${line}</li>`;
})
.join('');
nodesHtml = `<details class="section"><summary>Built workflow (${String(nodes.length)} nodes)</summary><ul class="node-list">${nodeList}</ul></details>`;
}
}
let jsonHtml = '';
if (result.workflowJson) {
const raw = JSON.stringify(result.workflowJson, null, 2);
jsonHtml = `<details class="section"><summary>Agent output (raw JSON)</summary><pre class="json-block"><code>${escapeHtml(raw)}</code></pre></details>`;
}
return nodesHtml + jsonHtml;
}
// ---------------------------------------------------------------------------
// Test case rendering
// ---------------------------------------------------------------------------
function renderTestCase(result: WorkflowTestCaseResult, tcIndex: number): string {
const passCount = result.scenarioResults.filter((sr) => sr.success).length;
const totalCount = result.scenarioResults.length;
const allPass = passCount === totalCount && totalCount > 0;
const statusClass = result.workflowBuildSuccess ? (allPass ? 'pass' : 'mixed') : 'fail';
const buildBadge = result.workflowBuildSuccess
? '<span class="badge badge-pass">BUILT</span>'
: '<span class="badge badge-fail">BUILD FAILED</span>';
const scoreBadge =
totalCount > 0
? `<span class="badge badge-${allPass ? 'pass' : 'fail'}">${String(passCount)}/${String(totalCount)}</span>`
: '';
const prompt = result.testCase.prompt;
const truncatedPrompt = prompt.length > 100 ? prompt.slice(0, 100) + '...' : prompt;
// Inline scenario indicators for quick triage without expanding
const scenarioIndicators = result.scenarioResults
.map(
(sr) =>
`<span class="scenario-indicator ${sr.success ? 'pass' : 'fail'}" title="${escapeHtml(sr.scenario.name)}">${sr.success ? '✓' : '✗'} ${escapeHtml(sr.scenario.name)}</span>`,
)
.join(' ');
let scenariosHtml = '';
if (result.scenarioResults.length > 0) {
scenariosHtml = result.scenarioResults
.map((sr, i) => renderScenario(sr, tcIndex * 100 + i))
.join('');
} else if (!result.workflowBuildSuccess) {
const errorDetail = result.buildError
? `<div class="error-box">${escapeHtml(result.buildError)}</div>`
: '';
scenariosHtml = `<div class="muted">Workflow failed to build — no scenarios executed</div>${errorDetail}`;
}
return `<div class="test-case ${statusClass}">
<div class="test-case-header" onclick="this.parentElement.classList.toggle('expanded')">
<div class="test-case-title">
${buildBadge} ${scoreBadge}
<span class="test-case-prompt">${escapeHtml(truncatedPrompt)}</span>
</div>
<div class="test-case-meta">
<span class="badge badge-tag">${escapeHtml(result.testCase.complexity)}</span>
${result.workflowId ? `<span class="workflow-id">${escapeHtml(result.workflowId)}</span>` : ''}
</div>
<div class="scenario-indicators">${scenarioIndicators}</div>
</div>
<div class="test-case-detail">
<details class="section"><summary>Prompt</summary><div class="prompt-text">${escapeHtml(prompt)}</div></details>
${renderWorkflowSummary(result)}
${scenariosHtml}
</div>
</div>`;
}
// ---------------------------------------------------------------------------
// Full report
// ---------------------------------------------------------------------------
export function generateWorkflowReport(results: WorkflowTestCaseResult[]): string {
const totalTestCases = results.length;
const builtCount = results.filter((r) => r.workflowBuildSuccess).length;
const allScenarios = results.flatMap((r) => r.scenarioResults);
const passCount = allScenarios.filter((sr) => sr.success).length;
const failCount = allScenarios.length - passCount;
const totalScenarios = allScenarios.length;
const passRate = totalScenarios > 0 ? Math.round((passCount / totalScenarios) * 100) : 0;
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Workflow evaluation report</title>
<style>
:root {
--bg-primary: #0d1117;
--bg-secondary: #161b22;
--bg-tertiary: #1c2129;
--border: #30363d;
--border-light: #21262d;
--text-primary: #f0f6fc;
--text-secondary: #c9d1d9;
--text-muted: #8b949e;
--color-pass: #3fb950;
--color-fail: #f85149;
--color-warn: #d29922;
--color-info: #58a6ff;
--color-purple: #bc8cff;
--color-pass-bg: #23863622;
--color-fail-bg: #da363322;
--color-warn-bg: #d2992222;
}
* { margin: 0; padding: 0; box-sizing: border-box; }
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; background: var(--bg-primary); color: var(--text-secondary); padding: 24px; max-width: 1400px; margin: 0 auto; font-size: 14px; line-height: 1.5; }
/* Header */
h1 { color: var(--text-primary); font-size: 20px; margin-bottom: 2px; }
.subtitle { color: var(--text-muted); font-size: 13px; margin-bottom: 20px; }
/* Dashboard */
.dashboard { display: flex; gap: 12px; margin-bottom: 24px; flex-wrap: wrap; align-items: stretch; }
.stat-card { background: var(--bg-secondary); border: 1px solid var(--border); border-radius: 8px; padding: 14px 20px; min-width: 120px; }
.stat-card .label { color: var(--text-muted); font-size: 12px; }
.stat-card .value { color: var(--text-primary); font-size: 26px; font-weight: 700; margin-top: 2px; }
.stat-card .value.pass { color: var(--color-pass); }
.stat-card .value.fail { color: var(--color-fail); }
.stat-card .value.mixed { color: var(--color-warn); }
/* Toolbar */
.toolbar { display: flex; gap: 8px; margin-bottom: 16px; }
.toolbar button { background: var(--bg-secondary); border: 1px solid var(--border); border-radius: 6px; color: var(--text-secondary); padding: 6px 12px; font-size: 12px; cursor: pointer; }
.toolbar button:hover { background: var(--bg-tertiary); color: var(--text-primary); }
.toolbar button.active { border-color: var(--color-info); color: var(--color-info); }
/* Badges */
.badge { display: inline-block; padding: 2px 8px; border-radius: 12px; font-size: 11px; font-weight: 600; margin-right: 4px; }
.badge-pass { background: var(--color-pass-bg); color: var(--color-pass); }
.badge-fail { background: var(--color-fail-bg); color: var(--color-fail); }
.badge-tag { background: var(--border); color: var(--text-muted); }
/* Test case cards */
.test-case { background: var(--bg-secondary); border: 1px solid var(--border); border-radius: 8px; margin-bottom: 10px; overflow: hidden; }
.test-case.pass { border-left: 3px solid var(--color-pass); }
.test-case.fail { border-left: 3px solid var(--color-fail); }
.test-case.mixed { border-left: 3px solid var(--color-warn); }
.test-case-header { padding: 12px 16px; cursor: pointer; }
.test-case-header:hover { background: var(--bg-tertiary); }
.test-case-title { display: flex; align-items: center; gap: 8px; margin-bottom: 4px; }
.test-case-prompt { color: var(--text-primary); font-weight: 500; font-size: 13px; }
.test-case-meta { display: flex; align-items: center; gap: 6px; margin-bottom: 6px; }
.workflow-id { color: var(--text-muted); font-size: 11px; font-family: monospace; }
.scenario-indicators { display: flex; gap: 8px; flex-wrap: wrap; }
.scenario-indicator { font-size: 11px; font-family: monospace; }
.scenario-indicator.pass { color: var(--color-pass); }
.scenario-indicator.fail { color: var(--color-fail); }
.test-case-detail { display: none; padding: 0 16px 16px; }
.test-case.expanded .test-case-detail { display: block; }
/* Sections (collapsible) */
.section { margin: 8px 0; }
.section > summary { cursor: pointer; color: var(--color-info); font-size: 12px; font-weight: 600; padding: 4px 0; }
.section > summary:hover { text-decoration: underline; }
/* Scenarios */
.scenario { border: 1px solid var(--border-light); border-radius: 6px; margin-bottom: 6px; overflow: hidden; }
.scenario-header { padding: 8px 12px; cursor: pointer; display: flex; align-items: center; gap: 8px; font-size: 13px; }
.scenario-header:hover { background: var(--bg-tertiary); }
.scenario-icon { font-weight: bold; font-size: 14px; min-width: 16px; }
.scenario-icon.pass { color: var(--color-pass); }
.scenario-icon.fail { color: var(--color-fail); }
.scenario-name { color: var(--text-primary); font-weight: 600; }
.scenario-desc { color: var(--text-muted); font-size: 12px; }
.scenario-summary-inline { color: var(--text-muted); font-size: 12px; flex: 1; }
.scenario-detail { display: none; padding: 10px 12px; border-top: 1px solid var(--border-light); background: var(--bg-primary); }
.scenario.expanded .scenario-detail { display: block; }
/* Error and warning boxes */
.error-box { color: var(--color-fail); font-size: 12px; padding: 6px 10px; background: var(--color-fail-bg); border-radius: 4px; margin-bottom: 8px; border-left: 3px solid var(--color-fail); }
.warning-box { color: var(--color-warn); font-size: 12px; padding: 6px 10px; background: var(--color-warn-bg); border-radius: 4px; margin-bottom: 8px; border-left: 3px solid var(--color-warn); }
.warning-inline { color: var(--color-warn); font-size: 11px; margin: 4px 0; }
.build-issue { color: var(--color-warn); font-size: 11px; display: block; margin-top: 2px; }
/* Diagnosis */
.diagnosis { color: var(--text-secondary); font-size: 12px; line-height: 1.6; padding: 6px 0; }
/* Prompt */
.prompt-text { color: var(--text-secondary); font-size: 13px; line-height: 1.6; padding: 10px; background: var(--bg-primary); border: 1px solid var(--border); border-radius: 6px; white-space: pre-wrap; }
/* Execution trace */
.trace-legend { font-size: 11px; margin-bottom: 8px; display: flex; gap: 12px; }
.trace-node { border: 1px solid var(--border-light); border-radius: 4px; margin-bottom: 6px; padding: 8px; }
.trace-node-header { font-size: 12px; font-family: monospace; margin-bottom: 4px; }
.request-count { color: var(--text-muted); font-size: 11px; }
/* Request/response pairs */
.request-pair { border: 1px solid var(--border-light); border-radius: 4px; margin: 6px 0; overflow: hidden; }
.request-header { background: #1c3a5e; color: var(--color-info); font-size: 10px; font-weight: 700; padding: 3px 8px; letter-spacing: 0.5px; }
.response-header { background: #2a1f3e; color: var(--color-purple); font-size: 10px; font-weight: 700; padding: 3px 8px; letter-spacing: 0.5px; }
.request-method { font-size: 11px; color: var(--text-primary); padding: 4px 8px; font-family: monospace; font-weight: 600; background: var(--bg-primary); }
/* JSON blocks */
.json-block { font-size: 11px; margin: 4px 0; padding: 8px; background: var(--bg-secondary); border: 1px solid var(--border-light); border-radius: 4px; overflow-x: auto; }
.json-sm { font-size: 10px; }
pre { overflow-x: auto; margin: 0; }
code { color: var(--text-secondary); }
/* Node list */
.node-list { list-style: none; padding: 4px 0; font-size: 12px; font-family: monospace; }
.node-list li { padding: 3px 0; }
.node-mode-mocked { color: var(--color-info); font-weight: 600; }
.node-mode-pinned { color: var(--color-warn); font-weight: 600; }
.node-mode-real { color: var(--color-pass); font-weight: 600; }
/* Node output toggle */
.node-output-toggle { margin: 4px 0; }
.node-output-toggle > summary { cursor: pointer; color: var(--text-muted); font-size: 11px; }
/* Node hint */
.node-hint { margin: 2px 0; }
.node-hint > summary { cursor: pointer; color: var(--text-secondary); font-size: 11px; font-family: monospace; }
.hint-text { color: var(--text-muted); font-size: 11px; padding: 4px 0; line-height: 1.5; }
.subsection-label { color: var(--text-primary); font-size: 11px; font-weight: 600; margin-top: 8px; margin-bottom: 2px; }
/* Category badges */
.category-badge { font-size: 11px; font-weight: 600; padding: 4px 10px; border-radius: 4px; margin-bottom: 8px; }
.category-warn { background: var(--color-warn-bg); color: var(--color-warn); border-left: 3px solid var(--color-warn); }
.category-fail { background: var(--color-fail-bg); color: var(--color-fail); border-left: 3px solid var(--color-fail); }
.category-info { background: #1c3a5e33; color: var(--color-info); border-left: 3px solid var(--color-info); }
/* Utilities */
.muted { color: var(--text-muted); font-size: 12px; }
</style>
</head>
<body>
<h1>Workflow evaluation report</h1>
<p class="subtitle">Generated ${new Date().toLocaleString()} &mdash; ${String(totalScenarios)} scenarios across ${String(totalTestCases)} test cases</p>
<div class="dashboard">
<div class="stat-card">
<div class="label">Pass rate</div>
<div class="value${passRate >= 80 ? ' pass' : passRate >= 50 ? ' mixed' : ' fail'}">${String(passRate)}%</div>
</div>
<div class="stat-card">
<div class="label">Passed</div>
<div class="value pass">${String(passCount)}</div>
</div>
<div class="stat-card">
<div class="label">Failed</div>
<div class="value${failCount > 0 ? ' fail' : ''}">${String(failCount)}</div>
</div>
<div class="stat-card">
<div class="label">Built</div>
<div class="value${builtCount === totalTestCases ? ' pass' : ' mixed'}">${String(builtCount)}/${String(totalTestCases)}</div>
</div>
</div>
<div class="toolbar">
<button onclick="document.querySelectorAll('.test-case').forEach(e => e.classList.add('expanded'))">Expand all</button>
<button onclick="document.querySelectorAll('.test-case').forEach(e => e.classList.remove('expanded'))">Collapse all</button>
<button onclick="document.querySelectorAll('.test-case').forEach(e => { e.style.display = e.classList.contains('pass') ? 'none' : '' }); this.classList.toggle('active')">Show failures only</button>
</div>
${results.map((r, i) => renderTestCase(r, i)).join('')}
</body>
</html>`;
}
// ---------------------------------------------------------------------------
// Write report to disk
// ---------------------------------------------------------------------------
export function writeWorkflowReport(results: WorkflowTestCaseResult[]): string {
const reportDir = path.join(__dirname, '..', '..', '.data');
if (!fs.existsSync(reportDir)) {
fs.mkdirSync(reportDir, { recursive: true });
}
const html = generateWorkflowReport(results);
const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
const reportPath = path.join(reportDir, `workflow-eval-${timestamp}.html`);
fs.writeFileSync(reportPath, html);
// Also write to the stable filename for quick access
fs.writeFileSync(path.join(reportDir, 'workflow-eval-report.html'), html);
return reportPath;
}

View file

@ -23,7 +23,8 @@
"require": "./dist/index.js",
"import": "./src/index.ts",
"types": "./dist/index.d.ts"
}
},
"./evaluations": "./evaluations/index.ts"
},
"dependencies": {
"@daytonaio/sdk": "0.149.0",

View file

@ -42,6 +42,7 @@
"@n8n/api-types": "workspace:*",
"@n8n/constants": "workspace:*",
"@n8n/db": "workspace:*",
"@n8n/instance-ai": "workspace:*",
"@n8n/permissions": "workspace:*",
"@n8n/playwright-janitor": "workspace:*",
"@n8n/workflow-sdk": "workspace:*",

View file

@ -4408,6 +4408,9 @@ importers:
'@n8n/db':
specifier: workspace:*
version: link:../../@n8n/db
'@n8n/instance-ai':
specifier: workspace:*
version: link:../../@n8n/instance-ai
'@n8n/permissions':
specifier: workspace:*
version: link:../../@n8n/permissions