refactor(ci): strip E2E smoke tests to bare minimum for speed

Claude CLI is extremely slow with structured output (~4 min) and tool use
(~2 min) in CI, making the previous multi-workflow approach take 10+ min.

Radical simplification:
- Remove e2e-all-nodes (redundant with deterministic + claude-smoke)
- Remove e2e-skills-mcp (advanced features too slow for per-commit smoke)
- Remove structured output and tool use from Claude smoke test (too slow)
- Strip Claude smoke to: 1 prompt + 1 command + 1 bash verify node
- Keep mixed providers (simplified: 1 Claude + 1 Codex + bash verify)
- All timeouts reduced to 30s, all job timeouts to 5 min
- Remove MCP test fixtures and e2e-test-skill (no longer needed)

Expected: Claude job ~15s of AI time, Codex ~5s, mixed ~10s

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Cole Medin 2026-04-16 10:50:11 -05:00
parent 4c259e7a0a
commit bf9091159c
8 changed files with 24 additions and 196 deletions

View file

@ -1,6 +0,0 @@
{
"filesystem": {
"command": "npx",
"args": ["-y", "@modelcontextprotocol/server-filesystem", "/tmp"]
}
}

View file

@ -1,54 +0,0 @@
# E2E smoke test — all node types
# Verifies: bash, prompt, script (bun), structured output, effort control, $nodeId.output refs
# NOTE: AI nodes run sequentially to avoid concurrent Claude CLI subprocess issues in CI
name: e2e-all-nodes
description: "Comprehensive E2E test exercising bash, prompt, script, and structured output nodes."
provider: claude
model: haiku
nodes:
# 1. Bash node — no AI, runs shell, stdout captured as output
- id: bash-check
bash: "echo '{\"status\":\"ok\",\"cwd\":\"'$(pwd)'\"}'"
# 2. Script node (bun runtime) — verifies script execution
- id: script-bun
script: echo-args
runtime: bun
depends_on: [bash-check]
timeout: 30000
# 3. Prompt node — simple AI call, verifies sendQuery works
- id: prompt-simple
prompt: "The bash node returned: $bash-check.output — confirm you received it by saying 'received'. Say nothing else."
depends_on: [script-bun]
allowed_tools: []
idle_timeout: 120000
# 4. Structured output node — verifies output_format translation
- id: structured
prompt: "Classify the text 'hello world' as either 'greeting' or 'math'."
output_format:
type: object
properties:
category:
type: string
enum: ["greeting", "math"]
required: ["category"]
additionalProperties: false
depends_on: [prompt-simple]
allowed_tools: []
idle_timeout: 120000
# 5. Bash node using $nodeId.output from structured node
- id: bash-read-output
bash: "echo 'Structured output category: $structured.output'"
depends_on: [structured]
# 6. Prompt with effort control — verifies effort passes through to SDK
- id: prompt-effort
prompt: "Say 'effort-ok' and nothing else."
effort: low
depends_on: [structured]
allowed_tools: []
idle_timeout: 120000

View file

@ -1,48 +1,26 @@
# E2E smoke test — Claude provider
# Verifies: provider selection, sendQuery, structured output, tool use,
# command node, workflow-level model, node-level model override
# NOTE: Nodes run sequentially to avoid concurrent Claude CLI subprocess issues in CI
# Verifies: Claude connectivity (sendQuery), command node loading, $nodeId.output refs
# Design: Only uses allowed_tools: [] (no tool use) and no output_format (no structured output)
# because the Claude CLI subprocess is extremely slow with those features in CI.
name: e2e-claude-smoke
description: "E2E smoke test for Claude provider. Tests prompt, structured output, tool use, command node, and model overrides."
description: "Smoke test for Claude provider. Verifies prompt response and command node loading."
provider: claude
model: haiku
nodes:
# 1. Simple prompt — verifies basic sendQuery
# 1. Simple prompt — verifies Claude API connectivity via sendQuery
- id: simple
prompt: "What is 2+2? Answer with just the number, nothing else."
allowed_tools: []
idle_timeout: 120000
idle_timeout: 30000
# 2. Structured output — verifies output_format translation
- id: structured
prompt: "Classify this input as 'math' or 'text': '2+2=4'"
output_format:
type: object
properties:
category:
type: string
enum: ["math", "text"]
required: ["category"]
additionalProperties: false
allowed_tools: []
idle_timeout: 120000
depends_on: [simple]
# 3. Tool use — verifies agent can use tools
- id: tool-use
prompt: "Read the file package.json and tell me the 'name' field value. Answer with just the name, nothing else."
allowed_tools: [Read]
idle_timeout: 120000
depends_on: [structured]
# 4. Command node — verifies command file loading
# 2. Command node — verifies command file discovery and loading
- id: command-test
command: e2e-echo-command
idle_timeout: 120000
depends_on: [tool-use]
idle_timeout: 30000
depends_on: [simple]
# 5. Bash node reads structured output field
- id: verify-structured
bash: "echo 'category=$structured.output.category'"
depends_on: [structured]
# 3. Bash node — verifies $nodeId.output substitution from AI node
- id: verify-output
bash: "echo 'simple=$simple.output command=$command-test.output'"
depends_on: [simple, command-test]

View file

@ -8,7 +8,7 @@ model: gpt-5.1-codex-mini
nodes:
- id: simple
prompt: "What is 2+2? Answer with just the number, nothing else."
idle_timeout: 60000
idle_timeout: 30000
- id: structured
prompt: "Classify this input as 'math' or 'text': '2+2=4'. Return JSON only."
@ -20,5 +20,5 @@ nodes:
enum: ["math", "text"]
required: ["category"]
additionalProperties: false
idle_timeout: 60000
idle_timeout: 30000
depends_on: [simple]

View file

@ -12,23 +12,16 @@ nodes:
- id: claude-node
prompt: "Say 'claude-ok' and nothing else."
allowed_tools: []
idle_timeout: 60000
idle_timeout: 30000
# 2. Codex node — provider override
# 2. Codex node — provider override (runs parallel with claude-node, different providers)
- id: codex-node
prompt: "Say 'codex-ok' and nothing else."
provider: codex
model: gpt-5.1-codex-mini
idle_timeout: 60000
idle_timeout: 30000
# 3. Claude node reads Codex output — cross-provider ref
- id: claude-reads-codex
prompt: "The codex node said: '$codex-node.output'. Confirm you received it by saying 'cross-provider-ok'. Say nothing else."
allowed_tools: []
idle_timeout: 60000
depends_on: [codex-node]
# 4. Bash node verifies both outputs
# 3. Bash node verifies both outputs — cross-provider ref
- id: verify
bash: "echo 'claude=$claude-node.output codex=$codex-node.output cross=$claude-reads-codex.output'"
depends_on: [claude-node, codex-node, claude-reads-codex]
bash: "echo 'claude=$claude-node.output codex=$codex-node.output'"
depends_on: [claude-node, codex-node]

View file

@ -1,56 +0,0 @@
# E2E smoke test — Claude advanced features (skills, MCP, effort, systemPrompt)
# Verifies: skills injection, MCP server loading, effort control, custom system prompt
# NOTE: AI nodes run sequentially to avoid concurrent Claude CLI subprocess issues in CI
# NOTE: MCP test uses model: sonnet because Haiku does not support MCP tool search
name: e2e-skills-mcp
description: "Tests Claude-specific advanced features: skills injection, MCP server, effort control, and systemPrompt."
provider: claude
model: haiku
nodes:
# 1. Skills injection — verifies AgentDefinition wrapping
- id: skill-test
prompt: "Confirm your skill loading status. If the E2E test skill is loaded, follow its instructions."
skills:
- e2e-test-skill
allowed_tools: [Read]
idle_timeout: 120000
# 2. MCP server — verifies MCP config loading and tool availability
# Uses sonnet because Haiku does not support MCP tool search
- id: mcp-test
prompt: "You have a filesystem MCP server available. Use it to list the contents of /tmp. Report what you find briefly."
model: sonnet
mcp: .archon/test-fixtures/mcp/e2e-filesystem.json
idle_timeout: 120000
depends_on: [skill-test]
# 3. Effort control — verifies effort passes through to SDK
- id: effort-test
prompt: "Say 'effort-ok' and nothing else."
effort: low
allowed_tools: []
idle_timeout: 120000
depends_on: [mcp-test]
# 4. Custom system prompt — verifies systemPrompt injection
- id: system-prompt-test
prompt: "What is your role? Answer in 5 words or fewer."
systemPrompt: "You are a smoke test validator. Always start your response with 'VALIDATOR:'"
allowed_tools: []
idle_timeout: 120000
depends_on: [effort-test]
# 5. Context shared — verifies session continuity
- id: context-shared-setup
prompt: "Remember the secret code: ORANGE-42. Say 'stored' and nothing else."
allowed_tools: []
idle_timeout: 120000
depends_on: [system-prompt-test]
- id: context-shared-verify
prompt: "What was the secret code I told you to remember? Say just the code, nothing else."
context: shared
allowed_tools: []
idle_timeout: 120000
depends_on: [context-shared-setup]

View file

@ -1,8 +0,0 @@
---
name: E2E Test Skill
description: Minimal skill for smoke testing skill injection in CI
---
# E2E Test Skill
You have the E2E test skill loaded. When asked to confirm skill loading, respond with exactly: "skill-loaded-ok"

View file

@ -34,7 +34,7 @@ jobs:
# ─── Tier 2a: Claude provider ──────────────────────────────────────────
e2e-claude:
runs-on: ubuntu-latest
timeout-minutes: 20
timeout-minutes: 5
steps:
- uses: actions/checkout@v4
@ -43,11 +43,6 @@ jobs:
with:
bun-version: 1.3.11
- name: Setup Node.js (for npx/MCP servers)
uses: actions/setup-node@v4
with:
node-version: 22
- name: Install Claude Code CLI
run: |
curl -fsSL https://claude.ai/install.sh | bash
@ -63,24 +58,10 @@ jobs:
run: |
bun run cli workflow run e2e-claude-smoke --no-worktree "smoke test"
- name: Run all-nodes test
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
CLAUDE_BIN_PATH: ~/.local/bin/claude
run: |
bun run cli workflow run e2e-all-nodes --no-worktree "smoke test"
- name: Run skills + MCP test
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
CLAUDE_BIN_PATH: ~/.local/bin/claude
run: |
bun run cli workflow run e2e-skills-mcp --no-worktree "smoke test"
# ─── Tier 2b: Codex provider ───────────────────────────────────────────
e2e-codex:
runs-on: ubuntu-latest
timeout-minutes: 10
timeout-minutes: 5
steps:
- uses: actions/checkout@v4
@ -110,7 +91,7 @@ jobs:
# ─── Tier 3: Mixed providers ───────────────────────────────────────────
e2e-mixed:
runs-on: ubuntu-latest
timeout-minutes: 10
timeout-minutes: 5
needs: [e2e-claude, e2e-codex]
steps:
- uses: actions/checkout@v4