mirror of
https://github.com/coleam00/Archon
synced 2026-04-21 13:37:41 +00:00
fix(workflows): fail loudly on SDK isError results (#1208) (#1291)
Some checks are pending
E2E Smoke Tests / e2e-deterministic (push) Waiting to run
E2E Smoke Tests / e2e-mixed (push) Blocked by required conditions
Test Suite / test (windows-latest) (push) Waiting to run
Test Suite / docker-build (push) Waiting to run
E2E Smoke Tests / e2e-codex (push) Waiting to run
E2E Smoke Tests / e2e-claude (push) Waiting to run
Test Suite / test (ubuntu-latest) (push) Waiting to run
Some checks are pending
E2E Smoke Tests / e2e-deterministic (push) Waiting to run
E2E Smoke Tests / e2e-mixed (push) Blocked by required conditions
Test Suite / test (windows-latest) (push) Waiting to run
Test Suite / docker-build (push) Waiting to run
E2E Smoke Tests / e2e-codex (push) Waiting to run
E2E Smoke Tests / e2e-claude (push) Waiting to run
Test Suite / test (ubuntu-latest) (push) Waiting to run
Previously, `dag-executor` only failed nodes/iterations when the SDK returned an `error_max_budget_usd` result. Every other `isError: true` subtype — including `error_during_execution` — was silently `break`ed out of the stream with whatever partial output had accumulated, letting failed runs masquerade as successful ones with empty output. This is the most likely explanation for the "5-second crash" symptom in #1208: iterations finish instantly with empty text, the loop keeps going, and only the `claude.result_is_error` log tips the user off. Changes: - Capture the SDK's `errors: string[]` detail on result messages (previously discarded) and surface it through `MessageChunk.errors`. - Log `errors`, `stopReason` alongside `errorSubtype` in `claude.result_is_error` so users can see what actually failed. - Throw from both the general node path and the loop iteration path on any `isError: true` result, including the subtype and SDK errors detail in the thrown message. Note: this does not implement auto-retry. See PR comments on #1121 and the analysis on #1208 — a retry-with-fresh-session approach for loop iterations is not obviously correct until we see what `error_during_execution` actually carries in the reporter's env. This change is the observability + fail-loud step that has to come first so that signal is no longer silent. Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
d89bc767d2
commit
4c6ddd994f
4 changed files with 168 additions and 1 deletions
|
|
@ -740,6 +740,7 @@ async function* streamClaudeMessages(
|
|||
total_cost_usd?: number;
|
||||
stop_reason?: string | null;
|
||||
num_turns?: number;
|
||||
errors?: string[];
|
||||
model_usage?: Record<
|
||||
string,
|
||||
{
|
||||
|
|
@ -751,9 +752,15 @@ async function* streamClaudeMessages(
|
|||
>;
|
||||
};
|
||||
const tokens = normalizeClaudeUsage(resultMsg.usage);
|
||||
const sdkErrors = Array.isArray(resultMsg.errors) ? resultMsg.errors : undefined;
|
||||
if (resultMsg.is_error) {
|
||||
getLog().error(
|
||||
{ sessionId: resultMsg.session_id, errorSubtype: resultMsg.subtype },
|
||||
{
|
||||
sessionId: resultMsg.session_id,
|
||||
errorSubtype: resultMsg.subtype,
|
||||
stopReason: resultMsg.stop_reason,
|
||||
errors: sdkErrors,
|
||||
},
|
||||
'claude.result_is_error'
|
||||
);
|
||||
}
|
||||
|
|
@ -765,6 +772,7 @@ async function* streamClaudeMessages(
|
|||
? { structuredOutput: resultMsg.structured_output }
|
||||
: {}),
|
||||
...(resultMsg.is_error ? { isError: true, errorSubtype: resultMsg.subtype } : {}),
|
||||
...(resultMsg.is_error && sdkErrors?.length ? { errors: sdkErrors } : {}),
|
||||
...(resultMsg.total_cost_usd !== undefined ? { cost: resultMsg.total_cost_usd } : {}),
|
||||
...(resultMsg.stop_reason != null ? { stopReason: resultMsg.stop_reason } : {}),
|
||||
...(resultMsg.num_turns !== undefined ? { numTurns: resultMsg.num_turns } : {}),
|
||||
|
|
|
|||
|
|
@ -72,6 +72,8 @@ export type MessageChunk =
|
|||
structuredOutput?: unknown;
|
||||
isError?: boolean;
|
||||
errorSubtype?: string;
|
||||
/** SDK-provided error detail strings. Populated when isError is true. */
|
||||
errors?: string[];
|
||||
cost?: number;
|
||||
stopReason?: string;
|
||||
numTurns?: number;
|
||||
|
|
|
|||
|
|
@ -3594,6 +3594,70 @@ describe('executeDagWorkflow -- resume with priorCompletedNodes', () => {
|
|||
expect(sessionArg).toBe('loop-session-1');
|
||||
});
|
||||
|
||||
it('loop iteration fails loudly when SDK returns error_during_execution', async () => {
|
||||
// Regression test for #1208: previously the loop silently broke on isError
|
||||
// results and kept iterating with empty output, producing "5-second crashes"
|
||||
// that masqueraded as successful iterations.
|
||||
mockSendQueryDag.mockImplementation(function* () {
|
||||
yield {
|
||||
type: 'result',
|
||||
isError: true,
|
||||
errorSubtype: 'error_during_execution',
|
||||
errors: ['Subprocess crashed mid-turn'],
|
||||
sessionId: 'bad-session',
|
||||
};
|
||||
});
|
||||
|
||||
const store = createMockStore();
|
||||
const mockDeps = createMockDeps(store);
|
||||
const platform = createMockPlatform();
|
||||
const workflowRun = makeWorkflowRun();
|
||||
|
||||
await executeDagWorkflow(
|
||||
mockDeps,
|
||||
platform,
|
||||
'conv-dag',
|
||||
testDir,
|
||||
{
|
||||
name: 'loop-iteration-err',
|
||||
nodes: [
|
||||
{
|
||||
id: 'work',
|
||||
loop: {
|
||||
prompt: 'Do the work. Say DONE.',
|
||||
until: 'DONE',
|
||||
max_iterations: 5,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
workflowRun,
|
||||
'claude',
|
||||
undefined,
|
||||
join(testDir, 'artifacts'),
|
||||
join(testDir, 'logs'),
|
||||
'main',
|
||||
'docs/',
|
||||
minimalConfig
|
||||
);
|
||||
|
||||
// Should fail after one iteration rather than burning through max_iterations
|
||||
expect(mockSendQueryDag.mock.calls.length).toBe(1);
|
||||
// The loop_iteration_failed event should carry the subtype and SDK errors detail
|
||||
const eventCalls = (store.createWorkflowEvent as ReturnType<typeof mock>).mock.calls;
|
||||
const iterFailedEvents = eventCalls.filter(
|
||||
(call: unknown[]) =>
|
||||
(call[0] as Record<string, unknown>).event_type === 'loop_iteration_failed'
|
||||
);
|
||||
expect(iterFailedEvents.length).toBeGreaterThan(0);
|
||||
const failedData = (iterFailedEvents[0][0] as Record<string, unknown>).data as Record<
|
||||
string,
|
||||
unknown
|
||||
>;
|
||||
expect(failedData.error).toContain('error_during_execution');
|
||||
expect(failedData.error).toContain('Subprocess crashed mid-turn');
|
||||
});
|
||||
|
||||
it('non-interactive loop is unaffected (no pause)', async () => {
|
||||
mockSendQueryDag.mockImplementation(function* () {
|
||||
yield { type: 'assistant', content: 'Still working...' };
|
||||
|
|
@ -4617,6 +4681,58 @@ describe('executeDagWorkflow -- Claude SDK advanced options', () => {
|
|||
expect(capMessage).toBeDefined();
|
||||
});
|
||||
|
||||
it('fails node when SDK returns error_during_execution result', async () => {
|
||||
// Regression test for #1208: previously we only failed on error_max_budget_usd
|
||||
// and silently broke on all other isError subtypes, letting failed nodes
|
||||
// masquerade as successes with empty output.
|
||||
mockSendQueryDag.mockImplementation(function* () {
|
||||
yield {
|
||||
type: 'result',
|
||||
isError: true,
|
||||
errorSubtype: 'error_during_execution',
|
||||
errors: ['Tool call failed: permission denied'],
|
||||
sessionId: 'sid-err',
|
||||
};
|
||||
});
|
||||
|
||||
const store = createMockStore();
|
||||
const mockDeps = createMockDeps(store);
|
||||
const platform = createMockPlatform();
|
||||
const workflowRun = makeWorkflowRun();
|
||||
|
||||
await executeDagWorkflow(
|
||||
mockDeps,
|
||||
platform,
|
||||
'conv-dag',
|
||||
testDir,
|
||||
{
|
||||
name: 'err-exec-test',
|
||||
nodes: [{ id: 'step1', command: 'my-cmd' }],
|
||||
},
|
||||
workflowRun,
|
||||
'claude',
|
||||
undefined,
|
||||
join(testDir, 'artifacts'),
|
||||
join(testDir, 'logs'),
|
||||
'main',
|
||||
'docs/',
|
||||
minimalConfig
|
||||
);
|
||||
|
||||
// The node_failed event should carry the subtype and SDK errors detail
|
||||
const eventCalls = (store.createWorkflowEvent as ReturnType<typeof mock>).mock.calls;
|
||||
const nodeFailedEvents = eventCalls.filter(
|
||||
(call: unknown[]) => (call[0] as Record<string, unknown>).event_type === 'node_failed'
|
||||
);
|
||||
expect(nodeFailedEvents.length).toBeGreaterThan(0);
|
||||
const failedData = (nodeFailedEvents[0][0] as Record<string, unknown>).data as Record<
|
||||
string,
|
||||
unknown
|
||||
>;
|
||||
expect(failedData.error).toContain('error_during_execution');
|
||||
expect(failedData.error).toContain('permission denied');
|
||||
});
|
||||
|
||||
it('forwards workflow-level effort to node when no per-node override', async () => {
|
||||
const mockDeps = createMockDeps();
|
||||
const platform = createMockPlatform();
|
||||
|
|
|
|||
|
|
@ -764,6 +764,25 @@ async function executeNodeInternal(
|
|||
`Node '${node.id}' exceeded cost cap${cap !== undefined ? ` of $${cap.toFixed(2)}` : ''}.`
|
||||
);
|
||||
}
|
||||
// Fail loudly on any other SDK error result. Previously we broke out of
|
||||
// the stream silently, producing empty/partial output without signaling
|
||||
// failure — which let failed iterations masquerade as successes (#1208).
|
||||
if (msg.isError) {
|
||||
const subtype = msg.errorSubtype ?? 'unknown';
|
||||
const errorsDetail = msg.errors?.length ? ` — ${msg.errors.join('; ')}` : '';
|
||||
getLog().error(
|
||||
{
|
||||
nodeId: node.id,
|
||||
errorSubtype: subtype,
|
||||
errors: msg.errors,
|
||||
sessionId: msg.sessionId,
|
||||
stopReason: msg.stopReason,
|
||||
durationMs: Date.now() - nodeStartTime,
|
||||
},
|
||||
'dag.node_sdk_error_result'
|
||||
);
|
||||
throw new Error(`Node '${node.id}' failed: SDK returned ${subtype}${errorsDetail}`);
|
||||
}
|
||||
break; // Result is the "I'm done" signal — don't wait for subprocess to exit
|
||||
} else if (msg.type === 'system' && msg.content) {
|
||||
// Forward provider warnings (⚠️) and MCP connection failures to the user.
|
||||
|
|
@ -1626,6 +1645,28 @@ async function executeLoopNode(
|
|||
if (msg.numTurns !== undefined) {
|
||||
loopTotalNumTurns = (loopTotalNumTurns ?? 0) + msg.numTurns;
|
||||
}
|
||||
// Fail the iteration loudly on SDK error results. Previously we broke
|
||||
// silently, producing empty output and continuing to the next iteration —
|
||||
// which made `error_during_execution` on resumed interactive loops look
|
||||
// like a "5-second crash" that kept burning iterations (#1208).
|
||||
if (msg.isError) {
|
||||
const subtype = msg.errorSubtype ?? 'unknown';
|
||||
const errorsDetail = msg.errors?.length ? ` — ${msg.errors.join('; ')}` : '';
|
||||
getLog().error(
|
||||
{
|
||||
nodeId: node.id,
|
||||
iteration: i,
|
||||
errorSubtype: subtype,
|
||||
errors: msg.errors,
|
||||
sessionId: msg.sessionId,
|
||||
stopReason: msg.stopReason,
|
||||
},
|
||||
'loop_node.iteration_sdk_error'
|
||||
);
|
||||
throw new Error(
|
||||
`Loop '${node.id}' iteration ${String(i)} failed: SDK returned ${subtype}${errorsDetail}`
|
||||
);
|
||||
}
|
||||
break; // Result is the "I'm done" signal — don't wait for subprocess to exit
|
||||
} else if (msg.type === 'tool' && msg.toolName) {
|
||||
const now = Date.now();
|
||||
|
|
|
|||
Loading…
Reference in a new issue