fix(workflows): fail loudly on SDK isError results (#1208) (#1291)
Some checks are pending
E2E Smoke Tests / e2e-deterministic (push) Waiting to run
E2E Smoke Tests / e2e-mixed (push) Blocked by required conditions
Test Suite / test (windows-latest) (push) Waiting to run
Test Suite / docker-build (push) Waiting to run
E2E Smoke Tests / e2e-codex (push) Waiting to run
E2E Smoke Tests / e2e-claude (push) Waiting to run
Test Suite / test (ubuntu-latest) (push) Waiting to run

Previously, `dag-executor` only failed nodes/iterations when the SDK
returned an `error_max_budget_usd` result. Every other `isError: true`
subtype — including `error_during_execution` — was silently `break`ed
out of the stream with whatever partial output had accumulated, letting
failed runs masquerade as successful ones with empty output.

This is the most likely explanation for the "5-second crash" symptom in
#1208: iterations finish instantly with empty text, the loop keeps
going, and only the `claude.result_is_error` log tips the user off.

Changes:
- Capture the SDK's `errors: string[]` detail on result messages
  (previously discarded) and surface it through `MessageChunk.errors`.
- Log `errors`, `stopReason` alongside `errorSubtype` in
  `claude.result_is_error` so users can see what actually failed.
- Throw from both the general node path and the loop iteration path
  on any `isError: true` result, including the subtype and SDK errors
  detail in the thrown message.

Note: this does not implement auto-retry. See PR comments on #1121 and
the analysis on #1208 — a retry-with-fresh-session approach for loop
iterations is not obviously correct until we see what
`error_during_execution` actually carries in the reporter's env.
This change is the observability + fail-loud step that has to come
first so that signal is no longer silent.

Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Cole Medin 2026-04-18 15:02:35 -05:00 committed by GitHub
parent d89bc767d2
commit 4c6ddd994f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 168 additions and 1 deletions

View file

@ -740,6 +740,7 @@ async function* streamClaudeMessages(
total_cost_usd?: number;
stop_reason?: string | null;
num_turns?: number;
errors?: string[];
model_usage?: Record<
string,
{
@ -751,9 +752,15 @@ async function* streamClaudeMessages(
>;
};
const tokens = normalizeClaudeUsage(resultMsg.usage);
const sdkErrors = Array.isArray(resultMsg.errors) ? resultMsg.errors : undefined;
if (resultMsg.is_error) {
getLog().error(
{ sessionId: resultMsg.session_id, errorSubtype: resultMsg.subtype },
{
sessionId: resultMsg.session_id,
errorSubtype: resultMsg.subtype,
stopReason: resultMsg.stop_reason,
errors: sdkErrors,
},
'claude.result_is_error'
);
}
@ -765,6 +772,7 @@ async function* streamClaudeMessages(
? { structuredOutput: resultMsg.structured_output }
: {}),
...(resultMsg.is_error ? { isError: true, errorSubtype: resultMsg.subtype } : {}),
...(resultMsg.is_error && sdkErrors?.length ? { errors: sdkErrors } : {}),
...(resultMsg.total_cost_usd !== undefined ? { cost: resultMsg.total_cost_usd } : {}),
...(resultMsg.stop_reason != null ? { stopReason: resultMsg.stop_reason } : {}),
...(resultMsg.num_turns !== undefined ? { numTurns: resultMsg.num_turns } : {}),

View file

@ -72,6 +72,8 @@ export type MessageChunk =
structuredOutput?: unknown;
isError?: boolean;
errorSubtype?: string;
/** SDK-provided error detail strings. Populated when isError is true. */
errors?: string[];
cost?: number;
stopReason?: string;
numTurns?: number;

View file

@ -3594,6 +3594,70 @@ describe('executeDagWorkflow -- resume with priorCompletedNodes', () => {
expect(sessionArg).toBe('loop-session-1');
});
it('loop iteration fails loudly when SDK returns error_during_execution', async () => {
// Regression test for #1208: previously the loop silently broke on isError
// results and kept iterating with empty output, producing "5-second crashes"
// that masqueraded as successful iterations.
mockSendQueryDag.mockImplementation(function* () {
yield {
type: 'result',
isError: true,
errorSubtype: 'error_during_execution',
errors: ['Subprocess crashed mid-turn'],
sessionId: 'bad-session',
};
});
const store = createMockStore();
const mockDeps = createMockDeps(store);
const platform = createMockPlatform();
const workflowRun = makeWorkflowRun();
await executeDagWorkflow(
mockDeps,
platform,
'conv-dag',
testDir,
{
name: 'loop-iteration-err',
nodes: [
{
id: 'work',
loop: {
prompt: 'Do the work. Say DONE.',
until: 'DONE',
max_iterations: 5,
},
},
],
},
workflowRun,
'claude',
undefined,
join(testDir, 'artifacts'),
join(testDir, 'logs'),
'main',
'docs/',
minimalConfig
);
// Should fail after one iteration rather than burning through max_iterations
expect(mockSendQueryDag.mock.calls.length).toBe(1);
// The loop_iteration_failed event should carry the subtype and SDK errors detail
const eventCalls = (store.createWorkflowEvent as ReturnType<typeof mock>).mock.calls;
const iterFailedEvents = eventCalls.filter(
(call: unknown[]) =>
(call[0] as Record<string, unknown>).event_type === 'loop_iteration_failed'
);
expect(iterFailedEvents.length).toBeGreaterThan(0);
const failedData = (iterFailedEvents[0][0] as Record<string, unknown>).data as Record<
string,
unknown
>;
expect(failedData.error).toContain('error_during_execution');
expect(failedData.error).toContain('Subprocess crashed mid-turn');
});
it('non-interactive loop is unaffected (no pause)', async () => {
mockSendQueryDag.mockImplementation(function* () {
yield { type: 'assistant', content: 'Still working...' };
@ -4617,6 +4681,58 @@ describe('executeDagWorkflow -- Claude SDK advanced options', () => {
expect(capMessage).toBeDefined();
});
it('fails node when SDK returns error_during_execution result', async () => {
// Regression test for #1208: previously we only failed on error_max_budget_usd
// and silently broke on all other isError subtypes, letting failed nodes
// masquerade as successes with empty output.
mockSendQueryDag.mockImplementation(function* () {
yield {
type: 'result',
isError: true,
errorSubtype: 'error_during_execution',
errors: ['Tool call failed: permission denied'],
sessionId: 'sid-err',
};
});
const store = createMockStore();
const mockDeps = createMockDeps(store);
const platform = createMockPlatform();
const workflowRun = makeWorkflowRun();
await executeDagWorkflow(
mockDeps,
platform,
'conv-dag',
testDir,
{
name: 'err-exec-test',
nodes: [{ id: 'step1', command: 'my-cmd' }],
},
workflowRun,
'claude',
undefined,
join(testDir, 'artifacts'),
join(testDir, 'logs'),
'main',
'docs/',
minimalConfig
);
// The node_failed event should carry the subtype and SDK errors detail
const eventCalls = (store.createWorkflowEvent as ReturnType<typeof mock>).mock.calls;
const nodeFailedEvents = eventCalls.filter(
(call: unknown[]) => (call[0] as Record<string, unknown>).event_type === 'node_failed'
);
expect(nodeFailedEvents.length).toBeGreaterThan(0);
const failedData = (nodeFailedEvents[0][0] as Record<string, unknown>).data as Record<
string,
unknown
>;
expect(failedData.error).toContain('error_during_execution');
expect(failedData.error).toContain('permission denied');
});
it('forwards workflow-level effort to node when no per-node override', async () => {
const mockDeps = createMockDeps();
const platform = createMockPlatform();

View file

@ -764,6 +764,25 @@ async function executeNodeInternal(
`Node '${node.id}' exceeded cost cap${cap !== undefined ? ` of $${cap.toFixed(2)}` : ''}.`
);
}
// Fail loudly on any other SDK error result. Previously we broke out of
// the stream silently, producing empty/partial output without signaling
// failure — which let failed iterations masquerade as successes (#1208).
if (msg.isError) {
const subtype = msg.errorSubtype ?? 'unknown';
const errorsDetail = msg.errors?.length ? `${msg.errors.join('; ')}` : '';
getLog().error(
{
nodeId: node.id,
errorSubtype: subtype,
errors: msg.errors,
sessionId: msg.sessionId,
stopReason: msg.stopReason,
durationMs: Date.now() - nodeStartTime,
},
'dag.node_sdk_error_result'
);
throw new Error(`Node '${node.id}' failed: SDK returned ${subtype}${errorsDetail}`);
}
break; // Result is the "I'm done" signal — don't wait for subprocess to exit
} else if (msg.type === 'system' && msg.content) {
// Forward provider warnings (⚠️) and MCP connection failures to the user.
@ -1626,6 +1645,28 @@ async function executeLoopNode(
if (msg.numTurns !== undefined) {
loopTotalNumTurns = (loopTotalNumTurns ?? 0) + msg.numTurns;
}
// Fail the iteration loudly on SDK error results. Previously we broke
// silently, producing empty output and continuing to the next iteration —
// which made `error_during_execution` on resumed interactive loops look
// like a "5-second crash" that kept burning iterations (#1208).
if (msg.isError) {
const subtype = msg.errorSubtype ?? 'unknown';
const errorsDetail = msg.errors?.length ? `${msg.errors.join('; ')}` : '';
getLog().error(
{
nodeId: node.id,
iteration: i,
errorSubtype: subtype,
errors: msg.errors,
sessionId: msg.sessionId,
stopReason: msg.stopReason,
},
'loop_node.iteration_sdk_error'
);
throw new Error(
`Loop '${node.id}' iteration ${String(i)} failed: SDK returned ${subtype}${errorsDetail}`
);
}
break; // Result is the "I'm done" signal — don't wait for subprocess to exit
} else if (msg.type === 'tool' && msg.toolName) {
const now = Date.now();