fix(workflows): fail loudly on SDK isError results (#1208) (#1291)

Previously, `dag-executor` only failed nodes/iterations when the SDK returned an `error_max_budget_usd` result. Every other `isError: true` subtype — including `error_during_execution` — was silently `break`ed out of the stream with whatever partial output had accumulated, letting failed runs masquerade as successful ones with empty output. This is the most likely explanation for the "5-second crash" symptom in #1208: iterations finish instantly with empty text, the loop keeps going, and only the `claude.result_is_error` log tips the user off. Changes: - Capture the SDK's `errors: string[]` detail on result messages (previously discarded) and surface it through `MessageChunk.errors`. - Log `errors`, `stopReason` alongside `errorSubtype` in `claude.result_is_error` so users can see what actually failed. - Throw from both the general node path and the loop iteration path on any `isError: true` result, including the subtype and SDK errors detail in the thrown message. Note: this does not implement auto-retry. See PR comments on #1121 and the analysis on #1208 — a retry-with-fresh-session approach for loop iterations is not obviously correct until we see what `error_during_execution` actually carries in the reporter's env. This change is the observability + fail-loud step that has to come first so that signal is no longer silent. Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-21 13:37:41 +00:00 · 2026-04-18 15:02:35 -05:00 · 2026-04-18 15:02:35 -05:00 · 4c6ddd994f
commit 4c6ddd994f
parent d89bc767d2
4 changed files with 168 additions and 1 deletions
--- a/packages/providers/src/claude/provider.ts
+++ b/packages/providers/src/claude/provider.ts
@ -740,6 +740,7 @@ async function* streamClaudeMessages(
        total_cost_usd?: number;
        stop_reason?: string | null;
        num_turns?: number;
+        errors?: string[];
        model_usage?: Record<
          string,
          {
@ -751,9 +752,15 @@ async function* streamClaudeMessages(
        >;
      };
      const tokens = normalizeClaudeUsage(resultMsg.usage);
+      const sdkErrors = Array.isArray(resultMsg.errors) ? resultMsg.errors : undefined;
      if (resultMsg.is_error) {
        getLog().error(
-          { sessionId: resultMsg.session_id, errorSubtype: resultMsg.subtype },
+          {
+            sessionId: resultMsg.session_id,
+            errorSubtype: resultMsg.subtype,
+            stopReason: resultMsg.stop_reason,
+            errors: sdkErrors,
+          },
          'claude.result_is_error'
        );
      }
@ -765,6 +772,7 @@ async function* streamClaudeMessages(
          ? { structuredOutput: resultMsg.structured_output }
          : {}),
        ...(resultMsg.is_error ? { isError: true, errorSubtype: resultMsg.subtype } : {}),
+        ...(resultMsg.is_error && sdkErrors?.length ? { errors: sdkErrors } : {}),
        ...(resultMsg.total_cost_usd !== undefined ? { cost: resultMsg.total_cost_usd } : {}),
        ...(resultMsg.stop_reason != null ? { stopReason: resultMsg.stop_reason } : {}),
        ...(resultMsg.num_turns !== undefined ? { numTurns: resultMsg.num_turns } : {}),
--- a/packages/providers/src/types.ts
+++ b/packages/providers/src/types.ts
@ -72,6 +72,8 @@ export type MessageChunk =
      structuredOutput?: unknown;
      isError?: boolean;
      errorSubtype?: string;
+      /** SDK-provided error detail strings. Populated when isError is true. */
+      errors?: string[];
      cost?: number;
      stopReason?: string;
      numTurns?: number;
--- a/packages/workflows/src/dag-executor.test.ts
+++ b/packages/workflows/src/dag-executor.test.ts
@ -3594,6 +3594,70 @@ describe('executeDagWorkflow -- resume with priorCompletedNodes', () => {
      expect(sessionArg).toBe('loop-session-1');
    });

+    it('loop iteration fails loudly when SDK returns error_during_execution', async () => {
+      // Regression test for #1208: previously the loop silently broke on isError
+      // results and kept iterating with empty output, producing "5-second crashes"
+      // that masqueraded as successful iterations.
+      mockSendQueryDag.mockImplementation(function* () {
+        yield {
+          type: 'result',
+          isError: true,
+          errorSubtype: 'error_during_execution',
+          errors: ['Subprocess crashed mid-turn'],
+          sessionId: 'bad-session',
+        };
+      });
+
+      const store = createMockStore();
+      const mockDeps = createMockDeps(store);
+      const platform = createMockPlatform();
+      const workflowRun = makeWorkflowRun();
+
+      await executeDagWorkflow(
+        mockDeps,
+        platform,
+        'conv-dag',
+        testDir,
+        {
+          name: 'loop-iteration-err',
+          nodes: [
+            {
+              id: 'work',
+              loop: {
+                prompt: 'Do the work. Say DONE.',
+                until: 'DONE',
+                max_iterations: 5,
+              },
+            },
+          ],
+        },
+        workflowRun,
+        'claude',
+        undefined,
+        join(testDir, 'artifacts'),
+        join(testDir, 'logs'),
+        'main',
+        'docs/',
+        minimalConfig
+      );
+
+      // Should fail after one iteration rather than burning through max_iterations
+      expect(mockSendQueryDag.mock.calls.length).toBe(1);
+      // The loop_iteration_failed event should carry the subtype and SDK errors detail
+      const eventCalls = (store.createWorkflowEvent as ReturnType<typeof mock>).mock.calls;
+      const iterFailedEvents = eventCalls.filter(
+        (call: unknown[]) =>
+          (call[0] as Record<string, unknown>).event_type === 'loop_iteration_failed'
+      );
+      expect(iterFailedEvents.length).toBeGreaterThan(0);
+      const failedData = (iterFailedEvents[0][0] as Record<string, unknown>).data as Record<
+        string,
+        unknown
+      >;
+      expect(failedData.error).toContain('error_during_execution');
+      expect(failedData.error).toContain('Subprocess crashed mid-turn');
+    });
+
    it('non-interactive loop is unaffected (no pause)', async () => {
      mockSendQueryDag.mockImplementation(function* () {
        yield { type: 'assistant', content: 'Still working...' };
@ -4617,6 +4681,58 @@ describe('executeDagWorkflow -- Claude SDK advanced options', () => {
    expect(capMessage).toBeDefined();
  });

+  it('fails node when SDK returns error_during_execution result', async () => {
+    // Regression test for #1208: previously we only failed on error_max_budget_usd
+    // and silently broke on all other isError subtypes, letting failed nodes
+    // masquerade as successes with empty output.
+    mockSendQueryDag.mockImplementation(function* () {
+      yield {
+        type: 'result',
+        isError: true,
+        errorSubtype: 'error_during_execution',
+        errors: ['Tool call failed: permission denied'],
+        sessionId: 'sid-err',
+      };
+    });
+
+    const store = createMockStore();
+    const mockDeps = createMockDeps(store);
+    const platform = createMockPlatform();
+    const workflowRun = makeWorkflowRun();
+
+    await executeDagWorkflow(
+      mockDeps,
+      platform,
+      'conv-dag',
+      testDir,
+      {
+        name: 'err-exec-test',
+        nodes: [{ id: 'step1', command: 'my-cmd' }],
+      },
+      workflowRun,
+      'claude',
+      undefined,
+      join(testDir, 'artifacts'),
+      join(testDir, 'logs'),
+      'main',
+      'docs/',
+      minimalConfig
+    );
+
+    // The node_failed event should carry the subtype and SDK errors detail
+    const eventCalls = (store.createWorkflowEvent as ReturnType<typeof mock>).mock.calls;
+    const nodeFailedEvents = eventCalls.filter(
+      (call: unknown[]) => (call[0] as Record<string, unknown>).event_type === 'node_failed'
+    );
+    expect(nodeFailedEvents.length).toBeGreaterThan(0);
+    const failedData = (nodeFailedEvents[0][0] as Record<string, unknown>).data as Record<
+      string,
+      unknown
+    >;
+    expect(failedData.error).toContain('error_during_execution');
+    expect(failedData.error).toContain('permission denied');
+  });
+
  it('forwards workflow-level effort to node when no per-node override', async () => {
    const mockDeps = createMockDeps();
    const platform = createMockPlatform();
--- a/packages/workflows/src/dag-executor.ts
+++ b/packages/workflows/src/dag-executor.ts
@ -764,6 +764,25 @@ async function executeNodeInternal(
            `Node '${node.id}' exceeded cost cap${cap !== undefined ? ` of $${cap.toFixed(2)}` : ''}.`
          );
        }
+        // Fail loudly on any other SDK error result. Previously we broke out of
+        // the stream silently, producing empty/partial output without signaling
+        // failure — which let failed iterations masquerade as successes (#1208).
+        if (msg.isError) {
+          const subtype = msg.errorSubtype ?? 'unknown';
+          const errorsDetail = msg.errors?.length ? ` — ${msg.errors.join('; ')}` : '';
+          getLog().error(
+            {
+              nodeId: node.id,
+              errorSubtype: subtype,
+              errors: msg.errors,
+              sessionId: msg.sessionId,
+              stopReason: msg.stopReason,
+              durationMs: Date.now() - nodeStartTime,
+            },
+            'dag.node_sdk_error_result'
+          );
+          throw new Error(`Node '${node.id}' failed: SDK returned ${subtype}${errorsDetail}`);
+        }
        break; // Result is the "I'm done" signal — don't wait for subprocess to exit
      } else if (msg.type === 'system' && msg.content) {
        // Forward provider warnings (⚠️) and MCP connection failures to the user.
@ -1626,6 +1645,28 @@ async function executeLoopNode(
          if (msg.numTurns !== undefined) {
            loopTotalNumTurns = (loopTotalNumTurns ?? 0) + msg.numTurns;
          }
+          // Fail the iteration loudly on SDK error results. Previously we broke
+          // silently, producing empty output and continuing to the next iteration —
+          // which made `error_during_execution` on resumed interactive loops look
+          // like a "5-second crash" that kept burning iterations (#1208).
+          if (msg.isError) {
+            const subtype = msg.errorSubtype ?? 'unknown';
+            const errorsDetail = msg.errors?.length ? ` — ${msg.errors.join('; ')}` : '';
+            getLog().error(
+              {
+                nodeId: node.id,
+                iteration: i,
+                errorSubtype: subtype,
+                errors: msg.errors,
+                sessionId: msg.sessionId,
+                stopReason: msg.stopReason,
+              },
+              'loop_node.iteration_sdk_error'
+            );
+            throw new Error(
+              `Loop '${node.id}' iteration ${String(i)} failed: SDK returned ${subtype}${errorsDetail}`
+            );
+          }
          break; // Result is the "I'm done" signal — don't wait for subprocess to exit
        } else if (msg.type === 'tool' && msg.toolName) {
          const now = Date.now();