From 17e47f3f8c4e7ffdc80420847ae202e4abc2f9ad Mon Sep 17 00:00:00 2001 From: Rasmus Widing Date: Sat, 11 Apr 2026 08:55:28 +0300 Subject: [PATCH] fix(claude): surface nested Claude Code hangs with timeout + diagnostics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes addressing the silent-hang class of bug in #1067: 1. CLAUDECODE=1 startup warning in the CLI When `archon` is invoked from inside a Claude Code session (parent env has CLAUDECODE=1), print a warning with a pointer to #1067 and the `archon serve` workaround. Suppressable via ARCHON_SUPPRESS_NESTED_CLAUDE_WARNING=1. 2. First-event timeout on the SDK query generator Wrap the Claude Agent SDK's `query()` async iterable so the first yielded message is raced against a 60-second timeout (configurable via ARCHON_CLAUDE_FIRST_EVENT_TIMEOUT_MS). If the subprocess spawns but produces no output within the window, log a diagnostic dump (subprocess env keys, parent CLAUDE_CODE_* keys, SDK model, stderr tail, process state) and throw a descriptive error pointing at #1067. Uses Promise.race on the first iterator.next() rather than relying on abortController.abort() alone — this unblocks us even if the SDK ignores the abort, which is the exact pathological case we're defending against. The reporter in #1067 saw 30+ minutes of silence; with this change they would have seen a clear error with actionable evidence within 60 seconds. Context: the hang does not reproduce on current dev in our environment (see the RCA at .agents/rca/issue-1067-nested-claude-hang.md), so this is defense + diagnostics rather than a root-cause fix. If the next reporter hits the same hang, the diagnostic dump will pinpoint the cause. Validation: - All 9 packages type-check clean, lint clean, format clean - 3 new unit tests for the timeout path (fast path, timeout path, error message discoverability) - E2E: `archon version` with CLAUDECODE=1 prints the warning; `archon workflow run archon-assist` still completes normally (timeout doesn't fire on fast path) --- packages/cli/src/cli.ts | 15 +++ packages/core/src/clients/claude.test.ts | 87 ++++++++++++++ packages/core/src/clients/claude.ts | 137 ++++++++++++++++++++++- 3 files changed, 238 insertions(+), 1 deletion(-) diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts index 96c02096..508baa7e 100755 --- a/packages/cli/src/cli.ts +++ b/packages/cli/src/cli.ts @@ -38,6 +38,21 @@ if (!process.env.CLAUDE_API_KEY && !process.env.CLAUDE_CODE_OAUTH_TOKEN) { } } +// Warn if running from inside a Claude Code session. Nested Claude Code +// subprocess launches have a history of hanging (keychain/session/TTY state, +// upstream SDK quirks) even when Archon's subprocess env strip works. Users +// hitting this class of issue should use `archon serve` + web UI / HTTP API +// from a non-nested shell. See coleam00/Archon#1067. +if (process.env.CLAUDECODE === '1' && process.env.ARCHON_SUPPRESS_NESTED_CLAUDE_WARNING !== '1') { + console.warn( + '⚠ Detected CLAUDECODE=1 — you appear to be running `archon` from inside a Claude Code session.\n' + + ' If workflows hang silently at dag_node_started, this is a known class of issue.\n' + + ' Workaround: run `archon serve` from a regular shell and use the web UI or HTTP API.\n' + + ' Details: https://github.com/coleam00/Archon/issues/1067\n' + + ' Suppress this warning: export ARCHON_SUPPRESS_NESTED_CLAUDE_WARNING=1' + ); +} + // DATABASE_URL is no longer required - SQLite will be used as default // Import commands after dotenv is loaded diff --git a/packages/core/src/clients/claude.test.ts b/packages/core/src/clients/claude.test.ts index fd79d162..44f49963 100644 --- a/packages/core/src/clients/claude.test.ts +++ b/packages/core/src/clients/claude.test.ts @@ -1105,4 +1105,91 @@ describe('ClaudeClient', () => { expect(spyScan).not.toHaveBeenCalled(); }); }); + + describe('first-event timeout', () => { + const originalTimeoutEnv = process.env.ARCHON_CLAUDE_FIRST_EVENT_TIMEOUT_MS; + + beforeEach(() => { + process.env.ARCHON_CLAUDE_FIRST_EVENT_TIMEOUT_MS = '50'; + }); + + afterEach(() => { + if (originalTimeoutEnv === undefined) { + delete process.env.ARCHON_CLAUDE_FIRST_EVENT_TIMEOUT_MS; + } else { + process.env.ARCHON_CLAUDE_FIRST_EVENT_TIMEOUT_MS = originalTimeoutEnv; + } + }); + + test('throws a descriptive error when SDK never yields a first message', async () => { + // Generator that sleeps forever — simulates a hung subprocess + mockQuery.mockImplementation(async function* () { + await new Promise(() => { + // never resolves; the abortController.abort() inside the generator's + // internal loop is what actually unblocks it in production. For the + // test, we simulate that by awaiting a promise that rejects on abort. + }); + // unreachable + yield { type: 'assistant', message: { content: [{ type: 'text', text: 'x' }] } }; + }); + + // The for-await loop in sendQuery() will be blocked on the generator's + // first `next()` call. The timeout fires, aborts the controller, and the + // SDK generator is expected to throw or return. Since our mock doesn't + // actually react to abort, we test that sendQuery() itself raises the + // descriptive timeout error. We use a very short timeout (50ms) to keep + // the test fast. + await expect(async () => { + for await (const _ of client.sendQuery('test', '/workspace')) { + // consume — should never produce any chunks + } + }).toThrow(/produced no output within/); + + // The diagnostic dump should have been logged at error level + expect(mockLogger.error).toHaveBeenCalledWith( + expect.objectContaining({ + timeoutMs: 50, + parentProcess: expect.objectContaining({ + platform: process.platform, + }), + }), + 'claude.first_event_timeout' + ); + }); + + test('does not throw when SDK yields a message within the timeout', async () => { + mockQuery.mockImplementation(async function* () { + yield { + type: 'assistant', + message: { content: [{ type: 'text', text: 'fast response' }] }, + }; + }); + + const chunks = []; + for await (const chunk of client.sendQuery('test', '/workspace')) { + chunks.push(chunk); + } + + expect(chunks).toHaveLength(1); + expect(chunks[0]).toEqual({ type: 'assistant', content: 'fast response' }); + // The timeout should not have fired + expect(mockLogger.error).not.toHaveBeenCalledWith( + expect.anything(), + 'claude.first_event_timeout' + ); + }); + + test('the timeout error mentions issue #1067 for discoverability', async () => { + mockQuery.mockImplementation(async function* () { + await new Promise(() => {}); + yield { type: 'assistant', message: { content: [{ type: 'text', text: 'x' }] } }; + }); + + await expect(async () => { + for await (const _ of client.sendQuery('test', '/workspace')) { + // consume + } + }).toThrow(/coleam00\/Archon\/issues\/1067/); + }); + }); }); diff --git a/packages/core/src/clients/claude.ts b/packages/core/src/clients/claude.ts index 1d2bd664..69c88256 100644 --- a/packages/core/src/clients/claude.ts +++ b/packages/core/src/clients/claude.ts @@ -195,6 +195,98 @@ const MAX_SUBPROCESS_RETRIES = 3; /** Delay between retries in milliseconds */ const RETRY_BASE_DELAY_MS = 2000; +/** Timeout (ms) for the SDK's *first* yielded message. If the subprocess spawns + * but produces no output within this window, we abort it and emit a diagnostic + * dump instead of hanging forever. Defaults to 60s — normal first events arrive + * in <5s, so 60s catches real hangs without flagging legitimately slow starts. + * Override via ARCHON_CLAUDE_FIRST_EVENT_TIMEOUT_MS. See coleam00/Archon#1067. */ +function getFirstEventTimeoutMs(): number { + const raw = process.env.ARCHON_CLAUDE_FIRST_EVENT_TIMEOUT_MS; + if (raw === undefined) return 60_000; + const parsed = Number.parseInt(raw, 10); + if (!Number.isFinite(parsed) || parsed <= 0) { + getLog().warn({ raw }, 'claude.invalid_first_event_timeout_env'); + return 60_000; + } + return parsed; +} + +/** + * Wrap an async iterable so that its first `next()` call is raced against a + * timeout. On timeout, `onTimeout()` is called to build and throw the error. + * Subsequent iterations are unaffected — only the *first* yield has a deadline. + * + * Uses Promise.race rather than relying on abortController so we unblock even + * if the source ignores abort (the exact pathological case we're defending). + */ +async function* withFirstMessageTimeout( + source: AsyncIterable, + timeoutMs: number, + onTimeout: () => Error +): AsyncGenerator { + const it = source[Symbol.asyncIterator](); + let first = true; + while (true) { + let result: IteratorResult; + if (first) { + first = false; + let timer: ReturnType | undefined; + try { + result = await Promise.race([ + it.next(), + new Promise((_, reject) => { + timer = setTimeout(() => { + reject(onTimeout()); + }, timeoutMs); + }), + ]); + } finally { + if (timer !== undefined) clearTimeout(timer); + } + } else { + result = await it.next(); + } + if (result.done) return; + yield result.value; + } + // No `finally` with `await it.return()` — on the timeout path, the source + // generator is stuck inside an `await` that never resolves (that's literally + // the bug we're defending against). Calling `it.return()` would itself hang + // waiting for the generator to unwind. In production, the SDK's own abort + // handling cleans up; in tests, the stuck generator is GC'd. +} + +/** Build a diagnostic snapshot for the hang case. Includes env var names + * (not values — secrets), SDK options (redacted), and process state. Logged + * at error level when the first-event timeout fires so the next reporter has + * actionable evidence instead of 30 minutes of silence. */ +function buildFirstEventHangDiagnostics(params: { + cwd: string; + attempt: number; + subprocessEnvKeys: string[]; + parentClaudeEnvKeys: string[]; + model: string | undefined; + timeoutMs: number; + stderrLines: string[]; +}): Record { + return { + cwd: params.cwd, + attempt: params.attempt, + timeoutMs: params.timeoutMs, + model: params.model, + subprocessEnvKeys: params.subprocessEnvKeys.sort(), + parentClaudeEnvKeys: params.parentClaudeEnvKeys.sort(), + parentProcess: { + platform: process.platform, + uid: getProcessUid(), + isTTY: process.stdout.isTTY ?? false, + claudeCode: process.env.CLAUDECODE ?? null, + claudeCodeEntrypoint: process.env.CLAUDE_CODE_ENTRYPOINT ?? null, + }, + stderrTail: params.stderrLines.slice(-20), + }; +} + /** Patterns indicating rate limiting in stderr/error messages */ const RATE_LIMIT_PATTERNS = ['rate limit', 'too many requests', '429', 'overloaded']; @@ -478,8 +570,43 @@ export class ClaudeClient implements IAssistantClient { getLog().debug({ cwd, attempt }, 'starting_new_session'); } + // First-event timeout: if the SDK produces no output within + // `firstEventTimeoutMs`, log a diagnostic dump and throw. Uses Promise.race + // on the first iterator.next() so we unblock even if the SDK ignores abort + // (the pathological case we're defending against). + // See coleam00/Archon#1067. + const firstEventTimeoutMs = getFirstEventTimeoutMs(); + let firstEventTimedOut = false; + const buildTimeoutError = (): Error => { + firstEventTimedOut = true; + const subprocessEnvKeys = Object.keys(options.env ?? {}); + const parentClaudeEnvKeys = Object.keys(process.env).filter( + k => k === 'CLAUDECODE' || k.startsWith('CLAUDE_CODE_') || k.startsWith('ANTHROPIC_') + ); + const diag = buildFirstEventHangDiagnostics({ + cwd, + attempt, + subprocessEnvKeys, + parentClaudeEnvKeys, + model: requestOptions?.model, + timeoutMs: firstEventTimeoutMs, + stderrLines, + }); + getLog().error(diag, 'claude.first_event_timeout'); + controller.abort(); // best effort — Promise.race has already unblocked us + return new Error( + `Claude Code subprocess produced no output within ${firstEventTimeoutMs}ms. ` + + "See logs for 'claude.first_event_timeout' diagnostics. " + + 'Details: https://github.com/coleam00/Archon/issues/1067' + ); + }; + try { - for await (const msg of query({ prompt, options })) { + for await (const msg of withFirstMessageTimeout( + query({ prompt, options }), + firstEventTimeoutMs, + buildTimeoutError + )) { // Drain tool results captured by PostToolUse hook before processing the next message while (toolResultQueue.length > 0) { const tr = toolResultQueue.shift(); @@ -588,6 +715,14 @@ export class ClaudeClient implements IAssistantClient { } catch (error) { const err = error as Error; + // First-event timeout fired: the SDK never yielded a message within + // `firstEventTimeoutMs`. `buildTimeoutError()` already built, logged, + // and aborted. Re-throw the error as-is — no retry, because a wedged + // subprocess will produce the same hang next time. + if (firstEventTimedOut) { + throw err; + } + // Don't retry aborted queries if (controller.signal.aborted) { throw new Error('Query aborted');