mirror of
https://github.com/stablyai/orca
synced 2026-04-21 14:17:16 +00:00
fix(terminal): auto-respawn PTY daemon when it dies mid-session
When the daemon process died (e.g. from a signal, OOM, or cascading from force-quitting child processes), all terminals froze permanently with "connect ENOENT daemon-v1.sock" because there was no recovery path — DaemonSpawner.handle stayed non-null and ensureRunning() never re-spawned. Add lazy daemon respawn: DaemonPtyAdapter.withDaemonRetry() catches daemon-death errors (ENOENT, ECONNREFUSED, "Connection lost"), tears down stale client state, forks a fresh daemon, and retries the operation once.
This commit is contained in:
parent
b3f99b5ae1
commit
f6d5f62e60
4 changed files with 112 additions and 1 deletions
|
|
@ -160,7 +160,16 @@ export async function initDaemonPtyProvider(): Promise<void> {
|
|||
const newAdapter = new DaemonPtyAdapter({
|
||||
socketPath: info.socketPath,
|
||||
tokenPath: info.tokenPath,
|
||||
historyPath: getHistoryDir()
|
||||
historyPath: getHistoryDir(),
|
||||
// Why: when the daemon process dies (e.g. killed by a signal, OOM, or
|
||||
// cascading from a force-quit of child processes), the adapter's
|
||||
// ensureConnected() detects the dead socket and calls this to fork a
|
||||
// replacement daemon before retrying the connection.
|
||||
respawn: async () => {
|
||||
console.warn('[daemon] Daemon process died — respawning')
|
||||
newSpawner.resetHandle()
|
||||
await newSpawner.ensureRunning()
|
||||
}
|
||||
})
|
||||
|
||||
spawner = newSpawner
|
||||
|
|
|
|||
|
|
@ -641,4 +641,50 @@ describe('DaemonPtyAdapter (IPtyProvider)', () => {
|
|||
)
|
||||
})
|
||||
})
|
||||
|
||||
describe('respawn on daemon death', () => {
|
||||
it('respawns the daemon and retries when the socket disappears', async () => {
|
||||
let respawnServer: DaemonServer | undefined
|
||||
const respawnFn = vi.fn(async () => {
|
||||
respawnServer = new DaemonServer({
|
||||
socketPath,
|
||||
tokenPath,
|
||||
spawnSubprocess: () => createMockSubprocess()
|
||||
})
|
||||
await respawnServer.start()
|
||||
})
|
||||
|
||||
const respawnAdapter = new DaemonPtyAdapter({ socketPath, tokenPath, respawn: respawnFn })
|
||||
|
||||
// First spawn succeeds normally
|
||||
const r1 = await respawnAdapter.spawn({ cols: 80, rows: 24 })
|
||||
expect(r1.id).toBeDefined()
|
||||
|
||||
// Kill the server to simulate daemon death
|
||||
await server.shutdown()
|
||||
|
||||
// Next spawn should detect the dead socket, call respawn, and succeed
|
||||
const r2 = await respawnAdapter.spawn({ cols: 80, rows: 24 })
|
||||
expect(r2.id).toBeDefined()
|
||||
expect(respawnFn).toHaveBeenCalledOnce()
|
||||
|
||||
respawnAdapter.dispose()
|
||||
await respawnServer?.shutdown()
|
||||
})
|
||||
|
||||
it('propagates the error when no respawn callback is provided', async () => {
|
||||
const noRespawnAdapter = new DaemonPtyAdapter({ socketPath, tokenPath })
|
||||
|
||||
// First spawn succeeds
|
||||
await noRespawnAdapter.spawn({ cols: 80, rows: 24 })
|
||||
|
||||
// Kill the server
|
||||
await server.shutdown()
|
||||
|
||||
// Next spawn should fail with the original socket error
|
||||
await expect(noRespawnAdapter.spawn({ cols: 80, rows: 24 })).rejects.toThrow()
|
||||
|
||||
noRespawnAdapter.dispose()
|
||||
})
|
||||
})
|
||||
})
|
||||
|
|
|
|||
|
|
@ -17,6 +17,9 @@ export type DaemonPtyAdapterOptions = {
|
|||
/** Directory for disk-based terminal history. When set, the adapter writes
|
||||
* raw PTY output to disk for cold restore on daemon crash. */
|
||||
historyPath?: string
|
||||
/** Called when the daemon socket is unreachable (process died). Expected to
|
||||
* fork a fresh daemon so the next connection attempt can succeed. */
|
||||
respawn?: () => Promise<void>
|
||||
}
|
||||
|
||||
const MAX_TOMBSTONES = 1000
|
||||
|
|
@ -32,6 +35,7 @@ export class DaemonPtyAdapter implements IPtyProvider {
|
|||
private client: DaemonClient
|
||||
private historyManager: HistoryManager | null
|
||||
private historyReader: HistoryReader | null
|
||||
private respawnFn: (() => Promise<void>) | null
|
||||
private dataListeners: ((payload: { id: string; data: string }) => void)[] = []
|
||||
private exitListeners: ((payload: { id: string; code: number }) => void)[] = []
|
||||
private removeEventListener: (() => void) | null = null
|
||||
|
|
@ -54,6 +58,7 @@ export class DaemonPtyAdapter implements IPtyProvider {
|
|||
})
|
||||
this.historyManager = opts.historyPath ? new HistoryManager(opts.historyPath) : null
|
||||
this.historyReader = opts.historyPath ? new HistoryReader(opts.historyPath) : null
|
||||
this.respawnFn = opts.respawn ?? null
|
||||
}
|
||||
|
||||
getHistoryManager(): HistoryManager | null {
|
||||
|
|
@ -61,6 +66,10 @@ export class DaemonPtyAdapter implements IPtyProvider {
|
|||
}
|
||||
|
||||
async spawn(opts: PtySpawnOptions): Promise<PtySpawnResult> {
|
||||
return this.withDaemonRetry(() => this.doSpawn(opts))
|
||||
}
|
||||
|
||||
private async doSpawn(opts: PtySpawnOptions): Promise<PtySpawnResult> {
|
||||
await this.ensureConnected()
|
||||
|
||||
const sessionId =
|
||||
|
|
@ -366,6 +375,28 @@ export class DaemonPtyAdapter implements IPtyProvider {
|
|||
this.setupEventRouting()
|
||||
}
|
||||
|
||||
// Why: when the daemon process dies, operations fail with ENOENT (socket
|
||||
// gone), ECONNREFUSED, or "Connection lost" (socket closed mid-request).
|
||||
// Rather than leaving all terminals permanently broken until app restart,
|
||||
// this wrapper detects daemon-death errors, tears down the stale client
|
||||
// state, forks a fresh daemon via respawnFn, reconnects, and retries the
|
||||
// operation once. If respawn itself fails, the error propagates normally.
|
||||
private async withDaemonRetry<T>(fn: () => Promise<T>): Promise<T> {
|
||||
try {
|
||||
return await fn()
|
||||
} catch (err) {
|
||||
if (!this.respawnFn || !isDaemonGoneError(err)) {
|
||||
throw err
|
||||
}
|
||||
console.warn('[daemon] Operation failed, respawning:', (err as Error).message)
|
||||
this.removeEventListener?.()
|
||||
this.removeEventListener = null
|
||||
this.client.disconnect()
|
||||
await this.respawnFn()
|
||||
return await fn()
|
||||
}
|
||||
}
|
||||
|
||||
private setupEventRouting(): void {
|
||||
if (this.removeEventListener) {
|
||||
return
|
||||
|
|
@ -402,3 +433,21 @@ export class DaemonPtyAdapter implements IPtyProvider {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Why: ENOENT means the socket file was deleted (daemon crashed and cleaned
|
||||
// up, or was killed). ECONNREFUSED means the file exists but nothing is
|
||||
// listening (rare race). "Connection lost" / "Not connected" mean the daemon
|
||||
// died while we had an active or stale connection — the client detected the
|
||||
// socket close but we still tried to use it. All indicate the daemon is
|
||||
// gone and a respawn should be attempted.
|
||||
function isDaemonGoneError(err: unknown): boolean {
|
||||
if (!(err instanceof Error)) {
|
||||
return false
|
||||
}
|
||||
const code = (err as NodeJS.ErrnoException).code
|
||||
if (code === 'ENOENT' || code === 'ECONNREFUSED') {
|
||||
return true
|
||||
}
|
||||
const msg = err.message
|
||||
return msg === 'Connection lost' || msg === 'Not connected'
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,6 +42,13 @@ export class DaemonSpawner {
|
|||
return { socketPath: this.socketPath, tokenPath: this.tokenPath }
|
||||
}
|
||||
|
||||
// Why: after the daemon process dies unexpectedly, the cached handle is
|
||||
// stale. Clearing it lets the next ensureRunning() fork a fresh daemon
|
||||
// instead of returning the dead socket path.
|
||||
resetHandle(): void {
|
||||
this.handle = null
|
||||
}
|
||||
|
||||
async shutdown(): Promise<void> {
|
||||
if (!this.handle) {
|
||||
return
|
||||
|
|
|
|||
Loading…
Reference in a new issue