Fix browser wait timeout and focus behavior

2026-04-21 14:17:16 +00:00 · 2026-04-20 22:11:43 -04:00 · 2026-04-20 22:11:43 -04:00 · 19c49e601f
commit 19c49e601f
parent 99c7f59968
7 changed files with 366 additions and 45 deletions
--- a/docs/browser-automation-review-findings.md
+++ b/docs/browser-automation-review-findings.md
@ -83,6 +83,32 @@ Last updated: 2026-04-20
  Resolution: command responses are now sent back to the websocket that originated the request,
  so late results from a closed client are dropped instead of leaking into a newer connection.

+## Wait and emulation behavior
+
+- `fixed` `src/cli/index.ts`, `src/cli/index.test.ts`
+  Issue: `orca wait --selector ...` used the generic RPC timeout budget and then surfaced a
+  misleading "Orca is not running" hint when the wait outlived that budget.
+  Resolution: browser waits now use an explicit RPC timeout budget keyed to the requested wait
+  duration, and generic runtime timeouts no longer claim the app is down.
+
+- `fixed` `src/main/browser/agent-browser-bridge.ts`, `src/main/browser/agent-browser-bridge.test.ts`
+  Issue: selector waits relied on undocumented `agent-browser` behavior. `--state visible`
+  timed out even when the selector was already visible, and timed selector waits could still
+  bubble up as generic runtime timeouts when the underlying command never resolved itself.
+  Resolution: Orca now treats `--state visible` as the default selector-wait semantics instead
+  of forwarding an unsupported flag, and it enforces selector/text/url/function/load wait
+  timeouts at the bridge layer so missing conditions fail as `browser_timeout` with the
+  requested timeout in the message. Live verification against `orca-dev` confirmed both
+  `wait --selector h1 --state visible --timeout 2000` and
+  `wait --selector .definitely-not-present --timeout 1000`.
+
+- `fixed` `src/main/browser/agent-browser-bridge.ts`, `src/main/browser/agent-browser-bridge.test.ts`
+  Issue: `orca viewport --mobile` never took effect because the bridge shelled out to
+  `agent-browser set viewport`, which does not support mobile emulation.
+  Resolution: viewport emulation now goes straight through CDP
+  `Emulation.setDeviceMetricsOverride`, so width, scale, and the `mobile` flag all stay aligned
+  with the CLI contract.
+
 ## Console behavior

 - `not-a-bug` `src/main/browser/agent-browser-bridge.ts:1252`, `src/main/browser/cdp-ws-proxy.ts`
@ -92,6 +118,16 @@ Last updated: 2026-04-20
  emitted inside each guest renderer context, so the command is page-scoped even though the output
  includes Chromium/Electron-flavored warnings.

+## Focus behavior
+
+- `fixed` `src/main/browser/cdp-ws-proxy.ts`, `src/main/browser/cdp-ws-proxy.test.ts`
+  Issue: background browser automation could repeatedly steal app focus because the CDP proxy
+  auto-focused the guest for every `Runtime.evaluate` / `Runtime.callFunctionOn` request, and
+  commands like `wait --selector` poll through those methods.
+  Resolution: proxy auto-focus is now limited to `Input.insertText`. Generic eval/function calls
+  stay background-safe, so waits and read-only probes no longer keep dragging the browser window
+  to the foreground.
+
 ## Session lifecycle and process swaps

 - `fixed` `src/main/browser/agent-browser-bridge.ts:264`, `src/main/browser/agent-browser-bridge.ts:1148`, `src/main/browser/agent-browser-bridge.ts:1169`, `src/main/browser/agent-browser-bridge.ts:1559`, `src/main/browser/agent-browser-bridge.test.ts`
--- a/src/cli/index.test.ts
+++ b/src/cli/index.test.ts
@ -43,6 +43,7 @@ import {
  main,
  normalizeWorktreeSelector
 } from './index'
+import { RuntimeClientError } from './runtime-client'

 describe('COMMAND_SPECS collision check', () => {
  it('has no duplicate command paths', () => {
@ -499,3 +500,134 @@ describe('orca cli browser page targeting', () => {
    })
  })
 })
+
+describe('orca cli browser waits and viewport flags', () => {
+  beforeEach(() => {
+    callMock.mockReset()
+    process.exitCode = undefined
+  })
+
+  afterEach(() => {
+    vi.restoreAllMocks()
+  })
+
+  it('gives selector waits an explicit RPC timeout budget', async () => {
+    callMock.mockResolvedValueOnce({
+      id: 'req_wait',
+      ok: true,
+      result: { ok: true },
+      _meta: {
+        runtimeId: 'runtime-1'
+      }
+    })
+    vi.spyOn(console, 'log').mockImplementation(() => {})
+
+    await main(
+      ['wait', '--selector', '#ready', '--worktree', 'all', '--json'],
+      '/tmp/not-an-orca-worktree'
+    )
+
+    expect(callMock).toHaveBeenCalledWith(
+      'browser.wait',
+      {
+        selector: '#ready',
+        timeout: undefined,
+        text: undefined,
+        url: undefined,
+        load: undefined,
+        fn: undefined,
+        state: undefined,
+        worktree: undefined
+      },
+      { timeoutMs: 60_000 }
+    )
+  })
+
+  it('extends selector wait RPC timeout when the user passes --timeout', async () => {
+    callMock.mockResolvedValueOnce({
+      id: 'req_wait',
+      ok: true,
+      result: { ok: true },
+      _meta: {
+        runtimeId: 'runtime-1'
+      }
+    })
+    vi.spyOn(console, 'log').mockImplementation(() => {})
+
+    await main(
+      ['wait', '--selector', '#ready', '--timeout', '12000', '--worktree', 'all', '--json'],
+      '/tmp/not-an-orca-worktree'
+    )
+
+    expect(callMock).toHaveBeenCalledWith(
+      'browser.wait',
+      {
+        selector: '#ready',
+        timeout: 12000,
+        text: undefined,
+        url: undefined,
+        load: undefined,
+        fn: undefined,
+        state: undefined,
+        worktree: undefined
+      },
+      { timeoutMs: 17000 }
+    )
+  })
+
+  it('does not tell users Orca is down for a generic runtime timeout', async () => {
+    callMock.mockRejectedValueOnce(
+      new RuntimeClientError(
+        'runtime_timeout',
+        'Timed out waiting for the Orca runtime to respond.'
+      )
+    )
+    const errorSpy = vi.spyOn(console, 'error').mockImplementation(() => {})
+
+    await main(['wait', '--selector', '#ready', '--worktree', 'all'], '/tmp/not-an-orca-worktree')
+
+    expect(errorSpy).toHaveBeenCalledWith('Timed out waiting for the Orca runtime to respond.')
+  })
+
+  it('passes the mobile viewport flag through to browser.viewport', async () => {
+    callMock.mockResolvedValueOnce({
+      id: 'req_viewport',
+      ok: true,
+      result: {
+        width: 375,
+        height: 812,
+        deviceScaleFactor: 2,
+        mobile: true
+      },
+      _meta: {
+        runtimeId: 'runtime-1'
+      }
+    })
+    vi.spyOn(console, 'log').mockImplementation(() => {})
+
+    await main(
+      [
+        'viewport',
+        '--width',
+        '375',
+        '--height',
+        '812',
+        '--scale',
+        '2',
+        '--mobile',
+        '--worktree',
+        'all',
+        '--json'
+      ],
+      '/tmp/not-an-orca-worktree'
+    )
+
+    expect(callMock).toHaveBeenCalledWith('browser.viewport', {
+      width: 375,
+      height: 812,
+      deviceScaleFactor: 2,
+      mobile: true,
+      worktree: undefined
+    })
+  })
+})
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@ -78,6 +78,7 @@ type BrowserCliTarget = {
 }

 const DEFAULT_TERMINAL_WAIT_RPC_TIMEOUT_MS = 5 * 60 * 1000
+const DEFAULT_BROWSER_WAIT_RPC_TIMEOUT_MS = 60_000
 const GLOBAL_FLAGS = ['help', 'json']
 export const COMMAND_SPECS: CommandSpec[] = [
  {
@ -1021,16 +1022,26 @@ export async function main(argv = process.argv.slice(2), cwd = process.cwd()): P
      const fn = getOptionalStringFlag(parsed.flags, 'fn')
      const state = getOptionalStringFlag(parsed.flags, 'state')
      const target = await getBrowserCommandTarget(parsed.flags, cwd, client)
-      const result = await client.call<BrowserWaitResult>('browser.wait', {
-        selector,
-        timeout,
-        text,
-        url,
-        load,
-        fn,
-        state,
-        ...target
-      })
+      const result = await client.call<BrowserWaitResult>(
+        'browser.wait',
+        {
+          selector,
+          timeout,
+          text,
+          url,
+          load,
+          fn,
+          state,
+          ...target
+        },
+        {
+          // Why: selector/text/url waits can legitimately take longer than a
+          // normal RPC round-trip, even when Orca is healthy. Give browser.wait
+          // an explicit timeout budget so slow waits do not get mislabeled as
+          // "Orca is not running" by the generic client timeout path.
+          timeoutMs: timeout ? timeout + 5000 : DEFAULT_BROWSER_WAIT_RPC_TIMEOUT_MS
+        }
+      )
      return printResult(result, json, (v) => JSON.stringify(v, null, 2))
    }

@ -1955,10 +1966,7 @@ function formatCliStatus(status: CliStatusResult): string {

 function formatCliError(error: unknown): string {
  const message = error instanceof Error ? error.message : String(error)
-  if (
-    error instanceof RuntimeClientError &&
-    (error.code === 'runtime_unavailable' || error.code === 'runtime_timeout')
-  ) {
+  if (error instanceof RuntimeClientError && error.code === 'runtime_unavailable') {
    return `${message}\nOrca is not running. Run 'orca open' first.`
  }
  if (
--- a/src/main/browser/agent-browser-bridge.test.ts
+++ b/src/main/browser/agent-browser-bridge.test.ts
@ -830,17 +830,69 @@ describe('AgentBrowserBridge', () => {

  // ── Viewport command arg building ──

-  it('builds viewport args with scale', async () => {
-    succeedWith({ width: 375, height: 812, mobile: true })
+  it('applies viewport emulation through CDP so mobile mode is preserved', async () => {
+    const wc = mockWebContents(100)
+    webContentsFromIdMock.mockReturnValue(wc)
+
    await bridge.setViewport(375, 812, 2, true)

-    // Why: calls[0] is the stale-session 'close'; the actual command is the last call
+    expect(wc.debugger.sendCommand).toHaveBeenCalledWith('Emulation.setDeviceMetricsOverride', {
+      width: 375,
+      height: 812,
+      deviceScaleFactor: 2,
+      mobile: true
+    })
+    const viewportCall = execFileMock.mock.calls.find((call: unknown[]) =>
+      (call[1] as string[]).includes('viewport')
+    )
+    expect(viewportCall).toBeUndefined()
+  })
+
+  it('normalizes selector wait state=visible to the default supported semantics', async () => {
+    succeedWith({ selector: 'h1', waited: 'selector' })
+
+    await bridge.wait({ selector: 'h1', state: 'visible' })
+
    const args = execFileMock.mock.calls.at(-1)![1] as string[]
-    expect(args).toContain('set')
-    expect(args).toContain('viewport')
-    expect(args).toContain('375')
-    expect(args).toContain('812')
-    expect(args).toContain('2')
+    expect(args).toContain('wait')
+    expect(args).toContain('h1')
+    expect(args).not.toContain('--state')
+  })
+
+  it('enforces conditional wait timeouts at the bridge layer', async () => {
+    succeedWith({ selector: '#ready', waited: 'selector' })
+
+    await bridge.wait({ selector: '#ready', timeout: 1200 })
+
+    const args = execFileMock.mock.calls.at(-1)![1] as string[]
+    const options = execFileMock.mock.calls.at(-1)![2] as { timeout?: number; env?: unknown }
+    expect(args).toContain('wait')
+    expect(args).toContain('#ready')
+    expect(options.timeout).toBe(2200)
+    expect(options.env).toBe(process.env)
+  })
+
+  it('returns browser_timeout for timed conditional waits without recycling the session', async () => {
+    const killedError = Object.assign(new Error('timeout'), { killed: true })
+    execFileMock.mockImplementation(
+      (_bin: string, args: string[], _opts: unknown, cb: Function) => {
+        if (args.includes('wait')) {
+          cb(killedError, '', '')
+          return
+        }
+        cb(null, JSON.stringify({ success: true, data: { snapshot: 'fresh' } }), '')
+      }
+    )
+
+    for (let i = 0; i < 3; i++) {
+      await expect(bridge.wait({ selector: '.missing', timeout: 1200 })).rejects.toThrow(
+        'Timed out waiting for browser condition after 1200ms.'
+      )
+    }
+
+    await bridge.snapshot()
+
+    expect(CdpWsProxyMock.instances).toHaveLength(1)
  })

  // ── Stderr passthrough on non-timeout errors ──
--- a/src/main/browser/agent-browser-bridge.ts
+++ b/src/main/browser/agent-browser-bridge.ts
@ -52,6 +52,7 @@ import type {
 // agent-browser's own timeout fires and returns a proper error.
 const EXEC_TIMEOUT_MS = 90_000
 const CONSECUTIVE_TIMEOUT_LIMIT = 3
+const WAIT_PROCESS_TIMEOUT_GRACE_MS = 1_000

 type SessionState = {
  proxy: CdpWsProxy
@ -78,6 +79,12 @@ type ResolvedBrowserCommandTarget = {
  webContentsId: number
 }

+type AgentBrowserExecOptions = {
+  envOverrides?: NodeJS.ProcessEnv
+  timeoutMs?: number
+  timeoutError?: BrowserError
+}
+
 function agentBrowserNativeName(): string {
  const ext = process.platform === 'win32' ? '.exe' : ''
  return `agent-browser-${platform()}-${arch()}${ext}`
@ -982,9 +989,11 @@ export class AgentBrowserBridge {
  ): Promise<BrowserWaitResult> {
    return this.enqueueTargetedCommand(worktreeId, browserPageId, async (sessionName) => {
      const args = ['wait']
+      const hasCondition =
+        !!options?.selector || !!options?.text || !!options?.url || !!options?.load || !!options?.fn
      if (options?.selector) {
        args.push(options.selector)
-      } else if (options?.timeout != null) {
+      } else if (options?.timeout != null && !hasCondition) {
        args.push(String(options.timeout))
      }
      if (options?.text) {
@ -999,10 +1008,28 @@ export class AgentBrowserBridge {
      if (options?.fn) {
        args.push('--fn', options.fn)
      }
-      if (options?.state) {
-        args.push('--state', options.state)
+      const normalizedState = options?.state === 'visible' ? undefined : options?.state
+      if (normalizedState) {
+        args.push('--state', normalizedState)
      }
-      return (await this.execAgentBrowser(sessionName, args)) as BrowserWaitResult
+      // Why: agent-browser's selector wait surface does not support `--state visible`
+      // or a documented per-command `--timeout`. Orca normalizes "visible" back
+      // to the default selector wait semantics and enforces the requested timeout
+      // at the bridge layer so missing selectors fail as browser_timeout instead
+      // of hanging until the generic runtime RPC timeout fires.
+      return (await this.execAgentBrowser(sessionName, args, {
+        timeoutMs:
+          options?.timeout != null && hasCondition
+            ? options.timeout + WAIT_PROCESS_TIMEOUT_GRACE_MS
+            : undefined,
+        timeoutError:
+          options?.timeout != null && hasCondition
+            ? new BrowserError(
+                'browser_timeout',
+                `Timed out waiting for browser condition after ${options.timeout}ms.`
+              )
+            : undefined
+      })) as BrowserWaitResult
    })
  }

@ -1149,17 +1176,37 @@ export class AgentBrowserBridge {
  async setViewport(
    width: number,
    height: number,
-    scale?: number,
-    _mobile?: boolean,
+    scale = 1,
+    mobile = false,
    worktreeId?: string,
    browserPageId?: string
  ): Promise<BrowserViewportResult> {
-    return this.enqueueTargetedCommand(worktreeId, browserPageId, async (sessionName) => {
-      const args = ['set', 'viewport', String(width), String(height)]
-      if (scale != null) {
-        args.push(String(scale))
+    return this.enqueueTargetedCommand(worktreeId, browserPageId, async (_sessionName, target) => {
+      const wc = this.getWebContents(target.webContentsId)
+      if (!wc) {
+        throw new BrowserError('browser_tab_not_found', 'Tab is no longer available')
+      }
+      const dbg = wc.debugger
+      if (!dbg.isAttached()) {
+        throw new BrowserError('browser_error', 'Debugger not attached')
+      }
+
+      // Why: agent-browser only supports width/height/scale for `set viewport`;
+      // it has no `mobile` flag. Orca's CLI exposes `--mobile`, so apply the
+      // emulation directly through CDP to keep the public CLI contract honest.
+      await dbg.sendCommand('Emulation.setDeviceMetricsOverride', {
+        width,
+        height,
+        deviceScaleFactor: scale,
+        mobile
+      })
+
+      return {
+        width,
+        height,
+        deviceScaleFactor: scale,
+        mobile
      }
-      return (await this.execAgentBrowser(sessionName, args)) as BrowserViewportResult
    })
  }

@ -1575,7 +1622,11 @@ export class AgentBrowserBridge {
    }
  }

-  private async execAgentBrowser(sessionName: string, commandArgs: string[]): Promise<unknown> {
+  private async execAgentBrowser(
+    sessionName: string,
+    commandArgs: string[],
+    execOptions?: AgentBrowserExecOptions
+  ): Promise<unknown> {
    const session = this.sessions.get(sessionName)
    if (!session) {
      throw new BrowserError('browser_error', 'Session not found')
@ -1604,7 +1655,7 @@ export class AgentBrowserBridge {

    args.push(...commandArgs, '--json')

-    const stdout = await this.runAgentBrowserRaw(sessionName, args)
+    const stdout = await this.runAgentBrowserRaw(sessionName, args, execOptions)
    const translated = translateResult(stdout)

    if (!translated.ok) {
@ -1645,7 +1696,11 @@ export class AgentBrowserBridge {
    return translated.result
  }

-  private runAgentBrowserRaw(sessionName: string, args: string[]): Promise<string> {
+  private runAgentBrowserRaw(
+    sessionName: string,
+    args: string[],
+    execOptions?: AgentBrowserExecOptions
+  ): Promise<string> {
    return new Promise<string>((resolve, reject) => {
      const session = this.sessions.get(sessionName)
      let child: ChildProcess | null = null
@ -1654,7 +1709,13 @@ export class AgentBrowserBridge {
        args,
        // Why: screenshots return large base64 strings that exceed Node's default
        // 1MB maxBuffer, causing ENOBUFS and a timeout-like failure.
-        { timeout: EXEC_TIMEOUT_MS, maxBuffer: 50 * 1024 * 1024 },
+        {
+          timeout: execOptions?.timeoutMs ?? EXEC_TIMEOUT_MS,
+          maxBuffer: 50 * 1024 * 1024,
+          env: execOptions?.envOverrides
+            ? { ...process.env, ...execOptions.envOverrides }
+            : process.env
+        },
        (error, stdout, stderr) => {
          if (session && session.activeProcess === child) {
            session.activeProcess = null
@ -1668,6 +1729,10 @@ export class AgentBrowserBridge {
          const liveSession = this.sessions.get(sessionName)

          if (error && (error as NodeJS.ErrnoException & { killed?: boolean }).killed) {
+            if (execOptions?.timeoutError) {
+              reject(execOptions.timeoutError)
+              return
+            }
            if (liveSession) {
              liveSession.consecutiveTimeouts++
              if (liveSession.consecutiveTimeouts >= CONSECUTIVE_TIMEOUT_LIMIT) {
--- a/src/main/browser/cdp-ws-proxy.test.ts
+++ b/src/main/browser/cdp-ws-proxy.test.ts
@ -33,7 +33,10 @@ function createMockWebContents() {
  return {
    webContents: {
      debugger: debuggerObj,
-      isDestroyed: () => false
+      isDestroyed: () => false,
+      focus: vi.fn(),
+      getTitle: vi.fn(() => 'Example'),
+      getURL: vi.fn(() => 'https://example.com')
    },
    listeners,
    emit(event: string, ...args: unknown[]) {
@ -242,6 +245,32 @@ describe('CdpWsProxy', () => {
    client.close()
  })

+  it('does not focus the guest for Runtime.evaluate polling commands', async () => {
+    const client = await connect()
+
+    await sendAndReceive(client, {
+      id: 9,
+      method: 'Runtime.evaluate',
+      params: { expression: 'document.readyState' }
+    })
+
+    expect(mock.webContents.focus).not.toHaveBeenCalled()
+    client.close()
+  })
+
+  it('still focuses the guest for Input.insertText', async () => {
+    const client = await connect()
+
+    await sendAndReceive(client, {
+      id: 10,
+      method: 'Input.insertText',
+      params: { text: 'hello' }
+    })
+
+    expect(mock.webContents.focus).toHaveBeenCalledTimes(1)
+    client.close()
+  })
+
  // ── Page.frameNavigated interception ──

  // ── Cleanup ──
--- a/src/main/browser/cdp-ws-proxy.ts
+++ b/src/main/browser/cdp-ws-proxy.ts
@ -233,13 +233,12 @@ export class CdpWsProxy {
      this.handleScreenshot(client, clientId, msg.params)
      return
    }
-    // Why: Input.insertText/Runtime.evaluate need native focus for clipboard APIs.
-    if (
-      (msg.method === 'Input.insertText' ||
-        msg.method === 'Runtime.evaluate' ||
-        msg.method === 'Runtime.callFunctionOn') &&
-      !this.webContents.isDestroyed()
-    ) {
+    // Why: Input.insertText can still require native focus in Electron webviews.
+    // Do not auto-focus generic Runtime.evaluate/callFunctionOn traffic: wait
+    // polling and read-only JS probes use those methods heavily, and focusing on
+    // every eval steals the user's foreground window while background automation
+    // is running.
+    if (msg.method === 'Input.insertText' && !this.webContents.isDestroyed()) {
      this.webContents.focus()
    }
    // Why: agent-browser waits for network idle to detect navigation completion.