🐛 fix(agent-runtime): harden classifyLLMError so it never masks the original provider error (#13774)

* 🐛 fix(agent-runtime): harden classifyLLMError so it never masks the original provider error Production traces across multiple providers (openrouter, openai, google) surface a single opaque error — `e.trim is not a function` with `errorType: 'unknown'` — hiding whatever the upstream actually returned. Root cause: `normalizeCode` / `normalizeErrorType` assumed their input is always `string | undefined` (matching the TypeScript signature), but real provider error objects frequently carry a numeric `code` (HTTP status) or a structured object in `errorType`. `value?.trim()` short-circuits only on null/undefined, so a truthy non-string turns into a TypeError that the outer catch records as the "final" error, erasing the upstream one. Fixes: - Guard `normalizeCode` / `normalizeErrorType` on `typeof value === 'string'`, widen parameter type to `unknown`. - Wrap the whole `classifyLLMError` in a try/catch that falls back to a conservative `stop` decision and preserves the best-effort message of the ORIGINAL error. A classifier that throws is worse than a classifier that's wrong — it must never shadow the real failure. - `bestEffortMessage` swallows property-access errors (hostile Proxy etc.) to guarantee the fallback itself can't throw. Regression tests cover: numeric `code`, structured `errorType`, nested OpenAI-SDK-shaped `error.error.code`, and a hostile Proxy that throws on every property access. This is a forcing function for root-cause diagnosis: after this lands, the real upstream errors behind the 'e.trim' mask will finally surface. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Remove fallback warning in classifyLLMError Removed console warning for classification failure. * 🐛 fix(agent-runtime): treat numeric provider code as status fallback Bare HTTP proxies sometimes surface the HTTP status ONLY as a numeric `code` on the error object (no `status`/`statusCode`, no digits in the message). After widening `normalizeCode` to require `typeof === 'string'`, those numeric codes were dropped entirely and auth/permission failures fell through to retry — wasting the full retry budget on permanent errors. Forward numeric `raw.code` / `nested?.code` / `nestedError?.code` into the status chain (after the real status/statusCode lookups, before the message-digit extractor) so classifyKind still maps 401/403 → stop and 429/5xx → retry. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 17:47:27 +00:00 · 2026-04-14 23:23:21 +08:00 · 2026-04-14 23:23:21 +08:00 · d6f11f80b6
commit d6f11f80b6
parent 1c75686b70
2 changed files with 153 additions and 22 deletions
--- a/src/server/modules/AgentRuntime/tests/llmErrorClassification.test.ts
+++ b/src/server/modules/AgentRuntime/tests/llmErrorClassification.test.ts
@ -49,4 +49,86 @@ describe('classifyLLMError', () => {
  it('should default unknown errors to retry', () => {
    expect(classifyLLMError(new Error('unexpected upstream issue')).kind).toBe('retry');
  });
+
+  describe('non-string code / errorType defensive handling', () => {
+    // Regression: real-world provider errors sometimes carry numeric `code`
+    // (HTTP status) or a structured object in the error fields. Earlier versions
+    // called `.trim()` on these and threw TypeError, which masked the original
+    // provider error behind "e.trim is not a function".
+
+    it('does not throw when error.code is a number', () => {
+      const result = classifyLLMError({ code: 429, message: 'rate limit' });
+      expect(result.message).toBe('rate limit');
+      // Classifier should still land on a valid kind, not crash.
+      expect(['retry', 'stop']).toContain(result.kind);
+    });
+
+    it('does not throw when errorType is an object', () => {
+      const result = classifyLLMError({
+        errorType: { nested: 'structured error' },
+        message: 'upstream returned structured type',
+      });
+      expect(result.message).toBe('upstream returned structured type');
+      expect(['retry', 'stop']).toContain(result.kind);
+    });
+
+    it('does not throw when nested error.code is a number (OpenAI SDK shape)', () => {
+      const result = classifyLLMError({
+        error: { error: { code: 402, message: 'payment required' } },
+        errorType: 'ProviderBizError',
+      });
+      expect(result.message).toBe('payment required');
+      expect(['retry', 'stop']).toContain(result.kind);
+    });
+
+    // Regression: some third-party proxies surface HTTP status ONLY as a
+    // numeric `code` (no `status`/`statusCode`, no status digits in the
+    // message). Previously these fell through to `retry`, causing wasteful
+    // retry loops on permanent auth/permission failures.
+
+    it('treats numeric code=401 as stop when no status field is present', () => {
+      const result = classifyLLMError({ code: 401, message: 'upstream refused' });
+      expect(result.kind).toBe('stop');
+    });
+
+    it('treats numeric code=403 as stop when no status field is present', () => {
+      const result = classifyLLMError({ code: 403, message: 'upstream refused' });
+      expect(result.kind).toBe('stop');
+    });
+
+    it('treats numeric code=429 as retry when no status field is present', () => {
+      const result = classifyLLMError({ code: 429, message: 'upstream refused' });
+      expect(result.kind).toBe('retry');
+    });
+
+    it('treats nested numeric code as stop (proxy-wrapped auth failure)', () => {
+      const result = classifyLLMError({
+        error: { error: { code: 401, message: 'proxy refused upstream' } },
+      });
+      expect(result.kind).toBe('stop');
+    });
+
+    it('prefers explicit status over numeric code fallback', () => {
+      // status says 500 (retry), code says 401 (stop) — status wins.
+      const result = classifyLLMError({ code: 401, message: 'oops', status: 500 });
+      expect(result.kind).toBe('retry');
+    });
+
+    it('preserves the original error message when normalizeSignal itself would throw', () => {
+      // Force a normalizeSignal crash by making `.toLowerCase()` blow up on a
+      // non-string message (via a Proxy that throws on Symbol.toPrimitive).
+      const hostile = new Proxy(
+        {},
+        {
+          get: () => {
+            throw new Error('property access explodes');
+          },
+        },
+      );
+      const result = classifyLLMError(hostile);
+      // Falls back to 'stop' when classifier throws; message is best-effort.
+      expect(result.kind).toBe('stop');
+      expect(typeof result.message).toBe('string');
+    });
+  });
 });
--- a/src/server/modules/AgentRuntime/llmErrorClassification.ts
+++ b/src/server/modules/AgentRuntime/llmErrorClassification.ts
@ -64,8 +64,8 @@ const STOP_KEYWORDS = [
 const hasAnyKeyword = (text: string, keywords: string[]) =>
  keywords.some((keyword) => text.includes(keyword));

-const normalizeCode = (value?: string) => {
-  if (!value) return;
+const normalizeCode = (value?: unknown): string | undefined => {
+  if (typeof value !== 'string' || !value) return;

  return value
    .trim()
@ -73,7 +73,11 @@ const normalizeCode = (value?: string) => {
    .replaceAll(/[\s-]+/g, '_');
 };

-const normalizeErrorType = (value?: string) => value?.trim();
+const normalizeErrorType = (value?: unknown): string | undefined => {
+  if (typeof value !== 'string') return undefined;
+  const trimmed = value.trim();
+  return trimmed || undefined;
+};

 const tryExtractStatus = (message: string) => {
  const matches = message.match(/\b([45]\d{2})\b/);
@ -83,6 +87,17 @@ const tryExtractStatus = (message: string) => {
  return Number.isNaN(status) ? undefined : status;
 };

+// Some providers (notably bare HTTP proxies) only surface the HTTP status as a
+// numeric `code` on the error object, with no `status`/`statusCode`. Treat
+// those numeric codes as status so classifyKind can still map 401/403 to stop
+// and 429/5xx to retry without falling through to message-keyword matching.
+const numericStatusFromCode = (...codes: unknown[]): number | undefined => {
+  for (const code of codes) {
+    if (typeof code === 'number' && Number.isFinite(code)) return code;
+  }
+  return undefined;
+};
+
 const normalizeSignal = (error: unknown): LLMErrorSignal => {
  if (typeof error === 'string') {
    const message = error.toLowerCase();
@ -91,11 +106,11 @@ const normalizeSignal = (error: unknown): LLMErrorSignal => {

  if (error instanceof Error) {
    const raw = error as Error & {
-      code?: string;
-      errorType?: string;
+      code?: unknown;
+      errorType?: unknown;
      status?: number;
      statusCode?: number;
-      type?: string;
+      type?: unknown;
    };
    const message = (raw.message || raw.name || 'unknown error').toLowerCase();

@ -108,26 +123,26 @@ const normalizeSignal = (error: unknown): LLMErrorSignal => {
          ? raw.status
          : typeof raw.statusCode === 'number'
            ? raw.statusCode
-            : tryExtractStatus(message),
+            : (numericStatusFromCode(raw.code) ?? tryExtractStatus(message)),
    };
  }

  if (error && typeof error === 'object') {
    const raw = error as {
-      code?: string;
+      code?: unknown;
      error?: {
-        code?: string;
-        error?: { code?: string; message?: string; status?: number; type?: string };
-        errorType?: string;
+        code?: unknown;
+        error?: { code?: unknown; message?: string; status?: number; type?: unknown };
+        errorType?: unknown;
        message?: string;
        status?: number;
-        type?: string;
+        type?: unknown;
      };
-      errorType?: string;
+      errorType?: unknown;
      message?: string;
      status?: number;
      statusCode?: number;
-      type?: string;
+      type?: unknown;
    };
    const nested = raw.error;
    const nestedError = nested?.error;
@ -153,7 +168,8 @@ const normalizeSignal = (error: unknown): LLMErrorSignal => {
              ? nested.status
              : typeof nestedError?.status === 'number'
                ? nestedError.status
-                : tryExtractStatus(message),
+                : (numericStatusFromCode(raw.code, nested?.code, nestedError?.code) ??
+                  tryExtractStatus(message)),
    };
  }

@ -198,14 +214,47 @@ const classifyKind = ({ code, errorType, message, status }: LLMErrorSignal): LLM
  return 'retry';
 };

-export const classifyLLMError = (error: unknown): ClassifiedLLMError => {
-  const signal = normalizeSignal(error);
+/**
+ * Extract a human-readable message for the fallback path without relying on
+ * normalizeSignal (which might be the thing that just threw).
+ */
+const bestEffortMessage = (error: unknown): string => {
+  try {
+    if (error instanceof Error && typeof error.message === 'string' && error.message.length > 0) {
+      return error.message;
+    }
+    if (typeof error === 'string' && error.length > 0) return error;
+    if (error && typeof error === 'object') {
+      const e = error as { message?: unknown; error?: { message?: unknown } };
+      if (typeof e.message === 'string' && e.message.length > 0) return e.message;
+      const nested = e.error?.message;
+      if (typeof nested === 'string' && nested.length > 0) return nested;
+    }
+  } catch {
+    // Property access itself can throw (e.g. hostile Proxy). Fall through to default.
+  }
+  return 'unknown error';
+};

-  return {
-    code: signal.code || signal.errorType,
-    kind: classifyKind(signal),
-    message: signal.message,
-  };
+export const classifyLLMError = (error: unknown): ClassifiedLLMError => {
+  // Defensive: a classifier that throws would mask the original provider error
+  // behind the classifier's own TypeError (e.g. `e.trim is not a function`),
+  // making prod debugging impossible. If anything below throws, fall back to a
+  // conservative "stop" decision that preserves the original error message.
+  try {
+    const signal = normalizeSignal(error);
+
+    return {
+      code: signal.code || signal.errorType,
+      kind: classifyKind(signal),
+      message: signal.message,
+    };
+  } catch (classificationError) {
+    return {
+      kind: 'stop',
+      message: bestEffortMessage(error),
+    };
+  }
 };

 export type { ClassifiedLLMError, LLMErrorKind };