mirror of
https://github.com/lobehub/lobehub
synced 2026-04-21 17:47:27 +00:00
🐛 fix(agent-runtime): harden classifyLLMError so it never masks the original provider error (#13774)
* 🐛 fix(agent-runtime): harden classifyLLMError so it never masks the original provider error Production traces across multiple providers (openrouter, openai, google) surface a single opaque error — `e.trim is not a function` with `errorType: 'unknown'` — hiding whatever the upstream actually returned. Root cause: `normalizeCode` / `normalizeErrorType` assumed their input is always `string | undefined` (matching the TypeScript signature), but real provider error objects frequently carry a numeric `code` (HTTP status) or a structured object in `errorType`. `value?.trim()` short-circuits only on null/undefined, so a truthy non-string turns into a TypeError that the outer catch records as the "final" error, erasing the upstream one. Fixes: - Guard `normalizeCode` / `normalizeErrorType` on `typeof value === 'string'`, widen parameter type to `unknown`. - Wrap the whole `classifyLLMError` in a try/catch that falls back to a conservative `stop` decision and preserves the best-effort message of the ORIGINAL error. A classifier that throws is worse than a classifier that's wrong — it must never shadow the real failure. - `bestEffortMessage` swallows property-access errors (hostile Proxy etc.) to guarantee the fallback itself can't throw. Regression tests cover: numeric `code`, structured `errorType`, nested OpenAI-SDK-shaped `error.error.code`, and a hostile Proxy that throws on every property access. This is a forcing function for root-cause diagnosis: after this lands, the real upstream errors behind the 'e.trim' mask will finally surface. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Remove fallback warning in classifyLLMError Removed console warning for classification failure. * 🐛 fix(agent-runtime): treat numeric provider code as status fallback Bare HTTP proxies sometimes surface the HTTP status ONLY as a numeric `code` on the error object (no `status`/`statusCode`, no digits in the message). After widening `normalizeCode` to require `typeof === 'string'`, those numeric codes were dropped entirely and auth/permission failures fell through to retry — wasting the full retry budget on permanent errors. Forward numeric `raw.code` / `nested?.code` / `nestedError?.code` into the status chain (after the real status/statusCode lookups, before the message-digit extractor) so classifyKind still maps 401/403 → stop and 429/5xx → retry. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1c75686b70
commit
d6f11f80b6
2 changed files with 153 additions and 22 deletions
|
|
@ -49,4 +49,86 @@ describe('classifyLLMError', () => {
|
|||
it('should default unknown errors to retry', () => {
|
||||
expect(classifyLLMError(new Error('unexpected upstream issue')).kind).toBe('retry');
|
||||
});
|
||||
|
||||
describe('non-string code / errorType defensive handling', () => {
|
||||
// Regression: real-world provider errors sometimes carry numeric `code`
|
||||
// (HTTP status) or a structured object in the error fields. Earlier versions
|
||||
// called `.trim()` on these and threw TypeError, which masked the original
|
||||
// provider error behind "e.trim is not a function".
|
||||
|
||||
it('does not throw when error.code is a number', () => {
|
||||
const result = classifyLLMError({ code: 429, message: 'rate limit' });
|
||||
expect(result.message).toBe('rate limit');
|
||||
// Classifier should still land on a valid kind, not crash.
|
||||
expect(['retry', 'stop']).toContain(result.kind);
|
||||
});
|
||||
|
||||
it('does not throw when errorType is an object', () => {
|
||||
const result = classifyLLMError({
|
||||
errorType: { nested: 'structured error' },
|
||||
message: 'upstream returned structured type',
|
||||
});
|
||||
expect(result.message).toBe('upstream returned structured type');
|
||||
expect(['retry', 'stop']).toContain(result.kind);
|
||||
});
|
||||
|
||||
it('does not throw when nested error.code is a number (OpenAI SDK shape)', () => {
|
||||
const result = classifyLLMError({
|
||||
error: { error: { code: 402, message: 'payment required' } },
|
||||
errorType: 'ProviderBizError',
|
||||
});
|
||||
expect(result.message).toBe('payment required');
|
||||
expect(['retry', 'stop']).toContain(result.kind);
|
||||
});
|
||||
|
||||
// Regression: some third-party proxies surface HTTP status ONLY as a
|
||||
// numeric `code` (no `status`/`statusCode`, no status digits in the
|
||||
// message). Previously these fell through to `retry`, causing wasteful
|
||||
// retry loops on permanent auth/permission failures.
|
||||
|
||||
it('treats numeric code=401 as stop when no status field is present', () => {
|
||||
const result = classifyLLMError({ code: 401, message: 'upstream refused' });
|
||||
expect(result.kind).toBe('stop');
|
||||
});
|
||||
|
||||
it('treats numeric code=403 as stop when no status field is present', () => {
|
||||
const result = classifyLLMError({ code: 403, message: 'upstream refused' });
|
||||
expect(result.kind).toBe('stop');
|
||||
});
|
||||
|
||||
it('treats numeric code=429 as retry when no status field is present', () => {
|
||||
const result = classifyLLMError({ code: 429, message: 'upstream refused' });
|
||||
expect(result.kind).toBe('retry');
|
||||
});
|
||||
|
||||
it('treats nested numeric code as stop (proxy-wrapped auth failure)', () => {
|
||||
const result = classifyLLMError({
|
||||
error: { error: { code: 401, message: 'proxy refused upstream' } },
|
||||
});
|
||||
expect(result.kind).toBe('stop');
|
||||
});
|
||||
|
||||
it('prefers explicit status over numeric code fallback', () => {
|
||||
// status says 500 (retry), code says 401 (stop) — status wins.
|
||||
const result = classifyLLMError({ code: 401, message: 'oops', status: 500 });
|
||||
expect(result.kind).toBe('retry');
|
||||
});
|
||||
|
||||
it('preserves the original error message when normalizeSignal itself would throw', () => {
|
||||
// Force a normalizeSignal crash by making `.toLowerCase()` blow up on a
|
||||
// non-string message (via a Proxy that throws on Symbol.toPrimitive).
|
||||
const hostile = new Proxy(
|
||||
{},
|
||||
{
|
||||
get: () => {
|
||||
throw new Error('property access explodes');
|
||||
},
|
||||
},
|
||||
);
|
||||
const result = classifyLLMError(hostile);
|
||||
// Falls back to 'stop' when classifier throws; message is best-effort.
|
||||
expect(result.kind).toBe('stop');
|
||||
expect(typeof result.message).toBe('string');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -64,8 +64,8 @@ const STOP_KEYWORDS = [
|
|||
const hasAnyKeyword = (text: string, keywords: string[]) =>
|
||||
keywords.some((keyword) => text.includes(keyword));
|
||||
|
||||
const normalizeCode = (value?: string) => {
|
||||
if (!value) return;
|
||||
const normalizeCode = (value?: unknown): string | undefined => {
|
||||
if (typeof value !== 'string' || !value) return;
|
||||
|
||||
return value
|
||||
.trim()
|
||||
|
|
@ -73,7 +73,11 @@ const normalizeCode = (value?: string) => {
|
|||
.replaceAll(/[\s-]+/g, '_');
|
||||
};
|
||||
|
||||
const normalizeErrorType = (value?: string) => value?.trim();
|
||||
const normalizeErrorType = (value?: unknown): string | undefined => {
|
||||
if (typeof value !== 'string') return undefined;
|
||||
const trimmed = value.trim();
|
||||
return trimmed || undefined;
|
||||
};
|
||||
|
||||
const tryExtractStatus = (message: string) => {
|
||||
const matches = message.match(/\b([45]\d{2})\b/);
|
||||
|
|
@ -83,6 +87,17 @@ const tryExtractStatus = (message: string) => {
|
|||
return Number.isNaN(status) ? undefined : status;
|
||||
};
|
||||
|
||||
// Some providers (notably bare HTTP proxies) only surface the HTTP status as a
|
||||
// numeric `code` on the error object, with no `status`/`statusCode`. Treat
|
||||
// those numeric codes as status so classifyKind can still map 401/403 to stop
|
||||
// and 429/5xx to retry without falling through to message-keyword matching.
|
||||
const numericStatusFromCode = (...codes: unknown[]): number | undefined => {
|
||||
for (const code of codes) {
|
||||
if (typeof code === 'number' && Number.isFinite(code)) return code;
|
||||
}
|
||||
return undefined;
|
||||
};
|
||||
|
||||
const normalizeSignal = (error: unknown): LLMErrorSignal => {
|
||||
if (typeof error === 'string') {
|
||||
const message = error.toLowerCase();
|
||||
|
|
@ -91,11 +106,11 @@ const normalizeSignal = (error: unknown): LLMErrorSignal => {
|
|||
|
||||
if (error instanceof Error) {
|
||||
const raw = error as Error & {
|
||||
code?: string;
|
||||
errorType?: string;
|
||||
code?: unknown;
|
||||
errorType?: unknown;
|
||||
status?: number;
|
||||
statusCode?: number;
|
||||
type?: string;
|
||||
type?: unknown;
|
||||
};
|
||||
const message = (raw.message || raw.name || 'unknown error').toLowerCase();
|
||||
|
||||
|
|
@ -108,26 +123,26 @@ const normalizeSignal = (error: unknown): LLMErrorSignal => {
|
|||
? raw.status
|
||||
: typeof raw.statusCode === 'number'
|
||||
? raw.statusCode
|
||||
: tryExtractStatus(message),
|
||||
: (numericStatusFromCode(raw.code) ?? tryExtractStatus(message)),
|
||||
};
|
||||
}
|
||||
|
||||
if (error && typeof error === 'object') {
|
||||
const raw = error as {
|
||||
code?: string;
|
||||
code?: unknown;
|
||||
error?: {
|
||||
code?: string;
|
||||
error?: { code?: string; message?: string; status?: number; type?: string };
|
||||
errorType?: string;
|
||||
code?: unknown;
|
||||
error?: { code?: unknown; message?: string; status?: number; type?: unknown };
|
||||
errorType?: unknown;
|
||||
message?: string;
|
||||
status?: number;
|
||||
type?: string;
|
||||
type?: unknown;
|
||||
};
|
||||
errorType?: string;
|
||||
errorType?: unknown;
|
||||
message?: string;
|
||||
status?: number;
|
||||
statusCode?: number;
|
||||
type?: string;
|
||||
type?: unknown;
|
||||
};
|
||||
const nested = raw.error;
|
||||
const nestedError = nested?.error;
|
||||
|
|
@ -153,7 +168,8 @@ const normalizeSignal = (error: unknown): LLMErrorSignal => {
|
|||
? nested.status
|
||||
: typeof nestedError?.status === 'number'
|
||||
? nestedError.status
|
||||
: tryExtractStatus(message),
|
||||
: (numericStatusFromCode(raw.code, nested?.code, nestedError?.code) ??
|
||||
tryExtractStatus(message)),
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -198,14 +214,47 @@ const classifyKind = ({ code, errorType, message, status }: LLMErrorSignal): LLM
|
|||
return 'retry';
|
||||
};
|
||||
|
||||
export const classifyLLMError = (error: unknown): ClassifiedLLMError => {
|
||||
const signal = normalizeSignal(error);
|
||||
/**
|
||||
* Extract a human-readable message for the fallback path without relying on
|
||||
* normalizeSignal (which might be the thing that just threw).
|
||||
*/
|
||||
const bestEffortMessage = (error: unknown): string => {
|
||||
try {
|
||||
if (error instanceof Error && typeof error.message === 'string' && error.message.length > 0) {
|
||||
return error.message;
|
||||
}
|
||||
if (typeof error === 'string' && error.length > 0) return error;
|
||||
if (error && typeof error === 'object') {
|
||||
const e = error as { message?: unknown; error?: { message?: unknown } };
|
||||
if (typeof e.message === 'string' && e.message.length > 0) return e.message;
|
||||
const nested = e.error?.message;
|
||||
if (typeof nested === 'string' && nested.length > 0) return nested;
|
||||
}
|
||||
} catch {
|
||||
// Property access itself can throw (e.g. hostile Proxy). Fall through to default.
|
||||
}
|
||||
return 'unknown error';
|
||||
};
|
||||
|
||||
return {
|
||||
code: signal.code || signal.errorType,
|
||||
kind: classifyKind(signal),
|
||||
message: signal.message,
|
||||
};
|
||||
export const classifyLLMError = (error: unknown): ClassifiedLLMError => {
|
||||
// Defensive: a classifier that throws would mask the original provider error
|
||||
// behind the classifier's own TypeError (e.g. `e.trim is not a function`),
|
||||
// making prod debugging impossible. If anything below throws, fall back to a
|
||||
// conservative "stop" decision that preserves the original error message.
|
||||
try {
|
||||
const signal = normalizeSignal(error);
|
||||
|
||||
return {
|
||||
code: signal.code || signal.errorType,
|
||||
kind: classifyKind(signal),
|
||||
message: signal.message,
|
||||
};
|
||||
} catch (classificationError) {
|
||||
return {
|
||||
kind: 'stop',
|
||||
message: bestEffortMessage(error),
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
export type { ClassifiedLLMError, LLMErrorKind };
|
||||
|
|
|
|||
Loading…
Reference in a new issue