track token usage for openai compatible model (#4)

This commit is contained in:
davi0015 2026-04-21 01:24:07 +08:00 committed by GitHub
parent 37c21e3b9f
commit e5baa8d169
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 41 additions and 2 deletions

View file

@ -333,11 +333,18 @@ const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, c
const displayPct = rawPct < 0.01 ? '<0.01%' : rawPct < 1 ? `${rawPct.toFixed(2)}%` : `${rawPct.toFixed(1)}%`
// Use plain text (no HTML) because the renderer enforces Trusted Types and
// react-tooltip's html mode would set innerHTML directly, which is blocked.
// `cachedInputTokens` is the portion of `inputTokens` served from the provider's
// prompt cache (OpenAI `prompt_tokens_details.cached_tokens`, mirrored by OpenRouter,
// DeepSeek, etc.). Only show the line when the server actually reported a value —
// an undefined field means the server doesn't expose it, which is different from 0.
const inputLine = usage.cachedInputTokens !== undefined
? `Input: ${formatTokenCount(usage.inputTokens)} (${formatTokenCount(usage.cachedInputTokens)} cached)`
: `Input: ${formatTokenCount(usage.inputTokens)}`
tooltipContent = [
`Context window usage`,
`${formatTokenCount(total)} / ${formatTokenCount(contextWindow)} (${displayPct})`,
``,
`Input: ${formatTokenCount(usage.inputTokens)}`,
inputLine,
`Output: ${formatTokenCount(usage.outputTokens)}`,
usage.reasoningTokens !== undefined ? `Reasoning: ${formatTokenCount(usage.reasoningTokens)}` : null,
`Total: ${formatTokenCount(total)}`,

View file

@ -89,6 +89,7 @@ export class LLMMessageService extends Disposable implements ILLMMessageService
outputTokens: e.usage.outputTokens,
reasoningTokens: e.usage.reasoningTokens,
totalTokens: e.usage.totalTokens,
cachedInputTokens: e.usage.cachedInputTokens,
})
}
this.llmMessageHooks.onFinalMessage[e.requestId]?.(e);

View file

@ -99,6 +99,11 @@ export type LLMUsage = {
outputTokens?: number;
totalTokens?: number;
reasoningTokens?: number;
// Portion of `inputTokens` that was served from prompt/context cache.
// Populated by OpenAI-compatible servers via `usage.prompt_tokens_details.cached_tokens`
// (OpenAI's implicit prompt cache; DeepSeek and a few others mirror the schema).
// Undefined on servers that don't return the field.
cachedInputTokens?: number;
}
export type OnText = (p: { fullText: string; fullReasoning: string; toolCall?: RawToolCallObj; usage?: LLMUsage }) => void

View file

@ -305,6 +305,13 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE
model: modelName,
messages: messages as any,
stream: true,
// Ask the server to emit a final usage chunk. Per the OpenAI spec this adds a
// trailing chunk with `choices: []` and a populated `usage`. Most OAI-compatible
// servers (DeepSeek, OpenRouter, Groq, vLLM, LM Studio, LiteLLM, etc.) honor this;
// ones that don't just ignore the field and we get no usage, same as before.
// Declared before the spreads so `additionalOpenAIPayload` can override if a
// particular model/provider needs a different setting.
stream_options: { include_usage: true },
...nativeToolsObj,
...additionalOpenAIPayload
// max_completion_tokens: maxTokens,
@ -333,6 +340,10 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE
let toolId = ''
let toolParamsStr = ''
// Usage only arrives in the final chunk (and only if the server honored
// stream_options.include_usage). `chunk.usage` is typed as `| null` there.
let latestUsage: LLMUsage | undefined = undefined
openai.chat.completions
.create(options)
.then(async response => {
@ -362,11 +373,26 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE
fullReasoningSoFar += newReasoning
}
// usage — present only on the final chunk (which typically has empty choices).
// `prompt_tokens_details.cached_tokens` is OpenAI's implicit prompt-cache hit
// count; non-OpenAI servers that mimic the schema (DeepSeek, OpenRouter-for-
// OpenAI-routed models, some vLLM deployments) populate it too.
if (chunk.usage) {
latestUsage = {
inputTokens: chunk.usage.prompt_tokens,
outputTokens: chunk.usage.completion_tokens,
totalTokens: chunk.usage.total_tokens,
reasoningTokens: chunk.usage.completion_tokens_details?.reasoning_tokens,
cachedInputTokens: chunk.usage.prompt_tokens_details?.cached_tokens,
}
}
// call onText
onText({
fullText: fullTextSoFar,
fullReasoning: fullReasoningSoFar,
toolCall: !toolName ? undefined : { name: toolName, rawParams: {}, isDone: false, doneParams: [], id: toolId },
usage: latestUsage,
})
}
@ -377,7 +403,7 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE
else {
const toolCall = rawToolCallObjOfParamsStr(toolName, toolParamsStr, toolId)
const toolCallObj = toolCall ? { toolCall } : {}
onFinalMessage({ fullText: fullTextSoFar, fullReasoning: fullReasoningSoFar, anthropicReasoning: null, ...toolCallObj });
onFinalMessage({ fullText: fullTextSoFar, fullReasoning: fullReasoningSoFar, anthropicReasoning: null, usage: latestUsage, ...toolCallObj });
}
})
// when error/fail - this catches errors of both .create() and .then(for await)