From e5baa8d169d129839b76cd2f1a999812757f86b6 Mon Sep 17 00:00:00 2001 From: davi0015 Date: Tue, 21 Apr 2026 01:24:07 +0800 Subject: [PATCH] track token usage for openai compatible model (#4) --- .../react/src/sidebar-tsx/SidebarChat.tsx | 9 +++++- .../void/common/sendLLMMessageService.ts | 1 + .../void/common/sendLLMMessageTypes.ts | 5 ++++ .../llmMessage/sendLLMMessage.impl.ts | 28 ++++++++++++++++++- 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/vs/workbench/contrib/void/browser/react/src/sidebar-tsx/SidebarChat.tsx b/src/vs/workbench/contrib/void/browser/react/src/sidebar-tsx/SidebarChat.tsx index 9496d02a..c1a13547 100644 --- a/src/vs/workbench/contrib/void/browser/react/src/sidebar-tsx/SidebarChat.tsx +++ b/src/vs/workbench/contrib/void/browser/react/src/sidebar-tsx/SidebarChat.tsx @@ -333,11 +333,18 @@ const TokenUsageRing: React.FC = ({ usage, contextWindow, c const displayPct = rawPct < 0.01 ? '<0.01%' : rawPct < 1 ? `${rawPct.toFixed(2)}%` : `${rawPct.toFixed(1)}%` // Use plain text (no HTML) because the renderer enforces Trusted Types and // react-tooltip's html mode would set innerHTML directly, which is blocked. + // `cachedInputTokens` is the portion of `inputTokens` served from the provider's + // prompt cache (OpenAI `prompt_tokens_details.cached_tokens`, mirrored by OpenRouter, + // DeepSeek, etc.). Only show the line when the server actually reported a value — + // an undefined field means the server doesn't expose it, which is different from 0. + const inputLine = usage.cachedInputTokens !== undefined + ? `Input: ${formatTokenCount(usage.inputTokens)} (${formatTokenCount(usage.cachedInputTokens)} cached)` + : `Input: ${formatTokenCount(usage.inputTokens)}` tooltipContent = [ `Context window usage`, `${formatTokenCount(total)} / ${formatTokenCount(contextWindow)} (${displayPct})`, ``, - `Input: ${formatTokenCount(usage.inputTokens)}`, + inputLine, `Output: ${formatTokenCount(usage.outputTokens)}`, usage.reasoningTokens !== undefined ? `Reasoning: ${formatTokenCount(usage.reasoningTokens)}` : null, `Total: ${formatTokenCount(total)}`, diff --git a/src/vs/workbench/contrib/void/common/sendLLMMessageService.ts b/src/vs/workbench/contrib/void/common/sendLLMMessageService.ts index d1c10082..dfa8a2d0 100644 --- a/src/vs/workbench/contrib/void/common/sendLLMMessageService.ts +++ b/src/vs/workbench/contrib/void/common/sendLLMMessageService.ts @@ -89,6 +89,7 @@ export class LLMMessageService extends Disposable implements ILLMMessageService outputTokens: e.usage.outputTokens, reasoningTokens: e.usage.reasoningTokens, totalTokens: e.usage.totalTokens, + cachedInputTokens: e.usage.cachedInputTokens, }) } this.llmMessageHooks.onFinalMessage[e.requestId]?.(e); diff --git a/src/vs/workbench/contrib/void/common/sendLLMMessageTypes.ts b/src/vs/workbench/contrib/void/common/sendLLMMessageTypes.ts index 9c45f36e..865cea85 100644 --- a/src/vs/workbench/contrib/void/common/sendLLMMessageTypes.ts +++ b/src/vs/workbench/contrib/void/common/sendLLMMessageTypes.ts @@ -99,6 +99,11 @@ export type LLMUsage = { outputTokens?: number; totalTokens?: number; reasoningTokens?: number; + // Portion of `inputTokens` that was served from prompt/context cache. + // Populated by OpenAI-compatible servers via `usage.prompt_tokens_details.cached_tokens` + // (OpenAI's implicit prompt cache; DeepSeek and a few others mirror the schema). + // Undefined on servers that don't return the field. + cachedInputTokens?: number; } export type OnText = (p: { fullText: string; fullReasoning: string; toolCall?: RawToolCallObj; usage?: LLMUsage }) => void diff --git a/src/vs/workbench/contrib/void/electron-main/llmMessage/sendLLMMessage.impl.ts b/src/vs/workbench/contrib/void/electron-main/llmMessage/sendLLMMessage.impl.ts index ae71fdf4..3527b76a 100644 --- a/src/vs/workbench/contrib/void/electron-main/llmMessage/sendLLMMessage.impl.ts +++ b/src/vs/workbench/contrib/void/electron-main/llmMessage/sendLLMMessage.impl.ts @@ -305,6 +305,13 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE model: modelName, messages: messages as any, stream: true, + // Ask the server to emit a final usage chunk. Per the OpenAI spec this adds a + // trailing chunk with `choices: []` and a populated `usage`. Most OAI-compatible + // servers (DeepSeek, OpenRouter, Groq, vLLM, LM Studio, LiteLLM, etc.) honor this; + // ones that don't just ignore the field and we get no usage, same as before. + // Declared before the spreads so `additionalOpenAIPayload` can override if a + // particular model/provider needs a different setting. + stream_options: { include_usage: true }, ...nativeToolsObj, ...additionalOpenAIPayload // max_completion_tokens: maxTokens, @@ -333,6 +340,10 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE let toolId = '' let toolParamsStr = '' + // Usage only arrives in the final chunk (and only if the server honored + // stream_options.include_usage). `chunk.usage` is typed as `| null` there. + let latestUsage: LLMUsage | undefined = undefined + openai.chat.completions .create(options) .then(async response => { @@ -362,11 +373,26 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE fullReasoningSoFar += newReasoning } + // usage — present only on the final chunk (which typically has empty choices). + // `prompt_tokens_details.cached_tokens` is OpenAI's implicit prompt-cache hit + // count; non-OpenAI servers that mimic the schema (DeepSeek, OpenRouter-for- + // OpenAI-routed models, some vLLM deployments) populate it too. + if (chunk.usage) { + latestUsage = { + inputTokens: chunk.usage.prompt_tokens, + outputTokens: chunk.usage.completion_tokens, + totalTokens: chunk.usage.total_tokens, + reasoningTokens: chunk.usage.completion_tokens_details?.reasoning_tokens, + cachedInputTokens: chunk.usage.prompt_tokens_details?.cached_tokens, + } + } + // call onText onText({ fullText: fullTextSoFar, fullReasoning: fullReasoningSoFar, toolCall: !toolName ? undefined : { name: toolName, rawParams: {}, isDone: false, doneParams: [], id: toolId }, + usage: latestUsage, }) } @@ -377,7 +403,7 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE else { const toolCall = rawToolCallObjOfParamsStr(toolName, toolParamsStr, toolId) const toolCallObj = toolCall ? { toolCall } : {} - onFinalMessage({ fullText: fullTextSoFar, fullReasoning: fullReasoningSoFar, anthropicReasoning: null, ...toolCallObj }); + onFinalMessage({ fullText: fullTextSoFar, fullReasoning: fullReasoningSoFar, anthropicReasoning: null, usage: latestUsage, ...toolCallObj }); } }) // when error/fail - this catches errors of both .create() and .then(for await)