track token usage for openai compatible model (#4)

2026-05-23 01:18:25 +00:00 · 2026-04-21 01:24:07 +08:00 · 2026-04-21 01:24:07 +08:00 · e5baa8d169
commit e5baa8d169
parent 37c21e3b9f
4 changed files with 41 additions and 2 deletions
--- a/src/vs/workbench/contrib/void/browser/react/src/sidebar-tsx/SidebarChat.tsx
+++ b/src/vs/workbench/contrib/void/browser/react/src/sidebar-tsx/SidebarChat.tsx
@ -333,11 +333,18 @@ const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, c
 		const displayPct = rawPct < 0.01 ? '<0.01%' : rawPct < 1 ? `${rawPct.toFixed(2)}%` : `${rawPct.toFixed(1)}%`
 		// Use plain text (no HTML) because the renderer enforces Trusted Types and
 		// react-tooltip's html mode would set innerHTML directly, which is blocked.
+		// `cachedInputTokens` is the portion of `inputTokens` served from the provider's
+		// prompt cache (OpenAI `prompt_tokens_details.cached_tokens`, mirrored by OpenRouter,
+		// DeepSeek, etc.). Only show the line when the server actually reported a value —
+		// an undefined field means the server doesn't expose it, which is different from 0.
+		const inputLine = usage.cachedInputTokens !== undefined
+			? `Input: ${formatTokenCount(usage.inputTokens)} (${formatTokenCount(usage.cachedInputTokens)} cached)`
+			: `Input: ${formatTokenCount(usage.inputTokens)}`
 		tooltipContent = [
 			`Context window usage`,
 			`${formatTokenCount(total)} / ${formatTokenCount(contextWindow)} (${displayPct})`,
 			``,
-			`Input: ${formatTokenCount(usage.inputTokens)}`,
+			inputLine,
 			`Output: ${formatTokenCount(usage.outputTokens)}`,
 			usage.reasoningTokens !== undefined ? `Reasoning: ${formatTokenCount(usage.reasoningTokens)}` : null,
 			`Total: ${formatTokenCount(total)}`,
--- a/src/vs/workbench/contrib/void/common/sendLLMMessageService.ts
+++ b/src/vs/workbench/contrib/void/common/sendLLMMessageService.ts
@ -89,6 +89,7 @@ export class LLMMessageService extends Disposable implements ILLMMessageService
 					outputTokens: e.usage.outputTokens,
 					reasoningTokens: e.usage.reasoningTokens,
 					totalTokens: e.usage.totalTokens,
+					cachedInputTokens: e.usage.cachedInputTokens,
 				})
 			}
 			this.llmMessageHooks.onFinalMessage[e.requestId]?.(e);
--- a/src/vs/workbench/contrib/void/common/sendLLMMessageTypes.ts
+++ b/src/vs/workbench/contrib/void/common/sendLLMMessageTypes.ts
@ -99,6 +99,11 @@ export type LLMUsage = {
 	outputTokens?: number;
 	totalTokens?: number;
 	reasoningTokens?: number;
+	// Portion of `inputTokens` that was served from prompt/context cache.
+	// Populated by OpenAI-compatible servers via `usage.prompt_tokens_details.cached_tokens`
+	// (OpenAI's implicit prompt cache; DeepSeek and a few others mirror the schema).
+	// Undefined on servers that don't return the field.
+	cachedInputTokens?: number;
 }

 export type OnText = (p: { fullText: string; fullReasoning: string; toolCall?: RawToolCallObj; usage?: LLMUsage }) => void
--- a/src/vs/workbench/contrib/void/electron-main/llmMessage/sendLLMMessage.impl.ts
+++ b/src/vs/workbench/contrib/void/electron-main/llmMessage/sendLLMMessage.impl.ts
@ -305,6 +305,13 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE
 		model: modelName,
 		messages: messages as any,
 		stream: true,
+		// Ask the server to emit a final usage chunk. Per the OpenAI spec this adds a
+		// trailing chunk with `choices: []` and a populated `usage`. Most OAI-compatible
+		// servers (DeepSeek, OpenRouter, Groq, vLLM, LM Studio, LiteLLM, etc.) honor this;
+		// ones that don't just ignore the field and we get no usage, same as before.
+		// Declared before the spreads so `additionalOpenAIPayload` can override if a
+		// particular model/provider needs a different setting.
+		stream_options: { include_usage: true },
 		...nativeToolsObj,
 		...additionalOpenAIPayload
 		// max_completion_tokens: maxTokens,
@ -333,6 +340,10 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE
 	let toolId = ''
 	let toolParamsStr = ''

+	// Usage only arrives in the final chunk (and only if the server honored
+	// stream_options.include_usage). `chunk.usage` is typed as `| null` there.
+	let latestUsage: LLMUsage | undefined = undefined
+
 	openai.chat.completions
 		.create(options)
 		.then(async response => {
@ -362,11 +373,26 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE
 					fullReasoningSoFar += newReasoning
 				}

+				// usage — present only on the final chunk (which typically has empty choices).
+				// `prompt_tokens_details.cached_tokens` is OpenAI's implicit prompt-cache hit
+				// count; non-OpenAI servers that mimic the schema (DeepSeek, OpenRouter-for-
+				// OpenAI-routed models, some vLLM deployments) populate it too.
+				if (chunk.usage) {
+					latestUsage = {
+						inputTokens: chunk.usage.prompt_tokens,
+						outputTokens: chunk.usage.completion_tokens,
+						totalTokens: chunk.usage.total_tokens,
+						reasoningTokens: chunk.usage.completion_tokens_details?.reasoning_tokens,
+						cachedInputTokens: chunk.usage.prompt_tokens_details?.cached_tokens,
+					}
+				}
+
 				// call onText
 				onText({
 					fullText: fullTextSoFar,
 					fullReasoning: fullReasoningSoFar,
 					toolCall: !toolName ? undefined : { name: toolName, rawParams: {}, isDone: false, doneParams: [], id: toolId },
+					usage: latestUsage,
 				})

 			}
@ -377,7 +403,7 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE
 			else {
 				const toolCall = rawToolCallObjOfParamsStr(toolName, toolParamsStr, toolId)
 				const toolCallObj = toolCall ? { toolCall } : {}
-				onFinalMessage({ fullText: fullTextSoFar, fullReasoning: fullReasoningSoFar, anthropicReasoning: null, ...toolCallObj });
+				onFinalMessage({ fullText: fullTextSoFar, fullReasoning: fullReasoningSoFar, anthropicReasoning: null, usage: latestUsage, ...toolCallObj });
 			}
 		})
 		// when error/fail - this catches errors of both .create() and .then(for await)