mirror of
https://github.com/voideditor/void
synced 2026-05-23 01:18:25 +00:00
track token usage for openai compatible model (#4)
This commit is contained in:
parent
37c21e3b9f
commit
e5baa8d169
4 changed files with 41 additions and 2 deletions
|
|
@ -333,11 +333,18 @@ const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, c
|
|||
const displayPct = rawPct < 0.01 ? '<0.01%' : rawPct < 1 ? `${rawPct.toFixed(2)}%` : `${rawPct.toFixed(1)}%`
|
||||
// Use plain text (no HTML) because the renderer enforces Trusted Types and
|
||||
// react-tooltip's html mode would set innerHTML directly, which is blocked.
|
||||
// `cachedInputTokens` is the portion of `inputTokens` served from the provider's
|
||||
// prompt cache (OpenAI `prompt_tokens_details.cached_tokens`, mirrored by OpenRouter,
|
||||
// DeepSeek, etc.). Only show the line when the server actually reported a value —
|
||||
// an undefined field means the server doesn't expose it, which is different from 0.
|
||||
const inputLine = usage.cachedInputTokens !== undefined
|
||||
? `Input: ${formatTokenCount(usage.inputTokens)} (${formatTokenCount(usage.cachedInputTokens)} cached)`
|
||||
: `Input: ${formatTokenCount(usage.inputTokens)}`
|
||||
tooltipContent = [
|
||||
`Context window usage`,
|
||||
`${formatTokenCount(total)} / ${formatTokenCount(contextWindow)} (${displayPct})`,
|
||||
``,
|
||||
`Input: ${formatTokenCount(usage.inputTokens)}`,
|
||||
inputLine,
|
||||
`Output: ${formatTokenCount(usage.outputTokens)}`,
|
||||
usage.reasoningTokens !== undefined ? `Reasoning: ${formatTokenCount(usage.reasoningTokens)}` : null,
|
||||
`Total: ${formatTokenCount(total)}`,
|
||||
|
|
|
|||
|
|
@ -89,6 +89,7 @@ export class LLMMessageService extends Disposable implements ILLMMessageService
|
|||
outputTokens: e.usage.outputTokens,
|
||||
reasoningTokens: e.usage.reasoningTokens,
|
||||
totalTokens: e.usage.totalTokens,
|
||||
cachedInputTokens: e.usage.cachedInputTokens,
|
||||
})
|
||||
}
|
||||
this.llmMessageHooks.onFinalMessage[e.requestId]?.(e);
|
||||
|
|
|
|||
|
|
@ -99,6 +99,11 @@ export type LLMUsage = {
|
|||
outputTokens?: number;
|
||||
totalTokens?: number;
|
||||
reasoningTokens?: number;
|
||||
// Portion of `inputTokens` that was served from prompt/context cache.
|
||||
// Populated by OpenAI-compatible servers via `usage.prompt_tokens_details.cached_tokens`
|
||||
// (OpenAI's implicit prompt cache; DeepSeek and a few others mirror the schema).
|
||||
// Undefined on servers that don't return the field.
|
||||
cachedInputTokens?: number;
|
||||
}
|
||||
|
||||
export type OnText = (p: { fullText: string; fullReasoning: string; toolCall?: RawToolCallObj; usage?: LLMUsage }) => void
|
||||
|
|
|
|||
|
|
@ -305,6 +305,13 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE
|
|||
model: modelName,
|
||||
messages: messages as any,
|
||||
stream: true,
|
||||
// Ask the server to emit a final usage chunk. Per the OpenAI spec this adds a
|
||||
// trailing chunk with `choices: []` and a populated `usage`. Most OAI-compatible
|
||||
// servers (DeepSeek, OpenRouter, Groq, vLLM, LM Studio, LiteLLM, etc.) honor this;
|
||||
// ones that don't just ignore the field and we get no usage, same as before.
|
||||
// Declared before the spreads so `additionalOpenAIPayload` can override if a
|
||||
// particular model/provider needs a different setting.
|
||||
stream_options: { include_usage: true },
|
||||
...nativeToolsObj,
|
||||
...additionalOpenAIPayload
|
||||
// max_completion_tokens: maxTokens,
|
||||
|
|
@ -333,6 +340,10 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE
|
|||
let toolId = ''
|
||||
let toolParamsStr = ''
|
||||
|
||||
// Usage only arrives in the final chunk (and only if the server honored
|
||||
// stream_options.include_usage). `chunk.usage` is typed as `| null` there.
|
||||
let latestUsage: LLMUsage | undefined = undefined
|
||||
|
||||
openai.chat.completions
|
||||
.create(options)
|
||||
.then(async response => {
|
||||
|
|
@ -362,11 +373,26 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE
|
|||
fullReasoningSoFar += newReasoning
|
||||
}
|
||||
|
||||
// usage — present only on the final chunk (which typically has empty choices).
|
||||
// `prompt_tokens_details.cached_tokens` is OpenAI's implicit prompt-cache hit
|
||||
// count; non-OpenAI servers that mimic the schema (DeepSeek, OpenRouter-for-
|
||||
// OpenAI-routed models, some vLLM deployments) populate it too.
|
||||
if (chunk.usage) {
|
||||
latestUsage = {
|
||||
inputTokens: chunk.usage.prompt_tokens,
|
||||
outputTokens: chunk.usage.completion_tokens,
|
||||
totalTokens: chunk.usage.total_tokens,
|
||||
reasoningTokens: chunk.usage.completion_tokens_details?.reasoning_tokens,
|
||||
cachedInputTokens: chunk.usage.prompt_tokens_details?.cached_tokens,
|
||||
}
|
||||
}
|
||||
|
||||
// call onText
|
||||
onText({
|
||||
fullText: fullTextSoFar,
|
||||
fullReasoning: fullReasoningSoFar,
|
||||
toolCall: !toolName ? undefined : { name: toolName, rawParams: {}, isDone: false, doneParams: [], id: toolId },
|
||||
usage: latestUsage,
|
||||
})
|
||||
|
||||
}
|
||||
|
|
@ -377,7 +403,7 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE
|
|||
else {
|
||||
const toolCall = rawToolCallObjOfParamsStr(toolName, toolParamsStr, toolId)
|
||||
const toolCallObj = toolCall ? { toolCall } : {}
|
||||
onFinalMessage({ fullText: fullTextSoFar, fullReasoning: fullReasoningSoFar, anthropicReasoning: null, ...toolCallObj });
|
||||
onFinalMessage({ fullText: fullTextSoFar, fullReasoning: fullReasoningSoFar, anthropicReasoning: null, usage: latestUsage, ...toolCallObj });
|
||||
}
|
||||
})
|
||||
// when error/fail - this catches errors of both .create() and .then(for await)
|
||||
|
|
|
|||
Loading…
Reference in a new issue