track accumulative token usage in a conversation thread (#9)

This commit is contained in:
davi0015 2026-04-22 14:36:08 +08:00 committed by GitHub
parent bbef0faa3b
commit 8d3b7400ff
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 156 additions and 10 deletions

View file

@ -124,6 +124,14 @@ export type ThreadType = {
// after the user sends a new message).
latestUsage?: LLMUsage;
// Sum of `LLMUsage` across every API request ever made on this thread.
// In an agent loop with N tool calls, the loop fires N sequential requests
// each carrying the full history + accumulated tool results — total billed
// tokens are O(N²) while `latestUsage` only shows the latest request (O(N)).
// This field surfaces the real cumulative cost so the user can see actual
// billing impact, not just the last sample. Persisted alongside latestUsage.
cumulativeUsageThisThread?: LLMUsage;
// Model used to send the most recent user message on this thread. Captured
// on send, restored on `switchToThread` (writes to settings' `Chat` model
// selection). `null` means "no message was sent on this thread yet"; if the
@ -252,6 +260,15 @@ export interface IChatThreadService {
readonly state: ThreadsState;
readonly streamState: ThreadStreamState; // not persistent
readonly latestUsageOfThreadId: { [threadId: string]: LLMUsage | undefined }; // hydrated from persisted threads on startup; updated as the model streams
// Cumulative usage across all requests in the *current* user turn (reset
// when a new user message is sent or a thread is opened/switched-to fresh).
// Only lives in memory — not persisted, since "this turn" doesn't survive
// a reload anyway.
readonly cumulativeUsageThisTurnOfThreadId: { [threadId: string]: LLMUsage | undefined };
// Cumulative usage across the entire thread history. Hydrated from the
// persisted thread on startup so the user can see lifetime cost across
// reloads.
readonly cumulativeUsageThisThreadOfThreadId: { [threadId: string]: LLMUsage | undefined };
onDidChangeCurrentThread: Event<void>;
onDidChangeStreamState: Event<{ threadId: string }>
@ -330,6 +347,8 @@ class ChatThreadService extends Disposable implements IChatThreadService {
readonly streamState: ThreadStreamState = {}
readonly latestUsageOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
readonly cumulativeUsageThisTurnOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
readonly cumulativeUsageThisThreadOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
state: ThreadsState // allThreads is persisted, currentThread is not
// used in checkpointing
@ -375,6 +394,7 @@ class ChatThreadService extends Disposable implements IChatThreadService {
for (const id in allThreads) {
const t = allThreads[id]
if (t?.latestUsage) this.latestUsageOfThreadId[id] = t.latestUsage
if (t?.cumulativeUsageThisThread) this.cumulativeUsageThisThreadOfThreadId[id] = t.cumulativeUsageThisThread
}
// always be in a thread
@ -576,14 +596,70 @@ class ChatThreadService extends Disposable implements IChatThreadService {
// Also persists on the thread so the ring shows the last-known value after a reload.
private _setLatestUsage(threadId: string, usage: LLMUsage) {
this.latestUsageOfThreadId[threadId] = usage
// Cumulative = (cumulative locked-in from prior finalized requests in this
// turn/thread) + (this request's running total). Always recompute from the
// baseline so streaming updates (which carry the per-request running total,
// not a delta) don't double-count.
this.cumulativeUsageThisTurnOfThreadId[threadId] = this._addUsage(this._cumulativeThisTurnBaselineOfThreadId[threadId], usage)
this.cumulativeUsageThisThreadOfThreadId[threadId] = this._addUsage(this._cumulativeThisThreadBaselineOfThreadId[threadId], usage)
const thread = this.state.allThreads[threadId]
if (thread) {
thread.latestUsage = usage
thread.cumulativeUsageThisThread = this.cumulativeUsageThisThreadOfThreadId[threadId]
this._storeAllThreads(this.state.allThreads)
}
this._onDidChangeStreamState.fire({ threadId })
}
// Baseline = cumulative usage from previously-finalized requests in this
// turn/thread. The current request's running total gets added on top in
// `_setLatestUsage`. Moved forward by `_lockInCurrentRequestUsage` once a
// request finishes, so the next request starts counting from where we
// left off.
private readonly _cumulativeThisTurnBaselineOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
private readonly _cumulativeThisThreadBaselineOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
// Sum two LLMUsage values. `undefined` fields stay undefined unless one of
// the inputs has a defined value, in which case we fall back to the defined
// side (so e.g. a request that doesn't report `cachedInputTokens` doesn't
// erase the previously-accumulated cached count).
private _addUsage(a: LLMUsage | undefined, b: LLMUsage | undefined): LLMUsage | undefined {
if (!a) return b ? { ...b } : undefined
if (!b) return { ...a }
const add = (x: number | undefined, y: number | undefined): number | undefined => {
if (x === undefined && y === undefined) return undefined
return (x ?? 0) + (y ?? 0)
}
return {
inputTokens: add(a.inputTokens, b.inputTokens),
outputTokens: add(a.outputTokens, b.outputTokens),
totalTokens: add(a.totalTokens, b.totalTokens),
reasoningTokens: add(a.reasoningTokens, b.reasoningTokens),
cachedInputTokens: add(a.cachedInputTokens, b.cachedInputTokens),
}
}
// Roll the most recent per-request usage into the cumulative baselines so
// the next request's running total starts from a fresh zero on top of the
// locked-in totals. Called once per request (on `onFinalMessage`).
private _lockInCurrentRequestUsage(threadId: string) {
const lastUsage = this.latestUsageOfThreadId[threadId]
if (!lastUsage) return
this._cumulativeThisTurnBaselineOfThreadId[threadId] = this._addUsage(this._cumulativeThisTurnBaselineOfThreadId[threadId], lastUsage)
this._cumulativeThisThreadBaselineOfThreadId[threadId] = this._addUsage(this._cumulativeThisThreadBaselineOfThreadId[threadId], lastUsage)
}
// Reset the "this turn" counter and its baseline. Called when a new user
// message starts a fresh turn. Does NOT touch "this thread" — that's
// lifetime accumulation.
private _resetCumulativeThisTurn(threadId: string) {
this.cumulativeUsageThisTurnOfThreadId[threadId] = undefined
this._cumulativeThisTurnBaselineOfThreadId[threadId] = undefined
this._onDidChangeStreamState.fire({ threadId })
}
// ---------- streaming ----------
@ -960,6 +1036,9 @@ class ChatThreadService extends Disposable implements IChatThreadService {
},
onFinalMessage: async ({ fullText, fullReasoning, toolCall, anthropicReasoning, usage }) => {
if (usage) this._setLatestUsage(threadId, usage)
// Lock in this request's usage so the next loop iteration's
// running total is added to (not replacing) what we already counted.
this._lockInCurrentRequestUsage(threadId)
resMessageIsDonePromise({ type: 'llmDone', toolCall, info: { fullText, fullReasoning, anthropicReasoning } }) // resolve with tool calls
},
onError: async (error) => {
@ -1385,6 +1464,11 @@ We only need to do it for files that were edited since `from`, ie files between
await this.abortRunning(threadId)
}
// A new user message starts a new "turn" — zero out this-turn cumulative
// before any LLM requests fire. Lifetime/this-thread cumulative keeps
// accumulating across turns.
this._resetCumulativeThisTurn(threadId)
// add dummy before this message to keep checkpoint before user message idea consistent
if (thread.messages.length === 0) {
this._addUserCheckpoint({ threadId })

View file

@ -6,7 +6,7 @@
import React, { ButtonHTMLAttributes, FormEvent, FormHTMLAttributes, Fragment, KeyboardEvent, useCallback, useEffect, useMemo, useRef, useState } from 'react';
import { useAccessor, useChatThreadsState, useChatThreadsStreamState, useSettingsState, useActiveURI, useCommandBarState, useFullChatThreadsStreamState, useChatThreadLatestUsage } from '../util/services.js';
import { useAccessor, useChatThreadsState, useChatThreadsStreamState, useSettingsState, useActiveURI, useCommandBarState, useFullChatThreadsStreamState, useChatThreadLatestUsage, useChatThreadCumulativeUsage } from '../util/services.js';
import { ScrollType } from '../../../../../../../editor/common/editorCommon.js';
import { ChatMarkdownRender, ChatMessageLocation, getApplyBoxId } from '../markdown/ChatMarkdownRender.js';
@ -311,10 +311,30 @@ interface TokenUsageRingProps {
// ring is drawn — this prevents the send button from shifting once usage arrives
usage: LLMUsage | undefined;
contextWindow: number; // model's max input context, in tokens
cumulativeThisTurn?: LLMUsage | undefined;
cumulativeThisThread?: LLMUsage | undefined;
children: React.ReactNode;
size?: number;
}
const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, children, size = 34 }) => {
// Format a single LLMUsage block for the tooltip. Returns an array of plain
// text lines (no HTML — react-tooltip's html mode is blocked by Trusted Types).
const formatUsageBlock = (label: string, u: LLMUsage | undefined): (string | null)[] => {
if (!u) return [`${label}: -`]
const total = u.totalTokens ?? ((u.inputTokens ?? 0) + (u.outputTokens ?? 0) + (u.reasoningTokens ?? 0))
const inputLine = u.cachedInputTokens !== undefined
? ` Input: ${formatTokenCount(u.inputTokens)} (${formatTokenCount(u.cachedInputTokens)} cached)`
: ` Input: ${formatTokenCount(u.inputTokens)}`
return [
`${label}:`,
inputLine,
` Output: ${formatTokenCount(u.outputTokens)}`,
u.reasoningTokens !== undefined ? ` Reasoning: ${formatTokenCount(u.reasoningTokens)}` : null,
` Total: ${formatTokenCount(total)}`,
]
}
const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, cumulativeThisTurn, cumulativeThisThread, children, size = 34 }) => {
const strokeWidth = 3
const radius = (size - strokeWidth) / 2
const hasData = !!usage && contextWindow > 0
@ -337,17 +357,23 @@ const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, c
// prompt cache (OpenAI `prompt_tokens_details.cached_tokens`, mirrored by OpenRouter,
// DeepSeek, etc.). Only show the line when the server actually reported a value —
// an undefined field means the server doesn't expose it, which is different from 0.
const inputLine = usage.cachedInputTokens !== undefined
? `Input: ${formatTokenCount(usage.inputTokens)} (${formatTokenCount(usage.cachedInputTokens)} cached)`
: `Input: ${formatTokenCount(usage.inputTokens)}`
// Tooltip layout:
// 1. Context-window ring summary (per-request, drives the ring color)
// 2. Last request breakdown (the per-request snapshot the ring is based on)
// 3. Cumulative this turn (sum across all loop iterations of the current user turn)
// 4. Cumulative this thread (lifetime sum across the whole chat history)
// The cumulative blocks are critical because agent loops issue many requests
// per turn — total billed tokens grow ~O(N²) while the ring only shows the
// last request's input.
tooltipContent = [
`Context window usage`,
`${formatTokenCount(total)} / ${formatTokenCount(contextWindow)} (${displayPct})`,
``,
inputLine,
`Output: ${formatTokenCount(usage.outputTokens)}`,
usage.reasoningTokens !== undefined ? `Reasoning: ${formatTokenCount(usage.reasoningTokens)}` : null,
`Total: ${formatTokenCount(total)}`,
...formatUsageBlock('Last request', usage),
``,
...formatUsageBlock('Cumulative this turn', cumulativeThisTurn),
``,
...formatUsageBlock('Cumulative this thread', cumulativeThisThread),
].filter(s => s !== null).join('\n')
svgEl = (
@ -400,6 +426,7 @@ const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, c
const SubmitButtonWithUsageRing: React.FC<{ threadId: string; featureName: FeatureName; children: React.ReactNode }> = ({ threadId, featureName, children }) => {
const settingsState = useSettingsState()
const usage = useChatThreadLatestUsage(threadId)
const cumulative = useChatThreadCumulativeUsage(threadId)
const modelSelection = settingsState.modelSelectionOfFeature[featureName]
// Always render the wrapper so the send button doesn't jump sideways when
@ -410,7 +437,7 @@ const SubmitButtonWithUsageRing: React.FC<{ threadId: string; featureName: Featu
: 0
return (
<TokenUsageRing usage={usage} contextWindow={contextWindow}>
<TokenUsageRing usage={usage} contextWindow={contextWindow} cumulativeThisTurn={cumulative.thisTurn} cumulativeThisThread={cumulative.thisThread}>
{children}
</TokenUsageRing>
)

View file

@ -69,6 +69,8 @@ let chatThreadsStreamState: ThreadStreamState
const chatThreadsStreamStateListeners: Set<(threadId: string) => void> = new Set()
let chatThreadsLatestUsageOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
let chatThreadsCumulativeUsageThisTurnOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
let chatThreadsCumulativeUsageThisThreadOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
let settingsState: VoidSettingsState
const settingsStateListeners: Set<(s: VoidSettingsState) => void> = new Set()
@ -122,10 +124,14 @@ export const _registerServices = (accessor: ServicesAccessor) => {
// same service, different state
chatThreadsStreamState = chatThreadsStateService.streamState
chatThreadsLatestUsageOfThreadId = chatThreadsStateService.latestUsageOfThreadId
chatThreadsCumulativeUsageThisTurnOfThreadId = chatThreadsStateService.cumulativeUsageThisTurnOfThreadId
chatThreadsCumulativeUsageThisThreadOfThreadId = chatThreadsStateService.cumulativeUsageThisThreadOfThreadId
disposables.push(
chatThreadsStateService.onDidChangeStreamState(({ threadId }) => {
chatThreadsStreamState = chatThreadsStateService.streamState
chatThreadsLatestUsageOfThreadId = chatThreadsStateService.latestUsageOfThreadId
chatThreadsCumulativeUsageThisTurnOfThreadId = chatThreadsStateService.cumulativeUsageThisTurnOfThreadId
chatThreadsCumulativeUsageThisThreadOfThreadId = chatThreadsStateService.cumulativeUsageThisThreadOfThreadId
chatThreadsStreamStateListeners.forEach(l => l(threadId))
})
)
@ -323,6 +329,35 @@ export const useChatThreadLatestUsage = (threadId: string) => {
return u
}
// Cumulative token usage across all LLM requests fired in the current user turn
// (this-turn) and across the entire thread history (this-thread). In an agent
// loop with N tool calls each request resends the full history, so total billed
// tokens grow ~O(N²) — these counters expose that real cost vs. `latestUsage`'s
// per-request snapshot.
export const useChatThreadCumulativeUsage = (threadId: string) => {
const initial = {
thisTurn: chatThreadsCumulativeUsageThisTurnOfThreadId[threadId],
thisThread: chatThreadsCumulativeUsageThisThreadOfThreadId[threadId],
}
const [u, su] = useState<{ thisTurn: LLMUsage | undefined, thisThread: LLMUsage | undefined }>(initial)
useEffect(() => {
su({
thisTurn: chatThreadsCumulativeUsageThisTurnOfThreadId[threadId],
thisThread: chatThreadsCumulativeUsageThisThreadOfThreadId[threadId],
})
const listener = (threadId_: string) => {
if (threadId_ !== threadId) return
su({
thisTurn: chatThreadsCumulativeUsageThisTurnOfThreadId[threadId],
thisThread: chatThreadsCumulativeUsageThisThreadOfThreadId[threadId],
})
}
chatThreadsStreamStateListeners.add(listener)
return () => { chatThreadsStreamStateListeners.delete(listener) }
}, [su, threadId])
return u
}
export const useFullChatThreadsStreamState = () => {
const [s, ss] = useState(chatThreadsStreamState)
useEffect(() => {