track accumulative token usage in a conversation thread (#9)

2026-05-22 17:08:25 +00:00 · 2026-04-22 14:36:08 +08:00 · 2026-04-22 14:36:08 +08:00 · 8d3b7400ff
commit 8d3b7400ff
parent bbef0faa3b
3 changed files with 156 additions and 10 deletions
--- a/src/vs/workbench/contrib/void/browser/chatThreadService.ts
+++ b/src/vs/workbench/contrib/void/browser/chatThreadService.ts
@ -124,6 +124,14 @@ export type ThreadType = {
 	// after the user sends a new message).
 	latestUsage?: LLMUsage;

+	// Sum of `LLMUsage` across every API request ever made on this thread.
+	// In an agent loop with N tool calls, the loop fires N sequential requests
+	// each carrying the full history + accumulated tool results — total billed
+	// tokens are O(N²) while `latestUsage` only shows the latest request (O(N)).
+	// This field surfaces the real cumulative cost so the user can see actual
+	// billing impact, not just the last sample. Persisted alongside latestUsage.
+	cumulativeUsageThisThread?: LLMUsage;
+
 	// Model used to send the most recent user message on this thread. Captured
 	// on send, restored on `switchToThread` (writes to settings' `Chat` model
 	// selection). `null` means "no message was sent on this thread yet"; if the
@ -252,6 +260,15 @@ export interface IChatThreadService {
 	readonly state: ThreadsState;
 	readonly streamState: ThreadStreamState; // not persistent
 	readonly latestUsageOfThreadId: { [threadId: string]: LLMUsage | undefined }; // hydrated from persisted threads on startup; updated as the model streams
+	// Cumulative usage across all requests in the *current* user turn (reset
+	// when a new user message is sent or a thread is opened/switched-to fresh).
+	// Only lives in memory — not persisted, since "this turn" doesn't survive
+	// a reload anyway.
+	readonly cumulativeUsageThisTurnOfThreadId: { [threadId: string]: LLMUsage | undefined };
+	// Cumulative usage across the entire thread history. Hydrated from the
+	// persisted thread on startup so the user can see lifetime cost across
+	// reloads.
+	readonly cumulativeUsageThisThreadOfThreadId: { [threadId: string]: LLMUsage | undefined };

 	onDidChangeCurrentThread: Event<void>;
 	onDidChangeStreamState: Event<{ threadId: string }>
@ -330,6 +347,8 @@ class ChatThreadService extends Disposable implements IChatThreadService {

 	readonly streamState: ThreadStreamState = {}
 	readonly latestUsageOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
+	readonly cumulativeUsageThisTurnOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
+	readonly cumulativeUsageThisThreadOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
 	state: ThreadsState // allThreads is persisted, currentThread is not

 	// used in checkpointing
@ -375,6 +394,7 @@ class ChatThreadService extends Disposable implements IChatThreadService {
 		for (const id in allThreads) {
 			const t = allThreads[id]
 			if (t?.latestUsage) this.latestUsageOfThreadId[id] = t.latestUsage
+			if (t?.cumulativeUsageThisThread) this.cumulativeUsageThisThreadOfThreadId[id] = t.cumulativeUsageThisThread
 		}

 		// always be in a thread
@ -576,14 +596,70 @@ class ChatThreadService extends Disposable implements IChatThreadService {
 	// Also persists on the thread so the ring shows the last-known value after a reload.
 	private _setLatestUsage(threadId: string, usage: LLMUsage) {
 		this.latestUsageOfThreadId[threadId] = usage
+
+		// Cumulative = (cumulative locked-in from prior finalized requests in this
+		// turn/thread) + (this request's running total). Always recompute from the
+		// baseline so streaming updates (which carry the per-request running total,
+		// not a delta) don't double-count.
+		this.cumulativeUsageThisTurnOfThreadId[threadId] = this._addUsage(this._cumulativeThisTurnBaselineOfThreadId[threadId], usage)
+		this.cumulativeUsageThisThreadOfThreadId[threadId] = this._addUsage(this._cumulativeThisThreadBaselineOfThreadId[threadId], usage)
+
 		const thread = this.state.allThreads[threadId]
 		if (thread) {
 			thread.latestUsage = usage
+			thread.cumulativeUsageThisThread = this.cumulativeUsageThisThreadOfThreadId[threadId]
 			this._storeAllThreads(this.state.allThreads)
 		}
 		this._onDidChangeStreamState.fire({ threadId })
 	}

+	// Baseline = cumulative usage from previously-finalized requests in this
+	// turn/thread. The current request's running total gets added on top in
+	// `_setLatestUsage`. Moved forward by `_lockInCurrentRequestUsage` once a
+	// request finishes, so the next request starts counting from where we
+	// left off.
+	private readonly _cumulativeThisTurnBaselineOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
+	private readonly _cumulativeThisThreadBaselineOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
+
+	// Sum two LLMUsage values. `undefined` fields stay undefined unless one of
+	// the inputs has a defined value, in which case we fall back to the defined
+	// side (so e.g. a request that doesn't report `cachedInputTokens` doesn't
+	// erase the previously-accumulated cached count).
+	private _addUsage(a: LLMUsage | undefined, b: LLMUsage | undefined): LLMUsage | undefined {
+		if (!a) return b ? { ...b } : undefined
+		if (!b) return { ...a }
+		const add = (x: number | undefined, y: number | undefined): number | undefined => {
+			if (x === undefined && y === undefined) return undefined
+			return (x ?? 0) + (y ?? 0)
+		}
+		return {
+			inputTokens: add(a.inputTokens, b.inputTokens),
+			outputTokens: add(a.outputTokens, b.outputTokens),
+			totalTokens: add(a.totalTokens, b.totalTokens),
+			reasoningTokens: add(a.reasoningTokens, b.reasoningTokens),
+			cachedInputTokens: add(a.cachedInputTokens, b.cachedInputTokens),
+		}
+	}
+
+	// Roll the most recent per-request usage into the cumulative baselines so
+	// the next request's running total starts from a fresh zero on top of the
+	// locked-in totals. Called once per request (on `onFinalMessage`).
+	private _lockInCurrentRequestUsage(threadId: string) {
+		const lastUsage = this.latestUsageOfThreadId[threadId]
+		if (!lastUsage) return
+		this._cumulativeThisTurnBaselineOfThreadId[threadId] = this._addUsage(this._cumulativeThisTurnBaselineOfThreadId[threadId], lastUsage)
+		this._cumulativeThisThreadBaselineOfThreadId[threadId] = this._addUsage(this._cumulativeThisThreadBaselineOfThreadId[threadId], lastUsage)
+	}
+
+	// Reset the "this turn" counter and its baseline. Called when a new user
+	// message starts a fresh turn. Does NOT touch "this thread" — that's
+	// lifetime accumulation.
+	private _resetCumulativeThisTurn(threadId: string) {
+		this.cumulativeUsageThisTurnOfThreadId[threadId] = undefined
+		this._cumulativeThisTurnBaselineOfThreadId[threadId] = undefined
+		this._onDidChangeStreamState.fire({ threadId })
+	}
+

 	// ---------- streaming ----------

@ -960,6 +1036,9 @@ class ChatThreadService extends Disposable implements IChatThreadService {
 					},
 					onFinalMessage: async ({ fullText, fullReasoning, toolCall, anthropicReasoning, usage }) => {
 						if (usage) this._setLatestUsage(threadId, usage)
+						// Lock in this request's usage so the next loop iteration's
+						// running total is added to (not replacing) what we already counted.
+						this._lockInCurrentRequestUsage(threadId)
 						resMessageIsDonePromise({ type: 'llmDone', toolCall, info: { fullText, fullReasoning, anthropicReasoning } }) // resolve with tool calls
 					},
 					onError: async (error) => {
@ -1385,6 +1464,11 @@ We only need to do it for files that were edited since `from`, ie files between
 			await this.abortRunning(threadId)
 		}

+		// A new user message starts a new "turn" — zero out this-turn cumulative
+		// before any LLM requests fire. Lifetime/this-thread cumulative keeps
+		// accumulating across turns.
+		this._resetCumulativeThisTurn(threadId)
+
 		// add dummy before this message to keep checkpoint before user message idea consistent
 		if (thread.messages.length === 0) {
 			this._addUserCheckpoint({ threadId })
--- a/src/vs/workbench/contrib/void/browser/react/src/sidebar-tsx/SidebarChat.tsx
+++ b/src/vs/workbench/contrib/void/browser/react/src/sidebar-tsx/SidebarChat.tsx
@ -6,7 +6,7 @@
 import React, { ButtonHTMLAttributes, FormEvent, FormHTMLAttributes, Fragment, KeyboardEvent, useCallback, useEffect, useMemo, useRef, useState } from 'react';


-import { useAccessor, useChatThreadsState, useChatThreadsStreamState, useSettingsState, useActiveURI, useCommandBarState, useFullChatThreadsStreamState, useChatThreadLatestUsage } from '../util/services.js';
+import { useAccessor, useChatThreadsState, useChatThreadsStreamState, useSettingsState, useActiveURI, useCommandBarState, useFullChatThreadsStreamState, useChatThreadLatestUsage, useChatThreadCumulativeUsage } from '../util/services.js';
 import { ScrollType } from '../../../../../../../editor/common/editorCommon.js';

 import { ChatMarkdownRender, ChatMessageLocation, getApplyBoxId } from '../markdown/ChatMarkdownRender.js';
@ -311,10 +311,30 @@ interface TokenUsageRingProps {
 	// ring is drawn — this prevents the send button from shifting once usage arrives
 	usage: LLMUsage | undefined;
 	contextWindow: number; // model's max input context, in tokens
+	cumulativeThisTurn?: LLMUsage | undefined;
+	cumulativeThisThread?: LLMUsage | undefined;
 	children: React.ReactNode;
 	size?: number;
 }
-const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, children, size = 34 }) => {
+
+// Format a single LLMUsage block for the tooltip. Returns an array of plain
+// text lines (no HTML — react-tooltip's html mode is blocked by Trusted Types).
+const formatUsageBlock = (label: string, u: LLMUsage | undefined): (string | null)[] => {
+	if (!u) return [`${label}: -`]
+	const total = u.totalTokens ?? ((u.inputTokens ?? 0) + (u.outputTokens ?? 0) + (u.reasoningTokens ?? 0))
+	const inputLine = u.cachedInputTokens !== undefined
+		? `  Input: ${formatTokenCount(u.inputTokens)} (${formatTokenCount(u.cachedInputTokens)} cached)`
+		: `  Input: ${formatTokenCount(u.inputTokens)}`
+	return [
+		`${label}:`,
+		inputLine,
+		`  Output: ${formatTokenCount(u.outputTokens)}`,
+		u.reasoningTokens !== undefined ? `  Reasoning: ${formatTokenCount(u.reasoningTokens)}` : null,
+		`  Total: ${formatTokenCount(total)}`,
+	]
+}
+
+const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, cumulativeThisTurn, cumulativeThisThread, children, size = 34 }) => {
 	const strokeWidth = 3
 	const radius = (size - strokeWidth) / 2
 	const hasData = !!usage && contextWindow > 0
@ -337,17 +357,23 @@ const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, c
 		// prompt cache (OpenAI `prompt_tokens_details.cached_tokens`, mirrored by OpenRouter,
 		// DeepSeek, etc.). Only show the line when the server actually reported a value —
 		// an undefined field means the server doesn't expose it, which is different from 0.
-		const inputLine = usage.cachedInputTokens !== undefined
-			? `Input: ${formatTokenCount(usage.inputTokens)} (${formatTokenCount(usage.cachedInputTokens)} cached)`
-			: `Input: ${formatTokenCount(usage.inputTokens)}`
+		// Tooltip layout:
+		//   1. Context-window ring summary (per-request, drives the ring color)
+		//   2. Last request breakdown (the per-request snapshot the ring is based on)
+		//   3. Cumulative this turn (sum across all loop iterations of the current user turn)
+		//   4. Cumulative this thread (lifetime sum across the whole chat history)
+		// The cumulative blocks are critical because agent loops issue many requests
+		// per turn — total billed tokens grow ~O(N²) while the ring only shows the
+		// last request's input.
 		tooltipContent = [
 			`Context window usage`,
 			`${formatTokenCount(total)} / ${formatTokenCount(contextWindow)} (${displayPct})`,
 			``,
-			inputLine,
-			`Output: ${formatTokenCount(usage.outputTokens)}`,
-			usage.reasoningTokens !== undefined ? `Reasoning: ${formatTokenCount(usage.reasoningTokens)}` : null,
-			`Total: ${formatTokenCount(total)}`,
+			...formatUsageBlock('Last request', usage),
+			``,
+			...formatUsageBlock('Cumulative this turn', cumulativeThisTurn),
+			``,
+			...formatUsageBlock('Cumulative this thread', cumulativeThisThread),
 		].filter(s => s !== null).join('\n')

 		svgEl = (
@ -400,6 +426,7 @@ const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, c
 const SubmitButtonWithUsageRing: React.FC<{ threadId: string; featureName: FeatureName; children: React.ReactNode }> = ({ threadId, featureName, children }) => {
 	const settingsState = useSettingsState()
 	const usage = useChatThreadLatestUsage(threadId)
+	const cumulative = useChatThreadCumulativeUsage(threadId)

 	const modelSelection = settingsState.modelSelectionOfFeature[featureName]
 	// Always render the wrapper so the send button doesn't jump sideways when
@ -410,7 +437,7 @@ const SubmitButtonWithUsageRing: React.FC<{ threadId: string; featureName: Featu
 		: 0

 	return (
-		<TokenUsageRing usage={usage} contextWindow={contextWindow}>
+		<TokenUsageRing usage={usage} contextWindow={contextWindow} cumulativeThisTurn={cumulative.thisTurn} cumulativeThisThread={cumulative.thisThread}>
 			{children}
 		</TokenUsageRing>
 	)
--- a/src/vs/workbench/contrib/void/browser/react/src/util/services.tsx
+++ b/src/vs/workbench/contrib/void/browser/react/src/util/services.tsx
@ -69,6 +69,8 @@ let chatThreadsStreamState: ThreadStreamState
 const chatThreadsStreamStateListeners: Set<(threadId: string) => void> = new Set()

 let chatThreadsLatestUsageOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
+let chatThreadsCumulativeUsageThisTurnOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
+let chatThreadsCumulativeUsageThisThreadOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}

 let settingsState: VoidSettingsState
 const settingsStateListeners: Set<(s: VoidSettingsState) => void> = new Set()
@ -122,10 +124,14 @@ export const _registerServices = (accessor: ServicesAccessor) => {
 	// same service, different state
 	chatThreadsStreamState = chatThreadsStateService.streamState
 	chatThreadsLatestUsageOfThreadId = chatThreadsStateService.latestUsageOfThreadId
+	chatThreadsCumulativeUsageThisTurnOfThreadId = chatThreadsStateService.cumulativeUsageThisTurnOfThreadId
+	chatThreadsCumulativeUsageThisThreadOfThreadId = chatThreadsStateService.cumulativeUsageThisThreadOfThreadId
 	disposables.push(
 		chatThreadsStateService.onDidChangeStreamState(({ threadId }) => {
 			chatThreadsStreamState = chatThreadsStateService.streamState
 			chatThreadsLatestUsageOfThreadId = chatThreadsStateService.latestUsageOfThreadId
+			chatThreadsCumulativeUsageThisTurnOfThreadId = chatThreadsStateService.cumulativeUsageThisTurnOfThreadId
+			chatThreadsCumulativeUsageThisThreadOfThreadId = chatThreadsStateService.cumulativeUsageThisThreadOfThreadId
 			chatThreadsStreamStateListeners.forEach(l => l(threadId))
 		})
 	)
@ -323,6 +329,35 @@ export const useChatThreadLatestUsage = (threadId: string) => {
 	return u
 }

+// Cumulative token usage across all LLM requests fired in the current user turn
+// (this-turn) and across the entire thread history (this-thread). In an agent
+// loop with N tool calls each request resends the full history, so total billed
+// tokens grow ~O(N²) — these counters expose that real cost vs. `latestUsage`'s
+// per-request snapshot.
+export const useChatThreadCumulativeUsage = (threadId: string) => {
+	const initial = {
+		thisTurn: chatThreadsCumulativeUsageThisTurnOfThreadId[threadId],
+		thisThread: chatThreadsCumulativeUsageThisThreadOfThreadId[threadId],
+	}
+	const [u, su] = useState<{ thisTurn: LLMUsage | undefined, thisThread: LLMUsage | undefined }>(initial)
+	useEffect(() => {
+		su({
+			thisTurn: chatThreadsCumulativeUsageThisTurnOfThreadId[threadId],
+			thisThread: chatThreadsCumulativeUsageThisThreadOfThreadId[threadId],
+		})
+		const listener = (threadId_: string) => {
+			if (threadId_ !== threadId) return
+			su({
+				thisTurn: chatThreadsCumulativeUsageThisTurnOfThreadId[threadId],
+				thisThread: chatThreadsCumulativeUsageThisThreadOfThreadId[threadId],
+			})
+		}
+		chatThreadsStreamStateListeners.add(listener)
+		return () => { chatThreadsStreamStateListeners.delete(listener) }
+	}, [su, threadId])
+	return u
+}
+
 export const useFullChatThreadsStreamState = () => {
 	const [s, ss] = useState(chatThreadsStreamState)
 	useEffect(() => {