mirror of
https://github.com/voideditor/void
synced 2026-05-22 17:08:25 +00:00
track accumulative token usage in a conversation thread (#9)
This commit is contained in:
parent
bbef0faa3b
commit
8d3b7400ff
3 changed files with 156 additions and 10 deletions
|
|
@ -124,6 +124,14 @@ export type ThreadType = {
|
|||
// after the user sends a new message).
|
||||
latestUsage?: LLMUsage;
|
||||
|
||||
// Sum of `LLMUsage` across every API request ever made on this thread.
|
||||
// In an agent loop with N tool calls, the loop fires N sequential requests
|
||||
// each carrying the full history + accumulated tool results — total billed
|
||||
// tokens are O(N²) while `latestUsage` only shows the latest request (O(N)).
|
||||
// This field surfaces the real cumulative cost so the user can see actual
|
||||
// billing impact, not just the last sample. Persisted alongside latestUsage.
|
||||
cumulativeUsageThisThread?: LLMUsage;
|
||||
|
||||
// Model used to send the most recent user message on this thread. Captured
|
||||
// on send, restored on `switchToThread` (writes to settings' `Chat` model
|
||||
// selection). `null` means "no message was sent on this thread yet"; if the
|
||||
|
|
@ -252,6 +260,15 @@ export interface IChatThreadService {
|
|||
readonly state: ThreadsState;
|
||||
readonly streamState: ThreadStreamState; // not persistent
|
||||
readonly latestUsageOfThreadId: { [threadId: string]: LLMUsage | undefined }; // hydrated from persisted threads on startup; updated as the model streams
|
||||
// Cumulative usage across all requests in the *current* user turn (reset
|
||||
// when a new user message is sent or a thread is opened/switched-to fresh).
|
||||
// Only lives in memory — not persisted, since "this turn" doesn't survive
|
||||
// a reload anyway.
|
||||
readonly cumulativeUsageThisTurnOfThreadId: { [threadId: string]: LLMUsage | undefined };
|
||||
// Cumulative usage across the entire thread history. Hydrated from the
|
||||
// persisted thread on startup so the user can see lifetime cost across
|
||||
// reloads.
|
||||
readonly cumulativeUsageThisThreadOfThreadId: { [threadId: string]: LLMUsage | undefined };
|
||||
|
||||
onDidChangeCurrentThread: Event<void>;
|
||||
onDidChangeStreamState: Event<{ threadId: string }>
|
||||
|
|
@ -330,6 +347,8 @@ class ChatThreadService extends Disposable implements IChatThreadService {
|
|||
|
||||
readonly streamState: ThreadStreamState = {}
|
||||
readonly latestUsageOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
|
||||
readonly cumulativeUsageThisTurnOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
|
||||
readonly cumulativeUsageThisThreadOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
|
||||
state: ThreadsState // allThreads is persisted, currentThread is not
|
||||
|
||||
// used in checkpointing
|
||||
|
|
@ -375,6 +394,7 @@ class ChatThreadService extends Disposable implements IChatThreadService {
|
|||
for (const id in allThreads) {
|
||||
const t = allThreads[id]
|
||||
if (t?.latestUsage) this.latestUsageOfThreadId[id] = t.latestUsage
|
||||
if (t?.cumulativeUsageThisThread) this.cumulativeUsageThisThreadOfThreadId[id] = t.cumulativeUsageThisThread
|
||||
}
|
||||
|
||||
// always be in a thread
|
||||
|
|
@ -576,14 +596,70 @@ class ChatThreadService extends Disposable implements IChatThreadService {
|
|||
// Also persists on the thread so the ring shows the last-known value after a reload.
|
||||
private _setLatestUsage(threadId: string, usage: LLMUsage) {
|
||||
this.latestUsageOfThreadId[threadId] = usage
|
||||
|
||||
// Cumulative = (cumulative locked-in from prior finalized requests in this
|
||||
// turn/thread) + (this request's running total). Always recompute from the
|
||||
// baseline so streaming updates (which carry the per-request running total,
|
||||
// not a delta) don't double-count.
|
||||
this.cumulativeUsageThisTurnOfThreadId[threadId] = this._addUsage(this._cumulativeThisTurnBaselineOfThreadId[threadId], usage)
|
||||
this.cumulativeUsageThisThreadOfThreadId[threadId] = this._addUsage(this._cumulativeThisThreadBaselineOfThreadId[threadId], usage)
|
||||
|
||||
const thread = this.state.allThreads[threadId]
|
||||
if (thread) {
|
||||
thread.latestUsage = usage
|
||||
thread.cumulativeUsageThisThread = this.cumulativeUsageThisThreadOfThreadId[threadId]
|
||||
this._storeAllThreads(this.state.allThreads)
|
||||
}
|
||||
this._onDidChangeStreamState.fire({ threadId })
|
||||
}
|
||||
|
||||
// Baseline = cumulative usage from previously-finalized requests in this
|
||||
// turn/thread. The current request's running total gets added on top in
|
||||
// `_setLatestUsage`. Moved forward by `_lockInCurrentRequestUsage` once a
|
||||
// request finishes, so the next request starts counting from where we
|
||||
// left off.
|
||||
private readonly _cumulativeThisTurnBaselineOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
|
||||
private readonly _cumulativeThisThreadBaselineOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
|
||||
|
||||
// Sum two LLMUsage values. `undefined` fields stay undefined unless one of
|
||||
// the inputs has a defined value, in which case we fall back to the defined
|
||||
// side (so e.g. a request that doesn't report `cachedInputTokens` doesn't
|
||||
// erase the previously-accumulated cached count).
|
||||
private _addUsage(a: LLMUsage | undefined, b: LLMUsage | undefined): LLMUsage | undefined {
|
||||
if (!a) return b ? { ...b } : undefined
|
||||
if (!b) return { ...a }
|
||||
const add = (x: number | undefined, y: number | undefined): number | undefined => {
|
||||
if (x === undefined && y === undefined) return undefined
|
||||
return (x ?? 0) + (y ?? 0)
|
||||
}
|
||||
return {
|
||||
inputTokens: add(a.inputTokens, b.inputTokens),
|
||||
outputTokens: add(a.outputTokens, b.outputTokens),
|
||||
totalTokens: add(a.totalTokens, b.totalTokens),
|
||||
reasoningTokens: add(a.reasoningTokens, b.reasoningTokens),
|
||||
cachedInputTokens: add(a.cachedInputTokens, b.cachedInputTokens),
|
||||
}
|
||||
}
|
||||
|
||||
// Roll the most recent per-request usage into the cumulative baselines so
|
||||
// the next request's running total starts from a fresh zero on top of the
|
||||
// locked-in totals. Called once per request (on `onFinalMessage`).
|
||||
private _lockInCurrentRequestUsage(threadId: string) {
|
||||
const lastUsage = this.latestUsageOfThreadId[threadId]
|
||||
if (!lastUsage) return
|
||||
this._cumulativeThisTurnBaselineOfThreadId[threadId] = this._addUsage(this._cumulativeThisTurnBaselineOfThreadId[threadId], lastUsage)
|
||||
this._cumulativeThisThreadBaselineOfThreadId[threadId] = this._addUsage(this._cumulativeThisThreadBaselineOfThreadId[threadId], lastUsage)
|
||||
}
|
||||
|
||||
// Reset the "this turn" counter and its baseline. Called when a new user
|
||||
// message starts a fresh turn. Does NOT touch "this thread" — that's
|
||||
// lifetime accumulation.
|
||||
private _resetCumulativeThisTurn(threadId: string) {
|
||||
this.cumulativeUsageThisTurnOfThreadId[threadId] = undefined
|
||||
this._cumulativeThisTurnBaselineOfThreadId[threadId] = undefined
|
||||
this._onDidChangeStreamState.fire({ threadId })
|
||||
}
|
||||
|
||||
|
||||
// ---------- streaming ----------
|
||||
|
||||
|
|
@ -960,6 +1036,9 @@ class ChatThreadService extends Disposable implements IChatThreadService {
|
|||
},
|
||||
onFinalMessage: async ({ fullText, fullReasoning, toolCall, anthropicReasoning, usage }) => {
|
||||
if (usage) this._setLatestUsage(threadId, usage)
|
||||
// Lock in this request's usage so the next loop iteration's
|
||||
// running total is added to (not replacing) what we already counted.
|
||||
this._lockInCurrentRequestUsage(threadId)
|
||||
resMessageIsDonePromise({ type: 'llmDone', toolCall, info: { fullText, fullReasoning, anthropicReasoning } }) // resolve with tool calls
|
||||
},
|
||||
onError: async (error) => {
|
||||
|
|
@ -1385,6 +1464,11 @@ We only need to do it for files that were edited since `from`, ie files between
|
|||
await this.abortRunning(threadId)
|
||||
}
|
||||
|
||||
// A new user message starts a new "turn" — zero out this-turn cumulative
|
||||
// before any LLM requests fire. Lifetime/this-thread cumulative keeps
|
||||
// accumulating across turns.
|
||||
this._resetCumulativeThisTurn(threadId)
|
||||
|
||||
// add dummy before this message to keep checkpoint before user message idea consistent
|
||||
if (thread.messages.length === 0) {
|
||||
this._addUserCheckpoint({ threadId })
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@
|
|||
import React, { ButtonHTMLAttributes, FormEvent, FormHTMLAttributes, Fragment, KeyboardEvent, useCallback, useEffect, useMemo, useRef, useState } from 'react';
|
||||
|
||||
|
||||
import { useAccessor, useChatThreadsState, useChatThreadsStreamState, useSettingsState, useActiveURI, useCommandBarState, useFullChatThreadsStreamState, useChatThreadLatestUsage } from '../util/services.js';
|
||||
import { useAccessor, useChatThreadsState, useChatThreadsStreamState, useSettingsState, useActiveURI, useCommandBarState, useFullChatThreadsStreamState, useChatThreadLatestUsage, useChatThreadCumulativeUsage } from '../util/services.js';
|
||||
import { ScrollType } from '../../../../../../../editor/common/editorCommon.js';
|
||||
|
||||
import { ChatMarkdownRender, ChatMessageLocation, getApplyBoxId } from '../markdown/ChatMarkdownRender.js';
|
||||
|
|
@ -311,10 +311,30 @@ interface TokenUsageRingProps {
|
|||
// ring is drawn — this prevents the send button from shifting once usage arrives
|
||||
usage: LLMUsage | undefined;
|
||||
contextWindow: number; // model's max input context, in tokens
|
||||
cumulativeThisTurn?: LLMUsage | undefined;
|
||||
cumulativeThisThread?: LLMUsage | undefined;
|
||||
children: React.ReactNode;
|
||||
size?: number;
|
||||
}
|
||||
const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, children, size = 34 }) => {
|
||||
|
||||
// Format a single LLMUsage block for the tooltip. Returns an array of plain
|
||||
// text lines (no HTML — react-tooltip's html mode is blocked by Trusted Types).
|
||||
const formatUsageBlock = (label: string, u: LLMUsage | undefined): (string | null)[] => {
|
||||
if (!u) return [`${label}: -`]
|
||||
const total = u.totalTokens ?? ((u.inputTokens ?? 0) + (u.outputTokens ?? 0) + (u.reasoningTokens ?? 0))
|
||||
const inputLine = u.cachedInputTokens !== undefined
|
||||
? ` Input: ${formatTokenCount(u.inputTokens)} (${formatTokenCount(u.cachedInputTokens)} cached)`
|
||||
: ` Input: ${formatTokenCount(u.inputTokens)}`
|
||||
return [
|
||||
`${label}:`,
|
||||
inputLine,
|
||||
` Output: ${formatTokenCount(u.outputTokens)}`,
|
||||
u.reasoningTokens !== undefined ? ` Reasoning: ${formatTokenCount(u.reasoningTokens)}` : null,
|
||||
` Total: ${formatTokenCount(total)}`,
|
||||
]
|
||||
}
|
||||
|
||||
const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, cumulativeThisTurn, cumulativeThisThread, children, size = 34 }) => {
|
||||
const strokeWidth = 3
|
||||
const radius = (size - strokeWidth) / 2
|
||||
const hasData = !!usage && contextWindow > 0
|
||||
|
|
@ -337,17 +357,23 @@ const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, c
|
|||
// prompt cache (OpenAI `prompt_tokens_details.cached_tokens`, mirrored by OpenRouter,
|
||||
// DeepSeek, etc.). Only show the line when the server actually reported a value —
|
||||
// an undefined field means the server doesn't expose it, which is different from 0.
|
||||
const inputLine = usage.cachedInputTokens !== undefined
|
||||
? `Input: ${formatTokenCount(usage.inputTokens)} (${formatTokenCount(usage.cachedInputTokens)} cached)`
|
||||
: `Input: ${formatTokenCount(usage.inputTokens)}`
|
||||
// Tooltip layout:
|
||||
// 1. Context-window ring summary (per-request, drives the ring color)
|
||||
// 2. Last request breakdown (the per-request snapshot the ring is based on)
|
||||
// 3. Cumulative this turn (sum across all loop iterations of the current user turn)
|
||||
// 4. Cumulative this thread (lifetime sum across the whole chat history)
|
||||
// The cumulative blocks are critical because agent loops issue many requests
|
||||
// per turn — total billed tokens grow ~O(N²) while the ring only shows the
|
||||
// last request's input.
|
||||
tooltipContent = [
|
||||
`Context window usage`,
|
||||
`${formatTokenCount(total)} / ${formatTokenCount(contextWindow)} (${displayPct})`,
|
||||
``,
|
||||
inputLine,
|
||||
`Output: ${formatTokenCount(usage.outputTokens)}`,
|
||||
usage.reasoningTokens !== undefined ? `Reasoning: ${formatTokenCount(usage.reasoningTokens)}` : null,
|
||||
`Total: ${formatTokenCount(total)}`,
|
||||
...formatUsageBlock('Last request', usage),
|
||||
``,
|
||||
...formatUsageBlock('Cumulative this turn', cumulativeThisTurn),
|
||||
``,
|
||||
...formatUsageBlock('Cumulative this thread', cumulativeThisThread),
|
||||
].filter(s => s !== null).join('\n')
|
||||
|
||||
svgEl = (
|
||||
|
|
@ -400,6 +426,7 @@ const TokenUsageRing: React.FC<TokenUsageRingProps> = ({ usage, contextWindow, c
|
|||
const SubmitButtonWithUsageRing: React.FC<{ threadId: string; featureName: FeatureName; children: React.ReactNode }> = ({ threadId, featureName, children }) => {
|
||||
const settingsState = useSettingsState()
|
||||
const usage = useChatThreadLatestUsage(threadId)
|
||||
const cumulative = useChatThreadCumulativeUsage(threadId)
|
||||
|
||||
const modelSelection = settingsState.modelSelectionOfFeature[featureName]
|
||||
// Always render the wrapper so the send button doesn't jump sideways when
|
||||
|
|
@ -410,7 +437,7 @@ const SubmitButtonWithUsageRing: React.FC<{ threadId: string; featureName: Featu
|
|||
: 0
|
||||
|
||||
return (
|
||||
<TokenUsageRing usage={usage} contextWindow={contextWindow}>
|
||||
<TokenUsageRing usage={usage} contextWindow={contextWindow} cumulativeThisTurn={cumulative.thisTurn} cumulativeThisThread={cumulative.thisThread}>
|
||||
{children}
|
||||
</TokenUsageRing>
|
||||
)
|
||||
|
|
|
|||
|
|
@ -69,6 +69,8 @@ let chatThreadsStreamState: ThreadStreamState
|
|||
const chatThreadsStreamStateListeners: Set<(threadId: string) => void> = new Set()
|
||||
|
||||
let chatThreadsLatestUsageOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
|
||||
let chatThreadsCumulativeUsageThisTurnOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
|
||||
let chatThreadsCumulativeUsageThisThreadOfThreadId: { [threadId: string]: LLMUsage | undefined } = {}
|
||||
|
||||
let settingsState: VoidSettingsState
|
||||
const settingsStateListeners: Set<(s: VoidSettingsState) => void> = new Set()
|
||||
|
|
@ -122,10 +124,14 @@ export const _registerServices = (accessor: ServicesAccessor) => {
|
|||
// same service, different state
|
||||
chatThreadsStreamState = chatThreadsStateService.streamState
|
||||
chatThreadsLatestUsageOfThreadId = chatThreadsStateService.latestUsageOfThreadId
|
||||
chatThreadsCumulativeUsageThisTurnOfThreadId = chatThreadsStateService.cumulativeUsageThisTurnOfThreadId
|
||||
chatThreadsCumulativeUsageThisThreadOfThreadId = chatThreadsStateService.cumulativeUsageThisThreadOfThreadId
|
||||
disposables.push(
|
||||
chatThreadsStateService.onDidChangeStreamState(({ threadId }) => {
|
||||
chatThreadsStreamState = chatThreadsStateService.streamState
|
||||
chatThreadsLatestUsageOfThreadId = chatThreadsStateService.latestUsageOfThreadId
|
||||
chatThreadsCumulativeUsageThisTurnOfThreadId = chatThreadsStateService.cumulativeUsageThisTurnOfThreadId
|
||||
chatThreadsCumulativeUsageThisThreadOfThreadId = chatThreadsStateService.cumulativeUsageThisThreadOfThreadId
|
||||
chatThreadsStreamStateListeners.forEach(l => l(threadId))
|
||||
})
|
||||
)
|
||||
|
|
@ -323,6 +329,35 @@ export const useChatThreadLatestUsage = (threadId: string) => {
|
|||
return u
|
||||
}
|
||||
|
||||
// Cumulative token usage across all LLM requests fired in the current user turn
|
||||
// (this-turn) and across the entire thread history (this-thread). In an agent
|
||||
// loop with N tool calls each request resends the full history, so total billed
|
||||
// tokens grow ~O(N²) — these counters expose that real cost vs. `latestUsage`'s
|
||||
// per-request snapshot.
|
||||
export const useChatThreadCumulativeUsage = (threadId: string) => {
|
||||
const initial = {
|
||||
thisTurn: chatThreadsCumulativeUsageThisTurnOfThreadId[threadId],
|
||||
thisThread: chatThreadsCumulativeUsageThisThreadOfThreadId[threadId],
|
||||
}
|
||||
const [u, su] = useState<{ thisTurn: LLMUsage | undefined, thisThread: LLMUsage | undefined }>(initial)
|
||||
useEffect(() => {
|
||||
su({
|
||||
thisTurn: chatThreadsCumulativeUsageThisTurnOfThreadId[threadId],
|
||||
thisThread: chatThreadsCumulativeUsageThisThreadOfThreadId[threadId],
|
||||
})
|
||||
const listener = (threadId_: string) => {
|
||||
if (threadId_ !== threadId) return
|
||||
su({
|
||||
thisTurn: chatThreadsCumulativeUsageThisTurnOfThreadId[threadId],
|
||||
thisThread: chatThreadsCumulativeUsageThisThreadOfThreadId[threadId],
|
||||
})
|
||||
}
|
||||
chatThreadsStreamStateListeners.add(listener)
|
||||
return () => { chatThreadsStreamStateListeners.delete(listener) }
|
||||
}, [su, threadId])
|
||||
return u
|
||||
}
|
||||
|
||||
export const useFullChatThreadsStreamState = () => {
|
||||
const [s, ss] = useState(chatThreadsStreamState)
|
||||
useEffect(() => {
|
||||
|
|
|
|||
Loading…
Reference in a new issue