Feature/improve chat experience (#7)

* handle relative path when agent ask to interact with files or directories * handle gemini reasoning output * move volatile system message to the last part * handle reasoning for openai compatible opencode * serialize tool call arguments to improve cache hit rate
2026-05-22 17:08:25 +00:00 · 2026-04-22 11:29:49 +08:00 · 2026-04-22 11:29:49 +08:00 · 7603d8f9a2
commit 7603d8f9a2
parent f4e5b9e91a
9 changed files with 238 additions and 81 deletions
--- a/src/vs/workbench/contrib/void/browser/chatThreadService.ts
+++ b/src/vs/workbench/contrib/void/browser/chatThreadService.ts
@ -205,6 +205,7 @@ export type ThreadStreamState = {
 			id: string;
 			content: string;
 			rawParams: RawToolParamsObj;
+			rawParamsStr?: string;
 			mcpServerName: string | undefined;
 		};
 		interrupt: Promise<() => void>;
@ -536,7 +537,7 @@ class ChatThreadService extends Disposable implements IChatThreadService {
 			// if running now but stream state doesn't indicate it (happens if restart Void), cancel that last tool
 			if (lastMessage && lastMessage.role === 'tool' && lastMessage.type === 'running_now') {

-				this._updateLatestTool(threadId, { role: 'tool', type: 'rejected', content: lastMessage.content, id: lastMessage.id, rawParams: lastMessage.rawParams, result: null, name: lastMessage.name, params: lastMessage.params, mcpServerName: lastMessage.mcpServerName })
+				this._updateLatestTool(threadId, { role: 'tool', type: 'rejected', content: lastMessage.content, id: lastMessage.id, rawParams: lastMessage.rawParams, rawParamsStr: lastMessage.rawParamsStr, result: null, name: lastMessage.name, params: lastMessage.params, mcpServerName: lastMessage.mcpServerName })
 			}

 		}
@ -681,10 +682,10 @@ class ChatThreadService extends Disposable implements IChatThreadService {
 		}
 		else return

-		const { name, id, rawParams, mcpServerName } = lastMsg
+		const { name, id, rawParams, rawParamsStr, mcpServerName } = lastMsg

 		const errorMessage = this.toolErrMsgs.rejected
-		this._updateLatestTool(threadId, { role: 'tool', type: 'rejected', params: params, name: name, content: errorMessage, result: null, id, rawParams, mcpServerName })
+		this._updateLatestTool(threadId, { role: 'tool', type: 'rejected', params: params, name: name, content: errorMessage, result: null, id, rawParams, rawParamsStr, mcpServerName })
 		this._setStreamState(threadId, undefined)
 	}

@ -704,9 +705,9 @@ class ChatThreadService extends Disposable implements IChatThreadService {
 		}
 		// add tool that's running
 		else if (this.streamState[threadId]?.isRunning === 'tool') {
-			const { toolName, toolParams, id, content: content_, rawParams, mcpServerName } = this.streamState[threadId].toolInfo
+			const { toolName, toolParams, id, content: content_, rawParams, rawParamsStr, mcpServerName } = this.streamState[threadId].toolInfo
 			const content = content_ || this.toolErrMsgs.interrupted
-			this._updateLatestTool(threadId, { role: 'tool', name: toolName, params: toolParams, id, content, rawParams, type: 'rejected', result: null, mcpServerName })
+			this._updateLatestTool(threadId, { role: 'tool', name: toolName, params: toolParams, id, content, rawParams, rawParamsStr, type: 'rejected', result: null, mcpServerName })
 		}
 		// reject the tool for the user if relevant
 		else if (this.streamState[threadId]?.isRunning === 'awaiting_user') {
@ -745,8 +746,12 @@ class ChatThreadService extends Disposable implements IChatThreadService {
 		toolName: ToolName,
 		toolId: string,
 		mcpServerName: string | undefined,
-		opts: { preapproved: true, unvalidatedToolParams: RawToolParamsObj, validatedParams: ToolCallParams<ToolName> } | { preapproved: false, unvalidatedToolParams: RawToolParamsObj },
+		opts: { preapproved: true, unvalidatedToolParams: RawToolParamsObj, validatedParams: ToolCallParams<ToolName>, rawParamsStr?: string } | { preapproved: false, unvalidatedToolParams: RawToolParamsObj, rawParamsStr?: string },
 	): Promise<{ awaitingUserApproval?: boolean, interrupted?: boolean }> => {
+		// Carry the model's original serialized arguments string (when available) into
+		// every tool message we persist. This lets the replay path send byte-identical
+		// tool_calls back to the provider, preserving the prefix cache across turns.
+		const rawParamsStr = opts.rawParamsStr

 		// compute these below
 		let toolParams: ToolCallParams<ToolName>
@ -770,7 +775,7 @@ class ChatThreadService extends Disposable implements IChatThreadService {
 			}
 			catch (error) {
 				const errorMessage = getErrorMessage(error)
-				this._addMessageToThread(threadId, { role: 'tool', type: 'invalid_params', rawParams: opts.unvalidatedToolParams, result: null, name: toolName, content: errorMessage, id: toolId, mcpServerName })
+				this._addMessageToThread(threadId, { role: 'tool', type: 'invalid_params', rawParams: opts.unvalidatedToolParams, rawParamsStr, result: null, name: toolName, content: errorMessage, id: toolId, mcpServerName })
 				return {}
 			}
 			// once validated, add checkpoint for edit
@ -783,7 +788,7 @@ class ChatThreadService extends Disposable implements IChatThreadService {
 			if (approvalType) {
 				const autoApprove = this._settingsService.state.globalSettings.autoApprove[approvalType]
 				// add a tool_request because we use it for UI if a tool is loading (this should be improved in the future)
-				this._addMessageToThread(threadId, { role: 'tool', type: 'tool_request', content: '(Awaiting user permission...)', result: null, name: toolName, params: toolParams, id: toolId, rawParams: opts.unvalidatedToolParams, mcpServerName })
+				this._addMessageToThread(threadId, { role: 'tool', type: 'tool_request', content: '(Awaiting user permission...)', result: null, name: toolName, params: toolParams, id: toolId, rawParams: opts.unvalidatedToolParams, rawParamsStr, mcpServerName })
 				if (!autoApprove) {
 					return { awaitingUserApproval: true }
 				}
@ -800,7 +805,7 @@ class ChatThreadService extends Disposable implements IChatThreadService {

 		// 3. call the tool
 		// this._setStreamState(threadId, { isRunning: 'tool' }, 'merge')
-		const runningTool = { role: 'tool', type: 'running_now', name: toolName, params: toolParams, content: '(value not received yet...)', result: null, id: toolId, rawParams: opts.unvalidatedToolParams, mcpServerName } as const
+		const runningTool = { role: 'tool', type: 'running_now', name: toolName, params: toolParams, content: '(value not received yet...)', result: null, id: toolId, rawParams: opts.unvalidatedToolParams, rawParamsStr, mcpServerName } as const
 		this._updateLatestTool(threadId, runningTool)


@ -810,7 +815,7 @@ class ChatThreadService extends Disposable implements IChatThreadService {
 		try {

 			// set stream state
-			this._setStreamState(threadId, { isRunning: 'tool', interrupt: interruptorPromise, toolInfo: { toolName, toolParams, id: toolId, content: 'interrupted...', rawParams: opts.unvalidatedToolParams, mcpServerName } })
+			this._setStreamState(threadId, { isRunning: 'tool', interrupt: interruptorPromise, toolInfo: { toolName, toolParams, id: toolId, content: 'interrupted...', rawParams: opts.unvalidatedToolParams, rawParamsStr, mcpServerName } })

 			if (isBuiltInTool) {
 				const { result, interruptTool } = await this._toolsService.callTool[toolName](toolParams as any)
@ -840,7 +845,7 @@ class ChatThreadService extends Disposable implements IChatThreadService {
 			if (interrupted) { return { interrupted: true } } // the tool result is added where we interrupt, not here

 			const errorMessage = getErrorMessage(error)
-			this._updateLatestTool(threadId, { role: 'tool', type: 'tool_error', params: toolParams, result: errorMessage, name: toolName, content: errorMessage, id: toolId, rawParams: opts.unvalidatedToolParams, mcpServerName })
+			this._updateLatestTool(threadId, { role: 'tool', type: 'tool_error', params: toolParams, result: errorMessage, name: toolName, content: errorMessage, id: toolId, rawParams: opts.unvalidatedToolParams, rawParamsStr, mcpServerName })
 			return {}
 		}

@ -855,12 +860,12 @@ class ChatThreadService extends Disposable implements IChatThreadService {
 			}
 		} catch (error) {
 			const errorMessage = this.toolErrMsgs.errWhenStringifying(error)
-			this._updateLatestTool(threadId, { role: 'tool', type: 'tool_error', params: toolParams, result: errorMessage, name: toolName, content: errorMessage, id: toolId, rawParams: opts.unvalidatedToolParams, mcpServerName })
+			this._updateLatestTool(threadId, { role: 'tool', type: 'tool_error', params: toolParams, result: errorMessage, name: toolName, content: errorMessage, id: toolId, rawParams: opts.unvalidatedToolParams, rawParamsStr, mcpServerName })
 			return {}
 		}

 		// 5. add to history and keep going
-		this._updateLatestTool(threadId, { role: 'tool', type: 'success', params: toolParams, result: toolResult, name: toolName, content: toolResultStr, id: toolId, rawParams: opts.unvalidatedToolParams, mcpServerName })
+		this._updateLatestTool(threadId, { role: 'tool', type: 'success', params: toolParams, result: toolResult, name: toolName, content: toolResultStr, id: toolId, rawParams: opts.unvalidatedToolParams, rawParamsStr, mcpServerName })
 		return {}
 	};

@ -895,7 +900,7 @@ class ChatThreadService extends Disposable implements IChatThreadService {

 		// before enter loop, call tool
 		if (callThisToolFirst) {
-			const { interrupted } = await this._runToolCall(threadId, callThisToolFirst.name, callThisToolFirst.id, callThisToolFirst.mcpServerName, { preapproved: true, unvalidatedToolParams: callThisToolFirst.rawParams, validatedParams: callThisToolFirst.params })
+			const { interrupted } = await this._runToolCall(threadId, callThisToolFirst.name, callThisToolFirst.id, callThisToolFirst.mcpServerName, { preapproved: true, unvalidatedToolParams: callThisToolFirst.rawParams, rawParamsStr: callThisToolFirst.rawParamsStr, validatedParams: callThisToolFirst.params })
 			if (interrupted) {
 				this._setStreamState(threadId, undefined)
 				this._addUserCheckpoint({ threadId })
@ -1026,7 +1031,7 @@ class ChatThreadService extends Disposable implements IChatThreadService {
 					const mcpTools = this._mcpService.getMCPTools()
 					const mcpTool = mcpTools?.find(t => t.name === toolCall.name)

-					const { awaitingUserApproval, interrupted } = await this._runToolCall(threadId, toolCall.name, toolCall.id, mcpTool?.mcpServerName, { preapproved: false, unvalidatedToolParams: toolCall.rawParams })
+					const { awaitingUserApproval, interrupted } = await this._runToolCall(threadId, toolCall.name, toolCall.id, mcpTool?.mcpServerName, { preapproved: false, unvalidatedToolParams: toolCall.rawParams, rawParamsStr: toolCall.rawParamsStr })
 					if (interrupted) {
 						this._setStreamState(threadId, undefined)
 						return
@ -1391,7 +1396,19 @@ We only need to do it for files that were edited since `from`, ie files between
 		const currSelns: StagingSelectionItem[] = _chatSelections ?? thread.state.stagingSelections

 		const userMessageContent = await chat_userMessageContent(instructions, currSelns, { directoryStrService: this._directoryStringService, fileService: this._fileService }) // user message + names of files (NOT content)
-		const userHistoryElt: ChatMessage = { role: 'user', content: userMessageContent, displayContent: instructions, selections: currSelns, state: defaultMessageState }
+
+		// Snapshot the volatile runtime context (date, open files, active URI,
+		// directory listing, terminal IDs) into this user message's stored content
+		// so past turns stay byte-identical across subsequent requests. The volatile
+		// block goes into `content` (what the LLM sees) but NOT into `displayContent`
+		// (what the UI renders), so the chat bubble shows only the user's words.
+		const { chatMode } = this._settingsService.state.globalSettings
+		const volatileBlock = await this._convertToLLMMessagesService.generateChatVolatileContext({ chatMode })
+		const contentWithVolatile = volatileBlock
+			? `${volatileBlock}\n\n${userMessageContent}`
+			: userMessageContent
+
+		const userHistoryElt: ChatMessage = { role: 'user', content: contentWithVolatile, displayContent: instructions, selections: currSelns, state: defaultMessageState }
 		this._addMessageToThread(threadId, userHistoryElt)

 		this._setThreadState(threadId, { currCheckpointIdx: null }) // no longer at a checkpoint because started streaming
--- a/src/vs/workbench/contrib/void/browser/convertToLLMMessageService.ts
+++ b/src/vs/workbench/contrib/void/browser/convertToLLMMessageService.ts
@ -7,7 +7,7 @@ import { IWorkspaceContextService } from '../../../../platform/workspace/common/
 import { IEditorService } from '../../../services/editor/common/editorService.js';
 import { ChatMessage } from '../common/chatThreadServiceTypes.js';
 import { getIsReasoningEnabledState, getReservedOutputTokenSpace, getModelCapabilities } from '../common/modelCapabilities.js';
-import { reParsedToolXMLString, chat_systemMessage } from '../common/prompt/prompts.js';
+import { reParsedToolXMLString, chat_systemMessage, chat_volatileContext } from '../common/prompt/prompts.js';
 import { AnthropicLLMChatMessage, AnthropicReasoning, GeminiLLMChatMessage, LLMChatMessage, LLMFIMMessage, OpenAILLMChatMessage, RawToolParamsObj } from '../common/sendLLMMessageTypes.js';
 import { IVoidSettingsService } from '../common/voidSettingsService.js';
 import { ChatMode, FeatureName, ModelSelection, ProviderName } from '../common/voidSettingsTypes.js';
@ -29,6 +29,10 @@ type SimpleLLMMessage = {
 	id: string;
 	name: ToolName;
 	rawParams: RawToolParamsObj;
+	// Original serialized arguments string from the model's tool call (OpenAI-compat
+	// only). When present, used verbatim on replay to keep the provider's prefix cache
+	// matching across turns. Falls back to JSON.stringify(rawParams) when absent.
+	rawParamsStr?: string;
 } | {
 	role: 'user';
 	content: string;
@ -84,12 +88,17 @@ const prepareMessages_openai_tools = (messages: SimpleLLMMessage[]): AnthropicOr
 		// edit previous assistant message to have called the tool
 		const prevMsg = 0 <= i - 1 && i - 1 <= newMessages.length ? newMessages[i - 1] : undefined
 		if (prevMsg?.role === 'assistant') {
+			// Prefer the model's original serialized argument string when we have it
+			// (OpenAI-compatible providers expose it in the streaming delta). Sending
+			// byte-identical bytes back preserves the provider's prefix cache past the
+			// tool call. Fall back to re-serializing when the raw string is unavailable
+			// (e.g. conversations from before this field existed, or non-OpenAI provenance).
 			prevMsg.tool_calls = [{
 				type: 'function',
 				id: currMsg.id,
 				function: {
 					name: currMsg.name,
-					arguments: JSON.stringify(currMsg.rawParams)
+					arguments: currMsg.rawParamsStr ?? JSON.stringify(currMsg.rawParams)
 				}
 			}]
 		}
@ -524,6 +533,12 @@ export interface IConvertToLLMMessageService {
 	prepareLLMSimpleMessages: (opts: { simpleMessages: SimpleLLMMessage[], systemMessage: string, modelSelection: ModelSelection | null, featureName: FeatureName }) => { messages: LLMChatMessage[], separateSystemMessage: string | undefined }
 	prepareLLMChatMessages: (opts: { chatMessages: ChatMessage[], chatMode: ChatMode, modelSelection: ModelSelection | null }) => Promise<{ messages: LLMChatMessage[], separateSystemMessage: string | undefined }>
 	prepareFIMMessage(opts: { messages: LLMFIMMessage, }): { prefix: string, suffix: string, stopTokens: string[] }
+	// Called by chat creation paths to snapshot runtime grounding (date, open files,
+	// active URI, directory listing, terminal IDs) into a user message at storage time.
+	// Baking volatile into the stored content (rather than prepending at send time)
+	// keeps prior turns byte-identical across requests so the provider's prefix cache
+	// stays warm turn-over-turn.
+	generateChatVolatileContext: (opts: { chatMode: ChatMode }) => Promise<string>
 }

 export const IConvertToLLMMessageService = createDecorator<IConvertToLLMMessageService>('ConvertToLLMMessageService');
@ -575,26 +590,30 @@ class ConvertToLLMMessageService extends Disposable implements IConvertToLLMMess
 	}


-	// system message
-	private _generateChatMessagesSystemMessage = async (chatMode: ChatMode, specialToolFormat: 'openai-style' | 'anthropic-style' | 'gemini-style' | undefined) => {
-		const workspaceFolders = this.workspaceContextService.getWorkspace().folders.map(f => f.uri.fsPath)
+	// Computes the stable system message and the volatile-context block in one pass.
+	// The stable system message contains only cacheable content (persona, rules, tool
+	// definitions). The volatile block (runtime grounding: date, open files, active
+	// URI, directory listing, terminal IDs) is generated separately via
+	// `generateChatVolatileContext` and baked into the user message at storage time
+	// by the chat thread creation path — that keeps historical turns byte-identical
+	// across requests so the provider's prefix cache stays warm.
+	private _generateChatSystemMessage = (chatMode: ChatMode, specialToolFormat: 'openai-style' | 'anthropic-style' | 'gemini-style' | undefined) => {
+		const includeXMLToolDefinitions = !specialToolFormat
+		const mcpTools = this.mcpService.getMCPTools()
+		return chat_systemMessage({ chatMode, mcpTools, includeXMLToolDefinitions })
+	}

+	generateChatVolatileContext: IConvertToLLMMessageService['generateChatVolatileContext'] = async ({ chatMode }) => {
+		const workspaceFolders = this.workspaceContextService.getWorkspace().folders.map(f => f.uri.fsPath)
 		const openedURIs = this.modelService.getModels().filter(m => m.isAttachedToEditor()).map(m => m.uri.fsPath) || [];
 		const activeURI = this.editorService.activeEditor?.resource?.fsPath;
-
 		const directoryStr = await this.directoryStrService.getAllDirectoriesStr({
 			cutOffMessage: chatMode === 'agent' || chatMode === 'gather' ?
 				`...Directories string cut off, use tools to read more...`
 				: `...Directories string cut off, ask user for more if necessary...`
 		})
-
-		const includeXMLToolDefinitions = !specialToolFormat
-
-		const mcpTools = this.mcpService.getMCPTools()
-
 		const persistentTerminalIDs = this.terminalToolService.listPersistentTerminalIds()
-		const systemMessage = chat_systemMessage({ workspaceFolders, openedURIs, directoryStr, activeURI, persistentTerminalIDs, chatMode, mcpTools, includeXMLToolDefinitions })
-		return systemMessage
+		return chat_volatileContext({ workspaceFolders, openedURIs, activeURI, persistentTerminalIDs, directoryStr, chatMode })
 	}


@ -622,6 +641,7 @@ class ConvertToLLMMessageService extends Disposable implements IConvertToLLMMess
 					name: m.name,
 					id: m.id,
 					rawParams: m.rawParams,
+					rawParamsStr: m.rawParamsStr,
 				})
 			}
 			else if (m.role === 'user') {
@ -680,7 +700,7 @@ class ConvertToLLMMessageService extends Disposable implements IConvertToLLMMess
 		} = getModelCapabilities(providerName, modelName, overridesOfModel)

 		const { disableSystemMessage } = this.voidSettingsService.state.globalSettings;
-		const fullSystemMessage = await this._generateChatMessagesSystemMessage(chatMode, specialToolFormat)
+		const fullSystemMessage = this._generateChatSystemMessage(chatMode, specialToolFormat)
 		const systemMessage = disableSystemMessage ? '' : fullSystemMessage;

 		const modelSelectionOptions = this.voidSettingsService.state.optionsOfModelSelection['Chat'][modelSelection.providerName]?.[modelSelection.modelName]
@ -689,6 +709,11 @@ class ConvertToLLMMessageService extends Disposable implements IConvertToLLMMess
 		const aiInstructions = this._getCombinedAIInstructions();
 		const isReasoningEnabled = getIsReasoningEnabledState('Chat', providerName, modelName, modelSelectionOptions, overridesOfModel)
 		const reservedOutputTokenSpace = getReservedOutputTokenSpace(providerName, modelName, { isReasoningEnabled, overridesOfModel })
+		// Volatile context is baked into user messages at thread-creation time
+		// (see `chatThreadService._addUserMessageAndStreamResponse`). At send time
+		// the stored content is passed through verbatim so each past turn is
+		// byte-identical to what was sent before, keeping the provider's prefix
+		// cache warm across turns.
 		const llmMessages = this._chatMessagesToSimpleMessages(chatMessages)

 		const { messages, separateSystemMessage } = prepareMessages({
--- a/src/vs/workbench/contrib/void/browser/react/src/sidebar-tsx/SidebarChat.tsx
+++ b/src/vs/workbench/contrib/void/browser/react/src/sidebar-tsx/SidebarChat.tsx
@ -1515,12 +1515,23 @@ const ReasoningWrapper = ({ isDoneReasoning, isStreaming, children }: { isDoneRe
 	const isDone = isDoneReasoning || !isStreaming
 	const isWriting = !isDone
 	const [isOpen, setIsOpen] = useState(isWriting)
+	const scrollRef = useRef<HTMLDivElement>(null)
 	useEffect(() => {
 		if (!isWriting) setIsOpen(false) // if just finished reasoning, close
 	}, [isWriting])
+	// While streaming, keep the box pinned to the bottom so the user sees the
+	// latest thoughts without having to scroll. Once done, respect user scroll.
+	useEffect(() => {
+		if (!isWriting || !isOpen) return
+		const el = scrollRef.current
+		if (el) el.scrollTop = el.scrollHeight
+	}, [children, isWriting, isOpen])
 	return <ToolHeaderWrapper title='Reasoning' desc1={isWriting ? <IconLoading /> : ''} isOpen={isOpen} onClick={() => setIsOpen(v => !v)}>
 		<ToolChildrenWrapper>
-			<div className='!select-text cursor-auto'>
+			<div
+				ref={scrollRef}
+				className='!select-text cursor-auto max-h-60 overflow-y-auto'
+			>
 				{children}
 			</div>
 		</ToolChildrenWrapper>
--- a/src/vs/workbench/contrib/void/browser/toolsService.ts
+++ b/src/vs/workbench/contrib/void/browser/toolsService.ts
@ -38,38 +38,60 @@ const validateStr = (argName: string, value: unknown) => {
 }


-// We are NOT checking to make sure in workspace
-const validateURI = (uriStr: unknown) => {
+// Detects whether a plain path string is absolute.
+// - Unix absolute: starts with '/'
+// - Windows absolute: drive letter followed by ':\' or ':/' (e.g. 'C:\...', 'c:/...')
+// - UNC path: starts with '\\'
+const isAbsolutePathString = (s: string) => {
+	if (s.startsWith('/')) return true
+	if (s.startsWith('\\\\')) return true
+	if (/^[a-zA-Z]:[\\/]/.test(s)) return true
+	return false
+}
+
+// We are NOT checking to make sure in workspace.
+// workspaceRoot is optional; when provided, bare relative paths like "src/foo.ts" or
+// "./README.md" are resolved against it. Without it (or when no workspace is open),
+// we fall back to URI.file which resolves relative paths against the filesystem root —
+// same as the legacy behavior, but that's the pathological case we want to avoid.
+// Prefer the workspace-aware `validateURI` bound inside ToolsService; this raw
+// version is exported-by-module-scope only for internal re-use.
+const validateURIWithRoot = (uriStr: unknown, workspaceRoot?: URI | null) => {
 	if (uriStr === null) throw new Error(`Invalid LLM output: uri was null.`)
 	if (typeof uriStr !== 'string') throw new Error(`Invalid LLM output format: Provided uri must be a string, but it's a(n) ${typeof uriStr}. Full value: ${JSON.stringify(uriStr)}.`)

-	// Check if it's already a full URI with scheme (e.g., vscode-remote://, file://, etc.)
-	// Look for :// pattern which indicates a scheme is present
-	// Examples of supported URIs:
-	// - vscode-remote://wsl+Ubuntu/home/user/file.txt (WSL)
-	// - vscode-remote://ssh-remote+myserver/home/user/file.txt (SSH)
-	// - file:///home/user/file.txt (local file with scheme)
-	// - /home/user/file.txt (local file path, will be converted to file://)
-	// - C:\Users\file.txt (Windows local path, will be converted to file://)
+	// Scheme-qualified URI (e.g. vscode-remote://, file://, etc.) — parse as-is.
 	if (uriStr.includes('://')) {
 		try {
 			const uri = URI.parse(uriStr)
 			return uri
 		} catch (e) {
-			// If parsing fails, it's a malformed URI
 			throw new Error(`Invalid URI format: ${uriStr}. Error: ${e}`)
 		}
-	} else {
-		// No scheme present, treat as file path
-		// This handles regular file paths like /home/user/file.txt or C:\Users\file.txt
-		const uri = URI.file(uriStr)
-		return uri
 	}
+
+	// Absolute path — safe to pass to URI.file.
+	if (isAbsolutePathString(uriStr)) {
+		return URI.file(uriStr)
+	}
+
+	// Relative path (e.g. "README.md", "src/foo.ts", "./foo", "../bar").
+	// Resolve against workspace root when available. This is the critical branch:
+	// weak models naturally produce bare filenames, and without this resolution
+	// URI.file("README.md") would become file:///README.md (root of filesystem),
+	// forcing models to fall back to terminal commands.
+	if (workspaceRoot) {
+		return URI.joinPath(workspaceRoot, uriStr)
+	}
+
+	// No workspace — legacy fallback. Will resolve from filesystem root and likely fail,
+	// but preserves prior behavior for the (rare) no-workspace case.
+	return URI.file(uriStr)
 }

-const validateOptionalURI = (uriStr: unknown) => {
+const validateOptionalURIWithRoot = (uriStr: unknown, workspaceRoot?: URI | null) => {
 	if (isFalsy(uriStr)) return null
-	return validateURI(uriStr)
+	return validateURIWithRoot(uriStr, workspaceRoot)
 }

 const validateOptionalStr = (argName: string, str: unknown) => {
@ -156,6 +178,16 @@ export class ToolsService implements IToolsService {
 	) {
 		const queryBuilder = instantiationService.createInstance(QueryBuilder);

+		// Resolve the current workspace root lazily so that multi-root / workspace-switch
+		// scenarios pick up the correct folder at call time rather than at construction time.
+		// These shadow the module-level helpers so the 11+ call sites below stay terse.
+		const getWorkspaceRoot = (): URI | null => {
+			const folders = workspaceContextService.getWorkspace().folders
+			return folders.length > 0 ? folders[0].uri : null
+		}
+		const validateURI = (uriStr: unknown) => validateURIWithRoot(uriStr, getWorkspaceRoot())
+		const validateOptionalURI = (uriStr: unknown) => validateOptionalURIWithRoot(uriStr, getWorkspaceRoot())
+
 		this.validateParams = {
 			read_file: (params: RawToolParamsObj) => {
 				const { uri: uriStr, start_line: startLineUnknown, end_line: endLineUnknown, page_number: pageNumberUnknown } = params
--- a/src/vs/workbench/contrib/void/common/chatThreadServiceTypes.ts
+++ b/src/vs/workbench/contrib/void/common/chatThreadServiceTypes.ts
@ -13,6 +13,10 @@ export type ToolMessage<T extends ToolName> = {
 	content: string; // give this result to LLM (string of value)
 	id: string;
 	rawParams: RawToolParamsObj;
+	// Original serialized `arguments` string from the model's tool call (when available
+	// from the provider stream — OpenAI-compatible only). Used on replay to send
+	// byte-identical tool_calls back, preserving the provider's prefix cache.
+	rawParamsStr?: string;
 	mcpServerName: string | undefined; // the server name at the time of the call
 } & (
 		// in order of events:
--- a/src/vs/workbench/contrib/void/common/modelCapabilities.ts
+++ b/src/vs/workbench/contrib/void/common/modelCapabilities.ts
@ -227,9 +227,12 @@ type ProviderReasoningIOSettings = {
 	// include this in payload to get reasoning
 	input?: { includeInPayload?: (reasoningState: SendableReasoningInfo) => null | { [key: string]: any }, };
 	// nameOfFieldInDelta: reasoning output is in response.choices[0].delta[deltaReasoningField]
+	// may be a single field name or a list of candidates tried in order (first non-empty wins) —
+	// lets one provider entry cover gateways that standardize on different field names
+	// (e.g. DeepSeek uses `reasoning_content`, OpenRouter uses `reasoning`).
 	// needsManualParse: whether we must manually parse out the <think> tags
 	output?:
-	| { nameOfFieldInDelta?: string, needsManualParse?: undefined, }
+	| { nameOfFieldInDelta?: string | string[], needsManualParse?: undefined, }
 	| { nameOfFieldInDelta?: undefined, needsManualParse?: true, };
 }

@ -1254,9 +1257,12 @@ const openaiCompatible: VoidStaticProviderInfo = {
 	modelOptionsFallback: (modelName) => extensiveModelOptionsFallback(modelName),
 	modelOptions: {},
 	providerReasoningIOSettings: {
-		// reasoning: we have no idea what endpoint they used, so we can't consistently parse out reasoning
+		// reasoning: we have no idea what endpoint they used, so cover the common field names.
+		// `reasoning_content` — DeepSeek, vLLM, many self-hosted servers
+		// `reasoning`         — OpenRouter, opencode-style gateways
+		// `thinking`          — some Chinese gateways (Moonshot, Zhipu)
 		input: { includeInPayload: openAICompatIncludeInPayloadReasoning },
-		output: { nameOfFieldInDelta: 'reasoning_content' },
+		output: { nameOfFieldInDelta: ['reasoning_content', 'reasoning', 'thinking'] },
 	},
 }

--- a/src/vs/workbench/contrib/void/common/prompt/prompts.ts
+++ b/src/vs/workbench/contrib/void/common/prompt/prompts.ts
@ -425,17 +425,25 @@ const systemToolsXMLPrompt = (chatMode: ChatMode, mcpTools: InternalToolInfo[] |
 // ======================================================== chat (normal, gather, agent) ========================================================


-export const chat_systemMessage = ({ workspaceFolders, openedURIs, activeURI, persistentTerminalIDs, directoryStr, chatMode: mode, mcpTools, includeXMLToolDefinitions }: { workspaceFolders: string[], directoryStr: string, openedURIs: string[], activeURI: string | undefined, persistentTerminalIDs: string[], chatMode: ChatMode, mcpTools: InternalToolInfo[] | undefined, includeXMLToolDefinitions: boolean }) => {
-	const header = (`You are an expert coding ${mode === 'agent' ? 'agent' : 'assistant'} whose job is \
-${mode === 'agent' ? `to help the user develop, run, and make changes to their codebase.`
-			: mode === 'gather' ? `to search, understand, and reference files in the user's codebase.`
-				: mode === 'normal' ? `to assist the user with their coding tasks.`
-					: ''}
-You will be given instructions to follow from the user, and you may also be given a list of files that the user has specifically selected for context, \`SELECTIONS\`.
-Please assist the user with their query.`)
-
+// Shared input type between the stable system message and the volatile context.
+// Kept together so callers can compute the workspace snapshot once and feed both.
+export type ChatPromptContext = {
+	workspaceFolders: string[]
+	directoryStr: string
+	openedURIs: string[]
+	activeURI: string | undefined
+	persistentTerminalIDs: string[]
+	chatMode: ChatMode
+	mcpTools: InternalToolInfo[] | undefined
+	includeXMLToolDefinitions: boolean
+}


+// Returns the volatile runtime-grounding block as a standalone string. Callers
+// should prepend this to the latest user message (Phase B caching layout) rather
+// than embed it in the system message — keeping it out of the system message lets
+// the stable prefix and the full conversation history be prefix-cached across turns.
+export const chat_volatileContext = ({ workspaceFolders, openedURIs, activeURI, persistentTerminalIDs, directoryStr, chatMode: mode }: Pick<ChatPromptContext, 'workspaceFolders' | 'directoryStr' | 'openedURIs' | 'activeURI' | 'persistentTerminalIDs' | 'chatMode'>) => {
 	const sysInfo = (`Here is the user's system information:
 <system_info>
 - ${os}
@ -459,6 +467,29 @@ ${directoryStr}
 </files_overview>`)


+	// XML tag is self-describing; no narration prefix. Keep field order stable
+	// so that on turns where volatile fields happen to match the previous turn,
+	// the cache can extend further into the prefix.
+	return (`<volatile_context>
+Today's date is ${new Date().toDateString()}.
+
+${sysInfo}
+
+${fsInfo}
+</volatile_context>`)
+}
+
+
+export const chat_systemMessage = ({ chatMode: mode, mcpTools, includeXMLToolDefinitions }: Pick<ChatPromptContext, 'chatMode' | 'mcpTools' | 'includeXMLToolDefinitions'>) => {
+	const header = (`You are an expert coding ${mode === 'agent' ? 'agent' : 'assistant'} whose job is \
+${mode === 'agent' ? `to help the user develop, run, and make changes to their codebase.`
+			: mode === 'gather' ? `to search, understand, and reference files in the user's codebase.`
+				: mode === 'normal' ? `to assist the user with their coding tasks.`
+					: ''}
+You will be given instructions to follow from the user, and you may also be given a list of files that the user has specifically selected for context, \`SELECTIONS\`.
+Please assist the user with their query.`)
+
+
 	const toolDefinitions = includeXMLToolDefinitions ? systemToolsXMLPrompt(mode, mcpTools) : null

 	const details: string[] = []
@ -506,19 +537,21 @@ Here's an example of a good code block:\n${chatSuggestionDiffExample}`)

 	details.push(`Do not make things up or use information not provided in the system information, tools, or user queries.`)
 	details.push(`Always use MARKDOWN to format lists, bullet points, etc. Do NOT write tables.`)
-	details.push(`Today's date is ${new Date().toDateString()}.`)

 	const importantDetails = (`Important notes:
 ${details.map((d, i) => `${i + 1}. ${d}`).join('\n\n')}`)

-
-	// return answer
+	// System message contains ONLY stable content (persona, rules, tool definitions)
+	// so the entire system prefix is eligible for cross-turn prefix caching. Anything
+	// that can change between turns (active file, open tabs, today's date, directory
+	// listing, terminal IDs) lives in `chat_volatileContext` and is baked into each
+	// user message's stored content at thread-creation time by chatThreadService.
+	// That keeps historical turns byte-identical across subsequent requests so the
+	// provider's prefix cache stays warm as the conversation grows.
 	const ansStrs: string[] = []
 	ansStrs.push(header)
-	ansStrs.push(sysInfo)
-	if (toolDefinitions) ansStrs.push(toolDefinitions)
 	ansStrs.push(importantDetails)
-	ansStrs.push(fsInfo)
+	if (toolDefinitions) ansStrs.push(toolDefinitions)

 	const fullSystemMsgStr = ansStrs
 		.join('\n\n\n')
--- a/src/vs/workbench/contrib/void/common/sendLLMMessageTypes.ts
+++ b/src/vs/workbench/contrib/void/common/sendLLMMessageTypes.ts
@ -84,6 +84,12 @@ export type RawToolParamsObj = {
 export type RawToolCallObj = {
 	name: ToolName;
 	rawParams: RawToolParamsObj;
+	// Original serialized `arguments` string as the model emitted it (OpenAI-compatible
+	// path only — Anthropic/Gemini deliver tool input as structured JSON with no raw
+	// source string). Preserved so that on replay we can send byte-identical content
+	// back to the provider, which keeps the prefix cache warm past the tool call.
+	// Absent/undefined when not available; callers should fall back to JSON.stringify(rawParams).
+	rawParamsStr?: string;
 	doneParams: ToolParamName<ToolName>[];
 	id: string;
 	isDone: boolean;
--- a/src/vs/workbench/contrib/void/electron-main/llmMessage/sendLLMMessage.impl.ts
+++ b/src/vs/workbench/contrib/void/electron-main/llmMessage/sendLLMMessage.impl.ts
@ -252,7 +252,10 @@ const rawToolCallObjOfParamsStr = (name: string, toolParamsStr: string, id: stri
 	if (typeof input !== 'object') return null

 	const rawParams: RawToolParamsObj = input
-	return { id, name, rawParams, doneParams: Object.keys(rawParams), isDone: true }
+	// Preserve the original argument string exactly as the model emitted it. On replay
+	// we'll send this back verbatim inside `tool_calls[].function.arguments` so the
+	// provider sees byte-identical content and the prefix cache stays warm.
+	return { id, name, rawParams, rawParamsStr: toolParamsStr, doneParams: Object.keys(rawParams), isDone: true }
 }


@ -365,11 +368,17 @@ const _sendOpenAICompatibleChat = async ({ messages, onText, onFinalMessage, onE
 				}


-				// reasoning
+				// reasoning — nameOfFieldInDelta may be a single field or a list of candidates
+				// (some gateways like OpenRouter use `reasoning`, others like DeepSeek use
+				// `reasoning_content`). Take the first non-empty one this chunk provides.
 				let newReasoning = ''
 				if (nameOfReasoningFieldInDelta) {
-					// @ts-ignore
-					newReasoning = (chunk.choices[0]?.delta?.[nameOfReasoningFieldInDelta] || '') + ''
+					const fields = Array.isArray(nameOfReasoningFieldInDelta) ? nameOfReasoningFieldInDelta : [nameOfReasoningFieldInDelta]
+					for (const f of fields) {
+						// @ts-ignore
+						const val = (chunk.choices[0]?.delta?.[f] || '') + ''
+						if (val) { newReasoning = val; break }
+					}
 					fullReasoningSoFar += newReasoning
 				}

@ -821,9 +830,19 @@ const sendGeminiChat = async ({

 			// Process the stream
 			for await (const chunk of stream) {
-				// message
-				const newText = chunk.text ?? ''
-				fullTextSoFar += newText
+				// message — split thought-tagged parts from answer parts.
+				// Gemini 2.5 Pro / Gemma 4 route internal reasoning through parts with
+				// `thought: true`; the visible answer lives in plain text parts. Using
+				// `chunk.text` (SDK shortcut) would concatenate both, polluting the
+				// chat view and the stored message history.
+				const parts = chunk.candidates?.[0]?.content?.parts
+				if (parts) {
+					for (const part of parts) {
+						if (typeof part.text !== 'string') continue // skip functionCall / inlineData / etc.
+						if (part.thought === true) fullReasoningSoFar += part.text
+						else fullTextSoFar += part.text
+					}
+				}

 				// tool call
 				const functionCalls = chunk.functionCalls
@ -834,17 +853,21 @@ const sendGeminiChat = async ({
 					toolId = functionCall.id ?? ''
 				}

-				// (do not handle reasoning yet)
-
 				// usage (Gemini exposes promptTokenCount / candidatesTokenCount / totalTokenCount /
-				// thoughtsTokenCount via usageMetadata). Only update when the chunk reports it.
+				// thoughtsTokenCount / cachedContentTokenCount via usageMetadata). Multiple
+				// chunks can carry usageMetadata during a stream, and the field set is NOT
+				// consistent across chunks — notably, cachedContentTokenCount often appears
+				// on an early chunk and is absent from the final summary. Merge per-field
+				// with `??` so we preserve the best value seen so far instead of flickering
+				// to `undefined` when Google stops reporting a field.
 				const usageMetadata = chunk.usageMetadata
 				if (usageMetadata) {
 					latestUsage = {
-						inputTokens: usageMetadata.promptTokenCount,
-						outputTokens: usageMetadata.candidatesTokenCount,
-						totalTokens: usageMetadata.totalTokenCount,
-						reasoningTokens: usageMetadata.thoughtsTokenCount,
+						inputTokens: usageMetadata.promptTokenCount ?? latestUsage?.inputTokens,
+						outputTokens: usageMetadata.candidatesTokenCount ?? latestUsage?.outputTokens,
+						totalTokens: usageMetadata.totalTokenCount ?? latestUsage?.totalTokens,
+						reasoningTokens: usageMetadata.thoughtsTokenCount ?? latestUsage?.reasoningTokens,
+						cachedInputTokens: usageMetadata.cachedContentTokenCount ?? latestUsage?.cachedInputTokens,
 					}
 				}