diff --git a/src/vs/workbench/contrib/void/browser/toolsService.ts b/src/vs/workbench/contrib/void/browser/toolsService.ts index 439bed62..1936be08 100644 --- a/src/vs/workbench/contrib/void/browser/toolsService.ts +++ b/src/vs/workbench/contrib/void/browser/toolsService.ts @@ -437,7 +437,6 @@ export class ToolsService implements IToolsService { return `Change successfully made to ${params.uri.fsPath}. ${additionalStr}` }, run_terminal_command: (params, result) => { - const { terminalId, didCreateTerminal, diff --git a/src/vs/workbench/contrib/void/common/modelCapabilities.ts b/src/vs/workbench/contrib/void/common/modelCapabilities.ts index 5417649c..ad76ebec 100644 --- a/src/vs/workbench/contrib/void/common/modelCapabilities.ts +++ b/src/vs/workbench/contrib/void/common/modelCapabilities.ts @@ -67,8 +67,8 @@ export const defaultModelsOfProvider = { type ModelOptions = { - contextWindow: number; // input tokens // <-- UNUSED - maxOutputTokens: number | null; // output tokens // <-- UNUSED + contextWindow: number; // input tokens + maxOutputTokens: number | null; // output tokens, defaults to 4092 cost: { // <-- UNUSED input: number; output: number; @@ -113,9 +113,9 @@ type ProviderSettings = { const modelOptionsDefaults: ModelOptions = { - contextWindow: 32_000, // unused - maxOutputTokens: null, // unused - cost: { input: 0, output: 0 }, // unused + contextWindow: 32_000, + maxOutputTokens: 4_096, + cost: { input: 0, output: 0 }, supportsSystemMessage: false, supportsTools: false, supportsFIM: false, @@ -493,7 +493,7 @@ const xAISettings: ProviderSettings = { const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing 'gemini-2.5-pro-exp-03-25': { contextWindow: 1_048_576, - maxOutputTokens: null, // 8_192, + maxOutputTokens: 8_192, cost: { input: 0, output: 0 }, supportsFIM: false, supportsSystemMessage: 'system-role', @@ -502,7 +502,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing }, 'gemini-2.0-flash': { contextWindow: 1_048_576, - maxOutputTokens: null, // 8_192, + maxOutputTokens: 8_192, // 8_192, cost: { input: 0.10, output: 0.40 }, supportsFIM: false, supportsSystemMessage: 'system-role', @@ -511,7 +511,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing }, 'gemini-2.0-flash-lite-preview-02-05': { contextWindow: 1_048_576, - maxOutputTokens: null, // 8_192, + maxOutputTokens: 8_192, // 8_192, cost: { input: 0.075, output: 0.30 }, supportsFIM: false, supportsSystemMessage: 'system-role', @@ -520,7 +520,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing }, 'gemini-1.5-flash': { contextWindow: 1_048_576, - maxOutputTokens: null, // 8_192, + maxOutputTokens: 8_192, // 8_192, cost: { input: 0.075, output: 0.30 }, // TODO!!! price doubles after 128K tokens, we are NOT encoding that info right now supportsFIM: false, supportsSystemMessage: 'system-role', @@ -529,7 +529,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing }, 'gemini-1.5-pro': { contextWindow: 2_097_152, - maxOutputTokens: null, // 8_192, + maxOutputTokens: 8_192, cost: { input: 1.25, output: 5.00 }, // TODO!!! price doubles after 128K tokens, we are NOT encoding that info right now supportsFIM: false, supportsSystemMessage: 'system-role', @@ -538,7 +538,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing }, 'gemini-1.5-flash-8b': { contextWindow: 1_048_576, - maxOutputTokens: null, // 8_192, + maxOutputTokens: 8_192, cost: { input: 0.0375, output: 0.15 }, // TODO!!! price doubles after 128K tokens, we are NOT encoding that info right now supportsFIM: false, supportsSystemMessage: 'system-role', @@ -559,13 +559,13 @@ const deepseekModelOptions = { 'deepseek-chat': { ...openSourceModelOptions_assumingOAICompat.deepseekR1, contextWindow: 64_000, // https://api-docs.deepseek.com/quick_start/pricing - maxOutputTokens: null, // 8_000, + maxOutputTokens: 8_000, // 8_000, cost: { cache_read: .07, input: .27, output: 1.10, }, }, 'deepseek-reasoner': { ...openSourceModelOptions_assumingOAICompat.deepseekCoderV2, contextWindow: 64_000, - maxOutputTokens: null, // 8_000, + maxOutputTokens: 8_000, // 8_000, cost: { cache_read: .14, input: .55, output: 2.19, }, }, } as const satisfies { [s: string]: ModelOptions } @@ -584,7 +584,7 @@ const deepseekSettings: ProviderSettings = { const groqModelOptions = { // https://console.groq.com/docs/models, https://groq.com/pricing/ 'llama-3.3-70b-versatile': { contextWindow: 128_000, - maxOutputTokens: null, // 32_768, + maxOutputTokens: 32_768, // 32_768, cost: { input: 0.59, output: 0.79 }, supportsFIM: false, supportsSystemMessage: 'system-role', @@ -593,7 +593,7 @@ const groqModelOptions = { // https://console.groq.com/docs/models, https://groq }, 'llama-3.1-8b-instant': { contextWindow: 128_000, - maxOutputTokens: null, // 8_192, + maxOutputTokens: 8_192, cost: { input: 0.05, output: 0.08 }, supportsFIM: false, supportsSystemMessage: 'system-role', diff --git a/src/vs/workbench/contrib/void/electron-main/llmMessage/preprocessLLMMessages.ts b/src/vs/workbench/contrib/void/electron-main/llmMessage/preprocessLLMMessages.ts index ab9991c1..21995562 100644 --- a/src/vs/workbench/contrib/void/electron-main/llmMessage/preprocessLLMMessages.ts +++ b/src/vs/workbench/contrib/void/electron-main/llmMessage/preprocessLLMMessages.ts @@ -40,17 +40,10 @@ const prepareMessages_normalize = ({ messages: messages_ }: { messages: LLMChatM const newMessages: LLMChatMessage[] = [] if (messages.length >= 0) newMessages.push(messages[0]) - // remove duplicate roles + // remove duplicate roles - we used to do this, but we don't anymore for (let i = 1; i < messages.length; i += 1) { - const curr = messages[i] - // const prev = messages[i - 1] - // // if found a repeated role, put the current content in the prev - // if ((curr.role === 'assistant' && prev.role === 'assistant')) { - // prev.content += '\n' + curr.content - // continue - // } - // add the message - newMessages.push(curr) + const m = messages[i] + newMessages.push(m) } const finalMessages = newMessages.map(m => ({ ...m, content: m.content.trim() })) return { messages: finalMessages } @@ -61,6 +54,94 @@ const prepareMessages_normalize = ({ messages: messages_ }: { messages: LLMChatM +const CHARS_PER_TOKEN = 4 +const TRIM_TO_LEN = 60 + +const prepareMessages_fitIntoContext = ({ messages, contextWindow, maxOutputTokens }: { messages: LLMChatMessage[], contextWindow: number, maxOutputTokens: number }): { messages: LLMChatMessage[] } => { + + // the higher the weight, the higher the desire to truncate + const alreadyTrimmedIdxes = new Set() + const weight = (message: LLMChatMessage, messages: LLMChatMessage[], idx: number) => { + const base = message.content.length + + let multiplier: number + if (message.role === 'system') + return 0 // never erase system message + + if (message.role === 'user') { + multiplier = 4 + } + else { + multiplier = 8 + } + + // last 3 msgs are very important + if (idx >= messages.length - 1 - 3 || alreadyTrimmedIdxes.has(idx)) { + multiplier *= .05 + } + + return base * multiplier + + } + const _findLargestByWeight = (messages: LLMChatMessage[]) => { + let largestIndex = -1 + let largestWeight = -Infinity + for (let i = 0; i < messages.length; i += 1) { + const m = messages[i] + const w = weight(m, messages, i) + if (w > largestWeight) { + largestWeight = w + largestIndex = i + } + } + return largestIndex + } + + let totalLen = 0 + for (const m of messages) { totalLen += m.content.length } + const charsNeedToTrim = totalLen - (contextWindow - maxOutputTokens) * CHARS_PER_TOKEN + if (charsNeedToTrim <= 0) return { messages } + + // <-----------------------------------------> + // 0 | | | + // | contextWindow | + // contextWindow - maxOut|putTokens + // | + // totalLen + + + // TRIM HIGHEST WEIGHT MESSAGES + let remainingCharsToTrim = charsNeedToTrim + let i = 0 + + while (remainingCharsToTrim > 0) { + i += 1 + if (i > 100) break + + const trimIdx = _findLargestByWeight(messages) + const m = messages[trimIdx] + + // if can finish here, do + const numCharsWillTrim = m.content.length - TRIM_TO_LEN + if (numCharsWillTrim > remainingCharsToTrim) { + m.content = m.content.slice(0, m.content.length - remainingCharsToTrim) + break + } + + remainingCharsToTrim -= numCharsWillTrim + m.content = m.content.substring(0, TRIM_TO_LEN - 3) + '...' + alreadyTrimmedIdxes.add(trimIdx) + } + + return { messages } + +} + + + + + + // no matter whether the model supports a system message or not (or what format it supports), add it in some way const prepareMessages_systemMessage = ({ @@ -378,14 +459,21 @@ export const prepareMessages = ({ supportsSystemMessage, supportsTools, supportsAnthropicReasoningSignature, + contextWindow, + maxOutputTokens, }: { messages: LLMChatMessage[], aiInstructions: string, supportsSystemMessage: false | 'system-role' | 'developer-role' | 'separated', supportsTools: false | 'anthropic-style' | 'openai-style', supportsAnthropicReasoningSignature: boolean, + contextWindow: number, + maxOutputTokens: number | null | undefined, }) => { - const { messages: messages1 } = prepareMessages_normalize({ messages }) + maxOutputTokens = maxOutputTokens ?? 4_096 // default to 4096 + + const { messages: messages0 } = prepareMessages_normalize({ messages }) + const { messages: messages1 } = prepareMessages_fitIntoContext({ messages: messages0, contextWindow, maxOutputTokens }) const { messages: messages2 } = prepareMessages_anthropicContent({ messages: messages1, supportsAnthropicReasoningSignature }) const { messages: messages3, separateSystemMessageStr } = prepareMessages_systemMessage({ messages: messages2, aiInstructions, supportsSystemMessage }) const { messages: messages4 } = prepareMessages_tools({ messages: messages3, supportsTools }) diff --git a/src/vs/workbench/contrib/void/electron-main/llmMessage/sendLLMMessage.impl.ts b/src/vs/workbench/contrib/void/electron-main/llmMessage/sendLLMMessage.impl.ts index a4bce5b9..05610f6a 100644 --- a/src/vs/workbench/contrib/void/electron-main/llmMessage/sendLLMMessage.impl.ts +++ b/src/vs/workbench/contrib/void/electron-main/llmMessage/sendLLMMessage.impl.ts @@ -157,7 +157,8 @@ const _sendOpenAICompatibleChat = ({ messages: messages_, onText, onFinalMessage modelName, supportsSystemMessage, supportsTools, - // maxOutputTokens, + contextWindow, + maxOutputTokens, reasoningCapabilities, } = getModelCapabilities(providerName, modelName_) @@ -173,10 +174,10 @@ const _sendOpenAICompatibleChat = ({ messages: messages_, onText, onFinalMessage const toolsObj = tools ? { tools: tools, tool_choice: 'auto', parallel_tool_calls: false, } as const : {} // max tokens - // const maxTokens = reasoningInfo?.isReasoningEnabled && reasoningCapabilities ? reasoningCapabilities.reasoningMaxOutputTokens : maxOutputTokens + const maxTokens = reasoningInfo?.isReasoningEnabled && reasoningCapabilities ? reasoningCapabilities.reasoningMaxOutputTokens : maxOutputTokens // instance - const { messages } = prepareMessages({ messages: messages_, aiInstructions, supportsSystemMessage, supportsTools, supportsAnthropicReasoningSignature: false }) + const { messages } = prepareMessages({ messages: messages_, aiInstructions, supportsSystemMessage, supportsTools, supportsAnthropicReasoningSignature: false, contextWindow, maxOutputTokens: maxTokens }) const openai: OpenAI = newOpenAICompatibleSDK({ providerName, settingsOfProvider, includeInPayload }) const options: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = { model: modelName, @@ -316,6 +317,7 @@ const sendAnthropicChat = ({ messages: messages_, providerName, onText, onFinalM const { modelName, supportsSystemMessage, + contextWindow, supportsTools, maxOutputTokens, reasoningCapabilities, @@ -339,7 +341,7 @@ const sendAnthropicChat = ({ messages: messages_, providerName, onText, onFinalM const maxTokens = reasoningInfo?.isReasoningEnabled && reasoningCapabilities ? reasoningCapabilities.reasoningMaxOutputTokens : maxOutputTokens // instance - const { messages, separateSystemMessageStr } = prepareMessages({ messages: messages_, aiInstructions, supportsSystemMessage, supportsTools, supportsAnthropicReasoningSignature: true }) + const { messages, separateSystemMessageStr } = prepareMessages({ messages: messages_, aiInstructions, supportsSystemMessage, supportsTools, supportsAnthropicReasoningSignature: true, contextWindow, maxOutputTokens: maxTokens }) const anthropic = new Anthropic({ apiKey: thisConfig.apiKey, dangerouslyAllowBrowser: true