mirror of
https://github.com/voideditor/void
synced 2026-05-24 09:58:23 +00:00
context trimming
This commit is contained in:
parent
7c4e92a030
commit
7aa5f1e4ec
4 changed files with 120 additions and 31 deletions
|
|
@ -437,7 +437,6 @@ export class ToolsService implements IToolsService {
|
|||
return `Change successfully made to ${params.uri.fsPath}. ${additionalStr}`
|
||||
},
|
||||
run_terminal_command: (params, result) => {
|
||||
|
||||
const {
|
||||
terminalId,
|
||||
didCreateTerminal,
|
||||
|
|
|
|||
|
|
@ -67,8 +67,8 @@ export const defaultModelsOfProvider = {
|
|||
|
||||
|
||||
type ModelOptions = {
|
||||
contextWindow: number; // input tokens // <-- UNUSED
|
||||
maxOutputTokens: number | null; // output tokens // <-- UNUSED
|
||||
contextWindow: number; // input tokens
|
||||
maxOutputTokens: number | null; // output tokens, defaults to 4092
|
||||
cost: { // <-- UNUSED
|
||||
input: number;
|
||||
output: number;
|
||||
|
|
@ -113,9 +113,9 @@ type ProviderSettings = {
|
|||
|
||||
|
||||
const modelOptionsDefaults: ModelOptions = {
|
||||
contextWindow: 32_000, // unused
|
||||
maxOutputTokens: null, // unused
|
||||
cost: { input: 0, output: 0 }, // unused
|
||||
contextWindow: 32_000,
|
||||
maxOutputTokens: 4_096,
|
||||
cost: { input: 0, output: 0 },
|
||||
supportsSystemMessage: false,
|
||||
supportsTools: false,
|
||||
supportsFIM: false,
|
||||
|
|
@ -493,7 +493,7 @@ const xAISettings: ProviderSettings = {
|
|||
const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing
|
||||
'gemini-2.5-pro-exp-03-25': {
|
||||
contextWindow: 1_048_576,
|
||||
maxOutputTokens: null, // 8_192,
|
||||
maxOutputTokens: 8_192,
|
||||
cost: { input: 0, output: 0 },
|
||||
supportsFIM: false,
|
||||
supportsSystemMessage: 'system-role',
|
||||
|
|
@ -502,7 +502,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing
|
|||
},
|
||||
'gemini-2.0-flash': {
|
||||
contextWindow: 1_048_576,
|
||||
maxOutputTokens: null, // 8_192,
|
||||
maxOutputTokens: 8_192, // 8_192,
|
||||
cost: { input: 0.10, output: 0.40 },
|
||||
supportsFIM: false,
|
||||
supportsSystemMessage: 'system-role',
|
||||
|
|
@ -511,7 +511,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing
|
|||
},
|
||||
'gemini-2.0-flash-lite-preview-02-05': {
|
||||
contextWindow: 1_048_576,
|
||||
maxOutputTokens: null, // 8_192,
|
||||
maxOutputTokens: 8_192, // 8_192,
|
||||
cost: { input: 0.075, output: 0.30 },
|
||||
supportsFIM: false,
|
||||
supportsSystemMessage: 'system-role',
|
||||
|
|
@ -520,7 +520,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing
|
|||
},
|
||||
'gemini-1.5-flash': {
|
||||
contextWindow: 1_048_576,
|
||||
maxOutputTokens: null, // 8_192,
|
||||
maxOutputTokens: 8_192, // 8_192,
|
||||
cost: { input: 0.075, output: 0.30 }, // TODO!!! price doubles after 128K tokens, we are NOT encoding that info right now
|
||||
supportsFIM: false,
|
||||
supportsSystemMessage: 'system-role',
|
||||
|
|
@ -529,7 +529,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing
|
|||
},
|
||||
'gemini-1.5-pro': {
|
||||
contextWindow: 2_097_152,
|
||||
maxOutputTokens: null, // 8_192,
|
||||
maxOutputTokens: 8_192,
|
||||
cost: { input: 1.25, output: 5.00 }, // TODO!!! price doubles after 128K tokens, we are NOT encoding that info right now
|
||||
supportsFIM: false,
|
||||
supportsSystemMessage: 'system-role',
|
||||
|
|
@ -538,7 +538,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing
|
|||
},
|
||||
'gemini-1.5-flash-8b': {
|
||||
contextWindow: 1_048_576,
|
||||
maxOutputTokens: null, // 8_192,
|
||||
maxOutputTokens: 8_192,
|
||||
cost: { input: 0.0375, output: 0.15 }, // TODO!!! price doubles after 128K tokens, we are NOT encoding that info right now
|
||||
supportsFIM: false,
|
||||
supportsSystemMessage: 'system-role',
|
||||
|
|
@ -559,13 +559,13 @@ const deepseekModelOptions = {
|
|||
'deepseek-chat': {
|
||||
...openSourceModelOptions_assumingOAICompat.deepseekR1,
|
||||
contextWindow: 64_000, // https://api-docs.deepseek.com/quick_start/pricing
|
||||
maxOutputTokens: null, // 8_000,
|
||||
maxOutputTokens: 8_000, // 8_000,
|
||||
cost: { cache_read: .07, input: .27, output: 1.10, },
|
||||
},
|
||||
'deepseek-reasoner': {
|
||||
...openSourceModelOptions_assumingOAICompat.deepseekCoderV2,
|
||||
contextWindow: 64_000,
|
||||
maxOutputTokens: null, // 8_000,
|
||||
maxOutputTokens: 8_000, // 8_000,
|
||||
cost: { cache_read: .14, input: .55, output: 2.19, },
|
||||
},
|
||||
} as const satisfies { [s: string]: ModelOptions }
|
||||
|
|
@ -584,7 +584,7 @@ const deepseekSettings: ProviderSettings = {
|
|||
const groqModelOptions = { // https://console.groq.com/docs/models, https://groq.com/pricing/
|
||||
'llama-3.3-70b-versatile': {
|
||||
contextWindow: 128_000,
|
||||
maxOutputTokens: null, // 32_768,
|
||||
maxOutputTokens: 32_768, // 32_768,
|
||||
cost: { input: 0.59, output: 0.79 },
|
||||
supportsFIM: false,
|
||||
supportsSystemMessage: 'system-role',
|
||||
|
|
@ -593,7 +593,7 @@ const groqModelOptions = { // https://console.groq.com/docs/models, https://groq
|
|||
},
|
||||
'llama-3.1-8b-instant': {
|
||||
contextWindow: 128_000,
|
||||
maxOutputTokens: null, // 8_192,
|
||||
maxOutputTokens: 8_192,
|
||||
cost: { input: 0.05, output: 0.08 },
|
||||
supportsFIM: false,
|
||||
supportsSystemMessage: 'system-role',
|
||||
|
|
|
|||
|
|
@ -40,17 +40,10 @@ const prepareMessages_normalize = ({ messages: messages_ }: { messages: LLMChatM
|
|||
const newMessages: LLMChatMessage[] = []
|
||||
if (messages.length >= 0) newMessages.push(messages[0])
|
||||
|
||||
// remove duplicate roles
|
||||
// remove duplicate roles - we used to do this, but we don't anymore
|
||||
for (let i = 1; i < messages.length; i += 1) {
|
||||
const curr = messages[i]
|
||||
// const prev = messages[i - 1]
|
||||
// // if found a repeated role, put the current content in the prev
|
||||
// if ((curr.role === 'assistant' && prev.role === 'assistant')) {
|
||||
// prev.content += '\n' + curr.content
|
||||
// continue
|
||||
// }
|
||||
// add the message
|
||||
newMessages.push(curr)
|
||||
const m = messages[i]
|
||||
newMessages.push(m)
|
||||
}
|
||||
const finalMessages = newMessages.map(m => ({ ...m, content: m.content.trim() }))
|
||||
return { messages: finalMessages }
|
||||
|
|
@ -61,6 +54,94 @@ const prepareMessages_normalize = ({ messages: messages_ }: { messages: LLMChatM
|
|||
|
||||
|
||||
|
||||
const CHARS_PER_TOKEN = 4
|
||||
const TRIM_TO_LEN = 60
|
||||
|
||||
const prepareMessages_fitIntoContext = ({ messages, contextWindow, maxOutputTokens }: { messages: LLMChatMessage[], contextWindow: number, maxOutputTokens: number }): { messages: LLMChatMessage[] } => {
|
||||
|
||||
// the higher the weight, the higher the desire to truncate
|
||||
const alreadyTrimmedIdxes = new Set<number>()
|
||||
const weight = (message: LLMChatMessage, messages: LLMChatMessage[], idx: number) => {
|
||||
const base = message.content.length
|
||||
|
||||
let multiplier: number
|
||||
if (message.role === 'system')
|
||||
return 0 // never erase system message
|
||||
|
||||
if (message.role === 'user') {
|
||||
multiplier = 4
|
||||
}
|
||||
else {
|
||||
multiplier = 8
|
||||
}
|
||||
|
||||
// last 3 msgs are very important
|
||||
if (idx >= messages.length - 1 - 3 || alreadyTrimmedIdxes.has(idx)) {
|
||||
multiplier *= .05
|
||||
}
|
||||
|
||||
return base * multiplier
|
||||
|
||||
}
|
||||
const _findLargestByWeight = (messages: LLMChatMessage[]) => {
|
||||
let largestIndex = -1
|
||||
let largestWeight = -Infinity
|
||||
for (let i = 0; i < messages.length; i += 1) {
|
||||
const m = messages[i]
|
||||
const w = weight(m, messages, i)
|
||||
if (w > largestWeight) {
|
||||
largestWeight = w
|
||||
largestIndex = i
|
||||
}
|
||||
}
|
||||
return largestIndex
|
||||
}
|
||||
|
||||
let totalLen = 0
|
||||
for (const m of messages) { totalLen += m.content.length }
|
||||
const charsNeedToTrim = totalLen - (contextWindow - maxOutputTokens) * CHARS_PER_TOKEN
|
||||
if (charsNeedToTrim <= 0) return { messages }
|
||||
|
||||
// <----------------------------------------->
|
||||
// 0 | | |
|
||||
// | contextWindow |
|
||||
// contextWindow - maxOut|putTokens
|
||||
// |
|
||||
// totalLen
|
||||
|
||||
|
||||
// TRIM HIGHEST WEIGHT MESSAGES
|
||||
let remainingCharsToTrim = charsNeedToTrim
|
||||
let i = 0
|
||||
|
||||
while (remainingCharsToTrim > 0) {
|
||||
i += 1
|
||||
if (i > 100) break
|
||||
|
||||
const trimIdx = _findLargestByWeight(messages)
|
||||
const m = messages[trimIdx]
|
||||
|
||||
// if can finish here, do
|
||||
const numCharsWillTrim = m.content.length - TRIM_TO_LEN
|
||||
if (numCharsWillTrim > remainingCharsToTrim) {
|
||||
m.content = m.content.slice(0, m.content.length - remainingCharsToTrim)
|
||||
break
|
||||
}
|
||||
|
||||
remainingCharsToTrim -= numCharsWillTrim
|
||||
m.content = m.content.substring(0, TRIM_TO_LEN - 3) + '...'
|
||||
alreadyTrimmedIdxes.add(trimIdx)
|
||||
}
|
||||
|
||||
return { messages }
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// no matter whether the model supports a system message or not (or what format it supports), add it in some way
|
||||
const prepareMessages_systemMessage = ({
|
||||
|
|
@ -378,14 +459,21 @@ export const prepareMessages = ({
|
|||
supportsSystemMessage,
|
||||
supportsTools,
|
||||
supportsAnthropicReasoningSignature,
|
||||
contextWindow,
|
||||
maxOutputTokens,
|
||||
}: {
|
||||
messages: LLMChatMessage[],
|
||||
aiInstructions: string,
|
||||
supportsSystemMessage: false | 'system-role' | 'developer-role' | 'separated',
|
||||
supportsTools: false | 'anthropic-style' | 'openai-style',
|
||||
supportsAnthropicReasoningSignature: boolean,
|
||||
contextWindow: number,
|
||||
maxOutputTokens: number | null | undefined,
|
||||
}) => {
|
||||
const { messages: messages1 } = prepareMessages_normalize({ messages })
|
||||
maxOutputTokens = maxOutputTokens ?? 4_096 // default to 4096
|
||||
|
||||
const { messages: messages0 } = prepareMessages_normalize({ messages })
|
||||
const { messages: messages1 } = prepareMessages_fitIntoContext({ messages: messages0, contextWindow, maxOutputTokens })
|
||||
const { messages: messages2 } = prepareMessages_anthropicContent({ messages: messages1, supportsAnthropicReasoningSignature })
|
||||
const { messages: messages3, separateSystemMessageStr } = prepareMessages_systemMessage({ messages: messages2, aiInstructions, supportsSystemMessage })
|
||||
const { messages: messages4 } = prepareMessages_tools({ messages: messages3, supportsTools })
|
||||
|
|
|
|||
|
|
@ -157,7 +157,8 @@ const _sendOpenAICompatibleChat = ({ messages: messages_, onText, onFinalMessage
|
|||
modelName,
|
||||
supportsSystemMessage,
|
||||
supportsTools,
|
||||
// maxOutputTokens,
|
||||
contextWindow,
|
||||
maxOutputTokens,
|
||||
reasoningCapabilities,
|
||||
} = getModelCapabilities(providerName, modelName_)
|
||||
|
||||
|
|
@ -173,10 +174,10 @@ const _sendOpenAICompatibleChat = ({ messages: messages_, onText, onFinalMessage
|
|||
const toolsObj = tools ? { tools: tools, tool_choice: 'auto', parallel_tool_calls: false, } as const : {}
|
||||
|
||||
// max tokens
|
||||
// const maxTokens = reasoningInfo?.isReasoningEnabled && reasoningCapabilities ? reasoningCapabilities.reasoningMaxOutputTokens : maxOutputTokens
|
||||
const maxTokens = reasoningInfo?.isReasoningEnabled && reasoningCapabilities ? reasoningCapabilities.reasoningMaxOutputTokens : maxOutputTokens
|
||||
|
||||
// instance
|
||||
const { messages } = prepareMessages({ messages: messages_, aiInstructions, supportsSystemMessage, supportsTools, supportsAnthropicReasoningSignature: false })
|
||||
const { messages } = prepareMessages({ messages: messages_, aiInstructions, supportsSystemMessage, supportsTools, supportsAnthropicReasoningSignature: false, contextWindow, maxOutputTokens: maxTokens })
|
||||
const openai: OpenAI = newOpenAICompatibleSDK({ providerName, settingsOfProvider, includeInPayload })
|
||||
const options: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
|
||||
model: modelName,
|
||||
|
|
@ -316,6 +317,7 @@ const sendAnthropicChat = ({ messages: messages_, providerName, onText, onFinalM
|
|||
const {
|
||||
modelName,
|
||||
supportsSystemMessage,
|
||||
contextWindow,
|
||||
supportsTools,
|
||||
maxOutputTokens,
|
||||
reasoningCapabilities,
|
||||
|
|
@ -339,7 +341,7 @@ const sendAnthropicChat = ({ messages: messages_, providerName, onText, onFinalM
|
|||
const maxTokens = reasoningInfo?.isReasoningEnabled && reasoningCapabilities ? reasoningCapabilities.reasoningMaxOutputTokens : maxOutputTokens
|
||||
|
||||
// instance
|
||||
const { messages, separateSystemMessageStr } = prepareMessages({ messages: messages_, aiInstructions, supportsSystemMessage, supportsTools, supportsAnthropicReasoningSignature: true })
|
||||
const { messages, separateSystemMessageStr } = prepareMessages({ messages: messages_, aiInstructions, supportsSystemMessage, supportsTools, supportsAnthropicReasoningSignature: true, contextWindow, maxOutputTokens: maxTokens })
|
||||
const anthropic = new Anthropic({
|
||||
apiKey: thisConfig.apiKey,
|
||||
dangerouslyAllowBrowser: true
|
||||
|
|
|
|||
Loading…
Reference in a new issue