context trimming

This commit is contained in:
Andrew Pareles 2025-04-07 01:46:07 -07:00
parent 7c4e92a030
commit 7aa5f1e4ec
4 changed files with 120 additions and 31 deletions

View file

@ -437,7 +437,6 @@ export class ToolsService implements IToolsService {
return `Change successfully made to ${params.uri.fsPath}. ${additionalStr}`
},
run_terminal_command: (params, result) => {
const {
terminalId,
didCreateTerminal,

View file

@ -67,8 +67,8 @@ export const defaultModelsOfProvider = {
type ModelOptions = {
contextWindow: number; // input tokens // <-- UNUSED
maxOutputTokens: number | null; // output tokens // <-- UNUSED
contextWindow: number; // input tokens
maxOutputTokens: number | null; // output tokens, defaults to 4092
cost: { // <-- UNUSED
input: number;
output: number;
@ -113,9 +113,9 @@ type ProviderSettings = {
const modelOptionsDefaults: ModelOptions = {
contextWindow: 32_000, // unused
maxOutputTokens: null, // unused
cost: { input: 0, output: 0 }, // unused
contextWindow: 32_000,
maxOutputTokens: 4_096,
cost: { input: 0, output: 0 },
supportsSystemMessage: false,
supportsTools: false,
supportsFIM: false,
@ -493,7 +493,7 @@ const xAISettings: ProviderSettings = {
const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing
'gemini-2.5-pro-exp-03-25': {
contextWindow: 1_048_576,
maxOutputTokens: null, // 8_192,
maxOutputTokens: 8_192,
cost: { input: 0, output: 0 },
supportsFIM: false,
supportsSystemMessage: 'system-role',
@ -502,7 +502,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing
},
'gemini-2.0-flash': {
contextWindow: 1_048_576,
maxOutputTokens: null, // 8_192,
maxOutputTokens: 8_192, // 8_192,
cost: { input: 0.10, output: 0.40 },
supportsFIM: false,
supportsSystemMessage: 'system-role',
@ -511,7 +511,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing
},
'gemini-2.0-flash-lite-preview-02-05': {
contextWindow: 1_048_576,
maxOutputTokens: null, // 8_192,
maxOutputTokens: 8_192, // 8_192,
cost: { input: 0.075, output: 0.30 },
supportsFIM: false,
supportsSystemMessage: 'system-role',
@ -520,7 +520,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing
},
'gemini-1.5-flash': {
contextWindow: 1_048_576,
maxOutputTokens: null, // 8_192,
maxOutputTokens: 8_192, // 8_192,
cost: { input: 0.075, output: 0.30 }, // TODO!!! price doubles after 128K tokens, we are NOT encoding that info right now
supportsFIM: false,
supportsSystemMessage: 'system-role',
@ -529,7 +529,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing
},
'gemini-1.5-pro': {
contextWindow: 2_097_152,
maxOutputTokens: null, // 8_192,
maxOutputTokens: 8_192,
cost: { input: 1.25, output: 5.00 }, // TODO!!! price doubles after 128K tokens, we are NOT encoding that info right now
supportsFIM: false,
supportsSystemMessage: 'system-role',
@ -538,7 +538,7 @@ const geminiModelOptions = { // https://ai.google.dev/gemini-api/docs/pricing
},
'gemini-1.5-flash-8b': {
contextWindow: 1_048_576,
maxOutputTokens: null, // 8_192,
maxOutputTokens: 8_192,
cost: { input: 0.0375, output: 0.15 }, // TODO!!! price doubles after 128K tokens, we are NOT encoding that info right now
supportsFIM: false,
supportsSystemMessage: 'system-role',
@ -559,13 +559,13 @@ const deepseekModelOptions = {
'deepseek-chat': {
...openSourceModelOptions_assumingOAICompat.deepseekR1,
contextWindow: 64_000, // https://api-docs.deepseek.com/quick_start/pricing
maxOutputTokens: null, // 8_000,
maxOutputTokens: 8_000, // 8_000,
cost: { cache_read: .07, input: .27, output: 1.10, },
},
'deepseek-reasoner': {
...openSourceModelOptions_assumingOAICompat.deepseekCoderV2,
contextWindow: 64_000,
maxOutputTokens: null, // 8_000,
maxOutputTokens: 8_000, // 8_000,
cost: { cache_read: .14, input: .55, output: 2.19, },
},
} as const satisfies { [s: string]: ModelOptions }
@ -584,7 +584,7 @@ const deepseekSettings: ProviderSettings = {
const groqModelOptions = { // https://console.groq.com/docs/models, https://groq.com/pricing/
'llama-3.3-70b-versatile': {
contextWindow: 128_000,
maxOutputTokens: null, // 32_768,
maxOutputTokens: 32_768, // 32_768,
cost: { input: 0.59, output: 0.79 },
supportsFIM: false,
supportsSystemMessage: 'system-role',
@ -593,7 +593,7 @@ const groqModelOptions = { // https://console.groq.com/docs/models, https://groq
},
'llama-3.1-8b-instant': {
contextWindow: 128_000,
maxOutputTokens: null, // 8_192,
maxOutputTokens: 8_192,
cost: { input: 0.05, output: 0.08 },
supportsFIM: false,
supportsSystemMessage: 'system-role',

View file

@ -40,17 +40,10 @@ const prepareMessages_normalize = ({ messages: messages_ }: { messages: LLMChatM
const newMessages: LLMChatMessage[] = []
if (messages.length >= 0) newMessages.push(messages[0])
// remove duplicate roles
// remove duplicate roles - we used to do this, but we don't anymore
for (let i = 1; i < messages.length; i += 1) {
const curr = messages[i]
// const prev = messages[i - 1]
// // if found a repeated role, put the current content in the prev
// if ((curr.role === 'assistant' && prev.role === 'assistant')) {
// prev.content += '\n' + curr.content
// continue
// }
// add the message
newMessages.push(curr)
const m = messages[i]
newMessages.push(m)
}
const finalMessages = newMessages.map(m => ({ ...m, content: m.content.trim() }))
return { messages: finalMessages }
@ -61,6 +54,94 @@ const prepareMessages_normalize = ({ messages: messages_ }: { messages: LLMChatM
const CHARS_PER_TOKEN = 4
const TRIM_TO_LEN = 60
const prepareMessages_fitIntoContext = ({ messages, contextWindow, maxOutputTokens }: { messages: LLMChatMessage[], contextWindow: number, maxOutputTokens: number }): { messages: LLMChatMessage[] } => {
// the higher the weight, the higher the desire to truncate
const alreadyTrimmedIdxes = new Set<number>()
const weight = (message: LLMChatMessage, messages: LLMChatMessage[], idx: number) => {
const base = message.content.length
let multiplier: number
if (message.role === 'system')
return 0 // never erase system message
if (message.role === 'user') {
multiplier = 4
}
else {
multiplier = 8
}
// last 3 msgs are very important
if (idx >= messages.length - 1 - 3 || alreadyTrimmedIdxes.has(idx)) {
multiplier *= .05
}
return base * multiplier
}
const _findLargestByWeight = (messages: LLMChatMessage[]) => {
let largestIndex = -1
let largestWeight = -Infinity
for (let i = 0; i < messages.length; i += 1) {
const m = messages[i]
const w = weight(m, messages, i)
if (w > largestWeight) {
largestWeight = w
largestIndex = i
}
}
return largestIndex
}
let totalLen = 0
for (const m of messages) { totalLen += m.content.length }
const charsNeedToTrim = totalLen - (contextWindow - maxOutputTokens) * CHARS_PER_TOKEN
if (charsNeedToTrim <= 0) return { messages }
// <----------------------------------------->
// 0 | | |
// | contextWindow |
// contextWindow - maxOut|putTokens
// |
// totalLen
// TRIM HIGHEST WEIGHT MESSAGES
let remainingCharsToTrim = charsNeedToTrim
let i = 0
while (remainingCharsToTrim > 0) {
i += 1
if (i > 100) break
const trimIdx = _findLargestByWeight(messages)
const m = messages[trimIdx]
// if can finish here, do
const numCharsWillTrim = m.content.length - TRIM_TO_LEN
if (numCharsWillTrim > remainingCharsToTrim) {
m.content = m.content.slice(0, m.content.length - remainingCharsToTrim)
break
}
remainingCharsToTrim -= numCharsWillTrim
m.content = m.content.substring(0, TRIM_TO_LEN - 3) + '...'
alreadyTrimmedIdxes.add(trimIdx)
}
return { messages }
}
// no matter whether the model supports a system message or not (or what format it supports), add it in some way
const prepareMessages_systemMessage = ({
@ -378,14 +459,21 @@ export const prepareMessages = ({
supportsSystemMessage,
supportsTools,
supportsAnthropicReasoningSignature,
contextWindow,
maxOutputTokens,
}: {
messages: LLMChatMessage[],
aiInstructions: string,
supportsSystemMessage: false | 'system-role' | 'developer-role' | 'separated',
supportsTools: false | 'anthropic-style' | 'openai-style',
supportsAnthropicReasoningSignature: boolean,
contextWindow: number,
maxOutputTokens: number | null | undefined,
}) => {
const { messages: messages1 } = prepareMessages_normalize({ messages })
maxOutputTokens = maxOutputTokens ?? 4_096 // default to 4096
const { messages: messages0 } = prepareMessages_normalize({ messages })
const { messages: messages1 } = prepareMessages_fitIntoContext({ messages: messages0, contextWindow, maxOutputTokens })
const { messages: messages2 } = prepareMessages_anthropicContent({ messages: messages1, supportsAnthropicReasoningSignature })
const { messages: messages3, separateSystemMessageStr } = prepareMessages_systemMessage({ messages: messages2, aiInstructions, supportsSystemMessage })
const { messages: messages4 } = prepareMessages_tools({ messages: messages3, supportsTools })

View file

@ -157,7 +157,8 @@ const _sendOpenAICompatibleChat = ({ messages: messages_, onText, onFinalMessage
modelName,
supportsSystemMessage,
supportsTools,
// maxOutputTokens,
contextWindow,
maxOutputTokens,
reasoningCapabilities,
} = getModelCapabilities(providerName, modelName_)
@ -173,10 +174,10 @@ const _sendOpenAICompatibleChat = ({ messages: messages_, onText, onFinalMessage
const toolsObj = tools ? { tools: tools, tool_choice: 'auto', parallel_tool_calls: false, } as const : {}
// max tokens
// const maxTokens = reasoningInfo?.isReasoningEnabled && reasoningCapabilities ? reasoningCapabilities.reasoningMaxOutputTokens : maxOutputTokens
const maxTokens = reasoningInfo?.isReasoningEnabled && reasoningCapabilities ? reasoningCapabilities.reasoningMaxOutputTokens : maxOutputTokens
// instance
const { messages } = prepareMessages({ messages: messages_, aiInstructions, supportsSystemMessage, supportsTools, supportsAnthropicReasoningSignature: false })
const { messages } = prepareMessages({ messages: messages_, aiInstructions, supportsSystemMessage, supportsTools, supportsAnthropicReasoningSignature: false, contextWindow, maxOutputTokens: maxTokens })
const openai: OpenAI = newOpenAICompatibleSDK({ providerName, settingsOfProvider, includeInPayload })
const options: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
model: modelName,
@ -316,6 +317,7 @@ const sendAnthropicChat = ({ messages: messages_, providerName, onText, onFinalM
const {
modelName,
supportsSystemMessage,
contextWindow,
supportsTools,
maxOutputTokens,
reasoningCapabilities,
@ -339,7 +341,7 @@ const sendAnthropicChat = ({ messages: messages_, providerName, onText, onFinalM
const maxTokens = reasoningInfo?.isReasoningEnabled && reasoningCapabilities ? reasoningCapabilities.reasoningMaxOutputTokens : maxOutputTokens
// instance
const { messages, separateSystemMessageStr } = prepareMessages({ messages: messages_, aiInstructions, supportsSystemMessage, supportsTools, supportsAnthropicReasoningSignature: true })
const { messages, separateSystemMessageStr } = prepareMessages({ messages: messages_, aiInstructions, supportsSystemMessage, supportsTools, supportsAnthropicReasoningSignature: true, contextWindow, maxOutputTokens: maxTokens })
const anthropic = new Anthropic({
apiKey: thisConfig.apiKey,
dangerouslyAllowBrowser: true