Autocomplete (max number of pending requests, filter for matchup with user's text, better prompt)

This commit is contained in:
mp 2024-11-19 03:57:06 -08:00
parent 6df0093786
commit cd77542a9e
5 changed files with 86 additions and 80 deletions

View file

@ -1,32 +0,0 @@
import { LRUCache } from 'lru-cache';
const DEFAULT_MAX_SIZE = 20
export class SimpleLRUCache<T extends {}> {
private cache: LRUCache<number, T>;
private maxSize: number
public length: number
constructor(maxSize?: number) {
maxSize = maxSize ?? DEFAULT_MAX_SIZE
this.cache = new LRUCache<number, T>({ max: maxSize });
this.length = 0
this.maxSize = maxSize
}
push(value: T): void {
const key = this.cache.size;
this.cache.set(key, value);
this.length++
this.length = Math.min(this.length, this.maxSize)
}
values() {
return this.cache.values()
}
}

View file

@ -22,9 +22,10 @@ export const getFIMSystem: GetFIMPrompt = ({ voidConfig, fimInfo }) => {
Instruction summary:
1. Return the MIDDLE of the code between the START and END.
2. Do not give an explanation, description, or any other code besides the middle.
2. Do not return duplicate code from either START or END.
3. Make sure the MIDDLE piece of code has balanced brackets that match the START and END.
4. The MIDDLE begins on the same line as START. Please include a newline character if you want to begin on the next line.
3. Do not return duplicate code from either START or END.
4. Make sure the MIDDLE piece of code has balanced brackets that match the START and END.
5. The MIDDLE begins on the same line as START. Please include a newline character if you want to begin on the next line.
6. Around 90% of the time, you should return just one or a few lines of code. You should keep your outputs short unless you are confident the user is trying to write boilderplate code.
# EXAMPLE
@ -75,11 +76,19 @@ export const getFIMPrompt: GetFIMPrompt = ({ voidConfig, fimInfo }) => {
// if no prefix or suffix, return empty string
if (!fimInfo.prefix.trim() && !fimInfo.suffix.trim()) return ''
// instruct model to generate a single line if there is text immediately after the cursor
const suffixLines = fimInfo.suffix.split('\n');
const afterCursor = suffixLines[0] || '';
const generateSingleLine = afterCursor.trim().length > 0;
const singleLinePrompt = generateSingleLine ? `Please produce a single line of code that fills in the middle.` : ''
// TODO may want to trim the prefix and suffix
switch (voidConfig.default.whichApi) {
case 'ollama':
if (voidConfig.ollama.model === 'codestral') {
return `[SUFFIX]${fimInfo.suffix}[PREFIX] ${fimInfo.prefix}`
return `${singleLinePrompt}[SUFFIX]${fimInfo.suffix}[PREFIX] ${fimInfo.prefix}`
} else if (voidConfig.ollama.model.includes('qwen')) {
return `${singleLinePrompt}<|fim_prefix|>${fimInfo.prefix}<|fim_suffix|>${fimInfo.suffix}<|fim_middle|>`
}
return ''
case 'anthropic':

View file

@ -247,12 +247,12 @@ export const sendOllamaMsg: SendLLMMessageFnTypeInternal = ({ mode, messages, on
let didAbort = false
let fullText = ""
const ollama = new Ollama({ host: voidConfig.ollama.endpoint })
abortRef.current = () => {
didAbort = true;
};
const ollama = new Ollama({ host: voidConfig.ollama.endpoint })
type GenerateResponse = Awaited<ReturnType<(typeof ollama.generate)>>
type ChatResponse = Awaited<ReturnType<(typeof ollama.chat)>>
@ -271,7 +271,6 @@ export const sendOllamaMsg: SendLLMMessageFnTypeInternal = ({ mode, messages, on
}
if (mode === 'fim') {
// the fim prompt is the last message
let prompt = messages[messages.length - 1].content
return ollama.generate({

View file

@ -2,22 +2,24 @@ import * as vscode from 'vscode';
import { AbortRef, LLMMessage, sendLLMMessage } from '../common/sendLLMMessage';
import { getVoidConfigFromPartial, VoidConfig } from '../webviews/common/contextForConfig';
import { LRUCache } from 'lru-cache';
import { SimpleLRUCache } from '../common/SimpleLruCache';
type AutocompletionStatus = 'pending' | 'finished' | 'error';
type Autocompletion = {
id: number,
prefix: string,
suffix: string,
startTime: number,
endTime: number | undefined,
abortRef: AbortRef,
status: AutocompletionStatus,
promise: Promise<string> | undefined,
llmPromise: Promise<string> | undefined,
result: string,
}
const DEBOUNCE_TIME = 300
const DEBOUNCE_TIME = 500
const TIMEOUT_TIME = 60000
const MAX_CACHE_SIZE = 20
const MAX_PENDING_REQUESTS = 2
// postprocesses the result
const postprocessResult = (result: string) => {
@ -72,10 +74,6 @@ const toInlineCompletion = ({ prefix, autocompletion, position }: { prefix: stri
const lastMatchupIndex = trimmedCurrentPrefix.length - trimmedOriginalPrefix.length
console.log('generatedMiddle ', generatedMiddle)
console.log('trimmedOriginalPrefix ', trimmedOriginalPrefix)
console.log('trimmedCurrentPrefix ', trimmedCurrentPrefix)
console.log('index: ', lastMatchupIndex)
if (lastMatchupIndex < 0) {
return new vscode.InlineCompletionItem('')
}
@ -90,19 +88,19 @@ const toInlineCompletion = ({ prefix, autocompletion, position }: { prefix: stri
}
// returns whether we can use this autocompletion to complete the prefix
// returns whether this autocompletion is in the cache
const doesPrefixMatchAutocompletion = ({ prefix, autocompletion }: { prefix: string, autocompletion: Autocompletion }): boolean => {
const originalPrefix = autocompletion.prefix
const generatedMiddle = autocompletion.result
const trimmedOriginalPrefix = trimPrefix(originalPrefix)
const trimmedCurrentPrefix = trimPrefix(prefix)
const originalPrefixTrimmed = trimPrefix(originalPrefix)
const currentPrefixTrimmed = trimPrefix(prefix)
if (trimmedCurrentPrefix.length < trimmedOriginalPrefix.length) {
if (currentPrefixTrimmed.length < originalPrefixTrimmed.length) {
return false
}
const isMatch = (trimmedOriginalPrefix + generatedMiddle).startsWith(trimmedCurrentPrefix)
const isMatch = (originalPrefixTrimmed + generatedMiddle).startsWith(currentPrefixTrimmed)
return isMatch
}
@ -111,11 +109,14 @@ const doesPrefixMatchAutocompletion = ({ prefix, autocompletion }: { prefix: str
export class AutocompleteProvider implements vscode.InlineCompletionItemProvider {
private _extensionContext: vscode.ExtensionContext;
private _autocompletionsOfDocument: { [docUriStr: string]: SimpleLRUCache<Autocompletion> } = {}
private _autocompletionId: number = 0;
private _autocompletionsOfDocument: { [docUriStr: string]: LRUCache<number, Autocompletion> } = {}
private _lastTime = 0
private _lastCompletionTime = 0
private _lastPrefix: string = ''
constructor(context: vscode.ExtensionContext) {
this._extensionContext = context
@ -130,7 +131,7 @@ export class AutocompleteProvider implements vscode.InlineCompletionItemProvider
token: vscode.CancellationToken,
): Promise<vscode.InlineCompletionItem[]> {
const disabled = true
const disabled = false
if (disabled) { return []; }
const docUriStr = document.uri.toString()
@ -139,20 +140,26 @@ export class AutocompleteProvider implements vscode.InlineCompletionItemProvider
const cursorOffset = document.offsetAt(position);
const prefix = fullText.substring(0, cursorOffset)
const suffix = fullText.substring(cursorOffset)
if (!this._autocompletionsOfDocument[docUriStr]) {
this._autocompletionsOfDocument[docUriStr] = new SimpleLRUCache()
}
const voidConfig = getVoidConfigFromPartial(this._extensionContext.globalState.get('partialVoidConfig') ?? {})
// initialize cache and other variables
// note that whenever an autocompletion is rejected, it is removed from cache
if (!this._autocompletionsOfDocument[docUriStr]) {
this._autocompletionsOfDocument[docUriStr] = new LRUCache<number, Autocompletion>({
max: MAX_CACHE_SIZE,
dispose: (autocompletion) => { autocompletion.abortRef.current() }
})
}
this._lastPrefix = prefix
console.log('cache size: ', this._autocompletionsOfDocument[docUriStr].size)
// get autocompletion from cache
let cachedAutocompletion: Autocompletion | undefined = undefined
loop: for (const autocompletion of this._autocompletionsOfDocument[docUriStr].values()) {
for (const autocompletion of this._autocompletionsOfDocument[docUriStr].values()) {
// if the user's change matches up with the generated text
if (doesPrefixMatchAutocompletion({ prefix, autocompletion })) {
cachedAutocompletion = autocompletion
break loop;
break
}
}
@ -169,11 +176,12 @@ export class AutocompleteProvider implements vscode.InlineCompletionItemProvider
console.log('AAA2')
try {
await cachedAutocompletion.promise;
await cachedAutocompletion.llmPromise;
const inlineCompletion = toInlineCompletion({ autocompletion: cachedAutocompletion, prefix, position })
return [inlineCompletion]
} catch (e) {
this._autocompletionsOfDocument[docUriStr].delete(cachedAutocompletion.id)
console.error('Error creating autocompletion (1): ' + e)
}
@ -184,15 +192,13 @@ export class AutocompleteProvider implements vscode.InlineCompletionItemProvider
return []
}
// if there is no cached autocompletion, create it and add it to cache
// else if no more typing happens, then go forwards with the request
// wait DEBOUNCE_TIME for the user to stop typing
const thisTime = Date.now()
this._lastTime = thisTime
this._lastCompletionTime = thisTime
const didTypingHappenDuringDebounce = await new Promise((resolve, reject) =>
setTimeout(() => {
if (this._lastTime === thisTime) {
if (this._lastCompletionTime === thisTime) {
resolve(false)
} else {
resolve(true)
@ -207,27 +213,50 @@ export class AutocompleteProvider implements vscode.InlineCompletionItemProvider
console.log('BBB')
// else if no more typing happens, then go forwards with the request
// if there are too many pending requests, cancel the oldest one
let numPending = 0
let oldestPending: Autocompletion | undefined = undefined
for (const autocompletion of this._autocompletionsOfDocument[docUriStr].values()) {
if (autocompletion.status === 'pending') {
numPending += 1
if (oldestPending === undefined) {
oldestPending = autocompletion
}
if (numPending >= MAX_PENDING_REQUESTS) {
// cancel the oldest pending request and remove it from cache
this._autocompletionsOfDocument[docUriStr].delete(oldestPending.id)
break
}
}
}
// create a new autocompletion and add it to cache
const newAutocompletion: Autocompletion = {
id: this._autocompletionId++,
prefix: prefix,
suffix: suffix,
startTime: Date.now(),
endTime: undefined,
abortRef: { current: () => { } },
status: 'pending',
promise: undefined,
llmPromise: undefined,
result: '',
}
// set parameters of `newAutocompletion` appropriately
newAutocompletion.promise = new Promise((resolve, reject) => {
newAutocompletion.llmPromise = new Promise((resolve, reject) => {
sendLLMMessage({
mode: 'fim',
fimInfo: { prefix, suffix },
onText: async (tokenStr, completionStr) => {
// TODO filter out bad responses here
newAutocompletion.result = completionStr
// if generation doesn't match the prefix for the first few tokens generated, reject it
if (completionStr.length < 20 && !doesPrefixMatchAutocompletion({ prefix: this._lastPrefix, autocompletion: newAutocompletion })) {
reject('LLM response did not match user\'s text.')
}
},
onFinalMessage: (finalMessage) => {
@ -252,24 +281,28 @@ export class AutocompleteProvider implements vscode.InlineCompletionItemProvider
abortRef: newAutocompletion.abortRef,
})
setTimeout(() => { // if the request hasnt resolved in TIMEOUT_TIME seconds, reject it
// if the request hasnt resolved in TIMEOUT_TIME seconds, reject it
setTimeout(() => {
if (newAutocompletion.status === 'pending') {
reject('Timeout')
reject('Timeout receiving message to LLM.')
}
}, TIMEOUT_TIME)
})
// add autocompletion to cache
this._autocompletionsOfDocument[docUriStr].push(newAutocompletion)
this._autocompletionsOfDocument[docUriStr].set(newAutocompletion.id, newAutocompletion)
// show autocompletion
try {
await newAutocompletion.promise;
await newAutocompletion.llmPromise;
const inlineCompletion = toInlineCompletion({ autocompletion: newAutocompletion, prefix, position })
return [inlineCompletion]
} catch (e) {
this._autocompletionsOfDocument[docUriStr].delete(newAutocompletion.id)
console.error('Error creating autocompletion (2): ' + e)
return []
}
@ -277,6 +310,4 @@ export class AutocompleteProvider implements vscode.InlineCompletionItemProvider
}
}

View file

@ -1,5 +1,6 @@
import React, { ReactNode, createContext, useCallback, useContext, useEffect, useRef, useState, } from "react"
import { awaitVSCodeResponse, getVSCodeAPI, useOnVSCodeMessage } from "./getVscodeApi"
import { Ollama } from "ollama/browser"
const configEnum = <EnumArr extends readonly string[]>(description: string, defaultVal: EnumArr[number], enumArr: EnumArr) => {
return {
@ -29,8 +30,6 @@ export const configFields = [
'azure',
] as const
const voidConfigInfo: Record<
typeof configFields[number] | 'default', {
[prop: string]: {
@ -122,7 +121,7 @@ const voidConfigInfo: Record<
model: configEnum(
'Ollama model to use.',
'codestral',
["codestral", "codegemma", "codegemma:2b", "codegemma:7b", "codellama", "codellama:7b", "codellama:13b", "codellama:34b", "codellama:70b", "codellama:code", "codellama:python", "command-r", "command-r:35b", "command-r-plus", "command-r-plus:104b", "deepseek-coder-v2", "deepseek-coder-v2:16b", "deepseek-coder-v2:236b", "falcon2", "falcon2:11b", "firefunction-v2", "firefunction-v2:70b", "gemma", "gemma:2b", "gemma:7b", "gemma2", "gemma2:2b", "gemma2:9b", "gemma2:27b", "llama2", "llama2:7b", "llama2:13b", "llama2:70b", "llama3", "llama3:8b", "llama3:70b", "llama3-chatqa", "llama3-chatqa:8b", "llama3-chatqa:70b", "llama3-gradient", "llama3-gradient:8b", "llama3-gradient:70b", "llama3.1", "llama3.2", "llama3.1:8b", "llama3.1:70b", "llama3.1:405b", "llava", "llava:7b", "llava:13b", "llava:34b", "llava-llama3", "llava-llama3:8b", "llava-phi3", "llava-phi3:3.8b", "mistral", "mistral:7b", "mistral-large", "mistral-large:123b", "mistral-nemo", "mistral-nemo:12b", "mixtral", "mixtral:8x7b", "mixtral:8x22b", "moondream", "moondream:1.8b", "openhermes", "openhermes:v2.5", "phi3", "phi3:3.8b", "phi3:14b", "phi3.5", "phi3.5:3.8b", "qwen", "qwen:7b", "qwen:14b", "qwen:32b", "qwen:72b", "qwen:110b", "qwen2", "qwen2:0.5b", "qwen2:1.5b", "qwen2:7b", "qwen2:72b", "smollm", "smollm:135m", "smollm:360m", "smollm:1.7b"] as const
["codestral", "qwen2.5-coder", "qwen2.5-coder:0.5B", "qwen2.5-coder:1.5B", "qwen2.5-coder:3B", "qwen2.5-coder:7B", "qwen2.5-coder:14B", "qwen2.5-coder:32B", "codegemma", "codegemma:2b", "codegemma:7b", "codellama", "codellama:7b", "codellama:13b", "codellama:34b", "codellama:70b", "codellama:code", "codellama:python", "command-r", "command-r:35b", "command-r-plus", "command-r-plus:104b", "deepseek-coder-v2", "deepseek-coder-v2:16b", "deepseek-coder-v2:236b", "falcon2", "falcon2:11b", "firefunction-v2", "firefunction-v2:70b", "gemma", "gemma:2b", "gemma:7b", "gemma2", "gemma2:2b", "gemma2:9b", "gemma2:27b", "llama2", "llama2:7b", "llama2:13b", "llama2:70b", "llama3", "llama3:8b", "llama3:70b", "llama3-chatqa", "llama3-chatqa:8b", "llama3-chatqa:70b", "llama3-gradient", "llama3-gradient:8b", "llama3-gradient:70b", "llama3.1", "llama3.2", "llama3.1:8b", "llama3.1:70b", "llama3.1:405b", "llava", "llava:7b", "llava:13b", "llava:34b", "llava-llama3", "llava-llama3:8b", "llava-phi3", "llava-phi3:3.8b", "mistral", "mistral:7b", "mistral-large", "mistral-large:123b", "mistral-nemo", "mistral-nemo:12b", "mixtral", "mixtral:8x7b", "mixtral:8x22b", "moondream", "moondream:1.8b", "openhermes", "openhermes:v2.5", "phi3", "phi3:3.8b", "phi3:14b", "phi3.5", "phi3.5:3.8b", "qwen", "qwen:7b", "qwen:14b", "qwen:32b", "qwen:72b", "qwen:110b", "qwen2", "qwen2:0.5b", "qwen2:1.5b", "qwen2:7b", "qwen2:72b", "smollm", "smollm:135m", "smollm:360m", "smollm:1.7b"] as const
),
},
openRouter: {