mirror of
https://github.com/mudler/LocalAI
synced 2026-04-21 13:27:21 +00:00
fix(reasoning): warm-up
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
6d9d77d590
commit
c5a840f6af
2 changed files with 41 additions and 11 deletions
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
LLAMA_VERSION?=d006858316d4650bb4da0c6923294ccd741caefd
|
||||
LLAMA_VERSION?=b8635075ffe27b135c49afb9a8b5c434bd42c502
|
||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
|
|
|||
|
|
@ -84,18 +84,26 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
|||
_, _, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
|
||||
var reasoningDelta, contentDelta string
|
||||
|
||||
// Prefer pre-parsed chat deltas from C++ autoparser when available
|
||||
// Always keep the Go-side extractor in sync with raw tokens
|
||||
// (needed for backends that never send chat deltas).
|
||||
goReasoning, goContent := extractor.ProcessToken(s)
|
||||
|
||||
// Prefer pre-parsed chat deltas from C++ autoparser when available.
|
||||
if tokenUsage.HasChatDeltaContent() {
|
||||
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
|
||||
contentDelta = cd
|
||||
// Strip reasoning tags (e.g. <|channel>thought / <channel|>) that
|
||||
// the C++ autoparser includes as part of reasoning content.
|
||||
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
|
||||
// Keep extractor state consistent for fallback
|
||||
extractor.ProcessToken(s)
|
||||
} else if config.TemplateConfig.UseTokenizerTemplate {
|
||||
// C++ autoparser is active (jinja templates) but hasn't emitted
|
||||
// chat deltas for this chunk yet — PEG parser is still warming up
|
||||
// (e.g. accumulating "<|channel>thought\n" for Gemma 4).
|
||||
// Suppress Go-side output to avoid leaking partial tag tokens.
|
||||
} else {
|
||||
// Fallback: Go-side extraction from raw text
|
||||
reasoningDelta, contentDelta = extractor.ProcessToken(s)
|
||||
// No autoparser — use Go-side extraction as the sole source.
|
||||
reasoningDelta = goReasoning
|
||||
contentDelta = goContent
|
||||
}
|
||||
|
||||
usage := schema.OpenAIUsage{
|
||||
|
|
@ -151,18 +159,22 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
|||
|
||||
var reasoningDelta, contentDelta string
|
||||
|
||||
// Prefer pre-parsed chat deltas from C++ autoparser when available
|
||||
// Always keep the Go-side extractor in sync with raw tokens
|
||||
goReasoning, goContent := extractor.ProcessToken(s)
|
||||
|
||||
// Prefer pre-parsed chat deltas from C++ autoparser when available.
|
||||
if usage.HasChatDeltaContent() {
|
||||
rawReasoning, cd := usage.ChatDeltaReasoningAndContent()
|
||||
contentDelta = cd
|
||||
// Strip reasoning tags (e.g. <|channel>thought / <channel|>) that
|
||||
// the C++ autoparser includes as part of reasoning content.
|
||||
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
|
||||
// Keep extractor state consistent for fallback
|
||||
extractor.ProcessToken(s)
|
||||
} else if config.TemplateConfig.UseTokenizerTemplate {
|
||||
// C++ autoparser warming up — suppress Go-side to avoid tag leaks.
|
||||
} else {
|
||||
// Fallback: Go-side extraction from raw text
|
||||
reasoningDelta, contentDelta = extractor.ProcessToken(s)
|
||||
// No autoparser — use Go-side extraction.
|
||||
reasoningDelta = goReasoning
|
||||
contentDelta = goContent
|
||||
}
|
||||
|
||||
// Emit reasoning deltas in their own SSE chunks before any tool-call chunks
|
||||
|
|
@ -993,6 +1005,24 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
|||
return err
|
||||
}
|
||||
|
||||
// For non-tool requests: prefer C++ autoparser chat deltas over
|
||||
// Go-side tag extraction (which can mangle output when thinkingStartToken
|
||||
// differs from the model's actual reasoning tags, e.g. Gemma 4).
|
||||
if !shouldUseFn && len(chatDeltas) > 0 {
|
||||
deltaContent := functions.ContentFromChatDeltas(chatDeltas)
|
||||
deltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas)
|
||||
if deltaContent != "" || deltaReasoning != "" {
|
||||
xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
|
||||
"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
|
||||
stopReason := FinishReasonStop
|
||||
message := &schema.Message{Role: "assistant", Content: &deltaContent}
|
||||
if deltaReasoning != "" {
|
||||
message.Reasoning = &deltaReasoning
|
||||
}
|
||||
result = []schema.Choice{{FinishReason: &stopReason, Index: 0, Message: message}}
|
||||
}
|
||||
}
|
||||
|
||||
// Tool parsing is deferred here (only when shouldUseFn) so chat deltas are available
|
||||
if shouldUseFn {
|
||||
var funcResults []functions.FuncCallResults
|
||||
|
|
|
|||
Loading…
Reference in a new issue