fix(reasoning): warm-up

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2026-04-04 20:25:24 +00:00
parent 6d9d77d590
commit c5a840f6af
2 changed files with 41 additions and 11 deletions

View file

@ -1,5 +1,5 @@
LLAMA_VERSION?=d006858316d4650bb4da0c6923294ccd741caefd
LLAMA_VERSION?=b8635075ffe27b135c49afb9a8b5c434bd42c502
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?=

View file

@ -84,18 +84,26 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
_, _, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
var reasoningDelta, contentDelta string
// Prefer pre-parsed chat deltas from C++ autoparser when available
// Always keep the Go-side extractor in sync with raw tokens
// (needed for backends that never send chat deltas).
goReasoning, goContent := extractor.ProcessToken(s)
// Prefer pre-parsed chat deltas from C++ autoparser when available.
if tokenUsage.HasChatDeltaContent() {
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
contentDelta = cd
// Strip reasoning tags (e.g. <|channel>thought / <channel|>) that
// the C++ autoparser includes as part of reasoning content.
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
// Keep extractor state consistent for fallback
extractor.ProcessToken(s)
} else if config.TemplateConfig.UseTokenizerTemplate {
// C++ autoparser is active (jinja templates) but hasn't emitted
// chat deltas for this chunk yet — PEG parser is still warming up
// (e.g. accumulating "<|channel>thought\n" for Gemma 4).
// Suppress Go-side output to avoid leaking partial tag tokens.
} else {
// Fallback: Go-side extraction from raw text
reasoningDelta, contentDelta = extractor.ProcessToken(s)
// No autoparser — use Go-side extraction as the sole source.
reasoningDelta = goReasoning
contentDelta = goContent
}
usage := schema.OpenAIUsage{
@ -151,18 +159,22 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
var reasoningDelta, contentDelta string
// Prefer pre-parsed chat deltas from C++ autoparser when available
// Always keep the Go-side extractor in sync with raw tokens
goReasoning, goContent := extractor.ProcessToken(s)
// Prefer pre-parsed chat deltas from C++ autoparser when available.
if usage.HasChatDeltaContent() {
rawReasoning, cd := usage.ChatDeltaReasoningAndContent()
contentDelta = cd
// Strip reasoning tags (e.g. <|channel>thought / <channel|>) that
// the C++ autoparser includes as part of reasoning content.
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
// Keep extractor state consistent for fallback
extractor.ProcessToken(s)
} else if config.TemplateConfig.UseTokenizerTemplate {
// C++ autoparser warming up — suppress Go-side to avoid tag leaks.
} else {
// Fallback: Go-side extraction from raw text
reasoningDelta, contentDelta = extractor.ProcessToken(s)
// No autoparser — use Go-side extraction.
reasoningDelta = goReasoning
contentDelta = goContent
}
// Emit reasoning deltas in their own SSE chunks before any tool-call chunks
@ -993,6 +1005,24 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
return err
}
// For non-tool requests: prefer C++ autoparser chat deltas over
// Go-side tag extraction (which can mangle output when thinkingStartToken
// differs from the model's actual reasoning tags, e.g. Gemma 4).
if !shouldUseFn && len(chatDeltas) > 0 {
deltaContent := functions.ContentFromChatDeltas(chatDeltas)
deltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas)
if deltaContent != "" || deltaReasoning != "" {
xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
stopReason := FinishReasonStop
message := &schema.Message{Role: "assistant", Content: &deltaContent}
if deltaReasoning != "" {
message.Reasoning = &deltaReasoning
}
result = []schema.Choice{{FinishReason: &stopReason, Index: 0, Message: message}}
}
}
// Tool parsing is deferred here (only when shouldUseFn) so chat deltas are available
if shouldUseFn {
var funcResults []functions.FuncCallResults