mirror of
https://github.com/mudler/LocalAI
synced 2026-04-21 13:27:21 +00:00
fix: thinking models with tools returning empty content (reasoning-only retry loop) (#9290)
When clients like Nextcloud or Home Assistant send requests with tools to thinking models (e.g. Gemma 4 with <|channel>thought tags), the response was empty despite the backend producing valid content. Root cause: the C++ autoparser puts clean content in both the raw Response and ChatDeltas. The Go-side PrependThinkingTokenIfNeeded then prepends the thinking start token to the already-clean content, causing ExtractReasoning to classify the entire response as unclosed reasoning. This made cbRawResult empty, triggering a retry loop that never succeeds. Two fixes: - inference.go: check ChatDeltas for content/tool_calls regardless of whether Response is empty, so skipCallerRetry fires correctly - chat.go: when ChatDeltas have content but no tool calls, use that content directly instead of falling back to the empty cbRawResult
This commit is contained in:
parent
85be4ff03c
commit
13a6ed709c
6 changed files with 151 additions and 8 deletions
|
|
@ -1045,6 +1045,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
|||
funcResults = deltaToolCalls
|
||||
textContentToReturn = functions.ContentFromChatDeltas(chatDeltas)
|
||||
cbReasoning = functions.ReasoningFromChatDeltas(chatDeltas)
|
||||
} else if deltaContent := functions.ContentFromChatDeltas(chatDeltas); len(chatDeltas) > 0 && deltaContent != "" {
|
||||
// ChatDeltas have content but no tool calls — model answered without using tools.
|
||||
// This happens with thinking models (e.g. Gemma 4) where the Go-side reasoning
|
||||
// extraction misclassifies clean content as reasoning, leaving cbRawResult empty.
|
||||
xlog.Debug("[ChatDeltas] non-SSE: using C++ autoparser content (no tool calls)", "content_len", len(deltaContent))
|
||||
textContentToReturn = deltaContent
|
||||
cbReasoning = functions.ReasoningFromChatDeltas(chatDeltas)
|
||||
} else {
|
||||
// Fallback: parse tool calls from raw text
|
||||
xlog.Debug("[ChatDeltas] non-SSE: no chat deltas, falling back to Go-side text parsing")
|
||||
|
|
@ -1067,7 +1074,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
|||
|
||||
switch {
|
||||
case noActionsToRun:
|
||||
qResult, qErr := handleQuestion(config, funcResults, cbRawResult, predInput)
|
||||
// Use textContentToReturn if available (e.g. from ChatDeltas),
|
||||
// otherwise fall back to cbRawResult for legacy Go-side parsing.
|
||||
questionInput := cbRawResult
|
||||
if textContentToReturn != "" {
|
||||
questionInput = textContentToReturn
|
||||
}
|
||||
qResult, qErr := handleQuestion(config, funcResults, questionInput, predInput)
|
||||
if qErr != nil {
|
||||
xlog.Error("error handling question", "error", qErr)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -143,12 +143,15 @@ func ComputeChoices(
|
|||
cb(finetunedResponse, &result)
|
||||
|
||||
// Caller-driven retry (tool parsing, reasoning-only, etc.).
|
||||
// When the C++ autoparser is active, it clears the raw response
|
||||
// and delivers data via ChatDeltas. If the response is empty but
|
||||
// ChatDeltas contain actionable data, skip the caller retry —
|
||||
// the autoparser already parsed the response successfully.
|
||||
// When the C++ autoparser is active, it may deliver parsed data
|
||||
// via ChatDeltas while also keeping the raw response. If ChatDeltas
|
||||
// contain actionable data (content or tool calls), skip the caller
|
||||
// retry — the autoparser already parsed the response successfully.
|
||||
// Note: we check ChatDeltas regardless of whether Response is empty,
|
||||
// because thinking models (e.g. Gemma 4) produce a non-empty Response
|
||||
// that the Go-side reasoning extraction can misclassify as reasoning-only.
|
||||
skipCallerRetry := false
|
||||
if strings.TrimSpace(prediction.Response) == "" && len(prediction.ChatDeltas) > 0 {
|
||||
if len(prediction.ChatDeltas) > 0 {
|
||||
for _, d := range prediction.ChatDeltas {
|
||||
if d.Content != "" || len(d.ToolCalls) > 0 {
|
||||
skipCallerRetry = true
|
||||
|
|
|
|||
|
|
@ -242,11 +242,13 @@ var _ = Describe("ComputeChoices", func() {
|
|||
})
|
||||
|
||||
Context("chat deltas from latest attempt", func() {
|
||||
It("should return chat deltas from the last attempt only", func() {
|
||||
It("should return chat deltas from the last attempt when retry is allowed", func() {
|
||||
// When the first attempt has only reasoning (no content/tool calls),
|
||||
// the caller-driven retry proceeds and we get deltas from the last attempt.
|
||||
mockInference([]backend.LLMResponse{
|
||||
{
|
||||
Response: "retry-me",
|
||||
ChatDeltas: []*pb.ChatDelta{{Content: "old"}},
|
||||
ChatDeltas: []*pb.ChatDelta{{ReasoningContent: "thinking..."}},
|
||||
},
|
||||
{
|
||||
Response: "final",
|
||||
|
|
@ -266,6 +268,40 @@ var _ = Describe("ComputeChoices", func() {
|
|||
Expect(deltas).To(HaveLen(1))
|
||||
Expect(deltas[0].Content).To(Equal("new"))
|
||||
})
|
||||
|
||||
It("should keep first attempt deltas when ChatDeltas have content (skip retry)", func() {
|
||||
// When the first attempt has content in ChatDeltas, skipCallerRetry
|
||||
// prevents the retry — the autoparser already parsed successfully.
|
||||
mockInference([]backend.LLMResponse{
|
||||
{
|
||||
Response: "autoparser-content",
|
||||
ChatDeltas: []*pb.ChatDelta{{Content: "first-content"}},
|
||||
},
|
||||
{
|
||||
Response: "should-not-reach",
|
||||
ChatDeltas: []*pb.ChatDelta{{Content: "second-content"}},
|
||||
},
|
||||
})
|
||||
|
||||
retryRequested := false
|
||||
_, _, deltas, err := ComputeChoices(
|
||||
makeReq(), "test", cfg, nil, appCfg, nil,
|
||||
func(s string, c *[]schema.Choice) {
|
||||
*c = append(*c, schema.Choice{Text: s})
|
||||
},
|
||||
nil,
|
||||
func(attempt int) bool {
|
||||
retryRequested = true
|
||||
return true
|
||||
},
|
||||
)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(retryRequested).To(BeFalse(),
|
||||
"shouldRetry should not be called when ChatDeltas have content")
|
||||
Expect(deltas).To(HaveLen(1))
|
||||
Expect(deltas[0].Content).To(Equal("first-content"))
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
Context("result choices cleared on retry", func() {
|
||||
|
|
|
|||
|
|
@ -121,6 +121,31 @@ var _ = BeforeSuite(func() {
|
|||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(os.WriteFile(autoparserPath, autoparserYAML, 0644)).To(Succeed())
|
||||
|
||||
// Create model config for thinking model + autoparser tests.
|
||||
// The chat template ends with <|channel>thought to simulate Gemma 4 thinking models.
|
||||
// This triggers DetectThinkingStartToken and PrependThinkingTokenIfNeeded in the
|
||||
// reasoning extraction path, reproducing a bug where clean content from the C++
|
||||
// autoparser gets misclassified as unclosed reasoning.
|
||||
thinkingAutoparserConfig := map[string]any{
|
||||
"name": "mock-model-thinking-autoparser",
|
||||
"backend": "mock-backend",
|
||||
"parameters": map[string]any{
|
||||
"model": "mock-model.bin",
|
||||
},
|
||||
"template": map[string]any{
|
||||
"chat": "{{.Input}}\n<|turn>model\n<|channel>thought\n<channel|>",
|
||||
},
|
||||
"function": map[string]any{
|
||||
"grammar": map[string]any{
|
||||
"disable": true,
|
||||
},
|
||||
},
|
||||
}
|
||||
thinkingAutoparserPath := filepath.Join(modelsPath, "mock-model-thinking-autoparser.yaml")
|
||||
thinkingAutoparserYAML, err := yaml.Marshal(thinkingAutoparserConfig)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(os.WriteFile(thinkingAutoparserPath, thinkingAutoparserYAML, 0644)).To(Succeed())
|
||||
|
||||
// Start mock MCP server and create MCP-enabled model config
|
||||
mcpServerURL, mcpServerShutdown = startMockMCPServer()
|
||||
mcpConfig := mcpModelConfig(mcpServerURL)
|
||||
|
|
|
|||
|
|
@ -95,6 +95,26 @@ func (m *MockBackend) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.R
|
|||
}, nil
|
||||
}
|
||||
|
||||
// Simulate Gemma 4 / thinking model with C++ autoparser:
|
||||
// - Message contains the clean content (autoparser extracts it from OAI choices[0].message.content)
|
||||
// - ChatDeltas contain both reasoning and content separately
|
||||
// This reproduces the bug where Go-side PrependThinkingTokenIfNeeded
|
||||
// incorrectly prepends a thinking start token to the clean content,
|
||||
// causing the entire response to be classified as unclosed reasoning.
|
||||
if strings.Contains(in.Prompt, "AUTOPARSER_THINKING_CONTENT") {
|
||||
return &pb.Reply{
|
||||
Message: []byte("I am a helpful AI assistant designed to assist you with a wide range of tasks."),
|
||||
Tokens: 20,
|
||||
PromptTokens: 50,
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{
|
||||
ReasoningContent: "The user is asking a simple introductory question. I should respond directly.",
|
||||
Content: "I am a helpful AI assistant designed to assist you with a wide range of tasks.",
|
||||
},
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
var response string
|
||||
toolName := mockToolNameFromRequest(in)
|
||||
if toolName != "" && !promptHasToolResults(in.Prompt) {
|
||||
|
|
|
|||
|
|
@ -462,5 +462,51 @@ var _ = Describe("Mock Backend E2E Tests", Label("MockBackend"), func() {
|
|||
Expect(fn["name"]).To(Equal("search_collections"))
|
||||
})
|
||||
})
|
||||
|
||||
// Regression test: thinking model (Gemma 4-style) with tools, where the
|
||||
// model responds with content only (no tool calls). The C++ autoparser
|
||||
// puts clean content in Message AND reasoning+content in ChatDeltas.
|
||||
// Bug: Go-side PrependThinkingTokenIfNeeded prepends <|channel>thought
|
||||
// to the clean content, causing it to be classified as unclosed reasoning,
|
||||
// leading to "Backend produced reasoning without actionable content, retrying".
|
||||
Context("Non-streaming thinking model with tools and ChatDelta content (no tool calls)", func() {
|
||||
It("should return content without retrying", func() {
|
||||
body := `{
|
||||
"model": "mock-model-thinking-autoparser",
|
||||
"messages": [{"role": "user", "content": "AUTOPARSER_THINKING_CONTENT"}],
|
||||
"tools": [{"type": "function", "function": {"name": "search_collections", "description": "Search documents", "parameters": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}}}]
|
||||
}`
|
||||
req, err := http.NewRequest("POST", apiURL+"/chat/completions", strings.NewReader(body))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
httpClient := &http.Client{Timeout: 60 * time.Second}
|
||||
resp, err := httpClient.Do(req)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
defer resp.Body.Close()
|
||||
Expect(resp.StatusCode).To(Equal(200))
|
||||
|
||||
data, err := io.ReadAll(resp.Body)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
var result map[string]any
|
||||
Expect(json.Unmarshal(data, &result)).To(Succeed())
|
||||
|
||||
choices, ok := result["choices"].([]any)
|
||||
Expect(ok).To(BeTrue(), "Expected choices array, got: %s", string(data))
|
||||
Expect(choices).To(HaveLen(1))
|
||||
|
||||
choice := choices[0].(map[string]any)
|
||||
msg, _ := choice["message"].(map[string]any)
|
||||
Expect(msg).ToNot(BeNil())
|
||||
|
||||
content, _ := msg["content"].(string)
|
||||
Expect(content).ToNot(BeEmpty(),
|
||||
"Expected non-empty content in thinking model response with tools, "+
|
||||
"but got empty content. Full response: %s", string(data))
|
||||
Expect(content).To(ContainSubstring("helpful AI assistant"),
|
||||
"Expected content to contain the model's response text, got: %s", content)
|
||||
})
|
||||
})
|
||||
})
|
||||
})
|
||||
|
|
|
|||
Loading…
Reference in a new issue