fix: thinking models with tools returning empty content (reasoning-only retry loop) (#9290)

When clients like Nextcloud or Home Assistant send requests with tools
to thinking models (e.g. Gemma 4 with <|channel>thought tags), the
response was empty despite the backend producing valid content.

Root cause: the C++ autoparser puts clean content in both the raw
Response and ChatDeltas. The Go-side PrependThinkingTokenIfNeeded
then prepends the thinking start token to the already-clean content,
causing ExtractReasoning to classify the entire response as unclosed
reasoning. This made cbRawResult empty, triggering a retry loop that
never succeeds.

Two fixes:
- inference.go: check ChatDeltas for content/tool_calls regardless of
  whether Response is empty, so skipCallerRetry fires correctly
- chat.go: when ChatDeltas have content but no tool calls, use that
  content directly instead of falling back to the empty cbRawResult
This commit is contained in:
Ettore Di Giacinto 2026-04-09 18:30:31 +02:00 committed by GitHub
parent 85be4ff03c
commit 13a6ed709c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 151 additions and 8 deletions

View file

@ -1045,6 +1045,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
funcResults = deltaToolCalls
textContentToReturn = functions.ContentFromChatDeltas(chatDeltas)
cbReasoning = functions.ReasoningFromChatDeltas(chatDeltas)
} else if deltaContent := functions.ContentFromChatDeltas(chatDeltas); len(chatDeltas) > 0 && deltaContent != "" {
// ChatDeltas have content but no tool calls — model answered without using tools.
// This happens with thinking models (e.g. Gemma 4) where the Go-side reasoning
// extraction misclassifies clean content as reasoning, leaving cbRawResult empty.
xlog.Debug("[ChatDeltas] non-SSE: using C++ autoparser content (no tool calls)", "content_len", len(deltaContent))
textContentToReturn = deltaContent
cbReasoning = functions.ReasoningFromChatDeltas(chatDeltas)
} else {
// Fallback: parse tool calls from raw text
xlog.Debug("[ChatDeltas] non-SSE: no chat deltas, falling back to Go-side text parsing")
@ -1067,7 +1074,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
switch {
case noActionsToRun:
qResult, qErr := handleQuestion(config, funcResults, cbRawResult, predInput)
// Use textContentToReturn if available (e.g. from ChatDeltas),
// otherwise fall back to cbRawResult for legacy Go-side parsing.
questionInput := cbRawResult
if textContentToReturn != "" {
questionInput = textContentToReturn
}
qResult, qErr := handleQuestion(config, funcResults, questionInput, predInput)
if qErr != nil {
xlog.Error("error handling question", "error", qErr)
}

View file

@ -143,12 +143,15 @@ func ComputeChoices(
cb(finetunedResponse, &result)
// Caller-driven retry (tool parsing, reasoning-only, etc.).
// When the C++ autoparser is active, it clears the raw response
// and delivers data via ChatDeltas. If the response is empty but
// ChatDeltas contain actionable data, skip the caller retry —
// the autoparser already parsed the response successfully.
// When the C++ autoparser is active, it may deliver parsed data
// via ChatDeltas while also keeping the raw response. If ChatDeltas
// contain actionable data (content or tool calls), skip the caller
// retry — the autoparser already parsed the response successfully.
// Note: we check ChatDeltas regardless of whether Response is empty,
// because thinking models (e.g. Gemma 4) produce a non-empty Response
// that the Go-side reasoning extraction can misclassify as reasoning-only.
skipCallerRetry := false
if strings.TrimSpace(prediction.Response) == "" && len(prediction.ChatDeltas) > 0 {
if len(prediction.ChatDeltas) > 0 {
for _, d := range prediction.ChatDeltas {
if d.Content != "" || len(d.ToolCalls) > 0 {
skipCallerRetry = true

View file

@ -242,11 +242,13 @@ var _ = Describe("ComputeChoices", func() {
})
Context("chat deltas from latest attempt", func() {
It("should return chat deltas from the last attempt only", func() {
It("should return chat deltas from the last attempt when retry is allowed", func() {
// When the first attempt has only reasoning (no content/tool calls),
// the caller-driven retry proceeds and we get deltas from the last attempt.
mockInference([]backend.LLMResponse{
{
Response: "retry-me",
ChatDeltas: []*pb.ChatDelta{{Content: "old"}},
ChatDeltas: []*pb.ChatDelta{{ReasoningContent: "thinking..."}},
},
{
Response: "final",
@ -266,6 +268,40 @@ var _ = Describe("ComputeChoices", func() {
Expect(deltas).To(HaveLen(1))
Expect(deltas[0].Content).To(Equal("new"))
})
It("should keep first attempt deltas when ChatDeltas have content (skip retry)", func() {
// When the first attempt has content in ChatDeltas, skipCallerRetry
// prevents the retry — the autoparser already parsed successfully.
mockInference([]backend.LLMResponse{
{
Response: "autoparser-content",
ChatDeltas: []*pb.ChatDelta{{Content: "first-content"}},
},
{
Response: "should-not-reach",
ChatDeltas: []*pb.ChatDelta{{Content: "second-content"}},
},
})
retryRequested := false
_, _, deltas, err := ComputeChoices(
makeReq(), "test", cfg, nil, appCfg, nil,
func(s string, c *[]schema.Choice) {
*c = append(*c, schema.Choice{Text: s})
},
nil,
func(attempt int) bool {
retryRequested = true
return true
},
)
Expect(err).ToNot(HaveOccurred())
Expect(retryRequested).To(BeFalse(),
"shouldRetry should not be called when ChatDeltas have content")
Expect(deltas).To(HaveLen(1))
Expect(deltas[0].Content).To(Equal("first-content"))
})
})
Context("result choices cleared on retry", func() {

View file

@ -121,6 +121,31 @@ var _ = BeforeSuite(func() {
Expect(err).ToNot(HaveOccurred())
Expect(os.WriteFile(autoparserPath, autoparserYAML, 0644)).To(Succeed())
// Create model config for thinking model + autoparser tests.
// The chat template ends with <|channel>thought to simulate Gemma 4 thinking models.
// This triggers DetectThinkingStartToken and PrependThinkingTokenIfNeeded in the
// reasoning extraction path, reproducing a bug where clean content from the C++
// autoparser gets misclassified as unclosed reasoning.
thinkingAutoparserConfig := map[string]any{
"name": "mock-model-thinking-autoparser",
"backend": "mock-backend",
"parameters": map[string]any{
"model": "mock-model.bin",
},
"template": map[string]any{
"chat": "{{.Input}}\n<|turn>model\n<|channel>thought\n<channel|>",
},
"function": map[string]any{
"grammar": map[string]any{
"disable": true,
},
},
}
thinkingAutoparserPath := filepath.Join(modelsPath, "mock-model-thinking-autoparser.yaml")
thinkingAutoparserYAML, err := yaml.Marshal(thinkingAutoparserConfig)
Expect(err).ToNot(HaveOccurred())
Expect(os.WriteFile(thinkingAutoparserPath, thinkingAutoparserYAML, 0644)).To(Succeed())
// Start mock MCP server and create MCP-enabled model config
mcpServerURL, mcpServerShutdown = startMockMCPServer()
mcpConfig := mcpModelConfig(mcpServerURL)

View file

@ -95,6 +95,26 @@ func (m *MockBackend) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.R
}, nil
}
// Simulate Gemma 4 / thinking model with C++ autoparser:
// - Message contains the clean content (autoparser extracts it from OAI choices[0].message.content)
// - ChatDeltas contain both reasoning and content separately
// This reproduces the bug where Go-side PrependThinkingTokenIfNeeded
// incorrectly prepends a thinking start token to the clean content,
// causing the entire response to be classified as unclosed reasoning.
if strings.Contains(in.Prompt, "AUTOPARSER_THINKING_CONTENT") {
return &pb.Reply{
Message: []byte("I am a helpful AI assistant designed to assist you with a wide range of tasks."),
Tokens: 20,
PromptTokens: 50,
ChatDeltas: []*pb.ChatDelta{
{
ReasoningContent: "The user is asking a simple introductory question. I should respond directly.",
Content: "I am a helpful AI assistant designed to assist you with a wide range of tasks.",
},
},
}, nil
}
var response string
toolName := mockToolNameFromRequest(in)
if toolName != "" && !promptHasToolResults(in.Prompt) {

View file

@ -462,5 +462,51 @@ var _ = Describe("Mock Backend E2E Tests", Label("MockBackend"), func() {
Expect(fn["name"]).To(Equal("search_collections"))
})
})
// Regression test: thinking model (Gemma 4-style) with tools, where the
// model responds with content only (no tool calls). The C++ autoparser
// puts clean content in Message AND reasoning+content in ChatDeltas.
// Bug: Go-side PrependThinkingTokenIfNeeded prepends <|channel>thought
// to the clean content, causing it to be classified as unclosed reasoning,
// leading to "Backend produced reasoning without actionable content, retrying".
Context("Non-streaming thinking model with tools and ChatDelta content (no tool calls)", func() {
It("should return content without retrying", func() {
body := `{
"model": "mock-model-thinking-autoparser",
"messages": [{"role": "user", "content": "AUTOPARSER_THINKING_CONTENT"}],
"tools": [{"type": "function", "function": {"name": "search_collections", "description": "Search documents", "parameters": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}}}]
}`
req, err := http.NewRequest("POST", apiURL+"/chat/completions", strings.NewReader(body))
Expect(err).ToNot(HaveOccurred())
req.Header.Set("Content-Type", "application/json")
httpClient := &http.Client{Timeout: 60 * time.Second}
resp, err := httpClient.Do(req)
Expect(err).ToNot(HaveOccurred())
defer resp.Body.Close()
Expect(resp.StatusCode).To(Equal(200))
data, err := io.ReadAll(resp.Body)
Expect(err).ToNot(HaveOccurred())
var result map[string]any
Expect(json.Unmarshal(data, &result)).To(Succeed())
choices, ok := result["choices"].([]any)
Expect(ok).To(BeTrue(), "Expected choices array, got: %s", string(data))
Expect(choices).To(HaveLen(1))
choice := choices[0].(map[string]any)
msg, _ := choice["message"].(map[string]any)
Expect(msg).ToNot(BeNil())
content, _ := msg["content"].(string)
Expect(content).ToNot(BeEmpty(),
"Expected non-empty content in thinking model response with tools, "+
"but got empty content. Full response: %s", string(data))
Expect(content).To(ContainSubstring("helpful AI assistant"),
"Expected content to contain the model's response text, got: %s", content)
})
})
})
})