fix: thinking models with tools returning empty content (reasoning-only retry loop) (#9290)

When clients like Nextcloud or Home Assistant send requests with tools to thinking models (e.g. Gemma 4 with <|channel>thought tags), the response was empty despite the backend producing valid content. Root cause: the C++ autoparser puts clean content in both the raw Response and ChatDeltas. The Go-side PrependThinkingTokenIfNeeded then prepends the thinking start token to the already-clean content, causing ExtractReasoning to classify the entire response as unclosed reasoning. This made cbRawResult empty, triggering a retry loop that never succeeds. Two fixes: - inference.go: check ChatDeltas for content/tool_calls regardless of whether Response is empty, so skipCallerRetry fires correctly - chat.go: when ChatDeltas have content but no tool calls, use that content directly instead of falling back to the empty cbRawResult
2026-04-21 13:27:21 +00:00 · 2026-04-09 18:30:31 +02:00 · 2026-04-09 18:30:31 +02:00 · 13a6ed709c
commit 13a6ed709c
parent 85be4ff03c
6 changed files with 151 additions and 8 deletions
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@ -1045,6 +1045,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 						funcResults = deltaToolCalls
 						textContentToReturn = functions.ContentFromChatDeltas(chatDeltas)
 						cbReasoning = functions.ReasoningFromChatDeltas(chatDeltas)
+					} else if deltaContent := functions.ContentFromChatDeltas(chatDeltas); len(chatDeltas) > 0 && deltaContent != "" {
+						// ChatDeltas have content but no tool calls — model answered without using tools.
+						// This happens with thinking models (e.g. Gemma 4) where the Go-side reasoning
+						// extraction misclassifies clean content as reasoning, leaving cbRawResult empty.
+						xlog.Debug("[ChatDeltas] non-SSE: using C++ autoparser content (no tool calls)", "content_len", len(deltaContent))
+						textContentToReturn = deltaContent
+						cbReasoning = functions.ReasoningFromChatDeltas(chatDeltas)
 					} else {
 						// Fallback: parse tool calls from raw text
 						xlog.Debug("[ChatDeltas] non-SSE: no chat deltas, falling back to Go-side text parsing")
@ -1067,7 +1074,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator

 					switch {
 					case noActionsToRun:
-						qResult, qErr := handleQuestion(config, funcResults, cbRawResult, predInput)
+						// Use textContentToReturn if available (e.g. from ChatDeltas),
+						// otherwise fall back to cbRawResult for legacy Go-side parsing.
+						questionInput := cbRawResult
+						if textContentToReturn != "" {
+							questionInput = textContentToReturn
+						}
+						qResult, qErr := handleQuestion(config, funcResults, questionInput, predInput)
 						if qErr != nil {
 							xlog.Error("error handling question", "error", qErr)
 						}
--- a/core/http/endpoints/openai/inference.go
+++ b/core/http/endpoints/openai/inference.go
@ -143,12 +143,15 @@ func ComputeChoices(
 			cb(finetunedResponse, &result)

 			// Caller-driven retry (tool parsing, reasoning-only, etc.).
-			// When the C++ autoparser is active, it clears the raw response
-			// and delivers data via ChatDeltas. If the response is empty but
-			// ChatDeltas contain actionable data, skip the caller retry —
-			// the autoparser already parsed the response successfully.
+			// When the C++ autoparser is active, it may deliver parsed data
+			// via ChatDeltas while also keeping the raw response. If ChatDeltas
+			// contain actionable data (content or tool calls), skip the caller
+			// retry — the autoparser already parsed the response successfully.
+			// Note: we check ChatDeltas regardless of whether Response is empty,
+			// because thinking models (e.g. Gemma 4) produce a non-empty Response
+			// that the Go-side reasoning extraction can misclassify as reasoning-only.
 			skipCallerRetry := false
-			if strings.TrimSpace(prediction.Response) == "" && len(prediction.ChatDeltas) > 0 {
+			if len(prediction.ChatDeltas) > 0 {
 				for _, d := range prediction.ChatDeltas {
 					if d.Content != "" || len(d.ToolCalls) > 0 {
 						skipCallerRetry = true
--- a/core/http/endpoints/openai/inference_test.go
+++ b/core/http/endpoints/openai/inference_test.go
@ -242,11 +242,13 @@ var _ = Describe("ComputeChoices", func() {
 	})

 	Context("chat deltas from latest attempt", func() {
-		It("should return chat deltas from the last attempt only", func() {
+		It("should return chat deltas from the last attempt when retry is allowed", func() {
+			// When the first attempt has only reasoning (no content/tool calls),
+			// the caller-driven retry proceeds and we get deltas from the last attempt.
 			mockInference([]backend.LLMResponse{
 				{
 					Response:   "retry-me",
-					ChatDeltas: []*pb.ChatDelta{{Content: "old"}},
+					ChatDeltas: []*pb.ChatDelta{{ReasoningContent: "thinking..."}},
 				},
 				{
 					Response:   "final",
@ -266,6 +268,40 @@ var _ = Describe("ComputeChoices", func() {
 			Expect(deltas).To(HaveLen(1))
 			Expect(deltas[0].Content).To(Equal("new"))
 		})
+
+		It("should keep first attempt deltas when ChatDeltas have content (skip retry)", func() {
+			// When the first attempt has content in ChatDeltas, skipCallerRetry
+			// prevents the retry — the autoparser already parsed successfully.
+			mockInference([]backend.LLMResponse{
+				{
+					Response:   "autoparser-content",
+					ChatDeltas: []*pb.ChatDelta{{Content: "first-content"}},
+				},
+				{
+					Response:   "should-not-reach",
+					ChatDeltas: []*pb.ChatDelta{{Content: "second-content"}},
+				},
+			})
+
+			retryRequested := false
+			_, _, deltas, err := ComputeChoices(
+				makeReq(), "test", cfg, nil, appCfg, nil,
+				func(s string, c *[]schema.Choice) {
+					*c = append(*c, schema.Choice{Text: s})
+				},
+				nil,
+				func(attempt int) bool {
+					retryRequested = true
+					return true
+				},
+			)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(retryRequested).To(BeFalse(),
+				"shouldRetry should not be called when ChatDeltas have content")
+			Expect(deltas).To(HaveLen(1))
+			Expect(deltas[0].Content).To(Equal("first-content"))
+		})
+
 	})

 	Context("result choices cleared on retry", func() {
--- a/tests/e2e/e2e_suite_test.go
+++ b/tests/e2e/e2e_suite_test.go
@ -121,6 +121,31 @@ var _ = BeforeSuite(func() {
 	Expect(err).ToNot(HaveOccurred())
 	Expect(os.WriteFile(autoparserPath, autoparserYAML, 0644)).To(Succeed())

+	// Create model config for thinking model + autoparser tests.
+	// The chat template ends with <|channel>thought to simulate Gemma 4 thinking models.
+	// This triggers DetectThinkingStartToken and PrependThinkingTokenIfNeeded in the
+	// reasoning extraction path, reproducing a bug where clean content from the C++
+	// autoparser gets misclassified as unclosed reasoning.
+	thinkingAutoparserConfig := map[string]any{
+		"name":    "mock-model-thinking-autoparser",
+		"backend": "mock-backend",
+		"parameters": map[string]any{
+			"model": "mock-model.bin",
+		},
+		"template": map[string]any{
+			"chat": "{{.Input}}\n<|turn>model\n<|channel>thought\n<channel|>",
+		},
+		"function": map[string]any{
+			"grammar": map[string]any{
+				"disable": true,
+			},
+		},
+	}
+	thinkingAutoparserPath := filepath.Join(modelsPath, "mock-model-thinking-autoparser.yaml")
+	thinkingAutoparserYAML, err := yaml.Marshal(thinkingAutoparserConfig)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(os.WriteFile(thinkingAutoparserPath, thinkingAutoparserYAML, 0644)).To(Succeed())
+
 	// Start mock MCP server and create MCP-enabled model config
 	mcpServerURL, mcpServerShutdown = startMockMCPServer()
 	mcpConfig := mcpModelConfig(mcpServerURL)
--- a/tests/e2e/mock-backend/main.go
+++ b/tests/e2e/mock-backend/main.go
@ -95,6 +95,26 @@ func (m *MockBackend) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.R
 		}, nil
 	}

+	// Simulate Gemma 4 / thinking model with C++ autoparser:
+	// - Message contains the clean content (autoparser extracts it from OAI choices[0].message.content)
+	// - ChatDeltas contain both reasoning and content separately
+	// This reproduces the bug where Go-side PrependThinkingTokenIfNeeded
+	// incorrectly prepends a thinking start token to the clean content,
+	// causing the entire response to be classified as unclosed reasoning.
+	if strings.Contains(in.Prompt, "AUTOPARSER_THINKING_CONTENT") {
+		return &pb.Reply{
+			Message:      []byte("I am a helpful AI assistant designed to assist you with a wide range of tasks."),
+			Tokens:       20,
+			PromptTokens: 50,
+			ChatDeltas: []*pb.ChatDelta{
+				{
+					ReasoningContent: "The user is asking a simple introductory question. I should respond directly.",
+					Content:          "I am a helpful AI assistant designed to assist you with a wide range of tasks.",
+				},
+			},
+		}, nil
+	}
+
 	var response string
 	toolName := mockToolNameFromRequest(in)
 	if toolName != "" && !promptHasToolResults(in.Prompt) {
--- a/tests/e2e/mock_backend_test.go
+++ b/tests/e2e/mock_backend_test.go
@ -462,5 +462,51 @@ var _ = Describe("Mock Backend E2E Tests", Label("MockBackend"), func() {
 				Expect(fn["name"]).To(Equal("search_collections"))
 			})
 		})
+
+		// Regression test: thinking model (Gemma 4-style) with tools, where the
+		// model responds with content only (no tool calls). The C++ autoparser
+		// puts clean content in Message AND reasoning+content in ChatDeltas.
+		// Bug: Go-side PrependThinkingTokenIfNeeded prepends <|channel>thought
+		// to the clean content, causing it to be classified as unclosed reasoning,
+		// leading to "Backend produced reasoning without actionable content, retrying".
+		Context("Non-streaming thinking model with tools and ChatDelta content (no tool calls)", func() {
+			It("should return content without retrying", func() {
+				body := `{
+					"model": "mock-model-thinking-autoparser",
+					"messages": [{"role": "user", "content": "AUTOPARSER_THINKING_CONTENT"}],
+					"tools": [{"type": "function", "function": {"name": "search_collections", "description": "Search documents", "parameters": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}}}]
+				}`
+				req, err := http.NewRequest("POST", apiURL+"/chat/completions", strings.NewReader(body))
+				Expect(err).ToNot(HaveOccurred())
+				req.Header.Set("Content-Type", "application/json")
+
+				httpClient := &http.Client{Timeout: 60 * time.Second}
+				resp, err := httpClient.Do(req)
+				Expect(err).ToNot(HaveOccurred())
+				defer resp.Body.Close()
+				Expect(resp.StatusCode).To(Equal(200))
+
+				data, err := io.ReadAll(resp.Body)
+				Expect(err).ToNot(HaveOccurred())
+
+				var result map[string]any
+				Expect(json.Unmarshal(data, &result)).To(Succeed())
+
+				choices, ok := result["choices"].([]any)
+				Expect(ok).To(BeTrue(), "Expected choices array, got: %s", string(data))
+				Expect(choices).To(HaveLen(1))
+
+				choice := choices[0].(map[string]any)
+				msg, _ := choice["message"].(map[string]any)
+				Expect(msg).ToNot(BeNil())
+
+				content, _ := msg["content"].(string)
+				Expect(content).ToNot(BeEmpty(),
+					"Expected non-empty content in thinking model response with tools, "+
+						"but got empty content. Full response: %s", string(data))
+				Expect(content).To(ContainSubstring("helpful AI assistant"),
+					"Expected content to contain the model's response text, got: %s", content)
+			})
+		})
 	})
 })