diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index bc66d1aef..b2ec1ac44 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -1045,6 +1045,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator funcResults = deltaToolCalls textContentToReturn = functions.ContentFromChatDeltas(chatDeltas) cbReasoning = functions.ReasoningFromChatDeltas(chatDeltas) + } else if deltaContent := functions.ContentFromChatDeltas(chatDeltas); len(chatDeltas) > 0 && deltaContent != "" { + // ChatDeltas have content but no tool calls — model answered without using tools. + // This happens with thinking models (e.g. Gemma 4) where the Go-side reasoning + // extraction misclassifies clean content as reasoning, leaving cbRawResult empty. + xlog.Debug("[ChatDeltas] non-SSE: using C++ autoparser content (no tool calls)", "content_len", len(deltaContent)) + textContentToReturn = deltaContent + cbReasoning = functions.ReasoningFromChatDeltas(chatDeltas) } else { // Fallback: parse tool calls from raw text xlog.Debug("[ChatDeltas] non-SSE: no chat deltas, falling back to Go-side text parsing") @@ -1067,7 +1074,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator switch { case noActionsToRun: - qResult, qErr := handleQuestion(config, funcResults, cbRawResult, predInput) + // Use textContentToReturn if available (e.g. from ChatDeltas), + // otherwise fall back to cbRawResult for legacy Go-side parsing. + questionInput := cbRawResult + if textContentToReturn != "" { + questionInput = textContentToReturn + } + qResult, qErr := handleQuestion(config, funcResults, questionInput, predInput) if qErr != nil { xlog.Error("error handling question", "error", qErr) } diff --git a/core/http/endpoints/openai/inference.go b/core/http/endpoints/openai/inference.go index f18e0e20a..bc770dce7 100644 --- a/core/http/endpoints/openai/inference.go +++ b/core/http/endpoints/openai/inference.go @@ -143,12 +143,15 @@ func ComputeChoices( cb(finetunedResponse, &result) // Caller-driven retry (tool parsing, reasoning-only, etc.). - // When the C++ autoparser is active, it clears the raw response - // and delivers data via ChatDeltas. If the response is empty but - // ChatDeltas contain actionable data, skip the caller retry — - // the autoparser already parsed the response successfully. + // When the C++ autoparser is active, it may deliver parsed data + // via ChatDeltas while also keeping the raw response. If ChatDeltas + // contain actionable data (content or tool calls), skip the caller + // retry — the autoparser already parsed the response successfully. + // Note: we check ChatDeltas regardless of whether Response is empty, + // because thinking models (e.g. Gemma 4) produce a non-empty Response + // that the Go-side reasoning extraction can misclassify as reasoning-only. skipCallerRetry := false - if strings.TrimSpace(prediction.Response) == "" && len(prediction.ChatDeltas) > 0 { + if len(prediction.ChatDeltas) > 0 { for _, d := range prediction.ChatDeltas { if d.Content != "" || len(d.ToolCalls) > 0 { skipCallerRetry = true diff --git a/core/http/endpoints/openai/inference_test.go b/core/http/endpoints/openai/inference_test.go index f540a4d8f..a7ead8fd1 100644 --- a/core/http/endpoints/openai/inference_test.go +++ b/core/http/endpoints/openai/inference_test.go @@ -242,11 +242,13 @@ var _ = Describe("ComputeChoices", func() { }) Context("chat deltas from latest attempt", func() { - It("should return chat deltas from the last attempt only", func() { + It("should return chat deltas from the last attempt when retry is allowed", func() { + // When the first attempt has only reasoning (no content/tool calls), + // the caller-driven retry proceeds and we get deltas from the last attempt. mockInference([]backend.LLMResponse{ { Response: "retry-me", - ChatDeltas: []*pb.ChatDelta{{Content: "old"}}, + ChatDeltas: []*pb.ChatDelta{{ReasoningContent: "thinking..."}}, }, { Response: "final", @@ -266,6 +268,40 @@ var _ = Describe("ComputeChoices", func() { Expect(deltas).To(HaveLen(1)) Expect(deltas[0].Content).To(Equal("new")) }) + + It("should keep first attempt deltas when ChatDeltas have content (skip retry)", func() { + // When the first attempt has content in ChatDeltas, skipCallerRetry + // prevents the retry — the autoparser already parsed successfully. + mockInference([]backend.LLMResponse{ + { + Response: "autoparser-content", + ChatDeltas: []*pb.ChatDelta{{Content: "first-content"}}, + }, + { + Response: "should-not-reach", + ChatDeltas: []*pb.ChatDelta{{Content: "second-content"}}, + }, + }) + + retryRequested := false + _, _, deltas, err := ComputeChoices( + makeReq(), "test", cfg, nil, appCfg, nil, + func(s string, c *[]schema.Choice) { + *c = append(*c, schema.Choice{Text: s}) + }, + nil, + func(attempt int) bool { + retryRequested = true + return true + }, + ) + Expect(err).ToNot(HaveOccurred()) + Expect(retryRequested).To(BeFalse(), + "shouldRetry should not be called when ChatDeltas have content") + Expect(deltas).To(HaveLen(1)) + Expect(deltas[0].Content).To(Equal("first-content")) + }) + }) Context("result choices cleared on retry", func() { diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go index c68484ae1..65af629e0 100644 --- a/tests/e2e/e2e_suite_test.go +++ b/tests/e2e/e2e_suite_test.go @@ -121,6 +121,31 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) Expect(os.WriteFile(autoparserPath, autoparserYAML, 0644)).To(Succeed()) + // Create model config for thinking model + autoparser tests. + // The chat template ends with <|channel>thought to simulate Gemma 4 thinking models. + // This triggers DetectThinkingStartToken and PrependThinkingTokenIfNeeded in the + // reasoning extraction path, reproducing a bug where clean content from the C++ + // autoparser gets misclassified as unclosed reasoning. + thinkingAutoparserConfig := map[string]any{ + "name": "mock-model-thinking-autoparser", + "backend": "mock-backend", + "parameters": map[string]any{ + "model": "mock-model.bin", + }, + "template": map[string]any{ + "chat": "{{.Input}}\n<|turn>model\n<|channel>thought\n", + }, + "function": map[string]any{ + "grammar": map[string]any{ + "disable": true, + }, + }, + } + thinkingAutoparserPath := filepath.Join(modelsPath, "mock-model-thinking-autoparser.yaml") + thinkingAutoparserYAML, err := yaml.Marshal(thinkingAutoparserConfig) + Expect(err).ToNot(HaveOccurred()) + Expect(os.WriteFile(thinkingAutoparserPath, thinkingAutoparserYAML, 0644)).To(Succeed()) + // Start mock MCP server and create MCP-enabled model config mcpServerURL, mcpServerShutdown = startMockMCPServer() mcpConfig := mcpModelConfig(mcpServerURL) diff --git a/tests/e2e/mock-backend/main.go b/tests/e2e/mock-backend/main.go index 0db9b49c1..469795e25 100644 --- a/tests/e2e/mock-backend/main.go +++ b/tests/e2e/mock-backend/main.go @@ -95,6 +95,26 @@ func (m *MockBackend) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.R }, nil } + // Simulate Gemma 4 / thinking model with C++ autoparser: + // - Message contains the clean content (autoparser extracts it from OAI choices[0].message.content) + // - ChatDeltas contain both reasoning and content separately + // This reproduces the bug where Go-side PrependThinkingTokenIfNeeded + // incorrectly prepends a thinking start token to the clean content, + // causing the entire response to be classified as unclosed reasoning. + if strings.Contains(in.Prompt, "AUTOPARSER_THINKING_CONTENT") { + return &pb.Reply{ + Message: []byte("I am a helpful AI assistant designed to assist you with a wide range of tasks."), + Tokens: 20, + PromptTokens: 50, + ChatDeltas: []*pb.ChatDelta{ + { + ReasoningContent: "The user is asking a simple introductory question. I should respond directly.", + Content: "I am a helpful AI assistant designed to assist you with a wide range of tasks.", + }, + }, + }, nil + } + var response string toolName := mockToolNameFromRequest(in) if toolName != "" && !promptHasToolResults(in.Prompt) { diff --git a/tests/e2e/mock_backend_test.go b/tests/e2e/mock_backend_test.go index 6c18bd881..d45301ef0 100644 --- a/tests/e2e/mock_backend_test.go +++ b/tests/e2e/mock_backend_test.go @@ -462,5 +462,51 @@ var _ = Describe("Mock Backend E2E Tests", Label("MockBackend"), func() { Expect(fn["name"]).To(Equal("search_collections")) }) }) + + // Regression test: thinking model (Gemma 4-style) with tools, where the + // model responds with content only (no tool calls). The C++ autoparser + // puts clean content in Message AND reasoning+content in ChatDeltas. + // Bug: Go-side PrependThinkingTokenIfNeeded prepends <|channel>thought + // to the clean content, causing it to be classified as unclosed reasoning, + // leading to "Backend produced reasoning without actionable content, retrying". + Context("Non-streaming thinking model with tools and ChatDelta content (no tool calls)", func() { + It("should return content without retrying", func() { + body := `{ + "model": "mock-model-thinking-autoparser", + "messages": [{"role": "user", "content": "AUTOPARSER_THINKING_CONTENT"}], + "tools": [{"type": "function", "function": {"name": "search_collections", "description": "Search documents", "parameters": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}}}] + }` + req, err := http.NewRequest("POST", apiURL+"/chat/completions", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + httpClient := &http.Client{Timeout: 60 * time.Second} + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + Expect(resp.StatusCode).To(Equal(200)) + + data, err := io.ReadAll(resp.Body) + Expect(err).ToNot(HaveOccurred()) + + var result map[string]any + Expect(json.Unmarshal(data, &result)).To(Succeed()) + + choices, ok := result["choices"].([]any) + Expect(ok).To(BeTrue(), "Expected choices array, got: %s", string(data)) + Expect(choices).To(HaveLen(1)) + + choice := choices[0].(map[string]any) + msg, _ := choice["message"].(map[string]any) + Expect(msg).ToNot(BeNil()) + + content, _ := msg["content"].(string) + Expect(content).ToNot(BeEmpty(), + "Expected non-empty content in thinking model response with tools, "+ + "but got empty content. Full response: %s", string(data)) + Expect(content).To(ContainSubstring("helpful AI assistant"), + "Expected content to contain the model's response text, got: %s", content) + }) + }) }) })