fix(anthropic): do not emit empty tokens and fix SSE tool calls (#9258)

This fixes Claude Code compatibility

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2026-04-07 00:38:21 +02:00 committed by GitHub
parent 33b124c6f1
commit 0f9d516a6c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 296 additions and 4 deletions

View file

@ -3,6 +3,8 @@ package anthropic
import (
"encoding/json"
"fmt"
"sync"
"time"
"github.com/google/uuid"
"github.com/labstack/echo/v4"
@ -366,7 +368,33 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
// Collect tool calls for MCP execution
var collectedToolCalls []functions.FuncCallResults
// SSE keepalive: send comment pings every 3s until the first token arrives.
// This prevents clients (e.g. Claude Code) from timing out while the model loads or processes the prompt.
firstTokenReceived := make(chan struct{})
keepaliveDone := make(chan struct{})
go func() {
defer close(keepaliveDone)
ticker := time.NewTicker(3 * time.Second)
defer ticker.Stop()
for {
select {
case <-firstTokenReceived:
return
case <-c.Request().Context().Done():
return
case <-ticker.C:
fmt.Fprintf(c.Response().Writer, "event: ping\ndata: {\"type\": \"ping\"}\n\n")
c.Response().Flush()
}
}
}()
firstTokenOnce := sync.Once{}
tokenCallback := func(token string, usage backend.TokenUsage) bool {
firstTokenOnce.Do(func() {
close(firstTokenReceived)
<-keepaliveDone // wait for keepalive goroutine to exit before writing
})
accumulatedContent += token
if shouldUseFn {
@ -414,7 +442,7 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
}
}
if !inToolCall {
if !inToolCall && token != "" {
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_delta",
Index: intPtr(0),
@ -433,6 +461,11 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
openAIReq.Metadata = input.Metadata
_, tokenUsage, chatDeltas, err := openaiEndpoint.ComputeChoices(openAIReq, predInput, cfg, cl, appConfig, ml, func(s string, c *[]schema.Choice) {}, tokenCallback)
// Stop the keepalive goroutine now that inference is done
firstTokenOnce.Do(func() { close(firstTokenReceived) })
<-keepaliveDone
if err != nil {
xlog.Error("Anthropic stream model inference failed", "error", err)
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
@ -445,9 +478,68 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
return nil
}
// Also check chat deltas for tool calls
if deltaToolCalls := functions.ToolCallsFromChatDeltas(chatDeltas); len(deltaToolCalls) > 0 && len(collectedToolCalls) == 0 {
collectedToolCalls = deltaToolCalls
// Check chat deltas from C++ autoparser — when active, the raw
// message is cleared and content/tool calls arrive via ChatDeltas.
if len(chatDeltas) > 0 {
deltaContent := functions.ContentFromChatDeltas(chatDeltas)
deltaToolCalls := functions.ToolCallsFromChatDeltas(chatDeltas)
// Emit text content from ChatDeltas only when the tokenCallback
// didn't already stream it (autoparser clears raw text, so
// accumulatedContent will be empty in that case).
if deltaContent != "" && !inToolCall && accumulatedContent == "" {
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_delta",
Index: intPtr(0),
Delta: &schema.AnthropicStreamDelta{
Type: "text_delta",
Text: deltaContent,
},
})
}
// Emit tool_use blocks from ChatDeltas
if len(deltaToolCalls) > 0 && len(collectedToolCalls) == 0 {
collectedToolCalls = deltaToolCalls
if !inToolCall && currentBlockIndex == 0 {
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_stop",
Index: intPtr(currentBlockIndex),
})
currentBlockIndex++
inToolCall = true
}
for i, tc := range deltaToolCalls {
toolCallID := tc.ID
if toolCallID == "" {
toolCallID = fmt.Sprintf("toolu_%s_%d", id, i)
}
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_start",
Index: intPtr(currentBlockIndex),
ContentBlock: &schema.AnthropicContentBlock{
Type: "tool_use",
ID: toolCallID,
Name: tc.Name,
},
})
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_delta",
Index: intPtr(currentBlockIndex),
Delta: &schema.AnthropicStreamDelta{
Type: "input_json_delta",
PartialJSON: tc.Arguments,
},
})
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_stop",
Index: intPtr(currentBlockIndex),
})
currentBlockIndex++
toolCallsEmitted++
}
}
}
// MCP streaming tool execution: if we collected MCP tool calls, execute and loop

View file

@ -166,6 +166,67 @@ This section provides step-by-step instructions for configuring specific softwar
After saving the configuration file, restart OpenCode for the changes to take effect.
### Claude Code
[Claude Code](https://docs.anthropic.com/en/docs/claude-code) is Anthropic's official CLI tool for coding with Claude. LocalAI implements the Anthropic Messages API (`/v1/messages`), so Claude Code can be pointed directly at a LocalAI instance.
#### Prerequisites
- LocalAI must be running and accessible (either locally or on a network)
- You need to know your LocalAI server's IP address/hostname and port (default is `8080`)
- An API key configured in your LocalAI instance
#### Running Claude Code with LocalAI
Set the `ANTHROPIC_BASE_URL` and `ANTHROPIC_API_KEY` environment variables to point Claude Code at your LocalAI server:
```bash
ANTHROPIC_BASE_URL=http://127.0.0.1:8080 \
ANTHROPIC_API_KEY=your-localai-api-key \
claude --model your-model-name
```
For example, if you have a Gemma model loaded:
```bash
ANTHROPIC_BASE_URL=http://127.0.0.1:8080 \
ANTHROPIC_API_KEY=your-localai-api-key \
claude --model gemma-4-12B-it-GGUF
```
You can also run a single prompt non-interactively:
```bash
ANTHROPIC_BASE_URL=http://127.0.0.1:8080 \
ANTHROPIC_API_KEY=your-localai-api-key \
claude -p "list the files in /tmp" --model your-model-name
```
#### Configuration
To avoid setting environment variables every time, you can add them to your shell profile (e.g., `~/.bashrc` or `~/.zshrc`):
```bash
export ANTHROPIC_BASE_URL=http://127.0.0.1:8080
export ANTHROPIC_API_KEY=your-localai-api-key
```
#### Verify available models
Check which models are available in your LocalAI instance:
```bash
curl http://127.0.0.1:8080/v1/models
```
Use one of the listed model IDs as the `--model` argument.
#### Notes
- Models with tool calling support (e.g., Gemma 4, Qwen 3) work best, as Claude Code relies heavily on tool use for file operations and code editing.
- Larger models generally produce better results for complex coding tasks.
- The Anthropic Messages API endpoint supports both streaming and non-streaming modes.
### Charm Crush
You can ask [Charm Crush](https://charm.land/crush) to generate your config by giving it this documentation's URL and your LocalAI instance URL. The configuration will look something like the following and goes in `~/.config/crush/crush.json`:

View file

@ -383,5 +383,144 @@ var _ = Describe("Anthropic API E2E test", func() {
Expect(string(message.StopReason)).To(Equal("tool_use"))
})
})
Context("ChatDeltas (C++ autoparser)", func() {
It("streams tool calls via ChatDeltas", func() {
stream := client.Messages.NewStreaming(context.TODO(), anthropic.MessageNewParams{
Model: "mock-model-autoparser",
MaxTokens: 1024,
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("AUTOPARSER_TOOL_CALL What's the weather like in San Francisco?")),
},
Tools: []anthropic.ToolUnionParam{
anthropic.ToolUnionParam{
OfTool: &anthropic.ToolParam{
Name: "get_weather",
Description: anthropic.Opt("Get the current weather in a given location"),
InputSchema: anthropic.ToolInputSchemaParam{
Type: constant.ValueOf[constant.Object](),
Properties: map[string]any{
"location": map[string]any{
"type": "string",
"description": "The city and state",
},
},
Required: []string{"location"},
},
},
},
},
})
message := anthropic.Message{}
hasToolUseStart := false
for stream.Next() {
event := stream.Current()
err := message.Accumulate(event)
Expect(err).ToNot(HaveOccurred())
if e, ok := event.AsAny().(anthropic.ContentBlockStartEvent); ok {
if e.ContentBlock.Type == "tool_use" {
hasToolUseStart = true
}
}
}
Expect(stream.Err()).ToNot(HaveOccurred())
Expect(hasToolUseStart).To(BeTrue(), "Should have tool_use content_block_start event from ChatDeltas")
Expect(string(message.StopReason)).To(Equal("tool_use"))
// Verify tool call is present in accumulated message
foundToolUse := false
for _, block := range message.Content {
if block.Type == "tool_use" {
foundToolUse = true
Expect(block.ID).ToNot(BeEmpty())
}
}
Expect(foundToolUse).To(BeTrue(), "Accumulated message should contain tool_use block from ChatDeltas")
})
It("streams content via ChatDeltas without duplication", func() {
stream := client.Messages.NewStreaming(context.TODO(), anthropic.MessageNewParams{
Model: "mock-model-autoparser",
MaxTokens: 1024,
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("AUTOPARSER_CONTENT Tell me about LocalAI")),
},
})
message := anthropic.Message{}
var textDeltas []string
for stream.Next() {
event := stream.Current()
err := message.Accumulate(event)
Expect(err).ToNot(HaveOccurred())
if e, ok := event.AsAny().(anthropic.ContentBlockDeltaEvent); ok {
if e.Delta.Type == "text_delta" && e.Delta.Text != "" {
textDeltas = append(textDeltas, e.Delta.Text)
}
}
}
Expect(stream.Err()).ToNot(HaveOccurred())
Expect(message.Content).ToNot(BeEmpty())
Expect(string(message.StopReason)).To(Equal("end_turn"))
// Content should appear exactly once (no duplication)
fullText := ""
for _, block := range message.Content {
if block.Type == "text" {
fullText += block.Text
}
}
Expect(fullText).To(ContainSubstring("LocalAI"))
// Check that the content is not duplicated by counting occurrences
Expect(len(fullText)).To(BeNumerically("<", 200), "Content should not be duplicated")
})
It("handles tool calls via ChatDeltas in non-streaming mode", func() {
message, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
Model: "mock-model-autoparser",
MaxTokens: 1024,
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("AUTOPARSER_TOOL_CALL What's the weather like in San Francisco?")),
},
Tools: []anthropic.ToolUnionParam{
anthropic.ToolUnionParam{
OfTool: &anthropic.ToolParam{
Name: "get_weather",
Description: anthropic.Opt("Get the current weather"),
InputSchema: anthropic.ToolInputSchemaParam{
Type: constant.ValueOf[constant.Object](),
Properties: map[string]any{
"location": map[string]any{
"type": "string",
},
},
Required: []string{"location"},
},
},
},
},
})
Expect(err).ToNot(HaveOccurred())
Expect(message.Content).ToNot(BeEmpty())
Expect(string(message.StopReason)).To(Equal("tool_use"))
foundToolUse := false
for _, block := range message.Content {
if block.Type == "tool_use" {
foundToolUse = true
Expect(block.ID).ToNot(BeEmpty())
}
}
Expect(foundToolUse).To(BeTrue(), "Should have tool_use block from ChatDeltas")
})
})
})
})