fix(vision): propagate mtmd media marker from backend via ModelMetadata (#9412)

Upstream llama.cpp (PR #21962) switched the server-side mtmd media marker to a random per-server string and removed the legacy "<__media__>" backward-compat replacement in mtmd_tokenizer. The Go layer still emitted the hardcoded "<__media__>", so on the non-tokenizer-template path the prompt arrived with a marker mtmd did not recognize and tokenization failed with "number of bitmaps (1) does not match number of markers (0)". Report the active media marker via ModelMetadataResponse.media_marker and substitute the sentinel "<__media__>" with it right before the gRPC call, after the backend has been loaded and probed. Also skip the Go-side multimodal templating entirely when UseTokenizerTemplate is true — llama.cpp's oaicompat_chat_params_parse already injects its own marker and StringContent is unused in that path. Backends that do not expose the field keep the legacy "<__media__>" behavior.
2026-04-21 13:27:21 +00:00 · 2026-04-18 20:30:13 +02:00 · 2026-04-18 20:30:13 +02:00 · 7809c5f5d0
commit 7809c5f5d0
parent ad742738cb
9 changed files with 96 additions and 20 deletions
--- a/backend/backend.proto
+++ b/backend/backend.proto
@ -557,6 +557,7 @@ message ModelMetadataResponse {
  bool supports_thinking = 1;
  string rendered_template = 2;  // The rendered chat template with enable_thinking=true (empty if not applicable)
  ToolFormatMarkers tool_format = 3;  // Auto-detected tool format markers from differential template analysis
+  string media_marker = 4;  // Marker the backend expects in the prompt for each multimodal input (images/audio/video). Empty when the backend does not use a marker.
 }

 // Fine-tuning messages
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@ -2814,6 +2814,13 @@ public:
            return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
        }

+        // Report the active multimodal media marker so the Go layer can emit the
+        // same string when rendering prompts outside the tokenizer-template path.
+        // Only meaningful when an mtmd context was initialized (vision/audio models).
+        if (ctx_server.impl->mctx != nullptr) {
+            response->set_media_marker(get_media_marker());
+        }
+
        // Check if chat templates are initialized
        if (ctx_server.impl->chat_params.tmpls == nullptr) {
            // If templates are not initialized, we can't detect thinking support
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@ -15,6 +15,7 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services/galleryop"
+	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/core/trace"

 	"github.com/mudler/LocalAI/core/gallery"
@ -94,15 +95,25 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
 		return nil, err
 	}

-	// Detect thinking support after model load (only if not already detected)
-	// This needs to happen after LoadModel succeeds so the backend can render templates
-	if (c.ReasoningConfig.DisableReasoning == nil && c.ReasoningConfig.DisableReasoningTagPrefill == nil) && c.TemplateConfig.UseTokenizerTemplate {
+	// Probe the backend for model-scoped metadata after LoadModel succeeds.
+	// Two signals are captured: thinking-mode detection (only meaningful when the
+	// tokenizer template path is active) and the multimodal media marker (needed
+	// by custom chat templates so markers line up with what mtmd expects).
+	// We probe whenever any of those slots is still empty.
+	needsThinkingProbe := c.TemplateConfig.UseTokenizerTemplate &&
+		c.ReasoningConfig.DisableReasoning == nil &&
+		c.ReasoningConfig.DisableReasoningTagPrefill == nil
+	needsMarkerProbe := c.MediaMarker == ""
+	if needsThinkingProbe || needsMarkerProbe {
 		modelOpts := grpcModelOpts(*c, o.SystemState.Model.ModelsPath)
 		config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts)
 		// Update the config in the loader so it persists for future requests
 		cl.UpdateModelConfig(c.Name, func(cfg *config.ModelConfig) {
 			cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning
 			cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill
+			if c.MediaMarker != "" {
+				cfg.MediaMarker = c.MediaMarker
+			}
 		})
 	}

@ -121,7 +132,17 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
 		for k, v := range metadata {
 			opts.Metadata[k] = v
 		}
-		opts.Prompt = s
+		// The prompt was rendered with the sentinel "<__media__>" marker because
+		// middleware templating runs before the backend is loaded and probed.
+		// Once we know the backend's actual media marker, substitute so marker
+		// count matches the bitmap count passed through opts.Images/Videos/Audios.
+		// No-op when MediaMarker is unset, matches the sentinel, or the prompt has
+		// no media placeholders.
+		prompt := s
+		if c.MediaMarker != "" && c.MediaMarker != templates.DefaultMultiMediaMarker {
+			prompt = strings.ReplaceAll(prompt, templates.DefaultMultiMediaMarker, c.MediaMarker)
+		}
+		opts.Prompt = prompt
 		opts.Messages = protoMessages
 		opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
 		opts.Images = images
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@ -84,6 +84,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 // if the model supports thinking mode and if the template ends with a thinking start token.
 // This should be called after the model is loaded.
 // The results are stored in cfg.SupportsThinking and cfg.ThinkingForcedOpen.
+// The backend-reported multimodal marker is also captured into cfg.MediaMarker.
 func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, backendClient grpc.Backend, modelOptions *pb.ModelOptions) {
 	if backendClient == nil {
 		xlog.Debug("[gguf] DetectThinkingSupportFromBackend: backend client is nil, skipping detection")
@ -95,9 +96,10 @@ func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, bac
 		return
 	}

-	// Only detect for llama-cpp backend when using tokenizer templates
-	if cfg.Backend != "llama-cpp" || !cfg.TemplateConfig.UseTokenizerTemplate {
-		xlog.Debug("[gguf] DetectThinkingSupportFromBackend: skipping detection", "backend", cfg.Backend, "useTokenizerTemplate", cfg.TemplateConfig.UseTokenizerTemplate)
+	// Only llama-cpp exposes ModelMetadata today. Other backends will either error
+	// or return an empty response — both are fine, we just bail before calling.
+	if cfg.Backend != "llama-cpp" {
+		xlog.Debug("[gguf] DetectThinkingSupportFromBackend: skipping detection", "backend", cfg.Backend)
 		return
 	}

@ -108,6 +110,21 @@ func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, bac
 	}

 	if metadata != nil {
+		// The multimodal media marker is backend-controlled (llama.cpp may pick a
+		// random per-server string). Empty means "no mtmd context" — Go falls back
+		// to templates.DefaultMultiMediaMarker at render time.
+		if metadata.MediaMarker != "" {
+			cfg.MediaMarker = metadata.MediaMarker
+			xlog.Debug("[gguf] DetectThinkingSupportFromBackend: media marker captured", "marker", metadata.MediaMarker)
+		}
+
+		// Thinking / tool-format detection only applies when we rely on the
+		// backend-side tokenizer template — otherwise the rendered-template based
+		// heuristics below aren't meaningful.
+		if !cfg.TemplateConfig.UseTokenizerTemplate {
+			return
+		}
+
 		cfg.ReasoningConfig.DisableReasoning = ptr.To(!metadata.SupportsThinking)

 		// Use the rendered template to detect if thinking token is at the end
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@ -52,6 +52,12 @@ type ModelConfig struct {
 	ResponseFormat                             string         `yaml:"-" json:"-"`
 	ResponseFormatMap                          map[string]any `yaml:"-" json:"-"`

+	// MediaMarker is the runtime-discovered multimodal marker the backend expects
+	// in the prompt (e.g. "<__media__>" or a random "<__media_<rand>__>" picked by
+	// llama.cpp). Populated on first successful ModelMetadata call. Empty until
+	// then — callers must fall back to templates.DefaultMultiMediaMarker.
+	MediaMarker string `yaml:"-" json:"-"`
+
 	FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"`
 	ReasoningConfig reasoning.Config          `yaml:"reasoning,omitempty" json:"reasoning,omitempty"`

--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@ -1179,7 +1179,7 @@ func triggerResponse(ctx context.Context, session *Session, conv *Conversation,
 					nrOfImgsInMessage++
 				}
 			}
-			if nrOfImgsInMessage > 0 {
+			if nrOfImgsInMessage > 0 && !config.TemplateConfig.UseTokenizerTemplate {
 				templated, err := templates.TemplateMultiModal(config.TemplateConfig.Multimodal, templates.MultiModalOptions{
 					TotalImages:     imgIndex,
 					ImagesInMessage: nrOfImgsInMessage,
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@ -709,8 +709,10 @@ func convertORMessageItem(itemMap map[string]any, cfg *config.ModelConfig) (sche
 		msg.StringVideos = stringVideos
 		msg.StringAudios = stringAudios

-		// Template multimodal content
-		if len(stringImages) > 0 || len(stringVideos) > 0 || len(stringAudios) > 0 {
+		// Template multimodal content. Skipped when the backend handles templating
+		// itself (UseTokenizerTemplate) — it also injects markers server-side and
+		// StringContent is not consumed by the evaluator in that path.
+		if (len(stringImages) > 0 || len(stringVideos) > 0 || len(stringAudios) > 0) && !cfg.TemplateConfig.UseTokenizerTemplate {
 			msg.StringContent, _ = templates.TemplateMultiModal(cfg.TemplateConfig.Multimodal, templates.MultiModalOptions{
 				TotalImages:     len(stringImages),
 				TotalVideos:     len(stringVideos),
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@ -398,14 +398,23 @@ func mergeOpenAIRequestAndModelConfig(config *config.ModelConfig, input *schema.
 				}
 			}

-			input.Messages[i].StringContent, _ = templates.TemplateMultiModal(config.TemplateConfig.Multimodal, templates.MultiModalOptions{
-				TotalImages:     imgIndex,
-				TotalVideos:     vidIndex,
-				TotalAudios:     audioIndex,
-				ImagesInMessage: nrOfImgsInMessage,
-				VideosInMessage: nrOfVideosInMessage,
-				AudiosInMessage: nrOfAudiosInMessage,
-			}, textContent)
+			// When the backend handles templating itself (UseTokenizerTemplate),
+			// it also injects media markers server-side (see
+			// oaicompat_chat_params_parse in llama.cpp). Emitting our own markers
+			// here would double-mark them and downstream consumers ignore
+			// StringContent in that path anyway, so just pass through plain text.
+			if config.TemplateConfig.UseTokenizerTemplate {
+				input.Messages[i].StringContent = textContent
+			} else {
+				input.Messages[i].StringContent, _ = templates.TemplateMultiModal(config.TemplateConfig.Multimodal, templates.MultiModalOptions{
+					TotalImages:     imgIndex,
+					TotalVideos:     vidIndex,
+					TotalAudios:     audioIndex,
+					ImagesInMessage: nrOfImgsInMessage,
+					VideosInMessage: nrOfVideosInMessage,
+					AudiosInMessage: nrOfAudiosInMessage,
+				}, textContent)
+			}
 		}
 	}

--- a/core/templates/multimodal.go
+++ b/core/templates/multimodal.go
@ -21,8 +21,21 @@ type MultimodalContent struct {
 	ID int
 }

-// https://github.com/ggml-org/llama.cpp/blob/be1d4a13db26750fac702ceb3af88ae4f39dc9f4/tools/mtmd/mtmd.h#L42
-// from <__image__> to <__media__> https://github.com/ggml-org/llama.cpp/blob/79c137f77677b3c8ee3c60a7da033721b938399a/tools/mtmd/mtmd.cpp#L83
+// DefaultMultiMediaMarker is the sentinel marker LocalAI emits in the rendered
+// prompt for each image/audio item. It matches llama.cpp's historical
+// mtmd_default_marker() ("<__media__>"). llama.cpp's server now picks a random
+// per-server marker (see PR #21962) and reports it via ModelMetadataResponse.media_marker;
+// callers substitute this sentinel with the backend-reported marker right before
+// the gRPC call (core/backend/llm.go).
+const DefaultMultiMediaMarker = "<__media__>"
+
+// DefaultMultiModalTemplate renders a per-message media-marker prefix followed
+// by the text content. The sentinel marker is substituted late, so this
+// template does not need to know the backend-specific marker.
+//
+// References:
+//   - https://github.com/ggml-org/llama.cpp/blob/79c137f77677b3c8ee3c60a7da033721b938399a/tools/mtmd/mtmd.cpp#L83
+//   - https://github.com/ggml-org/llama.cpp/pull/21962
 const DefaultMultiModalTemplate = "{{ range .Audio }}<__media__>{{end}}{{ range .Images }}<__media__>{{end}}{{ range .Video }}[vid-{{.ID}}]{{end}}{{.Text}}"

 func TemplateMultiModal(templateString string, opts MultiModalOptions, text string) (string, error) {