fix(traces): cap captured body size to keep admin Traces UI responsive (#9946)

The trace middleware buffered the full request and response bodies for every JSON exchange. With a chatty agent-pool RAG workload, /embeddings responses (large vector arrays) accumulated to tens of MB in the in-memory buffer; the admin Traces page would then download and parse 40+ MB on every load and on every 5s auto-refresh, locking the UI in a loading state. Add LOCALAI_TRACING_MAX_BODY_BYTES (default 64 KiB) that caps each captured body. The full payload still flows through to the real client; only the trace copy is bounded. Exchanges record body_truncated and original body_bytes so the dashboard can show that truncation happened. The cap is configurable via env, CLI, and runtime_settings.json. Also unblock recovery: the Traces page now keeps the Clear button enabled while loading, since "buffer too large to render" is exactly when the user needs to clear it. Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-24 09:28:23 +00:00 · 2026-05-22 15:29:24 +02:00 · 2026-05-22 15:29:24 +02:00 · 61bf34ea2f
commit 61bf34ea2f
parent 0b2ae3c6ca
7 changed files with 212 additions and 19 deletions
--- a/core/application/startup.go
+++ b/core/application/startup.go
@ -552,6 +552,13 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 			options.TracingMaxItems = *settings.TracingMaxItems
 		}
 	}
+	if settings.TracingMaxBodyBytes != nil {
+		// Allow the on-disk setting to override the CLI/env default. The
+		// startup default is non-zero (see NewApplicationConfig), so a plain
+		// `== 0` guard like the others would never trigger; we instead respect
+		// any value the file specifies. 0 in the file means "uncapped".
+		options.TracingMaxBodyBytes = *settings.TracingMaxBodyBytes
+	}

 	// Branding / whitelabeling. There are no env vars for these — the file is
 	// the only source — so apply unconditionally. Without this block a server
--- a/core/cli/run.go
+++ b/core/cli/run.go
@ -100,6 +100,7 @@ type RunCMD struct {
 	LoadToMemory                       []string `env:"LOCALAI_LOAD_TO_MEMORY,LOAD_TO_MEMORY" help:"A list of models to load into memory at startup" group:"models"`
 	EnableTracing                      bool     `env:"LOCALAI_ENABLE_TRACING,ENABLE_TRACING" help:"Enable API tracing" group:"api"`
 	TracingMaxItems                    int      `env:"LOCALAI_TRACING_MAX_ITEMS" default:"1024" help:"Maximum number of traces to keep" group:"api"`
+	TracingMaxBodyBytes                int      `env:"LOCALAI_TRACING_MAX_BODY_BYTES" default:"65536" help:"Maximum bytes captured per request/response body in the trace buffer (0 = uncapped). Caps memory growth from chatty endpoints like /embeddings." group:"api"`
 	AgentJobRetentionDays              int      `env:"LOCALAI_AGENT_JOB_RETENTION_DAYS,AGENT_JOB_RETENTION_DAYS" default:"30" help:"Number of days to keep agent job history (default: 30)" group:"api"`
 	OpenResponsesStoreTTL              string   `env:"LOCALAI_OPEN_RESPONSES_STORE_TTL,OPEN_RESPONSES_STORE_TTL" default:"0" help:"TTL for Open Responses store (e.g., 1h, 30m, 0 = no expiration)" group:"api"`

@ -273,6 +274,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		opts = append(opts, config.EnableTracing)
 	}
 	opts = append(opts, config.WithTracingMaxItems(r.TracingMaxItems))
+	opts = append(opts, config.WithTracingMaxBodyBytes(r.TracingMaxBodyBytes))

 	token := ""
 	if r.Peer2Peer || r.Peer2PeerToken != "" {
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@ -21,6 +21,7 @@ type ApplicationConfig struct {
 	Debug                               bool
 	EnableTracing                       bool
 	TracingMaxItems                     int
+	TracingMaxBodyBytes                 int // Per-body cap for captured request/response bodies; 0 disables the cap
 	EnableBackendLogging                bool
 	GeneratedContentDir                 string

@ -187,6 +188,7 @@ func NewApplicationConfig(o ...AppOption) *ApplicationConfig {
 		LRUEvictionRetryInterval: 1 * time.Second,        // Default: 1 second
 		WatchDogInterval:         500 * time.Millisecond, // Default: 500ms
 		TracingMaxItems:          1024,
+		TracingMaxBodyBytes:      64 * 1024, // 64 KiB - caps each request/response body in the trace buffer
 		AgentPool: AgentPoolConfig{
 			Enabled:         true,
 			Timeout:         "5m",
@ -578,6 +580,12 @@ func WithTracingMaxItems(items int) AppOption {
 	}
 }

+func WithTracingMaxBodyBytes(bytes int) AppOption {
+	return func(o *ApplicationConfig) {
+		o.TracingMaxBodyBytes = bytes
+	}
+}
+
 func WithGeneratedContentDir(generatedContentDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.GeneratedContentDir = generatedContentDir
@ -920,6 +928,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 	f16 := o.F16
 	debug := o.Debug
 	tracingMaxItems := o.TracingMaxItems
+	tracingMaxBodyBytes := o.TracingMaxBodyBytes
 	enableTracing := o.EnableTracing
 	enableBackendLogging := o.EnableBackendLogging
 	cors := o.CORS
@ -1008,6 +1017,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 		F16:                       &f16,
 		Debug:                     &debug,
 		TracingMaxItems:           &tracingMaxItems,
+		TracingMaxBodyBytes:       &tracingMaxBodyBytes,
 		EnableTracing:             &enableTracing,
 		EnableBackendLogging:      &enableBackendLogging,
 		CORS:                      &cors,
@ -1146,6 +1156,9 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
 	if settings.TracingMaxItems != nil {
 		o.TracingMaxItems = *settings.TracingMaxItems
 	}
+	if settings.TracingMaxBodyBytes != nil {
+		o.TracingMaxBodyBytes = *settings.TracingMaxBodyBytes
+	}
 	if settings.EnableBackendLogging != nil {
 		o.EnableBackendLogging = *settings.EnableBackendLogging
 	}
--- a/core/config/runtime_settings.go
+++ b/core/config/runtime_settings.go
@ -38,6 +38,7 @@ type RuntimeSettings struct {
 	Debug                *bool `json:"debug,omitempty"`
 	EnableTracing        *bool `json:"enable_tracing,omitempty"`
 	TracingMaxItems      *int  `json:"tracing_max_items,omitempty"`
+	TracingMaxBodyBytes  *int  `json:"tracing_max_body_bytes,omitempty"` // Per-body cap in bytes; 0 disables the cap
 	EnableBackendLogging *bool `json:"enable_backend_logging,omitempty"`

 	// Security/CORS settings
--- a/core/http/middleware/trace.go
+++ b/core/http/middleware/trace.go
@ -17,16 +17,20 @@ import (
 )

 type APIExchangeRequest struct {
-	Method  string       `json:"method"`
-	Path    string       `json:"path"`
-	Headers *http.Header `json:"headers"`
-	Body    *[]byte      `json:"body"`
+	Method        string       `json:"method"`
+	Path          string       `json:"path"`
+	Headers       *http.Header `json:"headers"`
+	Body          *[]byte      `json:"body"`
+	BodyTruncated bool         `json:"body_truncated,omitempty"`
+	BodyBytes     int          `json:"body_bytes,omitempty"` // original size before truncation
 }

 type APIExchangeResponse struct {
-	Status  int          `json:"status"`
-	Headers *http.Header `json:"headers"`
-	Body    *[]byte      `json:"body"`
+	Status        int          `json:"status"`
+	Headers       *http.Header `json:"headers"`
+	Body          *[]byte      `json:"body"`
+	BodyTruncated bool         `json:"body_truncated,omitempty"`
+	BodyBytes     int          `json:"body_bytes,omitempty"` // original size before truncation
 }

 type APIExchange struct {
@ -66,11 +70,29 @@ var doInitializeTracing = sync.OnceFunc(func() {

 type bodyWriter struct {
 	http.ResponseWriter
-	body *bytes.Buffer
+	body       *bytes.Buffer
+	maxBytes   int // 0 = unlimited capture
+	truncated  bool
+	totalBytes int // bytes the upstream handler wrote, even past the cap
 }

 func (w *bodyWriter) Write(b []byte) (int, error) {
-	w.body.Write(b)
+	// Capture into the trace buffer up to maxBytes, then drop the overflow
+	// so a chatty endpoint can't grow the buffer without bound. The full
+	// payload still flows through to the real client below.
+	w.totalBytes += len(b)
+	if w.maxBytes <= 0 {
+		w.body.Write(b)
+	} else if remain := w.maxBytes - w.body.Len(); remain > 0 {
+		if remain >= len(b) {
+			w.body.Write(b)
+		} else {
+			w.body.Write(b[:remain])
+			w.truncated = true
+		}
+	} else {
+		w.truncated = true
+	}
 	return w.ResponseWriter.Write(b)
 }

@ -80,6 +102,20 @@ func (w *bodyWriter) Flush() {
 	}
 }

+// truncateForTrace returns a defensive copy of body capped at maxBytes,
+// and a flag indicating whether the cap forced truncation. maxBytes <= 0
+// disables the cap.
+func truncateForTrace(body []byte, maxBytes int) ([]byte, bool) {
+	if maxBytes <= 0 || len(body) <= maxBytes {
+		out := make([]byte, len(body))
+		copy(out, body)
+		return out, false
+	}
+	out := make([]byte, maxBytes)
+	copy(out, body[:maxBytes])
+	return out, true
+}
+
 func initializeTracing(maxItems int) {
 	tracingMaxItems = maxItems
 	doInitializeTracing()
@ -134,11 +170,18 @@ func TraceMiddleware(app *application.Application) echo.MiddlewareFunc {

 			startTime := time.Now()

+			// Cap captured payload size. Without this, /embeddings and
+			// streaming /chat/completions blow the in-memory buffer into the
+			// tens of MB, which then locks the admin Traces UI fetching the
+			// JSON dump faster than the 5s auto-refresh.
+			maxBodyBytes := app.ApplicationConfig().TracingMaxBodyBytes
+
 			// Wrap response writer to capture body
 			resBody := new(bytes.Buffer)
 			mw := &bodyWriter{
 				ResponseWriter: c.Response().Writer,
 				body:           resBody,
+				maxBytes:       maxBodyBytes,
 			}
 			c.Response().Writer = mw

@ -159,8 +202,7 @@ func TraceMiddleware(app *application.Application) echo.MiddlewareFunc {
 			// via any heap-dump-style introspection, and tokens shouldn't
 			// outlive the request that carried them.
 			requestHeaders := redactSensitiveHeaders(c.Request().Header)
-			requestBody := make([]byte, len(body))
-			copy(requestBody, body)
+			requestBody, requestTruncated := truncateForTrace(body, maxBodyBytes)
 			responseHeaders := redactSensitiveHeaders(c.Response().Header())
 			responseBody := make([]byte, resBody.Len())
 			copy(responseBody, resBody.Bytes())
@ -168,15 +210,19 @@ func TraceMiddleware(app *application.Application) echo.MiddlewareFunc {
 				Timestamp: startTime,
 				Duration:  time.Since(startTime),
 				Request: APIExchangeRequest{
-					Method:  c.Request().Method,
-					Path:    c.Path(),
-					Headers: &requestHeaders,
-					Body:    &requestBody,
+					Method:        c.Request().Method,
+					Path:          c.Path(),
+					Headers:       &requestHeaders,
+					Body:          &requestBody,
+					BodyTruncated: requestTruncated,
+					BodyBytes:     len(body),
 				},
 				Response: APIExchangeResponse{
-					Status:  status,
-					Headers: &responseHeaders,
-					Body:    &responseBody,
+					Status:        status,
+					Headers:       &responseHeaders,
+					Body:          &responseBody,
+					BodyTruncated: mw.truncated,
+					BodyBytes:     mw.totalBytes,
 				},
 			}
 			if handlerErr != nil {
--- a/core/http/middleware/trace_body_cap_test.go
+++ b/core/http/middleware/trace_body_cap_test.go
@ -0,0 +1,116 @@
+package middleware
+
+import (
+	"bytes"
+	"net/http/httptest"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// The trace middleware copies request and response bodies into an in-memory
+// buffer that backs the admin /api/traces endpoint. With no upper bound a
+// chatty workload (embeddings, large completions) trivially produces a
+// multi-MB response that locks the Traces UI in a loading state — fetching
+// and parsing the payload outruns the 5-second auto-refresh. These specs
+// pin the capping contract so future refactors keep both the cap and the
+// passthrough to the real client intact.
+
+var _ = Describe("bodyWriter capping", func() {
+	It("captures the full body when maxBytes is 0 (unlimited)", func() {
+		downstream := httptest.NewRecorder()
+		buf := &bytes.Buffer{}
+		bw := &bodyWriter{ResponseWriter: downstream, body: buf, maxBytes: 0}
+
+		payload := []byte(strings.Repeat("x", 4096))
+		n, err := bw.Write(payload)
+
+		Expect(err).ToNot(HaveOccurred())
+		Expect(n).To(Equal(len(payload)))
+		Expect(buf.Len()).To(Equal(len(payload)))
+		Expect(downstream.Body.Len()).To(Equal(len(payload)))
+		Expect(bw.truncated).To(BeFalse())
+	})
+
+	It("stops appending to the trace buffer once maxBytes is reached but still forwards to the client", func() {
+		downstream := httptest.NewRecorder()
+		buf := &bytes.Buffer{}
+		bw := &bodyWriter{ResponseWriter: downstream, body: buf, maxBytes: 100}
+
+		payload := []byte(strings.Repeat("a", 250))
+		n, err := bw.Write(payload)
+
+		Expect(err).ToNot(HaveOccurred())
+		Expect(n).To(Equal(len(payload)), "Write must return the full byte count so callers see no short write")
+		Expect(buf.Len()).To(Equal(100), "trace buffer should hold exactly maxBytes")
+		Expect(downstream.Body.Len()).To(Equal(len(payload)), "client must still receive every byte")
+		Expect(bw.truncated).To(BeTrue())
+	})
+
+	It("handles a write that straddles the cap by keeping only the leading slice", func() {
+		downstream := httptest.NewRecorder()
+		buf := &bytes.Buffer{}
+		bw := &bodyWriter{ResponseWriter: downstream, body: buf, maxBytes: 10}
+
+		_, err := bw.Write([]byte("12345"))
+		Expect(err).ToNot(HaveOccurred())
+		Expect(bw.truncated).To(BeFalse())
+
+		_, err = bw.Write([]byte("67890ABCDE"))
+		Expect(err).ToNot(HaveOccurred())
+
+		Expect(buf.String()).To(Equal("1234567890"))
+		Expect(downstream.Body.String()).To(Equal("1234567890ABCDE"))
+		Expect(bw.truncated).To(BeTrue())
+	})
+
+	It("ignores further writes after the cap was already hit", func() {
+		downstream := httptest.NewRecorder()
+		buf := &bytes.Buffer{}
+		bw := &bodyWriter{ResponseWriter: downstream, body: buf, maxBytes: 4}
+
+		_, _ = bw.Write([]byte("AAAA"))
+		_, _ = bw.Write([]byte("BBBB"))
+		_, _ = bw.Write([]byte("CCCC"))
+
+		Expect(buf.String()).To(Equal("AAAA"))
+		Expect(downstream.Body.String()).To(Equal("AAAABBBBCCCC"))
+		Expect(bw.truncated).To(BeTrue())
+	})
+})
+
+var _ = Describe("truncateForTrace", func() {
+	It("returns the input unchanged when below the cap", func() {
+		in := []byte("hello")
+		out, truncated := truncateForTrace(in, 1024)
+		Expect(truncated).To(BeFalse())
+		Expect(out).To(Equal(in))
+	})
+
+	It("truncates when the input exceeds the cap and signals truncation", func() {
+		in := []byte(strings.Repeat("z", 200))
+		out, truncated := truncateForTrace(in, 64)
+		Expect(truncated).To(BeTrue())
+		Expect(out).To(HaveLen(64))
+		Expect(string(out)).To(Equal(strings.Repeat("z", 64)))
+	})
+
+	It("treats maxBytes <= 0 as unlimited (back-compat with current default)", func() {
+		in := []byte(strings.Repeat("q", 10_000))
+		out, truncated := truncateForTrace(in, 0)
+		Expect(truncated).To(BeFalse())
+		Expect(out).To(HaveLen(len(in)))
+	})
+
+	It("does not retain the caller's backing array (defensive copy)", func() {
+		in := []byte("abcdefghij")
+		out, truncated := truncateForTrace(in, 4)
+		Expect(truncated).To(BeTrue())
+		Expect(string(out)).To(Equal("abcd"))
+
+		// Mutating the source must not corrupt the trace copy.
+		in[0] = 'Z'
+		Expect(string(out)).To(Equal("abcd"))
+	})
+})
--- a/core/http/react-ui/src/pages/Traces.jsx
+++ b/core/http/react-ui/src/pages/Traces.jsx
@ -406,7 +406,15 @@ export default function Traces() {
        <button className="btn btn-secondary btn-sm" onClick={fetchTraces}><i className="fas fa-rotate" /> Refresh</button>
        <button className="btn btn-secondary btn-sm" onClick={handleExport} disabled={traces.length === 0}><i className="fas fa-download" /> Export</button>
        <div style={{ flex: 1 }} />
-        <button className="btn btn-danger btn-sm" onClick={handleClear} disabled={traces.length === 0}><i className="fas fa-trash" /> Clear</button>
+        <button
+          className="btn btn-danger btn-sm"
+          onClick={handleClear}
+          /* Stay enabled while loading: a massive in-memory trace buffer is
+             precisely the case where the user can't see the table yet and
+             needs Clear to recover. Clearing an already-empty server-side
+             buffer is a harmless no-op. */
+          disabled={!loading && traces.length === 0}
+        ><i className="fas fa-trash" /> Clear</button>
      </div>

      {settings && (() => {