// Package ctxerr provides functions to wrap errors with annotations and // stack traces, and to handle those errors such that unique instances of // those errors will be stored for an amount of time so that it can be // used to troubleshoot issues. // // Typical uses of this package should be to call New or Wrap[f] as close as // possible from where the error is encountered (or where it needs to be // created for New), and then to call Handle with the error only once, after it // bubbled back to the top of the call stack (e.g. in the HTTP handler, or in // the CLI command, etc.). It is fine to wrap the error with more annotations // along the way, by calling Wrap[f]. package ctxerr import ( "context" "encoding/json" "errors" "fmt" "maps" "runtime" "strings" "time" platform_http "github.com/fleetdm/fleet/v4/server/platform/http" "github.com/getsentry/sentry-go" "go.elastic.co/apm/v2" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/codes" "go.opentelemetry.io/otel/trace" ) type key int const errHandlerKey key = 0 // Defining here for testing purposes var nowFn = time.Now // FleetError is the error implementation used by this package. type FleetError struct { msg string // error message to be prepended to cause stack stackTracer // stack trace where this error was created cause error // original error that caused this error if non-nil data json.RawMessage // additional metadata about the error (timestamps, etc) } type fleetErrorJSON struct { Message string `json:"message,omitempty"` Data json.RawMessage `json:"data,omitempty"` Stack []string `json:"stack,omitempty"` } // Error implements the error interface. func (e FleetError) Error() string { if e.cause == nil { return e.msg } return fmt.Sprintf("%s: %s", e.msg, e.cause.Error()) } // Unwrap implements the error Unwrap interface introduced in go1.13. func (e *FleetError) Unwrap() error { return e.cause } // Stack returns a call stack for the error func (e *FleetError) Stack() []string { return e.stack.List() } // StackTrace implements the runtimeStackTracer interface understood by the // elastic APM package to reuse already-captured stack traces. // https://github.com/elastic/apm-agent-go/blob/main/stacktrace/errors.go#L45-L47 func (e *FleetError) StackTrace() *runtime.Frames { st := e.stack.(stack) // outside of tests, e.stack is always a stack type return runtime.CallersFrames(st) } // StackFrames implements the reflection-based method that Sentry's Go SDK // uses to look for a stack trace. It abuses the internals a bit, as it uses // the name that sentry looks for, but returns the []uintptr slice (which works // because of how they handle the returned value via reflection). A cleaner // approach would be if they used an interface detection like APM does. // https://github.com/getsentry/sentry-go/blob/master/stacktrace.go#L44-L49 func (e *FleetError) StackFrames() []uintptr { return e.stack.(stack) // outside of tests, e.stack is always a stack type } // LogFields implements fleet.ErrWithLogFields, so attached error data can be // logged along with the error func (e *FleetError) LogFields() []any { var fields []any var data map[string]any if len(e.data) == 0 { return fields } // if we fail to unmarshal the data, return it as a raw string. It // won't be as easy to read but it will be there. if err := json.Unmarshal(e.data, &data); err != nil { return []any{ "data", string(e.data), } } for k, v := range data { fields = append(fields, k, v) } return fields } // setMetadata adds common metadata attributes to the `data` map provided. // NOTE: this will mutate the data provided and override other values with the same keys. func setMetadata(ctx context.Context, data map[string]interface{}) map[string]interface{} { if data == nil { data = map[string]interface{}{} } data["timestamp"] = nowFn().Format(time.RFC3339) // Get diagnostic context from all registered providers for _, provider := range getErrorContextProviders(ctx) { maps.Copy(data, provider.GetDiagnosticContext()) } return data } func encodeData(ctx context.Context, data map[string]interface{}, augment bool) json.RawMessage { if augment { data = setMetadata(ctx, data) } encoded, err := json.Marshal(data) if err != nil { msg := fmt.Sprintf(`{"error": "there was an error encoding additional data: %s"}`, err.Error()) encoded = json.RawMessage(msg) } return encoded } func newError(ctx context.Context, msg string, cause error, data map[string]interface{}) error { stack := newStack(2) edata := encodeData(ctx, data, true) return &FleetError{msg, stack, cause, edata} } func wrapError(ctx context.Context, msg string, cause error, data map[string]interface{}) error { if cause == nil { return nil } stack := newStack(2) var ferr *FleetError isFleetError := errors.As(cause, &ferr) // If the error is a FleetError, don't add the full stack trace as it should // already be present. if isFleetError { stack = stack[:1] } edata := encodeData(ctx, data, !isFleetError) return &FleetError{msg, stack, cause, edata} } // New creates a new error with the given message. func New(ctx context.Context, msg string) error { return newError(ctx, msg, nil, nil) } // NewWithData creates a new error and attaches additional metadata to it func NewWithData(ctx context.Context, msg string, data map[string]interface{}) error { return newError(ctx, msg, nil, data) } // Errorf creates a new error with the given message. func Errorf(ctx context.Context, format string, args ...interface{}) error { msg := fmt.Sprintf(format, args...) return newError(ctx, msg, nil, nil) } // Wrap creates a new error with the given message, wrapping another error. func Wrap(ctx context.Context, cause error, msgs ...string) error { msg := strings.Join(msgs, " ") return wrapError(ctx, msg, cause, nil) } // WrapWithData creates a new error with the given message, wrapping another // error and attaching the data provided to it. func WrapWithData(ctx context.Context, cause error, msg string, data map[string]interface{}) error { return wrapError(ctx, msg, cause, data) } // Wrapf creates a new error with the given message, wrapping another error. func Wrapf(ctx context.Context, cause error, format string, args ...interface{}) error { msg := fmt.Sprintf(format, args...) return wrapError(ctx, msg, cause, nil) } // Cause returns the root error in err's chain. func Cause(err error) error { return platform_http.Cause(err) } // FleetCause is similar to Cause, but returns the root-most // FleetError in the chain func FleetCause(err error) *FleetError { var ferr, aux *FleetError var ok bool for err != nil { if aux, ok = err.(*FleetError); ok { ferr = aux } err = Unwrap(err) } return ferr } // Unwrap is a wrapper of built-in errors.Unwrap. It returns the result of // calling the Unwrap method on err, if err's type contains an Unwrap method // returning error. Otherwise, Unwrap returns nil. func Unwrap(err error) error { return errors.Unwrap(err) } // MarshalJSON provides a JSON representation of a whole error chain. func MarshalJSON(err error) ([]byte, error) { chain := make([]fleetErrorJSON, 0) for err != nil { switch v := err.(type) { case *FleetError: chain = append(chain, fleetErrorJSON{ Message: v.msg, Data: v.data, Stack: v.stack.List(), }) default: chain = append(chain, fleetErrorJSON{Message: v.Error()}) } err = Unwrap(err) } // reverse the chain to present errors in chronological order. for i := len(chain)/2 - 1; i >= 0; i-- { opp := len(chain) - 1 - i chain[i], chain[opp] = chain[opp], chain[i] } return json.MarshalIndent(chain, "", " ") } // StoredError represents the structure we use to de-serialize errors and // counts stored in Redis type StoredError struct { Count int `json:"count"` Chain json.RawMessage `json:"chain"` } type Handler interface { Store(error) Retrieve(flush bool) ([]*StoredError, error) } // NewContext returns a context derived from ctx that contains the provided // error handler. func NewContext(ctx context.Context, eh Handler) context.Context { return context.WithValue(ctx, errHandlerKey, eh) } func FromContext(ctx context.Context) Handler { v, _ := ctx.Value(errHandlerKey).(Handler) return v } // Handle handles err by passing it to the registered error handler, // deduplicating it and storing it for a configured duration. It also takes // care of sending it to the configured OpenTelemetry/APM/Sentry, if any. func Handle(ctx context.Context, err error) { if err == nil { return } // as a last resource, wrap the error if there isn't // a FleetError in the chain var ferr *FleetError if !errors.As(err, &ferr) { wrapped := wrapError(ctx, "missing FleetError in chain", err, nil) if wrapped == nil { return // shouldn't happen since err is not nil, but be safe } // wrapError returns an error interface, but we know it's a *FleetError ferr = wrapped.(*FleetError) } cause := ferr if rootCause := FleetCause(ferr); rootCause != nil { // use the FleetCause error so we send the most relevant stacktrace to APM // (the one from the initial New/Wrap call). cause = rootCause } // Collect telemetry context from registered providers telemetryAttrs := collectTelemetryContext(ctx) // Check if this is a client error. Per OTEL semantic conventions, // 4xx errors on server spans MUST NOT set span status to Error. // See: https://opentelemetry.io/docs/specs/semconv/http/http-spans/ clientErr := isClientError(err) exceptionType := fmt.Sprintf("%T", Cause(cause)) // type of root error // Record metrics for both client and server errors if clientErr { clientErrorsCounter.Add(ctx, 1, clientErrorCounterAttrs(exceptionType)) } else { serverErrorsCounter.Add(ctx, 1, serverErrorCounterAttrs(exceptionType)) } // Only record exception events for server errors (5xx). // Per OTEL spec, handled errors (like 4xx responses) should not be recorded as exceptions. // See: https://opentelemetry.io/docs/specs/semconv/general/recording-errors/ if !clientErr { if span := trace.SpanFromContext(ctx); span != nil && span.IsRecording() { span.SetStatus(codes.Error, exceptionType) // Build attributes for the event using OTEL semantic conventions. // See: https://opentelemetry.io/docs/specs/semconv/exceptions/exceptions-spans/ attrs := []attribute.KeyValue{ attribute.String("exception.type", exceptionType), attribute.String("exception.message", cause.Error()), attribute.String("exception.stacktrace", strings.Join(cause.Stack(), "\n")), } // Add contextual information from telemetry providers. // OpenTelemetry requires typed attributes, so we convert the values to the appropriate type. for k, v := range telemetryAttrs { switch val := v.(type) { case string: attrs = append(attrs, attribute.String(k, val)) case int: attrs = append(attrs, attribute.Int64(k, int64(val))) case int64: attrs = append(attrs, attribute.Int64(k, val)) case uint: attrs = append(attrs, attribute.Int64(k, int64(val))) //nolint:gosec case uint64: attrs = append(attrs, attribute.Int64(k, int64(val))) //nolint:gosec case bool: attrs = append(attrs, attribute.Bool(k, val)) default: attrs = append(attrs, attribute.String(k, fmt.Sprint(val))) } } span.AddEvent("exception", trace.WithAttributes(attrs...)) } // send to elastic APM apm.CaptureError(ctx, cause).Send() // if Sentry is configured, capture the error there if sentryClient := sentry.CurrentHub().Client(); sentryClient != nil { if len(telemetryAttrs) > 0 { // we have contextual information, use it to enrich the error ctxHub := sentry.CurrentHub().Clone() ctxHub.ConfigureScope(func(scope *sentry.Scope) { for k, v := range telemetryAttrs { scope.SetTag(k, fmt.Sprint(v)) } }) ctxHub.CaptureException(cause) } else { sentry.CaptureException(cause) } } } if eh := FromContext(ctx); eh != nil { eh.Store(ferr) } } // collectTelemetryContext gathers telemetry context from all registered providers. func collectTelemetryContext(ctx context.Context) map[string]any { attrs := make(map[string]any) for _, provider := range getErrorContextProviders(ctx) { if telemetry := provider.GetTelemetryContext(); telemetry != nil { maps.Copy(attrs, telemetry) } } return attrs } // isClientError checks if the error is a client error (4xx). func isClientError(err error) bool { // Check for explicit client error interface var clientErr platform_http.ErrWithIsClientError if errors.As(err, &clientErr) { return clientErr.IsClientError() } // Treat context.Canceled as a client error. In HTTP handlers, this typically // indicates client disconnection. While it could theoretically come from // server-side cancellation, detecting true client disconnection at the // transport layer is complex. Go's HTTP server doesn't provide a distinct // error type for client disconnection (see https://github.com/golang/go/issues/64465). // The occasional misclassification is acceptable given that most context // cancellations in request handling are client-initiated. return errors.Is(err, context.Canceled) } // Retrieve retrieves an error from the registered error handler func Retrieve(ctx context.Context) ([]*StoredError, error) { eh := FromContext(ctx) if eh == nil { return nil, New(ctx, "missing handler in context") } return eh.Retrieve(false) } // MockHandler is a mock implementation of an error handler that allows to test // ctxerr features that retrieve and store information in Redis without a // server running. // Ideally this should live in errorstore/errors, but that creates a circular // dependency. type MockHandler struct { StoreImpl func(err error) RetrieveImpl func(flush bool) ([]*StoredError, error) } func (h MockHandler) Store(err error) { h.StoreImpl(err) } func (h MockHandler) Retrieve(flush bool) ([]*StoredError, error) { return h.RetrieveImpl(flush) }