mirror of
https://github.com/fleetdm/fleet
synced 2026-05-14 12:38:41 +00:00
<!-- Add the related story/sub-task/bug number, like Resolves #123, or remove if NA --> **Related issue:** Resolves #38756 - Changed to NOT mark many client errors as exceptions - Instead, added client_error and server_error metrics that can be used to alert on unusual error rates # Checklist for submitter - [x] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. ## Testing - [x] Added/updated automated tests - [x] QA'd all new/changed functionality manually <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Added separate metrics for distinguishing between client and server errors, enhancing observability and monitoring capabilities. * **Bug Fixes** * Client request errors no longer incorrectly appear in error tracking as exceptions; improved accuracy of error reporting to external services. * Adjusted logging levels for authentication and enrollment operations to provide clearer diagnostics. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai -->
443 lines
14 KiB
Go
443 lines
14 KiB
Go
// Package ctxerr provides functions to wrap errors with annotations and
|
|
// stack traces, and to handle those errors such that unique instances of
|
|
// those errors will be stored for an amount of time so that it can be
|
|
// used to troubleshoot issues.
|
|
//
|
|
// Typical uses of this package should be to call New or Wrap[f] as close as
|
|
// possible from where the error is encountered (or where it needs to be
|
|
// created for New), and then to call Handle with the error only once, after it
|
|
// bubbled back to the top of the call stack (e.g. in the HTTP handler, or in
|
|
// the CLI command, etc.). It is fine to wrap the error with more annotations
|
|
// along the way, by calling Wrap[f].
|
|
package ctxerr
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"maps"
|
|
"runtime"
|
|
"strings"
|
|
"time"
|
|
|
|
platform_http "github.com/fleetdm/fleet/v4/server/platform/http"
|
|
"github.com/getsentry/sentry-go"
|
|
"go.elastic.co/apm/v2"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/codes"
|
|
"go.opentelemetry.io/otel/trace"
|
|
)
|
|
|
|
type key int
|
|
|
|
const errHandlerKey key = 0
|
|
|
|
// Defining here for testing purposes
|
|
var nowFn = time.Now
|
|
|
|
// FleetError is the error implementation used by this package.
|
|
type FleetError struct {
|
|
msg string // error message to be prepended to cause
|
|
stack stackTracer // stack trace where this error was created
|
|
cause error // original error that caused this error if non-nil
|
|
data json.RawMessage // additional metadata about the error (timestamps, etc)
|
|
}
|
|
|
|
type fleetErrorJSON struct {
|
|
Message string `json:"message,omitempty"`
|
|
Data json.RawMessage `json:"data,omitempty"`
|
|
Stack []string `json:"stack,omitempty"`
|
|
}
|
|
|
|
// Error implements the error interface.
|
|
func (e FleetError) Error() string {
|
|
if e.cause == nil {
|
|
return e.msg
|
|
}
|
|
return fmt.Sprintf("%s: %s", e.msg, e.cause.Error())
|
|
}
|
|
|
|
// Unwrap implements the error Unwrap interface introduced in go1.13.
|
|
func (e *FleetError) Unwrap() error {
|
|
return e.cause
|
|
}
|
|
|
|
// Stack returns a call stack for the error
|
|
func (e *FleetError) Stack() []string {
|
|
return e.stack.List()
|
|
}
|
|
|
|
// StackTrace implements the runtimeStackTracer interface understood by the
|
|
// elastic APM package to reuse already-captured stack traces.
|
|
// https://github.com/elastic/apm-agent-go/blob/main/stacktrace/errors.go#L45-L47
|
|
func (e *FleetError) StackTrace() *runtime.Frames {
|
|
st := e.stack.(stack) // outside of tests, e.stack is always a stack type
|
|
return runtime.CallersFrames(st)
|
|
}
|
|
|
|
// StackFrames implements the reflection-based method that Sentry's Go SDK
|
|
// uses to look for a stack trace. It abuses the internals a bit, as it uses
|
|
// the name that sentry looks for, but returns the []uintptr slice (which works
|
|
// because of how they handle the returned value via reflection). A cleaner
|
|
// approach would be if they used an interface detection like APM does.
|
|
// https://github.com/getsentry/sentry-go/blob/master/stacktrace.go#L44-L49
|
|
func (e *FleetError) StackFrames() []uintptr {
|
|
return e.stack.(stack) // outside of tests, e.stack is always a stack type
|
|
}
|
|
|
|
// LogFields implements fleet.ErrWithLogFields, so attached error data can be
|
|
// logged along with the error
|
|
func (e *FleetError) LogFields() []any {
|
|
var fields []any
|
|
var data map[string]any
|
|
|
|
if len(e.data) == 0 {
|
|
return fields
|
|
}
|
|
|
|
// if we fail to unmarshal the data, return it as a raw string. It
|
|
// won't be as easy to read but it will be there.
|
|
if err := json.Unmarshal(e.data, &data); err != nil {
|
|
return []any{
|
|
"data", string(e.data),
|
|
}
|
|
}
|
|
|
|
for k, v := range data {
|
|
fields = append(fields, k, v)
|
|
}
|
|
|
|
return fields
|
|
}
|
|
|
|
// setMetadata adds common metadata attributes to the `data` map provided.
|
|
// NOTE: this will mutate the data provided and override other values with the same keys.
|
|
func setMetadata(ctx context.Context, data map[string]interface{}) map[string]interface{} {
|
|
if data == nil {
|
|
data = map[string]interface{}{}
|
|
}
|
|
|
|
data["timestamp"] = nowFn().Format(time.RFC3339)
|
|
|
|
// Get diagnostic context from all registered providers
|
|
for _, provider := range getErrorContextProviders(ctx) {
|
|
maps.Copy(data, provider.GetDiagnosticContext())
|
|
}
|
|
|
|
return data
|
|
}
|
|
|
|
func encodeData(ctx context.Context, data map[string]interface{}, augment bool) json.RawMessage {
|
|
if augment {
|
|
data = setMetadata(ctx, data)
|
|
}
|
|
|
|
encoded, err := json.Marshal(data)
|
|
if err != nil {
|
|
msg := fmt.Sprintf(`{"error": "there was an error encoding additional data: %s"}`, err.Error())
|
|
encoded = json.RawMessage(msg)
|
|
}
|
|
return encoded
|
|
}
|
|
|
|
func newError(ctx context.Context, msg string, cause error, data map[string]interface{}) error {
|
|
stack := newStack(2)
|
|
edata := encodeData(ctx, data, true)
|
|
return &FleetError{msg, stack, cause, edata}
|
|
}
|
|
|
|
func wrapError(ctx context.Context, msg string, cause error, data map[string]interface{}) error {
|
|
if cause == nil {
|
|
return nil
|
|
}
|
|
|
|
stack := newStack(2)
|
|
var ferr *FleetError
|
|
isFleetError := errors.As(cause, &ferr)
|
|
|
|
// If the error is a FleetError, don't add the full stack trace as it should
|
|
// already be present.
|
|
if isFleetError {
|
|
stack = stack[:1]
|
|
}
|
|
|
|
edata := encodeData(ctx, data, !isFleetError)
|
|
return &FleetError{msg, stack, cause, edata}
|
|
}
|
|
|
|
// New creates a new error with the given message.
|
|
func New(ctx context.Context, msg string) error {
|
|
return newError(ctx, msg, nil, nil)
|
|
}
|
|
|
|
// NewWithData creates a new error and attaches additional metadata to it
|
|
func NewWithData(ctx context.Context, msg string, data map[string]interface{}) error {
|
|
return newError(ctx, msg, nil, data)
|
|
}
|
|
|
|
// Errorf creates a new error with the given message.
|
|
func Errorf(ctx context.Context, format string, args ...interface{}) error {
|
|
msg := fmt.Sprintf(format, args...)
|
|
return newError(ctx, msg, nil, nil)
|
|
}
|
|
|
|
// Wrap creates a new error with the given message, wrapping another error.
|
|
func Wrap(ctx context.Context, cause error, msgs ...string) error {
|
|
msg := strings.Join(msgs, " ")
|
|
return wrapError(ctx, msg, cause, nil)
|
|
}
|
|
|
|
// WrapWithData creates a new error with the given message, wrapping another
|
|
// error and attaching the data provided to it.
|
|
func WrapWithData(ctx context.Context, cause error, msg string, data map[string]interface{}) error {
|
|
return wrapError(ctx, msg, cause, data)
|
|
}
|
|
|
|
// Wrapf creates a new error with the given message, wrapping another error.
|
|
func Wrapf(ctx context.Context, cause error, format string, args ...interface{}) error {
|
|
msg := fmt.Sprintf(format, args...)
|
|
return wrapError(ctx, msg, cause, nil)
|
|
}
|
|
|
|
// Cause returns the root error in err's chain.
|
|
func Cause(err error) error {
|
|
return platform_http.Cause(err)
|
|
}
|
|
|
|
// FleetCause is similar to Cause, but returns the root-most
|
|
// FleetError in the chain
|
|
func FleetCause(err error) *FleetError {
|
|
var ferr, aux *FleetError
|
|
var ok bool
|
|
|
|
for err != nil {
|
|
if aux, ok = err.(*FleetError); ok {
|
|
ferr = aux
|
|
}
|
|
err = Unwrap(err)
|
|
}
|
|
|
|
return ferr
|
|
}
|
|
|
|
// Unwrap is a wrapper of built-in errors.Unwrap. It returns the result of
|
|
// calling the Unwrap method on err, if err's type contains an Unwrap method
|
|
// returning error. Otherwise, Unwrap returns nil.
|
|
func Unwrap(err error) error {
|
|
return errors.Unwrap(err)
|
|
}
|
|
|
|
// MarshalJSON provides a JSON representation of a whole error chain.
|
|
func MarshalJSON(err error) ([]byte, error) {
|
|
chain := make([]fleetErrorJSON, 0)
|
|
|
|
for err != nil {
|
|
switch v := err.(type) {
|
|
case *FleetError:
|
|
chain = append(chain, fleetErrorJSON{
|
|
Message: v.msg,
|
|
Data: v.data,
|
|
Stack: v.stack.List(),
|
|
})
|
|
default:
|
|
chain = append(chain, fleetErrorJSON{Message: v.Error()})
|
|
}
|
|
|
|
err = Unwrap(err)
|
|
}
|
|
|
|
// reverse the chain to present errors in chronological order.
|
|
for i := len(chain)/2 - 1; i >= 0; i-- {
|
|
opp := len(chain) - 1 - i
|
|
chain[i], chain[opp] = chain[opp], chain[i]
|
|
}
|
|
|
|
return json.MarshalIndent(chain, "", " ")
|
|
}
|
|
|
|
// StoredError represents the structure we use to de-serialize errors and
|
|
// counts stored in Redis
|
|
type StoredError struct {
|
|
Count int `json:"count"`
|
|
Chain json.RawMessage `json:"chain"`
|
|
}
|
|
|
|
type Handler interface {
|
|
Store(error)
|
|
Retrieve(flush bool) ([]*StoredError, error)
|
|
}
|
|
|
|
// NewContext returns a context derived from ctx that contains the provided
|
|
// error handler.
|
|
func NewContext(ctx context.Context, eh Handler) context.Context {
|
|
return context.WithValue(ctx, errHandlerKey, eh)
|
|
}
|
|
|
|
func FromContext(ctx context.Context) Handler {
|
|
v, _ := ctx.Value(errHandlerKey).(Handler)
|
|
return v
|
|
}
|
|
|
|
// Handle handles err by passing it to the registered error handler,
|
|
// deduplicating it and storing it for a configured duration. It also takes
|
|
// care of sending it to the configured OpenTelemetry/APM/Sentry, if any.
|
|
func Handle(ctx context.Context, err error) {
|
|
if err == nil {
|
|
return
|
|
}
|
|
|
|
// as a last resource, wrap the error if there isn't
|
|
// a FleetError in the chain
|
|
var ferr *FleetError
|
|
if !errors.As(err, &ferr) {
|
|
wrapped := wrapError(ctx, "missing FleetError in chain", err, nil)
|
|
if wrapped == nil {
|
|
return // shouldn't happen since err is not nil, but be safe
|
|
}
|
|
// wrapError returns an error interface, but we know it's a *FleetError
|
|
ferr = wrapped.(*FleetError)
|
|
}
|
|
|
|
cause := ferr
|
|
if rootCause := FleetCause(ferr); rootCause != nil {
|
|
// use the FleetCause error so we send the most relevant stacktrace to APM
|
|
// (the one from the initial New/Wrap call).
|
|
cause = rootCause
|
|
}
|
|
|
|
// Collect telemetry context from registered providers
|
|
telemetryAttrs := collectTelemetryContext(ctx)
|
|
|
|
// Check if this is a client error. Per OTEL semantic conventions,
|
|
// 4xx errors on server spans MUST NOT set span status to Error.
|
|
// See: https://opentelemetry.io/docs/specs/semconv/http/http-spans/
|
|
clientErr := isClientError(err)
|
|
exceptionType := fmt.Sprintf("%T", Cause(cause)) // type of root error
|
|
|
|
// Record metrics for both client and server errors
|
|
if clientErr {
|
|
clientErrorsCounter.Add(ctx, 1, clientErrorCounterAttrs(exceptionType))
|
|
} else {
|
|
serverErrorsCounter.Add(ctx, 1, serverErrorCounterAttrs(exceptionType))
|
|
}
|
|
|
|
// Only record exception events for server errors (5xx).
|
|
// Per OTEL spec, handled errors (like 4xx responses) should not be recorded as exceptions.
|
|
// See: https://opentelemetry.io/docs/specs/semconv/general/recording-errors/
|
|
if !clientErr {
|
|
if span := trace.SpanFromContext(ctx); span != nil && span.IsRecording() {
|
|
span.SetStatus(codes.Error, exceptionType)
|
|
|
|
// Build attributes for the event using OTEL semantic conventions.
|
|
// See: https://opentelemetry.io/docs/specs/semconv/exceptions/exceptions-spans/
|
|
attrs := []attribute.KeyValue{
|
|
attribute.String("exception.type", exceptionType),
|
|
attribute.String("exception.message", cause.Error()),
|
|
attribute.String("exception.stacktrace", strings.Join(cause.Stack(), "\n")),
|
|
}
|
|
|
|
// Add contextual information from telemetry providers.
|
|
// OpenTelemetry requires typed attributes, so we convert the values to the appropriate type.
|
|
for k, v := range telemetryAttrs {
|
|
switch val := v.(type) {
|
|
case string:
|
|
attrs = append(attrs, attribute.String(k, val))
|
|
case int:
|
|
attrs = append(attrs, attribute.Int64(k, int64(val)))
|
|
case int64:
|
|
attrs = append(attrs, attribute.Int64(k, val))
|
|
case uint:
|
|
attrs = append(attrs, attribute.Int64(k, int64(val))) //nolint:gosec
|
|
case uint64:
|
|
attrs = append(attrs, attribute.Int64(k, int64(val))) //nolint:gosec
|
|
case bool:
|
|
attrs = append(attrs, attribute.Bool(k, val))
|
|
default:
|
|
attrs = append(attrs, attribute.String(k, fmt.Sprint(val)))
|
|
}
|
|
}
|
|
|
|
span.AddEvent("exception", trace.WithAttributes(attrs...))
|
|
}
|
|
|
|
// send to elastic APM
|
|
apm.CaptureError(ctx, cause).Send()
|
|
|
|
// if Sentry is configured, capture the error there
|
|
if sentryClient := sentry.CurrentHub().Client(); sentryClient != nil {
|
|
if len(telemetryAttrs) > 0 {
|
|
// we have contextual information, use it to enrich the error
|
|
ctxHub := sentry.CurrentHub().Clone()
|
|
ctxHub.ConfigureScope(func(scope *sentry.Scope) {
|
|
for k, v := range telemetryAttrs {
|
|
scope.SetTag(k, fmt.Sprint(v))
|
|
}
|
|
})
|
|
ctxHub.CaptureException(cause)
|
|
} else {
|
|
sentry.CaptureException(cause)
|
|
}
|
|
}
|
|
}
|
|
|
|
if eh := FromContext(ctx); eh != nil {
|
|
eh.Store(ferr)
|
|
}
|
|
}
|
|
|
|
// collectTelemetryContext gathers telemetry context from all registered providers.
|
|
func collectTelemetryContext(ctx context.Context) map[string]any {
|
|
attrs := make(map[string]any)
|
|
for _, provider := range getErrorContextProviders(ctx) {
|
|
if telemetry := provider.GetTelemetryContext(); telemetry != nil {
|
|
maps.Copy(attrs, telemetry)
|
|
}
|
|
}
|
|
return attrs
|
|
}
|
|
|
|
// isClientError checks if the error is a client error (4xx).
|
|
func isClientError(err error) bool {
|
|
// Check for explicit client error interface
|
|
var clientErr platform_http.ErrWithIsClientError
|
|
if errors.As(err, &clientErr) {
|
|
return clientErr.IsClientError()
|
|
}
|
|
|
|
// Treat context.Canceled as a client error. In HTTP handlers, this typically
|
|
// indicates client disconnection. While it could theoretically come from
|
|
// server-side cancellation, detecting true client disconnection at the
|
|
// transport layer is complex. Go's HTTP server doesn't provide a distinct
|
|
// error type for client disconnection (see https://github.com/golang/go/issues/64465).
|
|
// The occasional misclassification is acceptable given that most context
|
|
// cancellations in request handling are client-initiated.
|
|
return errors.Is(err, context.Canceled)
|
|
}
|
|
|
|
// Retrieve retrieves an error from the registered error handler
|
|
func Retrieve(ctx context.Context) ([]*StoredError, error) {
|
|
eh := FromContext(ctx)
|
|
if eh == nil {
|
|
return nil, New(ctx, "missing handler in context")
|
|
}
|
|
return eh.Retrieve(false)
|
|
}
|
|
|
|
// MockHandler is a mock implementation of an error handler that allows to test
|
|
// ctxerr features that retrieve and store information in Redis without a
|
|
// server running.
|
|
// Ideally this should live in errorstore/errors, but that creates a circular
|
|
// dependency.
|
|
type MockHandler struct {
|
|
StoreImpl func(err error)
|
|
RetrieveImpl func(flush bool) ([]*StoredError, error)
|
|
}
|
|
|
|
func (h MockHandler) Store(err error) {
|
|
h.StoreImpl(err)
|
|
}
|
|
|
|
func (h MockHandler) Retrieve(flush bool) ([]*StoredError, error) {
|
|
return h.RetrieveImpl(flush)
|
|
}
|