docs(router): metrics (#7696)

2026-04-21 14:37:17 +00:00 · 2026-03-12 10:32:41 +01:00 · 2026-03-12 10:32:41 +01:00 · a3acbd892f
commit a3acbd892f
parent 14c73e5751
5 changed files with 1386 additions and 1 deletions
--- a/packages/web/docs/src/components/otel-metrics/label-card.tsx
+++ b/packages/web/docs/src/components/otel-metrics/label-card.tsx
@ -0,0 +1,58 @@
+import { Info, Lightbulb, Tag } from 'lucide-react';
+
+interface LabelCardProps {
+  name: string;
+  meaning: string;
+  typicalValues: string[];
+  notes?: string;
+}
+
+export function LabelCard({ name, meaning, typicalValues, notes }: LabelCardProps) {
+  return (
+    <div>
+      <div className="mb-3 flex items-start gap-3">
+        <div className="shrink-0 rounded-md border border-gray-200 bg-gray-100 p-1.5 dark:border-neutral-700 dark:bg-neutral-800">
+          <Tag className="h-4 w-4 text-gray-600 dark:text-slate-100" />
+        </div>
+        <div className="min-w-0 flex-1">
+          <code className="break-all text-sm font-semibold text-gray-900 dark:text-slate-100">
+            {name}
+          </code>
+          <p className="mt-1 text-sm leading-relaxed text-gray-600 dark:text-slate-100">
+            {meaning}
+          </p>
+        </div>
+      </div>
+
+      <div className="mt-4 space-y-3">
+        <div>
+          <div className="mb-2 flex items-center gap-1.5">
+            <Info className="h-3.5 w-3.5 text-gray-500 dark:text-slate-400" />
+            <span className="text-xs font-semibold uppercase text-gray-700 dark:text-slate-100">
+              Typical Values
+            </span>
+          </div>
+          <div className="flex flex-wrap gap-1.5">
+            {typicalValues.map(value => (
+              <code
+                key={value}
+                className="rounded-md border border-slate-200 bg-slate-50 px-2.5 py-1 text-xs font-medium text-slate-700 dark:border-neutral-700 dark:bg-neutral-800 dark:text-slate-200"
+              >
+                {value}
+              </code>
+            ))}
+          </div>
+        </div>
+
+        {notes && (
+          <div className="pt-1">
+            <div className="flex items-start gap-2">
+              <Lightbulb className="mt-0.5 h-3.5 w-3.5 shrink-0 text-amber-600 dark:text-amber-400" />
+              <p className="text-sm leading-relaxed text-gray-600 dark:text-slate-100">{notes}</p>
+            </div>
+          </div>
+        )}
+      </div>
+    </div>
+  );
+}
--- a/packages/web/docs/src/components/otel-metrics/metric-card.tsx
+++ b/packages/web/docs/src/components/otel-metrics/metric-card.tsx
@ -0,0 +1,163 @@
+import { useEffect, useRef, useState } from 'react';
+import { Activity, BarChart3, Gauge, TrendingUp } from 'lucide-react';
+
+interface MetricCardProps {
+  name: string;
+  type: 'Counter' | 'Histogram' | 'UpDownCounter' | 'Gauge';
+  unit?: string;
+  description?: string;
+  labels?: string[];
+}
+
+const typeConfig = {
+  Counter: {
+    icon: TrendingUp,
+    color:
+      'bg-emerald-50 text-emerald-700 border-emerald-200 dark:bg-emerald-900/30 dark:text-emerald-300 dark:border-emerald-700/50',
+    badge: 'bg-emerald-100 text-emerald-800',
+  },
+  Histogram: {
+    icon: BarChart3,
+    color:
+      'bg-blue-50 text-blue-700 border-blue-200 dark:bg-blue-900/30 dark:text-blue-300 dark:border-blue-700/50',
+    badge: 'bg-blue-100 text-blue-800',
+  },
+  UpDownCounter: {
+    icon: Activity,
+    color:
+      'bg-amber-50 text-amber-700 border-amber-200 dark:bg-amber-900/30 dark:text-amber-300 dark:border-amber-700/50',
+    badge: 'bg-amber-100 text-amber-800',
+  },
+  Gauge: {
+    icon: Gauge,
+    color:
+      'bg-slate-50 text-slate-700 border-slate-200 dark:bg-slate-800/60 dark:text-slate-100 dark:border-slate-700',
+    badge: 'bg-slate-100 text-slate-800',
+  },
+};
+
+export function MetricCard({ name, type, unit, description, labels }: MetricCardProps) {
+  const config = typeConfig[type];
+  const Icon = config.icon;
+  const [isCopied, setIsCopied] = useState(false);
+  const copiedTimeoutRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const metricId = `metric-${name
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, '-')
+    .replace(/(^-|-$)/g, '')}`;
+
+  useEffect(() => {
+    return () => {
+      if (copiedTimeoutRef.current) {
+        clearTimeout(copiedTimeoutRef.current);
+      }
+    };
+  }, []);
+
+  function showCopiedState() {
+    setIsCopied(true);
+
+    if (copiedTimeoutRef.current) {
+      clearTimeout(copiedTimeoutRef.current);
+    }
+
+    copiedTimeoutRef.current = setTimeout(() => {
+      setIsCopied(false);
+    }, 1200);
+  }
+
+  async function copyMetricLink() {
+    if (typeof window === 'undefined') {
+      return;
+    }
+
+    const metricUrl = `${window.location.origin}${window.location.pathname}${window.location.search}#${metricId}`;
+
+    try {
+      await navigator.clipboard.writeText(metricUrl);
+      showCopiedState();
+    } catch {
+      window.location.hash = metricId;
+    }
+  }
+
+  return (
+    <div
+      id={metricId}
+      className="group scroll-mt-20 overflow-hidden rounded-lg border border-gray-200 bg-white transition-shadow duration-200 hover:shadow-md dark:border-neutral-800 dark:bg-neutral-900 dark:hover:shadow-black/30"
+    >
+      <div className="p-5">
+        <div className="mb-3 flex items-center justify-between gap-4">
+          <div className="min-w-0 flex-1">
+            <div className="flex items-center gap-1.5">
+              <code className="break-all text-sm font-semibold text-gray-900 dark:text-slate-100">
+                {name}
+              </code>
+              <button
+                type="button"
+                onClick={() => {
+                  void copyMetricLink();
+                }}
+                className={`hive-focus inline-flex items-center justify-center rounded font-mono text-sm font-semibold leading-none transition-all duration-200 ${isCopied ? 'translate-y-0 text-gray-500 opacity-100 dark:text-slate-500' : 'translate-y-0 text-gray-500 opacity-0 hover:text-gray-700 focus:text-gray-700 group-focus-within:opacity-100 group-hover:opacity-100 dark:text-slate-500 dark:hover:text-slate-200 dark:focus:text-slate-200'}`}
+                aria-label={`Copy link to ${name}`}
+                title="Copy metric link"
+              >
+                {isCopied ? (
+                  <>
+                    <span>✓</span>
+                    <span className="ml-1 text-xs">copied</span>
+                  </>
+                ) : (
+                  '#'
+                )}
+              </button>
+              <span className="sr-only" aria-live="polite">
+                {isCopied ? `Copied link to ${name}` : ''}
+              </span>
+            </div>
+          </div>
+          <div className="flex shrink-0 items-center gap-2">
+            {unit && (
+              <div className="flex items-center gap-1.5 rounded-md border border-gray-200 bg-gray-100 px-2.5 py-1 text-xs text-gray-700 dark:border-neutral-700 dark:bg-neutral-800 dark:text-slate-200">
+                <span className="font-medium text-gray-500 dark:text-slate-300">Unit:</span>
+                <code>{unit}</code>
+              </div>
+            )}
+            <div
+              className={`flex items-center gap-1.5 rounded-md border px-2.5 py-1 ${config.color}`}
+            >
+              <Icon className="h-3.5 w-3.5" />
+              <span className="text-xs font-medium">{type}</span>
+            </div>
+          </div>
+        </div>
+
+        {description && (
+          <p className="mb-4 text-sm leading-relaxed text-gray-600 dark:text-slate-100">
+            {description}
+          </p>
+        )}
+
+        {labels && labels.length > 0 && (
+          <div className="mt-4 border-t border-gray-100 pt-4 dark:border-neutral-800">
+            <div className="mb-2 flex items-center gap-2">
+              <span className="text-xs font-semibold uppercase text-gray-700 dark:text-slate-100">
+                Labels
+              </span>
+            </div>
+            <div className="flex flex-wrap gap-1.5">
+              {labels.map(label => (
+                <code
+                  key={label}
+                  className="rounded border border-gray-200 bg-gray-50 px-2 py-1 text-xs text-gray-700 transition-colors hover:border-gray-300 dark:border-neutral-700 dark:bg-neutral-800 dark:text-slate-200 dark:hover:border-neutral-600"
+                >
+                  {label}
+                </code>
+              ))}
+            </div>
+          </div>
+        )}
+      </div>
+    </div>
+  );
+}
--- a/packages/web/docs/src/components/otel-metrics/metrics-section.tsx
+++ b/packages/web/docs/src/components/otel-metrics/metrics-section.tsx
@ -0,0 +1,80 @@
+'use client';
+
+import { useId, useState } from 'react';
+import { ChevronDown } from 'lucide-react';
+import { LabelCard } from './label-card';
+import { MetricCard } from './metric-card';
+
+interface Metric {
+  name: string;
+  type: 'Counter' | 'Histogram' | 'UpDownCounter' | 'Gauge';
+  unit?: string;
+  description?: string;
+  labels?: string[];
+}
+
+interface Label {
+  name: string;
+  meaning: string;
+  typicalValues: string[];
+  notes?: string;
+}
+
+interface MetricsSectionProps {
+  title?: string;
+  description?: string;
+  metrics?: Metric[];
+  labels?: Label[];
+}
+export function MetricsSection({ metrics, labels }: MetricsSectionProps) {
+  const [isLabelsOpen, setIsLabelsOpen] = useState(false);
+  const labelsRegionId = useId();
+
+  return (
+    <div className="space-y-6">
+      {metrics && metrics.length > 0 && (
+        <div className="space-y-4">
+          <h4 className="mt-8 text-xl font-semibold tracking-tight text-slate-900 dark:text-slate-100">
+            Metrics
+          </h4>
+          <div className="grid gap-4">
+            {metrics.map(metric => (
+              <MetricCard key={metric.name} {...metric} />
+            ))}
+          </div>
+        </div>
+      )}
+
+      {labels && labels.length > 0 && (
+        <div className="overflow-hidden rounded-lg border border-gray-200 bg-white dark:border-neutral-800 dark:bg-neutral-900">
+          <button
+            type="button"
+            onClick={() => setIsLabelsOpen(current => !current)}
+            aria-expanded={isLabelsOpen}
+            aria-controls={labelsRegionId}
+            className="hive-focus flex w-full items-center justify-between px-5 py-4 text-left text-xl font-semibold tracking-tight text-slate-900 dark:text-slate-100"
+          >
+            <span>Labels Reference</span>
+            <ChevronDown
+              className={`h-5 w-5 transition-transform duration-200 ${isLabelsOpen ? 'rotate-180' : ''}`}
+            />
+          </button>
+          <div
+            id={labelsRegionId}
+            className={`overflow-hidden transition-[max-height,opacity] duration-300 ease-out ${isLabelsOpen ? 'max-h-[4000px] opacity-100' : 'max-h-0 opacity-90'}`}
+          >
+            <div className="border-t border-gray-100 px-5 pb-5 dark:border-neutral-800">
+              <div className="divide-y divide-gray-100 pt-2 dark:divide-neutral-800">
+                {labels.map(label => (
+                  <div key={label.name} className="py-6">
+                    <LabelCard {...label} />
+                  </div>
+                ))}
+              </div>
+            </div>
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}
--- a/packages/web/docs/src/content/router/configuration/telemetry.mdx
+++ b/packages/web/docs/src/content/router/configuration/telemetry.mdx
@ -5,7 +5,7 @@ title: 'telemetry'
 # telemetry

 The `telemetry` configuration controls client identification, Hive reporting, and OpenTelemetry
-tracing behavior in Hive Router.
+tracing and metrics behavior in Hive Router.

 ## client_identification

@ -232,6 +232,265 @@ telemetry:
            x-api-key: key
 ```

+</details>
+
+</div>
+
+</details>
+
+## metrics
+
+Top-level OpenTelemetry metrics configuration.
+
+<details>
+  <summary>Show metrics configuration</summary>
+
+Metrics are enabled when at least one exporter is configured and enabled.
+
+| Field             | Type     | Default | Notes                                                                                 |
+| ----------------- | -------- | ------- | ------------------------------------------------------------------------------------- |
+| `exporters`       | `array`  | `[]`    | List of exporters used to send metrics.                                               |
+| `instrumentation` | `object` | `{}`    | Instrument behavior for metrics (histogram aggregation and per-instrument overrides). |
+
+<div id="telemetry-metrics-exporters" style={{marginTop: 10}}>
+<details>
+  <summary>`exporters`</summary>
+
+Each item configures one metrics exporter.
+
+Each item in this array defines one exporter instance, so you can configure multiple metrics
+destinations if needed.
+
+This reference documents OTLP and Prometheus exporter configuration.
+
+| Field     | Type      | Default | Notes                                                  |
+| --------- | --------- | ------- | ------------------------------------------------------ |
+| `kind`    | `string`  | -       | Exporter kind. Supported values: `otlp`, `prometheus`. |
+| `enabled` | `boolean` | `true`  | Enables or disables this exporter.                     |
+
+<details>
+  <summary>`otlp`</summary>
+
+| Field                                                                                                               | Type                 | Default      | Notes                                                             |
+| ------------------------------------------------------------------------------------------------------------------- | -------------------- | ------------ | ----------------------------------------------------------------- |
+| `kind`                                                                                                              | `string`             | -            | Must be `otlp`.                                                   |
+| `enabled`                                                                                                           | `boolean`            | `true`       | Enables or disables this exporter.                                |
+| `endpoint`                                                                                                          | `StringOrExpression` | -            | OTLP endpoint. Must be set explicitly.                            |
+| `protocol`                                                                                                          | `string`             | -            | OTLP transport protocol. Supported values: `http`, `grpc`.        |
+| `interval`                                                                                                          | `string`             | `60s`        | Interval between OTLP export attempts.                            |
+| [`temporality`](https://opentelemetry.io/docs/specs/otel/metrics/supplementary-guidelines/#aggregation-temporality) | `string`             | `cumulative` | Aggregation temporality. Supported values: `cumulative`, `delta`. |
+| `max_export_timeout`                                                                                                | `string`             | `5s`         | Maximum time for one OTLP export attempt.                         |
+| `http`                                                                                                              | `object`             | -            | HTTP-specific OTLP settings (for `protocol: http`).               |
+| `grpc`                                                                                                              | `object`             | -            | gRPC-specific OTLP settings (for `protocol: grpc`).               |
+
+OTLP over HTTP:
+
+| Field          | Type     | Value / Default | Notes                                                         |
+| -------------- | -------- | --------------- | ------------------------------------------------------------- |
+| `protocol`     | `string` | `http`          | OTLP transport protocol.                                      |
+| `http.headers` | `object` | `{}`            | Map of header names to values (`string` or `{ expression }`). |
+
+```yaml filename="router.config.yaml"
+telemetry:
+  metrics:
+    exporters:
+      - kind: otlp
+        enabled: true
+        protocol: http
+        endpoint: https://otel-collector.example.com/v1/metrics
+        interval: 60s
+        temporality: cumulative
+        max_export_timeout: 5s
+        http:
+          headers:
+            x-otlp-header: value
+```
+
+OTLP over gRPC:
+
+| Field                  | Type     | Value / Default | Notes                                                                        |
+| ---------------------- | -------- | --------------- | ---------------------------------------------------------------------------- |
+| `protocol`             | `string` | `grpc`          | OTLP transport protocol.                                                     |
+| `grpc.metadata`        | `object` | `{}`            | Map of metadata keys to values (`string` or `{ expression }`).               |
+| `grpc.tls.domain_name` | `string` | -               | Domain name used to verify the server certificate.                           |
+| `grpc.tls.key`         | `string` | -               | Path to the client private key file.                                         |
+| `grpc.tls.cert`        | `string` | -               | Path to the client certificate file (PEM).                                   |
+| `grpc.tls.ca`          | `string` | -               | Path to the CA certificate file (PEM) used to verify the server certificate. |
+
+```yaml filename="router.config.yaml"
+telemetry:
+  metrics:
+    exporters:
+      - kind: otlp
+        enabled: true
+        protocol: grpc
+        endpoint: https://otel-collector.example.com:4317
+        interval: 60s
+        temporality: cumulative
+        max_export_timeout: 5s
+        grpc:
+          metadata:
+            x-api-key: key
+```
+
+</details>
+
+<details>
+  <summary>`prometheus`</summary>
+
+| Field     | Type      | Default    | Notes                                       |
+| --------- | --------- | ---------- | ------------------------------------------- |
+| `kind`    | `string`  | -          | Must be `prometheus`.                       |
+| `enabled` | `boolean` | `true`     | Enables/disables Prometheus metrics export. |
+| `port`    | `integer` | -          | Optional port for metrics endpoint.         |
+| `path`    | `string`  | `/metrics` | HTTP path exposed for scraping.             |
+
+```yaml filename="router.config.yaml"
+telemetry:
+  metrics:
+    exporters:
+      - kind: prometheus
+        enabled: true
+        port: 9090
+        path: /metrics
+```
+
+</details>
+
+</details>
+</div>
+
+<div id="telemetry-metrics-instrumentation" style={{marginTop: 10}}>
+<details>
+  <summary>`instrumentation`</summary>
+
+Controls histogram aggregation and per-instrument overrides.
+
+| Field              | Type     | Default                 | Notes                                                       |
+| ------------------ | -------- | ----------------------- | ----------------------------------------------------------- |
+| `common.histogram` | `object` | exponential aggregation | Histogram aggregation strategy for instrumented histograms. |
+| `instruments`      | `object` | `{}`                    | Map of metric name to `false`, `true`, or object override.  |
+
+<details>
+  <summary>`common.histogram`</summary>
+
+Set aggregation mode with `aggregation`.
+
+`explicit` aggregation (default):
+
+| Field         | Type     | Default | Notes                                                 |
+| ------------- | -------- | ------- | ----------------------------------------------------- |
+| `aggregation` | `string` | -       | Must be `explicit`.                                   |
+| `seconds`     | `object` | -       | Explicit histogram config for metrics with unit `s`.  |
+| `bytes`       | `object` | -       | Explicit histogram config for metrics with unit `By`. |
+
+`seconds` and `bytes` fields:
+
+| Field            | Type                   | Default | Notes                                                           |
+| ---------------- | ---------------------- | ------- | --------------------------------------------------------------- |
+| `buckets`        | `number[] \| string[]` | varies  | Explicit bucket upper bounds. Must be non-empty and increasing. |
+| `record_min_max` | `boolean`              | `false` | Record min/max values for this unit bucket set.                 |
+
+Default explicit buckets:
+
+- `seconds.buckets`: `[0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5, 10]`
+- `bytes.buckets`:
+  `[128, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 3145728, 4194304, 5242880]`
+
+Bucket format rules:
+
+- `buckets` can be either all numbers or all strings.
+- mixed arrays are not allowed.
+- for `seconds.buckets`, string values are parsed as durations (for example `"5ms"`, `"1s"`).
+- for `bytes.buckets`, string values are parsed as human-readable sizes (for example `"1KB"`,
+  `"5MB"`).
+
+[`exponential`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#exponentialhistogram)
+aggregation:
+
+| Field                                                                                                               | Type      | Default | Notes                                     |
+| ------------------------------------------------------------------------------------------------------------------- | --------- | ------- | ----------------------------------------- |
+| `aggregation`                                                                                                       | `string`  | -       | Must be `exponential`.                    |
+| [`max_size`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#base2-exponential-bucket-histogram-aggregation)  | `integer` | -       | Max bucket count. Required.               |
+| [`max_scale`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#base2-exponential-bucket-histogram-aggregation) | `integer` | -       | Max scale for bucket precision. Required. |
+| `record_min_max`                                                                                                    | `boolean` | `false` | Record min/max values.                    |
+
+</details>
+
+<details>
+  <summary>`instruments`</summary>
+
+`instruments` is a map keyed by metric name. Value can be:
+
+- `false` to disable a metric
+- `true` to keep defaults
+- object to keep metric enabled and override attributes
+
+Object form supports:
+
+| Field        | Type     | Notes                                                                          |
+| ------------ | -------- | ------------------------------------------------------------------------------ |
+| `attributes` | `object` | Map of attribute name to `boolean` (`false` drops attribute, `true` keeps it). |
+
+```yaml filename="router.config.yaml"
+telemetry:
+  metrics:
+    instrumentation:
+      common:
+        histogram:
+          aggregation: explicit
+          seconds:
+            buckets:
+              [
+                '5ms',
+                '10ms',
+                '25ms',
+                '50ms',
+                '75ms',
+                '100ms',
+                '250ms',
+                '500ms',
+                '750ms',
+                '1s',
+                '2.5s',
+                '5s',
+                '7.5s',
+                '10s'
+              ]
+            record_min_max: false
+          bytes:
+            buckets:
+              [
+                '128B',
+                '512B',
+                '1KB',
+                '2KB',
+                '4KB',
+                '8KB',
+                '16KB',
+                '32KB',
+                '64KB',
+                '128KB',
+                '256KB',
+                '512KB',
+                '1MB',
+                '2MB',
+                '3MB',
+                '4MB',
+                '5MB'
+              ]
+            record_min_max: false
+      instruments:
+        http.server.request.duration: true
+        http.client.request.duration:
+          attributes:
+            subgraph.name: true
+            http.response.status_code: true
+            server.address: false
+```
+
+</details>
+
 </details>
 </div>

--- a/packages/web/docs/src/content/router/observability/metrics.mdx
+++ b/packages/web/docs/src/content/router/observability/metrics.mdx
@ -0,0 +1,825 @@
+---
+title: 'OpenTelemetry Metrics'
+---
+
+import { Callout } from '#components/callout'
+import { MetricsSection } from '#components/otel-metrics/metrics-section'
+import { Tabs } from '@theguild/components'
+
+# OpenTelemetry Metrics
+
+Hive Router exposes OpenTelemetry metrics for gateway traffic, subgraph traffic, cache behavior,
+supergraph lifecycle, and GraphQL errors.
+
+This guide explains where to export metrics, how to configure OTLP and Prometheus, how to customize
+instruments, and what each metric/label means in practice.
+
+## Choose your metrics destination
+
+Hive Router exposes metrics through two widely used integration patterns:
+
+- OTLP-based observability backends
+- Prometheus scrape endpoints
+
+Most teams already running an OpenTelemetry pipeline tend to integrate via OTLP, while teams built
+around Prometheus and Grafana typically stick with Prometheus scraping.
+
+### Send metrics to OTLP-compatible backends
+
+Hive Router can export metrics using OTLP to standard OpenTelemetry pipelines, including the
+OpenTelemetry Collector and vendor backends that support OTLP ingestion over HTTP or gRPC.
+
+After enabling the exporter, generate some traffic through the router and confirm that new metric
+series appear in your backend (for example HTTP server/client latency, cache metrics, and supergraph
+execution metrics).
+
+If metrics do not appear, verify:
+
+- Endpoint reachability (network, DNS, TLS)
+- Authentication credentials or headers
+- Exporter protocol matches the backend (OTLP/HTTP vs OTLP/gRPC)
+
+<Tabs items={["OTLP over HTTP", "OTLP over gRPC"]}>
+
+<Tabs.Tab>
+
+```yaml filename="router.config.yaml"
+telemetry:
+  metrics:
+    exporters:
+      - kind: otlp
+        enabled: true
+        protocol: http
+        endpoint: https://otel-collector.example.com/v1/metrics
+        interval: 30s
+        max_export_timeout: 5s
+        http:
+          headers:
+            authorization:
+              expression: |
+                "Bearer " + env("OTLP_TOKEN")
+```
+
+</Tabs.Tab>
+
+<Tabs.Tab>
+
+```yaml filename="router.config.yaml"
+telemetry:
+  metrics:
+    exporters:
+      - kind: otlp
+        enabled: true
+        protocol: grpc
+        endpoint: https://otel-collector.example.com:4317
+        interval: 30s
+        max_export_timeout: 5s
+        grpc:
+          metadata:
+            x-api-key:
+              expression: env("OTEL_API_KEY")
+          tls:
+            domain_name: otel-collector.example.com
+            ca: /etc/certs/ca.pem
+            cert: /etc/certs/client.pem
+            key: /etc/certs/client.key
+```
+
+</Tabs.Tab>
+
+</Tabs>
+
+### Expose metrics for Prometheus scraping
+
+If your observability stack is Prometheus-first, Hive Router can expose an HTTP endpoint that
+Prometheus scrapes at its configured interval.
+
+The `port` and `path` settings define the address where the Router exposes metrics. Prometheus must
+be able to reach that address from its runtime environment (local network, Kubernetes service, or VM
+network path).
+
+<Callout type="note">
+  If `port` is not set, or is the same as the main HTTP server port, the Router exposes metrics
+  through the same HTTP server that serves the GraphQL API. If the port is different, the Router
+  starts a separate HTTP server dedicated solely to the Prometheus metrics endpoint.
+</Callout>
+
+In production, make sure this endpoint is reachable only by trusted scrapers (for example via
+network policy, firewall rules, or private ingress). Once configured, confirm the target appears as
+healthy in Prometheus and then verify expected series are present (for example
+`http.server.request.duration`, `http.client.request.duration`).
+
+```yaml filename="router.config.yaml"
+telemetry:
+  metrics:
+    exporters:
+      - kind: prometheus
+        enabled: true
+        port: 9090
+        path: /metrics
+```
+
+## Production baseline
+
+For production workloads, start with a single primary exporter, define a clear service identity, and
+keep default instrumentation settings.
+
+```yaml filename="router.config.yaml"
+telemetry:
+  resource:
+    attributes:
+      service.name: hive-router
+      service.namespace: your-platform
+      deployment.environment:
+        expression: env("ENVIRONMENT")
+  metrics:
+    exporters:
+      - kind: otlp
+        enabled: true
+        protocol: grpc
+        endpoint: https://otel-collector.example.com:4317
+        interval: 30s
+        max_export_timeout: 5s
+```
+
+This is a safe baseline and works well before introducing instrumentation-level customization.
+Additional exporters can be added later, but starting with one simplifies validation and
+troubleshooting.
+
+### Cardinality considerations
+
+For production workloads, consider disabling `graphql.operation.name` label or even
+`graphql.operation.type` on high-volume metrics.
+
+<Callout type="warning">
+  `graphql.operation.name` can create very high-cardinality metrics.
+
+Operation names come from client requests. Without persisted operations, clients can send many
+distinct operation names (or random names), which can rapidly increase cardinality and cost in
+Prometheus and OTLP backends.
+
+</Callout>
+
+```yaml filename="router.config.yaml"
+telemetry:
+  metrics:
+    instrumentation:
+      instruments:
+        http.server.request.duration:
+          attributes:
+            graphql.operation.name: false
+        http.server.request.body.size:
+          attributes:
+            graphql.operation.name: false
+        http.server.response.body.size:
+          attributes:
+            graphql.operation.name: false
+```
+
+## Customize instrumentation
+
+You can override behavior per metric under `telemetry.metrics.instrumentation.instruments`.
+
+<Callout type="tip">
+  Disable non-essential labels to control cost, since each additional label value increases active
+  time-series cardinality - a primary billing and performance driver in platforms like Grafana and
+  Datadog.
+</Callout>
+
+For production guidance on label cardinality (especially `graphql.operation.name`), see
+[Cardinality considerations](#cardinality-considerations).
+
+- `false` disables a metric.
+- `true` keeps default behavior.
+- object form enables metric + optional attribute overrides.
+
+```yaml filename="router.config.yaml"
+telemetry:
+  metrics:
+    instrumentation:
+      instruments:
+        # Disable HTTP server request duration metric
+        http.server.request.duration: false
+        http.client.request.duration:
+          attributes:
+            # Disable the label
+            subgraph.name: false
+            # Enable the label (labels are enabled by default)
+            http.response.status_code: true
+```
+
+Attribute override behavior:
+
+- `false` - drop label from that metric
+- `true` - keep label (all labels are enabled by default)
+
+Histogram aggregation can also be customized under
+`telemetry.metrics.instrumentation.common.histogram`.
+
+- [`explicit`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#histogram) (default)
+  uses unit-specific bucket sets. Lets you configure unit-specific buckets:
+  - `seconds` for histogram unit `s`
+  - `bytes` for histogram unit `By`
+- [`exponential`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#exponentialhistogram)
+  uses one shared exponential strategy for all histogram metrics.
+- `record_min_max controls whether min and max are reported for histogram points.
+
+Bucket format rules:
+
+- [`buckets`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#explicit-bucket-histogram-aggregation)
+  can be either all numbers or all strings.
+- mixed arrays are not allowed.
+- `seconds.buckets` string values are parsed as durations (for example `"5ms"`, `"1s"`).
+- `bytes.buckets` string values are parsed as human-readable sizes (for example `"1KB"`, `"5MB"`).
+
+In `explicit` mode, histogram units other than `s` and `By` fail startup.
+
+```yaml filename="router.config.yaml"
+telemetry:
+  metrics:
+    instrumentation:
+      common:
+        histogram:
+          aggregation: explicit
+          seconds:
+            buckets:
+              [
+                '5ms',
+                '10ms',
+                '25ms',
+                '50ms',
+                '75ms',
+                '100ms',
+                '250ms',
+                '500ms',
+                '750ms',
+                '1s',
+                '2.5s',
+                '5s',
+                '7.5s',
+                '10s'
+              ]
+            record_min_max: false
+          bytes:
+            buckets:
+              [
+                '128B',
+                '512B',
+                '1KB',
+                '2KB',
+                '4KB',
+                '8KB',
+                '16KB',
+                '32KB',
+                '64KB',
+                '128KB',
+                '256KB',
+                '512KB',
+                '1MB',
+                '2MB',
+                '3MB',
+                '4MB',
+                '5MB'
+              ]
+            record_min_max: false
+```
+
+## Metrics reference
+
+### GraphQL
+
+GraphQL metrics capture errors surfaced by the router across all stages of a GraphQL request
+lifecycle.
+
+<MetricsSection
+  metrics={[
+    {
+      name: 'hive.router.graphql.errors_total',
+      type: 'Counter',
+      unit: '{error}',
+      description:
+        'Total count of GraphQL errors encountered during query processing and execution, categorized by error code.',
+      labels: ['code']
+    }
+  ]}
+  labels={[
+    {
+      name: 'code',
+      meaning: 'GraphQL error code',
+      typicalValues: [
+        'GRAPHQL_PARSE_FAILED',
+        'GRAPHQL_VALIDATION_FAILED',
+        'PLAN_EXECUTION_FAILED',
+        'UNKNOWN',
+        '...'
+      ],
+      notes: `Uses "extensions.code" values and router's error codes. "UNKNOWN" is used when no code is available.`
+    }
+  ]}
+/>
+
+### Supergraph
+
+Supergraph metrics cover polling and processing lifecycle of schema updates.
+
+<MetricsSection
+  metrics={[
+    {
+      name: 'hive.router.supergraph.poll.total',
+      type: 'Counter',
+      description: 'Total number of supergraph polling attempts, categorized by poll result.',
+      labels: ['result']
+    },
+    {
+      name: 'hive.router.supergraph.poll.duration',
+      type: 'Histogram',
+      unit: 'Seconds',
+      description: 'Duration of supergraph polling attempts, categorized by poll result.',
+      labels: ['result']
+    },
+    {
+      name: 'hive.router.supergraph.process.duration',
+      type: 'Histogram',
+      unit: 'Seconds',
+      description: 'Time spent processing supergraph updates, categorized by status.',
+      labels: ['status']
+    }
+  ]}
+  labels={[
+    {
+      name: 'result',
+      meaning: 'Result of the poll',
+      typicalValues: ['updated', 'not_modified', 'error'],
+      notes: 'Used by "hive.router.supergraph.poll.*" metrics only'
+    },
+    {
+      name: 'status',
+      meaning: 'Supergraph processing status',
+      typicalValues: ['ok', 'error'],
+      notes: 'Used by "hive.router.supergraph.process.*" metrics only'
+    }
+  ]}
+/>
+
+### HTTP server
+
+HTTP server metrics capture inbound client traffic processed by the router.
+
+<MetricsSection
+  metrics={[
+    {
+      name: 'http.server.request.duration',
+      type: 'Histogram',
+      unit: 'Seconds',
+      description: 'Duration of inbound HTTP requests handled by the router.',
+      labels: [
+        'http.request.method',
+        'http.response.status_code',
+        'http.route',
+        'network.protocol.name',
+        'network.protocol.version',
+        'url.scheme',
+        'error.type',
+        'graphql.operation.name',
+        'graphql.operation.type',
+        'graphql.response.status'
+      ]
+    },
+    {
+      name: 'http.server.request.body.size',
+      type: 'Histogram',
+      unit: 'Bytes',
+      description: 'Size of inbound HTTP request bodies handled by the router.',
+      labels: [
+        'http.request.method',
+        'http.response.status_code',
+        'http.route',
+        'network.protocol.name',
+        'network.protocol.version',
+        'url.scheme',
+        'error.type',
+        'graphql.operation.name',
+        'graphql.operation.type',
+        'graphql.response.status'
+      ]
+    },
+    {
+      name: 'http.server.response.body.size',
+      type: 'Histogram',
+      unit: 'Bytes',
+      description: 'Size of outbound HTTP response bodies returned by the router.',
+      labels: [
+        'http.request.method',
+        'http.response.status_code',
+        'http.route',
+        'network.protocol.name',
+        'network.protocol.version',
+        'url.scheme',
+        'error.type',
+        'graphql.operation.name',
+        'graphql.operation.type',
+        'graphql.response.status'
+      ]
+    },
+    {
+      name: 'http.server.active_requests',
+      type: 'UpDownCounter',
+      unit: '{request}',
+      description: 'Current number of in-flight inbound HTTP requests.',
+      labels: ['http.request.method', 'network.protocol.name', 'url.scheme']
+    }
+  ]}
+  labels={[
+    {
+      name: 'http.request.method',
+      meaning: 'HTTP method',
+      typicalValues: [
+        'GET',
+        'POST',
+        'PUT',
+        'PATCH',
+        'DELETE',
+        'HEAD',
+        'OPTIONS',
+        'CONNECT',
+        'TRACE',
+        'QUERY',
+        '_OTHER'
+      ],
+      notes: '_OTHER is fallback for unknown methods'
+    },
+    {
+      name: 'http.response.status_code',
+      meaning: 'Response status code',
+      typicalValues: ['200', '400', '500', '...']
+    },
+    {
+      name: 'http.route',
+      meaning: 'Normalized router path',
+      typicalValues: ['/graphql']
+    },
+    {
+      name: 'network.protocol.name',
+      meaning: 'Protocol name',
+      typicalValues: ['http']
+    },
+    {
+      name: 'network.protocol.version',
+      meaning: 'Protocol version',
+      typicalValues: ['0.9', '1.0', '1.1', '2', '3']
+    },
+    {
+      name: 'url.scheme',
+      meaning: 'URL scheme',
+      typicalValues: ['http', 'https']
+    },
+    {
+      name: 'error.type',
+      meaning: 'Error classification for failed requests',
+      typicalValues: ['status code >= 400'],
+      notes: 'Only set for failed requests'
+    },
+    {
+      name: 'graphql.operation.name',
+      meaning: 'GraphQL operation name associated with the HTTP request',
+      typicalValues: ['UsersQuery', 'IntrospectionQuery', 'UNKNOWN'],
+      notes:
+        'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. High-cardinality risk: value is client-controlled and can explode without persisted operations.'
+    },
+    {
+      name: 'graphql.operation.type',
+      meaning: 'GraphQL operation type',
+      typicalValues: ['query', 'mutation', 'subscription'],
+      notes:
+        'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. Omitted when unknown'
+    },
+    {
+      name: 'graphql.response.status',
+      meaning: 'GraphQL response status for the request',
+      typicalValues: ['ok', 'error'],
+      notes:
+        'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. "error" indicates the GraphQL response contains at least one error'
+    }
+  ]}
+/>
+
+### HTTP client
+
+HTTP client metrics capture outbound requests the router makes to subgraphs.
+
+<MetricsSection
+  metrics={[
+    {
+      name: 'http.client.request.duration',
+      type: 'Histogram',
+      unit: 'Seconds',
+      description: 'Duration of outbound HTTP requests sent from router to subgraphs.',
+      labels: [
+        'http.request.method',
+        'server.address',
+        'server.port',
+        'network.protocol.name',
+        'network.protocol.version',
+        'url.scheme',
+        'subgraph.name',
+        'http.response.status_code',
+        'error.type',
+        'graphql.response.status'
+      ]
+    },
+    {
+      name: 'http.client.request.body.size',
+      type: 'Histogram',
+      unit: 'Bytes',
+      description: 'Size of outbound HTTP request bodies sent to subgraphs.',
+      labels: [
+        'http.request.method',
+        'server.address',
+        'server.port',
+        'network.protocol.name',
+        'network.protocol.version',
+        'url.scheme',
+        'subgraph.name',
+        'http.response.status_code',
+        'error.type',
+        'graphql.response.status'
+      ]
+    },
+    {
+      name: 'http.client.response.body.size',
+      type: 'Histogram',
+      unit: 'Bytes',
+      description: 'Size of HTTP response bodies returned by subgraphs.',
+      labels: [
+        'http.request.method',
+        'server.address',
+        'server.port',
+        'network.protocol.name',
+        'network.protocol.version',
+        'url.scheme',
+        'subgraph.name',
+        'http.response.status_code',
+        'error.type',
+        'graphql.response.status'
+      ]
+    },
+    {
+      name: 'http.client.active_requests',
+      type: 'UpDownCounter',
+      unit: '{request}',
+      description: 'Current number of in-flight outbound HTTP requests to subgraphs.',
+      labels: [
+        'http.request.method',
+        'server.address',
+        'server.port',
+        'url.scheme',
+        'subgraph.name'
+      ]
+    }
+  ]}
+  labels={[
+    {
+      name: 'http.request.method',
+      meaning: 'HTTP method',
+      typicalValues: [
+        'GET',
+        'POST',
+        'PUT',
+        'PATCH',
+        'DELETE',
+        'HEAD',
+        'OPTIONS',
+        'CONNECT',
+        'TRACE',
+        'QUERY',
+        '_OTHER'
+      ],
+      notes: '_OTHER is fallback for unknown methods'
+    },
+    {
+      name: 'http.response.status_code',
+      meaning: 'Response status code',
+      typicalValues: ['200', '400', '500', '...']
+    },
+    {
+      name: 'network.protocol.name',
+      meaning: 'Protocol name',
+      typicalValues: ['http']
+    },
+    {
+      name: 'network.protocol.version',
+      meaning: 'Protocol version',
+      typicalValues: ['0.9', '1.0', '1.1', '2', '3']
+    },
+    {
+      name: 'url.scheme',
+      meaning: 'URL scheme',
+      typicalValues: ['http', 'https']
+    },
+    {
+      name: 'server.address',
+      meaning: 'Subgraph host',
+      typicalValues: ['URI host', 'unknown'],
+      notes: 'URI host, or unknown fallback'
+    },
+    {
+      name: 'server.port',
+      meaning: 'Subgraph port',
+      typicalValues: ['80', '443'],
+      notes: 'Explicit URI port, or fallback 80/443'
+    },
+    {
+      name: 'subgraph.name',
+      meaning: 'Subgraph identifier',
+      typicalValues: ['accounts'],
+      notes: 'Configured names (for example "accounts")'
+    },
+    {
+      name: 'error.type',
+      meaning: 'Error classification',
+      typicalValues: ['400', 'SUBGRAPH_REQUEST_FAILURE', '...'],
+      notes: 'Numeric status code >= 400 or execution error code string'
+    },
+    {
+      name: 'graphql.response.status',
+      meaning: 'GraphQL response status for the subgraph request',
+      typicalValues: ['ok', 'error'],
+      notes:
+        'Set to "ok" when the parsed subgraph response has no GraphQL errors. Set to "error" when the subgraph response includes GraphQL errors or when transport/deserialization fails.'
+    }
+  ]}
+/>
+
+### Cache
+
+Cache metrics track lookup behavior and cache size across router caches used during request
+preparation and planning stages.
+
+#### Parsing cache
+
+Parsing cache metrics measure query parse cache hit/miss behavior and cache size.
+
+<MetricsSection
+  metrics={[
+    {
+      name: 'hive.router.parse_cache.requests_total',
+      type: 'Counter',
+      description: 'Total number of parsing cache lookups, categorized by result.',
+      labels: ['result']
+    },
+    {
+      name: 'hive.router.parse_cache.duration',
+      type: 'Histogram',
+      unit: 'Seconds',
+      description: 'Duration of parsing cache lookups, categorized by result.',
+      labels: ['result']
+    },
+    {
+      name: 'hive.router.parse_cache.size',
+      type: 'Gauge',
+      description: 'Current number of entries stored in the parsing cache.'
+    }
+  ]}
+/>
+
+#### Validation cache
+
+Validation cache metrics measure query validation cache hit/miss behavior and cache size.
+
+<MetricsSection
+  metrics={[
+    {
+      name: 'hive.router.validate_cache.requests_total',
+      type: 'Counter',
+      description: 'Total number of validation cache lookups, categorized by result.',
+      labels: ['result']
+    },
+    {
+      name: 'hive.router.validate_cache.duration',
+      type: 'Histogram',
+      unit: 'Seconds',
+      description: 'Duration of validation cache lookups, categorized by result.',
+      labels: ['result']
+    },
+    {
+      name: 'hive.router.validate_cache.size',
+      type: 'Gauge',
+      description: 'Current number of entries stored in the validation cache.'
+    }
+  ]}
+/>
+
+#### Normalization cache
+
+Normalization cache metrics measure query normalization cache hit/miss behavior and cache size.
+
+<MetricsSection
+  metrics={[
+    {
+      name: 'hive.router.normalize_cache.requests_total',
+      type: 'Counter',
+      description: 'Total number of normalization cache lookups, categorized by result.',
+      labels: ['result']
+    },
+    {
+      name: 'hive.router.normalize_cache.duration',
+      type: 'Histogram',
+      unit: 'Seconds',
+      description: 'Duration of normalization cache lookups, categorized by result.',
+      labels: ['result']
+    },
+    {
+      name: 'hive.router.normalize_cache.size',
+      type: 'Gauge',
+      description: 'Current number of entries stored in the normalization cache.'
+    }
+  ]}
+/>
+
+#### Planning cache
+
+Planning cache metrics measure query planning cache hit/miss behavior and cache size.
+
+<MetricsSection
+  metrics={[
+    {
+      name: 'hive.router.plan_cache.requests_total',
+      type: 'Counter',
+      description: 'Total number of planning cache lookups, categorized by result.',
+      labels: ['result']
+    },
+    {
+      name: 'hive.router.plan_cache.duration',
+      type: 'Histogram',
+      unit: 'Seconds',
+      description: 'Duration of planning cache lookups, categorized by result.',
+      labels: ['result']
+    },
+    {
+      name: 'hive.router.plan_cache.size',
+      type: 'Gauge',
+      description: 'Current number of entries stored in the planning cache.'
+    }
+  ]}
+/>
+
+#### Labels
+
+These labels are shared by cache lookup counters and duration histograms.
+
+<div className="mt-4">
+  <MetricsSection
+    labels={[
+      {
+        name: 'result',
+        meaning: 'Cache lookup outcome',
+        typicalValues: ['hit', 'miss'],
+        notes: 'Used by cache `requests_total` and `duration` metrics'
+      }
+    ]}
+  />
+</div>
+
+## What to monitor in production
+
+The examples below show which signals to monitor in production and how to break them down so you can
+quickly isolate API, subgraph, cache, and GraphQL issues.
+
+### Monitor end-to-end latency of your GraphQL API
+
+Use [`http.server.request.duration`](#metric-http-server-request-duration) as your primary latency
+signal.
+
+In production, break this metric down by `http.route`, `http.request.method`,
+`http.response.status_code`, and/or `graphql.response.status`, then track p95 and p99 latency per
+route and method. Keep successful and failed responses separated so error-path latency does not get
+hidden by healthy traffic.
+
+### Monitor health of your subgraphs
+
+Use [`http.client.request.duration`](#metric-http-client-request-duration) and
+[`http.client.active_requests`](#metric-http-client-active-requests) to monitor dependency health
+across your federated graph.
+
+Break these metrics down by `subgraph.name`, `http.response.status_code`, and `error.type` to
+identify which subgraph is driving tail latency or error spikes.
+
+### Monitor cache effectiveness and planning pressure
+
+Use the cache metrics to evaluate cache hit ratio, miss cost, and pressure over time.
+
+For request and duration metrics, split by `result` (`hit` and `miss`) so you can track hit ratio
+and miss latency per cache kind.
+
+### Monitor GraphQL errors over time
+
+Use [`hive.router.graphql.errors_total`](#metric-hive-router-graphql-errors-total) and break it down
+by `code` to track both volume and error distribution.
+
+In production, monitor how error-code distribution changes over time, not only total count, so you
+can separate validation issues from execution failures.
+
+## Configuration reference
+
+For full options and defaults, see
+[telemetry configuration reference](/docs/router/configuration/telemetry).