docs(router): metrics (#7696)

This commit is contained in:
Kamil Kisiela 2026-03-12 10:32:41 +01:00 committed by GitHub
parent 14c73e5751
commit a3acbd892f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 1386 additions and 1 deletions

View file

@ -0,0 +1,58 @@
import { Info, Lightbulb, Tag } from 'lucide-react';
interface LabelCardProps {
name: string;
meaning: string;
typicalValues: string[];
notes?: string;
}
export function LabelCard({ name, meaning, typicalValues, notes }: LabelCardProps) {
return (
<div>
<div className="mb-3 flex items-start gap-3">
<div className="shrink-0 rounded-md border border-gray-200 bg-gray-100 p-1.5 dark:border-neutral-700 dark:bg-neutral-800">
<Tag className="h-4 w-4 text-gray-600 dark:text-slate-100" />
</div>
<div className="min-w-0 flex-1">
<code className="break-all text-sm font-semibold text-gray-900 dark:text-slate-100">
{name}
</code>
<p className="mt-1 text-sm leading-relaxed text-gray-600 dark:text-slate-100">
{meaning}
</p>
</div>
</div>
<div className="mt-4 space-y-3">
<div>
<div className="mb-2 flex items-center gap-1.5">
<Info className="h-3.5 w-3.5 text-gray-500 dark:text-slate-400" />
<span className="text-xs font-semibold uppercase text-gray-700 dark:text-slate-100">
Typical Values
</span>
</div>
<div className="flex flex-wrap gap-1.5">
{typicalValues.map(value => (
<code
key={value}
className="rounded-md border border-slate-200 bg-slate-50 px-2.5 py-1 text-xs font-medium text-slate-700 dark:border-neutral-700 dark:bg-neutral-800 dark:text-slate-200"
>
{value}
</code>
))}
</div>
</div>
{notes && (
<div className="pt-1">
<div className="flex items-start gap-2">
<Lightbulb className="mt-0.5 h-3.5 w-3.5 shrink-0 text-amber-600 dark:text-amber-400" />
<p className="text-sm leading-relaxed text-gray-600 dark:text-slate-100">{notes}</p>
</div>
</div>
)}
</div>
</div>
);
}

View file

@ -0,0 +1,163 @@
import { useEffect, useRef, useState } from 'react';
import { Activity, BarChart3, Gauge, TrendingUp } from 'lucide-react';
interface MetricCardProps {
name: string;
type: 'Counter' | 'Histogram' | 'UpDownCounter' | 'Gauge';
unit?: string;
description?: string;
labels?: string[];
}
const typeConfig = {
Counter: {
icon: TrendingUp,
color:
'bg-emerald-50 text-emerald-700 border-emerald-200 dark:bg-emerald-900/30 dark:text-emerald-300 dark:border-emerald-700/50',
badge: 'bg-emerald-100 text-emerald-800',
},
Histogram: {
icon: BarChart3,
color:
'bg-blue-50 text-blue-700 border-blue-200 dark:bg-blue-900/30 dark:text-blue-300 dark:border-blue-700/50',
badge: 'bg-blue-100 text-blue-800',
},
UpDownCounter: {
icon: Activity,
color:
'bg-amber-50 text-amber-700 border-amber-200 dark:bg-amber-900/30 dark:text-amber-300 dark:border-amber-700/50',
badge: 'bg-amber-100 text-amber-800',
},
Gauge: {
icon: Gauge,
color:
'bg-slate-50 text-slate-700 border-slate-200 dark:bg-slate-800/60 dark:text-slate-100 dark:border-slate-700',
badge: 'bg-slate-100 text-slate-800',
},
};
export function MetricCard({ name, type, unit, description, labels }: MetricCardProps) {
const config = typeConfig[type];
const Icon = config.icon;
const [isCopied, setIsCopied] = useState(false);
const copiedTimeoutRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const metricId = `metric-${name
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/(^-|-$)/g, '')}`;
useEffect(() => {
return () => {
if (copiedTimeoutRef.current) {
clearTimeout(copiedTimeoutRef.current);
}
};
}, []);
function showCopiedState() {
setIsCopied(true);
if (copiedTimeoutRef.current) {
clearTimeout(copiedTimeoutRef.current);
}
copiedTimeoutRef.current = setTimeout(() => {
setIsCopied(false);
}, 1200);
}
async function copyMetricLink() {
if (typeof window === 'undefined') {
return;
}
const metricUrl = `${window.location.origin}${window.location.pathname}${window.location.search}#${metricId}`;
try {
await navigator.clipboard.writeText(metricUrl);
showCopiedState();
} catch {
window.location.hash = metricId;
}
}
return (
<div
id={metricId}
className="group scroll-mt-20 overflow-hidden rounded-lg border border-gray-200 bg-white transition-shadow duration-200 hover:shadow-md dark:border-neutral-800 dark:bg-neutral-900 dark:hover:shadow-black/30"
>
<div className="p-5">
<div className="mb-3 flex items-center justify-between gap-4">
<div className="min-w-0 flex-1">
<div className="flex items-center gap-1.5">
<code className="break-all text-sm font-semibold text-gray-900 dark:text-slate-100">
{name}
</code>
<button
type="button"
onClick={() => {
void copyMetricLink();
}}
className={`hive-focus inline-flex items-center justify-center rounded font-mono text-sm font-semibold leading-none transition-all duration-200 ${isCopied ? 'translate-y-0 text-gray-500 opacity-100 dark:text-slate-500' : 'translate-y-0 text-gray-500 opacity-0 hover:text-gray-700 focus:text-gray-700 group-focus-within:opacity-100 group-hover:opacity-100 dark:text-slate-500 dark:hover:text-slate-200 dark:focus:text-slate-200'}`}
aria-label={`Copy link to ${name}`}
title="Copy metric link"
>
{isCopied ? (
<>
<span></span>
<span className="ml-1 text-xs">copied</span>
</>
) : (
'#'
)}
</button>
<span className="sr-only" aria-live="polite">
{isCopied ? `Copied link to ${name}` : ''}
</span>
</div>
</div>
<div className="flex shrink-0 items-center gap-2">
{unit && (
<div className="flex items-center gap-1.5 rounded-md border border-gray-200 bg-gray-100 px-2.5 py-1 text-xs text-gray-700 dark:border-neutral-700 dark:bg-neutral-800 dark:text-slate-200">
<span className="font-medium text-gray-500 dark:text-slate-300">Unit:</span>
<code>{unit}</code>
</div>
)}
<div
className={`flex items-center gap-1.5 rounded-md border px-2.5 py-1 ${config.color}`}
>
<Icon className="h-3.5 w-3.5" />
<span className="text-xs font-medium">{type}</span>
</div>
</div>
</div>
{description && (
<p className="mb-4 text-sm leading-relaxed text-gray-600 dark:text-slate-100">
{description}
</p>
)}
{labels && labels.length > 0 && (
<div className="mt-4 border-t border-gray-100 pt-4 dark:border-neutral-800">
<div className="mb-2 flex items-center gap-2">
<span className="text-xs font-semibold uppercase text-gray-700 dark:text-slate-100">
Labels
</span>
</div>
<div className="flex flex-wrap gap-1.5">
{labels.map(label => (
<code
key={label}
className="rounded border border-gray-200 bg-gray-50 px-2 py-1 text-xs text-gray-700 transition-colors hover:border-gray-300 dark:border-neutral-700 dark:bg-neutral-800 dark:text-slate-200 dark:hover:border-neutral-600"
>
{label}
</code>
))}
</div>
</div>
)}
</div>
</div>
);
}

View file

@ -0,0 +1,80 @@
'use client';
import { useId, useState } from 'react';
import { ChevronDown } from 'lucide-react';
import { LabelCard } from './label-card';
import { MetricCard } from './metric-card';
interface Metric {
name: string;
type: 'Counter' | 'Histogram' | 'UpDownCounter' | 'Gauge';
unit?: string;
description?: string;
labels?: string[];
}
interface Label {
name: string;
meaning: string;
typicalValues: string[];
notes?: string;
}
interface MetricsSectionProps {
title?: string;
description?: string;
metrics?: Metric[];
labels?: Label[];
}
export function MetricsSection({ metrics, labels }: MetricsSectionProps) {
const [isLabelsOpen, setIsLabelsOpen] = useState(false);
const labelsRegionId = useId();
return (
<div className="space-y-6">
{metrics && metrics.length > 0 && (
<div className="space-y-4">
<h4 className="mt-8 text-xl font-semibold tracking-tight text-slate-900 dark:text-slate-100">
Metrics
</h4>
<div className="grid gap-4">
{metrics.map(metric => (
<MetricCard key={metric.name} {...metric} />
))}
</div>
</div>
)}
{labels && labels.length > 0 && (
<div className="overflow-hidden rounded-lg border border-gray-200 bg-white dark:border-neutral-800 dark:bg-neutral-900">
<button
type="button"
onClick={() => setIsLabelsOpen(current => !current)}
aria-expanded={isLabelsOpen}
aria-controls={labelsRegionId}
className="hive-focus flex w-full items-center justify-between px-5 py-4 text-left text-xl font-semibold tracking-tight text-slate-900 dark:text-slate-100"
>
<span>Labels Reference</span>
<ChevronDown
className={`h-5 w-5 transition-transform duration-200 ${isLabelsOpen ? 'rotate-180' : ''}`}
/>
</button>
<div
id={labelsRegionId}
className={`overflow-hidden transition-[max-height,opacity] duration-300 ease-out ${isLabelsOpen ? 'max-h-[4000px] opacity-100' : 'max-h-0 opacity-90'}`}
>
<div className="border-t border-gray-100 px-5 pb-5 dark:border-neutral-800">
<div className="divide-y divide-gray-100 pt-2 dark:divide-neutral-800">
{labels.map(label => (
<div key={label.name} className="py-6">
<LabelCard {...label} />
</div>
))}
</div>
</div>
</div>
</div>
)}
</div>
);
}

View file

@ -5,7 +5,7 @@ title: 'telemetry'
# telemetry
The `telemetry` configuration controls client identification, Hive reporting, and OpenTelemetry
tracing behavior in Hive Router.
tracing and metrics behavior in Hive Router.
## client_identification
@ -232,6 +232,265 @@ telemetry:
x-api-key: key
```
</details>
</div>
</details>
## metrics
Top-level OpenTelemetry metrics configuration.
<details>
<summary>Show metrics configuration</summary>
Metrics are enabled when at least one exporter is configured and enabled.
| Field | Type | Default | Notes |
| ----------------- | -------- | ------- | ------------------------------------------------------------------------------------- |
| `exporters` | `array` | `[]` | List of exporters used to send metrics. |
| `instrumentation` | `object` | `{}` | Instrument behavior for metrics (histogram aggregation and per-instrument overrides). |
<div id="telemetry-metrics-exporters" style={{marginTop: 10}}>
<details>
<summary>`exporters`</summary>
Each item configures one metrics exporter.
Each item in this array defines one exporter instance, so you can configure multiple metrics
destinations if needed.
This reference documents OTLP and Prometheus exporter configuration.
| Field | Type | Default | Notes |
| --------- | --------- | ------- | ------------------------------------------------------ |
| `kind` | `string` | - | Exporter kind. Supported values: `otlp`, `prometheus`. |
| `enabled` | `boolean` | `true` | Enables or disables this exporter. |
<details>
<summary>`otlp`</summary>
| Field | Type | Default | Notes |
| ------------------------------------------------------------------------------------------------------------------- | -------------------- | ------------ | ----------------------------------------------------------------- |
| `kind` | `string` | - | Must be `otlp`. |
| `enabled` | `boolean` | `true` | Enables or disables this exporter. |
| `endpoint` | `StringOrExpression` | - | OTLP endpoint. Must be set explicitly. |
| `protocol` | `string` | - | OTLP transport protocol. Supported values: `http`, `grpc`. |
| `interval` | `string` | `60s` | Interval between OTLP export attempts. |
| [`temporality`](https://opentelemetry.io/docs/specs/otel/metrics/supplementary-guidelines/#aggregation-temporality) | `string` | `cumulative` | Aggregation temporality. Supported values: `cumulative`, `delta`. |
| `max_export_timeout` | `string` | `5s` | Maximum time for one OTLP export attempt. |
| `http` | `object` | - | HTTP-specific OTLP settings (for `protocol: http`). |
| `grpc` | `object` | - | gRPC-specific OTLP settings (for `protocol: grpc`). |
OTLP over HTTP:
| Field | Type | Value / Default | Notes |
| -------------- | -------- | --------------- | ------------------------------------------------------------- |
| `protocol` | `string` | `http` | OTLP transport protocol. |
| `http.headers` | `object` | `{}` | Map of header names to values (`string` or `{ expression }`). |
```yaml filename="router.config.yaml"
telemetry:
metrics:
exporters:
- kind: otlp
enabled: true
protocol: http
endpoint: https://otel-collector.example.com/v1/metrics
interval: 60s
temporality: cumulative
max_export_timeout: 5s
http:
headers:
x-otlp-header: value
```
OTLP over gRPC:
| Field | Type | Value / Default | Notes |
| ---------------------- | -------- | --------------- | ---------------------------------------------------------------------------- |
| `protocol` | `string` | `grpc` | OTLP transport protocol. |
| `grpc.metadata` | `object` | `{}` | Map of metadata keys to values (`string` or `{ expression }`). |
| `grpc.tls.domain_name` | `string` | - | Domain name used to verify the server certificate. |
| `grpc.tls.key` | `string` | - | Path to the client private key file. |
| `grpc.tls.cert` | `string` | - | Path to the client certificate file (PEM). |
| `grpc.tls.ca` | `string` | - | Path to the CA certificate file (PEM) used to verify the server certificate. |
```yaml filename="router.config.yaml"
telemetry:
metrics:
exporters:
- kind: otlp
enabled: true
protocol: grpc
endpoint: https://otel-collector.example.com:4317
interval: 60s
temporality: cumulative
max_export_timeout: 5s
grpc:
metadata:
x-api-key: key
```
</details>
<details>
<summary>`prometheus`</summary>
| Field | Type | Default | Notes |
| --------- | --------- | ---------- | ------------------------------------------- |
| `kind` | `string` | - | Must be `prometheus`. |
| `enabled` | `boolean` | `true` | Enables/disables Prometheus metrics export. |
| `port` | `integer` | - | Optional port for metrics endpoint. |
| `path` | `string` | `/metrics` | HTTP path exposed for scraping. |
```yaml filename="router.config.yaml"
telemetry:
metrics:
exporters:
- kind: prometheus
enabled: true
port: 9090
path: /metrics
```
</details>
</details>
</div>
<div id="telemetry-metrics-instrumentation" style={{marginTop: 10}}>
<details>
<summary>`instrumentation`</summary>
Controls histogram aggregation and per-instrument overrides.
| Field | Type | Default | Notes |
| ------------------ | -------- | ----------------------- | ----------------------------------------------------------- |
| `common.histogram` | `object` | exponential aggregation | Histogram aggregation strategy for instrumented histograms. |
| `instruments` | `object` | `{}` | Map of metric name to `false`, `true`, or object override. |
<details>
<summary>`common.histogram`</summary>
Set aggregation mode with `aggregation`.
`explicit` aggregation (default):
| Field | Type | Default | Notes |
| ------------- | -------- | ------- | ----------------------------------------------------- |
| `aggregation` | `string` | - | Must be `explicit`. |
| `seconds` | `object` | - | Explicit histogram config for metrics with unit `s`. |
| `bytes` | `object` | - | Explicit histogram config for metrics with unit `By`. |
`seconds` and `bytes` fields:
| Field | Type | Default | Notes |
| ---------------- | ---------------------- | ------- | --------------------------------------------------------------- |
| `buckets` | `number[] \| string[]` | varies | Explicit bucket upper bounds. Must be non-empty and increasing. |
| `record_min_max` | `boolean` | `false` | Record min/max values for this unit bucket set. |
Default explicit buckets:
- `seconds.buckets`: `[0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5, 10]`
- `bytes.buckets`:
`[128, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 3145728, 4194304, 5242880]`
Bucket format rules:
- `buckets` can be either all numbers or all strings.
- mixed arrays are not allowed.
- for `seconds.buckets`, string values are parsed as durations (for example `"5ms"`, `"1s"`).
- for `bytes.buckets`, string values are parsed as human-readable sizes (for example `"1KB"`,
`"5MB"`).
[`exponential`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#exponentialhistogram)
aggregation:
| Field | Type | Default | Notes |
| ------------------------------------------------------------------------------------------------------------------- | --------- | ------- | ----------------------------------------- |
| `aggregation` | `string` | - | Must be `exponential`. |
| [`max_size`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#base2-exponential-bucket-histogram-aggregation) | `integer` | - | Max bucket count. Required. |
| [`max_scale`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#base2-exponential-bucket-histogram-aggregation) | `integer` | - | Max scale for bucket precision. Required. |
| `record_min_max` | `boolean` | `false` | Record min/max values. |
</details>
<details>
<summary>`instruments`</summary>
`instruments` is a map keyed by metric name. Value can be:
- `false` to disable a metric
- `true` to keep defaults
- object to keep metric enabled and override attributes
Object form supports:
| Field | Type | Notes |
| ------------ | -------- | ------------------------------------------------------------------------------ |
| `attributes` | `object` | Map of attribute name to `boolean` (`false` drops attribute, `true` keeps it). |
```yaml filename="router.config.yaml"
telemetry:
metrics:
instrumentation:
common:
histogram:
aggregation: explicit
seconds:
buckets:
[
'5ms',
'10ms',
'25ms',
'50ms',
'75ms',
'100ms',
'250ms',
'500ms',
'750ms',
'1s',
'2.5s',
'5s',
'7.5s',
'10s'
]
record_min_max: false
bytes:
buckets:
[
'128B',
'512B',
'1KB',
'2KB',
'4KB',
'8KB',
'16KB',
'32KB',
'64KB',
'128KB',
'256KB',
'512KB',
'1MB',
'2MB',
'3MB',
'4MB',
'5MB'
]
record_min_max: false
instruments:
http.server.request.duration: true
http.client.request.duration:
attributes:
subgraph.name: true
http.response.status_code: true
server.address: false
```
</details>
</details>
</div>

View file

@ -0,0 +1,825 @@
---
title: 'OpenTelemetry Metrics'
---
import { Callout } from '#components/callout'
import { MetricsSection } from '#components/otel-metrics/metrics-section'
import { Tabs } from '@theguild/components'
# OpenTelemetry Metrics
Hive Router exposes OpenTelemetry metrics for gateway traffic, subgraph traffic, cache behavior,
supergraph lifecycle, and GraphQL errors.
This guide explains where to export metrics, how to configure OTLP and Prometheus, how to customize
instruments, and what each metric/label means in practice.
## Choose your metrics destination
Hive Router exposes metrics through two widely used integration patterns:
- OTLP-based observability backends
- Prometheus scrape endpoints
Most teams already running an OpenTelemetry pipeline tend to integrate via OTLP, while teams built
around Prometheus and Grafana typically stick with Prometheus scraping.
### Send metrics to OTLP-compatible backends
Hive Router can export metrics using OTLP to standard OpenTelemetry pipelines, including the
OpenTelemetry Collector and vendor backends that support OTLP ingestion over HTTP or gRPC.
After enabling the exporter, generate some traffic through the router and confirm that new metric
series appear in your backend (for example HTTP server/client latency, cache metrics, and supergraph
execution metrics).
If metrics do not appear, verify:
- Endpoint reachability (network, DNS, TLS)
- Authentication credentials or headers
- Exporter protocol matches the backend (OTLP/HTTP vs OTLP/gRPC)
<Tabs items={["OTLP over HTTP", "OTLP over gRPC"]}>
<Tabs.Tab>
```yaml filename="router.config.yaml"
telemetry:
metrics:
exporters:
- kind: otlp
enabled: true
protocol: http
endpoint: https://otel-collector.example.com/v1/metrics
interval: 30s
max_export_timeout: 5s
http:
headers:
authorization:
expression: |
"Bearer " + env("OTLP_TOKEN")
```
</Tabs.Tab>
<Tabs.Tab>
```yaml filename="router.config.yaml"
telemetry:
metrics:
exporters:
- kind: otlp
enabled: true
protocol: grpc
endpoint: https://otel-collector.example.com:4317
interval: 30s
max_export_timeout: 5s
grpc:
metadata:
x-api-key:
expression: env("OTEL_API_KEY")
tls:
domain_name: otel-collector.example.com
ca: /etc/certs/ca.pem
cert: /etc/certs/client.pem
key: /etc/certs/client.key
```
</Tabs.Tab>
</Tabs>
### Expose metrics for Prometheus scraping
If your observability stack is Prometheus-first, Hive Router can expose an HTTP endpoint that
Prometheus scrapes at its configured interval.
The `port` and `path` settings define the address where the Router exposes metrics. Prometheus must
be able to reach that address from its runtime environment (local network, Kubernetes service, or VM
network path).
<Callout type="note">
If `port` is not set, or is the same as the main HTTP server port, the Router exposes metrics
through the same HTTP server that serves the GraphQL API. If the port is different, the Router
starts a separate HTTP server dedicated solely to the Prometheus metrics endpoint.
</Callout>
In production, make sure this endpoint is reachable only by trusted scrapers (for example via
network policy, firewall rules, or private ingress). Once configured, confirm the target appears as
healthy in Prometheus and then verify expected series are present (for example
`http.server.request.duration`, `http.client.request.duration`).
```yaml filename="router.config.yaml"
telemetry:
metrics:
exporters:
- kind: prometheus
enabled: true
port: 9090
path: /metrics
```
## Production baseline
For production workloads, start with a single primary exporter, define a clear service identity, and
keep default instrumentation settings.
```yaml filename="router.config.yaml"
telemetry:
resource:
attributes:
service.name: hive-router
service.namespace: your-platform
deployment.environment:
expression: env("ENVIRONMENT")
metrics:
exporters:
- kind: otlp
enabled: true
protocol: grpc
endpoint: https://otel-collector.example.com:4317
interval: 30s
max_export_timeout: 5s
```
This is a safe baseline and works well before introducing instrumentation-level customization.
Additional exporters can be added later, but starting with one simplifies validation and
troubleshooting.
### Cardinality considerations
For production workloads, consider disabling `graphql.operation.name` label or even
`graphql.operation.type` on high-volume metrics.
<Callout type="warning">
`graphql.operation.name` can create very high-cardinality metrics.
Operation names come from client requests. Without persisted operations, clients can send many
distinct operation names (or random names), which can rapidly increase cardinality and cost in
Prometheus and OTLP backends.
</Callout>
```yaml filename="router.config.yaml"
telemetry:
metrics:
instrumentation:
instruments:
http.server.request.duration:
attributes:
graphql.operation.name: false
http.server.request.body.size:
attributes:
graphql.operation.name: false
http.server.response.body.size:
attributes:
graphql.operation.name: false
```
## Customize instrumentation
You can override behavior per metric under `telemetry.metrics.instrumentation.instruments`.
<Callout type="tip">
Disable non-essential labels to control cost, since each additional label value increases active
time-series cardinality - a primary billing and performance driver in platforms like Grafana and
Datadog.
</Callout>
For production guidance on label cardinality (especially `graphql.operation.name`), see
[Cardinality considerations](#cardinality-considerations).
- `false` disables a metric.
- `true` keeps default behavior.
- object form enables metric + optional attribute overrides.
```yaml filename="router.config.yaml"
telemetry:
metrics:
instrumentation:
instruments:
# Disable HTTP server request duration metric
http.server.request.duration: false
http.client.request.duration:
attributes:
# Disable the label
subgraph.name: false
# Enable the label (labels are enabled by default)
http.response.status_code: true
```
Attribute override behavior:
- `false` - drop label from that metric
- `true` - keep label (all labels are enabled by default)
Histogram aggregation can also be customized under
`telemetry.metrics.instrumentation.common.histogram`.
- [`explicit`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#histogram) (default)
uses unit-specific bucket sets. Lets you configure unit-specific buckets:
- `seconds` for histogram unit `s`
- `bytes` for histogram unit `By`
- [`exponential`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#exponentialhistogram)
uses one shared exponential strategy for all histogram metrics.
- `record_min_max controls whether min and max are reported for histogram points.
Bucket format rules:
- [`buckets`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#explicit-bucket-histogram-aggregation)
can be either all numbers or all strings.
- mixed arrays are not allowed.
- `seconds.buckets` string values are parsed as durations (for example `"5ms"`, `"1s"`).
- `bytes.buckets` string values are parsed as human-readable sizes (for example `"1KB"`, `"5MB"`).
In `explicit` mode, histogram units other than `s` and `By` fail startup.
```yaml filename="router.config.yaml"
telemetry:
metrics:
instrumentation:
common:
histogram:
aggregation: explicit
seconds:
buckets:
[
'5ms',
'10ms',
'25ms',
'50ms',
'75ms',
'100ms',
'250ms',
'500ms',
'750ms',
'1s',
'2.5s',
'5s',
'7.5s',
'10s'
]
record_min_max: false
bytes:
buckets:
[
'128B',
'512B',
'1KB',
'2KB',
'4KB',
'8KB',
'16KB',
'32KB',
'64KB',
'128KB',
'256KB',
'512KB',
'1MB',
'2MB',
'3MB',
'4MB',
'5MB'
]
record_min_max: false
```
## Metrics reference
### GraphQL
GraphQL metrics capture errors surfaced by the router across all stages of a GraphQL request
lifecycle.
<MetricsSection
metrics={[
{
name: 'hive.router.graphql.errors_total',
type: 'Counter',
unit: '{error}',
description:
'Total count of GraphQL errors encountered during query processing and execution, categorized by error code.',
labels: ['code']
}
]}
labels={[
{
name: 'code',
meaning: 'GraphQL error code',
typicalValues: [
'GRAPHQL_PARSE_FAILED',
'GRAPHQL_VALIDATION_FAILED',
'PLAN_EXECUTION_FAILED',
'UNKNOWN',
'...'
],
notes: `Uses "extensions.code" values and router's error codes. "UNKNOWN" is used when no code is available.`
}
]}
/>
### Supergraph
Supergraph metrics cover polling and processing lifecycle of schema updates.
<MetricsSection
metrics={[
{
name: 'hive.router.supergraph.poll.total',
type: 'Counter',
description: 'Total number of supergraph polling attempts, categorized by poll result.',
labels: ['result']
},
{
name: 'hive.router.supergraph.poll.duration',
type: 'Histogram',
unit: 'Seconds',
description: 'Duration of supergraph polling attempts, categorized by poll result.',
labels: ['result']
},
{
name: 'hive.router.supergraph.process.duration',
type: 'Histogram',
unit: 'Seconds',
description: 'Time spent processing supergraph updates, categorized by status.',
labels: ['status']
}
]}
labels={[
{
name: 'result',
meaning: 'Result of the poll',
typicalValues: ['updated', 'not_modified', 'error'],
notes: 'Used by "hive.router.supergraph.poll.*" metrics only'
},
{
name: 'status',
meaning: 'Supergraph processing status',
typicalValues: ['ok', 'error'],
notes: 'Used by "hive.router.supergraph.process.*" metrics only'
}
]}
/>
### HTTP server
HTTP server metrics capture inbound client traffic processed by the router.
<MetricsSection
metrics={[
{
name: 'http.server.request.duration',
type: 'Histogram',
unit: 'Seconds',
description: 'Duration of inbound HTTP requests handled by the router.',
labels: [
'http.request.method',
'http.response.status_code',
'http.route',
'network.protocol.name',
'network.protocol.version',
'url.scheme',
'error.type',
'graphql.operation.name',
'graphql.operation.type',
'graphql.response.status'
]
},
{
name: 'http.server.request.body.size',
type: 'Histogram',
unit: 'Bytes',
description: 'Size of inbound HTTP request bodies handled by the router.',
labels: [
'http.request.method',
'http.response.status_code',
'http.route',
'network.protocol.name',
'network.protocol.version',
'url.scheme',
'error.type',
'graphql.operation.name',
'graphql.operation.type',
'graphql.response.status'
]
},
{
name: 'http.server.response.body.size',
type: 'Histogram',
unit: 'Bytes',
description: 'Size of outbound HTTP response bodies returned by the router.',
labels: [
'http.request.method',
'http.response.status_code',
'http.route',
'network.protocol.name',
'network.protocol.version',
'url.scheme',
'error.type',
'graphql.operation.name',
'graphql.operation.type',
'graphql.response.status'
]
},
{
name: 'http.server.active_requests',
type: 'UpDownCounter',
unit: '{request}',
description: 'Current number of in-flight inbound HTTP requests.',
labels: ['http.request.method', 'network.protocol.name', 'url.scheme']
}
]}
labels={[
{
name: 'http.request.method',
meaning: 'HTTP method',
typicalValues: [
'GET',
'POST',
'PUT',
'PATCH',
'DELETE',
'HEAD',
'OPTIONS',
'CONNECT',
'TRACE',
'QUERY',
'_OTHER'
],
notes: '_OTHER is fallback for unknown methods'
},
{
name: 'http.response.status_code',
meaning: 'Response status code',
typicalValues: ['200', '400', '500', '...']
},
{
name: 'http.route',
meaning: 'Normalized router path',
typicalValues: ['/graphql']
},
{
name: 'network.protocol.name',
meaning: 'Protocol name',
typicalValues: ['http']
},
{
name: 'network.protocol.version',
meaning: 'Protocol version',
typicalValues: ['0.9', '1.0', '1.1', '2', '3']
},
{
name: 'url.scheme',
meaning: 'URL scheme',
typicalValues: ['http', 'https']
},
{
name: 'error.type',
meaning: 'Error classification for failed requests',
typicalValues: ['status code >= 400'],
notes: 'Only set for failed requests'
},
{
name: 'graphql.operation.name',
meaning: 'GraphQL operation name associated with the HTTP request',
typicalValues: ['UsersQuery', 'IntrospectionQuery', 'UNKNOWN'],
notes:
'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. High-cardinality risk: value is client-controlled and can explode without persisted operations.'
},
{
name: 'graphql.operation.type',
meaning: 'GraphQL operation type',
typicalValues: ['query', 'mutation', 'subscription'],
notes:
'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. Omitted when unknown'
},
{
name: 'graphql.response.status',
meaning: 'GraphQL response status for the request',
typicalValues: ['ok', 'error'],
notes:
'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. "error" indicates the GraphQL response contains at least one error'
}
]}
/>
### HTTP client
HTTP client metrics capture outbound requests the router makes to subgraphs.
<MetricsSection
metrics={[
{
name: 'http.client.request.duration',
type: 'Histogram',
unit: 'Seconds',
description: 'Duration of outbound HTTP requests sent from router to subgraphs.',
labels: [
'http.request.method',
'server.address',
'server.port',
'network.protocol.name',
'network.protocol.version',
'url.scheme',
'subgraph.name',
'http.response.status_code',
'error.type',
'graphql.response.status'
]
},
{
name: 'http.client.request.body.size',
type: 'Histogram',
unit: 'Bytes',
description: 'Size of outbound HTTP request bodies sent to subgraphs.',
labels: [
'http.request.method',
'server.address',
'server.port',
'network.protocol.name',
'network.protocol.version',
'url.scheme',
'subgraph.name',
'http.response.status_code',
'error.type',
'graphql.response.status'
]
},
{
name: 'http.client.response.body.size',
type: 'Histogram',
unit: 'Bytes',
description: 'Size of HTTP response bodies returned by subgraphs.',
labels: [
'http.request.method',
'server.address',
'server.port',
'network.protocol.name',
'network.protocol.version',
'url.scheme',
'subgraph.name',
'http.response.status_code',
'error.type',
'graphql.response.status'
]
},
{
name: 'http.client.active_requests',
type: 'UpDownCounter',
unit: '{request}',
description: 'Current number of in-flight outbound HTTP requests to subgraphs.',
labels: [
'http.request.method',
'server.address',
'server.port',
'url.scheme',
'subgraph.name'
]
}
]}
labels={[
{
name: 'http.request.method',
meaning: 'HTTP method',
typicalValues: [
'GET',
'POST',
'PUT',
'PATCH',
'DELETE',
'HEAD',
'OPTIONS',
'CONNECT',
'TRACE',
'QUERY',
'_OTHER'
],
notes: '_OTHER is fallback for unknown methods'
},
{
name: 'http.response.status_code',
meaning: 'Response status code',
typicalValues: ['200', '400', '500', '...']
},
{
name: 'network.protocol.name',
meaning: 'Protocol name',
typicalValues: ['http']
},
{
name: 'network.protocol.version',
meaning: 'Protocol version',
typicalValues: ['0.9', '1.0', '1.1', '2', '3']
},
{
name: 'url.scheme',
meaning: 'URL scheme',
typicalValues: ['http', 'https']
},
{
name: 'server.address',
meaning: 'Subgraph host',
typicalValues: ['URI host', 'unknown'],
notes: 'URI host, or unknown fallback'
},
{
name: 'server.port',
meaning: 'Subgraph port',
typicalValues: ['80', '443'],
notes: 'Explicit URI port, or fallback 80/443'
},
{
name: 'subgraph.name',
meaning: 'Subgraph identifier',
typicalValues: ['accounts'],
notes: 'Configured names (for example "accounts")'
},
{
name: 'error.type',
meaning: 'Error classification',
typicalValues: ['400', 'SUBGRAPH_REQUEST_FAILURE', '...'],
notes: 'Numeric status code >= 400 or execution error code string'
},
{
name: 'graphql.response.status',
meaning: 'GraphQL response status for the subgraph request',
typicalValues: ['ok', 'error'],
notes:
'Set to "ok" when the parsed subgraph response has no GraphQL errors. Set to "error" when the subgraph response includes GraphQL errors or when transport/deserialization fails.'
}
]}
/>
### Cache
Cache metrics track lookup behavior and cache size across router caches used during request
preparation and planning stages.
#### Parsing cache
Parsing cache metrics measure query parse cache hit/miss behavior and cache size.
<MetricsSection
metrics={[
{
name: 'hive.router.parse_cache.requests_total',
type: 'Counter',
description: 'Total number of parsing cache lookups, categorized by result.',
labels: ['result']
},
{
name: 'hive.router.parse_cache.duration',
type: 'Histogram',
unit: 'Seconds',
description: 'Duration of parsing cache lookups, categorized by result.',
labels: ['result']
},
{
name: 'hive.router.parse_cache.size',
type: 'Gauge',
description: 'Current number of entries stored in the parsing cache.'
}
]}
/>
#### Validation cache
Validation cache metrics measure query validation cache hit/miss behavior and cache size.
<MetricsSection
metrics={[
{
name: 'hive.router.validate_cache.requests_total',
type: 'Counter',
description: 'Total number of validation cache lookups, categorized by result.',
labels: ['result']
},
{
name: 'hive.router.validate_cache.duration',
type: 'Histogram',
unit: 'Seconds',
description: 'Duration of validation cache lookups, categorized by result.',
labels: ['result']
},
{
name: 'hive.router.validate_cache.size',
type: 'Gauge',
description: 'Current number of entries stored in the validation cache.'
}
]}
/>
#### Normalization cache
Normalization cache metrics measure query normalization cache hit/miss behavior and cache size.
<MetricsSection
metrics={[
{
name: 'hive.router.normalize_cache.requests_total',
type: 'Counter',
description: 'Total number of normalization cache lookups, categorized by result.',
labels: ['result']
},
{
name: 'hive.router.normalize_cache.duration',
type: 'Histogram',
unit: 'Seconds',
description: 'Duration of normalization cache lookups, categorized by result.',
labels: ['result']
},
{
name: 'hive.router.normalize_cache.size',
type: 'Gauge',
description: 'Current number of entries stored in the normalization cache.'
}
]}
/>
#### Planning cache
Planning cache metrics measure query planning cache hit/miss behavior and cache size.
<MetricsSection
metrics={[
{
name: 'hive.router.plan_cache.requests_total',
type: 'Counter',
description: 'Total number of planning cache lookups, categorized by result.',
labels: ['result']
},
{
name: 'hive.router.plan_cache.duration',
type: 'Histogram',
unit: 'Seconds',
description: 'Duration of planning cache lookups, categorized by result.',
labels: ['result']
},
{
name: 'hive.router.plan_cache.size',
type: 'Gauge',
description: 'Current number of entries stored in the planning cache.'
}
]}
/>
#### Labels
These labels are shared by cache lookup counters and duration histograms.
<div className="mt-4">
<MetricsSection
labels={[
{
name: 'result',
meaning: 'Cache lookup outcome',
typicalValues: ['hit', 'miss'],
notes: 'Used by cache `requests_total` and `duration` metrics'
}
]}
/>
</div>
## What to monitor in production
The examples below show which signals to monitor in production and how to break them down so you can
quickly isolate API, subgraph, cache, and GraphQL issues.
### Monitor end-to-end latency of your GraphQL API
Use [`http.server.request.duration`](#metric-http-server-request-duration) as your primary latency
signal.
In production, break this metric down by `http.route`, `http.request.method`,
`http.response.status_code`, and/or `graphql.response.status`, then track p95 and p99 latency per
route and method. Keep successful and failed responses separated so error-path latency does not get
hidden by healthy traffic.
### Monitor health of your subgraphs
Use [`http.client.request.duration`](#metric-http-client-request-duration) and
[`http.client.active_requests`](#metric-http-client-active-requests) to monitor dependency health
across your federated graph.
Break these metrics down by `subgraph.name`, `http.response.status_code`, and `error.type` to
identify which subgraph is driving tail latency or error spikes.
### Monitor cache effectiveness and planning pressure
Use the cache metrics to evaluate cache hit ratio, miss cost, and pressure over time.
For request and duration metrics, split by `result` (`hit` and `miss`) so you can track hit ratio
and miss latency per cache kind.
### Monitor GraphQL errors over time
Use [`hive.router.graphql.errors_total`](#metric-hive-router-graphql-errors-total) and break it down
by `code` to track both volume and error distribution.
In production, monitor how error-code distribution changes over time, not only total count, so you
can separate validation issues from execution failures.
## Configuration reference
For full options and defaults, see
[telemetry configuration reference](/docs/router/configuration/telemetry).