mirror of
https://github.com/graphql-hive/console
synced 2026-04-21 14:37:17 +00:00
docs(router): metrics (#7696)
This commit is contained in:
parent
14c73e5751
commit
a3acbd892f
5 changed files with 1386 additions and 1 deletions
58
packages/web/docs/src/components/otel-metrics/label-card.tsx
Normal file
58
packages/web/docs/src/components/otel-metrics/label-card.tsx
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
import { Info, Lightbulb, Tag } from 'lucide-react';
|
||||
|
||||
interface LabelCardProps {
|
||||
name: string;
|
||||
meaning: string;
|
||||
typicalValues: string[];
|
||||
notes?: string;
|
||||
}
|
||||
|
||||
export function LabelCard({ name, meaning, typicalValues, notes }: LabelCardProps) {
|
||||
return (
|
||||
<div>
|
||||
<div className="mb-3 flex items-start gap-3">
|
||||
<div className="shrink-0 rounded-md border border-gray-200 bg-gray-100 p-1.5 dark:border-neutral-700 dark:bg-neutral-800">
|
||||
<Tag className="h-4 w-4 text-gray-600 dark:text-slate-100" />
|
||||
</div>
|
||||
<div className="min-w-0 flex-1">
|
||||
<code className="break-all text-sm font-semibold text-gray-900 dark:text-slate-100">
|
||||
{name}
|
||||
</code>
|
||||
<p className="mt-1 text-sm leading-relaxed text-gray-600 dark:text-slate-100">
|
||||
{meaning}
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="mt-4 space-y-3">
|
||||
<div>
|
||||
<div className="mb-2 flex items-center gap-1.5">
|
||||
<Info className="h-3.5 w-3.5 text-gray-500 dark:text-slate-400" />
|
||||
<span className="text-xs font-semibold uppercase text-gray-700 dark:text-slate-100">
|
||||
Typical Values
|
||||
</span>
|
||||
</div>
|
||||
<div className="flex flex-wrap gap-1.5">
|
||||
{typicalValues.map(value => (
|
||||
<code
|
||||
key={value}
|
||||
className="rounded-md border border-slate-200 bg-slate-50 px-2.5 py-1 text-xs font-medium text-slate-700 dark:border-neutral-700 dark:bg-neutral-800 dark:text-slate-200"
|
||||
>
|
||||
{value}
|
||||
</code>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{notes && (
|
||||
<div className="pt-1">
|
||||
<div className="flex items-start gap-2">
|
||||
<Lightbulb className="mt-0.5 h-3.5 w-3.5 shrink-0 text-amber-600 dark:text-amber-400" />
|
||||
<p className="text-sm leading-relaxed text-gray-600 dark:text-slate-100">{notes}</p>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
163
packages/web/docs/src/components/otel-metrics/metric-card.tsx
Normal file
163
packages/web/docs/src/components/otel-metrics/metric-card.tsx
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
import { useEffect, useRef, useState } from 'react';
|
||||
import { Activity, BarChart3, Gauge, TrendingUp } from 'lucide-react';
|
||||
|
||||
interface MetricCardProps {
|
||||
name: string;
|
||||
type: 'Counter' | 'Histogram' | 'UpDownCounter' | 'Gauge';
|
||||
unit?: string;
|
||||
description?: string;
|
||||
labels?: string[];
|
||||
}
|
||||
|
||||
const typeConfig = {
|
||||
Counter: {
|
||||
icon: TrendingUp,
|
||||
color:
|
||||
'bg-emerald-50 text-emerald-700 border-emerald-200 dark:bg-emerald-900/30 dark:text-emerald-300 dark:border-emerald-700/50',
|
||||
badge: 'bg-emerald-100 text-emerald-800',
|
||||
},
|
||||
Histogram: {
|
||||
icon: BarChart3,
|
||||
color:
|
||||
'bg-blue-50 text-blue-700 border-blue-200 dark:bg-blue-900/30 dark:text-blue-300 dark:border-blue-700/50',
|
||||
badge: 'bg-blue-100 text-blue-800',
|
||||
},
|
||||
UpDownCounter: {
|
||||
icon: Activity,
|
||||
color:
|
||||
'bg-amber-50 text-amber-700 border-amber-200 dark:bg-amber-900/30 dark:text-amber-300 dark:border-amber-700/50',
|
||||
badge: 'bg-amber-100 text-amber-800',
|
||||
},
|
||||
Gauge: {
|
||||
icon: Gauge,
|
||||
color:
|
||||
'bg-slate-50 text-slate-700 border-slate-200 dark:bg-slate-800/60 dark:text-slate-100 dark:border-slate-700',
|
||||
badge: 'bg-slate-100 text-slate-800',
|
||||
},
|
||||
};
|
||||
|
||||
export function MetricCard({ name, type, unit, description, labels }: MetricCardProps) {
|
||||
const config = typeConfig[type];
|
||||
const Icon = config.icon;
|
||||
const [isCopied, setIsCopied] = useState(false);
|
||||
const copiedTimeoutRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||
const metricId = `metric-${name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/(^-|-$)/g, '')}`;
|
||||
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
if (copiedTimeoutRef.current) {
|
||||
clearTimeout(copiedTimeoutRef.current);
|
||||
}
|
||||
};
|
||||
}, []);
|
||||
|
||||
function showCopiedState() {
|
||||
setIsCopied(true);
|
||||
|
||||
if (copiedTimeoutRef.current) {
|
||||
clearTimeout(copiedTimeoutRef.current);
|
||||
}
|
||||
|
||||
copiedTimeoutRef.current = setTimeout(() => {
|
||||
setIsCopied(false);
|
||||
}, 1200);
|
||||
}
|
||||
|
||||
async function copyMetricLink() {
|
||||
if (typeof window === 'undefined') {
|
||||
return;
|
||||
}
|
||||
|
||||
const metricUrl = `${window.location.origin}${window.location.pathname}${window.location.search}#${metricId}`;
|
||||
|
||||
try {
|
||||
await navigator.clipboard.writeText(metricUrl);
|
||||
showCopiedState();
|
||||
} catch {
|
||||
window.location.hash = metricId;
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<div
|
||||
id={metricId}
|
||||
className="group scroll-mt-20 overflow-hidden rounded-lg border border-gray-200 bg-white transition-shadow duration-200 hover:shadow-md dark:border-neutral-800 dark:bg-neutral-900 dark:hover:shadow-black/30"
|
||||
>
|
||||
<div className="p-5">
|
||||
<div className="mb-3 flex items-center justify-between gap-4">
|
||||
<div className="min-w-0 flex-1">
|
||||
<div className="flex items-center gap-1.5">
|
||||
<code className="break-all text-sm font-semibold text-gray-900 dark:text-slate-100">
|
||||
{name}
|
||||
</code>
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => {
|
||||
void copyMetricLink();
|
||||
}}
|
||||
className={`hive-focus inline-flex items-center justify-center rounded font-mono text-sm font-semibold leading-none transition-all duration-200 ${isCopied ? 'translate-y-0 text-gray-500 opacity-100 dark:text-slate-500' : 'translate-y-0 text-gray-500 opacity-0 hover:text-gray-700 focus:text-gray-700 group-focus-within:opacity-100 group-hover:opacity-100 dark:text-slate-500 dark:hover:text-slate-200 dark:focus:text-slate-200'}`}
|
||||
aria-label={`Copy link to ${name}`}
|
||||
title="Copy metric link"
|
||||
>
|
||||
{isCopied ? (
|
||||
<>
|
||||
<span>✓</span>
|
||||
<span className="ml-1 text-xs">copied</span>
|
||||
</>
|
||||
) : (
|
||||
'#'
|
||||
)}
|
||||
</button>
|
||||
<span className="sr-only" aria-live="polite">
|
||||
{isCopied ? `Copied link to ${name}` : ''}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
<div className="flex shrink-0 items-center gap-2">
|
||||
{unit && (
|
||||
<div className="flex items-center gap-1.5 rounded-md border border-gray-200 bg-gray-100 px-2.5 py-1 text-xs text-gray-700 dark:border-neutral-700 dark:bg-neutral-800 dark:text-slate-200">
|
||||
<span className="font-medium text-gray-500 dark:text-slate-300">Unit:</span>
|
||||
<code>{unit}</code>
|
||||
</div>
|
||||
)}
|
||||
<div
|
||||
className={`flex items-center gap-1.5 rounded-md border px-2.5 py-1 ${config.color}`}
|
||||
>
|
||||
<Icon className="h-3.5 w-3.5" />
|
||||
<span className="text-xs font-medium">{type}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{description && (
|
||||
<p className="mb-4 text-sm leading-relaxed text-gray-600 dark:text-slate-100">
|
||||
{description}
|
||||
</p>
|
||||
)}
|
||||
|
||||
{labels && labels.length > 0 && (
|
||||
<div className="mt-4 border-t border-gray-100 pt-4 dark:border-neutral-800">
|
||||
<div className="mb-2 flex items-center gap-2">
|
||||
<span className="text-xs font-semibold uppercase text-gray-700 dark:text-slate-100">
|
||||
Labels
|
||||
</span>
|
||||
</div>
|
||||
<div className="flex flex-wrap gap-1.5">
|
||||
{labels.map(label => (
|
||||
<code
|
||||
key={label}
|
||||
className="rounded border border-gray-200 bg-gray-50 px-2 py-1 text-xs text-gray-700 transition-colors hover:border-gray-300 dark:border-neutral-700 dark:bg-neutral-800 dark:text-slate-200 dark:hover:border-neutral-600"
|
||||
>
|
||||
{label}
|
||||
</code>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
'use client';
|
||||
|
||||
import { useId, useState } from 'react';
|
||||
import { ChevronDown } from 'lucide-react';
|
||||
import { LabelCard } from './label-card';
|
||||
import { MetricCard } from './metric-card';
|
||||
|
||||
interface Metric {
|
||||
name: string;
|
||||
type: 'Counter' | 'Histogram' | 'UpDownCounter' | 'Gauge';
|
||||
unit?: string;
|
||||
description?: string;
|
||||
labels?: string[];
|
||||
}
|
||||
|
||||
interface Label {
|
||||
name: string;
|
||||
meaning: string;
|
||||
typicalValues: string[];
|
||||
notes?: string;
|
||||
}
|
||||
|
||||
interface MetricsSectionProps {
|
||||
title?: string;
|
||||
description?: string;
|
||||
metrics?: Metric[];
|
||||
labels?: Label[];
|
||||
}
|
||||
export function MetricsSection({ metrics, labels }: MetricsSectionProps) {
|
||||
const [isLabelsOpen, setIsLabelsOpen] = useState(false);
|
||||
const labelsRegionId = useId();
|
||||
|
||||
return (
|
||||
<div className="space-y-6">
|
||||
{metrics && metrics.length > 0 && (
|
||||
<div className="space-y-4">
|
||||
<h4 className="mt-8 text-xl font-semibold tracking-tight text-slate-900 dark:text-slate-100">
|
||||
Metrics
|
||||
</h4>
|
||||
<div className="grid gap-4">
|
||||
{metrics.map(metric => (
|
||||
<MetricCard key={metric.name} {...metric} />
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{labels && labels.length > 0 && (
|
||||
<div className="overflow-hidden rounded-lg border border-gray-200 bg-white dark:border-neutral-800 dark:bg-neutral-900">
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => setIsLabelsOpen(current => !current)}
|
||||
aria-expanded={isLabelsOpen}
|
||||
aria-controls={labelsRegionId}
|
||||
className="hive-focus flex w-full items-center justify-between px-5 py-4 text-left text-xl font-semibold tracking-tight text-slate-900 dark:text-slate-100"
|
||||
>
|
||||
<span>Labels Reference</span>
|
||||
<ChevronDown
|
||||
className={`h-5 w-5 transition-transform duration-200 ${isLabelsOpen ? 'rotate-180' : ''}`}
|
||||
/>
|
||||
</button>
|
||||
<div
|
||||
id={labelsRegionId}
|
||||
className={`overflow-hidden transition-[max-height,opacity] duration-300 ease-out ${isLabelsOpen ? 'max-h-[4000px] opacity-100' : 'max-h-0 opacity-90'}`}
|
||||
>
|
||||
<div className="border-t border-gray-100 px-5 pb-5 dark:border-neutral-800">
|
||||
<div className="divide-y divide-gray-100 pt-2 dark:divide-neutral-800">
|
||||
{labels.map(label => (
|
||||
<div key={label.name} className="py-6">
|
||||
<LabelCard {...label} />
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
|
@ -5,7 +5,7 @@ title: 'telemetry'
|
|||
# telemetry
|
||||
|
||||
The `telemetry` configuration controls client identification, Hive reporting, and OpenTelemetry
|
||||
tracing behavior in Hive Router.
|
||||
tracing and metrics behavior in Hive Router.
|
||||
|
||||
## client_identification
|
||||
|
||||
|
|
@ -232,6 +232,265 @@ telemetry:
|
|||
x-api-key: key
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
</div>
|
||||
|
||||
</details>
|
||||
|
||||
## metrics
|
||||
|
||||
Top-level OpenTelemetry metrics configuration.
|
||||
|
||||
<details>
|
||||
<summary>Show metrics configuration</summary>
|
||||
|
||||
Metrics are enabled when at least one exporter is configured and enabled.
|
||||
|
||||
| Field | Type | Default | Notes |
|
||||
| ----------------- | -------- | ------- | ------------------------------------------------------------------------------------- |
|
||||
| `exporters` | `array` | `[]` | List of exporters used to send metrics. |
|
||||
| `instrumentation` | `object` | `{}` | Instrument behavior for metrics (histogram aggregation and per-instrument overrides). |
|
||||
|
||||
<div id="telemetry-metrics-exporters" style={{marginTop: 10}}>
|
||||
<details>
|
||||
<summary>`exporters`</summary>
|
||||
|
||||
Each item configures one metrics exporter.
|
||||
|
||||
Each item in this array defines one exporter instance, so you can configure multiple metrics
|
||||
destinations if needed.
|
||||
|
||||
This reference documents OTLP and Prometheus exporter configuration.
|
||||
|
||||
| Field | Type | Default | Notes |
|
||||
| --------- | --------- | ------- | ------------------------------------------------------ |
|
||||
| `kind` | `string` | - | Exporter kind. Supported values: `otlp`, `prometheus`. |
|
||||
| `enabled` | `boolean` | `true` | Enables or disables this exporter. |
|
||||
|
||||
<details>
|
||||
<summary>`otlp`</summary>
|
||||
|
||||
| Field | Type | Default | Notes |
|
||||
| ------------------------------------------------------------------------------------------------------------------- | -------------------- | ------------ | ----------------------------------------------------------------- |
|
||||
| `kind` | `string` | - | Must be `otlp`. |
|
||||
| `enabled` | `boolean` | `true` | Enables or disables this exporter. |
|
||||
| `endpoint` | `StringOrExpression` | - | OTLP endpoint. Must be set explicitly. |
|
||||
| `protocol` | `string` | - | OTLP transport protocol. Supported values: `http`, `grpc`. |
|
||||
| `interval` | `string` | `60s` | Interval between OTLP export attempts. |
|
||||
| [`temporality`](https://opentelemetry.io/docs/specs/otel/metrics/supplementary-guidelines/#aggregation-temporality) | `string` | `cumulative` | Aggregation temporality. Supported values: `cumulative`, `delta`. |
|
||||
| `max_export_timeout` | `string` | `5s` | Maximum time for one OTLP export attempt. |
|
||||
| `http` | `object` | - | HTTP-specific OTLP settings (for `protocol: http`). |
|
||||
| `grpc` | `object` | - | gRPC-specific OTLP settings (for `protocol: grpc`). |
|
||||
|
||||
OTLP over HTTP:
|
||||
|
||||
| Field | Type | Value / Default | Notes |
|
||||
| -------------- | -------- | --------------- | ------------------------------------------------------------- |
|
||||
| `protocol` | `string` | `http` | OTLP transport protocol. |
|
||||
| `http.headers` | `object` | `{}` | Map of header names to values (`string` or `{ expression }`). |
|
||||
|
||||
```yaml filename="router.config.yaml"
|
||||
telemetry:
|
||||
metrics:
|
||||
exporters:
|
||||
- kind: otlp
|
||||
enabled: true
|
||||
protocol: http
|
||||
endpoint: https://otel-collector.example.com/v1/metrics
|
||||
interval: 60s
|
||||
temporality: cumulative
|
||||
max_export_timeout: 5s
|
||||
http:
|
||||
headers:
|
||||
x-otlp-header: value
|
||||
```
|
||||
|
||||
OTLP over gRPC:
|
||||
|
||||
| Field | Type | Value / Default | Notes |
|
||||
| ---------------------- | -------- | --------------- | ---------------------------------------------------------------------------- |
|
||||
| `protocol` | `string` | `grpc` | OTLP transport protocol. |
|
||||
| `grpc.metadata` | `object` | `{}` | Map of metadata keys to values (`string` or `{ expression }`). |
|
||||
| `grpc.tls.domain_name` | `string` | - | Domain name used to verify the server certificate. |
|
||||
| `grpc.tls.key` | `string` | - | Path to the client private key file. |
|
||||
| `grpc.tls.cert` | `string` | - | Path to the client certificate file (PEM). |
|
||||
| `grpc.tls.ca` | `string` | - | Path to the CA certificate file (PEM) used to verify the server certificate. |
|
||||
|
||||
```yaml filename="router.config.yaml"
|
||||
telemetry:
|
||||
metrics:
|
||||
exporters:
|
||||
- kind: otlp
|
||||
enabled: true
|
||||
protocol: grpc
|
||||
endpoint: https://otel-collector.example.com:4317
|
||||
interval: 60s
|
||||
temporality: cumulative
|
||||
max_export_timeout: 5s
|
||||
grpc:
|
||||
metadata:
|
||||
x-api-key: key
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>`prometheus`</summary>
|
||||
|
||||
| Field | Type | Default | Notes |
|
||||
| --------- | --------- | ---------- | ------------------------------------------- |
|
||||
| `kind` | `string` | - | Must be `prometheus`. |
|
||||
| `enabled` | `boolean` | `true` | Enables/disables Prometheus metrics export. |
|
||||
| `port` | `integer` | - | Optional port for metrics endpoint. |
|
||||
| `path` | `string` | `/metrics` | HTTP path exposed for scraping. |
|
||||
|
||||
```yaml filename="router.config.yaml"
|
||||
telemetry:
|
||||
metrics:
|
||||
exporters:
|
||||
- kind: prometheus
|
||||
enabled: true
|
||||
port: 9090
|
||||
path: /metrics
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
</details>
|
||||
</div>
|
||||
|
||||
<div id="telemetry-metrics-instrumentation" style={{marginTop: 10}}>
|
||||
<details>
|
||||
<summary>`instrumentation`</summary>
|
||||
|
||||
Controls histogram aggregation and per-instrument overrides.
|
||||
|
||||
| Field | Type | Default | Notes |
|
||||
| ------------------ | -------- | ----------------------- | ----------------------------------------------------------- |
|
||||
| `common.histogram` | `object` | exponential aggregation | Histogram aggregation strategy for instrumented histograms. |
|
||||
| `instruments` | `object` | `{}` | Map of metric name to `false`, `true`, or object override. |
|
||||
|
||||
<details>
|
||||
<summary>`common.histogram`</summary>
|
||||
|
||||
Set aggregation mode with `aggregation`.
|
||||
|
||||
`explicit` aggregation (default):
|
||||
|
||||
| Field | Type | Default | Notes |
|
||||
| ------------- | -------- | ------- | ----------------------------------------------------- |
|
||||
| `aggregation` | `string` | - | Must be `explicit`. |
|
||||
| `seconds` | `object` | - | Explicit histogram config for metrics with unit `s`. |
|
||||
| `bytes` | `object` | - | Explicit histogram config for metrics with unit `By`. |
|
||||
|
||||
`seconds` and `bytes` fields:
|
||||
|
||||
| Field | Type | Default | Notes |
|
||||
| ---------------- | ---------------------- | ------- | --------------------------------------------------------------- |
|
||||
| `buckets` | `number[] \| string[]` | varies | Explicit bucket upper bounds. Must be non-empty and increasing. |
|
||||
| `record_min_max` | `boolean` | `false` | Record min/max values for this unit bucket set. |
|
||||
|
||||
Default explicit buckets:
|
||||
|
||||
- `seconds.buckets`: `[0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5, 10]`
|
||||
- `bytes.buckets`:
|
||||
`[128, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 3145728, 4194304, 5242880]`
|
||||
|
||||
Bucket format rules:
|
||||
|
||||
- `buckets` can be either all numbers or all strings.
|
||||
- mixed arrays are not allowed.
|
||||
- for `seconds.buckets`, string values are parsed as durations (for example `"5ms"`, `"1s"`).
|
||||
- for `bytes.buckets`, string values are parsed as human-readable sizes (for example `"1KB"`,
|
||||
`"5MB"`).
|
||||
|
||||
[`exponential`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#exponentialhistogram)
|
||||
aggregation:
|
||||
|
||||
| Field | Type | Default | Notes |
|
||||
| ------------------------------------------------------------------------------------------------------------------- | --------- | ------- | ----------------------------------------- |
|
||||
| `aggregation` | `string` | - | Must be `exponential`. |
|
||||
| [`max_size`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#base2-exponential-bucket-histogram-aggregation) | `integer` | - | Max bucket count. Required. |
|
||||
| [`max_scale`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#base2-exponential-bucket-histogram-aggregation) | `integer` | - | Max scale for bucket precision. Required. |
|
||||
| `record_min_max` | `boolean` | `false` | Record min/max values. |
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>`instruments`</summary>
|
||||
|
||||
`instruments` is a map keyed by metric name. Value can be:
|
||||
|
||||
- `false` to disable a metric
|
||||
- `true` to keep defaults
|
||||
- object to keep metric enabled and override attributes
|
||||
|
||||
Object form supports:
|
||||
|
||||
| Field | Type | Notes |
|
||||
| ------------ | -------- | ------------------------------------------------------------------------------ |
|
||||
| `attributes` | `object` | Map of attribute name to `boolean` (`false` drops attribute, `true` keeps it). |
|
||||
|
||||
```yaml filename="router.config.yaml"
|
||||
telemetry:
|
||||
metrics:
|
||||
instrumentation:
|
||||
common:
|
||||
histogram:
|
||||
aggregation: explicit
|
||||
seconds:
|
||||
buckets:
|
||||
[
|
||||
'5ms',
|
||||
'10ms',
|
||||
'25ms',
|
||||
'50ms',
|
||||
'75ms',
|
||||
'100ms',
|
||||
'250ms',
|
||||
'500ms',
|
||||
'750ms',
|
||||
'1s',
|
||||
'2.5s',
|
||||
'5s',
|
||||
'7.5s',
|
||||
'10s'
|
||||
]
|
||||
record_min_max: false
|
||||
bytes:
|
||||
buckets:
|
||||
[
|
||||
'128B',
|
||||
'512B',
|
||||
'1KB',
|
||||
'2KB',
|
||||
'4KB',
|
||||
'8KB',
|
||||
'16KB',
|
||||
'32KB',
|
||||
'64KB',
|
||||
'128KB',
|
||||
'256KB',
|
||||
'512KB',
|
||||
'1MB',
|
||||
'2MB',
|
||||
'3MB',
|
||||
'4MB',
|
||||
'5MB'
|
||||
]
|
||||
record_min_max: false
|
||||
instruments:
|
||||
http.server.request.duration: true
|
||||
http.client.request.duration:
|
||||
attributes:
|
||||
subgraph.name: true
|
||||
http.response.status_code: true
|
||||
server.address: false
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
</details>
|
||||
</div>
|
||||
|
||||
|
|
|
|||
825
packages/web/docs/src/content/router/observability/metrics.mdx
Normal file
825
packages/web/docs/src/content/router/observability/metrics.mdx
Normal file
|
|
@ -0,0 +1,825 @@
|
|||
---
|
||||
title: 'OpenTelemetry Metrics'
|
||||
---
|
||||
|
||||
import { Callout } from '#components/callout'
|
||||
import { MetricsSection } from '#components/otel-metrics/metrics-section'
|
||||
import { Tabs } from '@theguild/components'
|
||||
|
||||
# OpenTelemetry Metrics
|
||||
|
||||
Hive Router exposes OpenTelemetry metrics for gateway traffic, subgraph traffic, cache behavior,
|
||||
supergraph lifecycle, and GraphQL errors.
|
||||
|
||||
This guide explains where to export metrics, how to configure OTLP and Prometheus, how to customize
|
||||
instruments, and what each metric/label means in practice.
|
||||
|
||||
## Choose your metrics destination
|
||||
|
||||
Hive Router exposes metrics through two widely used integration patterns:
|
||||
|
||||
- OTLP-based observability backends
|
||||
- Prometheus scrape endpoints
|
||||
|
||||
Most teams already running an OpenTelemetry pipeline tend to integrate via OTLP, while teams built
|
||||
around Prometheus and Grafana typically stick with Prometheus scraping.
|
||||
|
||||
### Send metrics to OTLP-compatible backends
|
||||
|
||||
Hive Router can export metrics using OTLP to standard OpenTelemetry pipelines, including the
|
||||
OpenTelemetry Collector and vendor backends that support OTLP ingestion over HTTP or gRPC.
|
||||
|
||||
After enabling the exporter, generate some traffic through the router and confirm that new metric
|
||||
series appear in your backend (for example HTTP server/client latency, cache metrics, and supergraph
|
||||
execution metrics).
|
||||
|
||||
If metrics do not appear, verify:
|
||||
|
||||
- Endpoint reachability (network, DNS, TLS)
|
||||
- Authentication credentials or headers
|
||||
- Exporter protocol matches the backend (OTLP/HTTP vs OTLP/gRPC)
|
||||
|
||||
<Tabs items={["OTLP over HTTP", "OTLP over gRPC"]}>
|
||||
|
||||
<Tabs.Tab>
|
||||
|
||||
```yaml filename="router.config.yaml"
|
||||
telemetry:
|
||||
metrics:
|
||||
exporters:
|
||||
- kind: otlp
|
||||
enabled: true
|
||||
protocol: http
|
||||
endpoint: https://otel-collector.example.com/v1/metrics
|
||||
interval: 30s
|
||||
max_export_timeout: 5s
|
||||
http:
|
||||
headers:
|
||||
authorization:
|
||||
expression: |
|
||||
"Bearer " + env("OTLP_TOKEN")
|
||||
```
|
||||
|
||||
</Tabs.Tab>
|
||||
|
||||
<Tabs.Tab>
|
||||
|
||||
```yaml filename="router.config.yaml"
|
||||
telemetry:
|
||||
metrics:
|
||||
exporters:
|
||||
- kind: otlp
|
||||
enabled: true
|
||||
protocol: grpc
|
||||
endpoint: https://otel-collector.example.com:4317
|
||||
interval: 30s
|
||||
max_export_timeout: 5s
|
||||
grpc:
|
||||
metadata:
|
||||
x-api-key:
|
||||
expression: env("OTEL_API_KEY")
|
||||
tls:
|
||||
domain_name: otel-collector.example.com
|
||||
ca: /etc/certs/ca.pem
|
||||
cert: /etc/certs/client.pem
|
||||
key: /etc/certs/client.key
|
||||
```
|
||||
|
||||
</Tabs.Tab>
|
||||
|
||||
</Tabs>
|
||||
|
||||
### Expose metrics for Prometheus scraping
|
||||
|
||||
If your observability stack is Prometheus-first, Hive Router can expose an HTTP endpoint that
|
||||
Prometheus scrapes at its configured interval.
|
||||
|
||||
The `port` and `path` settings define the address where the Router exposes metrics. Prometheus must
|
||||
be able to reach that address from its runtime environment (local network, Kubernetes service, or VM
|
||||
network path).
|
||||
|
||||
<Callout type="note">
|
||||
If `port` is not set, or is the same as the main HTTP server port, the Router exposes metrics
|
||||
through the same HTTP server that serves the GraphQL API. If the port is different, the Router
|
||||
starts a separate HTTP server dedicated solely to the Prometheus metrics endpoint.
|
||||
</Callout>
|
||||
|
||||
In production, make sure this endpoint is reachable only by trusted scrapers (for example via
|
||||
network policy, firewall rules, or private ingress). Once configured, confirm the target appears as
|
||||
healthy in Prometheus and then verify expected series are present (for example
|
||||
`http.server.request.duration`, `http.client.request.duration`).
|
||||
|
||||
```yaml filename="router.config.yaml"
|
||||
telemetry:
|
||||
metrics:
|
||||
exporters:
|
||||
- kind: prometheus
|
||||
enabled: true
|
||||
port: 9090
|
||||
path: /metrics
|
||||
```
|
||||
|
||||
## Production baseline
|
||||
|
||||
For production workloads, start with a single primary exporter, define a clear service identity, and
|
||||
keep default instrumentation settings.
|
||||
|
||||
```yaml filename="router.config.yaml"
|
||||
telemetry:
|
||||
resource:
|
||||
attributes:
|
||||
service.name: hive-router
|
||||
service.namespace: your-platform
|
||||
deployment.environment:
|
||||
expression: env("ENVIRONMENT")
|
||||
metrics:
|
||||
exporters:
|
||||
- kind: otlp
|
||||
enabled: true
|
||||
protocol: grpc
|
||||
endpoint: https://otel-collector.example.com:4317
|
||||
interval: 30s
|
||||
max_export_timeout: 5s
|
||||
```
|
||||
|
||||
This is a safe baseline and works well before introducing instrumentation-level customization.
|
||||
Additional exporters can be added later, but starting with one simplifies validation and
|
||||
troubleshooting.
|
||||
|
||||
### Cardinality considerations
|
||||
|
||||
For production workloads, consider disabling `graphql.operation.name` label or even
|
||||
`graphql.operation.type` on high-volume metrics.
|
||||
|
||||
<Callout type="warning">
|
||||
`graphql.operation.name` can create very high-cardinality metrics.
|
||||
|
||||
Operation names come from client requests. Without persisted operations, clients can send many
|
||||
distinct operation names (or random names), which can rapidly increase cardinality and cost in
|
||||
Prometheus and OTLP backends.
|
||||
|
||||
</Callout>
|
||||
|
||||
```yaml filename="router.config.yaml"
|
||||
telemetry:
|
||||
metrics:
|
||||
instrumentation:
|
||||
instruments:
|
||||
http.server.request.duration:
|
||||
attributes:
|
||||
graphql.operation.name: false
|
||||
http.server.request.body.size:
|
||||
attributes:
|
||||
graphql.operation.name: false
|
||||
http.server.response.body.size:
|
||||
attributes:
|
||||
graphql.operation.name: false
|
||||
```
|
||||
|
||||
## Customize instrumentation
|
||||
|
||||
You can override behavior per metric under `telemetry.metrics.instrumentation.instruments`.
|
||||
|
||||
<Callout type="tip">
|
||||
Disable non-essential labels to control cost, since each additional label value increases active
|
||||
time-series cardinality - a primary billing and performance driver in platforms like Grafana and
|
||||
Datadog.
|
||||
</Callout>
|
||||
|
||||
For production guidance on label cardinality (especially `graphql.operation.name`), see
|
||||
[Cardinality considerations](#cardinality-considerations).
|
||||
|
||||
- `false` disables a metric.
|
||||
- `true` keeps default behavior.
|
||||
- object form enables metric + optional attribute overrides.
|
||||
|
||||
```yaml filename="router.config.yaml"
|
||||
telemetry:
|
||||
metrics:
|
||||
instrumentation:
|
||||
instruments:
|
||||
# Disable HTTP server request duration metric
|
||||
http.server.request.duration: false
|
||||
http.client.request.duration:
|
||||
attributes:
|
||||
# Disable the label
|
||||
subgraph.name: false
|
||||
# Enable the label (labels are enabled by default)
|
||||
http.response.status_code: true
|
||||
```
|
||||
|
||||
Attribute override behavior:
|
||||
|
||||
- `false` - drop label from that metric
|
||||
- `true` - keep label (all labels are enabled by default)
|
||||
|
||||
Histogram aggregation can also be customized under
|
||||
`telemetry.metrics.instrumentation.common.histogram`.
|
||||
|
||||
- [`explicit`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#histogram) (default)
|
||||
uses unit-specific bucket sets. Lets you configure unit-specific buckets:
|
||||
- `seconds` for histogram unit `s`
|
||||
- `bytes` for histogram unit `By`
|
||||
- [`exponential`](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#exponentialhistogram)
|
||||
uses one shared exponential strategy for all histogram metrics.
|
||||
- `record_min_max controls whether min and max are reported for histogram points.
|
||||
|
||||
Bucket format rules:
|
||||
|
||||
- [`buckets`](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#explicit-bucket-histogram-aggregation)
|
||||
can be either all numbers or all strings.
|
||||
- mixed arrays are not allowed.
|
||||
- `seconds.buckets` string values are parsed as durations (for example `"5ms"`, `"1s"`).
|
||||
- `bytes.buckets` string values are parsed as human-readable sizes (for example `"1KB"`, `"5MB"`).
|
||||
|
||||
In `explicit` mode, histogram units other than `s` and `By` fail startup.
|
||||
|
||||
```yaml filename="router.config.yaml"
|
||||
telemetry:
|
||||
metrics:
|
||||
instrumentation:
|
||||
common:
|
||||
histogram:
|
||||
aggregation: explicit
|
||||
seconds:
|
||||
buckets:
|
||||
[
|
||||
'5ms',
|
||||
'10ms',
|
||||
'25ms',
|
||||
'50ms',
|
||||
'75ms',
|
||||
'100ms',
|
||||
'250ms',
|
||||
'500ms',
|
||||
'750ms',
|
||||
'1s',
|
||||
'2.5s',
|
||||
'5s',
|
||||
'7.5s',
|
||||
'10s'
|
||||
]
|
||||
record_min_max: false
|
||||
bytes:
|
||||
buckets:
|
||||
[
|
||||
'128B',
|
||||
'512B',
|
||||
'1KB',
|
||||
'2KB',
|
||||
'4KB',
|
||||
'8KB',
|
||||
'16KB',
|
||||
'32KB',
|
||||
'64KB',
|
||||
'128KB',
|
||||
'256KB',
|
||||
'512KB',
|
||||
'1MB',
|
||||
'2MB',
|
||||
'3MB',
|
||||
'4MB',
|
||||
'5MB'
|
||||
]
|
||||
record_min_max: false
|
||||
```
|
||||
|
||||
## Metrics reference
|
||||
|
||||
### GraphQL
|
||||
|
||||
GraphQL metrics capture errors surfaced by the router across all stages of a GraphQL request
|
||||
lifecycle.
|
||||
|
||||
<MetricsSection
|
||||
metrics={[
|
||||
{
|
||||
name: 'hive.router.graphql.errors_total',
|
||||
type: 'Counter',
|
||||
unit: '{error}',
|
||||
description:
|
||||
'Total count of GraphQL errors encountered during query processing and execution, categorized by error code.',
|
||||
labels: ['code']
|
||||
}
|
||||
]}
|
||||
labels={[
|
||||
{
|
||||
name: 'code',
|
||||
meaning: 'GraphQL error code',
|
||||
typicalValues: [
|
||||
'GRAPHQL_PARSE_FAILED',
|
||||
'GRAPHQL_VALIDATION_FAILED',
|
||||
'PLAN_EXECUTION_FAILED',
|
||||
'UNKNOWN',
|
||||
'...'
|
||||
],
|
||||
notes: `Uses "extensions.code" values and router's error codes. "UNKNOWN" is used when no code is available.`
|
||||
}
|
||||
]}
|
||||
/>
|
||||
|
||||
### Supergraph
|
||||
|
||||
Supergraph metrics cover polling and processing lifecycle of schema updates.
|
||||
|
||||
<MetricsSection
|
||||
metrics={[
|
||||
{
|
||||
name: 'hive.router.supergraph.poll.total',
|
||||
type: 'Counter',
|
||||
description: 'Total number of supergraph polling attempts, categorized by poll result.',
|
||||
labels: ['result']
|
||||
},
|
||||
{
|
||||
name: 'hive.router.supergraph.poll.duration',
|
||||
type: 'Histogram',
|
||||
unit: 'Seconds',
|
||||
description: 'Duration of supergraph polling attempts, categorized by poll result.',
|
||||
labels: ['result']
|
||||
},
|
||||
{
|
||||
name: 'hive.router.supergraph.process.duration',
|
||||
type: 'Histogram',
|
||||
unit: 'Seconds',
|
||||
description: 'Time spent processing supergraph updates, categorized by status.',
|
||||
labels: ['status']
|
||||
}
|
||||
]}
|
||||
labels={[
|
||||
{
|
||||
name: 'result',
|
||||
meaning: 'Result of the poll',
|
||||
typicalValues: ['updated', 'not_modified', 'error'],
|
||||
notes: 'Used by "hive.router.supergraph.poll.*" metrics only'
|
||||
},
|
||||
{
|
||||
name: 'status',
|
||||
meaning: 'Supergraph processing status',
|
||||
typicalValues: ['ok', 'error'],
|
||||
notes: 'Used by "hive.router.supergraph.process.*" metrics only'
|
||||
}
|
||||
]}
|
||||
/>
|
||||
|
||||
### HTTP server
|
||||
|
||||
HTTP server metrics capture inbound client traffic processed by the router.
|
||||
|
||||
<MetricsSection
|
||||
metrics={[
|
||||
{
|
||||
name: 'http.server.request.duration',
|
||||
type: 'Histogram',
|
||||
unit: 'Seconds',
|
||||
description: 'Duration of inbound HTTP requests handled by the router.',
|
||||
labels: [
|
||||
'http.request.method',
|
||||
'http.response.status_code',
|
||||
'http.route',
|
||||
'network.protocol.name',
|
||||
'network.protocol.version',
|
||||
'url.scheme',
|
||||
'error.type',
|
||||
'graphql.operation.name',
|
||||
'graphql.operation.type',
|
||||
'graphql.response.status'
|
||||
]
|
||||
},
|
||||
{
|
||||
name: 'http.server.request.body.size',
|
||||
type: 'Histogram',
|
||||
unit: 'Bytes',
|
||||
description: 'Size of inbound HTTP request bodies handled by the router.',
|
||||
labels: [
|
||||
'http.request.method',
|
||||
'http.response.status_code',
|
||||
'http.route',
|
||||
'network.protocol.name',
|
||||
'network.protocol.version',
|
||||
'url.scheme',
|
||||
'error.type',
|
||||
'graphql.operation.name',
|
||||
'graphql.operation.type',
|
||||
'graphql.response.status'
|
||||
]
|
||||
},
|
||||
{
|
||||
name: 'http.server.response.body.size',
|
||||
type: 'Histogram',
|
||||
unit: 'Bytes',
|
||||
description: 'Size of outbound HTTP response bodies returned by the router.',
|
||||
labels: [
|
||||
'http.request.method',
|
||||
'http.response.status_code',
|
||||
'http.route',
|
||||
'network.protocol.name',
|
||||
'network.protocol.version',
|
||||
'url.scheme',
|
||||
'error.type',
|
||||
'graphql.operation.name',
|
||||
'graphql.operation.type',
|
||||
'graphql.response.status'
|
||||
]
|
||||
},
|
||||
{
|
||||
name: 'http.server.active_requests',
|
||||
type: 'UpDownCounter',
|
||||
unit: '{request}',
|
||||
description: 'Current number of in-flight inbound HTTP requests.',
|
||||
labels: ['http.request.method', 'network.protocol.name', 'url.scheme']
|
||||
}
|
||||
]}
|
||||
labels={[
|
||||
{
|
||||
name: 'http.request.method',
|
||||
meaning: 'HTTP method',
|
||||
typicalValues: [
|
||||
'GET',
|
||||
'POST',
|
||||
'PUT',
|
||||
'PATCH',
|
||||
'DELETE',
|
||||
'HEAD',
|
||||
'OPTIONS',
|
||||
'CONNECT',
|
||||
'TRACE',
|
||||
'QUERY',
|
||||
'_OTHER'
|
||||
],
|
||||
notes: '_OTHER is fallback for unknown methods'
|
||||
},
|
||||
{
|
||||
name: 'http.response.status_code',
|
||||
meaning: 'Response status code',
|
||||
typicalValues: ['200', '400', '500', '...']
|
||||
},
|
||||
{
|
||||
name: 'http.route',
|
||||
meaning: 'Normalized router path',
|
||||
typicalValues: ['/graphql']
|
||||
},
|
||||
{
|
||||
name: 'network.protocol.name',
|
||||
meaning: 'Protocol name',
|
||||
typicalValues: ['http']
|
||||
},
|
||||
{
|
||||
name: 'network.protocol.version',
|
||||
meaning: 'Protocol version',
|
||||
typicalValues: ['0.9', '1.0', '1.1', '2', '3']
|
||||
},
|
||||
{
|
||||
name: 'url.scheme',
|
||||
meaning: 'URL scheme',
|
||||
typicalValues: ['http', 'https']
|
||||
},
|
||||
{
|
||||
name: 'error.type',
|
||||
meaning: 'Error classification for failed requests',
|
||||
typicalValues: ['status code >= 400'],
|
||||
notes: 'Only set for failed requests'
|
||||
},
|
||||
{
|
||||
name: 'graphql.operation.name',
|
||||
meaning: 'GraphQL operation name associated with the HTTP request',
|
||||
typicalValues: ['UsersQuery', 'IntrospectionQuery', 'UNKNOWN'],
|
||||
notes:
|
||||
'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. High-cardinality risk: value is client-controlled and can explode without persisted operations.'
|
||||
},
|
||||
{
|
||||
name: 'graphql.operation.type',
|
||||
meaning: 'GraphQL operation type',
|
||||
typicalValues: ['query', 'mutation', 'subscription'],
|
||||
notes:
|
||||
'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. Omitted when unknown'
|
||||
},
|
||||
{
|
||||
name: 'graphql.response.status',
|
||||
meaning: 'GraphQL response status for the request',
|
||||
typicalValues: ['ok', 'error'],
|
||||
notes:
|
||||
'Used by http.server.request.duration, http.server.request.body.size, and http.server.response.body.size. "error" indicates the GraphQL response contains at least one error'
|
||||
}
|
||||
]}
|
||||
/>
|
||||
|
||||
### HTTP client
|
||||
|
||||
HTTP client metrics capture outbound requests the router makes to subgraphs.
|
||||
|
||||
<MetricsSection
|
||||
metrics={[
|
||||
{
|
||||
name: 'http.client.request.duration',
|
||||
type: 'Histogram',
|
||||
unit: 'Seconds',
|
||||
description: 'Duration of outbound HTTP requests sent from router to subgraphs.',
|
||||
labels: [
|
||||
'http.request.method',
|
||||
'server.address',
|
||||
'server.port',
|
||||
'network.protocol.name',
|
||||
'network.protocol.version',
|
||||
'url.scheme',
|
||||
'subgraph.name',
|
||||
'http.response.status_code',
|
||||
'error.type',
|
||||
'graphql.response.status'
|
||||
]
|
||||
},
|
||||
{
|
||||
name: 'http.client.request.body.size',
|
||||
type: 'Histogram',
|
||||
unit: 'Bytes',
|
||||
description: 'Size of outbound HTTP request bodies sent to subgraphs.',
|
||||
labels: [
|
||||
'http.request.method',
|
||||
'server.address',
|
||||
'server.port',
|
||||
'network.protocol.name',
|
||||
'network.protocol.version',
|
||||
'url.scheme',
|
||||
'subgraph.name',
|
||||
'http.response.status_code',
|
||||
'error.type',
|
||||
'graphql.response.status'
|
||||
]
|
||||
},
|
||||
{
|
||||
name: 'http.client.response.body.size',
|
||||
type: 'Histogram',
|
||||
unit: 'Bytes',
|
||||
description: 'Size of HTTP response bodies returned by subgraphs.',
|
||||
labels: [
|
||||
'http.request.method',
|
||||
'server.address',
|
||||
'server.port',
|
||||
'network.protocol.name',
|
||||
'network.protocol.version',
|
||||
'url.scheme',
|
||||
'subgraph.name',
|
||||
'http.response.status_code',
|
||||
'error.type',
|
||||
'graphql.response.status'
|
||||
]
|
||||
},
|
||||
{
|
||||
name: 'http.client.active_requests',
|
||||
type: 'UpDownCounter',
|
||||
unit: '{request}',
|
||||
description: 'Current number of in-flight outbound HTTP requests to subgraphs.',
|
||||
labels: [
|
||||
'http.request.method',
|
||||
'server.address',
|
||||
'server.port',
|
||||
'url.scheme',
|
||||
'subgraph.name'
|
||||
]
|
||||
}
|
||||
]}
|
||||
labels={[
|
||||
{
|
||||
name: 'http.request.method',
|
||||
meaning: 'HTTP method',
|
||||
typicalValues: [
|
||||
'GET',
|
||||
'POST',
|
||||
'PUT',
|
||||
'PATCH',
|
||||
'DELETE',
|
||||
'HEAD',
|
||||
'OPTIONS',
|
||||
'CONNECT',
|
||||
'TRACE',
|
||||
'QUERY',
|
||||
'_OTHER'
|
||||
],
|
||||
notes: '_OTHER is fallback for unknown methods'
|
||||
},
|
||||
{
|
||||
name: 'http.response.status_code',
|
||||
meaning: 'Response status code',
|
||||
typicalValues: ['200', '400', '500', '...']
|
||||
},
|
||||
{
|
||||
name: 'network.protocol.name',
|
||||
meaning: 'Protocol name',
|
||||
typicalValues: ['http']
|
||||
},
|
||||
{
|
||||
name: 'network.protocol.version',
|
||||
meaning: 'Protocol version',
|
||||
typicalValues: ['0.9', '1.0', '1.1', '2', '3']
|
||||
},
|
||||
{
|
||||
name: 'url.scheme',
|
||||
meaning: 'URL scheme',
|
||||
typicalValues: ['http', 'https']
|
||||
},
|
||||
{
|
||||
name: 'server.address',
|
||||
meaning: 'Subgraph host',
|
||||
typicalValues: ['URI host', 'unknown'],
|
||||
notes: 'URI host, or unknown fallback'
|
||||
},
|
||||
{
|
||||
name: 'server.port',
|
||||
meaning: 'Subgraph port',
|
||||
typicalValues: ['80', '443'],
|
||||
notes: 'Explicit URI port, or fallback 80/443'
|
||||
},
|
||||
{
|
||||
name: 'subgraph.name',
|
||||
meaning: 'Subgraph identifier',
|
||||
typicalValues: ['accounts'],
|
||||
notes: 'Configured names (for example "accounts")'
|
||||
},
|
||||
{
|
||||
name: 'error.type',
|
||||
meaning: 'Error classification',
|
||||
typicalValues: ['400', 'SUBGRAPH_REQUEST_FAILURE', '...'],
|
||||
notes: 'Numeric status code >= 400 or execution error code string'
|
||||
},
|
||||
{
|
||||
name: 'graphql.response.status',
|
||||
meaning: 'GraphQL response status for the subgraph request',
|
||||
typicalValues: ['ok', 'error'],
|
||||
notes:
|
||||
'Set to "ok" when the parsed subgraph response has no GraphQL errors. Set to "error" when the subgraph response includes GraphQL errors or when transport/deserialization fails.'
|
||||
}
|
||||
]}
|
||||
/>
|
||||
|
||||
### Cache
|
||||
|
||||
Cache metrics track lookup behavior and cache size across router caches used during request
|
||||
preparation and planning stages.
|
||||
|
||||
#### Parsing cache
|
||||
|
||||
Parsing cache metrics measure query parse cache hit/miss behavior and cache size.
|
||||
|
||||
<MetricsSection
|
||||
metrics={[
|
||||
{
|
||||
name: 'hive.router.parse_cache.requests_total',
|
||||
type: 'Counter',
|
||||
description: 'Total number of parsing cache lookups, categorized by result.',
|
||||
labels: ['result']
|
||||
},
|
||||
{
|
||||
name: 'hive.router.parse_cache.duration',
|
||||
type: 'Histogram',
|
||||
unit: 'Seconds',
|
||||
description: 'Duration of parsing cache lookups, categorized by result.',
|
||||
labels: ['result']
|
||||
},
|
||||
{
|
||||
name: 'hive.router.parse_cache.size',
|
||||
type: 'Gauge',
|
||||
description: 'Current number of entries stored in the parsing cache.'
|
||||
}
|
||||
]}
|
||||
/>
|
||||
|
||||
#### Validation cache
|
||||
|
||||
Validation cache metrics measure query validation cache hit/miss behavior and cache size.
|
||||
|
||||
<MetricsSection
|
||||
metrics={[
|
||||
{
|
||||
name: 'hive.router.validate_cache.requests_total',
|
||||
type: 'Counter',
|
||||
description: 'Total number of validation cache lookups, categorized by result.',
|
||||
labels: ['result']
|
||||
},
|
||||
{
|
||||
name: 'hive.router.validate_cache.duration',
|
||||
type: 'Histogram',
|
||||
unit: 'Seconds',
|
||||
description: 'Duration of validation cache lookups, categorized by result.',
|
||||
labels: ['result']
|
||||
},
|
||||
{
|
||||
name: 'hive.router.validate_cache.size',
|
||||
type: 'Gauge',
|
||||
description: 'Current number of entries stored in the validation cache.'
|
||||
}
|
||||
]}
|
||||
/>
|
||||
|
||||
#### Normalization cache
|
||||
|
||||
Normalization cache metrics measure query normalization cache hit/miss behavior and cache size.
|
||||
|
||||
<MetricsSection
|
||||
metrics={[
|
||||
{
|
||||
name: 'hive.router.normalize_cache.requests_total',
|
||||
type: 'Counter',
|
||||
description: 'Total number of normalization cache lookups, categorized by result.',
|
||||
labels: ['result']
|
||||
},
|
||||
{
|
||||
name: 'hive.router.normalize_cache.duration',
|
||||
type: 'Histogram',
|
||||
unit: 'Seconds',
|
||||
description: 'Duration of normalization cache lookups, categorized by result.',
|
||||
labels: ['result']
|
||||
},
|
||||
{
|
||||
name: 'hive.router.normalize_cache.size',
|
||||
type: 'Gauge',
|
||||
description: 'Current number of entries stored in the normalization cache.'
|
||||
}
|
||||
]}
|
||||
/>
|
||||
|
||||
#### Planning cache
|
||||
|
||||
Planning cache metrics measure query planning cache hit/miss behavior and cache size.
|
||||
|
||||
<MetricsSection
|
||||
metrics={[
|
||||
{
|
||||
name: 'hive.router.plan_cache.requests_total',
|
||||
type: 'Counter',
|
||||
description: 'Total number of planning cache lookups, categorized by result.',
|
||||
labels: ['result']
|
||||
},
|
||||
{
|
||||
name: 'hive.router.plan_cache.duration',
|
||||
type: 'Histogram',
|
||||
unit: 'Seconds',
|
||||
description: 'Duration of planning cache lookups, categorized by result.',
|
||||
labels: ['result']
|
||||
},
|
||||
{
|
||||
name: 'hive.router.plan_cache.size',
|
||||
type: 'Gauge',
|
||||
description: 'Current number of entries stored in the planning cache.'
|
||||
}
|
||||
]}
|
||||
/>
|
||||
|
||||
#### Labels
|
||||
|
||||
These labels are shared by cache lookup counters and duration histograms.
|
||||
|
||||
<div className="mt-4">
|
||||
<MetricsSection
|
||||
labels={[
|
||||
{
|
||||
name: 'result',
|
||||
meaning: 'Cache lookup outcome',
|
||||
typicalValues: ['hit', 'miss'],
|
||||
notes: 'Used by cache `requests_total` and `duration` metrics'
|
||||
}
|
||||
]}
|
||||
/>
|
||||
</div>
|
||||
|
||||
## What to monitor in production
|
||||
|
||||
The examples below show which signals to monitor in production and how to break them down so you can
|
||||
quickly isolate API, subgraph, cache, and GraphQL issues.
|
||||
|
||||
### Monitor end-to-end latency of your GraphQL API
|
||||
|
||||
Use [`http.server.request.duration`](#metric-http-server-request-duration) as your primary latency
|
||||
signal.
|
||||
|
||||
In production, break this metric down by `http.route`, `http.request.method`,
|
||||
`http.response.status_code`, and/or `graphql.response.status`, then track p95 and p99 latency per
|
||||
route and method. Keep successful and failed responses separated so error-path latency does not get
|
||||
hidden by healthy traffic.
|
||||
|
||||
### Monitor health of your subgraphs
|
||||
|
||||
Use [`http.client.request.duration`](#metric-http-client-request-duration) and
|
||||
[`http.client.active_requests`](#metric-http-client-active-requests) to monitor dependency health
|
||||
across your federated graph.
|
||||
|
||||
Break these metrics down by `subgraph.name`, `http.response.status_code`, and `error.type` to
|
||||
identify which subgraph is driving tail latency or error spikes.
|
||||
|
||||
### Monitor cache effectiveness and planning pressure
|
||||
|
||||
Use the cache metrics to evaluate cache hit ratio, miss cost, and pressure over time.
|
||||
|
||||
For request and duration metrics, split by `result` (`hit` and `miss`) so you can track hit ratio
|
||||
and miss latency per cache kind.
|
||||
|
||||
### Monitor GraphQL errors over time
|
||||
|
||||
Use [`hive.router.graphql.errors_total`](#metric-hive-router-graphql-errors-total) and break it down
|
||||
by `code` to track both volume and error distribution.
|
||||
|
||||
In production, monitor how error-code distribution changes over time, not only total count, so you
|
||||
can separate validation issues from execution failures.
|
||||
|
||||
## Configuration reference
|
||||
|
||||
For full options and defaults, see
|
||||
[telemetry configuration reference](/docs/router/configuration/telemetry).
|
||||
Loading…
Reference in a new issue