2022-05-18 07:26:57 +00:00
import * as k8s from '@pulumi/kubernetes' ;
2022-12-28 19:22:54 +00:00
import { interpolate , Output } from '@pulumi/pulumi' ;
2025-03-04 13:50:12 +00:00
import { Environment } from '../services/environment' ;
2023-02-28 06:07:45 +00:00
import { helmChart } from './helm' ;
2024-03-13 10:17:52 +00:00
import { Values as OpenTelemetryCollectorValues } from './opentelemetry-collector.types' ;
import { VectorValues } from './vector.types' ;
2022-05-18 07:26:57 +00:00
2024-11-25 10:34:42 +00:00
export type ObservabilityConfig =
| 'local'
| {
loki : {
endpoint : Output < string > | string ;
username : Output < string > | string ;
password : Output < string > ;
} ;
prom : {
endpoint : Output < string > | string ;
username : Output < string > | string ;
password : Output < string > ;
} ;
tempo : {
endpoint : Output < string > | string ;
username : Output < string > | string ;
password : Output < string > ;
} ;
} ;
2024-03-13 10:17:52 +00:00
// prettier-ignore
2025-01-18 06:01:05 +00:00
export const OTLP_COLLECTOR_CHART = helmChart ( 'https://open-telemetry.github.io/opentelemetry-helm-charts' , 'opentelemetry-collector' , '0.111.2' ) ;
2024-03-13 10:17:52 +00:00
// prettier-ignore
2024-08-18 12:24:23 +00:00
export const VECTOR_HELM_CHART = helmChart ( 'https://helm.vector.dev' , 'vector' , '0.35.0' ) ;
2024-03-13 10:17:52 +00:00
2022-05-18 07:26:57 +00:00
export class Observability {
2023-11-07 07:14:57 +00:00
constructor (
2025-03-04 13:50:12 +00:00
private environment : Environment ,
2023-11-07 07:14:57 +00:00
private config : ObservabilityConfig ,
) { }
2022-05-18 07:26:57 +00:00
deploy() {
2024-04-07 08:57:03 +00:00
const nsName = 'observability' ;
const ns = new k8s . core . v1 . Namespace ( nsName , {
2022-05-18 07:26:57 +00:00
metadata : {
2024-04-07 08:57:03 +00:00
name : nsName ,
2022-05-18 07:26:57 +00:00
} ,
} ) ;
2024-11-25 10:34:42 +00:00
const extensions =
this . config === 'local'
? { }
: {
'basicauth/grafana_cloud_traces' : {
client_auth : {
username : this.config.tempo.username ,
password : this.config.tempo.password ,
} ,
} ,
} ;
const exporters =
this . config === 'local'
? { }
: {
prometheusremotewrite : {
endpoint : interpolate ` https:// ${ this . config . prom . username } : ${ this . config . prom . password } @ ${ this . config . prom . endpoint } ` ,
} ,
'otlp/grafana_cloud_traces' : {
endpoint : this.config.tempo.endpoint ,
auth : {
authenticator : 'basicauth/grafana_cloud_traces' ,
} ,
} ,
} ;
const sinks =
this . config === 'local'
? {
stdout : {
type : 'console' ,
inputs : [ 'kubernetes_logs' ] ,
encoding : { codec : 'json' } ,
} ,
}
: {
grafana_lab : {
type : 'loki' ,
inputs : [ 'kubernetes_logs' , 'envoy_error_logs' ] ,
endpoint : interpolate ` https:// ${ this . config . loki . endpoint } ` ,
auth : {
strategy : 'basic' ,
user : this.config.loki.username ,
password : this.config.loki.password ,
} ,
2025-09-30 14:30:41 +00:00
// Based on https://vector.dev/docs/reference/configuration/sources/kubernetes_logs/#output-types
2024-11-25 10:34:42 +00:00
labels : {
namespace : '{{`{{ kubernetes.pod_namespace }}`}}' ,
2025-09-30 14:30:41 +00:00
pod_name : '{{`{{ kubernetes.pod_name }}`}}' ,
node : '{{`{{ kubernetes.pod_node_name }}`}}' ,
2025-10-02 08:39:52 +00:00
container_name : '{{`{{ kubernetes.container_name }}`}}' ,
2024-11-25 10:34:42 +00:00
} ,
encoding : {
codec : 'text' ,
} ,
out_of_order_action : 'accept' ,
remove_timestamp : false ,
} ,
} ;
2024-03-13 10:17:52 +00:00
// https://github.com/open-telemetry/opentelemetry-helm-charts/blob/main/charts/opentelemetry-collector/values.yaml
const chartValues : OpenTelemetryCollectorValues = {
2024-06-26 12:40:40 +00:00
image : {
repository : 'otel/opentelemetry-collector-contrib' ,
} ,
2024-03-13 10:17:52 +00:00
mode : 'deployment' ,
replicaCount : 1 ,
resources : {
limits : {
2025-09-16 11:06:42 +00:00
cpu : this.environment.podsConfig.internalObservability.cpuLimit ,
memory : this.environment.podsConfig.internalObservability.memoryLimit ,
2024-03-13 10:17:52 +00:00
} ,
} ,
2024-04-07 08:57:03 +00:00
podAnnotations : {
2024-08-21 09:36:04 +00:00
// This is done because open-telemetry collector doesn't always update the deployment
// when the config file changes. This will force-restart it.
2024-04-07 08:57:03 +00:00
'pulumi.com/update-timestamp' : Date . now ( ) . toString ( ) ,
} ,
2024-03-13 10:17:52 +00:00
clusterRole : {
create : true ,
rules : [
{
apiGroups : [ '' ] ,
resources : [
'events' ,
'namespaces' ,
'namespaces/status' ,
'nodes' ,
'nodes/spec' ,
'pods' ,
'pods/metrics' ,
'nodes/metrics' ,
'pods/status' ,
'replicationcontrollers' ,
'replicationcontrollers/status' ,
'resourcequotas' ,
'services' ,
'endpoints' ,
] ,
verbs : [ 'get' , 'list' , 'watch' ] ,
} ,
{
apiGroups : [ 'apps' ] ,
resources : [ 'daemonsets' , 'deployments' , 'replicasets' , 'statefulsets' ] ,
verbs : [ 'get' , 'list' , 'watch' ] ,
} ,
{
apiGroups : [ 'extensions' ] ,
resources : [ 'daemonsets' , 'deployments' , 'replicasets' ] ,
verbs : [ 'get' , 'list' , 'watch' ] ,
} ,
{
apiGroups : [ 'batch' ] ,
resources : [ 'jobs' , 'cronjobs' ] ,
verbs : [ 'get' , 'list' , 'watch' ] ,
} ,
{
apiGroups : [ 'autoscaling' ] ,
resources : [ 'horizontalpodautoscalers' ] ,
verbs : [ 'get' , 'list' , 'watch' ] ,
} ,
] ,
} ,
config : {
exporters : {
2024-11-25 10:34:42 +00:00
. . . exporters ,
2025-01-18 06:01:05 +00:00
debug : {
2024-06-26 12:40:40 +00:00
verbosity : 'basic' ,
2024-03-13 10:17:52 +00:00
} ,
2022-05-18 07:26:57 +00:00
} ,
2024-03-13 10:17:52 +00:00
extensions : {
2024-11-25 10:34:42 +00:00
. . . extensions ,
2024-03-13 10:17:52 +00:00
health_check : { } ,
} ,
processors : {
batch : { } ,
memory_limiter : {
check_interval : '5s' ,
limit_mib : 409 ,
spike_limit_mib : 128 ,
} ,
2024-08-21 09:36:04 +00:00
// Filter OpenTelemetry traces that are not needed for debugging.
2024-04-07 08:57:03 +00:00
'filter/traces' : {
error_mode : 'ignore' ,
traces : {
span : [
2024-08-21 09:36:04 +00:00
// Ignore all HEAD/OPTIONS requests
2024-04-07 08:57:03 +00:00
'attributes["component"] == "proxy" and attributes["http.method"] == "HEAD"' ,
'attributes["component"] == "proxy" and attributes["http.method"] == "OPTIONS"' ,
2024-08-21 09:36:04 +00:00
// Ignore health checks
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and attributes["http.url"] == "/_readiness"' ,
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and attributes["http.url"] == "/_health"' ,
2024-04-08 10:08:55 +00:00
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and IsMatch(attributes["http.url"], ".*/_health") == true' ,
2024-08-21 09:36:04 +00:00
// Ignore Contour/Envoy traces for /usage requests
2025-01-17 09:40:18 +00:00
'attributes["component"] == "proxy" and attributes["http.method"] == "POST" and attributes["http.url"] == "/usage" and (attributes["http.status_code"] == "200" or attributes["http.status_code"] == "429")' ,
2024-08-21 09:36:04 +00:00
// Ignore metrics scraping
2024-04-07 08:57:03 +00:00
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and attributes["http.url"] == "/metrics"' ,
2024-08-21 09:36:04 +00:00
// Ignore webapp HTTP calls
2025-01-17 09:40:18 +00:00
'attributes["component"] == "proxy" and attributes["http.method"] == "POST" and IsMatch(attributes["upstream_cluster.name"], "default_app-.*") == true' ,
2024-04-07 10:27:33 +00:00
'attributes["component"] == "proxy" and attributes["http.method"] == "GET" and IsMatch(attributes["upstream_cluster.name"], "default_app-.*") == true' ,
2024-04-07 08:57:03 +00:00
] ,
} ,
} ,
2024-08-21 09:36:04 +00:00
// Remove raw trace information that we don't really need and exposed by default.
2024-04-07 08:57:03 +00:00
'attributes/trace_filter' : {
actions : [
'downstream_cluster' ,
'podName' ,
'podNamespace' ,
'zone' ,
'upstream_cluster' ,
'peer.address' ,
] . map ( key = > ( {
key ,
action : 'delete' ,
} ) ) ,
} ,
2024-08-21 09:36:04 +00:00
// Remove attributes that are not needed for debugging.
2024-04-07 08:57:03 +00:00
'resource/trace_cleanup' : {
attributes : [
2024-04-08 10:08:55 +00:00
'host.arch' ,
2024-04-07 08:57:03 +00:00
'process.command' ,
'process.command_args' ,
'process.executable.path' ,
'process.executable.name' ,
'process.owner' ,
'process.pid' ,
'process.runtime.description' ,
'process.runtime.name' ,
'process.runtime.version' ,
'telemetry.sdk.language' ,
'telemetry.sdk.name' ,
'telemetry.sdk.version' ,
] . map ( key = > ( {
key ,
action : 'delete' ,
} ) ) ,
} ,
2024-08-21 09:36:04 +00:00
// Contour spans are not very human-readable by default, so we are transforming them
// into a format that is easier to understand.
// First, we modify the span URL to be relative, and remove the hostname and full path,
// Then, we rename to be "METHOD /path"
2024-04-07 08:57:03 +00:00
'transform/patch_envoy_spans' : {
error_mode : 'ignore' ,
trace_statements : [
{
context : 'span' ,
statements : [
// By defualt, Envoy reports this as full URL, but we only want the path
2024-04-08 10:08:55 +00:00
'replace_pattern(attributes["http.url"], "https?://[^/]+(/[^?#]*)", "$$1") where attributes["component"] == "proxy"' ,
2024-04-07 08:57:03 +00:00
// Replace Envoy default span name with a more human-readable one (e.g. "METHOD /path")
2024-08-18 08:07:07 +00:00
'set(name, Concat([attributes["http.method"], attributes["http.url"]], " ")) where attributes["component"] == "proxy" and attributes["http.method"] != nil' ,
2024-04-07 08:57:03 +00:00
] ,
} ,
] ,
} ,
2024-03-13 10:17:52 +00:00
} ,
receivers : {
2024-04-07 08:57:03 +00:00
otlp : {
protocols : {
grpc : { } ,
http : { } ,
} ,
} ,
2024-03-13 10:17:52 +00:00
prometheus : {
config : {
global : {
evaluation_interval : '10s' ,
scrape_interval : '30s' ,
scrape_timeout : '10s' ,
} ,
scrape_configs : [
{
2025-04-16 08:31:36 +00:00
job_name : 'service-metrics' ,
scheme : 'http' ,
2024-03-13 10:17:52 +00:00
honor_labels : true ,
honor_timestamps : true ,
2025-04-16 08:31:36 +00:00
metrics_path : '/metrics' ,
2024-03-13 10:17:52 +00:00
kubernetes_sd_configs : [
{
role : 'pod' ,
namespaces : {
names : [ 'default' ] ,
} ,
} ,
] ,
relabel_configs : [
2025-04-16 08:31:36 +00:00
// compares the name of the port to == "metrics"
2024-03-13 10:17:52 +00:00
{
source_labels : [ '__meta_kubernetes_pod_container_port_name' ] ,
action : 'keep' ,
regex : 'metrics' ,
} ,
2025-04-16 08:31:36 +00:00
// compares "scrape" label to "true"
2024-03-13 10:17:52 +00:00
{
source_labels : [ '__meta_kubernetes_pod_annotation_prometheus_io_scrape' ] ,
action : 'keep' ,
regex : true ,
} ,
2025-04-16 08:31:36 +00:00
{
source_labels : [ '__meta_kubernetes_pod_name' ] ,
action : 'replace' ,
target_label : 'instance' ,
regex : '(.*redis.*)' ,
} ,
2024-03-13 10:17:52 +00:00
{
source_labels : [ '__meta_kubernetes_pod_annotation_prometheus_io_scheme' ] ,
action : 'replace' ,
target_label : '__scheme__' ,
regex : '(https?)' ,
} ,
{
source_labels : [ '__meta_kubernetes_pod_annotation_prometheus_io_path' ] ,
action : 'replace' ,
target_label : '__metrics_path__' ,
regex : '(.+)' ,
} ,
{
action : 'labelmap' ,
regex : '__meta_kubernetes_service_label_(.+)' ,
} ,
{
action : 'replace' ,
source_labels : [ '__meta_kubernetes_namespace' ] ,
target_label : 'namespace' ,
} ,
{
action : 'replace' ,
source_labels : [ '__meta_kubernetes_service_name' ] ,
target_label : 'service' ,
} ,
{
action : 'replace' ,
source_labels : [ '__meta_kubernetes_pod_name' ] ,
target_label : 'pod' ,
} ,
{
action : 'replace' ,
source_labels : [ '__meta_kubernetes_pod_node_name' ] ,
target_label : 'kubernetes_node' ,
} ,
] ,
} ,
2022-05-18 07:26:57 +00:00
] ,
} ,
2024-03-13 10:17:52 +00:00
} ,
2022-05-18 07:26:57 +00:00
} ,
2024-03-13 10:17:52 +00:00
service : {
2024-11-25 10:34:42 +00:00
extensions :
this . config === 'local'
? [ 'health_check' ]
: [ 'health_check' , 'basicauth/grafana_cloud_traces' ] ,
2024-03-13 10:17:52 +00:00
pipelines : {
2024-04-07 08:57:03 +00:00
traces : {
receivers : [ 'otlp' ] ,
processors : [
'resource/trace_cleanup' ,
'attributes/trace_filter' ,
'transform/patch_envoy_spans' ,
'filter/traces' ,
'batch' ,
] ,
2024-11-25 10:34:42 +00:00
exporters :
2025-01-18 06:01:05 +00:00
this . config === 'local' ? [ 'debug' ] : [ 'debug' , 'otlp/grafana_cloud_traces' ] ,
2024-04-07 08:57:03 +00:00
} ,
2024-03-13 10:17:52 +00:00
metrics : {
2025-01-18 06:01:05 +00:00
exporters : this.config === 'local' ? [ 'debug' ] : [ 'debug' , 'prometheusremotewrite' ] ,
2024-03-13 10:17:52 +00:00
processors : [ 'memory_limiter' , 'batch' ] ,
receivers : [ 'prometheus' ] ,
2022-05-18 07:26:57 +00:00
} ,
} ,
2024-03-13 10:17:52 +00:00
} ,
} ,
} ;
2024-04-07 08:57:03 +00:00
// We are using otel-collector to scrape metrics and collect traces from Pods
const otlpCollector = new k8s . helm . v3 . Chart ( 'metrics' , {
2024-03-13 10:17:52 +00:00
. . . OTLP_COLLECTOR_CHART ,
namespace : ns . metadata . name ,
values : chartValues ,
} ) ;
2024-04-07 08:57:03 +00:00
let otlpCollectorService = otlpCollector . getResource (
'v1/Service' ,
` ${ nsName } /metrics-opentelemetry-collector ` ,
) ;
2024-03-13 10:17:52 +00:00
// https://vector.dev/docs/reference/configuration/
const vectorValues : VectorValues = {
role : 'Agent' ,
customConfig : {
data_dir : '/vector-data-dir' ,
api : {
enabled : true ,
playground : false ,
address : '127.0.0.1:7676' ,
} ,
sources : {
kubernetes_logs : {
type : 'kubernetes_logs' ,
extra_field_selector : 'metadata.namespace=default' ,
2022-05-18 07:26:57 +00:00
} ,
2024-03-13 10:17:52 +00:00
envoy_logs : {
type : 'kubernetes_logs' ,
extra_field_selector : 'metadata.namespace=contour' ,
2022-05-18 07:26:57 +00:00
} ,
2024-03-13 10:17:52 +00:00
} ,
transforms : {
envoy_json_logs : {
type : 'remap' ,
inputs : [ 'envoy_logs' ] ,
// Avoid sending the event to the sink
drop_on_error : true ,
// Route the dropped events to the debug_dropped sink
reroute_dropped : true ,
source : '. |= object!(parse_json!(.message))' ,
2022-05-18 07:26:57 +00:00
} ,
2024-03-13 10:17:52 +00:00
envoy_error_logs : {
type : 'filter' ,
inputs : [ 'envoy_json_logs' ] ,
condition : '.response_code != 200 && .response_code != 401' ,
} ,
} ,
sinks : {
// enable if you need to debug the raw vector messages
// stdout: {
// type: 'console',
// inputs: ['kubernetes_logs'],
// encoding: { codec: 'json' },
// },
// Debug dropped messages (envoy_json_logs)
debug_dropped : {
type : 'console' ,
inputs : [ 'envoy_json_logs.dropped' ] ,
encoding : { codec : 'json' } ,
} ,
2024-11-25 10:34:42 +00:00
. . . sinks ,
2022-05-18 07:26:57 +00:00
} ,
} ,
2024-03-13 10:17:52 +00:00
} ;
2022-05-18 07:26:57 +00:00
// We are using Vector to scrape logs from the K8s Pods, and send it to Grafana Cloud
new k8s . helm . v3 . Chart (
'vector-logging' ,
{
2023-03-12 12:02:30 +00:00
// prettier-ignore
2024-03-13 10:17:52 +00:00
. . . VECTOR_HELM_CHART ,
2022-05-18 07:26:57 +00:00
namespace : ns . metadata . name ,
2024-03-13 10:17:52 +00:00
values : vectorValues ,
2022-05-18 07:26:57 +00:00
} ,
{
dependsOn : [ ns ] ,
2022-11-24 10:00:41 +00:00
} ,
2022-05-18 07:26:57 +00:00
) ;
2024-04-07 08:57:03 +00:00
return {
otlpCollectorService ,
} ;
2022-05-18 07:26:57 +00:00
}
}