diff --git a/.changeset/config.json b/.changeset/config.json index cfef3e6f..3c2da191 100644 --- a/.changeset/config.json +++ b/.changeset/config.json @@ -2,7 +2,7 @@ "$schema": "https://unpkg.com/@changesets/config@2.3.1/schema.json", "changelog": "@changesets/cli/changelog", "commit": false, - "fixed": [["@hyperdx/api", "@hyperdx/app"]], + "fixed": [["@hyperdx/api", "@hyperdx/app", "@hyperdx/otel-collector"]], "linked": [], "access": "restricted", "baseBranch": "main", diff --git a/.changeset/smooth-bananas-hammer.md b/.changeset/smooth-bananas-hammer.md new file mode 100644 index 00000000..bea9279f --- /dev/null +++ b/.changeset/smooth-bananas-hammer.md @@ -0,0 +1,5 @@ +--- +"@hyperdx/otel-collector": patch +--- + +feat: allow otel-collector to run without OpAMP server diff --git a/docker/otel-collector/Dockerfile b/docker/otel-collector/Dockerfile index dbe3b316..5033a762 100644 --- a/docker/otel-collector/Dockerfile +++ b/docker/otel-collector/Dockerfile @@ -13,7 +13,8 @@ ARG USER_GID=10001 RUN apk add --no-cache ca-certificates && \ addgroup -S -g ${USER_GID} otel && \ adduser -S -u ${USER_UID} -G otel otel && \ - install -d -m 0777 -o ${USER_UID} -g ${USER_GID} /etc/otel/supervisor-data + install -d -m 0777 -o ${USER_UID} -g ${USER_GID} /etc/otel/supervisor-data && \ + install -d -m 0755 -o ${USER_UID} -g ${USER_GID} /etc/otelcol-contrib # Copy gomplate binary from the gomplate image COPY --from=gomplate /bin/gomplate /usr/local/bin/gomplate @@ -30,8 +31,9 @@ COPY --chmod=755 ./log-tailer.sh /log-tailer.sh ## dev ############################################################################################## FROM base AS dev -COPY ./config.yaml /etc/otelcol-contrib/config.yaml -COPY ./supervisor_docker.yaml.tmpl /etc/otel/supervisor.yaml.tmpl +COPY --chown=10001:10001 ./config.yaml /etc/otelcol-contrib/config.yaml +COPY --chown=10001:10001 ./config.standalone.yaml /etc/otelcol-contrib/standalone-config.yaml +COPY --chown=10001:10001 ./supervisor_docker.yaml.tmpl /etc/otel/supervisor.yaml.tmpl EXPOSE 4317 4318 13133 @@ -40,8 +42,9 @@ ENTRYPOINT ["/entrypoint.sh", "/opampsupervisor"] ## prod ############################################################################################# FROM base AS prod -COPY ./config.yaml /etc/otelcol-contrib/config.yaml -COPY ./supervisor_docker.yaml.tmpl /etc/otel/supervisor.yaml.tmpl +COPY --chown=10001:10001 ./config.yaml /etc/otelcol-contrib/config.yaml +COPY --chown=10001:10001 ./config.standalone.yaml /etc/otelcol-contrib/standalone-config.yaml +COPY --chown=10001:10001 ./supervisor_docker.yaml.tmpl /etc/otel/supervisor.yaml.tmpl EXPOSE 4317 4318 13133 diff --git a/docker/otel-collector/config.deprecated.yaml b/docker/otel-collector/config.deprecated.yaml deleted file mode 100644 index afe54dc4..00000000 --- a/docker/otel-collector/config.deprecated.yaml +++ /dev/null @@ -1,174 +0,0 @@ -receivers: - # Troubleshooting - prometheus: - config: - scrape_configs: - - job_name: 'otelcol' - scrape_interval: 30s - static_configs: - - targets: - - '0.0.0.0:8888' - - ${env:CLICKHOUSE_PROMETHEUS_METRICS_ENDPOINT} - # Data sources: logs - fluentforward: - endpoint: '0.0.0.0:24225' - # Configured via OpAMP w/ authentication - # Data sources: traces, metrics, logs - # otlp/hyperdx: - # protocols: - # grpc: - # include_metadata: true - # endpoint: '0.0.0.0:4317' - # http: - # cors: - # allowed_origins: ['*'] - # allowed_headers: ['*'] - # include_metadata: true - # endpoint: '0.0.0.0:4318' -processors: - transform: - log_statements: - - context: log - error_mode: ignore - statements: - # JSON parsing: Extends log attributes with the fields from structured log body content, either as an OTEL map or - # as a string containing JSON content. - - set(log.cache, ExtractPatterns(log.body, "(?P<0>(\\{.*\\}))")) where - IsString(log.body) - - merge_maps(log.attributes, ParseJSON(log.cache["0"]), "upsert") - where IsMap(log.cache) - - flatten(log.attributes) where IsMap(log.cache) - - merge_maps(log.attributes, log.body, "upsert") where IsMap(log.body) - - context: log - error_mode: ignore - conditions: - - severity_number == 0 and severity_text == "" - statements: - # Infer: extract the first log level keyword from the first 256 characters of the body - - set(log.cache["substr"], log.body.string) where Len(log.body.string) - < 256 - - set(log.cache["substr"], Substring(log.body.string, 0, 256)) where - Len(log.body.string) >= 256 - - set(log.cache, ExtractPatterns(log.cache["substr"], - "(?i)(?P<0>(alert|crit|emerg|fatal|error|err|warn|notice|debug|dbug|trace))")) - # Infer: detect FATAL - - set(log.severity_number, SEVERITY_NUMBER_FATAL) where - IsMatch(log.cache["0"], "(?i)(alert|crit|emerg|fatal)") - - set(log.severity_text, "fatal") where log.severity_number == - SEVERITY_NUMBER_FATAL - # Infer: detect ERROR - - set(log.severity_number, SEVERITY_NUMBER_ERROR) where - IsMatch(log.cache["0"], "(?i)(error|err)") - - set(log.severity_text, "error") where log.severity_number == - SEVERITY_NUMBER_ERROR - # Infer: detect WARN - - set(log.severity_number, SEVERITY_NUMBER_WARN) where - IsMatch(log.cache["0"], "(?i)(warn|notice)") - - set(log.severity_text, "warn") where log.severity_number == - SEVERITY_NUMBER_WARN - # Infer: detect DEBUG - - set(log.severity_number, SEVERITY_NUMBER_DEBUG) where - IsMatch(log.cache["0"], "(?i)(debug|dbug)") - - set(log.severity_text, "debug") where log.severity_number == - SEVERITY_NUMBER_DEBUG - # Infer: detect TRACE - - set(log.severity_number, SEVERITY_NUMBER_TRACE) where - IsMatch(log.cache["0"], "(?i)(trace)") - - set(log.severity_text, "trace") where log.severity_number == - SEVERITY_NUMBER_TRACE - # Infer: else - - set(log.severity_text, "info") where log.severity_number == 0 - - set(log.severity_number, SEVERITY_NUMBER_INFO) where - log.severity_number == 0 - - context: log - error_mode: ignore - statements: - # Normalize the severity_text case - - set(log.severity_text, ConvertCase(log.severity_text, "lower")) - resourcedetection: - detectors: - - env - - system - - docker - timeout: 5s - override: false - batch: - memory_limiter: - # 80% of maximum memory up to 2G - limit_mib: 1500 - # 25% of limit up to 2G - spike_limit_mib: 512 - check_interval: 5s -connectors: - routing/logs: - default_pipelines: [logs/out-default] - error_mode: ignore - table: - - context: log - statement: route() where IsMatch(attributes["rr-web.event"], ".*") - pipelines: [logs/out-rrweb] -exporters: - debug: - verbosity: detailed - sampling_initial: 5 - sampling_thereafter: 200 - clickhouse/rrweb: - database: ${env:HYPERDX_OTEL_EXPORTER_CLICKHOUSE_DATABASE} - endpoint: ${env:CLICKHOUSE_ENDPOINT} - password: ${env:CLICKHOUSE_PASSWORD} - username: ${env:CLICKHOUSE_USER} - ttl: 720h - logs_table_name: hyperdx_sessions - timeout: 5s - retry_on_failure: - enabled: true - initial_interval: 5s - max_interval: 30s - max_elapsed_time: 300s - clickhouse: - database: ${env:HYPERDX_OTEL_EXPORTER_CLICKHOUSE_DATABASE} - endpoint: ${env:CLICKHOUSE_ENDPOINT} - password: ${env:CLICKHOUSE_PASSWORD} - username: ${env:CLICKHOUSE_USER} - ttl: 720h - timeout: 5s - retry_on_failure: - enabled: true - initial_interval: 5s - max_interval: 30s - max_elapsed_time: 300s -extensions: - health_check: - endpoint: :13133 -service: - telemetry: - metrics: - readers: - - pull: - exporter: - prometheus: - host: '0.0.0.0' - port: 8888 - logs: - level: ${HYPERDX_LOG_LEVEL} - extensions: [health_check] - pipelines: - traces: - # receivers: [otlp/hyperdx] - processors: [memory_limiter, batch] - exporters: [clickhouse] - metrics: - # receivers: [otlp/hyperdx, prometheus] - processors: [memory_limiter, batch] - exporters: [clickhouse] - logs/in: - # receivers: [otlp/hyperdx, fluentforward] - exporters: [routing/logs] - logs/out-default: - receivers: [routing/logs] - processors: [memory_limiter, transform, batch] - exporters: [clickhouse] - logs/out-rrweb: - receivers: [routing/logs] - processors: [memory_limiter, batch] - exporters: [clickhouse/rrweb] diff --git a/docker/otel-collector/config.standalone.yaml b/docker/otel-collector/config.standalone.yaml new file mode 100644 index 00000000..bf374623 --- /dev/null +++ b/docker/otel-collector/config.standalone.yaml @@ -0,0 +1,73 @@ +# This configuration is derived from packages/api/src/opamp/controllers/opampController.ts +# When updating this file, ensure it stays in sync with buildOtelCollectorConfig() + +receivers: + otlp/hyperdx: + protocols: + grpc: + include_metadata: true + endpoint: "0.0.0.0:4317" + http: + cors: + allowed_origins: ["*"] + allowed_headers: ["*"] + include_metadata: true + endpoint: "0.0.0.0:4318" + +connectors: + routing/logs: + default_pipelines: [logs/out-default] + error_mode: ignore + table: + - context: log + statement: route() where IsMatch(attributes["rr-web.event"], ".*") + pipelines: [logs/out-rrweb] + +exporters: + clickhouse/rrweb: + database: ${env:HYPERDX_OTEL_EXPORTER_CLICKHOUSE_DATABASE} + endpoint: ${env:CLICKHOUSE_ENDPOINT} + password: ${env:CLICKHOUSE_PASSWORD} + username: ${env:CLICKHOUSE_USER} + ttl: 720h + logs_table_name: hyperdx_sessions + timeout: 5s + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + clickhouse: + database: ${env:HYPERDX_OTEL_EXPORTER_CLICKHOUSE_DATABASE} + endpoint: ${env:CLICKHOUSE_ENDPOINT} + password: ${env:CLICKHOUSE_PASSWORD} + username: ${env:CLICKHOUSE_USER} + ttl: 720h + timeout: 5s + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + +service: + pipelines: + traces: + receivers: [otlp/hyperdx] + processors: [memory_limiter, batch] + exporters: [clickhouse] + metrics: + receivers: [otlp/hyperdx] + processors: [memory_limiter, batch] + exporters: [clickhouse] + logs/in: + receivers: [otlp/hyperdx] + exporters: [routing/logs] + logs/out-default: + receivers: [routing/logs] + processors: [memory_limiter, transform, batch] + exporters: [clickhouse] + logs/out-rrweb: + receivers: [routing/logs] + processors: [memory_limiter, batch] + exporters: [clickhouse/rrweb] diff --git a/docker/otel-collector/entrypoint.sh b/docker/otel-collector/entrypoint.sh index 2c425851..c327efe6 100644 --- a/docker/otel-collector/entrypoint.sh +++ b/docker/otel-collector/entrypoint.sh @@ -1,34 +1,55 @@ #!/bin/sh set -e -if [ "$OTEL_SUPERVISOR_LOGS" = "true" ]; then - # Start log tailer process in background for agent.log - # Arguments: log_file_path [check_interval_seconds] - /log-tailer.sh /etc/otel/supervisor-data/agent.log 1 & +# Check if OPAMP_SERVER_URL is defined to determine mode +if [ -z "$OPAMP_SERVER_URL" ]; then + # Standalone mode - run collector directly without supervisor + echo "Running in standalone mode (OPAMP_SERVER_URL not set)" - # Create a agent log file for the supervisor and collector child process. Normally - # this file would be created as a standard file but we just want a FIFO pipe that - # will pass data over to the tail process in the entrypoint script. This avoids - # the need to the supervisor to store and forward the logs in its memory while also - # eliminating the need for volume based storage. - if [ ! -e /etc/otel/supervisor-data/agent.log ]; then - mkfifo /etc/otel/supervisor-data/agent.log || echo "Failed to create FIFO" >&2 + # Build collector arguments with multiple config files + COLLECTOR_ARGS="--config /etc/otelcol-contrib/config.yaml --config /etc/otelcol-contrib/standalone-config.yaml" + + # Add custom config file if specified + if [ -n "$CUSTOM_OTELCOL_CONFIG_FILE" ]; then + echo "Including custom config: $CUSTOM_OTELCOL_CONFIG_FILE" + COLLECTOR_ARGS="$COLLECTOR_ARGS --config $CUSTOM_OTELCOL_CONFIG_FILE" fi -fi -# Render the supervisor config template using gomplate -# Write to supervisor-data directory which has proper permissions for otel user -gomplate -f /etc/otel/supervisor.yaml.tmpl -o /etc/otel/supervisor-data/supervisor-runtime.yaml - -# Log the configuration being used -if [ -n "$CUSTOM_OTELCOL_CONFIG_FILE" ]; then - echo "Using custom OTEL config file: $CUSTOM_OTELCOL_CONFIG_FILE" + # Execute collector directly + exec /otelcontribcol $COLLECTOR_ARGS else - echo "CUSTOM_OTELCOL_CONFIG_FILE not set, using default configuration" + # Supervisor mode - run with OpAMP supervisor + echo "Running in supervisor mode (OPAMP_SERVER_URL: $OPAMP_SERVER_URL)" + + if [ "$OTEL_SUPERVISOR_LOGS" = "true" ]; then + # Start log tailer process in background for agent.log + # Arguments: log_file_path [check_interval_seconds] + /log-tailer.sh /etc/otel/supervisor-data/agent.log 1 & + + # Create a agent log file for the supervisor and collector child process. Normally + # this file would be created as a standard file but we just want a FIFO pipe that + # will pass data over to the tail process in the entrypoint script. This avoids + # the need to the supervisor to store and forward the logs in its memory while also + # eliminating the need for volume based storage. + if [ ! -e /etc/otel/supervisor-data/agent.log ]; then + mkfifo /etc/otel/supervisor-data/agent.log || echo "Failed to create FIFO" >&2 + fi + fi + + # Render the supervisor config template using gomplate + # Write to supervisor-data directory which has proper permissions for otel user + gomplate -f /etc/otel/supervisor.yaml.tmpl -o /etc/otel/supervisor-data/supervisor-runtime.yaml + + # Log the configuration being used + if [ -n "$CUSTOM_OTELCOL_CONFIG_FILE" ]; then + echo "Using custom OTEL config file: $CUSTOM_OTELCOL_CONFIG_FILE" + else + echo "CUSTOM_OTELCOL_CONFIG_FILE not set, using default configuration" + fi + + # Update the command arguments to use the rendered config file + set -- "$1" --config /etc/otel/supervisor-data/supervisor-runtime.yaml + + # Execute the supervisor with all passed arguments + exec "$@" fi - -# Update the command arguments to use the rendered config file -set -- "$1" --config /etc/otel/supervisor-data/supervisor-runtime.yaml - -# Execute the supervisor with all passed arguments -exec "$@" diff --git a/packages/otel-collector/package.json b/packages/otel-collector/package.json new file mode 100644 index 00000000..77099131 --- /dev/null +++ b/packages/otel-collector/package.json @@ -0,0 +1,7 @@ +{ + "name": "@hyperdx/otel-collector", + "description": "HyperDX OpenTelemetry Collector configuration and Docker image", + "version": "2.13.0", + "license": "MIT", + "private": true +} diff --git a/smoke-tests/otel-collector/docker-compose.yaml b/smoke-tests/otel-collector/docker-compose.yaml index 1c3d6008..16171bfd 100644 --- a/smoke-tests/otel-collector/docker-compose.yaml +++ b/smoke-tests/otel-collector/docker-compose.yaml @@ -19,20 +19,17 @@ services: retries: 5 start_period: 10s otel-collector: - image: otel/opentelemetry-collector-contrib:0.129.1 - volumes: - - ../../docker/otel-collector/config.deprecated.yaml:/etc/otelcol-contrib/config.yaml - - ./receiver-config.yaml:/etc/otelcol-contrib/receiver-config.yaml - command: - [ - '--config=/etc/otelcol-contrib/receiver-config.yaml', - '--config=/etc/otelcol-contrib/config.yaml', - ] + build: + context: ../../docker/otel-collector + target: dev environment: - CLICKHOUSE_ENDPOINT=tcp://ch-server:9000?dial_timeout=10s - CLICKHOUSE_PROMETHEUS_METRICS_ENDPOINT=ch-server:9363 + - CLICKHOUSE_USER=default + - CLICKHOUSE_PASSWORD= - HYPERDX_OTEL_EXPORTER_CLICKHOUSE_DATABASE=default - HYPERDX_LOG_LEVEL=info + # OPAMP_SERVER_URL is intentionally not set to run in standalone mode ports: - 4318:4318 # OTLP http receiver - 13133:13133 # health check diff --git a/yarn.lock b/yarn.lock index 2d0951c5..d3a57995 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4581,6 +4581,12 @@ __metadata: languageName: node linkType: hard +"@hyperdx/otel-collector@workspace:packages/otel-collector": + version: 0.0.0-use.local + resolution: "@hyperdx/otel-collector@workspace:packages/otel-collector" + languageName: unknown + linkType: soft + "@hyperdx/otel-web-session-recorder@npm:0.16.2": version: 0.16.2 resolution: "@hyperdx/otel-web-session-recorder@npm:0.16.2"