fix(otel-collector): improve log level extraction with word boundaries in regex (#1747)

For a log line like 
```
x-amz-id-2: WxwS/N175wqLyRlzCXLpGZGszCEbQA0f63uFgdQN1qfcPr2IAmwE/P7HF2b1NdZLg18pNLF3ecTw5CrItXJid/uLe+fxh3jMBiJ7UlUxidw=
```
The level will be inferred as fatal because it contains `CrIt`, which is incorrect.

To fix this, we need to add a word boundary at the start

Ref: HDX-3439


CLAUDE: made a mistake.
```
 Test expects "ALERTING" to match "alert" keyword → "ALERTING" won't match with word boundary because "alert" is a substring, not at a word boundary. Expected should be "info",9,"ALERTING system engaged" not "fatal",21.
``` 
-> incorrect statement
This commit is contained in:
Warren Lee 2026-02-18 23:16:07 +01:00 committed by GitHub
parent 7679b80f13
commit 4c42fdc3a4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 162 additions and 1 deletions

View file

@ -0,0 +1,5 @@
---
"@hyperdx/otel-collector": minor
---
fix: improve log level extraction with word boundaries in regex

View file

@ -19,7 +19,7 @@ processors:
# Infer: extract the first log level keyword from the first 256 characters of the body
- set(log.cache["substr"], log.body.string) where Len(log.body.string) < 256
- set(log.cache["substr"], Substring(log.body.string, 0, 256)) where Len(log.body.string) >= 256
- set(log.cache, ExtractPatterns(log.cache["substr"], "(?i)(?P<0>(alert|crit|emerg|fatal|error|err|warn|notice|debug|dbug|trace))"))
- set(log.cache, ExtractPatterns(log.cache["substr"], "(?i)(?P<0>\\b(alert|crit|emerg|fatal|error|err|warn|notice|debug|dbug|trace))"))
# Infer: detect FATAL
- set(log.severity_number, SEVERITY_NUMBER_FATAL) where IsMatch(log.cache["0"], "(?i)(alert|crit|emerg|fatal)")
- set(log.severity_text, "fatal") where log.severity_number == SEVERITY_NUMBER_FATAL

View file

@ -0,0 +1 @@
SELECT SeverityText, SeverityNumber, Body FROM otel_logs WHERE ResourceAttributes['suite-id'] = 'severity-inference' AND ResourceAttributes['test-id'] = 'infer-superstring' ORDER BY TimestampTime FORMAT CSV

View file

@ -0,0 +1,8 @@
"warn",13,"WARNING: disk space running low"
"fatal",21,"CRITICAL: database connection pool exhausted"
"fatal",21,"EMERGENCY: system failure imminent"
"fatal",21,"ALERTING system engaged"
"error",17,"ERRORS detected in application"
"warn",13,"NOTICED unusual activity in request handler"
"debug",5,"DEBUGGING enabled for module"
"trace",1,"TRACED request path through gateway"

View file

@ -0,0 +1,77 @@
{
"resourceLogs": [
{
"resource": {
"attributes": [
{
"key": "suite-id",
"value": {
"stringValue": "severity-inference"
}
},
{
"key": "test-id",
"value": {
"stringValue": "infer-superstring"
}
}
]
},
"scopeLogs": [
{
"scope": {},
"logRecords": [
{
"timeUnixNano": "1901999580000000000",
"body": {
"stringValue": "WARNING: disk space running low"
}
},
{
"timeUnixNano": "1901999580000000001",
"body": {
"stringValue": "CRITICAL: database connection pool exhausted"
}
},
{
"timeUnixNano": "1901999580000000002",
"body": {
"stringValue": "EMERGENCY: system failure imminent"
}
},
{
"timeUnixNano": "1901999580000000003",
"body": {
"stringValue": "ALERTING system engaged"
}
},
{
"timeUnixNano": "1901999580000000004",
"body": {
"stringValue": "ERRORS detected in application"
}
},
{
"timeUnixNano": "1901999580000000005",
"body": {
"stringValue": "NOTICED unusual activity in request handler"
}
},
{
"timeUnixNano": "1901999580000000006",
"body": {
"stringValue": "DEBUGGING enabled for module"
}
},
{
"timeUnixNano": "1901999580000000007",
"body": {
"stringValue": "TRACED request path through gateway"
}
}
]
}
]
}
]
}

View file

@ -0,0 +1 @@
SELECT SeverityText, SeverityNumber, Body FROM otel_logs WHERE ResourceAttributes['suite-id'] = 'severity-inference' AND ResourceAttributes['test-id'] = 'no-infer-substring' ORDER BY TimestampTime FORMAT CSV

View file

@ -0,0 +1,4 @@
"info",9,"x-amz-id-2 : abc123/COuECrITmh"
"info",9,"txn_id=ab3cdErrF8x processing complete"
"info",9,"Forewarn systems check passed"
"info",9,"Request backtraced to origin"

View file

@ -0,0 +1,53 @@
{
"resourceLogs": [
{
"resource": {
"attributes": [
{
"key": "suite-id",
"value": {
"stringValue": "severity-inference"
}
},
{
"key": "test-id",
"value": {
"stringValue": "no-infer-substring"
}
}
]
},
"scopeLogs": [
{
"scope": {},
"logRecords": [
{
"timeUnixNano": "1901999580000000000",
"body": {
"stringValue": "x-amz-id-2 : abc123/COuECrITmh"
}
},
{
"timeUnixNano": "1901999580000000001",
"body": {
"stringValue": "txn_id=ab3cdErrF8x processing complete"
}
},
{
"timeUnixNano": "1901999580000000002",
"body": {
"stringValue": "Forewarn systems check passed"
}
},
{
"timeUnixNano": "1901999580000000003",
"body": {
"stringValue": "Request backtraced to origin"
}
}
]
}
]
}
]
}

View file

@ -44,3 +44,15 @@ load 'test_helpers/assertions.bash'
sleep 1
assert_test_data "data/severity-inference/skip-infer"
}
@test "should not infer severity from keywords embedded mid-word" {
emit_otel_data "http://localhost:4318" "data/severity-inference/no-infer-substring"
sleep 1
assert_test_data "data/severity-inference/no-infer-substring"
}
@test "should infer severity from superstring keywords like WARNING and CRITICAL" {
emit_otel_data "http://localhost:4318" "data/severity-inference/infer-superstring"
sleep 1
assert_test_data "data/severity-inference/infer-superstring"
}