feat: improved attribute sorting with entropy scoring (#1854)

## Summary
Closes #1826

Replaces the basic max-delta sorting for event delta attributes with smarter scoring algorithms:

- **Proportional comparison scoring** (`computeComparisonScore`): Normalizes each group's percentages to sum to 100% before computing max delta, so fields with identical proportional distributions score 0 regardless of coverage rate differences. This makes the sorting resilient to different sample sizes between outlier and inlier groups.
- **Shannon entropy scoring** (`computeEntropyScore`): Returns [0, 1] where 1 = maximally useful (low entropy, dominant value among several) and 0 = not useful (single value, empty, or perfectly uniform). Prepared for future use in distribution mode.
- **Semantic boost** (`semanticBoost`): Provides a tiebreaker boost for well-known OTel attributes (e.g., `service.name`, `http.status_code`, `error`) to surface the most operationally relevant fields.

Currently uses `computeComparisonScore` for sorting. Entropy scoring will be integrated when #1824 (always-on distribution mode) merges.

## Test plan
- [x] 15 unit tests covering all three scoring functions pass
- [ ] Verify attribute order improves in practice: fields with proportionally different distributions should rank higher than fields that differ only due to sample size
- [ ] Confirm no regression in existing event delta chart behavior

🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
Alex Fedotyev 2026-03-12 14:46:45 -07:00 committed by GitHub
parent 33edc7e5be
commit 26759f794f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 294 additions and 11 deletions

View file

@ -0,0 +1,5 @@
---
"@hyperdx/app": patch
---
feat: improved attribute sorting with entropy scoring and proportional comparison

View file

@ -22,13 +22,14 @@ import { getFirstTimestampValueExpression } from '@/source';
import { SQLPreview } from './ChartSQLPreview';
import type { AddFilterFn } from './deltaChartUtils';
import {
computeComparisonScore,
flattenedKeyToFilterKey,
getPropertyStatistics,
getStableSampleExpression,
isDenylisted,
isHighCardinality,
mergeValueStatisticsMaps,
SAMPLE_SIZE,
semanticBoost,
} from './deltaChartUtils';
import {
CHART_GAP,
@ -257,7 +258,10 @@ export default function DBDeltaChart({
if (uniqueKeys.size === 0) {
uniqueKeys = new Set([...inlierValueOccurences.keys()]);
}
// Now process the keys to find the ones with the highest delta between outlier and inlier percentages
// Sort by proportional comparison score (normalizes group sizes).
// TODO: When #1824 (always-on distribution) merges, use computeEntropyScore
// for distribution mode (no selection) and computeComparisonScore only when
// a selection is active (hasSelection flag from #1824).
const sortedProperties = Array.from(uniqueKeys)
.map(key => {
const inlierCount =
@ -265,16 +269,14 @@ export default function DBDeltaChart({
const outlierCount =
outlierValueOccurences.get(key) ?? new Map<string, number>();
const mergedArray = mergeValueStatisticsMaps(outlierCount, inlierCount);
let maxValueDelta = 0;
mergedArray.forEach(item => {
const delta = Math.abs(item.outlierCount - item.inlierCount);
if (delta > maxValueDelta) {
maxValueDelta = delta;
}
});
// Use proportional comparison scoring which normalizes group sizes.
// Semantic boost acts as a tiebreaker for well-known OTel attributes
// (only applied when the field has actual variance).
const baseScore = computeComparisonScore(outlierCount, inlierCount);
const boost = baseScore > 0 ? semanticBoost(key) * 0.1 : 0;
const sortScore = baseScore + boost;
return [key, maxValueDelta] as const;
return [key, sortScore] as const;
})
.sort((a, b) => b[1] - a[1])
.map(a => a[0]);

View file

@ -0,0 +1,162 @@
import {
computeComparisonScore,
computeEntropyScore,
semanticBoost,
} from '../deltaChartUtils';
describe('computeComparisonScore', () => {
it('returns 0 for empty maps', () => {
expect(computeComparisonScore(new Map(), new Map())).toBe(0);
});
it('returns 0 when both groups have identical proportions', () => {
const outlier = new Map([
['GET', 80],
['POST', 20],
]);
const inlier = new Map([
['GET', 40],
['POST', 10],
]);
expect(computeComparisonScore(outlier, inlier)).toBeCloseTo(0);
});
it('returns high score for different proportions', () => {
const outlier = new Map([
['error', 90],
['ok', 10],
]);
const inlier = new Map([
['error', 10],
['ok', 90],
]);
expect(computeComparisonScore(outlier, inlier)).toBeGreaterThan(70);
});
it('returns 0 for single-value field when other group is empty', () => {
// Single value with no comparison group is uninformative
// (e.g., Events.Name[N] = "message" at 100% with no inlier data)
const outlier = new Map([['error', 50]]);
expect(computeComparisonScore(outlier, new Map())).toBe(0);
});
it('normalizes multi-value field to [0, 100] when other group is empty', () => {
// Multi-value with no comparison group IS informative — shows distribution
const outlier = new Map([
['error', 80],
['ok', 20],
]);
expect(computeComparisonScore(outlier, new Map())).toBe(80);
});
it('normalizes by group sum so different sample sizes produce same score', () => {
const outlierSmall = new Map([
['GET', 8],
['POST', 2],
]);
const outlierLarge = new Map([
['GET', 800],
['POST', 200],
]);
const inlier = new Map([
['GET', 50],
['POST', 50],
]);
const scoreSmall = computeComparisonScore(outlierSmall, inlier);
const scoreLarge = computeComparisonScore(outlierLarge, inlier);
expect(scoreSmall).toBeCloseTo(scoreLarge, 1);
});
});
describe('computeEntropyScore', () => {
it('returns 0 for single-value fields', () => {
expect(computeEntropyScore(new Map([['only', 100]]))).toBe(0);
});
it('returns 0 for empty map', () => {
expect(computeEntropyScore(new Map())).toBe(0);
});
it('returns ~0 for perfectly uniform distribution', () => {
expect(
computeEntropyScore(
new Map([
['a', 50],
['b', 50],
]),
),
).toBeCloseTo(0);
});
it('returns high score for skewed distribution', () => {
const score = computeEntropyScore(
new Map([
['ok', 99],
['error', 1],
]),
);
expect(score).toBeGreaterThan(0.5);
});
it('ranks more-skewed fields higher', () => {
const scoreA = computeEntropyScore(
new Map([
['a', 95],
['b', 5],
]),
);
const scoreB = computeEntropyScore(
new Map([
['a', 60],
['b', 40],
]),
);
expect(scoreA).toBeGreaterThan(scoreB);
});
it('returns ~0 for uniform 3-value field', () => {
expect(
computeEntropyScore(
new Map([
['a', 33.33],
['b', 33.33],
['c', 33.34],
]),
),
).toBeCloseTo(0, 2);
});
it('handles power-law distributions', () => {
const powerLaw = new Map([
['v1', 50],
['v2', 25],
['v3', 12],
['v4', 6],
['v5', 4],
['v6', 2],
['v7', 1],
]);
const score = computeEntropyScore(powerLaw);
expect(score).toBeGreaterThan(0.1);
expect(score).toBeLessThan(1);
});
});
describe('semanticBoost', () => {
it('boosts well-known OTel attributes', () => {
expect(semanticBoost('ResourceAttributes.service.name')).toBe(1);
expect(semanticBoost('SpanAttributes.http.method')).toBe(1);
expect(semanticBoost('SpanAttributes.http.status_code')).toBe(1);
expect(semanticBoost('SpanAttributes.error')).toBe(1);
});
it('returns 0 for non-boosted attributes', () => {
expect(semanticBoost('SpanAttributes.custom.field')).toBe(0);
expect(semanticBoost('TraceId')).toBe(0);
});
it('is case-insensitive', () => {
expect(semanticBoost('ResourceAttributes.Service.Name')).toBe(1);
expect(semanticBoost('SpanAttributes.HTTP.METHOD')).toBe(1);
});
});

View file

@ -400,3 +400,117 @@ export function computeEffectiveSampleSize(totalCount: number): number {
Math.max(MIN_SAMPLE_SIZE, Math.ceil(totalCount * SAMPLE_RATIO)),
);
}
// ---------------------------------------------------------------------------
// Attribute sorting and scoring
// ---------------------------------------------------------------------------
/**
* Comparison mode scoring: normalizes each group's percentages to sum to 100%
* before computing max delta. Fields with identical proportional distributions
* score 0 regardless of coverage rate differences.
*/
export function computeComparisonScore(
outlierValues: Map<string, number>,
inlierValues: Map<string, number>,
): number {
const allValues = new Set([...outlierValues.keys(), ...inlierValues.keys()]);
if (allValues.size === 0) return 0;
let outlierSum = 0;
let inlierSum = 0;
outlierValues.forEach(v => (outlierSum += v));
inlierValues.forEach(v => (inlierSum += v));
if (outlierSum === 0 && inlierSum === 0) return 0;
if (outlierSum === 0 || inlierSum === 0) {
// One group has data, the other doesn't.
const presentValues = outlierSum > 0 ? outlierValues : inlierValues;
// Single value with no comparison group is uninformative — score 0.
// (e.g., Events.Name[N] = "message" at 100% with no inlier data)
// Multi-value fields with no comparison group ARE informative — they show
// that the present group has a distinctive distribution.
if (presentValues.size <= 1) return 0;
// Normalize to [0, 100] so the score is scale-consistent with the two-group case.
const presentSum = outlierSum > 0 ? outlierSum : inlierSum;
let maxNormPct = 0;
presentValues.forEach(v => {
const pct = (v / presentSum) * 100;
if (pct > maxNormPct) maxNormPct = pct;
});
return maxNormPct;
}
let maxDelta = 0;
allValues.forEach(value => {
const outlierNorm = ((outlierValues.get(value) ?? 0) / outlierSum) * 100;
const inlierNorm = ((inlierValues.get(value) ?? 0) / inlierSum) * 100;
const delta = Math.abs(outlierNorm - inlierNorm);
if (delta > maxDelta) maxDelta = delta;
});
return maxDelta;
}
/**
* Shannon entropy-based distribution score for sorting properties.
* Returns [0, 1]: 1 = maximally useful (low entropy, dominant value among several),
* 0 = not useful (single value, empty, or perfectly uniform).
*/
export function computeEntropyScore(
valuePercentages: Map<string, number>,
): number {
const nValues = valuePercentages.size;
if (nValues <= 1) return 0;
let totalPct = 0;
valuePercentages.forEach(pct => {
totalPct += pct;
});
if (totalPct === 0) return 0;
let entropy = 0;
valuePercentages.forEach(pct => {
const p = pct / totalPct;
if (p > 0) {
entropy -= p * Math.log2(p);
}
});
const maxEntropy = Math.log2(nValues);
if (maxEntropy === 0) return 0;
return 1 - entropy / maxEntropy;
}
/** Well-known OTel attribute suffixes that get a score boost */
const BOOSTED_ATTRIBUTE_SUFFIXES = [
'service.name',
'http.method',
'http.request.method',
'http.status_code',
'http.response.status_code',
'error',
'error.type',
'deployment.environment',
'deployment.environment.name',
'rpc.method',
'rpc.service',
'db.system',
'db.operation',
'messaging.system',
'messaging.operation',
];
/**
* Returns 1 for well-known OTel attributes, 0 otherwise.
* Uses dot-segment boundary matching to avoid false positives
* (e.g., 'SpanAttributes.myerror' won't match the 'error' entry).
* Callers scale this as a tiebreaker (e.g., * 0.1) and only apply when baseScore > 0.
*/
export function semanticBoost(key: string): number {
const lowerKey = key.toLowerCase();
for (const suffix of BOOSTED_ATTRIBUTE_SUFFIXES) {
if (lowerKey.endsWith('.' + suffix) || lowerKey === suffix) return 1;
}
return 0;
}