mirror of
https://github.com/hyperdxio/hyperdx
synced 2026-04-21 13:37:15 +00:00
feat: improved attribute sorting with entropy scoring (#1854)
## Summary Closes #1826 Replaces the basic max-delta sorting for event delta attributes with smarter scoring algorithms: - **Proportional comparison scoring** (`computeComparisonScore`): Normalizes each group's percentages to sum to 100% before computing max delta, so fields with identical proportional distributions score 0 regardless of coverage rate differences. This makes the sorting resilient to different sample sizes between outlier and inlier groups. - **Shannon entropy scoring** (`computeEntropyScore`): Returns [0, 1] where 1 = maximally useful (low entropy, dominant value among several) and 0 = not useful (single value, empty, or perfectly uniform). Prepared for future use in distribution mode. - **Semantic boost** (`semanticBoost`): Provides a tiebreaker boost for well-known OTel attributes (e.g., `service.name`, `http.status_code`, `error`) to surface the most operationally relevant fields. Currently uses `computeComparisonScore` for sorting. Entropy scoring will be integrated when #1824 (always-on distribution mode) merges. ## Test plan - [x] 15 unit tests covering all three scoring functions pass - [ ] Verify attribute order improves in practice: fields with proportionally different distributions should rank higher than fields that differ only due to sample size - [ ] Confirm no regression in existing event delta chart behavior 🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
parent
33edc7e5be
commit
26759f794f
4 changed files with 294 additions and 11 deletions
5
.changeset/attribute-sorting.md
Normal file
5
.changeset/attribute-sorting.md
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
"@hyperdx/app": patch
|
||||
---
|
||||
|
||||
feat: improved attribute sorting with entropy scoring and proportional comparison
|
||||
|
|
@ -22,13 +22,14 @@ import { getFirstTimestampValueExpression } from '@/source';
|
|||
import { SQLPreview } from './ChartSQLPreview';
|
||||
import type { AddFilterFn } from './deltaChartUtils';
|
||||
import {
|
||||
computeComparisonScore,
|
||||
flattenedKeyToFilterKey,
|
||||
getPropertyStatistics,
|
||||
getStableSampleExpression,
|
||||
isDenylisted,
|
||||
isHighCardinality,
|
||||
mergeValueStatisticsMaps,
|
||||
SAMPLE_SIZE,
|
||||
semanticBoost,
|
||||
} from './deltaChartUtils';
|
||||
import {
|
||||
CHART_GAP,
|
||||
|
|
@ -257,7 +258,10 @@ export default function DBDeltaChart({
|
|||
if (uniqueKeys.size === 0) {
|
||||
uniqueKeys = new Set([...inlierValueOccurences.keys()]);
|
||||
}
|
||||
// Now process the keys to find the ones with the highest delta between outlier and inlier percentages
|
||||
// Sort by proportional comparison score (normalizes group sizes).
|
||||
// TODO: When #1824 (always-on distribution) merges, use computeEntropyScore
|
||||
// for distribution mode (no selection) and computeComparisonScore only when
|
||||
// a selection is active (hasSelection flag from #1824).
|
||||
const sortedProperties = Array.from(uniqueKeys)
|
||||
.map(key => {
|
||||
const inlierCount =
|
||||
|
|
@ -265,16 +269,14 @@ export default function DBDeltaChart({
|
|||
const outlierCount =
|
||||
outlierValueOccurences.get(key) ?? new Map<string, number>();
|
||||
|
||||
const mergedArray = mergeValueStatisticsMaps(outlierCount, inlierCount);
|
||||
let maxValueDelta = 0;
|
||||
mergedArray.forEach(item => {
|
||||
const delta = Math.abs(item.outlierCount - item.inlierCount);
|
||||
if (delta > maxValueDelta) {
|
||||
maxValueDelta = delta;
|
||||
}
|
||||
});
|
||||
// Use proportional comparison scoring which normalizes group sizes.
|
||||
// Semantic boost acts as a tiebreaker for well-known OTel attributes
|
||||
// (only applied when the field has actual variance).
|
||||
const baseScore = computeComparisonScore(outlierCount, inlierCount);
|
||||
const boost = baseScore > 0 ? semanticBoost(key) * 0.1 : 0;
|
||||
const sortScore = baseScore + boost;
|
||||
|
||||
return [key, maxValueDelta] as const;
|
||||
return [key, sortScore] as const;
|
||||
})
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.map(a => a[0]);
|
||||
|
|
|
|||
162
packages/app/src/components/__tests__/deltaChartScoring.test.ts
Normal file
162
packages/app/src/components/__tests__/deltaChartScoring.test.ts
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
import {
|
||||
computeComparisonScore,
|
||||
computeEntropyScore,
|
||||
semanticBoost,
|
||||
} from '../deltaChartUtils';
|
||||
|
||||
describe('computeComparisonScore', () => {
|
||||
it('returns 0 for empty maps', () => {
|
||||
expect(computeComparisonScore(new Map(), new Map())).toBe(0);
|
||||
});
|
||||
|
||||
it('returns 0 when both groups have identical proportions', () => {
|
||||
const outlier = new Map([
|
||||
['GET', 80],
|
||||
['POST', 20],
|
||||
]);
|
||||
const inlier = new Map([
|
||||
['GET', 40],
|
||||
['POST', 10],
|
||||
]);
|
||||
expect(computeComparisonScore(outlier, inlier)).toBeCloseTo(0);
|
||||
});
|
||||
|
||||
it('returns high score for different proportions', () => {
|
||||
const outlier = new Map([
|
||||
['error', 90],
|
||||
['ok', 10],
|
||||
]);
|
||||
const inlier = new Map([
|
||||
['error', 10],
|
||||
['ok', 90],
|
||||
]);
|
||||
expect(computeComparisonScore(outlier, inlier)).toBeGreaterThan(70);
|
||||
});
|
||||
|
||||
it('returns 0 for single-value field when other group is empty', () => {
|
||||
// Single value with no comparison group is uninformative
|
||||
// (e.g., Events.Name[N] = "message" at 100% with no inlier data)
|
||||
const outlier = new Map([['error', 50]]);
|
||||
expect(computeComparisonScore(outlier, new Map())).toBe(0);
|
||||
});
|
||||
|
||||
it('normalizes multi-value field to [0, 100] when other group is empty', () => {
|
||||
// Multi-value with no comparison group IS informative — shows distribution
|
||||
const outlier = new Map([
|
||||
['error', 80],
|
||||
['ok', 20],
|
||||
]);
|
||||
expect(computeComparisonScore(outlier, new Map())).toBe(80);
|
||||
});
|
||||
|
||||
it('normalizes by group sum so different sample sizes produce same score', () => {
|
||||
const outlierSmall = new Map([
|
||||
['GET', 8],
|
||||
['POST', 2],
|
||||
]);
|
||||
const outlierLarge = new Map([
|
||||
['GET', 800],
|
||||
['POST', 200],
|
||||
]);
|
||||
const inlier = new Map([
|
||||
['GET', 50],
|
||||
['POST', 50],
|
||||
]);
|
||||
const scoreSmall = computeComparisonScore(outlierSmall, inlier);
|
||||
const scoreLarge = computeComparisonScore(outlierLarge, inlier);
|
||||
expect(scoreSmall).toBeCloseTo(scoreLarge, 1);
|
||||
});
|
||||
});
|
||||
|
||||
describe('computeEntropyScore', () => {
|
||||
it('returns 0 for single-value fields', () => {
|
||||
expect(computeEntropyScore(new Map([['only', 100]]))).toBe(0);
|
||||
});
|
||||
|
||||
it('returns 0 for empty map', () => {
|
||||
expect(computeEntropyScore(new Map())).toBe(0);
|
||||
});
|
||||
|
||||
it('returns ~0 for perfectly uniform distribution', () => {
|
||||
expect(
|
||||
computeEntropyScore(
|
||||
new Map([
|
||||
['a', 50],
|
||||
['b', 50],
|
||||
]),
|
||||
),
|
||||
).toBeCloseTo(0);
|
||||
});
|
||||
|
||||
it('returns high score for skewed distribution', () => {
|
||||
const score = computeEntropyScore(
|
||||
new Map([
|
||||
['ok', 99],
|
||||
['error', 1],
|
||||
]),
|
||||
);
|
||||
expect(score).toBeGreaterThan(0.5);
|
||||
});
|
||||
|
||||
it('ranks more-skewed fields higher', () => {
|
||||
const scoreA = computeEntropyScore(
|
||||
new Map([
|
||||
['a', 95],
|
||||
['b', 5],
|
||||
]),
|
||||
);
|
||||
const scoreB = computeEntropyScore(
|
||||
new Map([
|
||||
['a', 60],
|
||||
['b', 40],
|
||||
]),
|
||||
);
|
||||
expect(scoreA).toBeGreaterThan(scoreB);
|
||||
});
|
||||
|
||||
it('returns ~0 for uniform 3-value field', () => {
|
||||
expect(
|
||||
computeEntropyScore(
|
||||
new Map([
|
||||
['a', 33.33],
|
||||
['b', 33.33],
|
||||
['c', 33.34],
|
||||
]),
|
||||
),
|
||||
).toBeCloseTo(0, 2);
|
||||
});
|
||||
|
||||
it('handles power-law distributions', () => {
|
||||
const powerLaw = new Map([
|
||||
['v1', 50],
|
||||
['v2', 25],
|
||||
['v3', 12],
|
||||
['v4', 6],
|
||||
['v5', 4],
|
||||
['v6', 2],
|
||||
['v7', 1],
|
||||
]);
|
||||
const score = computeEntropyScore(powerLaw);
|
||||
expect(score).toBeGreaterThan(0.1);
|
||||
expect(score).toBeLessThan(1);
|
||||
});
|
||||
});
|
||||
|
||||
describe('semanticBoost', () => {
|
||||
it('boosts well-known OTel attributes', () => {
|
||||
expect(semanticBoost('ResourceAttributes.service.name')).toBe(1);
|
||||
expect(semanticBoost('SpanAttributes.http.method')).toBe(1);
|
||||
expect(semanticBoost('SpanAttributes.http.status_code')).toBe(1);
|
||||
expect(semanticBoost('SpanAttributes.error')).toBe(1);
|
||||
});
|
||||
|
||||
it('returns 0 for non-boosted attributes', () => {
|
||||
expect(semanticBoost('SpanAttributes.custom.field')).toBe(0);
|
||||
expect(semanticBoost('TraceId')).toBe(0);
|
||||
});
|
||||
|
||||
it('is case-insensitive', () => {
|
||||
expect(semanticBoost('ResourceAttributes.Service.Name')).toBe(1);
|
||||
expect(semanticBoost('SpanAttributes.HTTP.METHOD')).toBe(1);
|
||||
});
|
||||
});
|
||||
|
|
@ -400,3 +400,117 @@ export function computeEffectiveSampleSize(totalCount: number): number {
|
|||
Math.max(MIN_SAMPLE_SIZE, Math.ceil(totalCount * SAMPLE_RATIO)),
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Attribute sorting and scoring
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Comparison mode scoring: normalizes each group's percentages to sum to 100%
|
||||
* before computing max delta. Fields with identical proportional distributions
|
||||
* score 0 regardless of coverage rate differences.
|
||||
*/
|
||||
export function computeComparisonScore(
|
||||
outlierValues: Map<string, number>,
|
||||
inlierValues: Map<string, number>,
|
||||
): number {
|
||||
const allValues = new Set([...outlierValues.keys(), ...inlierValues.keys()]);
|
||||
if (allValues.size === 0) return 0;
|
||||
|
||||
let outlierSum = 0;
|
||||
let inlierSum = 0;
|
||||
outlierValues.forEach(v => (outlierSum += v));
|
||||
inlierValues.forEach(v => (inlierSum += v));
|
||||
|
||||
if (outlierSum === 0 && inlierSum === 0) return 0;
|
||||
if (outlierSum === 0 || inlierSum === 0) {
|
||||
// One group has data, the other doesn't.
|
||||
const presentValues = outlierSum > 0 ? outlierValues : inlierValues;
|
||||
// Single value with no comparison group is uninformative — score 0.
|
||||
// (e.g., Events.Name[N] = "message" at 100% with no inlier data)
|
||||
// Multi-value fields with no comparison group ARE informative — they show
|
||||
// that the present group has a distinctive distribution.
|
||||
if (presentValues.size <= 1) return 0;
|
||||
// Normalize to [0, 100] so the score is scale-consistent with the two-group case.
|
||||
const presentSum = outlierSum > 0 ? outlierSum : inlierSum;
|
||||
let maxNormPct = 0;
|
||||
presentValues.forEach(v => {
|
||||
const pct = (v / presentSum) * 100;
|
||||
if (pct > maxNormPct) maxNormPct = pct;
|
||||
});
|
||||
return maxNormPct;
|
||||
}
|
||||
|
||||
let maxDelta = 0;
|
||||
allValues.forEach(value => {
|
||||
const outlierNorm = ((outlierValues.get(value) ?? 0) / outlierSum) * 100;
|
||||
const inlierNorm = ((inlierValues.get(value) ?? 0) / inlierSum) * 100;
|
||||
const delta = Math.abs(outlierNorm - inlierNorm);
|
||||
if (delta > maxDelta) maxDelta = delta;
|
||||
});
|
||||
return maxDelta;
|
||||
}
|
||||
|
||||
/**
|
||||
* Shannon entropy-based distribution score for sorting properties.
|
||||
* Returns [0, 1]: 1 = maximally useful (low entropy, dominant value among several),
|
||||
* 0 = not useful (single value, empty, or perfectly uniform).
|
||||
*/
|
||||
export function computeEntropyScore(
|
||||
valuePercentages: Map<string, number>,
|
||||
): number {
|
||||
const nValues = valuePercentages.size;
|
||||
if (nValues <= 1) return 0;
|
||||
|
||||
let totalPct = 0;
|
||||
valuePercentages.forEach(pct => {
|
||||
totalPct += pct;
|
||||
});
|
||||
if (totalPct === 0) return 0;
|
||||
|
||||
let entropy = 0;
|
||||
valuePercentages.forEach(pct => {
|
||||
const p = pct / totalPct;
|
||||
if (p > 0) {
|
||||
entropy -= p * Math.log2(p);
|
||||
}
|
||||
});
|
||||
|
||||
const maxEntropy = Math.log2(nValues);
|
||||
if (maxEntropy === 0) return 0;
|
||||
|
||||
return 1 - entropy / maxEntropy;
|
||||
}
|
||||
|
||||
/** Well-known OTel attribute suffixes that get a score boost */
|
||||
const BOOSTED_ATTRIBUTE_SUFFIXES = [
|
||||
'service.name',
|
||||
'http.method',
|
||||
'http.request.method',
|
||||
'http.status_code',
|
||||
'http.response.status_code',
|
||||
'error',
|
||||
'error.type',
|
||||
'deployment.environment',
|
||||
'deployment.environment.name',
|
||||
'rpc.method',
|
||||
'rpc.service',
|
||||
'db.system',
|
||||
'db.operation',
|
||||
'messaging.system',
|
||||
'messaging.operation',
|
||||
];
|
||||
|
||||
/**
|
||||
* Returns 1 for well-known OTel attributes, 0 otherwise.
|
||||
* Uses dot-segment boundary matching to avoid false positives
|
||||
* (e.g., 'SpanAttributes.myerror' won't match the 'error' entry).
|
||||
* Callers scale this as a tiebreaker (e.g., * 0.1) and only apply when baseScore > 0.
|
||||
*/
|
||||
export function semanticBoost(key: string): number {
|
||||
const lowerKey = key.toLowerCase();
|
||||
for (const suffix of BOOSTED_ATTRIBUTE_SUFFIXES) {
|
||||
if (lowerKey.endsWith('.' + suffix) || lowerKey === suffix) return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue