feat: improved attribute sorting with entropy scoring (#1854)

## Summary Closes #1826 Replaces the basic max-delta sorting for event delta attributes with smarter scoring algorithms: - **Proportional comparison scoring** (`computeComparisonScore`): Normalizes each group's percentages to sum to 100% before computing max delta, so fields with identical proportional distributions score 0 regardless of coverage rate differences. This makes the sorting resilient to different sample sizes between outlier and inlier groups. - **Shannon entropy scoring** (`computeEntropyScore`): Returns [0, 1] where 1 = maximally useful (low entropy, dominant value among several) and 0 = not useful (single value, empty, or perfectly uniform). Prepared for future use in distribution mode. - **Semantic boost** (`semanticBoost`): Provides a tiebreaker boost for well-known OTel attributes (e.g., `service.name`, `http.status_code`, `error`) to surface the most operationally relevant fields. Currently uses `computeComparisonScore` for sorting. Entropy scoring will be integrated when #1824 (always-on distribution mode) merges. ## Test plan - [x] 15 unit tests covering all three scoring functions pass - [ ] Verify attribute order improves in practice: fields with proportionally different distributions should rank higher than fields that differ only due to sample size - [ ] Confirm no regression in existing event delta chart behavior 🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-04-21 13:37:15 +00:00 · 2026-03-12 14:46:45 -07:00 · 2026-03-12 14:46:45 -07:00 · 26759f794f
commit 26759f794f
parent 33edc7e5be
4 changed files with 294 additions and 11 deletions
--- a/.changeset/attribute-sorting.md
+++ b/.changeset/attribute-sorting.md
@ -0,0 +1,5 @@
+---
+"@hyperdx/app": patch
+---
+
+feat: improved attribute sorting with entropy scoring and proportional comparison
--- a/packages/app/src/components/DBDeltaChart.tsx
+++ b/packages/app/src/components/DBDeltaChart.tsx
@ -22,13 +22,14 @@ import { getFirstTimestampValueExpression } from '@/source';
 import { SQLPreview } from './ChartSQLPreview';
 import type { AddFilterFn } from './deltaChartUtils';
 import {
+  computeComparisonScore,
  flattenedKeyToFilterKey,
  getPropertyStatistics,
  getStableSampleExpression,
  isDenylisted,
  isHighCardinality,
-  mergeValueStatisticsMaps,
  SAMPLE_SIZE,
+  semanticBoost,
 } from './deltaChartUtils';
 import {
  CHART_GAP,
@ -257,7 +258,10 @@ export default function DBDeltaChart({
    if (uniqueKeys.size === 0) {
      uniqueKeys = new Set([...inlierValueOccurences.keys()]);
    }
-    // Now process the keys to find the ones with the highest delta between outlier and inlier percentages
+    // Sort by proportional comparison score (normalizes group sizes).
+    // TODO: When #1824 (always-on distribution) merges, use computeEntropyScore
+    // for distribution mode (no selection) and computeComparisonScore only when
+    // a selection is active (hasSelection flag from #1824).
    const sortedProperties = Array.from(uniqueKeys)
      .map(key => {
        const inlierCount =
@ -265,16 +269,14 @@ export default function DBDeltaChart({
        const outlierCount =
          outlierValueOccurences.get(key) ?? new Map<string, number>();

-        const mergedArray = mergeValueStatisticsMaps(outlierCount, inlierCount);
-        let maxValueDelta = 0;
-        mergedArray.forEach(item => {
-          const delta = Math.abs(item.outlierCount - item.inlierCount);
-          if (delta > maxValueDelta) {
-            maxValueDelta = delta;
-          }
-        });
+        // Use proportional comparison scoring which normalizes group sizes.
+        // Semantic boost acts as a tiebreaker for well-known OTel attributes
+        // (only applied when the field has actual variance).
+        const baseScore = computeComparisonScore(outlierCount, inlierCount);
+        const boost = baseScore > 0 ? semanticBoost(key) * 0.1 : 0;
+        const sortScore = baseScore + boost;

-        return [key, maxValueDelta] as const;
+        return [key, sortScore] as const;
      })
      .sort((a, b) => b[1] - a[1])
      .map(a => a[0]);
--- a/packages/app/src/components/tests/deltaChartScoring.test.ts
+++ b/packages/app/src/components/tests/deltaChartScoring.test.ts
@ -0,0 +1,162 @@
+import {
+  computeComparisonScore,
+  computeEntropyScore,
+  semanticBoost,
+} from '../deltaChartUtils';
+
+describe('computeComparisonScore', () => {
+  it('returns 0 for empty maps', () => {
+    expect(computeComparisonScore(new Map(), new Map())).toBe(0);
+  });
+
+  it('returns 0 when both groups have identical proportions', () => {
+    const outlier = new Map([
+      ['GET', 80],
+      ['POST', 20],
+    ]);
+    const inlier = new Map([
+      ['GET', 40],
+      ['POST', 10],
+    ]);
+    expect(computeComparisonScore(outlier, inlier)).toBeCloseTo(0);
+  });
+
+  it('returns high score for different proportions', () => {
+    const outlier = new Map([
+      ['error', 90],
+      ['ok', 10],
+    ]);
+    const inlier = new Map([
+      ['error', 10],
+      ['ok', 90],
+    ]);
+    expect(computeComparisonScore(outlier, inlier)).toBeGreaterThan(70);
+  });
+
+  it('returns 0 for single-value field when other group is empty', () => {
+    // Single value with no comparison group is uninformative
+    // (e.g., Events.Name[N] = "message" at 100% with no inlier data)
+    const outlier = new Map([['error', 50]]);
+    expect(computeComparisonScore(outlier, new Map())).toBe(0);
+  });
+
+  it('normalizes multi-value field to [0, 100] when other group is empty', () => {
+    // Multi-value with no comparison group IS informative — shows distribution
+    const outlier = new Map([
+      ['error', 80],
+      ['ok', 20],
+    ]);
+    expect(computeComparisonScore(outlier, new Map())).toBe(80);
+  });
+
+  it('normalizes by group sum so different sample sizes produce same score', () => {
+    const outlierSmall = new Map([
+      ['GET', 8],
+      ['POST', 2],
+    ]);
+    const outlierLarge = new Map([
+      ['GET', 800],
+      ['POST', 200],
+    ]);
+    const inlier = new Map([
+      ['GET', 50],
+      ['POST', 50],
+    ]);
+    const scoreSmall = computeComparisonScore(outlierSmall, inlier);
+    const scoreLarge = computeComparisonScore(outlierLarge, inlier);
+    expect(scoreSmall).toBeCloseTo(scoreLarge, 1);
+  });
+});
+
+describe('computeEntropyScore', () => {
+  it('returns 0 for single-value fields', () => {
+    expect(computeEntropyScore(new Map([['only', 100]]))).toBe(0);
+  });
+
+  it('returns 0 for empty map', () => {
+    expect(computeEntropyScore(new Map())).toBe(0);
+  });
+
+  it('returns ~0 for perfectly uniform distribution', () => {
+    expect(
+      computeEntropyScore(
+        new Map([
+          ['a', 50],
+          ['b', 50],
+        ]),
+      ),
+    ).toBeCloseTo(0);
+  });
+
+  it('returns high score for skewed distribution', () => {
+    const score = computeEntropyScore(
+      new Map([
+        ['ok', 99],
+        ['error', 1],
+      ]),
+    );
+    expect(score).toBeGreaterThan(0.5);
+  });
+
+  it('ranks more-skewed fields higher', () => {
+    const scoreA = computeEntropyScore(
+      new Map([
+        ['a', 95],
+        ['b', 5],
+      ]),
+    );
+    const scoreB = computeEntropyScore(
+      new Map([
+        ['a', 60],
+        ['b', 40],
+      ]),
+    );
+    expect(scoreA).toBeGreaterThan(scoreB);
+  });
+
+  it('returns ~0 for uniform 3-value field', () => {
+    expect(
+      computeEntropyScore(
+        new Map([
+          ['a', 33.33],
+          ['b', 33.33],
+          ['c', 33.34],
+        ]),
+      ),
+    ).toBeCloseTo(0, 2);
+  });
+
+  it('handles power-law distributions', () => {
+    const powerLaw = new Map([
+      ['v1', 50],
+      ['v2', 25],
+      ['v3', 12],
+      ['v4', 6],
+      ['v5', 4],
+      ['v6', 2],
+      ['v7', 1],
+    ]);
+    const score = computeEntropyScore(powerLaw);
+    expect(score).toBeGreaterThan(0.1);
+    expect(score).toBeLessThan(1);
+  });
+});
+
+describe('semanticBoost', () => {
+  it('boosts well-known OTel attributes', () => {
+    expect(semanticBoost('ResourceAttributes.service.name')).toBe(1);
+    expect(semanticBoost('SpanAttributes.http.method')).toBe(1);
+    expect(semanticBoost('SpanAttributes.http.status_code')).toBe(1);
+    expect(semanticBoost('SpanAttributes.error')).toBe(1);
+  });
+
+  it('returns 0 for non-boosted attributes', () => {
+    expect(semanticBoost('SpanAttributes.custom.field')).toBe(0);
+    expect(semanticBoost('TraceId')).toBe(0);
+  });
+
+  it('is case-insensitive', () => {
+    expect(semanticBoost('ResourceAttributes.Service.Name')).toBe(1);
+    expect(semanticBoost('SpanAttributes.HTTP.METHOD')).toBe(1);
+  });
+});
--- a/packages/app/src/components/deltaChartUtils.ts
+++ b/packages/app/src/components/deltaChartUtils.ts
@ -400,3 +400,117 @@ export function computeEffectiveSampleSize(totalCount: number): number {
    Math.max(MIN_SAMPLE_SIZE, Math.ceil(totalCount * SAMPLE_RATIO)),
  );
 }
+
+// ---------------------------------------------------------------------------
+// Attribute sorting and scoring
+// ---------------------------------------------------------------------------
+
+/**
+ * Comparison mode scoring: normalizes each group's percentages to sum to 100%
+ * before computing max delta. Fields with identical proportional distributions
+ * score 0 regardless of coverage rate differences.
+ */
+export function computeComparisonScore(
+  outlierValues: Map<string, number>,
+  inlierValues: Map<string, number>,
+): number {
+  const allValues = new Set([...outlierValues.keys(), ...inlierValues.keys()]);
+  if (allValues.size === 0) return 0;
+
+  let outlierSum = 0;
+  let inlierSum = 0;
+  outlierValues.forEach(v => (outlierSum += v));
+  inlierValues.forEach(v => (inlierSum += v));
+
+  if (outlierSum === 0 && inlierSum === 0) return 0;
+  if (outlierSum === 0 || inlierSum === 0) {
+    // One group has data, the other doesn't.
+    const presentValues = outlierSum > 0 ? outlierValues : inlierValues;
+    // Single value with no comparison group is uninformative — score 0.
+    // (e.g., Events.Name[N] = "message" at 100% with no inlier data)
+    // Multi-value fields with no comparison group ARE informative — they show
+    // that the present group has a distinctive distribution.
+    if (presentValues.size <= 1) return 0;
+    // Normalize to [0, 100] so the score is scale-consistent with the two-group case.
+    const presentSum = outlierSum > 0 ? outlierSum : inlierSum;
+    let maxNormPct = 0;
+    presentValues.forEach(v => {
+      const pct = (v / presentSum) * 100;
+      if (pct > maxNormPct) maxNormPct = pct;
+    });
+    return maxNormPct;
+  }
+
+  let maxDelta = 0;
+  allValues.forEach(value => {
+    const outlierNorm = ((outlierValues.get(value) ?? 0) / outlierSum) * 100;
+    const inlierNorm = ((inlierValues.get(value) ?? 0) / inlierSum) * 100;
+    const delta = Math.abs(outlierNorm - inlierNorm);
+    if (delta > maxDelta) maxDelta = delta;
+  });
+  return maxDelta;
+}
+
+/**
+ * Shannon entropy-based distribution score for sorting properties.
+ * Returns [0, 1]: 1 = maximally useful (low entropy, dominant value among several),
+ * 0 = not useful (single value, empty, or perfectly uniform).
+ */
+export function computeEntropyScore(
+  valuePercentages: Map<string, number>,
+): number {
+  const nValues = valuePercentages.size;
+  if (nValues <= 1) return 0;
+
+  let totalPct = 0;
+  valuePercentages.forEach(pct => {
+    totalPct += pct;
+  });
+  if (totalPct === 0) return 0;
+
+  let entropy = 0;
+  valuePercentages.forEach(pct => {
+    const p = pct / totalPct;
+    if (p > 0) {
+      entropy -= p * Math.log2(p);
+    }
+  });
+
+  const maxEntropy = Math.log2(nValues);
+  if (maxEntropy === 0) return 0;
+
+  return 1 - entropy / maxEntropy;
+}
+
+/** Well-known OTel attribute suffixes that get a score boost */
+const BOOSTED_ATTRIBUTE_SUFFIXES = [
+  'service.name',
+  'http.method',
+  'http.request.method',
+  'http.status_code',
+  'http.response.status_code',
+  'error',
+  'error.type',
+  'deployment.environment',
+  'deployment.environment.name',
+  'rpc.method',
+  'rpc.service',
+  'db.system',
+  'db.operation',
+  'messaging.system',
+  'messaging.operation',
+];
+
+/**
+ * Returns 1 for well-known OTel attributes, 0 otherwise.
+ * Uses dot-segment boundary matching to avoid false positives
+ * (e.g., 'SpanAttributes.myerror' won't match the 'error' entry).
+ * Callers scale this as a tiebreaker (e.g., * 0.1) and only apply when baseScore > 0.
+ */
+export function semanticBoost(key: string): number {
+  const lowerKey = key.toLowerCase();
+  for (const suffix of BOOSTED_ATTRIBUTE_SUFFIXES) {
+    if (lowerKey.endsWith('.' + suffix) || lowerKey === suffix) return 1;
+  }
+  return 0;
+}