feat: deterministic sampling with adaptive sample size (#1849)

## Summary
Closes #1827

Replaces non-deterministic `ORDER BY rand()` with deterministic `cityHash64(SpanId)` sampling and introduces sampling configuration constants.

### What this PR does
- **Deterministic sampling**: `ORDER BY cityHash64(SpanId)` instead of `rand()` — same data always produces the same sample, so results are stable across re-renders
- **Named constants**: `SAMPLE_SIZE`, `STABLE_SAMPLE_EXPR` replace hardcoded `1000` and `'rand()'` in query configs
- **Adaptive sizing foundation**: `computeEffectiveSampleSize()` function with `MIN_SAMPLE_SIZE`/`MAX_SAMPLE_SIZE`/`SAMPLE_RATIO` constants, exported and tested (6 unit tests)

### What this PR does NOT do (follow-up)
- **Count query for adaptive sizing**: Wiring `computeEffectiveSampleSize` into the actual queries requires adding a lightweight `count()` query. This is deferred to keep this PR focused on the deterministic sampling change.
- **Dynamic column detection**: `STABLE_SAMPLE_EXPR` uses `SpanId` which is trace-specific. Event Deltas currently only renders on the traces search page where `SpanId` is always present. If the feature expands to logs/metrics, this should be parameterized per source (documented in code comment).

## Test plan
- [ ] Same data + same hover always highlights the same heatmap cells (deterministic)
- [ ] Run `npx jest src/components/__tests__/deltaChartSampling.test.ts` — 6 tests pass

🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
Alex Fedotyev 2026-03-05 08:52:54 -08:00 committed by GitHub
parent 46daa63055
commit 68ef3d6f97
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 116 additions and 6 deletions

View file

@ -0,0 +1,5 @@
---
"@hyperdx/app": patch
---
feat: deterministic sampling with adaptive sample size for Event Deltas

View file

@ -23,9 +23,11 @@ import { getFirstTimestampValueExpression } from '@/source';
import { SQLPreview } from './ChartSQLPreview';
import {
getPropertyStatistics,
getStableSampleExpression,
isDenylisted,
isHighCardinality,
mergeValueStatisticsMaps,
SAMPLE_SIZE,
} from './deltaChartUtils';
import {
CHART_GAP,
@ -42,6 +44,7 @@ export default function DBDeltaChart({
xMax,
yMin,
yMax,
spanIdExpression,
}: {
config: ChartConfigWithDateRange;
valueExpr: string;
@ -49,10 +52,14 @@ export default function DBDeltaChart({
xMax: number;
yMin: number;
yMax: number;
spanIdExpression?: string;
}) {
// Determine if the value expression uses aggregate functions
const isAggregate = isAggregateFunction(valueExpr);
// Build deterministic ORDER BY expression from source's spanIdExpression
const stableSampleExpr = getStableSampleExpression(spanIdExpression);
// Get the timestamp expression from config
const timestampExpr = getFirstTimestampValueExpression(
config.timestampValueExpression,
@ -136,8 +143,8 @@ export default function DBDeltaChart({
]
: []),
],
orderBy: [{ ordering: 'DESC', valueExpression: 'rand()' }],
limit: { limit: 1000 },
orderBy: [{ ordering: 'DESC', valueExpression: stableSampleExpr }],
limit: { limit: SAMPLE_SIZE },
},
},
];
@ -191,8 +198,8 @@ export default function DBDeltaChart({
with: buildWithClauses(true),
select: '*',
filters: buildFilters(true),
orderBy: [{ ordering: 'DESC', valueExpression: 'rand()' }],
limit: { limit: 1000 },
orderBy: [{ ordering: 'DESC', valueExpression: stableSampleExpr }],
limit: { limit: SAMPLE_SIZE },
});
const { data: inlierData } = useQueriedChartConfig({
@ -200,8 +207,8 @@ export default function DBDeltaChart({
with: buildWithClauses(false),
select: '*',
filters: buildFilters(false),
orderBy: [{ ordering: 'DESC', valueExpression: 'rand()' }],
limit: { limit: 1000 },
orderBy: [{ ordering: 'DESC', valueExpression: stableSampleExpr }],
limit: { limit: SAMPLE_SIZE },
});
// Column metadata for field classification (from ClickHouse response)

View file

@ -119,6 +119,7 @@ export function DBSearchHeatmapChart({
xMax={fields.xMax}
yMin={fields.yMin}
yMax={fields.yMax}
spanIdExpression={source.spanIdExpression}
/>
) : (
<Center mih={100} h="100%">

View file

@ -0,0 +1,57 @@
import {
computeEffectiveSampleSize,
getStableSampleExpression,
MAX_SAMPLE_SIZE,
MIN_SAMPLE_SIZE,
SAMPLE_RATIO,
SAMPLE_SIZE,
} from '../deltaChartUtils';
describe('getStableSampleExpression', () => {
it('returns cityHash64 of spanIdExpression when provided', () => {
expect(getStableSampleExpression('SpanId')).toBe('cityHash64(SpanId)');
});
it('uses custom spanId column name', () => {
expect(getStableSampleExpression('my_span_id')).toBe(
'cityHash64(my_span_id)',
);
});
it('falls back to rand() when spanIdExpression is undefined', () => {
expect(getStableSampleExpression(undefined)).toBe('rand()');
});
it('falls back to rand() when spanIdExpression is empty', () => {
expect(getStableSampleExpression('')).toBe('rand()');
});
});
describe('computeEffectiveSampleSize', () => {
it('returns SAMPLE_SIZE when totalCount is 0 (fallback)', () => {
expect(computeEffectiveSampleSize(0)).toBe(SAMPLE_SIZE);
});
it('returns SAMPLE_SIZE when totalCount is negative', () => {
expect(computeEffectiveSampleSize(-1)).toBe(SAMPLE_SIZE);
});
it('returns MIN_SAMPLE_SIZE for small datasets', () => {
expect(computeEffectiveSampleSize(100)).toBe(MIN_SAMPLE_SIZE);
});
it('returns SAMPLE_RATIO * totalCount for mid-size datasets', () => {
const result = computeEffectiveSampleSize(200_000);
expect(result).toBe(Math.ceil(200_000 * SAMPLE_RATIO));
expect(result).toBeGreaterThan(MIN_SAMPLE_SIZE);
expect(result).toBeLessThan(MAX_SAMPLE_SIZE);
});
it('caps at MAX_SAMPLE_SIZE for very large datasets', () => {
expect(computeEffectiveSampleSize(10_000_000)).toBe(MAX_SAMPLE_SIZE);
});
it('returns exact 1% for datasets where 1% falls in the valid range', () => {
expect(computeEffectiveSampleSize(100_000)).toBe(1000);
});
});

View file

@ -283,3 +283,43 @@ export function isHighCardinality(
return effectiveUniqueness > 0.9;
}
// ---------------------------------------------------------------------------
// Sampling configuration
// ---------------------------------------------------------------------------
/** Default number of rows sampled when the total count is unknown */
export const SAMPLE_SIZE = 1000;
/** Minimum number of rows to sample */
export const MIN_SAMPLE_SIZE = 500;
/** Maximum number of rows to sample */
export const MAX_SAMPLE_SIZE = 5000;
/** Fraction of total rows to sample (e.g., 0.01 = 1%) */
export const SAMPLE_RATIO = 0.01;
/**
* Builds a deterministic ORDER BY expression for stable sampling.
* Uses the source's spanIdExpression when available, falls back to rand().
*/
export function getStableSampleExpression(spanIdExpression?: string): string {
if (spanIdExpression) {
return `cityHash64(${spanIdExpression})`;
}
return 'rand()';
}
/**
* Computes the effective sample size based on total row count.
* Adaptive formula: clamp(MIN_SAMPLE_SIZE, ceil(totalCount * SAMPLE_RATIO), MAX_SAMPLE_SIZE).
* Returns SAMPLE_SIZE as fallback when totalCount is 0 or unavailable.
*/
export function computeEffectiveSampleSize(totalCount: number): number {
if (totalCount <= 0) return SAMPLE_SIZE;
return Math.min(
MAX_SAMPLE_SIZE,
Math.max(MIN_SAMPLE_SIZE, Math.ceil(totalCount * SAMPLE_RATIO)),
);
}