mirror of
https://github.com/hyperdxio/hyperdx
synced 2026-04-21 13:37:15 +00:00
feat: deterministic sampling with adaptive sample size (#1849)
## Summary Closes #1827 Replaces non-deterministic `ORDER BY rand()` with deterministic `cityHash64(SpanId)` sampling and introduces sampling configuration constants. ### What this PR does - **Deterministic sampling**: `ORDER BY cityHash64(SpanId)` instead of `rand()` — same data always produces the same sample, so results are stable across re-renders - **Named constants**: `SAMPLE_SIZE`, `STABLE_SAMPLE_EXPR` replace hardcoded `1000` and `'rand()'` in query configs - **Adaptive sizing foundation**: `computeEffectiveSampleSize()` function with `MIN_SAMPLE_SIZE`/`MAX_SAMPLE_SIZE`/`SAMPLE_RATIO` constants, exported and tested (6 unit tests) ### What this PR does NOT do (follow-up) - **Count query for adaptive sizing**: Wiring `computeEffectiveSampleSize` into the actual queries requires adding a lightweight `count()` query. This is deferred to keep this PR focused on the deterministic sampling change. - **Dynamic column detection**: `STABLE_SAMPLE_EXPR` uses `SpanId` which is trace-specific. Event Deltas currently only renders on the traces search page where `SpanId` is always present. If the feature expands to logs/metrics, this should be parameterized per source (documented in code comment). ## Test plan - [ ] Same data + same hover always highlights the same heatmap cells (deterministic) - [ ] Run `npx jest src/components/__tests__/deltaChartSampling.test.ts` — 6 tests pass 🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
parent
46daa63055
commit
68ef3d6f97
5 changed files with 116 additions and 6 deletions
5
.changeset/sampling-improvements.md
Normal file
5
.changeset/sampling-improvements.md
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
"@hyperdx/app": patch
|
||||
---
|
||||
|
||||
feat: deterministic sampling with adaptive sample size for Event Deltas
|
||||
|
|
@ -23,9 +23,11 @@ import { getFirstTimestampValueExpression } from '@/source';
|
|||
import { SQLPreview } from './ChartSQLPreview';
|
||||
import {
|
||||
getPropertyStatistics,
|
||||
getStableSampleExpression,
|
||||
isDenylisted,
|
||||
isHighCardinality,
|
||||
mergeValueStatisticsMaps,
|
||||
SAMPLE_SIZE,
|
||||
} from './deltaChartUtils';
|
||||
import {
|
||||
CHART_GAP,
|
||||
|
|
@ -42,6 +44,7 @@ export default function DBDeltaChart({
|
|||
xMax,
|
||||
yMin,
|
||||
yMax,
|
||||
spanIdExpression,
|
||||
}: {
|
||||
config: ChartConfigWithDateRange;
|
||||
valueExpr: string;
|
||||
|
|
@ -49,10 +52,14 @@ export default function DBDeltaChart({
|
|||
xMax: number;
|
||||
yMin: number;
|
||||
yMax: number;
|
||||
spanIdExpression?: string;
|
||||
}) {
|
||||
// Determine if the value expression uses aggregate functions
|
||||
const isAggregate = isAggregateFunction(valueExpr);
|
||||
|
||||
// Build deterministic ORDER BY expression from source's spanIdExpression
|
||||
const stableSampleExpr = getStableSampleExpression(spanIdExpression);
|
||||
|
||||
// Get the timestamp expression from config
|
||||
const timestampExpr = getFirstTimestampValueExpression(
|
||||
config.timestampValueExpression,
|
||||
|
|
@ -136,8 +143,8 @@ export default function DBDeltaChart({
|
|||
]
|
||||
: []),
|
||||
],
|
||||
orderBy: [{ ordering: 'DESC', valueExpression: 'rand()' }],
|
||||
limit: { limit: 1000 },
|
||||
orderBy: [{ ordering: 'DESC', valueExpression: stableSampleExpr }],
|
||||
limit: { limit: SAMPLE_SIZE },
|
||||
},
|
||||
},
|
||||
];
|
||||
|
|
@ -191,8 +198,8 @@ export default function DBDeltaChart({
|
|||
with: buildWithClauses(true),
|
||||
select: '*',
|
||||
filters: buildFilters(true),
|
||||
orderBy: [{ ordering: 'DESC', valueExpression: 'rand()' }],
|
||||
limit: { limit: 1000 },
|
||||
orderBy: [{ ordering: 'DESC', valueExpression: stableSampleExpr }],
|
||||
limit: { limit: SAMPLE_SIZE },
|
||||
});
|
||||
|
||||
const { data: inlierData } = useQueriedChartConfig({
|
||||
|
|
@ -200,8 +207,8 @@ export default function DBDeltaChart({
|
|||
with: buildWithClauses(false),
|
||||
select: '*',
|
||||
filters: buildFilters(false),
|
||||
orderBy: [{ ordering: 'DESC', valueExpression: 'rand()' }],
|
||||
limit: { limit: 1000 },
|
||||
orderBy: [{ ordering: 'DESC', valueExpression: stableSampleExpr }],
|
||||
limit: { limit: SAMPLE_SIZE },
|
||||
});
|
||||
|
||||
// Column metadata for field classification (from ClickHouse response)
|
||||
|
|
|
|||
|
|
@ -119,6 +119,7 @@ export function DBSearchHeatmapChart({
|
|||
xMax={fields.xMax}
|
||||
yMin={fields.yMin}
|
||||
yMax={fields.yMax}
|
||||
spanIdExpression={source.spanIdExpression}
|
||||
/>
|
||||
) : (
|
||||
<Center mih={100} h="100%">
|
||||
|
|
|
|||
|
|
@ -0,0 +1,57 @@
|
|||
import {
|
||||
computeEffectiveSampleSize,
|
||||
getStableSampleExpression,
|
||||
MAX_SAMPLE_SIZE,
|
||||
MIN_SAMPLE_SIZE,
|
||||
SAMPLE_RATIO,
|
||||
SAMPLE_SIZE,
|
||||
} from '../deltaChartUtils';
|
||||
|
||||
describe('getStableSampleExpression', () => {
|
||||
it('returns cityHash64 of spanIdExpression when provided', () => {
|
||||
expect(getStableSampleExpression('SpanId')).toBe('cityHash64(SpanId)');
|
||||
});
|
||||
|
||||
it('uses custom spanId column name', () => {
|
||||
expect(getStableSampleExpression('my_span_id')).toBe(
|
||||
'cityHash64(my_span_id)',
|
||||
);
|
||||
});
|
||||
|
||||
it('falls back to rand() when spanIdExpression is undefined', () => {
|
||||
expect(getStableSampleExpression(undefined)).toBe('rand()');
|
||||
});
|
||||
|
||||
it('falls back to rand() when spanIdExpression is empty', () => {
|
||||
expect(getStableSampleExpression('')).toBe('rand()');
|
||||
});
|
||||
});
|
||||
|
||||
describe('computeEffectiveSampleSize', () => {
|
||||
it('returns SAMPLE_SIZE when totalCount is 0 (fallback)', () => {
|
||||
expect(computeEffectiveSampleSize(0)).toBe(SAMPLE_SIZE);
|
||||
});
|
||||
|
||||
it('returns SAMPLE_SIZE when totalCount is negative', () => {
|
||||
expect(computeEffectiveSampleSize(-1)).toBe(SAMPLE_SIZE);
|
||||
});
|
||||
|
||||
it('returns MIN_SAMPLE_SIZE for small datasets', () => {
|
||||
expect(computeEffectiveSampleSize(100)).toBe(MIN_SAMPLE_SIZE);
|
||||
});
|
||||
|
||||
it('returns SAMPLE_RATIO * totalCount for mid-size datasets', () => {
|
||||
const result = computeEffectiveSampleSize(200_000);
|
||||
expect(result).toBe(Math.ceil(200_000 * SAMPLE_RATIO));
|
||||
expect(result).toBeGreaterThan(MIN_SAMPLE_SIZE);
|
||||
expect(result).toBeLessThan(MAX_SAMPLE_SIZE);
|
||||
});
|
||||
|
||||
it('caps at MAX_SAMPLE_SIZE for very large datasets', () => {
|
||||
expect(computeEffectiveSampleSize(10_000_000)).toBe(MAX_SAMPLE_SIZE);
|
||||
});
|
||||
|
||||
it('returns exact 1% for datasets where 1% falls in the valid range', () => {
|
||||
expect(computeEffectiveSampleSize(100_000)).toBe(1000);
|
||||
});
|
||||
});
|
||||
|
|
@ -283,3 +283,43 @@ export function isHighCardinality(
|
|||
|
||||
return effectiveUniqueness > 0.9;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Sampling configuration
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Default number of rows sampled when the total count is unknown */
|
||||
export const SAMPLE_SIZE = 1000;
|
||||
|
||||
/** Minimum number of rows to sample */
|
||||
export const MIN_SAMPLE_SIZE = 500;
|
||||
|
||||
/** Maximum number of rows to sample */
|
||||
export const MAX_SAMPLE_SIZE = 5000;
|
||||
|
||||
/** Fraction of total rows to sample (e.g., 0.01 = 1%) */
|
||||
export const SAMPLE_RATIO = 0.01;
|
||||
|
||||
/**
|
||||
* Builds a deterministic ORDER BY expression for stable sampling.
|
||||
* Uses the source's spanIdExpression when available, falls back to rand().
|
||||
*/
|
||||
export function getStableSampleExpression(spanIdExpression?: string): string {
|
||||
if (spanIdExpression) {
|
||||
return `cityHash64(${spanIdExpression})`;
|
||||
}
|
||||
return 'rand()';
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the effective sample size based on total row count.
|
||||
* Adaptive formula: clamp(MIN_SAMPLE_SIZE, ceil(totalCount * SAMPLE_RATIO), MAX_SAMPLE_SIZE).
|
||||
* Returns SAMPLE_SIZE as fallback when totalCount is 0 or unavailable.
|
||||
*/
|
||||
export function computeEffectiveSampleSize(totalCount: number): number {
|
||||
if (totalCount <= 0) return SAMPLE_SIZE;
|
||||
return Math.min(
|
||||
MAX_SAMPLE_SIZE,
|
||||
Math.max(MIN_SAMPLE_SIZE, Math.ceil(totalCount * SAMPLE_RATIO)),
|
||||
);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue