hyperdx/packages/cli/src/components/EventViewer/usePatternData.ts

334 lines
9.3 KiB
TypeScript
Raw Normal View History

[HDX-3964] Add event pattern mining to CLI (Shift+P) (#2106) ## Summary Adds a pattern mining feature to the CLI, accessible via `Shift+P`. This mirrors the web app's Pattern Table functionality but runs entirely in TypeScript — no Pyodide/Python WASM needed. **Linear:** https://linear.app/hyperdx/issue/HDX-3964 ## What changed ### 1. Drain library in common-utils (`packages/common-utils/src/drain/`) Ported the [browser-drain](https://github.com/DeploySentinel/browser-drain) TypeScript library into `@hyperdx/common-utils`. This is a pure TypeScript implementation of the Drain3 log template mining algorithm, including: - `TemplateMiner` / `TemplateMinerConfig` — main API - `Drain` — core algorithm with prefix tree and LRU cluster cache - `LogMasker` — regex-based token masking (IPs, numbers, etc.) - `LruCache` — custom LRU cache matching Python Drain3's eviction semantics - 11 Jest tests ported from the original `node:test` suite ### 2. CLI pattern view (`packages/cli/src/components/EventViewer/`) **Keybinding:** `Shift+P` toggles pattern view (pauses follow mode, restores on exit) **Data flow (mirrors web app's `useGroupedPatterns`):** - Issues `SELECT ... ORDER BY rand() LIMIT 100000` to randomly sample up to 100K events - Issues parallel `SELECT count()` to get true total event count - Feeds sampled log bodies through the TypeScript `TemplateMiner` - Estimates pattern counts via `sampleMultiplier = totalCount / sampledRowCount` - Computes time-bucketed trend data per pattern **UI:** - Pattern list with columns: Est. Count (with `~` prefix), Pattern - `l`/`Enter` expands a pattern to show its sample events (full table columns) - `h`/`Esc` returns to pattern list - `j/k/G/g/Ctrl+D/Ctrl+U` navigation throughout - Loading spinner while sampling query runs **Alias fix:** Pattern and count queries compute `WITH` clauses from the source's `defaultTableSelectExpression` so Lucene searches using aliases (e.g. `level:error` where `level` is an alias for `SeverityText`) resolve correctly. ### New files - `packages/common-utils/src/drain/` — 7 source files + barrel index - `packages/common-utils/src/__tests__/drain.test.ts` - `packages/cli/src/components/EventViewer/usePatternData.ts` - `packages/cli/src/components/EventViewer/PatternView.tsx` - `packages/cli/src/components/EventViewer/PatternSamplesView.tsx` ### Modified files - `packages/cli/src/api/eventQuery.ts` — added `buildPatternSampleQuery`, `buildTotalCountQuery`, `buildAliasWithClauses` - `packages/cli/src/components/EventViewer/EventViewer.tsx` — wired in pattern state + rendering - `packages/cli/src/components/EventViewer/useKeybindings.ts` — added P, l, h keybindings + pattern/sample navigation - `packages/cli/src/components/EventViewer/SubComponents.tsx` — added P to help screen ### Demo https://github.com/user-attachments/assets/50a2edfc-8891-43ae-ab86-b96fca778c66
2026-04-14 18:03:56 +00:00
/**
* Fetches a random sample of events from ClickHouse, mines patterns
* using the Drain algorithm, and estimates total counts using a
* sampleMultiplier mirroring the web frontend's useGroupedPatterns.
*/
import { useState, useEffect, useCallback, useRef } from 'react';
import {
TemplateMiner,
TemplateMinerConfig,
} from '@hyperdx/common-utils/dist/drain';
import type { Metadata } from '@hyperdx/common-utils/dist/core/metadata';
import { convertDateRangeToGranularityString } from '@hyperdx/common-utils/dist/core/utils';
import type { SourceResponse, ProxyClickhouseClient } from '@/api/client';
import {
buildPatternSampleQuery,
buildTotalCountQuery,
} from '@/api/eventQuery';
import { getEventBody } from '@/shared/source';
import type { EventRow } from './types';
import { flatten } from './utils';
// ---- Constants -----------------------------------------------------
const SAMPLES = 10_000;
// ---- Time bucketing utilities --------------------------------------
/** Parse a granularity string like "5 minute" into seconds. */
function granularityToSeconds(granularity: string): number {
const [num, unit] = granularity.split(' ');
const n = parseInt(num, 10);
switch (unit) {
case 'second':
return n;
case 'minute':
return n * 60;
case 'hour':
return n * 3600;
case 'day':
return n * 86400;
default:
return n * 60;
}
}
/** Round a timestamp down to the start of its granularity bucket. */
function toStartOfBucket(ts: number, granularityMs: number): number {
return Math.floor(ts / granularityMs) * granularityMs;
}
/** Generate all bucket start timestamps between start and end. */
function generateBuckets(
startMs: number,
endMs: number,
granularityMs: number,
): number[] {
const buckets: number[] = [];
let current = toStartOfBucket(startMs, granularityMs);
while (current < endMs) {
buckets.push(current);
current += granularityMs;
}
return buckets;
}
// ---- Types ---------------------------------------------------------
export interface TrendBucket {
ts: number;
count: number;
}
export interface PatternGroup {
id: string;
pattern: string;
/** Raw count within the sample */
count: number;
/** Estimated total count (count * sampleMultiplier), prefixed with ~ in display */
estimatedCount: number;
samples: EventRow[];
/** Time-bucketed trend data for sparkline */
trend: TrendBucket[];
}
export interface UsePatternDataParams {
clickhouseClient: ProxyClickhouseClient;
metadata: Metadata;
source: SourceResponse;
submittedQuery: string;
startTime: Date;
endTime: Date;
/** Only fetch when true (i.e., pattern view is open) */
enabled: boolean;
}
export interface UsePatternDataReturn {
patterns: PatternGroup[];
loading: boolean;
error: Error | null;
totalCount: number | null;
sampledRowCount: number;
}
// ---- Hook ----------------------------------------------------------
export function usePatternData({
clickhouseClient,
metadata,
source,
submittedQuery,
startTime,
endTime,
enabled,
}: UsePatternDataParams): UsePatternDataReturn {
const [patterns, setPatterns] = useState<PatternGroup[]>([]);
const [loading, setLoading] = useState(false);
const [error, setError] = useState<Error | null>(null);
const [totalCount, setTotalCount] = useState<number | null>(null);
const [sampledRowCount, setSampledRowCount] = useState(0);
// Track the last query params to avoid redundant fetches
const lastFetchRef = useRef<string>('');
const bodyColumn = (() => {
const expr = getEventBody(source);
if (expr) return expr;
return undefined;
})();
const fetchPatterns = useCallback(async () => {
const fetchKey = JSON.stringify({
source: source.id,
submittedQuery,
startTime: startTime.getTime(),
endTime: endTime.getTime(),
});
// Skip if we already fetched for these exact params
if (lastFetchRef.current === fetchKey) return;
lastFetchRef.current = fetchKey;
setLoading(true);
setError(null);
try {
// Fire both queries in parallel
const [sampleChSql, countChSql] = await Promise.all([
buildPatternSampleQuery(
{
source,
searchQuery: submittedQuery,
startTime,
endTime,
sampleLimit: SAMPLES,
},
metadata,
),
buildTotalCountQuery(
{ source, searchQuery: submittedQuery, startTime, endTime },
metadata,
),
]);
const [sampleResult, countResult] = await Promise.all([
clickhouseClient.query({
query: sampleChSql.sql,
query_params: sampleChSql.params,
format: 'JSON',
connectionId: source.connection,
}),
clickhouseClient.query({
query: countChSql.sql,
query_params: countChSql.params,
format: 'JSON',
connectionId: source.connection,
}),
]);
const sampleJson = (await sampleResult.json()) as { data: EventRow[] };
const countJson = (await countResult.json()) as {
data: Array<Record<string, string | number>>;
};
const sampleRows = sampleJson.data ?? [];
const total = Number(countJson.data?.[0]?.total ?? 0);
setTotalCount(total);
setSampledRowCount(sampleRows.length);
if (sampleRows.length === 0) {
setPatterns([]);
setLoading(false);
return;
}
// Determine columns from the result keys
const resultKeys = Object.keys(sampleRows[0]);
const effectiveBodyColumn =
bodyColumn ?? resultKeys[resultKeys.length - 1];
// Use the source's timestamp expression, falling back to the first column
const tsExpr = source.timestampValueExpression ?? 'TimestampTime';
const tsColumn = resultKeys.find(k => k === tsExpr) ?? resultKeys[0];
// Compute granularity for trend buckets
const granularity = convertDateRangeToGranularityString(
[startTime, endTime],
24,
);
const granularityMs = granularityToSeconds(granularity) * 1000;
const allBuckets = generateBuckets(
startTime.getTime(),
endTime.getTime(),
granularityMs,
);
// Mine patterns
const config = new TemplateMinerConfig();
const miner = new TemplateMiner(config);
const clustered: Array<{
clusterId: number;
row: EventRow;
tsMs: number;
}> = [];
for (const row of sampleRows) {
const body = row[effectiveBodyColumn];
const text = body != null ? flatten(String(body)) : '';
const result = miner.addLogMessage(text);
const tsRaw = row[tsColumn];
const tsMs =
tsRaw != null
? new Date(String(tsRaw)).getTime()
: startTime.getTime();
clustered.push({ clusterId: result.clusterId, row, tsMs });
}
// Group by cluster ID
const groups = new Map<
number,
{
rows: EventRow[];
template: string;
bucketCounts: Map<number, number>;
}
>();
for (const { clusterId, row, tsMs } of clustered) {
const bucket = toStartOfBucket(tsMs, granularityMs);
const existing = groups.get(clusterId);
if (existing) {
existing.rows.push(row);
existing.bucketCounts.set(
bucket,
(existing.bucketCounts.get(bucket) ?? 0) + 1,
);
} else {
const body = row[effectiveBodyColumn];
const text = body != null ? flatten(String(body)) : '';
const match = miner.match(text, 'fallback');
const bucketCounts = new Map<number, number>();
bucketCounts.set(bucket, 1);
groups.set(clusterId, {
rows: [row],
template: match?.getTemplate() ?? text,
bucketCounts,
});
}
}
// Compute sampleMultiplier
const sampleMultiplier =
total > 0 && sampleRows.length > 0 ? total / sampleRows.length : 1;
// Convert to sorted array with estimated counts and trend data
const result: PatternGroup[] = [];
for (const [id, { rows, template, bucketCounts }] of groups) {
const trend: TrendBucket[] = allBuckets.map(bucketTs => ({
ts: bucketTs,
count: Math.round(
(bucketCounts.get(bucketTs) ?? 0) * sampleMultiplier,
),
}));
result.push({
id: String(id),
pattern: template,
count: rows.length,
estimatedCount: Math.max(
Math.round(rows.length * sampleMultiplier),
1,
),
samples: rows,
trend,
});
}
result.sort((a, b) => b.estimatedCount - a.estimatedCount);
setPatterns(result);
} catch (err: unknown) {
setError(err instanceof Error ? err : new Error(String(err)));
// Clear the fetch key so a retry will re-fetch
lastFetchRef.current = '';
} finally {
setLoading(false);
}
}, [
clickhouseClient,
metadata,
source,
submittedQuery,
startTime,
endTime,
bodyColumn,
]);
useEffect(() => {
if (enabled) {
fetchPatterns();
}
}, [enabled, fetchPatterns]);
// Clear patterns when disabled
useEffect(() => {
if (!enabled) {
lastFetchRef.current = '';
}
}, [enabled]);
return { patterns, loading, error, totalCount, sampledRowCount };
}