feat: Add percentages to filter values (#1250)

# Summary

Closes HDX-1960

This PR adds a button to our search filters component which can be used to show the _approximate_ percentage of rows which have each filter value.

https://github.com/user-attachments/assets/2dba1b28-d2b9-4414-986c-0c515d252c89

Notes:
- The percentages are based on a sample of 100k rows. The sampling is done similarly to how EE version samples logs for patterns.
- We only fetch the most common 100 values in the sample. All other values are assumed to represent <1% of the data.
- The percentages represent the distribution within the dataset after it has been filtered by the selected filters and the where clause.
- This is a potentially expensive query, even with sampling, so the percentages are only queried if they're toggled on for a particular filter, and do not refresh in live mode. They do refresh if the search or date ranges changes (outside of live mode).
This commit is contained in:
Drew Davis 2025-10-09 15:26:39 -04:00 committed by GitHub
parent 13b191c8a0
commit daffcf3594
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 480 additions and 17 deletions

View file

@ -0,0 +1,6 @@
---
"@hyperdx/common-utils": patch
"@hyperdx/app": patch
---
feat: Add percentages to filter values

View file

@ -55,7 +55,7 @@ import {
useDocumentVisibility,
} from '@mantine/hooks';
import { notifications } from '@mantine/notifications';
import { useIsFetching } from '@tanstack/react-query';
import { keepPreviousData, useIsFetching } from '@tanstack/react-query';
import { SortingState } from '@tanstack/react-table';
import CodeMirror from '@uiw/react-codemirror';
@ -1099,7 +1099,10 @@ function DBSearchPage() {
}
}, [isReady, queryReady, isChartConfigLoading, onSearch]);
const { data: aliasMap } = useAliasMapFromChartConfig(dbSqlRowTableConfig);
const { data: aliasMap } = useAliasMapFromChartConfig(dbSqlRowTableConfig, {
placeholderData: keepPreviousData,
queryKey: ['aliasMap', dbSqlRowTableConfig, 'withPlaceholder'],
});
const aliasWith = useMemo(
() =>

View file

@ -40,6 +40,7 @@ import { useExplainQuery } from '@/hooks/useExplainQuery';
import {
useAllFields,
useGetKeyValues,
useGetValuesDistribution,
useJsonColumns,
useTableMetadata,
} from '@/hooks/useMetadata';
@ -76,6 +77,8 @@ type FilterCheckboxProps = {
onClickExclude?: VoidFunction;
onClickPin: VoidFunction;
className?: string;
percentage?: number;
isPercentageLoading?: boolean;
};
export const TextButton = ({
@ -105,6 +108,26 @@ export const TextButton = ({
);
};
type FilterPercentageProps = {
percentage: number;
isLoading?: boolean;
};
const FilterPercentage = ({ percentage, isLoading }: FilterPercentageProps) => {
const formattedPercentage =
percentage < 1
? `<1%`
: percentage >= 99.5
? `>99%`
: `~${Math.round(percentage)}%`;
return (
<Text size="xs" c="gray.3" className={isLoading ? 'effect-pulse' : ''}>
{formattedPercentage}
</Text>
);
};
const emptyFn = () => {};
export const FilterCheckbox = ({
value,
@ -115,6 +138,8 @@ export const FilterCheckbox = ({
onClickExclude,
onClickPin,
className,
percentage,
isPercentageLoading,
}: FilterCheckboxProps) => {
return (
<div
@ -146,15 +171,30 @@ export const FilterCheckbox = ({
fz="xxs"
color="gray"
>
<Text
size="xs"
c={value === 'excluded' ? 'red.4' : 'gray.3'}
truncate="end"
<Group
w="100%"
title={label}
gap="xs"
wrap="nowrap"
justify="space-between"
pe={'11px'}
miw={0}
>
{label}
</Text>
<Text
size="xs"
c={value === 'excluded' ? 'red.4' : 'gray.3'}
truncate="end"
flex={1}
title={label}
>
{label}
</Text>
{percentage != null && (
<FilterPercentage
percentage={percentage}
isLoading={isPercentageLoading}
/>
)}
</Group>
</Tooltip>
</Group>
<div className={classes.filterActions}>
@ -208,6 +248,8 @@ export type FilterGroupProps = {
hasLoadedMore: boolean;
isDefaultExpanded?: boolean;
'data-testid'?: string;
chartConfig: ChartConfigWithDateRange;
isLive?: boolean;
};
const MAX_FILTER_GROUP_ITEMS = 10;
@ -230,6 +272,8 @@ export const FilterGroup = ({
hasLoadedMore,
isDefaultExpanded,
'data-testid': dataTestId,
chartConfig,
isLive,
}: FilterGroupProps) => {
const [search, setSearch] = useState('');
// "Show More" button when there's lots of options
@ -238,6 +282,26 @@ export const FilterGroup = ({
const [isExpanded, setExpanded] = useState(isDefaultExpanded ?? false);
// Track recently moved items for highlight animation
const [recentlyMoved, setRecentlyMoved] = useState<Set<string>>(new Set());
// Show what percentage of the data has each value
const [showDistributions, setShowDistributions] = useState(false);
// For live searches, don't refresh percentages when date range changes
const [dateRange, setDateRange] = useState<[Date, Date]>(
chartConfig.dateRange,
);
const toggleShowDistributions = () => {
if (!showDistributions) {
setExpanded(true);
setDateRange(chartConfig.dateRange);
}
setShowDistributions(prev => !prev);
};
useEffect(() => {
if (!isLive) {
setDateRange(chartConfig.dateRange);
}
}, [chartConfig.dateRange, isLive]);
useEffect(() => {
if (isDefaultExpanded) {
@ -245,6 +309,33 @@ export const FilterGroup = ({
}
}, [isDefaultExpanded]);
const {
data: distributionData,
isFetching: isFetchingDistribution,
error: distributionError,
} = useGetValuesDistribution(
{
chartConfig: { ...chartConfig, dateRange },
key: name,
limit: 100, // The 100 most common values are enough to find any values that are present in at least 1% of rows
},
{
enabled: showDistributions,
},
);
useEffect(() => {
if (distributionError) {
notifications.show({
color: 'red',
title: 'Error loading filter distribution',
message: distributionError?.message,
autoClose: 5000,
});
setShowDistributions(false);
}
}, [distributionError]);
const totalFiltersSize =
selectedValues.included.size + selectedValues.excluded.size;
@ -292,6 +383,13 @@ export const FilterGroup = ({
if (aExcluded && !bExcluded) return -1;
if (!aExcluded && bExcluded) return 1;
// Then sort by estimated percentage of rows with this value, if available
const aPercentage = distributionData?.get(a.value) ?? 0;
const bPercentage = distributionData?.get(b.value) ?? 0;
if (aPercentage !== bPercentage) {
return bPercentage - aPercentage;
}
// Finally sort alphabetically/numerically
return a.value.localeCompare(b.value, undefined, { numeric: true });
});
@ -310,6 +408,7 @@ export const FilterGroup = ({
augmentedOptions,
selectedValues,
totalFiltersSize,
distributionData,
]);
// Simple highlight animation when checkbox is checked
@ -402,6 +501,22 @@ export const FilterGroup = ({
</Tooltip>
</Accordion.Control>
<Group gap="xxxs" wrap="nowrap">
<ActionIcon
size="xs"
variant="subtle"
color="gray"
onClick={toggleShowDistributions}
title={
showDistributions ? 'Hide distribution' : 'Show distribution'
}
data-testid={`toggle-distribution-button-${name}`}
aria-checked={showDistributions}
role="checkbox"
>
<i
className={`bi ${isFetchingDistribution ? 'spinner-border spinner-border-sm' : showDistributions ? 'bi-bar-chart-line-fill' : 'bi-bar-chart-line'}`}
/>
</ActionIcon>
{onFieldPinClick && (
<ActionIcon
size="xs"
@ -409,6 +524,7 @@ export const FilterGroup = ({
color="gray"
onClick={onFieldPinClick}
title={isFieldPinned ? 'Unpin field' : 'Pin field'}
me={'4px'}
>
<i
className={`bi bi-pin-angle${isFieldPinned ? '-fill' : ''}`}
@ -452,6 +568,12 @@ export const FilterGroup = ({
onClickOnly={() => onOnlyClick(option.value)}
onClickExclude={() => onExcludeClick(option.value)}
onClickPin={() => onPinClick(option.value)}
isPercentageLoading={isFetchingDistribution}
percentage={
showDistributions && distributionData
? (distributionData.get(option.value) ?? 0)
: undefined
}
/>
))}
{optionsLoading ? (
@ -900,6 +1022,8 @@ const DBSearchPageFiltersComponent = ({
(filterState[facet.key].included.size > 0 ||
filterState[facet.key].excluded.size > 0))
}
chartConfig={chartConfig}
isLive={isLive}
/>
))}

View file

@ -1,12 +1,21 @@
import { UseQueryOptions, UseQueryResult } from '@tanstack/react-query';
import { screen, within } from '@testing-library/react';
import userEvent from '@testing-library/user-event';
import { useGetValuesDistribution } from '@/hooks/useMetadata';
import {
cleanedFacetName,
FilterGroup,
type FilterGroupProps,
} from '../DBSearchPageFilters';
jest.mock('@/hooks/useMetadata', () => ({
useGetValuesDistribution: jest
.fn()
.mockReturnValue({ data: undefined, isFetching: false, error: undefined }),
}));
describe('cleanedFacetName', () => {
describe('basic functionality', () => {
it('should return non-toString strings unchanged', () => {
@ -202,12 +211,24 @@ describe('FilterGroup', () => {
loadMoreLoading: false,
hasLoadedMore: false,
isDefaultExpanded: true,
chartConfig: {
from: {
databaseName: 'test_db',
tableName: 'test_table',
},
select: '',
where: '',
whereLanguage: 'sql',
timestampValueExpression: '',
connection: 'test_connection',
dateRange: [new Date('2024-01-01'), new Date('2024-01-02')],
},
};
it('should sort options alphabetically by default', () => {
renderWithMantine(<FilterGroup {...defaultProps} />);
const options = screen.getAllByRole('checkbox');
const options = screen.getAllByTestId(/filter-checkbox-input/g);
expect(options).toHaveLength(3);
const labels = screen.getAllByText(/apple|banana|zebra/);
expect(labels[0]).toHaveTextContent('apple');
@ -226,7 +247,7 @@ describe('FilterGroup', () => {
/>,
);
const options = screen.getAllByRole('checkbox');
const options = screen.getAllByTestId(/filter-checkbox-input/g);
expect(options).toHaveLength(3);
const labels = screen.getAllByText(/apple|banana|zebra/);
expect(labels[0]).toHaveTextContent('apple');
@ -234,6 +255,68 @@ describe('FilterGroup', () => {
expect(labels[2]).toHaveTextContent('banana');
});
it('should show selected items first, then sort by counts, if percentages when they are enabled', () => {
jest.mocked(useGetValuesDistribution).mockReturnValue({
data: new Map([
['apple', 30],
['banana', 20],
['zebra', 50],
]),
isFetching: false,
error: null,
} as UseQueryResult<Map<string, number>>);
renderWithMantine(
<FilterGroup
{...defaultProps}
selectedValues={{
included: new Set(['banana']),
excluded: new Set(),
}}
/>,
);
const options = screen.getAllByTestId(/filter-checkbox-input/g);
expect(options).toHaveLength(3);
const labels = screen.getAllByText(/apple|banana|zebra/);
expect(labels[0]).toHaveTextContent('banana'); // Selected
expect(labels[1]).toHaveTextContent('zebra'); // 50%
expect(labels[2]).toHaveTextContent('apple'); // 30%
});
it('should show percentages, if enabled', async () => {
jest.mocked(useGetValuesDistribution).mockReturnValue({
data: new Map([
['apple', 99.2],
['zebra', 0.6],
]),
isFetching: false,
error: null,
} as UseQueryResult<Map<string, number>>);
renderWithMantine(
<FilterGroup
{...defaultProps}
selectedValues={{
included: new Set(),
excluded: new Set(),
}}
/>,
);
const showPercentages = screen.getByTestId(
'toggle-distribution-button-Test Filter',
);
await userEvent.click(showPercentages);
const options = screen.getAllByTestId(/filter-checkbox-input/g);
expect(options).toHaveLength(3);
const labels = screen.getAllByText(/%/);
expect(labels[0]).toHaveTextContent('~99%'); // apple
expect(labels[1]).toHaveTextContent('<1%'); // zebra
expect(labels[2]).toHaveTextContent('<1%'); // banana
});
it('should handle excluded items', () => {
renderWithMantine(
<FilterGroup
@ -245,7 +328,7 @@ describe('FilterGroup', () => {
/>,
);
const options = screen.getAllByRole('checkbox');
const options = screen.getAllByTestId(/filter-checkbox-input/g);
expect(options).toHaveLength(3);
const labels = screen.getAllByText(/apple|banana|zebra/);
expect(labels[0]).toHaveTextContent('apple'); // included first
@ -276,7 +359,7 @@ describe('FilterGroup', () => {
);
// Should show MAX_FILTER_GROUP_ITEMS (10) by default
let options = screen.getAllByRole('checkbox');
let options = screen.getAllByTestId(/filter-checkbox-input/g);
expect(options).toHaveLength(10);
// Selected items should be visible even if they would be beyond MAX_FILTER_GROUP_ITEMS
@ -289,7 +372,7 @@ describe('FilterGroup', () => {
await userEvent.click(showMoreButton);
// Should show all items
options = screen.getAllByRole('checkbox');
options = screen.getAllByTestId(/filter-checkbox-input/g);
expect(options).toHaveLength(15);
});

View file

@ -210,6 +210,36 @@ export function useMultipleGetKeyValues(
});
}
export function useGetValuesDistribution(
{
chartConfig,
key,
limit,
}: {
chartConfig: ChartConfigWithDateRange;
key: string;
limit: number;
},
options?: Omit<UseQueryOptions<Map<string, number>, Error>, 'queryKey'>,
) {
const metadata = useMetadataWithSettings();
return useQuery<Map<string, number>>({
queryKey: ['useMetadata.useGetValuesDistribution', chartConfig, key],
queryFn: async () => {
return await metadata.getValuesDistribution({
chartConfig,
key,
limit,
});
},
staleTime: Infinity,
enabled: !!key,
placeholderData: keepPreviousData,
retry: false,
...options,
});
}
export function useGetKeyValues(
{
chartConfig,

View file

@ -59,7 +59,7 @@
.filterCheckbox {
width: 100%;
display: grid;
grid-template-columns: 1fr 20px;
grid-template-columns: 1fr 13px;
padding: 2px 6px;
justify-content: space-between;
align-items: center;
@ -81,12 +81,12 @@
backdrop-filter: blur(4px);
border-radius: 4px;
align-items: center;
padding: 0 8px;
padding: 0 4px;
gap: 4px;
background-color: $slate-950;
.textButton {
padding: 2px 6px;
padding: 2px 4px;
border-radius: 3px;
&:hover {

View file

@ -422,4 +422,139 @@ describe('Metadata', () => {
);
});
});
describe('getValuesDistribution', () => {
const mockChartConfig: ChartConfigWithDateRange = {
from: {
databaseName: 'test_db',
tableName: 'test_table',
},
select: '',
where: '',
whereLanguage: 'sql',
timestampValueExpression: '',
connection: 'test_connection',
dateRange: [new Date('2024-01-01'), new Date('2024-01-02')],
};
beforeEach(() => {
(mockClickhouseClient.query as jest.Mock).mockResolvedValue({
json: () =>
Promise.resolve({
data: [
{
__hdx_value: 'info',
__hdx_percentage: '85.9',
},
{
__hdx_value: 'debug',
__hdx_percentage: '3.0',
},
{
__hdx_value: 'warn',
__hdx_percentage: '6.5',
},
{
__hdx_value: 'error',
__hdx_percentage: '4.1',
},
],
}),
});
});
it('should fetch and return values distribution for severity', async () => {
const result = await metadata.getValuesDistribution({
chartConfig: mockChartConfig,
key: 'severity',
});
expect(result).toEqual(
new Map([
['info', Number(85.9)],
['debug', Number(3.0)],
['warn', Number(6.5)],
['error', Number(4.1)],
]),
);
});
it('should include alias CTEs when provided in the config', async () => {
const configWithAliases = {
...mockChartConfig,
with: [
{
name: 'service',
sql: {
sql: 'ServiceName',
params: {},
},
},
{
name: 'severity',
sql: {
sql: 'SeverityText',
params: {},
},
},
],
where: "severity = 'info'",
};
const renderChartConfigSpy = jest.spyOn(
renderChartConfigModule,
'renderChartConfig',
);
await metadata.getValuesDistribution({
chartConfig: configWithAliases,
key: 'severity',
});
const actualConfig = renderChartConfigSpy.mock.calls[0][0];
expect(actualConfig.with).toContainEqual({
name: 'service',
sql: {
sql: 'ServiceName',
params: {},
},
});
expect(actualConfig.with).toContainEqual({
name: 'severity',
sql: {
sql: 'SeverityText',
params: {},
},
});
expect(actualConfig.where).toBe("severity = 'info'");
});
it('should include filters from the config in the query', async () => {
const configWithFilters: ChartConfigWithDateRange = {
...mockChartConfig,
filters: [
{
type: 'sql',
condition: "ServiceName IN ('clickhouse')",
},
],
};
const renderChartConfigSpy = jest.spyOn(
renderChartConfigModule,
'renderChartConfig',
);
await metadata.getValuesDistribution({
chartConfig: configWithFilters,
key: 'severity',
});
const actualConfig = renderChartConfigSpy.mock.calls[0][0];
expect(actualConfig.filters).toContainEqual({
type: 'sql',
condition: "ServiceName IN ('clickhouse')",
});
});
});
});

View file

@ -1,4 +1,5 @@
import type { ClickHouseSettings } from '@clickhouse/client-common';
import { omit, pick } from 'lodash';
import {
BaseClickhouseClient,
@ -570,6 +571,87 @@ export class Metadata {
return tableMetadata;
}
async getValuesDistribution({
chartConfig,
key,
samples = 100_000,
limit = 100,
}: {
chartConfig: ChartConfigWithDateRange;
key: string;
samples?: number;
limit?: number;
}) {
const cacheKeyConfig = pick(chartConfig, [
'connection',
'from',
'dateRange',
'filters',
'where',
'with',
]);
return this.cache.getOrFetch(
`${JSON.stringify(cacheKeyConfig)}.${key}.valuesDistribution`,
async () => {
const config: ChartConfigWithDateRange = {
...chartConfig,
with: [
...(chartConfig.with || []),
// Add CTE to get total row count and sample factor
{
name: 'tableStats',
chartConfig: {
...omit(chartConfig, ['with', 'groupBy', 'orderBy', 'limit']),
select: `count() as total, greatest(CAST(total / ${samples} AS UInt32), 1) as sample_factor`,
},
},
],
// Add sampling condition as a filter. The query will still read all rows to evaluate
// the sampling condition, but will only read values column from selected rows.
filters: [
...(chartConfig.filters || []),
{
type: 'sql',
condition: `cityHash64(${chartConfig.timestampValueExpression}, rand()) % (SELECT sample_factor FROM tableStats) = 0`,
},
],
select: `${key} AS __hdx_value, count() as __hdx_count, __hdx_count / (sum(__hdx_count) OVER ()) * 100 AS __hdx_percentage`,
orderBy: '__hdx_percentage DESC',
groupBy: `__hdx_value`,
limit: { limit },
};
const sql = await renderChartConfig(config, this);
const json = await this.clickhouseClient
.query<'JSON'>({
query: sql.sql,
query_params: sql.params,
connectionId: chartConfig.connection,
clickhouse_settings: {
...this.getClickHouseSettings(),
// Set max_rows_to_group_by to avoid using too much memory when grouping on high cardinality key columns
max_rows_to_group_by: `${limit * 10}`,
group_by_overflow_mode: 'any',
},
})
.then(res =>
res.json<{
__hdx_value: string;
__hdx_percentage: string | number;
}>(),
);
return new Map(
json.data.map(({ __hdx_value, __hdx_percentage }) => [
__hdx_value,
Number(__hdx_percentage),
]),
);
},
);
}
async getKeyValues({
chartConfig,
keys,