perf: Improve getKeyValues query performance for JSON keys (#1284)

Closes HDX-2623

# Summary

This change improves the performance of `getKeyValues` when getting values of a JSON key. 

Generally, columns that are not referenced outside of a CTE will be pruned by the query planner. For JSON however, if the outer select references one field in a JSON column, then the inner select will read (it seems) the entire JSON object.

This PR also adds integration tests for `getKeyValues` to ensure that the function generates queries that work as expected in ClickHouse.

##  Performance impact (on single JSON Dashboard Filter)

- Original: 15.03s

<img width="584" height="71" alt="Screenshot 2025-10-21 at 3 28 07 PM" src="https://github.com/user-attachments/assets/184de198-cee1-4b1d-beed-ec4465d3e248" />

- Optimized: 0.443s

<img width="590" height="61" alt="Screenshot 2025-10-21 at 3 25 47 PM" src="https://github.com/user-attachments/assets/690d0ef0-15b8-47c5-9a7e-8b8f6b8f5e92" />
This commit is contained in:
Drew Davis 2025-10-27 17:46:46 +01:00 committed by GitHub
parent 93edb6f84f
commit 8190ee8f6a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 255 additions and 89 deletions

View file

@ -0,0 +1,5 @@
---
"@hyperdx/common-utils": patch
---
perf: Improve getKeyValues query performance for JSON keys

View file

@ -45,10 +45,16 @@ dev-int:
npx nx run @hyperdx/api:dev:int $(FILE)
docker compose -p int -f ./docker-compose.ci.yml down
.PHONY: dev-int-common-utils
dev-int-common-utils:
docker compose -p int -f ./docker-compose.ci.yml up -d
npx nx run @hyperdx/common-utils:dev:int $(FILE)
docker compose -p int -f ./docker-compose.ci.yml down
.PHONY: ci-int
ci-int:
docker compose -p int -f ./docker-compose.ci.yml up -d
npx nx run @hyperdx/api:ci:int
npx nx run-many -t ci:int --parallel=false
docker compose -p int -f ./docker-compose.ci.yml down
.PHONY: dev-unit

View file

@ -0,0 +1,4 @@
CLICKHOUSE_HOST=http://localhost:8123
CLICKHOUSE_PASSWORD=
CLICKHOUSE_USER=default
NODE_ENV=test

View file

@ -5,6 +5,7 @@ module.exports = {
verbose: true,
rootDir: './src',
testMatch: ['**/__tests__/*.test.ts?(x)'],
testPathIgnorePatterns: ['.*\\.int\\.test\\.ts$'],
testTimeout: 30000,
moduleNameMapper: {
'@/(.*)$': '<rootDir>/$1',

View file

@ -0,0 +1,13 @@
/** @type {import('ts-jest/dist/types').InitialOptionsTsJest} */
module.exports = {
setupFiles: ['dotenv/config'],
preset: 'ts-jest',
testEnvironment: 'node',
verbose: true,
rootDir: './src',
testMatch: ['**/__tests__/*.int.test.ts?(x)'],
testTimeout: 30000,
moduleNameMapper: {
'@/(.*)$': '<rootDir>/$1',
},
};

View file

@ -36,6 +36,7 @@
"@types/sqlstring": "^2.3.0",
"@types/supertest": "^2.0.12",
"@types/uuid": "^8.3.4",
"dotenv": "^17.2.3",
"jest": "^28.1.1",
"nodemon": "^2.0.20",
"rimraf": "^4.4.1",
@ -56,6 +57,8 @@
"lint:fix": "npx eslint . --ext .ts --fix",
"ci:lint": "yarn lint && yarn tsc --noEmit",
"ci:unit": "jest --runInBand --ci --forceExit --coverage",
"dev:unit": "jest --watchAll --runInBand --detectOpenHandles"
"dev:unit": "jest --watchAll --runInBand --detectOpenHandles",
"ci:int": "DOTENV_CONFIG_PATH=.env.test jest --config jest.int.config.js --runInBand --ci --forceExit --coverage",
"dev:int": "DOTENV_CONFIG_PATH=.env.test jest --config jest.int.config.js --watchAll --runInBand --detectOpenHandles"
}
}

View file

@ -0,0 +1,193 @@
import { createClient } from '@clickhouse/client';
import { ClickHouseClient } from '@clickhouse/client-common';
import { ClickhouseClient as HdxClickhouseClient } from '@/clickhouse/node';
import { Metadata, MetadataCache } from '@/metadata';
import { ChartConfigWithDateRange } from '@/types';
describe('Metadata Integration Tests', () => {
let client: ClickHouseClient;
let hdxClient: HdxClickhouseClient;
beforeAll(() => {
const host = process.env.CLICKHOUSE_HOST || 'http://localhost:8123';
const username = process.env.CLICKHOUSE_USER || 'default';
const password = process.env.CLICKHOUSE_PASSWORD || '';
client = createClient({
url: host,
username,
password,
});
hdxClient = new HdxClickhouseClient({
host,
username,
password,
});
});
describe('getKeyValues', () => {
let metadata: Metadata;
const chartConfig: ChartConfigWithDateRange = {
connection: 'test_connection',
from: {
databaseName: 'default',
tableName: 'test_table',
},
dateRange: [new Date('2023-01-01'), new Date('2025-01-01')],
select: 'col1, col2',
timestampValueExpression: 'Timestamp',
where: '',
};
beforeAll(async () => {
await client.command({
query: `CREATE OR REPLACE TABLE default.test_table (
Timestamp DateTime64(9) CODEC(Delta(8), ZSTD(1)),
SeverityText LowCardinality(String) CODEC(ZSTD(1)),
TraceId String,
LogAttributes JSON CODEC(ZSTD(1)),
ResourceAttributes Map(LowCardinality(String), String) CODEC(ZSTD(1)),
\`__hdx_materialized_k8s.pod.name\` String MATERIALIZED ResourceAttributes['k8s.pod.name'] CODEC(ZSTD(1)),
)
ENGINE = MergeTree()
ORDER BY (Timestamp)
`,
});
await client.command({
query: `INSERT INTO default.test_table (Timestamp, SeverityText, TraceId, ResourceAttributes, LogAttributes) VALUES
('2023-06-01 12:00:00', 'info', '1o2udn120d8n', { 'k8s.pod.name': 'pod1', 'env': 'prod' },'{"action":"ping"}'),
('2024-06-01 12:00:00', 'error', '67890-09098', { 'k8s.pod.name': 'pod2', 'env': 'prod' },'{}'),
('2024-06-01 12:00:00', 'info', '11h9238re1h92', { 'env': 'staging' },'{"user":"john"}'),
('2024-06-01 12:00:00', 'warning', '1o2udn120d8n', { 'k8s.pod.name': 'pod1', 'env': 'prod' }, '{"user":"jack","action":"login"}'),
('2024-06-01 12:00:00', '', '1o2udn120d8n', { 'env': 'prod' }, '{"user":"jane","action":"login"}')
`,
});
});
beforeEach(async () => {
metadata = new Metadata(hdxClient, new MetadataCache());
});
afterAll(async () => {
await client.command({
query: 'DROP TABLE IF EXISTS default.test_table',
});
await client.close();
});
describe.each([true, false])('with disableRowLimit=%s', disableRowLimit => {
it('should return key-value pairs for a given metadata key', async () => {
const resultSeverityText = await metadata.getKeyValues({
chartConfig,
keys: ['SeverityText'],
disableRowLimit,
});
expect(resultSeverityText).toHaveLength(1);
expect(resultSeverityText[0].key).toBe('SeverityText');
expect(resultSeverityText[0].value).toHaveLength(3);
expect(resultSeverityText[0].value).toEqual(
expect.arrayContaining(['info', 'error', 'warning']),
);
const resultTraceId = await metadata.getKeyValues({
chartConfig,
keys: ['TraceId'],
disableRowLimit,
});
expect(resultTraceId).toHaveLength(1);
expect(resultTraceId[0].key).toBe('TraceId');
expect(resultTraceId[0].value).toHaveLength(3);
expect(resultTraceId[0].value).toEqual(
expect.arrayContaining([
'1o2udn120d8n',
'67890-09098',
'11h9238re1h92',
]),
);
const resultBoth = await metadata.getKeyValues({
chartConfig,
keys: ['TraceId', 'SeverityText'],
disableRowLimit,
});
expect(resultBoth).toEqual([
{
key: 'TraceId',
value: expect.arrayContaining([
'1o2udn120d8n',
'67890-09098',
'11h9238re1h92',
]),
},
{
key: 'SeverityText',
value: expect.arrayContaining(['info', 'error', 'warning']),
},
]);
});
it('should handle materialized columns correctly', async () => {
const resultPodName = await metadata.getKeyValues({
chartConfig,
keys: ['__hdx_materialized_k8s.pod.name'],
disableRowLimit,
});
expect(resultPodName).toHaveLength(1);
expect(resultPodName[0].key).toBe('__hdx_materialized_k8s.pod.name');
expect(resultPodName[0].value).toHaveLength(2);
expect(resultPodName[0].value).toEqual(
expect.arrayContaining(['pod1', 'pod2']),
);
});
it('should handle JSON columns correctly', async () => {
const resultLogAttributes = await metadata.getKeyValues({
chartConfig,
keys: ['LogAttributes.user'],
disableRowLimit,
});
expect(resultLogAttributes).toHaveLength(1);
expect(resultLogAttributes[0].key).toBe('LogAttributes.user');
expect(resultLogAttributes[0].value).toHaveLength(3);
expect(resultLogAttributes[0].value).toEqual(
expect.arrayContaining(['john', 'jack', 'jane']),
);
});
it('should return an empty list when no keys are provided', async () => {
const resultEmpty = await metadata.getKeyValues({
chartConfig,
keys: [],
});
expect(resultEmpty).toEqual([]);
});
it('should correctly limit the number of returned values', async () => {
const resultLimited = await metadata.getKeyValues({
chartConfig,
keys: ['SeverityText'],
limit: 2,
});
expect(resultLimited).toHaveLength(1);
expect(resultLimited[0].key).toBe('SeverityText');
expect(resultLimited[0].value).toHaveLength(2);
expect(
resultLimited[0].value.every(v =>
['info', 'error', 'warning'].includes(v),
),
).toBeTruthy();
});
});
});
});

View file

@ -251,22 +251,6 @@ describe('Metadata', () => {
],
}),
});
// Mock getColumns to return columns including materialized ones
mockCache.getOrFetch.mockImplementation((key, queryFn) => {
if (key.includes('.columns')) {
return Promise.resolve([
{ name: 'regular_column', type: 'String', default_type: '' },
{
name: 'materialized_column',
type: 'String',
default_type: 'MATERIALIZED',
},
{ name: 'default_column', type: 'String', default_type: 'DEFAULT' },
]);
}
return queryFn();
});
});
it('should apply row limit when disableRowLimit is false', async () => {
@ -355,71 +339,20 @@ describe('Metadata', () => {
expect(result).toEqual([{ key: 'column1', value: ['value1', 'value2'] }]);
});
it('should include materialized fields when selecting all columns', async () => {
it('should return an empty list when no keys are provided', async () => {
const renderChartConfigSpy = jest.spyOn(
renderChartConfigModule,
'renderChartConfig',
);
await metadata.getKeyValues({
const results = await metadata.getKeyValues({
chartConfig: mockChartConfig,
keys: ['column1'],
keys: [],
limit: 10,
});
// Verify that renderChartConfig was called with the expanded select list
// that includes all columns by name (including materialized ones)
expect(renderChartConfigSpy).toHaveBeenCalledWith(
expect.objectContaining({
with: [
expect.objectContaining({
name: 'sampledData',
chartConfig: expect.objectContaining({
// Should expand to all column names instead of using '*'
select:
'`regular_column`, `materialized_column`, `default_column`',
}),
}),
],
}),
metadata,
);
});
it('should fallback to * when no columns are found', async () => {
// Mock getColumns to return empty array
mockCache.getOrFetch.mockImplementation((key, queryFn) => {
if (key.includes('.columns')) {
return Promise.resolve([]);
}
return queryFn();
});
const renderChartConfigSpy = jest.spyOn(
renderChartConfigModule,
'renderChartConfig',
);
await metadata.getKeyValues({
chartConfig: mockChartConfig,
keys: ['column1'],
limit: 10,
});
// Should fallback to '*' when no columns are found
expect(renderChartConfigSpy).toHaveBeenCalledWith(
expect.objectContaining({
with: [
expect.objectContaining({
name: 'sampledData',
chartConfig: expect.objectContaining({
select: '*',
}),
}),
],
}),
metadata,
);
expect(results).toEqual([]);
expect(renderChartConfigSpy).not.toHaveBeenCalled();
});
});

View file

@ -668,29 +668,24 @@ export class Metadata {
return this.cache.getOrFetch(
`${chartConfig.connection}.${chartConfig.from.databaseName}.${chartConfig.from.tableName}.${keys.join(',')}.${chartConfig.dateRange.toString()}.${disableRowLimit}.values`,
async () => {
const selectClause = keys
.map((k, i) => `groupUniqArray(${limit})(${k}) AS param${i}`)
.join(', ');
if (keys.length === 0) return [];
// When disableRowLimit is true, query directly without CTE
// Otherwise, use CTE with row limits for sampling
const sqlConfig = disableRowLimit
? {
...chartConfig,
select: selectClause,
select: keys
.map((k, i) => `groupUniqArray(${limit})(${k}) AS param${i}`)
.join(', '),
}
: await (async () => {
// Get all columns including materialized ones
const columns = await this.getColumns({
databaseName: chartConfig.from.databaseName,
tableName: chartConfig.from.tableName,
connectionId: chartConfig.connection,
});
// Build select expression that includes all columns by name
// This ensures materialized columns are included
// Build select expression that includes each of the given keys.
// This avoids selecting entire JSON columns, which is significantly slower
// than selecting just the JSON paths corresponding to the given keys.
// paramN aliases are used to avoid issues with special characters or complex expressions in keys.
const selectExpr =
columns.map(col => `\`${col.name}\``).join(', ') || '*';
keys.map((k, i) => `${k} as param${i}`).join(', ') || '*';
return {
with: [
@ -710,7 +705,12 @@ export class Metadata {
isSubquery: true,
},
],
select: selectClause,
select: keys
.map(
(_, i) =>
`groupUniqArray(${limit})(param${i}) AS param${i}`,
)
.join(', '),
connection: chartConfig.connection,
from: { databaseName: '', tableName: 'sampledData' },
where: '',

View file

@ -4535,6 +4535,7 @@ __metadata:
"@types/uuid": "npm:^8.3.4"
date-fns: "npm:^2.28.0"
date-fns-tz: "npm:^2.0.0"
dotenv: "npm:^17.2.3"
jest: "npm:^28.1.1"
lodash: "npm:^4.17.21"
node-sql-parser: "npm:^5.3.5"
@ -14690,6 +14691,13 @@ __metadata:
languageName: node
linkType: hard
"dotenv@npm:^17.2.3":
version: 17.2.3
resolution: "dotenv@npm:17.2.3"
checksum: 10c0/c884403209f713214a1b64d4d1defa4934c2aa5b0002f5a670ae298a51e3c3ad3ba79dfee2f8df49f01ae74290fcd9acdb1ab1d09c7bfb42b539036108bb2ba0
languageName: node
linkType: hard
"dunder-proto@npm:^1.0.1":
version: 1.0.1
resolution: "dunder-proto@npm:1.0.1"