hyperdx/packages/common-utils/src/__tests__/metadata.int.test.ts

678 lines
22 KiB
TypeScript
Raw Normal View History

import { createClient } from '@clickhouse/client';
import { ClickHouseClient } from '@clickhouse/client-common';
import { ClickhouseClient as HdxClickhouseClient } from '@/clickhouse/node';
import { Metadata, MetadataCache } from '@/core/metadata';
import { ChartConfigWithDateRange, SourceKind, TSource } from '@/types';
describe('Metadata Integration Tests', () => {
let client: ClickHouseClient;
let hdxClient: HdxClickhouseClient;
const source: TSource = {
id: 'test-source',
name: 'Test',
kind: SourceKind.Log,
connection: 'conn-1',
from: { databaseName: 'default', tableName: 'logs' },
timestampValueExpression: 'Timestamp',
defaultTableSelectExpression: '*',
querySettings: [
{ setting: 'optimize_read_in_order', value: '0' },
{ setting: 'cast_keep_nullable', value: '0' },
],
};
beforeAll(() => {
const host = process.env.CLICKHOUSE_HOST || 'http://localhost:8123';
const username = process.env.CLICKHOUSE_USER || 'default';
const password = process.env.CLICKHOUSE_PASSWORD || '';
client = createClient({
url: host,
username,
password,
});
hdxClient = new HdxClickhouseClient({
host,
username,
password,
});
});
describe('getKeyValues', () => {
let metadata: Metadata;
const chartConfig: ChartConfigWithDateRange = {
connection: 'test_connection',
from: {
databaseName: 'default',
tableName: 'test_table',
},
dateRange: [new Date('2023-01-01'), new Date('2025-01-01')],
select: 'col1, col2',
timestampValueExpression: 'Timestamp',
where: '',
};
beforeAll(async () => {
await client.command({
query: `CREATE OR REPLACE TABLE default.test_table (
Timestamp DateTime64(9) CODEC(Delta(8), ZSTD(1)),
SeverityText LowCardinality(String) CODEC(ZSTD(1)),
TraceId String,
LogAttributes JSON CODEC(ZSTD(1)),
ResourceAttributes Map(LowCardinality(String), String) CODEC(ZSTD(1)),
\`__hdx_materialized_k8s.pod.name\` String MATERIALIZED ResourceAttributes['k8s.pod.name'] CODEC(ZSTD(1)),
)
ENGINE = MergeTree()
ORDER BY (Timestamp)
`,
});
await client.command({
query: `INSERT INTO default.test_table (Timestamp, SeverityText, TraceId, ResourceAttributes, LogAttributes) VALUES
('2023-06-01 12:00:00', 'info', '1o2udn120d8n', { 'k8s.pod.name': 'pod1', 'env': 'prod' },'{"action":"ping"}'),
('2024-06-01 12:00:00', 'error', '67890-09098', { 'k8s.pod.name': 'pod2', 'env': 'prod' },'{}'),
('2024-06-01 12:00:00', 'info', '11h9238re1h92', { 'env': 'staging' },'{"user":"john"}'),
('2024-06-01 12:00:00', 'warning', '1o2udn120d8n', { 'k8s.pod.name': 'pod1', 'env': 'prod' }, '{"user":"jack","action":"login"}'),
('2024-06-01 12:00:00', '', '1o2udn120d8n', { 'env': 'prod' }, '{"user":"jane","action":"login"}')
`,
});
});
beforeEach(async () => {
metadata = new Metadata(hdxClient, new MetadataCache());
});
afterAll(async () => {
await client.command({
query: 'DROP TABLE IF EXISTS default.test_table',
});
await client.close();
});
describe.each([true, false])('with disableRowLimit=%s', disableRowLimit => {
it('should return key-value pairs for a given metadata key', async () => {
const resultSeverityText = await metadata.getKeyValues({
chartConfig,
keys: ['SeverityText'],
disableRowLimit,
source,
});
expect(resultSeverityText).toHaveLength(1);
expect(resultSeverityText[0].key).toBe('SeverityText');
expect(resultSeverityText[0].value).toHaveLength(3);
expect(resultSeverityText[0].value).toEqual(
expect.arrayContaining(['info', 'error', 'warning']),
);
const resultTraceId = await metadata.getKeyValues({
chartConfig,
keys: ['TraceId'],
disableRowLimit,
source,
});
expect(resultTraceId).toHaveLength(1);
expect(resultTraceId[0].key).toBe('TraceId');
expect(resultTraceId[0].value).toHaveLength(3);
expect(resultTraceId[0].value).toEqual(
expect.arrayContaining([
'1o2udn120d8n',
'67890-09098',
'11h9238re1h92',
]),
);
const resultBoth = await metadata.getKeyValues({
chartConfig,
keys: ['TraceId', 'SeverityText'],
disableRowLimit,
source,
});
expect(resultBoth).toEqual([
{
key: 'TraceId',
value: expect.arrayContaining([
'1o2udn120d8n',
'67890-09098',
'11h9238re1h92',
]),
},
{
key: 'SeverityText',
value: expect.arrayContaining(['info', 'error', 'warning']),
},
]);
});
it('should handle materialized columns correctly', async () => {
const resultPodName = await metadata.getKeyValues({
chartConfig,
keys: ['__hdx_materialized_k8s.pod.name'],
disableRowLimit,
source,
});
expect(resultPodName).toHaveLength(1);
expect(resultPodName[0].key).toBe('__hdx_materialized_k8s.pod.name');
expect(resultPodName[0].value).toHaveLength(2);
expect(resultPodName[0].value).toEqual(
expect.arrayContaining(['pod1', 'pod2']),
);
});
it('should handle JSON columns correctly', async () => {
const resultLogAttributes = await metadata.getKeyValues({
chartConfig,
keys: ['LogAttributes.user'],
disableRowLimit,
source,
});
expect(resultLogAttributes).toHaveLength(1);
expect(resultLogAttributes[0].key).toBe('LogAttributes.user');
expect(resultLogAttributes[0].value).toHaveLength(3);
expect(resultLogAttributes[0].value).toEqual(
expect.arrayContaining(['john', 'jack', 'jane']),
);
});
it('should return an empty list when no keys are provided', async () => {
const resultEmpty = await metadata.getKeyValues({
chartConfig,
keys: [],
source,
});
expect(resultEmpty).toEqual([]);
});
it('should correctly limit the number of returned values', async () => {
const resultLimited = await metadata.getKeyValues({
chartConfig,
keys: ['SeverityText'],
limit: 2,
source,
});
expect(resultLimited).toHaveLength(1);
expect(resultLimited[0].key).toBe('SeverityText');
expect(resultLimited[0].value).toHaveLength(2);
expect(
resultLimited[0].value.every(
v =>
typeof v === 'string' && ['info', 'error', 'warning'].includes(v),
),
).toBeTruthy();
});
});
});
perf: Query filter values from MVs (#1591) Closes HDX-3066 # Summary This PR improves the performance of Search and Dashboard filters by querying available filter values from materialized views, when possible. The existing `useMultipleGetKeyValues` has been updated to make use of `getKeyValuesWithMVs`, which works as follows: 1. Identify which materialized views support each of the requested keys. Keys must be `dimensionColumns` in the materialized view, the materialized view must support the provided date range, and the materialized view must support the provided filters (determined by running an EXPLAIN query). 2. Split the keys into groups based on which Materialized view can provide their values. Query values for each group using the existing `getKeyValues` function. Sampling is disabled because it is assumed that MVs are small enough to be queried without sampling. 3. Query any keys which are not supported by any materialized view from the base table. To reduce the number of EXPLAIN queries required to support this, and to generally decrease the number of concurrent requests for filters, Dashboard filter value queries are now batched by source. Values for each batch are then queried using `getKeyValuesWithMVs` (described above). Other fixes: 1. I've also updated the various filter functions and hooks to support abort signals, so that filter queries are canceled when a query value is no longer needed. 2. The getKeyValues cache key now includes `where` and `filters`, so that the filter values correctly update when new filters or where conditions are added on the search page.
2026-01-14 18:05:11 +00:00
describe('getKeyValuesWithMVs', () => {
let metadata: Metadata;
const baseTableName = 'test_logs_base';
const mvTableName = 'test_logs_mv_1m';
const chartConfig: ChartConfigWithDateRange = {
connection: 'test_connection',
from: {
databaseName: 'default',
tableName: baseTableName,
},
dateRange: [new Date('2024-01-01'), new Date('2024-01-31')],
select: '',
timestampValueExpression: 'Timestamp',
where: '',
};
beforeAll(async () => {
// Create base table
await client.command({
query: `CREATE OR REPLACE TABLE default.${baseTableName} (
Timestamp DateTime64(9) CODEC(Delta(8), ZSTD(1)),
environment LowCardinality(String) CODEC(ZSTD(1)),
service LowCardinality(String) CODEC(ZSTD(1)),
status_code LowCardinality(String) CODEC(ZSTD(1)),
region LowCardinality(String) CODEC(ZSTD(1)),
message String CODEC(ZSTD(1))
)
ENGINE = MergeTree()
ORDER BY (Timestamp)
`,
});
// Create materialized view
await client.command({
query: `CREATE MATERIALIZED VIEW IF NOT EXISTS default.${mvTableName}
ENGINE = SummingMergeTree()
ORDER BY (environment, service, status_code, Timestamp)
AS SELECT
toStartOfMinute(Timestamp) as Timestamp,
environment,
service,
status_code,
count() as count
FROM default.${baseTableName}
GROUP BY Timestamp, environment, service, status_code
`,
});
// Insert data again to populate the MV (MVs don't get historical data)
await client.command({
query: `INSERT INTO default.${baseTableName}
(Timestamp, environment, service, status_code, region, message) VALUES
('2024-01-10 12:00:00', 'production', 'api', '200', 'us-east', 'Success'),
('2024-01-10 13:00:00', 'production', 'web', '200', 'us-west', 'Success'),
('2024-01-10 14:00:00', 'staging', 'api', '500', 'us-east', 'Error'),
('2024-01-10 15:00:00', 'staging', 'worker', '200', 'eu-west', 'Success'),
('2024-01-10 16:00:00', 'production', 'api', '404', 'us-east', 'Not found')
`,
});
});
beforeEach(async () => {
metadata = new Metadata(hdxClient, new MetadataCache());
});
afterAll(async () => {
await client.command({
query: `DROP VIEW IF EXISTS default.${mvTableName}`,
});
await client.command({
query: `DROP TABLE IF EXISTS default.${baseTableName}`,
});
});
it('should fetch key values using materialized views when available', async () => {
const source = {
id: 'test-source',
name: 'Test Logs',
kind: 'otel-logs',
from: { databaseName: 'default', tableName: baseTableName },
timestampValueExpression: 'Timestamp',
connection: 'test_connection',
materializedViews: [
{
databaseName: 'default',
tableName: mvTableName,
dimensionColumns: 'environment, service, status_code',
minGranularity: '1 minute',
timestampColumn: 'Timestamp',
aggregatedColumns: [{ aggFn: 'count', mvColumn: 'count' }],
},
],
};
const result = await metadata.getKeyValuesWithMVs({
chartConfig,
keys: ['environment', 'service', 'status_code'],
source: source as any,
});
expect(result).toHaveLength(3);
const environmentResult = result.find(r => r.key === 'environment');
expect(environmentResult?.value).toEqual(
expect.arrayContaining(['production', 'staging']),
);
const serviceResult = result.find(r => r.key === 'service');
expect(serviceResult?.value).toEqual(
expect.arrayContaining(['api', 'web', 'worker']),
);
const statusCodeResult = result.find(r => r.key === 'status_code');
expect(statusCodeResult?.value).toEqual(
expect.arrayContaining(['200', '404', '500']),
);
});
it('should fall back to base table for keys not in materialized view', async () => {
const source = {
id: 'test-source',
name: 'Test Logs',
kind: 'otel-logs',
from: { databaseName: 'default', tableName: baseTableName },
timestampValueExpression: 'Timestamp',
connection: 'test_connection',
materializedViews: [
{
databaseName: 'default',
tableName: mvTableName,
dimensionColumns: 'environment, service, status_code',
minGranularity: '1 minute',
timestampColumn: 'Timestamp',
aggregatedColumns: [{ aggFn: 'count', mvColumn: 'count' }],
},
],
};
// Query for keys both in and not in the MV
const result = await metadata.getKeyValuesWithMVs({
chartConfig,
keys: ['environment', 'region'], // 'region' is NOT in the MV
source: source as any,
});
expect(result).toHaveLength(2);
const environmentResult = result.find(r => r.key === 'environment');
expect(environmentResult?.value).toEqual(
expect.arrayContaining(['production', 'staging']),
);
const regionResult = result.find(r => r.key === 'region');
expect(regionResult?.value).toEqual(
expect.arrayContaining(['us-east', 'us-west', 'eu-west']),
);
});
it('should work without materialized views (fall back to base table)', async () => {
const source = {
id: 'test-source',
name: 'Test Logs',
kind: 'otel-logs',
from: { databaseName: 'default', tableName: baseTableName },
timestampValueExpression: 'Timestamp',
connection: 'test_connection',
materializedViews: [], // No MVs
};
const result = await metadata.getKeyValuesWithMVs({
chartConfig,
keys: ['environment', 'service'],
source: source as any,
});
expect(result).toHaveLength(2);
const environmentResult = result.find(r => r.key === 'environment');
expect(environmentResult?.value).toEqual(
expect.arrayContaining(['production', 'staging']),
);
const serviceResult = result.find(r => r.key === 'service');
expect(serviceResult?.value).toEqual(
expect.arrayContaining(['api', 'web', 'worker']),
);
});
it('should work with an undefined source parameter (fall back to base table)', async () => {
perf: Query filter values from MVs (#1591) Closes HDX-3066 # Summary This PR improves the performance of Search and Dashboard filters by querying available filter values from materialized views, when possible. The existing `useMultipleGetKeyValues` has been updated to make use of `getKeyValuesWithMVs`, which works as follows: 1. Identify which materialized views support each of the requested keys. Keys must be `dimensionColumns` in the materialized view, the materialized view must support the provided date range, and the materialized view must support the provided filters (determined by running an EXPLAIN query). 2. Split the keys into groups based on which Materialized view can provide their values. Query values for each group using the existing `getKeyValues` function. Sampling is disabled because it is assumed that MVs are small enough to be queried without sampling. 3. Query any keys which are not supported by any materialized view from the base table. To reduce the number of EXPLAIN queries required to support this, and to generally decrease the number of concurrent requests for filters, Dashboard filter value queries are now batched by source. Values for each batch are then queried using `getKeyValuesWithMVs` (described above). Other fixes: 1. I've also updated the various filter functions and hooks to support abort signals, so that filter queries are canceled when a query value is no longer needed. 2. The getKeyValues cache key now includes `where` and `filters`, so that the filter values correctly update when new filters or where conditions are added on the search page.
2026-01-14 18:05:11 +00:00
const result = await metadata.getKeyValuesWithMVs({
chartConfig,
keys: ['environment', 'service'],
// No source parameter
source: undefined,
perf: Query filter values from MVs (#1591) Closes HDX-3066 # Summary This PR improves the performance of Search and Dashboard filters by querying available filter values from materialized views, when possible. The existing `useMultipleGetKeyValues` has been updated to make use of `getKeyValuesWithMVs`, which works as follows: 1. Identify which materialized views support each of the requested keys. Keys must be `dimensionColumns` in the materialized view, the materialized view must support the provided date range, and the materialized view must support the provided filters (determined by running an EXPLAIN query). 2. Split the keys into groups based on which Materialized view can provide their values. Query values for each group using the existing `getKeyValues` function. Sampling is disabled because it is assumed that MVs are small enough to be queried without sampling. 3. Query any keys which are not supported by any materialized view from the base table. To reduce the number of EXPLAIN queries required to support this, and to generally decrease the number of concurrent requests for filters, Dashboard filter value queries are now batched by source. Values for each batch are then queried using `getKeyValuesWithMVs` (described above). Other fixes: 1. I've also updated the various filter functions and hooks to support abort signals, so that filter queries are canceled when a query value is no longer needed. 2. The getKeyValues cache key now includes `where` and `filters`, so that the filter values correctly update when new filters or where conditions are added on the search page.
2026-01-14 18:05:11 +00:00
});
expect(result).toHaveLength(2);
const environmentResult = result.find(r => r.key === 'environment');
expect(environmentResult?.value).toEqual(
expect.arrayContaining(['production', 'staging']),
);
const serviceResult = result.find(r => r.key === 'service');
expect(serviceResult?.value).toEqual(
expect.arrayContaining(['api', 'web', 'worker']),
);
});
it('should return empty array for empty keys', async () => {
const source = {
id: 'test-source',
name: 'Test Logs',
kind: 'otel-logs',
from: { databaseName: 'default', tableName: baseTableName },
timestampValueExpression: 'Timestamp',
connection: 'test_connection',
materializedViews: [
{
databaseName: 'default',
tableName: mvTableName,
dimensionColumns: 'environment, service, status_code',
minGranularity: '1 minute',
timestampColumn: 'Timestamp',
aggregatedColumns: [{ aggFn: 'count', mvColumn: 'count' }],
},
],
};
const result = await metadata.getKeyValuesWithMVs({
chartConfig,
keys: [],
source: source as any,
});
expect(result).toEqual([]);
});
it('should respect limit parameter', async () => {
const source = {
id: 'test-source',
name: 'Test Logs',
kind: 'otel-logs',
from: { databaseName: 'default', tableName: baseTableName },
timestampValueExpression: 'Timestamp',
connection: 'test_connection',
materializedViews: [
{
databaseName: 'default',
tableName: mvTableName,
dimensionColumns: 'environment, service, status_code',
minGranularity: '1 minute',
timestampColumn: 'Timestamp',
aggregatedColumns: [{ aggFn: 'count', mvColumn: 'count' }],
},
],
};
const result = await metadata.getKeyValuesWithMVs({
chartConfig,
keys: ['service'],
source: source as any,
limit: 2,
});
expect(result).toHaveLength(1);
expect(result[0].key).toBe('service');
expect(result[0].value.length).toBeLessThanOrEqual(2);
});
it('should work with disableRowLimit: true', async () => {
const source = {
id: 'test-source',
name: 'Test Logs',
kind: 'otel-logs',
from: { databaseName: 'default', tableName: baseTableName },
timestampValueExpression: 'Timestamp',
connection: 'test_connection',
materializedViews: [
{
databaseName: 'default',
tableName: mvTableName,
dimensionColumns: 'environment, service, status_code',
minGranularity: '1 minute',
timestampColumn: 'Timestamp',
aggregatedColumns: [{ aggFn: 'count', mvColumn: 'count' }],
},
],
};
// Should work with disableRowLimit: true (no row limits applied)
const result = await metadata.getKeyValuesWithMVs({
chartConfig,
keys: ['environment', 'service', 'status_code'],
source: source as any,
disableRowLimit: true,
});
expect(result).toHaveLength(3);
const environmentResult = result.find(r => r.key === 'environment');
expect(environmentResult?.value).toEqual(
expect.arrayContaining(['production', 'staging']),
);
const serviceResult = result.find(r => r.key === 'service');
expect(serviceResult?.value).toEqual(
expect.arrayContaining(['api', 'web', 'worker']),
);
const statusCodeResult = result.find(r => r.key === 'status_code');
expect(statusCodeResult?.value).toEqual(
expect.arrayContaining(['200', '404', '500']),
);
});
});
feat: Support fetching table metadata for Distributed tables (#1920) ## Summary This PR updates the `getTableMetadata` and `getSkipIndices` functions to handle distributed tables by looking up primary keys and indexes (respectively) from the underlying local table (since the distributed table does not have them). - Source config inference works again - The default order by optimization (adding `toStartOfXX()` to the search page order by when it's present in the primary key) now correctly applies when querying a distributed table source - The date range filter now correctly filters on both `toStartOfXX(TimestampTime)` and `TimestampTime` when `toStartOfXX(TimestampTime)` is present in the primary key of the local table. - Source schema preview now shows both the distributed table and the local table, when the source is defined by a distributed table. - Text indexes are now detected correctly for distributed tables ### Screenshots or video https://github.com/user-attachments/assets/d1c60964-99f0-4470-9378-a812f963c692 When text index is present, hasAllTokens is used: <img width="848" height="139" alt="Screenshot 2026-03-16 at 10 55 24 AM" src="https://github.com/user-attachments/assets/2bd780dc-291d-495f-bd12-c636988648c1" /> ### How to test locally or on Vercel <details> <summary>Testing locally, you'll need to create a distributed logs table with a local table that has a timestamp optimization:</summary> ```sql CREATE TABLE default.otel_logs_toStartOf on cluster hdx_cluster ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TimestampTime` DateTime DEFAULT toDateTime(Timestamp), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), `TraceFlags` UInt8, `SeverityText` LowCardinality(String) CODEC(ZSTD(1)), `SeverityNumber` UInt8, `ServiceName` LowCardinality(String) CODEC(ZSTD(1)), `Body` String CODEC(ZSTD(1)), `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ScopeName` String CODEC(ZSTD(1)), `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)), `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), INDEX idx_trace_id TraceId TYPE bloom_filter(0.001) GRANULARITY 1, INDEX idx_res_attr_key mapKeys(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_res_attr_value mapValues(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_key mapKeys(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_value mapValues(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_key mapKeys(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_value mapValues(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_body Body TYPE tokenbf_v1(32768, 3, 0) GRANULARITY 8 ) ENGINE = MergeTree PARTITION BY toDate(TimestampTime) PRIMARY KEY (toStartOfMinute(TimestampTime), ServiceName, TimestampTime) ORDER BY (toStartOfMinute(TimestampTime), ServiceName, TimestampTime, Timestamp) TTL TimestampTime + toIntervalDay(30) SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1; CREATE TABLE default.otel_logs_toStartOf_distributed on cluster hdx_cluster ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TimestampTime` DateTime DEFAULT toDateTime(Timestamp), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), `TraceFlags` UInt8, `SeverityText` LowCardinality(String) CODEC(ZSTD(1)), `SeverityNumber` UInt8, `ServiceName` LowCardinality(String) CODEC(ZSTD(1)), `Body` String CODEC(ZSTD(1)), `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ScopeName` String CODEC(ZSTD(1)), `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)), `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)) ) ENGINE = Distributed('hdx_cluster', 'default', 'otel_logs_toStartOf', rand()); ALTER TABLE otel_logs_toStartOf ON CLUSTER hdx_cluster ADD INDEX text_idx(Body) TYPE text(tokenizer=splitByNonAlpha, preprocessor=lower(Body)) SETTINGS enable_full_text_index=1; ALTER TABLE otel_logs_toStartOf ON CLUSTER hdx_cluster MATERIALIZE INDEX text_idx; ``` </details> <details> <summary>To test text index detection, first enable full text indexes locally in your users.xml file</summary> ```xml <clickhouse> <profiles> <default> ... <enable_full_text_index>1</enable_full_text_index> </default> </profiles> ... <clickhouse> ``` </details> ### References - Linear Issue: Closes HDX-3703 - Related PRs:
2026-03-17 14:35:08 +00:00
describe('Distributed tables support', () => {
let metadata: Metadata;
const localTableName = 'test_dist_local';
const distributedTableName = 'test_dist';
beforeAll(async () => {
await client.command({
query: `CREATE OR REPLACE TABLE default.${localTableName} (
Timestamp DateTime64(9) CODEC(Delta(8), ZSTD(1)),
ServiceName LowCardinality(String) CODEC(ZSTD(1)),
Body String CODEC(ZSTD(1)),
ResourceAttributes Map(LowCardinality(String), String) CODEC(ZSTD(1)),
INDEX idx_body Body TYPE tokenbf_v1(32768, 3, 0) GRANULARITY 8,
INDEX idx_ts Timestamp TYPE minmax GRANULARITY 1
)
ENGINE = MergeTree()
PARTITION BY toDate(Timestamp)
ORDER BY (ServiceName, Timestamp)
`,
});
await client.command({
query: `CREATE OR REPLACE TABLE default.${distributedTableName}
AS default.${localTableName}
ENGINE = Distributed('hdx_cluster', 'default', '${localTableName}', rand())
`,
});
});
beforeEach(() => {
metadata = new Metadata(hdxClient, new MetadataCache());
});
afterAll(async () => {
await client.command({
query: `DROP TABLE IF EXISTS default.${distributedTableName}`,
});
await client.command({
query: `DROP TABLE IF EXISTS default.${localTableName}`,
});
});
it('should return local table keys for a distributed table', async () => {
const result = await metadata.getTableMetadata({
databaseName: 'default',
tableName: distributedTableName,
connectionId: 'test_connection',
});
// Should have the distributed table's create_table_query
expect(result!.create_table_query).toContain(distributedTableName);
expect(result!.create_table_query).toContain('Distributed');
feat: Support fetching table metadata for Distributed tables (#1920) ## Summary This PR updates the `getTableMetadata` and `getSkipIndices` functions to handle distributed tables by looking up primary keys and indexes (respectively) from the underlying local table (since the distributed table does not have them). - Source config inference works again - The default order by optimization (adding `toStartOfXX()` to the search page order by when it's present in the primary key) now correctly applies when querying a distributed table source - The date range filter now correctly filters on both `toStartOfXX(TimestampTime)` and `TimestampTime` when `toStartOfXX(TimestampTime)` is present in the primary key of the local table. - Source schema preview now shows both the distributed table and the local table, when the source is defined by a distributed table. - Text indexes are now detected correctly for distributed tables ### Screenshots or video https://github.com/user-attachments/assets/d1c60964-99f0-4470-9378-a812f963c692 When text index is present, hasAllTokens is used: <img width="848" height="139" alt="Screenshot 2026-03-16 at 10 55 24 AM" src="https://github.com/user-attachments/assets/2bd780dc-291d-495f-bd12-c636988648c1" /> ### How to test locally or on Vercel <details> <summary>Testing locally, you'll need to create a distributed logs table with a local table that has a timestamp optimization:</summary> ```sql CREATE TABLE default.otel_logs_toStartOf on cluster hdx_cluster ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TimestampTime` DateTime DEFAULT toDateTime(Timestamp), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), `TraceFlags` UInt8, `SeverityText` LowCardinality(String) CODEC(ZSTD(1)), `SeverityNumber` UInt8, `ServiceName` LowCardinality(String) CODEC(ZSTD(1)), `Body` String CODEC(ZSTD(1)), `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ScopeName` String CODEC(ZSTD(1)), `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)), `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), INDEX idx_trace_id TraceId TYPE bloom_filter(0.001) GRANULARITY 1, INDEX idx_res_attr_key mapKeys(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_res_attr_value mapValues(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_key mapKeys(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_value mapValues(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_key mapKeys(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_value mapValues(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_body Body TYPE tokenbf_v1(32768, 3, 0) GRANULARITY 8 ) ENGINE = MergeTree PARTITION BY toDate(TimestampTime) PRIMARY KEY (toStartOfMinute(TimestampTime), ServiceName, TimestampTime) ORDER BY (toStartOfMinute(TimestampTime), ServiceName, TimestampTime, Timestamp) TTL TimestampTime + toIntervalDay(30) SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1; CREATE TABLE default.otel_logs_toStartOf_distributed on cluster hdx_cluster ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TimestampTime` DateTime DEFAULT toDateTime(Timestamp), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), `TraceFlags` UInt8, `SeverityText` LowCardinality(String) CODEC(ZSTD(1)), `SeverityNumber` UInt8, `ServiceName` LowCardinality(String) CODEC(ZSTD(1)), `Body` String CODEC(ZSTD(1)), `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ScopeName` String CODEC(ZSTD(1)), `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)), `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)) ) ENGINE = Distributed('hdx_cluster', 'default', 'otel_logs_toStartOf', rand()); ALTER TABLE otel_logs_toStartOf ON CLUSTER hdx_cluster ADD INDEX text_idx(Body) TYPE text(tokenizer=splitByNonAlpha, preprocessor=lower(Body)) SETTINGS enable_full_text_index=1; ALTER TABLE otel_logs_toStartOf ON CLUSTER hdx_cluster MATERIALIZE INDEX text_idx; ``` </details> <details> <summary>To test text index detection, first enable full text indexes locally in your users.xml file</summary> ```xml <clickhouse> <profiles> <default> ... <enable_full_text_index>1</enable_full_text_index> </default> </profiles> ... <clickhouse> ``` </details> ### References - Linear Issue: Closes HDX-3703 - Related PRs:
2026-03-17 14:35:08 +00:00
// Should have the local table's create_table_query
expect(result!.create_local_table_query).toContain(localTableName);
expect(result!.create_local_table_query).toContain('MergeTree');
feat: Support fetching table metadata for Distributed tables (#1920) ## Summary This PR updates the `getTableMetadata` and `getSkipIndices` functions to handle distributed tables by looking up primary keys and indexes (respectively) from the underlying local table (since the distributed table does not have them). - Source config inference works again - The default order by optimization (adding `toStartOfXX()` to the search page order by when it's present in the primary key) now correctly applies when querying a distributed table source - The date range filter now correctly filters on both `toStartOfXX(TimestampTime)` and `TimestampTime` when `toStartOfXX(TimestampTime)` is present in the primary key of the local table. - Source schema preview now shows both the distributed table and the local table, when the source is defined by a distributed table. - Text indexes are now detected correctly for distributed tables ### Screenshots or video https://github.com/user-attachments/assets/d1c60964-99f0-4470-9378-a812f963c692 When text index is present, hasAllTokens is used: <img width="848" height="139" alt="Screenshot 2026-03-16 at 10 55 24 AM" src="https://github.com/user-attachments/assets/2bd780dc-291d-495f-bd12-c636988648c1" /> ### How to test locally or on Vercel <details> <summary>Testing locally, you'll need to create a distributed logs table with a local table that has a timestamp optimization:</summary> ```sql CREATE TABLE default.otel_logs_toStartOf on cluster hdx_cluster ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TimestampTime` DateTime DEFAULT toDateTime(Timestamp), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), `TraceFlags` UInt8, `SeverityText` LowCardinality(String) CODEC(ZSTD(1)), `SeverityNumber` UInt8, `ServiceName` LowCardinality(String) CODEC(ZSTD(1)), `Body` String CODEC(ZSTD(1)), `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ScopeName` String CODEC(ZSTD(1)), `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)), `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), INDEX idx_trace_id TraceId TYPE bloom_filter(0.001) GRANULARITY 1, INDEX idx_res_attr_key mapKeys(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_res_attr_value mapValues(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_key mapKeys(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_value mapValues(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_key mapKeys(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_value mapValues(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_body Body TYPE tokenbf_v1(32768, 3, 0) GRANULARITY 8 ) ENGINE = MergeTree PARTITION BY toDate(TimestampTime) PRIMARY KEY (toStartOfMinute(TimestampTime), ServiceName, TimestampTime) ORDER BY (toStartOfMinute(TimestampTime), ServiceName, TimestampTime, Timestamp) TTL TimestampTime + toIntervalDay(30) SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1; CREATE TABLE default.otel_logs_toStartOf_distributed on cluster hdx_cluster ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TimestampTime` DateTime DEFAULT toDateTime(Timestamp), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), `TraceFlags` UInt8, `SeverityText` LowCardinality(String) CODEC(ZSTD(1)), `SeverityNumber` UInt8, `ServiceName` LowCardinality(String) CODEC(ZSTD(1)), `Body` String CODEC(ZSTD(1)), `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ScopeName` String CODEC(ZSTD(1)), `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)), `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)) ) ENGINE = Distributed('hdx_cluster', 'default', 'otel_logs_toStartOf', rand()); ALTER TABLE otel_logs_toStartOf ON CLUSTER hdx_cluster ADD INDEX text_idx(Body) TYPE text(tokenizer=splitByNonAlpha, preprocessor=lower(Body)) SETTINGS enable_full_text_index=1; ALTER TABLE otel_logs_toStartOf ON CLUSTER hdx_cluster MATERIALIZE INDEX text_idx; ``` </details> <details> <summary>To test text index detection, first enable full text indexes locally in your users.xml file</summary> ```xml <clickhouse> <profiles> <default> ... <enable_full_text_index>1</enable_full_text_index> </default> </profiles> ... <clickhouse> ``` </details> ### References - Linear Issue: Closes HDX-3703 - Related PRs:
2026-03-17 14:35:08 +00:00
// Keys should come from the local table
expect(result!.primary_key).toBe('ServiceName, Timestamp');
expect(result!.sorting_key).toBe('ServiceName, Timestamp');
expect(result!.partition_key).toBe('toDate(Timestamp)');
feat: Support fetching table metadata for Distributed tables (#1920) ## Summary This PR updates the `getTableMetadata` and `getSkipIndices` functions to handle distributed tables by looking up primary keys and indexes (respectively) from the underlying local table (since the distributed table does not have them). - Source config inference works again - The default order by optimization (adding `toStartOfXX()` to the search page order by when it's present in the primary key) now correctly applies when querying a distributed table source - The date range filter now correctly filters on both `toStartOfXX(TimestampTime)` and `TimestampTime` when `toStartOfXX(TimestampTime)` is present in the primary key of the local table. - Source schema preview now shows both the distributed table and the local table, when the source is defined by a distributed table. - Text indexes are now detected correctly for distributed tables ### Screenshots or video https://github.com/user-attachments/assets/d1c60964-99f0-4470-9378-a812f963c692 When text index is present, hasAllTokens is used: <img width="848" height="139" alt="Screenshot 2026-03-16 at 10 55 24 AM" src="https://github.com/user-attachments/assets/2bd780dc-291d-495f-bd12-c636988648c1" /> ### How to test locally or on Vercel <details> <summary>Testing locally, you'll need to create a distributed logs table with a local table that has a timestamp optimization:</summary> ```sql CREATE TABLE default.otel_logs_toStartOf on cluster hdx_cluster ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TimestampTime` DateTime DEFAULT toDateTime(Timestamp), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), `TraceFlags` UInt8, `SeverityText` LowCardinality(String) CODEC(ZSTD(1)), `SeverityNumber` UInt8, `ServiceName` LowCardinality(String) CODEC(ZSTD(1)), `Body` String CODEC(ZSTD(1)), `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ScopeName` String CODEC(ZSTD(1)), `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)), `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), INDEX idx_trace_id TraceId TYPE bloom_filter(0.001) GRANULARITY 1, INDEX idx_res_attr_key mapKeys(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_res_attr_value mapValues(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_key mapKeys(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_value mapValues(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_key mapKeys(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_value mapValues(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_body Body TYPE tokenbf_v1(32768, 3, 0) GRANULARITY 8 ) ENGINE = MergeTree PARTITION BY toDate(TimestampTime) PRIMARY KEY (toStartOfMinute(TimestampTime), ServiceName, TimestampTime) ORDER BY (toStartOfMinute(TimestampTime), ServiceName, TimestampTime, Timestamp) TTL TimestampTime + toIntervalDay(30) SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1; CREATE TABLE default.otel_logs_toStartOf_distributed on cluster hdx_cluster ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TimestampTime` DateTime DEFAULT toDateTime(Timestamp), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), `TraceFlags` UInt8, `SeverityText` LowCardinality(String) CODEC(ZSTD(1)), `SeverityNumber` UInt8, `ServiceName` LowCardinality(String) CODEC(ZSTD(1)), `Body` String CODEC(ZSTD(1)), `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ScopeName` String CODEC(ZSTD(1)), `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)), `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)) ) ENGINE = Distributed('hdx_cluster', 'default', 'otel_logs_toStartOf', rand()); ALTER TABLE otel_logs_toStartOf ON CLUSTER hdx_cluster ADD INDEX text_idx(Body) TYPE text(tokenizer=splitByNonAlpha, preprocessor=lower(Body)) SETTINGS enable_full_text_index=1; ALTER TABLE otel_logs_toStartOf ON CLUSTER hdx_cluster MATERIALIZE INDEX text_idx; ``` </details> <details> <summary>To test text index detection, first enable full text indexes locally in your users.xml file</summary> ```xml <clickhouse> <profiles> <default> ... <enable_full_text_index>1</enable_full_text_index> </default> </profiles> ... <clickhouse> ``` </details> ### References - Linear Issue: Closes HDX-3703 - Related PRs:
2026-03-17 14:35:08 +00:00
// Engine should be overridden with local table's engine
expect(result!.engine).toBe('MergeTree');
feat: Support fetching table metadata for Distributed tables (#1920) ## Summary This PR updates the `getTableMetadata` and `getSkipIndices` functions to handle distributed tables by looking up primary keys and indexes (respectively) from the underlying local table (since the distributed table does not have them). - Source config inference works again - The default order by optimization (adding `toStartOfXX()` to the search page order by when it's present in the primary key) now correctly applies when querying a distributed table source - The date range filter now correctly filters on both `toStartOfXX(TimestampTime)` and `TimestampTime` when `toStartOfXX(TimestampTime)` is present in the primary key of the local table. - Source schema preview now shows both the distributed table and the local table, when the source is defined by a distributed table. - Text indexes are now detected correctly for distributed tables ### Screenshots or video https://github.com/user-attachments/assets/d1c60964-99f0-4470-9378-a812f963c692 When text index is present, hasAllTokens is used: <img width="848" height="139" alt="Screenshot 2026-03-16 at 10 55 24 AM" src="https://github.com/user-attachments/assets/2bd780dc-291d-495f-bd12-c636988648c1" /> ### How to test locally or on Vercel <details> <summary>Testing locally, you'll need to create a distributed logs table with a local table that has a timestamp optimization:</summary> ```sql CREATE TABLE default.otel_logs_toStartOf on cluster hdx_cluster ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TimestampTime` DateTime DEFAULT toDateTime(Timestamp), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), `TraceFlags` UInt8, `SeverityText` LowCardinality(String) CODEC(ZSTD(1)), `SeverityNumber` UInt8, `ServiceName` LowCardinality(String) CODEC(ZSTD(1)), `Body` String CODEC(ZSTD(1)), `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ScopeName` String CODEC(ZSTD(1)), `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)), `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), INDEX idx_trace_id TraceId TYPE bloom_filter(0.001) GRANULARITY 1, INDEX idx_res_attr_key mapKeys(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_res_attr_value mapValues(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_key mapKeys(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_value mapValues(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_key mapKeys(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_value mapValues(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_body Body TYPE tokenbf_v1(32768, 3, 0) GRANULARITY 8 ) ENGINE = MergeTree PARTITION BY toDate(TimestampTime) PRIMARY KEY (toStartOfMinute(TimestampTime), ServiceName, TimestampTime) ORDER BY (toStartOfMinute(TimestampTime), ServiceName, TimestampTime, Timestamp) TTL TimestampTime + toIntervalDay(30) SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1; CREATE TABLE default.otel_logs_toStartOf_distributed on cluster hdx_cluster ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TimestampTime` DateTime DEFAULT toDateTime(Timestamp), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), `TraceFlags` UInt8, `SeverityText` LowCardinality(String) CODEC(ZSTD(1)), `SeverityNumber` UInt8, `ServiceName` LowCardinality(String) CODEC(ZSTD(1)), `Body` String CODEC(ZSTD(1)), `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ScopeName` String CODEC(ZSTD(1)), `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)), `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)) ) ENGINE = Distributed('hdx_cluster', 'default', 'otel_logs_toStartOf', rand()); ALTER TABLE otel_logs_toStartOf ON CLUSTER hdx_cluster ADD INDEX text_idx(Body) TYPE text(tokenizer=splitByNonAlpha, preprocessor=lower(Body)) SETTINGS enable_full_text_index=1; ALTER TABLE otel_logs_toStartOf ON CLUSTER hdx_cluster MATERIALIZE INDEX text_idx; ``` </details> <details> <summary>To test text index detection, first enable full text indexes locally in your users.xml file</summary> ```xml <clickhouse> <profiles> <default> ... <enable_full_text_index>1</enable_full_text_index> </default> </profiles> ... <clickhouse> ``` </details> ### References - Linear Issue: Closes HDX-3703 - Related PRs:
2026-03-17 14:35:08 +00:00
});
it('should not set create_local_table_query for non-distributed tables', async () => {
const result = await metadata.getTableMetadata({
databaseName: 'default',
tableName: localTableName,
connectionId: 'test_connection',
});
expect(result!.create_local_table_query).toBeUndefined();
expect(result!.engine).toBe('MergeTree');
expect(result!.primary_key).toBe('ServiceName, Timestamp');
feat: Support fetching table metadata for Distributed tables (#1920) ## Summary This PR updates the `getTableMetadata` and `getSkipIndices` functions to handle distributed tables by looking up primary keys and indexes (respectively) from the underlying local table (since the distributed table does not have them). - Source config inference works again - The default order by optimization (adding `toStartOfXX()` to the search page order by when it's present in the primary key) now correctly applies when querying a distributed table source - The date range filter now correctly filters on both `toStartOfXX(TimestampTime)` and `TimestampTime` when `toStartOfXX(TimestampTime)` is present in the primary key of the local table. - Source schema preview now shows both the distributed table and the local table, when the source is defined by a distributed table. - Text indexes are now detected correctly for distributed tables ### Screenshots or video https://github.com/user-attachments/assets/d1c60964-99f0-4470-9378-a812f963c692 When text index is present, hasAllTokens is used: <img width="848" height="139" alt="Screenshot 2026-03-16 at 10 55 24 AM" src="https://github.com/user-attachments/assets/2bd780dc-291d-495f-bd12-c636988648c1" /> ### How to test locally or on Vercel <details> <summary>Testing locally, you'll need to create a distributed logs table with a local table that has a timestamp optimization:</summary> ```sql CREATE TABLE default.otel_logs_toStartOf on cluster hdx_cluster ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TimestampTime` DateTime DEFAULT toDateTime(Timestamp), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), `TraceFlags` UInt8, `SeverityText` LowCardinality(String) CODEC(ZSTD(1)), `SeverityNumber` UInt8, `ServiceName` LowCardinality(String) CODEC(ZSTD(1)), `Body` String CODEC(ZSTD(1)), `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ScopeName` String CODEC(ZSTD(1)), `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)), `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), INDEX idx_trace_id TraceId TYPE bloom_filter(0.001) GRANULARITY 1, INDEX idx_res_attr_key mapKeys(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_res_attr_value mapValues(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_key mapKeys(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_value mapValues(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_key mapKeys(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_value mapValues(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_body Body TYPE tokenbf_v1(32768, 3, 0) GRANULARITY 8 ) ENGINE = MergeTree PARTITION BY toDate(TimestampTime) PRIMARY KEY (toStartOfMinute(TimestampTime), ServiceName, TimestampTime) ORDER BY (toStartOfMinute(TimestampTime), ServiceName, TimestampTime, Timestamp) TTL TimestampTime + toIntervalDay(30) SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1; CREATE TABLE default.otel_logs_toStartOf_distributed on cluster hdx_cluster ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TimestampTime` DateTime DEFAULT toDateTime(Timestamp), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), `TraceFlags` UInt8, `SeverityText` LowCardinality(String) CODEC(ZSTD(1)), `SeverityNumber` UInt8, `ServiceName` LowCardinality(String) CODEC(ZSTD(1)), `Body` String CODEC(ZSTD(1)), `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ScopeName` String CODEC(ZSTD(1)), `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)), `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)) ) ENGINE = Distributed('hdx_cluster', 'default', 'otel_logs_toStartOf', rand()); ALTER TABLE otel_logs_toStartOf ON CLUSTER hdx_cluster ADD INDEX text_idx(Body) TYPE text(tokenizer=splitByNonAlpha, preprocessor=lower(Body)) SETTINGS enable_full_text_index=1; ALTER TABLE otel_logs_toStartOf ON CLUSTER hdx_cluster MATERIALIZE INDEX text_idx; ``` </details> <details> <summary>To test text index detection, first enable full text indexes locally in your users.xml file</summary> ```xml <clickhouse> <profiles> <default> ... <enable_full_text_index>1</enable_full_text_index> </default> </profiles> ... <clickhouse> ``` </details> ### References - Linear Issue: Closes HDX-3703 - Related PRs:
2026-03-17 14:35:08 +00:00
});
it('should return skip indices from the local table when querying a distributed table', async () => {
const result = await metadata.getSkipIndices({
databaseName: 'default',
tableName: distributedTableName,
connectionId: 'test_connection',
});
expect(result).toHaveLength(2);
const bodyIndex = result.find(idx => idx.name === 'idx_body');
expect(bodyIndex).toBeDefined();
expect(bodyIndex!.type).toBe('tokenbf_v1');
expect(bodyIndex!.expression).toBe('Body');
expect(bodyIndex!.granularity).toBe(8);
const tsIndex = result.find(idx => idx.name === 'idx_ts');
expect(tsIndex).toBeDefined();
expect(tsIndex!.type).toBe('minmax');
expect(tsIndex!.granularity).toBe(1);
});
it('should return skip indices directly for a non-distributed table', async () => {
const result = await metadata.getSkipIndices({
databaseName: 'default',
tableName: localTableName,
connectionId: 'test_connection',
});
expect(result).toHaveLength(2);
expect(result.map(idx => idx.name)).toEqual(
expect.arrayContaining(['idx_body', 'idx_ts']),
);
});
});
feat: Add hasAllTokens for text index support (#1637) Closes HDX-3245 # Summary This PR updates the Lucene to SQL compilation process to generate conditions using `hasAllTokens` when the target column has a text index defined. `hasAllTokens` has a couple of limitations which are solved for: 1. The `needle` argument must be no more than 64 tokens, or `hasAllTokens` will error. To support search terms with more than 64 tokens, terms are first broken up into batches of 50 tokens, each batch is passed to a separate `hasAllTokens` call. When multiple `hasAllTokens` calls are used, we also use substring matching `lower(Body) LIKE '%term with many tokens...%'`. 2. `hasAllTokens` may only be used when `enable_full_text_index = 1`. The existence of a text index does not guarantee that `enable_full_text_index = 1`, since the text index could have been created with a query that explicitly specified `SETTINGS enable_full_text_index = 1`. We cannot set this option in every query HyperDX makes, because the setting was not available prior to v25.12. To solve for this, we check the value of `enable_full_text_index` in `system.settings`, and only use `hasAllTokens` if the setting exists and is enabled. ## Testing Setup ### Enable Full Text Index First, make sure you're running at least ClickHouse 25.12. Then, update the ClickHouse `users.xml`'s default profile with the following (or otherwise update your user's profile): ```xml <clickhouse> <profiles> <default> ... <enable_full_text_index>1</enable_full_text_index> </default> </profiles> ... <clickhouse> ``` ### Add a Full Text Index ```sql ALTER TABLE otel_logs ADD INDEX text_idx(Body) TYPE text(tokenizer=splitByNonAlpha, preprocessor=lower(Body)) SETTINGS enable_full_text_index=1; ALTER TABLE otel_logs MATERIALIZE INDEX text_idx; ``` ## Limitations 1. We currently only support the `splitByNonAlpha` tokenizer. If the text index is created with a different tokenizer, `hasAllTokens` will not be used. If needed, this limitation can be removed in the future by implementing `tokenizeTerm`, `termContainsSeparators`, and token batching logic specific to the other tokenizers. 2. This requires the latest (Beta) version of the full text index and related setting, available in ClickHouse v25.12.
2026-01-22 19:50:08 +00:00
describe('getSetting', () => {
let metadata: Metadata;
beforeEach(async () => {
metadata = new Metadata(hdxClient, new MetadataCache());
});
it('should get setting that exists and is enabled', async () => {
const settingValue = await metadata.getSetting({
settingName: 'format_csv_allow_single_quotes',
connectionId: 'test_connection',
});
expect(settingValue).toBe('0');
});
it('should get setting that exists and is disabled', async () => {
const settingValue = await metadata.getSetting({
settingName: 'format_csv_allow_double_quotes',
connectionId: 'test_connection',
});
expect(settingValue).toBe('1');
});
it('should return undefined for setting that does not exist', async () => {
const settingValue = await metadata.getSetting({
settingName: 'enable_quantum_tunnelling',
connectionId: 'test_connection',
});
expect(settingValue).toBeUndefined();
});
});
});