diff --git a/.changeset/tough-swans-doubt.md b/.changeset/tough-swans-doubt.md new file mode 100644 index 00000000..a1207d58 --- /dev/null +++ b/.changeset/tough-swans-doubt.md @@ -0,0 +1,7 @@ +--- +"@hyperdx/common-utils": patch +"@hyperdx/api": patch +"@hyperdx/app": patch +--- + +feat: Add hasAllTokens for text index support diff --git a/packages/common-utils/src/__tests__/metadata.int.test.ts b/packages/common-utils/src/__tests__/metadata.int.test.ts index 3743d047..94c7892f 100644 --- a/packages/common-utils/src/__tests__/metadata.int.test.ts +++ b/packages/common-utils/src/__tests__/metadata.int.test.ts @@ -522,4 +522,35 @@ describe('Metadata Integration Tests', () => { ); }); }); + + describe('getSetting', () => { + let metadata: Metadata; + beforeEach(async () => { + metadata = new Metadata(hdxClient, new MetadataCache()); + }); + + it('should get setting that exists and is enabled', async () => { + const settingValue = await metadata.getSetting({ + settingName: 'format_csv_allow_single_quotes', + connectionId: 'test_connection', + }); + expect(settingValue).toBe('0'); + }); + + it('should get setting that exists and is disabled', async () => { + const settingValue = await metadata.getSetting({ + settingName: 'format_csv_allow_double_quotes', + connectionId: 'test_connection', + }); + expect(settingValue).toBe('1'); + }); + + it('should return undefined for setting that does not exist', async () => { + const settingValue = await metadata.getSetting({ + settingName: 'enable_quantum_tunnelling', + connectionId: 'test_connection', + }); + expect(settingValue).toBeUndefined(); + }); + }); }); diff --git a/packages/common-utils/src/__tests__/queryParser.test.ts b/packages/common-utils/src/__tests__/queryParser.test.ts index 60f6011a..765e9333 100644 --- a/packages/common-utils/src/__tests__/queryParser.test.ts +++ b/packages/common-utils/src/__tests__/queryParser.test.ts @@ -767,6 +767,343 @@ describe('CustomSchemaSQLSerializerV2 - bloom_filter tokens() indices', () => { }); }); +describe('CustomSchemaSQLSerializerV2 - text indices', () => { + const metadata = getMetadata( + new ClickhouseClient({ host: 'http://localhost:8123' }), + ); + + const databaseName = 'default'; + const tableName = 'otel_logs'; + const connectionId = 'test'; + + beforeEach(() => { + // Mock getColumn to return Body as String column + metadata.getColumn = jest.fn().mockImplementation(async ({ column }) => { + if (column === 'Body') { + return { name: 'Body', type: 'String' }; + } else if (column === 'ServiceName') { + return { name: 'ServiceName', type: 'String' }; + } + return undefined; + }); + + metadata.getSetting = jest + .fn() + .mockImplementation(async ({ settingName }) => { + if (settingName === 'enable_full_text_index') { + return '1'; + } + return undefined; + }); + }); + + it('should use hasAllTokens when text index exists', async () => { + metadata.getSkipIndices = jest.fn().mockResolvedValue([ + { + name: 'idx_body_text', + type: 'text', + typeFull: 'text(tokenizer=splitByNonAlpha)', + expression: 'Body', + granularity: '8', + }, + ]); + + const serializer = new CustomSchemaSQLSerializerV2({ + metadata, + databaseName, + tableName, + connectionId, + implicitColumnExpression: 'Body', + }); + + const builder = new SearchQueryBuilder('foo', serializer); + const sql = await builder.build(); + + expect(sql).toBe("((hasAllTokens(Body, 'foo')))"); + }); + + it('should use hasAllTokens for multi-token terms with single call', async () => { + metadata.getSkipIndices = jest.fn().mockResolvedValue([ + { + name: 'idx_body_text', + type: 'text', + typeFull: 'text(tokenizer=splitByNonAlpha, preprocessor=lower(Body))', + expression: 'Body', + granularity: '8', + }, + ]); + + const serializer = new CustomSchemaSQLSerializerV2({ + metadata, + databaseName, + tableName, + connectionId, + implicitColumnExpression: 'Body', + }); + + const builder = new SearchQueryBuilder('"foo bar"', serializer); + const sql = await builder.build(); + + expect(sql).toContain("hasAllTokens(Body, 'foo bar')"); + expect(sql).toContain("(lower(Body) LIKE lower('%foo bar%'))"); + }); + + it('should fallback to hasToken when no text indexes are found', async () => { + // Mock getSkipIndices to return empty + metadata.getSkipIndices = jest.fn().mockResolvedValue([]); + + const serializer = new CustomSchemaSQLSerializerV2({ + metadata, + databaseName, + tableName, + connectionId, + implicitColumnExpression: 'Body', + }); + + const builder = new SearchQueryBuilder('foo', serializer); + const sql = await builder.build(); + + // Should use hasToken (existing behavior) + expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))"); + }); + + it('should handle text index on a different column', async () => { + metadata.getSkipIndices = jest.fn().mockResolvedValue([ + { + name: 'idx_body_text', + type: 'text', + typeFull: 'text(tokenizer=splitByNonAlpha)', + expression: 'OtherBody', + granularity: '8', + }, + ]); + + const serializer = new CustomSchemaSQLSerializerV2({ + metadata, + databaseName, + tableName, + connectionId, + implicitColumnExpression: 'Body', + }); + + const builder = new SearchQueryBuilder('foo', serializer); + const sql = await builder.build(); + + // Should fallback to hasToken (index doesn't use tokens()) + expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))"); + }); + + it('should handle negated searches with hasAllTokens', async () => { + metadata.getSkipIndices = jest.fn().mockResolvedValue([ + { + name: 'idx_body_text', + type: 'text', + typeFull: 'text(tokenizer=splitByNonAlpha, preprocessor=lower(Body))', + expression: 'Body', + granularity: '8', + }, + ]); + + const serializer = new CustomSchemaSQLSerializerV2({ + metadata, + databaseName, + tableName, + connectionId, + implicitColumnExpression: 'Body', + }); + + const builder = new SearchQueryBuilder('-foo', serializer); + const sql = await builder.build(); + + // Should use NOT hasAllTokens + expect(sql).toBe("((NOT hasAllTokens(Body, 'foo')))"); + }); + + it('should not use text index for explicit field searches', async () => { + metadata.getSkipIndices = jest.fn().mockResolvedValue([ + { + name: 'idx_body_text', + type: 'text', + typeFull: 'text(tokenizer=splitByNonAlpha)', + expression: 'Body', + granularity: '8', + }, + ]); + + const serializer = new CustomSchemaSQLSerializerV2({ + metadata, + databaseName, + tableName, + connectionId, + implicitColumnExpression: 'Body', + }); + + // Query: 'ServiceName:foo' + const builder = new SearchQueryBuilder('ServiceName:foo', serializer); + const sql = await builder.build(); + + // Should use ILIKE, not hasAll or hasToken + expect(sql).toContain('ILIKE'); + expect(sql).not.toContain('hasAll'); + expect(sql).not.toContain('hasToken'); + }); + + it('should batch tokens into groups to avoid hitting the hasAllTokens limit', async () => { + metadata.getSkipIndices = jest.fn().mockResolvedValue([ + { + name: 'idx_body_text', + type: 'text', + typeFull: 'text(tokenizer=splitByNonAlpha)', + expression: 'Body', + granularity: '8', + }, + ]); + + const serializer = new CustomSchemaSQLSerializerV2({ + metadata, + databaseName, + tableName, + connectionId, + implicitColumnExpression: 'Body', + }); + + const builder = new SearchQueryBuilder( + '"1 2 3 4 5 6 7 8 9 10; 11 12 13 14 15 16 17 18 19 20; 21 22 23 24 25 26 27 28 29 30; 31 32 33 34 35 36 37 38 39 40; 41 42 43 44 45 46 47 48 49 50; 51 52 53 54 55 56 57 58 59 60;"', + serializer, + ); + const sql = await builder.build(); + + // Should generate separate hasAllTokens for each term (not single statement) + expect(sql).toContain( + "hasAllTokens(Body, '1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50') AND hasAllTokens(Body, '51 52 53 54 55 56 57 58 59 60') AND (lower(Body) LIKE lower('%1 2 3 4 5 6 7 8 9 10; 11 12 13 14 15 16 17 18 19 20; 21 22 23 24 25 26 27 28 29 30; 31 32 33 34 35 36 37 38 39 40; 41 42 43 44 45 46 47 48 49 50; 51 52 53 54 55 56 57 58 59 60;%'))", + ); + }); + + it('should use hasAllTokens for multiple separate terms', async () => { + metadata.getSkipIndices = jest.fn().mockResolvedValue([ + { + name: 'idx_body_text', + type: 'text', + typeFull: 'text(tokenizer=splitByNonAlpha)', + expression: 'Body', + granularity: '8', + }, + ]); + + const serializer = new CustomSchemaSQLSerializerV2({ + metadata, + databaseName, + tableName, + connectionId, + implicitColumnExpression: 'Body', + }); + + const builder = new SearchQueryBuilder('foo NOT bar baz', serializer); + const sql = await builder.build(); + + // Should generate separate hasAllTokens for each term (not single statement) + expect(sql).toContain("hasAllTokens(Body, 'foo')"); + expect(sql).toContain("NOT (hasAllTokens(Body, 'bar'))"); + expect(sql).toContain("hasAllTokens(Body, 'baz')"); + }); + + it('should not use text index when enable_full_text_index is disabled', async () => { + metadata.getSkipIndices = jest.fn().mockResolvedValue([ + { + name: 'idx_body_text', + type: 'text', + typeFull: 'text(tokenizer=splitByNonAlpha)', + expression: 'Body', + granularity: '8', + }, + ]); + + // Mock getSetting to disable full text index + metadata.getSetting = jest + .fn() + .mockImplementation(async ({ settingName }) => { + if (settingName === 'enable_full_text_index') { + return '0'; + } + return undefined; + }); + + const serializer = new CustomSchemaSQLSerializerV2({ + metadata, + databaseName, + tableName, + connectionId, + implicitColumnExpression: 'Body', + }); + + const builder = new SearchQueryBuilder('foo', serializer); + const sql = await builder.build(); + + // Should fallback to hasToken (full text index disabled) + expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))"); + }); + + it('should not use text index when enable_full_text_index is unavailable (ClickHouse version is old)', async () => { + metadata.getSkipIndices = jest.fn().mockResolvedValue([ + { + name: 'idx_body_text', + type: 'text', + typeFull: 'text(tokenizer=splitByNonAlpha)', + expression: 'Body', + granularity: '8', + }, + ]); + + // Mock getSetting to disable full text index + metadata.getSetting = jest.fn().mockResolvedValue(undefined); + + const serializer = new CustomSchemaSQLSerializerV2({ + metadata, + databaseName, + tableName, + connectionId, + implicitColumnExpression: 'Body', + }); + + const builder = new SearchQueryBuilder('foo', serializer); + const sql = await builder.build(); + + // Should fallback to hasToken (full text index disabled) + expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))"); + }); + + it('should not use text index when getSetting throws an error', async () => { + metadata.getSkipIndices = jest.fn().mockResolvedValue([ + { + name: 'idx_body_text', + type: 'text', + typeFull: 'text(tokenizer=splitByNonAlpha)', + expression: 'Body', + granularity: '8', + }, + ]); + + // Mock getSetting to disable full text index + metadata.getSetting = jest + .fn() + .mockRejectedValue(new Error('Failed to get setting')); + + const serializer = new CustomSchemaSQLSerializerV2({ + metadata, + databaseName, + tableName, + connectionId, + implicitColumnExpression: 'Body', + }); + + const builder = new SearchQueryBuilder('foo', serializer); + const sql = await builder.build(); + + // Should fallback to hasToken (full text index disabled) + expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))"); + }); +}); + describe('CustomSchemaSQLSerializerV2 - indexCoversColumn', () => { const metadata = getMetadata( new ClickhouseClient({ host: 'http://localhost:8123' }), diff --git a/packages/common-utils/src/__tests__/renderChartConfig.test.ts b/packages/common-utils/src/__tests__/renderChartConfig.test.ts index a9988a62..fe6bf728 100644 --- a/packages/common-utils/src/__tests__/renderChartConfig.test.ts +++ b/packages/common-utils/src/__tests__/renderChartConfig.test.ts @@ -38,6 +38,7 @@ describe('renderChartConfig', () => { .fn() .mockResolvedValue({ primary_key: 'timestamp' }), getSkipIndices: jest.fn().mockResolvedValue([]), + getSetting: jest.fn().mockResolvedValue(undefined), } as unknown as jest.Mocked; }); diff --git a/packages/common-utils/src/__tests__/utils.test.ts b/packages/common-utils/src/__tests__/utils.test.ts index b26f997e..1dfd74f2 100644 --- a/packages/common-utils/src/__tests__/utils.test.ts +++ b/packages/common-utils/src/__tests__/utils.test.ts @@ -20,11 +20,13 @@ import { isTimestampExpressionInFirstOrderBy, joinQuerySettings, optimizeTimestampValueExpression, + parseTokenizerFromTextIndex, parseToNumber, parseToStartOfFunction, replaceJsonExpressions, splitAndTrimCSV, splitAndTrimWithBracket, + TextIndexTokenizer, } from '../core/utils'; describe('utils', () => { @@ -1557,4 +1559,117 @@ describe('utils', () => { ).toEqual("setting_name = 'Infinity'"); }); }); + describe('parseTokenizerFromTextIndex', () => { + it.each([ + { + type: 'text', + expected: undefined, + }, + { + type: 'text()', + expected: undefined, + }, + { + type: ' text ( tokenizer= array ) ', + expected: { type: 'array' }, + }, + { + type: 'text(tokenizer=splitByNonAlpha)', + expected: { type: 'splitByNonAlpha' }, + }, + { + type: 'text( tokenizer = splitByNonAlpha )', + expected: { type: 'splitByNonAlpha' }, + }, + { + type: 'text(tokenizer = splitByString())', + expected: { type: 'splitByString', separators: [' '] }, + }, + { + type: `text(tokenizer = splitByString([', ', '; ', '\\n', '" ', '\\\\', '\\t', '(', ')']))`, + expected: { + type: 'splitByString', + separators: [', ', '; ', '\n', '" ', '\\', '\t', '(', ')'], + }, + }, + { + type: 'text(preprocessor=lower(s), tokenizer=sparseGrams(2, 5, 10))', + expected: { + type: 'sparseGrams', + minLength: 2, + maxLength: 5, + minCutoffLength: 10, + }, + }, + { + type: 'text(preprocessor=lower(s), tokenizer=sparseGrams(2, 5))', + expected: { + type: 'sparseGrams', + minLength: 2, + maxLength: 5, + minCutoffLength: undefined, + }, + }, + { + type: 'text(preprocessor=lower(s), tokenizer=sparseGrams(2))', + expected: { + type: 'sparseGrams', + minLength: 2, + maxLength: 10, + minCutoffLength: undefined, + }, + }, + { + type: 'text(preprocessor=lower(s), tokenizer=sparseGrams)', + expected: { + type: 'sparseGrams', + minLength: 3, + maxLength: 10, + minCutoffLength: undefined, + }, + }, + { + type: 'text(preprocessor=lower(s), tokenizer= sparseGrams ())', + expected: { + type: 'sparseGrams', + minLength: 3, + maxLength: 10, + minCutoffLength: undefined, + }, + }, + { + type: 'text(preprocessor=lower(s), tokenizer=unknown)', + expected: undefined, + }, + { + type: '', + expected: undefined, + }, + { + type: 'text(preprocessor=lower(s), tokenizer=array)', + expected: { type: 'array' }, + }, + { + type: 'text(preprocessor=lower(s), tokenizer=ngrams)', + expected: { type: 'ngrams', n: 3 }, + }, + { + type: 'text(tokenizer=ngrams())', + expected: { type: 'ngrams', n: 3 }, + }, + { + type: 'text(tokenizer=ngrams(20))', + expected: { type: 'ngrams', n: 20 }, + }, + ])('should correctly parse tokenizer from: $type', ({ type, expected }) => { + const result = parseTokenizerFromTextIndex({ + type: 'text', + typeFull: type, + name: 'text_idx', + expression: 'Body', + granularity: 1000, + }); + expect(result).toEqual(expected); + }); + }); }); diff --git a/packages/common-utils/src/core/metadata.ts b/packages/common-utils/src/core/metadata.ts index fbc119ec..6cb56014 100644 --- a/packages/common-utils/src/core/metadata.ts +++ b/packages/common-utils/src/core/metadata.ts @@ -103,6 +103,7 @@ export type TableMetadata = { export type SkipIndexMetadata = { name: string; type: string; // 'bloom_filter', 'tokenbf_v1', 'minmax', etc. + typeFull: string; // e.g., 'text(tokenizer='splitByNonAlpha')' expression: string; // e.g., "tokens(lower(Body))" granularity: number; }; @@ -616,6 +617,48 @@ export class Metadata { return tableMetadata; } + /** Reads the value of the setting with the given name from system.settings. */ + async getSetting({ + settingName, + connectionId, + }: { + settingName: string; + connectionId: string; + }) { + return this.cache.getOrFetch(`${connectionId}.${settingName}`, async () => { + const sql = chSql` + SELECT name, value + FROM system.settings + WHERE name = ${{ String: settingName }} + `; + + try { + const json = await this.clickhouseClient + .query<'JSON'>({ + connectionId, + query: sql.sql, + query_params: sql.params, + clickhouse_settings: this.getClickHouseSettings(), + }) + .then(res => res.json<{ name: string; value: string }>()); + + if (json.data.length > 0) { + return json.data[0].value; + } + + return undefined; + } catch (e) { + // Don't retry permissions errors, just silently return undefined + if (e instanceof Error && e.message.includes('Not enough privileges')) { + console.warn('Not enough privileges to fetch settings:', e); + return undefined; + } + + throw e; + } + }); + } + /** * Queries system.data_skipping_indices to retrieve skip index metadata for a table. * Results are cached using MetadataCache. @@ -639,6 +682,7 @@ export class Metadata { SELECT name, type, + type_full as typeFull, expr as expression, granularity FROM system.data_skipping_indices diff --git a/packages/common-utils/src/core/utils.ts b/packages/common-utils/src/core/utils.ts index e0f9e99c..a03638de 100644 --- a/packages/common-utils/src/core/utils.ts +++ b/packages/common-utils/src/core/utils.ts @@ -19,6 +19,8 @@ import { TSourceUnion, } from '@/types'; +import { SkipIndexMetadata } from './metadata'; + /** The default maximum number of buckets setting when determining a bucket duration for 'auto' granularity */ export const DEFAULT_AUTO_GRANULARITY_MAX_BUCKETS = 60; @@ -747,3 +749,181 @@ export function joinQuerySettings( return formattedPairs.join(', '); } + +// A discriminated union type for different tokenizers above +export type TextIndexTokenizer = + | { type: 'splitByNonAlpha' } + | { type: 'splitByString'; separators: string[] } + | { type: 'ngrams'; n: number } + | { + type: 'sparseGrams'; + minLength: number; + maxLength: number; + minCutoffLength?: number; + } + | { type: 'array' }; + +/** + * Parses the tokenizer and any associated tokenizer parameters from a text index type definition. + * + * Examples: + * - `text(tokenizer = splitByNonAlpha)` -> `{ type: 'splitByNonAlpha' }` + * - `text(tokenizer = splitByString([', ', '; ', '\n', '\\']))` -> `{ type: 'splitByString', separators: [', ', '; ', '\n', '\\'] }` + * - `text(preprocessor=lower(s), tokenizer=sparseGrams(2, 5, 10))` -> `{ type: 'sparseGrams', minLength: 2, maxLength: 5, minCutoffLength: 10 }` + */ +export function parseTokenizerFromTextIndex({ + typeFull, +}: SkipIndexMetadata): TextIndexTokenizer | undefined { + const textPattern = /^\s*text\s*\((.+)\)\s*$/; + const match = typeFull.match(textPattern); + if (!match) { + console.error(`Invalid text index type ${typeFull}.`); + return undefined; + } + + const argsString = match[1].trim(); + const args = splitAndTrimWithBracket(argsString).map(arg => { + const [key, value] = arg.split('=').map(s => s.trim()); + return { key, value }; + }); + + const tokenizerArg = args.find(arg => arg.key === 'tokenizer')?.value; + if (!tokenizerArg) { + console.error( + `Invalid tokenizer argument in index type ${typeFull}: ${tokenizerArg}`, + argsString, + splitAndTrimWithBracket(argsString), + ); + return undefined; + } + + const tokenizerName = tokenizerArg.split('(')[0].trim(); + const tokenizerArgsString = tokenizerArg + .substring(tokenizerArg.indexOf('(') + 1, tokenizerArg.lastIndexOf(')')) + .trim(); + + switch (tokenizerName) { + case 'splitByNonAlpha': + return { type: 'splitByNonAlpha' }; + + case 'array': + return { type: 'array' }; + + case 'ngrams': { + // Default n is 3 + if (!tokenizerArgsString) { + return { type: 'ngrams', n: 3 }; + } + + return { type: 'ngrams', n: Number.parseInt(tokenizerArgsString, 10) }; + } + + case 'sparseGrams': { + const args = tokenizerArgsString + .split(',') + .map(s => s.trim()) + .filter(s => !!s); + + const tokenizer: TextIndexTokenizer = { + type: 'sparseGrams', + minLength: 3, + maxLength: 10, + }; + + if (args.length >= 1) tokenizer.minLength = Number.parseInt(args[0], 10); + if (args.length >= 2) tokenizer.maxLength = Number.parseInt(args[1], 10); + if (args.length >= 3) + tokenizer.minCutoffLength = Number.parseInt(args[2], 10); + + return tokenizer; + } + + case 'splitByString': { + if (!tokenizerArgsString) { + // Default separator is space + return { type: 'splitByString', separators: [' '] }; + } + + const unescape = (str: string) => { + const escapeCharacters = [ + { pattern: /\\a/g, replacement: 'a' }, + { pattern: /\\b/g, replacement: 'b' }, + { pattern: /\\e/g, replacement: 'e' }, + { pattern: /\\f/g, replacement: '\f' }, + { pattern: /\\n/g, replacement: '\n' }, + { pattern: /\\r/g, replacement: '\r' }, + { pattern: /\\t/g, replacement: '\t' }, + { pattern: /\\v/g, replacement: '\v' }, + { pattern: /\\0/g, replacement: '\0' }, + { pattern: /\\\\/g, replacement: '\\' }, + { pattern: /\\'/g, replacement: "'" }, + { pattern: /\\"/g, replacement: '"' }, + { pattern: /\\`/g, replacement: '`' }, + { pattern: /\\\//g, replacement: '/' }, + { pattern: /\\=/g, replacement: '=' }, + ]; + + for (const { pattern, replacement } of escapeCharacters) { + str = str.replace(pattern, replacement); + } + + return str; + }; + + const separatorsString = tokenizerArgsString.match(/\[(.*)\]/); + if (!separatorsString) { + // If no array is provided, default to space + return { type: 'splitByString', separators: [' '] }; + } + + const arrayContent = separatorsString[1]; + + // Split by commas outside of quotes + const separators: string[] = []; + let current = ''; + let inQuote = false; + let quoteChar = ''; + + for (let i = 0; i < arrayContent.length; i++) { + const char = arrayContent[i]; + + if ((char === "'" || char === '"') && !inQuote) { + inQuote = true; + quoteChar = char; + } else if (char === quoteChar && inQuote) { + if (arrayContent[i - 1] !== '\\' || arrayContent[i - 2] === '\\') { + inQuote = false; + quoteChar = ''; + } + } else if (char === ',' && !inQuote) { + const trimmed = current.trim(); + if (trimmed) { + // Remove quotes and unescape characters + const value = trimmed.replace(/^['"]|['"]$/g, ''); + const unescapedValue = unescape(value); + separators.push(unescapedValue); + } + + current = ''; + continue; + } + + current += char; + } + + // Add last separator + const trimmed = current.trim(); + if (trimmed) { + const value = trimmed.replace(/^['"]|['"]$/g, ''); + const unescapedValue = unescape(value); + separators.push(unescapedValue); + } + + return { type: 'splitByString', separators }; + } + + default: + console.error(`Unknown tokenizer ${tokenizerName} in type ${typeFull}.`); + return undefined; + } +} diff --git a/packages/common-utils/src/queryParser.ts b/packages/common-utils/src/queryParser.ts index 0ea54b69..182d558a 100644 --- a/packages/common-utils/src/queryParser.ts +++ b/packages/common-utils/src/queryParser.ts @@ -1,9 +1,16 @@ import lucene from '@hyperdx/lucene'; +import { chunk } from 'lodash'; import SqlString from 'sqlstring'; import { convertCHTypeToPrimitiveJSType, JSDataType } from '@/clickhouse'; import { Metadata, SkipIndexMetadata } from '@/core/metadata'; -import { splitAndTrimWithBracket } from '@/core/utils'; +import { + parseTokenizerFromTextIndex, + splitAndTrimWithBracket, +} from '@/core/utils'; + +/** Max number of tokens to pass to hasAllTokens(), which supports up to 64 tokens as of ClickHouse v25.12. */ +const HAS_ALL_TOKENS_CHUNK_SIZE = 50; function encodeSpecialTokens(query: string): string { return query @@ -493,6 +500,7 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer { private implicitColumnExpression?: string; private connectionId: string; private skipIndicesPromise?: Promise; + private enableTextIndexPromise?: Promise; constructor({ metadata, @@ -519,6 +527,18 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer { console.error('Error fetching skip indices:', error); return []; }); + + // Pre-fetch value of the enable_full_text_index setting + this.enableTextIndexPromise = this.metadata + .getSetting({ + settingName: 'enable_full_text_index', + connectionId, + }) + .then(value => value === '1') + .catch(error => { + console.error('Error fetching enable_full_text_index setting:', error); + return false; + }); } /** @@ -583,15 +603,54 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer { return SqlString.format( `(lower(?) ${isNegatedField ? 'NOT ' : ''}LIKE lower(?))`, [ - SqlString.raw(column ?? ''), + SqlString.raw(column), `${prefixWildcard ? '%' : ''}${term}${suffixWildcard ? '%' : ''}`, ], ); } else if (shouldUseTokenBf) { - const hasSeparators = this.termHasSeparators(term); + // First check for a text index, and use it if possible + // Note: We check that enable_full_text_index = 1, otherwise hasAllTokens() errors + const isTextIndexEnabled = await this.enableTextIndexPromise; + const textIndex = isTextIndexEnabled + ? await this.findTextIndex(column) + : undefined; + + if (textIndex) { + const tokenizer = parseTokenizerFromTextIndex(textIndex); + + // HDX-3259: Support other tokenizers by overriding tokenizeTerm, termHasSeparators, and batching logic + if (tokenizer?.type === 'splitByNonAlpha') { + const tokens = this.tokenizeTerm(term); + const hasSeparators = this.termHasSeparators(term); + + // Batch tokens to avoid exceeding hasAllTokens limit (64) + const tokenBatches = chunk(tokens, HAS_ALL_TOKENS_CHUNK_SIZE); + const hasAllTokensExpressions = tokenBatches.map(batch => + SqlString.format(`hasAllTokens(?, ?)`, [ + SqlString.raw(column), + batch.join(' '), + ]), + ); + + if (hasSeparators || tokenBatches.length > 1) { + // Multi-token, or term containing token separators: hasAllTokens(..., 'foo bar') AND lower(...) LIKE '%foo bar%' + return `(${isNegatedField ? 'NOT (' : ''}${[ + ...hasAllTokensExpressions, + SqlString.format(`(lower(?) LIKE lower(?))`, [ + SqlString.raw(column), + `%${term}%`, + ]), + ].join(' AND ')}${isNegatedField ? ')' : ''})`; + } else { + // Single token, without token separators: hasAllTokens(..., 'term') + return `(${isNegatedField ? 'NOT ' : ''}${hasAllTokensExpressions.join(' AND ')})`; + } + } + } // Check for bloom_filter tokens() index first - const bloomIndex = await this.findBloomFilterTokensIndex(column ?? ''); + const hasSeparators = this.termHasSeparators(term); + const bloomIndex = await this.findBloomFilterTokensIndex(column); if (bloomIndex.found) { const indexHasLower = /\blower\s*\(/.test(bloomIndex.indexExpression); @@ -607,7 +666,7 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer { `hasAll(${bloomIndex.indexExpression}, ${termTokensExpression})`, // If there are token separators in the term, try to match the whole term as well SqlString.format(`(lower(?) LIKE lower(?))`, [ - SqlString.raw(column ?? ''), + SqlString.raw(column), `%${term}%`, ]), ].join(' AND ')}${isNegatedField ? ')' : ''})`; @@ -623,20 +682,20 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer { return `(${isNegatedField ? 'NOT (' : ''}${[ ...tokens.map(token => SqlString.format(`hasToken(lower(?), lower(?))`, [ - SqlString.raw(column ?? ''), + SqlString.raw(column), token, ]), ), // If there are symbols in the term, try to match the whole term as well SqlString.format(`(lower(?) LIKE lower(?))`, [ - SqlString.raw(column ?? ''), + SqlString.raw(column), `%${term}%`, ]), ].join(' AND ')}${isNegatedField ? ')' : ''})`; } else { return SqlString.format( `(${isNegatedField ? 'NOT ' : ''}hasToken(lower(?), lower(?)))`, - [SqlString.raw(column ?? ''), term], + [SqlString.raw(column), term], ); } } @@ -778,6 +837,23 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer { // throw new Error(`Column not found: ${field}`); } + private async findTextIndex( + columnExpression: string, + ): Promise { + const skipIndices = await this.skipIndicesPromise; + + if (!skipIndices || skipIndices.length === 0) { + return undefined; + } + + // Note: Text index expressions should not be wrapped in tokens() or preprocessing functions like lower(). + return skipIndices.find( + idx => + idx.type === 'text' && + this.indexCoversColumn(idx.expression, columnExpression), + ); + } + /** * Finds a bloom_filter skip index that uses tokens() on the given column expression. * Returns the full index expression if found, otherwise returns not found.