feat: Add hasAllTokens for text index support (#1637)

Closes HDX-3245 # Summary This PR updates the Lucene to SQL compilation process to generate conditions using `hasAllTokens` when the target column has a text index defined. `hasAllTokens` has a couple of limitations which are solved for: 1. The `needle` argument must be no more than 64 tokens, or `hasAllTokens` will error. To support search terms with more than 64 tokens, terms are first broken up into batches of 50 tokens, each batch is passed to a separate `hasAllTokens` call. When multiple `hasAllTokens` calls are used, we also use substring matching `lower(Body) LIKE '%term with many tokens...%'`. 2. `hasAllTokens` may only be used when `enable_full_text_index = 1`. The existence of a text index does not guarantee that `enable_full_text_index = 1`, since the text index could have been created with a query that explicitly specified `SETTINGS enable_full_text_index = 1`. We cannot set this option in every query HyperDX makes, because the setting was not available prior to v25.12. To solve for this, we check the value of `enable_full_text_index` in `system.settings`, and only use `hasAllTokens` if the setting exists and is enabled. ## Testing Setup ### Enable Full Text Index First, make sure you're running at least ClickHouse 25.12. Then, update the ClickHouse `users.xml`'s default profile with the following (or otherwise update your user's profile): ```xml <clickhouse> <profiles> <default> ... <enable_full_text_index>1</enable_full_text_index> </default> </profiles> ... <clickhouse> ``` ### Add a Full Text Index ```sql ALTER TABLE otel_logs ADD INDEX text_idx(Body) TYPE text(tokenizer=splitByNonAlpha, preprocessor=lower(Body)) SETTINGS enable_full_text_index=1; ALTER TABLE otel_logs MATERIALIZE INDEX text_idx; ``` ## Limitations 1. We currently only support the `splitByNonAlpha` tokenizer. If the text index is created with a different tokenizer, `hasAllTokens` will not be used. If needed, this limitation can be removed in the future by implementing `tokenizeTerm`, `termContainsSeparators`, and token batching logic specific to the other tokenizers. 2. This requires the latest (Beta) version of the full text index and related setting, available in ClickHouse v25.12.
2026-04-21 13:37:15 +00:00 · 2026-01-22 14:50:08 -05:00 · 2026-01-22 14:50:08 -05:00 · 4a85617320
commit 4a85617320
parent 9f51920b5d
8 changed files with 799 additions and 8 deletions
--- a/.changeset/tough-swans-doubt.md
+++ b/.changeset/tough-swans-doubt.md
@ -0,0 +1,7 @@
+---
+"@hyperdx/common-utils": patch
+"@hyperdx/api": patch
+"@hyperdx/app": patch
+---
+
+feat: Add hasAllTokens for text index support
--- a/packages/common-utils/src/tests/metadata.int.test.ts
+++ b/packages/common-utils/src/tests/metadata.int.test.ts
@ -522,4 +522,35 @@ describe('Metadata Integration Tests', () => {
      );
    });
  });
+
+  describe('getSetting', () => {
+    let metadata: Metadata;
+    beforeEach(async () => {
+      metadata = new Metadata(hdxClient, new MetadataCache());
+    });
+
+    it('should get setting that exists and is enabled', async () => {
+      const settingValue = await metadata.getSetting({
+        settingName: 'format_csv_allow_single_quotes',
+        connectionId: 'test_connection',
+      });
+      expect(settingValue).toBe('0');
+    });
+
+    it('should get setting that exists and is disabled', async () => {
+      const settingValue = await metadata.getSetting({
+        settingName: 'format_csv_allow_double_quotes',
+        connectionId: 'test_connection',
+      });
+      expect(settingValue).toBe('1');
+    });
+
+    it('should return undefined for setting that does not exist', async () => {
+      const settingValue = await metadata.getSetting({
+        settingName: 'enable_quantum_tunnelling',
+        connectionId: 'test_connection',
+      });
+      expect(settingValue).toBeUndefined();
+    });
+  });
 });
--- a/packages/common-utils/src/tests/queryParser.test.ts
+++ b/packages/common-utils/src/tests/queryParser.test.ts
@ -767,6 +767,343 @@ describe('CustomSchemaSQLSerializerV2 - bloom_filter tokens() indices', () => {
  });
 });

+describe('CustomSchemaSQLSerializerV2 - text indices', () => {
+  const metadata = getMetadata(
+    new ClickhouseClient({ host: 'http://localhost:8123' }),
+  );
+
+  const databaseName = 'default';
+  const tableName = 'otel_logs';
+  const connectionId = 'test';
+
+  beforeEach(() => {
+    // Mock getColumn to return Body as String column
+    metadata.getColumn = jest.fn().mockImplementation(async ({ column }) => {
+      if (column === 'Body') {
+        return { name: 'Body', type: 'String' };
+      } else if (column === 'ServiceName') {
+        return { name: 'ServiceName', type: 'String' };
+      }
+      return undefined;
+    });
+
+    metadata.getSetting = jest
+      .fn()
+      .mockImplementation(async ({ settingName }) => {
+        if (settingName === 'enable_full_text_index') {
+          return '1';
+        }
+        return undefined;
+      });
+  });
+
+  it('should use hasAllTokens when text index exists', async () => {
+    metadata.getSkipIndices = jest.fn().mockResolvedValue([
+      {
+        name: 'idx_body_text',
+        type: 'text',
+        typeFull: 'text(tokenizer=splitByNonAlpha)',
+        expression: 'Body',
+        granularity: '8',
+      },
+    ]);
+
+    const serializer = new CustomSchemaSQLSerializerV2({
+      metadata,
+      databaseName,
+      tableName,
+      connectionId,
+      implicitColumnExpression: 'Body',
+    });
+
+    const builder = new SearchQueryBuilder('foo', serializer);
+    const sql = await builder.build();
+
+    expect(sql).toBe("((hasAllTokens(Body, 'foo')))");
+  });
+
+  it('should use hasAllTokens for multi-token terms with single call', async () => {
+    metadata.getSkipIndices = jest.fn().mockResolvedValue([
+      {
+        name: 'idx_body_text',
+        type: 'text',
+        typeFull: 'text(tokenizer=splitByNonAlpha, preprocessor=lower(Body))',
+        expression: 'Body',
+        granularity: '8',
+      },
+    ]);
+
+    const serializer = new CustomSchemaSQLSerializerV2({
+      metadata,
+      databaseName,
+      tableName,
+      connectionId,
+      implicitColumnExpression: 'Body',
+    });
+
+    const builder = new SearchQueryBuilder('"foo bar"', serializer);
+    const sql = await builder.build();
+
+    expect(sql).toContain("hasAllTokens(Body, 'foo bar')");
+    expect(sql).toContain("(lower(Body) LIKE lower('%foo bar%'))");
+  });
+
+  it('should fallback to hasToken when no text indexes are found', async () => {
+    // Mock getSkipIndices to return empty
+    metadata.getSkipIndices = jest.fn().mockResolvedValue([]);
+
+    const serializer = new CustomSchemaSQLSerializerV2({
+      metadata,
+      databaseName,
+      tableName,
+      connectionId,
+      implicitColumnExpression: 'Body',
+    });
+
+    const builder = new SearchQueryBuilder('foo', serializer);
+    const sql = await builder.build();
+
+    // Should use hasToken (existing behavior)
+    expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
+  });
+
+  it('should handle text index on a different column', async () => {
+    metadata.getSkipIndices = jest.fn().mockResolvedValue([
+      {
+        name: 'idx_body_text',
+        type: 'text',
+        typeFull: 'text(tokenizer=splitByNonAlpha)',
+        expression: 'OtherBody',
+        granularity: '8',
+      },
+    ]);
+
+    const serializer = new CustomSchemaSQLSerializerV2({
+      metadata,
+      databaseName,
+      tableName,
+      connectionId,
+      implicitColumnExpression: 'Body',
+    });
+
+    const builder = new SearchQueryBuilder('foo', serializer);
+    const sql = await builder.build();
+
+    // Should fallback to hasToken (index doesn't use tokens())
+    expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
+  });
+
+  it('should handle negated searches with hasAllTokens', async () => {
+    metadata.getSkipIndices = jest.fn().mockResolvedValue([
+      {
+        name: 'idx_body_text',
+        type: 'text',
+        typeFull: 'text(tokenizer=splitByNonAlpha, preprocessor=lower(Body))',
+        expression: 'Body',
+        granularity: '8',
+      },
+    ]);
+
+    const serializer = new CustomSchemaSQLSerializerV2({
+      metadata,
+      databaseName,
+      tableName,
+      connectionId,
+      implicitColumnExpression: 'Body',
+    });
+
+    const builder = new SearchQueryBuilder('-foo', serializer);
+    const sql = await builder.build();
+
+    // Should use NOT hasAllTokens
+    expect(sql).toBe("((NOT hasAllTokens(Body, 'foo')))");
+  });
+
+  it('should not use text index for explicit field searches', async () => {
+    metadata.getSkipIndices = jest.fn().mockResolvedValue([
+      {
+        name: 'idx_body_text',
+        type: 'text',
+        typeFull: 'text(tokenizer=splitByNonAlpha)',
+        expression: 'Body',
+        granularity: '8',
+      },
+    ]);
+
+    const serializer = new CustomSchemaSQLSerializerV2({
+      metadata,
+      databaseName,
+      tableName,
+      connectionId,
+      implicitColumnExpression: 'Body',
+    });
+
+    // Query: 'ServiceName:foo'
+    const builder = new SearchQueryBuilder('ServiceName:foo', serializer);
+    const sql = await builder.build();
+
+    // Should use ILIKE, not hasAll or hasToken
+    expect(sql).toContain('ILIKE');
+    expect(sql).not.toContain('hasAll');
+    expect(sql).not.toContain('hasToken');
+  });
+
+  it('should batch tokens into groups to avoid hitting the hasAllTokens limit', async () => {
+    metadata.getSkipIndices = jest.fn().mockResolvedValue([
+      {
+        name: 'idx_body_text',
+        type: 'text',
+        typeFull: 'text(tokenizer=splitByNonAlpha)',
+        expression: 'Body',
+        granularity: '8',
+      },
+    ]);
+
+    const serializer = new CustomSchemaSQLSerializerV2({
+      metadata,
+      databaseName,
+      tableName,
+      connectionId,
+      implicitColumnExpression: 'Body',
+    });
+
+    const builder = new SearchQueryBuilder(
+      '"1 2 3 4 5 6 7 8 9 10; 11 12 13 14 15 16 17 18 19 20; 21 22 23 24 25 26 27 28 29 30; 31 32 33 34 35 36 37 38 39 40; 41 42 43 44 45 46 47 48 49 50; 51 52 53 54 55 56 57 58 59 60;"',
+      serializer,
+    );
+    const sql = await builder.build();
+
+    // Should generate separate hasAllTokens for each term (not single statement)
+    expect(sql).toContain(
+      "hasAllTokens(Body, '1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50') AND hasAllTokens(Body, '51 52 53 54 55 56 57 58 59 60') AND (lower(Body) LIKE lower('%1 2 3 4 5 6 7 8 9 10; 11 12 13 14 15 16 17 18 19 20; 21 22 23 24 25 26 27 28 29 30; 31 32 33 34 35 36 37 38 39 40; 41 42 43 44 45 46 47 48 49 50; 51 52 53 54 55 56 57 58 59 60;%'))",
+    );
+  });
+
+  it('should use hasAllTokens for multiple separate terms', async () => {
+    metadata.getSkipIndices = jest.fn().mockResolvedValue([
+      {
+        name: 'idx_body_text',
+        type: 'text',
+        typeFull: 'text(tokenizer=splitByNonAlpha)',
+        expression: 'Body',
+        granularity: '8',
+      },
+    ]);
+
+    const serializer = new CustomSchemaSQLSerializerV2({
+      metadata,
+      databaseName,
+      tableName,
+      connectionId,
+      implicitColumnExpression: 'Body',
+    });
+
+    const builder = new SearchQueryBuilder('foo NOT bar baz', serializer);
+    const sql = await builder.build();
+
+    // Should generate separate hasAllTokens for each term (not single statement)
+    expect(sql).toContain("hasAllTokens(Body, 'foo')");
+    expect(sql).toContain("NOT (hasAllTokens(Body, 'bar'))");
+    expect(sql).toContain("hasAllTokens(Body, 'baz')");
+  });
+
+  it('should not use text index when enable_full_text_index is disabled', async () => {
+    metadata.getSkipIndices = jest.fn().mockResolvedValue([
+      {
+        name: 'idx_body_text',
+        type: 'text',
+        typeFull: 'text(tokenizer=splitByNonAlpha)',
+        expression: 'Body',
+        granularity: '8',
+      },
+    ]);
+
+    // Mock getSetting to disable full text index
+    metadata.getSetting = jest
+      .fn()
+      .mockImplementation(async ({ settingName }) => {
+        if (settingName === 'enable_full_text_index') {
+          return '0';
+        }
+        return undefined;
+      });
+
+    const serializer = new CustomSchemaSQLSerializerV2({
+      metadata,
+      databaseName,
+      tableName,
+      connectionId,
+      implicitColumnExpression: 'Body',
+    });
+
+    const builder = new SearchQueryBuilder('foo', serializer);
+    const sql = await builder.build();
+
+    // Should fallback to hasToken (full text index disabled)
+    expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
+  });
+
+  it('should not use text index when enable_full_text_index is unavailable (ClickHouse version is old)', async () => {
+    metadata.getSkipIndices = jest.fn().mockResolvedValue([
+      {
+        name: 'idx_body_text',
+        type: 'text',
+        typeFull: 'text(tokenizer=splitByNonAlpha)',
+        expression: 'Body',
+        granularity: '8',
+      },
+    ]);
+
+    // Mock getSetting to disable full text index
+    metadata.getSetting = jest.fn().mockResolvedValue(undefined);
+
+    const serializer = new CustomSchemaSQLSerializerV2({
+      metadata,
+      databaseName,
+      tableName,
+      connectionId,
+      implicitColumnExpression: 'Body',
+    });
+
+    const builder = new SearchQueryBuilder('foo', serializer);
+    const sql = await builder.build();
+
+    // Should fallback to hasToken (full text index disabled)
+    expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
+  });
+
+  it('should not use text index when getSetting throws an error', async () => {
+    metadata.getSkipIndices = jest.fn().mockResolvedValue([
+      {
+        name: 'idx_body_text',
+        type: 'text',
+        typeFull: 'text(tokenizer=splitByNonAlpha)',
+        expression: 'Body',
+        granularity: '8',
+      },
+    ]);
+
+    // Mock getSetting to disable full text index
+    metadata.getSetting = jest
+      .fn()
+      .mockRejectedValue(new Error('Failed to get setting'));
+
+    const serializer = new CustomSchemaSQLSerializerV2({
+      metadata,
+      databaseName,
+      tableName,
+      connectionId,
+      implicitColumnExpression: 'Body',
+    });
+
+    const builder = new SearchQueryBuilder('foo', serializer);
+    const sql = await builder.build();
+
+    // Should fallback to hasToken (full text index disabled)
+    expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
+  });
+});
+
 describe('CustomSchemaSQLSerializerV2 - indexCoversColumn', () => {
  const metadata = getMetadata(
    new ClickhouseClient({ host: 'http://localhost:8123' }),
--- a/packages/common-utils/src/tests/renderChartConfig.test.ts
+++ b/packages/common-utils/src/tests/renderChartConfig.test.ts
@ -38,6 +38,7 @@ describe('renderChartConfig', () => {
        .fn()
        .mockResolvedValue({ primary_key: 'timestamp' }),
      getSkipIndices: jest.fn().mockResolvedValue([]),
+      getSetting: jest.fn().mockResolvedValue(undefined),
    } as unknown as jest.Mocked<Metadata>;
  });

--- a/packages/common-utils/src/tests/utils.test.ts
+++ b/packages/common-utils/src/tests/utils.test.ts
@ -20,11 +20,13 @@ import {
  isTimestampExpressionInFirstOrderBy,
  joinQuerySettings,
  optimizeTimestampValueExpression,
+  parseTokenizerFromTextIndex,
  parseToNumber,
  parseToStartOfFunction,
  replaceJsonExpressions,
  splitAndTrimCSV,
  splitAndTrimWithBracket,
+  TextIndexTokenizer,
 } from '../core/utils';

 describe('utils', () => {
@ -1557,4 +1559,117 @@ describe('utils', () => {
      ).toEqual("setting_name = 'Infinity'");
    });
  });
+  describe('parseTokenizerFromTextIndex', () => {
+    it.each([
+      {
+        type: 'text',
+        expected: undefined,
+      },
+      {
+        type: 'text()',
+        expected: undefined,
+      },
+      {
+        type: ' text ( tokenizer= array ) ',
+        expected: { type: 'array' },
+      },
+      {
+        type: 'text(tokenizer=splitByNonAlpha)',
+        expected: { type: 'splitByNonAlpha' },
+      },
+      {
+        type: 'text( tokenizer = splitByNonAlpha )',
+        expected: { type: 'splitByNonAlpha' },
+      },
+      {
+        type: 'text(tokenizer = splitByString())',
+        expected: { type: 'splitByString', separators: [' '] },
+      },
+      {
+        type: `text(tokenizer = splitByString([', ', '; ', '\\n', '" ', '\\\\', '\\t', '(', ')']))`,
+        expected: {
+          type: 'splitByString',
+          separators: [', ', '; ', '\n', '" ', '\\', '\t', '(', ')'],
+        },
+      },
+      {
+        type: 'text(preprocessor=lower(s), tokenizer=sparseGrams(2, 5, 10))',
+        expected: {
+          type: 'sparseGrams',
+          minLength: 2,
+          maxLength: 5,
+          minCutoffLength: 10,
+        },
+      },
+      {
+        type: 'text(preprocessor=lower(s), tokenizer=sparseGrams(2, 5))',
+        expected: {
+          type: 'sparseGrams',
+          minLength: 2,
+          maxLength: 5,
+          minCutoffLength: undefined,
+        },
+      },
+      {
+        type: 'text(preprocessor=lower(s), tokenizer=sparseGrams(2))',
+        expected: {
+          type: 'sparseGrams',
+          minLength: 2,
+          maxLength: 10,
+          minCutoffLength: undefined,
+        },
+      },
+      {
+        type: 'text(preprocessor=lower(s), tokenizer=sparseGrams)',
+        expected: {
+          type: 'sparseGrams',
+          minLength: 3,
+          maxLength: 10,
+          minCutoffLength: undefined,
+        },
+      },
+      {
+        type: 'text(preprocessor=lower(s), tokenizer= sparseGrams ())',
+        expected: {
+          type: 'sparseGrams',
+          minLength: 3,
+          maxLength: 10,
+          minCutoffLength: undefined,
+        },
+      },
+      {
+        type: 'text(preprocessor=lower(s), tokenizer=unknown)',
+        expected: undefined,
+      },
+      {
+        type: '',
+        expected: undefined,
+      },
+      {
+        type: 'text(preprocessor=lower(s), tokenizer=array)',
+        expected: { type: 'array' },
+      },
+      {
+        type: 'text(preprocessor=lower(s), tokenizer=ngrams)',
+        expected: { type: 'ngrams', n: 3 },
+      },
+      {
+        type: 'text(tokenizer=ngrams())',
+        expected: { type: 'ngrams', n: 3 },
+      },
+      {
+        type: 'text(tokenizer=ngrams(20))',
+        expected: { type: 'ngrams', n: 20 },
+      },
+    ])('should correctly parse tokenizer from: $type', ({ type, expected }) => {
+      const result = parseTokenizerFromTextIndex({
+        type: 'text',
+        typeFull: type,
+        name: 'text_idx',
+        expression: 'Body',
+        granularity: 1000,
+      });
+      expect(result).toEqual(expected);
+    });
+  });
 });
--- a/packages/common-utils/src/core/metadata.ts
+++ b/packages/common-utils/src/core/metadata.ts
@ -103,6 +103,7 @@ export type TableMetadata = {
 export type SkipIndexMetadata = {
  name: string;
  type: string; // 'bloom_filter', 'tokenbf_v1', 'minmax', etc.
+  typeFull: string; // e.g., 'text(tokenizer='splitByNonAlpha')'
  expression: string; // e.g., "tokens(lower(Body))"
  granularity: number;
 };
@ -616,6 +617,48 @@ export class Metadata {
    return tableMetadata;
  }

+  /** Reads the value of the setting with the given name from system.settings. */
+  async getSetting({
+    settingName,
+    connectionId,
+  }: {
+    settingName: string;
+    connectionId: string;
+  }) {
+    return this.cache.getOrFetch(`${connectionId}.${settingName}`, async () => {
+      const sql = chSql`
+          SELECT name, value
+          FROM system.settings
+          WHERE name = ${{ String: settingName }}
+        `;
+
+      try {
+        const json = await this.clickhouseClient
+          .query<'JSON'>({
+            connectionId,
+            query: sql.sql,
+            query_params: sql.params,
+            clickhouse_settings: this.getClickHouseSettings(),
+          })
+          .then(res => res.json<{ name: string; value: string }>());
+
+        if (json.data.length > 0) {
+          return json.data[0].value;
+        }
+
+        return undefined;
+      } catch (e) {
+        // Don't retry permissions errors, just silently return undefined
+        if (e instanceof Error && e.message.includes('Not enough privileges')) {
+          console.warn('Not enough privileges to fetch settings:', e);
+          return undefined;
+        }
+
+        throw e;
+      }
+    });
+  }
+
  /**
   * Queries system.data_skipping_indices to retrieve skip index metadata for a table.
   * Results are cached using MetadataCache.
@ -639,6 +682,7 @@ export class Metadata {
          SELECT
            name,
            type,
+            type_full as typeFull,
            expr as expression,
            granularity
          FROM system.data_skipping_indices
--- a/packages/common-utils/src/core/utils.ts
+++ b/packages/common-utils/src/core/utils.ts
@ -19,6 +19,8 @@ import {
  TSourceUnion,
 } from '@/types';

+import { SkipIndexMetadata } from './metadata';
+
 /** The default maximum number of buckets setting when determining a bucket duration for 'auto' granularity */
 export const DEFAULT_AUTO_GRANULARITY_MAX_BUCKETS = 60;

@ -747,3 +749,181 @@ export function joinQuerySettings(

  return formattedPairs.join(', ');
 }
+
+// A discriminated union type for different tokenizers above
+export type TextIndexTokenizer =
+  | { type: 'splitByNonAlpha' }
+  | { type: 'splitByString'; separators: string[] }
+  | { type: 'ngrams'; n: number }
+  | {
+      type: 'sparseGrams';
+      minLength: number;
+      maxLength: number;
+      minCutoffLength?: number;
+    }
+  | { type: 'array' };
+
+/**
+ * Parses the tokenizer and any associated tokenizer parameters from a text index type definition.
+ *
+ * Examples:
+ * - `text(tokenizer = splitByNonAlpha)` -> `{ type: 'splitByNonAlpha' }`
+ * - `text(tokenizer = splitByString([', ', '; ', '\n', '\\']))` -> `{ type: 'splitByString', separators: [', ', '; ', '\n', '\\'] }`
+ * - `text(preprocessor=lower(s), tokenizer=sparseGrams(2, 5, 10))` -> `{ type: 'sparseGrams', minLength: 2, maxLength: 5, minCutoffLength: 10 }`
+ */
+export function parseTokenizerFromTextIndex({
+  typeFull,
+}: SkipIndexMetadata): TextIndexTokenizer | undefined {
+  const textPattern = /^\s*text\s*\((.+)\)\s*$/;
+  const match = typeFull.match(textPattern);
+  if (!match) {
+    console.error(`Invalid text index type ${typeFull}.`);
+    return undefined;
+  }
+
+  const argsString = match[1].trim();
+  const args = splitAndTrimWithBracket(argsString).map(arg => {
+    const [key, value] = arg.split('=').map(s => s.trim());
+    return { key, value };
+  });
+
+  const tokenizerArg = args.find(arg => arg.key === 'tokenizer')?.value;
+  if (!tokenizerArg) {
+    console.error(
+      `Invalid tokenizer argument in index type ${typeFull}: ${tokenizerArg}`,
+      argsString,
+      splitAndTrimWithBracket(argsString),
+    );
+    return undefined;
+  }
+
+  const tokenizerName = tokenizerArg.split('(')[0].trim();
+  const tokenizerArgsString = tokenizerArg
+    .substring(tokenizerArg.indexOf('(') + 1, tokenizerArg.lastIndexOf(')'))
+    .trim();
+
+  switch (tokenizerName) {
+    case 'splitByNonAlpha':
+      return { type: 'splitByNonAlpha' };
+
+    case 'array':
+      return { type: 'array' };
+
+    case 'ngrams': {
+      // Default n is 3
+      if (!tokenizerArgsString) {
+        return { type: 'ngrams', n: 3 };
+      }
+
+      return { type: 'ngrams', n: Number.parseInt(tokenizerArgsString, 10) };
+    }
+
+    case 'sparseGrams': {
+      const args = tokenizerArgsString
+        .split(',')
+        .map(s => s.trim())
+        .filter(s => !!s);
+
+      const tokenizer: TextIndexTokenizer = {
+        type: 'sparseGrams',
+        minLength: 3,
+        maxLength: 10,
+      };
+
+      if (args.length >= 1) tokenizer.minLength = Number.parseInt(args[0], 10);
+      if (args.length >= 2) tokenizer.maxLength = Number.parseInt(args[1], 10);
+      if (args.length >= 3)
+        tokenizer.minCutoffLength = Number.parseInt(args[2], 10);
+
+      return tokenizer;
+    }
+
+    case 'splitByString': {
+      if (!tokenizerArgsString) {
+        // Default separator is space
+        return { type: 'splitByString', separators: [' '] };
+      }
+
+      const unescape = (str: string) => {
+        const escapeCharacters = [
+          { pattern: /\\a/g, replacement: 'a' },
+          { pattern: /\\b/g, replacement: 'b' },
+          { pattern: /\\e/g, replacement: 'e' },
+          { pattern: /\\f/g, replacement: '\f' },
+          { pattern: /\\n/g, replacement: '\n' },
+          { pattern: /\\r/g, replacement: '\r' },
+          { pattern: /\\t/g, replacement: '\t' },
+          { pattern: /\\v/g, replacement: '\v' },
+          { pattern: /\\0/g, replacement: '\0' },
+          { pattern: /\\\\/g, replacement: '\\' },
+          { pattern: /\\'/g, replacement: "'" },
+          { pattern: /\\"/g, replacement: '"' },
+          { pattern: /\\`/g, replacement: '`' },
+          { pattern: /\\\//g, replacement: '/' },
+          { pattern: /\\=/g, replacement: '=' },
+        ];
+
+        for (const { pattern, replacement } of escapeCharacters) {
+          str = str.replace(pattern, replacement);
+        }
+
+        return str;
+      };
+
+      const separatorsString = tokenizerArgsString.match(/\[(.*)\]/);
+      if (!separatorsString) {
+        // If no array is provided, default to space
+        return { type: 'splitByString', separators: [' '] };
+      }
+
+      const arrayContent = separatorsString[1];
+
+      // Split by commas outside of quotes
+      const separators: string[] = [];
+      let current = '';
+      let inQuote = false;
+      let quoteChar = '';
+
+      for (let i = 0; i < arrayContent.length; i++) {
+        const char = arrayContent[i];
+
+        if ((char === "'" || char === '"') && !inQuote) {
+          inQuote = true;
+          quoteChar = char;
+        } else if (char === quoteChar && inQuote) {
+          if (arrayContent[i - 1] !== '\\' || arrayContent[i - 2] === '\\') {
+            inQuote = false;
+            quoteChar = '';
+          }
+        } else if (char === ',' && !inQuote) {
+          const trimmed = current.trim();
+          if (trimmed) {
+            // Remove quotes and unescape characters
+            const value = trimmed.replace(/^['"]|['"]$/g, '');
+            const unescapedValue = unescape(value);
+            separators.push(unescapedValue);
+          }
+
+          current = '';
+          continue;
+        }
+
+        current += char;
+      }
+
+      // Add last separator
+      const trimmed = current.trim();
+      if (trimmed) {
+        const value = trimmed.replace(/^['"]|['"]$/g, '');
+        const unescapedValue = unescape(value);
+        separators.push(unescapedValue);
+      }
+
+      return { type: 'splitByString', separators };
+    }
+
+    default:
+      console.error(`Unknown tokenizer ${tokenizerName} in type ${typeFull}.`);
+      return undefined;
+  }
+}
--- a/packages/common-utils/src/queryParser.ts
+++ b/packages/common-utils/src/queryParser.ts
@ -1,9 +1,16 @@
 import lucene from '@hyperdx/lucene';
+import { chunk } from 'lodash';
 import SqlString from 'sqlstring';

 import { convertCHTypeToPrimitiveJSType, JSDataType } from '@/clickhouse';
 import { Metadata, SkipIndexMetadata } from '@/core/metadata';
-import { splitAndTrimWithBracket } from '@/core/utils';
+import {
+  parseTokenizerFromTextIndex,
+  splitAndTrimWithBracket,
+} from '@/core/utils';
+
+/** Max number of tokens to pass to hasAllTokens(), which supports up to 64 tokens as of ClickHouse v25.12. */
+const HAS_ALL_TOKENS_CHUNK_SIZE = 50;

 function encodeSpecialTokens(query: string): string {
  return query
@ -493,6 +500,7 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
  private implicitColumnExpression?: string;
  private connectionId: string;
  private skipIndicesPromise?: Promise<SkipIndexMetadata[]>;
+  private enableTextIndexPromise?: Promise<boolean>;

  constructor({
    metadata,
@ -519,6 +527,18 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
        console.error('Error fetching skip indices:', error);
        return [];
      });
+
+    // Pre-fetch value of the enable_full_text_index setting
+    this.enableTextIndexPromise = this.metadata
+      .getSetting({
+        settingName: 'enable_full_text_index',
+        connectionId,
+      })
+      .then(value => value === '1')
+      .catch(error => {
+        console.error('Error fetching enable_full_text_index setting:', error);
+        return false;
+      });
  }

  /**
@ -583,15 +603,54 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
        return SqlString.format(
          `(lower(?) ${isNegatedField ? 'NOT ' : ''}LIKE lower(?))`,
          [
-            SqlString.raw(column ?? ''),
+            SqlString.raw(column),
            `${prefixWildcard ? '%' : ''}${term}${suffixWildcard ? '%' : ''}`,
          ],
        );
      } else if (shouldUseTokenBf) {
-        const hasSeparators = this.termHasSeparators(term);
+        // First check for a text index, and use it if possible
+        // Note: We check that enable_full_text_index = 1, otherwise hasAllTokens() errors
+        const isTextIndexEnabled = await this.enableTextIndexPromise;
+        const textIndex = isTextIndexEnabled
+          ? await this.findTextIndex(column)
+          : undefined;
+
+        if (textIndex) {
+          const tokenizer = parseTokenizerFromTextIndex(textIndex);
+
+          // HDX-3259: Support other tokenizers by overriding tokenizeTerm, termHasSeparators, and batching logic
+          if (tokenizer?.type === 'splitByNonAlpha') {
+            const tokens = this.tokenizeTerm(term);
+            const hasSeparators = this.termHasSeparators(term);
+
+            // Batch tokens to avoid exceeding hasAllTokens limit (64)
+            const tokenBatches = chunk(tokens, HAS_ALL_TOKENS_CHUNK_SIZE);
+            const hasAllTokensExpressions = tokenBatches.map(batch =>
+              SqlString.format(`hasAllTokens(?, ?)`, [
+                SqlString.raw(column),
+                batch.join(' '),
+              ]),
+            );
+
+            if (hasSeparators || tokenBatches.length > 1) {
+              // Multi-token, or term containing token separators: hasAllTokens(..., 'foo bar') AND lower(...) LIKE '%foo bar%'
+              return `(${isNegatedField ? 'NOT (' : ''}${[
+                ...hasAllTokensExpressions,
+                SqlString.format(`(lower(?) LIKE lower(?))`, [
+                  SqlString.raw(column),
+                  `%${term}%`,
+                ]),
+              ].join(' AND ')}${isNegatedField ? ')' : ''})`;
+            } else {
+              // Single token, without token separators: hasAllTokens(..., 'term')
+              return `(${isNegatedField ? 'NOT ' : ''}${hasAllTokensExpressions.join(' AND ')})`;
+            }
+          }
+        }

        // Check for bloom_filter tokens() index first
-        const bloomIndex = await this.findBloomFilterTokensIndex(column ?? '');
+        const hasSeparators = this.termHasSeparators(term);
+        const bloomIndex = await this.findBloomFilterTokensIndex(column);

        if (bloomIndex.found) {
          const indexHasLower = /\blower\s*\(/.test(bloomIndex.indexExpression);
@ -607,7 +666,7 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
              `hasAll(${bloomIndex.indexExpression}, ${termTokensExpression})`,
              // If there are token separators in the term, try to match the whole term as well
              SqlString.format(`(lower(?) LIKE lower(?))`, [
-                SqlString.raw(column ?? ''),
+                SqlString.raw(column),
                `%${term}%`,
              ]),
            ].join(' AND ')}${isNegatedField ? ')' : ''})`;
@ -623,20 +682,20 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
          return `(${isNegatedField ? 'NOT (' : ''}${[
            ...tokens.map(token =>
              SqlString.format(`hasToken(lower(?), lower(?))`, [
-                SqlString.raw(column ?? ''),
+                SqlString.raw(column),
                token,
              ]),
            ),
            // If there are symbols in the term, try to match the whole term as well
            SqlString.format(`(lower(?) LIKE lower(?))`, [
-              SqlString.raw(column ?? ''),
+              SqlString.raw(column),
              `%${term}%`,
            ]),
          ].join(' AND ')}${isNegatedField ? ')' : ''})`;
        } else {
          return SqlString.format(
            `(${isNegatedField ? 'NOT ' : ''}hasToken(lower(?), lower(?)))`,
-            [SqlString.raw(column ?? ''), term],
+            [SqlString.raw(column), term],
          );
        }
      }
@ -778,6 +837,23 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
    // throw new Error(`Column not found: ${field}`);
  }

+  private async findTextIndex(
+    columnExpression: string,
+  ): Promise<SkipIndexMetadata | undefined> {
+    const skipIndices = await this.skipIndicesPromise;
+
+    if (!skipIndices || skipIndices.length === 0) {
+      return undefined;
+    }
+
+    // Note: Text index expressions should not be wrapped in tokens() or preprocessing functions like lower().
+    return skipIndices.find(
+      idx =>
+        idx.type === 'text' &&
+        this.indexCoversColumn(idx.expression, columnExpression),
+    );
+  }
+
  /**
   * Finds a bloom_filter skip index that uses tokens() on the given column expression.
   * Returns the full index expression if found, otherwise returns not found.