mirror of
https://github.com/hyperdxio/hyperdx
synced 2026-04-21 13:37:15 +00:00
feat: Add hasAllTokens for text index support (#1637)
Closes HDX-3245
# Summary
This PR updates the Lucene to SQL compilation process to generate conditions using `hasAllTokens` when the target column has a text index defined.
`hasAllTokens` has a couple of limitations which are solved for:
1. The `needle` argument must be no more than 64 tokens, or `hasAllTokens` will error. To support search terms with more than 64 tokens, terms are first broken up into batches of 50 tokens, each batch is passed to a separate `hasAllTokens` call. When multiple `hasAllTokens` calls are used, we also use substring matching `lower(Body) LIKE '%term with many tokens...%'`.
2. `hasAllTokens` may only be used when `enable_full_text_index = 1`. The existence of a text index does not guarantee that `enable_full_text_index = 1`, since the text index could have been created with a query that explicitly specified `SETTINGS enable_full_text_index = 1`. We cannot set this option in every query HyperDX makes, because the setting was not available prior to v25.12. To solve for this, we check the value of `enable_full_text_index` in `system.settings`, and only use `hasAllTokens` if the setting exists and is enabled.
## Testing Setup
### Enable Full Text Index
First, make sure you're running at least ClickHouse 25.12.
Then, update the ClickHouse `users.xml`'s default profile with the following (or otherwise update your user's profile):
```xml
<clickhouse>
<profiles>
<default>
...
<enable_full_text_index>1</enable_full_text_index>
</default>
</profiles>
...
<clickhouse>
```
### Add a Full Text Index
```sql
ALTER TABLE otel_logs ADD INDEX text_idx(Body)
TYPE text(tokenizer=splitByNonAlpha, preprocessor=lower(Body))
SETTINGS enable_full_text_index=1;
ALTER TABLE otel_logs MATERIALIZE INDEX text_idx;
```
## Limitations
1. We currently only support the `splitByNonAlpha` tokenizer. If the text index is created with a different tokenizer, `hasAllTokens` will not be used. If needed, this limitation can be removed in the future by implementing `tokenizeTerm`, `termContainsSeparators`, and token batching logic specific to the other tokenizers.
2. This requires the latest (Beta) version of the full text index and related setting, available in ClickHouse v25.12.
This commit is contained in:
parent
9f51920b5d
commit
4a85617320
8 changed files with 799 additions and 8 deletions
7
.changeset/tough-swans-doubt.md
Normal file
7
.changeset/tough-swans-doubt.md
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
---
|
||||
"@hyperdx/common-utils": patch
|
||||
"@hyperdx/api": patch
|
||||
"@hyperdx/app": patch
|
||||
---
|
||||
|
||||
feat: Add hasAllTokens for text index support
|
||||
|
|
@ -522,4 +522,35 @@ describe('Metadata Integration Tests', () => {
|
|||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('getSetting', () => {
|
||||
let metadata: Metadata;
|
||||
beforeEach(async () => {
|
||||
metadata = new Metadata(hdxClient, new MetadataCache());
|
||||
});
|
||||
|
||||
it('should get setting that exists and is enabled', async () => {
|
||||
const settingValue = await metadata.getSetting({
|
||||
settingName: 'format_csv_allow_single_quotes',
|
||||
connectionId: 'test_connection',
|
||||
});
|
||||
expect(settingValue).toBe('0');
|
||||
});
|
||||
|
||||
it('should get setting that exists and is disabled', async () => {
|
||||
const settingValue = await metadata.getSetting({
|
||||
settingName: 'format_csv_allow_double_quotes',
|
||||
connectionId: 'test_connection',
|
||||
});
|
||||
expect(settingValue).toBe('1');
|
||||
});
|
||||
|
||||
it('should return undefined for setting that does not exist', async () => {
|
||||
const settingValue = await metadata.getSetting({
|
||||
settingName: 'enable_quantum_tunnelling',
|
||||
connectionId: 'test_connection',
|
||||
});
|
||||
expect(settingValue).toBeUndefined();
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -767,6 +767,343 @@ describe('CustomSchemaSQLSerializerV2 - bloom_filter tokens() indices', () => {
|
|||
});
|
||||
});
|
||||
|
||||
describe('CustomSchemaSQLSerializerV2 - text indices', () => {
|
||||
const metadata = getMetadata(
|
||||
new ClickhouseClient({ host: 'http://localhost:8123' }),
|
||||
);
|
||||
|
||||
const databaseName = 'default';
|
||||
const tableName = 'otel_logs';
|
||||
const connectionId = 'test';
|
||||
|
||||
beforeEach(() => {
|
||||
// Mock getColumn to return Body as String column
|
||||
metadata.getColumn = jest.fn().mockImplementation(async ({ column }) => {
|
||||
if (column === 'Body') {
|
||||
return { name: 'Body', type: 'String' };
|
||||
} else if (column === 'ServiceName') {
|
||||
return { name: 'ServiceName', type: 'String' };
|
||||
}
|
||||
return undefined;
|
||||
});
|
||||
|
||||
metadata.getSetting = jest
|
||||
.fn()
|
||||
.mockImplementation(async ({ settingName }) => {
|
||||
if (settingName === 'enable_full_text_index') {
|
||||
return '1';
|
||||
}
|
||||
return undefined;
|
||||
});
|
||||
});
|
||||
|
||||
it('should use hasAllTokens when text index exists', async () => {
|
||||
metadata.getSkipIndices = jest.fn().mockResolvedValue([
|
||||
{
|
||||
name: 'idx_body_text',
|
||||
type: 'text',
|
||||
typeFull: 'text(tokenizer=splitByNonAlpha)',
|
||||
expression: 'Body',
|
||||
granularity: '8',
|
||||
},
|
||||
]);
|
||||
|
||||
const serializer = new CustomSchemaSQLSerializerV2({
|
||||
metadata,
|
||||
databaseName,
|
||||
tableName,
|
||||
connectionId,
|
||||
implicitColumnExpression: 'Body',
|
||||
});
|
||||
|
||||
const builder = new SearchQueryBuilder('foo', serializer);
|
||||
const sql = await builder.build();
|
||||
|
||||
expect(sql).toBe("((hasAllTokens(Body, 'foo')))");
|
||||
});
|
||||
|
||||
it('should use hasAllTokens for multi-token terms with single call', async () => {
|
||||
metadata.getSkipIndices = jest.fn().mockResolvedValue([
|
||||
{
|
||||
name: 'idx_body_text',
|
||||
type: 'text',
|
||||
typeFull: 'text(tokenizer=splitByNonAlpha, preprocessor=lower(Body))',
|
||||
expression: 'Body',
|
||||
granularity: '8',
|
||||
},
|
||||
]);
|
||||
|
||||
const serializer = new CustomSchemaSQLSerializerV2({
|
||||
metadata,
|
||||
databaseName,
|
||||
tableName,
|
||||
connectionId,
|
||||
implicitColumnExpression: 'Body',
|
||||
});
|
||||
|
||||
const builder = new SearchQueryBuilder('"foo bar"', serializer);
|
||||
const sql = await builder.build();
|
||||
|
||||
expect(sql).toContain("hasAllTokens(Body, 'foo bar')");
|
||||
expect(sql).toContain("(lower(Body) LIKE lower('%foo bar%'))");
|
||||
});
|
||||
|
||||
it('should fallback to hasToken when no text indexes are found', async () => {
|
||||
// Mock getSkipIndices to return empty
|
||||
metadata.getSkipIndices = jest.fn().mockResolvedValue([]);
|
||||
|
||||
const serializer = new CustomSchemaSQLSerializerV2({
|
||||
metadata,
|
||||
databaseName,
|
||||
tableName,
|
||||
connectionId,
|
||||
implicitColumnExpression: 'Body',
|
||||
});
|
||||
|
||||
const builder = new SearchQueryBuilder('foo', serializer);
|
||||
const sql = await builder.build();
|
||||
|
||||
// Should use hasToken (existing behavior)
|
||||
expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
|
||||
});
|
||||
|
||||
it('should handle text index on a different column', async () => {
|
||||
metadata.getSkipIndices = jest.fn().mockResolvedValue([
|
||||
{
|
||||
name: 'idx_body_text',
|
||||
type: 'text',
|
||||
typeFull: 'text(tokenizer=splitByNonAlpha)',
|
||||
expression: 'OtherBody',
|
||||
granularity: '8',
|
||||
},
|
||||
]);
|
||||
|
||||
const serializer = new CustomSchemaSQLSerializerV2({
|
||||
metadata,
|
||||
databaseName,
|
||||
tableName,
|
||||
connectionId,
|
||||
implicitColumnExpression: 'Body',
|
||||
});
|
||||
|
||||
const builder = new SearchQueryBuilder('foo', serializer);
|
||||
const sql = await builder.build();
|
||||
|
||||
// Should fallback to hasToken (index doesn't use tokens())
|
||||
expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
|
||||
});
|
||||
|
||||
it('should handle negated searches with hasAllTokens', async () => {
|
||||
metadata.getSkipIndices = jest.fn().mockResolvedValue([
|
||||
{
|
||||
name: 'idx_body_text',
|
||||
type: 'text',
|
||||
typeFull: 'text(tokenizer=splitByNonAlpha, preprocessor=lower(Body))',
|
||||
expression: 'Body',
|
||||
granularity: '8',
|
||||
},
|
||||
]);
|
||||
|
||||
const serializer = new CustomSchemaSQLSerializerV2({
|
||||
metadata,
|
||||
databaseName,
|
||||
tableName,
|
||||
connectionId,
|
||||
implicitColumnExpression: 'Body',
|
||||
});
|
||||
|
||||
const builder = new SearchQueryBuilder('-foo', serializer);
|
||||
const sql = await builder.build();
|
||||
|
||||
// Should use NOT hasAllTokens
|
||||
expect(sql).toBe("((NOT hasAllTokens(Body, 'foo')))");
|
||||
});
|
||||
|
||||
it('should not use text index for explicit field searches', async () => {
|
||||
metadata.getSkipIndices = jest.fn().mockResolvedValue([
|
||||
{
|
||||
name: 'idx_body_text',
|
||||
type: 'text',
|
||||
typeFull: 'text(tokenizer=splitByNonAlpha)',
|
||||
expression: 'Body',
|
||||
granularity: '8',
|
||||
},
|
||||
]);
|
||||
|
||||
const serializer = new CustomSchemaSQLSerializerV2({
|
||||
metadata,
|
||||
databaseName,
|
||||
tableName,
|
||||
connectionId,
|
||||
implicitColumnExpression: 'Body',
|
||||
});
|
||||
|
||||
// Query: 'ServiceName:foo'
|
||||
const builder = new SearchQueryBuilder('ServiceName:foo', serializer);
|
||||
const sql = await builder.build();
|
||||
|
||||
// Should use ILIKE, not hasAll or hasToken
|
||||
expect(sql).toContain('ILIKE');
|
||||
expect(sql).not.toContain('hasAll');
|
||||
expect(sql).not.toContain('hasToken');
|
||||
});
|
||||
|
||||
it('should batch tokens into groups to avoid hitting the hasAllTokens limit', async () => {
|
||||
metadata.getSkipIndices = jest.fn().mockResolvedValue([
|
||||
{
|
||||
name: 'idx_body_text',
|
||||
type: 'text',
|
||||
typeFull: 'text(tokenizer=splitByNonAlpha)',
|
||||
expression: 'Body',
|
||||
granularity: '8',
|
||||
},
|
||||
]);
|
||||
|
||||
const serializer = new CustomSchemaSQLSerializerV2({
|
||||
metadata,
|
||||
databaseName,
|
||||
tableName,
|
||||
connectionId,
|
||||
implicitColumnExpression: 'Body',
|
||||
});
|
||||
|
||||
const builder = new SearchQueryBuilder(
|
||||
'"1 2 3 4 5 6 7 8 9 10; 11 12 13 14 15 16 17 18 19 20; 21 22 23 24 25 26 27 28 29 30; 31 32 33 34 35 36 37 38 39 40; 41 42 43 44 45 46 47 48 49 50; 51 52 53 54 55 56 57 58 59 60;"',
|
||||
serializer,
|
||||
);
|
||||
const sql = await builder.build();
|
||||
|
||||
// Should generate separate hasAllTokens for each term (not single statement)
|
||||
expect(sql).toContain(
|
||||
"hasAllTokens(Body, '1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50') AND hasAllTokens(Body, '51 52 53 54 55 56 57 58 59 60') AND (lower(Body) LIKE lower('%1 2 3 4 5 6 7 8 9 10; 11 12 13 14 15 16 17 18 19 20; 21 22 23 24 25 26 27 28 29 30; 31 32 33 34 35 36 37 38 39 40; 41 42 43 44 45 46 47 48 49 50; 51 52 53 54 55 56 57 58 59 60;%'))",
|
||||
);
|
||||
});
|
||||
|
||||
it('should use hasAllTokens for multiple separate terms', async () => {
|
||||
metadata.getSkipIndices = jest.fn().mockResolvedValue([
|
||||
{
|
||||
name: 'idx_body_text',
|
||||
type: 'text',
|
||||
typeFull: 'text(tokenizer=splitByNonAlpha)',
|
||||
expression: 'Body',
|
||||
granularity: '8',
|
||||
},
|
||||
]);
|
||||
|
||||
const serializer = new CustomSchemaSQLSerializerV2({
|
||||
metadata,
|
||||
databaseName,
|
||||
tableName,
|
||||
connectionId,
|
||||
implicitColumnExpression: 'Body',
|
||||
});
|
||||
|
||||
const builder = new SearchQueryBuilder('foo NOT bar baz', serializer);
|
||||
const sql = await builder.build();
|
||||
|
||||
// Should generate separate hasAllTokens for each term (not single statement)
|
||||
expect(sql).toContain("hasAllTokens(Body, 'foo')");
|
||||
expect(sql).toContain("NOT (hasAllTokens(Body, 'bar'))");
|
||||
expect(sql).toContain("hasAllTokens(Body, 'baz')");
|
||||
});
|
||||
|
||||
it('should not use text index when enable_full_text_index is disabled', async () => {
|
||||
metadata.getSkipIndices = jest.fn().mockResolvedValue([
|
||||
{
|
||||
name: 'idx_body_text',
|
||||
type: 'text',
|
||||
typeFull: 'text(tokenizer=splitByNonAlpha)',
|
||||
expression: 'Body',
|
||||
granularity: '8',
|
||||
},
|
||||
]);
|
||||
|
||||
// Mock getSetting to disable full text index
|
||||
metadata.getSetting = jest
|
||||
.fn()
|
||||
.mockImplementation(async ({ settingName }) => {
|
||||
if (settingName === 'enable_full_text_index') {
|
||||
return '0';
|
||||
}
|
||||
return undefined;
|
||||
});
|
||||
|
||||
const serializer = new CustomSchemaSQLSerializerV2({
|
||||
metadata,
|
||||
databaseName,
|
||||
tableName,
|
||||
connectionId,
|
||||
implicitColumnExpression: 'Body',
|
||||
});
|
||||
|
||||
const builder = new SearchQueryBuilder('foo', serializer);
|
||||
const sql = await builder.build();
|
||||
|
||||
// Should fallback to hasToken (full text index disabled)
|
||||
expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
|
||||
});
|
||||
|
||||
it('should not use text index when enable_full_text_index is unavailable (ClickHouse version is old)', async () => {
|
||||
metadata.getSkipIndices = jest.fn().mockResolvedValue([
|
||||
{
|
||||
name: 'idx_body_text',
|
||||
type: 'text',
|
||||
typeFull: 'text(tokenizer=splitByNonAlpha)',
|
||||
expression: 'Body',
|
||||
granularity: '8',
|
||||
},
|
||||
]);
|
||||
|
||||
// Mock getSetting to disable full text index
|
||||
metadata.getSetting = jest.fn().mockResolvedValue(undefined);
|
||||
|
||||
const serializer = new CustomSchemaSQLSerializerV2({
|
||||
metadata,
|
||||
databaseName,
|
||||
tableName,
|
||||
connectionId,
|
||||
implicitColumnExpression: 'Body',
|
||||
});
|
||||
|
||||
const builder = new SearchQueryBuilder('foo', serializer);
|
||||
const sql = await builder.build();
|
||||
|
||||
// Should fallback to hasToken (full text index disabled)
|
||||
expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
|
||||
});
|
||||
|
||||
it('should not use text index when getSetting throws an error', async () => {
|
||||
metadata.getSkipIndices = jest.fn().mockResolvedValue([
|
||||
{
|
||||
name: 'idx_body_text',
|
||||
type: 'text',
|
||||
typeFull: 'text(tokenizer=splitByNonAlpha)',
|
||||
expression: 'Body',
|
||||
granularity: '8',
|
||||
},
|
||||
]);
|
||||
|
||||
// Mock getSetting to disable full text index
|
||||
metadata.getSetting = jest
|
||||
.fn()
|
||||
.mockRejectedValue(new Error('Failed to get setting'));
|
||||
|
||||
const serializer = new CustomSchemaSQLSerializerV2({
|
||||
metadata,
|
||||
databaseName,
|
||||
tableName,
|
||||
connectionId,
|
||||
implicitColumnExpression: 'Body',
|
||||
});
|
||||
|
||||
const builder = new SearchQueryBuilder('foo', serializer);
|
||||
const sql = await builder.build();
|
||||
|
||||
// Should fallback to hasToken (full text index disabled)
|
||||
expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
|
||||
});
|
||||
});
|
||||
|
||||
describe('CustomSchemaSQLSerializerV2 - indexCoversColumn', () => {
|
||||
const metadata = getMetadata(
|
||||
new ClickhouseClient({ host: 'http://localhost:8123' }),
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@ describe('renderChartConfig', () => {
|
|||
.fn()
|
||||
.mockResolvedValue({ primary_key: 'timestamp' }),
|
||||
getSkipIndices: jest.fn().mockResolvedValue([]),
|
||||
getSetting: jest.fn().mockResolvedValue(undefined),
|
||||
} as unknown as jest.Mocked<Metadata>;
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -20,11 +20,13 @@ import {
|
|||
isTimestampExpressionInFirstOrderBy,
|
||||
joinQuerySettings,
|
||||
optimizeTimestampValueExpression,
|
||||
parseTokenizerFromTextIndex,
|
||||
parseToNumber,
|
||||
parseToStartOfFunction,
|
||||
replaceJsonExpressions,
|
||||
splitAndTrimCSV,
|
||||
splitAndTrimWithBracket,
|
||||
TextIndexTokenizer,
|
||||
} from '../core/utils';
|
||||
|
||||
describe('utils', () => {
|
||||
|
|
@ -1557,4 +1559,117 @@ describe('utils', () => {
|
|||
).toEqual("setting_name = 'Infinity'");
|
||||
});
|
||||
});
|
||||
describe('parseTokenizerFromTextIndex', () => {
|
||||
it.each([
|
||||
{
|
||||
type: 'text',
|
||||
expected: undefined,
|
||||
},
|
||||
{
|
||||
type: 'text()',
|
||||
expected: undefined,
|
||||
},
|
||||
{
|
||||
type: ' text ( tokenizer= array ) ',
|
||||
expected: { type: 'array' },
|
||||
},
|
||||
{
|
||||
type: 'text(tokenizer=splitByNonAlpha)',
|
||||
expected: { type: 'splitByNonAlpha' },
|
||||
},
|
||||
{
|
||||
type: 'text( tokenizer = splitByNonAlpha )',
|
||||
expected: { type: 'splitByNonAlpha' },
|
||||
},
|
||||
{
|
||||
type: 'text(tokenizer = splitByString())',
|
||||
expected: { type: 'splitByString', separators: [' '] },
|
||||
},
|
||||
{
|
||||
type: `text(tokenizer = splitByString([', ', '; ', '\\n', '" ', '\\\\', '\\t', '(', ')']))`,
|
||||
expected: {
|
||||
type: 'splitByString',
|
||||
separators: [', ', '; ', '\n', '" ', '\\', '\t', '(', ')'],
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'text(preprocessor=lower(s), tokenizer=sparseGrams(2, 5, 10))',
|
||||
expected: {
|
||||
type: 'sparseGrams',
|
||||
minLength: 2,
|
||||
maxLength: 5,
|
||||
minCutoffLength: 10,
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'text(preprocessor=lower(s), tokenizer=sparseGrams(2, 5))',
|
||||
expected: {
|
||||
type: 'sparseGrams',
|
||||
minLength: 2,
|
||||
maxLength: 5,
|
||||
minCutoffLength: undefined,
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'text(preprocessor=lower(s), tokenizer=sparseGrams(2))',
|
||||
expected: {
|
||||
type: 'sparseGrams',
|
||||
minLength: 2,
|
||||
maxLength: 10,
|
||||
minCutoffLength: undefined,
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'text(preprocessor=lower(s), tokenizer=sparseGrams)',
|
||||
expected: {
|
||||
type: 'sparseGrams',
|
||||
minLength: 3,
|
||||
maxLength: 10,
|
||||
minCutoffLength: undefined,
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'text(preprocessor=lower(s), tokenizer= sparseGrams ())',
|
||||
expected: {
|
||||
type: 'sparseGrams',
|
||||
minLength: 3,
|
||||
maxLength: 10,
|
||||
minCutoffLength: undefined,
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'text(preprocessor=lower(s), tokenizer=unknown)',
|
||||
expected: undefined,
|
||||
},
|
||||
{
|
||||
type: '',
|
||||
expected: undefined,
|
||||
},
|
||||
{
|
||||
type: 'text(preprocessor=lower(s), tokenizer=array)',
|
||||
expected: { type: 'array' },
|
||||
},
|
||||
{
|
||||
type: 'text(preprocessor=lower(s), tokenizer=ngrams)',
|
||||
expected: { type: 'ngrams', n: 3 },
|
||||
},
|
||||
{
|
||||
type: 'text(tokenizer=ngrams())',
|
||||
expected: { type: 'ngrams', n: 3 },
|
||||
},
|
||||
{
|
||||
type: 'text(tokenizer=ngrams(20))',
|
||||
expected: { type: 'ngrams', n: 20 },
|
||||
},
|
||||
])('should correctly parse tokenizer from: $type', ({ type, expected }) => {
|
||||
const result = parseTokenizerFromTextIndex({
|
||||
type: 'text',
|
||||
typeFull: type,
|
||||
name: 'text_idx',
|
||||
expression: 'Body',
|
||||
granularity: 1000,
|
||||
});
|
||||
expect(result).toEqual(expected);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -103,6 +103,7 @@ export type TableMetadata = {
|
|||
export type SkipIndexMetadata = {
|
||||
name: string;
|
||||
type: string; // 'bloom_filter', 'tokenbf_v1', 'minmax', etc.
|
||||
typeFull: string; // e.g., 'text(tokenizer='splitByNonAlpha')'
|
||||
expression: string; // e.g., "tokens(lower(Body))"
|
||||
granularity: number;
|
||||
};
|
||||
|
|
@ -616,6 +617,48 @@ export class Metadata {
|
|||
return tableMetadata;
|
||||
}
|
||||
|
||||
/** Reads the value of the setting with the given name from system.settings. */
|
||||
async getSetting({
|
||||
settingName,
|
||||
connectionId,
|
||||
}: {
|
||||
settingName: string;
|
||||
connectionId: string;
|
||||
}) {
|
||||
return this.cache.getOrFetch(`${connectionId}.${settingName}`, async () => {
|
||||
const sql = chSql`
|
||||
SELECT name, value
|
||||
FROM system.settings
|
||||
WHERE name = ${{ String: settingName }}
|
||||
`;
|
||||
|
||||
try {
|
||||
const json = await this.clickhouseClient
|
||||
.query<'JSON'>({
|
||||
connectionId,
|
||||
query: sql.sql,
|
||||
query_params: sql.params,
|
||||
clickhouse_settings: this.getClickHouseSettings(),
|
||||
})
|
||||
.then(res => res.json<{ name: string; value: string }>());
|
||||
|
||||
if (json.data.length > 0) {
|
||||
return json.data[0].value;
|
||||
}
|
||||
|
||||
return undefined;
|
||||
} catch (e) {
|
||||
// Don't retry permissions errors, just silently return undefined
|
||||
if (e instanceof Error && e.message.includes('Not enough privileges')) {
|
||||
console.warn('Not enough privileges to fetch settings:', e);
|
||||
return undefined;
|
||||
}
|
||||
|
||||
throw e;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Queries system.data_skipping_indices to retrieve skip index metadata for a table.
|
||||
* Results are cached using MetadataCache.
|
||||
|
|
@ -639,6 +682,7 @@ export class Metadata {
|
|||
SELECT
|
||||
name,
|
||||
type,
|
||||
type_full as typeFull,
|
||||
expr as expression,
|
||||
granularity
|
||||
FROM system.data_skipping_indices
|
||||
|
|
|
|||
|
|
@ -19,6 +19,8 @@ import {
|
|||
TSourceUnion,
|
||||
} from '@/types';
|
||||
|
||||
import { SkipIndexMetadata } from './metadata';
|
||||
|
||||
/** The default maximum number of buckets setting when determining a bucket duration for 'auto' granularity */
|
||||
export const DEFAULT_AUTO_GRANULARITY_MAX_BUCKETS = 60;
|
||||
|
||||
|
|
@ -747,3 +749,181 @@ export function joinQuerySettings(
|
|||
|
||||
return formattedPairs.join(', ');
|
||||
}
|
||||
|
||||
// A discriminated union type for different tokenizers above
|
||||
export type TextIndexTokenizer =
|
||||
| { type: 'splitByNonAlpha' }
|
||||
| { type: 'splitByString'; separators: string[] }
|
||||
| { type: 'ngrams'; n: number }
|
||||
| {
|
||||
type: 'sparseGrams';
|
||||
minLength: number;
|
||||
maxLength: number;
|
||||
minCutoffLength?: number;
|
||||
}
|
||||
| { type: 'array' };
|
||||
|
||||
/**
|
||||
* Parses the tokenizer and any associated tokenizer parameters from a text index type definition.
|
||||
*
|
||||
* Examples:
|
||||
* - `text(tokenizer = splitByNonAlpha)` -> `{ type: 'splitByNonAlpha' }`
|
||||
* - `text(tokenizer = splitByString([', ', '; ', '\n', '\\']))` -> `{ type: 'splitByString', separators: [', ', '; ', '\n', '\\'] }`
|
||||
* - `text(preprocessor=lower(s), tokenizer=sparseGrams(2, 5, 10))` -> `{ type: 'sparseGrams', minLength: 2, maxLength: 5, minCutoffLength: 10 }`
|
||||
*/
|
||||
export function parseTokenizerFromTextIndex({
|
||||
typeFull,
|
||||
}: SkipIndexMetadata): TextIndexTokenizer | undefined {
|
||||
const textPattern = /^\s*text\s*\((.+)\)\s*$/;
|
||||
const match = typeFull.match(textPattern);
|
||||
if (!match) {
|
||||
console.error(`Invalid text index type ${typeFull}.`);
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const argsString = match[1].trim();
|
||||
const args = splitAndTrimWithBracket(argsString).map(arg => {
|
||||
const [key, value] = arg.split('=').map(s => s.trim());
|
||||
return { key, value };
|
||||
});
|
||||
|
||||
const tokenizerArg = args.find(arg => arg.key === 'tokenizer')?.value;
|
||||
if (!tokenizerArg) {
|
||||
console.error(
|
||||
`Invalid tokenizer argument in index type ${typeFull}: ${tokenizerArg}`,
|
||||
argsString,
|
||||
splitAndTrimWithBracket(argsString),
|
||||
);
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const tokenizerName = tokenizerArg.split('(')[0].trim();
|
||||
const tokenizerArgsString = tokenizerArg
|
||||
.substring(tokenizerArg.indexOf('(') + 1, tokenizerArg.lastIndexOf(')'))
|
||||
.trim();
|
||||
|
||||
switch (tokenizerName) {
|
||||
case 'splitByNonAlpha':
|
||||
return { type: 'splitByNonAlpha' };
|
||||
|
||||
case 'array':
|
||||
return { type: 'array' };
|
||||
|
||||
case 'ngrams': {
|
||||
// Default n is 3
|
||||
if (!tokenizerArgsString) {
|
||||
return { type: 'ngrams', n: 3 };
|
||||
}
|
||||
|
||||
return { type: 'ngrams', n: Number.parseInt(tokenizerArgsString, 10) };
|
||||
}
|
||||
|
||||
case 'sparseGrams': {
|
||||
const args = tokenizerArgsString
|
||||
.split(',')
|
||||
.map(s => s.trim())
|
||||
.filter(s => !!s);
|
||||
|
||||
const tokenizer: TextIndexTokenizer = {
|
||||
type: 'sparseGrams',
|
||||
minLength: 3,
|
||||
maxLength: 10,
|
||||
};
|
||||
|
||||
if (args.length >= 1) tokenizer.minLength = Number.parseInt(args[0], 10);
|
||||
if (args.length >= 2) tokenizer.maxLength = Number.parseInt(args[1], 10);
|
||||
if (args.length >= 3)
|
||||
tokenizer.minCutoffLength = Number.parseInt(args[2], 10);
|
||||
|
||||
return tokenizer;
|
||||
}
|
||||
|
||||
case 'splitByString': {
|
||||
if (!tokenizerArgsString) {
|
||||
// Default separator is space
|
||||
return { type: 'splitByString', separators: [' '] };
|
||||
}
|
||||
|
||||
const unescape = (str: string) => {
|
||||
const escapeCharacters = [
|
||||
{ pattern: /\\a/g, replacement: 'a' },
|
||||
{ pattern: /\\b/g, replacement: 'b' },
|
||||
{ pattern: /\\e/g, replacement: 'e' },
|
||||
{ pattern: /\\f/g, replacement: '\f' },
|
||||
{ pattern: /\\n/g, replacement: '\n' },
|
||||
{ pattern: /\\r/g, replacement: '\r' },
|
||||
{ pattern: /\\t/g, replacement: '\t' },
|
||||
{ pattern: /\\v/g, replacement: '\v' },
|
||||
{ pattern: /\\0/g, replacement: '\0' },
|
||||
{ pattern: /\\\\/g, replacement: '\\' },
|
||||
{ pattern: /\\'/g, replacement: "'" },
|
||||
{ pattern: /\\"/g, replacement: '"' },
|
||||
{ pattern: /\\`/g, replacement: '`' },
|
||||
{ pattern: /\\\//g, replacement: '/' },
|
||||
{ pattern: /\\=/g, replacement: '=' },
|
||||
];
|
||||
|
||||
for (const { pattern, replacement } of escapeCharacters) {
|
||||
str = str.replace(pattern, replacement);
|
||||
}
|
||||
|
||||
return str;
|
||||
};
|
||||
|
||||
const separatorsString = tokenizerArgsString.match(/\[(.*)\]/);
|
||||
if (!separatorsString) {
|
||||
// If no array is provided, default to space
|
||||
return { type: 'splitByString', separators: [' '] };
|
||||
}
|
||||
|
||||
const arrayContent = separatorsString[1];
|
||||
|
||||
// Split by commas outside of quotes
|
||||
const separators: string[] = [];
|
||||
let current = '';
|
||||
let inQuote = false;
|
||||
let quoteChar = '';
|
||||
|
||||
for (let i = 0; i < arrayContent.length; i++) {
|
||||
const char = arrayContent[i];
|
||||
|
||||
if ((char === "'" || char === '"') && !inQuote) {
|
||||
inQuote = true;
|
||||
quoteChar = char;
|
||||
} else if (char === quoteChar && inQuote) {
|
||||
if (arrayContent[i - 1] !== '\\' || arrayContent[i - 2] === '\\') {
|
||||
inQuote = false;
|
||||
quoteChar = '';
|
||||
}
|
||||
} else if (char === ',' && !inQuote) {
|
||||
const trimmed = current.trim();
|
||||
if (trimmed) {
|
||||
// Remove quotes and unescape characters
|
||||
const value = trimmed.replace(/^['"]|['"]$/g, '');
|
||||
const unescapedValue = unescape(value);
|
||||
separators.push(unescapedValue);
|
||||
}
|
||||
|
||||
current = '';
|
||||
continue;
|
||||
}
|
||||
|
||||
current += char;
|
||||
}
|
||||
|
||||
// Add last separator
|
||||
const trimmed = current.trim();
|
||||
if (trimmed) {
|
||||
const value = trimmed.replace(/^['"]|['"]$/g, '');
|
||||
const unescapedValue = unescape(value);
|
||||
separators.push(unescapedValue);
|
||||
}
|
||||
|
||||
return { type: 'splitByString', separators };
|
||||
}
|
||||
|
||||
default:
|
||||
console.error(`Unknown tokenizer ${tokenizerName} in type ${typeFull}.`);
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,9 +1,16 @@
|
|||
import lucene from '@hyperdx/lucene';
|
||||
import { chunk } from 'lodash';
|
||||
import SqlString from 'sqlstring';
|
||||
|
||||
import { convertCHTypeToPrimitiveJSType, JSDataType } from '@/clickhouse';
|
||||
import { Metadata, SkipIndexMetadata } from '@/core/metadata';
|
||||
import { splitAndTrimWithBracket } from '@/core/utils';
|
||||
import {
|
||||
parseTokenizerFromTextIndex,
|
||||
splitAndTrimWithBracket,
|
||||
} from '@/core/utils';
|
||||
|
||||
/** Max number of tokens to pass to hasAllTokens(), which supports up to 64 tokens as of ClickHouse v25.12. */
|
||||
const HAS_ALL_TOKENS_CHUNK_SIZE = 50;
|
||||
|
||||
function encodeSpecialTokens(query: string): string {
|
||||
return query
|
||||
|
|
@ -493,6 +500,7 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
|
|||
private implicitColumnExpression?: string;
|
||||
private connectionId: string;
|
||||
private skipIndicesPromise?: Promise<SkipIndexMetadata[]>;
|
||||
private enableTextIndexPromise?: Promise<boolean>;
|
||||
|
||||
constructor({
|
||||
metadata,
|
||||
|
|
@ -519,6 +527,18 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
|
|||
console.error('Error fetching skip indices:', error);
|
||||
return [];
|
||||
});
|
||||
|
||||
// Pre-fetch value of the enable_full_text_index setting
|
||||
this.enableTextIndexPromise = this.metadata
|
||||
.getSetting({
|
||||
settingName: 'enable_full_text_index',
|
||||
connectionId,
|
||||
})
|
||||
.then(value => value === '1')
|
||||
.catch(error => {
|
||||
console.error('Error fetching enable_full_text_index setting:', error);
|
||||
return false;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -583,15 +603,54 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
|
|||
return SqlString.format(
|
||||
`(lower(?) ${isNegatedField ? 'NOT ' : ''}LIKE lower(?))`,
|
||||
[
|
||||
SqlString.raw(column ?? ''),
|
||||
SqlString.raw(column),
|
||||
`${prefixWildcard ? '%' : ''}${term}${suffixWildcard ? '%' : ''}`,
|
||||
],
|
||||
);
|
||||
} else if (shouldUseTokenBf) {
|
||||
const hasSeparators = this.termHasSeparators(term);
|
||||
// First check for a text index, and use it if possible
|
||||
// Note: We check that enable_full_text_index = 1, otherwise hasAllTokens() errors
|
||||
const isTextIndexEnabled = await this.enableTextIndexPromise;
|
||||
const textIndex = isTextIndexEnabled
|
||||
? await this.findTextIndex(column)
|
||||
: undefined;
|
||||
|
||||
if (textIndex) {
|
||||
const tokenizer = parseTokenizerFromTextIndex(textIndex);
|
||||
|
||||
// HDX-3259: Support other tokenizers by overriding tokenizeTerm, termHasSeparators, and batching logic
|
||||
if (tokenizer?.type === 'splitByNonAlpha') {
|
||||
const tokens = this.tokenizeTerm(term);
|
||||
const hasSeparators = this.termHasSeparators(term);
|
||||
|
||||
// Batch tokens to avoid exceeding hasAllTokens limit (64)
|
||||
const tokenBatches = chunk(tokens, HAS_ALL_TOKENS_CHUNK_SIZE);
|
||||
const hasAllTokensExpressions = tokenBatches.map(batch =>
|
||||
SqlString.format(`hasAllTokens(?, ?)`, [
|
||||
SqlString.raw(column),
|
||||
batch.join(' '),
|
||||
]),
|
||||
);
|
||||
|
||||
if (hasSeparators || tokenBatches.length > 1) {
|
||||
// Multi-token, or term containing token separators: hasAllTokens(..., 'foo bar') AND lower(...) LIKE '%foo bar%'
|
||||
return `(${isNegatedField ? 'NOT (' : ''}${[
|
||||
...hasAllTokensExpressions,
|
||||
SqlString.format(`(lower(?) LIKE lower(?))`, [
|
||||
SqlString.raw(column),
|
||||
`%${term}%`,
|
||||
]),
|
||||
].join(' AND ')}${isNegatedField ? ')' : ''})`;
|
||||
} else {
|
||||
// Single token, without token separators: hasAllTokens(..., 'term')
|
||||
return `(${isNegatedField ? 'NOT ' : ''}${hasAllTokensExpressions.join(' AND ')})`;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check for bloom_filter tokens() index first
|
||||
const bloomIndex = await this.findBloomFilterTokensIndex(column ?? '');
|
||||
const hasSeparators = this.termHasSeparators(term);
|
||||
const bloomIndex = await this.findBloomFilterTokensIndex(column);
|
||||
|
||||
if (bloomIndex.found) {
|
||||
const indexHasLower = /\blower\s*\(/.test(bloomIndex.indexExpression);
|
||||
|
|
@ -607,7 +666,7 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
|
|||
`hasAll(${bloomIndex.indexExpression}, ${termTokensExpression})`,
|
||||
// If there are token separators in the term, try to match the whole term as well
|
||||
SqlString.format(`(lower(?) LIKE lower(?))`, [
|
||||
SqlString.raw(column ?? ''),
|
||||
SqlString.raw(column),
|
||||
`%${term}%`,
|
||||
]),
|
||||
].join(' AND ')}${isNegatedField ? ')' : ''})`;
|
||||
|
|
@ -623,20 +682,20 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
|
|||
return `(${isNegatedField ? 'NOT (' : ''}${[
|
||||
...tokens.map(token =>
|
||||
SqlString.format(`hasToken(lower(?), lower(?))`, [
|
||||
SqlString.raw(column ?? ''),
|
||||
SqlString.raw(column),
|
||||
token,
|
||||
]),
|
||||
),
|
||||
// If there are symbols in the term, try to match the whole term as well
|
||||
SqlString.format(`(lower(?) LIKE lower(?))`, [
|
||||
SqlString.raw(column ?? ''),
|
||||
SqlString.raw(column),
|
||||
`%${term}%`,
|
||||
]),
|
||||
].join(' AND ')}${isNegatedField ? ')' : ''})`;
|
||||
} else {
|
||||
return SqlString.format(
|
||||
`(${isNegatedField ? 'NOT ' : ''}hasToken(lower(?), lower(?)))`,
|
||||
[SqlString.raw(column ?? ''), term],
|
||||
[SqlString.raw(column), term],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -778,6 +837,23 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
|
|||
// throw new Error(`Column not found: ${field}`);
|
||||
}
|
||||
|
||||
private async findTextIndex(
|
||||
columnExpression: string,
|
||||
): Promise<SkipIndexMetadata | undefined> {
|
||||
const skipIndices = await this.skipIndicesPromise;
|
||||
|
||||
if (!skipIndices || skipIndices.length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
// Note: Text index expressions should not be wrapped in tokens() or preprocessing functions like lower().
|
||||
return skipIndices.find(
|
||||
idx =>
|
||||
idx.type === 'text' &&
|
||||
this.indexCoversColumn(idx.expression, columnExpression),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds a bloom_filter skip index that uses tokens() on the given column expression.
|
||||
* Returns the full index expression if found, otherwise returns not found.
|
||||
|
|
|
|||
Loading…
Reference in a new issue