feat: force usage of the map key index with lucene rendered queries (#1606)

We have indices like `INDEX idx_log_attr_key mapKeys(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1` that track whether a key likely exists for one of the maps in a granule. ClickHouse actually rarely uses this, mostly just for strict equality queries. We have many more scenarios where this is useful. 

This PR adds an expression to the rendered SQL for many Lucene queries. For a full list of scenarios, check the added test cases.

The condition added is `indexHint(mapContains(LogAttributes, 'key'))`. This should never change the outcome of a query because `indexHint` always returns `true`, it just hints to the planner that an index can be used. `mapContains` is the specific condition that tells ClickHouse.

Closes HDX-3070
This commit is contained in:
Aaron Knudtson 2026-01-15 11:28:51 -05:00 committed by GitHub
parent acefcbeda7
commit f20fac30b1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 296 additions and 41 deletions

View file

@ -0,0 +1,5 @@
---
"@hyperdx/common-utils": patch
---
feat: force usage of the map key index with lucene rendered queries

View file

@ -21,10 +21,19 @@ describe('CustomSchemaSQLSerializerV2 - json', () => {
return { name: 'SeverityNumber', type: 'UInt8' };
} else if (column === 'foo') {
return { name: 'foo', type: 'String' };
} else if (column === 'MaterializedExample') {
return { name: 'MaterializedExample', type: 'String' };
} else {
return undefined;
}
});
metadata.getMaterializedColumnsLookupTable = jest
.fn()
.mockImplementation(async () => {
return new Map([
["LogAttributes['materialized.example']", 'MaterializedExample'],
]);
});
const databaseName = 'testName';
const tableName = 'testTable';
const connectionId = 'testId';
@ -199,7 +208,7 @@ describe('CustomSchemaSQLSerializerV2 - json', () => {
},
{
lucene: 'LogAttributes.error.message:("Failed to fetch")',
sql: "(((`LogAttributes`['error.message'] ILIKE '%Failed to fetch%')))",
sql: "(((`LogAttributes`['error.message'] ILIKE '%Failed to fetch%' AND indexHint(mapContains(`LogAttributes`, 'error.message')))))",
english: '(LogAttributes.error.message contains "Failed to fetch")',
},
{
@ -288,6 +297,121 @@ describe('CustomSchemaSQLSerializerV2 - json', () => {
sql: "((((bar ILIKE '%baz%')) AND (foo ILIKE '%qux%')))",
english: '((bar contains baz) AND foo contains qux)',
},
// indexHint related cases
{
// We can probably trust CH to use the map keys index, but let's be explicit anyways
lucene: 'LogAttributes.error.message:"Failed to fetch"',
sql: "((`LogAttributes`['error.message'] = 'Failed to fetch' AND indexHint(mapContains(`LogAttributes`, 'error.message'))))",
english: "'LogAttributes.error.message' is Failed to fetch",
},
{
// Can't really use the map keys index
lucene: '-LogAttributes.error.message:"Failed to fetch"',
sql: "((`LogAttributes`['error.message'] != 'Failed to fetch'))",
english: "'LogAttributes.error.message' is not Failed to fetch",
},
{
// Explicitly hint to CH to use the map keys index
lucene: 'LogAttributes.error.message:Failed',
sql: "((`LogAttributes`['error.message'] ILIKE '%Failed%' AND indexHint(mapContains(`LogAttributes`, 'error.message'))))",
english: "'LogAttributes.error.message' contains Failed",
},
{
// Can't really use the map keys index
lucene: '-LogAttributes.error.message:Failed',
sql: "((`LogAttributes`['error.message'] NOT ILIKE '%Failed%'))",
english: "'LogAttributes.error.message' does not contain Failed",
},
{
// Explicitly hint to CH to use the map keys index
lucene: 'LogAttributes.error.message:*',
sql: "(notEmpty(`LogAttributes`['error.message']) = 1 AND indexHint(mapContains(`LogAttributes`, 'error.message')))",
english: "'LogAttributes.error.message' is not null",
},
{
// Can't really use the map keys index
lucene: '-LogAttributes.error.message:*',
sql: "(notEmpty(`LogAttributes`['error.message']) != 1)",
english: "'LogAttributes.error.message' is null",
},
{
// Explicitly hint to CH to use the map keys index for the non-materialized entry
lucene: 'MaterializedExample:"foo"',
sql: "((MaterializedExample = 'foo' AND indexHint(mapContains(`LogAttributes`, 'materialized.example'))))",
english: "'MaterializedExample' is foo",
},
{
// Can't really use the map keys index
lucene: '-MaterializedExample:"foo"',
sql: "((MaterializedExample != 'foo'))",
english: "'MaterializedExample' is not foo",
},
{
// Explicitly hint to CH to use the map keys index for the non-materialized entry
lucene: 'MaterializedExample:foo',
sql: "((MaterializedExample ILIKE '%foo%' AND indexHint(mapContains(`LogAttributes`, 'materialized.example'))))",
english: "'MaterializedExample' contains foo",
},
{
// Can't really use the map keys index
lucene: '-MaterializedExample:foo',
sql: "((MaterializedExample NOT ILIKE '%foo%'))",
english: "'MaterializedExample' does not contain foo",
},
{
// Explicitly hint to CH to use the map keys index
lucene: 'LogAttributes.materialized.example:foo',
sql: "((`LogAttributes`['materialized.example'] ILIKE '%foo%' AND indexHint(mapContains(`LogAttributes`, 'materialized.example'))))",
english: "'LogAttributes.materialized.example' contains foo",
},
{
// Explicitly hint to CH to use the map keys index
lucene: 'LogAttributes.example.number:>1',
sql: "((`LogAttributes`['example.number'] > '1' AND indexHint(mapContains(`LogAttributes`, 'example.number'))))",
english: "'LogAttributes.example.number' is greater than 1",
},
{
// Explicitly hint to CH to use the map keys index
lucene: 'LogAttributes.example.number:>=1',
sql: "((`LogAttributes`['example.number'] >= '1' AND indexHint(mapContains(`LogAttributes`, 'example.number'))))",
english: "'LogAttributes.example.number' is greater than or equal to 1",
},
{
// Explicitly hint to CH to use the map keys index
lucene: 'LogAttributes.example.number:<1',
sql: "((`LogAttributes`['example.number'] < '1' AND indexHint(mapContains(`LogAttributes`, 'example.number'))))",
english: "'LogAttributes.example.number' is less than 1",
},
{
// Explicitly hint to CH to use the map keys index
lucene: 'LogAttributes.example.number:<=1',
sql: "((`LogAttributes`['example.number'] <= '1' AND indexHint(mapContains(`LogAttributes`, 'example.number'))))",
english: "'LogAttributes.example.number' is less than or equal to 1",
},
{
// Explicitly hint to CH to use the map keys index
lucene: 'LogAttributes.example.number:[1 TO 5]',
sql: "((`LogAttributes`['example.number'] BETWEEN 1 AND 5 AND indexHint(mapContains(`LogAttributes`, 'example.number'))))",
english: 'LogAttributes.example.number is between 1 and 5',
},
{
// Can't really use the map keys index
lucene: '-LogAttributes.example.number:[1 TO 5]',
sql: "((`LogAttributes`['example.number'] NOT BETWEEN 1 AND 5))",
english: 'LogAttributes.example.number is not between 1 and 5',
},
{
// Explicitly hint to CH to use the map keys index
lucene: 'LogAttributes.error.message:("A B")',
sql: "(((`LogAttributes`['error.message'] ILIKE '%A B%' AND indexHint(mapContains(`LogAttributes`, 'error.message')))))",
english: '(LogAttributes.error.message contains "A B")',
},
{
// Can't really use the index here
lucene: '-LogAttributes.error.message:("A B")',
sql: "(NOT ((`LogAttributes`['error.message'] ILIKE '%A B%')))",
english: 'NOT (LogAttributes.error.message contains "A B")',
},
];
it.each(testCases)(

View file

@ -27,6 +27,21 @@ export function parse(query: string): lucene.AST {
return lucene.parse(encodeSpecialTokens(query));
}
function buildMapContains(mapField: string) {
const splitMapKey = (
field: string,
): { map: string; key: string } | undefined => {
const bracketIndex = field.indexOf("['");
if (bracketIndex === -1) return undefined;
const map = field.slice(0, bracketIndex);
const key = field.slice(bracketIndex + 2, -2);
return { map, key };
};
const val = splitMapKey(mapField);
if (!val) return undefined;
return SqlString.format('mapContains(??, ?)', [val.map, val.key]);
}
const IMPLICIT_FIELD = '<implicit>';
// Type guards for lucene AST types
@ -72,6 +87,7 @@ const CLICK_HOUSE_JSON_NUMBER_TYPES = [
interface SerializerContext {
/** The current implicit column expression, indicating which SQL expression to use when comparing a term to the '<implicit>' field */
implicitColumnExpression?: string;
isNegatedAndParenthesized?: boolean;
}
interface Serializer {
@ -238,6 +254,7 @@ export abstract class SQLSerializer implements Serializer {
columnJSON?: { string: string; number: string };
propertyType?: JSDataType;
found: boolean;
mapKeyIndexExpression?: string;
}>;
operator(op: lucene.Operator) {
@ -268,32 +285,44 @@ export abstract class SQLSerializer implements Serializer {
isNegatedField: boolean,
context: SerializerContext,
) {
const { column, columnJSON, found, propertyType } =
const { column, columnJSON, found, propertyType, mapKeyIndexExpression } =
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
const expressionPostfix =
mapKeyIndexExpression && !isNegatedField
? ` AND ${mapKeyIndexExpression}`
: '';
if (propertyType === JSDataType.Bool) {
// numeric and boolean fields must be equality matched
const normTerm = `${term}`.trim().toLowerCase();
return SqlString.format(`(?? ${isNegatedField ? '!' : ''}= ?)`, [
column,
normTerm === 'true' ? 1 : normTerm === 'false' ? 0 : parseInt(normTerm),
]);
return SqlString.format(
`(?? ${isNegatedField ? '!' : ''}= ?${expressionPostfix})`,
[
column,
normTerm === 'true'
? 1
: normTerm === 'false'
? 0
: parseInt(normTerm),
],
);
} else if (propertyType === JSDataType.Number) {
return SqlString.format(
`(${column} ${isNegatedField ? '!' : ''}= CAST(?, 'Float64'))`,
`(${column} ${isNegatedField ? '!' : ''}= CAST(?, 'Float64')${expressionPostfix})`,
[term],
);
} else if (propertyType === JSDataType.JSON) {
return SqlString.format(
`(${columnJSON?.string} ${isNegatedField ? '!' : ''}= ?)`,
`(${columnJSON?.string} ${isNegatedField ? '!' : ''}= ?${expressionPostfix})`,
[term],
);
}
return SqlString.format(`(${column} ${isNegatedField ? '!' : ''}= ?)`, [
term,
]);
return SqlString.format(
`(${column} ${isNegatedField ? '!' : ''}= ?${expressionPostfix})`,
[term],
);
}
async isNotNull(
@ -301,63 +330,91 @@ export abstract class SQLSerializer implements Serializer {
isNegatedField: boolean,
context: SerializerContext,
) {
const { column, columnJSON, found, propertyType } =
const { column, columnJSON, found, propertyType, mapKeyIndexExpression } =
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
const expressionPostfix =
mapKeyIndexExpression && !isNegatedField
? ` AND ${mapKeyIndexExpression}`
: '';
if (propertyType === JSDataType.JSON) {
return `notEmpty(${columnJSON?.string}) ${isNegatedField ? '!' : ''}= 1`;
return `notEmpty(${columnJSON?.string}) ${isNegatedField ? '!' : ''}= 1${expressionPostfix}`;
}
return `notEmpty(${column}) ${isNegatedField ? '!' : ''}= 1`;
return `notEmpty(${column}) ${isNegatedField ? '!' : ''}= 1${expressionPostfix}`;
}
async gte(field: string, term: string, context: SerializerContext) {
const { column, columnJSON, found, propertyType } =
const { column, columnJSON, found, propertyType, mapKeyIndexExpression } =
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
const expressionPostfix = mapKeyIndexExpression
? ` AND ${mapKeyIndexExpression}`
: '';
if (propertyType === JSDataType.JSON) {
return SqlString.format(`(${columnJSON?.number} >= ?)`, [term]);
return SqlString.format(
`(${columnJSON?.number} >= ?${expressionPostfix})`,
[term],
);
}
return SqlString.format(`(${column} >= ?)`, [term]);
return SqlString.format(`(${column} >= ?${expressionPostfix})`, [term]);
}
async lte(field: string, term: string, context: SerializerContext) {
const { column, columnJSON, found, propertyType } =
const { column, columnJSON, found, propertyType, mapKeyIndexExpression } =
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
const expressionPostfix = mapKeyIndexExpression
? ` AND ${mapKeyIndexExpression}`
: '';
if (propertyType === JSDataType.JSON) {
return SqlString.format(`(${columnJSON?.number} <= ?)`, [term]);
return SqlString.format(
`(${columnJSON?.number} <= ?${expressionPostfix})`,
[term],
);
}
return SqlString.format(`(${column} <= ?)`, [term]);
return SqlString.format(`(${column} <= ?${expressionPostfix})`, [term]);
}
async lt(field: string, term: string, context: SerializerContext) {
const { column, columnJSON, found, propertyType } =
const { column, columnJSON, found, propertyType, mapKeyIndexExpression } =
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
const expressionPostfix = mapKeyIndexExpression
? ` AND ${mapKeyIndexExpression}`
: '';
if (propertyType === JSDataType.JSON) {
return SqlString.format(`(${columnJSON?.number} < ?)`, [term]);
return SqlString.format(
`(${columnJSON?.number} < ?${expressionPostfix})`,
[term],
);
}
return SqlString.format(`(${column} < ?)`, [term]);
return SqlString.format(`(${column} < ?${expressionPostfix})`, [term]);
}
async gt(field: string, term: string, context: SerializerContext) {
const { column, columnJSON, found, propertyType } =
const { column, columnJSON, found, propertyType, mapKeyIndexExpression } =
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
const expressionPostfix = mapKeyIndexExpression
? ` AND ${mapKeyIndexExpression}`
: '';
if (propertyType === JSDataType.JSON) {
return SqlString.format(`(${columnJSON?.number} > ?)`, [term]);
return SqlString.format(
`(${columnJSON?.number} > ?${expressionPostfix})`,
[term],
);
}
return SqlString.format(`(${column} > ?)`, [term]);
return SqlString.format(`(${column} > ?${expressionPostfix})`, [term]);
}
// TODO: Not sure if SQL really needs this or if it'll coerce itself
@ -388,28 +445,41 @@ export abstract class SQLSerializer implements Serializer {
context: SerializerContext,
) {
const isImplicitField = field === IMPLICIT_FIELD;
const { column, columnJSON, found, propertyType } =
const { column, columnJSON, found, propertyType, mapKeyIndexExpression } =
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
const expressionPostfix =
mapKeyIndexExpression &&
!isNegatedField &&
(!isImplicitField || !context.isNegatedAndParenthesized)
? ` AND ${mapKeyIndexExpression}`
: '';
// If it's a string field, we will always try to match with ilike
if (propertyType === JSDataType.Bool) {
// numeric and boolean fields must be equality matched
const normTerm = `${term}`.trim().toLowerCase();
return SqlString.format(`(?? ${isNegatedField ? '!' : ''}= ?)`, [
column,
normTerm === 'true' ? 1 : normTerm === 'false' ? 0 : parseInt(normTerm),
]);
return SqlString.format(
`(?? ${isNegatedField ? '!' : ''}= ?${expressionPostfix})`,
[
column,
normTerm === 'true'
? 1
: normTerm === 'false'
? 0
: parseInt(normTerm),
],
);
} else if (propertyType === JSDataType.Number) {
return SqlString.format(
`(?? ${isNegatedField ? '!' : ''}= CAST(?, 'Float64'))`,
`(?? ${isNegatedField ? '!' : ''}= CAST(?, 'Float64')${expressionPostfix})`,
[column, term],
);
} else if (propertyType === JSDataType.JSON) {
return SqlString.format(
`(${columnJSON?.string} ${isNegatedField ? 'NOT ' : ''}ILIKE ?)`,
`(${columnJSON?.string} ${isNegatedField ? 'NOT ' : ''}ILIKE ?${expressionPostfix})`,
[`%${term}%`],
);
}
@ -464,10 +534,10 @@ export abstract class SQLSerializer implements Serializer {
}
}
return SqlString.format(`(${column} ${isNegatedField ? 'NOT ' : ''}? ?)`, [
SqlString.raw('ILIKE'),
`%${term}%`,
]);
return SqlString.format(
`(${column} ${isNegatedField ? 'NOT ' : ''}? ?${expressionPostfix})`,
[SqlString.raw('ILIKE'), `%${term}%`],
);
}
async range(
@ -477,17 +547,33 @@ export abstract class SQLSerializer implements Serializer {
isNegatedField: boolean,
context: SerializerContext,
) {
const { column, found } = await this.getColumnForField(field, context);
const { column, found, mapKeyIndexExpression } =
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
const expressionPostfix =
mapKeyIndexExpression && !isNegatedField
? ` AND ${mapKeyIndexExpression}`
: '';
return SqlString.format(
`(${column} ${isNegatedField ? 'NOT ' : ''}BETWEEN ? AND ?)`,
`(${column} ${isNegatedField ? 'NOT ' : ''}BETWEEN ? AND ?${expressionPostfix})`,
[this.attemptToParseNumber(start), this.attemptToParseNumber(end)],
);
}
}
type CustomSchemaSQLColumnExpression = {
found: boolean;
columnType: string;
columnExpression: string;
columnExpressionJSON?: {
string: string;
number: string;
};
mapKeyIndexExpression?: string;
};
export type CustomSchemaConfig = {
databaseName: string;
implicitColumnExpression?: string;
@ -527,7 +613,9 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
* - Nested Map
* - JSONExtract for non-string types
*/
private async buildColumnExpressionFromField(field: string) {
private async buildColumnExpressionFromField(
field: string,
): Promise<CustomSchemaSQLColumnExpression> {
const exactMatch = await this.metadata.getColumn({
databaseName: this.databaseName,
tableName: this.tableName,
@ -536,13 +624,46 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
});
if (exactMatch) {
return {
const columnExpression: CustomSchemaSQLColumnExpression = {
found: true,
columnType: exactMatch.type,
columnExpression: exactMatch.name,
// TODO
// Add JSON excatMatch if want to support whole json compare in future, ex: json:"{a: 1234}""
};
let materializedColumns: Map<string, string>;
try {
// This won't work for CTEs
materializedColumns =
await this.metadata.getMaterializedColumnsLookupTable({
databaseName: this.databaseName,
tableName: this.tableName,
connectionId: this.connectionId,
});
} catch (e) {
console.debug('Error in getMaterializedColumnsLookupTable', e);
materializedColumns = new Map();
}
const materializedColumn = (() => {
for (const [
materializedTarget,
materializedName,
] of materializedColumns.entries()) {
if (materializedName === field) {
return { materializedTarget, materializedName };
}
}
return undefined;
})();
if (materializedColumn) {
const mapContainsStatement = buildMapContains(
materializedColumn.materializedTarget,
);
if (mapContainsStatement) {
columnExpression.mapKeyIndexExpression = `indexHint(${mapContainsStatement})`;
}
}
return columnExpression;
}
const fieldPrefix = field.split('.')[0];
@ -564,6 +685,7 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
prefixMatch.name,
fieldPostfix,
]),
mapKeyIndexExpression: `indexHint(${buildMapContains(`${fieldPrefix}['${fieldPostfix}']`)})`,
columnType: valueType ?? 'Unknown',
};
} else if (prefixMatch.type.startsWith('JSON')) {
@ -650,6 +772,7 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
propertyType:
convertCHTypeToPrimitiveJSType(expression.columnType) ?? undefined,
found: expression.found,
mapKeyIndexExpression: expression.mapKeyIndexExpression,
};
}
}
@ -764,6 +887,9 @@ function createSerializerContext(
return {
...currentContext,
implicitColumnExpression: fieldWithoutNegation,
...(isNegatedAndParenthesized(ast)
? { isNegatedAndParenthesized: true }
: {}),
};
} else {
return currentContext;