feat: Support field:(<term>...) Lucene searches (#1315)

# Summary

This PR updates HyperDX's lucene support to include parenthesized field searches of the form `<field>:(<term>...)`.

Prior to these changes, HyperDX would ignore the `<field>` entirely and search as if the query were just `<term>...`.

With these changes, the search is performed just like a `<term>...` search except:

1. The `field` is used for the search, instead of the implicit field expression (eg. `Body` for `otel_logs`)
2. The search is performed without `hasToken()`, as we assume that fields do not have bloom filters setup (matching the current behavior for how we search fields)

This support has the added benefit of unlocking multi-token substring searches (Ref HDX-1931)
- Previously, you could not search a field for a substring with multiple tokens, eg `error.message:*Method not allowed*` is interpreted as 3 separate terms, and only `*Method` would be associated with `error.message`. `error.message:"Method not allowed"` and `error.message:"*Method not allowed*"` look for exact matches, instead of substrings.
- Now, this can be accomplished with `error.message:("Method not allowed")`. This matches the current behavior of a search like `"Method not allowed"`, which would search the source's default implicit column (eg. `Body`) for the substring "Method not allowed".

## Testing

To test these changes, this PR adds a few dozen query parser unit test cases.
This commit is contained in:
Drew Davis 2025-11-04 18:39:58 -05:00 committed by GitHub
parent f612bf3c00
commit 6e628bcded
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 438 additions and 131 deletions

View file

@ -0,0 +1,5 @@
---
"@hyperdx/common-utils": patch
---
feat: Support field:(<term>...) Lucene searches

View file

@ -1,23 +1,30 @@
import { ClickhouseClient } from '@/clickhouse/node';
import { getMetadata } from '@/core/metadata';
import { CustomSchemaSQLSerializerV2 } from '@/queryParser';
import {
CustomSchemaSQLSerializerV2,
genEnglishExplanation,
SearchQueryBuilder,
} from '@/queryParser';
describe('CustomSchemaSQLSerializerV2 - json', () => {
function getTestTable(field) {
return { name: field, type: 'JSON' };
}
const metadata = getMetadata(
new ClickhouseClient({ host: 'http://localhost:8123' }),
);
// @ts-ignore
metadata.getColumn = ({ column }) => {
return new Promise((resolve, reject) => {
if (column.indexOf('.') >= 0) return resolve(undefined);
const testTable = getTestTable(column);
// @ts-ignore
return resolve(testTable);
});
};
metadata.getColumn = jest.fn().mockImplementation(async ({ column }) => {
if (column === 'ResourceAttributesJSON') {
return { name: 'ResourceAttributesJSON', type: 'JSON' };
} else if (column === 'LogAttributes') {
return { name: 'LogAttributes', type: 'Map' };
} else if (column === 'ServiceName') {
return { name: 'ServiceName', type: 'String' };
} else if (column === 'SeverityNumber') {
return { name: 'SeverityNumber', type: 'UInt8' };
} else if (column === 'foo') {
return { name: 'foo', type: 'String' };
} else {
return undefined;
}
});
const databaseName = 'testName';
const tableName = 'testTable';
const connectionId = 'testId';
@ -26,29 +33,30 @@ describe('CustomSchemaSQLSerializerV2 - json', () => {
databaseName,
tableName,
connectionId,
implicitColumnExpression: 'Body',
});
it('getColumnForField', async () => {
const field1 = 'serviceName.test';
const res1 = await serializer.getColumnForField(field1);
const field1 = 'ResourceAttributesJSON.test';
const res1 = await serializer.getColumnForField(field1, {});
expect(res1).toEqual({
column: '',
columnJSON: {
number:
"dynamicType(`serviceName`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`test`",
string: 'toString(`serviceName`.`test`)',
"dynamicType(`ResourceAttributesJSON`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`test`",
string: 'toString(`ResourceAttributesJSON`.`test`)',
},
found: true,
propertyType: 'json',
});
const field2 = 'logBody.test.nest';
const res2 = await serializer.getColumnForField(field2);
const field2 = 'ResourceAttributesJSON.test.nest';
const res2 = await serializer.getColumnForField(field2, {});
expect(res2).toEqual({
column: '',
columnJSON: {
number:
"dynamicType(`logBody`.`test`.`nest`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `logBody`.`test`.`nest`",
string: 'toString(`logBody`.`test`.`nest`)',
"dynamicType(`ResourceAttributesJSON`.`test`.`nest`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`test`.`nest`",
string: 'toString(`ResourceAttributesJSON`.`test`.`nest`)',
},
found: true,
propertyType: 'json',
@ -56,59 +64,263 @@ describe('CustomSchemaSQLSerializerV2 - json', () => {
});
it('compare - eq, isNotNull, gte, lte, lt, gt', async () => {
const eqField = 'serviceName.eq.test';
const eqField = 'ResourceAttributesJSON.eq.test';
const eqTerm = 'testTerm';
const eq1 = await serializer.eq(eqField, eqTerm, false);
expect(eq1).toBe("(toString(`serviceName`.`eq`.`test`) = 'testTerm')");
const eq2 = await serializer.eq(eqField, eqTerm, true);
expect(eq2).toBe("(toString(`serviceName`.`eq`.`test`) != 'testTerm')");
const eq1 = await serializer.eq(eqField, eqTerm, false, {});
expect(eq1).toBe(
"(toString(`ResourceAttributesJSON`.`eq`.`test`) = 'testTerm')",
);
const eq2 = await serializer.eq(eqField, eqTerm, true, {});
expect(eq2).toBe(
"(toString(`ResourceAttributesJSON`.`eq`.`test`) != 'testTerm')",
);
});
it('compare - isNotNull', async () => {
const isNotNullField = 'serviceName.isNotNull.test';
const isNotNull1 = await serializer.isNotNull(isNotNullField, false);
const isNotNullField = 'ResourceAttributesJSON.isNotNull.test';
const isNotNull1 = await serializer.isNotNull(isNotNullField, false, {});
expect(isNotNull1).toBe(
'notEmpty(toString(`serviceName`.`isNotNull`.`test`)) = 1',
'notEmpty(toString(`ResourceAttributesJSON`.`isNotNull`.`test`)) = 1',
);
const isNotNull2 = await serializer.isNotNull(isNotNullField, true);
const isNotNull2 = await serializer.isNotNull(isNotNullField, true, {});
expect(isNotNull2).toBe(
'notEmpty(toString(`serviceName`.`isNotNull`.`test`)) != 1',
'notEmpty(toString(`ResourceAttributesJSON`.`isNotNull`.`test`)) != 1',
);
});
it('compare - gte', async () => {
const gteField = 'serviceName.gte.test';
const gteField = 'ResourceAttributesJSON.gte.test';
const gteTerm = '30';
const gte = await serializer.gte(gteField, gteTerm);
const gte = await serializer.gte(gteField, gteTerm, {});
expect(gte).toBe(
"(dynamicType(`serviceName`.`gte`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`gte`.`test` >= '30')",
"(dynamicType(`ResourceAttributesJSON`.`gte`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`gte`.`test` >= '30')",
);
});
it('compare - lte', async () => {
const lteField = 'serviceName.lte.test';
const lteField = 'ResourceAttributesJSON.lte.test';
const lteTerm = '40';
const lte = await serializer.lte(lteField, lteTerm);
const lte = await serializer.lte(lteField, lteTerm, {});
expect(lte).toBe(
"(dynamicType(`serviceName`.`lte`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`lte`.`test` <= '40')",
"(dynamicType(`ResourceAttributesJSON`.`lte`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`lte`.`test` <= '40')",
);
});
it('compare - gt', async () => {
const gtField = 'serviceName.gt.test';
const gtField = 'ResourceAttributesJSON.gt.test';
const gtTerm = '70';
const gt = await serializer.gt(gtField, gtTerm);
const gt = await serializer.gt(gtField, gtTerm, {});
expect(gt).toBe(
"(dynamicType(`serviceName`.`gt`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`gt`.`test` > '70')",
"(dynamicType(`ResourceAttributesJSON`.`gt`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`gt`.`test` > '70')",
);
});
it('compare - lt', async () => {
const ltField = 'serviceName.lt.test';
const ltField = 'ResourceAttributesJSON.lt.test';
const ltTerm = '2';
const lt = await serializer.lt(ltField, ltTerm);
const lt = await serializer.lt(ltField, ltTerm, {});
expect(lt).toBe(
"(dynamicType(`serviceName`.`lt`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`lt`.`test` < '2')",
"(dynamicType(`ResourceAttributesJSON`.`lt`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`lt`.`test` < '2')",
);
});
const testCases = [
{
lucene: '"foo bar baz"',
sql: "((hasToken(lower(Body), lower('foo')) AND hasToken(lower(Body), lower('bar')) AND hasToken(lower(Body), lower('baz')) AND (lower(Body) LIKE lower('%foo bar baz%'))))",
english: 'event has whole word "foo bar baz"',
},
{
lucene: 'foo bar baz',
sql: "((hasToken(lower(Body), lower('foo'))) AND (hasToken(lower(Body), lower('bar'))) AND (hasToken(lower(Body), lower('baz'))))",
english:
'event has whole word foo AND event has whole word bar AND event has whole word baz',
},
{
lucene: 'ServiceName:foo bar baz',
sql: "((ServiceName ILIKE '%foo%') AND (hasToken(lower(Body), lower('bar'))) AND (hasToken(lower(Body), lower('baz'))))",
english:
"'ServiceName' contains foo AND event has whole word bar AND event has whole word baz",
},
{
lucene: 'ServiceName:"foo bar baz"',
sql: "((ServiceName = 'foo bar baz'))",
english: "'ServiceName' is foo bar baz",
},
{
lucene: 'ServiceName:("foo bar baz")',
sql: "(((ServiceName ILIKE '%foo bar baz%')))",
english: '(ServiceName contains "foo bar baz")',
},
{
lucene: 'ServiceName:(abc def)',
sql: "(((ServiceName ILIKE '%abc%') AND (ServiceName ILIKE '%def%')))",
english: '(ServiceName contains abc AND ServiceName contains def)',
},
{
lucene: '(abc def)',
sql: "(((hasToken(lower(Body), lower('abc'))) AND (hasToken(lower(Body), lower('def')))))",
english: '(event has whole word abc AND event has whole word def)',
},
{
lucene: '("abc def")',
sql: "(((hasToken(lower(Body), lower('abc')) AND hasToken(lower(Body), lower('def')) AND (lower(Body) LIKE lower('%abc def%')))))",
english: '(event has whole word "abc def")',
},
{
lucene: 'foo:bar',
sql: "((foo ILIKE '%bar%'))",
english: "'foo' contains bar",
},
{
lucene: '(foo:bar)',
sql: "(((foo ILIKE '%bar%')))",
english: "('foo' contains bar)",
},
{
lucene: 'bar',
sql: "((hasToken(lower(Body), lower('bar'))))",
english: 'event has whole word bar',
},
{
lucene: '(bar)',
sql: "(((hasToken(lower(Body), lower('bar')))))",
english: '(event has whole word bar)',
},
{
lucene: 'foo:(bar)',
sql: "(((foo ILIKE '%bar%')))",
english: '(foo contains bar)',
},
{
lucene: 'foo:(bar) baz',
sql: "(((foo ILIKE '%bar%')) AND (hasToken(lower(Body), lower('baz'))))",
english: '(foo contains bar) AND event has whole word baz',
},
{
lucene: 'LogAttributes.error.message:("Failed to fetch")',
sql: "(((`LogAttributes`['error.message'] ILIKE '%Failed to fetch%')))",
english: '(LogAttributes.error.message contains "Failed to fetch")',
},
{
lucene: 'ResourceAttributesJSON.error.message:("Failed to fetch")',
sql: "(((toString(`ResourceAttributesJSON`.`error`.`message`) ILIKE '%Failed to fetch%')))",
english:
'(ResourceAttributesJSON.error.message contains "Failed to fetch")',
},
{
lucene: 'SeverityNumber:>10',
sql: "((SeverityNumber > '10'))",
english: "'SeverityNumber' is greater than 10",
},
{
lucene: 'ResourceAttributesJSON.error.severity:>10',
sql: "((dynamicType(`ResourceAttributesJSON`.`error`.`severity`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`error`.`severity` > '10'))",
english: "'ResourceAttributesJSON.error.severity' is greater than 10",
},
{
lucene: 'foo:(bar baz)',
sql: "(((foo ILIKE '%bar%') AND (foo ILIKE '%baz%')))",
english: '(foo contains bar AND foo contains baz)',
},
{
lucene: '-foo:bar',
sql: "((foo NOT ILIKE '%bar%'))",
english: "'foo' does not contain bar",
},
{
lucene: 'NOT foo:(bar baz)',
sql: "(NOT ((foo ILIKE '%bar%') AND (foo ILIKE '%baz%')))",
english: 'NOT (foo contains bar AND foo contains baz)',
},
{
lucene: '-foo:(bar baz)',
sql: "(NOT ((foo ILIKE '%bar%') AND (foo ILIKE '%baz%')))",
english: 'NOT (foo contains bar AND foo contains baz)',
},
{
lucene: '-foo:(bar)',
sql: "(NOT ((foo ILIKE '%bar%')))",
english: 'NOT (foo contains bar)',
},
{
lucene: '-foo:(-bar)',
sql: "(NOT ((foo NOT ILIKE '%bar%')))",
english: 'NOT (foo does not contain bar)',
},
{
lucene: '*bar',
sql: "((lower(Body) LIKE lower('%bar')))",
english: 'event ends with bar',
},
{
lucene: 'foo:*bar',
sql: "((foo ILIKE '%bar%'))",
english: "'foo' contains bar",
},
{
lucene: 'foo:*bar*',
sql: "((foo ILIKE '%bar%'))",
english: "'foo' contains bar",
},
{
lucene: 'foo:(*bar)',
sql: "(((lower(foo) LIKE lower('%bar'))))",
english: '(foo ends with bar)',
},
{
lucene: 'foo:(bar*)',
sql: "(((lower(foo) LIKE lower('bar%'))))",
english: '(foo starts with bar)',
},
{
lucene: 'foo:(*bar*)',
sql: "(((lower(foo) LIKE lower('%bar%'))))",
english: '(foo contains bar)',
},
{
lucene: 'foo:[1 TO 5]',
sql: '((foo BETWEEN 1 AND 5))',
english: 'foo is between 1 and 5',
},
{
lucene: 'foo:(bar:(baz) qux)',
sql: "((((bar ILIKE '%baz%')) AND (foo ILIKE '%qux%')))",
english: '((bar contains baz) AND foo contains qux)',
},
];
it.each(testCases)(
'converts "$lucene" to SQL "$sql"',
async ({ lucene, sql }) => {
const builder = new SearchQueryBuilder(lucene, serializer);
const actualSql = await builder.build();
expect(actualSql).toBe(sql);
},
);
it.each(testCases)(
'converts "$lucene" to english "$english"',
async ({ lucene, english }) => {
const actualEnglish = await genEnglishExplanation(lucene);
expect(actualEnglish).toBe(english);
},
);
it('correctly searches multi-column implicit field', async () => {
const serializer = new CustomSchemaSQLSerializerV2({
metadata,
databaseName,
tableName,
connectionId,
implicitColumnExpression: 'Body, OtherColumn',
});
const lucene = 'foo bar';
const builder = new SearchQueryBuilder(lucene, serializer);
const actualSql = await builder.build();
const expectedSql =
"((hasToken(lower(concatWithSeparator(';',Body,OtherColumn)), lower('foo'))) AND (hasToken(lower(concatWithSeparator(';',Body,OtherColumn)), lower('bar'))))";
expect(actualSql).toBe(expectedSql);
});
});

View file

@ -46,33 +46,49 @@ const CLICK_HOUSE_JSON_NUMBER_TYPES = [
'Float64',
];
interface SerializerContext {
/** The current implicit column expression, indicating which SQL expression to use when comparing a term to the '<implicit>' field */
implicitColumnExpression?: string;
}
interface Serializer {
operator(op: lucene.Operator): string;
eq(field: string, term: string, isNegatedField: boolean): Promise<string>;
isNotNull(field: string, isNegatedField: boolean): Promise<string>;
gte(field: string, term: string): Promise<string>;
lte(field: string, term: string): Promise<string>;
lt(field: string, term: string): Promise<string>;
gt(field: string, term: string): Promise<string>;
operator(op: lucene.Operator, context: SerializerContext): string;
eq(
field: string,
term: string,
isNegatedField: boolean,
context: SerializerContext,
): Promise<string>;
isNotNull(
field: string,
isNegatedField: boolean,
context: SerializerContext,
): Promise<string>;
gte(field: string, term: string, context: SerializerContext): Promise<string>;
lte(field: string, term: string, context: SerializerContext): Promise<string>;
lt(field: string, term: string, context: SerializerContext): Promise<string>;
gt(field: string, term: string, context: SerializerContext): Promise<string>;
fieldSearch(
field: string,
term: string,
isNegatedField: boolean,
prefixWildcard: boolean,
suffixWildcard: boolean,
context: SerializerContext,
): Promise<string>;
range(
field: string,
start: string,
end: string,
isNegatedField: boolean,
context: SerializerContext,
): Promise<string>;
}
class EnglishSerializer implements Serializer {
private translateField(field: string) {
private translateField(field: string, context: SerializerContext) {
if (field === IMPLICIT_FIELD) {
return 'event';
return context.implicitColumnExpression ?? 'event';
}
return `'${field}'`;
@ -99,49 +115,56 @@ class EnglishSerializer implements Serializer {
}
}
async eq(field: string, term: string, isNegatedField: boolean) {
return `${this.translateField(field)} ${
async eq(
field: string,
term: string,
isNegatedField: boolean,
context: SerializerContext,
) {
return `${this.translateField(field, context)} ${
isNegatedField ? 'is not' : 'is'
} ${term}`;
}
async isNotNull(field: string, isNegatedField: boolean) {
return `${this.translateField(field)} ${
async isNotNull(
field: string,
isNegatedField: boolean,
context: SerializerContext,
) {
return `${this.translateField(field, context)} ${
isNegatedField ? 'is null' : 'is not null'
}`;
}
async gte(field: string, term: string) {
return `${this.translateField(field)} is greater than or equal to ${term}`;
async gte(field: string, term: string, context: SerializerContext) {
return `${this.translateField(field, context)} is greater than or equal to ${term}`;
}
async lte(field: string, term: string) {
return `${this.translateField(field)} is less than or equal to ${term}`;
async lte(field: string, term: string, context: SerializerContext) {
return `${this.translateField(field, context)} is less than or equal to ${term}`;
}
async lt(field: string, term: string) {
return `${this.translateField(field)} is less than ${term}`;
async lt(field: string, term: string, context: SerializerContext) {
return `${this.translateField(field, context)} is less than ${term}`;
}
async gt(field: string, term: string) {
return `${this.translateField(field)} is greater than ${term}`;
async gt(field: string, term: string, context: SerializerContext) {
return `${this.translateField(field, context)} is greater than ${term}`;
}
// async fieldSearch(field: string, term: string, isNegatedField: boolean) {
// return `${this.translateField(field)} ${
// isNegatedField ? 'does not contain' : 'contains'
// } ${term}`;
// }
async fieldSearch(
field: string,
term: string,
isNegatedField: boolean,
prefixWildcard: boolean,
suffixWildcard: boolean,
context: SerializerContext,
) {
const formattedTerm = term.trim().match(/\s/) ? `"${term}"` : term;
if (field === IMPLICIT_FIELD) {
return `${this.translateField(field)} ${
const isUsingTokenSearch = !context.implicitColumnExpression; // Source's implicit column has not been overridden
return `${this.translateField(field, context)} ${
prefixWildcard && suffixWildcard
? isNegatedField
? 'does not contain'
@ -154,14 +177,18 @@ class EnglishSerializer implements Serializer {
? isNegatedField
? 'does not start with'
: 'starts with'
: isNegatedField
? 'does not have whole word'
: 'has whole word'
} ${term}`;
: isUsingTokenSearch
? isNegatedField
? 'does not have whole word'
: 'has whole word'
: isNegatedField
? 'does not contain'
: 'contains'
} ${formattedTerm}`;
} else {
return `${this.translateField(field)} ${
return `${this.translateField(field, context)} ${
isNegatedField ? 'does not contain' : 'contains'
} ${term}`;
} ${formattedTerm}`;
}
}
@ -180,7 +207,10 @@ class EnglishSerializer implements Serializer {
export abstract class SQLSerializer implements Serializer {
private NOT_FOUND_QUERY = '(1 = 0)';
abstract getColumnForField(field: string): Promise<{
abstract getColumnForField(
field: string,
context: SerializerContext,
): Promise<{
column?: string;
columnJSON?: { string: string; number: string };
propertyType?: JSDataType;
@ -209,9 +239,14 @@ export abstract class SQLSerializer implements Serializer {
}
// Only for exact string matches
async eq(field: string, term: string, isNegatedField: boolean) {
async eq(
field: string,
term: string,
isNegatedField: boolean,
context: SerializerContext,
) {
const { column, columnJSON, found, propertyType } =
await this.getColumnForField(field);
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
@ -238,9 +273,13 @@ export abstract class SQLSerializer implements Serializer {
]);
}
async isNotNull(field: string, isNegatedField: boolean) {
async isNotNull(
field: string,
isNegatedField: boolean,
context: SerializerContext,
) {
const { column, columnJSON, found, propertyType } =
await this.getColumnForField(field);
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
@ -250,9 +289,9 @@ export abstract class SQLSerializer implements Serializer {
return `notEmpty(${column}) ${isNegatedField ? '!' : ''}= 1`;
}
async gte(field: string, term: string) {
async gte(field: string, term: string, context: SerializerContext) {
const { column, columnJSON, found, propertyType } =
await this.getColumnForField(field);
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
@ -262,9 +301,9 @@ export abstract class SQLSerializer implements Serializer {
return SqlString.format(`(${column} >= ?)`, [term]);
}
async lte(field: string, term: string) {
async lte(field: string, term: string, context: SerializerContext) {
const { column, columnJSON, found, propertyType } =
await this.getColumnForField(field);
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
@ -274,9 +313,9 @@ export abstract class SQLSerializer implements Serializer {
return SqlString.format(`(${column} <= ?)`, [term]);
}
async lt(field: string, term: string) {
async lt(field: string, term: string, context: SerializerContext) {
const { column, columnJSON, found, propertyType } =
await this.getColumnForField(field);
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
@ -286,9 +325,9 @@ export abstract class SQLSerializer implements Serializer {
return SqlString.format(`(${column} < ?)`, [term]);
}
async gt(field: string, term: string) {
async gt(field: string, term: string, context: SerializerContext) {
const { column, columnJSON, found, propertyType } =
await this.getColumnForField(field);
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
@ -323,10 +362,11 @@ export abstract class SQLSerializer implements Serializer {
isNegatedField: boolean,
prefixWildcard: boolean,
suffixWildcard: boolean,
context: SerializerContext,
) {
const isImplicitField = field === IMPLICIT_FIELD;
const { column, columnJSON, found, propertyType } =
await this.getColumnForField(field);
await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
@ -358,8 +398,13 @@ export abstract class SQLSerializer implements Serializer {
}
if (isImplicitField) {
// For implicit fields that come directly from the Source, we assume there is a bloom filter that can be used to
// optimize searches with hasToken. Overridden implicit columns (eg. "foo" in "foo:("bar baz")") are assumed
// to not have bloom filters.
const shouldUseTokenBf = !context.implicitColumnExpression;
// For the _source column, we'll try to do whole word searches by default
// to utilize the token bloom filter unless a prefix/sufix wildcard is specified
// to utilize the token bloom filter unless a prefix/suffix wildcard is specified
if (prefixWildcard || suffixWildcard) {
return SqlString.format(
`(lower(?) ${isNegatedField ? 'NOT ' : ''}LIKE lower(?))`,
@ -368,7 +413,7 @@ export abstract class SQLSerializer implements Serializer {
`${prefixWildcard ? '%' : ''}${term}${suffixWildcard ? '%' : ''}`,
],
);
} else {
} else if (shouldUseTokenBf) {
// TODO: Check case sensitivity of the index before lowering by default
// We can't search multiple tokens with `hasToken`, so we need to split up the term into tokens
const hasSeperators = this.termHasSeperators(term);
@ -394,13 +439,12 @@ export abstract class SQLSerializer implements Serializer {
);
}
}
} else {
const shoudUseTokenBf = isImplicitField;
return SqlString.format(
`(${column} ${isNegatedField ? 'NOT ' : ''}? ?)`,
[SqlString.raw(shoudUseTokenBf ? 'LIKE' : 'ILIKE'), `%${term}%`],
);
}
return SqlString.format(`(${column} ${isNegatedField ? 'NOT ' : ''}? ?)`, [
SqlString.raw('ILIKE'),
`%${term}%`,
]);
}
async range(
@ -408,8 +452,9 @@ export abstract class SQLSerializer implements Serializer {
start: string,
end: string,
isNegatedField: boolean,
context: SerializerContext,
) {
const { column, found } = await this.getColumnForField(field);
const { column, found } = await this.getColumnForField(field, context);
if (!found) {
return this.NOT_FOUND_QUERY;
}
@ -543,30 +588,38 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
// throw new Error(`Column not found: ${field}`);
}
async getColumnForField(field: string) {
if (field === IMPLICIT_FIELD) {
if (!this.implicitColumnExpression) {
throw new Error(
'Can not search bare text without an implicit column set.',
);
}
const expressions = splitAndTrimWithBracket(
this.implicitColumnExpression,
async getColumnForField(field: string, context: SerializerContext) {
const implicitColumnExpression =
context.implicitColumnExpression ?? this.implicitColumnExpression;
if (field === IMPLICIT_FIELD && !implicitColumnExpression) {
throw new Error(
'Can not search bare text without an implicit column set.',
);
}
const fieldFinal =
field === IMPLICIT_FIELD ? implicitColumnExpression! : field;
if (
field === IMPLICIT_FIELD &&
implicitColumnExpression === this.implicitColumnExpression // Source's implicit column has not been overridden
) {
// Sources can specify multi-column implicit columns, eg. Body and Message, in
// which case we search the combined string `concatWithSeparator(';', Body, Message)`.
const expressions = splitAndTrimWithBracket(fieldFinal);
return {
column:
expressions.length > 1
? `concatWithSeparator(';',${expressions.join(',')})`
: this.implicitColumnExpression,
: fieldFinal,
columnJSON: undefined,
propertyType: JSDataType.String,
found: true,
};
}
const expression = await this.buildColumnExpressionFromField(field);
const expression = await this.buildColumnExpressionFromField(fieldFinal);
return {
column: expression.columnExpression,
@ -581,6 +634,7 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
async function nodeTerm(
node: lucene.Node,
serializer: Serializer,
context: SerializerContext,
): Promise<string> {
const field = node.field[0] === '-' ? node.field.slice(1) : node.field;
let isNegatedField = node.field[0] === '-';
@ -605,36 +659,36 @@ async function nodeTerm(
// If the term is quoted, we should search for the exact term in a property (ex. foo:"bar")
// Implicit field searches should still use substring matching (ex. "foo bar")
if (nodeTerm.quoted && !isImplicitField) {
return serializer.eq(field, term, isNegatedField);
return serializer.eq(field, term, isNegatedField, context);
}
if (!nodeTerm.quoted && term === '*') {
return serializer.isNotNull(field, isNegatedField);
return serializer.isNotNull(field, isNegatedField, context);
}
if (!nodeTerm.quoted && term.substring(0, 2) === '>=') {
if (isNegatedField) {
return serializer.lt(field, term.slice(2));
return serializer.lt(field, term.slice(2), context);
}
return serializer.gte(field, term.slice(2));
return serializer.gte(field, term.slice(2), context);
}
if (!nodeTerm.quoted && term.substring(0, 2) === '<=') {
if (isNegatedField) {
return serializer.gt(field, term.slice(2));
return serializer.gt(field, term.slice(2), context);
}
return serializer.lte(field, term.slice(2));
return serializer.lte(field, term.slice(2), context);
}
if (!nodeTerm.quoted && term[0] === '>') {
if (isNegatedField) {
return serializer.lte(field, term.slice(1));
return serializer.lte(field, term.slice(1), context);
}
return serializer.gt(field, term.slice(1));
return serializer.gt(field, term.slice(1), context);
}
if (!nodeTerm.quoted && term[0] === '<') {
if (isNegatedField) {
return serializer.gte(field, term.slice(1));
return serializer.gte(field, term.slice(1), context);
}
return serializer.lt(field, term.slice(1));
return serializer.lt(field, term.slice(1), context);
}
let prefixWildcard = false;
@ -654,6 +708,7 @@ async function nodeTerm(
isNegatedField,
prefixWildcard,
suffixWildcard,
context,
);
// TODO: Handle regex, similarity, boost, prefix
@ -666,24 +721,50 @@ async function nodeTerm(
rangedTerm.term_min,
rangedTerm.term_max,
isNegatedField,
context,
);
}
throw new Error(`Unexpected Node type. ${node}`);
}
function createSerializerContext(
currentContext: SerializerContext,
ast: lucene.BinaryAST | lucene.LeftOnlyAST,
) {
// For syntax like `foo:(bar baz)` or `foo:("bar baz")`, the implicit field for the inner expression must be `foo`
if (ast.field && ast.parenthesized && ast.field !== IMPLICIT_FIELD) {
const fieldWithoutNegation = ast.field?.startsWith('-')
? ast.field.slice(1)
: ast.field;
return {
...currentContext,
implicitColumnExpression: fieldWithoutNegation,
};
} else {
return currentContext;
}
}
/** Returns true if the AST is of the form `-[field]:([terms...])` */
function isNegatedAndParenthesized(ast: lucene.BinaryAST | lucene.LeftOnlyAST) {
return ast.parenthesized && ast.field?.startsWith('-');
}
async function serialize(
ast: lucene.AST | lucene.Node,
serializer: Serializer,
context: SerializerContext,
): Promise<string> {
// Node Scenarios:
// 1. NodeTerm: Single term ex. "foo:bar"
// 2. NodeRangedTerm: Two terms ex. "foo:[bar TO qux]"
if ((ast as lucene.NodeTerm).term != null) {
return await nodeTerm(ast as lucene.NodeTerm, serializer);
return await nodeTerm(ast as lucene.NodeTerm, serializer, context);
}
if ((ast as lucene.NodeRangedTerm).inclusive != null) {
return await nodeTerm(ast as lucene.NodeTerm, serializer);
return await nodeTerm(ast as lucene.NodeTerm, serializer, context);
}
// AST Scenarios:
@ -691,25 +772,34 @@ async function serialize(
// 2. LeftOnlyAST: Single term ex. "foo:bar"
if ((ast as lucene.BinaryAST).right != null) {
const binaryAST = ast as lucene.BinaryAST;
const operator = serializer.operator(binaryAST.operator);
const operator = serializer.operator(binaryAST.operator, context);
const parenthesized = binaryAST.parenthesized;
return `${parenthesized ? '(' : ''}${await serialize(
const newContext = createSerializerContext(context, binaryAST);
const serialized = `${isNegatedAndParenthesized(binaryAST) ? 'NOT ' : ''}${parenthesized ? '(' : ''}${await serialize(
binaryAST.left,
serializer,
)} ${operator} ${await serialize(binaryAST.right, serializer)}${
newContext,
)} ${operator} ${await serialize(binaryAST.right, serializer, newContext)}${
parenthesized ? ')' : ''
}`;
return serialized;
}
if ((ast as lucene.LeftOnlyAST).left != null) {
const leftOnlyAST = ast as lucene.LeftOnlyAST;
const parenthesized = leftOnlyAST.parenthesized;
const newContext = createSerializerContext(context, leftOnlyAST);
// start is used when ex. "NOT foo:bar"
return `${parenthesized ? '(' : ''}${
const serialized = `${isNegatedAndParenthesized(leftOnlyAST) ? 'NOT ' : ''}${parenthesized ? '(' : ''}${
leftOnlyAST.start != undefined ? `${leftOnlyAST.start} ` : ''
}${await serialize(leftOnlyAST.left, serializer)}${
}${await serialize(leftOnlyAST.left, serializer, newContext)}${
parenthesized ? ')' : ''
}`;
return serialized;
}
// Blank AST, means no text was parsed
@ -721,7 +811,7 @@ export async function genWhereSQL(
ast: lucene.AST,
serializer: Serializer,
): Promise<string> {
return await serialize(ast, serializer);
return await serialize(ast, serializer, {});
}
export class SearchQueryBuilder {
@ -793,7 +883,7 @@ export async function genEnglishExplanation(query: string): Promise<string> {
if (parsedQ) {
const serializer = new EnglishSerializer();
return await serialize(parsedQ, serializer);
return await serialize(parsedQ, serializer, {});
}
} catch (e) {
console.warn('Parse failure', query, e);