mirror of
https://github.com/hyperdxio/hyperdx
synced 2026-04-21 13:37:15 +00:00
feat: Support field:(<term>...) Lucene searches (#1315)
# Summary
This PR updates HyperDX's lucene support to include parenthesized field searches of the form `<field>:(<term>...)`.
Prior to these changes, HyperDX would ignore the `<field>` entirely and search as if the query were just `<term>...`.
With these changes, the search is performed just like a `<term>...` search except:
1. The `field` is used for the search, instead of the implicit field expression (eg. `Body` for `otel_logs`)
2. The search is performed without `hasToken()`, as we assume that fields do not have bloom filters setup (matching the current behavior for how we search fields)
This support has the added benefit of unlocking multi-token substring searches (Ref HDX-1931)
- Previously, you could not search a field for a substring with multiple tokens, eg `error.message:*Method not allowed*` is interpreted as 3 separate terms, and only `*Method` would be associated with `error.message`. `error.message:"Method not allowed"` and `error.message:"*Method not allowed*"` look for exact matches, instead of substrings.
- Now, this can be accomplished with `error.message:("Method not allowed")`. This matches the current behavior of a search like `"Method not allowed"`, which would search the source's default implicit column (eg. `Body`) for the substring "Method not allowed".
## Testing
To test these changes, this PR adds a few dozen query parser unit test cases.
This commit is contained in:
parent
f612bf3c00
commit
6e628bcded
3 changed files with 438 additions and 131 deletions
5
.changeset/hungry-ways-rush.md
Normal file
5
.changeset/hungry-ways-rush.md
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
"@hyperdx/common-utils": patch
|
||||
---
|
||||
|
||||
feat: Support field:(<term>...) Lucene searches
|
||||
|
|
@ -1,23 +1,30 @@
|
|||
import { ClickhouseClient } from '@/clickhouse/node';
|
||||
import { getMetadata } from '@/core/metadata';
|
||||
import { CustomSchemaSQLSerializerV2 } from '@/queryParser';
|
||||
import {
|
||||
CustomSchemaSQLSerializerV2,
|
||||
genEnglishExplanation,
|
||||
SearchQueryBuilder,
|
||||
} from '@/queryParser';
|
||||
|
||||
describe('CustomSchemaSQLSerializerV2 - json', () => {
|
||||
function getTestTable(field) {
|
||||
return { name: field, type: 'JSON' };
|
||||
}
|
||||
const metadata = getMetadata(
|
||||
new ClickhouseClient({ host: 'http://localhost:8123' }),
|
||||
);
|
||||
// @ts-ignore
|
||||
metadata.getColumn = ({ column }) => {
|
||||
return new Promise((resolve, reject) => {
|
||||
if (column.indexOf('.') >= 0) return resolve(undefined);
|
||||
const testTable = getTestTable(column);
|
||||
// @ts-ignore
|
||||
return resolve(testTable);
|
||||
});
|
||||
};
|
||||
metadata.getColumn = jest.fn().mockImplementation(async ({ column }) => {
|
||||
if (column === 'ResourceAttributesJSON') {
|
||||
return { name: 'ResourceAttributesJSON', type: 'JSON' };
|
||||
} else if (column === 'LogAttributes') {
|
||||
return { name: 'LogAttributes', type: 'Map' };
|
||||
} else if (column === 'ServiceName') {
|
||||
return { name: 'ServiceName', type: 'String' };
|
||||
} else if (column === 'SeverityNumber') {
|
||||
return { name: 'SeverityNumber', type: 'UInt8' };
|
||||
} else if (column === 'foo') {
|
||||
return { name: 'foo', type: 'String' };
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
});
|
||||
const databaseName = 'testName';
|
||||
const tableName = 'testTable';
|
||||
const connectionId = 'testId';
|
||||
|
|
@ -26,29 +33,30 @@ describe('CustomSchemaSQLSerializerV2 - json', () => {
|
|||
databaseName,
|
||||
tableName,
|
||||
connectionId,
|
||||
implicitColumnExpression: 'Body',
|
||||
});
|
||||
|
||||
it('getColumnForField', async () => {
|
||||
const field1 = 'serviceName.test';
|
||||
const res1 = await serializer.getColumnForField(field1);
|
||||
const field1 = 'ResourceAttributesJSON.test';
|
||||
const res1 = await serializer.getColumnForField(field1, {});
|
||||
expect(res1).toEqual({
|
||||
column: '',
|
||||
columnJSON: {
|
||||
number:
|
||||
"dynamicType(`serviceName`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`test`",
|
||||
string: 'toString(`serviceName`.`test`)',
|
||||
"dynamicType(`ResourceAttributesJSON`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`test`",
|
||||
string: 'toString(`ResourceAttributesJSON`.`test`)',
|
||||
},
|
||||
found: true,
|
||||
propertyType: 'json',
|
||||
});
|
||||
const field2 = 'logBody.test.nest';
|
||||
const res2 = await serializer.getColumnForField(field2);
|
||||
const field2 = 'ResourceAttributesJSON.test.nest';
|
||||
const res2 = await serializer.getColumnForField(field2, {});
|
||||
expect(res2).toEqual({
|
||||
column: '',
|
||||
columnJSON: {
|
||||
number:
|
||||
"dynamicType(`logBody`.`test`.`nest`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `logBody`.`test`.`nest`",
|
||||
string: 'toString(`logBody`.`test`.`nest`)',
|
||||
"dynamicType(`ResourceAttributesJSON`.`test`.`nest`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`test`.`nest`",
|
||||
string: 'toString(`ResourceAttributesJSON`.`test`.`nest`)',
|
||||
},
|
||||
found: true,
|
||||
propertyType: 'json',
|
||||
|
|
@ -56,59 +64,263 @@ describe('CustomSchemaSQLSerializerV2 - json', () => {
|
|||
});
|
||||
|
||||
it('compare - eq, isNotNull, gte, lte, lt, gt', async () => {
|
||||
const eqField = 'serviceName.eq.test';
|
||||
const eqField = 'ResourceAttributesJSON.eq.test';
|
||||
const eqTerm = 'testTerm';
|
||||
const eq1 = await serializer.eq(eqField, eqTerm, false);
|
||||
expect(eq1).toBe("(toString(`serviceName`.`eq`.`test`) = 'testTerm')");
|
||||
const eq2 = await serializer.eq(eqField, eqTerm, true);
|
||||
expect(eq2).toBe("(toString(`serviceName`.`eq`.`test`) != 'testTerm')");
|
||||
const eq1 = await serializer.eq(eqField, eqTerm, false, {});
|
||||
expect(eq1).toBe(
|
||||
"(toString(`ResourceAttributesJSON`.`eq`.`test`) = 'testTerm')",
|
||||
);
|
||||
const eq2 = await serializer.eq(eqField, eqTerm, true, {});
|
||||
expect(eq2).toBe(
|
||||
"(toString(`ResourceAttributesJSON`.`eq`.`test`) != 'testTerm')",
|
||||
);
|
||||
});
|
||||
|
||||
it('compare - isNotNull', async () => {
|
||||
const isNotNullField = 'serviceName.isNotNull.test';
|
||||
const isNotNull1 = await serializer.isNotNull(isNotNullField, false);
|
||||
const isNotNullField = 'ResourceAttributesJSON.isNotNull.test';
|
||||
const isNotNull1 = await serializer.isNotNull(isNotNullField, false, {});
|
||||
expect(isNotNull1).toBe(
|
||||
'notEmpty(toString(`serviceName`.`isNotNull`.`test`)) = 1',
|
||||
'notEmpty(toString(`ResourceAttributesJSON`.`isNotNull`.`test`)) = 1',
|
||||
);
|
||||
const isNotNull2 = await serializer.isNotNull(isNotNullField, true);
|
||||
const isNotNull2 = await serializer.isNotNull(isNotNullField, true, {});
|
||||
expect(isNotNull2).toBe(
|
||||
'notEmpty(toString(`serviceName`.`isNotNull`.`test`)) != 1',
|
||||
'notEmpty(toString(`ResourceAttributesJSON`.`isNotNull`.`test`)) != 1',
|
||||
);
|
||||
});
|
||||
|
||||
it('compare - gte', async () => {
|
||||
const gteField = 'serviceName.gte.test';
|
||||
const gteField = 'ResourceAttributesJSON.gte.test';
|
||||
const gteTerm = '30';
|
||||
const gte = await serializer.gte(gteField, gteTerm);
|
||||
const gte = await serializer.gte(gteField, gteTerm, {});
|
||||
expect(gte).toBe(
|
||||
"(dynamicType(`serviceName`.`gte`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`gte`.`test` >= '30')",
|
||||
"(dynamicType(`ResourceAttributesJSON`.`gte`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`gte`.`test` >= '30')",
|
||||
);
|
||||
});
|
||||
|
||||
it('compare - lte', async () => {
|
||||
const lteField = 'serviceName.lte.test';
|
||||
const lteField = 'ResourceAttributesJSON.lte.test';
|
||||
const lteTerm = '40';
|
||||
const lte = await serializer.lte(lteField, lteTerm);
|
||||
const lte = await serializer.lte(lteField, lteTerm, {});
|
||||
expect(lte).toBe(
|
||||
"(dynamicType(`serviceName`.`lte`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`lte`.`test` <= '40')",
|
||||
"(dynamicType(`ResourceAttributesJSON`.`lte`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`lte`.`test` <= '40')",
|
||||
);
|
||||
});
|
||||
|
||||
it('compare - gt', async () => {
|
||||
const gtField = 'serviceName.gt.test';
|
||||
const gtField = 'ResourceAttributesJSON.gt.test';
|
||||
const gtTerm = '70';
|
||||
const gt = await serializer.gt(gtField, gtTerm);
|
||||
const gt = await serializer.gt(gtField, gtTerm, {});
|
||||
expect(gt).toBe(
|
||||
"(dynamicType(`serviceName`.`gt`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`gt`.`test` > '70')",
|
||||
"(dynamicType(`ResourceAttributesJSON`.`gt`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`gt`.`test` > '70')",
|
||||
);
|
||||
});
|
||||
|
||||
it('compare - lt', async () => {
|
||||
const ltField = 'serviceName.lt.test';
|
||||
const ltField = 'ResourceAttributesJSON.lt.test';
|
||||
const ltTerm = '2';
|
||||
const lt = await serializer.lt(ltField, ltTerm);
|
||||
const lt = await serializer.lt(ltField, ltTerm, {});
|
||||
expect(lt).toBe(
|
||||
"(dynamicType(`serviceName`.`lt`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `serviceName`.`lt`.`test` < '2')",
|
||||
"(dynamicType(`ResourceAttributesJSON`.`lt`.`test`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`lt`.`test` < '2')",
|
||||
);
|
||||
});
|
||||
|
||||
const testCases = [
|
||||
{
|
||||
lucene: '"foo bar baz"',
|
||||
sql: "((hasToken(lower(Body), lower('foo')) AND hasToken(lower(Body), lower('bar')) AND hasToken(lower(Body), lower('baz')) AND (lower(Body) LIKE lower('%foo bar baz%'))))",
|
||||
english: 'event has whole word "foo bar baz"',
|
||||
},
|
||||
{
|
||||
lucene: 'foo bar baz',
|
||||
sql: "((hasToken(lower(Body), lower('foo'))) AND (hasToken(lower(Body), lower('bar'))) AND (hasToken(lower(Body), lower('baz'))))",
|
||||
english:
|
||||
'event has whole word foo AND event has whole word bar AND event has whole word baz',
|
||||
},
|
||||
{
|
||||
lucene: 'ServiceName:foo bar baz',
|
||||
sql: "((ServiceName ILIKE '%foo%') AND (hasToken(lower(Body), lower('bar'))) AND (hasToken(lower(Body), lower('baz'))))",
|
||||
english:
|
||||
"'ServiceName' contains foo AND event has whole word bar AND event has whole word baz",
|
||||
},
|
||||
{
|
||||
lucene: 'ServiceName:"foo bar baz"',
|
||||
sql: "((ServiceName = 'foo bar baz'))",
|
||||
english: "'ServiceName' is foo bar baz",
|
||||
},
|
||||
{
|
||||
lucene: 'ServiceName:("foo bar baz")',
|
||||
sql: "(((ServiceName ILIKE '%foo bar baz%')))",
|
||||
english: '(ServiceName contains "foo bar baz")',
|
||||
},
|
||||
{
|
||||
lucene: 'ServiceName:(abc def)',
|
||||
sql: "(((ServiceName ILIKE '%abc%') AND (ServiceName ILIKE '%def%')))",
|
||||
english: '(ServiceName contains abc AND ServiceName contains def)',
|
||||
},
|
||||
{
|
||||
lucene: '(abc def)',
|
||||
sql: "(((hasToken(lower(Body), lower('abc'))) AND (hasToken(lower(Body), lower('def')))))",
|
||||
english: '(event has whole word abc AND event has whole word def)',
|
||||
},
|
||||
{
|
||||
lucene: '("abc def")',
|
||||
sql: "(((hasToken(lower(Body), lower('abc')) AND hasToken(lower(Body), lower('def')) AND (lower(Body) LIKE lower('%abc def%')))))",
|
||||
english: '(event has whole word "abc def")',
|
||||
},
|
||||
{
|
||||
lucene: 'foo:bar',
|
||||
sql: "((foo ILIKE '%bar%'))",
|
||||
english: "'foo' contains bar",
|
||||
},
|
||||
{
|
||||
lucene: '(foo:bar)',
|
||||
sql: "(((foo ILIKE '%bar%')))",
|
||||
english: "('foo' contains bar)",
|
||||
},
|
||||
{
|
||||
lucene: 'bar',
|
||||
sql: "((hasToken(lower(Body), lower('bar'))))",
|
||||
english: 'event has whole word bar',
|
||||
},
|
||||
{
|
||||
lucene: '(bar)',
|
||||
sql: "(((hasToken(lower(Body), lower('bar')))))",
|
||||
english: '(event has whole word bar)',
|
||||
},
|
||||
{
|
||||
lucene: 'foo:(bar)',
|
||||
sql: "(((foo ILIKE '%bar%')))",
|
||||
english: '(foo contains bar)',
|
||||
},
|
||||
{
|
||||
lucene: 'foo:(bar) baz',
|
||||
sql: "(((foo ILIKE '%bar%')) AND (hasToken(lower(Body), lower('baz'))))",
|
||||
english: '(foo contains bar) AND event has whole word baz',
|
||||
},
|
||||
{
|
||||
lucene: 'LogAttributes.error.message:("Failed to fetch")',
|
||||
sql: "(((`LogAttributes`['error.message'] ILIKE '%Failed to fetch%')))",
|
||||
english: '(LogAttributes.error.message contains "Failed to fetch")',
|
||||
},
|
||||
{
|
||||
lucene: 'ResourceAttributesJSON.error.message:("Failed to fetch")',
|
||||
sql: "(((toString(`ResourceAttributesJSON`.`error`.`message`) ILIKE '%Failed to fetch%')))",
|
||||
english:
|
||||
'(ResourceAttributesJSON.error.message contains "Failed to fetch")',
|
||||
},
|
||||
{
|
||||
lucene: 'SeverityNumber:>10',
|
||||
sql: "((SeverityNumber > '10'))",
|
||||
english: "'SeverityNumber' is greater than 10",
|
||||
},
|
||||
{
|
||||
lucene: 'ResourceAttributesJSON.error.severity:>10',
|
||||
sql: "((dynamicType(`ResourceAttributesJSON`.`error`.`severity`) in ('Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', 'Float32', 'Float64') and `ResourceAttributesJSON`.`error`.`severity` > '10'))",
|
||||
english: "'ResourceAttributesJSON.error.severity' is greater than 10",
|
||||
},
|
||||
{
|
||||
lucene: 'foo:(bar baz)',
|
||||
sql: "(((foo ILIKE '%bar%') AND (foo ILIKE '%baz%')))",
|
||||
english: '(foo contains bar AND foo contains baz)',
|
||||
},
|
||||
{
|
||||
lucene: '-foo:bar',
|
||||
sql: "((foo NOT ILIKE '%bar%'))",
|
||||
english: "'foo' does not contain bar",
|
||||
},
|
||||
{
|
||||
lucene: 'NOT foo:(bar baz)',
|
||||
sql: "(NOT ((foo ILIKE '%bar%') AND (foo ILIKE '%baz%')))",
|
||||
english: 'NOT (foo contains bar AND foo contains baz)',
|
||||
},
|
||||
{
|
||||
lucene: '-foo:(bar baz)',
|
||||
sql: "(NOT ((foo ILIKE '%bar%') AND (foo ILIKE '%baz%')))",
|
||||
english: 'NOT (foo contains bar AND foo contains baz)',
|
||||
},
|
||||
{
|
||||
lucene: '-foo:(bar)',
|
||||
sql: "(NOT ((foo ILIKE '%bar%')))",
|
||||
english: 'NOT (foo contains bar)',
|
||||
},
|
||||
{
|
||||
lucene: '-foo:(-bar)',
|
||||
sql: "(NOT ((foo NOT ILIKE '%bar%')))",
|
||||
english: 'NOT (foo does not contain bar)',
|
||||
},
|
||||
{
|
||||
lucene: '*bar',
|
||||
sql: "((lower(Body) LIKE lower('%bar')))",
|
||||
english: 'event ends with bar',
|
||||
},
|
||||
{
|
||||
lucene: 'foo:*bar',
|
||||
sql: "((foo ILIKE '%bar%'))",
|
||||
english: "'foo' contains bar",
|
||||
},
|
||||
{
|
||||
lucene: 'foo:*bar*',
|
||||
sql: "((foo ILIKE '%bar%'))",
|
||||
english: "'foo' contains bar",
|
||||
},
|
||||
{
|
||||
lucene: 'foo:(*bar)',
|
||||
sql: "(((lower(foo) LIKE lower('%bar'))))",
|
||||
english: '(foo ends with bar)',
|
||||
},
|
||||
{
|
||||
lucene: 'foo:(bar*)',
|
||||
sql: "(((lower(foo) LIKE lower('bar%'))))",
|
||||
english: '(foo starts with bar)',
|
||||
},
|
||||
{
|
||||
lucene: 'foo:(*bar*)',
|
||||
sql: "(((lower(foo) LIKE lower('%bar%'))))",
|
||||
english: '(foo contains bar)',
|
||||
},
|
||||
{
|
||||
lucene: 'foo:[1 TO 5]',
|
||||
sql: '((foo BETWEEN 1 AND 5))',
|
||||
english: 'foo is between 1 and 5',
|
||||
},
|
||||
{
|
||||
lucene: 'foo:(bar:(baz) qux)',
|
||||
sql: "((((bar ILIKE '%baz%')) AND (foo ILIKE '%qux%')))",
|
||||
english: '((bar contains baz) AND foo contains qux)',
|
||||
},
|
||||
];
|
||||
|
||||
it.each(testCases)(
|
||||
'converts "$lucene" to SQL "$sql"',
|
||||
async ({ lucene, sql }) => {
|
||||
const builder = new SearchQueryBuilder(lucene, serializer);
|
||||
const actualSql = await builder.build();
|
||||
expect(actualSql).toBe(sql);
|
||||
},
|
||||
);
|
||||
|
||||
it.each(testCases)(
|
||||
'converts "$lucene" to english "$english"',
|
||||
async ({ lucene, english }) => {
|
||||
const actualEnglish = await genEnglishExplanation(lucene);
|
||||
expect(actualEnglish).toBe(english);
|
||||
},
|
||||
);
|
||||
|
||||
it('correctly searches multi-column implicit field', async () => {
|
||||
const serializer = new CustomSchemaSQLSerializerV2({
|
||||
metadata,
|
||||
databaseName,
|
||||
tableName,
|
||||
connectionId,
|
||||
implicitColumnExpression: 'Body, OtherColumn',
|
||||
});
|
||||
|
||||
const lucene = 'foo bar';
|
||||
const builder = new SearchQueryBuilder(lucene, serializer);
|
||||
const actualSql = await builder.build();
|
||||
const expectedSql =
|
||||
"((hasToken(lower(concatWithSeparator(';',Body,OtherColumn)), lower('foo'))) AND (hasToken(lower(concatWithSeparator(';',Body,OtherColumn)), lower('bar'))))";
|
||||
expect(actualSql).toBe(expectedSql);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -46,33 +46,49 @@ const CLICK_HOUSE_JSON_NUMBER_TYPES = [
|
|||
'Float64',
|
||||
];
|
||||
|
||||
interface SerializerContext {
|
||||
/** The current implicit column expression, indicating which SQL expression to use when comparing a term to the '<implicit>' field */
|
||||
implicitColumnExpression?: string;
|
||||
}
|
||||
|
||||
interface Serializer {
|
||||
operator(op: lucene.Operator): string;
|
||||
eq(field: string, term: string, isNegatedField: boolean): Promise<string>;
|
||||
isNotNull(field: string, isNegatedField: boolean): Promise<string>;
|
||||
gte(field: string, term: string): Promise<string>;
|
||||
lte(field: string, term: string): Promise<string>;
|
||||
lt(field: string, term: string): Promise<string>;
|
||||
gt(field: string, term: string): Promise<string>;
|
||||
operator(op: lucene.Operator, context: SerializerContext): string;
|
||||
eq(
|
||||
field: string,
|
||||
term: string,
|
||||
isNegatedField: boolean,
|
||||
context: SerializerContext,
|
||||
): Promise<string>;
|
||||
isNotNull(
|
||||
field: string,
|
||||
isNegatedField: boolean,
|
||||
context: SerializerContext,
|
||||
): Promise<string>;
|
||||
gte(field: string, term: string, context: SerializerContext): Promise<string>;
|
||||
lte(field: string, term: string, context: SerializerContext): Promise<string>;
|
||||
lt(field: string, term: string, context: SerializerContext): Promise<string>;
|
||||
gt(field: string, term: string, context: SerializerContext): Promise<string>;
|
||||
fieldSearch(
|
||||
field: string,
|
||||
term: string,
|
||||
isNegatedField: boolean,
|
||||
prefixWildcard: boolean,
|
||||
suffixWildcard: boolean,
|
||||
context: SerializerContext,
|
||||
): Promise<string>;
|
||||
range(
|
||||
field: string,
|
||||
start: string,
|
||||
end: string,
|
||||
isNegatedField: boolean,
|
||||
context: SerializerContext,
|
||||
): Promise<string>;
|
||||
}
|
||||
|
||||
class EnglishSerializer implements Serializer {
|
||||
private translateField(field: string) {
|
||||
private translateField(field: string, context: SerializerContext) {
|
||||
if (field === IMPLICIT_FIELD) {
|
||||
return 'event';
|
||||
return context.implicitColumnExpression ?? 'event';
|
||||
}
|
||||
|
||||
return `'${field}'`;
|
||||
|
|
@ -99,49 +115,56 @@ class EnglishSerializer implements Serializer {
|
|||
}
|
||||
}
|
||||
|
||||
async eq(field: string, term: string, isNegatedField: boolean) {
|
||||
return `${this.translateField(field)} ${
|
||||
async eq(
|
||||
field: string,
|
||||
term: string,
|
||||
isNegatedField: boolean,
|
||||
context: SerializerContext,
|
||||
) {
|
||||
return `${this.translateField(field, context)} ${
|
||||
isNegatedField ? 'is not' : 'is'
|
||||
} ${term}`;
|
||||
}
|
||||
|
||||
async isNotNull(field: string, isNegatedField: boolean) {
|
||||
return `${this.translateField(field)} ${
|
||||
async isNotNull(
|
||||
field: string,
|
||||
isNegatedField: boolean,
|
||||
context: SerializerContext,
|
||||
) {
|
||||
return `${this.translateField(field, context)} ${
|
||||
isNegatedField ? 'is null' : 'is not null'
|
||||
}`;
|
||||
}
|
||||
|
||||
async gte(field: string, term: string) {
|
||||
return `${this.translateField(field)} is greater than or equal to ${term}`;
|
||||
async gte(field: string, term: string, context: SerializerContext) {
|
||||
return `${this.translateField(field, context)} is greater than or equal to ${term}`;
|
||||
}
|
||||
|
||||
async lte(field: string, term: string) {
|
||||
return `${this.translateField(field)} is less than or equal to ${term}`;
|
||||
async lte(field: string, term: string, context: SerializerContext) {
|
||||
return `${this.translateField(field, context)} is less than or equal to ${term}`;
|
||||
}
|
||||
|
||||
async lt(field: string, term: string) {
|
||||
return `${this.translateField(field)} is less than ${term}`;
|
||||
async lt(field: string, term: string, context: SerializerContext) {
|
||||
return `${this.translateField(field, context)} is less than ${term}`;
|
||||
}
|
||||
|
||||
async gt(field: string, term: string) {
|
||||
return `${this.translateField(field)} is greater than ${term}`;
|
||||
async gt(field: string, term: string, context: SerializerContext) {
|
||||
return `${this.translateField(field, context)} is greater than ${term}`;
|
||||
}
|
||||
|
||||
// async fieldSearch(field: string, term: string, isNegatedField: boolean) {
|
||||
// return `${this.translateField(field)} ${
|
||||
// isNegatedField ? 'does not contain' : 'contains'
|
||||
// } ${term}`;
|
||||
// }
|
||||
|
||||
async fieldSearch(
|
||||
field: string,
|
||||
term: string,
|
||||
isNegatedField: boolean,
|
||||
prefixWildcard: boolean,
|
||||
suffixWildcard: boolean,
|
||||
context: SerializerContext,
|
||||
) {
|
||||
const formattedTerm = term.trim().match(/\s/) ? `"${term}"` : term;
|
||||
|
||||
if (field === IMPLICIT_FIELD) {
|
||||
return `${this.translateField(field)} ${
|
||||
const isUsingTokenSearch = !context.implicitColumnExpression; // Source's implicit column has not been overridden
|
||||
return `${this.translateField(field, context)} ${
|
||||
prefixWildcard && suffixWildcard
|
||||
? isNegatedField
|
||||
? 'does not contain'
|
||||
|
|
@ -154,14 +177,18 @@ class EnglishSerializer implements Serializer {
|
|||
? isNegatedField
|
||||
? 'does not start with'
|
||||
: 'starts with'
|
||||
: isNegatedField
|
||||
? 'does not have whole word'
|
||||
: 'has whole word'
|
||||
} ${term}`;
|
||||
: isUsingTokenSearch
|
||||
? isNegatedField
|
||||
? 'does not have whole word'
|
||||
: 'has whole word'
|
||||
: isNegatedField
|
||||
? 'does not contain'
|
||||
: 'contains'
|
||||
} ${formattedTerm}`;
|
||||
} else {
|
||||
return `${this.translateField(field)} ${
|
||||
return `${this.translateField(field, context)} ${
|
||||
isNegatedField ? 'does not contain' : 'contains'
|
||||
} ${term}`;
|
||||
} ${formattedTerm}`;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -180,7 +207,10 @@ class EnglishSerializer implements Serializer {
|
|||
export abstract class SQLSerializer implements Serializer {
|
||||
private NOT_FOUND_QUERY = '(1 = 0)';
|
||||
|
||||
abstract getColumnForField(field: string): Promise<{
|
||||
abstract getColumnForField(
|
||||
field: string,
|
||||
context: SerializerContext,
|
||||
): Promise<{
|
||||
column?: string;
|
||||
columnJSON?: { string: string; number: string };
|
||||
propertyType?: JSDataType;
|
||||
|
|
@ -209,9 +239,14 @@ export abstract class SQLSerializer implements Serializer {
|
|||
}
|
||||
|
||||
// Only for exact string matches
|
||||
async eq(field: string, term: string, isNegatedField: boolean) {
|
||||
async eq(
|
||||
field: string,
|
||||
term: string,
|
||||
isNegatedField: boolean,
|
||||
context: SerializerContext,
|
||||
) {
|
||||
const { column, columnJSON, found, propertyType } =
|
||||
await this.getColumnForField(field);
|
||||
await this.getColumnForField(field, context);
|
||||
if (!found) {
|
||||
return this.NOT_FOUND_QUERY;
|
||||
}
|
||||
|
|
@ -238,9 +273,13 @@ export abstract class SQLSerializer implements Serializer {
|
|||
]);
|
||||
}
|
||||
|
||||
async isNotNull(field: string, isNegatedField: boolean) {
|
||||
async isNotNull(
|
||||
field: string,
|
||||
isNegatedField: boolean,
|
||||
context: SerializerContext,
|
||||
) {
|
||||
const { column, columnJSON, found, propertyType } =
|
||||
await this.getColumnForField(field);
|
||||
await this.getColumnForField(field, context);
|
||||
if (!found) {
|
||||
return this.NOT_FOUND_QUERY;
|
||||
}
|
||||
|
|
@ -250,9 +289,9 @@ export abstract class SQLSerializer implements Serializer {
|
|||
return `notEmpty(${column}) ${isNegatedField ? '!' : ''}= 1`;
|
||||
}
|
||||
|
||||
async gte(field: string, term: string) {
|
||||
async gte(field: string, term: string, context: SerializerContext) {
|
||||
const { column, columnJSON, found, propertyType } =
|
||||
await this.getColumnForField(field);
|
||||
await this.getColumnForField(field, context);
|
||||
if (!found) {
|
||||
return this.NOT_FOUND_QUERY;
|
||||
}
|
||||
|
|
@ -262,9 +301,9 @@ export abstract class SQLSerializer implements Serializer {
|
|||
return SqlString.format(`(${column} >= ?)`, [term]);
|
||||
}
|
||||
|
||||
async lte(field: string, term: string) {
|
||||
async lte(field: string, term: string, context: SerializerContext) {
|
||||
const { column, columnJSON, found, propertyType } =
|
||||
await this.getColumnForField(field);
|
||||
await this.getColumnForField(field, context);
|
||||
if (!found) {
|
||||
return this.NOT_FOUND_QUERY;
|
||||
}
|
||||
|
|
@ -274,9 +313,9 @@ export abstract class SQLSerializer implements Serializer {
|
|||
return SqlString.format(`(${column} <= ?)`, [term]);
|
||||
}
|
||||
|
||||
async lt(field: string, term: string) {
|
||||
async lt(field: string, term: string, context: SerializerContext) {
|
||||
const { column, columnJSON, found, propertyType } =
|
||||
await this.getColumnForField(field);
|
||||
await this.getColumnForField(field, context);
|
||||
if (!found) {
|
||||
return this.NOT_FOUND_QUERY;
|
||||
}
|
||||
|
|
@ -286,9 +325,9 @@ export abstract class SQLSerializer implements Serializer {
|
|||
return SqlString.format(`(${column} < ?)`, [term]);
|
||||
}
|
||||
|
||||
async gt(field: string, term: string) {
|
||||
async gt(field: string, term: string, context: SerializerContext) {
|
||||
const { column, columnJSON, found, propertyType } =
|
||||
await this.getColumnForField(field);
|
||||
await this.getColumnForField(field, context);
|
||||
if (!found) {
|
||||
return this.NOT_FOUND_QUERY;
|
||||
}
|
||||
|
|
@ -323,10 +362,11 @@ export abstract class SQLSerializer implements Serializer {
|
|||
isNegatedField: boolean,
|
||||
prefixWildcard: boolean,
|
||||
suffixWildcard: boolean,
|
||||
context: SerializerContext,
|
||||
) {
|
||||
const isImplicitField = field === IMPLICIT_FIELD;
|
||||
const { column, columnJSON, found, propertyType } =
|
||||
await this.getColumnForField(field);
|
||||
await this.getColumnForField(field, context);
|
||||
if (!found) {
|
||||
return this.NOT_FOUND_QUERY;
|
||||
}
|
||||
|
|
@ -358,8 +398,13 @@ export abstract class SQLSerializer implements Serializer {
|
|||
}
|
||||
|
||||
if (isImplicitField) {
|
||||
// For implicit fields that come directly from the Source, we assume there is a bloom filter that can be used to
|
||||
// optimize searches with hasToken. Overridden implicit columns (eg. "foo" in "foo:("bar baz")") are assumed
|
||||
// to not have bloom filters.
|
||||
const shouldUseTokenBf = !context.implicitColumnExpression;
|
||||
|
||||
// For the _source column, we'll try to do whole word searches by default
|
||||
// to utilize the token bloom filter unless a prefix/sufix wildcard is specified
|
||||
// to utilize the token bloom filter unless a prefix/suffix wildcard is specified
|
||||
if (prefixWildcard || suffixWildcard) {
|
||||
return SqlString.format(
|
||||
`(lower(?) ${isNegatedField ? 'NOT ' : ''}LIKE lower(?))`,
|
||||
|
|
@ -368,7 +413,7 @@ export abstract class SQLSerializer implements Serializer {
|
|||
`${prefixWildcard ? '%' : ''}${term}${suffixWildcard ? '%' : ''}`,
|
||||
],
|
||||
);
|
||||
} else {
|
||||
} else if (shouldUseTokenBf) {
|
||||
// TODO: Check case sensitivity of the index before lowering by default
|
||||
// We can't search multiple tokens with `hasToken`, so we need to split up the term into tokens
|
||||
const hasSeperators = this.termHasSeperators(term);
|
||||
|
|
@ -394,13 +439,12 @@ export abstract class SQLSerializer implements Serializer {
|
|||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const shoudUseTokenBf = isImplicitField;
|
||||
return SqlString.format(
|
||||
`(${column} ${isNegatedField ? 'NOT ' : ''}? ?)`,
|
||||
[SqlString.raw(shoudUseTokenBf ? 'LIKE' : 'ILIKE'), `%${term}%`],
|
||||
);
|
||||
}
|
||||
|
||||
return SqlString.format(`(${column} ${isNegatedField ? 'NOT ' : ''}? ?)`, [
|
||||
SqlString.raw('ILIKE'),
|
||||
`%${term}%`,
|
||||
]);
|
||||
}
|
||||
|
||||
async range(
|
||||
|
|
@ -408,8 +452,9 @@ export abstract class SQLSerializer implements Serializer {
|
|||
start: string,
|
||||
end: string,
|
||||
isNegatedField: boolean,
|
||||
context: SerializerContext,
|
||||
) {
|
||||
const { column, found } = await this.getColumnForField(field);
|
||||
const { column, found } = await this.getColumnForField(field, context);
|
||||
if (!found) {
|
||||
return this.NOT_FOUND_QUERY;
|
||||
}
|
||||
|
|
@ -543,30 +588,38 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
|
|||
// throw new Error(`Column not found: ${field}`);
|
||||
}
|
||||
|
||||
async getColumnForField(field: string) {
|
||||
if (field === IMPLICIT_FIELD) {
|
||||
if (!this.implicitColumnExpression) {
|
||||
throw new Error(
|
||||
'Can not search bare text without an implicit column set.',
|
||||
);
|
||||
}
|
||||
|
||||
const expressions = splitAndTrimWithBracket(
|
||||
this.implicitColumnExpression,
|
||||
async getColumnForField(field: string, context: SerializerContext) {
|
||||
const implicitColumnExpression =
|
||||
context.implicitColumnExpression ?? this.implicitColumnExpression;
|
||||
if (field === IMPLICIT_FIELD && !implicitColumnExpression) {
|
||||
throw new Error(
|
||||
'Can not search bare text without an implicit column set.',
|
||||
);
|
||||
}
|
||||
|
||||
const fieldFinal =
|
||||
field === IMPLICIT_FIELD ? implicitColumnExpression! : field;
|
||||
|
||||
if (
|
||||
field === IMPLICIT_FIELD &&
|
||||
implicitColumnExpression === this.implicitColumnExpression // Source's implicit column has not been overridden
|
||||
) {
|
||||
// Sources can specify multi-column implicit columns, eg. Body and Message, in
|
||||
// which case we search the combined string `concatWithSeparator(';', Body, Message)`.
|
||||
const expressions = splitAndTrimWithBracket(fieldFinal);
|
||||
|
||||
return {
|
||||
column:
|
||||
expressions.length > 1
|
||||
? `concatWithSeparator(';',${expressions.join(',')})`
|
||||
: this.implicitColumnExpression,
|
||||
: fieldFinal,
|
||||
columnJSON: undefined,
|
||||
propertyType: JSDataType.String,
|
||||
found: true,
|
||||
};
|
||||
}
|
||||
|
||||
const expression = await this.buildColumnExpressionFromField(field);
|
||||
const expression = await this.buildColumnExpressionFromField(fieldFinal);
|
||||
|
||||
return {
|
||||
column: expression.columnExpression,
|
||||
|
|
@ -581,6 +634,7 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
|
|||
async function nodeTerm(
|
||||
node: lucene.Node,
|
||||
serializer: Serializer,
|
||||
context: SerializerContext,
|
||||
): Promise<string> {
|
||||
const field = node.field[0] === '-' ? node.field.slice(1) : node.field;
|
||||
let isNegatedField = node.field[0] === '-';
|
||||
|
|
@ -605,36 +659,36 @@ async function nodeTerm(
|
|||
// If the term is quoted, we should search for the exact term in a property (ex. foo:"bar")
|
||||
// Implicit field searches should still use substring matching (ex. "foo bar")
|
||||
if (nodeTerm.quoted && !isImplicitField) {
|
||||
return serializer.eq(field, term, isNegatedField);
|
||||
return serializer.eq(field, term, isNegatedField, context);
|
||||
}
|
||||
|
||||
if (!nodeTerm.quoted && term === '*') {
|
||||
return serializer.isNotNull(field, isNegatedField);
|
||||
return serializer.isNotNull(field, isNegatedField, context);
|
||||
}
|
||||
|
||||
if (!nodeTerm.quoted && term.substring(0, 2) === '>=') {
|
||||
if (isNegatedField) {
|
||||
return serializer.lt(field, term.slice(2));
|
||||
return serializer.lt(field, term.slice(2), context);
|
||||
}
|
||||
return serializer.gte(field, term.slice(2));
|
||||
return serializer.gte(field, term.slice(2), context);
|
||||
}
|
||||
if (!nodeTerm.quoted && term.substring(0, 2) === '<=') {
|
||||
if (isNegatedField) {
|
||||
return serializer.gt(field, term.slice(2));
|
||||
return serializer.gt(field, term.slice(2), context);
|
||||
}
|
||||
return serializer.lte(field, term.slice(2));
|
||||
return serializer.lte(field, term.slice(2), context);
|
||||
}
|
||||
if (!nodeTerm.quoted && term[0] === '>') {
|
||||
if (isNegatedField) {
|
||||
return serializer.lte(field, term.slice(1));
|
||||
return serializer.lte(field, term.slice(1), context);
|
||||
}
|
||||
return serializer.gt(field, term.slice(1));
|
||||
return serializer.gt(field, term.slice(1), context);
|
||||
}
|
||||
if (!nodeTerm.quoted && term[0] === '<') {
|
||||
if (isNegatedField) {
|
||||
return serializer.gte(field, term.slice(1));
|
||||
return serializer.gte(field, term.slice(1), context);
|
||||
}
|
||||
return serializer.lt(field, term.slice(1));
|
||||
return serializer.lt(field, term.slice(1), context);
|
||||
}
|
||||
|
||||
let prefixWildcard = false;
|
||||
|
|
@ -654,6 +708,7 @@ async function nodeTerm(
|
|||
isNegatedField,
|
||||
prefixWildcard,
|
||||
suffixWildcard,
|
||||
context,
|
||||
);
|
||||
|
||||
// TODO: Handle regex, similarity, boost, prefix
|
||||
|
|
@ -666,24 +721,50 @@ async function nodeTerm(
|
|||
rangedTerm.term_min,
|
||||
rangedTerm.term_max,
|
||||
isNegatedField,
|
||||
context,
|
||||
);
|
||||
}
|
||||
|
||||
throw new Error(`Unexpected Node type. ${node}`);
|
||||
}
|
||||
|
||||
function createSerializerContext(
|
||||
currentContext: SerializerContext,
|
||||
ast: lucene.BinaryAST | lucene.LeftOnlyAST,
|
||||
) {
|
||||
// For syntax like `foo:(bar baz)` or `foo:("bar baz")`, the implicit field for the inner expression must be `foo`
|
||||
if (ast.field && ast.parenthesized && ast.field !== IMPLICIT_FIELD) {
|
||||
const fieldWithoutNegation = ast.field?.startsWith('-')
|
||||
? ast.field.slice(1)
|
||||
: ast.field;
|
||||
|
||||
return {
|
||||
...currentContext,
|
||||
implicitColumnExpression: fieldWithoutNegation,
|
||||
};
|
||||
} else {
|
||||
return currentContext;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if the AST is of the form `-[field]:([terms...])` */
|
||||
function isNegatedAndParenthesized(ast: lucene.BinaryAST | lucene.LeftOnlyAST) {
|
||||
return ast.parenthesized && ast.field?.startsWith('-');
|
||||
}
|
||||
|
||||
async function serialize(
|
||||
ast: lucene.AST | lucene.Node,
|
||||
serializer: Serializer,
|
||||
context: SerializerContext,
|
||||
): Promise<string> {
|
||||
// Node Scenarios:
|
||||
// 1. NodeTerm: Single term ex. "foo:bar"
|
||||
// 2. NodeRangedTerm: Two terms ex. "foo:[bar TO qux]"
|
||||
if ((ast as lucene.NodeTerm).term != null) {
|
||||
return await nodeTerm(ast as lucene.NodeTerm, serializer);
|
||||
return await nodeTerm(ast as lucene.NodeTerm, serializer, context);
|
||||
}
|
||||
if ((ast as lucene.NodeRangedTerm).inclusive != null) {
|
||||
return await nodeTerm(ast as lucene.NodeTerm, serializer);
|
||||
return await nodeTerm(ast as lucene.NodeTerm, serializer, context);
|
||||
}
|
||||
|
||||
// AST Scenarios:
|
||||
|
|
@ -691,25 +772,34 @@ async function serialize(
|
|||
// 2. LeftOnlyAST: Single term ex. "foo:bar"
|
||||
if ((ast as lucene.BinaryAST).right != null) {
|
||||
const binaryAST = ast as lucene.BinaryAST;
|
||||
const operator = serializer.operator(binaryAST.operator);
|
||||
const operator = serializer.operator(binaryAST.operator, context);
|
||||
const parenthesized = binaryAST.parenthesized;
|
||||
return `${parenthesized ? '(' : ''}${await serialize(
|
||||
|
||||
const newContext = createSerializerContext(context, binaryAST);
|
||||
const serialized = `${isNegatedAndParenthesized(binaryAST) ? 'NOT ' : ''}${parenthesized ? '(' : ''}${await serialize(
|
||||
binaryAST.left,
|
||||
serializer,
|
||||
)} ${operator} ${await serialize(binaryAST.right, serializer)}${
|
||||
newContext,
|
||||
)} ${operator} ${await serialize(binaryAST.right, serializer, newContext)}${
|
||||
parenthesized ? ')' : ''
|
||||
}`;
|
||||
return serialized;
|
||||
}
|
||||
|
||||
if ((ast as lucene.LeftOnlyAST).left != null) {
|
||||
const leftOnlyAST = ast as lucene.LeftOnlyAST;
|
||||
const parenthesized = leftOnlyAST.parenthesized;
|
||||
|
||||
const newContext = createSerializerContext(context, leftOnlyAST);
|
||||
|
||||
// start is used when ex. "NOT foo:bar"
|
||||
return `${parenthesized ? '(' : ''}${
|
||||
const serialized = `${isNegatedAndParenthesized(leftOnlyAST) ? 'NOT ' : ''}${parenthesized ? '(' : ''}${
|
||||
leftOnlyAST.start != undefined ? `${leftOnlyAST.start} ` : ''
|
||||
}${await serialize(leftOnlyAST.left, serializer)}${
|
||||
}${await serialize(leftOnlyAST.left, serializer, newContext)}${
|
||||
parenthesized ? ')' : ''
|
||||
}`;
|
||||
|
||||
return serialized;
|
||||
}
|
||||
|
||||
// Blank AST, means no text was parsed
|
||||
|
|
@ -721,7 +811,7 @@ export async function genWhereSQL(
|
|||
ast: lucene.AST,
|
||||
serializer: Serializer,
|
||||
): Promise<string> {
|
||||
return await serialize(ast, serializer);
|
||||
return await serialize(ast, serializer, {});
|
||||
}
|
||||
|
||||
export class SearchQueryBuilder {
|
||||
|
|
@ -793,7 +883,7 @@ export async function genEnglishExplanation(query: string): Promise<string> {
|
|||
|
||||
if (parsedQ) {
|
||||
const serializer = new EnglishSerializer();
|
||||
return await serialize(parsedQ, serializer);
|
||||
return await serialize(parsedQ, serializer, {});
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('Parse failure', query, e);
|
||||
|
|
|
|||
Loading…
Reference in a new issue