mirror of
https://github.com/hyperdxio/hyperdx
synced 2026-04-21 13:37:15 +00:00
Implement CHART source alert (scheduled task) (#108)
This commit is contained in:
parent
42969f243e
commit
8443a080f9
12 changed files with 699 additions and 119 deletions
6
.changeset/small-buttons-deny.md
Normal file
6
.changeset/small-buttons-deny.md
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
---
|
||||
'@hyperdx/api': minor
|
||||
'@hyperdx/app': minor
|
||||
---
|
||||
|
||||
feat: implement CHART source alert (scheduled task)
|
||||
|
|
@ -44,14 +44,33 @@ HYPERDX_API_KEY=<YOUR_INGESTION_API_KEY_HERE> docker compose -f ./docker-compose
|
|||
The core services are all hot-reloaded, so you can make changes to the code and
|
||||
see them reflected in real-time.
|
||||
|
||||
## Testing
|
||||
|
||||
To run the tests locally, you can run the following command:
|
||||
|
||||
```bash
|
||||
make dev-int
|
||||
```
|
||||
|
||||
If you want to run a specific test file, you can run the following command:
|
||||
|
||||
```bash
|
||||
make dev-int FILE=checkAlerts
|
||||
```
|
||||
|
||||
### Windows
|
||||
|
||||
If you are running WSL 2, Hot module reload on Nextjs (Frontend) does not work out of the box on windows when run natively on docker. The fix here is to open project directory in WSL and run the above docker compose commands directly in WSL. Note that the project directory should not be under /mnt/c/ directory. You can clone the git repo in /home/{username} for example.
|
||||
If you are running WSL 2, Hot module reload on Nextjs (Frontend) does not work
|
||||
out of the box on windows when run natively on docker. The fix here is to open
|
||||
project directory in WSL and run the above docker compose commands directly in
|
||||
WSL. Note that the project directory should not be under /mnt/c/ directory. You
|
||||
can clone the git repo in /home/{username} for example.
|
||||
|
||||
To develop from WSL, follow instructions [here](https://code.visualstudio.com/docs/remote/wsl).
|
||||
To develop from WSL, follow instructions
|
||||
[here](https://code.visualstudio.com/docs/remote/wsl).
|
||||
|
||||
## Additional support
|
||||
|
||||
If you need help getting started,
|
||||
[join our Discord](https://discord.gg/FErRRKU78j) and we're more than happy to
|
||||
get you set up!
|
||||
get you set up!
|
||||
|
|
|
|||
2
Makefile
2
Makefile
|
|
@ -33,7 +33,7 @@ ci-lint:
|
|||
|
||||
.PHONY: dev-int
|
||||
dev-int:
|
||||
docker compose -p int -f ./docker-compose.ci.yml run --rm api dev:int
|
||||
docker compose -p int -f ./docker-compose.ci.yml run --rm api dev:int $(FILE)
|
||||
|
||||
.PHONY: ci-int
|
||||
ci-int:
|
||||
|
|
|
|||
|
|
@ -1,21 +1,12 @@
|
|||
version: '3'
|
||||
services:
|
||||
ch_keeper:
|
||||
container_name: hdx-ci-ch-keeper
|
||||
image: zookeeper:3.7
|
||||
volumes:
|
||||
- ./docker/clickhouse/local/zoo.cfg:/conf/zoo.cfg
|
||||
restart: on-failure
|
||||
networks:
|
||||
- internal
|
||||
ch_server:
|
||||
container_name: hdx-ci-ch-server
|
||||
image: clickhouse/clickhouse-server:23.5.2-alpine
|
||||
image: clickhouse/clickhouse-server:23.7.1-alpine
|
||||
environment:
|
||||
# default settings
|
||||
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
|
||||
volumes:
|
||||
- ./docker/clickhouse/local/docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d
|
||||
- ./docker/clickhouse/local/config.xml:/etc/clickhouse-server/config.xml
|
||||
- ./docker/clickhouse/local/users.xml:/etc/clickhouse-server/users.xml
|
||||
restart: on-failure
|
||||
|
|
@ -24,8 +15,6 @@ services:
|
|||
- 9000:9000 # native
|
||||
networks:
|
||||
- internal
|
||||
depends_on:
|
||||
- ch_keeper
|
||||
db:
|
||||
container_name: hdx-ci-db
|
||||
image: mongo:5.0.14-focal
|
||||
|
|
|
|||
|
|
@ -980,7 +980,15 @@ export const getLogsChart = async ({
|
|||
),
|
||||
},
|
||||
});
|
||||
const result = await rows.json<ResponseJSON<Record<string, unknown>>>();
|
||||
const result = await rows.json<
|
||||
ResponseJSON<{
|
||||
data: string;
|
||||
ts_bucket: number;
|
||||
group: string;
|
||||
rank: string;
|
||||
rank_order_by_value: string;
|
||||
}>
|
||||
>();
|
||||
logger.info({
|
||||
message: 'getChart',
|
||||
query,
|
||||
|
|
@ -1392,8 +1400,8 @@ export const checkAlert = async ({
|
|||
`
|
||||
SELECT
|
||||
?
|
||||
count(*) as count,
|
||||
toStartOfInterval(timestamp, INTERVAL ?) as ts_bucket
|
||||
count(*) as data,
|
||||
toUnixTimestamp(toStartOfInterval(timestamp, INTERVAL ?)) as ts_bucket
|
||||
FROM ??
|
||||
WHERE ? AND (?)
|
||||
GROUP BY ?
|
||||
|
|
@ -1438,7 +1446,7 @@ export const checkAlert = async ({
|
|||
},
|
||||
});
|
||||
const result = await rows.json<
|
||||
ResponseJSON<{ count: string; group?: string; ts_bucket: string }>
|
||||
ResponseJSON<{ data: string; group?: string; ts_bucket: number }>
|
||||
>();
|
||||
logger.info({
|
||||
message: 'checkAlert',
|
||||
|
|
|
|||
|
|
@ -50,6 +50,8 @@ export interface IAlert {
|
|||
chartId?: string;
|
||||
}
|
||||
|
||||
export type AlertDocument = mongoose.HydratedDocument<IAlert>;
|
||||
|
||||
const AlertSchema = new Schema<IAlert>(
|
||||
{
|
||||
type: {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
import mongoose, { Schema } from 'mongoose';
|
||||
|
||||
import { AggFn } from '../clickhouse';
|
||||
|
||||
import type { ObjectId } from '.';
|
||||
|
||||
type Chart = {
|
||||
|
|
@ -9,14 +11,47 @@ type Chart = {
|
|||
y: number;
|
||||
w: number;
|
||||
h: number;
|
||||
series: {
|
||||
table: string;
|
||||
type: 'time' | 'histogram' | 'search' | 'number' | 'table' | 'markdown';
|
||||
aggFn: string;
|
||||
field?: string;
|
||||
where?: string;
|
||||
groupBy?: string[];
|
||||
}[];
|
||||
series: (
|
||||
| {
|
||||
table: string;
|
||||
type: 'time';
|
||||
aggFn: AggFn; // TODO: Type
|
||||
field: string | undefined;
|
||||
where: string;
|
||||
groupBy: string[];
|
||||
}
|
||||
| {
|
||||
table: string;
|
||||
type: 'histogram';
|
||||
field: string | undefined;
|
||||
where: string;
|
||||
}
|
||||
| {
|
||||
type: 'search';
|
||||
fields: string[];
|
||||
where: string;
|
||||
}
|
||||
| {
|
||||
type: 'number';
|
||||
table: string;
|
||||
aggFn: AggFn;
|
||||
field: string | undefined;
|
||||
where: string;
|
||||
}
|
||||
| {
|
||||
type: 'table';
|
||||
table: string;
|
||||
aggFn: AggFn;
|
||||
field: string | undefined;
|
||||
where: string;
|
||||
groupBy: string[];
|
||||
sortOrder: 'desc' | 'asc';
|
||||
}
|
||||
| {
|
||||
type: 'markdown';
|
||||
content: string;
|
||||
}
|
||||
)[];
|
||||
};
|
||||
|
||||
export interface IDashboard {
|
||||
|
|
|
|||
|
|
@ -1,4 +1,18 @@
|
|||
import { buildLogSearchLink, roundDownToXMinutes } from '../checkAlerts';
|
||||
import * as clickhouse from '../../clickhouse';
|
||||
import * as slack from '../../utils/slack';
|
||||
import AlertHistory from '../../models/alertHistory';
|
||||
import Dashboard from '../../models/dashboard';
|
||||
import LogView from '../../models/logView';
|
||||
import Webhook from '../../models/webhook';
|
||||
import {
|
||||
buildLogSearchLink,
|
||||
doesExceedThreshold,
|
||||
processAlert,
|
||||
roundDownToXMinutes,
|
||||
} from '../checkAlerts';
|
||||
import { clearDBCollections, closeDB, getServer } from '../../fixtures';
|
||||
import { createAlert } from '../../controllers/alerts';
|
||||
import { createTeam } from '../../controllers/team';
|
||||
|
||||
describe('checkAlerts', () => {
|
||||
it('roundDownToXMinutes', () => {
|
||||
|
|
@ -49,4 +63,330 @@ describe('checkAlerts', () => {
|
|||
'http://localhost:9090/search/123?from=1679091183103&to=1679091239103&q=%F0%9F%90%B1+foo%3A%22bar%22',
|
||||
);
|
||||
});
|
||||
|
||||
it('doesExceedThreshold', () => {
|
||||
expect(
|
||||
doesExceedThreshold(
|
||||
{
|
||||
type: 'presence',
|
||||
threshold: 10,
|
||||
} as any,
|
||||
11,
|
||||
),
|
||||
).toBe(true);
|
||||
expect(
|
||||
doesExceedThreshold(
|
||||
{
|
||||
type: 'presence',
|
||||
threshold: 10,
|
||||
} as any,
|
||||
10,
|
||||
),
|
||||
).toBe(true);
|
||||
expect(
|
||||
doesExceedThreshold(
|
||||
{
|
||||
type: 'absence',
|
||||
threshold: 10,
|
||||
} as any,
|
||||
9,
|
||||
),
|
||||
).toBe(true);
|
||||
expect(
|
||||
doesExceedThreshold(
|
||||
{
|
||||
type: 'absence',
|
||||
threshold: 10,
|
||||
} as any,
|
||||
10,
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
describe('processAlert', () => {
|
||||
const server = getServer();
|
||||
|
||||
beforeAll(async () => {
|
||||
await server.start();
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
jest.resetAllMocks();
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await clearDBCollections();
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
await server.closeHttpServer();
|
||||
await closeDB();
|
||||
});
|
||||
|
||||
it('LOG alert', async () => {
|
||||
jest
|
||||
.spyOn(slack, 'postMessageToWebhook')
|
||||
.mockResolvedValueOnce(null as any);
|
||||
jest
|
||||
.spyOn(clickhouse, 'checkAlert')
|
||||
.mockResolvedValueOnce({
|
||||
rows: 1,
|
||||
data: [
|
||||
{
|
||||
data: '11',
|
||||
group: 'HyperDX',
|
||||
ts_bucket: 1700172600,
|
||||
},
|
||||
],
|
||||
} as any)
|
||||
// no logs found in the next window
|
||||
.mockResolvedValueOnce({
|
||||
rows: 0,
|
||||
data: [],
|
||||
} as any);
|
||||
jest.spyOn(clickhouse, 'getLogBatch').mockResolvedValueOnce({
|
||||
rows: 1,
|
||||
data: [
|
||||
{
|
||||
timestamp: '2023-11-16T22:10:00.000Z',
|
||||
severity_text: 'error',
|
||||
body: 'Oh no! Something went wrong!',
|
||||
},
|
||||
],
|
||||
} as any);
|
||||
|
||||
const team = await createTeam({ name: 'My Team' });
|
||||
const logView = await new LogView({
|
||||
name: 'My Log View',
|
||||
query: `level:error`,
|
||||
team: team._id,
|
||||
}).save();
|
||||
const webhook = await new Webhook({
|
||||
team: team._id,
|
||||
service: 'slack',
|
||||
url: 'https://hooks.slack.com/services/123',
|
||||
name: 'My Webhook',
|
||||
}).save();
|
||||
const alert = await createAlert({
|
||||
source: 'LOG',
|
||||
channel: {
|
||||
type: 'webhook',
|
||||
webhookId: webhook._id.toString(),
|
||||
},
|
||||
interval: '5m',
|
||||
type: 'presence',
|
||||
threshold: 10,
|
||||
groupBy: 'span_name',
|
||||
logViewId: logView._id.toString(),
|
||||
});
|
||||
|
||||
const now = new Date('2023-11-16T22:12:00.000Z');
|
||||
|
||||
// shoud fetch 5m of logs
|
||||
await processAlert(now, alert);
|
||||
// check alert history
|
||||
const alertHistories = await AlertHistory.find({
|
||||
alertId: alert._id,
|
||||
});
|
||||
expect(alertHistories.length).toBe(1);
|
||||
expect(alertHistories[0].counts).toBe(1);
|
||||
expect(alertHistories[0].createdAt).toEqual(
|
||||
new Date('2023-11-16T22:10:00.000Z'),
|
||||
);
|
||||
expect(alert.state).toBe('ALERT');
|
||||
|
||||
// skip since time diff is less than 1 window size
|
||||
const later = new Date('2023-11-16T22:14:00.000Z');
|
||||
await processAlert(later, alert);
|
||||
// alert should still be in alert state
|
||||
expect(alert.state).toBe('ALERT');
|
||||
|
||||
const nextWindow = new Date('2023-11-16T22:16:00.000Z');
|
||||
await processAlert(nextWindow, alert);
|
||||
// alert should be in ok state
|
||||
expect(alert.state).toBe('OK');
|
||||
|
||||
// check if checkAlert query + webhook were triggered
|
||||
expect(clickhouse.checkAlert).toHaveBeenNthCalledWith(1, {
|
||||
endTime: new Date('2023-11-16T22:10:00.000Z'),
|
||||
groupBy: alert.groupBy,
|
||||
q: logView.query,
|
||||
startTime: new Date('2023-11-16T22:05:00.000Z'),
|
||||
tableVersion: team.logStreamTableVersion,
|
||||
teamId: logView.team._id.toString(),
|
||||
windowSizeInMins: 5,
|
||||
});
|
||||
expect(slack.postMessageToWebhook).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
'https://hooks.slack.com/services/123',
|
||||
{
|
||||
text: 'Alert for My Log View - 11 lines found',
|
||||
blocks: [
|
||||
{
|
||||
text: {
|
||||
text: [
|
||||
`*<http://localhost:9090/search/${logView._id}?from=1700172600000&to=1700172900000&q=level%3Aerror+span_name%3A%22HyperDX%22 | Alert for My Log View>*`,
|
||||
'Group: "HyperDX"',
|
||||
'11 lines found, expected less than 10 lines',
|
||||
'```',
|
||||
'Nov 16 22:10:00Z [error] Oh no! Something went wrong!',
|
||||
'```',
|
||||
].join('\n'),
|
||||
type: 'mrkdwn',
|
||||
},
|
||||
type: 'section',
|
||||
},
|
||||
],
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it('CHART alert', async () => {
|
||||
jest
|
||||
.spyOn(slack, 'postMessageToWebhook')
|
||||
.mockResolvedValueOnce(null as any);
|
||||
jest
|
||||
.spyOn(clickhouse, 'getLogsChart')
|
||||
.mockResolvedValueOnce({
|
||||
rows: 1,
|
||||
data: [
|
||||
{
|
||||
data: '11',
|
||||
group: 'HyperDX',
|
||||
rank: '1',
|
||||
rank_order_by_value: '11',
|
||||
ts_bucket: 1700172600,
|
||||
},
|
||||
],
|
||||
} as any)
|
||||
// no logs found in the next window
|
||||
.mockResolvedValueOnce({
|
||||
rows: 0,
|
||||
data: [],
|
||||
} as any);
|
||||
|
||||
const team = await createTeam({ name: 'My Team' });
|
||||
const webhook = await new Webhook({
|
||||
team: team._id,
|
||||
service: 'slack',
|
||||
url: 'https://hooks.slack.com/services/123',
|
||||
name: 'My Webhook',
|
||||
}).save();
|
||||
const dashboard = await new Dashboard({
|
||||
name: 'My Dashboard',
|
||||
team: team._id,
|
||||
charts: [
|
||||
{
|
||||
id: '198hki',
|
||||
name: 'Max Duration',
|
||||
x: 0,
|
||||
y: 0,
|
||||
w: 6,
|
||||
h: 3,
|
||||
series: [
|
||||
{
|
||||
table: 'logs',
|
||||
type: 'time',
|
||||
aggFn: 'max',
|
||||
field: 'duration',
|
||||
where: 'level:error',
|
||||
groupBy: ['span_name'],
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'obil1',
|
||||
name: 'Min Duratioin',
|
||||
x: 6,
|
||||
y: 0,
|
||||
w: 6,
|
||||
h: 3,
|
||||
series: [
|
||||
{
|
||||
table: 'logs',
|
||||
type: 'time',
|
||||
aggFn: 'min',
|
||||
field: 'duration',
|
||||
where: '',
|
||||
groupBy: [],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
}).save();
|
||||
const alert = await createAlert({
|
||||
source: 'CHART',
|
||||
channel: {
|
||||
type: 'webhook',
|
||||
webhookId: webhook._id.toString(),
|
||||
},
|
||||
interval: '5m',
|
||||
type: 'presence',
|
||||
threshold: 10,
|
||||
dashboardId: dashboard._id.toString(),
|
||||
chartId: '198hki',
|
||||
});
|
||||
|
||||
const now = new Date('2023-11-16T22:12:00.000Z');
|
||||
|
||||
// shoud fetch 5m of logs
|
||||
await processAlert(now, alert);
|
||||
// check alert history
|
||||
const alertHistories = await AlertHistory.find({
|
||||
alertId: alert._id,
|
||||
});
|
||||
expect(alertHistories.length).toBe(1);
|
||||
expect(alertHistories[0].counts).toBe(1);
|
||||
expect(alertHistories[0].createdAt).toEqual(
|
||||
new Date('2023-11-16T22:10:00.000Z'),
|
||||
);
|
||||
expect(alert.state).toBe('ALERT');
|
||||
|
||||
// skip since time diff is less than 1 window size
|
||||
const later = new Date('2023-11-16T22:14:00.000Z');
|
||||
await processAlert(later, alert);
|
||||
// alert should still be in alert state
|
||||
expect(alert.state).toBe('ALERT');
|
||||
|
||||
const nextWindow = new Date('2023-11-16T22:16:00.000Z');
|
||||
await processAlert(nextWindow, alert);
|
||||
// alert should be in ok state
|
||||
expect(alert.state).toBe('OK');
|
||||
|
||||
// check if getLogsChart query + webhook were triggered
|
||||
expect(clickhouse.getLogsChart).toHaveBeenNthCalledWith(1, {
|
||||
aggFn: 'max',
|
||||
endTime: 1700172600000,
|
||||
field: 'duration',
|
||||
granularity: '5 minute',
|
||||
groupBy: 'span_name',
|
||||
maxNumGroups: 20,
|
||||
propertyTypeMappingsModel: expect.any(Object),
|
||||
q: 'level:error',
|
||||
startTime: 1700172300000,
|
||||
tableVersion: team.logStreamTableVersion,
|
||||
teamId: team._id.toString(),
|
||||
});
|
||||
expect(slack.postMessageToWebhook).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
'https://hooks.slack.com/services/123',
|
||||
{
|
||||
text: 'Alert for "Max Duration" in "My Dashboard" - 11 exceeds 10',
|
||||
blocks: [
|
||||
{
|
||||
text: {
|
||||
text: [
|
||||
`*<http://localhost:9090/dashboards/${dashboard._id}?from=1700170500000&granularity=5+minute&to=1700175000000 | Alert for "Max Duration" in "My Dashboard">*`,
|
||||
'Group: "HyperDX"',
|
||||
'11 exceeds 10',
|
||||
].join('\n'),
|
||||
type: 'mrkdwn',
|
||||
},
|
||||
type: 'section',
|
||||
},
|
||||
],
|
||||
},
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -11,8 +11,9 @@ import { serializeError } from 'serialize-error';
|
|||
import * as clickhouse from '@/clickhouse';
|
||||
import * as config from '@/config';
|
||||
import * as slack from '@/utils/slack';
|
||||
import Alert, { AlertState, IAlert, AlertSource } from '@/models/alert';
|
||||
import Alert, { AlertState, AlertDocument } from '@/models/alert';
|
||||
import AlertHistory, { IAlertHistory } from '@/models/alertHistory';
|
||||
import Dashboard, { IDashboard } from '@/models/dashboard';
|
||||
import LogView from '@/models/logView';
|
||||
import Webhook from '@/models/webhook';
|
||||
import logger from '@/utils/logger';
|
||||
|
|
@ -20,8 +21,7 @@ import { ITeam } from '@/models/team';
|
|||
import { ObjectId } from '@/models';
|
||||
import { truncateString } from '@/utils/common';
|
||||
|
||||
import type { ResponseJSON } from '@clickhouse/client';
|
||||
import type { LogSearchRow } from '@/clickhouse';
|
||||
type EnhancedDashboard = Omit<IDashboard, 'team'> & { team: ITeam };
|
||||
|
||||
const MAX_MESSAGE_LENGTH = 500;
|
||||
|
||||
|
|
@ -58,25 +58,109 @@ export const buildLogSearchLink = ({
|
|||
return url.toString();
|
||||
};
|
||||
|
||||
const buildEventSlackMessage = ({
|
||||
// TODO: should link to the chart instead
|
||||
export const buildChartLink = ({
|
||||
dashboardId,
|
||||
endTime,
|
||||
granularity,
|
||||
startTime,
|
||||
}: {
|
||||
dashboardId: string;
|
||||
endTime: Date;
|
||||
granularity: string;
|
||||
startTime: Date;
|
||||
}) => {
|
||||
const url = new URL(`${config.FRONTEND_URL}/dashboards/${dashboardId}`);
|
||||
// extend both start and end time by 7x granularity
|
||||
const from = (startTime.getTime() - ms(granularity) * 7).toString();
|
||||
const to = (endTime.getTime() + ms(granularity) * 7).toString();
|
||||
const queryParams = new URLSearchParams({
|
||||
from,
|
||||
granularity,
|
||||
to,
|
||||
});
|
||||
url.search = queryParams.toString();
|
||||
return url.toString();
|
||||
};
|
||||
|
||||
const buildChartEventSlackMessage = ({
|
||||
alert,
|
||||
dashboard,
|
||||
endTime,
|
||||
granularity,
|
||||
group,
|
||||
startTime,
|
||||
totalCount,
|
||||
}: {
|
||||
alert: AlertDocument;
|
||||
endTime: Date;
|
||||
dashboard: EnhancedDashboard;
|
||||
granularity: string;
|
||||
group?: string;
|
||||
startTime: Date;
|
||||
totalCount: number;
|
||||
}) => {
|
||||
// should be only 1 chart
|
||||
const chart = dashboard.charts[0];
|
||||
const mrkdwn = [
|
||||
`*<${buildChartLink({
|
||||
dashboardId: dashboard._id.toString(),
|
||||
endTime,
|
||||
granularity,
|
||||
startTime,
|
||||
})} | Alert for "${chart.name}" in "${dashboard.name}">*`,
|
||||
...(group != null ? [`Group: "${group}"`] : []),
|
||||
`${totalCount} ${
|
||||
doesExceedThreshold(alert, totalCount) ? 'exceeds' : 'falls below'
|
||||
} ${alert.threshold}`,
|
||||
].join('\n');
|
||||
|
||||
return {
|
||||
text: `Alert for "${chart.name}" in "${dashboard.name}" - ${totalCount} ${
|
||||
doesExceedThreshold(alert, totalCount) ? 'exceeds' : 'falls below'
|
||||
} ${alert.threshold}`,
|
||||
blocks: [
|
||||
{
|
||||
type: 'section',
|
||||
text: {
|
||||
type: 'mrkdwn',
|
||||
text: mrkdwn,
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
};
|
||||
|
||||
const buildLogEventSlackMessage = async ({
|
||||
alert,
|
||||
endTime,
|
||||
group,
|
||||
logView,
|
||||
results,
|
||||
searchQuery,
|
||||
startTime,
|
||||
totalCount,
|
||||
}: {
|
||||
alert: IAlert;
|
||||
alert: AlertDocument;
|
||||
endTime: Date;
|
||||
group?: string;
|
||||
logView: Awaited<ReturnType<typeof getLogViewEnhanced>>;
|
||||
results: ResponseJSON<LogSearchRow> | undefined;
|
||||
searchQuery?: string;
|
||||
startTime: Date;
|
||||
totalCount: number;
|
||||
}) => {
|
||||
const searchQuery = alert.groupBy
|
||||
? `${logView.query} ${alert.groupBy}:"${group}"`
|
||||
: logView.query;
|
||||
// TODO: show group + total count for group-by alerts
|
||||
const results = await clickhouse.getLogBatch({
|
||||
endTime: endTime.getTime(),
|
||||
limit: 5,
|
||||
offset: 0,
|
||||
order: 'desc', // TODO: better to use null
|
||||
q: searchQuery,
|
||||
startTime: startTime.getTime(),
|
||||
tableVersion: logView.team.logStreamTableVersion,
|
||||
teamId: logView.team._id.toString(),
|
||||
});
|
||||
|
||||
const mrkdwn = [
|
||||
`*<${buildLogSearchLink({
|
||||
endTime,
|
||||
|
|
@ -127,34 +211,23 @@ const buildEventSlackMessage = ({
|
|||
|
||||
const fireChannelEvent = async ({
|
||||
alert,
|
||||
logView,
|
||||
totalCount,
|
||||
group,
|
||||
startTime,
|
||||
dashboard,
|
||||
endTime,
|
||||
group,
|
||||
logView,
|
||||
startTime,
|
||||
totalCount,
|
||||
windowSizeInMins,
|
||||
}: {
|
||||
alert: IAlert;
|
||||
logView: Awaited<ReturnType<typeof getLogViewEnhanced>>;
|
||||
totalCount: number;
|
||||
alert: AlertDocument;
|
||||
logView: Awaited<ReturnType<typeof getLogViewEnhanced>> | null;
|
||||
dashboard: EnhancedDashboard | null;
|
||||
endTime: Date;
|
||||
group?: string;
|
||||
startTime: Date;
|
||||
endTime: Date;
|
||||
totalCount: number;
|
||||
windowSizeInMins: number;
|
||||
}) => {
|
||||
const searchQuery = alert.groupBy
|
||||
? `${logView.query} ${alert.groupBy}:"${group}"`
|
||||
: logView.query;
|
||||
// TODO: show group + total count for group-by alerts
|
||||
const results = await clickhouse.getLogBatch({
|
||||
endTime: endTime.getTime(),
|
||||
limit: 5,
|
||||
offset: 0,
|
||||
order: 'desc', // TODO: better to use null
|
||||
q: searchQuery,
|
||||
startTime: startTime.getTime(),
|
||||
tableVersion: logView.team.logStreamTableVersion,
|
||||
teamId: logView.team._id.toString(),
|
||||
});
|
||||
|
||||
switch (alert.channel.type) {
|
||||
case 'webhook': {
|
||||
const webhook = await Webhook.findOne({
|
||||
|
|
@ -162,19 +235,48 @@ const fireChannelEvent = async ({
|
|||
});
|
||||
// ONLY SUPPORTS SLACK WEBHOOKS FOR NOW
|
||||
if (webhook?.service === 'slack') {
|
||||
await slack.postMessageToWebhook(
|
||||
webhook.url,
|
||||
buildEventSlackMessage({
|
||||
let message: {
|
||||
text: string;
|
||||
blocks?: {
|
||||
type: string;
|
||||
text: {
|
||||
type: string;
|
||||
text: string;
|
||||
};
|
||||
}[];
|
||||
} | null = null;
|
||||
|
||||
if (alert.source === 'LOG' && logView) {
|
||||
message = await buildLogEventSlackMessage({
|
||||
alert,
|
||||
endTime,
|
||||
group,
|
||||
logView,
|
||||
results,
|
||||
searchQuery,
|
||||
startTime,
|
||||
totalCount,
|
||||
}),
|
||||
);
|
||||
});
|
||||
} else if (alert.source === 'CHART' && dashboard) {
|
||||
message = buildChartEventSlackMessage({
|
||||
alert,
|
||||
dashboard,
|
||||
endTime,
|
||||
granularity: `${windowSizeInMins} minute`,
|
||||
group,
|
||||
startTime,
|
||||
totalCount,
|
||||
});
|
||||
}
|
||||
|
||||
if (message !== null) {
|
||||
await slack.postMessageToWebhook(webhook.url, message);
|
||||
} else {
|
||||
logger.error({
|
||||
alert,
|
||||
dashboard,
|
||||
logView,
|
||||
message: 'Unsupported alert source',
|
||||
});
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
@ -185,7 +287,10 @@ const fireChannelEvent = async ({
|
|||
}
|
||||
};
|
||||
|
||||
const doesExceedThreshold = (alert: IAlert, totalCount: number) => {
|
||||
export const doesExceedThreshold = (
|
||||
alert: AlertDocument,
|
||||
totalCount: number,
|
||||
) => {
|
||||
if (alert.type === 'presence' && totalCount >= alert.threshold) {
|
||||
return true;
|
||||
} else if (alert.type === 'absence' && totalCount < alert.threshold) {
|
||||
|
|
@ -198,18 +303,8 @@ export const roundDownTo = (roundTo: number) => (x: Date) =>
|
|||
new Date(Math.floor(x.getTime() / roundTo) * roundTo);
|
||||
export const roundDownToXMinutes = (x: number) => roundDownTo(1000 * 60 * x);
|
||||
|
||||
const processAlert = async (now: Date, alert: IAlert) => {
|
||||
export const processAlert = async (now: Date, alert: AlertDocument) => {
|
||||
try {
|
||||
if (alert.source === 'CHART' || !alert.logView) {
|
||||
logger.info({
|
||||
message: `[Not implemented] Skipping Chart alert processing`,
|
||||
alert,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const logView = await getLogViewEnhanced(alert.logView);
|
||||
|
||||
const previous: IAlertHistory | undefined = (
|
||||
await AlertHistory.find({ alert: alert._id })
|
||||
.sort({ createdAt: -1 })
|
||||
|
|
@ -229,59 +324,141 @@ const processAlert = async (now: Date, alert: IAlert) => {
|
|||
previous,
|
||||
now,
|
||||
alert,
|
||||
logView,
|
||||
});
|
||||
return;
|
||||
}
|
||||
const history = await new AlertHistory({
|
||||
alert: alert._id,
|
||||
createdAt: nowInMinsRoundDown,
|
||||
}).save();
|
||||
const checkStartTime = previous
|
||||
? previous.createdAt
|
||||
: fns.subMinutes(nowInMinsRoundDown, windowSizeInMins);
|
||||
const checkEndTime = nowInMinsRoundDown;
|
||||
const check = await clickhouse.checkAlert({
|
||||
endTime: checkEndTime,
|
||||
groupBy: alert.groupBy,
|
||||
q: logView.query,
|
||||
startTime: checkStartTime,
|
||||
tableVersion: logView.team.logStreamTableVersion,
|
||||
teamId: logView.team._id.toString(),
|
||||
windowSizeInMins,
|
||||
});
|
||||
|
||||
logger.info({
|
||||
message: 'Received alert metric',
|
||||
alert,
|
||||
logView,
|
||||
check,
|
||||
checkStartTime,
|
||||
checkEndTime,
|
||||
});
|
||||
// Logs Source
|
||||
let checksData:
|
||||
| Awaited<ReturnType<typeof clickhouse.checkAlert>>
|
||||
| Awaited<ReturnType<typeof clickhouse.getLogsChart>>
|
||||
| null = null;
|
||||
let logView: Awaited<ReturnType<typeof getLogViewEnhanced>> | null = null;
|
||||
let targetDashboard: EnhancedDashboard | null = null;
|
||||
if (alert.source === 'LOG' && alert.logView) {
|
||||
logView = await getLogViewEnhanced(alert.logView);
|
||||
// TODO: use getLogsChart instead so we can deprecate checkAlert
|
||||
checksData = await clickhouse.checkAlert({
|
||||
endTime: checkEndTime,
|
||||
groupBy: alert.groupBy,
|
||||
q: logView.query,
|
||||
startTime: checkStartTime,
|
||||
tableVersion: logView.team.logStreamTableVersion,
|
||||
teamId: logView.team._id.toString(),
|
||||
windowSizeInMins,
|
||||
});
|
||||
logger.info({
|
||||
message: 'Received alert metric [LOG source]',
|
||||
alert,
|
||||
logView,
|
||||
checksData,
|
||||
checkStartTime,
|
||||
checkEndTime,
|
||||
});
|
||||
}
|
||||
// Chart Source
|
||||
else if (alert.source === 'CHART' && alert.dashboardId && alert.chartId) {
|
||||
const dashboard = await Dashboard.findOne(
|
||||
{
|
||||
_id: alert.dashboardId,
|
||||
'charts.id': alert.chartId,
|
||||
},
|
||||
{
|
||||
name: 1,
|
||||
charts: {
|
||||
$elemMatch: {
|
||||
id: alert.chartId,
|
||||
},
|
||||
},
|
||||
},
|
||||
).populate<{
|
||||
team: ITeam;
|
||||
}>('team');
|
||||
if (
|
||||
dashboard &&
|
||||
Array.isArray(dashboard.charts) &&
|
||||
dashboard.charts.length === 1
|
||||
) {
|
||||
const chart = dashboard.charts[0];
|
||||
// TODO: assuming that the chart has only 1 series for now
|
||||
const series = chart.series[0];
|
||||
if (series.type === 'time' && series.table === 'logs') {
|
||||
targetDashboard = dashboard;
|
||||
const MAX_NUM_GROUPS = 20;
|
||||
const startTimeMs = fns.getTime(checkStartTime);
|
||||
const endTimeMs = fns.getTime(checkEndTime);
|
||||
const propertyTypeMappingsModel =
|
||||
await clickhouse.buildLogsPropertyTypeMappingsModel(
|
||||
dashboard.team.logStreamTableVersion,
|
||||
dashboard.team._id.toString(),
|
||||
startTimeMs,
|
||||
endTimeMs,
|
||||
);
|
||||
checksData = await clickhouse.getLogsChart({
|
||||
aggFn: series.aggFn,
|
||||
endTime: endTimeMs,
|
||||
// @ts-expect-error
|
||||
field: series.field,
|
||||
granularity: `${windowSizeInMins} minute`,
|
||||
groupBy: series.groupBy[0],
|
||||
maxNumGroups: MAX_NUM_GROUPS,
|
||||
propertyTypeMappingsModel,
|
||||
q: series.where,
|
||||
startTime: startTimeMs,
|
||||
tableVersion: dashboard.team.logStreamTableVersion,
|
||||
teamId: dashboard.team._id.toString(),
|
||||
});
|
||||
}
|
||||
// TODO: support metrics table
|
||||
}
|
||||
|
||||
logger.info({
|
||||
message: 'Received alert metric [CHART source]',
|
||||
alert,
|
||||
checksData,
|
||||
checkStartTime,
|
||||
checkEndTime,
|
||||
});
|
||||
} else {
|
||||
logger.error({
|
||||
message: `Unsupported alert source: ${alert.source}`,
|
||||
alert,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const history = await new AlertHistory({
|
||||
alert: alert._id,
|
||||
createdAt: nowInMinsRoundDown,
|
||||
}).save();
|
||||
// TODO: support INSUFFICIENT_DATA state
|
||||
let alertState = AlertState.OK;
|
||||
if (check?.rows && check?.rows > 0) {
|
||||
for (const checkData of check.data) {
|
||||
const totalCount = parseInt(checkData.count);
|
||||
if (checksData?.rows && checksData?.rows > 0) {
|
||||
for (const checkData of checksData.data) {
|
||||
const totalCount = parseInt(checkData.data);
|
||||
if (doesExceedThreshold(alert, totalCount)) {
|
||||
alertState = AlertState.ALERT;
|
||||
logger.info({
|
||||
message: `Triggering ${alert.channel.type} alarm!`,
|
||||
alert,
|
||||
logView,
|
||||
totalCount,
|
||||
checkData,
|
||||
});
|
||||
const bucketStart = new Date(checkData.ts_bucket);
|
||||
const bucketStart = new Date(checkData.ts_bucket * 1000);
|
||||
|
||||
await fireChannelEvent({
|
||||
alert,
|
||||
logView,
|
||||
totalCount,
|
||||
group: checkData.group,
|
||||
startTime: bucketStart,
|
||||
dashboard: targetDashboard,
|
||||
endTime: fns.addMinutes(bucketStart, windowSizeInMins),
|
||||
group: checkData.group,
|
||||
logView,
|
||||
startTime: bucketStart,
|
||||
totalCount,
|
||||
windowSizeInMins,
|
||||
});
|
||||
history.counts += 1;
|
||||
}
|
||||
|
|
@ -289,11 +466,15 @@ const processAlert = async (now: Date, alert: IAlert) => {
|
|||
await history.save();
|
||||
}
|
||||
alert.state = alertState;
|
||||
await (alert as any).save();
|
||||
await alert.save();
|
||||
} catch (e) {
|
||||
// Uncomment this for better error messages locally
|
||||
// console.error(e);
|
||||
logger.error(serializeError(e));
|
||||
logger.error({
|
||||
message: 'Failed to process alert',
|
||||
alert,
|
||||
error: serializeError(e),
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
import { IncomingWebhook } from '@slack/webhook';
|
||||
import { IncomingWebhook, IncomingWebhookSendArguments } from '@slack/webhook';
|
||||
|
||||
export function postMessageToWebhook(webhookUrl: string, message: any) {
|
||||
export function postMessageToWebhook(
|
||||
webhookUrl: string,
|
||||
message: IncomingWebhookSendArguments,
|
||||
) {
|
||||
const webhook = new IncomingWebhook(webhookUrl);
|
||||
return webhook.send({
|
||||
text: message.text,
|
||||
|
|
|
|||
|
|
@ -970,7 +970,7 @@ export const EditLineChartForm = ({
|
|||
}}
|
||||
/>
|
||||
|
||||
{config.CHART_ALERTS_ENABLED && (
|
||||
{editedChart.series[0].table === 'logs' && (
|
||||
<div className="mt-4 border-top border-bottom border-grey p-2 py-3">
|
||||
{isLocalDashboard ? (
|
||||
<span className="text-gray-600 fs-8">
|
||||
|
|
|
|||
|
|
@ -9,6 +9,3 @@ export const HDX_COLLECTOR_URL =
|
|||
'http://localhost:4318';
|
||||
|
||||
export const IS_OSS = process.env.NEXT_PUBLIC_IS_OSS ?? 'true' === 'true';
|
||||
|
||||
// Features in development
|
||||
export const CHART_ALERTS_ENABLED = process.env.NODE_ENV === 'development';
|
||||
|
|
|
|||
Loading…
Reference in a new issue