ci: Adds Docker stats telemetry (#26196)

This commit is contained in:
Declan Carroll 2026-02-25 08:18:23 +00:00 committed by GitHub
parent 562d867483
commit c92becbc34
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 363 additions and 37 deletions

View file

@ -36,6 +36,7 @@ return 'blacksmith';
| Telemetry | Source | Metrics |
|-----------|--------|---------|
| Build stats | `.github/scripts/send-build-stats.mjs` | Per-package build time, cache hits |
| Docker stats | `.github/scripts/send-docker-stats.mjs` | Image size, compiled artifact size, docker build time |
| Container stack | `packages/testing/containers/telemetry.ts` | E2E startup times |
## Secrets
@ -44,6 +45,8 @@ return 'blacksmith';
BUILD_STATS_WEBHOOK_URL
BUILD_STATS_WEBHOOK_USER
BUILD_STATS_WEBHOOK_PASSWORD # Alphanumeric + hyphens only (no $!#@)
DOCKER_STATS_WEBHOOK_URL
```
## Adding New Telemetry

View file

@ -74,4 +74,5 @@ runs:
run: |
${{ inputs.build-command }} --summarize
node .github/scripts/send-build-stats.mjs || true
node .github/scripts/send-docker-stats.mjs || true
shell: bash

104
.github/scripts/send-docker-stats.mjs vendored Normal file
View file

@ -0,0 +1,104 @@
#!/usr/bin/env node
/**
* Sends Docker build stats to a webhook for BigQuery ingestion.
*
* Reads manifests produced by build-n8n.mjs and dockerize-n8n.mjs,
* enriches with git/CI/runner context, and POSTs to a webhook.
*
* Usage: node send-docker-stats.mjs
*
* Environment variables:
* DOCKER_STATS_WEBHOOK_URL - Webhook URL (required to send)
*/
import { existsSync, readFileSync } from 'node:fs';
import * as os from 'node:os';
const buildManifestPath = 'compiled/build-manifest.json';
const dockerManifestPath = 'docker-build-manifest.json';
if (!existsSync(buildManifestPath) && !existsSync(dockerManifestPath)) {
console.log('No build or docker manifests found, skipping.');
process.exit(0);
}
const webhookUrl = process.env.DOCKER_STATS_WEBHOOK_URL;
if (!webhookUrl) {
console.log('DOCKER_STATS_WEBHOOK_URL not set, skipping.');
process.exit(0);
}
const buildManifest = existsSync(buildManifestPath)
? JSON.parse(readFileSync(buildManifestPath, 'utf-8'))
: null;
const dockerManifest = existsSync(dockerManifestPath)
? JSON.parse(readFileSync(dockerManifestPath, 'utf-8'))
: null;
// Extract PR number from GITHUB_REF (refs/pull/123/merge)
const ref = process.env.GITHUB_REF ?? '';
const prMatch = ref.match(/refs\/pull\/(\d+)/);
// Detect runner provider (matches packages/testing/containers/telemetry.ts)
function getRunnerProvider() {
if (!process.env.CI) return 'local';
if (process.env.RUNNER_ENVIRONMENT === 'github-hosted') return 'github';
return 'blacksmith';
}
const payload = {
build: buildManifest
? {
artifactSize: buildManifest.artifactSize,
buildDuration: buildManifest.buildDuration,
}
: null,
docker: dockerManifest
? {
platform: dockerManifest.platform,
images: dockerManifest.images,
}
: null,
git: {
sha: process.env.GITHUB_SHA?.slice(0, 8) || null,
branch: process.env.GITHUB_HEAD_REF ?? process.env.GITHUB_REF_NAME ?? null,
pr: prMatch ? parseInt(prMatch[1], 10) : null,
},
ci: {
runId: process.env.GITHUB_RUN_ID || null,
runUrl: process.env.GITHUB_RUN_ID
? `https://github.com/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`
: null,
job: process.env.GITHUB_JOB || null,
workflow: process.env.GITHUB_WORKFLOW || null,
attempt: process.env.GITHUB_RUN_ATTEMPT ? parseInt(process.env.GITHUB_RUN_ATTEMPT, 10) : null,
},
runner: {
provider: getRunnerProvider(),
cpuCores: os.cpus().length,
memoryGb: Math.round((os.totalmem() / (1024 * 1024 * 1024)) * 10) / 10,
},
};
const response = await fetch(webhookUrl, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(payload),
});
if (!response.ok) {
console.error(`Webhook failed: ${response.status} ${response.statusText}`);
const body = await response.text();
if (body) console.error(`Response: ${body}`);
process.exit(1);
}
console.log(`Docker build stats sent: ${response.status}`);

View file

@ -46,6 +46,10 @@ jobs:
IMAGE_BASE_NAME: ghcr.io/${{ github.repository }}
IMAGE_TAG: ci-${{ github.run_id }}
RUNNERS_IMAGE_BASE_NAME: ghcr.io/${{ github.repository_owner }}/runners
BUILD_STATS_WEBHOOK_URL: ${{ secrets.BUILD_STATS_WEBHOOK_URL }}
BUILD_STATS_WEBHOOK_USER: ${{ secrets.BUILD_STATS_WEBHOOK_USER }}
BUILD_STATS_WEBHOOK_PASSWORD: ${{ secrets.BUILD_STATS_WEBHOOK_PASSWORD }}
DOCKER_STATS_WEBHOOK_URL: ${{ secrets.DOCKER_STATS_WEBHOOK_URL }}
- name: Generate shard matrix
id: generate-matrix

1
.gitignore vendored
View file

@ -37,6 +37,7 @@ test-results/
compiled_app_output
trivy_report*
compiled
docker-build-manifest.json
packages/cli/src/modules/my-feature
.secrets
packages/testing/**/.cursor/rules/

View file

@ -1,3 +1,5 @@
import { readFileSync } from 'node:fs';
import { mockInstance } from '@n8n/backend-test-utils';
import { GlobalConfig } from '@n8n/config';
import type { WorkflowRepository, LicenseMetricsRepository } from '@n8n/db';
@ -19,9 +21,12 @@ const mockMiddleware = (
next: express.NextFunction,
) => next();
jest.mock('node:fs', () => ({ readFileSync: jest.fn() }));
jest.mock('prom-client');
jest.mock('express-prom-bundle', () => jest.fn(() => mockMiddleware));
const mockedReadFileSync = jest.mocked(readFileSync);
describe('PrometheusMetricsService', () => {
let globalConfig: GlobalConfig;
let app: express.Application;
@ -85,6 +90,10 @@ describe('PrometheusMetricsService', () => {
promClient.Counter.prototype.inc = jest.fn();
(promClient.validateMetricName as jest.Mock).mockReturnValue(true);
mockedReadFileSync.mockImplementation(() => {
throw new Error('ENOENT: no such file or directory');
});
});
afterEach(() => {
@ -570,4 +579,85 @@ describe('PrometheusMetricsService', () => {
expect(hasInstanceRoleMetric).toBe(false);
});
});
describe('PSS metric', () => {
const findPssGaugeConfig = () => {
const calls = (promClient.Gauge as jest.Mock).mock.calls;
return calls.find((call) => call[0]?.name === 'n8n_process_pss_bytes')?.[0];
};
it('should not set up PSS metric when default metrics are disabled', async () => {
await prometheusMetricsService.init(app);
expect(findPssGaugeConfig()).toBeUndefined();
});
it('should not set up PSS metric when smaps_rollup is not readable', async () => {
prometheusMetricsService.enableMetric('default');
await prometheusMetricsService.init(app);
expect(findPssGaugeConfig()).toBeUndefined();
});
it('should set up PSS metric when default metrics enabled and smaps_rollup is readable', async () => {
prometheusMetricsService.enableMetric('default');
mockedReadFileSync.mockReturnValue('Pss: 12345 kB' as never);
await prometheusMetricsService.init(app);
const config = findPssGaugeConfig();
expect(config).toMatchObject({
name: 'n8n_process_pss_bytes',
help: 'Proportional Set Size of the process in bytes.',
});
expect(config.collect).toBeDefined();
});
it('should parse Pss value and convert kB to bytes in collect callback', async () => {
prometheusMetricsService.enableMetric('default');
mockedReadFileSync.mockReturnValue(
'Rss: 100000 kB\nPss: 12345 kB\nShared_Clean: 5000 kB' as never,
);
await prometheusMetricsService.init(app);
const config = findPssGaugeConfig();
const mockSet = jest.fn();
config.collect.call({ set: mockSet });
expect(mockSet).toHaveBeenCalledWith(12345 * 1024);
});
it('should not set gauge value when Pss line is not found in smaps_rollup', async () => {
prometheusMetricsService.enableMetric('default');
mockedReadFileSync.mockReturnValue('some content without pss' as never);
await prometheusMetricsService.init(app);
const config = findPssGaugeConfig();
const mockSet = jest.fn();
config.collect.call({ set: mockSet });
expect(mockSet).not.toHaveBeenCalled();
});
it('should silently handle readFileSync failure in collect callback', async () => {
prometheusMetricsService.enableMetric('default');
// Availability check succeeds
mockedReadFileSync.mockReturnValueOnce('Pss: 1 kB' as never);
await prometheusMetricsService.init(app);
// Subsequent reads in collect callback fail
mockedReadFileSync.mockImplementation(() => {
throw new Error('EACCES: permission denied');
});
const config = findPssGaugeConfig();
const mockSet = jest.fn();
expect(() => config.collect.call({ set: mockSet })).not.toThrow();
expect(mockSet).not.toHaveBeenCalled();
});
});
});

View file

@ -1,3 +1,5 @@
import { readFileSync } from 'node:fs';
import { GlobalConfig } from '@n8n/config';
import { Time } from '@n8n/constants';
import { LicenseMetricsRepository, WorkflowRepository } from '@n8n/db';
@ -60,6 +62,7 @@ export class PrometheusMetricsService {
async init(app: express.Application) {
promClient.register.clear(); // clear all metrics in case we call this a second time
this.initDefaultMetrics();
this.initPssMetric();
this.initN8nVersionMetric();
if (this.instanceSettings.instanceType === 'main') this.initInstanceRoleMetric();
this.initCacheMetrics();
@ -145,6 +148,46 @@ export class PrometheusMetricsService {
promClient.collectDefaultMetrics({ prefix: this.globalConfig.endpoints.metrics.prefix });
}
/**
* Set up PSS (Proportional Set Size) metric: `n8n_process_pss_bytes`
*
* Unlike RSS which double-counts shared pages, PSS divides shared memory
* proportionally among processes. This gives a fairer memory measurement
* in containerized environments where shared libraries are common.
* Only available on Linux with kernel 4.14+.
*/
private initPssMetric() {
if (!this.includes.metrics.default) return;
let pssAvailable = true;
try {
readFileSync('/proc/self/smaps_rollup', 'utf8');
} catch {
pssAvailable = false;
}
if (!pssAvailable) return;
const prefix = this.prefix;
new promClient.Gauge({
name: prefix + 'process_pss_bytes',
help: 'Proportional Set Size of the process in bytes.',
collect() {
try {
// Sync read is intentional: /proc is a kernel virtual filesystem (microseconds, no disk I/O).
// This matches prom-client's own built-in metrics which use process.memoryUsage() (also /proc).
const content = readFileSync('/proc/self/smaps_rollup', 'utf8');
const match = content.match(/^Pss:\s+(\d+)\s+kB$/m);
if (match) {
this.set(parseInt(match[1], 10) * 1024);
}
} catch {
// Failed to read smaps_rollup, skip this scrape
}
},
});
}
/**
* Set up metrics for server routes with `express-prom-bundle`. The same
* middleware is also utilized for an instance activity metric

View file

@ -8,36 +8,34 @@ test.use({
},
});
test.describe('Memory Consumption @capability:observability', {
annotation: [
{ type: 'owner', description: 'Catalysts' },
],
}, () => {
test('Memory consumption baseline with starter plan resources', async ({
n8nContainer,
services,
}, testInfo) => {
const obs = services.observability;
test.describe(
'Memory Consumption @capability:observability',
{
annotation: [{ type: 'owner', description: 'Catalysts' }],
},
() => {
test('Memory consumption baseline with starter plan resources', async ({
n8nContainer,
services,
}, testInfo) => {
const obs = services.observability;
const { heapUsedMB } = await getStableHeap(n8nContainer.baseUrl, obs.metrics);
const result = await getStableHeap(n8nContainer.baseUrl, obs.metrics);
const [heapTotalResult, rssResult] = await Promise.all([
obs.metrics.waitForMetric('n8n_nodejs_heap_size_total_bytes / 1024 / 1024'),
obs.metrics.waitForMetric('n8n_process_resident_memory_bytes / 1024 / 1024'),
]);
const heapTotalMB = heapTotalResult!.value;
const rssMB = rssResult!.value;
await attachMetric(testInfo, 'memory-heap-used-baseline', result.heapUsedMB, 'MB');
await attachMetric(testInfo, 'memory-heap-total-baseline', result.heapTotalMB, 'MB');
await attachMetric(testInfo, 'memory-rss-baseline', result.rssMB, 'MB');
await attachMetric(testInfo, 'memory-pss-baseline', result.pssMB ?? 0, 'MB');
await attachMetric(
testInfo,
'memory-non-heap-overhead-baseline',
result.nonHeapOverheadMB,
'MB',
);
console.log(
`[MEMORY] Heap Used: ${heapUsedMB.toFixed(2)} MB | Heap Total: ${heapTotalMB.toFixed(2)} MB | RSS: ${rssMB.toFixed(2)} MB`,
);
await attachMetric(testInfo, 'memory-heap-used-baseline', heapUsedMB, 'MB');
await attachMetric(testInfo, 'memory-heap-total-baseline', heapTotalMB, 'MB');
await attachMetric(testInfo, 'memory-rss-baseline', rssMB, 'MB');
expect(heapUsedMB).toBeGreaterThan(0);
expect(heapTotalMB).toBeGreaterThan(0);
expect(rssMB).toBeGreaterThan(0);
});
});
expect(result.heapUsedMB).toBeGreaterThan(0);
expect(result.heapTotalMB).toBeGreaterThan(0);
expect(result.rssMB).toBeGreaterThan(0);
});
},
);

View file

@ -2,6 +2,9 @@ import type { Page, TestInfo } from '@playwright/test';
import type { MetricsHelper } from 'n8n-containers';
const HEAP_USED_QUERY = 'n8n_nodejs_heap_size_used_bytes / 1024 / 1024';
const HEAP_TOTAL_QUERY = 'n8n_nodejs_heap_size_total_bytes / 1024 / 1024';
const RSS_QUERY = 'n8n_process_resident_memory_bytes / 1024 / 1024';
const PSS_QUERY = 'n8n_process_pss_bytes / 1024 / 1024';
export async function measurePerformance(
page: Page,
@ -49,13 +52,18 @@ export interface StableHeapOptions {
export interface StableHeapResult {
heapUsedMB: number;
heapTotalMB: number;
rssMB: number;
pssMB: number | null;
nonHeapOverheadMB: number;
stabilizationTimeMs: number;
readingsCount: number;
}
/**
* Trigger GC and wait for heap memory to stabilize.
* Uses consecutive stable readings to ensure metrics have settled post-GC.
* Collects RSS, PSS, and heap total samples during the stabilization window
* and returns median values to reduce point-in-time noise.
*/
export async function getStableHeap(
baseUrl: string,
@ -100,6 +108,38 @@ interface StabilizationConfig {
stableReadingsRequired: number;
}
interface MemorySamples {
heapTotal: number[];
rss: number[];
pss: number[];
}
function median(values: number[]): number {
if (values.length === 0) return 0;
const sorted = [...values].sort((a, b) => a - b);
const mid = Math.floor(sorted.length / 2);
return sorted.length % 2 !== 0 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2;
}
async function collectAdditionalSamples(
metrics: MetricsHelper,
samples: MemorySamples,
): Promise<void> {
try {
const results = await Promise.all([
metrics.query(HEAP_TOTAL_QUERY),
metrics.query(RSS_QUERY),
metrics.query(PSS_QUERY),
]);
if (results[0]?.[0]) samples.heapTotal.push(results[0][0].value);
if (results[1]?.[0]) samples.rss.push(results[1][0].value);
if (results[2]?.[0]) samples.pss.push(results[2][0].value);
} catch {
// Non-critical, skip this sample
}
}
async function waitForStableMemory(
metrics: MetricsHelper,
config: StabilizationConfig,
@ -109,6 +149,7 @@ async function waitForStableMemory(
let lastValue = 0;
let stableCount = 0;
let readingsCount = 0;
const samples: MemorySamples = { heapTotal: [], rss: [], pss: [] };
while (Date.now() - startTime < maxWaitMs) {
const result = await metrics.waitForMetric(HEAP_USED_QUERY, {
@ -119,17 +160,42 @@ async function waitForStableMemory(
if (result) {
readingsCount++;
const currentValue = result.value;
await collectAdditionalSamples(metrics, samples);
const delta = Math.abs(currentValue - lastValue);
if (lastValue > 0 && delta < thresholdMB) {
stableCount++;
if (stableCount >= stableReadingsRequired) {
const stabilizationTimeMs = Date.now() - startTime;
const heapUsedMB = currentValue;
const heapTotalMB = median(samples.heapTotal);
const rssMB = median(samples.rss);
const pssMB = samples.pss.length > 0 ? median(samples.pss) : null;
// Can theoretically go negative if RSS/heapTotal medians come from slightly
// different sample windows. A negative value would indicate a measurement
// timing issue — don't clamp to 0, surface it for investigation.
const nonHeapOverheadMB = rssMB - heapTotalMB;
console.log(
`[STABILIZATION] Memory stabilized at ${currentValue.toFixed(2)} MB ` +
`after ${stabilizationTimeMs}ms (${readingsCount} readings)`,
`[STABILIZATION] Memory stabilized after ${stabilizationTimeMs}ms (${readingsCount} readings)\n` +
` Heap Used: ${heapUsedMB.toFixed(2)} MB\n` +
` Heap Total: ${heapTotalMB.toFixed(2)} MB (median of ${samples.heapTotal.length})\n` +
` RSS: ${rssMB.toFixed(2)} MB (median of ${samples.rss.length})\n` +
` PSS: ${pssMB?.toFixed(2) ?? 'N/A'} MB${pssMB !== null ? ` (median of ${samples.pss.length})` : ''}\n` +
` Non-Heap Overhead: ${nonHeapOverheadMB.toFixed(2)} MB`,
);
return { heapUsedMB: currentValue, stabilizationTimeMs, readingsCount };
return {
heapUsedMB,
heapTotalMB,
rssMB,
pssMB,
nonHeapOverheadMB,
stabilizationTimeMs,
readingsCount,
};
}
} else {
stableCount = 0;

View file

@ -173,8 +173,7 @@ async function main() {
const n8nImageSize = await getImageSize(config.n8n.fullImageName);
const runnersImageSize = await getImageSize(config.runners.fullImageName);
// Display summary
displaySummary([
const imageStats = [
{
imageName: config.n8n.fullImageName,
platform,
@ -187,7 +186,24 @@ async function main() {
size: runnersImageSize,
buildTime: runnersBuildTime,
},
]);
];
// Write docker build manifest for telemetry collection
const dockerManifest = {
buildTime: new Date().toISOString(),
platform,
images: imageStats.map(({ imageName, size, buildTime }) => ({
imageName,
size,
buildTime,
})),
};
await fs.writeJson(path.join(config.buildContext, 'docker-build-manifest.json'), dockerManifest, {
spaces: 2,
});
// Display summary
displaySummary(imageStats);
}
async function checkPrerequisites() {