ci: Adds Docker stats telemetry (#26196)

2026-04-21 15:47:20 +00:00 · 2026-02-25 08:18:23 +00:00 · 2026-02-25 08:18:23 +00:00 · c92becbc34
commit c92becbc34
parent 562d867483
10 changed files with 363 additions and 37 deletions
--- a/.github/CI-TELEMETRY.md
+++ b/.github/CI-TELEMETRY.md
@ -36,6 +36,7 @@ return 'blacksmith';
 | Telemetry | Source | Metrics |
 |-----------|--------|---------|
 | Build stats | `.github/scripts/send-build-stats.mjs` | Per-package build time, cache hits |
+| Docker stats | `.github/scripts/send-docker-stats.mjs` | Image size, compiled artifact size, docker build time |
 | Container stack | `packages/testing/containers/telemetry.ts` | E2E startup times |

 ## Secrets
@ -44,6 +45,8 @@ return 'blacksmith';
 BUILD_STATS_WEBHOOK_URL
 BUILD_STATS_WEBHOOK_USER
 BUILD_STATS_WEBHOOK_PASSWORD  # Alphanumeric + hyphens only (no $!#@)
+
+DOCKER_STATS_WEBHOOK_URL
 ```

 ## Adding New Telemetry
--- a/.github/actions/setup-nodejs/action.yml
+++ b/.github/actions/setup-nodejs/action.yml
@ -74,4 +74,5 @@ runs:
      run: |
        ${{ inputs.build-command }} --summarize
        node .github/scripts/send-build-stats.mjs || true
+        node .github/scripts/send-docker-stats.mjs || true
      shell: bash
--- a/.github/scripts/send-docker-stats.mjs
+++ b/.github/scripts/send-docker-stats.mjs
@ -0,0 +1,104 @@
+#!/usr/bin/env node
+/**
+ * Sends Docker build stats to a webhook for BigQuery ingestion.
+ *
+ * Reads manifests produced by build-n8n.mjs and dockerize-n8n.mjs,
+ * enriches with git/CI/runner context, and POSTs to a webhook.
+ *
+ * Usage: node send-docker-stats.mjs
+ *
+ * Environment variables:
+ *   DOCKER_STATS_WEBHOOK_URL - Webhook URL (required to send)
+ */
+
+import { existsSync, readFileSync } from 'node:fs';
+import * as os from 'node:os';
+
+const buildManifestPath = 'compiled/build-manifest.json';
+const dockerManifestPath = 'docker-build-manifest.json';
+
+if (!existsSync(buildManifestPath) && !existsSync(dockerManifestPath)) {
+	console.log('No build or docker manifests found, skipping.');
+	process.exit(0);
+}
+
+const webhookUrl = process.env.DOCKER_STATS_WEBHOOK_URL;
+
+if (!webhookUrl) {
+	console.log('DOCKER_STATS_WEBHOOK_URL not set, skipping.');
+	process.exit(0);
+}
+
+const buildManifest = existsSync(buildManifestPath)
+	? JSON.parse(readFileSync(buildManifestPath, 'utf-8'))
+	: null;
+
+const dockerManifest = existsSync(dockerManifestPath)
+	? JSON.parse(readFileSync(dockerManifestPath, 'utf-8'))
+	: null;
+
+// Extract PR number from GITHUB_REF (refs/pull/123/merge)
+const ref = process.env.GITHUB_REF ?? '';
+const prMatch = ref.match(/refs\/pull\/(\d+)/);
+
+// Detect runner provider (matches packages/testing/containers/telemetry.ts)
+function getRunnerProvider() {
+	if (!process.env.CI) return 'local';
+	if (process.env.RUNNER_ENVIRONMENT === 'github-hosted') return 'github';
+	return 'blacksmith';
+}
+
+const payload = {
+	build: buildManifest
+		? {
+				artifactSize: buildManifest.artifactSize,
+				buildDuration: buildManifest.buildDuration,
+			}
+		: null,
+
+	docker: dockerManifest
+		? {
+				platform: dockerManifest.platform,
+				images: dockerManifest.images,
+			}
+		: null,
+
+	git: {
+		sha: process.env.GITHUB_SHA?.slice(0, 8) || null,
+		branch: process.env.GITHUB_HEAD_REF ?? process.env.GITHUB_REF_NAME ?? null,
+		pr: prMatch ? parseInt(prMatch[1], 10) : null,
+	},
+
+	ci: {
+		runId: process.env.GITHUB_RUN_ID || null,
+		runUrl: process.env.GITHUB_RUN_ID
+			? `https://github.com/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`
+			: null,
+		job: process.env.GITHUB_JOB || null,
+		workflow: process.env.GITHUB_WORKFLOW || null,
+		attempt: process.env.GITHUB_RUN_ATTEMPT ? parseInt(process.env.GITHUB_RUN_ATTEMPT, 10) : null,
+	},
+
+	runner: {
+		provider: getRunnerProvider(),
+		cpuCores: os.cpus().length,
+		memoryGb: Math.round((os.totalmem() / (1024 * 1024 * 1024)) * 10) / 10,
+	},
+};
+
+const response = await fetch(webhookUrl, {
+	method: 'POST',
+	headers: {
+		'Content-Type': 'application/json',
+	},
+	body: JSON.stringify(payload),
+});
+
+if (!response.ok) {
+	console.error(`Webhook failed: ${response.status} ${response.statusText}`);
+	const body = await response.text();
+	if (body) console.error(`Response: ${body}`);
+	process.exit(1);
+}
+
+console.log(`Docker build stats sent: ${response.status}`);
--- a/.github/workflows/test-e2e-ci-reusable.yml
+++ b/.github/workflows/test-e2e-ci-reusable.yml
@ -46,6 +46,10 @@ jobs:
          IMAGE_BASE_NAME: ghcr.io/${{ github.repository }}
          IMAGE_TAG: ci-${{ github.run_id }}
          RUNNERS_IMAGE_BASE_NAME: ghcr.io/${{ github.repository_owner }}/runners
+          BUILD_STATS_WEBHOOK_URL: ${{ secrets.BUILD_STATS_WEBHOOK_URL }}
+          BUILD_STATS_WEBHOOK_USER: ${{ secrets.BUILD_STATS_WEBHOOK_USER }}
+          BUILD_STATS_WEBHOOK_PASSWORD: ${{ secrets.BUILD_STATS_WEBHOOK_PASSWORD }}
+          DOCKER_STATS_WEBHOOK_URL: ${{ secrets.DOCKER_STATS_WEBHOOK_URL }}

      - name: Generate shard matrix
        id: generate-matrix
--- a/.gitignore
+++ b/.gitignore
@ -37,6 +37,7 @@ test-results/
 compiled_app_output
 trivy_report*
 compiled
+docker-build-manifest.json
 packages/cli/src/modules/my-feature
 .secrets
 packages/testing/**/.cursor/rules/
--- a/packages/cli/src/metrics/tests/prometheus-metrics.service.test.ts
+++ b/packages/cli/src/metrics/tests/prometheus-metrics.service.test.ts
@ -1,3 +1,5 @@
+import { readFileSync } from 'node:fs';
+
 import { mockInstance } from '@n8n/backend-test-utils';
 import { GlobalConfig } from '@n8n/config';
 import type { WorkflowRepository, LicenseMetricsRepository } from '@n8n/db';
@ -19,9 +21,12 @@ const mockMiddleware = (
 	next: express.NextFunction,
 ) => next();

+jest.mock('node:fs', () => ({ readFileSync: jest.fn() }));
 jest.mock('prom-client');
 jest.mock('express-prom-bundle', () => jest.fn(() => mockMiddleware));

+const mockedReadFileSync = jest.mocked(readFileSync);
+
 describe('PrometheusMetricsService', () => {
 	let globalConfig: GlobalConfig;
 	let app: express.Application;
@ -85,6 +90,10 @@ describe('PrometheusMetricsService', () => {

 		promClient.Counter.prototype.inc = jest.fn();
 		(promClient.validateMetricName as jest.Mock).mockReturnValue(true);
+
+		mockedReadFileSync.mockImplementation(() => {
+			throw new Error('ENOENT: no such file or directory');
+		});
 	});

 	afterEach(() => {
@ -570,4 +579,85 @@ describe('PrometheusMetricsService', () => {
 			expect(hasInstanceRoleMetric).toBe(false);
 		});
 	});
+
+	describe('PSS metric', () => {
+		const findPssGaugeConfig = () => {
+			const calls = (promClient.Gauge as jest.Mock).mock.calls;
+			return calls.find((call) => call[0]?.name === 'n8n_process_pss_bytes')?.[0];
+		};
+
+		it('should not set up PSS metric when default metrics are disabled', async () => {
+			await prometheusMetricsService.init(app);
+
+			expect(findPssGaugeConfig()).toBeUndefined();
+		});
+
+		it('should not set up PSS metric when smaps_rollup is not readable', async () => {
+			prometheusMetricsService.enableMetric('default');
+
+			await prometheusMetricsService.init(app);
+
+			expect(findPssGaugeConfig()).toBeUndefined();
+		});
+
+		it('should set up PSS metric when default metrics enabled and smaps_rollup is readable', async () => {
+			prometheusMetricsService.enableMetric('default');
+			mockedReadFileSync.mockReturnValue('Pss:    12345 kB' as never);
+
+			await prometheusMetricsService.init(app);
+
+			const config = findPssGaugeConfig();
+			expect(config).toMatchObject({
+				name: 'n8n_process_pss_bytes',
+				help: 'Proportional Set Size of the process in bytes.',
+			});
+			expect(config.collect).toBeDefined();
+		});
+
+		it('should parse Pss value and convert kB to bytes in collect callback', async () => {
+			prometheusMetricsService.enableMetric('default');
+			mockedReadFileSync.mockReturnValue(
+				'Rss:   100000 kB\nPss:    12345 kB\nShared_Clean:  5000 kB' as never,
+			);
+
+			await prometheusMetricsService.init(app);
+
+			const config = findPssGaugeConfig();
+			const mockSet = jest.fn();
+			config.collect.call({ set: mockSet });
+
+			expect(mockSet).toHaveBeenCalledWith(12345 * 1024);
+		});
+
+		it('should not set gauge value when Pss line is not found in smaps_rollup', async () => {
+			prometheusMetricsService.enableMetric('default');
+			mockedReadFileSync.mockReturnValue('some content without pss' as never);
+
+			await prometheusMetricsService.init(app);
+
+			const config = findPssGaugeConfig();
+			const mockSet = jest.fn();
+			config.collect.call({ set: mockSet });
+
+			expect(mockSet).not.toHaveBeenCalled();
+		});
+
+		it('should silently handle readFileSync failure in collect callback', async () => {
+			prometheusMetricsService.enableMetric('default');
+			// Availability check succeeds
+			mockedReadFileSync.mockReturnValueOnce('Pss:    1 kB' as never);
+
+			await prometheusMetricsService.init(app);
+
+			// Subsequent reads in collect callback fail
+			mockedReadFileSync.mockImplementation(() => {
+				throw new Error('EACCES: permission denied');
+			});
+
+			const config = findPssGaugeConfig();
+			const mockSet = jest.fn();
+			expect(() => config.collect.call({ set: mockSet })).not.toThrow();
+			expect(mockSet).not.toHaveBeenCalled();
+		});
+	});
 });
--- a/packages/cli/src/metrics/prometheus-metrics.service.ts
+++ b/packages/cli/src/metrics/prometheus-metrics.service.ts
@ -1,3 +1,5 @@
+import { readFileSync } from 'node:fs';
+
 import { GlobalConfig } from '@n8n/config';
 import { Time } from '@n8n/constants';
 import { LicenseMetricsRepository, WorkflowRepository } from '@n8n/db';
@ -60,6 +62,7 @@ export class PrometheusMetricsService {
 	async init(app: express.Application) {
 		promClient.register.clear(); // clear all metrics in case we call this a second time
 		this.initDefaultMetrics();
+		this.initPssMetric();
 		this.initN8nVersionMetric();
 		if (this.instanceSettings.instanceType === 'main') this.initInstanceRoleMetric();
 		this.initCacheMetrics();
@ -145,6 +148,46 @@ export class PrometheusMetricsService {
 		promClient.collectDefaultMetrics({ prefix: this.globalConfig.endpoints.metrics.prefix });
 	}

+	/**
+	 * Set up PSS (Proportional Set Size) metric: `n8n_process_pss_bytes`
+	 *
+	 * Unlike RSS which double-counts shared pages, PSS divides shared memory
+	 * proportionally among processes. This gives a fairer memory measurement
+	 * in containerized environments where shared libraries are common.
+	 * Only available on Linux with kernel 4.14+.
+	 */
+	private initPssMetric() {
+		if (!this.includes.metrics.default) return;
+
+		let pssAvailable = true;
+		try {
+			readFileSync('/proc/self/smaps_rollup', 'utf8');
+		} catch {
+			pssAvailable = false;
+		}
+
+		if (!pssAvailable) return;
+
+		const prefix = this.prefix;
+		new promClient.Gauge({
+			name: prefix + 'process_pss_bytes',
+			help: 'Proportional Set Size of the process in bytes.',
+			collect() {
+				try {
+					// Sync read is intentional: /proc is a kernel virtual filesystem (microseconds, no disk I/O).
+					// This matches prom-client's own built-in metrics which use process.memoryUsage() (also /proc).
+					const content = readFileSync('/proc/self/smaps_rollup', 'utf8');
+					const match = content.match(/^Pss:\s+(\d+)\s+kB$/m);
+					if (match) {
+						this.set(parseInt(match[1], 10) * 1024);
+					}
+				} catch {
+					// Failed to read smaps_rollup, skip this scrape
+				}
+			},
+		});
+	}
+
 	/**
 	 * Set up metrics for server routes with `express-prom-bundle`. The same
 	 * middleware is also utilized for an instance activity metric
--- a/packages/testing/playwright/tests/performance/memory-consumption-cloud.spec.ts
+++ b/packages/testing/playwright/tests/performance/memory-consumption-cloud.spec.ts
@ -8,36 +8,34 @@ test.use({
 	},
 });

-test.describe('Memory Consumption @capability:observability', {
-	annotation: [
-		{ type: 'owner', description: 'Catalysts' },
-	],
-}, () => {
-	test('Memory consumption baseline with starter plan resources', async ({
-		n8nContainer,
-		services,
-	}, testInfo) => {
-		const obs = services.observability;
+test.describe(
+	'Memory Consumption @capability:observability',
+	{
+		annotation: [{ type: 'owner', description: 'Catalysts' }],
+	},
+	() => {
+		test('Memory consumption baseline with starter plan resources', async ({
+			n8nContainer,
+			services,
+		}, testInfo) => {
+			const obs = services.observability;

-		const { heapUsedMB } = await getStableHeap(n8nContainer.baseUrl, obs.metrics);
+			const result = await getStableHeap(n8nContainer.baseUrl, obs.metrics);

-		const [heapTotalResult, rssResult] = await Promise.all([
-			obs.metrics.waitForMetric('n8n_nodejs_heap_size_total_bytes / 1024 / 1024'),
-			obs.metrics.waitForMetric('n8n_process_resident_memory_bytes / 1024 / 1024'),
-		]);
-		const heapTotalMB = heapTotalResult!.value;
-		const rssMB = rssResult!.value;
+			await attachMetric(testInfo, 'memory-heap-used-baseline', result.heapUsedMB, 'MB');
+			await attachMetric(testInfo, 'memory-heap-total-baseline', result.heapTotalMB, 'MB');
+			await attachMetric(testInfo, 'memory-rss-baseline', result.rssMB, 'MB');
+			await attachMetric(testInfo, 'memory-pss-baseline', result.pssMB ?? 0, 'MB');
+			await attachMetric(
+				testInfo,
+				'memory-non-heap-overhead-baseline',
+				result.nonHeapOverheadMB,
+				'MB',
+			);

-		console.log(
-			`[MEMORY] Heap Used: ${heapUsedMB.toFixed(2)} MB | Heap Total: ${heapTotalMB.toFixed(2)} MB | RSS: ${rssMB.toFixed(2)} MB`,
-		);
-
-		await attachMetric(testInfo, 'memory-heap-used-baseline', heapUsedMB, 'MB');
-		await attachMetric(testInfo, 'memory-heap-total-baseline', heapTotalMB, 'MB');
-		await attachMetric(testInfo, 'memory-rss-baseline', rssMB, 'MB');
-
-		expect(heapUsedMB).toBeGreaterThan(0);
-		expect(heapTotalMB).toBeGreaterThan(0);
-		expect(rssMB).toBeGreaterThan(0);
-	});
-});
+			expect(result.heapUsedMB).toBeGreaterThan(0);
+			expect(result.heapTotalMB).toBeGreaterThan(0);
+			expect(result.rssMB).toBeGreaterThan(0);
+		});
+	},
+);
--- a/packages/testing/playwright/utils/performance-helper.ts
+++ b/packages/testing/playwright/utils/performance-helper.ts
@ -2,6 +2,9 @@ import type { Page, TestInfo } from '@playwright/test';
 import type { MetricsHelper } from 'n8n-containers';

 const HEAP_USED_QUERY = 'n8n_nodejs_heap_size_used_bytes / 1024 / 1024';
+const HEAP_TOTAL_QUERY = 'n8n_nodejs_heap_size_total_bytes / 1024 / 1024';
+const RSS_QUERY = 'n8n_process_resident_memory_bytes / 1024 / 1024';
+const PSS_QUERY = 'n8n_process_pss_bytes / 1024 / 1024';

 export async function measurePerformance(
 	page: Page,
@ -49,13 +52,18 @@ export interface StableHeapOptions {

 export interface StableHeapResult {
 	heapUsedMB: number;
+	heapTotalMB: number;
+	rssMB: number;
+	pssMB: number | null;
+	nonHeapOverheadMB: number;
 	stabilizationTimeMs: number;
 	readingsCount: number;
 }

 /**
 * Trigger GC and wait for heap memory to stabilize.
- * Uses consecutive stable readings to ensure metrics have settled post-GC.
+ * Collects RSS, PSS, and heap total samples during the stabilization window
+ * and returns median values to reduce point-in-time noise.
 */
 export async function getStableHeap(
 	baseUrl: string,
@ -100,6 +108,38 @@ interface StabilizationConfig {
 	stableReadingsRequired: number;
 }

+interface MemorySamples {
+	heapTotal: number[];
+	rss: number[];
+	pss: number[];
+}
+
+function median(values: number[]): number {
+	if (values.length === 0) return 0;
+	const sorted = [...values].sort((a, b) => a - b);
+	const mid = Math.floor(sorted.length / 2);
+	return sorted.length % 2 !== 0 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2;
+}
+
+async function collectAdditionalSamples(
+	metrics: MetricsHelper,
+	samples: MemorySamples,
+): Promise<void> {
+	try {
+		const results = await Promise.all([
+			metrics.query(HEAP_TOTAL_QUERY),
+			metrics.query(RSS_QUERY),
+			metrics.query(PSS_QUERY),
+		]);
+
+		if (results[0]?.[0]) samples.heapTotal.push(results[0][0].value);
+		if (results[1]?.[0]) samples.rss.push(results[1][0].value);
+		if (results[2]?.[0]) samples.pss.push(results[2][0].value);
+	} catch {
+		// Non-critical, skip this sample
+	}
+}
+
 async function waitForStableMemory(
 	metrics: MetricsHelper,
 	config: StabilizationConfig,
@ -109,6 +149,7 @@ async function waitForStableMemory(
 	let lastValue = 0;
 	let stableCount = 0;
 	let readingsCount = 0;
+	const samples: MemorySamples = { heapTotal: [], rss: [], pss: [] };

 	while (Date.now() - startTime < maxWaitMs) {
 		const result = await metrics.waitForMetric(HEAP_USED_QUERY, {
@ -119,17 +160,42 @@ async function waitForStableMemory(
 		if (result) {
 			readingsCount++;
 			const currentValue = result.value;
+
+			await collectAdditionalSamples(metrics, samples);
+
 			const delta = Math.abs(currentValue - lastValue);

 			if (lastValue > 0 && delta < thresholdMB) {
 				stableCount++;
 				if (stableCount >= stableReadingsRequired) {
 					const stabilizationTimeMs = Date.now() - startTime;
+					const heapUsedMB = currentValue;
+					const heapTotalMB = median(samples.heapTotal);
+					const rssMB = median(samples.rss);
+					const pssMB = samples.pss.length > 0 ? median(samples.pss) : null;
+					// Can theoretically go negative if RSS/heapTotal medians come from slightly
+					// different sample windows. A negative value would indicate a measurement
+					// timing issue — don't clamp to 0, surface it for investigation.
+					const nonHeapOverheadMB = rssMB - heapTotalMB;
+
 					console.log(
-						`[STABILIZATION] Memory stabilized at ${currentValue.toFixed(2)} MB ` +
-							`after ${stabilizationTimeMs}ms (${readingsCount} readings)`,
+						`[STABILIZATION] Memory stabilized after ${stabilizationTimeMs}ms (${readingsCount} readings)\n` +
+							`  Heap Used: ${heapUsedMB.toFixed(2)} MB\n` +
+							`  Heap Total: ${heapTotalMB.toFixed(2)} MB (median of ${samples.heapTotal.length})\n` +
+							`  RSS: ${rssMB.toFixed(2)} MB (median of ${samples.rss.length})\n` +
+							`  PSS: ${pssMB?.toFixed(2) ?? 'N/A'} MB${pssMB !== null ? ` (median of ${samples.pss.length})` : ''}\n` +
+							`  Non-Heap Overhead: ${nonHeapOverheadMB.toFixed(2)} MB`,
 					);
-					return { heapUsedMB: currentValue, stabilizationTimeMs, readingsCount };
+
+					return {
+						heapUsedMB,
+						heapTotalMB,
+						rssMB,
+						pssMB,
+						nonHeapOverheadMB,
+						stabilizationTimeMs,
+						readingsCount,
+					};
 				}
 			} else {
 				stableCount = 0;
--- a/scripts/dockerize-n8n.mjs
+++ b/scripts/dockerize-n8n.mjs
@ -173,8 +173,7 @@ async function main() {
 	const n8nImageSize = await getImageSize(config.n8n.fullImageName);
 	const runnersImageSize = await getImageSize(config.runners.fullImageName);

-	// Display summary
-	displaySummary([
+	const imageStats = [
 		{
 			imageName: config.n8n.fullImageName,
 			platform,
@ -187,7 +186,24 @@ async function main() {
 			size: runnersImageSize,
 			buildTime: runnersBuildTime,
 		},
-	]);
+	];
+
+	// Write docker build manifest for telemetry collection
+	const dockerManifest = {
+		buildTime: new Date().toISOString(),
+		platform,
+		images: imageStats.map(({ imageName, size, buildTime }) => ({
+			imageName,
+			size,
+			buildTime,
+		})),
+	};
+	await fs.writeJson(path.join(config.buildContext, 'docker-build-manifest.json'), dockerManifest, {
+		spaces: 2,
+	});
+
+	// Display summary
+	displaySummary(imageStats);
 }

 async function checkPrerequisites() {