feat(telemetry): anonymous PostHog workflow-invocation tracking (#1262)

* feat(telemetry): add anonymous PostHog workflow-invocation tracking

Emits one `workflow_invoked` event per run with workflow name/description,
platform, and Archon version. Uses a stable random UUID persisted to
`$ARCHON_HOME/telemetry-id` for distinct-install counting, with
`$process_person_profile: false` to stay in PostHog's anonymous tier.

Opt out with `ARCHON_TELEMETRY_DISABLED=1` or `DO_NOT_TRACK=1`. Self-host
via `POSTHOG_API_KEY` / `POSTHOG_HOST`.

Closes #1261

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

* test(telemetry): stop leaking test events to production PostHog

The `telemetry-id preservation` test exercised the real capture path with
the embedded production key, so every `bun run validate` published a
tombstone `workflow_name: "w"` event. Redirect POSTHOG_HOST to loopback
so the flush fails silently; bump test timeout to accommodate the
retry-then-give-up window.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

* fix(telemetry): silence posthog-node stderr leak on network failure

The PostHog SDK's internal logFlushError() writes 'Error while flushing
PostHog' directly to stderr via console.error on any network or HTTP
error, bypassing logger config. For a fire-and-forget telemetry path
this leaked stack traces to users' terminals whenever PostHog was
unreachable (offline, firewalled, DNS broken, rate-limited).

Pass a silentFetch wrapper to the PostHog client that masks failures as
fake 200 responses. The SDK never sees an error, so it never logs.
Original failure is still recorded at debug level for diagnostics.

Side benefit: shutdown is now fast on network failure (no retry loop),
so offline CLI commands no longer hang ~10s on exit.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

* test(telemetry): make id-preservation test deterministic

Replace the fire-and-forget capture + setTimeout + POSTHOG_HOST-loopback
dance with a direct synchronous call to getOrCreateTelemetryId(). Export
the function with an @internal marker so tests can exercise the id path
without spinning up the PostHog client. No network, no timer, no flake.

Addresses CodeRabbit feedback on #1262.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Cole Medin 2026-04-16 13:45:55 -05:00 committed by GitHub
parent f1c5dcb231
commit d535c832e3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 463 additions and 3 deletions

View file

@ -187,3 +187,17 @@ MAX_CONCURRENT_CONVERSATIONS=10 # Maximum concurrent AI conversations (default:
# Session Retention
# SESSION_RETENTION_DAYS=30 # Delete inactive sessions older than N days (default: 30)
# Anonymous Telemetry (optional)
# Archon sends anonymous workflow-invocation events to PostHog so maintainers
# can see which workflows get real usage. No PII — workflow name/description +
# platform + Archon version + a random install UUID. No identities, no prompts,
# no paths, no code. See README "Telemetry" for the full list.
#
# Opt out (any one disables telemetry):
# ARCHON_TELEMETRY_DISABLED=1
# DO_NOT_TRACK=1 (de facto standard)
#
# Point at a self-hosted PostHog or a different project:
# POSTHOG_API_KEY=phc_yourKeyHere
# POSTHOG_HOST=https://eu.i.posthog.com (default: https://us.i.posthog.com)

View file

@ -315,6 +315,23 @@ Full documentation is available at **[archon.diy](https://archon.diy)**.
| [Architecture](https://archon.diy/reference/architecture/) | System design and internals |
| [Troubleshooting](https://archon.diy/reference/troubleshooting/) | Common issues and fixes |
## Telemetry
Archon sends a single anonymous event — `workflow_invoked` — each time a workflow starts, so maintainers can see which workflows get real usage and prioritize accordingly. **No PII, ever.**
**What's collected:** the workflow name, the workflow description (both authored by you in YAML), the platform that triggered it (`cli`, `web`, `slack`, etc.), the Archon version, and a random install UUID stored at `~/.archon/telemetry-id`. Nothing else.
**What's *not* collected:** your code, prompts, messages, git remotes, file paths, usernames, tokens, AI output, workflow node details — none of it.
**Opt out:** set any of these in your environment:
```bash
ARCHON_TELEMETRY_DISABLED=1
DO_NOT_TRACK=1 # de facto standard honored by Astro, Bun, Prisma, Nuxt, etc.
```
Self-host PostHog or use a different project by setting `POSTHOG_API_KEY` and `POSTHOG_HOST`.
## Contributing
Contributions welcome! See the open [issues](https://github.com/coleam00/Archon/issues) for things to work on.

View file

@ -118,6 +118,7 @@
"dotenv": "^17",
"pino": "^9",
"pino-pretty": "^13",
"posthog-node": "^5.29.2",
},
"peerDependencies": {
"typescript": "^5.0.0",
@ -620,6 +621,8 @@
"@pinojs/redact": ["@pinojs/redact@0.4.0", "", {}, "sha512-k2ENnmBugE/rzQfEcdWHcCY+/FM3VLzH9cYEsbdsoqrvzAKRhUZeRNhAZvB8OitQJ1TBed3yqWtdjzS6wJKBwg=="],
"@posthog/core": ["@posthog/core@1.25.2", "", {}, "sha512-h2FO7ut/BbfwpAXWpwdDHTzQgUo9ibDFEs6ZO+3cI3KPWQt5XwczK1OLAuPprcjm8T/jl0SH8jSFo5XdU4RbTg=="],
"@radix-ui/number": ["@radix-ui/number@1.1.1", "", {}, "sha512-MkKCwxlXTgz6CFoJx3pCwn07GKp36+aZyu/u2Ln2VrA5DcdyCZkASEDBTd8x5whTQQL5CiYf4prXKLcgQdv29g=="],
"@radix-ui/primitive": ["@radix-ui/primitive@1.1.3", "", {}, "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg=="],
@ -2010,6 +2013,8 @@
"postgres-interval": ["postgres-interval@1.2.0", "", { "dependencies": { "xtend": "^4.0.0" } }, "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ=="],
"posthog-node": ["posthog-node@5.29.2", "", { "dependencies": { "@posthog/core": "1.25.2" }, "peerDependencies": { "rxjs": "^7.0.0" }, "optionalPeers": ["rxjs"] }, "sha512-rI7kkF0XqDc0G1qjx+Hb4iuY9NAlL+XQNoGOpnEpRNTUcXvjY6WlsRGZ9m2whgc39emrrYdszi/YT8wZkr2xsg=="],
"powershell-utils": ["powershell-utils@0.1.0", "", {}, "sha512-dM0jVuXJPsDN6DvRpea484tCUaMiXWjuCn++HGTqUWzGDjv5tZkEZldAJ/UMlqRYGFrD/etByo4/xOuC/snX2A=="],
"prelude-ls": ["prelude-ls@1.2.1", "", {}, "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g=="],

View file

@ -80,6 +80,7 @@ import {
checkForUpdate,
BUNDLED_IS_BINARY,
BUNDLED_VERSION,
shutdownTelemetry,
} from '@archon/paths';
import * as git from '@archon/git';
@ -573,6 +574,9 @@ async function main(): Promise<number> {
}
return 1;
} finally {
// Flush queued telemetry events before the CLI process exits.
// Short-lived CLI commands lose buffered events if shutdown() is skipped.
await shutdownTelemetry();
// Always close database connection
await closeDb();
}

View file

@ -16,7 +16,8 @@
"dependencies": {
"dotenv": "^17",
"pino": "^9",
"pino-pretty": "^13"
"pino-pretty": "^13",
"posthog-node": "^5.29.2"
},
"peerDependencies": {
"typescript": "^5.0.0"

View file

@ -43,3 +43,7 @@ export {
parseLatestRelease,
} from './update-check';
export type { UpdateCheckResult } from './update-check';
// Anonymous telemetry
export { captureWorkflowInvoked, shutdownTelemetry, isTelemetryDisabled } from './telemetry';
export type { WorkflowInvokedProperties } from './telemetry';

View file

@ -0,0 +1,151 @@
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
import { tmpdir } from 'os';
import { join } from 'path';
import { existsSync, mkdtempSync, readFileSync, rmSync } from 'fs';
import {
isTelemetryDisabled,
captureWorkflowInvoked,
shutdownTelemetry,
resetTelemetryForTests,
getOrCreateTelemetryId,
} from './telemetry';
const ENV_VARS = [
'ARCHON_HOME',
'ARCHON_TELEMETRY_DISABLED',
'DO_NOT_TRACK',
'POSTHOG_API_KEY',
'POSTHOG_HOST',
];
function saveEnv(): Record<string, string | undefined> {
const saved: Record<string, string | undefined> = {};
for (const key of ENV_VARS) saved[key] = process.env[key];
return saved;
}
function restoreEnv(saved: Record<string, string | undefined>): void {
for (const key of ENV_VARS) {
if (saved[key] === undefined) {
delete process.env[key];
} else {
process.env[key] = saved[key];
}
}
}
describe('telemetry opt-out detection', () => {
let saved: Record<string, string | undefined>;
beforeEach(() => {
saved = saveEnv();
resetTelemetryForTests();
});
afterEach(() => {
restoreEnv(saved);
resetTelemetryForTests();
});
test('enabled by default when no opt-out env vars set', () => {
delete process.env.ARCHON_TELEMETRY_DISABLED;
delete process.env.DO_NOT_TRACK;
delete process.env.POSTHOG_API_KEY;
expect(isTelemetryDisabled()).toBe(false);
});
test('ARCHON_TELEMETRY_DISABLED=1 disables telemetry', () => {
process.env.ARCHON_TELEMETRY_DISABLED = '1';
expect(isTelemetryDisabled()).toBe(true);
});
test('DO_NOT_TRACK=1 disables telemetry', () => {
process.env.DO_NOT_TRACK = '1';
expect(isTelemetryDisabled()).toBe(true);
});
test('ARCHON_TELEMETRY_DISABLED=0 does not disable (strict "1" match)', () => {
process.env.ARCHON_TELEMETRY_DISABLED = '0';
delete process.env.DO_NOT_TRACK;
expect(isTelemetryDisabled()).toBe(false);
});
test('empty POSTHOG_API_KEY override disables telemetry', () => {
process.env.POSTHOG_API_KEY = '';
delete process.env.ARCHON_TELEMETRY_DISABLED;
delete process.env.DO_NOT_TRACK;
expect(isTelemetryDisabled()).toBe(true);
});
});
describe('captureWorkflowInvoked when disabled', () => {
let saved: Record<string, string | undefined>;
beforeEach(() => {
saved = saveEnv();
resetTelemetryForTests();
process.env.ARCHON_TELEMETRY_DISABLED = '1';
});
afterEach(() => {
restoreEnv(saved);
resetTelemetryForTests();
});
test('does not throw when telemetry is disabled', () => {
expect(() => {
captureWorkflowInvoked({
workflowName: 'test-workflow',
workflowDescription: 'A test',
platform: 'cli',
archonVersion: 'dev',
});
}).not.toThrow();
});
test('shutdownTelemetry is a no-op when never initialized', async () => {
await expect(shutdownTelemetry()).resolves.toBeUndefined();
});
});
describe('telemetry ID persistence', () => {
let saved: Record<string, string | undefined>;
let tmpHome: string;
beforeEach(() => {
saved = saveEnv();
tmpHome = mkdtempSync(join(tmpdir(), 'archon-telemetry-test-'));
process.env.ARCHON_HOME = tmpHome;
// Force-disable actual network capture — we only exercise the ID path.
process.env.ARCHON_TELEMETRY_DISABLED = '1';
resetTelemetryForTests();
});
afterEach(() => {
restoreEnv(saved);
resetTelemetryForTests();
rmSync(tmpHome, { recursive: true, force: true });
});
test('calling capture while disabled does not create a telemetry-id file', () => {
captureWorkflowInvoked({ workflowName: 'w' });
expect(existsSync(join(tmpHome, 'telemetry-id'))).toBe(false);
});
test('an existing telemetry-id file is preserved (not overwritten)', async () => {
const { writeFileSync, mkdirSync } = await import('fs');
const existingId = '11111111-1111-4111-8111-111111111111';
mkdirSync(tmpHome, { recursive: true });
writeFileSync(join(tmpHome, 'telemetry-id'), existingId, 'utf8');
resetTelemetryForTests();
// Direct, synchronous call — no network, no fire-and-forget, no timer.
const resolved = getOrCreateTelemetryId();
expect(resolved).toBe(existingId);
const stored = readFileSync(join(tmpHome, 'telemetry-id'), 'utf8').trim();
expect(stored).toBe(existingId);
});
});

View file

@ -0,0 +1,246 @@
/**
* Anonymous PostHog telemetry for Archon.
*
* Emits one event `workflow_invoked` each time a workflow starts. No PII,
* no user identity. A random UUID is persisted to `${ARCHON_HOME}/telemetry-id`
* so we can count distinct installs; `$process_person_profile: false` keeps
* events in PostHog's anonymous tier (no person profile ever created).
*
* Opt-out (any one disables telemetry):
* - ARCHON_TELEMETRY_DISABLED=1
* - DO_NOT_TRACK=1 (de facto standard)
* - POSTHOG_API_KEY unset *and* no embedded default
*
* All functions are fire-and-forget: telemetry errors are logged at debug level
* and swallowed. Capture must never crash Archon.
*/
import { randomUUID } from 'crypto';
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'fs';
import { join } from 'path';
import type { PostHog } from 'posthog-node';
import { getArchonHome } from './archon-paths';
import { createLogger } from './logger';
// Minimal shape of posthog-node's `fetch` option — copied from @posthog/core
// (a transitive dep) to avoid pulling it in as a direct dependency.
interface PostHogFetchOptions {
method: 'GET' | 'POST' | 'PUT' | 'PATCH';
mode?: 'no-cors';
credentials?: 'omit';
headers: Record<string, string>;
body?: string | Blob;
signal?: AbortSignal;
}
interface PostHogFetchResponse {
status: number;
text: () => Promise<string>;
json: () => Promise<unknown>;
headers?: { get(name: string): string | null };
}
/**
* Embedded write-only PostHog project key. Safe to ship in source: `phc_*`
* keys can only write events, never read data. Override with POSTHOG_API_KEY
* for self-hosted PostHog or a different project.
*/
const EMBEDDED_POSTHOG_API_KEY = 'phc_rR7oacut9mm4upGRbuoMptnyjRium34TTbbqobiQYS7x';
const DEFAULT_POSTHOG_HOST = 'https://us.i.posthog.com';
/** Max length of workflow description sent to PostHog. Guards against unusually long YAML descriptions. */
const DESCRIPTION_MAX_LENGTH = 500;
let cachedLog: ReturnType<typeof createLogger> | undefined;
function getLog(): ReturnType<typeof createLogger> {
if (!cachedLog) cachedLog = createLogger('telemetry');
return cachedLog;
}
function getApiKey(): string {
return process.env.POSTHOG_API_KEY ?? EMBEDDED_POSTHOG_API_KEY;
}
function getHost(): string {
return process.env.POSTHOG_HOST ?? DEFAULT_POSTHOG_HOST;
}
/**
* Check whether telemetry is disabled via env vars or missing key.
* Exported for tests and callers that want to short-circuit early.
*/
export function isTelemetryDisabled(): boolean {
if (process.env.ARCHON_TELEMETRY_DISABLED === '1') return true;
if (process.env.DO_NOT_TRACK === '1') return true;
if (!getApiKey()) return true;
return false;
}
/**
* Load or create a stable anonymous install UUID at `${ARCHON_HOME}/telemetry-id`.
* If the file can't be read or written (permissions, disk full), a fresh UUID
* is returned for this session telemetry still works, just not correlated
* across runs.
*
* Exported so tests can exercise the id-resolution invariants directly
* without spinning up the PostHog client.
* @internal
*/
export function getOrCreateTelemetryId(): string {
const idPath = join(getArchonHome(), 'telemetry-id');
try {
if (existsSync(idPath)) {
const existing = readFileSync(idPath, 'utf8').trim();
if (existing) return existing;
}
} catch (error) {
getLog().debug({ err: error as Error, idPath }, 'telemetry.id_read_failed');
}
const id = randomUUID();
try {
mkdirSync(getArchonHome(), { recursive: true });
writeFileSync(idPath, id, 'utf8');
} catch (error) {
getLog().debug({ err: error as Error, idPath }, 'telemetry.id_persist_failed');
}
return id;
}
let telemetryIdCache: string | undefined;
function getTelemetryId(): string {
if (!telemetryIdCache) telemetryIdCache = getOrCreateTelemetryId();
return telemetryIdCache;
}
/**
* Lazy singleton. `undefined` = not yet initialized; `null` = disabled or
* init failed; `PostHog` = live client. Init runs once per process.
*/
let clientInit: Promise<PostHog | null> | undefined;
async function getClient(): Promise<PostHog | null> {
if (clientInit === undefined) {
clientInit = initClient();
}
return clientInit;
}
/**
* Fetch wrapper that masks all failures as 200 responses. The PostHog SDK's
* internal `logFlushError` writes to stderr via `console.error` on any network
* or HTTP error, bypassing logger configuration (see `@posthog/core`
* `posthog-core-stateless.mjs` `logFlushError`). For a fire-and-forget
* telemetry path we want zero user-visible noise when PostHog is unreachable
* (offline, firewalled, DNS broken, rate-limited), so we intercept failures
* before the SDK sees them. The original error is still recorded at debug
* level.
*/
const FAKE_OK_RESPONSE: PostHogFetchResponse = {
status: 200,
text: () => Promise.resolve('{"status":"ok"}'),
json: () => Promise.resolve({ status: 'ok' }),
headers: { get: () => null },
};
async function silentFetch(
url: string,
options: PostHogFetchOptions
): Promise<PostHogFetchResponse> {
try {
const res = await fetch(url, options as RequestInit);
if (res.status < 200 || res.status >= 400) {
getLog().debug({ status: res.status }, 'telemetry.http_non_2xx_suppressed');
return FAKE_OK_RESPONSE;
}
return res;
} catch (error) {
getLog().debug({ err: error as Error }, 'telemetry.fetch_failed_suppressed');
return FAKE_OK_RESPONSE;
}
}
async function initClient(): Promise<PostHog | null> {
if (isTelemetryDisabled()) return null;
try {
const posthogModule = await import('posthog-node');
const client = new posthogModule.PostHog(getApiKey(), {
host: getHost(),
flushAt: 20,
flushInterval: 10000,
disableGeoip: true,
fetch: silentFetch,
});
// Defensive: also hook the client-level error channel in case a future
// posthog-node version routes errors there instead of (or in addition to)
// the internal console.error path.
client.on('error', (err: Error) => {
getLog().debug({ err }, 'telemetry.client_error');
});
return client;
} catch (error) {
getLog().debug({ err: error as Error }, 'telemetry.init_failed');
return null;
}
}
export interface WorkflowInvokedProperties {
workflowName: string;
workflowDescription?: string;
platform?: string;
archonVersion?: string;
}
/**
* Fire-and-forget capture of a `workflow_invoked` event. Never throws, never
* awaits safe to call from hot paths.
*/
export function captureWorkflowInvoked(props: WorkflowInvokedProperties): void {
if (isTelemetryDisabled()) return;
void (async (): Promise<void> => {
try {
const client = await getClient();
if (!client) return;
const description = props.workflowDescription?.slice(0, DESCRIPTION_MAX_LENGTH);
client.capture({
distinctId: getTelemetryId(),
event: 'workflow_invoked',
properties: {
$process_person_profile: false,
workflow_name: props.workflowName,
...(description ? { workflow_description: description } : {}),
...(props.platform ? { platform: props.platform } : {}),
...(props.archonVersion ? { archon_version: props.archonVersion } : {}),
},
});
} catch (error) {
getLog().debug({ err: error as Error }, 'telemetry.capture_failed');
}
})();
}
/**
* Flush queued events and close the PostHog client. Call on process exit
* (server SIGTERM, end of CLI command) so buffered events aren't lost.
* Safe to call when telemetry was never initialized.
*/
export async function shutdownTelemetry(): Promise<void> {
if (clientInit === undefined) return;
try {
const client = await clientInit;
if (client) {
await client.shutdown();
}
} catch (error) {
getLog().debug({ err: error as Error }, 'telemetry.shutdown_failed');
} finally {
clientInit = undefined;
}
}
/**
* Reset internal state for tests. Not part of the public API.
* @internal
*/
export function resetTelemetryForTests(): void {
clientInit = undefined;
telemetryIdCache = undefined;
}

View file

@ -79,7 +79,12 @@ import {
getPort,
} from '@archon/core';
import type { IPlatformAdapter } from '@archon/core';
import { createLogger, logArchonPaths, validateAppDefaultsPaths } from '@archon/paths';
import {
createLogger,
logArchonPaths,
validateAppDefaultsPaths,
shutdownTelemetry,
} from '@archon/paths';
/** Lazy-initialized logger (deferred so test mocks can intercept createLogger) */
let cachedLog: ReturnType<typeof createLogger> | undefined;
@ -640,6 +645,9 @@ export async function startServer(opts: ServerOptions = {}): Promise<void> {
getLog().error({ err: error }, 'adapter_stop_error');
}
// Flush queued telemetry events before pool closes the process.
await shutdownTelemetry();
return pool.end();
})
.then(() => {

View file

@ -6,7 +6,7 @@ import { join } from 'path';
import type { IWorkflowPlatform, WorkflowMessageMetadata } from './deps';
import type { WorkflowDeps, WorkflowConfig } from './deps';
import * as archonPaths from '@archon/paths';
import { createLogger } from '@archon/paths';
import { createLogger, captureWorkflowInvoked, BUNDLED_VERSION } from '@archon/paths';
import { getDefaultBranch, toRepoPath } from '@archon/git';
import type { WorkflowDefinition, WorkflowRun, WorkflowExecutionResult } from './schemas';
import { executeDagWorkflow } from './dag-executor';
@ -621,6 +621,16 @@ export async function executeWorkflow(
workflowName: workflow.name,
conversationId: conversationDbId,
});
// Fire-and-forget anonymous usage telemetry. No PII: only workflow name +
// description (authored by the user in their YAML) + platform + version.
// Opt out via ARCHON_TELEMETRY_DISABLED=1 or DO_NOT_TRACK=1.
captureWorkflowInvoked({
workflowName: workflow.name,
workflowDescription: workflow.description,
platform: platform.getPlatformType(),
archonVersion: BUNDLED_VERSION,
});
deps.store
.createWorkflowEvent({
workflow_run_id: workflowRun.id,