From dad92067ff3d15818dd43671609bcda54c38f913 Mon Sep 17 00:00:00 2001 From: Kamil Kisiela Date: Tue, 5 Dec 2023 10:13:49 +0100 Subject: [PATCH] Dynamic sampling in GraphQL Hive client (#3331) --- .changeset/old-pigs-dress.md | 5 + .changeset/twenty-adults-cough.md | 5 + packages/libraries/client/README.md | 83 ++++++++++++ packages/libraries/client/src/index.ts | 1 + .../libraries/client/src/internal/sampling.ts | 20 +++ .../libraries/client/src/internal/types.ts | 17 +++ .../libraries/client/src/internal/usage.ts | 17 ++- packages/libraries/client/src/samplers.ts | 28 ++++ packages/libraries/client/tests/usage.spec.ts | 123 ++++++++++++++++++ .../src/pages/docs/api-reference/client.mdx | 82 +++++++++++- 10 files changed, 372 insertions(+), 9 deletions(-) create mode 100644 .changeset/old-pigs-dress.md create mode 100644 .changeset/twenty-adults-cough.md create mode 100644 packages/libraries/client/src/samplers.ts diff --git a/.changeset/old-pigs-dress.md b/.changeset/old-pigs-dress.md new file mode 100644 index 000000000..0b28a21e3 --- /dev/null +++ b/.changeset/old-pigs-dress.md @@ -0,0 +1,5 @@ +--- +'@graphql-hive/client': minor +--- + +Add atLeastOnceSampler diff --git a/.changeset/twenty-adults-cough.md b/.changeset/twenty-adults-cough.md new file mode 100644 index 000000000..c8b8ae03f --- /dev/null +++ b/.changeset/twenty-adults-cough.md @@ -0,0 +1,5 @@ +--- +'@graphql-hive/client': minor +--- + +Introduce sampler for dynamic sampling diff --git a/packages/libraries/client/README.md b/packages/libraries/client/README.md index 993c2571b..a07ec96cf 100644 --- a/packages/libraries/client/README.md +++ b/packages/libraries/client/README.md @@ -319,6 +319,89 @@ const server = new ApolloServer({ }) ``` +### Sampling + +#### Basic sampling + +With `sampleRate` option, you're able to control the sampling rate of the usage reporting. Setting +it to `0.5` will result in 50% of the operations being sent to Hive. There is no guarantee that +every operation will be reported at least once (see `atLeastOnceSampler`). + +Default: `1` (100%) + +```typescript +useHive({ + /* ... other options ... */, + usage: { + sampleRate: 0.6 // 60% of the operations will be sent to Hive + } +}) +``` + +#### Dynamic sampling + +GraphQL Hive client accepts a function that returns a number between 0 and 1. This allows you to +implement dynamic sampling based on the operation's context. + +If `sampler` is defined, `sampleRate` is ignored. + +A sample rate between 0 and 1. + +- `0.0` = 0% chance of being sent +- `1.0` = 100% chance of being sent. +- `true` = 100% +- `false` = 0% + +```typescript +useHive({ + /* ... other options ... */, + usage: { + sampler(samplingContext) { + if (samplingContext.operationName === 'GetUser') { + return 0.5 // 50% of GetUser operations will be sent to Hive + } + + return 0.7; // 70% of the other operations will be sent to Hive + } + } +}) +``` + +#### At-least-once sampling + +If you want to make sure that every operation is reported at least once, you can use the +`atLeastOnceSampler`. Every operation is reported at least once, but every next occurrence is +decided by the sampler. + +```typescript +import { useHive, atLeastOnceSampler} from '@graphql-hive/client'; + +useHive({ + /* ... other options ... */, + usage: { + sampler: atLeastOnceSampler({ + // Produces a unique key for a given GraphQL request. + // This key is used to determine the uniqueness of a GraphQL operation. + keyFn(samplingContext) { + // Operation name is a good candidate for a key, but not perfect, + // as not all operations have names + // and some operations may have the same name but different body. + return samplingContext.operationName; + }, + sampler(_samplingContext) { + const hour = new Date().getHours(); + + if (hour >= 9 && hour <= 17) { + return 0.3; + } + + return 0.8; + } + }) + } +}) +``` + ## Self-Hosting To align the client with your own instance of GraphQL Hive, you should use `selfHosting` options in diff --git a/packages/libraries/client/src/index.ts b/packages/libraries/client/src/index.ts index 326c3ffdf..21246416c 100644 --- a/packages/libraries/client/src/index.ts +++ b/packages/libraries/client/src/index.ts @@ -4,3 +4,4 @@ export { useHive as useYogaHive } from './yoga.js'; export { hiveApollo, createSupergraphSDLFetcher, createSupergraphManager } from './apollo.js'; export { createSchemaFetcher, createServicesFetcher } from './gateways.js'; export { createHive } from './client.js'; +export { atLeastOnceSampler } from './samplers.js'; diff --git a/packages/libraries/client/src/internal/sampling.ts b/packages/libraries/client/src/internal/sampling.ts index 056af39cd..5edb210d4 100644 --- a/packages/libraries/client/src/internal/sampling.ts +++ b/packages/libraries/client/src/internal/sampling.ts @@ -1,3 +1,5 @@ +import type { SamplingContext } from './types.js'; + export function randomSampling(sampleRate: number) { if (sampleRate > 1 || sampleRate < 0) { throw new Error(`Expected usage.sampleRate to be 0 <= x <= 1, received ${sampleRate}`); @@ -7,3 +9,21 @@ export function randomSampling(sampleRate: number) { return Math.random() <= sampleRate; }; } + +export function dynamicSampling(sampler: (context: SamplingContext) => number | boolean) { + return function shouldInclude(context: SamplingContext): boolean { + let sampleRate = sampler(context); + + if (sampleRate === true) { + sampleRate = 1; + } else if (sampleRate === false) { + sampleRate = 0; + } + + if (sampleRate > 1 || sampleRate < 0) { + throw new Error(`Expected usage.sampleRate to be 0 <= x <= 1, received ${sampleRate}`); + } + + return Math.random() <= sampleRate; + }; +} diff --git a/packages/libraries/client/src/internal/types.ts b/packages/libraries/client/src/internal/types.ts index d509a950b..448339830 100644 --- a/packages/libraries/client/src/internal/types.ts +++ b/packages/libraries/client/src/internal/types.ts @@ -78,6 +78,18 @@ export interface HiveUsagePluginOptions { * Default: 1.0 */ sampleRate?: number; + /** + * Compute sample rate dynamically. + * + * If `sampler` is defined, `sampleRate` is ignored. + * + * @returns A sample rate between 0 and 1. + * 0.0 = 0% chance of being sent + * 1.0 = 100% chance of being sent. + * true = 100% + * false = 0% + */ + sampler?: (context: SamplingContext) => number | boolean; /** * (Experimental) Enables collecting Input fields usage based on the variables passed to the operation. * @@ -86,6 +98,11 @@ export interface HiveUsagePluginOptions { processVariables?: boolean; } +export interface SamplingContext + extends Pick { + operationName: string; +} + export interface HiveReportingPluginOptions { /** * Custom endpoint to collect schema reports diff --git a/packages/libraries/client/src/internal/usage.ts b/packages/libraries/client/src/internal/usage.ts index 544e22d9b..502e960b4 100644 --- a/packages/libraries/client/src/internal/usage.ts +++ b/packages/libraries/client/src/internal/usage.ts @@ -25,7 +25,7 @@ import LRU from 'tiny-lru'; import { normalizeOperation } from '@graphql-hive/core'; import { version } from '../version.js'; import { createAgent } from './agent.js'; -import { randomSampling } from './sampling.js'; +import { dynamicSampling, randomSampling } from './sampling.js'; import type { AbortAction, ClientInfo, @@ -145,7 +145,10 @@ export function createUsage(pluginOptions: HivePluginOptions): UsageCollector { logger.error, ); - const shouldInclude = randomSampling(options.sampleRate ?? 1.0); + const shouldInclude = + options.sampler && typeof options.sampler === 'function' + ? dynamicSampling(options.sampler) + : randomSampling(options.sampleRate ?? 1.0); return { dispose: agent.dispose, @@ -175,7 +178,15 @@ export function createUsage(pluginOptions: HivePluginOptions): UsageCollector { providedOperationName = args.operationName || rootOperation.name?.value; const operationName = providedOperationName || 'anonymous'; - if (!excludeSet.has(operationName) && shouldInclude()) { + if ( + !excludeSet.has(operationName) && + shouldInclude({ + operationName, + document, + variableValues: args.variableValues, + contextValue: args.contextValue, + }) + ) { const errors = result.errors?.map(error => ({ message: error.message, diff --git a/packages/libraries/client/src/samplers.ts b/packages/libraries/client/src/samplers.ts new file mode 100644 index 000000000..06d540587 --- /dev/null +++ b/packages/libraries/client/src/samplers.ts @@ -0,0 +1,28 @@ +import { dynamicSampling } from './internal/sampling.js'; +import type { SamplingContext } from './internal/types.js'; + +/** + * Every operation is reported at least once, but every next occurrence is decided by the sampler. + */ +export function atLeastOnceSampler(config: { + /** + * Produces a unique key for a given GraphQL request. + * This key is used to determine the uniqueness of a GraphQL operation. + */ + keyFn(context: SamplingContext): string; + sampler(context: SamplingContext): number | boolean; +}) { + const sampler = dynamicSampling(config.sampler); + const reportedKeys = new Set(); + + return function shouldInclude(context: SamplingContext): boolean { + const key = config.keyFn(context); + + if (!reportedKeys.has(key)) { + reportedKeys.add(key); + return true; + } + + return sampler(context); + }; +} diff --git a/packages/libraries/client/tests/usage.spec.ts b/packages/libraries/client/tests/usage.spec.ts index 0bfbcb8f1..24abeee6e 100644 --- a/packages/libraries/client/tests/usage.spec.ts +++ b/packages/libraries/client/tests/usage.spec.ts @@ -3,6 +3,7 @@ import { buildSchema, parse } from 'graphql'; import nock from 'nock'; import { createHive } from '../src/client'; import type { Report } from '../src/internal/usage'; +import { atLeastOnceSampler } from '../src/samplers'; import { version } from '../src/version'; import { waitFor } from './test-utils'; @@ -86,6 +87,21 @@ const op = parse(/* GraphQL */ ` } `); +const op2 = parse(/* GraphQL */ ` + query getProject($selector: ProjectSelectorInput!) { + project(selector: $selector) { + ...ProjectFields + } + } + + fragment ProjectFields on Project { + id + cleanId + name + type + } +`); + beforeEach(() => { vi.restoreAllMocks(); }); @@ -433,3 +449,110 @@ test('sendImmediately should not stop the schedule', async () => { await waitFor(1000); http.done(); }); + +test('should send data to Hive at least once when using atLeastOnceSampler', async () => { + const logger = { + error: vi.fn(), + info: vi.fn(), + }; + + const token = 'Token'; + + let report: Report = { + size: 0, + map: {}, + operations: [], + }; + const http = nock('http://localhost') + .post('/200') + .matchHeader('Authorization', `Bearer ${token}`) + .matchHeader('Content-Type', headers['Content-Type']) + .matchHeader('graphql-client-name', headers['graphql-client-name']) + .matchHeader('graphql-client-version', headers['graphql-client-version']) + .once() + .reply((_, _body) => { + report = _body as any; + return [200]; + }); + + const hive = createHive({ + enabled: true, + debug: true, + agent: { + timeout: 500, + maxRetries: 0, + logger, + }, + token, + selfHosting: { + graphqlEndpoint: 'http://localhost/graphql', + applicationUrl: 'http://localhost/', + usageEndpoint: 'http://localhost/200', + }, + usage: { + sampler: atLeastOnceSampler({ + keyFn(ctx) { + return ctx.operationName; + }, + sampler() { + // only + return 0; + }, + }), + }, + }); + + const collect = hive.collectUsage(); + + await waitFor(2000); + collect( + { + schema, + document: op, + operationName: 'deleteProject', + }, + {}, + ); + // different query + collect( + { + schema, + document: op2, + operationName: 'getProject', + }, + {}, + ); + // duplicated call + collect( + { + schema, + document: op, + operationName: 'deleteProject', + }, + {}, + ); + await hive.dispose(); + await waitFor(1000); + http.done(); + + expect(logger.error).not.toHaveBeenCalled(); + expect(logger.info).toHaveBeenCalledWith(`[hive][usage] Sending (queue 2) (attempt 1)`); + expect(logger.info).toHaveBeenCalledWith(`[hive][usage] Sent!`); + + // Map + expect(report.size).toEqual(2); + expect(Object.keys(report.map)).toHaveLength(2); + + const foundRecords: string[] = []; + for (const key in report.map) { + const record = report.map[key]; + + foundRecords.push(record.operationName ?? 'anonymous'); + } + + expect(foundRecords).toContainEqual('deleteProject'); + expect(foundRecords).toContainEqual('getProject'); + + const operations = report.operations; + expect(operations).toHaveLength(2); // two operations +}); diff --git a/packages/web/docs/src/pages/docs/api-reference/client.mdx b/packages/web/docs/src/pages/docs/api-reference/client.mdx index e5b894bab..651e5757b 100644 --- a/packages/web/docs/src/pages/docs/api-reference/client.mdx +++ b/packages/web/docs/src/pages/docs/api-reference/client.mdx @@ -68,15 +68,85 @@ const config: HivePluginOptions = { #### Sampling -You can pass a custom `sampleRate` array to the `HivePluginOptions` to sample a percentage of the -total operations reported. By default, Hive agent reports 100% of the operations (`1.0`). +##### Basic sampling -```ts -const config: HivePluginOptions = { +With `sampleRate` option, you're able to control the sampling rate of the usage reporting. Setting +it to `0.5` will result in 50% of the operations being sent to Hive. There is no guarantee that +every operation will be reported at least once (see `atLeastOnceSampler`). + +Default: `1` (100%) + +```typescript +useHive({ + /* ... other options ... */, usage: { - sampleRate: 0.1 + sampleRate: 0.6 // 60% of the operations will be sent to Hive } -} +}) +``` + +##### Dynamic sampling + +GraphQL Hive client accepts a function that returns a number between 0 and 1. This allows you to +implement dynamic sampling based on the operation's context. + +If `sampler` is defined, `sampleRate` is ignored. + +A sample rate between 0 and 1. + +- `0.0` = 0% chance of being sent +- `1.0` = 100% chance of being sent. +- `true` = 100% +- `false` = 0% + +```typescript +useHive({ + /* ... other options ... */, + usage: { + sampler(samplingContext) { + if (samplingContext.operationName === 'GetUser') { + return 0.5 // 50% of GetUser operations will be sent to Hive + } + + return 0.7; // 70% of the other operations will be sent to Hive + } + } +}) +``` + +##### At-least-once sampling + +If you want to make sure that every operation is reported at least once, you can use the +`atLeastOnceSampler`. Every operation is reported at least once, but every next occurrence is +decided by the sampler. + +```typescript +import { useHive, atLeastOnceSampler} from '@graphql-hive/client'; + +useHive({ + /* ... other options ... */, + usage: { + sampler: atLeastOnceSampler({ + // Produces a unique key for a given GraphQL request. + // This key is used to determine the uniqueness of a GraphQL operation. + keyFn(samplingContext) { + // Operation name is a good candidate for a key, but not perfect, + // as not all operations have names + // and some operations may have the same name but different body. + return samplingContext.operationName; + }, + sampler(_samplingContext) { + const hour = new Date().getHours(); + + if (hour >= 9 && hour <= 17) { + return 0.3; + } + + return 0.8; + } + }) + } +}) ``` #### Custom Integration