Dynamic sampling in GraphQL Hive client (#3331)

This commit is contained in:
Kamil Kisiela 2023-12-05 10:13:49 +01:00 committed by GitHub
parent c03a58c05e
commit dad92067ff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 372 additions and 9 deletions

View file

@ -0,0 +1,5 @@
---
'@graphql-hive/client': minor
---
Add atLeastOnceSampler

View file

@ -0,0 +1,5 @@
---
'@graphql-hive/client': minor
---
Introduce sampler for dynamic sampling

View file

@ -319,6 +319,89 @@ const server = new ApolloServer({
})
```
### Sampling
#### Basic sampling
With `sampleRate` option, you're able to control the sampling rate of the usage reporting. Setting
it to `0.5` will result in 50% of the operations being sent to Hive. There is no guarantee that
every operation will be reported at least once (see `atLeastOnceSampler`).
Default: `1` (100%)
```typescript
useHive({
/* ... other options ... */,
usage: {
sampleRate: 0.6 // 60% of the operations will be sent to Hive
}
})
```
#### Dynamic sampling
GraphQL Hive client accepts a function that returns a number between 0 and 1. This allows you to
implement dynamic sampling based on the operation's context.
If `sampler` is defined, `sampleRate` is ignored.
A sample rate between 0 and 1.
- `0.0` = 0% chance of being sent
- `1.0` = 100% chance of being sent.
- `true` = 100%
- `false` = 0%
```typescript
useHive({
/* ... other options ... */,
usage: {
sampler(samplingContext) {
if (samplingContext.operationName === 'GetUser') {
return 0.5 // 50% of GetUser operations will be sent to Hive
}
return 0.7; // 70% of the other operations will be sent to Hive
}
}
})
```
#### At-least-once sampling
If you want to make sure that every operation is reported at least once, you can use the
`atLeastOnceSampler`. Every operation is reported at least once, but every next occurrence is
decided by the sampler.
```typescript
import { useHive, atLeastOnceSampler} from '@graphql-hive/client';
useHive({
/* ... other options ... */,
usage: {
sampler: atLeastOnceSampler({
// Produces a unique key for a given GraphQL request.
// This key is used to determine the uniqueness of a GraphQL operation.
keyFn(samplingContext) {
// Operation name is a good candidate for a key, but not perfect,
// as not all operations have names
// and some operations may have the same name but different body.
return samplingContext.operationName;
},
sampler(_samplingContext) {
const hour = new Date().getHours();
if (hour >= 9 && hour <= 17) {
return 0.3;
}
return 0.8;
}
})
}
})
```
## Self-Hosting
To align the client with your own instance of GraphQL Hive, you should use `selfHosting` options in

View file

@ -4,3 +4,4 @@ export { useHive as useYogaHive } from './yoga.js';
export { hiveApollo, createSupergraphSDLFetcher, createSupergraphManager } from './apollo.js';
export { createSchemaFetcher, createServicesFetcher } from './gateways.js';
export { createHive } from './client.js';
export { atLeastOnceSampler } from './samplers.js';

View file

@ -1,3 +1,5 @@
import type { SamplingContext } from './types.js';
export function randomSampling(sampleRate: number) {
if (sampleRate > 1 || sampleRate < 0) {
throw new Error(`Expected usage.sampleRate to be 0 <= x <= 1, received ${sampleRate}`);
@ -7,3 +9,21 @@ export function randomSampling(sampleRate: number) {
return Math.random() <= sampleRate;
};
}
export function dynamicSampling(sampler: (context: SamplingContext) => number | boolean) {
return function shouldInclude(context: SamplingContext): boolean {
let sampleRate = sampler(context);
if (sampleRate === true) {
sampleRate = 1;
} else if (sampleRate === false) {
sampleRate = 0;
}
if (sampleRate > 1 || sampleRate < 0) {
throw new Error(`Expected usage.sampleRate to be 0 <= x <= 1, received ${sampleRate}`);
}
return Math.random() <= sampleRate;
};
}

View file

@ -78,6 +78,18 @@ export interface HiveUsagePluginOptions {
* Default: 1.0
*/
sampleRate?: number;
/**
* Compute sample rate dynamically.
*
* If `sampler` is defined, `sampleRate` is ignored.
*
* @returns A sample rate between 0 and 1.
* 0.0 = 0% chance of being sent
* 1.0 = 100% chance of being sent.
* true = 100%
* false = 0%
*/
sampler?: (context: SamplingContext) => number | boolean;
/**
* (Experimental) Enables collecting Input fields usage based on the variables passed to the operation.
*
@ -86,6 +98,11 @@ export interface HiveUsagePluginOptions {
processVariables?: boolean;
}
export interface SamplingContext
extends Pick<ExecutionArgs, 'document' | 'contextValue' | 'variableValues'> {
operationName: string;
}
export interface HiveReportingPluginOptions {
/**
* Custom endpoint to collect schema reports

View file

@ -25,7 +25,7 @@ import LRU from 'tiny-lru';
import { normalizeOperation } from '@graphql-hive/core';
import { version } from '../version.js';
import { createAgent } from './agent.js';
import { randomSampling } from './sampling.js';
import { dynamicSampling, randomSampling } from './sampling.js';
import type {
AbortAction,
ClientInfo,
@ -145,7 +145,10 @@ export function createUsage(pluginOptions: HivePluginOptions): UsageCollector {
logger.error,
);
const shouldInclude = randomSampling(options.sampleRate ?? 1.0);
const shouldInclude =
options.sampler && typeof options.sampler === 'function'
? dynamicSampling(options.sampler)
: randomSampling(options.sampleRate ?? 1.0);
return {
dispose: agent.dispose,
@ -175,7 +178,15 @@ export function createUsage(pluginOptions: HivePluginOptions): UsageCollector {
providedOperationName = args.operationName || rootOperation.name?.value;
const operationName = providedOperationName || 'anonymous';
if (!excludeSet.has(operationName) && shouldInclude()) {
if (
!excludeSet.has(operationName) &&
shouldInclude({
operationName,
document,
variableValues: args.variableValues,
contextValue: args.contextValue,
})
) {
const errors =
result.errors?.map(error => ({
message: error.message,

View file

@ -0,0 +1,28 @@
import { dynamicSampling } from './internal/sampling.js';
import type { SamplingContext } from './internal/types.js';
/**
* Every operation is reported at least once, but every next occurrence is decided by the sampler.
*/
export function atLeastOnceSampler(config: {
/**
* Produces a unique key for a given GraphQL request.
* This key is used to determine the uniqueness of a GraphQL operation.
*/
keyFn(context: SamplingContext): string;
sampler(context: SamplingContext): number | boolean;
}) {
const sampler = dynamicSampling(config.sampler);
const reportedKeys = new Set<string>();
return function shouldInclude(context: SamplingContext): boolean {
const key = config.keyFn(context);
if (!reportedKeys.has(key)) {
reportedKeys.add(key);
return true;
}
return sampler(context);
};
}

View file

@ -3,6 +3,7 @@ import { buildSchema, parse } from 'graphql';
import nock from 'nock';
import { createHive } from '../src/client';
import type { Report } from '../src/internal/usage';
import { atLeastOnceSampler } from '../src/samplers';
import { version } from '../src/version';
import { waitFor } from './test-utils';
@ -86,6 +87,21 @@ const op = parse(/* GraphQL */ `
}
`);
const op2 = parse(/* GraphQL */ `
query getProject($selector: ProjectSelectorInput!) {
project(selector: $selector) {
...ProjectFields
}
}
fragment ProjectFields on Project {
id
cleanId
name
type
}
`);
beforeEach(() => {
vi.restoreAllMocks();
});
@ -433,3 +449,110 @@ test('sendImmediately should not stop the schedule', async () => {
await waitFor(1000);
http.done();
});
test('should send data to Hive at least once when using atLeastOnceSampler', async () => {
const logger = {
error: vi.fn(),
info: vi.fn(),
};
const token = 'Token';
let report: Report = {
size: 0,
map: {},
operations: [],
};
const http = nock('http://localhost')
.post('/200')
.matchHeader('Authorization', `Bearer ${token}`)
.matchHeader('Content-Type', headers['Content-Type'])
.matchHeader('graphql-client-name', headers['graphql-client-name'])
.matchHeader('graphql-client-version', headers['graphql-client-version'])
.once()
.reply((_, _body) => {
report = _body as any;
return [200];
});
const hive = createHive({
enabled: true,
debug: true,
agent: {
timeout: 500,
maxRetries: 0,
logger,
},
token,
selfHosting: {
graphqlEndpoint: 'http://localhost/graphql',
applicationUrl: 'http://localhost/',
usageEndpoint: 'http://localhost/200',
},
usage: {
sampler: atLeastOnceSampler({
keyFn(ctx) {
return ctx.operationName;
},
sampler() {
// only
return 0;
},
}),
},
});
const collect = hive.collectUsage();
await waitFor(2000);
collect(
{
schema,
document: op,
operationName: 'deleteProject',
},
{},
);
// different query
collect(
{
schema,
document: op2,
operationName: 'getProject',
},
{},
);
// duplicated call
collect(
{
schema,
document: op,
operationName: 'deleteProject',
},
{},
);
await hive.dispose();
await waitFor(1000);
http.done();
expect(logger.error).not.toHaveBeenCalled();
expect(logger.info).toHaveBeenCalledWith(`[hive][usage] Sending (queue 2) (attempt 1)`);
expect(logger.info).toHaveBeenCalledWith(`[hive][usage] Sent!`);
// Map
expect(report.size).toEqual(2);
expect(Object.keys(report.map)).toHaveLength(2);
const foundRecords: string[] = [];
for (const key in report.map) {
const record = report.map[key];
foundRecords.push(record.operationName ?? 'anonymous');
}
expect(foundRecords).toContainEqual('deleteProject');
expect(foundRecords).toContainEqual('getProject');
const operations = report.operations;
expect(operations).toHaveLength(2); // two operations
});

View file

@ -68,15 +68,85 @@ const config: HivePluginOptions = {
#### Sampling
You can pass a custom `sampleRate` array to the `HivePluginOptions` to sample a percentage of the
total operations reported. By default, Hive agent reports 100% of the operations (`1.0`).
##### Basic sampling
```ts
const config: HivePluginOptions = {
With `sampleRate` option, you're able to control the sampling rate of the usage reporting. Setting
it to `0.5` will result in 50% of the operations being sent to Hive. There is no guarantee that
every operation will be reported at least once (see `atLeastOnceSampler`).
Default: `1` (100%)
```typescript
useHive({
/* ... other options ... */,
usage: {
sampleRate: 0.1
sampleRate: 0.6 // 60% of the operations will be sent to Hive
}
}
})
```
##### Dynamic sampling
GraphQL Hive client accepts a function that returns a number between 0 and 1. This allows you to
implement dynamic sampling based on the operation's context.
If `sampler` is defined, `sampleRate` is ignored.
A sample rate between 0 and 1.
- `0.0` = 0% chance of being sent
- `1.0` = 100% chance of being sent.
- `true` = 100%
- `false` = 0%
```typescript
useHive({
/* ... other options ... */,
usage: {
sampler(samplingContext) {
if (samplingContext.operationName === 'GetUser') {
return 0.5 // 50% of GetUser operations will be sent to Hive
}
return 0.7; // 70% of the other operations will be sent to Hive
}
}
})
```
##### At-least-once sampling
If you want to make sure that every operation is reported at least once, you can use the
`atLeastOnceSampler`. Every operation is reported at least once, but every next occurrence is
decided by the sampler.
```typescript
import { useHive, atLeastOnceSampler} from '@graphql-hive/client';
useHive({
/* ... other options ... */,
usage: {
sampler: atLeastOnceSampler({
// Produces a unique key for a given GraphQL request.
// This key is used to determine the uniqueness of a GraphQL operation.
keyFn(samplingContext) {
// Operation name is a good candidate for a key, but not perfect,
// as not all operations have names
// and some operations may have the same name but different body.
return samplingContext.operationName;
},
sampler(_samplingContext) {
const hour = new Date().getHours();
if (hour >= 9 && hour <= 17) {
return 0.3;
}
return 0.8;
}
})
}
})
```
#### Custom Integration