fix: fix histogram metric query (#737)

Fix for the histogram query based on the late night session at KubeCon. Ref: HDX-1572
2026-04-21 13:37:15 +00:00 · 2025-04-14 17:04:13 -05:00 · 2025-04-14 17:04:13 -05:00 · 4865ce7a62
commit 4865ce7a62
parent 08009ac3b3
5 changed files with 216 additions and 98 deletions
--- a/.changeset/old-rules-check.md
+++ b/.changeset/old-rules-check.md
@ -0,0 +1,5 @@
+---
+"@hyperdx/common-utils": patch
+---
+
+Fixes the histogram query to perform quantile calculation across all data points
--- a/packages/api/src/clickhouse/tests/snapshots/renderChartConfig.test.ts.snap
+++ b/packages/api/src/clickhouse/tests/snapshots/renderChartConfig.test.ts.snap
@ -60,6 +60,15 @@ Array [
 ]
 `;

+exports[`renderChartConfig Query Metrics should include multiple data points in percentile computation (p50) 1`] = `
+Array [
+  Object {
+    "__hdx_time_bucket": "2022-01-05T00:00:00Z",
+    "any(toFloat64OrNull(toString(Rate)))": 2.9,
+  },
+]
+`;
+
 exports[`renderChartConfig Query Metrics single avg gauge with group-by 1`] = `
 Array [
  Object {
@ -192,11 +201,11 @@ exports[`renderChartConfig Query Metrics two_timestamps_bounded histogram (p25)
 Array [
  Object {
    "__hdx_time_bucket": "2022-01-05T00:00:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 0,
+    "any(toFloat64OrNull(toString(Rate)))": 0,
  },
  Object {
    "__hdx_time_bucket": "2022-01-05T00:01:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 10,
+    "any(toFloat64OrNull(toString(Rate)))": 10,
  },
 ]
 `;
@ -205,11 +214,11 @@ exports[`renderChartConfig Query Metrics two_timestamps_bounded histogram (p50)
 Array [
  Object {
    "__hdx_time_bucket": "2022-01-05T00:00:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 0,
+    "any(toFloat64OrNull(toString(Rate)))": 0,
  },
  Object {
    "__hdx_time_bucket": "2022-01-05T00:01:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 20,
+    "any(toFloat64OrNull(toString(Rate)))": 13.333333333333332,
  },
 ]
 `;
@ -218,11 +227,11 @@ exports[`renderChartConfig Query Metrics two_timestamps_bounded histogram (p90)
 Array [
  Object {
    "__hdx_time_bucket": "2022-01-05T00:00:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 0,
+    "any(toFloat64OrNull(toString(Rate)))": 0,
  },
  Object {
    "__hdx_time_bucket": "2022-01-05T00:01:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 30,
+    "any(toFloat64OrNull(toString(Rate)))": 30,
  },
 ]
 `;
@ -231,11 +240,11 @@ exports[`renderChartConfig Query Metrics two_timestamps_lower_bound_inf histogra
 Array [
  Object {
    "__hdx_time_bucket": "2022-01-05T00:00:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 0,
+    "any(toFloat64OrNull(toString(Rate)))": 0,
  },
  Object {
    "__hdx_time_bucket": "2022-01-05T00:01:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 1,
+    "any(toFloat64OrNull(toString(Rate)))": 1,
  },
 ]
 `;
@ -244,11 +253,11 @@ exports[`renderChartConfig Query Metrics two_timestamps_upper_bound_inf histogra
 Array [
  Object {
    "__hdx_time_bucket": "2022-01-05T00:00:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 0,
+    "any(toFloat64OrNull(toString(Rate)))": 0,
  },
  Object {
    "__hdx_time_bucket": "2022-01-05T00:01:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 30,
+    "any(toFloat64OrNull(toString(Rate)))": 30,
  },
 ]
 `;
--- a/packages/api/src/clickhouse/tests/renderChartConfig.test.ts
+++ b/packages/api/src/clickhouse/tests/renderChartConfig.test.ts
@ -446,6 +446,36 @@ describe('renderChartConfig', () => {
        AggregationTemporality: 2, // Cumulative
        ...point,
      }));
+      const histPointsE = [
+        {
+          BucketCounts: [1, 1, 1, 1, 1, 1],
+          ExplicitBounds: [1, 2, 5, 8, 13],
+          TimeUnix: new Date(now),
+          ResourceAttributes: { host: 'test-a' },
+        },
+        {
+          BucketCounts: [2, 2, 2, 2, 2, 2],
+          ExplicitBounds: [1, 2, 5, 8, 13],
+          TimeUnix: new Date(now + ms('5s')),
+          ResourceAttributes: { host: 'test-b' },
+        },
+        {
+          BucketCounts: [2, 1, 2, 1, 2, 1],
+          ExplicitBounds: [1, 2, 5, 8, 13],
+          TimeUnix: new Date(now + ms('1m')),
+          ResourceAttributes: { host: 'test-a' },
+        },
+        {
+          BucketCounts: [3, 3, 2, 2, 3, 3],
+          ExplicitBounds: [1, 2, 5, 8, 13],
+          TimeUnix: new Date(now + ms('65s')),
+          ResourceAttributes: { host: 'test-b' },
+        },
+      ].map(point => ({
+        MetricName: 'test.multiple_series',
+        AggregationTemporality: 2, // Cumulative
+        ...point,
+      }));

      await Promise.all([
        bulkInsertMetricsGauge([...gaugePointsA, ...gaugePointsB]),
@ -462,6 +492,7 @@ describe('renderChartConfig', () => {
          ...histPointsB,
          ...histPointsC,
          ...histPointsD,
+          ...histPointsE,
        ]),
      ]);
    });
@ -655,6 +686,10 @@ describe('renderChartConfig', () => {
    });

    it('calculates min_rate/max_rate correctly for sum metrics', async () => {
+      // Raw Data is
+      // MIN_VARIANT_0: [0, 1, 8, 0, 7, 7, 15, 17, 0, 42]
+      // MIN_VARIANT_1: [0, 2, 9, 0, 15, 25 35, 57, 0, 92]
+      //
      // Based on the data inserted in the fixture, the expected stream of values
      // for each series after adjusting for the zero reset should be:
      // MIN_VARIANT_0: [0, 1, 8, 8, 15, 15, 23, 25, 25, 67]
@ -725,20 +760,10 @@ describe('renderChartConfig', () => {
        Since the AggregationTemporality is 2(cumulative), we need to calculate the delta between the two points:
          delta: [10, 10, 10] - [0, 0, 0] = [10, 10, 10]

-        Total observations: 10 + 10 + 10 = 30
-        Cumulative counts: [10, 20, 30]
-        p50 point:
-          Rank = 0.5 * 30 = 15
-          This falls in the second bucket (since 10 < 15 ≤ 20)
-
        We need to interpolate between the lower and upper bounds of the second bucket:
-          Lower bound: 10
-          Upper bound: 30
-          Position in bucket: (15 - 10) / (20 - 10) = 0.5
-          Interpolated value: 10 + (30 - 10) * 0.5 = 10 + 10 = 20
-
-        Thus the first point value would be 0 since it's at the start of the bounds.
-        The second point value would be 20 since that is the median point value delta from the first point.
+          cum sum = [10, 20, 30]
+          rank = 0.5 * 30 = 15 (between bounds 10 - 30)
+          interpolate: 10 + ((15 - 10) / 30) * (30 - 10) = 13.3333
       */
      const query = await renderChartConfig(
        {
@ -952,6 +977,57 @@ describe('renderChartConfig', () => {
      expect(res).toMatchSnapshot();
    });

+    it('should include multiple data points in percentile computation (p50)', async () => {
+      /*
+        bounds: [1, 2, 5, 8, 13]
+        host = test-a:
+          p1 = [1, 1, 1, 1, 1, 1]
+          p2 = [2, 1, 2, 1, 2, 1]
+        host = test-b:
+          p1 = [2, 2, 2, 2, 2, 2]
+          p2 = [3, 3, 2, 2, 3, 3]
+
+        Compute the diff between adjacent points for each unique host (deltaSumForEach)
+          host = test-a, diff = [1, 0, 1, 0, 1, 0]
+          host = test-b, diff = [1, 1, 0, 0, 1, 1]
+
+        Sum the diffs together to obtain a combined count for the different series
+          sum elements(d) = [2, 1, 1, 0, 2, 1]
+
+        Now compute the p50 value:
+          sum(d) = 7
+          cum sum = [2, 3, 4, 4, 6, 7]
+          rank = 0.5 * 7 = 3.5 (between bounds 2 - 5)
+          interpolate: 2 + ((3.5 - 2) / 5) * (5 - 2) = 2.9
+
+        Since all the points fall within a single granularity interval the result should be a single row
+        with the value 2.9.
+       */
+      const query = await renderChartConfig(
+        {
+          select: [
+            {
+              aggFn: 'quantile',
+              level: 0.5,
+              metricName: 'test.multiple_series',
+              metricType: MetricsDataType.Histogram,
+              valueExpression: 'Value',
+            },
+          ],
+          from: metricSource.from,
+          where: '',
+          metricTables: TEST_METRIC_TABLES,
+          dateRange: [new Date(now), new Date(now + ms('2m'))],
+          granularity: '5 minute',
+          timestampValueExpression: metricSource.timestampValueExpression,
+          connection: connection.id,
+        },
+        metadata,
+      );
+      const res = await queryData(query);
+      expect(res).toMatchSnapshot();
+    });
+
    // HDX-1515: Handle counter reset in histogram metric in the same way that the counter reset
    // is handled for sum metrics.
    it.skip('three_timestamps_bounded histogram with reset (p50)', async () => {
--- a/packages/common-utils/src/tests/snapshots/renderChartConfig.test.ts.snap
+++ b/packages/common-utils/src/tests/snapshots/renderChartConfig.test.ts.snap
@ -36,39 +36,47 @@ exports[`renderChartConfig should generate sql for a single gauge metric 1`] = `
 `;

 exports[`renderChartConfig should generate sql for a single histogram metric 1`] = `
-"WITH HistRate AS (
+"WITH Source AS (
          SELECT
            *,
            cityHash64(mapConcat(ScopeAttributes, ResourceAttributes, Attributes)) AS AttributesHash,
-            length(BucketCounts) as CountLength,
-            any(BucketCounts) OVER (ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS PrevBucketCounts,
-            any(CountLength) OVER (ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS PrevCountLength,
-            any(AttributesHash) OVER (ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS PrevAttributesHash,
            IF(AggregationTemporality = 1,
-               BucketCounts,
-               IF(AttributesHash = PrevAttributesHash AND CountLength = PrevCountLength,
-                  arrayMap((prev, curr) -> IF(curr < prev, curr, toUInt64(toInt64(curr) - toInt64(prev))), PrevBucketCounts, BucketCounts),
-                  BucketCounts)) as BucketRates
+              sumForEach(BucketCounts) OVER (PARTITION BY AttributesHash ORDER BY AttributesHash, TimeUnix ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
+              deltaSumForEach(BucketCounts) OVER (PARTITION BY AttributesHash ORDER BY AttributesHash, TimeUnix ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
+            ) AS MonoBucketCounts
          FROM default.otel_metrics_histogram
          WHERE (TimeUnix >= fromUnixTimestamp64Milli(1739318400000) AND TimeUnix <= fromUnixTimestamp64Milli(1765670400000)) AND ((MetricName = 'http.server.duration'))
-          ORDER BY Attributes, TimeUnix ASC
-          ),RawHist AS (
+          ORDER BY AttributesHash ASC
+          ),Bucketed AS (
            SELECT
-              *,
-              toUInt64( 0.5 * arraySum(BucketRates)) AS Rank,
-              arrayCumSum(BucketRates) as CumRates,
-              arrayFirstIndex(x -> if(x > Rank, 1, 0), CumRates) AS BucketLowIdx,
-              IF(BucketLowIdx = length(BucketRates),
-                arrayElement(ExplicitBounds, length(ExplicitBounds)),
-                IF(BucketLowIdx > 1,
-                  arrayElement(ExplicitBounds, BucketLowIdx - 1) + (arrayElement(ExplicitBounds, BucketLowIdx) - arrayElement(ExplicitBounds, BucketLowIdx - 1)) *
-                    IF(arrayElement(CumRates, BucketLowIdx) > arrayElement(CumRates, BucketLowIdx - 1),
-                      (Rank - arrayElement(CumRates, BucketLowIdx - 1)) / (arrayElement(CumRates, BucketLowIdx) - arrayElement(CumRates, BucketLowIdx - 1)), 0),
-                  arrayElement(ExplicitBounds, BucketLowIdx)
-                )) as Rate
-            FROM HistRate) SELECT sum(
+              
+              any(Source.ExplicitBounds) AS ExplicitBounds,
+              sumForEach(MonoBucketCounts) AS TotalBucketCounts,
+              arrayCumSum(TotalBucketCounts) AS BucketCumCounts,
+              0.5 * arraySum(TotalBucketCounts) AS Rank,
+              arrayFirstIndex(x -> if(x > Rank, 1, 0), BucketCumCounts) AS BucketLowIdx,
+              CASE
+                WHEN BucketLowIdx = 0 THEN 0
+                WHEN BucketLowIdx = 1 THEN ExplicitBounds[BucketLowIdx]
+                WHEN BucketLowIdx = length(TotalBucketCounts) THEN ExplicitBounds[length(ExplicitBounds)]
+                ELSE
+                  ExplicitBounds[BucketLowIdx - 1] +
+                    ((Rank - ExplicitBounds[BucketLowIdx - 1]) / (ExplicitBounds[BucketLowIdx])) * (ExplicitBounds[BucketLowIdx] - ExplicitBounds[BucketLowIdx - 1])
+              END AS Value
+            FROM Source
+            GROUP BY 
+            ORDER BY  ASC
+          ),Rates AS (
+            SELECT
+              \`__hdx_time_bucket\`,
+              any(Bucketed.Value) AS \`__hdx_value_high\`,
+              any(\`__hdx_value_high\`) OVER (ORDER BY \`__hdx_time_bucket\` ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS \`__hdx_prev_high\`,
+              \`__hdx_value_high\` - \`__hdx_prev_high\` AS Rate
+            FROM Bucketed
+            GROUP BY \`__hdx_time_bucket\`
+          ) SELECT any(
      toFloat64OrNull(toString(Rate))
-    ) FROM RawHist WHERE (TimeUnix >= fromUnixTimestamp64Milli(1739318400000) AND TimeUnix <= fromUnixTimestamp64Milli(1765670400000)) LIMIT 10"
+    ) FROM Rates WHERE (\`__hdx_time_bucket\` >= fromUnixTimestamp64Milli(1739318400000) AND \`__hdx_time_bucket\` <= fromUnixTimestamp64Milli(1765670400000)) LIMIT 10"
 `;

 exports[`renderChartConfig should generate sql for a single sum metric 1`] = `
--- a/packages/common-utils/src/renderChartConfig.ts
+++ b/packages/common-utils/src/renderChartConfig.ts
@ -1061,87 +1061,107 @@ async function translateMetricChartConfig(
      throw new Error('quantile must be specified for histogram metrics');
    }

-    // Render the where clause to limit data selection on the source CTE but also search forward/back one
-    // bucket window to ensure that there is enough data to compute a reasonable value on the ends of the
-    // series.
-    const where = await renderWhere(
-      {
-        ...chartConfig,
-        from: {
-          ...from,
-          tableName: metricTables[MetricsDataType.Histogram],
-        },
-        filters: [
-          ...(filters ?? []),
-          {
-            type: 'sql',
-            condition: `MetricName = '${metricName}'`,
-          },
-        ],
-        includedDataInterval:
-          chartConfig.granularity === 'auto' &&
-          Array.isArray(chartConfig.dateRange)
-            ? convertDateRangeToGranularityString(chartConfig.dateRange, 60)
-            : chartConfig.granularity,
+    // Render the various clauses from the user input so they can be woven into the CTE queries. The dateRange
+    // is manipulated to search forward/back one bucket window to ensure that there is enough data to compute
+    // a reasonable value on the ends of the series.
+    const cteChartConfig = {
+      ...chartConfig,
+      from: {
+        ...from,
+        tableName: metricTables[MetricsDataType.Histogram],
      },
-      metadata,
-    );
+      filters: [
+        ...(filters ?? []),
+        {
+          type: 'sql',
+          condition: `MetricName = '${metricName}'`,
+        },
+      ],
+      includedDataInterval:
+        chartConfig.granularity === 'auto' &&
+        Array.isArray(chartConfig.dateRange)
+          ? convertDateRangeToGranularityString(chartConfig.dateRange, 60)
+          : chartConfig.granularity,
+    } as ChartConfigWithOptDateRangeEx;
+
+    const timeBucketSelect = isUsingGranularity(cteChartConfig)
+      ? timeBucketExpr({
+          interval: cteChartConfig.granularity,
+          timestampValueExpression: cteChartConfig.timestampValueExpression,
+          dateRange: cteChartConfig.dateRange,
+        })
+      : chSql``;
+    const where = await renderWhere(cteChartConfig, metadata);
+    const groupBy = (await renderGroupBy(chartConfig, metadata)) || chSql``;

    return {
      ...restChartConfig,
      with: [
        {
-          name: 'HistRate',
+          name: 'Source',
          sql: chSql`
          SELECT
            *,
            cityHash64(mapConcat(ScopeAttributes, ResourceAttributes, Attributes)) AS AttributesHash,
-            length(BucketCounts) as CountLength,
-            any(BucketCounts) OVER (ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS PrevBucketCounts,
-            any(CountLength) OVER (ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS PrevCountLength,
-            any(AttributesHash) OVER (ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS PrevAttributesHash,
            IF(AggregationTemporality = 1,
-               BucketCounts,
-               IF(AttributesHash = PrevAttributesHash AND CountLength = PrevCountLength,
-                  arrayMap((prev, curr) -> IF(curr < prev, curr, toUInt64(toInt64(curr) - toInt64(prev))), PrevBucketCounts, BucketCounts),
-                  BucketCounts)) as BucketRates
+              sumForEach(BucketCounts) OVER (PARTITION BY AttributesHash ORDER BY AttributesHash, TimeUnix ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
+              deltaSumForEach(BucketCounts) OVER (PARTITION BY AttributesHash ORDER BY AttributesHash, TimeUnix ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
+            ) AS MonoBucketCounts
          FROM ${renderFrom({ from: { ...from, tableName: metricTables[MetricsDataType.Histogram] } })}
          WHERE ${where}
-          ORDER BY Attributes, TimeUnix ASC
+          ORDER BY AttributesHash ASC
          `,
        },
        {
-          name: 'RawHist',
+          name: 'Bucketed',
          sql: chSql`
            SELECT
-              *,
-              toUInt64( ${{ Float64: level }} * arraySum(BucketRates)) AS Rank,
-              arrayCumSum(BucketRates) as CumRates,
-              arrayFirstIndex(x -> if(x > Rank, 1, 0), CumRates) AS BucketLowIdx,
-              IF(BucketLowIdx = length(BucketRates),
-                arrayElement(ExplicitBounds, length(ExplicitBounds)),
-                IF(BucketLowIdx > 1,
-                  arrayElement(ExplicitBounds, BucketLowIdx - 1) + (arrayElement(ExplicitBounds, BucketLowIdx) - arrayElement(ExplicitBounds, BucketLowIdx - 1)) *
-                    IF(arrayElement(CumRates, BucketLowIdx) > arrayElement(CumRates, BucketLowIdx - 1),
-                      (Rank - arrayElement(CumRates, BucketLowIdx - 1)) / (arrayElement(CumRates, BucketLowIdx) - arrayElement(CumRates, BucketLowIdx - 1)), 0),
-                  arrayElement(ExplicitBounds, BucketLowIdx)
-                )) as Rate
-            FROM HistRate`,
+              ${timeBucketSelect.sql ? chSql`${timeBucketSelect},` : ''}
+              any(Source.ExplicitBounds) AS ExplicitBounds,
+              sumForEach(MonoBucketCounts) AS TotalBucketCounts,
+              arrayCumSum(TotalBucketCounts) AS BucketCumCounts,
+              ${{ Float64: level }} * arraySum(TotalBucketCounts) AS Rank,
+              arrayFirstIndex(x -> if(x > Rank, 1, 0), BucketCumCounts) AS BucketLowIdx,
+              CASE
+                WHEN BucketLowIdx = 0 THEN 0
+                WHEN BucketLowIdx = 1 THEN ExplicitBounds[BucketLowIdx]
+                WHEN BucketLowIdx = length(TotalBucketCounts) THEN ExplicitBounds[length(ExplicitBounds)]
+                ELSE
+                  ExplicitBounds[BucketLowIdx - 1] +
+                    ((Rank - ExplicitBounds[BucketLowIdx - 1]) / (ExplicitBounds[BucketLowIdx])) * (ExplicitBounds[BucketLowIdx] - ExplicitBounds[BucketLowIdx - 1])
+              END AS Value
+            FROM Source
+            GROUP BY ${groupBy}
+            ORDER BY ${groupBy} ASC
+          `,
+        },
+        {
+          name: 'Rates',
+          sql: chSql`
+            SELECT
+              \`__hdx_time_bucket\`,
+              any(Bucketed.Value) AS \`__hdx_value_high\`,
+              any(\`__hdx_value_high\`) OVER (ORDER BY \`__hdx_time_bucket\` ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS \`__hdx_prev_high\`,
+              \`__hdx_value_high\` - \`__hdx_prev_high\` AS Rate
+            FROM Bucketed
+            GROUP BY \`__hdx_time_bucket\`
+          `,
        },
      ],
      select: [
        {
          ..._selectRest,
-          aggFn: 'sum',
+          aggFn: 'any', // groupings happen further in the CTE stack, just need to grab the value
          aggCondition: '', // clear up the condition since the where clause is already applied at the upstream CTE
          valueExpression: 'Rate',
        },
      ],
      from: {
        databaseName: '',
-        tableName: 'RawHist',
+        tableName: 'Rates',
      },
      where: '', // clear up the condition since the where clause is already applied at the upstream CTE
+      timestampValueExpression: '`__hdx_time_bucket`',
    };
  }