From 5c9a5ef9a65f8e04e729fbae54a9310c0a42f6c2 Mon Sep 17 00:00:00 2001 From: Nicolas Richard Date: Fri, 20 Jun 2025 11:38:04 -0700 Subject: [PATCH] feat(health): Add healthCheck for CRD DatadogMetric (#23464) Signed-off-by: Nicolas Richard --- .../datadoghq.com/DatadogMetric/health.lua | 32 +++++++++++++++++++ .../DatadogMetric/health_test.yaml | 13 ++++++++ .../testdata/degraded_error.yaml | 24 ++++++++++++++ .../testdata/degraded_invalid.yaml | 19 +++++++++++ .../DatadogMetric/testdata/healthy.yaml | 25 +++++++++++++++ 5 files changed, 113 insertions(+) create mode 100644 resource_customizations/datadoghq.com/DatadogMetric/health.lua create mode 100644 resource_customizations/datadoghq.com/DatadogMetric/health_test.yaml create mode 100644 resource_customizations/datadoghq.com/DatadogMetric/testdata/degraded_error.yaml create mode 100644 resource_customizations/datadoghq.com/DatadogMetric/testdata/degraded_invalid.yaml create mode 100644 resource_customizations/datadoghq.com/DatadogMetric/testdata/healthy.yaml diff --git a/resource_customizations/datadoghq.com/DatadogMetric/health.lua b/resource_customizations/datadoghq.com/DatadogMetric/health.lua new file mode 100644 index 0000000000..d10e4b0277 --- /dev/null +++ b/resource_customizations/datadoghq.com/DatadogMetric/health.lua @@ -0,0 +1,32 @@ +-- Reference CRD can be found here: +-- https://github.com/DataDog/helm-charts/blob/main/charts/datadog-crds/templates/datadoghq.com_datadogmetrics_v1.yaml + +hs = {} +if obj.status ~= nil and obj.status.conditions ~= nil then + for i, condition in ipairs(obj.status.conditions) do + -- Check for the "Error: True" condition first + if condition.type == "Error" and condition.status == "True" then + hs.status = "Degraded" + local reason = condition.reason or "" + local message = condition.message or "DatadogMetric reported an error" + if reason ~= "" then + hs.message = reason .. ": " .. message + else + hs.message = message + end + return hs + end + end + for i, condition in ipairs(obj.status.conditions) do + -- Check for the "Valid: False" condition + if condition.type == "Valid" and condition.status == "False" then + hs.status = "Degraded" + hs.message = condition.message or "DatadogMetric is not valid" + return hs + end + end +end +-- If no "Degraded" conditions are found, default to Healthy +hs.status = "Healthy" +hs.message = "DatadogMetric is healthy" +return hs diff --git a/resource_customizations/datadoghq.com/DatadogMetric/health_test.yaml b/resource_customizations/datadoghq.com/DatadogMetric/health_test.yaml new file mode 100644 index 0000000000..b76369aca5 --- /dev/null +++ b/resource_customizations/datadoghq.com/DatadogMetric/health_test.yaml @@ -0,0 +1,13 @@ +tests: + - healthStatus: + status: Degraded + message: "Unable to fetch data from Datadog: Processing data from API failed, reason: no serie was found for this query in API Response, check Cluster Agent logs for QueryIndex errors, query was: max:foo.bar.metric" + inputPath: testdata/degraded_error.yaml + - healthStatus: + status: Degraded + message: "The metric query is invalid" + inputPath: testdata/degraded_invalid.yaml + - healthStatus: + status: Healthy + message: "DatadogMetric is healthy" + inputPath: testdata/healthy.yaml diff --git a/resource_customizations/datadoghq.com/DatadogMetric/testdata/degraded_error.yaml b/resource_customizations/datadoghq.com/DatadogMetric/testdata/degraded_error.yaml new file mode 100644 index 0000000000..2299770bb3 --- /dev/null +++ b/resource_customizations/datadoghq.com/DatadogMetric/testdata/degraded_error.yaml @@ -0,0 +1,24 @@ +apiVersion: datadoghq.com/v1alpha1 +kind: DatadogMetric +metadata: + name: foo-bar-metric + namespace: foo-namespace +status: + autoscalerReferences: + - hpa:foo-namespace/foo-bar-hpa + conditions: + - lastTransitionTime: "2025-02-05T00:03:00Z" + lastUpdateTime: "2025-06-17T17:49:45Z" + status: "True" + type: Active + - lastTransitionTime: "2025-02-05T00:03:00Z" + lastUpdateTime: "2025-06-17T17:49:45Z" + status: "False" + type: Valid + - lastTransitionTime: "2025-02-05T00:03:30Z" + lastUpdateTime: "2025-06-17T17:49:45Z" + message: "Processing data from API failed, reason: no serie was found for this query in API Response, check Cluster Agent logs for QueryIndex errors, query was: max:foo.bar.metric" + reason: Unable to fetch data from Datadog + status: "True" + type: Error + currentValue: 0 diff --git a/resource_customizations/datadoghq.com/DatadogMetric/testdata/degraded_invalid.yaml b/resource_customizations/datadoghq.com/DatadogMetric/testdata/degraded_invalid.yaml new file mode 100644 index 0000000000..28ca69e2d8 --- /dev/null +++ b/resource_customizations/datadoghq.com/DatadogMetric/testdata/degraded_invalid.yaml @@ -0,0 +1,19 @@ +apiVersion: datadoghq.com/v1alpha1 +kind: DatadogMetric +metadata: + name: foo-bar-metric-invalid + namespace: foo-namespace +status: + autoscalerReferences: + - hpa:foo-namespace/foo-bar-hpa + conditions: + - lastTransitionTime: "2025-02-05T00:03:00Z" + lastUpdateTime: "2025-06-17T17:49:45Z" + status: "True" + type: Active + - lastTransitionTime: "2025-02-05T00:03:00Z" + lastUpdateTime: "2025-06-17T17:49:45Z" + status: "False" + type: Valid + message: "The metric query is invalid" + currentValue: 0 \ No newline at end of file diff --git a/resource_customizations/datadoghq.com/DatadogMetric/testdata/healthy.yaml b/resource_customizations/datadoghq.com/DatadogMetric/testdata/healthy.yaml new file mode 100644 index 0000000000..de708e7c50 --- /dev/null +++ b/resource_customizations/datadoghq.com/DatadogMetric/testdata/healthy.yaml @@ -0,0 +1,25 @@ +apiVersion: datadoghq.com/v1alpha1 +kind: DatadogMetric +metadata: + name: foo-bar-metric + namespace: foo-namespace +status: + autoscalerReferences: + - hpa:foo-namespace/foo-bar-hpa + conditions: + - lastTransitionTime: "2025-04-23T18:40:58Z" + lastUpdateTime: "2025-06-17T20:45:05Z" + status: "True" + type: Active + - lastTransitionTime: "2025-06-16T14:07:12Z" + lastUpdateTime: "2025-06-17T20:45:05Z" + status: "True" + type: Valid + - lastUpdateTime: "2025-06-17T20:44:30Z" + status: "True" + type: Updated + - lastTransitionTime: "2025-06-16T14:07:12Z" + lastUpdateTime: "2025-06-17T20:45:05Z" + status: "False" + type: Error + currentValue: 0