From bd054088bfe11ab97c8c38ef53034d60f4b6ff6d Mon Sep 17 00:00:00 2001
From: stxkxs <stxkxs@users.noreply.github.com>
Date: Tue, 23 Jun 2026 19:01:06 -0700
Subject: [PATCH] feat(alerting): page when the kube-state-metrics CR-state
 projection goes dark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The seven agent persona dashboards read kube_customresource_* — projected by the
single customResourceState config in the kube-state-metrics addon. KSM parses that
config as one unit, so a malformed block, a CRD-list typo, an RBAC Forbidden on
agents.nanohype.dev, or a KSM rename/outage drops every kube_customresource_* series
at once and silently no-data's all those boards.

Nothing alerted on that. The existing absent() rules (agent-operator, fleet-vend)
watch controller_runtime_* — which arrives via the operator/provider pod-annotation
scrape, a different path that stays green through a KSM-side break.

Adds the ksm-health GrafanaAlertRuleGroup: absent(kube_customresource_status_phase)
for 10m → page (folderRef slo-alerts, datasourceUid managed-prometheus, same
query+threshold model as the other absent() canaries). noDataState OK so a healthy
KSM (absent() returns empty) doesn't fire; the 10m for-duration rides out a KSM pod
rollout without flapping.
---
 .yamllint.yaml                           |  1 +
 dashboards/base/alerting/ksm-health.yaml | 70 ++++++++++++++++++++++++
 dashboards/base/kustomization.yaml       |  1 +
 3 files changed, 72 insertions(+)
 create mode 100644 dashboards/base/alerting/ksm-health.yaml

diff --git a/.yamllint.yaml b/.yamllint.yaml
index 7cdafb0..88c9af1 100644
--- a/.yamllint.yaml
+++ b/.yamllint.yaml
@@ -12,6 +12,7 @@ ignore: |
   dashboards/base/alerting/portal.yaml
   dashboards/base/alerting/agent-operator.yaml
   dashboards/base/alerting/fleet-vend.yaml
+  dashboards/base/alerting/ksm-health.yaml
 
 rules:
   line-length:
diff --git a/dashboards/base/alerting/ksm-health.yaml b/dashboards/base/alerting/ksm-health.yaml
new file mode 100644
index 0000000..84e9507
--- /dev/null
+++ b/dashboards/base/alerting/ksm-health.yaml
@@ -0,0 +1,70 @@
+# kube-state-metrics customResourceState health. The 7 agent persona dashboards
+# (agent-tenants/founder/finance/ops/kill-switch/eval-quality + the operator's
+# CR-state row) all read kube_customresource_* — projected by the single
+# customResourceState config in addons/observability/kube-state-metrics. KSM
+# parses that config as ONE unit: a malformed block, a CRD-list typo, an RBAC
+# Forbidden, or a KSM service rename drops EVERY kube_customresource_* series at
+# once, silently no-data-ing all those boards. The existing absent() alerts watch
+# controller_runtime_* (the operator/provider POD-annotation scrape path) — a
+# different path that stays green through a KSM-side break. This is the canary for
+# the KSM static-scrape path.
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaAlertRuleGroup
+metadata:
+  name: ksm-health
+spec:
+  instanceSelector:
+    matchLabels:
+      dashboards: external
+  folderRef: slo-alerts
+  interval: 1m
+  rules:
+    - uid: ksm-customresource-absent
+      title: KubeCustomResourceMetricsAbsent
+      condition: B
+      for: 10m
+      noDataState: OK
+      execErrState: Error
+      isPaused: false
+      labels:
+        severity: page
+        service: eks-agent-platform
+        component: kube-state-metrics
+      annotations:
+        summary: kube_customresource_* metrics absent — the CR-state projection broke
+        description: >
+          No kube_customresource_status_phase series are reaching Amazon Managed
+          Prometheus. The kube-state-metrics customResourceState config emits nothing —
+          a malformed block, a CRD-list typo, an RBAC Forbidden on agents.nanohype.dev,
+          or a KSM rename/outage drops ALL kube_customresource_* at once. Every agent
+          persona dashboard (tenants/founder/finance/ops/kill-switch/eval-quality) is
+          silently no-data until this clears. Check the kube-state-metrics pod logs for
+          a customResourceState parse error and its RBAC, then the static scrape target
+          kube-state-metrics.kube-system.svc:8080.
+      data:
+        - refId: A
+          datasourceUid: managed-prometheus
+          relativeTimeRange: {from: 600, to: 0}
+          model:
+            refId: A
+            datasource: {type: prometheus, uid: managed-prometheus}
+            editorMode: code
+            instant: true
+            range: false
+            intervalMs: 1000
+            maxDataPoints: 43200
+            expr: "absent(kube_customresource_status_phase)"
+        - refId: B
+          datasourceUid: __expr__
+          relativeTimeRange: {from: 600, to: 0}
+          model:
+            refId: B
+            type: threshold
+            datasource: {type: __expr__, uid: __expr__}
+            expression: A
+            conditions:
+              - type: query
+                evaluator: {type: gt, params: [0]}
+                operator: {type: and}
+                query: {params: [A]}
+                reducer: {type: last, params: []}
diff --git a/dashboards/base/kustomization.yaml b/dashboards/base/kustomization.yaml
index 3a4c06f..d7eb468 100644
--- a/dashboards/base/kustomization.yaml
+++ b/dashboards/base/kustomization.yaml
@@ -20,6 +20,7 @@ resources:
   - alerting/portal.yaml
   - alerting/agent-operator.yaml
   - alerting/fleet-vend.yaml
+  - alerting/ksm-health.yaml
   - platform/kubernetes-cluster.yaml
   - platform/kubernetes-views-pods.yaml
   - platform/kubernetes-views-namespaces.yaml