From bd054088bfe11ab97c8c38ef53034d60f4b6ff6d Mon Sep 17 00:00:00 2001 From: stxkxs Date: Tue, 23 Jun 2026 19:01:06 -0700 Subject: [PATCH] feat(alerting): page when the kube-state-metrics CR-state projection goes dark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The seven agent persona dashboards read kube_customresource_* — projected by the single customResourceState config in the kube-state-metrics addon. KSM parses that config as one unit, so a malformed block, a CRD-list typo, an RBAC Forbidden on agents.nanohype.dev, or a KSM rename/outage drops every kube_customresource_* series at once and silently no-data's all those boards. Nothing alerted on that. The existing absent() rules (agent-operator, fleet-vend) watch controller_runtime_* — which arrives via the operator/provider pod-annotation scrape, a different path that stays green through a KSM-side break. Adds the ksm-health GrafanaAlertRuleGroup: absent(kube_customresource_status_phase) for 10m → page (folderRef slo-alerts, datasourceUid managed-prometheus, same query+threshold model as the other absent() canaries). noDataState OK so a healthy KSM (absent() returns empty) doesn't fire; the 10m for-duration rides out a KSM pod rollout without flapping. --- .yamllint.yaml | 1 + dashboards/base/alerting/ksm-health.yaml | 70 ++++++++++++++++++++++++ dashboards/base/kustomization.yaml | 1 + 3 files changed, 72 insertions(+) create mode 100644 dashboards/base/alerting/ksm-health.yaml diff --git a/.yamllint.yaml b/.yamllint.yaml index 7cdafb0..88c9af1 100644 --- a/.yamllint.yaml +++ b/.yamllint.yaml @@ -12,6 +12,7 @@ ignore: | dashboards/base/alerting/portal.yaml dashboards/base/alerting/agent-operator.yaml dashboards/base/alerting/fleet-vend.yaml + dashboards/base/alerting/ksm-health.yaml rules: line-length: diff --git a/dashboards/base/alerting/ksm-health.yaml b/dashboards/base/alerting/ksm-health.yaml new file mode 100644 index 0000000..84e9507 --- /dev/null +++ b/dashboards/base/alerting/ksm-health.yaml @@ -0,0 +1,70 @@ +# kube-state-metrics customResourceState health. The 7 agent persona dashboards +# (agent-tenants/founder/finance/ops/kill-switch/eval-quality + the operator's +# CR-state row) all read kube_customresource_* — projected by the single +# customResourceState config in addons/observability/kube-state-metrics. KSM +# parses that config as ONE unit: a malformed block, a CRD-list typo, an RBAC +# Forbidden, or a KSM service rename drops EVERY kube_customresource_* series at +# once, silently no-data-ing all those boards. The existing absent() alerts watch +# controller_runtime_* (the operator/provider POD-annotation scrape path) — a +# different path that stays green through a KSM-side break. This is the canary for +# the KSM static-scrape path. +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaAlertRuleGroup +metadata: + name: ksm-health +spec: + instanceSelector: + matchLabels: + dashboards: external + folderRef: slo-alerts + interval: 1m + rules: + - uid: ksm-customresource-absent + title: KubeCustomResourceMetricsAbsent + condition: B + for: 10m + noDataState: OK + execErrState: Error + isPaused: false + labels: + severity: page + service: eks-agent-platform + component: kube-state-metrics + annotations: + summary: kube_customresource_* metrics absent — the CR-state projection broke + description: > + No kube_customresource_status_phase series are reaching Amazon Managed + Prometheus. The kube-state-metrics customResourceState config emits nothing — + a malformed block, a CRD-list typo, an RBAC Forbidden on agents.nanohype.dev, + or a KSM rename/outage drops ALL kube_customresource_* at once. Every agent + persona dashboard (tenants/founder/finance/ops/kill-switch/eval-quality) is + silently no-data until this clears. Check the kube-state-metrics pod logs for + a customResourceState parse error and its RBAC, then the static scrape target + kube-state-metrics.kube-system.svc:8080. + data: + - refId: A + datasourceUid: managed-prometheus + relativeTimeRange: {from: 600, to: 0} + model: + refId: A + datasource: {type: prometheus, uid: managed-prometheus} + editorMode: code + instant: true + range: false + intervalMs: 1000 + maxDataPoints: 43200 + expr: "absent(kube_customresource_status_phase)" + - refId: B + datasourceUid: __expr__ + relativeTimeRange: {from: 600, to: 0} + model: + refId: B + type: threshold + datasource: {type: __expr__, uid: __expr__} + expression: A + conditions: + - type: query + evaluator: {type: gt, params: [0]} + operator: {type: and} + query: {params: [A]} + reducer: {type: last, params: []} diff --git a/dashboards/base/kustomization.yaml b/dashboards/base/kustomization.yaml index 3a4c06f..d7eb468 100644 --- a/dashboards/base/kustomization.yaml +++ b/dashboards/base/kustomization.yaml @@ -20,6 +20,7 @@ resources: - alerting/portal.yaml - alerting/agent-operator.yaml - alerting/fleet-vend.yaml + - alerting/ksm-health.yaml - platform/kubernetes-cluster.yaml - platform/kubernetes-views-pods.yaml - platform/kubernetes-views-namespaces.yaml