From 90b1d2462f0e6c3e99c25d4f40a90c8629116eaa Mon Sep 17 00:00:00 2001
From: stxkxs <stxkxs@users.noreply.github.com>
Date: Mon, 22 Jun 2026 19:15:58 -0700
Subject: [PATCH] feat(dashboards): operator reconcile RED + latency SLO
 dashboard & alerts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Charts the eks-agent-platform operator's own control loop and gives its latency
SLO teeth in prod. The audit found the operator's reconcile RED was alert-only
(never visualized) with no SLO/error-budget board; this adds both, self-contained
over the controller-runtime metrics that now reach AMP via the operator pod's
scrape annotation (eks-agent-platform#46).

dashboards/base/platform/agent-operator.yaml — GrafanaDashboard CR, four rows:
- Reconcile SLO & error budget (99% of reconciles <1s / 30d): inline fraction-
  under-1s SLI, budget remaining, and fast/slow burn (1h/6h) stats.
- Reconcile RED per controller: rate, error ratio, latency p50/p95/p99, and p99
  by controller with the 1s SLO line.
- Work queue & workers: depth, add rate, queue-wait p95, active workers.
- Reconciled fleet: Platforms Ready ratio + CRs by kind & phase (kube_customresource).

dashboards/base/alerting/agent-operator.yaml — GrafanaAlertRuleGroup (Grafana-
managed, evaluated by AMG): dual-window latency burn (14.4x fast / 6x slow,
page), reconcile error rate >5% (page), and operator-metrics-absent (page). Each
links its runbook. This is the prod path; the operator chart's PrometheusRule is
the kube-prometheus-stack mirror for kx — the header documents the split.

Registered in kustomization; .yamllint embedded-JSON ignore generalized to the
agent-* glob + the new alert file. kustomize build green.
---
 .yamllint.yaml                               |   1 +
 dashboards/base/alerting/agent-operator.yaml | 204 +++++++++++++++++++
 dashboards/base/kustomization.yaml           |   4 +
 dashboards/base/platform/agent-operator.yaml | 187 +++++++++++++++++
 4 files changed, 396 insertions(+)
 create mode 100644 dashboards/base/alerting/agent-operator.yaml
 create mode 100644 dashboards/base/platform/agent-operator.yaml

diff --git a/.yamllint.yaml b/.yamllint.yaml
index 5b8f014..c61f12e 100644
--- a/.yamllint.yaml
+++ b/.yamllint.yaml
@@ -9,6 +9,7 @@ ignore: |
   dashboards/base/platform/portal.yaml
   # Alert rule groups carry long PromQL exprs in their query model.
   dashboards/base/alerting/portal.yaml
+  dashboards/base/alerting/agent-operator.yaml
 
 rules:
   line-length:
diff --git a/dashboards/base/alerting/agent-operator.yaml b/dashboards/base/alerting/agent-operator.yaml
new file mode 100644
index 0000000..ddced97
--- /dev/null
+++ b/dashboards/base/alerting/agent-operator.yaml
@@ -0,0 +1,204 @@
+# eks-agent-platform operator — Grafana-managed SLO / health alert rules. The
+# latency SLO is "99% of reconciles complete in <1s over 30d" (budget 0.01); each
+# burn rule is a dual-window check encoded as a `> bool` product. Self-contained
+# over controller-runtime metrics, which reach AMP once the operator pod carries
+# the prometheus.io/scrape annotation (eks-agent-platform operator-prod-scrape).
+#
+# This is the PROD path (Grafana-managed, evaluated by Amazon Managed Grafana
+# against AMP). The operator chart's own PrometheusRule
+# (eks-agent-platform/charts/operator/files/slo/prometheusrule.yaml) is the
+# kube-prometheus-stack mirror — consumed only on the local kx cluster, which has
+# an in-cluster ruler. The two are the same SLO on different stacks; wire only one
+# of them to a given pager. The burn-rate model here is the more precise
+# expression of the 99%/30d objective than the PrometheusRule's coarser p99>1s.
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaAlertRuleGroup
+metadata:
+  name: agent-operator-slo
+spec:
+  instanceSelector:
+    matchLabels:
+      dashboards: external
+  folderRef: slo-alerts
+  interval: 1m
+  rules:
+    - uid: agent-operator-fast-burn
+      title: OperatorReconcileLatencyFastBurn
+      condition: B
+      for: 2m
+      noDataState: OK
+      execErrState: Error
+      isPaused: false
+      labels:
+        severity: page
+        service: eks-agent-platform
+        component: operator
+      annotations:
+        summary: operator reconcile latency budget burning fast (2% in 1h)
+        description: >
+          The fraction of reconciles over 1s exceeds 14.4x the 99% latency
+          objective over both the 1h and 5m windows. The control loop is slow —
+          check the operator's downstream calls (IAM/KMS/Athena/EventBridge).
+        runbook_url: https://github.com/nanohype/eks-agent-platform/blob/main/docs/runbooks/reconcile-latency.md
+      data:
+        - refId: A
+          datasourceUid: managed-prometheus
+          relativeTimeRange: {from: 3600, to: 0}
+          model:
+            refId: A
+            datasource: {type: prometheus, uid: managed-prometheus}
+            editorMode: code
+            instant: true
+            range: false
+            intervalMs: 1000
+            maxDataPoints: 43200
+            expr: "((1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[1h])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[1h])), 1))) / 0.01 > bool 14.4) * ((1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[5m])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[5m])), 1))) / 0.01 > bool 14.4)"
+        - refId: B
+          datasourceUid: __expr__
+          relativeTimeRange: {from: 3600, to: 0}
+          model:
+            refId: B
+            type: threshold
+            datasource: {type: __expr__, uid: __expr__}
+            expression: A
+            conditions:
+              - type: query
+                evaluator: {type: gt, params: [0]}
+                operator: {type: and}
+                query: {params: [A]}
+                reducer: {type: last, params: []}
+    - uid: agent-operator-slow-burn
+      title: OperatorReconcileLatencySlowBurn
+      condition: B
+      for: 15m
+      noDataState: OK
+      execErrState: Error
+      isPaused: false
+      labels:
+        severity: page
+        service: eks-agent-platform
+        component: operator
+      annotations:
+        summary: operator reconcile latency budget burning (5% in 6h)
+        description: >
+          The fraction of reconciles over 1s exceeds 6x the 99% latency objective
+          over both the 6h and 30m windows — a sustained slow burn of the reconcile
+          latency budget. Investigate before it escalates.
+        runbook_url: https://github.com/nanohype/eks-agent-platform/blob/main/docs/runbooks/reconcile-latency.md
+      data:
+        - refId: A
+          datasourceUid: managed-prometheus
+          relativeTimeRange: {from: 21600, to: 0}
+          model:
+            refId: A
+            datasource: {type: prometheus, uid: managed-prometheus}
+            editorMode: code
+            instant: true
+            range: false
+            intervalMs: 1000
+            maxDataPoints: 43200
+            expr: "((1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[6h])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[6h])), 1))) / 0.01 > bool 6) * ((1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[30m])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[30m])), 1))) / 0.01 > bool 6)"
+        - refId: B
+          datasourceUid: __expr__
+          relativeTimeRange: {from: 21600, to: 0}
+          model:
+            refId: B
+            type: threshold
+            datasource: {type: __expr__, uid: __expr__}
+            expression: A
+            conditions:
+              - type: query
+                evaluator: {type: gt, params: [0]}
+                operator: {type: and}
+                query: {params: [A]}
+                reducer: {type: last, params: []}
+    - uid: agent-operator-error-rate
+      title: OperatorReconcileErrorRateHigh
+      condition: B
+      for: 15m
+      noDataState: OK
+      execErrState: Error
+      isPaused: false
+      labels:
+        severity: page
+        service: eks-agent-platform
+        component: operator
+      annotations:
+        summary: operator reconcile error rate above 5%
+        description: >
+          More than 5% of reconciles are erroring over 15m. Probable cause: an
+          AWS-side outage or an operator IAM regression. Check CloudTrail for the
+          operator role and the controller logs.
+        runbook_url: https://github.com/nanohype/eks-agent-platform/blob/main/docs/runbooks/reconcile-errors.md
+      data:
+        - refId: A
+          datasourceUid: managed-prometheus
+          relativeTimeRange: {from: 900, to: 0}
+          model:
+            refId: A
+            datasource: {type: prometheus, uid: managed-prometheus}
+            editorMode: code
+            instant: true
+            range: false
+            intervalMs: 1000
+            maxDataPoints: 43200
+            expr: "sum(rate(controller_runtime_reconcile_errors_total{namespace=\"eks-agent-platform\"}[5m])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"eks-agent-platform\"}[5m])), 0.001)"
+        - refId: B
+          datasourceUid: __expr__
+          relativeTimeRange: {from: 900, to: 0}
+          model:
+            refId: B
+            type: threshold
+            datasource: {type: __expr__, uid: __expr__}
+            expression: A
+            conditions:
+              - type: query
+                evaluator: {type: gt, params: [0.05]}
+                operator: {type: and}
+                query: {params: [A]}
+                reducer: {type: last, params: []}
+    - uid: agent-operator-down
+      title: OperatorMetricsAbsent
+      condition: B
+      for: 5m
+      noDataState: OK
+      execErrState: Error
+      isPaused: false
+      labels:
+        severity: page
+        service: eks-agent-platform
+        component: operator
+      annotations:
+        summary: operator reconcile metrics absent — operator down or unscraped
+        description: >
+          No controller-runtime reconcile metrics are being reported for the
+          operator namespace. Either the deployment is down, leader election is
+          wedged, or the scrape has broken. Check the operator pods.
+        runbook_url: https://github.com/nanohype/eks-agent-platform/blob/main/docs/runbooks/operator-down.md
+      data:
+        - refId: A
+          datasourceUid: managed-prometheus
+          relativeTimeRange: {from: 600, to: 0}
+          model:
+            refId: A
+            datasource: {type: prometheus, uid: managed-prometheus}
+            editorMode: code
+            instant: true
+            range: false
+            intervalMs: 1000
+            maxDataPoints: 43200
+            expr: "absent(controller_runtime_reconcile_total{namespace=\"eks-agent-platform\"})"
+        - refId: B
+          datasourceUid: __expr__
+          relativeTimeRange: {from: 600, to: 0}
+          model:
+            refId: B
+            type: threshold
+            datasource: {type: __expr__, uid: __expr__}
+            expression: A
+            conditions:
+              - type: query
+                evaluator: {type: gt, params: [0]}
+                operator: {type: and}
+                query: {params: [A]}
+                reducer: {type: last, params: []}
diff --git a/dashboards/base/kustomization.yaml b/dashboards/base/kustomization.yaml
index 0d06f29..15ece60 100644
--- a/dashboards/base/kustomization.yaml
+++ b/dashboards/base/kustomization.yaml
@@ -17,6 +17,7 @@ resources:
   # reconciled onto the external Amazon Managed Grafana.
   - alerting/folder.yaml
   - alerting/portal.yaml
+  - alerting/agent-operator.yaml
   - platform/kubernetes-cluster.yaml
   - platform/kubernetes-views-pods.yaml
   - platform/kubernetes-views-namespaces.yaml
@@ -37,6 +38,9 @@ resources:
   - platform/agent-finance.yaml
   - platform/agent-ops.yaml
   - platform/agent-founder.yaml
+  # Operator reconcile RED + latency SLO/error-budget (controller-runtime metrics
+  # reach AMP via the operator pod's prometheus.io/scrape annotation).
+  - platform/agent-operator.yaml
   # Ops control-plane app (portal): API SLO/RED + tofu-run, River-job, watcher,
   # and pgxpool surfaces — self-contained PromQL over the portal_* metrics in AMP.
   - platform/portal.yaml
diff --git a/dashboards/base/platform/agent-operator.yaml b/dashboards/base/platform/agent-operator.yaml
new file mode 100644
index 0000000..169eb75
--- /dev/null
+++ b/dashboards/base/platform/agent-operator.yaml
@@ -0,0 +1,187 @@
+# eks-agent-platform — operator (reconcile RED + latency SLO)
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaDashboard
+metadata:
+  name: agent-operator
+spec:
+  instanceSelector:
+    matchLabels:
+      dashboards: external
+  resyncPeriod: 24h
+  json: |
+    {
+      "title": "eks-agent-platform — operator",
+      "uid": "agents-operator",
+      "tags": ["eks-agent-platform", "operator", "slo"],
+      "timezone": "browser",
+      "schemaVersion": 39,
+      "refresh": "30s",
+      "time": { "from": "now-6h", "to": "now" },
+      "panels": [
+        {
+          "type": "row",
+          "title": "Reconcile SLO & error budget (99% of reconciles < 1s / 30d)",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
+        },
+        {
+          "type": "stat",
+          "title": "Reconciles under 1s (30d)",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "decimals": 4, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.99 } ] } }, "overrides": [] },
+          "targets": [
+            { "expr": "sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[30d])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[30d])), 1)" }
+          ],
+          "gridPos": { "h": 5, "w": 6, "x": 0, "y": 1 }
+        },
+        {
+          "type": "gauge",
+          "title": "Error budget remaining (30d)",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 0.25 }, { "color": "green", "value": 0.5 } ] } }, "overrides": [] },
+          "targets": [
+            { "expr": "clamp_min(1 - ((1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[30d])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[30d])), 1))) / 0.01), 0)" }
+          ],
+          "gridPos": { "h": 5, "w": 6, "x": 6, "y": 1 }
+        },
+        {
+          "type": "stat",
+          "title": "Fast burn (1h)",
+          "datasource": "prometheus",
+          "description": "Latency-budget burn over 1h. Pages at 14.4x (2% of budget in 1h).",
+          "fieldConfig": { "defaults": { "unit": "none", "decimals": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 14.4 } ] } }, "overrides": [] },
+          "targets": [
+            { "expr": "(1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[1h])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[1h])), 1))) / 0.01" }
+          ],
+          "gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 }
+        },
+        {
+          "type": "stat",
+          "title": "Slow burn (6h)",
+          "datasource": "prometheus",
+          "description": "Latency-budget burn over 6h. Pages at 6x (5% of budget in 6h).",
+          "fieldConfig": { "defaults": { "unit": "none", "decimals": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 6 } ] } }, "overrides": [] },
+          "targets": [
+            { "expr": "(1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[6h])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[6h])), 1))) / 0.01" }
+          ],
+          "gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 }
+        },
+        {
+          "type": "row",
+          "title": "Reconcile RED (per controller)",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Reconcile rate by controller (rec/s)",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
+          "targets": [
+            { "expr": "sum by (controller) (rate(controller_runtime_reconcile_total{namespace=\"eks-agent-platform\"}[5m]))", "legendFormat": "{{controller}}" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Reconcile error rate by controller",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "percentunit", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.05 } ] } }, "overrides": [] },
+          "targets": [
+            { "expr": "sum by (controller) (rate(controller_runtime_reconcile_errors_total{namespace=\"eks-agent-platform\"}[5m])) / clamp_min(sum by (controller) (rate(controller_runtime_reconcile_total{namespace=\"eks-agent-platform\"}[5m])), 0.001)", "legendFormat": "{{controller}}" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Reconcile latency p50 / p95 / p99 (s)",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+          "targets": [
+            { "expr": "histogram_quantile(0.50, sum by (le) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\"}[5m])))", "legendFormat": "p50" },
+            { "expr": "histogram_quantile(0.95, sum by (le) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\"}[5m])))", "legendFormat": "p95" },
+            { "expr": "histogram_quantile(0.99, sum by (le) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\"}[5m])))", "legendFormat": "p99" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 0, "y": 14 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Reconcile p99 by controller (s) — SLO line at 1s",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } }, "overrides": [] },
+          "targets": [
+            { "expr": "histogram_quantile(0.99, sum by (le, controller) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\"}[5m])))", "legendFormat": "{{controller}}" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 12, "y": 14 }
+        },
+        {
+          "type": "row",
+          "title": "Work queue & workers (saturation)",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Workqueue depth by queue",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
+          "targets": [
+            { "expr": "sum by (name) (workqueue_depth{namespace=\"eks-agent-platform\"})", "legendFormat": "{{name}}" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 0, "y": 22 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Workqueue add rate by queue",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
+          "targets": [
+            { "expr": "sum by (name) (rate(workqueue_adds_total{namespace=\"eks-agent-platform\"}[5m]))", "legendFormat": "{{name}}" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 12, "y": 22 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Queue wait p95 by queue (s)",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+          "targets": [
+            { "expr": "histogram_quantile(0.95, sum by (le, name) (rate(workqueue_queue_duration_seconds_bucket{namespace=\"eks-agent-platform\"}[5m])))", "legendFormat": "{{name}}" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 0, "y": 29 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Active workers by controller",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
+          "targets": [
+            { "expr": "sum by (controller) (controller_runtime_active_workers{namespace=\"eks-agent-platform\"})", "legendFormat": "{{controller}}" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 12, "y": 29 }
+        },
+        {
+          "type": "row",
+          "title": "Reconciled fleet (CR state)",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 36 }
+        },
+        {
+          "type": "stat",
+          "title": "Platforms Ready ratio",
+          "datasource": "prometheus",
+          "description": "Fraction of Platform CRs in the Ready phase — fed by the kube-state-metrics customResourceState config.",
+          "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 0.9 }, { "color": "green", "value": 1 } ] } }, "overrides": [] },
+          "targets": [
+            { "expr": "count(kube_customresource_status_phase{customresource_kind=\"Platform\",customresource_phase=\"Ready\"} == 1) / clamp_min(count(kube_customresource_status_phase{customresource_kind=\"Platform\"}), 1)" }
+          ],
+          "gridPos": { "h": 7, "w": 8, "x": 0, "y": 37 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Custom resources by kind & phase",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
+          "targets": [
+            { "expr": "sum by (customresource_kind, customresource_phase) (kube_customresource_status_phase == 1)", "legendFormat": "{{customresource_kind}} / {{customresource_phase}}" }
+          ],
+          "gridPos": { "h": 7, "w": 16, "x": 8, "y": 37 }
+        }
+      ]
+    }