From 90b1d2462f0e6c3e99c25d4f40a90c8629116eaa Mon Sep 17 00:00:00 2001 From: stxkxs Date: Mon, 22 Jun 2026 19:15:58 -0700 Subject: [PATCH] feat(dashboards): operator reconcile RED + latency SLO dashboard & alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Charts the eks-agent-platform operator's own control loop and gives its latency SLO teeth in prod. The audit found the operator's reconcile RED was alert-only (never visualized) with no SLO/error-budget board; this adds both, self-contained over the controller-runtime metrics that now reach AMP via the operator pod's scrape annotation (eks-agent-platform#46). dashboards/base/platform/agent-operator.yaml — GrafanaDashboard CR, four rows: - Reconcile SLO & error budget (99% of reconciles <1s / 30d): inline fraction- under-1s SLI, budget remaining, and fast/slow burn (1h/6h) stats. - Reconcile RED per controller: rate, error ratio, latency p50/p95/p99, and p99 by controller with the 1s SLO line. - Work queue & workers: depth, add rate, queue-wait p95, active workers. - Reconciled fleet: Platforms Ready ratio + CRs by kind & phase (kube_customresource). dashboards/base/alerting/agent-operator.yaml — GrafanaAlertRuleGroup (Grafana- managed, evaluated by AMG): dual-window latency burn (14.4x fast / 6x slow, page), reconcile error rate >5% (page), and operator-metrics-absent (page). Each links its runbook. This is the prod path; the operator chart's PrometheusRule is the kube-prometheus-stack mirror for kx — the header documents the split. Registered in kustomization; .yamllint embedded-JSON ignore generalized to the agent-* glob + the new alert file. kustomize build green. --- .yamllint.yaml | 1 + dashboards/base/alerting/agent-operator.yaml | 204 +++++++++++++++++++ dashboards/base/kustomization.yaml | 4 + dashboards/base/platform/agent-operator.yaml | 187 +++++++++++++++++ 4 files changed, 396 insertions(+) create mode 100644 dashboards/base/alerting/agent-operator.yaml create mode 100644 dashboards/base/platform/agent-operator.yaml diff --git a/.yamllint.yaml b/.yamllint.yaml index 5b8f014..c61f12e 100644 --- a/.yamllint.yaml +++ b/.yamllint.yaml @@ -9,6 +9,7 @@ ignore: | dashboards/base/platform/portal.yaml # Alert rule groups carry long PromQL exprs in their query model. dashboards/base/alerting/portal.yaml + dashboards/base/alerting/agent-operator.yaml rules: line-length: diff --git a/dashboards/base/alerting/agent-operator.yaml b/dashboards/base/alerting/agent-operator.yaml new file mode 100644 index 0000000..ddced97 --- /dev/null +++ b/dashboards/base/alerting/agent-operator.yaml @@ -0,0 +1,204 @@ +# eks-agent-platform operator — Grafana-managed SLO / health alert rules. The +# latency SLO is "99% of reconciles complete in <1s over 30d" (budget 0.01); each +# burn rule is a dual-window check encoded as a `> bool` product. Self-contained +# over controller-runtime metrics, which reach AMP once the operator pod carries +# the prometheus.io/scrape annotation (eks-agent-platform operator-prod-scrape). +# +# This is the PROD path (Grafana-managed, evaluated by Amazon Managed Grafana +# against AMP). The operator chart's own PrometheusRule +# (eks-agent-platform/charts/operator/files/slo/prometheusrule.yaml) is the +# kube-prometheus-stack mirror — consumed only on the local kx cluster, which has +# an in-cluster ruler. The two are the same SLO on different stacks; wire only one +# of them to a given pager. The burn-rate model here is the more precise +# expression of the 99%/30d objective than the PrometheusRule's coarser p99>1s. +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaAlertRuleGroup +metadata: + name: agent-operator-slo +spec: + instanceSelector: + matchLabels: + dashboards: external + folderRef: slo-alerts + interval: 1m + rules: + - uid: agent-operator-fast-burn + title: OperatorReconcileLatencyFastBurn + condition: B + for: 2m + noDataState: OK + execErrState: Error + isPaused: false + labels: + severity: page + service: eks-agent-platform + component: operator + annotations: + summary: operator reconcile latency budget burning fast (2% in 1h) + description: > + The fraction of reconciles over 1s exceeds 14.4x the 99% latency + objective over both the 1h and 5m windows. The control loop is slow — + check the operator's downstream calls (IAM/KMS/Athena/EventBridge). + runbook_url: https://github.com/nanohype/eks-agent-platform/blob/main/docs/runbooks/reconcile-latency.md + data: + - refId: A + datasourceUid: managed-prometheus + relativeTimeRange: {from: 3600, to: 0} + model: + refId: A + datasource: {type: prometheus, uid: managed-prometheus} + editorMode: code + instant: true + range: false + intervalMs: 1000 + maxDataPoints: 43200 + expr: "((1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[1h])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[1h])), 1))) / 0.01 > bool 14.4) * ((1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[5m])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[5m])), 1))) / 0.01 > bool 14.4)" + - refId: B + datasourceUid: __expr__ + relativeTimeRange: {from: 3600, to: 0} + model: + refId: B + type: threshold + datasource: {type: __expr__, uid: __expr__} + expression: A + conditions: + - type: query + evaluator: {type: gt, params: [0]} + operator: {type: and} + query: {params: [A]} + reducer: {type: last, params: []} + - uid: agent-operator-slow-burn + title: OperatorReconcileLatencySlowBurn + condition: B + for: 15m + noDataState: OK + execErrState: Error + isPaused: false + labels: + severity: page + service: eks-agent-platform + component: operator + annotations: + summary: operator reconcile latency budget burning (5% in 6h) + description: > + The fraction of reconciles over 1s exceeds 6x the 99% latency objective + over both the 6h and 30m windows — a sustained slow burn of the reconcile + latency budget. Investigate before it escalates. + runbook_url: https://github.com/nanohype/eks-agent-platform/blob/main/docs/runbooks/reconcile-latency.md + data: + - refId: A + datasourceUid: managed-prometheus + relativeTimeRange: {from: 21600, to: 0} + model: + refId: A + datasource: {type: prometheus, uid: managed-prometheus} + editorMode: code + instant: true + range: false + intervalMs: 1000 + maxDataPoints: 43200 + expr: "((1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[6h])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[6h])), 1))) / 0.01 > bool 6) * ((1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[30m])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[30m])), 1))) / 0.01 > bool 6)" + - refId: B + datasourceUid: __expr__ + relativeTimeRange: {from: 21600, to: 0} + model: + refId: B + type: threshold + datasource: {type: __expr__, uid: __expr__} + expression: A + conditions: + - type: query + evaluator: {type: gt, params: [0]} + operator: {type: and} + query: {params: [A]} + reducer: {type: last, params: []} + - uid: agent-operator-error-rate + title: OperatorReconcileErrorRateHigh + condition: B + for: 15m + noDataState: OK + execErrState: Error + isPaused: false + labels: + severity: page + service: eks-agent-platform + component: operator + annotations: + summary: operator reconcile error rate above 5% + description: > + More than 5% of reconciles are erroring over 15m. Probable cause: an + AWS-side outage or an operator IAM regression. Check CloudTrail for the + operator role and the controller logs. + runbook_url: https://github.com/nanohype/eks-agent-platform/blob/main/docs/runbooks/reconcile-errors.md + data: + - refId: A + datasourceUid: managed-prometheus + relativeTimeRange: {from: 900, to: 0} + model: + refId: A + datasource: {type: prometheus, uid: managed-prometheus} + editorMode: code + instant: true + range: false + intervalMs: 1000 + maxDataPoints: 43200 + expr: "sum(rate(controller_runtime_reconcile_errors_total{namespace=\"eks-agent-platform\"}[5m])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"eks-agent-platform\"}[5m])), 0.001)" + - refId: B + datasourceUid: __expr__ + relativeTimeRange: {from: 900, to: 0} + model: + refId: B + type: threshold + datasource: {type: __expr__, uid: __expr__} + expression: A + conditions: + - type: query + evaluator: {type: gt, params: [0.05]} + operator: {type: and} + query: {params: [A]} + reducer: {type: last, params: []} + - uid: agent-operator-down + title: OperatorMetricsAbsent + condition: B + for: 5m + noDataState: OK + execErrState: Error + isPaused: false + labels: + severity: page + service: eks-agent-platform + component: operator + annotations: + summary: operator reconcile metrics absent — operator down or unscraped + description: > + No controller-runtime reconcile metrics are being reported for the + operator namespace. Either the deployment is down, leader election is + wedged, or the scrape has broken. Check the operator pods. + runbook_url: https://github.com/nanohype/eks-agent-platform/blob/main/docs/runbooks/operator-down.md + data: + - refId: A + datasourceUid: managed-prometheus + relativeTimeRange: {from: 600, to: 0} + model: + refId: A + datasource: {type: prometheus, uid: managed-prometheus} + editorMode: code + instant: true + range: false + intervalMs: 1000 + maxDataPoints: 43200 + expr: "absent(controller_runtime_reconcile_total{namespace=\"eks-agent-platform\"})" + - refId: B + datasourceUid: __expr__ + relativeTimeRange: {from: 600, to: 0} + model: + refId: B + type: threshold + datasource: {type: __expr__, uid: __expr__} + expression: A + conditions: + - type: query + evaluator: {type: gt, params: [0]} + operator: {type: and} + query: {params: [A]} + reducer: {type: last, params: []} diff --git a/dashboards/base/kustomization.yaml b/dashboards/base/kustomization.yaml index 0d06f29..15ece60 100644 --- a/dashboards/base/kustomization.yaml +++ b/dashboards/base/kustomization.yaml @@ -17,6 +17,7 @@ resources: # reconciled onto the external Amazon Managed Grafana. - alerting/folder.yaml - alerting/portal.yaml + - alerting/agent-operator.yaml - platform/kubernetes-cluster.yaml - platform/kubernetes-views-pods.yaml - platform/kubernetes-views-namespaces.yaml @@ -37,6 +38,9 @@ resources: - platform/agent-finance.yaml - platform/agent-ops.yaml - platform/agent-founder.yaml + # Operator reconcile RED + latency SLO/error-budget (controller-runtime metrics + # reach AMP via the operator pod's prometheus.io/scrape annotation). + - platform/agent-operator.yaml # Ops control-plane app (portal): API SLO/RED + tofu-run, River-job, watcher, # and pgxpool surfaces — self-contained PromQL over the portal_* metrics in AMP. - platform/portal.yaml diff --git a/dashboards/base/platform/agent-operator.yaml b/dashboards/base/platform/agent-operator.yaml new file mode 100644 index 0000000..169eb75 --- /dev/null +++ b/dashboards/base/platform/agent-operator.yaml @@ -0,0 +1,187 @@ +# eks-agent-platform — operator (reconcile RED + latency SLO) +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: agent-operator +spec: + instanceSelector: + matchLabels: + dashboards: external + resyncPeriod: 24h + json: | + { + "title": "eks-agent-platform — operator", + "uid": "agents-operator", + "tags": ["eks-agent-platform", "operator", "slo"], + "timezone": "browser", + "schemaVersion": 39, + "refresh": "30s", + "time": { "from": "now-6h", "to": "now" }, + "panels": [ + { + "type": "row", + "title": "Reconcile SLO & error budget (99% of reconciles < 1s / 30d)", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + { + "type": "stat", + "title": "Reconciles under 1s (30d)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "decimals": 4, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.99 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[30d])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[30d])), 1)" } + ], + "gridPos": { "h": 5, "w": 6, "x": 0, "y": 1 } + }, + { + "type": "gauge", + "title": "Error budget remaining (30d)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 0.25 }, { "color": "green", "value": 0.5 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "clamp_min(1 - ((1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[30d])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[30d])), 1))) / 0.01), 0)" } + ], + "gridPos": { "h": 5, "w": 6, "x": 6, "y": 1 } + }, + { + "type": "stat", + "title": "Fast burn (1h)", + "datasource": "prometheus", + "description": "Latency-budget burn over 1h. Pages at 14.4x (2% of budget in 1h).", + "fieldConfig": { "defaults": { "unit": "none", "decimals": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 14.4 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "(1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[1h])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[1h])), 1))) / 0.01" } + ], + "gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 } + }, + { + "type": "stat", + "title": "Slow burn (6h)", + "datasource": "prometheus", + "description": "Latency-budget burn over 6h. Pages at 6x (5% of budget in 6h).", + "fieldConfig": { "defaults": { "unit": "none", "decimals": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 6 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "(1 - (sum(rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\",le=\"1\"}[6h])) / clamp_min(sum(rate(controller_runtime_reconcile_time_seconds_count{namespace=\"eks-agent-platform\"}[6h])), 1))) / 0.01" } + ], + "gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 } + }, + { + "type": "row", + "title": "Reconcile RED (per controller)", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 } + }, + { + "type": "timeseries", + "title": "Reconcile rate by controller (rec/s)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] }, + "targets": [ + { "expr": "sum by (controller) (rate(controller_runtime_reconcile_total{namespace=\"eks-agent-platform\"}[5m]))", "legendFormat": "{{controller}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 } + }, + { + "type": "timeseries", + "title": "Reconcile error rate by controller", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "percentunit", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.05 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "sum by (controller) (rate(controller_runtime_reconcile_errors_total{namespace=\"eks-agent-platform\"}[5m])) / clamp_min(sum by (controller) (rate(controller_runtime_reconcile_total{namespace=\"eks-agent-platform\"}[5m])), 0.001)", "legendFormat": "{{controller}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 } + }, + { + "type": "timeseries", + "title": "Reconcile latency p50 / p95 / p99 (s)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by (le) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\"}[5m])))", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum by (le) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\"}[5m])))", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum by (le) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\"}[5m])))", "legendFormat": "p99" } + ], + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 14 } + }, + { + "type": "timeseries", + "title": "Reconcile p99 by controller (s) — SLO line at 1s", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "histogram_quantile(0.99, sum by (le, controller) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"eks-agent-platform\"}[5m])))", "legendFormat": "{{controller}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 14 } + }, + { + "type": "row", + "title": "Work queue & workers (saturation)", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 } + }, + { + "type": "timeseries", + "title": "Workqueue depth by queue", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, + "targets": [ + { "expr": "sum by (name) (workqueue_depth{namespace=\"eks-agent-platform\"})", "legendFormat": "{{name}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 22 } + }, + { + "type": "timeseries", + "title": "Workqueue add rate by queue", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] }, + "targets": [ + { "expr": "sum by (name) (rate(workqueue_adds_total{namespace=\"eks-agent-platform\"}[5m]))", "legendFormat": "{{name}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 22 } + }, + { + "type": "timeseries", + "title": "Queue wait p95 by queue (s)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { "expr": "histogram_quantile(0.95, sum by (le, name) (rate(workqueue_queue_duration_seconds_bucket{namespace=\"eks-agent-platform\"}[5m])))", "legendFormat": "{{name}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 29 } + }, + { + "type": "timeseries", + "title": "Active workers by controller", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, + "targets": [ + { "expr": "sum by (controller) (controller_runtime_active_workers{namespace=\"eks-agent-platform\"})", "legendFormat": "{{controller}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 29 } + }, + { + "type": "row", + "title": "Reconciled fleet (CR state)", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 36 } + }, + { + "type": "stat", + "title": "Platforms Ready ratio", + "datasource": "prometheus", + "description": "Fraction of Platform CRs in the Ready phase — fed by the kube-state-metrics customResourceState config.", + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 0.9 }, { "color": "green", "value": 1 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "count(kube_customresource_status_phase{customresource_kind=\"Platform\",customresource_phase=\"Ready\"} == 1) / clamp_min(count(kube_customresource_status_phase{customresource_kind=\"Platform\"}), 1)" } + ], + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 37 } + }, + { + "type": "timeseries", + "title": "Custom resources by kind & phase", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, + "targets": [ + { "expr": "sum by (customresource_kind, customresource_phase) (kube_customresource_status_phase == 1)", "legendFormat": "{{customresource_kind}} / {{customresource_phase}}" } + ], + "gridPos": { "h": 7, "w": 16, "x": 8, "y": 37 } + } + ] + }