From 6081af102ef8a0985f1d903b51b6ea7166df7c12 Mon Sep 17 00:00:00 2001 From: stxkxs Date: Mon, 22 Jun 2026 19:36:36 -0700 Subject: [PATCH] feat(dashboards): eks-fleet vend pipeline dashboard + alerts (blocked on hub observability) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vend-pipeline observability for eks-fleet, addressing the audit's F grade (no dashboard, no instrumentation). Self-contained over the provider-opentofu controller-runtime metrics that the eks-fleet vend-provider-scrape PR exposes. dashboards/base/platform/fleet-vend.yaml — GrafanaDashboard CR, three rows: - Vend reconcile SLO & error budget (99% reconcile success / 30d): inline success ratio, budget remaining, fast/slow burn. - Vend provider RED: provider-opentofu reconcile rate/error-ratio/latency p50/p95/p99 by controller (each reconcile = a tofu plan/apply cycle). - Work queue: depth, add rate, queue-wait p95, active workers (vend backpressure). dashboards/base/alerting/fleet-vend.yaml — GrafanaAlertRuleGroup: dual-window reconcile-error burn (fast/slow, page) + provider-absent (page). DRAFT / BLOCKED: the eks-fleet hub registers as environment=hub, for which eks-gitops has no env (no overlays/hub, no values-hub.yaml) and the fleet account has no AMP/AMG — so the hub runs neither grafana-agent nor grafana-operator and these would render nothing there. Blocked on #50 (wire the hub into the observability fabric). Per-cluster vend inventory/readiness (Cluster + Workspace CR state via KSM) is a follow-on in that issue. --- .yamllint.yaml | 2 + dashboards/base/alerting/fleet-vend.yaml | 153 +++++++++++++++++++++ dashboards/base/kustomization.yaml | 4 + dashboards/base/platform/fleet-vend.yaml | 163 +++++++++++++++++++++++ 4 files changed, 322 insertions(+) create mode 100644 dashboards/base/alerting/fleet-vend.yaml create mode 100644 dashboards/base/platform/fleet-vend.yaml diff --git a/.yamllint.yaml b/.yamllint.yaml index 5b8f014..16bc4f5 100644 --- a/.yamllint.yaml +++ b/.yamllint.yaml @@ -7,8 +7,10 @@ ignore: | # JSON is validated by kustomize build + the GrafanaDashboard schema, not yamllint. dashboards/base/platform/agent-*.yaml dashboards/base/platform/portal.yaml + dashboards/base/platform/fleet-vend.yaml # Alert rule groups carry long PromQL exprs in their query model. dashboards/base/alerting/portal.yaml + dashboards/base/alerting/fleet-vend.yaml rules: line-length: diff --git a/dashboards/base/alerting/fleet-vend.yaml b/dashboards/base/alerting/fleet-vend.yaml new file mode 100644 index 0000000..8b14262 --- /dev/null +++ b/dashboards/base/alerting/fleet-vend.yaml @@ -0,0 +1,153 @@ +# eks-fleet vend pipeline — Grafana-managed SLO / health alert rules. The SLO is +# "99% of provider-opentofu reconciles succeed over 30d" (budget 0.01); a failing +# reconcile is a tofu plan/apply error on a cluster Workspace. Self-contained over +# the provider's controller-runtime metrics, which reach AMP once the +# provider-opentofu pod carries the prometheus.io/scrape annotation (eks-fleet +# vend-provider-scrape). Per-cluster vend inventory/readiness (Cluster + Workspace +# CR state) is a follow-up that needs the kube-state-metrics customResourceState +# config extended and validated on a live hub — tracked separately. +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaAlertRuleGroup +metadata: + name: fleet-vend-slo +spec: + instanceSelector: + matchLabels: + dashboards: external + folderRef: slo-alerts + interval: 1m + rules: + - uid: fleet-vend-fast-burn + title: FleetVendReconcileFastBurn + condition: B + for: 2m + noDataState: OK + execErrState: Error + isPaused: false + labels: + severity: page + service: eks-fleet + component: vend + annotations: + summary: fleet vend reconcile errors burning the budget fast (2% in 1h) + description: > + provider-opentofu reconcile errors exceed 14.4x the 99% success + objective over both the 1h and 5m windows. Vends are failing — inspect + the cluster Workspaces' Synced condition (kubectl get workspace) for the + tofu/apply error, and the provider-opentofu pod logs. + data: + - refId: A + datasourceUid: managed-prometheus + relativeTimeRange: {from: 3600, to: 0} + model: + refId: A + datasource: {type: prometheus, uid: managed-prometheus} + editorMode: code + instant: true + range: false + intervalMs: 1000 + maxDataPoints: 43200 + expr: "(sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[1h])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[1h])), 1) / 0.01 > bool 14.4) * (sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[5m])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[5m])), 1) / 0.01 > bool 14.4)" + - refId: B + datasourceUid: __expr__ + relativeTimeRange: {from: 3600, to: 0} + model: + refId: B + type: threshold + datasource: {type: __expr__, uid: __expr__} + expression: A + conditions: + - type: query + evaluator: {type: gt, params: [0]} + operator: {type: and} + query: {params: [A]} + reducer: {type: last, params: []} + - uid: fleet-vend-slow-burn + title: FleetVendReconcileSlowBurn + condition: B + for: 15m + noDataState: OK + execErrState: Error + isPaused: false + labels: + severity: page + service: eks-fleet + component: vend + annotations: + summary: fleet vend reconcile errors burning the budget (5% in 6h) + description: > + provider-opentofu reconcile errors exceed 6x the 99% success objective + over both the 6h and 30m windows — a sustained vend failure. Investigate + the failing Workspaces before the budget is exhausted. + data: + - refId: A + datasourceUid: managed-prometheus + relativeTimeRange: {from: 21600, to: 0} + model: + refId: A + datasource: {type: prometheus, uid: managed-prometheus} + editorMode: code + instant: true + range: false + intervalMs: 1000 + maxDataPoints: 43200 + expr: "(sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[6h])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[6h])), 1) / 0.01 > bool 6) * (sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[30m])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[30m])), 1) / 0.01 > bool 6)" + - refId: B + datasourceUid: __expr__ + relativeTimeRange: {from: 21600, to: 0} + model: + refId: B + type: threshold + datasource: {type: __expr__, uid: __expr__} + expression: A + conditions: + - type: query + evaluator: {type: gt, params: [0]} + operator: {type: and} + query: {params: [A]} + reducer: {type: last, params: []} + - uid: fleet-vend-provider-absent + title: FleetVendProviderAbsent + condition: B + for: 5m + noDataState: OK + execErrState: Error + isPaused: false + labels: + severity: page + service: eks-fleet + component: vend + annotations: + summary: provider-opentofu reconcile metrics absent — the vend provider is down or unscraped + description: > + No controller-runtime reconcile metrics from crossplane-system. The + provider-opentofu deployment is down, crashlooping, or the scrape broke — + the hub cannot vend or reconcile clusters. Check the provider pod and the + provider's Healthy condition. + data: + - refId: A + datasourceUid: managed-prometheus + relativeTimeRange: {from: 600, to: 0} + model: + refId: A + datasource: {type: prometheus, uid: managed-prometheus} + editorMode: code + instant: true + range: false + intervalMs: 1000 + maxDataPoints: 43200 + expr: "absent(controller_runtime_reconcile_total{namespace=\"crossplane-system\"})" + - refId: B + datasourceUid: __expr__ + relativeTimeRange: {from: 600, to: 0} + model: + refId: B + type: threshold + datasource: {type: __expr__, uid: __expr__} + expression: A + conditions: + - type: query + evaluator: {type: gt, params: [0]} + operator: {type: and} + query: {params: [A]} + reducer: {type: last, params: []} diff --git a/dashboards/base/kustomization.yaml b/dashboards/base/kustomization.yaml index 0d06f29..c3fe478 100644 --- a/dashboards/base/kustomization.yaml +++ b/dashboards/base/kustomization.yaml @@ -17,6 +17,7 @@ resources: # reconciled onto the external Amazon Managed Grafana. - alerting/folder.yaml - alerting/portal.yaml + - alerting/fleet-vend.yaml - platform/kubernetes-cluster.yaml - platform/kubernetes-views-pods.yaml - platform/kubernetes-views-namespaces.yaml @@ -40,6 +41,9 @@ resources: # Ops control-plane app (portal): API SLO/RED + tofu-run, River-job, watcher, # and pgxpool surfaces — self-contained PromQL over the portal_* metrics in AMP. - platform/portal.yaml + # eks-fleet vend pipeline: provider-opentofu reconcile RED + work-queue + + # reconcile-success SLO (controller-runtime metrics scraped on the hub). + - platform/fleet-vend.yaml - addons/kyverno.yaml - addons/trivy-operator.yaml - addons/falco.yaml diff --git a/dashboards/base/platform/fleet-vend.yaml b/dashboards/base/platform/fleet-vend.yaml new file mode 100644 index 0000000..004e9b7 --- /dev/null +++ b/dashboards/base/platform/fleet-vend.yaml @@ -0,0 +1,163 @@ +# eks-fleet — cluster vend pipeline (provider-opentofu) +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: fleet-vend +spec: + instanceSelector: + matchLabels: + dashboards: external + resyncPeriod: 24h + json: | + { + "title": "eks-fleet — cluster vend pipeline", + "uid": "fleet-vend", + "tags": ["eks-fleet", "vend", "slo"], + "timezone": "browser", + "schemaVersion": 39, + "refresh": "30s", + "time": { "from": "now-6h", "to": "now" }, + "panels": [ + { + "type": "row", + "title": "Vend reconcile SLO & error budget (99% reconcile success / 30d)", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + { + "type": "stat", + "title": "Reconcile success (30d)", + "datasource": "prometheus", + "description": "Fraction of provider-opentofu reconciles that did not error over 30d. Each reconcile is a tofu plan/apply cycle on a cluster Workspace.", + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "decimals": 4, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.99 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "1 - (sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[30d])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[30d])), 1))" } + ], + "gridPos": { "h": 5, "w": 6, "x": 0, "y": 1 } + }, + { + "type": "gauge", + "title": "Error budget remaining (30d)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 0.25 }, { "color": "green", "value": 0.5 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "clamp_min(1 - ((sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[30d])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[30d])), 1)) / 0.01), 0)" } + ], + "gridPos": { "h": 5, "w": 6, "x": 6, "y": 1 } + }, + { + "type": "stat", + "title": "Fast burn (1h)", + "datasource": "prometheus", + "description": "Reconcile-error burn over 1h. Pages at 14.4x (2% of budget in 1h).", + "fieldConfig": { "defaults": { "unit": "none", "decimals": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 14.4 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "(sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[1h])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[1h])), 1)) / 0.01" } + ], + "gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 } + }, + { + "type": "stat", + "title": "Slow burn (6h)", + "datasource": "prometheus", + "description": "Reconcile-error burn over 6h. Pages at 6x (5% of budget in 6h).", + "fieldConfig": { "defaults": { "unit": "none", "decimals": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 6 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "(sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[6h])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[6h])), 1)) / 0.01" } + ], + "gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 } + }, + { + "type": "row", + "title": "Vend provider RED — provider-opentofu (each reconcile = a tofu plan/apply cycle)", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 } + }, + { + "type": "timeseries", + "title": "Reconcile rate by controller (rec/s)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] }, + "targets": [ + { "expr": "sum by (controller) (rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[5m]))", "legendFormat": "{{controller}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 } + }, + { + "type": "timeseries", + "title": "Reconcile error rate by controller", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "percentunit", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.05 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "sum by (controller) (rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[5m])) / clamp_min(sum by (controller) (rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[5m])), 0.001)", "legendFormat": "{{controller}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 } + }, + { + "type": "timeseries", + "title": "Reconcile latency p50 / p95 / p99 (s) — vend step duration", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by (le) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"crossplane-system\"}[5m])))", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum by (le) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"crossplane-system\"}[5m])))", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum by (le) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"crossplane-system\"}[5m])))", "legendFormat": "p99" } + ], + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 14 } + }, + { + "type": "timeseries", + "title": "Reconcile p99 by controller (s)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { "expr": "histogram_quantile(0.99, sum by (le, controller) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"crossplane-system\"}[5m])))", "legendFormat": "{{controller}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 14 } + }, + { + "type": "row", + "title": "Work queue (pending vends / backpressure)", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 } + }, + { + "type": "timeseries", + "title": "Workqueue depth by queue", + "datasource": "prometheus", + "description": "Reconcile requests waiting — sustained depth means vends are queuing faster than the provider drains them.", + "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, + "targets": [ + { "expr": "sum by (name) (workqueue_depth{namespace=\"crossplane-system\"})", "legendFormat": "{{name}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 22 } + }, + { + "type": "timeseries", + "title": "Workqueue add rate by queue", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] }, + "targets": [ + { "expr": "sum by (name) (rate(workqueue_adds_total{namespace=\"crossplane-system\"}[5m]))", "legendFormat": "{{name}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 22 } + }, + { + "type": "timeseries", + "title": "Queue wait p95 by queue (s)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { "expr": "histogram_quantile(0.95, sum by (le, name) (rate(workqueue_queue_duration_seconds_bucket{namespace=\"crossplane-system\"}[5m])))", "legendFormat": "{{name}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 29 } + }, + { + "type": "timeseries", + "title": "Active workers by controller", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, + "targets": [ + { "expr": "sum by (controller) (controller_runtime_active_workers{namespace=\"crossplane-system\"})", "legendFormat": "{{controller}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 29 } + } + ] + }