Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .yamllint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ ignore: |
# JSON is validated by kustomize build + the GrafanaDashboard schema, not yamllint.
dashboards/base/platform/agent-*.yaml
dashboards/base/platform/portal.yaml
dashboards/base/platform/fleet-vend.yaml
# Alert rule groups carry long PromQL exprs in their query model.
dashboards/base/alerting/portal.yaml
dashboards/base/alerting/fleet-vend.yaml

rules:
line-length:
Expand Down
153 changes: 153 additions & 0 deletions dashboards/base/alerting/fleet-vend.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# eks-fleet vend pipeline — Grafana-managed SLO / health alert rules. The SLO is
# "99% of provider-opentofu reconciles succeed over 30d" (budget 0.01); a failing
# reconcile is a tofu plan/apply error on a cluster Workspace. Self-contained over
# the provider's controller-runtime metrics, which reach AMP once the
# provider-opentofu pod carries the prometheus.io/scrape annotation (eks-fleet
# vend-provider-scrape). Per-cluster vend inventory/readiness (Cluster + Workspace
# CR state) is a follow-up that needs the kube-state-metrics customResourceState
# config extended and validated on a live hub — tracked separately.
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaAlertRuleGroup
metadata:
name: fleet-vend-slo
spec:
instanceSelector:
matchLabels:
dashboards: external
folderRef: slo-alerts
interval: 1m
rules:
- uid: fleet-vend-fast-burn
title: FleetVendReconcileFastBurn
condition: B
for: 2m
noDataState: OK
execErrState: Error
isPaused: false
labels:
severity: page
service: eks-fleet
component: vend
annotations:
summary: fleet vend reconcile errors burning the budget fast (2% in 1h)
description: >
provider-opentofu reconcile errors exceed 14.4x the 99% success
objective over both the 1h and 5m windows. Vends are failing — inspect
the cluster Workspaces' Synced condition (kubectl get workspace) for the
tofu/apply error, and the provider-opentofu pod logs.
data:
- refId: A
datasourceUid: managed-prometheus
relativeTimeRange: {from: 3600, to: 0}
model:
refId: A
datasource: {type: prometheus, uid: managed-prometheus}
editorMode: code
instant: true
range: false
intervalMs: 1000
maxDataPoints: 43200
expr: "(sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[1h])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[1h])), 1) / 0.01 > bool 14.4) * (sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[5m])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[5m])), 1) / 0.01 > bool 14.4)"
- refId: B
datasourceUid: __expr__
relativeTimeRange: {from: 3600, to: 0}
model:
refId: B
type: threshold
datasource: {type: __expr__, uid: __expr__}
expression: A
conditions:
- type: query
evaluator: {type: gt, params: [0]}
operator: {type: and}
query: {params: [A]}
reducer: {type: last, params: []}
- uid: fleet-vend-slow-burn
title: FleetVendReconcileSlowBurn
condition: B
for: 15m
noDataState: OK
execErrState: Error
isPaused: false
labels:
severity: page
service: eks-fleet
component: vend
annotations:
summary: fleet vend reconcile errors burning the budget (5% in 6h)
description: >
provider-opentofu reconcile errors exceed 6x the 99% success objective
over both the 6h and 30m windows — a sustained vend failure. Investigate
the failing Workspaces before the budget is exhausted.
data:
- refId: A
datasourceUid: managed-prometheus
relativeTimeRange: {from: 21600, to: 0}
model:
refId: A
datasource: {type: prometheus, uid: managed-prometheus}
editorMode: code
instant: true
range: false
intervalMs: 1000
maxDataPoints: 43200
expr: "(sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[6h])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[6h])), 1) / 0.01 > bool 6) * (sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[30m])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[30m])), 1) / 0.01 > bool 6)"
- refId: B
datasourceUid: __expr__
relativeTimeRange: {from: 21600, to: 0}
model:
refId: B
type: threshold
datasource: {type: __expr__, uid: __expr__}
expression: A
conditions:
- type: query
evaluator: {type: gt, params: [0]}
operator: {type: and}
query: {params: [A]}
reducer: {type: last, params: []}
- uid: fleet-vend-provider-absent
title: FleetVendProviderAbsent
condition: B
for: 5m
noDataState: OK
execErrState: Error
isPaused: false
labels:
severity: page
service: eks-fleet
component: vend
annotations:
summary: provider-opentofu reconcile metrics absent — the vend provider is down or unscraped
description: >
No controller-runtime reconcile metrics from crossplane-system. The
provider-opentofu deployment is down, crashlooping, or the scrape broke —
the hub cannot vend or reconcile clusters. Check the provider pod and the
provider's Healthy condition.
data:
- refId: A
datasourceUid: managed-prometheus
relativeTimeRange: {from: 600, to: 0}
model:
refId: A
datasource: {type: prometheus, uid: managed-prometheus}
editorMode: code
instant: true
range: false
intervalMs: 1000
maxDataPoints: 43200
expr: "absent(controller_runtime_reconcile_total{namespace=\"crossplane-system\"})"
- refId: B
datasourceUid: __expr__
relativeTimeRange: {from: 600, to: 0}
model:
refId: B
type: threshold
datasource: {type: __expr__, uid: __expr__}
expression: A
conditions:
- type: query
evaluator: {type: gt, params: [0]}
operator: {type: and}
query: {params: [A]}
reducer: {type: last, params: []}
4 changes: 4 additions & 0 deletions dashboards/base/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ resources:
# reconciled onto the external Amazon Managed Grafana.
- alerting/folder.yaml
- alerting/portal.yaml
- alerting/fleet-vend.yaml
- platform/kubernetes-cluster.yaml
- platform/kubernetes-views-pods.yaml
- platform/kubernetes-views-namespaces.yaml
Expand All @@ -40,6 +41,9 @@ resources:
# Ops control-plane app (portal): API SLO/RED + tofu-run, River-job, watcher,
# and pgxpool surfaces — self-contained PromQL over the portal_* metrics in AMP.
- platform/portal.yaml
# eks-fleet vend pipeline: provider-opentofu reconcile RED + work-queue +
# reconcile-success SLO (controller-runtime metrics scraped on the hub).
- platform/fleet-vend.yaml
- addons/kyverno.yaml
- addons/trivy-operator.yaml
- addons/falco.yaml
Expand Down
163 changes: 163 additions & 0 deletions dashboards/base/platform/fleet-vend.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
# eks-fleet — cluster vend pipeline (provider-opentofu)
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: fleet-vend
spec:
instanceSelector:
matchLabels:
dashboards: external
resyncPeriod: 24h
json: |
{
"title": "eks-fleet — cluster vend pipeline",
"uid": "fleet-vend",
"tags": ["eks-fleet", "vend", "slo"],
"timezone": "browser",
"schemaVersion": 39,
"refresh": "30s",
"time": { "from": "now-6h", "to": "now" },
"panels": [
{
"type": "row",
"title": "Vend reconcile SLO & error budget (99% reconcile success / 30d)",
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
},
{
"type": "stat",
"title": "Reconcile success (30d)",
"datasource": "prometheus",
"description": "Fraction of provider-opentofu reconciles that did not error over 30d. Each reconcile is a tofu plan/apply cycle on a cluster Workspace.",
"fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "decimals": 4, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.99 } ] } }, "overrides": [] },
"targets": [
{ "expr": "1 - (sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[30d])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[30d])), 1))" }
],
"gridPos": { "h": 5, "w": 6, "x": 0, "y": 1 }
},
{
"type": "gauge",
"title": "Error budget remaining (30d)",
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 0.25 }, { "color": "green", "value": 0.5 } ] } }, "overrides": [] },
"targets": [
{ "expr": "clamp_min(1 - ((sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[30d])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[30d])), 1)) / 0.01), 0)" }
],
"gridPos": { "h": 5, "w": 6, "x": 6, "y": 1 }
},
{
"type": "stat",
"title": "Fast burn (1h)",
"datasource": "prometheus",
"description": "Reconcile-error burn over 1h. Pages at 14.4x (2% of budget in 1h).",
"fieldConfig": { "defaults": { "unit": "none", "decimals": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 14.4 } ] } }, "overrides": [] },
"targets": [
{ "expr": "(sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[1h])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[1h])), 1)) / 0.01" }
],
"gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 }
},
{
"type": "stat",
"title": "Slow burn (6h)",
"datasource": "prometheus",
"description": "Reconcile-error burn over 6h. Pages at 6x (5% of budget in 6h).",
"fieldConfig": { "defaults": { "unit": "none", "decimals": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 6 } ] } }, "overrides": [] },
"targets": [
{ "expr": "(sum(rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[6h])) / clamp_min(sum(rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[6h])), 1)) / 0.01" }
],
"gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 }
},
{
"type": "row",
"title": "Vend provider RED — provider-opentofu (each reconcile = a tofu plan/apply cycle)",
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
},
{
"type": "timeseries",
"title": "Reconcile rate by controller (rec/s)",
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
"targets": [
{ "expr": "sum by (controller) (rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[5m]))", "legendFormat": "{{controller}}" }
],
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 }
},
{
"type": "timeseries",
"title": "Reconcile error rate by controller",
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "percentunit", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.05 } ] } }, "overrides": [] },
"targets": [
{ "expr": "sum by (controller) (rate(controller_runtime_reconcile_errors_total{namespace=\"crossplane-system\"}[5m])) / clamp_min(sum by (controller) (rate(controller_runtime_reconcile_total{namespace=\"crossplane-system\"}[5m])), 0.001)", "legendFormat": "{{controller}}" }
],
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 }
},
{
"type": "timeseries",
"title": "Reconcile latency p50 / p95 / p99 (s) — vend step duration",
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
"targets": [
{ "expr": "histogram_quantile(0.50, sum by (le) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"crossplane-system\"}[5m])))", "legendFormat": "p50" },
{ "expr": "histogram_quantile(0.95, sum by (le) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"crossplane-system\"}[5m])))", "legendFormat": "p95" },
{ "expr": "histogram_quantile(0.99, sum by (le) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"crossplane-system\"}[5m])))", "legendFormat": "p99" }
],
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 14 }
},
{
"type": "timeseries",
"title": "Reconcile p99 by controller (s)",
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
"targets": [
{ "expr": "histogram_quantile(0.99, sum by (le, controller) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=\"crossplane-system\"}[5m])))", "legendFormat": "{{controller}}" }
],
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 14 }
},
{
"type": "row",
"title": "Work queue (pending vends / backpressure)",
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 }
},
{
"type": "timeseries",
"title": "Workqueue depth by queue",
"datasource": "prometheus",
"description": "Reconcile requests waiting — sustained depth means vends are queuing faster than the provider drains them.",
"fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
"targets": [
{ "expr": "sum by (name) (workqueue_depth{namespace=\"crossplane-system\"})", "legendFormat": "{{name}}" }
],
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 22 }
},
{
"type": "timeseries",
"title": "Workqueue add rate by queue",
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
"targets": [
{ "expr": "sum by (name) (rate(workqueue_adds_total{namespace=\"crossplane-system\"}[5m]))", "legendFormat": "{{name}}" }
],
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 22 }
},
{
"type": "timeseries",
"title": "Queue wait p95 by queue (s)",
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
"targets": [
{ "expr": "histogram_quantile(0.95, sum by (le, name) (rate(workqueue_queue_duration_seconds_bucket{namespace=\"crossplane-system\"}[5m])))", "legendFormat": "{{name}}" }
],
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 29 }
},
{
"type": "timeseries",
"title": "Active workers by controller",
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
"targets": [
{ "expr": "sum by (controller) (controller_runtime_active_workers{namespace=\"crossplane-system\"})", "legendFormat": "{{controller}}" }
],
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 29 }
}
]
}