diff --git a/.yamllint.yaml b/.yamllint.yaml index 415ca9e..5b8f014 100644 --- a/.yamllint.yaml +++ b/.yamllint.yaml @@ -3,9 +3,12 @@ extends: default ignore: | catalog/druid/chart/templates/ - # Persona dashboards embed Grafana JSON (PromQL exprs exceed line-length); the + # Authored dashboards embed Grafana JSON (PromQL exprs exceed line-length); the # JSON is validated by kustomize build + the GrafanaDashboard schema, not yamllint. dashboards/base/platform/agent-*.yaml + dashboards/base/platform/portal.yaml + # Alert rule groups carry long PromQL exprs in their query model. + dashboards/base/alerting/portal.yaml rules: line-length: diff --git a/dashboards/base/alerting/folder.yaml b/dashboards/base/alerting/folder.yaml new file mode 100644 index 0000000..86f6b7e --- /dev/null +++ b/dashboards/base/alerting/folder.yaml @@ -0,0 +1,12 @@ +# Grafana folder that holds the authored SLO / burn-rate alert rules. The +# grafana-operator creates it in the external Amazon Managed Grafana; the +# GrafanaAlertRuleGroup CRs reference it by name via folderRef. +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaFolder +metadata: + name: slo-alerts +spec: + instanceSelector: + matchLabels: + dashboards: external + title: SLO & burn-rate alerts diff --git a/dashboards/base/alerting/portal.yaml b/dashboards/base/alerting/portal.yaml new file mode 100644 index 0000000..ebfc804 --- /dev/null +++ b/dashboards/base/alerting/portal.yaml @@ -0,0 +1,204 @@ +# portal — Grafana-managed SLO / burn-rate alert rules, reconciled onto the +# external Amazon Managed Grafana by the grafana-operator. Mirrors the burn-rate +# windows the observability-slo standard defines (nanohype#123); each burn rule is +# a dual-window check (both the long and short window must exceed the factor), +# encoded with PromQL `> bool` products so a single instant query yields 1/0. +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaAlertRuleGroup +metadata: + name: portal-slo +spec: + instanceSelector: + matchLabels: + dashboards: external + folderRef: slo-alerts + interval: 1m + rules: + - uid: portal-fast-burn + title: PortalErrorBudgetFastBurn + condition: B + for: 2m + noDataState: OK + execErrState: Error + isPaused: false + labels: + severity: page + service: portal + annotations: + summary: portal is burning its error budget fast (2% in 1h) + description: > + API 5xx burn rate exceeds 14.4x the 99.9% availability objective over + both the 1h and 5m windows. At this rate the 30-day error budget is + exhausted within ~2 days. Check recent deploys and upstream health. + data: + - refId: A + datasourceUid: managed-prometheus + relativeTimeRange: + from: 3600 + to: 0 + model: + refId: A + datasource: {type: prometheus, uid: managed-prometheus} + editorMode: code + instant: true + range: false + intervalMs: 1000 + maxDataPoints: 43200 + expr: "(sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[1h])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[1h])), 1) / 0.001 > bool 14.4) * (sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[5m])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[5m])), 1) / 0.001 > bool 14.4)" + - refId: B + datasourceUid: __expr__ + relativeTimeRange: + from: 3600 + to: 0 + model: + refId: B + type: threshold + datasource: {type: __expr__, uid: __expr__} + expression: A + conditions: + - type: query + evaluator: {type: gt, params: [0]} + operator: {type: and} + query: {params: [A]} + reducer: {type: last, params: []} + - uid: portal-slow-burn + title: PortalErrorBudgetSlowBurn + condition: B + for: 15m + noDataState: OK + execErrState: Error + isPaused: false + labels: + severity: page + service: portal + annotations: + summary: portal is burning its error budget (5% in 6h) + description: > + API 5xx burn rate exceeds 6x the 99.9% availability objective over both + the 6h and 30m windows — a sustained, slower budget burn. Investigate + before it escalates to a fast burn. + data: + - refId: A + datasourceUid: managed-prometheus + relativeTimeRange: + from: 21600 + to: 0 + model: + refId: A + datasource: {type: prometheus, uid: managed-prometheus} + editorMode: code + instant: true + range: false + intervalMs: 1000 + maxDataPoints: 43200 + expr: "(sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[6h])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[6h])), 1) / 0.001 > bool 6) * (sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[30m])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[30m])), 1) / 0.001 > bool 6)" + - refId: B + datasourceUid: __expr__ + relativeTimeRange: + from: 21600 + to: 0 + model: + refId: B + type: threshold + datasource: {type: __expr__, uid: __expr__} + expression: A + conditions: + - type: query + evaluator: {type: gt, params: [0]} + operator: {type: and} + query: {params: [A]} + reducer: {type: last, params: []} + - uid: portal-watcher-stall + title: PortalWatcherStalled + condition: B + for: 5m + noDataState: OK + execErrState: Error + isPaused: false + labels: + severity: page + service: portal + annotations: + summary: a portal watcher loop has stalled + description: > + The most-stale watcher loop has not ticked in over 15 minutes — the + tenant/cluster inventory it reconciles is going stale. Check the worker + logs for a wedged or panicking loop. + data: + - refId: A + datasourceUid: managed-prometheus + relativeTimeRange: + from: 1800 + to: 0 + model: + refId: A + datasource: {type: prometheus, uid: managed-prometheus} + editorMode: code + instant: true + range: false + intervalMs: 1000 + maxDataPoints: 43200 + expr: "max(time() - portal_watcher_last_tick_timestamp_seconds)" + - refId: B + datasourceUid: __expr__ + relativeTimeRange: + from: 1800 + to: 0 + model: + refId: B + type: threshold + datasource: {type: __expr__, uid: __expr__} + expression: A + conditions: + - type: query + evaluator: {type: gt, params: [900]} + operator: {type: and} + query: {params: [A]} + reducer: {type: last, params: []} + - uid: portal-worker-job-errors + title: PortalWorkerJobErrorsHigh + condition: B + for: 15m + noDataState: OK + execErrState: Error + isPaused: false + labels: + severity: ticket + service: portal + annotations: + summary: portal worker jobs are erroring + description: > + River job errors/panics are sustained above 0.1/s over 15m — a job kind + is failing or silently retrying. Check portal_worker_job_errors_total by + kind and the worker logs. + data: + - refId: A + datasourceUid: managed-prometheus + relativeTimeRange: + from: 900 + to: 0 + model: + refId: A + datasource: {type: prometheus, uid: managed-prometheus} + editorMode: code + instant: true + range: false + intervalMs: 1000 + maxDataPoints: 43200 + expr: "sum(rate(portal_worker_job_errors_total[15m]))" + - refId: B + datasourceUid: __expr__ + relativeTimeRange: + from: 900 + to: 0 + model: + refId: B + type: threshold + datasource: {type: __expr__, uid: __expr__} + expression: A + conditions: + - type: query + evaluator: {type: gt, params: [0.1]} + operator: {type: and} + query: {params: [A]} + reducer: {type: last, params: []} diff --git a/dashboards/base/datasources/prometheus.yaml b/dashboards/base/datasources/prometheus.yaml index 041b2d8..a654e88 100644 --- a/dashboards/base/datasources/prometheus.yaml +++ b/dashboards/base/datasources/prometheus.yaml @@ -16,6 +16,9 @@ spec: dashboards: external datasource: name: ManagedPrometheus + # Pinned UID so Grafana-managed alert rules (GrafanaAlertRuleGroup) can + # reference this datasource deterministically instead of relying on isDefault. + uid: managed-prometheus type: prometheus access: proxy url: https://aps-workspaces.us-west-2.amazonaws.com/workspaces/PLACEHOLDER # patched per-env diff --git a/dashboards/base/kustomization.yaml b/dashboards/base/kustomization.yaml index 65dd503..0d06f29 100644 --- a/dashboards/base/kustomization.yaml +++ b/dashboards/base/kustomization.yaml @@ -13,6 +13,10 @@ resources: - datasources/prometheus.yaml - datasources/loki.yaml - datasources/tempo.yaml + # Grafana-managed SLO / burn-rate alert rules (folder + per-system rule groups), + # reconciled onto the external Amazon Managed Grafana. + - alerting/folder.yaml + - alerting/portal.yaml - platform/kubernetes-cluster.yaml - platform/kubernetes-views-pods.yaml - platform/kubernetes-views-namespaces.yaml @@ -33,6 +37,9 @@ resources: - platform/agent-finance.yaml - platform/agent-ops.yaml - platform/agent-founder.yaml + # Ops control-plane app (portal): API SLO/RED + tofu-run, River-job, watcher, + # and pgxpool surfaces — self-contained PromQL over the portal_* metrics in AMP. + - platform/portal.yaml - addons/kyverno.yaml - addons/trivy-operator.yaml - addons/falco.yaml diff --git a/dashboards/base/platform/portal.yaml b/dashboards/base/platform/portal.yaml new file mode 100644 index 0000000..74cece2 --- /dev/null +++ b/dashboards/base/platform/portal.yaml @@ -0,0 +1,221 @@ +# portal — ops control-plane (server + worker) +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: portal +spec: + instanceSelector: + matchLabels: + dashboards: external + resyncPeriod: 24h + json: | + { + "title": "portal — ops control-plane", + "uid": "portal-overview", + "tags": ["portal", "slo", "nanohype"], + "timezone": "browser", + "schemaVersion": 39, + "refresh": "30s", + "time": { "from": "now-6h", "to": "now" }, + "panels": [ + { + "type": "row", + "title": "API — SLO & error budget (99.9% availability / 30d)", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + { + "type": "stat", + "title": "API availability (30d)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "decimals": 4, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.999 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "1 - (sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[30d])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[30d])), 1))" } + ], + "gridPos": { "h": 5, "w": 6, "x": 0, "y": 1 } + }, + { + "type": "gauge", + "title": "Error budget remaining (30d)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 0.25 }, { "color": "green", "value": 0.5 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "clamp_min(1 - ((sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[30d])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[30d])), 1)) / 0.001), 0)" } + ], + "gridPos": { "h": 5, "w": 6, "x": 6, "y": 1 } + }, + { + "type": "stat", + "title": "Fast burn (1h)", + "datasource": "prometheus", + "description": "Burn rate over 1h. Pages at 14.4x (2% of budget in 1h).", + "fieldConfig": { "defaults": { "unit": "none", "decimals": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 14.4 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "(sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[1h])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[1h])), 1)) / 0.001" } + ], + "gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 } + }, + { + "type": "stat", + "title": "Slow burn (6h)", + "datasource": "prometheus", + "description": "Burn rate over 6h. Pages at 6x (5% of budget in 6h).", + "fieldConfig": { "defaults": { "unit": "none", "decimals": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 6 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "(sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[6h])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[6h])), 1)) / 0.001" } + ], + "gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 } + }, + { + "type": "row", + "title": "API — golden signals (RED + saturation)", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 } + }, + { + "type": "timeseries", + "title": "Request rate by status (req/s)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] }, + "targets": [ + { "expr": "sum by (status) (rate(portal_http_request_duration_seconds_count[5m]))", "legendFormat": "{{status}}" }, + { "expr": "sum(rate(portal_http_request_duration_seconds_count[5m]))", "legendFormat": "total" } + ], + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 } + }, + { + "type": "timeseries", + "title": "5xx error ratio (5m)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "percentunit", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.001 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[5m])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[5m])), 1)", "legendFormat": "5xx ratio" } + ], + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 } + }, + { + "type": "timeseries", + "title": "Request latency p50 / p95 / p99 (s)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by (le) (rate(portal_http_request_duration_seconds_bucket[5m])))", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum by (le) (rate(portal_http_request_duration_seconds_bucket[5m])))", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum by (le) (rate(portal_http_request_duration_seconds_bucket[5m])))", "legendFormat": "p99" } + ], + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 14 } + }, + { + "type": "timeseries", + "title": "Slowest routes (p99, s)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { "expr": "topk(10, histogram_quantile(0.99, sum by (le, route) (rate(portal_http_request_duration_seconds_bucket[5m]))))", "legendFormat": "{{route}}" } + ], + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 14 } + }, + { + "type": "timeseries", + "title": "Saturation — in-flight + DB pool", + "datasource": "prometheus", + "description": "HTTP requests in flight and pgx pool acquired vs max.", + "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, + "targets": [ + { "expr": "sum(portal_http_requests_in_flight)", "legendFormat": "http in-flight" }, + { "expr": "sum(portal_db_pool_connections_acquired)", "legendFormat": "db acquired" }, + { "expr": "max(portal_db_pool_connections_max)", "legendFormat": "db max" } + ], + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 21 } + }, + { + "type": "timeseries", + "title": "DB pool acquire-wait rate", + "datasource": "prometheus", + "description": "Cumulative connection-acquire waits — climbing means pool contention.", + "fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] }, + "targets": [ + { "expr": "sum(rate(portal_db_pool_acquire_wait_total[5m]))", "legendFormat": "acquire waits/s" } + ], + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 21 } + }, + { + "type": "row", + "title": "tofu / terragrunt runs", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 } + }, + { + "type": "timeseries", + "title": "Run rate by operation & status (runs/h)", + "datasource": "prometheus", + "description": "tofu/terragrunt run completions — the core infra-execution health signal.", + "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, + "targets": [ + { "expr": "sum by (operation, status) (rate(portal_tofu_run_duration_seconds_count[1h]) * 3600)", "legendFormat": "{{operation}} {{status}}" } + ], + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 } + }, + { + "type": "timeseries", + "title": "Run duration p50 / p95 by operation (s)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by (le, operation) (rate(portal_tofu_run_duration_seconds_bucket[30m])))", "legendFormat": "p50 {{operation}}" }, + { "expr": "histogram_quantile(0.95, sum by (le, operation) (rate(portal_tofu_run_duration_seconds_bucket[30m])))", "legendFormat": "p95 {{operation}}" } + ], + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 } + }, + { + "type": "row", + "title": "worker — River jobs", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 } + }, + { + "type": "timeseries", + "title": "Jobs by state (queue depth & backlog)", + "datasource": "prometheus", + "description": "available/pending/scheduled = backlog; running = active; retryable/discarded/cancelled = trouble.", + "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, + "targets": [ + { "expr": "sum by (state) (portal_worker_jobs)", "legendFormat": "{{state}}" } + ], + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 38 } + }, + { + "type": "timeseries", + "title": "Job errors & panics (per 5m)", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] }, + "targets": [ + { "expr": "sum by (kind, event) (rate(portal_worker_job_errors_total[5m]))", "legendFormat": "{{kind}} {{event}}" } + ], + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 38 } + }, + { + "type": "row", + "title": "watcher loops (liveness)", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 46 } + }, + { + "type": "timeseries", + "title": "Time since last successful tick (s)", + "datasource": "prometheus", + "description": "A loop that stalls climbs without bound — the tenant/cluster watchers' staleness signal.", + "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 300 }, { "color": "red", "value": 900 } ] } }, "overrides": [] }, + "targets": [ + { "expr": "time() - max by (loop) (portal_watcher_last_tick_timestamp_seconds)", "legendFormat": "{{loop}}" } + ], + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 47 } + }, + { + "type": "timeseries", + "title": "Tick p95 (s) & panic rate", + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { "expr": "histogram_quantile(0.95, sum by (le, loop) (rate(portal_watcher_tick_duration_seconds_bucket[5m])))", "legendFormat": "p95 {{loop}}" }, + { "expr": "sum by (loop) (rate(portal_watcher_panics_total[5m]))", "legendFormat": "panics/s {{loop}}" } + ], + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 47 } + } + ] + }