nanohype · stxkxs · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/.yamllint.yaml b/.yamllint.yaml
@@ -3,9 +3,12 @@ extends: default
 
 ignore: |
   catalog/druid/chart/templates/
-  # Persona dashboards embed Grafana JSON (PromQL exprs exceed line-length); the
+  # Authored dashboards embed Grafana JSON (PromQL exprs exceed line-length); the
   # JSON is validated by kustomize build + the GrafanaDashboard schema, not yamllint.
   dashboards/base/platform/agent-*.yaml
+  dashboards/base/platform/portal.yaml
+  # Alert rule groups carry long PromQL exprs in their query model.
+  dashboards/base/alerting/portal.yaml
 
 rules:
   line-length:

diff --git a/dashboards/base/alerting/folder.yaml b/dashboards/base/alerting/folder.yaml
@@ -0,0 +1,12 @@
+# Grafana folder that holds the authored SLO / burn-rate alert rules. The
+# grafana-operator creates it in the external Amazon Managed Grafana; the
+# GrafanaAlertRuleGroup CRs reference it by name via folderRef.
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaFolder
+metadata:
+  name: slo-alerts
+spec:
+  instanceSelector:
+    matchLabels:
+      dashboards: external
+  title: SLO & burn-rate alerts
diff --git a/dashboards/base/alerting/portal.yaml b/dashboards/base/alerting/portal.yaml
@@ -0,0 +1,204 @@
+# portal — Grafana-managed SLO / burn-rate alert rules, reconciled onto the
+# external Amazon Managed Grafana by the grafana-operator. Mirrors the burn-rate
+# windows the observability-slo standard defines (nanohype#123); each burn rule is
+# a dual-window check (both the long and short window must exceed the factor),
+# encoded with PromQL `> bool` products so a single instant query yields 1/0.
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaAlertRuleGroup
+metadata:
+  name: portal-slo
+spec:
+  instanceSelector:
+    matchLabels:
+      dashboards: external
+  folderRef: slo-alerts
+  interval: 1m
+  rules:
+    - uid: portal-fast-burn
+      title: PortalErrorBudgetFastBurn
+      condition: B
+      for: 2m
+      noDataState: OK
+      execErrState: Error
+      isPaused: false
+      labels:
+        severity: page
+        service: portal
+      annotations:
+        summary: portal is burning its error budget fast (2% in 1h)
+        description: >
+          API 5xx burn rate exceeds 14.4x the 99.9% availability objective over
+          both the 1h and 5m windows. At this rate the 30-day error budget is
+          exhausted within ~2 days. Check recent deploys and upstream health.
+      data:
+        - refId: A
+          datasourceUid: managed-prometheus
+          relativeTimeRange:
+            from: 3600
+            to: 0
+          model:
+            refId: A
+            datasource: {type: prometheus, uid: managed-prometheus}
+            editorMode: code
+            instant: true
+            range: false
+            intervalMs: 1000
+            maxDataPoints: 43200
+            expr: "(sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[1h])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[1h])), 1) / 0.001 > bool 14.4) * (sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[5m])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[5m])), 1) / 0.001 > bool 14.4)"
+        - refId: B
+          datasourceUid: __expr__
+          relativeTimeRange:
+            from: 3600
+            to: 0
+          model:
+            refId: B
+            type: threshold
+            datasource: {type: __expr__, uid: __expr__}
+            expression: A
+            conditions:
+              - type: query
+                evaluator: {type: gt, params: [0]}
+                operator: {type: and}
+                query: {params: [A]}
+                reducer: {type: last, params: []}
+    - uid: portal-slow-burn
+      title: PortalErrorBudgetSlowBurn
+      condition: B
+      for: 15m
+      noDataState: OK
+      execErrState: Error
+      isPaused: false
+      labels:
+        severity: page
+        service: portal
+      annotations:
+        summary: portal is burning its error budget (5% in 6h)
+        description: >
+          API 5xx burn rate exceeds 6x the 99.9% availability objective over both
+          the 6h and 30m windows — a sustained, slower budget burn. Investigate
+          before it escalates to a fast burn.
+      data:
+        - refId: A
+          datasourceUid: managed-prometheus
+          relativeTimeRange:
+            from: 21600
+            to: 0
+          model:
+            refId: A
+            datasource: {type: prometheus, uid: managed-prometheus}
+            editorMode: code
+            instant: true
+            range: false
+            intervalMs: 1000
+            maxDataPoints: 43200
+            expr: "(sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[6h])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[6h])), 1) / 0.001 > bool 6) * (sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[30m])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[30m])), 1) / 0.001 > bool 6)"
+        - refId: B
+          datasourceUid: __expr__
+          relativeTimeRange:
+            from: 21600
+            to: 0
+          model:
+            refId: B
+            type: threshold
+            datasource: {type: __expr__, uid: __expr__}
+            expression: A
+            conditions:
+              - type: query
+                evaluator: {type: gt, params: [0]}
+                operator: {type: and}
+                query: {params: [A]}
+                reducer: {type: last, params: []}
+    - uid: portal-watcher-stall
+      title: PortalWatcherStalled
+      condition: B
+      for: 5m
+      noDataState: OK
+      execErrState: Error
+      isPaused: false
+      labels:
+        severity: page
+        service: portal
+      annotations:
+        summary: a portal watcher loop has stalled
+        description: >
+          The most-stale watcher loop has not ticked in over 15 minutes — the
+          tenant/cluster inventory it reconciles is going stale. Check the worker
+          logs for a wedged or panicking loop.
+      data:
+        - refId: A
+          datasourceUid: managed-prometheus
+          relativeTimeRange:
+            from: 1800
+            to: 0
+          model:
+            refId: A
+            datasource: {type: prometheus, uid: managed-prometheus}
+            editorMode: code
+            instant: true
+            range: false
+            intervalMs: 1000
+            maxDataPoints: 43200
+            expr: "max(time() - portal_watcher_last_tick_timestamp_seconds)"
+        - refId: B
+          datasourceUid: __expr__
+          relativeTimeRange:
+            from: 1800
+            to: 0
+          model:
+            refId: B
+            type: threshold
+            datasource: {type: __expr__, uid: __expr__}
+            expression: A
+            conditions:
+              - type: query
+                evaluator: {type: gt, params: [900]}
+                operator: {type: and}
+                query: {params: [A]}
+                reducer: {type: last, params: []}
+    - uid: portal-worker-job-errors
+      title: PortalWorkerJobErrorsHigh
+      condition: B
+      for: 15m
+      noDataState: OK
+      execErrState: Error
+      isPaused: false
+      labels:
+        severity: ticket
+        service: portal
+      annotations:
+        summary: portal worker jobs are erroring
+        description: >
+          River job errors/panics are sustained above 0.1/s over 15m — a job kind
+          is failing or silently retrying. Check portal_worker_job_errors_total by
+          kind and the worker logs.
+      data:
+        - refId: A
+          datasourceUid: managed-prometheus
+          relativeTimeRange:
+            from: 900
+            to: 0
+          model:
+            refId: A
+            datasource: {type: prometheus, uid: managed-prometheus}
+            editorMode: code
+            instant: true
+            range: false
+            intervalMs: 1000
+            maxDataPoints: 43200
+            expr: "sum(rate(portal_worker_job_errors_total[15m]))"
+        - refId: B
+          datasourceUid: __expr__
+          relativeTimeRange:
+            from: 900
+            to: 0
+          model:
+            refId: B
+            type: threshold
+            datasource: {type: __expr__, uid: __expr__}
+            expression: A
+            conditions:
+              - type: query
+                evaluator: {type: gt, params: [0.1]}
+                operator: {type: and}
+                query: {params: [A]}
+                reducer: {type: last, params: []}
diff --git a/dashboards/base/datasources/prometheus.yaml b/dashboards/base/datasources/prometheus.yaml
@@ -16,6 +16,9 @@ spec:
       dashboards: external
   datasource:
     name: ManagedPrometheus
+    # Pinned UID so Grafana-managed alert rules (GrafanaAlertRuleGroup) can
+    # reference this datasource deterministically instead of relying on isDefault.
+    uid: managed-prometheus
     type: prometheus
     access: proxy
     url: https://aps-workspaces.us-west-2.amazonaws.com/workspaces/PLACEHOLDER  # patched per-env

diff --git a/dashboards/base/kustomization.yaml b/dashboards/base/kustomization.yaml
@@ -13,6 +13,10 @@ resources:
   - datasources/prometheus.yaml
   - datasources/loki.yaml
   - datasources/tempo.yaml
+  # Grafana-managed SLO / burn-rate alert rules (folder + per-system rule groups),
+  # reconciled onto the external Amazon Managed Grafana.
+  - alerting/folder.yaml
+  - alerting/portal.yaml
   - platform/kubernetes-cluster.yaml
   - platform/kubernetes-views-pods.yaml
   - platform/kubernetes-views-namespaces.yaml
@@ -33,6 +37,9 @@ resources:
   - platform/agent-finance.yaml
   - platform/agent-ops.yaml
   - platform/agent-founder.yaml
+  # Ops control-plane app (portal): API SLO/RED + tofu-run, River-job, watcher,
+  # and pgxpool surfaces — self-contained PromQL over the portal_* metrics in AMP.
+  - platform/portal.yaml
   - addons/kyverno.yaml
   - addons/trivy-operator.yaml
   - addons/falco.yaml