diff --git a/.yamllint.yaml b/.yamllint.yaml
index 415ca9e..5b8f014 100644
--- a/.yamllint.yaml
+++ b/.yamllint.yaml
@@ -3,9 +3,12 @@ extends: default
 
 ignore: |
   catalog/druid/chart/templates/
-  # Persona dashboards embed Grafana JSON (PromQL exprs exceed line-length); the
+  # Authored dashboards embed Grafana JSON (PromQL exprs exceed line-length); the
   # JSON is validated by kustomize build + the GrafanaDashboard schema, not yamllint.
   dashboards/base/platform/agent-*.yaml
+  dashboards/base/platform/portal.yaml
+  # Alert rule groups carry long PromQL exprs in their query model.
+  dashboards/base/alerting/portal.yaml
 
 rules:
   line-length:
diff --git a/dashboards/base/alerting/folder.yaml b/dashboards/base/alerting/folder.yaml
new file mode 100644
index 0000000..86f6b7e
--- /dev/null
+++ b/dashboards/base/alerting/folder.yaml
@@ -0,0 +1,12 @@
+# Grafana folder that holds the authored SLO / burn-rate alert rules. The
+# grafana-operator creates it in the external Amazon Managed Grafana; the
+# GrafanaAlertRuleGroup CRs reference it by name via folderRef.
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaFolder
+metadata:
+  name: slo-alerts
+spec:
+  instanceSelector:
+    matchLabels:
+      dashboards: external
+  title: SLO & burn-rate alerts
diff --git a/dashboards/base/alerting/portal.yaml b/dashboards/base/alerting/portal.yaml
new file mode 100644
index 0000000..ebfc804
--- /dev/null
+++ b/dashboards/base/alerting/portal.yaml
@@ -0,0 +1,204 @@
+# portal — Grafana-managed SLO / burn-rate alert rules, reconciled onto the
+# external Amazon Managed Grafana by the grafana-operator. Mirrors the burn-rate
+# windows the observability-slo standard defines (nanohype#123); each burn rule is
+# a dual-window check (both the long and short window must exceed the factor),
+# encoded with PromQL `> bool` products so a single instant query yields 1/0.
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaAlertRuleGroup
+metadata:
+  name: portal-slo
+spec:
+  instanceSelector:
+    matchLabels:
+      dashboards: external
+  folderRef: slo-alerts
+  interval: 1m
+  rules:
+    - uid: portal-fast-burn
+      title: PortalErrorBudgetFastBurn
+      condition: B
+      for: 2m
+      noDataState: OK
+      execErrState: Error
+      isPaused: false
+      labels:
+        severity: page
+        service: portal
+      annotations:
+        summary: portal is burning its error budget fast (2% in 1h)
+        description: >
+          API 5xx burn rate exceeds 14.4x the 99.9% availability objective over
+          both the 1h and 5m windows. At this rate the 30-day error budget is
+          exhausted within ~2 days. Check recent deploys and upstream health.
+      data:
+        - refId: A
+          datasourceUid: managed-prometheus
+          relativeTimeRange:
+            from: 3600
+            to: 0
+          model:
+            refId: A
+            datasource: {type: prometheus, uid: managed-prometheus}
+            editorMode: code
+            instant: true
+            range: false
+            intervalMs: 1000
+            maxDataPoints: 43200
+            expr: "(sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[1h])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[1h])), 1) / 0.001 > bool 14.4) * (sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[5m])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[5m])), 1) / 0.001 > bool 14.4)"
+        - refId: B
+          datasourceUid: __expr__
+          relativeTimeRange:
+            from: 3600
+            to: 0
+          model:
+            refId: B
+            type: threshold
+            datasource: {type: __expr__, uid: __expr__}
+            expression: A
+            conditions:
+              - type: query
+                evaluator: {type: gt, params: [0]}
+                operator: {type: and}
+                query: {params: [A]}
+                reducer: {type: last, params: []}
+    - uid: portal-slow-burn
+      title: PortalErrorBudgetSlowBurn
+      condition: B
+      for: 15m
+      noDataState: OK
+      execErrState: Error
+      isPaused: false
+      labels:
+        severity: page
+        service: portal
+      annotations:
+        summary: portal is burning its error budget (5% in 6h)
+        description: >
+          API 5xx burn rate exceeds 6x the 99.9% availability objective over both
+          the 6h and 30m windows — a sustained, slower budget burn. Investigate
+          before it escalates to a fast burn.
+      data:
+        - refId: A
+          datasourceUid: managed-prometheus
+          relativeTimeRange:
+            from: 21600
+            to: 0
+          model:
+            refId: A
+            datasource: {type: prometheus, uid: managed-prometheus}
+            editorMode: code
+            instant: true
+            range: false
+            intervalMs: 1000
+            maxDataPoints: 43200
+            expr: "(sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[6h])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[6h])), 1) / 0.001 > bool 6) * (sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[30m])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[30m])), 1) / 0.001 > bool 6)"
+        - refId: B
+          datasourceUid: __expr__
+          relativeTimeRange:
+            from: 21600
+            to: 0
+          model:
+            refId: B
+            type: threshold
+            datasource: {type: __expr__, uid: __expr__}
+            expression: A
+            conditions:
+              - type: query
+                evaluator: {type: gt, params: [0]}
+                operator: {type: and}
+                query: {params: [A]}
+                reducer: {type: last, params: []}
+    - uid: portal-watcher-stall
+      title: PortalWatcherStalled
+      condition: B
+      for: 5m
+      noDataState: OK
+      execErrState: Error
+      isPaused: false
+      labels:
+        severity: page
+        service: portal
+      annotations:
+        summary: a portal watcher loop has stalled
+        description: >
+          The most-stale watcher loop has not ticked in over 15 minutes — the
+          tenant/cluster inventory it reconciles is going stale. Check the worker
+          logs for a wedged or panicking loop.
+      data:
+        - refId: A
+          datasourceUid: managed-prometheus
+          relativeTimeRange:
+            from: 1800
+            to: 0
+          model:
+            refId: A
+            datasource: {type: prometheus, uid: managed-prometheus}
+            editorMode: code
+            instant: true
+            range: false
+            intervalMs: 1000
+            maxDataPoints: 43200
+            expr: "max(time() - portal_watcher_last_tick_timestamp_seconds)"
+        - refId: B
+          datasourceUid: __expr__
+          relativeTimeRange:
+            from: 1800
+            to: 0
+          model:
+            refId: B
+            type: threshold
+            datasource: {type: __expr__, uid: __expr__}
+            expression: A
+            conditions:
+              - type: query
+                evaluator: {type: gt, params: [900]}
+                operator: {type: and}
+                query: {params: [A]}
+                reducer: {type: last, params: []}
+    - uid: portal-worker-job-errors
+      title: PortalWorkerJobErrorsHigh
+      condition: B
+      for: 15m
+      noDataState: OK
+      execErrState: Error
+      isPaused: false
+      labels:
+        severity: ticket
+        service: portal
+      annotations:
+        summary: portal worker jobs are erroring
+        description: >
+          River job errors/panics are sustained above 0.1/s over 15m — a job kind
+          is failing or silently retrying. Check portal_worker_job_errors_total by
+          kind and the worker logs.
+      data:
+        - refId: A
+          datasourceUid: managed-prometheus
+          relativeTimeRange:
+            from: 900
+            to: 0
+          model:
+            refId: A
+            datasource: {type: prometheus, uid: managed-prometheus}
+            editorMode: code
+            instant: true
+            range: false
+            intervalMs: 1000
+            maxDataPoints: 43200
+            expr: "sum(rate(portal_worker_job_errors_total[15m]))"
+        - refId: B
+          datasourceUid: __expr__
+          relativeTimeRange:
+            from: 900
+            to: 0
+          model:
+            refId: B
+            type: threshold
+            datasource: {type: __expr__, uid: __expr__}
+            expression: A
+            conditions:
+              - type: query
+                evaluator: {type: gt, params: [0.1]}
+                operator: {type: and}
+                query: {params: [A]}
+                reducer: {type: last, params: []}
diff --git a/dashboards/base/datasources/prometheus.yaml b/dashboards/base/datasources/prometheus.yaml
index 041b2d8..a654e88 100644
--- a/dashboards/base/datasources/prometheus.yaml
+++ b/dashboards/base/datasources/prometheus.yaml
@@ -16,6 +16,9 @@ spec:
       dashboards: external
   datasource:
     name: ManagedPrometheus
+    # Pinned UID so Grafana-managed alert rules (GrafanaAlertRuleGroup) can
+    # reference this datasource deterministically instead of relying on isDefault.
+    uid: managed-prometheus
     type: prometheus
     access: proxy
     url: https://aps-workspaces.us-west-2.amazonaws.com/workspaces/PLACEHOLDER  # patched per-env
diff --git a/dashboards/base/kustomization.yaml b/dashboards/base/kustomization.yaml
index 65dd503..0d06f29 100644
--- a/dashboards/base/kustomization.yaml
+++ b/dashboards/base/kustomization.yaml
@@ -13,6 +13,10 @@ resources:
   - datasources/prometheus.yaml
   - datasources/loki.yaml
   - datasources/tempo.yaml
+  # Grafana-managed SLO / burn-rate alert rules (folder + per-system rule groups),
+  # reconciled onto the external Amazon Managed Grafana.
+  - alerting/folder.yaml
+  - alerting/portal.yaml
   - platform/kubernetes-cluster.yaml
   - platform/kubernetes-views-pods.yaml
   - platform/kubernetes-views-namespaces.yaml
@@ -33,6 +37,9 @@ resources:
   - platform/agent-finance.yaml
   - platform/agent-ops.yaml
   - platform/agent-founder.yaml
+  # Ops control-plane app (portal): API SLO/RED + tofu-run, River-job, watcher,
+  # and pgxpool surfaces — self-contained PromQL over the portal_* metrics in AMP.
+  - platform/portal.yaml
   - addons/kyverno.yaml
   - addons/trivy-operator.yaml
   - addons/falco.yaml
diff --git a/dashboards/base/platform/portal.yaml b/dashboards/base/platform/portal.yaml
new file mode 100644
index 0000000..74cece2
--- /dev/null
+++ b/dashboards/base/platform/portal.yaml
@@ -0,0 +1,221 @@
+# portal — ops control-plane (server + worker)
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaDashboard
+metadata:
+  name: portal
+spec:
+  instanceSelector:
+    matchLabels:
+      dashboards: external
+  resyncPeriod: 24h
+  json: |
+    {
+      "title": "portal — ops control-plane",
+      "uid": "portal-overview",
+      "tags": ["portal", "slo", "nanohype"],
+      "timezone": "browser",
+      "schemaVersion": 39,
+      "refresh": "30s",
+      "time": { "from": "now-6h", "to": "now" },
+      "panels": [
+        {
+          "type": "row",
+          "title": "API — SLO & error budget (99.9% availability / 30d)",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
+        },
+        {
+          "type": "stat",
+          "title": "API availability (30d)",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "decimals": 4, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.999 } ] } }, "overrides": [] },
+          "targets": [
+            { "expr": "1 - (sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[30d])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[30d])), 1))" }
+          ],
+          "gridPos": { "h": 5, "w": 6, "x": 0, "y": 1 }
+        },
+        {
+          "type": "gauge",
+          "title": "Error budget remaining (30d)",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 0.25 }, { "color": "green", "value": 0.5 } ] } }, "overrides": [] },
+          "targets": [
+            { "expr": "clamp_min(1 - ((sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[30d])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[30d])), 1)) / 0.001), 0)" }
+          ],
+          "gridPos": { "h": 5, "w": 6, "x": 6, "y": 1 }
+        },
+        {
+          "type": "stat",
+          "title": "Fast burn (1h)",
+          "datasource": "prometheus",
+          "description": "Burn rate over 1h. Pages at 14.4x (2% of budget in 1h).",
+          "fieldConfig": { "defaults": { "unit": "none", "decimals": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 14.4 } ] } }, "overrides": [] },
+          "targets": [
+            { "expr": "(sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[1h])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[1h])), 1)) / 0.001" }
+          ],
+          "gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 }
+        },
+        {
+          "type": "stat",
+          "title": "Slow burn (6h)",
+          "datasource": "prometheus",
+          "description": "Burn rate over 6h. Pages at 6x (5% of budget in 6h).",
+          "fieldConfig": { "defaults": { "unit": "none", "decimals": 1, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 6 } ] } }, "overrides": [] },
+          "targets": [
+            { "expr": "(sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[6h])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[6h])), 1)) / 0.001" }
+          ],
+          "gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 }
+        },
+        {
+          "type": "row",
+          "title": "API — golden signals (RED + saturation)",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Request rate by status (req/s)",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
+          "targets": [
+            { "expr": "sum by (status) (rate(portal_http_request_duration_seconds_count[5m]))", "legendFormat": "{{status}}" },
+            { "expr": "sum(rate(portal_http_request_duration_seconds_count[5m]))", "legendFormat": "total" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 }
+        },
+        {
+          "type": "timeseries",
+          "title": "5xx error ratio (5m)",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "percentunit", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.001 } ] } }, "overrides": [] },
+          "targets": [
+            { "expr": "sum(rate(portal_http_request_duration_seconds_count{status=~\"5..\"}[5m])) / clamp_min(sum(rate(portal_http_request_duration_seconds_count[5m])), 1)", "legendFormat": "5xx ratio" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Request latency p50 / p95 / p99 (s)",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+          "targets": [
+            { "expr": "histogram_quantile(0.50, sum by (le) (rate(portal_http_request_duration_seconds_bucket[5m])))", "legendFormat": "p50" },
+            { "expr": "histogram_quantile(0.95, sum by (le) (rate(portal_http_request_duration_seconds_bucket[5m])))", "legendFormat": "p95" },
+            { "expr": "histogram_quantile(0.99, sum by (le) (rate(portal_http_request_duration_seconds_bucket[5m])))", "legendFormat": "p99" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 0, "y": 14 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Slowest routes (p99, s)",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+          "targets": [
+            { "expr": "topk(10, histogram_quantile(0.99, sum by (le, route) (rate(portal_http_request_duration_seconds_bucket[5m]))))", "legendFormat": "{{route}}" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 12, "y": 14 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Saturation — in-flight + DB pool",
+          "datasource": "prometheus",
+          "description": "HTTP requests in flight and pgx pool acquired vs max.",
+          "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
+          "targets": [
+            { "expr": "sum(portal_http_requests_in_flight)", "legendFormat": "http in-flight" },
+            { "expr": "sum(portal_db_pool_connections_acquired)", "legendFormat": "db acquired" },
+            { "expr": "max(portal_db_pool_connections_max)", "legendFormat": "db max" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 0, "y": 21 }
+        },
+        {
+          "type": "timeseries",
+          "title": "DB pool acquire-wait rate",
+          "datasource": "prometheus",
+          "description": "Cumulative connection-acquire waits — climbing means pool contention.",
+          "fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
+          "targets": [
+            { "expr": "sum(rate(portal_db_pool_acquire_wait_total[5m]))", "legendFormat": "acquire waits/s" }
+          ],
+          "gridPos": { "h": 7, "w": 12, "x": 12, "y": 21 }
+        },
+        {
+          "type": "row",
+          "title": "tofu / terragrunt runs",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Run rate by operation & status (runs/h)",
+          "datasource": "prometheus",
+          "description": "tofu/terragrunt run completions — the core infra-execution health signal.",
+          "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
+          "targets": [
+            { "expr": "sum by (operation, status) (rate(portal_tofu_run_duration_seconds_count[1h]) * 3600)", "legendFormat": "{{operation}} {{status}}" }
+          ],
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Run duration p50 / p95 by operation (s)",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+          "targets": [
+            { "expr": "histogram_quantile(0.50, sum by (le, operation) (rate(portal_tofu_run_duration_seconds_bucket[30m])))", "legendFormat": "p50 {{operation}}" },
+            { "expr": "histogram_quantile(0.95, sum by (le, operation) (rate(portal_tofu_run_duration_seconds_bucket[30m])))", "legendFormat": "p95 {{operation}}" }
+          ],
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }
+        },
+        {
+          "type": "row",
+          "title": "worker — River jobs",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Jobs by state (queue depth & backlog)",
+          "datasource": "prometheus",
+          "description": "available/pending/scheduled = backlog; running = active; retryable/discarded/cancelled = trouble.",
+          "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
+          "targets": [
+            { "expr": "sum by (state) (portal_worker_jobs)", "legendFormat": "{{state}}" }
+          ],
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 38 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Job errors & panics (per 5m)",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
+          "targets": [
+            { "expr": "sum by (kind, event) (rate(portal_worker_job_errors_total[5m]))", "legendFormat": "{{kind}} {{event}}" }
+          ],
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 38 }
+        },
+        {
+          "type": "row",
+          "title": "watcher loops (liveness)",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 46 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Time since last successful tick (s)",
+          "datasource": "prometheus",
+          "description": "A loop that stalls climbs without bound — the tenant/cluster watchers' staleness signal.",
+          "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 300 }, { "color": "red", "value": 900 } ] } }, "overrides": [] },
+          "targets": [
+            { "expr": "time() - max by (loop) (portal_watcher_last_tick_timestamp_seconds)", "legendFormat": "{{loop}}" }
+          ],
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 47 }
+        },
+        {
+          "type": "timeseries",
+          "title": "Tick p95 (s) & panic rate",
+          "datasource": "prometheus",
+          "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+          "targets": [
+            { "expr": "histogram_quantile(0.95, sum by (le, loop) (rate(portal_watcher_tick_duration_seconds_bucket[5m])))", "legendFormat": "p95 {{loop}}" },
+            { "expr": "sum by (loop) (rate(portal_watcher_panics_total[5m]))", "legendFormat": "panics/s {{loop}}" }
+          ],
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 47 }
+        }
+      ]
+    }