diff --git a/addons/observability/kube-state-metrics/values.yaml b/addons/observability/kube-state-metrics/values.yaml
index 95af1d5..07f9041 100644
--- a/addons/observability/kube-state-metrics/values.yaml
+++ b/addons/observability/kube-state-metrics/values.yaml
@@ -31,7 +31,7 @@ rbac:
       resources: ["budgetpolicies", "evalsuites"]
       verbs: ["list", "watch"]
     - apiGroups: ["agents.nanohype.dev"]
-      resources: ["agentfleets"]
+      resources: ["agentfleets", "agentsandboxes", "sandboxpools", "modelgateways", "batchjobs"]
       verbs: ["list", "watch"]
 
 # Scraped by the grafana-agent static target, but annotate too so the
@@ -58,6 +58,9 @@ customResourceState:
             group: platform.nanohype.dev
             version: v1alpha1
             kind: Platform
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
           metricNamePrefix: kube_customresource
           metrics:
             - name: status_phase
@@ -80,12 +83,37 @@ customResourceState:
               commonLabels:
                 customresource_kind: Platform
                 field: suspendedAt
+            - name: condition
+              help: "Platform.status.conditions by type + status (per-aspect reconcile/IRSA/ns truth)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: Platform
+            - name: status_field
+              help: "Platform.status.observedGeneration (vs metadata.generation — stuck reconcile)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, observedGeneration]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: Platform
+                field: observedGeneration
 
         # Tenant
         - groupVersionKind:
             group: platform.nanohype.dev
             version: v1alpha1
             kind: Tenant
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
           metricNamePrefix: kube_customresource
           metrics:
             - name: status_phase
@@ -107,6 +135,56 @@ customResourceState:
               commonLabels:
                 customresource_kind: Tenant
                 field: percentOfBudget
+            - name: status_field
+              help: "Tenant.status.aggregateSpendUsd (USD) as a gauge"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, aggregateSpendUsd]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: Tenant
+                field: aggregateSpendUsd
+            - name: status_field
+              help: "Tenant.status.platformCount (total Platforms — denominator for ready ratio)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, platformCount]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: Tenant
+                field: platformCount
+            - name: status_field
+              help: "Tenant.status.readyPlatformCount (alert when < platformCount — partial outage)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, readyPlatformCount]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: Tenant
+                field: readyPlatformCount
+            - name: status_field
+              help: "Tenant.status.suspendedPlatformCount (kill-switch hits — alert on non-zero)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, suspendedPlatformCount]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: Tenant
+                field: suspendedPlatformCount
+            - name: status_field
+              help: "Tenant.status.lastReconciled (staleness watchdog)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, lastReconciled]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: Tenant
+                field: lastReconciled
             - name: condition
               help: "Tenant.status.conditions by type + status"
               each:
@@ -125,10 +203,13 @@ customResourceState:
             group: governance.nanohype.dev
             version: v1alpha1
             kind: BudgetPolicy
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
           metricNamePrefix: kube_customresource
           metrics:
             - name: status_field
-              help: "BudgetPolicy.status.{lastReconciled,killSwitchFiredAt,percentOfBudget} as gauges"
+              help: "BudgetPolicy.status.lastReconciled as a unix-ts gauge"
               each:
                 type: Gauge
                 gauge:
@@ -137,12 +218,67 @@ customResourceState:
               commonLabels:
                 customresource_kind: BudgetPolicy
                 field: lastReconciled
+            - name: status_field
+              help: "BudgetPolicy.status.currentSpendUsd (USD) as a gauge"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, currentSpendUsd]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BudgetPolicy
+                field: currentSpendUsd
+            - name: status_field
+              help: "BudgetPolicy.status.killSwitchFiredAt as a unix-ts gauge (0 when unset)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, killSwitchFiredAt]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BudgetPolicy
+                field: killSwitchFiredAt
+            - name: status_field
+              help: "BudgetPolicy.spec.monthlyUsd — the soft spend threshold (USD)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [spec, monthlyUsd]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BudgetPolicy
+                field: monthlyUsd
+            - name: status_field
+              help: "BudgetPolicy.status.percentOfBudget (0..200+ — the natural alert threshold per policy)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, percentOfBudget]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BudgetPolicy
+                field: percentOfBudget
+            - name: condition
+              help: "BudgetPolicy.status.conditions (budget controller reconcile health — cap unenforced if stale)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: BudgetPolicy
 
         # AgentFleet
         - groupVersionKind:
             group: agents.nanohype.dev
             version: v1alpha1
             kind: AgentFleet
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
           metricNamePrefix: kube_customresource
           metrics:
             - name: status_phase
@@ -164,12 +300,37 @@ customResourceState:
               commonLabels:
                 customresource_kind: AgentFleet
                 field: readyAgents
+            - name: condition
+              help: "AgentFleet.status.conditions (downstream kagent rollout truth; phase=Ready can mask degraded)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: AgentFleet
+            - name: status_field
+              help: "AgentFleet.status.observedGeneration (vs metadata.generation — unapplied spec change)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, observedGeneration]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: AgentFleet
+                field: observedGeneration
 
         # EvalSuite
         - groupVersionKind:
             group: governance.nanohype.dev
             version: v1alpha1
             kind: EvalSuite
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
           metricNamePrefix: kube_customresource
           metrics:
             - name: status_phase
@@ -191,3 +352,237 @@ customResourceState:
               commonLabels:
                 customresource_kind: EvalSuite
                 field: lastScore
+            - name: status_field
+              help: "EvalSuite.spec.passThreshold (0..1) — the required mean; pair with lastScore to alert"
+              each:
+                type: Gauge
+                gauge:
+                  path: [spec, passThreshold]
+              commonLabels:
+                customresource_kind: EvalSuite
+                field: passThreshold
+            - name: status_field
+              help: "EvalSuite.status.lastRunAt — freshness watchdog (evals silently stopped)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, lastRunAt]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: EvalSuite
+                field: lastRunAt
+            - name: condition
+              help: "EvalSuite.status.conditions by type + status (harness executed vs agents passed)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: EvalSuite
+
+        # ModelGateway — every agent's Bedrock egress flows through it; a broken
+        # Route can be condition=False while phase stays Ready.
+        - groupVersionKind:
+            group: agents.nanohype.dev
+            version: v1alpha1
+            kind: ModelGateway
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
+          metricNamePrefix: kube_customresource
+          metrics:
+            - name: status_phase
+              help: "ModelGateway.status.phase"
+              each:
+                type: StateSet
+                stateSet:
+                  labelName: customresource_phase
+                  path: [status, phase]
+                  list: [Pending, Provisioning, Ready, Failed]
+              commonLabels:
+                customresource_kind: ModelGateway
+            - name: condition
+              help: "ModelGateway.status.conditions by type + status"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: ModelGateway
+            - name: status_field
+              help: "ModelGateway.status.observedGeneration (reconcile-lag vs metadata.generation)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, observedGeneration]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: ModelGateway
+                field: observedGeneration
+
+        # AgentSandbox — ephemeral attributable single-session runtime
+        - groupVersionKind:
+            group: agents.nanohype.dev
+            version: v1alpha1
+            kind: AgentSandbox
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
+          metricNamePrefix: kube_customresource
+          metrics:
+            - name: status_phase
+              help: "AgentSandbox.status.phase (run-once session lifecycle)"
+              each:
+                type: StateSet
+                stateSet:
+                  labelName: customresource_phase
+                  path: [status, phase]
+                  list: [Pending, Running, Succeeded, Failed, Suspended]
+              commonLabels:
+                customresource_kind: AgentSandbox
+            - name: status_pod_phase
+              help: "AgentSandbox.status.podPhase (the session pod's k8s phase — stuck Pending vs run failure)"
+              each:
+                type: StateSet
+                stateSet:
+                  labelName: customresource_pod_phase
+                  path: [status, podPhase]
+                  list: [Pending, Running, Succeeded, Failed, Unknown]
+              commonLabels:
+                customresource_kind: AgentSandbox
+            - name: condition
+              help: "AgentSandbox.status.conditions by type + status"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: AgentSandbox
+            - name: status_field
+              help: "AgentSandbox.status.completedAt (terminal ts; pairs with ttl for leaked-pod alerts)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, completedAt]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: AgentSandbox
+                field: completedAt
+
+        # SandboxPool — warm-pool of sandbox workers
+        - groupVersionKind:
+            group: agents.nanohype.dev
+            version: v1alpha1
+            kind: SandboxPool
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
+          metricNamePrefix: kube_customresource
+          metrics:
+            - name: status_phase
+              help: "SandboxPool.status.phase"
+              each:
+                type: StateSet
+                stateSet:
+                  labelName: customresource_phase
+                  path: [status, phase]
+                  list: [Pending, Ready, Suspended, Failed]
+              commonLabels:
+                customresource_kind: SandboxPool
+            - name: status_field
+              help: "SandboxPool.status.readyWorkers (warm-pool depth — 0 when unpopulated, so under-provision alerts still fire)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, readyWorkers]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: SandboxPool
+                field: readyWorkers
+            - name: condition
+              help: "SandboxPool.status.conditions by type + status"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: SandboxPool
+
+        # BatchJob — bulk async agent runs
+        - groupVersionKind:
+            group: agents.nanohype.dev
+            version: v1alpha1
+            kind: BatchJob
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
+          metricNamePrefix: kube_customresource
+          metrics:
+            - name: status_phase
+              help: "BatchJob.status.phase"
+              each:
+                type: StateSet
+                stateSet:
+                  labelName: customresource_phase
+                  path: [status, phase]
+                  list: [Pending, Provisioning, Running, Succeeded, Failed, Stopped]
+              commonLabels:
+                customresource_kind: BatchJob
+            - name: status_field
+              help: "BatchJob.status.failedCount"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, failedCount]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BatchJob
+                field: failedCount
+            - name: status_field
+              help: "BatchJob.status.succeededCount"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, succeededCount]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BatchJob
+                field: succeededCount
+            - name: status_field
+              help: "BatchJob.status.recordCount (total records to process)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, recordCount]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BatchJob
+                field: recordCount
+            - name: condition
+              help: "BatchJob.status.conditions by type + status"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: BatchJob
diff --git a/dashboards/base/platform/agent-agentgateway.yaml b/dashboards/base/platform/agent-agentgateway.yaml
index 90bea95..590860f 100644
--- a/dashboards/base/platform/agent-agentgateway.yaml
+++ b/dashboards/base/platform/agent-agentgateway.yaml
@@ -30,7 +30,7 @@ spec:
             "name": "platform",
             "type": "query",
             "datasource": "prometheus",
-            "query": "label_values(agentgateway_invocation_duration_seconds_count, platform)",
+            "query": "label_values(agentgateway_llm_request_duration_seconds_count, platform)",
             "includeAll": true,
             "multi": true
           }
@@ -43,15 +43,15 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket{platform=~\"$platform\"}[5m])))",
+              "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket{platform=~\"$platform\"}[5m])))",
               "legendFormat": "p50 {{model_id}}"
             },
             {
-              "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket{platform=~\"$platform\"}[5m])))",
+              "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket{platform=~\"$platform\"}[5m])))",
               "legendFormat": "p95 {{model_id}}"
             },
             {
-              "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket{platform=~\"$platform\"}[5m])))",
+              "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket{platform=~\"$platform\"}[5m])))",
               "legendFormat": "p99 {{model_id}}"
             }
           ],
@@ -73,7 +73,7 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "sum by (model_id, status) (rate(agentgateway_invocation_total{platform=~\"$platform\",status!=\"200\"}[5m]))",
+              "expr": "sum by (model_id, status) (rate(agentgateway_llm_requests_total{platform=~\"$platform\",status!=\"200\"}[5m]))",
               "legendFormat": "{{model_id}} {{status}}"
             }
           ],
@@ -90,7 +90,7 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "sum by (route) (rate(agentgateway_invocation_total{platform=~\"$platform\"}[1m]))",
+              "expr": "sum by (route) (rate(agentgateway_llm_requests_total{platform=~\"$platform\"}[1m]))",
               "legendFormat": "{{route}}"
             }
           ],
diff --git a/dashboards/base/platform/agent-finance.yaml b/dashboards/base/platform/agent-finance.yaml
index 0acc0ab..5cce216 100644
--- a/dashboards/base/platform/agent-finance.yaml
+++ b/dashboards/base/platform/agent-finance.yaml
@@ -82,12 +82,12 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "sum by (platform_id) (agents_spend_report_current_usd)",
-              "legendFormat": "{{platform_id}} spend"
+              "expr": "sum by (name) (kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"currentSpendUsd\"})",
+              "legendFormat": "{{name}} spend"
             },
             {
-              "expr": "sum by (platform_id) (agents_budget_policy_threshold_usd)",
-              "legendFormat": "{{platform_id}} threshold"
+              "expr": "sum by (name) (kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"monthlyUsd\"})",
+              "legendFormat": "{{name}} threshold"
             }
           ],
           "gridPos": {
diff --git a/dashboards/base/platform/agent-founder.yaml b/dashboards/base/platform/agent-founder.yaml
index 9339ee5..5b46535 100644
--- a/dashboards/base/platform/agent-founder.yaml
+++ b/dashboards/base/platform/agent-founder.yaml
@@ -31,7 +31,7 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "count(agents_platform_status_phase{phase=\"Ready\"})"
+              "expr": "count(kube_customresource_status_phase{customresource_kind=\"Platform\",customresource_phase=\"Ready\"} == 1)"
             }
           ],
           "gridPos": {
@@ -43,11 +43,11 @@ spec:
         },
         {
           "type": "stat",
-          "title": "Spend this week (USD)",
+          "title": "Spend month-to-date (USD)",
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "sum(increase(agents_spend_report_current_usd[7d]))"
+              "expr": "sum(kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"currentSpendUsd\"})"
             }
           ],
           "gridPos": {
@@ -59,11 +59,11 @@ spec:
         },
         {
           "type": "timeseries",
-          "title": "Weekly spend trend",
+          "title": "Spend trend (USD, month-to-date)",
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "sum(agents_spend_report_current_usd)"
+              "expr": "sum(kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"currentSpendUsd\"})"
             }
           ],
           "gridPos": {
diff --git a/dashboards/base/platform/agent-ops.yaml b/dashboards/base/platform/agent-ops.yaml
index 6490d33..643733b 100644
--- a/dashboards/base/platform/agent-ops.yaml
+++ b/dashboards/base/platform/agent-ops.yaml
@@ -27,11 +27,11 @@ spec:
       "panels": [
         {
           "type": "stat",
-          "title": "Active AgentRuntimes",
+          "title": "Ready agents (across fleets)",
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "count(agents_agent_runtime_replicas)"
+              "expr": "sum(kube_customresource_status_field{customresource_kind=\"AgentFleet\",field=\"readyAgents\"})"
             }
           ],
           "gridPos": {
@@ -68,15 +68,15 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket[5m])))",
+              "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket[5m])))",
               "legendFormat": "p50 {{model_id}}"
             },
             {
-              "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket[5m])))",
+              "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket[5m])))",
               "legendFormat": "p95 {{model_id}}"
             },
             {
-              "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket[5m])))",
+              "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket[5m])))",
               "legendFormat": "p99 {{model_id}}"
             }
           ],
@@ -89,11 +89,11 @@ spec:
         },
         {
           "type": "table",
-          "title": "Latest EvalRun scores",
+          "title": "Latest EvalSuite scores",
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "topk(50, agents_eval_run_score{phase=\"completed\"})"
+              "expr": "topk(50, kube_customresource_status_field{customresource_kind=\"EvalSuite\",field=\"lastScore\"})"
             }
           ],
           "gridPos": {
@@ -109,11 +109,11 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "sum by (model_id) (rate(agentgateway_invocations_total{status=~\"4..\"}[5m]))",
+              "expr": "sum by (model_id) (rate(agentgateway_llm_requests_total{status=~\"4..\"}[5m]))",
               "legendFormat": "4xx {{model_id}}"
             },
             {
-              "expr": "sum by (model_id) (rate(agentgateway_invocations_total{status=~\"5..\"}[5m]))",
+              "expr": "sum by (model_id) (rate(agentgateway_llm_requests_total{status=~\"5..\"}[5m]))",
               "legendFormat": "5xx {{model_id}}"
             }
           ],