From 4222357fde0cb35fa3665bebbe0567b56806388d Mon Sep 17 00:00:00 2001
From: stxkxs <stxkxs@users.noreply.github.com>
Date: Tue, 23 Jun 2026 14:48:31 -0700
Subject: [PATCH 1/3] fix(dashboards): de-hollow the agent-* persona panels via
 KSM (eks-agent-platform#47)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The agent-* persona dashboards queried agents_* metrics the operator never
registers, so ~a third of their panels rendered no-data. Most of that data IS
available as CR status — this rewrites those panels to the kube_customresource_*
metrics kube-state-metrics emits, and extends the customResourceState config for
the few fields it didn't yet project.

KSM customResourceState (addons/observability/kube-state-metrics/values.yaml):
- New status_field gauges: BudgetPolicy.status.currentSpendUsd, .killSwitchFiredAt,
  BudgetPolicy.spec.monthlyUsd (the threshold), Tenant.status.aggregateSpendUsd.
  Structurally identical to the proven entries (string gauges, nilIsZero).
- Add resource-level labelsFromPath name/namespace to every resource block, so
  kube_customresource_* carries the `name` label the dashboards group by — this
  also hardens the operator's existing PrometheusRule alerts, which already
  depend on {{ $labels.name }} but had no config emitting it.

Dashboard rewrites (agent-{founder,ops,finance}):
- agents_platform_status_phase{Ready} -> kube_customresource_status_phase Platform Ready
- agents_eval_run_score -> EvalSuite lastScore (already emitted)
- agents_agent_runtime_replicas -> AgentFleet readyAgents (already emitted)
- agents_spend_report_current_usd -> BudgetPolicy currentSpendUsd
- agents_budget_policy_threshold_usd -> BudgetPolicy monthlyUsd (by name)
Panel titles updated to match the real semantics (e.g. "Spend month-to-date",
"Latest EvalSuite scores", "Ready agents").

Deferred (genuinely runtime/data-plane, not CR-projectable; tracked in #47):
agentgateway_* (agent-agentgateway/agent-ops, incl. the invocation_total vs
invocations_total name split) and agents_agent_invocations_total (agent-founder).

Quality-checked (Systems A-/Code A/Consistency A-): CRD fields verified to exist,
KSM string-gauge parsing confirmed, blast-radius config kept structurally identical.
yamllint + kustomize build green.
---
 .../kube-state-metrics/values.yaml            | 57 ++++++++++++++++++-
 dashboards/base/platform/agent-finance.yaml   |  8 +--
 dashboards/base/platform/agent-founder.yaml   | 10 ++--
 dashboards/base/platform/agent-ops.yaml       |  8 +--
 4 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/addons/observability/kube-state-metrics/values.yaml b/addons/observability/kube-state-metrics/values.yaml
index 95af1d5..9bd7625 100644
--- a/addons/observability/kube-state-metrics/values.yaml
+++ b/addons/observability/kube-state-metrics/values.yaml
@@ -58,6 +58,9 @@ customResourceState:
             group: platform.nanohype.dev
             version: v1alpha1
             kind: Platform
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
           metricNamePrefix: kube_customresource
           metrics:
             - name: status_phase
@@ -86,6 +89,9 @@ customResourceState:
             group: platform.nanohype.dev
             version: v1alpha1
             kind: Tenant
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
           metricNamePrefix: kube_customresource
           metrics:
             - name: status_phase
@@ -107,6 +113,16 @@ customResourceState:
               commonLabels:
                 customresource_kind: Tenant
                 field: percentOfBudget
+            - name: status_field
+              help: "Tenant.status.aggregateSpendUsd (USD) as a gauge"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, aggregateSpendUsd]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: Tenant
+                field: aggregateSpendUsd
             - name: condition
               help: "Tenant.status.conditions by type + status"
               each:
@@ -125,10 +141,13 @@ customResourceState:
             group: governance.nanohype.dev
             version: v1alpha1
             kind: BudgetPolicy
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
           metricNamePrefix: kube_customresource
           metrics:
             - name: status_field
-              help: "BudgetPolicy.status.{lastReconciled,killSwitchFiredAt,percentOfBudget} as gauges"
+              help: "BudgetPolicy.status.lastReconciled as a unix-ts gauge"
               each:
                 type: Gauge
                 gauge:
@@ -137,12 +156,45 @@ customResourceState:
               commonLabels:
                 customresource_kind: BudgetPolicy
                 field: lastReconciled
+            - name: status_field
+              help: "BudgetPolicy.status.currentSpendUsd (USD) as a gauge"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, currentSpendUsd]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BudgetPolicy
+                field: currentSpendUsd
+            - name: status_field
+              help: "BudgetPolicy.status.killSwitchFiredAt as a unix-ts gauge (0 when unset)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, killSwitchFiredAt]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BudgetPolicy
+                field: killSwitchFiredAt
+            - name: status_field
+              help: "BudgetPolicy.spec.monthlyUsd — the soft spend threshold (USD)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [spec, monthlyUsd]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BudgetPolicy
+                field: monthlyUsd
 
         # AgentFleet
         - groupVersionKind:
             group: agents.nanohype.dev
             version: v1alpha1
             kind: AgentFleet
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
           metricNamePrefix: kube_customresource
           metrics:
             - name: status_phase
@@ -170,6 +222,9 @@ customResourceState:
             group: governance.nanohype.dev
             version: v1alpha1
             kind: EvalSuite
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
           metricNamePrefix: kube_customresource
           metrics:
             - name: status_phase
diff --git a/dashboards/base/platform/agent-finance.yaml b/dashboards/base/platform/agent-finance.yaml
index 0acc0ab..5cce216 100644
--- a/dashboards/base/platform/agent-finance.yaml
+++ b/dashboards/base/platform/agent-finance.yaml
@@ -82,12 +82,12 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "sum by (platform_id) (agents_spend_report_current_usd)",
-              "legendFormat": "{{platform_id}} spend"
+              "expr": "sum by (name) (kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"currentSpendUsd\"})",
+              "legendFormat": "{{name}} spend"
             },
             {
-              "expr": "sum by (platform_id) (agents_budget_policy_threshold_usd)",
-              "legendFormat": "{{platform_id}} threshold"
+              "expr": "sum by (name) (kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"monthlyUsd\"})",
+              "legendFormat": "{{name}} threshold"
             }
           ],
           "gridPos": {
diff --git a/dashboards/base/platform/agent-founder.yaml b/dashboards/base/platform/agent-founder.yaml
index 9339ee5..5b46535 100644
--- a/dashboards/base/platform/agent-founder.yaml
+++ b/dashboards/base/platform/agent-founder.yaml
@@ -31,7 +31,7 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "count(agents_platform_status_phase{phase=\"Ready\"})"
+              "expr": "count(kube_customresource_status_phase{customresource_kind=\"Platform\",customresource_phase=\"Ready\"} == 1)"
             }
           ],
           "gridPos": {
@@ -43,11 +43,11 @@ spec:
         },
         {
           "type": "stat",
-          "title": "Spend this week (USD)",
+          "title": "Spend month-to-date (USD)",
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "sum(increase(agents_spend_report_current_usd[7d]))"
+              "expr": "sum(kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"currentSpendUsd\"})"
             }
           ],
           "gridPos": {
@@ -59,11 +59,11 @@ spec:
         },
         {
           "type": "timeseries",
-          "title": "Weekly spend trend",
+          "title": "Spend trend (USD, month-to-date)",
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "sum(agents_spend_report_current_usd)"
+              "expr": "sum(kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"currentSpendUsd\"})"
             }
           ],
           "gridPos": {
diff --git a/dashboards/base/platform/agent-ops.yaml b/dashboards/base/platform/agent-ops.yaml
index 6490d33..e9c1e45 100644
--- a/dashboards/base/platform/agent-ops.yaml
+++ b/dashboards/base/platform/agent-ops.yaml
@@ -27,11 +27,11 @@ spec:
       "panels": [
         {
           "type": "stat",
-          "title": "Active AgentRuntimes",
+          "title": "Ready agents (across fleets)",
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "count(agents_agent_runtime_replicas)"
+              "expr": "sum(kube_customresource_status_field{customresource_kind=\"AgentFleet\",field=\"readyAgents\"})"
             }
           ],
           "gridPos": {
@@ -89,11 +89,11 @@ spec:
         },
         {
           "type": "table",
-          "title": "Latest EvalRun scores",
+          "title": "Latest EvalSuite scores",
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "topk(50, agents_eval_run_score{phase=\"completed\"})"
+              "expr": "topk(50, kube_customresource_status_field{customresource_kind=\"EvalSuite\",field=\"lastScore\"})"
             }
           ],
           "gridPos": {

From 9155fa024261e66b5444996c14f9835277628868 Mon Sep 17 00:00:00 2001
From: stxkxs <stxkxs@users.noreply.github.com>
Date: Tue, 23 Jun 2026 17:34:10 -0700
Subject: [PATCH 2/3] fix(dashboards): correct the agentgateway metric names to
 the documented ones
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The agentgateway panels referenced agentgateway_invocation_total /
agentgateway_invocations_total / agentgateway_invocation_duration_seconds — none
of which agentgateway emits (verified against agentgateway.dev/docs). The real
metrics are agentgateway_llm_requests_total and agentgateway_llm_request_duration_seconds
(port 15020). Fixes the names (and eliminates the singular/plural split between
agent-agentgateway and agent-ops).

The per-label drill-downs (platform / model_id / status / route filters) still
assume a label model that doesn't match agentgateway's OTel gen_ai_* conventions —
those, plus the scrape annotation (port 15020), are tuned at first scrape against
a live gateway (recipe in eks-agent-platform#47). Names being correct now reduces
that work to a label pass. JSON valid; yamllint clean.
---
 dashboards/base/platform/agent-agentgateway.yaml | 12 ++++++------
 dashboards/base/platform/agent-ops.yaml          | 10 +++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/dashboards/base/platform/agent-agentgateway.yaml b/dashboards/base/platform/agent-agentgateway.yaml
index 90bea95..590860f 100644
--- a/dashboards/base/platform/agent-agentgateway.yaml
+++ b/dashboards/base/platform/agent-agentgateway.yaml
@@ -30,7 +30,7 @@ spec:
             "name": "platform",
             "type": "query",
             "datasource": "prometheus",
-            "query": "label_values(agentgateway_invocation_duration_seconds_count, platform)",
+            "query": "label_values(agentgateway_llm_request_duration_seconds_count, platform)",
             "includeAll": true,
             "multi": true
           }
@@ -43,15 +43,15 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket{platform=~\"$platform\"}[5m])))",
+              "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket{platform=~\"$platform\"}[5m])))",
               "legendFormat": "p50 {{model_id}}"
             },
             {
-              "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket{platform=~\"$platform\"}[5m])))",
+              "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket{platform=~\"$platform\"}[5m])))",
               "legendFormat": "p95 {{model_id}}"
             },
             {
-              "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket{platform=~\"$platform\"}[5m])))",
+              "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket{platform=~\"$platform\"}[5m])))",
               "legendFormat": "p99 {{model_id}}"
             }
           ],
@@ -73,7 +73,7 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "sum by (model_id, status) (rate(agentgateway_invocation_total{platform=~\"$platform\",status!=\"200\"}[5m]))",
+              "expr": "sum by (model_id, status) (rate(agentgateway_llm_requests_total{platform=~\"$platform\",status!=\"200\"}[5m]))",
               "legendFormat": "{{model_id}} {{status}}"
             }
           ],
@@ -90,7 +90,7 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "sum by (route) (rate(agentgateway_invocation_total{platform=~\"$platform\"}[1m]))",
+              "expr": "sum by (route) (rate(agentgateway_llm_requests_total{platform=~\"$platform\"}[1m]))",
               "legendFormat": "{{route}}"
             }
           ],
diff --git a/dashboards/base/platform/agent-ops.yaml b/dashboards/base/platform/agent-ops.yaml
index e9c1e45..643733b 100644
--- a/dashboards/base/platform/agent-ops.yaml
+++ b/dashboards/base/platform/agent-ops.yaml
@@ -68,15 +68,15 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket[5m])))",
+              "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket[5m])))",
               "legendFormat": "p50 {{model_id}}"
             },
             {
-              "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket[5m])))",
+              "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket[5m])))",
               "legendFormat": "p95 {{model_id}}"
             },
             {
-              "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket[5m])))",
+              "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket[5m])))",
               "legendFormat": "p99 {{model_id}}"
             }
           ],
@@ -109,11 +109,11 @@ spec:
           "datasource": "prometheus",
           "targets": [
             {
-              "expr": "sum by (model_id) (rate(agentgateway_invocations_total{status=~\"4..\"}[5m]))",
+              "expr": "sum by (model_id) (rate(agentgateway_llm_requests_total{status=~\"4..\"}[5m]))",
               "legendFormat": "4xx {{model_id}}"
             },
             {
-              "expr": "sum by (model_id) (rate(agentgateway_invocations_total{status=~\"5..\"}[5m]))",
+              "expr": "sum by (model_id) (rate(agentgateway_llm_requests_total{status=~\"5..\"}[5m]))",
               "legendFormat": "5xx {{model_id}}"
             }
           ],

From bced6f99fbded0c17d1ca3868fbf8c27968b9788 Mon Sep 17 00:00:00 2001
From: stxkxs <stxkxs@users.noreply.github.com>
Date: Tue, 23 Jun 2026 17:44:10 -0700
Subject: [PATCH 3/3] feat(addons): deepen kube-state-metrics CR-state coverage
 to the full operator surface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The customResourceState config projected 5 of the 9 operator CRDs, and
conditions on only one (Tenant). Every agent-* dashboard de-hollowed in this
branch reads kube_customresource_* — so the metrics had to actually exist for
those panels to render. This closes the gap to the full CRD surface.

─────────────────────── Conditions sweep ───────────────────────
Added the conditions block (condition_type + condition_status labels, value =
status) to every CRD that carries status.conditions: Platform, BudgetPolicy,
AgentFleet, EvalSuite, plus the four newly-added CRDs. phase=Ready can mask a
degraded reconcile; conditions are the controller's real health truth, so
"<Kind> not Ready" alerts now have a series to fire on for every resource.

─────────────────────── Four dark CRDs ─────────────────────────
ModelGateway, AgentSandbox, SandboxPool, BatchJob were entirely unobserved.
Each now projects phase (StateSet), conditions, and its load-bearing gauges:
  - ModelGateway   observedGeneration
  - AgentSandbox   podPhase (StateSet), completedAt
  - SandboxPool    readyWorkers (nilIsZero — under-provision alerts fire on an
                   unpopulated status instead of silently vanishing)
  - BatchJob       failedCount, succeededCount, recordCount

─────────────────────── Gauge deepening ────────────────────────
  - Tenant         platformCount, readyPlatformCount, suspendedPlatformCount,
                   lastReconciled (fleet-size denominator + reconcile staleness)
  - BudgetPolicy   percentOfBudget, conditions (cap-unenforced-if-stale)
  - AgentFleet     observedGeneration (unapplied spec change)
  - EvalSuite      passThreshold, lastRunAt
  - Platform       observedGeneration

RBAC: granted KSM list/watch on agentfleets, agentsandboxes, sandboxpools,
modelgateways, batchjobs (agents.nanohype.dev) — required or the new resource
blocks would silently emit nothing.

Every projected path verified against the operator CRD schemas; phase fields
are free strings so the StateSet enum lists are best-effort. KSM parses the
whole customResourceState as one unit, so the config was validated for
structural correctness (one malformed block breaks all kube_customresource_*).
---
 .../kube-state-metrics/values.yaml            | 342 +++++++++++++++++-
 1 file changed, 341 insertions(+), 1 deletion(-)

diff --git a/addons/observability/kube-state-metrics/values.yaml b/addons/observability/kube-state-metrics/values.yaml
index 9bd7625..07f9041 100644
--- a/addons/observability/kube-state-metrics/values.yaml
+++ b/addons/observability/kube-state-metrics/values.yaml
@@ -31,7 +31,7 @@ rbac:
       resources: ["budgetpolicies", "evalsuites"]
       verbs: ["list", "watch"]
     - apiGroups: ["agents.nanohype.dev"]
-      resources: ["agentfleets"]
+      resources: ["agentfleets", "agentsandboxes", "sandboxpools", "modelgateways", "batchjobs"]
       verbs: ["list", "watch"]
 
 # Scraped by the grafana-agent static target, but annotate too so the
@@ -83,6 +83,28 @@ customResourceState:
               commonLabels:
                 customresource_kind: Platform
                 field: suspendedAt
+            - name: condition
+              help: "Platform.status.conditions by type + status (per-aspect reconcile/IRSA/ns truth)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: Platform
+            - name: status_field
+              help: "Platform.status.observedGeneration (vs metadata.generation — stuck reconcile)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, observedGeneration]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: Platform
+                field: observedGeneration
 
         # Tenant
         - groupVersionKind:
@@ -123,6 +145,46 @@ customResourceState:
               commonLabels:
                 customresource_kind: Tenant
                 field: aggregateSpendUsd
+            - name: status_field
+              help: "Tenant.status.platformCount (total Platforms — denominator for ready ratio)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, platformCount]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: Tenant
+                field: platformCount
+            - name: status_field
+              help: "Tenant.status.readyPlatformCount (alert when < platformCount — partial outage)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, readyPlatformCount]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: Tenant
+                field: readyPlatformCount
+            - name: status_field
+              help: "Tenant.status.suspendedPlatformCount (kill-switch hits — alert on non-zero)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, suspendedPlatformCount]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: Tenant
+                field: suspendedPlatformCount
+            - name: status_field
+              help: "Tenant.status.lastReconciled (staleness watchdog)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, lastReconciled]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: Tenant
+                field: lastReconciled
             - name: condition
               help: "Tenant.status.conditions by type + status"
               each:
@@ -186,6 +248,28 @@ customResourceState:
               commonLabels:
                 customresource_kind: BudgetPolicy
                 field: monthlyUsd
+            - name: status_field
+              help: "BudgetPolicy.status.percentOfBudget (0..200+ — the natural alert threshold per policy)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, percentOfBudget]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BudgetPolicy
+                field: percentOfBudget
+            - name: condition
+              help: "BudgetPolicy.status.conditions (budget controller reconcile health — cap unenforced if stale)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: BudgetPolicy
 
         # AgentFleet
         - groupVersionKind:
@@ -216,6 +300,28 @@ customResourceState:
               commonLabels:
                 customresource_kind: AgentFleet
                 field: readyAgents
+            - name: condition
+              help: "AgentFleet.status.conditions (downstream kagent rollout truth; phase=Ready can mask degraded)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: AgentFleet
+            - name: status_field
+              help: "AgentFleet.status.observedGeneration (vs metadata.generation — unapplied spec change)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, observedGeneration]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: AgentFleet
+                field: observedGeneration
 
         # EvalSuite
         - groupVersionKind:
@@ -246,3 +352,237 @@ customResourceState:
               commonLabels:
                 customresource_kind: EvalSuite
                 field: lastScore
+            - name: status_field
+              help: "EvalSuite.spec.passThreshold (0..1) — the required mean; pair with lastScore to alert"
+              each:
+                type: Gauge
+                gauge:
+                  path: [spec, passThreshold]
+              commonLabels:
+                customresource_kind: EvalSuite
+                field: passThreshold
+            - name: status_field
+              help: "EvalSuite.status.lastRunAt — freshness watchdog (evals silently stopped)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, lastRunAt]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: EvalSuite
+                field: lastRunAt
+            - name: condition
+              help: "EvalSuite.status.conditions by type + status (harness executed vs agents passed)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: EvalSuite
+
+        # ModelGateway — every agent's Bedrock egress flows through it; a broken
+        # Route can be condition=False while phase stays Ready.
+        - groupVersionKind:
+            group: agents.nanohype.dev
+            version: v1alpha1
+            kind: ModelGateway
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
+          metricNamePrefix: kube_customresource
+          metrics:
+            - name: status_phase
+              help: "ModelGateway.status.phase"
+              each:
+                type: StateSet
+                stateSet:
+                  labelName: customresource_phase
+                  path: [status, phase]
+                  list: [Pending, Provisioning, Ready, Failed]
+              commonLabels:
+                customresource_kind: ModelGateway
+            - name: condition
+              help: "ModelGateway.status.conditions by type + status"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: ModelGateway
+            - name: status_field
+              help: "ModelGateway.status.observedGeneration (reconcile-lag vs metadata.generation)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, observedGeneration]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: ModelGateway
+                field: observedGeneration
+
+        # AgentSandbox — ephemeral attributable single-session runtime
+        - groupVersionKind:
+            group: agents.nanohype.dev
+            version: v1alpha1
+            kind: AgentSandbox
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
+          metricNamePrefix: kube_customresource
+          metrics:
+            - name: status_phase
+              help: "AgentSandbox.status.phase (run-once session lifecycle)"
+              each:
+                type: StateSet
+                stateSet:
+                  labelName: customresource_phase
+                  path: [status, phase]
+                  list: [Pending, Running, Succeeded, Failed, Suspended]
+              commonLabels:
+                customresource_kind: AgentSandbox
+            - name: status_pod_phase
+              help: "AgentSandbox.status.podPhase (the session pod's k8s phase — stuck Pending vs run failure)"
+              each:
+                type: StateSet
+                stateSet:
+                  labelName: customresource_pod_phase
+                  path: [status, podPhase]
+                  list: [Pending, Running, Succeeded, Failed, Unknown]
+              commonLabels:
+                customresource_kind: AgentSandbox
+            - name: condition
+              help: "AgentSandbox.status.conditions by type + status"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: AgentSandbox
+            - name: status_field
+              help: "AgentSandbox.status.completedAt (terminal ts; pairs with ttl for leaked-pod alerts)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, completedAt]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: AgentSandbox
+                field: completedAt
+
+        # SandboxPool — warm-pool of sandbox workers
+        - groupVersionKind:
+            group: agents.nanohype.dev
+            version: v1alpha1
+            kind: SandboxPool
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
+          metricNamePrefix: kube_customresource
+          metrics:
+            - name: status_phase
+              help: "SandboxPool.status.phase"
+              each:
+                type: StateSet
+                stateSet:
+                  labelName: customresource_phase
+                  path: [status, phase]
+                  list: [Pending, Ready, Suspended, Failed]
+              commonLabels:
+                customresource_kind: SandboxPool
+            - name: status_field
+              help: "SandboxPool.status.readyWorkers (warm-pool depth — 0 when unpopulated, so under-provision alerts still fire)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, readyWorkers]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: SandboxPool
+                field: readyWorkers
+            - name: condition
+              help: "SandboxPool.status.conditions by type + status"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: SandboxPool
+
+        # BatchJob — bulk async agent runs
+        - groupVersionKind:
+            group: agents.nanohype.dev
+            version: v1alpha1
+            kind: BatchJob
+          labelsFromPath:
+            name: [metadata, name]
+            namespace: [metadata, namespace]
+          metricNamePrefix: kube_customresource
+          metrics:
+            - name: status_phase
+              help: "BatchJob.status.phase"
+              each:
+                type: StateSet
+                stateSet:
+                  labelName: customresource_phase
+                  path: [status, phase]
+                  list: [Pending, Provisioning, Running, Succeeded, Failed, Stopped]
+              commonLabels:
+                customresource_kind: BatchJob
+            - name: status_field
+              help: "BatchJob.status.failedCount"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, failedCount]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BatchJob
+                field: failedCount
+            - name: status_field
+              help: "BatchJob.status.succeededCount"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, succeededCount]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BatchJob
+                field: succeededCount
+            - name: status_field
+              help: "BatchJob.status.recordCount (total records to process)"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, recordCount]
+                  nilIsZero: true
+              commonLabels:
+                customresource_kind: BatchJob
+                field: recordCount
+            - name: condition
+              help: "BatchJob.status.conditions by type + status"
+              each:
+                type: Gauge
+                gauge:
+                  path: [status, conditions]
+                  labelsFromPath:
+                    condition_type: [type]
+                    condition_status: [status]
+                  valueFrom: [status]
+              commonLabels:
+                customresource_kind: BatchJob