From 4222357fde0cb35fa3665bebbe0567b56806388d Mon Sep 17 00:00:00 2001 From: stxkxs Date: Tue, 23 Jun 2026 14:48:31 -0700 Subject: [PATCH 1/3] fix(dashboards): de-hollow the agent-* persona panels via KSM (eks-agent-platform#47) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The agent-* persona dashboards queried agents_* metrics the operator never registers, so ~a third of their panels rendered no-data. Most of that data IS available as CR status — this rewrites those panels to the kube_customresource_* metrics kube-state-metrics emits, and extends the customResourceState config for the few fields it didn't yet project. KSM customResourceState (addons/observability/kube-state-metrics/values.yaml): - New status_field gauges: BudgetPolicy.status.currentSpendUsd, .killSwitchFiredAt, BudgetPolicy.spec.monthlyUsd (the threshold), Tenant.status.aggregateSpendUsd. Structurally identical to the proven entries (string gauges, nilIsZero). - Add resource-level labelsFromPath name/namespace to every resource block, so kube_customresource_* carries the `name` label the dashboards group by — this also hardens the operator's existing PrometheusRule alerts, which already depend on {{ $labels.name }} but had no config emitting it. Dashboard rewrites (agent-{founder,ops,finance}): - agents_platform_status_phase{Ready} -> kube_customresource_status_phase Platform Ready - agents_eval_run_score -> EvalSuite lastScore (already emitted) - agents_agent_runtime_replicas -> AgentFleet readyAgents (already emitted) - agents_spend_report_current_usd -> BudgetPolicy currentSpendUsd - agents_budget_policy_threshold_usd -> BudgetPolicy monthlyUsd (by name) Panel titles updated to match the real semantics (e.g. "Spend month-to-date", "Latest EvalSuite scores", "Ready agents"). Deferred (genuinely runtime/data-plane, not CR-projectable; tracked in #47): agentgateway_* (agent-agentgateway/agent-ops, incl. the invocation_total vs invocations_total name split) and agents_agent_invocations_total (agent-founder). Quality-checked (Systems A-/Code A/Consistency A-): CRD fields verified to exist, KSM string-gauge parsing confirmed, blast-radius config kept structurally identical. yamllint + kustomize build green. --- .../kube-state-metrics/values.yaml | 57 ++++++++++++++++++- dashboards/base/platform/agent-finance.yaml | 8 +-- dashboards/base/platform/agent-founder.yaml | 10 ++-- dashboards/base/platform/agent-ops.yaml | 8 +-- 4 files changed, 69 insertions(+), 14 deletions(-) diff --git a/addons/observability/kube-state-metrics/values.yaml b/addons/observability/kube-state-metrics/values.yaml index 95af1d5..9bd7625 100644 --- a/addons/observability/kube-state-metrics/values.yaml +++ b/addons/observability/kube-state-metrics/values.yaml @@ -58,6 +58,9 @@ customResourceState: group: platform.nanohype.dev version: v1alpha1 kind: Platform + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] metricNamePrefix: kube_customresource metrics: - name: status_phase @@ -86,6 +89,9 @@ customResourceState: group: platform.nanohype.dev version: v1alpha1 kind: Tenant + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] metricNamePrefix: kube_customresource metrics: - name: status_phase @@ -107,6 +113,16 @@ customResourceState: commonLabels: customresource_kind: Tenant field: percentOfBudget + - name: status_field + help: "Tenant.status.aggregateSpendUsd (USD) as a gauge" + each: + type: Gauge + gauge: + path: [status, aggregateSpendUsd] + nilIsZero: true + commonLabels: + customresource_kind: Tenant + field: aggregateSpendUsd - name: condition help: "Tenant.status.conditions by type + status" each: @@ -125,10 +141,13 @@ customResourceState: group: governance.nanohype.dev version: v1alpha1 kind: BudgetPolicy + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] metricNamePrefix: kube_customresource metrics: - name: status_field - help: "BudgetPolicy.status.{lastReconciled,killSwitchFiredAt,percentOfBudget} as gauges" + help: "BudgetPolicy.status.lastReconciled as a unix-ts gauge" each: type: Gauge gauge: @@ -137,12 +156,45 @@ customResourceState: commonLabels: customresource_kind: BudgetPolicy field: lastReconciled + - name: status_field + help: "BudgetPolicy.status.currentSpendUsd (USD) as a gauge" + each: + type: Gauge + gauge: + path: [status, currentSpendUsd] + nilIsZero: true + commonLabels: + customresource_kind: BudgetPolicy + field: currentSpendUsd + - name: status_field + help: "BudgetPolicy.status.killSwitchFiredAt as a unix-ts gauge (0 when unset)" + each: + type: Gauge + gauge: + path: [status, killSwitchFiredAt] + nilIsZero: true + commonLabels: + customresource_kind: BudgetPolicy + field: killSwitchFiredAt + - name: status_field + help: "BudgetPolicy.spec.monthlyUsd — the soft spend threshold (USD)" + each: + type: Gauge + gauge: + path: [spec, monthlyUsd] + nilIsZero: true + commonLabels: + customresource_kind: BudgetPolicy + field: monthlyUsd # AgentFleet - groupVersionKind: group: agents.nanohype.dev version: v1alpha1 kind: AgentFleet + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] metricNamePrefix: kube_customresource metrics: - name: status_phase @@ -170,6 +222,9 @@ customResourceState: group: governance.nanohype.dev version: v1alpha1 kind: EvalSuite + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] metricNamePrefix: kube_customresource metrics: - name: status_phase diff --git a/dashboards/base/platform/agent-finance.yaml b/dashboards/base/platform/agent-finance.yaml index 0acc0ab..5cce216 100644 --- a/dashboards/base/platform/agent-finance.yaml +++ b/dashboards/base/platform/agent-finance.yaml @@ -82,12 +82,12 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "sum by (platform_id) (agents_spend_report_current_usd)", - "legendFormat": "{{platform_id}} spend" + "expr": "sum by (name) (kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"currentSpendUsd\"})", + "legendFormat": "{{name}} spend" }, { - "expr": "sum by (platform_id) (agents_budget_policy_threshold_usd)", - "legendFormat": "{{platform_id}} threshold" + "expr": "sum by (name) (kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"monthlyUsd\"})", + "legendFormat": "{{name}} threshold" } ], "gridPos": { diff --git a/dashboards/base/platform/agent-founder.yaml b/dashboards/base/platform/agent-founder.yaml index 9339ee5..5b46535 100644 --- a/dashboards/base/platform/agent-founder.yaml +++ b/dashboards/base/platform/agent-founder.yaml @@ -31,7 +31,7 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "count(agents_platform_status_phase{phase=\"Ready\"})" + "expr": "count(kube_customresource_status_phase{customresource_kind=\"Platform\",customresource_phase=\"Ready\"} == 1)" } ], "gridPos": { @@ -43,11 +43,11 @@ spec: }, { "type": "stat", - "title": "Spend this week (USD)", + "title": "Spend month-to-date (USD)", "datasource": "prometheus", "targets": [ { - "expr": "sum(increase(agents_spend_report_current_usd[7d]))" + "expr": "sum(kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"currentSpendUsd\"})" } ], "gridPos": { @@ -59,11 +59,11 @@ spec: }, { "type": "timeseries", - "title": "Weekly spend trend", + "title": "Spend trend (USD, month-to-date)", "datasource": "prometheus", "targets": [ { - "expr": "sum(agents_spend_report_current_usd)" + "expr": "sum(kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"currentSpendUsd\"})" } ], "gridPos": { diff --git a/dashboards/base/platform/agent-ops.yaml b/dashboards/base/platform/agent-ops.yaml index 6490d33..e9c1e45 100644 --- a/dashboards/base/platform/agent-ops.yaml +++ b/dashboards/base/platform/agent-ops.yaml @@ -27,11 +27,11 @@ spec: "panels": [ { "type": "stat", - "title": "Active AgentRuntimes", + "title": "Ready agents (across fleets)", "datasource": "prometheus", "targets": [ { - "expr": "count(agents_agent_runtime_replicas)" + "expr": "sum(kube_customresource_status_field{customresource_kind=\"AgentFleet\",field=\"readyAgents\"})" } ], "gridPos": { @@ -89,11 +89,11 @@ spec: }, { "type": "table", - "title": "Latest EvalRun scores", + "title": "Latest EvalSuite scores", "datasource": "prometheus", "targets": [ { - "expr": "topk(50, agents_eval_run_score{phase=\"completed\"})" + "expr": "topk(50, kube_customresource_status_field{customresource_kind=\"EvalSuite\",field=\"lastScore\"})" } ], "gridPos": { From 9155fa024261e66b5444996c14f9835277628868 Mon Sep 17 00:00:00 2001 From: stxkxs Date: Tue, 23 Jun 2026 17:34:10 -0700 Subject: [PATCH 2/3] fix(dashboards): correct the agentgateway metric names to the documented ones MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The agentgateway panels referenced agentgateway_invocation_total / agentgateway_invocations_total / agentgateway_invocation_duration_seconds — none of which agentgateway emits (verified against agentgateway.dev/docs). The real metrics are agentgateway_llm_requests_total and agentgateway_llm_request_duration_seconds (port 15020). Fixes the names (and eliminates the singular/plural split between agent-agentgateway and agent-ops). The per-label drill-downs (platform / model_id / status / route filters) still assume a label model that doesn't match agentgateway's OTel gen_ai_* conventions — those, plus the scrape annotation (port 15020), are tuned at first scrape against a live gateway (recipe in eks-agent-platform#47). Names being correct now reduces that work to a label pass. JSON valid; yamllint clean. --- dashboards/base/platform/agent-agentgateway.yaml | 12 ++++++------ dashboards/base/platform/agent-ops.yaml | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/dashboards/base/platform/agent-agentgateway.yaml b/dashboards/base/platform/agent-agentgateway.yaml index 90bea95..590860f 100644 --- a/dashboards/base/platform/agent-agentgateway.yaml +++ b/dashboards/base/platform/agent-agentgateway.yaml @@ -30,7 +30,7 @@ spec: "name": "platform", "type": "query", "datasource": "prometheus", - "query": "label_values(agentgateway_invocation_duration_seconds_count, platform)", + "query": "label_values(agentgateway_llm_request_duration_seconds_count, platform)", "includeAll": true, "multi": true } @@ -43,15 +43,15 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket{platform=~\"$platform\"}[5m])))", + "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket{platform=~\"$platform\"}[5m])))", "legendFormat": "p50 {{model_id}}" }, { - "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket{platform=~\"$platform\"}[5m])))", + "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket{platform=~\"$platform\"}[5m])))", "legendFormat": "p95 {{model_id}}" }, { - "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket{platform=~\"$platform\"}[5m])))", + "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket{platform=~\"$platform\"}[5m])))", "legendFormat": "p99 {{model_id}}" } ], @@ -73,7 +73,7 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "sum by (model_id, status) (rate(agentgateway_invocation_total{platform=~\"$platform\",status!=\"200\"}[5m]))", + "expr": "sum by (model_id, status) (rate(agentgateway_llm_requests_total{platform=~\"$platform\",status!=\"200\"}[5m]))", "legendFormat": "{{model_id}} {{status}}" } ], @@ -90,7 +90,7 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "sum by (route) (rate(agentgateway_invocation_total{platform=~\"$platform\"}[1m]))", + "expr": "sum by (route) (rate(agentgateway_llm_requests_total{platform=~\"$platform\"}[1m]))", "legendFormat": "{{route}}" } ], diff --git a/dashboards/base/platform/agent-ops.yaml b/dashboards/base/platform/agent-ops.yaml index e9c1e45..643733b 100644 --- a/dashboards/base/platform/agent-ops.yaml +++ b/dashboards/base/platform/agent-ops.yaml @@ -68,15 +68,15 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket[5m])))", + "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket[5m])))", "legendFormat": "p50 {{model_id}}" }, { - "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket[5m])))", + "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket[5m])))", "legendFormat": "p95 {{model_id}}" }, { - "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket[5m])))", + "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket[5m])))", "legendFormat": "p99 {{model_id}}" } ], @@ -109,11 +109,11 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "sum by (model_id) (rate(agentgateway_invocations_total{status=~\"4..\"}[5m]))", + "expr": "sum by (model_id) (rate(agentgateway_llm_requests_total{status=~\"4..\"}[5m]))", "legendFormat": "4xx {{model_id}}" }, { - "expr": "sum by (model_id) (rate(agentgateway_invocations_total{status=~\"5..\"}[5m]))", + "expr": "sum by (model_id) (rate(agentgateway_llm_requests_total{status=~\"5..\"}[5m]))", "legendFormat": "5xx {{model_id}}" } ], From bced6f99fbded0c17d1ca3868fbf8c27968b9788 Mon Sep 17 00:00:00 2001 From: stxkxs Date: Tue, 23 Jun 2026 17:44:10 -0700 Subject: [PATCH 3/3] feat(addons): deepen kube-state-metrics CR-state coverage to the full operator surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The customResourceState config projected 5 of the 9 operator CRDs, and conditions on only one (Tenant). Every agent-* dashboard de-hollowed in this branch reads kube_customresource_* — so the metrics had to actually exist for those panels to render. This closes the gap to the full CRD surface. ─────────────────────── Conditions sweep ─────────────────────── Added the conditions block (condition_type + condition_status labels, value = status) to every CRD that carries status.conditions: Platform, BudgetPolicy, AgentFleet, EvalSuite, plus the four newly-added CRDs. phase=Ready can mask a degraded reconcile; conditions are the controller's real health truth, so " not Ready" alerts now have a series to fire on for every resource. ─────────────────────── Four dark CRDs ───────────────────────── ModelGateway, AgentSandbox, SandboxPool, BatchJob were entirely unobserved. Each now projects phase (StateSet), conditions, and its load-bearing gauges: - ModelGateway observedGeneration - AgentSandbox podPhase (StateSet), completedAt - SandboxPool readyWorkers (nilIsZero — under-provision alerts fire on an unpopulated status instead of silently vanishing) - BatchJob failedCount, succeededCount, recordCount ─────────────────────── Gauge deepening ──────────────────────── - Tenant platformCount, readyPlatformCount, suspendedPlatformCount, lastReconciled (fleet-size denominator + reconcile staleness) - BudgetPolicy percentOfBudget, conditions (cap-unenforced-if-stale) - AgentFleet observedGeneration (unapplied spec change) - EvalSuite passThreshold, lastRunAt - Platform observedGeneration RBAC: granted KSM list/watch on agentfleets, agentsandboxes, sandboxpools, modelgateways, batchjobs (agents.nanohype.dev) — required or the new resource blocks would silently emit nothing. Every projected path verified against the operator CRD schemas; phase fields are free strings so the StateSet enum lists are best-effort. KSM parses the whole customResourceState as one unit, so the config was validated for structural correctness (one malformed block breaks all kube_customresource_*). --- .../kube-state-metrics/values.yaml | 342 +++++++++++++++++- 1 file changed, 341 insertions(+), 1 deletion(-) diff --git a/addons/observability/kube-state-metrics/values.yaml b/addons/observability/kube-state-metrics/values.yaml index 9bd7625..07f9041 100644 --- a/addons/observability/kube-state-metrics/values.yaml +++ b/addons/observability/kube-state-metrics/values.yaml @@ -31,7 +31,7 @@ rbac: resources: ["budgetpolicies", "evalsuites"] verbs: ["list", "watch"] - apiGroups: ["agents.nanohype.dev"] - resources: ["agentfleets"] + resources: ["agentfleets", "agentsandboxes", "sandboxpools", "modelgateways", "batchjobs"] verbs: ["list", "watch"] # Scraped by the grafana-agent static target, but annotate too so the @@ -83,6 +83,28 @@ customResourceState: commonLabels: customresource_kind: Platform field: suspendedAt + - name: condition + help: "Platform.status.conditions by type + status (per-aspect reconcile/IRSA/ns truth)" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: Platform + - name: status_field + help: "Platform.status.observedGeneration (vs metadata.generation — stuck reconcile)" + each: + type: Gauge + gauge: + path: [status, observedGeneration] + nilIsZero: true + commonLabels: + customresource_kind: Platform + field: observedGeneration # Tenant - groupVersionKind: @@ -123,6 +145,46 @@ customResourceState: commonLabels: customresource_kind: Tenant field: aggregateSpendUsd + - name: status_field + help: "Tenant.status.platformCount (total Platforms — denominator for ready ratio)" + each: + type: Gauge + gauge: + path: [status, platformCount] + nilIsZero: true + commonLabels: + customresource_kind: Tenant + field: platformCount + - name: status_field + help: "Tenant.status.readyPlatformCount (alert when < platformCount — partial outage)" + each: + type: Gauge + gauge: + path: [status, readyPlatformCount] + nilIsZero: true + commonLabels: + customresource_kind: Tenant + field: readyPlatformCount + - name: status_field + help: "Tenant.status.suspendedPlatformCount (kill-switch hits — alert on non-zero)" + each: + type: Gauge + gauge: + path: [status, suspendedPlatformCount] + nilIsZero: true + commonLabels: + customresource_kind: Tenant + field: suspendedPlatformCount + - name: status_field + help: "Tenant.status.lastReconciled (staleness watchdog)" + each: + type: Gauge + gauge: + path: [status, lastReconciled] + nilIsZero: true + commonLabels: + customresource_kind: Tenant + field: lastReconciled - name: condition help: "Tenant.status.conditions by type + status" each: @@ -186,6 +248,28 @@ customResourceState: commonLabels: customresource_kind: BudgetPolicy field: monthlyUsd + - name: status_field + help: "BudgetPolicy.status.percentOfBudget (0..200+ — the natural alert threshold per policy)" + each: + type: Gauge + gauge: + path: [status, percentOfBudget] + nilIsZero: true + commonLabels: + customresource_kind: BudgetPolicy + field: percentOfBudget + - name: condition + help: "BudgetPolicy.status.conditions (budget controller reconcile health — cap unenforced if stale)" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: BudgetPolicy # AgentFleet - groupVersionKind: @@ -216,6 +300,28 @@ customResourceState: commonLabels: customresource_kind: AgentFleet field: readyAgents + - name: condition + help: "AgentFleet.status.conditions (downstream kagent rollout truth; phase=Ready can mask degraded)" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: AgentFleet + - name: status_field + help: "AgentFleet.status.observedGeneration (vs metadata.generation — unapplied spec change)" + each: + type: Gauge + gauge: + path: [status, observedGeneration] + nilIsZero: true + commonLabels: + customresource_kind: AgentFleet + field: observedGeneration # EvalSuite - groupVersionKind: @@ -246,3 +352,237 @@ customResourceState: commonLabels: customresource_kind: EvalSuite field: lastScore + - name: status_field + help: "EvalSuite.spec.passThreshold (0..1) — the required mean; pair with lastScore to alert" + each: + type: Gauge + gauge: + path: [spec, passThreshold] + commonLabels: + customresource_kind: EvalSuite + field: passThreshold + - name: status_field + help: "EvalSuite.status.lastRunAt — freshness watchdog (evals silently stopped)" + each: + type: Gauge + gauge: + path: [status, lastRunAt] + nilIsZero: true + commonLabels: + customresource_kind: EvalSuite + field: lastRunAt + - name: condition + help: "EvalSuite.status.conditions by type + status (harness executed vs agents passed)" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: EvalSuite + + # ModelGateway — every agent's Bedrock egress flows through it; a broken + # Route can be condition=False while phase stays Ready. + - groupVersionKind: + group: agents.nanohype.dev + version: v1alpha1 + kind: ModelGateway + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] + metricNamePrefix: kube_customresource + metrics: + - name: status_phase + help: "ModelGateway.status.phase" + each: + type: StateSet + stateSet: + labelName: customresource_phase + path: [status, phase] + list: [Pending, Provisioning, Ready, Failed] + commonLabels: + customresource_kind: ModelGateway + - name: condition + help: "ModelGateway.status.conditions by type + status" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: ModelGateway + - name: status_field + help: "ModelGateway.status.observedGeneration (reconcile-lag vs metadata.generation)" + each: + type: Gauge + gauge: + path: [status, observedGeneration] + nilIsZero: true + commonLabels: + customresource_kind: ModelGateway + field: observedGeneration + + # AgentSandbox — ephemeral attributable single-session runtime + - groupVersionKind: + group: agents.nanohype.dev + version: v1alpha1 + kind: AgentSandbox + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] + metricNamePrefix: kube_customresource + metrics: + - name: status_phase + help: "AgentSandbox.status.phase (run-once session lifecycle)" + each: + type: StateSet + stateSet: + labelName: customresource_phase + path: [status, phase] + list: [Pending, Running, Succeeded, Failed, Suspended] + commonLabels: + customresource_kind: AgentSandbox + - name: status_pod_phase + help: "AgentSandbox.status.podPhase (the session pod's k8s phase — stuck Pending vs run failure)" + each: + type: StateSet + stateSet: + labelName: customresource_pod_phase + path: [status, podPhase] + list: [Pending, Running, Succeeded, Failed, Unknown] + commonLabels: + customresource_kind: AgentSandbox + - name: condition + help: "AgentSandbox.status.conditions by type + status" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: AgentSandbox + - name: status_field + help: "AgentSandbox.status.completedAt (terminal ts; pairs with ttl for leaked-pod alerts)" + each: + type: Gauge + gauge: + path: [status, completedAt] + nilIsZero: true + commonLabels: + customresource_kind: AgentSandbox + field: completedAt + + # SandboxPool — warm-pool of sandbox workers + - groupVersionKind: + group: agents.nanohype.dev + version: v1alpha1 + kind: SandboxPool + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] + metricNamePrefix: kube_customresource + metrics: + - name: status_phase + help: "SandboxPool.status.phase" + each: + type: StateSet + stateSet: + labelName: customresource_phase + path: [status, phase] + list: [Pending, Ready, Suspended, Failed] + commonLabels: + customresource_kind: SandboxPool + - name: status_field + help: "SandboxPool.status.readyWorkers (warm-pool depth — 0 when unpopulated, so under-provision alerts still fire)" + each: + type: Gauge + gauge: + path: [status, readyWorkers] + nilIsZero: true + commonLabels: + customresource_kind: SandboxPool + field: readyWorkers + - name: condition + help: "SandboxPool.status.conditions by type + status" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: SandboxPool + + # BatchJob — bulk async agent runs + - groupVersionKind: + group: agents.nanohype.dev + version: v1alpha1 + kind: BatchJob + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] + metricNamePrefix: kube_customresource + metrics: + - name: status_phase + help: "BatchJob.status.phase" + each: + type: StateSet + stateSet: + labelName: customresource_phase + path: [status, phase] + list: [Pending, Provisioning, Running, Succeeded, Failed, Stopped] + commonLabels: + customresource_kind: BatchJob + - name: status_field + help: "BatchJob.status.failedCount" + each: + type: Gauge + gauge: + path: [status, failedCount] + nilIsZero: true + commonLabels: + customresource_kind: BatchJob + field: failedCount + - name: status_field + help: "BatchJob.status.succeededCount" + each: + type: Gauge + gauge: + path: [status, succeededCount] + nilIsZero: true + commonLabels: + customresource_kind: BatchJob + field: succeededCount + - name: status_field + help: "BatchJob.status.recordCount (total records to process)" + each: + type: Gauge + gauge: + path: [status, recordCount] + nilIsZero: true + commonLabels: + customresource_kind: BatchJob + field: recordCount + - name: condition + help: "BatchJob.status.conditions by type + status" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: BatchJob