diff --git a/addons/observability/kube-state-metrics/values.yaml b/addons/observability/kube-state-metrics/values.yaml index 95af1d5..07f9041 100644 --- a/addons/observability/kube-state-metrics/values.yaml +++ b/addons/observability/kube-state-metrics/values.yaml @@ -31,7 +31,7 @@ rbac: resources: ["budgetpolicies", "evalsuites"] verbs: ["list", "watch"] - apiGroups: ["agents.nanohype.dev"] - resources: ["agentfleets"] + resources: ["agentfleets", "agentsandboxes", "sandboxpools", "modelgateways", "batchjobs"] verbs: ["list", "watch"] # Scraped by the grafana-agent static target, but annotate too so the @@ -58,6 +58,9 @@ customResourceState: group: platform.nanohype.dev version: v1alpha1 kind: Platform + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] metricNamePrefix: kube_customresource metrics: - name: status_phase @@ -80,12 +83,37 @@ customResourceState: commonLabels: customresource_kind: Platform field: suspendedAt + - name: condition + help: "Platform.status.conditions by type + status (per-aspect reconcile/IRSA/ns truth)" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: Platform + - name: status_field + help: "Platform.status.observedGeneration (vs metadata.generation — stuck reconcile)" + each: + type: Gauge + gauge: + path: [status, observedGeneration] + nilIsZero: true + commonLabels: + customresource_kind: Platform + field: observedGeneration # Tenant - groupVersionKind: group: platform.nanohype.dev version: v1alpha1 kind: Tenant + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] metricNamePrefix: kube_customresource metrics: - name: status_phase @@ -107,6 +135,56 @@ customResourceState: commonLabels: customresource_kind: Tenant field: percentOfBudget + - name: status_field + help: "Tenant.status.aggregateSpendUsd (USD) as a gauge" + each: + type: Gauge + gauge: + path: [status, aggregateSpendUsd] + nilIsZero: true + commonLabels: + customresource_kind: Tenant + field: aggregateSpendUsd + - name: status_field + help: "Tenant.status.platformCount (total Platforms — denominator for ready ratio)" + each: + type: Gauge + gauge: + path: [status, platformCount] + nilIsZero: true + commonLabels: + customresource_kind: Tenant + field: platformCount + - name: status_field + help: "Tenant.status.readyPlatformCount (alert when < platformCount — partial outage)" + each: + type: Gauge + gauge: + path: [status, readyPlatformCount] + nilIsZero: true + commonLabels: + customresource_kind: Tenant + field: readyPlatformCount + - name: status_field + help: "Tenant.status.suspendedPlatformCount (kill-switch hits — alert on non-zero)" + each: + type: Gauge + gauge: + path: [status, suspendedPlatformCount] + nilIsZero: true + commonLabels: + customresource_kind: Tenant + field: suspendedPlatformCount + - name: status_field + help: "Tenant.status.lastReconciled (staleness watchdog)" + each: + type: Gauge + gauge: + path: [status, lastReconciled] + nilIsZero: true + commonLabels: + customresource_kind: Tenant + field: lastReconciled - name: condition help: "Tenant.status.conditions by type + status" each: @@ -125,10 +203,13 @@ customResourceState: group: governance.nanohype.dev version: v1alpha1 kind: BudgetPolicy + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] metricNamePrefix: kube_customresource metrics: - name: status_field - help: "BudgetPolicy.status.{lastReconciled,killSwitchFiredAt,percentOfBudget} as gauges" + help: "BudgetPolicy.status.lastReconciled as a unix-ts gauge" each: type: Gauge gauge: @@ -137,12 +218,67 @@ customResourceState: commonLabels: customresource_kind: BudgetPolicy field: lastReconciled + - name: status_field + help: "BudgetPolicy.status.currentSpendUsd (USD) as a gauge" + each: + type: Gauge + gauge: + path: [status, currentSpendUsd] + nilIsZero: true + commonLabels: + customresource_kind: BudgetPolicy + field: currentSpendUsd + - name: status_field + help: "BudgetPolicy.status.killSwitchFiredAt as a unix-ts gauge (0 when unset)" + each: + type: Gauge + gauge: + path: [status, killSwitchFiredAt] + nilIsZero: true + commonLabels: + customresource_kind: BudgetPolicy + field: killSwitchFiredAt + - name: status_field + help: "BudgetPolicy.spec.monthlyUsd — the soft spend threshold (USD)" + each: + type: Gauge + gauge: + path: [spec, monthlyUsd] + nilIsZero: true + commonLabels: + customresource_kind: BudgetPolicy + field: monthlyUsd + - name: status_field + help: "BudgetPolicy.status.percentOfBudget (0..200+ — the natural alert threshold per policy)" + each: + type: Gauge + gauge: + path: [status, percentOfBudget] + nilIsZero: true + commonLabels: + customresource_kind: BudgetPolicy + field: percentOfBudget + - name: condition + help: "BudgetPolicy.status.conditions (budget controller reconcile health — cap unenforced if stale)" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: BudgetPolicy # AgentFleet - groupVersionKind: group: agents.nanohype.dev version: v1alpha1 kind: AgentFleet + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] metricNamePrefix: kube_customresource metrics: - name: status_phase @@ -164,12 +300,37 @@ customResourceState: commonLabels: customresource_kind: AgentFleet field: readyAgents + - name: condition + help: "AgentFleet.status.conditions (downstream kagent rollout truth; phase=Ready can mask degraded)" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: AgentFleet + - name: status_field + help: "AgentFleet.status.observedGeneration (vs metadata.generation — unapplied spec change)" + each: + type: Gauge + gauge: + path: [status, observedGeneration] + nilIsZero: true + commonLabels: + customresource_kind: AgentFleet + field: observedGeneration # EvalSuite - groupVersionKind: group: governance.nanohype.dev version: v1alpha1 kind: EvalSuite + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] metricNamePrefix: kube_customresource metrics: - name: status_phase @@ -191,3 +352,237 @@ customResourceState: commonLabels: customresource_kind: EvalSuite field: lastScore + - name: status_field + help: "EvalSuite.spec.passThreshold (0..1) — the required mean; pair with lastScore to alert" + each: + type: Gauge + gauge: + path: [spec, passThreshold] + commonLabels: + customresource_kind: EvalSuite + field: passThreshold + - name: status_field + help: "EvalSuite.status.lastRunAt — freshness watchdog (evals silently stopped)" + each: + type: Gauge + gauge: + path: [status, lastRunAt] + nilIsZero: true + commonLabels: + customresource_kind: EvalSuite + field: lastRunAt + - name: condition + help: "EvalSuite.status.conditions by type + status (harness executed vs agents passed)" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: EvalSuite + + # ModelGateway — every agent's Bedrock egress flows through it; a broken + # Route can be condition=False while phase stays Ready. + - groupVersionKind: + group: agents.nanohype.dev + version: v1alpha1 + kind: ModelGateway + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] + metricNamePrefix: kube_customresource + metrics: + - name: status_phase + help: "ModelGateway.status.phase" + each: + type: StateSet + stateSet: + labelName: customresource_phase + path: [status, phase] + list: [Pending, Provisioning, Ready, Failed] + commonLabels: + customresource_kind: ModelGateway + - name: condition + help: "ModelGateway.status.conditions by type + status" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: ModelGateway + - name: status_field + help: "ModelGateway.status.observedGeneration (reconcile-lag vs metadata.generation)" + each: + type: Gauge + gauge: + path: [status, observedGeneration] + nilIsZero: true + commonLabels: + customresource_kind: ModelGateway + field: observedGeneration + + # AgentSandbox — ephemeral attributable single-session runtime + - groupVersionKind: + group: agents.nanohype.dev + version: v1alpha1 + kind: AgentSandbox + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] + metricNamePrefix: kube_customresource + metrics: + - name: status_phase + help: "AgentSandbox.status.phase (run-once session lifecycle)" + each: + type: StateSet + stateSet: + labelName: customresource_phase + path: [status, phase] + list: [Pending, Running, Succeeded, Failed, Suspended] + commonLabels: + customresource_kind: AgentSandbox + - name: status_pod_phase + help: "AgentSandbox.status.podPhase (the session pod's k8s phase — stuck Pending vs run failure)" + each: + type: StateSet + stateSet: + labelName: customresource_pod_phase + path: [status, podPhase] + list: [Pending, Running, Succeeded, Failed, Unknown] + commonLabels: + customresource_kind: AgentSandbox + - name: condition + help: "AgentSandbox.status.conditions by type + status" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: AgentSandbox + - name: status_field + help: "AgentSandbox.status.completedAt (terminal ts; pairs with ttl for leaked-pod alerts)" + each: + type: Gauge + gauge: + path: [status, completedAt] + nilIsZero: true + commonLabels: + customresource_kind: AgentSandbox + field: completedAt + + # SandboxPool — warm-pool of sandbox workers + - groupVersionKind: + group: agents.nanohype.dev + version: v1alpha1 + kind: SandboxPool + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] + metricNamePrefix: kube_customresource + metrics: + - name: status_phase + help: "SandboxPool.status.phase" + each: + type: StateSet + stateSet: + labelName: customresource_phase + path: [status, phase] + list: [Pending, Ready, Suspended, Failed] + commonLabels: + customresource_kind: SandboxPool + - name: status_field + help: "SandboxPool.status.readyWorkers (warm-pool depth — 0 when unpopulated, so under-provision alerts still fire)" + each: + type: Gauge + gauge: + path: [status, readyWorkers] + nilIsZero: true + commonLabels: + customresource_kind: SandboxPool + field: readyWorkers + - name: condition + help: "SandboxPool.status.conditions by type + status" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: SandboxPool + + # BatchJob — bulk async agent runs + - groupVersionKind: + group: agents.nanohype.dev + version: v1alpha1 + kind: BatchJob + labelsFromPath: + name: [metadata, name] + namespace: [metadata, namespace] + metricNamePrefix: kube_customresource + metrics: + - name: status_phase + help: "BatchJob.status.phase" + each: + type: StateSet + stateSet: + labelName: customresource_phase + path: [status, phase] + list: [Pending, Provisioning, Running, Succeeded, Failed, Stopped] + commonLabels: + customresource_kind: BatchJob + - name: status_field + help: "BatchJob.status.failedCount" + each: + type: Gauge + gauge: + path: [status, failedCount] + nilIsZero: true + commonLabels: + customresource_kind: BatchJob + field: failedCount + - name: status_field + help: "BatchJob.status.succeededCount" + each: + type: Gauge + gauge: + path: [status, succeededCount] + nilIsZero: true + commonLabels: + customresource_kind: BatchJob + field: succeededCount + - name: status_field + help: "BatchJob.status.recordCount (total records to process)" + each: + type: Gauge + gauge: + path: [status, recordCount] + nilIsZero: true + commonLabels: + customresource_kind: BatchJob + field: recordCount + - name: condition + help: "BatchJob.status.conditions by type + status" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: BatchJob diff --git a/dashboards/base/platform/agent-agentgateway.yaml b/dashboards/base/platform/agent-agentgateway.yaml index 90bea95..590860f 100644 --- a/dashboards/base/platform/agent-agentgateway.yaml +++ b/dashboards/base/platform/agent-agentgateway.yaml @@ -30,7 +30,7 @@ spec: "name": "platform", "type": "query", "datasource": "prometheus", - "query": "label_values(agentgateway_invocation_duration_seconds_count, platform)", + "query": "label_values(agentgateway_llm_request_duration_seconds_count, platform)", "includeAll": true, "multi": true } @@ -43,15 +43,15 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket{platform=~\"$platform\"}[5m])))", + "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket{platform=~\"$platform\"}[5m])))", "legendFormat": "p50 {{model_id}}" }, { - "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket{platform=~\"$platform\"}[5m])))", + "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket{platform=~\"$platform\"}[5m])))", "legendFormat": "p95 {{model_id}}" }, { - "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket{platform=~\"$platform\"}[5m])))", + "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket{platform=~\"$platform\"}[5m])))", "legendFormat": "p99 {{model_id}}" } ], @@ -73,7 +73,7 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "sum by (model_id, status) (rate(agentgateway_invocation_total{platform=~\"$platform\",status!=\"200\"}[5m]))", + "expr": "sum by (model_id, status) (rate(agentgateway_llm_requests_total{platform=~\"$platform\",status!=\"200\"}[5m]))", "legendFormat": "{{model_id}} {{status}}" } ], @@ -90,7 +90,7 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "sum by (route) (rate(agentgateway_invocation_total{platform=~\"$platform\"}[1m]))", + "expr": "sum by (route) (rate(agentgateway_llm_requests_total{platform=~\"$platform\"}[1m]))", "legendFormat": "{{route}}" } ], diff --git a/dashboards/base/platform/agent-finance.yaml b/dashboards/base/platform/agent-finance.yaml index 0acc0ab..5cce216 100644 --- a/dashboards/base/platform/agent-finance.yaml +++ b/dashboards/base/platform/agent-finance.yaml @@ -82,12 +82,12 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "sum by (platform_id) (agents_spend_report_current_usd)", - "legendFormat": "{{platform_id}} spend" + "expr": "sum by (name) (kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"currentSpendUsd\"})", + "legendFormat": "{{name}} spend" }, { - "expr": "sum by (platform_id) (agents_budget_policy_threshold_usd)", - "legendFormat": "{{platform_id}} threshold" + "expr": "sum by (name) (kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"monthlyUsd\"})", + "legendFormat": "{{name}} threshold" } ], "gridPos": { diff --git a/dashboards/base/platform/agent-founder.yaml b/dashboards/base/platform/agent-founder.yaml index 9339ee5..5b46535 100644 --- a/dashboards/base/platform/agent-founder.yaml +++ b/dashboards/base/platform/agent-founder.yaml @@ -31,7 +31,7 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "count(agents_platform_status_phase{phase=\"Ready\"})" + "expr": "count(kube_customresource_status_phase{customresource_kind=\"Platform\",customresource_phase=\"Ready\"} == 1)" } ], "gridPos": { @@ -43,11 +43,11 @@ spec: }, { "type": "stat", - "title": "Spend this week (USD)", + "title": "Spend month-to-date (USD)", "datasource": "prometheus", "targets": [ { - "expr": "sum(increase(agents_spend_report_current_usd[7d]))" + "expr": "sum(kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"currentSpendUsd\"})" } ], "gridPos": { @@ -59,11 +59,11 @@ spec: }, { "type": "timeseries", - "title": "Weekly spend trend", + "title": "Spend trend (USD, month-to-date)", "datasource": "prometheus", "targets": [ { - "expr": "sum(agents_spend_report_current_usd)" + "expr": "sum(kube_customresource_status_field{customresource_kind=\"BudgetPolicy\",field=\"currentSpendUsd\"})" } ], "gridPos": { diff --git a/dashboards/base/platform/agent-ops.yaml b/dashboards/base/platform/agent-ops.yaml index 6490d33..643733b 100644 --- a/dashboards/base/platform/agent-ops.yaml +++ b/dashboards/base/platform/agent-ops.yaml @@ -27,11 +27,11 @@ spec: "panels": [ { "type": "stat", - "title": "Active AgentRuntimes", + "title": "Ready agents (across fleets)", "datasource": "prometheus", "targets": [ { - "expr": "count(agents_agent_runtime_replicas)" + "expr": "sum(kube_customresource_status_field{customresource_kind=\"AgentFleet\",field=\"readyAgents\"})" } ], "gridPos": { @@ -68,15 +68,15 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket[5m])))", + "expr": "histogram_quantile(0.50, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket[5m])))", "legendFormat": "p50 {{model_id}}" }, { - "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket[5m])))", + "expr": "histogram_quantile(0.95, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket[5m])))", "legendFormat": "p95 {{model_id}}" }, { - "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_invocation_duration_seconds_bucket[5m])))", + "expr": "histogram_quantile(0.99, sum by (le, model_id) (rate(agentgateway_llm_request_duration_seconds_bucket[5m])))", "legendFormat": "p99 {{model_id}}" } ], @@ -89,11 +89,11 @@ spec: }, { "type": "table", - "title": "Latest EvalRun scores", + "title": "Latest EvalSuite scores", "datasource": "prometheus", "targets": [ { - "expr": "topk(50, agents_eval_run_score{phase=\"completed\"})" + "expr": "topk(50, kube_customresource_status_field{customresource_kind=\"EvalSuite\",field=\"lastScore\"})" } ], "gridPos": { @@ -109,11 +109,11 @@ spec: "datasource": "prometheus", "targets": [ { - "expr": "sum by (model_id) (rate(agentgateway_invocations_total{status=~\"4..\"}[5m]))", + "expr": "sum by (model_id) (rate(agentgateway_llm_requests_total{status=~\"4..\"}[5m]))", "legendFormat": "4xx {{model_id}}" }, { - "expr": "sum by (model_id) (rate(agentgateway_invocations_total{status=~\"5..\"}[5m]))", + "expr": "sum by (model_id) (rate(agentgateway_llm_requests_total{status=~\"5..\"}[5m]))", "legendFormat": "5xx {{model_id}}" } ],