From 52bce612d51218342980817bff4fd9a20a5516db Mon Sep 17 00:00:00 2001 From: hyeokjun32 Date: Sun, 17 May 2026 00:49:00 +0900 Subject: [PATCH] feat: add agent runtime reliability report --- README.ko.md | 5 + README.md | 6 + .../agent_runtime_reliability_report.md | 77 +++ .../agent_3_orchestration_summary.json | 69 +++ .../aiguard_runtime_guard_analysis.json | 57 +++ inferedgelab/cli.py | 4 + inferedgelab/commands/agent_runtime_report.py | 70 +++ inferedgelab/services/agent_runtime_report.py | 459 ++++++++++++++++++ tests/test_agent_runtime_report.py | 161 ++++++ 9 files changed, 908 insertions(+) create mode 100644 docs/portfolio/agent_runtime_reliability_report.md create mode 100644 examples/agent_runtime/agent_3_orchestration_summary.json create mode 100644 examples/agent_runtime/aiguard_runtime_guard_analysis.json create mode 100644 inferedgelab/commands/agent_runtime_report.py create mode 100644 inferedgelab/services/agent_runtime_report.py create mode 100644 tests/test_agent_runtime_report.py diff --git a/README.ko.md b/README.ko.md index bdfad63..daf1c84 100644 --- a/README.ko.md +++ b/README.ko.md @@ -58,12 +58,17 @@ Recommended demo flow: ```bash poetry run inferedgelab demo-evidence-summary poetry run inferedgelab demo-evidence-summary --format json +poetry run inferedgelab agent-runtime-report \ + --orchestration-summary examples/agent_runtime/agent_3_orchestration_summary.json \ + --guard-analysis examples/agent_runtime/aiguard_runtime_guard_analysis.json poetry run inferedgelab export-demo-evidence --output reports/studio_demo_evidence.md ``` Load Demo Evidence는 bundled ONNX Runtime CPU / TensorRT Jetson result fixture를 불러오고, Run / Import / Jetson Helper는 기존 CLI/API workflow를 local UI로 확장하는 보조 기능입니다. Studio evidence와 jobs는 in-memory이며 local server process가 재시작되면 초기화됩니다. +`agent-runtime-report`는 Orchestrator scheduling evidence와 AIGuard runtime reliability `guard_analysis`를 Lab-owned agent deployment decision context로 묶는 additive report path입니다. 기존 Runtime result나 compare contract는 변경하지 않습니다. + ## 이 레포의 역할 - Runtime benchmark/result JSON을 읽어 compare/report를 생성합니다. diff --git a/README.md b/README.md index 458fc4c..b4befb2 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,9 @@ poetry run inferedgelab demo-evidence-summary poetry run inferedgelab demo-evidence-summary --format json poetry run inferedgelab portfolio-demo-check poetry run inferedgelab core4-conformance-check +poetry run inferedgelab agent-runtime-report \ + --orchestration-summary examples/agent_runtime/agent_3_orchestration_summary.json \ + --guard-analysis examples/agent_runtime/aiguard_runtime_guard_analysis.json poetry run inferedgelab export-demo-evidence --output reports/studio_demo_evidence.md ``` @@ -122,6 +125,9 @@ It validates the committed Studio fixtures, expected README/PPT metrics, portfol It validates the bundled Forge manifest/metadata fixture, Runtime result JSON, Lab compare/deployment decision surface, and AIGuard `guard_analysis` evidence without mutating existing schemas. The Lab decision surface now also exposes `policy_version`, `triggered_rules`, and `policy_summary` so reviewers can see which local policy rules produced deploy/review/block/unknown outcomes. +`agent-runtime-report` is an additive reliable edge agent runtime report path. +It bundles Orchestrator scheduling evidence and AIGuard runtime reliability `guard_analysis` into a Lab-owned agent deployment decision context without changing existing Runtime result or compare contracts. + ![InferEdge Local Studio demo evidence](assets/images/local-studio-demo-evidence.png) Verified demo fixture values: diff --git a/docs/portfolio/agent_runtime_reliability_report.md b/docs/portfolio/agent_runtime_reliability_report.md new file mode 100644 index 0000000..d74cf19 --- /dev/null +++ b/docs/portfolio/agent_runtime_reliability_report.md @@ -0,0 +1,77 @@ +# Agent Runtime Reliability Report + +## Scope + +This report is the first Lab-side bundle view for the reliable edge agent +runtime path. + +It connects: + +- Forge `agent_manifest.json` metadata +- Runtime `result.agent` metadata +- Orchestrator `inferedge-orchestration-summary-v1` +- AIGuard `inferedge-aiguard-diagnosis-v1` +- Lab-owned agent deployment decision context + +This is a local-first report path. It is not a production cloud orchestration +dashboard and does not add DB/queue/auth/billing behavior. + +## Demo Bundle + +Committed lightweight fixtures: + +- `examples/agent_runtime/agent_3_orchestration_summary.json` +- `examples/agent_runtime/aiguard_runtime_guard_analysis.json` + +Generate a Markdown report: + +```bash +poetry run inferedgelab agent-runtime-report \ + --orchestration-summary examples/agent_runtime/agent_3_orchestration_summary.json \ + --guard-analysis examples/agent_runtime/aiguard_runtime_guard_analysis.json \ + --format markdown \ + --output reports/agent_runtime_reliability_report.md +``` + +## Evidence Summary + +| Evidence | Value | +|---|---:| +| executed_count | 10 | +| dropped_count | 14 | +| deadline_missed_count | 1 | +| fallback_count | 14 | +| drop_rate | 0.583333 | +| fallback_rate | 0.583333 | +| deadline_miss_rate | 0.1 | +| queue_backlog_policy_decision_count | 1 | + +## Lab Decision Context + +Expected decision: + +```text +blocked +``` + +Primary reason: + +```text +Agent runtime reliability evidence indicates blocked deployment risk. +``` + +Triggered rules: + +- `guard_blocked_runtime_block` +- `drop_rate_block` +- `fallback_rate_block` +- `deadline_miss_review` +- `queue_backlog_review` + +## Boundary + +- Orchestrator records scheduling and policy evidence. +- AIGuard explains runtime reliability risk. +- Lab remains the final deployment decision owner. +- This report is an additive agent-runtime path and does not change existing + Runtime result, compare output, or classic deployment decision contracts. diff --git a/examples/agent_runtime/agent_3_orchestration_summary.json b/examples/agent_runtime/agent_3_orchestration_summary.json new file mode 100644 index 0000000..898891b --- /dev/null +++ b/examples/agent_runtime/agent_3_orchestration_summary.json @@ -0,0 +1,69 @@ +{ + "schema_version": "inferedge-orchestration-summary-v1", + "agent_runtime_summary": { + "schema_version": "inferedge-orchestration-summary-v1", + "source_contracts": { + "forge_agent_manifest": "inferedge-agent-manifest-v1", + "runtime_agent_result": "inferedge-runtime-agent-task-v1" + }, + "agents": { + "safety_monitor_agent": { + "agent_id": "safety_monitor_agent", + "agent_type": "safety", + "priority": 100, + "latency_budget_ms": 20.0, + "fallback_policy": "protect", + "task_id": "task_safety_monitor_agent" + }, + "vision_agent": { + "agent_id": "vision_agent", + "agent_type": "vision", + "priority": 90, + "latency_budget_ms": 33.0, + "fallback_policy": "drop_stale", + "task_id": "task_vision_agent" + }, + "voice_command_agent": { + "agent_id": "voice_command_agent", + "agent_type": "voice", + "priority": 50, + "latency_budget_ms": 120.0, + "fallback_policy": "defer", + "task_id": "task_voice_command_agent" + } + }, + "totals": { + "executed_count": 10, + "dropped_count": 14, + "deadline_missed_count": 1, + "fallback_count": 14, + "policy_decision_count": 14, + "overload_event_count": 14 + } + }, + "policy_decision_log": [ + { + "agent_id": "vision_agent", + "task_id": "task_vision_agent", + "decision": "load_shedding", + "reason": "queue_backlog_threshold_exceeded", + "fallback_used": true, + "protected_agent_id": "safety_monitor_agent" + } + ], + "drop_events": [ + { + "agent_id": "vision_agent", + "task_id": "task_vision_agent", + "reason": "load_shedding_backlog_threshold_exceeded" + } + ], + "overload_events": [ + { + "agent_id": "vision_agent", + "task_id": "task_vision_agent", + "fallback_used": true, + "reason": "queue_backlog_threshold_exceeded" + } + ] +} diff --git a/examples/agent_runtime/aiguard_runtime_guard_analysis.json b/examples/agent_runtime/aiguard_runtime_guard_analysis.json new file mode 100644 index 0000000..65de9b6 --- /dev/null +++ b/examples/agent_runtime/aiguard_runtime_guard_analysis.json @@ -0,0 +1,57 @@ +{ + "schema_version": "inferedge-aiguard-diagnosis-v1", + "source": { + "orchestration_summary_schema_version": "inferedge-orchestration-summary-v1" + }, + "guard_verdict": "blocked", + "severity": "high", + "confidence": 0.88, + "primary_reason": "drop_rate indicates runtime reliability risk under orchestrated multi-agent load.", + "evidence": [ + { + "type": "excessive_drop_rate", + "metric_name": "drop_rate", + "observed_value": 0.5833333333333334, + "baseline_value": null, + "threshold": 0.2, + "delta": null, + "delta_pct": null, + "increase_factor": null, + "severity": "high", + "status": "failed", + "explanation": "Drop rate crossed the configured review threshold under synthetic 3-agent load.", + "why_it_matters": "High drop rate can make camera or command workloads stale even if selected high-priority tasks are protected.", + "suspected_causes": [ + "queue_backlog", + "overload_load_shedding", + "producer_rate_exceeds_runtime_capacity" + ], + "recommendation": "Tune target FPS, queue size, drop policy, or fallback policy for affected agents.", + "raw_context": { + "executed_count": 10, + "dropped_count": 14 + } + } + ], + "suspected_causes": [ + "queue_backlog", + "overload_load_shedding", + "producer_rate_exceeds_runtime_capacity" + ], + "recommendations": [ + "Tune target FPS, queue size, drop policy, or fallback policy for affected agents." + ], + "thresholds": { + "drop_rate_review": 0.2, + "drop_rate_blocked": 0.5 + }, + "baseline_summary": {}, + "candidate_summary": { + "runtime_reliability": { + "drop_rate": 0.5833333333333334, + "fallback_rate": 0.5833333333333334, + "deadline_miss_rate": 0.1 + } + }, + "created_at": "2026-05-17T00:00:00Z" +} diff --git a/inferedgelab/cli.py b/inferedgelab/cli.py index 3ae1f06..4065487 100644 --- a/inferedgelab/cli.py +++ b/inferedgelab/cli.py @@ -16,6 +16,7 @@ from inferedgelab.commands.demo_evidence import export_demo_evidence_cmd from inferedgelab.commands.demo_evidence import portfolio_demo_check_cmd from inferedgelab.commands.core4_conformance import core4_conformance_check_cmd +from inferedgelab.commands.agent_runtime_report import agent_runtime_report_cmd from inferedgelab.commands.list_results import list_results_cmd from inferedgelab.commands.history_report import history_report_cmd from inferedgelab.commands.serve import serve_cmd @@ -52,6 +53,9 @@ def version_cmd() -> None: app.command("core4-conformance-check", help="Validate Forge/Runtime/Lab/AIGuard contract conformance")( core4_conformance_check_cmd ) +app.command("agent-runtime-report", help="Generate Agent Runtime Reliability report from Orchestrator/AIGuard evidence")( + agent_runtime_report_cmd +) app.command("list-results", help="List recent structured benchmark results")(list_results_cmd) app.command("history-report", help="Generate HTML history report from structured benchmark results")(history_report_cmd) app.command("serve", help="Run InferEdgeLab FastAPI server")(serve_cmd) diff --git a/inferedgelab/commands/agent_runtime_report.py b/inferedgelab/commands/agent_runtime_report.py new file mode 100644 index 0000000..844c7d3 --- /dev/null +++ b/inferedgelab/commands/agent_runtime_report.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +from pathlib import Path + +import typer +from rich import print as rprint + +from inferedgelab.services.agent_runtime_report import ( + agent_runtime_reliability_json, + build_agent_runtime_reliability_markdown, + load_agent_runtime_reliability_bundle, +) + + +def agent_runtime_report_cmd( + orchestration_summary: str = typer.Option( + ..., + "--orchestration-summary", + help="Path to InferEdgeOrchestrator orchestration_summary JSON", + ), + guard_analysis: str = typer.Option( + "", + "--guard-analysis", + help="Optional AIGuard runtime reliability guard_analysis JSON", + ), + format: str = typer.Option("text", "--format", "-f", help="text/json/markdown"), + output: str = typer.Option("", "--output", "-o", help="Optional output path"), +) -> None: + report = load_agent_runtime_reliability_bundle( + orchestration_summary_path=orchestration_summary, + guard_analysis_path=guard_analysis or None, + ) + normalized_format = format.strip().lower() + if normalized_format == "json": + text = agent_runtime_reliability_json(report) + elif normalized_format in {"markdown", "md"}: + text = build_agent_runtime_reliability_markdown(report) + elif normalized_format == "text": + text = _text_summary(report) + else: + raise typer.BadParameter("--format must be one of: text, json, markdown") + + if output: + path = Path(output) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(text, encoding="utf-8") + rprint(f"[green]Saved[/green]: {path}") + else: + print(text, end="") + + +def _text_summary(report: dict) -> str: + metrics = report["agent_runtime_summary"]["metrics"] + decision = report["agent_deployment_decision"] + guard = report["guard_summary"] + lines = [ + "InferEdge Agent Runtime Reliability Report", + f"schema_version: {report['schema_version']}", + f"decision: {decision['decision']}", + f"policy_version: {decision['policy_version']}", + f"reason: {decision['reason']}", + f"guard_verdict: {guard.get('guard_verdict')}", + f"drop_rate: {metrics['drop_rate']:.6g}", + f"fallback_rate: {metrics['fallback_rate']:.6g}", + f"deadline_miss_rate: {metrics['deadline_miss_rate']:.6g}", + "triggered_rules:", + ] + lines.extend(f"- {rule}" for rule in decision["triggered_rules"]) + lines.append("") + return "\n".join(lines) diff --git a/inferedgelab/services/agent_runtime_report.py b/inferedgelab/services/agent_runtime_report.py new file mode 100644 index 0000000..233cdcf --- /dev/null +++ b/inferedgelab/services/agent_runtime_report.py @@ -0,0 +1,459 @@ +from __future__ import annotations + +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from inferedgelab.services.guard_analysis import ( + guard_evidence_items, + guard_primary_reason, + guard_status, + guard_verdict, +) + + +AGENT_RUNTIME_REPORT_SCHEMA_VERSION = "inferedgelab-agent-runtime-reliability-report-v1" +AGENT_RUNTIME_POLICY_VERSION = "inferedge-lab-agent-runtime-policy-v1" +ORCHESTRATION_SCHEMA_VERSION = "inferedge-orchestration-summary-v1" +AIGUARD_DIAGNOSIS_SCHEMA_VERSION = "inferedge-aiguard-diagnosis-v1" + +DEFAULT_AGENT_RUNTIME_THRESHOLDS = { + "deadline_miss_rate_review": 0.05, + "deadline_miss_rate_blocked": 0.20, + "drop_rate_review": 0.20, + "drop_rate_blocked": 0.50, + "fallback_rate_review": 0.20, + "fallback_rate_blocked": 0.50, + "queue_backlog_policy_decision_count_review": 1, +} + +AGENT_RUNTIME_POLICY_RULES: dict[str, dict[str, str]] = { + "guard_blocked_runtime_block": { + "effect": "blocked", + "description": "AIGuard runtime reliability evidence reported blocked/error status.", + }, + "guard_warning_runtime_review": { + "effect": "review_required", + "description": "AIGuard runtime reliability evidence requires deployment review.", + }, + "guard_missing_unknown": { + "effect": "unknown", + "description": "AIGuard runtime reliability evidence is missing.", + }, + "deadline_miss_block": { + "effect": "blocked", + "description": "Deadline miss rate crossed the blocking threshold.", + }, + "deadline_miss_review": { + "effect": "review_required", + "description": "Deadline miss rate crossed the review threshold.", + }, + "drop_rate_block": { + "effect": "blocked", + "description": "Drop rate crossed the blocking threshold.", + }, + "drop_rate_review": { + "effect": "review_required", + "description": "Drop rate crossed the review threshold.", + }, + "fallback_rate_block": { + "effect": "blocked", + "description": "Fallback usage crossed the blocking threshold.", + }, + "fallback_rate_review": { + "effect": "review_required", + "description": "Fallback usage crossed the review threshold.", + }, + "queue_backlog_review": { + "effect": "review_required", + "description": "Queue backlog policy intervention was observed.", + }, + "runtime_reliability_pass_note": { + "effect": "deployable_with_note", + "description": "Runtime reliability evidence stayed within configured thresholds.", + }, +} + + +def build_agent_runtime_reliability_report( + *, + orchestration_summary: dict[str, Any], + guard_analysis: dict[str, Any] | None = None, + source: dict[str, Any] | None = None, + thresholds: dict[str, float] | None = None, +) -> dict[str, Any]: + """Build a Lab-owned report for an agent runtime reliability bundle.""" + + policy = {**DEFAULT_AGENT_RUNTIME_THRESHOLDS, **(thresholds or {})} + metrics = compute_agent_runtime_metrics(orchestration_summary) + runtime_summary = _agent_runtime_summary(orchestration_summary) + decision = build_agent_runtime_deployment_decision( + metrics=metrics, + guard_analysis=guard_analysis, + thresholds=policy, + ) + + return { + "schema_version": AGENT_RUNTIME_REPORT_SCHEMA_VERSION, + "generated_at": _utc_now_iso(), + "scope": "local-first agent runtime reliability report", + "source": dict(source or {}), + "contracts": { + "orchestration_summary": ( + orchestration_summary.get("schema_version") + or runtime_summary.get("schema_version") + ), + "aiguard_guard_analysis": ( + guard_analysis.get("schema_version") + if isinstance(guard_analysis, dict) + else None + ), + "source_contracts": runtime_summary.get("source_contracts", {}), + }, + "agent_runtime_summary": { + "agents": _agent_summaries(runtime_summary), + "totals": _totals(runtime_summary), + "metrics": metrics, + "policy_decision_log_count": len(_policy_log(orchestration_summary)), + }, + "guard_summary": _guard_summary(guard_analysis), + "agent_deployment_decision": decision, + "notes": [ + "This report is local-first runtime reliability evidence, not a production cloud orchestration dashboard.", + "InferEdgeLab remains the final deployment decision owner.", + "AIGuard and Orchestrator provide optional evidence; they do not overwrite Lab policy.", + ], + } + + +def build_agent_runtime_deployment_decision( + *, + metrics: dict[str, Any], + guard_analysis: dict[str, Any] | None, + thresholds: dict[str, float] | None = None, +) -> dict[str, Any]: + policy = {**DEFAULT_AGENT_RUNTIME_THRESHOLDS, **(thresholds or {})} + triggered_rules: list[str] = [] + + normalized_guard_status = guard_status(guard_analysis) + normalized_guard_verdict = guard_verdict(guard_analysis) + + if normalized_guard_status == "error" or normalized_guard_verdict == "blocked": + triggered_rules.append("guard_blocked_runtime_block") + elif normalized_guard_status == "warning" or normalized_guard_verdict in { + "suspicious", + "review_required", + }: + triggered_rules.append("guard_warning_runtime_review") + elif normalized_guard_status is None and normalized_guard_verdict is None: + triggered_rules.append("guard_missing_unknown") + + _append_metric_rules( + triggered_rules, + metric_value=metrics["deadline_miss_rate"], + review=policy["deadline_miss_rate_review"], + blocked=policy["deadline_miss_rate_blocked"], + review_rule="deadline_miss_review", + blocked_rule="deadline_miss_block", + ) + _append_metric_rules( + triggered_rules, + metric_value=metrics["drop_rate"], + review=policy["drop_rate_review"], + blocked=policy["drop_rate_blocked"], + review_rule="drop_rate_review", + blocked_rule="drop_rate_block", + ) + _append_metric_rules( + triggered_rules, + metric_value=metrics["fallback_rate"], + review=policy["fallback_rate_review"], + blocked=policy["fallback_rate_blocked"], + review_rule="fallback_rate_review", + blocked_rule="fallback_rate_block", + ) + if ( + metrics["queue_backlog_policy_decision_count"] + >= policy["queue_backlog_policy_decision_count_review"] + ): + triggered_rules.append("queue_backlog_review") + + if not triggered_rules: + triggered_rules.append("runtime_reliability_pass_note") + + if any(_rule_effect(rule) == "blocked" for rule in triggered_rules): + decision = "blocked" + reason = "Agent runtime reliability evidence indicates blocked deployment risk." + recommended_action = ( + "Do not deploy until deadline, drop, fallback, and guard evidence are reviewed." + ) + elif any(_rule_effect(rule) == "review_required" for rule in triggered_rules): + decision = "review_required" + reason = "Agent runtime reliability evidence requires deployment review." + recommended_action = ( + "Review Orchestrator policy decisions, AIGuard evidence, and agent priority budgets." + ) + elif "guard_missing_unknown" in triggered_rules: + decision = "unknown" + reason = "AIGuard runtime reliability evidence is unavailable." + recommended_action = ( + "Run AIGuard runtime reliability analysis before using this report for deployment." + ) + else: + decision = "deployable_with_note" + reason = "Agent runtime reliability evidence stayed within configured thresholds." + recommended_action = ( + "Deployment can proceed with runtime monitoring and the local evidence note retained." + ) + + return { + "policy_version": AGENT_RUNTIME_POLICY_VERSION, + "decision": decision, + "reason": reason, + "guard_status": normalized_guard_status, + "guard_verdict": normalized_guard_verdict, + "recommended_action": recommended_action, + "triggered_rules": triggered_rules, + "policy_summary": [ + { + "rule": rule, + "effect": _rule_effect(rule), + "description": AGENT_RUNTIME_POLICY_RULES[rule]["description"], + } + for rule in triggered_rules + ], + } + + +def compute_agent_runtime_metrics(orchestration_summary: dict[str, Any]) -> dict[str, Any]: + runtime_summary = _agent_runtime_summary(orchestration_summary) + totals = _totals(runtime_summary) + executed_count = _non_negative_number(totals.get("executed_count")) + dropped_count = _non_negative_number(totals.get("dropped_count")) + deadline_missed_count = _non_negative_number(totals.get("deadline_missed_count")) + fallback_count = _non_negative_number(totals.get("fallback_count")) + total_task_events = executed_count + dropped_count + policy_log = _policy_log(orchestration_summary) + queue_backlog_count = sum( + 1 + for item in policy_log + if "backlog" in str(item.get("reason", "")).lower() + or "backlog" in str(item.get("decision", "")).lower() + ) + return { + "executed_count": executed_count, + "dropped_count": dropped_count, + "deadline_missed_count": deadline_missed_count, + "fallback_count": fallback_count, + "policy_decision_count": _non_negative_number( + totals.get("policy_decision_count") + ), + "overload_event_count": _non_negative_number(totals.get("overload_event_count")), + "total_task_events": total_task_events, + "deadline_miss_rate": _ratio(deadline_missed_count, executed_count), + "drop_rate": _ratio(dropped_count, total_task_events), + "fallback_rate": _ratio(fallback_count, total_task_events), + "queue_backlog_policy_decision_count": queue_backlog_count, + } + + +def load_agent_runtime_reliability_bundle( + *, + orchestration_summary_path: str | Path, + guard_analysis_path: str | Path | None = None, +) -> dict[str, Any]: + orchestration_summary = _load_json_dict(orchestration_summary_path) + guard_analysis = _load_json_dict(guard_analysis_path) if guard_analysis_path else None + return build_agent_runtime_reliability_report( + orchestration_summary=orchestration_summary, + guard_analysis=guard_analysis, + source={ + "orchestration_summary_path": str(orchestration_summary_path), + "guard_analysis_path": str(guard_analysis_path) + if guard_analysis_path + else None, + }, + ) + + +def build_agent_runtime_reliability_markdown(report: dict[str, Any]) -> str: + runtime = report["agent_runtime_summary"] + metrics = runtime["metrics"] + decision = report["agent_deployment_decision"] + guard = report["guard_summary"] + + lines = [ + "# InferEdge Agent Runtime Reliability Report", + "", + "## Scope", + "", + f"- schema_version: `{report['schema_version']}`", + f"- generated_at: `{report['generated_at']}`", + f"- scope: {report['scope']}", + "- This is local-first report evidence, not a production cloud orchestration dashboard.", + "", + "## Agent Runtime Summary", + "", + "| Agent | Type | Priority | Latency Budget ms | Fallback Policy |", + "|---|---|---:|---:|---|", + ] + for agent in runtime["agents"]: + lines.append( + "| " + f"{agent.get('agent_id', '')} | " + f"{agent.get('agent_type', '')} | " + f"{_fmt_number(agent.get('priority'))} | " + f"{_fmt_number(agent.get('latency_budget_ms'))} | " + f"{agent.get('fallback_policy', '')} |" + ) + + lines.extend( + [ + "", + "## Runtime Reliability Metrics", + "", + "| Metric | Value |", + "|---|---:|", + f"| executed_count | {_fmt_number(metrics['executed_count'])} |", + f"| dropped_count | {_fmt_number(metrics['dropped_count'])} |", + f"| deadline_missed_count | {_fmt_number(metrics['deadline_missed_count'])} |", + f"| fallback_count | {_fmt_number(metrics['fallback_count'])} |", + f"| deadline_miss_rate | {_fmt_number(metrics['deadline_miss_rate'])} |", + f"| drop_rate | {_fmt_number(metrics['drop_rate'])} |", + f"| fallback_rate | {_fmt_number(metrics['fallback_rate'])} |", + f"| queue_backlog_policy_decision_count | {_fmt_number(metrics['queue_backlog_policy_decision_count'])} |", + "", + "## AIGuard Runtime Reliability Evidence", + "", + f"- guard_status: `{guard.get('status')}`", + f"- guard_verdict: `{guard.get('guard_verdict')}`", + f"- severity: `{guard.get('severity')}`", + f"- primary_reason: {guard.get('primary_reason')}", + f"- evidence_count: `{guard.get('evidence_count')}`", + "", + "## Lab Agent Deployment Decision", + "", + f"- policy_version: `{decision['policy_version']}`", + f"- decision: `{decision['decision']}`", + f"- reason: {decision['reason']}", + f"- recommended_action: {decision['recommended_action']}", + "- triggered_rules:", + *[f" - `{rule}`" for rule in decision["triggered_rules"]], + "", + "## Notes", + "", + ] + ) + lines.extend(f"- {note}" for note in report["notes"]) + return "\n".join(lines) + "\n" + + +def agent_runtime_reliability_json(report: dict[str, Any]) -> str: + return json.dumps(report, ensure_ascii=False, indent=2) + "\n" + + +def write_agent_runtime_reliability_markdown( + report: dict[str, Any], + output: str | Path, +) -> Path: + path = Path(output) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(build_agent_runtime_reliability_markdown(report), encoding="utf-8") + return path + + +def _agent_runtime_summary(orchestration_summary: dict[str, Any]) -> dict[str, Any]: + value = orchestration_summary.get("agent_runtime_summary") + return value if isinstance(value, dict) else {} + + +def _totals(runtime_summary: dict[str, Any]) -> dict[str, Any]: + value = runtime_summary.get("totals") + return value if isinstance(value, dict) else {} + + +def _agent_summaries(runtime_summary: dict[str, Any]) -> list[dict[str, Any]]: + agents = runtime_summary.get("agents") + if isinstance(agents, dict): + values = [value for value in agents.values() if isinstance(value, dict)] + elif isinstance(agents, list): + values = [value for value in agents if isinstance(value, dict)] + else: + values = [] + return sorted(values, key=lambda item: str(item.get("agent_id", ""))) + + +def _guard_summary(guard_analysis: dict[str, Any] | None) -> dict[str, Any]: + return { + "schema_version": guard_analysis.get("schema_version") + if isinstance(guard_analysis, dict) + else None, + "status": guard_status(guard_analysis), + "guard_verdict": guard_verdict(guard_analysis), + "severity": guard_analysis.get("severity") if isinstance(guard_analysis, dict) else None, + "primary_reason": guard_primary_reason(guard_analysis), + "evidence_count": len(guard_evidence_items(guard_analysis)), + } + + +def _append_metric_rules( + rules: list[str], + *, + metric_value: float, + review: float, + blocked: float, + review_rule: str, + blocked_rule: str, +) -> None: + if metric_value >= blocked: + rules.append(blocked_rule) + elif metric_value >= review: + rules.append(review_rule) + + +def _rule_effect(rule: str) -> str: + return AGENT_RUNTIME_POLICY_RULES.get(rule, {}).get("effect", "unknown") + + +def _policy_log(orchestration_summary: dict[str, Any]) -> list[dict[str, Any]]: + value = orchestration_summary.get("policy_decision_log") + if not isinstance(value, list): + value = orchestration_summary.get("policy_decisions") + if not isinstance(value, list): + return [] + return [item for item in value if isinstance(item, dict)] + + +def _load_json_dict(path: str | Path | None) -> dict[str, Any] | None: + if path is None: + return None + with Path(path).open("r", encoding="utf-8") as file: + data = json.load(file) + if not isinstance(data, dict): + raise ValueError(f"Expected JSON object: {path}") + return data + + +def _non_negative_number(value: Any) -> float: + if isinstance(value, (int, float)) and not isinstance(value, bool): + return max(float(value), 0.0) + return 0.0 + + +def _ratio(numerator: float, denominator: float) -> float: + if denominator <= 0: + return 0.0 + return numerator / denominator + + +def _utc_now_iso() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def _fmt_number(value: Any) -> str: + if value is None: + return "-" + if isinstance(value, float): + return f"{value:.6g}" + return str(value) diff --git a/tests/test_agent_runtime_report.py b/tests/test_agent_runtime_report.py new file mode 100644 index 0000000..b4c2903 --- /dev/null +++ b/tests/test_agent_runtime_report.py @@ -0,0 +1,161 @@ +from __future__ import annotations + +import json + +import pytest + +from inferedgelab.commands.agent_runtime_report import agent_runtime_report_cmd +from inferedgelab.services.agent_runtime_report import ( + AGENT_RUNTIME_POLICY_VERSION, + AGENT_RUNTIME_REPORT_SCHEMA_VERSION, + build_agent_runtime_reliability_markdown, + build_agent_runtime_reliability_report, + compute_agent_runtime_metrics, + load_agent_runtime_reliability_bundle, +) + + +def orchestration_summary() -> dict: + return { + "schema_version": "inferedge-orchestration-summary-v1", + "agent_runtime_summary": { + "schema_version": "inferedge-orchestration-summary-v1", + "source_contracts": { + "forge_agent_manifest": "inferedge-agent-manifest-v1", + "runtime_agent_result": "inferedge-runtime-agent-task-v1", + }, + "agents": { + "safety_monitor_agent": { + "agent_id": "safety_monitor_agent", + "agent_type": "safety", + "priority": 100, + "latency_budget_ms": 20.0, + "fallback_policy": "protect", + }, + "vision_agent": { + "agent_id": "vision_agent", + "agent_type": "vision", + "priority": 90, + "latency_budget_ms": 33.0, + "fallback_policy": "drop_stale", + }, + }, + "totals": { + "executed_count": 10, + "dropped_count": 14, + "deadline_missed_count": 1, + "fallback_count": 14, + "policy_decision_count": 14, + "overload_event_count": 14, + }, + }, + "policy_decision_log": [ + { + "agent_id": "vision_agent", + "decision": "load_shedding", + "reason": "queue_backlog_threshold_exceeded", + "protected_agent_id": "safety_monitor_agent", + } + ], + } + + +def guard_analysis() -> dict: + return { + "schema_version": "inferedge-aiguard-diagnosis-v1", + "guard_verdict": "blocked", + "severity": "high", + "confidence": 0.88, + "primary_reason": "drop_rate indicates runtime reliability risk.", + "evidence": [ + { + "type": "excessive_drop_rate", + "metric_name": "drop_rate", + "observed_value": 14 / 24, + "baseline_value": None, + "threshold": 0.2, + "delta": None, + "delta_pct": None, + "increase_factor": None, + "severity": "high", + "status": "failed", + "explanation": "Drop rate crossed threshold.", + "why_it_matters": "Dropped work may become stale.", + "suspected_causes": ["queue_backlog"], + "recommendation": "Tune scheduling policy.", + "raw_context": {}, + } + ], + "created_at": "2026-05-17T00:00:00Z", + } + + +def test_compute_agent_runtime_metrics_from_orchestrator_summary(): + metrics = compute_agent_runtime_metrics(orchestration_summary()) + + assert metrics["deadline_miss_rate"] == pytest.approx(0.1) + assert metrics["drop_rate"] == pytest.approx(14 / 24) + assert metrics["fallback_rate"] == pytest.approx(14 / 24) + assert metrics["queue_backlog_policy_decision_count"] == 1 + + +def test_agent_runtime_report_blocks_when_guard_blocks(): + report = build_agent_runtime_reliability_report( + orchestration_summary=orchestration_summary(), + guard_analysis=guard_analysis(), + ) + + decision = report["agent_deployment_decision"] + assert report["schema_version"] == AGENT_RUNTIME_REPORT_SCHEMA_VERSION + assert report["contracts"]["orchestration_summary"] == ( + "inferedge-orchestration-summary-v1" + ) + assert report["contracts"]["aiguard_guard_analysis"] == ( + "inferedge-aiguard-diagnosis-v1" + ) + assert decision["policy_version"] == AGENT_RUNTIME_POLICY_VERSION + assert decision["decision"] == "blocked" + assert "guard_blocked_runtime_block" in decision["triggered_rules"] + assert "drop_rate_block" in decision["triggered_rules"] + assert report["guard_summary"]["guard_verdict"] == "blocked" + + +def test_agent_runtime_report_markdown_contains_sections(): + report = build_agent_runtime_reliability_report( + orchestration_summary=orchestration_summary(), + guard_analysis=guard_analysis(), + ) + markdown = build_agent_runtime_reliability_markdown(report) + + assert "# InferEdge Agent Runtime Reliability Report" in markdown + assert "Agent Runtime Summary" in markdown + assert "Runtime Reliability Metrics" in markdown + assert "AIGuard Runtime Reliability Evidence" in markdown + assert "Lab Agent Deployment Decision" in markdown + assert "guard_blocked_runtime_block" in markdown + assert "not a production cloud orchestration dashboard" in markdown + + +def test_agent_runtime_report_loads_committed_fixtures(): + report = load_agent_runtime_reliability_bundle( + orchestration_summary_path="examples/agent_runtime/agent_3_orchestration_summary.json", + guard_analysis_path="examples/agent_runtime/aiguard_runtime_guard_analysis.json", + ) + + assert report["agent_deployment_decision"]["decision"] == "blocked" + assert report["agent_runtime_summary"]["metrics"]["drop_rate"] == pytest.approx(14 / 24) + assert len(report["agent_runtime_summary"]["agents"]) == 3 + + +def test_agent_runtime_report_command_outputs_json(capsys): + agent_runtime_report_cmd( + orchestration_summary="examples/agent_runtime/agent_3_orchestration_summary.json", + guard_analysis="examples/agent_runtime/aiguard_runtime_guard_analysis.json", + format="json", + output="", + ) + out = capsys.readouterr().out + report = json.loads(out) + + assert report["schema_version"] == AGENT_RUNTIME_REPORT_SCHEMA_VERSION + assert report["agent_deployment_decision"]["decision"] == "blocked"