From d6784395ca7ab13e4dc773b3a32011635322baa5 Mon Sep 17 00:00:00 2001 From: hyeokjun32 Date: Wed, 20 May 2026 12:21:20 +0900 Subject: [PATCH] feat: include remote dispatch context in agent runtime report --- README.md | 4 +- .../agent_runtime_reliability_report.md | 11 ++ inferedgelab/commands/agent_runtime_report.py | 9 ++ inferedgelab/services/agent_runtime_report.py | 110 ++++++++++++++++++ tests/test_agent_runtime_report.py | 91 +++++++++++++++ 5 files changed, 224 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c497b57..8572139 100644 --- a/README.md +++ b/README.md @@ -128,8 +128,10 @@ The Lab decision surface now also exposes `policy_version`, `triggered_rules`, a `agent-runtime-report` is an additive reliable edge agent runtime report path. It bundles Orchestrator scheduling evidence and AIGuard runtime reliability `guard_analysis` into a Lab-owned agent deployment decision context without changing existing Runtime result or compare contracts. The current bundled evidence is a synthetic/dummy sustained high-load 3-agent scenario. -The report preserves sustained queue-depth, worker health, Runtime result health/error/event evidence, runtime event summary/timeline, policy decision reason, and `sustained_overload_risk` evidence as local-first deployment review context. +The report preserves sustained queue-depth, worker health, Runtime result health/error/event evidence, optional remote dispatch worker-selection context, runtime event summary/timeline, policy decision reason, and `sustained_overload_risk` evidence as local-first deployment review context. When a Runtime result JSON with `runtime_health_snapshot` / `runtime_events` is available, add `--runtime-result ` to include Runtime-side operation context in the same Lab report. +When an InferEdgeOrchestrator `inferedge-remote-dispatch-result-v1` JSON is available, add `--remote-dispatch ` to include file-based worker selection, retry/fallback plan, and plan-only remote execution context. +This is remote dispatch evidence for local-first review; it does not claim production remote execution. ![InferEdge Local Studio demo evidence](assets/images/local-studio-demo-evidence.png) diff --git a/docs/portfolio/agent_runtime_reliability_report.md b/docs/portfolio/agent_runtime_reliability_report.md index 7c7b189..9a9797c 100644 --- a/docs/portfolio/agent_runtime_reliability_report.md +++ b/docs/portfolio/agent_runtime_reliability_report.md @@ -34,6 +34,7 @@ Generate a Markdown report: poetry run inferedgelab agent-runtime-report \ --orchestration-summary examples/agent_runtime/agent_3_orchestration_summary.json \ --guard-analysis examples/agent_runtime/aiguard_runtime_guard_analysis.json \ + --remote-dispatch /tmp/inferedge_agent_runtime_e2e/06_remote_dispatch_result.json \ --format markdown \ --output reports/agent_runtime_reliability_report.md ``` @@ -70,6 +71,9 @@ runtime operation review: - Optional Runtime result operation evidence through `--runtime-result`, including `runtime_health_snapshot`, `runtime_error_classification`, and `runtime_events`. +- Optional Orchestrator remote dispatch evidence through `--remote-dispatch`, + including file-based worker selection, selected worker id, plan-only remote + execution context, and retry/fallback plan fields. - Runtime timeout observation context, including `timeout_policy`, `timeout_budget_ms`, and `runtime_timeout_observed`. A timeout observation is treated as Lab `review_required` evidence because it means the configured @@ -80,10 +84,15 @@ These fields make the report path explicit: ```text Runtime result operation evidence + Orchestrator operation evidence +-> optional remote worker-selection context -> AIGuard reliability explanation -> Lab-owned deployment risk context ``` +Remote dispatch remains a starter contract. It records worker-selection and +fallback-plan evidence for review, but it does not claim production SSH/HTTP +execution, secure tunnel operation, or long-lived remote worker readiness. + ## Lab Decision Context Expected decision: @@ -113,6 +122,8 @@ Triggered rules: - Orchestrator records scheduling and policy evidence. - Orchestrator operation-health fields are displayed as local runtime evidence. +- Orchestrator remote dispatch result fields are displayed as plan-only worker + selection evidence when provided. - AIGuard explains runtime reliability risk. - Lab remains the final deployment decision owner. - This report is an additive agent-runtime path and does not change existing diff --git a/inferedgelab/commands/agent_runtime_report.py b/inferedgelab/commands/agent_runtime_report.py index d146b7a..de09067 100644 --- a/inferedgelab/commands/agent_runtime_report.py +++ b/inferedgelab/commands/agent_runtime_report.py @@ -28,6 +28,11 @@ def agent_runtime_report_cmd( "--runtime-result", help="Optional InferEdge-Runtime result JSON with runtime_health_snapshot/runtime_events", ), + remote_dispatch: str = typer.Option( + "", + "--remote-dispatch", + help="Optional InferEdgeOrchestrator remote dispatch result JSON", + ), format: str = typer.Option("text", "--format", "-f", help="text/json/markdown"), output: str = typer.Option("", "--output", "-o", help="Optional output path"), ) -> None: @@ -35,6 +40,7 @@ def agent_runtime_report_cmd( orchestration_summary_path=orchestration_summary, guard_analysis_path=guard_analysis or None, runtime_result_path=runtime_result or None, + remote_dispatch_path=remote_dispatch or None, ) normalized_format = format.strip().lower() if normalized_format == "json": @@ -60,6 +66,7 @@ def _text_summary(report: dict) -> str: decision = report["agent_deployment_decision"] guard = report["guard_summary"] runtime_context = report["agent_runtime_summary"].get("runtime_result_context") or {} + remote_context = report["agent_runtime_summary"].get("remote_dispatch_context") or {} health = runtime_context.get("runtime_health_snapshot") or {} error = runtime_context.get("runtime_error_classification") or {} lines = [ @@ -74,6 +81,8 @@ def _text_summary(report: dict) -> str: f"deadline_miss_rate: {metrics['deadline_miss_rate']:.6g}", f"runtime_health_status: {health.get('status')}", f"runtime_error_category: {error.get('category')}", + f"remote_dispatch_status: {remote_context.get('dispatch_status')}", + f"remote_selected_worker_id: {remote_context.get('selected_worker_id')}", "triggered_rules:", ] lines.extend(f"- {rule}" for rule in decision["triggered_rules"]) diff --git a/inferedgelab/services/agent_runtime_report.py b/inferedgelab/services/agent_runtime_report.py index 78350a7..c7583b6 100644 --- a/inferedgelab/services/agent_runtime_report.py +++ b/inferedgelab/services/agent_runtime_report.py @@ -17,6 +17,7 @@ AGENT_RUNTIME_POLICY_VERSION = "inferedge-lab-agent-runtime-policy-v1" ORCHESTRATION_SCHEMA_VERSION = "inferedge-orchestration-summary-v1" AIGUARD_DIAGNOSIS_SCHEMA_VERSION = "inferedge-aiguard-diagnosis-v1" +REMOTE_DISPATCH_SCHEMA_VERSION = "inferedge-remote-dispatch-result-v1" DEFAULT_AGENT_RUNTIME_THRESHOLDS = { "deadline_miss_rate_review": 0.05, @@ -95,6 +96,7 @@ def build_agent_runtime_reliability_report( orchestration_summary: dict[str, Any], guard_analysis: dict[str, Any] | None = None, runtime_result: dict[str, Any] | None = None, + remote_dispatch: dict[str, Any] | None = None, source: dict[str, Any] | None = None, thresholds: dict[str, float] | None = None, ) -> dict[str, Any]: @@ -104,6 +106,7 @@ def build_agent_runtime_reliability_report( metrics = compute_agent_runtime_metrics(orchestration_summary) runtime_summary = _agent_runtime_summary(orchestration_summary) runtime_result_context = _runtime_result_operation_context(runtime_result) + remote_dispatch_context = _remote_dispatch_context(remote_dispatch) decision = build_agent_runtime_deployment_decision( metrics=metrics, guard_analysis=guard_analysis, @@ -131,6 +134,11 @@ def build_agent_runtime_reliability_report( if isinstance(runtime_result, dict) else None ), + "remote_dispatch": ( + remote_dispatch.get("schema_version") + if isinstance(remote_dispatch, dict) + else None + ), "source_contracts": runtime_summary.get("source_contracts", {}), }, "agent_runtime_summary": { @@ -140,6 +148,7 @@ def build_agent_runtime_reliability_report( "timeline_summary": _timeline_summary(orchestration_summary, metrics), "operation_context": _operation_context(orchestration_summary, metrics), "runtime_result_context": runtime_result_context, + "remote_dispatch_context": remote_dispatch_context, "policy_decision_reasons": metrics["policy_decision_reasons"], "policy_decision_log_count": len(_policy_log(orchestration_summary)), }, @@ -349,14 +358,19 @@ def load_agent_runtime_reliability_bundle( orchestration_summary_path: str | Path, guard_analysis_path: str | Path | None = None, runtime_result_path: str | Path | None = None, + remote_dispatch_path: str | Path | None = None, ) -> dict[str, Any]: orchestration_summary = _load_json_dict(orchestration_summary_path) guard_analysis = _load_json_dict(guard_analysis_path) if guard_analysis_path else None runtime_result = _load_json_dict(runtime_result_path) if runtime_result_path else None + remote_dispatch = ( + _load_json_dict(remote_dispatch_path) if remote_dispatch_path else None + ) return build_agent_runtime_reliability_report( orchestration_summary=orchestration_summary, guard_analysis=guard_analysis, runtime_result=runtime_result, + remote_dispatch=remote_dispatch, source={ "orchestration_summary_path": str(orchestration_summary_path), "guard_analysis_path": str(guard_analysis_path) @@ -365,6 +379,9 @@ def load_agent_runtime_reliability_bundle( "runtime_result_path": str(runtime_result_path) if runtime_result_path else None, + "remote_dispatch_path": str(remote_dispatch_path) + if remote_dispatch_path + else None, }, ) @@ -375,9 +392,14 @@ def build_agent_runtime_reliability_markdown(report: dict[str, Any]) -> str: decision = report["agent_deployment_decision"] guard = report["guard_summary"] runtime_result_context = runtime.get("runtime_result_context") or {} + remote_dispatch_context = runtime.get("remote_dispatch_context") or {} runtime_health = runtime_result_context.get("runtime_health_snapshot") or {} runtime_error = runtime_result_context.get("runtime_error_classification") or {} runtime_event_summary = runtime_result_context.get("runtime_event_summary") or {} + remote_execution = remote_dispatch_context.get("remote_execution") or {} + remote_execution_plan = remote_dispatch_context.get("remote_execution_plan") or {} + retry_fallback_plan = remote_dispatch_context.get("retry_fallback_plan") or {} + worker_selection = remote_dispatch_context.get("worker_selection") or {} lines = [ "# InferEdge Agent Runtime Reliability Report", @@ -521,6 +543,37 @@ def build_agent_runtime_reliability_markdown(report: dict[str, Any]) -> str: ) ], "", + "## Remote Dispatch Context", + "", + "| Field | Value |", + "|---|---|", + f"| remote_dispatch_schema | {remote_dispatch_context.get('source_schema_version') or '-'} |", + f"| dispatch_status | {remote_dispatch_context.get('dispatch_status') or '-'} |", + f"| selected_worker_id | {remote_dispatch_context.get('selected_worker_id') or '-'} |", + f"| decision_reason | {remote_dispatch_context.get('decision_reason') or '-'} |", + f"| production_remote_execution | {remote_execution.get('production_remote_execution', '-')} |", + f"| execution_plan_mode | {remote_execution_plan.get('mode') or '-'} |", + f"| network_execution_performed | {remote_execution_plan.get('network_execution_performed', '-')} |", + f"| planned_transport | {remote_execution_plan.get('transport') or '-'} |", + f"| fallback_worker_ids | {', '.join(worker_selection.get('fallback_worker_ids') or []) or '-'} |", + f"| retry_max_attempts | {_fmt_number(retry_fallback_plan.get('max_attempts'))} |", + f"| retry_execution_performed | {retry_fallback_plan.get('execution_performed', '-')} |", + "", + "Remote worker selection sample:", + "", + "| Worker | Eligible | Status | Health | Endpoint | Reason |", + "|---|---|---|---|---|---|", + *[ + "| " + f"{item.get('worker_id') or '-'} | " + f"{item.get('eligible')} | " + f"{item.get('status') or '-'} | " + f"{item.get('health_state') or '-'} | " + f"{item.get('endpoint_type') or '-'} | " + f"{item.get('decision_reason') or '-'} |" + for item in remote_dispatch_context.get("worker_evaluations", []) + ], + "", "## AIGuard Runtime Reliability Evidence", "", f"- guard_status: `{guard.get('status')}`", @@ -819,6 +872,63 @@ def _runtime_result_operation_context( } +def _remote_dispatch_context( + remote_dispatch: dict[str, Any] | None, +) -> dict[str, Any]: + if not isinstance(remote_dispatch, dict): + return { + "source_schema_version": None, + "dispatch_status": None, + "selected_worker_id": None, + "decision_reason": None, + "remote_execution": {}, + "remote_execution_plan": {}, + "worker_selection": { + "schema_version": None, + "selected_worker_id": None, + "candidate_worker_ids": [], + "fallback_worker_ids": [], + "evaluations": [], + }, + "retry_fallback_plan": {}, + "worker_evaluations": [], + "runtime_event_sample": [], + } + + worker_selection = remote_dispatch.get("worker_selection") + if not isinstance(worker_selection, dict): + worker_selection = { + "schema_version": None, + "selected_worker_id": remote_dispatch.get("selected_worker_id"), + "candidate_worker_ids": [], + "fallback_worker_ids": [], + "evaluations": [], + } + retry_fallback_plan = remote_dispatch.get("retry_fallback_plan") + remote_execution_plan = remote_dispatch.get("remote_execution_plan") + remote_execution = remote_dispatch.get("remote_execution") + runtime_events = _dict_list(remote_dispatch.get("runtime_events")) + evaluations = _dict_list(worker_selection.get("evaluations")) + return { + "source_schema_version": remote_dispatch.get("schema_version"), + "dispatch_status": remote_dispatch.get("dispatch_status"), + "selected_worker_id": remote_dispatch.get("selected_worker_id"), + "decision_reason": remote_dispatch.get("decision_reason"), + "remote_execution": dict(remote_execution) + if isinstance(remote_execution, dict) + else {}, + "remote_execution_plan": dict(remote_execution_plan) + if isinstance(remote_execution_plan, dict) + else {}, + "worker_selection": dict(worker_selection), + "retry_fallback_plan": dict(retry_fallback_plan) + if isinstance(retry_fallback_plan, dict) + else {}, + "worker_evaluations": evaluations[:8], + "runtime_event_sample": runtime_events[:8], + } + + def _queue_state_summary(orchestration_summary: dict[str, Any]) -> dict[str, Any]: value = orchestration_summary.get("queue_state_summary") if isinstance(value, dict): diff --git a/tests/test_agent_runtime_report.py b/tests/test_agent_runtime_report.py index c468c1c..f606610 100644 --- a/tests/test_agent_runtime_report.py +++ b/tests/test_agent_runtime_report.py @@ -278,6 +278,68 @@ def runtime_result_with_operation_evidence() -> dict: } +def remote_dispatch_result() -> dict: + return { + "schema_version": "inferedge-remote-dispatch-result-v1", + "dispatch_status": "accepted", + "selected_worker_id": "jetson-nano-01", + "decision_reason": ( + "selected online worker matching backend/device requirements" + ), + "remote_execution": { + "mode": "file_contract_starter", + "production_remote_execution": False, + "registry_path": "examples/remote_worker_registry.json", + "request_path": "examples/remote_task_request.json", + }, + "remote_execution_plan": { + "schema_version": "inferedge-remote-execution-plan-v1", + "mode": "plan_only", + "network_execution_performed": False, + "transport": "file_contract", + "endpoint_type": "file_contract", + "selected_worker_id": "jetson-nano-01", + "task_id": "task_vision_001", + "agent_id": "vision_agent", + }, + "worker_selection": { + "schema_version": "inferedge-remote-worker-selection-v1", + "selected_worker_id": "jetson-nano-01", + "candidate_worker_ids": ["jetson-nano-01"], + "fallback_worker_ids": [], + "evaluations": [ + { + "worker_id": "jetson-nano-01", + "eligible": True, + "status": "online", + "health_state": "healthy", + "endpoint_type": "file_contract", + "decision_reason": "eligible", + } + ], + }, + "retry_fallback_plan": { + "schema_version": "inferedge-remote-retry-fallback-plan-v1", + "max_attempts": 1, + "fallback_on": ["timeout", "worker_unhealthy", "runtime_error"], + "primary_worker_id": "jetson-nano-01", + "fallback_worker_ids": [], + "execution_performed": False, + }, + "runtime_events": [ + { + "event": "remote_dispatch_selected", + "task_id": "task_vision_001", + "agent_id": "vision_agent", + "selected_worker_id": "jetson-nano-01", + "reason": ( + "selected online worker matching backend/device requirements" + ), + } + ], + } + + def runtime_result_with_timeout_observed() -> dict: data = runtime_result_with_operation_evidence() data["status"] = "completed" @@ -452,6 +514,7 @@ def test_agent_runtime_report_blocks_when_guard_blocks(): orchestration_summary=orchestration_summary(), guard_analysis=sustained_guard_analysis(), runtime_result=runtime_result_with_operation_evidence(), + remote_dispatch=remote_dispatch_result(), ) decision = report["agent_deployment_decision"] @@ -463,6 +526,9 @@ def test_agent_runtime_report_blocks_when_guard_blocks(): "inferedge-aiguard-diagnosis-v1" ) assert report["contracts"]["runtime_result"] == "inferedge-runtime-result-v1" + assert report["contracts"]["remote_dispatch"] == ( + "inferedge-remote-dispatch-result-v1" + ) assert decision["policy_version"] == AGENT_RUNTIME_POLICY_VERSION assert decision["decision"] == "blocked" assert "guard_blocked_runtime_block" in decision["triggered_rules"] @@ -506,6 +572,16 @@ def test_agent_runtime_report_blocks_when_guard_blocks(): "benchmark_completed": 1, "runtime_error_classified": 1, } + remote_context = report["agent_runtime_summary"]["remote_dispatch_context"] + assert remote_context["dispatch_status"] == "accepted" + assert remote_context["selected_worker_id"] == "jetson-nano-01" + assert remote_context["remote_execution"]["production_remote_execution"] is False + assert remote_context["remote_execution_plan"]["mode"] == "plan_only" + assert remote_context["remote_execution_plan"]["network_execution_performed"] is False + assert remote_context["worker_selection"]["schema_version"] == ( + "inferedge-remote-worker-selection-v1" + ) + assert remote_context["worker_evaluations"][0]["worker_id"] == "jetson-nano-01" def test_agent_runtime_report_marks_runtime_timeout_as_review(): @@ -556,6 +632,7 @@ def test_agent_runtime_report_markdown_contains_sections(): orchestration_summary=orchestration_summary(), guard_analysis=sustained_guard_analysis(), runtime_result=runtime_result_with_operation_evidence(), + remote_dispatch=remote_dispatch_result(), ) markdown = build_agent_runtime_reliability_markdown(report) @@ -567,6 +644,11 @@ def test_agent_runtime_report_markdown_contains_sections(): assert "Worker Health" in markdown assert "Runtime Event Summary" in markdown assert "Runtime Result Operation Evidence" in markdown + assert "Remote Dispatch Context" in markdown + assert "jetson-nano-01" in markdown + assert "plan_only" in markdown + assert "network_execution_performed" in markdown + assert "retry_max_attempts" in markdown assert "runtime_execution_skipped" in markdown assert "queue_pressure_state" in markdown assert "policy_decision" in markdown @@ -593,11 +675,15 @@ def test_agent_runtime_report_command_outputs_json(tmp_path, capsys): runtime_result_path = tmp_path / "runtime_operation_result.json" with runtime_result_path.open("w", encoding="utf-8") as file: json.dump(runtime_result_with_operation_evidence(), file) + remote_dispatch_path = tmp_path / "remote_dispatch_result.json" + with remote_dispatch_path.open("w", encoding="utf-8") as file: + json.dump(remote_dispatch_result(), file) agent_runtime_report_cmd( orchestration_summary="examples/agent_runtime/agent_3_orchestration_summary.json", guard_analysis="examples/agent_runtime/aiguard_runtime_guard_analysis.json", runtime_result=str(runtime_result_path), + remote_dispatch=str(remote_dispatch_path), format="json", output="", ) @@ -608,3 +694,8 @@ def test_agent_runtime_report_command_outputs_json(tmp_path, capsys): assert report["agent_deployment_decision"]["decision"] == "blocked" runtime_context = report["agent_runtime_summary"]["runtime_result_context"] assert runtime_context["runtime_health_snapshot"]["status"] == "degraded" + remote_context = report["agent_runtime_summary"]["remote_dispatch_context"] + assert remote_context["selected_worker_id"] == "jetson-nano-01" + assert remote_context["worker_selection"]["candidate_worker_ids"] == [ + "jetson-nano-01" + ]