diff --git a/docs/portfolio/agent_runtime_reliability_report.md b/docs/portfolio/agent_runtime_reliability_report.md index 9a9797c..a0b8baf 100644 --- a/docs/portfolio/agent_runtime_reliability_report.md +++ b/docs/portfolio/agent_runtime_reliability_report.md @@ -79,6 +79,11 @@ runtime operation review: treated as Lab `review_required` evidence because it means the configured latency threshold was breached; it does not claim production request cancellation. +- AIGuard Runtime operation evidence, including + `runtime_backend_unavailable`, `runtime_latency_budget_overrun`, + `runtime_error_classification`, and + `runtime_thermal_memory_evidence_missing` when Runtime health/error/event + fields are analyzed by AIGuard. These fields make the report path explicit: @@ -117,6 +122,11 @@ Triggered rules: - `sustained_overload_review` - `runtime_timeout_observed_review` when a Runtime result reports a latency timeout observation threshold breach. +- `runtime_operation_guard_block` when AIGuard reports failed high-severity + Runtime operation evidence such as backend unavailable or latency budget + overrun. +- `runtime_operation_guard_review` when AIGuard reports warning-level Runtime + operation evidence such as missing Jetson thermal/memory context. ## Boundary @@ -124,7 +134,8 @@ Triggered rules: - Orchestrator operation-health fields are displayed as local runtime evidence. - Orchestrator remote dispatch result fields are displayed as plan-only worker selection evidence when provided. -- AIGuard explains runtime reliability risk. +- AIGuard explains runtime reliability risk, including additive Runtime + health/error/event warning evidence when provided. - Lab remains the final deployment decision owner. - This report is an additive agent-runtime path and does not change existing Runtime result, compare output, or classic deployment decision contracts. diff --git a/inferedgelab/services/agent_runtime_report.py b/inferedgelab/services/agent_runtime_report.py index c7583b6..b6b972e 100644 --- a/inferedgelab/services/agent_runtime_report.py +++ b/inferedgelab/services/agent_runtime_report.py @@ -19,6 +19,14 @@ AIGUARD_DIAGNOSIS_SCHEMA_VERSION = "inferedge-aiguard-diagnosis-v1" REMOTE_DISPATCH_SCHEMA_VERSION = "inferedge-remote-dispatch-result-v1" +RUNTIME_OPERATION_GUARD_EVIDENCE_TYPES = { + "runtime_backend_unavailable", + "runtime_latency_budget_overrun", + "runtime_error_classification", + "runtime_thermal_memory_evidence_missing", + "runtime_operation_health", +} + DEFAULT_AGENT_RUNTIME_THRESHOLDS = { "deadline_miss_rate_review": 0.05, "deadline_miss_rate_blocked": 0.20, @@ -84,6 +92,14 @@ "effect": "review_required", "description": "Runtime result reported a latency timeout observation threshold breach.", }, + "runtime_operation_guard_block": { + "effect": "blocked", + "description": "AIGuard Runtime operation evidence reported failed backend, latency, or error-classification risk.", + }, + "runtime_operation_guard_review": { + "effect": "review_required", + "description": "AIGuard Runtime operation evidence reported warning-level runtime context risk.", + }, "runtime_reliability_pass_note": { "effect": "deployable_with_note", "description": "Runtime reliability evidence stayed within configured thresholds.", @@ -107,10 +123,12 @@ def build_agent_runtime_reliability_report( runtime_summary = _agent_runtime_summary(orchestration_summary) runtime_result_context = _runtime_result_operation_context(runtime_result) remote_dispatch_context = _remote_dispatch_context(remote_dispatch) + runtime_operation_guard_summary = _runtime_operation_guard_summary(guard_analysis) decision = build_agent_runtime_deployment_decision( metrics=metrics, guard_analysis=guard_analysis, runtime_result_context=runtime_result_context, + runtime_operation_guard_summary=runtime_operation_guard_summary, thresholds=policy, ) @@ -153,6 +171,7 @@ def build_agent_runtime_reliability_report( "policy_decision_log_count": len(_policy_log(orchestration_summary)), }, "guard_summary": _guard_summary(guard_analysis), + "runtime_operation_guard_summary": runtime_operation_guard_summary, "runtime_reliability_evidence": _runtime_reliability_evidence(guard_analysis), "agent_deployment_decision": decision, "notes": [ @@ -168,6 +187,7 @@ def build_agent_runtime_deployment_decision( metrics: dict[str, Any], guard_analysis: dict[str, Any] | None, runtime_result_context: dict[str, Any] | None = None, + runtime_operation_guard_summary: dict[str, Any] | None = None, thresholds: dict[str, float] | None = None, ) -> dict[str, Any]: policy = {**DEFAULT_AGENT_RUNTIME_THRESHOLDS, **(thresholds or {})} @@ -225,6 +245,13 @@ def build_agent_runtime_deployment_decision( ) if _runtime_timeout_observed(runtime_result_context): triggered_rules.append("runtime_timeout_observed_review") + runtime_guard = runtime_operation_guard_summary + if runtime_guard is None: + runtime_guard = _runtime_operation_guard_summary(guard_analysis) + if _runtime_operation_guard_blocking(runtime_guard): + triggered_rules.append("runtime_operation_guard_block") + elif _runtime_operation_guard_review(runtime_guard): + triggered_rules.append("runtime_operation_guard_review") if not triggered_rules: triggered_rules.append("runtime_reliability_pass_note") @@ -391,6 +418,7 @@ def build_agent_runtime_reliability_markdown(report: dict[str, Any]) -> str: metrics = runtime["metrics"] decision = report["agent_deployment_decision"] guard = report["guard_summary"] + runtime_guard = report.get("runtime_operation_guard_summary") or {} runtime_result_context = runtime.get("runtime_result_context") or {} remote_dispatch_context = runtime.get("remote_dispatch_context") or {} runtime_health = runtime_result_context.get("runtime_health_snapshot") or {} @@ -528,6 +556,31 @@ def build_agent_runtime_reliability_markdown(report: dict[str, Any]) -> str: f"| runtime_timeout_observed | {runtime_result_context.get('runtime_timeout_observed', False)} |", f"| runtime_event_count | {_fmt_number(runtime_event_summary.get('event_count'))} |", "", + "## AIGuard Runtime Operation Evidence", + "", + "| Field | Value |", + "|---|---|", + f"| evidence_count | {_fmt_number(runtime_guard.get('evidence_count'))} |", + f"| failed_count | {_fmt_number(runtime_guard.get('failed_count'))} |", + f"| warning_count | {_fmt_number(runtime_guard.get('warning_count'))} |", + f"| evidence_types | {', '.join(runtime_guard.get('evidence_types') or []) or '-'} |", + f"| retry_hints | {', '.join(runtime_guard.get('retry_hints') or []) or '-'} |", + "", + "Runtime operation guard evidence:", + "", + "| Type | Metric | Observed | Severity | Status | Recommendation |", + "|---|---|---:|---|---|---|", + *[ + "| " + f"{item.get('type') or '-'} | " + f"{item.get('metric_name') or '-'} | " + f"{_fmt_number(item.get('observed_value'))} | " + f"{item.get('severity') or '-'} | " + f"{item.get('status') or '-'} | " + f"{item.get('recommendation') or '-'} |" + for item in runtime_guard.get("evidence", []) + ], + "", "Runtime result event sample:", "", "| # | Type | Status | Detail |", @@ -788,6 +841,87 @@ def _runtime_reliability_evidence( ] +def _runtime_operation_guard_summary( + guard_analysis: dict[str, Any] | None, +) -> dict[str, Any]: + evidence = [ + item + for item in guard_evidence_items(guard_analysis) + if isinstance(item, dict) + and item.get("type") in RUNTIME_OPERATION_GUARD_EVIDENCE_TYPES + ] + failed = [item for item in evidence if item.get("status") == "failed"] + warnings = [item for item in evidence if item.get("status") == "warning"] + retry_hints = sorted( + { + retry_hint + for item in evidence + for retry_hint in [_runtime_operation_retry_hint(item)] + if isinstance(retry_hint, str) and retry_hint + } + ) + return { + "evidence_count": len(evidence), + "failed_count": len(failed), + "warning_count": len(warnings), + "evidence_types": [ + item.get("type") for item in evidence if isinstance(item.get("type"), str) + ], + "metric_names": [ + item.get("metric_name") + for item in evidence + if isinstance(item.get("metric_name"), str) + ], + "retry_hints": retry_hints, + "evidence": [ + { + "type": item.get("type"), + "metric_name": item.get("metric_name"), + "observed_value": item.get("observed_value"), + "threshold": item.get("threshold"), + "severity": item.get("severity"), + "status": item.get("status"), + "explanation": item.get("explanation"), + "recommendation": item.get("recommendation"), + "why_it_matters": item.get("why_it_matters"), + "retry_hint": _runtime_operation_retry_hint(item), + } + for item in evidence + ], + } + + +def _runtime_operation_guard_blocking(summary: dict[str, Any]) -> bool: + for item in _dict_list(summary.get("evidence")): + if item.get("status") != "failed": + continue + if item.get("severity") in {"high", "critical"}: + return True + if item.get("type") in { + "runtime_backend_unavailable", + "runtime_latency_budget_overrun", + }: + return True + return False + + +def _runtime_operation_guard_review(summary: dict[str, Any]) -> bool: + if _runtime_operation_guard_blocking(summary): + return False + return bool(summary.get("failed_count") or summary.get("warning_count")) + + +def _runtime_operation_retry_hint(evidence_item: dict[str, Any]) -> str | None: + raw_context = evidence_item.get("raw_context") + if not isinstance(raw_context, dict): + return None + runtime_operation = raw_context.get("runtime_operation") + if not isinstance(runtime_operation, dict): + return None + retry_hint = runtime_operation.get("retry_hint") + return retry_hint if isinstance(retry_hint, str) and retry_hint else None + + def _timeline_summary( orchestration_summary: dict[str, Any], metrics: dict[str, Any], diff --git a/tests/test_agent_runtime_report.py b/tests/test_agent_runtime_report.py index f606610..aaed06f 100644 --- a/tests/test_agent_runtime_report.py +++ b/tests/test_agent_runtime_report.py @@ -218,6 +218,78 @@ def guard_analysis() -> dict: } +def runtime_operation_guard_analysis() -> dict: + data = guard_analysis() + data["primary_reason"] = ( + "runtime_error_severity indicates runtime reliability risk." + ) + data["evidence"].extend( + [ + { + "type": "runtime_backend_unavailable", + "metric_name": "engine_available", + "observed_value": 0, + "baseline_value": None, + "threshold": 1, + "delta": None, + "delta_pct": None, + "increase_factor": None, + "severity": "high", + "status": "failed", + "explanation": "Runtime could not confirm backend availability.", + "why_it_matters": ( + "Runtime backend availability is required before using the " + "result as deployment evidence." + ), + "suspected_causes": ["backend_runtime_unavailable"], + "recommendation": "Check backend installation and engine load logs.", + "raw_context": { + "runtime_operation": { + "engine_available": False, + "retry_hint": "check_backend_availability", + } + }, + }, + { + "type": "runtime_latency_budget_overrun", + "metric_name": "latency_budget_exceeded", + "observed_value": 1, + "baseline_value": None, + "threshold": 50.0, + "delta": 22.5, + "delta_pct": 0.45, + "increase_factor": None, + "severity": "high", + "status": "failed", + "explanation": "Runtime latency exceeded the configured budget.", + "why_it_matters": ( + "Latency budget overrun means the runtime result did not " + "satisfy the expected timing contract." + ), + "suspected_causes": ["runtime_latency_spike"], + "recommendation": "Review runtime event log and fallback policy.", + "raw_context": { + "runtime_operation": { + "latency_budget_ms": 50.0, + "observed_mean_ms": 72.5, + } + }, + }, + ] + ) + data["suspected_causes"] = [ + "queue_backlog", + "backend_runtime_unavailable", + "runtime_latency_spike", + ] + data["recommendations"] = [ + "Tune scheduling policy.", + "Check backend installation and engine load logs.", + "Review runtime event log and fallback policy.", + ] + return data + + def runtime_result_with_operation_evidence() -> dict: return { "schema_version": "inferedge-runtime-result-v1", @@ -512,7 +584,7 @@ def test_compute_agent_runtime_metrics_from_orchestrator_summary(): def test_agent_runtime_report_blocks_when_guard_blocks(): report = build_agent_runtime_reliability_report( orchestration_summary=orchestration_summary(), - guard_analysis=sustained_guard_analysis(), + guard_analysis=runtime_operation_guard_analysis(), runtime_result=runtime_result_with_operation_evidence(), remote_dispatch=remote_dispatch_result(), ) @@ -534,8 +606,19 @@ def test_agent_runtime_report_blocks_when_guard_blocks(): assert "guard_blocked_runtime_block" in decision["triggered_rules"] assert "drop_rate_block" in decision["triggered_rules"] assert "sustained_overload_review" in decision["triggered_rules"] + assert "runtime_operation_guard_block" in decision["triggered_rules"] assert report["guard_summary"]["guard_verdict"] == "blocked" - assert "sustained_overload_risk" in report["guard_summary"]["evidence_types"] + assert "runtime_backend_unavailable" in report["guard_summary"]["evidence_types"] + runtime_guard = report["runtime_operation_guard_summary"] + assert runtime_guard["evidence_count"] == 2 + assert runtime_guard["failed_count"] == 2 + assert runtime_guard["retry_hints"] == ["check_backend_availability"] + assert { + item["type"] for item in runtime_guard["evidence"] + } == { + "runtime_backend_unavailable", + "runtime_latency_budget_overrun", + } assert report["agent_runtime_summary"]["timeline_summary"] == { "scenario_mode": "sustained_high_load", "queue_depth_sample_count": 1, @@ -548,7 +631,11 @@ def test_agent_runtime_report_blocks_when_guard_blocks(): } assert { item["type"] for item in report["runtime_reliability_evidence"] - } == {"excessive_drop_rate", "sustained_overload_risk"} + } == { + "excessive_drop_rate", + "runtime_backend_unavailable", + "runtime_latency_budget_overrun", + } operation_context = report["agent_runtime_summary"]["operation_context"] assert operation_context["queue_state_summary"]["queue_pressure_state"] == "overloaded" assert operation_context["worker_health_counts"] == { @@ -630,7 +717,7 @@ def test_agent_runtime_report_keeps_legacy_orchestrator_summary_compatible(): def test_agent_runtime_report_markdown_contains_sections(): report = build_agent_runtime_reliability_report( orchestration_summary=orchestration_summary(), - guard_analysis=sustained_guard_analysis(), + guard_analysis=runtime_operation_guard_analysis(), runtime_result=runtime_result_with_operation_evidence(), remote_dispatch=remote_dispatch_result(), ) @@ -644,6 +731,10 @@ def test_agent_runtime_report_markdown_contains_sections(): assert "Worker Health" in markdown assert "Runtime Event Summary" in markdown assert "Runtime Result Operation Evidence" in markdown + assert "AIGuard Runtime Operation Evidence" in markdown + assert "runtime_backend_unavailable" in markdown + assert "runtime_latency_budget_overrun" in markdown + assert "check_backend_availability" in markdown assert "Remote Dispatch Context" in markdown assert "jetson-nano-01" in markdown assert "plan_only" in markdown @@ -655,7 +746,7 @@ def test_agent_runtime_report_markdown_contains_sections(): assert "AIGuard Runtime Reliability Evidence" in markdown assert "Lab Agent Deployment Decision" in markdown assert "guard_blocked_runtime_block" in markdown - assert "sustained_overload_risk" in markdown + assert "runtime_operation_guard_block" in markdown assert "max_total_queue_depth" in markdown assert "not a production cloud orchestration dashboard" in markdown