gwonxhj · hyeokjun32 · May 21, 2026 · May 21, 2026
diff --git a/docs/remote_dispatch_starter.ko.md b/docs/remote_dispatch_starter.ko.md
@@ -147,6 +147,13 @@ starter는 기본적으로 execution planning만 기록하고 network connection
 - fallback execution은 starter evidence로만 제한한다. production-grade retry,
   heartbeat, failover state, worker lifecycle management는 future hardening이다.
 
+이 recovery path의 작은 curated sample은
+[`examples/telemetry/remote_fallback_recovery_sample.json`](../examples/telemetry/remote_fallback_recovery_sample.json)에
+있다. 이 sample은 primary HTTP starter `connection_error`, 성공한 fallback
+starter attempt, retry/fallback plan field, AIGuard/Lab이 기대하는 downstream
+`remote_execution_recovered_by_fallback` signal을 기록한다. 이 sample은 문서용
+evidence이며 benchmark나 production retry claim이 아니다.
+
 ## Boundary
 
 정확한 표현을 유지한다.

diff --git a/docs/remote_dispatch_starter.md b/docs/remote_dispatch_starter.md
@@ -151,6 +151,14 @@ When execution is requested:
   retry, heartbeat, failover state, and worker lifecycle management remain
   future hardening.
 
+A small curated sample of this recovery path is available at
+[`examples/telemetry/remote_fallback_recovery_sample.json`](../examples/telemetry/remote_fallback_recovery_sample.json).
+It records a primary HTTP starter `connection_error`, a successful fallback
+starter attempt, the retry/fallback plan fields, and the downstream
+`remote_execution_recovered_by_fallback` signal expected by AIGuard/Lab. The
+sample is documentation evidence only, not a benchmark or production retry
+claim.
+
 ## Boundary
 
 Use precise wording:

diff --git a/examples/telemetry/README.ko.md b/examples/telemetry/README.ko.md
@@ -16,6 +16,7 @@ portfolio evidence를 확인할 수 있도록 제공한다.
 | --- | --- |
 | `phase3_overload_sample.json` | synthetic FIFO baseline과 scheduler/load-shedding 비교. detector p95 end-to-end latency가 `782.0ms`에서 `8.0ms`로 개선되고 low-priority classifier work가 drop된다. |
 | `agent_scheduler_delay_sample.json` | 3-agent sustained high-load config에서 추출한 curated excerpt. `scheduler_delay_event_count`, 지연된 execution event, policy/drop reason count, downstream AIGuard/Lab signal name을 보여준다. |
+| `remote_fallback_recovery_sample.json` | remote dispatch starter에서 primary HTTP starter `connection_error`, 제한된 fallback worker recovery, retry/fallback plan field, downstream AIGuard/Lab signal name을 보여주는 curated excerpt. |
 | `jetson_smoke_dummy_sample.json` | Jetson dummy smoke path의 telemetry schema. task count, drop event, result event, scheduler decision, resource snapshot을 보여준다. |
 | `jetson_onnx_smoke_sample.json` | ONNX Runtime worker smoke path의 telemetry schema. result event metadata, output shape `[1, 2]`, resource snapshot을 보여준다. |
 | `jetson_tensorrt_contention_sample.json` | Jetson의 TensorRT-backed scheduler/load-shedding evidence. `detector_trt`는 보호되고 `classifier_trt`는 제한되며 result event에 TensorRT backend metadata가 유지된다. |
@@ -31,6 +32,7 @@ sample은 다음 telemetry signal을 포함한다.
 - drop events
 - overload 또는 policy decisions
 - scheduler delay event count와 queue wait evidence
+- remote dispatch starter failure/fallback recovery evidence
 - result events
 - resource snapshots
 

diff --git a/examples/telemetry/README.md b/examples/telemetry/README.md
@@ -16,6 +16,7 @@ They are not benchmark artifacts. Raw runtime reports remain ignored under
 | --- | --- |
 | `phase3_overload_sample.json` | Synthetic FIFO baseline vs scheduler/load-shedding comparison. The detector p95 end-to-end latency improves from `782.0ms` to `8.0ms`, while low-priority classifier work is dropped. |
 | `agent_scheduler_delay_sample.json` | Curated excerpt from the 3-agent sustained high-load config showing `scheduler_delay_event_count`, a delayed execution event, policy/drop reason counts, and the downstream AIGuard/Lab signal names. |
+| `remote_fallback_recovery_sample.json` | Curated remote dispatch starter excerpt showing primary HTTP starter `connection_error`, bounded fallback worker recovery, retry/fallback plan fields, and downstream AIGuard/Lab signal names. |
 | `jetson_smoke_dummy_sample.json` | Telemetry schema from the Jetson dummy smoke path: task counts, drop events, result events, scheduler decisions, and resource snapshots. |
 | `jetson_onnx_smoke_sample.json` | Telemetry schema from the ONNX Runtime worker smoke path: result event metadata, output shape `[1, 2]`, and resource snapshots. |
 | `jetson_tensorrt_contention_sample.json` | TensorRT-backed scheduler/load-shedding evidence from Jetson: `detector_trt` is protected, `classifier_trt` is shed, and result events keep TensorRT backend metadata. |
@@ -31,6 +32,7 @@ The samples cover these telemetry signals:
 - drop events
 - overload or policy decisions
 - scheduler delay event counts and queue wait evidence
+- remote dispatch starter failure/fallback recovery evidence
 - result events
 - resource snapshots
 

diff --git a/examples/telemetry/remote_fallback_recovery_sample.json b/examples/telemetry/remote_fallback_recovery_sample.json
@@ -0,0 +1,105 @@
+{
+  "schema_version": "inferedge-remote-fallback-recovery-sample-v1",
+  "not_a_benchmark": true,
+  "source_test": "tests/test_remote_dispatch.py::test_remote_dispatch_execute_plan_falls_back_after_primary_connection_error",
+  "source_contract": "inferedge-remote-dispatch-result-v1",
+  "task_request": {
+    "schema_version": "inferedge-remote-task-request-v1",
+    "task_id": "task_http_fallback_001",
+    "agent_id": "vision_agent",
+    "required_backend": "onnxruntime",
+    "device_target": "cpu",
+    "retry_policy": {
+      "max_attempts": 2,
+      "fallback_on": [
+        "connection_error",
+        "timeout"
+      ]
+    }
+  },
+  "dispatch_summary": {
+    "dispatch_status": "accepted",
+    "selected_worker_id": "primary-http-worker",
+    "candidate_worker_ids": [
+      "primary-http-worker",
+      "fallback-http-worker"
+    ],
+    "fallback_worker_ids": [
+      "fallback-http-worker"
+    ],
+    "decision_reason": "selected online worker matching backend/device requirements"
+  },
+  "remote_execution_result": {
+    "schema_version": "inferedge-remote-execution-result-v1",
+    "execution_requested": true,
+    "execution_performed": false,
+    "production_remote_execution": false,
+    "transport": "http",
+    "selected_worker_id": "primary-http-worker",
+    "status": "failed",
+    "error_category": "connection_error"
+  },
+  "fallback_execution_result": {
+    "schema_version": "inferedge-remote-fallback-execution-v1",
+    "primary_worker_id": "primary-http-worker",
+    "attempted_worker_ids": [
+      "fallback-http-worker"
+    ],
+    "final_status": "succeeded",
+    "attempts": [
+      {
+        "fallback_attempt": 1,
+        "selected_worker_id": "fallback-http-worker",
+        "transport": "http",
+        "status": "succeeded",
+        "production_remote_execution": false
+      }
+    ]
+  },
+  "retry_fallback_plan": {
+    "schema_version": "inferedge-remote-retry-fallback-plan-v1",
+    "max_attempts": 2,
+    "fallback_on": [
+      "connection_error",
+      "timeout"
+    ],
+    "primary_worker_id": "primary-http-worker",
+    "fallback_worker_ids": [
+      "fallback-http-worker"
+    ],
+    "execution_performed": true,
+    "fallback_execution_performed": true,
+    "fallback_attempted_worker_ids": [
+      "fallback-http-worker"
+    ],
+    "last_execution_status": "succeeded"
+  },
+  "runtime_event_sample": [
+    {
+      "event": "remote_dispatch_selected",
+      "selected_worker_id": "primary-http-worker",
+      "reason": "selected online worker matching backend/device requirements"
+    },
+    {
+      "event": "remote_execution_failed",
+      "selected_worker_id": "primary-http-worker",
+      "transport": "http",
+      "status": "failed",
+      "error_category": "connection_error"
+    },
+    {
+      "event": "remote_fallback_execution_completed",
+      "selected_worker_id": "fallback-http-worker",
+      "primary_worker_id": "primary-http-worker",
+      "transport": "http",
+      "status": "succeeded",
+      "fallback_attempt": 1
+    }
+  ],
+  "downstream_expectation": {
+    "aiguard_evidence_type": "remote_execution_recovered_by_fallback",
+    "lab_report_context": "Remote fallback starter evidence",
+    "entrypoint_registry_operation_path": "remote_dispatch_with_fallback",
+    "boundary": "starter evidence only; not production remote retry control"
+  }
+}
diff --git a/tests/test_sample_telemetry_artifacts.py b/tests/test_sample_telemetry_artifacts.py
@@ -56,6 +56,54 @@ def test_agent_scheduler_delay_sample_records_downstream_signal() -> None:
     )
 
 
+def test_remote_fallback_recovery_sample_records_starter_boundary() -> None:
+    sample = _load_sample("remote_fallback_recovery_sample.json")
+
+    assert sample["schema_version"] == (  # type: ignore[index]
+        "inferedge-remote-fallback-recovery-sample-v1"
+    )
+    assert sample["source_contract"] == "inferedge-remote-dispatch-result-v1"
+    assert sample["not_a_benchmark"] is True
+
+    dispatch = sample["dispatch_summary"]  # type: ignore[index]
+    assert dispatch["dispatch_status"] == "accepted"
+    assert dispatch["selected_worker_id"] == "primary-http-worker"
+    assert dispatch["fallback_worker_ids"] == ["fallback-http-worker"]
+
+    remote_execution = sample["remote_execution_result"]  # type: ignore[index]
+    assert remote_execution["production_remote_execution"] is False
+    assert remote_execution["transport"] == "http"
+    assert remote_execution["status"] == "failed"
+    assert remote_execution["error_category"] == "connection_error"
+
+    fallback = sample["fallback_execution_result"]  # type: ignore[index]
+    assert fallback["schema_version"] == "inferedge-remote-fallback-execution-v1"
+    assert fallback["primary_worker_id"] == "primary-http-worker"
+    assert fallback["attempted_worker_ids"] == ["fallback-http-worker"]
+    assert fallback["final_status"] == "succeeded"
+    assert fallback["attempts"][0]["production_remote_execution"] is False
+
+    retry_plan = sample["retry_fallback_plan"]  # type: ignore[index]
+    assert retry_plan["fallback_execution_performed"] is True
+    assert retry_plan["last_execution_status"] == "succeeded"
+
+    events = sample["runtime_event_sample"]  # type: ignore[index]
+    assert [event["event"] for event in events] == [
+        "remote_dispatch_selected",
+        "remote_execution_failed",
+        "remote_fallback_execution_completed",
+    ]
+
+    downstream = sample["downstream_expectation"]  # type: ignore[index]
+    assert downstream["aiguard_evidence_type"] == (
+        "remote_execution_recovered_by_fallback"
+    )
+    assert downstream["entrypoint_registry_operation_path"] == (
+        "remote_dispatch_with_fallback"
+    )
+    assert "not production" in downstream["boundary"]
+
+
 def test_jetson_dummy_sample_matches_runtime_telemetry_schema() -> None:
     sample = _load_sample("jetson_smoke_dummy_sample.json")