From cafc60e3a566bb920e24e80329c767a6e3906d21 Mon Sep 17 00:00:00 2001 From: hyeokjun32 Date: Sat, 23 May 2026 22:29:54 +0900 Subject: [PATCH] Preserve runtime telemetry coverage context --- README.md | 4 ++ docs/ko/README.md | 5 ++- docs/runtime-telemetry-history.md | 26 +++++++++++-- inferedge_env/cli.py | 36 ++++++++++++++++++ inferedge_env/compare/regression.py | 29 ++++++++++++++ inferedge_env/result/telemetry_history.py | 46 +++++++++++++++++++++++ tests/test_regression.py | 46 ++++++++++++++++++++++- tests/test_runtime_telemetry_history.py | 39 +++++++++++++++++++ 8 files changed, 225 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 6888105..77b1fdb 100644 --- a/README.md +++ b/README.md @@ -313,6 +313,10 @@ Use `edgeenv runs telemetry inspect-history ` to validate and summarize that replay artifact before attaching it to a regression report. The intended local flow is export history, inspect the replay artifact, then pass it to `report regression --telemetry-history`. +If Runtime includes `runtime_telemetry.coverage`, EdgeEnv preserves it in the +history artifact and inspect summary as evidence quality metadata. Missing +coverage fields are visible as coverage gaps, but they do not fail the run or +change comparability. `report regression` reuses the same comparability gate. It only computes mean/p95/p99/FPS/resource deltas for `Comparable: Yes` with diff --git a/docs/ko/README.md b/docs/ko/README.md index 480ef60..6709865 100644 --- a/docs/ko/README.md +++ b/docs/ko/README.md @@ -80,7 +80,10 @@ runtime telemetry history artifact가 있으면 `--telemetry-history`로 연결 report에 telemetry coverage와 evidence gap을 보조 context로 첨부할 수 있다. 이 context는 same-condition comparability gate를 우회하지 않는다. `edgeenv runs telemetry inspect-history `로 history artifact의 schema, -replay run, telemetry field, evidence gap을 먼저 확인할 수 있다. +replay run, telemetry field, coverage metadata, evidence gap을 먼저 확인할 수 +있다. Runtime이 `runtime_telemetry.coverage`를 제공하면 EdgeEnv는 이를 +evidence quality metadata로 보존하지만, coverage 누락을 run 실패나 regression +judgement로 승격하지 않는다. ## EdgeEnv가 아닌 것 diff --git a/docs/runtime-telemetry-history.md b/docs/runtime-telemetry-history.md index de1039d..acda04d 100644 --- a/docs/runtime-telemetry-history.md +++ b/docs/runtime-telemetry-history.md @@ -37,6 +37,9 @@ The payload is intentionally additive and minimally validated: - it must be a JSON object - `schema_version`, when present, must be a string - unknown telemetry fields are preserved instead of normalized into registry columns +- Runtime telemetry `coverage`, when present, is preserved as evidence quality + metadata. Missing coverage fields are treated as an evidence gap, not as a + failed run or comparability failure. When present, EdgeEnv stores the payload in two places: @@ -95,7 +98,16 @@ The history artifact uses this top-level shape: { "run_id": "run-20260522-000000-12345678", "runtime_telemetry": { - "schema_version": "inferedge-runtime-telemetry-v1" + "schema_version": "inferedge-runtime-telemetry-v1", + "coverage": { + "schema_version": "inferedge-runtime-telemetry-coverage-v1", + "expected_fields": ["queue_depth", "gpu_temperature"], + "observed_fields": ["gpu_temperature"], + "missing_fields": ["queue_depth"], + "coverage_ratio": 0.5, + "comparability_owner": "edgeenv", + "missing_telemetry_is_failure": false + } }, "orchestrator_operation_context": { "schema_version": "inferedge-orchestrator-edgeenv-runtime-telemetry-feed-v1", @@ -117,8 +129,9 @@ This is a replay dataset seed. It records evidence gaps explicitly and does not `inspect-history` is a read-only validation step for that seed artifact. It checks the schema, summarizes replay run IDs, available telemetry fields, -execution sequence IDs, and missing telemetry evidence gaps. It does not mutate -the registry, change comparability judgement, or act as a monitoring alert. +execution sequence IDs, telemetry coverage metadata, and missing telemetry +evidence gaps. It does not mutate the registry, change comparability judgement, +or act as a monitoring alert. Regression reports can attach this artifact as supplemental context: @@ -146,6 +159,12 @@ The regression report records telemetry coverage and evidence gaps for the baseline/candidate pair. It still calculates regression deltas only after the normal same-condition comparability gate passes. +Runtime telemetry coverage context is copied into +`runtime_telemetry_context..telemetry_coverage` and, when +provided through the history artifact, `history_telemetry_coverage`. This makes +coverage gaps visible to Lab or AIGuard consumers without allowing coverage to +override EdgeEnv's comparability-first regression policy. + Replay edge cases are preserved as evidence context: - If the compared candidate is missing runtime telemetry, the regression report @@ -195,6 +214,7 @@ and Lab remains the deployment decision owner. - Do not make runtime telemetry required for a successful run. - Do not treat missing telemetry as a comparability failure. +- Do not treat missing telemetry coverage fields as a regression judgement. - Do not add telemetry columns to `runs.db` before a query/report requirement is proven. - Do not describe this as production observability, cloud monitoring, distributed tracing, or real-time data drift detection. - Do not use telemetry to bypass the existing comparability-first regression policy. diff --git a/inferedge_env/cli.py b/inferedge_env/cli.py index bce368c..96e5767 100644 --- a/inferedge_env/cli.py +++ b/inferedge_env/cli.py @@ -444,6 +444,17 @@ def inspect_runtime_telemetry_history_command( console.print(f"Schema: {summary['schema_version']}") console.print(f"Replay runs: {len(replay['run_ids'])}") console.print(f"Telemetry fields: {', '.join(replay['telemetry_fields']) or '-'}") + coverage = replay.get("telemetry_coverage", {}) + if isinstance(coverage, dict): + console.print( + "Telemetry coverage runs: " + f"{coverage.get('runs_with_coverage', 0)}" + ) + console.print( + "Telemetry coverage missing fields: " + f"{', '.join(coverage.get('missing_fields', [])) or '-'}", + soft_wrap=True, + ) console.print( "Orchestrator context runs: " f"{len(replay.get('orchestrator_context_run_ids', []))}" @@ -1159,6 +1170,7 @@ def _print_runtime_telemetry_context(context: dict[str, Any]) -> None: f"history={str(bool(baseline.get('history_entry_present'))).lower()}", soft_wrap=True, ) + _print_runtime_telemetry_coverage("baseline", baseline) if isinstance(candidate, dict): console.print( "- candidate: " @@ -1166,6 +1178,7 @@ def _print_runtime_telemetry_context(context: dict[str, Any]) -> None: f"history={str(bool(candidate.get('history_entry_present'))).lower()}", soft_wrap=True, ) + _print_runtime_telemetry_coverage("candidate", candidate) gaps = context.get("evidence_gaps", []) if gaps: console.print("- evidence_gaps:") @@ -1181,6 +1194,29 @@ def _print_runtime_telemetry_context(context: dict[str, Any]) -> None: console.print("- role: supplemental context, not a comparability gate") +def _print_runtime_telemetry_coverage( + label: str, + context: dict[str, Any], +) -> None: + coverage = context.get("telemetry_coverage") + if not isinstance(coverage, dict): + coverage = context.get("history_telemetry_coverage") + if not isinstance(coverage, dict): + return + missing_fields = coverage.get("missing_fields") + if not isinstance(missing_fields, list): + missing_fields = [] + console.print( + f"- {label} coverage: " + f"observed={coverage.get('observed_field_count', '-')}/" + f"{coverage.get('expected_field_count', '-')}, " + f"missing={', '.join(str(item) for item in missing_fields) or '-'}, " + f"missing_is_failure=" + f"{str(bool(coverage.get('missing_telemetry_is_failure'))).lower()}", + soft_wrap=True, + ) + + def _metric_delta_line(field: str, left: float, right: float, unit: str) -> str: delta = right - left percent = _format_percent_delta(left, delta) diff --git a/inferedge_env/compare/regression.py b/inferedge_env/compare/regression.py index 6212ec5..eee9428 100644 --- a/inferedge_env/compare/regression.py +++ b/inferedge_env/compare/regression.py @@ -315,6 +315,9 @@ def _telemetry_run_context( ), } ) + coverage = _telemetry_coverage_context(telemetry.get("coverage")) + if coverage is not None: + context["telemetry_coverage"] = coverage if history_entry is not None: context["history_telemetry_timestamp"] = history_entry.get( "telemetry_timestamp" @@ -322,6 +325,11 @@ def _telemetry_run_context( context["history_execution_sequence_id"] = history_entry.get( "execution_sequence_id" ) + history_telemetry = history_entry.get("runtime_telemetry") + if isinstance(history_telemetry, dict): + coverage = _telemetry_coverage_context(history_telemetry.get("coverage")) + if coverage is not None: + context["history_telemetry_coverage"] = coverage _attach_orchestrator_context(context, history_entry) if missing_entry is not None: context["history_missing_reason"] = missing_entry.get("reason") @@ -354,6 +362,27 @@ def _telemetry_source(telemetry: dict[str, Any]) -> str | None: return source if isinstance(source, str) else None +def _telemetry_coverage_context(value: Any) -> dict[str, Any] | None: + if not isinstance(value, dict): + return None + context: dict[str, Any] = {} + for key in ( + "schema_version", + "expected_fields", + "observed_fields", + "missing_fields", + "expected_field_count", + "observed_field_count", + "missing_field_count", + "coverage_ratio", + "comparability_owner", + "missing_telemetry_is_failure", + ): + if key in value: + context[key] = value[key] + return context + + def _telemetry_gaps(context: dict[str, Any]) -> list[dict[str, str]]: gaps: list[dict[str, str]] = [] if not context["result_telemetry_present"]: diff --git a/inferedge_env/result/telemetry_history.py b/inferedge_env/result/telemetry_history.py index 816cee3..0c5a1ba 100644 --- a/inferedge_env/result/telemetry_history.py +++ b/inferedge_env/result/telemetry_history.py @@ -208,6 +208,7 @@ def inspect_runtime_telemetry_history(payload: dict[str, Any]) -> dict[str, Any] "replay": { "run_ids": run_ids, "telemetry_fields": _telemetry_fields(runs), + "telemetry_coverage": _telemetry_coverage_summary(runs), "orchestrator_context_run_ids": [ entry["run_id"] for entry in runs @@ -390,5 +391,50 @@ def _telemetry_fields(entries: list[dict[str, Any]]) -> list[str]: return sorted(fields) +def _telemetry_coverage_summary(entries: list[dict[str, Any]]) -> dict[str, Any]: + coverage_entries: list[dict[str, Any]] = [] + expected_fields: set[str] = set() + observed_fields: set[str] = set() + missing_fields: set[str] = set() + ratios: list[float] = [] + missing_telemetry_failure_values: set[bool] = set() + + for entry in entries: + telemetry = entry.get("runtime_telemetry") + if not isinstance(telemetry, dict): + continue + coverage = telemetry.get("coverage") + if not isinstance(coverage, dict): + continue + coverage_entries.append(coverage) + expected_fields.update(_string_items(coverage.get("expected_fields"))) + observed_fields.update(_string_items(coverage.get("observed_fields"))) + missing_fields.update(_string_items(coverage.get("missing_fields"))) + ratio = coverage.get("coverage_ratio") + if isinstance(ratio, (int, float)): + ratios.append(float(ratio)) + missing_telemetry_is_failure = coverage.get("missing_telemetry_is_failure") + if isinstance(missing_telemetry_is_failure, bool): + missing_telemetry_failure_values.add(missing_telemetry_is_failure) + + return { + "runs_with_coverage": len(coverage_entries), + "expected_fields": sorted(expected_fields), + "observed_fields": sorted(observed_fields), + "missing_fields": sorted(missing_fields), + "coverage_ratio_min": min(ratios) if ratios else None, + "coverage_ratio_max": max(ratios) if ratios else None, + "missing_telemetry_is_failure_values": sorted( + missing_telemetry_failure_values + ), + } + + +def _string_items(value: Any) -> list[str]: + if not isinstance(value, list): + return [] + return [item for item in value if isinstance(item, str)] + + def _is_monotonic(values: list[int | float]) -> bool: return all(left <= right for left, right in zip(values, values[1:])) diff --git a/tests/test_regression.py b/tests/test_regression.py index 90c2989..2fba3bc 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -92,7 +92,10 @@ def test_regression_attaches_runtime_telemetry_history_context( p95=125.0, p99=135.0, fps=48.0, - runtime_telemetry=_runtime_telemetry(sequence_id=2), + runtime_telemetry=_runtime_telemetry( + sequence_id=2, + missing_fields=["queue_depth"], + ), ), ) telemetry_history = { @@ -112,6 +115,7 @@ def test_regression_attaches_runtime_telemetry_history_context( "run_id": "candidate", "telemetry_timestamp": "2026-05-22T00:00:02Z", "execution_sequence_id": 2, + "runtime_telemetry": candidate.runtime_telemetry, }, ], "missing_telemetry": [], @@ -132,6 +136,25 @@ def test_regression_attaches_runtime_telemetry_history_context( assert context["baseline"]["result_telemetry_present"] is True assert context["baseline"]["history_entry_present"] is True assert context["candidate"]["execution_sequence_id"] == 2 + assert context["candidate"]["telemetry_coverage"] == { + "schema_version": "inferedge-runtime-telemetry-coverage-v1", + "expected_fields": [ + "gpu_temperature", + "queue_depth", + "telemetry_timestamp", + ], + "observed_fields": ["gpu_temperature", "telemetry_timestamp"], + "missing_fields": ["queue_depth"], + "expected_field_count": 3, + "observed_field_count": 2, + "missing_field_count": 1, + "coverage_ratio": 0.666667, + "comparability_owner": "edgeenv", + "missing_telemetry_is_failure": False, + } + assert context["candidate"]["history_telemetry_coverage"]["missing_fields"] == [ + "queue_depth" + ] assert context["evidence_gaps"] == [] assert report.evidence["mean_delta_pct"] == 12.0 @@ -859,7 +882,14 @@ def _write_registered_run( RunRegistry(edgeenv_root / "runs.db").insert(result, run_dir / "result.json") -def _runtime_telemetry(sequence_id: int) -> dict: +def _runtime_telemetry( + sequence_id: int, + *, + missing_fields: list[str] | None = None, +) -> dict: + missing = missing_fields or [] + expected_fields = ["gpu_temperature", "queue_depth", "telemetry_timestamp"] + observed_fields = [field for field in expected_fields if field not in missing] return { "schema_version": "inferedge-runtime-telemetry-v1", "telemetry_timestamp": f"2026-05-22T00:00:0{sequence_id}Z", @@ -874,6 +904,18 @@ def _runtime_telemetry(sequence_id: int) -> dict: "operation": { "timeout_observed": False, }, + "coverage": { + "schema_version": "inferedge-runtime-telemetry-coverage-v1", + "expected_fields": expected_fields, + "observed_fields": observed_fields, + "missing_fields": missing, + "expected_field_count": len(expected_fields), + "observed_field_count": len(observed_fields), + "missing_field_count": len(missing), + "coverage_ratio": round(len(observed_fields) / len(expected_fields), 6), + "comparability_owner": "edgeenv", + "missing_telemetry_is_failure": False, + }, } diff --git a/tests/test_runtime_telemetry_history.py b/tests/test_runtime_telemetry_history.py index 68416bc..c7cea3f 100644 --- a/tests/test_runtime_telemetry_history.py +++ b/tests/test_runtime_telemetry_history.py @@ -65,6 +65,18 @@ def test_build_runtime_telemetry_history_records_entries_and_missing_gaps( assert payload["runs"][0]["runtime_telemetry"]["resource"] == { "telemetry_source": "runtime-result" } + assert payload["runs"][0]["runtime_telemetry"]["coverage"] == { + "schema_version": "inferedge-runtime-telemetry-coverage-v1", + "expected_fields": ["queue_depth", "gpu_temperature"], + "observed_fields": ["gpu_temperature"], + "missing_fields": ["queue_depth"], + "expected_field_count": 2, + "observed_field_count": 1, + "missing_field_count": 1, + "coverage_ratio": 0.5, + "comparability_owner": "edgeenv", + "missing_telemetry_is_failure": False, + } assert payload["runs"][0]["protocol"]["repeat_runs"] == 10 assert payload["missing_telemetry"] == [ { @@ -285,6 +297,15 @@ def test_inspect_runtime_telemetry_history_reports_replay_summary( assert summary["replay"]["missing_run_ids"] == ["run-without-telemetry"] assert "latency" in summary["replay"]["telemetry_fields"] assert "operation" in summary["replay"]["telemetry_fields"] + assert summary["replay"]["telemetry_coverage"] == { + "runs_with_coverage": 2, + "expected_fields": ["gpu_temperature", "queue_depth"], + "observed_fields": ["gpu_temperature"], + "missing_fields": ["queue_depth"], + "coverage_ratio_min": 0.5, + "coverage_ratio_max": 0.5, + "missing_telemetry_is_failure_values": [False], + } assert summary["replay"]["orchestrator_context_run_ids"] == [] assert "not production monitoring" in summary["notes"][2] @@ -398,6 +419,8 @@ def test_cli_runs_telemetry_inspect_history_validates_replay_artifact( assert "Runtime telemetry history valid" in result.output assert "Replay runs: 1" in result.output assert "Telemetry fields:" in result.output + assert "Telemetry coverage runs: 1" in result.output + assert "Telemetry coverage missing fields: queue_depth" in result.output assert "latency" in result.output assert "Evidence gaps: 1" in result.output assert "run-cli-without-telemetry" in result.output @@ -439,6 +462,10 @@ def test_cli_runs_telemetry_inspect_history_json_output( assert payload["valid"] is True assert payload["replay"]["run_ids"] == ["run-cli-json"] assert payload["replay"]["execution_sequence_ids"] == [3] + assert payload["replay"]["telemetry_coverage"]["runs_with_coverage"] == 1 + assert payload["replay"]["telemetry_coverage"]["missing_fields"] == [ + "queue_depth" + ] def _write_registered_run( @@ -487,6 +514,18 @@ def _runtime_telemetry_payload(sequence_id: int = 7) -> dict: "operation": { "timeout_observed": False, }, + "coverage": { + "schema_version": "inferedge-runtime-telemetry-coverage-v1", + "expected_fields": ["queue_depth", "gpu_temperature"], + "observed_fields": ["gpu_temperature"], + "missing_fields": ["queue_depth"], + "expected_field_count": 2, + "observed_field_count": 1, + "missing_field_count": 1, + "coverage_ratio": 0.5, + "comparability_owner": "edgeenv", + "missing_telemetry_is_failure": False, + }, "missing_fields": ["queue_depth"], "production_monitoring": False, }