Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ edgeenv report compare <run_id_a> <run_id_b>
```

For the full flow, see [Compare Workflow Guide](docs/compare-workflow-guide.md).
The guide also links small committed runtime regression replay fixtures under
`examples/regression/` for downstream AIGuard/Lab handoff checks.

### 4. Optional Resource And Sampler Evidence

Expand Down
14 changes: 14 additions & 0 deletions docs/compare-workflow-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,20 @@ If a runtime telemetry history artifact is available, pass
regression report. Telemetry context remains supplemental evidence; it never
bypasses the same-condition comparability gate.

Committed replay-context examples are available when downstream tools need a
small EdgeEnv-owned fixture without running a benchmark:

- `examples/regression/edgeenv_candidate_telemetry_gap.json` shows a comparable
same-condition report where the candidate run is missing runtime telemetry in
both the result artifact and telemetry history.
- `examples/regression/edgeenv_sequence_inversion.json` shows a comparable
same-condition report where baseline/candidate `execution_sequence_id` order
is inverted in the replay context.

These examples intentionally do not include `guard_analysis` or a deployment
decision. EdgeEnv owns the registry, replay context, comparability judgement,
and regression evidence; AIGuard and Lab consume the artifact later.

Default starter thresholds:

| Signal | Threshold | Meaning |
Expand Down
92 changes: 92 additions & 0 deletions examples/regression/edgeenv_candidate_telemetry_gap.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{
"baseline_run_id": "baseline",
"candidate_run_id": "candidate",
"regression_detected": false,
"regression_type": "none",
"severity": "none",
"comparable": true,
"mode": "same-condition",
"evidence": {
"mean_delta_pct": 8.0,
"p95_delta_pct": 10.0,
"p99_delta_pct": 12.0,
"fps_delta_pct": -5.0,
"memory_peak_delta_pct": 4.0,
"baseline": {
"latency_mean_ms": 100.0,
"latency_p95_ms": 120.0,
"latency_p99_ms": 130.0,
"throughput_fps": 50.0,
"memory_peak_mb": 512.0
},
"candidate": {
"latency_mean_ms": 108.0,
"latency_p95_ms": 132.0,
"latency_p99_ms": 145.6,
"throughput_fps": 47.5,
"memory_peak_mb": 532.48
},
"triggered_thresholds": []
},
"recommendation": "no_action_required",
"comparability": {
"comparable": "Yes",
"mode": "same-condition",
"reasons": [
"Required comparison fields match.",
"Runtime/provider/target fields match."
]
},
"runtime_telemetry_context": {
"role": "supplemental_runtime_telemetry_context",
"source": "result_artifacts+runtime_telemetry_history",
"baseline": {
"run_id": "baseline",
"result_telemetry_present": true,
"history_entry_present": true,
"history_missing_recorded": false,
"schema_version": "edgeenv.runtime-telemetry.v1",
"telemetry_timestamp": "2026-05-22T00:00:01Z",
"execution_sequence_id": 1,
"telemetry_source": "synthetic_local_fixture",
"available_sections": [
"execution_sequence_id",
"latency",
"resource",
"telemetry_timestamp"
],
"history_telemetry_timestamp": "2026-05-22T00:00:01Z",
"history_execution_sequence_id": 1
},
"candidate": {
"run_id": "candidate",
"result_telemetry_present": false,
"history_entry_present": false,
"history_missing_recorded": true,
"history_missing_reason": "runtime_telemetry_missing"
},
"evidence_gaps": [
{
"run_id": "candidate",
"reason": "runtime_telemetry_missing_in_result"
},
{
"run_id": "candidate",
"reason": "runtime_telemetry_missing"
}
],
"notes": [
"Runtime telemetry context is supplemental evidence, not a comparability gate.",
"Missing telemetry is an evidence gap, not a failed benchmark run.",
"Regression deltas are still gated by same-condition comparability."
],
"history": {
"schema_version": "edgeenv.runtime-telemetry-history.v1",
"summary": {
"registered_runs": 3,
"telemetry_runs": 2,
"missing_telemetry_runs": 1
}
}
}
}
94 changes: 94 additions & 0 deletions examples/regression/edgeenv_sequence_inversion.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"baseline_run_id": "baseline",
"candidate_run_id": "candidate",
"regression_detected": false,
"regression_type": "none",
"severity": "none",
"comparable": true,
"mode": "same-condition",
"evidence": {
"mean_delta_pct": 12.0,
"p95_delta_pct": 4.166667,
"p99_delta_pct": 3.846154,
"fps_delta_pct": -4.0,
"memory_peak_delta_pct": 5.0,
"baseline": {
"latency_mean_ms": 100.0,
"latency_p95_ms": 120.0,
"latency_p99_ms": 130.0,
"throughput_fps": 50.0,
"memory_peak_mb": 512.0
},
"candidate": {
"latency_mean_ms": 112.0,
"latency_p95_ms": 125.0,
"latency_p99_ms": 135.0,
"throughput_fps": 48.0,
"memory_peak_mb": 537.6
},
"triggered_thresholds": []
},
"recommendation": "no_action_required",
"comparability": {
"comparable": "Yes",
"mode": "same-condition",
"reasons": [
"Required comparison fields match.",
"Runtime/provider/target fields match."
]
},
"runtime_telemetry_context": {
"role": "supplemental_runtime_telemetry_context",
"source": "result_artifacts+runtime_telemetry_history",
"baseline": {
"run_id": "baseline",
"result_telemetry_present": true,
"history_entry_present": true,
"history_missing_recorded": false,
"schema_version": "edgeenv.runtime-telemetry.v1",
"telemetry_timestamp": "2026-05-22T00:00:05Z",
"execution_sequence_id": 5,
"telemetry_source": "synthetic_local_fixture",
"available_sections": [
"execution_sequence_id",
"latency",
"resource",
"telemetry_timestamp"
],
"history_telemetry_timestamp": "2026-05-22T00:00:05Z",
"history_execution_sequence_id": 5
},
"candidate": {
"run_id": "candidate",
"result_telemetry_present": true,
"history_entry_present": true,
"history_missing_recorded": false,
"schema_version": "edgeenv.runtime-telemetry.v1",
"telemetry_timestamp": "2026-05-22T00:00:02Z",
"execution_sequence_id": 2,
"telemetry_source": "synthetic_local_fixture",
"available_sections": [
"execution_sequence_id",
"latency",
"resource",
"telemetry_timestamp"
],
"history_telemetry_timestamp": "2026-05-22T00:00:02Z",
"history_execution_sequence_id": 2
},
"evidence_gaps": [],
"notes": [
"Runtime telemetry context is supplemental evidence, not a comparability gate.",
"Missing telemetry is an evidence gap, not a failed benchmark run.",
"Regression deltas are still gated by same-condition comparability."
],
"history": {
"schema_version": "edgeenv.runtime-telemetry-history.v1",
"summary": {
"registered_runs": 2,
"telemetry_runs": 2,
"missing_telemetry_runs": 0
}
}
}
}
55 changes: 55 additions & 0 deletions tests/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
from helpers import make_result


EXAMPLE_REGRESSION_DIR = Path("examples/regression")


def test_regression_detects_same_condition_latency_and_resource_regression(
bench_config,
target_profile,
Expand Down Expand Up @@ -629,6 +632,58 @@ def test_cli_telemetry_replay_candidate_gap_to_regression_smoke(
assert "runtime_telemetry_missing" in markdown


def test_committed_replay_warning_fixtures_preserve_edgeenv_owned_context():
candidate_gap = json.loads(
(EXAMPLE_REGRESSION_DIR / "edgeenv_candidate_telemetry_gap.json").read_text(
encoding="utf-8"
)
)
sequence_inversion = json.loads(
(EXAMPLE_REGRESSION_DIR / "edgeenv_sequence_inversion.json").read_text(
encoding="utf-8"
)
)

assert "guard_analysis" not in candidate_gap
assert "guard_analysis" not in sequence_inversion
assert candidate_gap["comparable"] is True
assert sequence_inversion["mode"] == "same-condition"
assert candidate_gap["regression_detected"] is False
assert sequence_inversion["regression_detected"] is False

candidate_context = candidate_gap["runtime_telemetry_context"]
assert candidate_context["source"] == "result_artifacts+runtime_telemetry_history"
assert candidate_context["history"]["summary"]["missing_telemetry_runs"] == 1
assert candidate_context["candidate"]["result_telemetry_present"] is False
assert candidate_context["candidate"]["history_entry_present"] is False
assert candidate_context["candidate"]["history_missing_recorded"] is True
assert candidate_context["candidate"]["history_missing_reason"] == (
"runtime_telemetry_missing"
)
assert candidate_context["evidence_gaps"] == [
{
"run_id": "candidate",
"reason": "runtime_telemetry_missing_in_result",
},
{
"run_id": "candidate",
"reason": "runtime_telemetry_missing",
},
]

sequence_context = sequence_inversion["runtime_telemetry_context"]
assert sequence_context["history"]["summary"]["missing_telemetry_runs"] == 0
assert sequence_context["baseline"]["execution_sequence_id"] == 5
assert sequence_context["baseline"]["history_execution_sequence_id"] == 5
assert sequence_context["candidate"]["execution_sequence_id"] == 2
assert sequence_context["candidate"]["history_execution_sequence_id"] == 2
assert sequence_context["evidence_gaps"] == []
assert any(
"not a comparability gate" in note
for note in sequence_context["notes"]
)


def test_regression_cli_marks_runtime_comparison_not_evaluated(
tmp_path,
bench_config,
Expand Down
Loading