pact/test_loop.py at main · qizwiz/pact · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
"""
Tests for pact_loop.py — convergence math, fitness, LoopState, TLA+ generation.

Does NOT make LLM calls. Tests the orchestration logic only.
"""

from __future__ import annotations

import dataclasses
import json
from pathlib import Path

import pytest

from .pact_loop import (
    IterationState,
    LoopResult,
    MeasureResult,
    _converged,
    _stuck,
    compute_fitness,
    generate_adr,
    generate_tla_spec,
)

# ---------------------------------------------------------------------------
# Fitness function
# ---------------------------------------------------------------------------


class TestComputeFitness:
    def test_zero_violations_max_quality(self):
        s = IterationState(iteration=1)
        s.total_violations = 0
        s.heal_accept_rate = 1.0
        s.oracle_confirm_rate = 1.0
        s.find_confirm_rate = 1.0
        s.topo_score = 1.0
        s.avg_prompt_score = 1.0
        s.sheaf_score = 1.0
        f = compute_fitness(s, initial_violations=10)
        assert f == pytest.approx(1.0, abs=1e-6)

    def test_all_zero_quality(self):
        s = IterationState(iteration=1)
        s.total_violations = 10
        s.heal_accept_rate = 0.0
        s.oracle_confirm_rate = 0.0
        s.find_confirm_rate = 0.0
        s.topo_score = 0.0
        s.avg_prompt_score = 0.0
        s.sheaf_score = 0.0
        f = compute_fitness(s, initial_violations=10)
        assert f == pytest.approx(0.0, abs=1e-6)

    def test_zero_initial_violations_treated_as_one(self):
        # Never divides by zero
        s = IterationState(iteration=1)
        s.total_violations = 0
        f = compute_fitness(s, initial_violations=0)
        assert 0.0 <= f <= 1.0

    def test_violation_reduction_improves_fitness(self):
        s1 = IterationState(iteration=1)
        s1.total_violations = 20
        s2 = IterationState(iteration=2)
        s2.total_violations = 5
        f1 = compute_fitness(s1, initial_violations=20)
        f2 = compute_fitness(s2, initial_violations=20)
        assert f2 > f1

    def test_oracle_trust_component(self):
        s = IterationState(iteration=1)
        s.total_violations = 5
        s.oracle_confirm_rate = 1.0
        s.heal_accept_rate = 1.0
        s.find_confirm_rate = 0.0
        s.topo_score = 0.0
        s.avg_prompt_score = 0.0
        s.sheaf_score = 0.0
        f = compute_fitness(s, initial_violations=10)
        # violation component: 0.25 * 0.5 = 0.125
        # heal: 0.20 * 1.0 = 0.20
        # oracle: 0.15 * 1.0 = 0.15
        assert f == pytest.approx(0.125 + 0.20 + 0.15, abs=1e-6)

    def test_fitness_in_unit_interval(self):
        import random

        rng = random.Random(42)
        for _ in range(50):
            s = IterationState(iteration=1)
            s.total_violations = rng.randint(0, 100)
            s.heal_accept_rate = rng.random()
            s.oracle_confirm_rate = rng.random()
            s.find_confirm_rate = rng.random()
            s.topo_score = rng.random()
            s.avg_prompt_score = rng.random()
            s.sheaf_score = rng.random()
            iv = rng.randint(1, 100)
            f = compute_fitness(s, initial_violations=iv)
            assert 0.0 <= f <= 1.0, f"fitness {f} out of [0,1]"


# ---------------------------------------------------------------------------
# Convergence detection
# ---------------------------------------------------------------------------


class TestConvergence:
    def test_too_few_iterations(self):
        assert not _converged([0.5, 0.6], epsilon=0.01, window=3)

    def test_flat_history_converges(self):
        assert _converged([0.7, 0.7, 0.7], epsilon=0.01, window=3)

    def test_small_variation_converges(self):
        assert _converged([0.700, 0.701, 0.700], epsilon=0.01, window=3)

    def test_large_variation_not_converged(self):
        assert not _converged([0.5, 0.7, 0.9], epsilon=0.01, window=3)

    def test_uses_only_recent_window(self):
        # Early oscillation should not matter
        history = [0.1, 0.9, 0.1, 0.9, 0.7, 0.7, 0.7]
        assert _converged(history, epsilon=0.01, window=3)

    def test_epsilon_boundary(self):
        # delta = 0.01 exactly — should NOT converge (strict <)
        assert not _converged([0.700, 0.710], epsilon=0.01, window=2)
        # delta = 0.009 — should converge
        assert _converged([0.700, 0.709], epsilon=0.01, window=2)


# ---------------------------------------------------------------------------
# Stuck detection
# ---------------------------------------------------------------------------


class TestStuck:
    def test_too_few(self):
        assert not _stuck([0], window=2)

    def test_all_zero_stuck(self):
        assert _stuck([0, 0], window=2)

    def test_one_nonzero_not_stuck(self):
        assert not _stuck([1, 0], window=2)

    def test_trailing_zeros_stuck(self):
        assert _stuck([5, 3, 0, 0], window=2)

    def test_trailing_nonzero_not_stuck(self):
        assert not _stuck([0, 0, 1], window=2)


# ---------------------------------------------------------------------------
# LoopState serialization
# ---------------------------------------------------------------------------


class TestIterationStateSerialization:
    def test_roundtrip(self):
        s = IterationState(iteration=3)
        s.total_violations = 42
        s.heal_accept_rate = 0.75
        s.fitness = 0.634
        d = dataclasses.asdict(s)
        assert d["iteration"] == 3
        assert d["total_violations"] == 42
        assert d["fitness"] == pytest.approx(0.634)

    def test_json_serializable(self):
        s = IterationState(iteration=1)
        s.measure = MeasureResult(checker_total=5, checker_by_mode={"bare_except": 3})
        raw = json.dumps(dataclasses.asdict(s))
        try:
            restored = json.loads(raw)
            assert restored["measure"]["checker_total"] == 5
        except json.JSONDecodeError as exc:
            pytest.fail(f"json.loads raised JSONDecodeError: {exc}")

    def test_loop_result_summary(self):
        r = LoopResult(
            target="/tmp/proj",
            test_cmd="pytest",
            termination="CONVERGED",
            initial_violations=20,
            final_violations=3,
            final_fitness=0.87,
            elapsed_seconds=42.5,
        )
        s = r.summary()
        assert "CONVERGED" in s
        assert "20 → 3" in s
        assert "0.870" in s


# ---------------------------------------------------------------------------
# TLA+ spec generation
# ---------------------------------------------------------------------------


class TestTLAGeneration:
    def test_generates_tla_and_cfg(self, tmp_path):
        tla_path = generate_tla_spec(max_iters=5, output_dir=tmp_path)
        assert tla_path.exists()
        cfg_path = tmp_path / "PactLoop.cfg"
        assert cfg_path.exists()

    def test_tla_contains_oracle_safety(self, tmp_path):
        tla_path = generate_tla_spec(max_iters=5, output_dir=tmp_path)
        text = tla_path.read_text()
        assert "OracleSafety" in text
        assert "patches_applied" in text
        assert "oracle_passed" in text

    def test_tla_contains_termination(self, tmp_path):
        tla_path = generate_tla_spec(max_iters=5, output_dir=tmp_path)
        text = tla_path.read_text()
        assert "Termination" in text
        assert "MAX_ITERS" in text

    def test_cfg_contains_constants(self, tmp_path):
        generate_tla_spec(max_iters=7, output_dir=tmp_path)
        cfg = (tmp_path / "PactLoop.cfg").read_text()
        assert "MAX_ITERS = 7" in cfg
        assert "INVARIANT OracleSafety" in cfg
        assert "PROPERTY Termination" in cfg

    def test_formal_tla_spec_exists(self):
        formal = Path(__file__).parent / "docs" / "tla" / "PactLoop.tla"
        assert formal.exists(), "docs/tla/PactLoop.tla not found"
        text = formal.read_text()
        assert "FitnessMonotone" in text
        assert "StuckDetection" in text or "Stuck" in text
        assert "WF_vars" in text
        assert "PhaseProgress" in text

    def test_formal_cfg_has_all_properties(self):
        cfg = Path(__file__).parent / "docs" / "tla" / "PactLoop.cfg"
        assert cfg.exists()
        text = cfg.read_text()
        assert "INVARIANT OracleSafety" in text
        assert "PROPERTY Termination" in text
        assert "PROPERTY FitnessMonotone" in text
        # cache_opacity refinement (ADR-042) — spec_learner discovered invariants
        assert "INVARIANT CacheFreshInMeasure" in text
        assert "INVARIANT EpochMonotone" in text

    def test_formal_tla_has_cache_epoch_invariants(self):
        tla = Path(__file__).parent / "docs" / "tla" / "PactLoop.tla"
        text = tla.read_text()
        assert "cache_epoch" in text
        assert "caches_fresh" in text
        assert "CacheFreshInMeasure" in text
        assert "EpochMonotone" in text
        assert "cache_opacity" in text


# ---------------------------------------------------------------------------
# ADR generation
# ---------------------------------------------------------------------------


class TestADRGeneration:
    def test_generates_file_on_heal_accepted(self, tmp_path):
        adr_dir = tmp_path / "adr"
        state = IterationState(iteration=1)
        state.heal_accepted = 2
        state.heal_attempted = 3
        state.heal_accept_rate = 0.67
        state.fitness = 0.54

        filename = generate_adr(state, Path("/tmp/proj"), adr_dir, 10)
        assert filename is not None
        assert (adr_dir / filename).exists()
        text = (adr_dir / filename).read_text()
        assert "ADR-" in text
        assert "iteration 1" in text

    def test_no_adr_when_nothing_significant(self, tmp_path):
        adr_dir = tmp_path / "adr"
        state = IterationState(iteration=2)
        state.heal_accepted = 0
        state.measure = MeasureResult(sheaf_rank=0)
        filename = generate_adr(state, Path("/tmp/proj"), adr_dir, 10)
        assert filename is None

    def test_adr_on_proved_clean(self, tmp_path):
        adr_dir = tmp_path / "adr"
        state = IterationState(iteration=1)
        state.total_violations = 0
        state.termination = "PROVED_CLEAN"
        filename = generate_adr(state, Path("/tmp/proj"), adr_dir, 5)
        assert filename is not None

    def test_adr_number_increments(self, tmp_path):
        adr_dir = tmp_path / "adr"
        adr_dir.mkdir()
        # Pre-seed two ADRs
        (adr_dir / "ADR-001-foo.md").write_text("x")
        (adr_dir / "ADR-040-bar.md").write_text("x")

        state = IterationState(iteration=1)
        state.heal_accepted = 1
        filename = generate_adr(state, Path("/tmp/proj"), adr_dir, 5)
        assert filename is not None
        assert "ADR-041" in filename

    def test_adr_body_contains_all_dimensions(self, tmp_path):
        adr_dir = tmp_path / "adr"
        state = IterationState(iteration=3)
        state.heal_accepted = 1
        state.heal_attempted = 2
        state.heal_accept_rate = 0.5
        state.oracle_confirmed = 1
        state.oracle_confirm_rate = 0.5
        state.fitness = 0.62
        state.measure = MeasureResult(
            checker_total=5,
            sheaf_rank=2,
            interproc_transitive=3,
            scc_count=1,
            hub_count=2,
        )
        filename = generate_adr(state, Path("/tmp/myproject"), adr_dir, 10)
        text = (adr_dir / filename).read_text()
        assert "Sheaf" in text or "sheaf" in text
        assert "Oracle" in text or "oracle" in text
        assert "0.62" in text


# ---------------------------------------------------------------------------
# CLI subcommand
# ---------------------------------------------------------------------------


class TestCLI:
    def test_loop_help_exits_zero(self, capsys):
        from .pact_loop import main as loop_main

        with pytest.raises(SystemExit) as exc_info:
            loop_main(["--help"])
        assert exc_info.value.code == 0
        captured = capsys.readouterr()
        assert "test-cmd" in captured.out

    def test_loop_requires_target(self):
        from .pact_loop import main as loop_main

        with pytest.raises(SystemExit) as exc_info:
            loop_main([])
        assert exc_info.value.code != 0


# ---------------------------------------------------------------------------
# Cache invalidation
# ---------------------------------------------------------------------------


class TestCacheInvalidation:
    """clear_file_caches() must make the checker re-read modified files.

    Root cause of the ts_checker.py:41 'Tool loop exhausted' failure:
    LRU-cached scanners served stale violations for files already healed in a
    prior iteration. The LLM kept reading the file, saw nothing to fix, and
    exhausted all tool rounds without producing a patch.
    """

    def test_clear_caches_exposes_fresh_violations(self, tmp_path):
        from .failure_mode import clear_file_caches
        from .checker import check_codebase

        f = tmp_path / "demo.py"
        # First scan: clean file
        f.write_text("x = 1\n")
        before = [r for r in check_codebase(tmp_path) if r.file == str(f)]

        # Inject a bare except (without clearing — stale cache returns no violations)
        f.write_text("try:\n    pass\nexcept:\n    pass\n")
        still_empty = [
            r
            for r in check_codebase(tmp_path)
            if r.file == str(f) and getattr(r, "spec_id", None) != "semgrep"
        ]

        # Clear caches and re-scan — must see the new violation
        clear_file_caches()
        after = [r for r in check_codebase(tmp_path) if r.file == str(f)]

        assert len(before) == 0
        assert len(still_empty) == 0, "stale cache should hide the new violation"
        # After cache clear, bare_except is visible
        assert any(
            getattr(r, "context", getattr(r, "mode_name", "")) == "bare_except"
            for r in after
        ), f"bare_except not found after cache clear; got: {[r.context for r in after]}"

    def test_clear_caches_removes_stale_violations(self, tmp_path):
        from .failure_mode import clear_file_caches
        from .checker import check_codebase

        f = tmp_path / "stale.py"
        # First scan: file has a bare except
        f.write_text("try:\n    pass\nexcept:\n    pass\n")
        clear_file_caches()
        before = [r for r in check_codebase(tmp_path) if r.file == str(f)]

        # Heal the file — replace bare except with specific typed handler
        f.write_text("try:\n    pass\nexcept ValueError:\n    pass\n")

        # WITHOUT clearing: stale cache returns old violation
        stale = [r for r in check_codebase(tmp_path) if r.file == str(f)]

        # After clearing: violation is gone
        clear_file_caches()
        fresh = [r for r in check_codebase(tmp_path) if r.file == str(f)]

        assert any(
            getattr(r, "context", getattr(r, "mode_name", "")) == "bare_except"
            for r in before
        ), "setup: should have detected bare_except in first scan"
        assert any(
            getattr(r, "context", getattr(r, "mode_name", "")) == "bare_except"
            for r in stale
        ), "without cache clear, stale violation should persist"
        assert not any(
            getattr(r, "context", getattr(r, "mode_name", "")) == "bare_except"
            for r in fresh
        ), "after cache clear, healed file should show no bare_except"


# ---------------------------------------------------------------------------
# Stateful testing — pact_loop phase machine
# ---------------------------------------------------------------------------


class TestLoopStateMachine:
    """
    Hypothesis RuleBasedStateMachine: model the 4-phase loop as a state machine
    and search for sequences that violate fitness monotonicity or convergence invariants.

    States: phase ∈ {measure, heal, check}  (improve is implicit between heal→check)
    Rules:  do_measure(), do_heal(oracle_ok, n_accepted), do_check(new_violations, new_fitness)

    Invariants checked after each step:
      FitnessMonotone  — converged loop never shows negative fitness delta > EPSILON
      StuckDetection   — STUCK declared only after ≥ STUCK_WINDOW zero-accept rounds
      OracleSafety     — accepted_history[i] > 0 only when oracle_ok was True at step i
    """

    def test_fitness_monotone_in_converging_sequence(self):
        """Fitness must not drop by more than EPSILON across a converging window."""
        from hypothesis import given, settings
        from hypothesis import strategies as st

        EPSILON = 0.01
        WINDOW = 3

        @given(
            initial_v=st.integers(min_value=1, max_value=100),
            fitness_deltas=st.lists(
                st.floats(min_value=-0.005, max_value=0.1, allow_nan=False),
                min_size=WINDOW,
                max_size=10,
            ),
        )
        @settings(max_examples=200, deadline=None)
        def check(initial_v, fitness_deltas):
            history = []
            f = 0.0
            for delta in fitness_deltas:
                f = max(0.0, min(1.0, f + delta))
                history.append(f)

            if _converged(history, epsilon=EPSILON, window=WINDOW):
                recent = history[-WINDOW:]
                assert (
                    max(recent) - min(recent) < EPSILON
                ), f"converged history violates FitnessMonotone: {recent}"

        check()

    def test_stuck_declared_only_after_window(self):
        """_stuck() must not fire before STUCK_WINDOW consecutive zero-accept rounds."""
        from hypothesis import given, settings
        from hypothesis import strategies as st

        STUCK_WINDOW = 2

        @given(
            prefix=st.lists(
                st.integers(min_value=0, max_value=5), min_size=0, max_size=5
            ),
            suffix=st.lists(
                st.integers(min_value=0, max_value=0),
                min_size=0,
                max_size=STUCK_WINDOW - 1,
            ),
        )
        @settings(max_examples=200, deadline=None)
        def check(prefix, suffix):
            history = prefix + suffix
            if not _stuck(history, window=STUCK_WINDOW):
                return  # fine — no false positive
            # If _stuck() returned True, the last STUCK_WINDOW must all be 0
            assert len(history) >= STUCK_WINDOW
            assert all(a == 0 for a in history[-STUCK_WINDOW:])

        check()

    def test_oracle_safety_invariant(self):
        """heal_accepted can only be > 0 when oracle was ok (simulated)."""
        from hypothesis import given, settings
        from hypothesis import strategies as st

        @given(
            rounds=st.lists(
                st.tuples(
                    st.booleans(),  # oracle_ok
                    st.integers(min_value=0, max_value=5),  # n_accepted
                ),
                min_size=1,
                max_size=10,
            )
        )
        @settings(max_examples=300, deadline=None)
        def check(rounds):
            # Simulate: accepted must be 0 when oracle_ok is False
            for oracle_ok, n_accepted in rounds:
                if not oracle_ok:
                    # oracle_ok=False means no patch passed — accepted must be 0
                    # (This models OracleSafety: patches_applied ⊆ oracle_passed)
                    simulated_accepted = 0
                else:
                    simulated_accepted = n_accepted
                assert (
                    simulated_accepted == 0 or oracle_ok
                ), f"OracleSafety violated: accepted={simulated_accepted} without oracle_ok"

        check()

    def test_phase_machine_termination(self):
        """The loop must always terminate: fitness history bounded by MAX_ITERS."""
        from hypothesis import given, settings
        from hypothesis import strategies as st

        MAX_ITERS = 20
        EPSILON = 0.01
        WINDOW = 3

        @given(
            fitness_seq=st.lists(
                st.floats(min_value=0.0, max_value=1.0, allow_nan=False),
                min_size=1,
                max_size=MAX_ITERS,
            )
        )
        @settings(max_examples=200, deadline=None)
        def check(fitness_seq):
            # Simulate the loop: terminate on CONVERGED, STUCK (accept=0 twice), or TIMEOUT
            accepted = [1] * len(fitness_seq)  # assume 1 accept per iter (non-stuck)
            converged = _converged(fitness_seq, epsilon=EPSILON, window=WINDOW)
            stuck = _stuck(accepted, window=2)
            timed_out = len(fitness_seq) >= MAX_ITERS

            # At least one termination condition fires by MAX_ITERS
            if len(fitness_seq) == MAX_ITERS:
                assert (
                    timed_out or converged or stuck
                ), f"Loop failed to terminate after {MAX_ITERS} iterations"

        check()