-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlinear_github_mapping.json
More file actions
1376 lines (1376 loc) · 167 KB
/
linear_github_mapping.json
File metadata and controls
1376 lines (1376 loc) · 167 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
{
"schema": "coordpy.linear_github_mapping.v1",
"team": "CoordPy",
"project": "CoordPy",
"linear_workspace_url": "https://linear.app/coordpy/",
"github_repo": "adotdong29/CoordPy",
"comment": "Canonical mapping of milestone → (commits, docs, Linear issues). Update this file at end-of-milestone; run scripts/sync_linear_github_v1.py to validate and emit Linear-ready summaries. Adding a milestone entry: append to milestones array, list commits + result/runbook docs + Linear issue IDs.",
"milestones": [
{
"id": "W89",
"title": "HumanEval-70B-Reflexion-K=5 same-budget multi-agent superiority retirement",
"outcome": "RETIRED — first confirmed same-budget multi-agent superiority retirement (+5.56 pp B over A1, 2-of-3 per-seed majority)",
"commits": [],
"docs": [
"docs/RESULTS_W89_HUMANEVAL_REFLEXION_V2.md"
],
"linear_issues": [],
"carry_forwards_retired": [
"W86-L-HUMANEVAL-V1-A1-SAME-BUDGET-NOT-BEATEN"
],
"carry_forwards_added": [
"W89-L-HUMANEVAL-REFLEXION-V2-HUMANEVAL-K5-SCALE-CAP"
]
},
{
"id": "W93",
"title": "Preflight-first empirical superiority wave (5-gate harness; 3 candidate kills; no expensive bench)",
"outcome": "DISCIPLINE — preflight infrastructure landed; 3 candidates killed in cheap evidence; no NIM spend",
"commits": [
"b79cd82"
],
"docs": [
"docs/RUNBOOK_W93.md",
"docs/RESULTS_W93_PREFLIGHT_DISCIPLINE_V1.md",
"docs/W93_FAILURE_DIAGNOSIS.md"
],
"linear_issues": [
"COO-13"
],
"carry_forwards_retired": [],
"carry_forwards_added": []
},
{
"id": "W94",
"title": "Preflight-earned K=10 pilot killed; cross-modal pivot to MathVista documented",
"outcome": "KILLED — K=10 pilot ceiling-saturated A1 to 100% on 15-problem slice; 3-of-6 pre-committed gates failed; HumanEval-Visual K=5 retired as cross-modal battlefield; MathVista selected as W95 line",
"commits": [
"9355b2f"
],
"docs": [
"docs/RUNBOOK_W94.md",
"docs/RESULTS_W94_K10_PILOT_V1.md",
"docs/W94_CROSS_MODAL_BATTLEFIELD_SCOUTING.md"
],
"linear_issues": [
"COO-7",
"COO-8",
"COO-10"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W94-L-K10-PILOT-CEILING-SATURATION-CAP"
]
},
{
"id": "W95",
"title": "MathVista cross-modal: preflight + Phase 2 pilot (9/9 gates PASS; +10pp) + Phase 3 retirement bench (5/6 W88 bars PASS; margin bar misses by 1.33 pp at +3.67 pp; NOT retirement)",
"outcome": "PHASE 3 NARROWLY FAILS RETIREMENT — A0=30.33% / A1=67.67% / B=71.33%; B-A1=+3.67pp (vs +5pp bar); B-A0=+41pp; per-seed B-A1=[+6,+10,-5]; B>A1 on 2/3 seeds; B≥A1 on 267/300 problems (89%); 13/14 audit re-derivations PASS offline. Single-seed +10pp pilot narrowed to +3.67pp at multi-seed scale; preflight-first discipline validated for the third time.",
"commits": [
"c14a8a1",
"ab022ce",
"1ebcdb5",
"90c7aca",
"70321d4",
"17ff0dc"
],
"docs": [
"docs/RUNBOOK_W95.md",
"docs/RESULTS_W95_MATHVISTA_PREFLIGHT_V1.md",
"docs/RESULTS_W95_MATHVISTA_PILOT_V1.md",
"docs/RESULTS_W95_MATHVISTA_PHASE3_V1.md",
"docs/LINEAR_GITHUB_SYNC.md",
"linear_github_mapping.json"
],
"linear_issues": [
"COO-11",
"COO-16"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W95-L-MATHVISTA-PILOT-SINGLE-SEED-CAP",
"W95-L-MATHVISTA-RETIREMENT-MARGIN-CAP"
]
},
{
"id": "W96-A",
"title": "MathVista at 90B-Vision: preflight + Phase 2 cheap pilot (9/9 gates PASS; +10pp, byte-equiv to 11B) + Phase 3 retirement bench (3/6 W88 bars FAIL; B-A1 = -5.00pp; scaling 11B->90B HURTS the team; NOT retirement)",
"outcome": "PHASE 3 DECISIVE NEGATIVE — A0=28.00% / A1=71.33% / B=66.33%; B-A1=-5.00pp (vs +5pp bar); B-A0=+38.33pp; per-seed B-A1=[+4,-7,-12]; B>A1 on 1/3 seeds; B>=A1 on 255/300 problems (85%); 11/14 audit re-derivations PASS offline; only 3 retirement-margin bars FAIL by sign/margin. Cross-scale shift on B-A1 = -8.67pp (11B +3.67pp -> 90B -5.00pp); H2-saturation empirically confirmed at multi-seed retirement scale; scaling the VLM weight class does NOT retire the cross-modal carry-forward.",
"commits": [
"8b5db54",
"1a2522d",
"b5778af",
"41f64e2",
"07ea9ed"
],
"docs": [
"docs/RUNBOOK_W96A.md",
"docs/RESULTS_W96A_MATHVISTA_90B_PILOT_V1.md",
"docs/RESULTS_W96A_MATHVISTA_90B_PHASE3_V1.md"
],
"linear_issues": [
"COO-17"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W96-L-MATHVISTA-90B-PILOT-SINGLE-SEED-CAP",
"W96-L-MATHVISTA-90B-RETIREMENT-MARGIN-CAP"
]
},
{
"id": "W96-C",
"title": "MathVista C1 (VLM-Verifier-Final-Turn) cross-scale Phase 2: 11B FAIL (+0pp; verifier 0/11) + 90B PASS (+13.33pp; verifier 1/7 = mechanism NOT load-bearing); NOT Phase 3 entitled",
"outcome": "CROSS-SCALE AMBIGUOUS — 11B Phase 2: A0=36.67% / A1=63.33% / B_v2=63.33%; B_v2-A1=+0.00pp; verifier rescue rate 0/11=0.0%; 2/9 gates FAIL; V2 LOST 13.34pp on B arm vs V1. 90B Phase 2: A0=36.67% / A1=66.67% / B_v2=80.00%; B_v2-A1=+13.33pp; verifier rescue rate 1/7=14.3%; 9/9 gates PASS but verifier mechanism not load-bearing (1 of 5 B-only rescues; rest from text-only chain sampling variance vs A1 K=5). Cross-scale collapse pattern (one-scale-good-one-scale-bad) triggers runbook's 'warning not green light' rule; NOT entitled to Phase 3 in any scenario. Adds two W96-L carry-forwards. Next move per Linear: COO-20 W96-D battlefield pivot.",
"commits": [
"63ccd29",
"080a716",
"0b20f19"
],
"docs": [
"docs/RUNBOOK_W96C.md",
"docs/RESULTS_W96C_ARSENAL_MINING_V1.md",
"docs/RESULTS_W96C_MATHVISTA_C1_11B_PHASE2_V1.md",
"docs/RESULTS_W96C_MATHVISTA_C1_CROSS_SCALE_PHASE2_V1.md"
],
"linear_issues": [
"COO-19"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W96-L-MATHVISTA-V2-C1-VERIFIER-FINAL-11B-PHASE2-CAP",
"W96-L-MATHVISTA-V2-C1-VERIFIER-FINAL-90B-PHASE2-SINGLE-SEED-NON-MECHANISM-DRIVEN-CAP"
]
},
{
"id": "W97",
"title": "RealWorldQA D2-B0 Phase 2 cheap pilot at 11B: FAIL (3 of 9 gates) — structurally informative; +6.67pp shortfall driven by vision-bound yes/no rescue cluster; cross-scale 90B not entitled",
"outcome": "PHASE 2 FAIL — A0=36.67% / A1@K=5=90.00% / B=83.33%; B-A1=-6.67 pp (gate 4 FAIL); A1 saturated at 90% (gate 2 FAIL); B does not strictly beat A1 (gate 3 FAIL). Gates 1, 5-9 PASS, including B-A0=+46.67pp (image extraction is real signal) and B>=A1 on 25/30 problems (per-problem majority). Per-problem disagreement structure: 22/30 both pass; 5/30 unique-A1-rescues on vision-bound yes/no questions (existence detection, color/state, orientation, depth ordering); 3/30 unique-B-rescues on multi-choice spatial questions; 0/30 neither pass. Per the W96-C cross-scale rule, 90B Phase 2 is NOT auto-launched (estimated A1@K=5 >= 90% on this saturation-prone slice, almost certainly failing gate 2 again). Adds carry-forward W97-L-REALWORLDQA-D2-B0-PHASE2-11B-CAP. Discipline validation #7: W93/W94/W95/W96-A/W96-C/W96-D/W97.",
"commits": [
"4693467"
],
"docs": [
"docs/RUNBOOK_W97.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W97_V1.md",
"docs/RESULTS_W97_ARSENAL_MINING_V1.md",
"docs/RESULTS_W97_REALWORLDQA_D2_B0_PHASE2_V1.md"
],
"linear_issues": [
"COO-21"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W97-L-REALWORLDQA-D2-B0-PHASE2-11B-CAP"
]
},
{
"id": "W96-D",
"title": "Cross-modal battlefield pivot — ChartQA preflight FAIL at both 11B and 90B (P3 saturation); RealWorldQA preflight PASS at both 11B and 90B (all 9 composite gates); D2 preflight-earned for Phase 2; no NIM spend",
"outcome": "BATTLEFIELD PIVOT D1->D2 — ChartQA (lmms-lab/ChartQA, 2500 problems; parquet SHA 165263505f2998aba65d819b44be832edecd92d676fee2c030645f784cd55d06) FAILS P3 saturation at both Llama-3.2-11B-Vision (A1@K=5 91.69%; residual 8.31pp) and Llama-3.2-90B-Vision (A1@K=5 92.75%; residual 7.25pp) — both far below the W95 +20pp floor; D1 killed cheaply. RealWorldQA (lmms-lab/RealWorldQA, 765 problems; 2 shards SHA-anchored) PASSES all 4 composite probes + W93 G1-G5 at both scales — 11B A1@K=5 73.44% (residual 26.56pp; clean); 90B A1@K=5 79.49% (residual 20.51pp; narrow 0.51pp above floor). D2-B0 (W95-B0 scene-port) is preflight-earned for a NIM smoke test + 1-seed x 30-problem Phase 2 cheap pilot at one or both scales; Phase 2 launch out of scope for this milestone. Adds 1 carry-forward (CHARTQA-PREFLIGHT-D1-B0-P3-SATURATION-CAP); no retirements. Discipline: W93/W94/W95/W96-A/W96-C/W96-D preflight-first + cross-scale rule caught a battlefield-level saturation cap cheaply at $0 NIM spend.",
"commits": [
"d14e46d"
],
"docs": [
"docs/RUNBOOK_W96D.md",
"docs/RESULTS_W96D_ARSENAL_MINING_V1.md",
"docs/RESULTS_W96D_CHARTQA_PREFLIGHT_V1.md",
"docs/RESULTS_W96D_REALWORLDQA_PREFLIGHT_V1.md"
],
"linear_issues": [
"COO-20"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W96-L-CHARTQA-PREFLIGHT-D1-B0-P3-SATURATION-CAP"
]
},
{
"id": "W98",
"title": "RealWorldQA multi-candidate slate (B1 + B2 preflight earn BOTH at both scales; B1 cheap NIM pilot at 11B FAIL 2/9 gates) - structurally informative; B1 recovers 4/5 W97 unique-A1-rescues on yes/no perception but regresses 5 D2-B0 multi-choice wins; W95-B0 family capped at B-A1 ≈ -6.67pp through two distinct mechanisms; B2 deferred to W99",
"outcome": "PHASE 2 FAIL VIA DIFFERENT MECHANISM — A0=36.67% / A1@K=5=86.67% / B1=80.00%; B1-A1=-6.67pp (gate 4 FAIL, IDENTICAL margin to W97 D2-B0); B1 does NOT strictly beat A1 (gate 3 FAIL). Gates 1, 2, 5-9 PASS — notably gate 2 (A1<90% this run; W97's saturation was sampling noise) and gate 6 (B>=A1 on 27/30 = IMPROVED per-problem majority vs W97 D2-B0's 25/30). Per-problem diff vs W97 D2-B0 on same slice: 4 B1-new-rescues on yes/no perception (000135, 000403, 000555, 000718) - EXACTLY the W97 failure cluster the typed schema + question-typed solver was designed to attack (4/5 recovery rate matches AddrP1 + AddrP2 predictions) - BUT 5 B1-regressions on multi-choice/numeric (000013, 000155, 000204, 000225, 000713) via a NEW failure mode: typed solver becomes more confident in reader's (often wrong) direct_answer_hint and stops K=4 reflexion-cycling. Net 4-5 = -1 problem = same architectural cap. W95-B0-derived extract-then-text-reason architecture family empirically capped at B-A1 ≈ -6.67pp at 11B through TWO distinct mechanisms (D2-B0 free text + D2-B1 typed schema). Cross-scale 90B NOT entitled per W96-C rule. B2 (direct-vision final-turn answerer, structurally distinct, no reader-hint dependency) deferred to W99 per runbook. Preflight + addressability probes (AddrP1..AddrP7) earned BOTH candidates at both 11B and 90B at $0 NIM (single pilot per runbook decision logic; tie-break = lower expected NIM cost). Discipline validation #8: W93/W94/W95/W96-A/W96-C/W96-D/W97/W98.",
"commits": [
"4e77107"
],
"docs": [
"docs/RUNBOOK_W98.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W98_V1.md",
"docs/RESULTS_W98_ARSENAL_MINING_V1.md",
"docs/RESULTS_W98_PREFLIGHT_V1.md",
"docs/RESULTS_W98_REALWORLDQA_B1_PHASE2_V1.md"
],
"linear_issues": [
"COO-22"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W98-L-REALWORLDQA-B1-TYPED-SCHEMA-PHASE2-11B-CAP"
]
},
{
"id": "W99",
"title": "RealWorldQA candidate tournament (B2 + B4 + B5) at 11B: B2 + B5 PASS structural Phase 2 (100%/+6.67pp; Option A slice-saturation); B4 (typed schema sans hint) FAILS (76.67%/-16.67pp); typed-extract-then-text-reason sub-family of W95-B0 capped through THREE mechanisms; only B2 (image at decision boundary) clears Phase 2",
"outcome": "2 of 3 PASS STRUCTURAL Phase 2 — B2 (frontier) + B5 (switch baseline ceiling); B4 (close-cousin repair) FAILS empirically refuting hint-removal-only hypothesis. NIM-free preflight + addressability probes earned all three at both 11B+90B; NIM-free B5 oracle prediction (+10pp) and B2 realistic prediction (+6.67pp) matched empirical to within sampling variance. B2 final-VLM rescued 3/3 invocations including the residual viewer-pov problem (000615) NEITHER W97 D2-B0 NOR W98 B1 could solve. Discipline validation #9 (W93-W99 preflight-first + cross-scale + multi-candidate tournament). Stable boundary preserved.",
"commits": [
"5526b4e"
],
"docs": [
"docs/RUNBOOK_W99.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W99_V1.md",
"docs/RESULTS_W99_ARSENAL_MINING_V1.md",
"docs/RESULTS_W99_PREFLIGHT_V1.md",
"docs/RESULTS_W99_REALWORLDQA_B5_PHASE2_V1.md",
"docs/RESULTS_W99_REALWORLDQA_B2_PHASE2_V1.md",
"docs/RESULTS_W99_REALWORLDQA_B4_PHASE2_V1.md",
"docs/RESULTS_W99_MILESTONE_SUMMARY_V1.md"
],
"linear_issues": [
"COO-23"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W99-L-REALWORLDQA-B5-SWITCH-BASELINE-PASS-11B-SLICE-SATURATION-CAP",
"W99-L-REALWORLDQA-B5-SWITCH-IS-BASELINE-NOT-FRONTIER-CAP",
"W99-L-REALWORLDQA-B2-DIRECT-VISION-FINAL-TURN-PHASE2-11B-STRUCTURAL-PASS-SLICE-SATURATION-CAP",
"W99-L-REALWORLDQA-W95-B0-FAMILY-NOT-STRUCTURALLY-CAPPED-AT-IMAGE-AT-DECISION-BOUNDARY-CAP",
"W99-L-REALWORLDQA-B4-TYPED-SCHEMA-WITHOUT-HINT-PHASE2-11B-FAIL-CAP",
"W99-L-REALWORLDQA-TYPED-EXTRACT-THEN-REASON-SUBFAMILY-EMPIRICALLY-CAPPED-AT-11B-CAP"
]
},
{
"id": "W100",
"title": "RealWorldQA cross-scale 90B Phase 2 confirmation: B2 (frontier lead) FAILS by -3.33pp with mechanism-load-bearingness sub-gate MLB-2 also FAILING (clean cross-scale collapse matching W96-C C1); B5 (baseline-only ceiling reference) narrowly MISSES gate 4 by +1.67pp short of bar; pre-committed Part H code-pivot contingency triggers COO-9 (second code benchmark) promotion to lead path; cross-modal RealWorldQA arc FROZEN AT 11B; Phase 3 NOT launched",
"outcome": "BOTH 90B Phase 2 pilots FAIL. B2 90B Phase 2: A0=46.67/A1@K=5=76.67/B2=73.33 / B-A1=-3.33pp / gates 3+4 FAIL / MLB-2 FAILS (final-VLM rescue rate 1/9=11.11% vs 11B 3/3=100%); MLB-1 PASS (invocation 30%); cross-scale shift -10pp; 22/30 both-pass with W99 11B, 0 new wins, 8 new losses; clean cross-scale collapse pattern matching W96-C C1 verifier. B5 90B Phase 2: A0=46.67/A1@K=5=80/B5=83.33 / B-A1=+3.33pp / 8/9 gates PASS (gate 4 alone fails); per-route vlm_team_b0(D2-B0) 14/18=77.8% (down from 18/18 at 11B; W95-B0 cross-scale fragility on multi-choice), a1_vlm_k5 11/12=91.7%; cross-scale shift -3.33pp; B5 stays classified baseline-only. NIM-free preflight reused W99 90B verdict cid 0bacd989... at /bin/zsh NIM; AddrW100-B2-P5 + AddrW100-B5-P4 cross-scale-stability probes both PASSed before any 90B NIM call. COO-9 PROMOTED to lead path; W101 charter = COO-9 runbook + corpus selection from {MBPP+, HumanEval+, APPS, LiveCodeBench, SWE-bench-lite}. Discipline validation #10 (W93-W100 preflight-first + cross-scale + new MLB sub-gates + multi-candidate-tournament-then-confirm). Stable boundary preserved. ZERO new coordpy.* modules introduced.",
"commits": [
"2383e6e"
],
"docs": [
"docs/RUNBOOK_W100.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W100_V1.md",
"docs/RESULTS_W100_REALWORLDQA_B2_PHASE2_90B_V1.md",
"docs/RESULTS_W100_REALWORLDQA_B5_PHASE2_90B_V1.md",
"docs/RESULTS_W100_MILESTONE_SUMMARY_V1.md"
],
"linear_issues": [
"COO-24"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W100-L-REALWORLDQA-B2-DIRECT-VISION-FINAL-TURN-PHASE2-90B-CAP",
"W100-L-REALWORLDQA-B2-CROSS-SCALE-COLLAPSE-MECHANISM-NON-LOAD-BEARING-AT-90B-CAP",
"W100-L-REALWORLDQA-B5-SWITCH-BASELINE-90B-NARROW-MISS-CAP",
"W100-L-REALWORLDQA-W95-B0-D2-B0-MULTI-CHOICE-EXTRACTION-DEGRADES-CROSS-SCALE-CAP"
]
},
{
"id": "W101",
"title": "Second-code-benchmark battlefield tournament + MBPP+ lead selection + cheap NIM-free preflight: 5-candidate x 8-criterion battlefield ranking (MBPP+ LEAD + HumanEval+ BACKUP); arsenal mining of W88+W91 sidecars via 2640 offline subprocess re-executions; 4 new explicit-import-only coordpy.* modules (loader+executor+reflexion-bench+preflight) + 3 driver scripts + 25 unit tests (all PASS); NIM-free preflight verdict 6/8 PASS with 2 DEFERRED on operator MBPP+ fetch; cheap NIM pilot NOT YET earned (conditional on operator MBPP+ download + SHA pin + preflight re-run clean)",
"outcome": "INFRASTRUCTURE + PREFLIGHT — COO-9 charter executed end-to-end with NO NIM call. Battlefield selection (locked BEFORE any code built): MBPP+ chosen as LEAD (EvalPlus's hardened MBPP; ~35x more hidden tests per problem; surgical attack on W91-L-MBPP-REFLEXION-V2-5SEED-PARTIAL-CAP); HumanEval+ BACKUP (built only if MBPP+ preflight FAILs); APPS/LiveCodeBench/SWE-bench-lite explicitly out of scope. Arsenal mining: 2640 W88+W91 candidate responses re-executed offline; per-seed numbers match published W89+W91 byte-for-byte; reflexion-rescue surface 2.5x richer on HumanEval (9.76%) than on base MBPP (3.97%) = empirical symptom of ceiling-saturation cap MBPP+ relieves; MBPP shared-fails cluster (14.0%) 2.5x larger than HumanEval (5.6%). Preflight 6/8 PASS: P3 predicts MBPP+ A1@K=5=69.97% (saturation margin 20.03pp); P4 decomposition argument 1727 chars; AddrW101-P1 W89 rescue 9.76% >= 5% threshold; AddrW101-P2 both partitions well-formed; AddrW101-P3 cross-bench margin 20.03pp >= 10pp floor; AddrW101-P4 no anti-pattern tokens. 2 DEFERRED: P1 corpus integrity + P2 executor self-test (both gated on operator MBPP+ fetch + SHA pin). Cheap pilot NOT YET earned. Pre-committed Phase 2 gates: W95 9-gate + MLB-1 (reflexion-cycle invocation >=33%) + MLB-2 (reflexion rescue rate >=33%) per W100 lesson. NO carry-forwards retired (W89 70B HumanEval K=5 remains the only confirmed multi-seed same-budget multi-agent superiority retirement); NO carry-forwards added (infrastructure + preflight only; no empirical NIM result yet). Discipline validation #11 (W93-W101). Stable boundary preserved. 4 new modules explicit-import only; 25 unit tests all PASS.",
"commits": [
"e811893"
],
"docs": [
"docs/RUNBOOK_W101.md",
"docs/RESULTS_W101_BATTLEFIELD_SELECTION_V1.md",
"docs/RESULTS_W101_ARSENAL_MINING_V1.md",
"docs/RESULTS_W101_PREFLIGHT_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W101_V1.md",
"docs/RESULTS_W101_MILESTONE_SUMMARY_V1.md"
],
"linear_issues": [
"COO-25"
],
"carry_forwards_retired": [],
"carry_forwards_added": []
},
{
"id": "W102",
"title": "MBPP+ V2 schema-fix lead lane + HumanEval+ backup lane + COO-14 helper lane + MBPP+ V2 cheap pilot at 70B FAIL (B-A1 = -6.67pp; MLB-1 + MLB-2 both FAIL; carry-forward W102-L-MBPP-PLUS-V2-REFLEXION-PHASE2-70B-CAP added; W103 pivots to HumanEval+ via the W102-built backup-lane infrastructure per pre-committed runbook branch-3 decision logic)",
"outcome": "PHASE 2 FAIL — A0=73.33% / A1@K=5=83.33% / B=76.67% / B-A1=-6.67pp / 6 of 9 Phase 2 gates PASS / MLB-1 30% FAIL / MLB-2 22.22% FAIL on the W102-locked seed-101_001 / 30-problem slice at meta/llama-3.3-70b-instruct. Critical W102 finding (lead lane): during the W101 deferred operator step, W102 discovered the W101 V1 loader's schema assumption is wrong (parallel plus_input/plus_output arrays do NOT exist in the actual EvalPlus release; the extra-test surface lives in a single `test` Python program). V1 would have silently degenerated the cheap pilot to a base-MBPP run. W102 fixes this BEFORE any NIM spend by building V2 infrastructure (4 new explicit-import-only coordpy.* modules: mbpp_plus_loader_v2 + executor_v2 + reflexion_bench_v2 + preflight_v2 with NEW P5 silent-degeneration guard + P6 V1-vs-V2 sanity); V2 preflight 10 of 10 PASS (verdict cid aab6d9ab...). HumanEval+ backup lane (4 new explicit-import-only modules): humaneval_plus_loader_v1 + executor_v1 + reflexion_bench_v1 + preflight_v1; HumanEval+ preflight 7 of 7 PASS (verdict cid 4f57a2cf...); cheap pilot NOT launched in W102 — reserved as the W103 immediate pivot. COO-14 helper lane (1 new explicit-import-only module): code_slice_selector_v1 implements the 4-item COO-14 DoD verbatim (rank_candidate_benches + propose_cheap_pilot_slice + format_*_markdown + anti-pattern guard); 15 unit tests PASS. W102 arsenal-mining extension (scripts/run_w102_arsenal_mining.py) re-executes 2640 W88+W91 candidate responses offline against HumanEval+ + MBPP+ V2 surfaces; key empirical finding: MBPP+ V2 cluster surface on the SAME W91 70B responses lifts B-A1 from +1.33pp (base-MBPP cap) to +5.28pp — BUT the empirical W102 cheap pilot at seed 101_001 with FRESH K=5 sampling landed at -6.67pp (11.95pp swing below the arsenal-mining prior). This is added as a NEW W102 anti-pattern: cross-bench arsenal-mining priors are an UPPER BOUND, not cheap-pilot earning evidence. Per-problem cluster on the cheap-pilot slice: 3 a1_only_wins (B regression on problems where i.i.d. K=5 found a passing candidate but reflexion chain didn't — same W97/W98/W99 RealWorldQA-B0 structural pattern), 1 b_only_win (B rescue at attempt index 1), 22 shared_wins, 4 shared_fails. B's first-pass-attempt distribution: 21/23 PASSes came at attempt 0 (reflexion never invoked); 1/23 at attempt 1; 1/23 at attempt 3. MLB-2 rescue rate 22.22% on MBPP+ V2 is well below W89 HumanEval rescue rate of 47% — reflexion-mechanism load-bearingness on MBPP-family at 70B is empirically weaker than on HumanEval-family. Decision applied per pre-committed runbook branch 3: 2 carry-forwards added (W102-L-MBPP-PLUS-V2-REFLEXION-PHASE2-70B-CAP + W102-L-MBPP-PLUS-V2-MECHANISM-LOAD-BEARINGNESS-WEAK-AT-70B-CAP); W103 = HumanEval+ cheap pilot via W102-built backup; COO-9 REMAINS lead path; NO cross-scale MBPP+ V2; NO Phase 3. W101 V1 loader/executor/reflexion-bench DEMOTED to historical artifact + anti-pattern (silent-degeneration failure mode); V1 stays in-repo for the W101 audit trail. Discipline validation #12: W93/W94/W95/W96-A/W96-C/W96-D/W97/W98/W99/W100/W101/W102 with W102 extending the discipline with (a) silent-degeneration-via-schema-assumption anti-pattern guard (P5+P6 probes), (b) cross-bench-arsenal-mining-priors-as-cheap-pilot-earning-evidence anti-pattern guard, (c) mechanism-load-bearingness-varies-by-benchmark-family-at-70B structural lesson. Stable boundary preserved: coordpy.__version__=0.5.20; SDK_VERSION=coordpy.sdk.v3.43; no PyPI publish; coordpy/__init__.py untouched. 9 new explicit-import-only coordpy.* modules + 5 new driver scripts + 3 new unit-test files (67 tests total; all PASS).",
"commits": [
"55f40cf",
"7419a6b"
],
"docs": [
"docs/RUNBOOK_W102.md",
"docs/RESULTS_W102_MBPP_PLUS_LOADER_V2_FIX_V1.md",
"docs/RESULTS_W102_HUMANEVAL_PLUS_PREFLIGHT_V1.md",
"docs/RESULTS_W102_CODE_SLICE_SELECTOR_V1.md",
"docs/RESULTS_W102_ARSENAL_MINING_V1.md",
"docs/RESULTS_W102_MBPP_PLUS_V2_PHASE2_70B_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W102_V1.md",
"docs/RESULTS_W102_MILESTONE_SUMMARY_V1.md"
],
"linear_issues": [
"COO-26"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W102-L-MBPP-PLUS-V2-REFLEXION-PHASE2-70B-CAP",
"W102-L-MBPP-PLUS-V2-MECHANISM-LOAD-BEARINGNESS-WEAK-AT-70B-CAP"
]
},
{
"id": "W103",
"title": "HumanEval+ lead pilot at 70B PASS_MECHANISM_DRIVEN (B-A1 = +20.00pp; 9/9 Phase 2 gates + MLB-1 56.67% + MLB-2 47.06% all PASS on the helper-anchored slice; first NIM-spending pilot to consume coordpy.code_slice_selector_v1 as a load-bearing input; W104 cross-scale confirmation pre-committed; COO-9 remains lead path; W89 sequential-reflexion mechanism extends to a SECOND published code benchmark family (HumanEval+ EvalPlus-hardened) at the cheap-pilot scale; carry-forward W103-L-HUMANEVAL-PLUS-REFLEXION-PHASE2-70B-PASS registered; ZERO new coordpy.* modules)",
"outcome": "PHASE 2 PASS_MECHANISM_DRIVEN — A0=50.00% / A1@K=5=50.00% / B (sequential reflexion K=5)=70.00% / B-A1=+20.00pp (4x larger than the W89 retirement's +5.56pp) / B-A0=+20.00pp / per-problem majority B ≥ A1 on 29 of 30 (only 1 a1_only regression) / 7 b_only_wins (mechanism rescues) / 14 shared_wins / 8 shared_fails / MLB-1=56.67% (17/30) PASS / MLB-2=47.06% (8/17) PASS (matches W89 base-HumanEval retirement template's 47% rescue rate byte-for-byte) / 9 of 9 Phase 2 gates PASS. Pilot 124 min wall (7424.3 s; heavy 429 throttling + early-launch socket hang; killed + relaunched at 12 min; hardened retry budget kept the run grinding without exhausting). Bench Merkle root 68f4a9669f1bd03e6b3cb393a436e4f04aca034a0bad9c4b5ea8a002faabfd6d. Slice CID c35155956ece605c0169b0cf35a6b69267bee04f5f68cf5a5de466dcc01dd8d2 (helper-priority order; 28 unique HumanEval+ task_ids + 2 base-HumanEval top-ups; cluster mix 7 humaneval_plus:b_only_wins + 10 humaneval_plus:shared_fails + 2 humaneval_plus:a1_only_wins + 9 humaneval_plus:shared_wins + 2 humaneval(top-up):shared_wins = 63.3% historically-hard). Notable empirical finding: A1@K=5=50.00% is EXACTLY EQUAL to A0=50.00% on this helper-anchored slice (K=5 i.i.d. sampling produced zero improvement over single-shot); sequential reflexion (B) recovered 8 of the 17 problems where A0 failed via cumulative-stderr conditioning. This is a stronger mechanism-load-bearing signal than the W89 retirement showed (where A1 ≈ 85.56% so the i.i.d. surface was structurally smaller); reflexion is doing real work that i.i.d. sampling cannot substitute for on this slice. Decision applied per pre-committed Branch A: W103-L-HUMANEVAL-PLUS-REFLEXION-PHASE2-70B-PASS registered (single-seed cheap-pilot PASS; NOT retirement); COO-9 REMAINS lead path; W104 = HumanEval+ cross-scale confirmation at a SECOND model class (W104 RUNBOOK locks the exact target); W105+ = Phase 3 retirement bench (3 seeds × 100 problems × K=5) IF W104 cross-scale PASSes. W102 / W103 cross-bench contrast: reflexion-mechanism load-bearingness is benchmark-family-dependent at 70B — MBPP-family rescue rate 22.22% (W102 FAIL) vs HumanEval-family rescue rate 47.06% (W103 PASS, identical to W89 47%). This is now an empirical fact, not a conjecture. Hardening lane: 9 unit tests codifying W102 + W103 lessons (silent-degeneration anti-pattern + arsenal-mining-prior anti-pattern + sidecar-flush hardening + helper-anchored-slice integrity + anti-pattern token absence + production slice CID determinism pin); driver now flushes the sidecar after every write (W102 buffered all 330 entries until pilot exit, blinding mid-run operator audits; W103 codifies the fix via test_w103_pilot_driver_flushes_sidecar_after_each_write). Planning lane: W104 pre-committed by outcome BEFORE pilot launched (Branch A applied; Branch B `PASS_NON_MECHANISM_DRIVEN` and Branch C FAIL decision logic remain in docs/RUNBOOK_W103.md as durable artifacts of pre-commit-then-execute discipline). Helper-consumption attestation (W102 COO-14 downstream-consumption deliverable): coordpy.code_slice_selector_v1 transitions from \"shipped helper with worked example\" to \"shipped helper as load-bearing pilot input\". Stable boundary preserved: coordpy.__version__=0.5.20; SDK_VERSION=coordpy.sdk.v3.43; no PyPI publish; coordpy/__init__.py untouched. ZERO new coordpy.* modules in W103. 2 new driver scripts (run_w103_humaneval_plus_pilot.py + _w103_emit_pilot_result_doc.py); 1 modified driver (+5 lines for sidecar flush); 1 modified test file (+21 lines for sidecar-flush regression); 6 new docs (RUNBOOK + 4 RESULTS + FRONTIER_AUDIT). 75 tests across W101+W102+W103 code line; all PASS. Discipline validation #13: W93/W94/W95/W96-A/W96-C/W96-D/W97/W98/W99/W100/W101/W102/W103.",
"commits": [
"b7a4c3d",
"1702e28"
],
"docs": [
"docs/RUNBOOK_W103.md",
"docs/RESULTS_W103_HELPER_CONSUMPTION_V1.md",
"docs/RESULTS_W103_HUMANEVAL_PLUS_PREFLIGHT_RECONFIRM_V1.md",
"docs/RESULTS_W103_HUMANEVAL_PLUS_PHASE2_70B_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W103_V1.md",
"docs/RESULTS_W103_MILESTONE_SUMMARY_V1.md"
],
"linear_issues": [
"COO-27"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W103-L-HUMANEVAL-PLUS-REFLEXION-PHASE2-70B-PASS"
]
},
{
"id": "W104",
"title": "HumanEval+ cross-scale Phase 2 cheap pilot at the pre-locked-backup target (Llama-3.1-70B-Instruct, cross-generation) PASS_MECHANISM_DRIVEN + cross-scale-discipline hardening + W105 entitlement pre-built",
"outcome": "PHASE 2 PASS_MECHANISM_DRIVEN AT CROSS-GENERATION FORM — A0=46.67% / A1@K=5=53.33% / B=63.33% / B-A1=+10.00pp / B-A0=+16.66pp / 9 of 9 Phase 2 gates PASS / MLB-1=56.67% (17/30) PASS / MLB-2=35.29% (6/17) PASS. Pre-locked primary `meta/llama-3.1-405b-instruct` returned HTTP 404 on NIM (not hosted); pre-locked backup `meta/llama-3.1-70b-instruct` was applied deterministically per the W104 RUNBOOK § Target-model selection rule criterion 2(b). Cross-scale form actually achieved: cross-GENERATION (Llama 3.1 vs Llama 3.3 at 70B), NOT cross-scale-UP (70B → 405B). Pilot ran 1 seed × 30 problems × K=5 = 330 NIM calls on the BYTE-EQUAL W103 helper-anchored slice (slice CID c35155956ece605c0169b0cf35a6b69267bee04f5f68cf5a5de466dcc01dd8d2 verified at run start; corpus SHA 908377f1daf28dcb36846db73a5662b2e05a9907407c2696c89ad9d3b0b04492 verified at run start; pilot refuses to run on either mismatch). 7506.9 s wall (~125 min). Cross-scale comparator emitted (after a brief permission-outage retry): 8 stayed + 11 improved + 11 regressed + 0 flipped on the byte-equal slice; cross-generation shift on B-A1 = -10.00 pp (W103 +20 → W104 +10; mechanism keeps half its W103 margin); cross-generation shift on MLB-2 = -11.77 pp (47.06 → 35.29, still load-bearing above the 33% floor). Hardening lane: 4 durable guardrails landed (cross-scale comparator V1 schema/provenance refuse-to-run; byte-equal W103 slice reuse with run-start CID verification; reachability smoke probe BEFORE NIM spend with pre-locked backup; resume-from-sidecar capability for socket hangs / 429 storms). 14 PASSing unit tests codify all four guardrails. Planning lane: W105 Phase 3 retirement bench slice pack pre-built (pack CID 8be55f3bf1650df397cb875543c69a48473483de8089dc3c40be45cc635a1314; 100 problems with W103 30-problem inner kernel preserved + 45 mid-shell helper extension + 25 corpus-fill; 3 seeds 105_001 / 105_002 / 105_003; cross-scale axis Llama-3.3-70B + Llama-3.1-70B; 6 600 NIM calls total). Branch C fallback dispatch table (machine-readable JSON) pre-committed: LiveCodeBench preflight on mechanism-distribution shift; APPS preflight on G2 saturation; HumanEval+ multi-seed at cross-scale target on per-seed-sampling variance; cross-scale-collapse audit + 70B Phase 3 on W96-A regression pattern; SWE-bench-lite stays unconditionally out of scope. Helper consumption: coordpy.code_slice_selector_v1 consumed as a real downstream input for the second time (first was W103; COO-14 deliverable extended from `load-bearing pilot input` to `load-bearing Phase 3 slice-pack input`). Decision applied: Branch A. Carry-forwards added: W104-L-HUMANEVAL-PLUS-REFLEXION-PHASE2-CROSS-GENERATION-70B-LLAMA31-PASS + W104-L-HUMANEVAL-PLUS-CROSS-SCALE-UP-PRIMARY-TARGET-405B-UNREACHABLE-ON-NIM-CAP. Carry-forwards retired: NONE; W89 70B-HumanEval K=5 remains the only confirmed multi-seed same-budget multi-agent superiority retirement. `COO-9` REMAINS the lead path. Discipline validation #14: W93/W94/W95/W96-A/W96-C/W96-D/W97/W98/W99/W100/W101/W102/W103/W104. Stable boundary preserved: coordpy.__version__=0.5.20; SDK_VERSION=coordpy.sdk.v3.43; no PyPI publish; coordpy/__init__.py untouched. Exactly ONE new coordpy.* module added explicit-import only: coordpy.cross_scale_comparator_v1.",
"commits": [
"35e7c3d"
],
"docs": [
"docs/RUNBOOK_W104.md",
"docs/RESULTS_W104_HUMANEVAL_PLUS_PHASE2_405B_V1.md",
"docs/RESULTS_W104_CROSS_SCALE_COMPARATOR_V1.md",
"docs/RESULTS_W104_HELPER_W105_PLANNING_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W104_V1.md",
"docs/RESULTS_W104_MILESTONE_SUMMARY_V1.md"
],
"linear_issues": [
"COO-28"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W104-L-HUMANEVAL-PLUS-REFLEXION-PHASE2-CROSS-GENERATION-70B-LLAMA31-PASS",
"W104-L-HUMANEVAL-PLUS-CROSS-SCALE-UP-PRIMARY-TARGET-405B-UNREACHABLE-ON-NIM-CAP"
]
},
{
"id": "W105",
"title": "HumanEval+ Phase 3 retirement bench (the earned 6,600-call run) + run-hardening + W106 contingency — SPLIT outcome: meta/llama-3.3-70b-instruct RETIRED (second confirmed multi-seed same-budget multi-agent superiority retirement after W89); meta/llama-3.1-70b-instruct FAIL_MARGIN; cross-class retirement NOT entitled",
"outcome": "SPLIT — meta/llama-3.3-70b-instruct RETIRED (6/6 W88/W89/W95 retirement bars; 3 seeds x 100 problems x K=5; mean B-A1=+7.00pp; per-cell +5.00/+9.00/+7.00; per-seed majority 3/3; per-problem majority 295/300; A1 84/82/82% all <90%; audit chain 3/3; executor clean 100%; MLB-2=55.62% load-bearing) = the SECOND confirmed multi-seed same-budget multi-agent superiority retirement after W89, on a different benchmark family (EvalPlus-hardened HumanEval+) at +7.00pp vs W89's base-HumanEval +5.56pp. meta/llama-3.1-70b-instruct FAIL_MARGIN (5/6 bars; mean B-A1=+2.33pp; per-cell +5.00/+1.00/+1.00; only the margin bar fails; per-seed majority 3/3; MLB-2=50.54% still load-bearing). The W104 cross-generation cheap-pilot +10.00pp on the 30-problem rescue-concentrated slice did NOT survive scale-up to the broad 100-problem Phase 3 slice for Llama-3.1 (A1 rose to 86.33%, compressing reflexion headroom) — confirms the W102 cheap-pilot-margin-is-an-upper-bound anti-pattern + the W96-A/W96-C/W100 cross-scale-collapse pattern. CROSS-CLASS RETIREMENT NOT ENTITLED: the W105 RUNBOOK rule requires BOTH classes to clear all 6 bars; only Llama-3.3-70B did (cross-class B-A1 diff 4.67pp is within the +-5pp envelope but condition (i) both-RETIRED fails); bounded claim is single-class on Llama-3.3-70B. Executed the pre-built slice pack (pack CID 8be55f3bf1650df397cb875543c69a48473483de8089dc3c40be45cc635a1314) BYTE-FOR-BYTE unchanged; 6600 NIM calls; canary smoke (66 calls) PASSed both classes before launch; 405B re-probed HTTP 404 (still unreachable; W104 cap stands; core two-class matrix unaffected); per-seed-aligned cross-class comparator clean (242 stayed / 27 improved / 28 regressed / 3 flipped across 300 cells). Three lanes: lead (Phase 3 execution) + hardening (phase3_retirement_evaluator_v1 + cross_class_comparator_v1, both explicit-import-only; 18 unit tests; 79 PASS across W101-W105) + planning (W106 pre-committed under all 5 verdict shapes; empirical SPLIT maps to Verdict C sub-case C1). COO-9 REMAINS lead path. 15th consecutive preflight-discipline validation (W93-W105). Stable boundary preserved: coordpy.__version__=0.5.20; SDK_VERSION=coordpy.sdk.v3.43; no PyPI publish; coordpy/__init__.py untouched; 2 new explicit-import-only coordpy.* modules.",
"commits": [
"d5f9133"
],
"docs": [
"docs/RUNBOOK_W105.md",
"docs/RESULTS_W105_HUMANEVAL_PLUS_PHASE3_LLAMA33_V1.md",
"docs/RESULTS_W105_HUMANEVAL_PLUS_PHASE3_LLAMA31_V1.md",
"docs/RESULTS_W105_CROSS_CLASS_COMPARATOR_V1.md",
"docs/RESULTS_W105_W106_PLANNING_V1.md",
"docs/RESULTS_W105_MILESTONE_SUMMARY_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W105_V1.md"
],
"linear_issues": [
"COO-29"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W105-T-HUMANEVAL-PLUS-RETIREMENT-LLAMA33-70B",
"W105-L-HUMANEVAL-PLUS-RETIREMENT-LLAMA31-70B-MARGIN-CAP",
"W105-L-HUMANEVAL-PLUS-CROSS-CLASS-RETIREMENT-NOT-ENTITLED-CAP"
]
},
{
"id": "W106",
"title": "Bounded second-retirement registration + Llama-3.1 margin-cap dispatch NO-GO + graphify truth-sync",
"outcome": "REGISTRATION + DISPATCH milestone (NOT a new benchmark tournament; NO expensive run; $0 NIM on the Llama-3.1 branch). Executes the pre-committed RESULTS_W105_W106_PLANNING_V1 Verdict C / sub-case C1 (retired class=Llama-3.3-70B, failed class=Llama-3.1-70B), per docs/RUNBOOK_W106.md locked BEFORE any NIM call. Lane 1 (claim): REGISTERS the W105 meta/llama-3.3-70b-instruct HumanEval+ Phase 3 retirement (+7.00pp; 6/6 bars; MLB-2=55.62%) as the SECOND confirmed multi-seed same-budget multi-agent superiority retirement after W89, on a different benchmark family; BOUNDED to ONE model class / ONE benchmark family (HumanEval+) / ONE parameter scale (70B). Lane 2 (margin-cap dispatch): a pre-committed TWO-GATE rule (coordpy.margin_cap_dispatch_v1; 20 unit tests) returned NO-GO on the Llama-3.1-70B FAIL_MARGIN branch -- GATE 1 (entitlement) ENTITLED (margin +2.33 in [0,+5); MLB-2 50.54%>=33%; A1 86.33%<90%; Branch-C cheap-confirmation row ~990 NIM calls) but GATE 2 (verdict-changing power) FAIL on all three sub-conditions (2a rescue-concentrated slice is an UPPER BOUND = the W102 anti-pattern; 2b the authoritative fair broad-slice multi-seed Phase 3 verdict already ran at W105 at +2.33pp; 2c the miss is a CLEAN true magnitude miss -- executor clean 100%, byte-exact budget, per-seed majority 3/3, MLB-2 50.54% healthy -- no confound). Decision CID de3dfb02...; the Llama-3.1 branch is CLOSED, not deferred. Lane 3 (graphify): graph refreshed from HEAD at start AND close (73,026 nodes / 237,304 edges / 2,336 communities; dated backup graphify-out/2026-05-28/); query/affected/explain used to confirm the new module is a sibling of the W104/W105 evaluators/comparators. COO-9 REMAINS lead path. W107 left obvious: 405B reachability gate -> alpha cross-scale-UP cheap pilot (if reachable) / beta next-code-benchmark NIM-free preflight LiveCodeBench/APPS (if 405B HTTP 404) / gamma W89->W106 consolidation in parallel. No empirical retirement added or retired; W89 + W105 remain the two confirmed retirements (both Llama-3.3-70B). 16th consecutive preflight-discipline validation (W93-W106). Stable boundary preserved: coordpy.__version__=0.5.20; SDK_VERSION=coordpy.sdk.v3.43; no PyPI publish; coordpy/__init__.py untouched; ONE new explicit-import-only coordpy.* module.",
"commits": [
"9ba018e"
],
"docs": [
"docs/RUNBOOK_W106.md",
"docs/RESULTS_W106_BOUNDED_RETIREMENT_REGISTRATION_V1.md",
"docs/RESULTS_W106_MARGIN_CAP_DISPATCH_V1.md",
"docs/RESULTS_W106_MILESTONE_SUMMARY_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W106_V1.md"
],
"linear_issues": [
"COO-30"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W106-T-BOUNDED-SECOND-RETIREMENT-REGISTERED",
"W106-T-MARGIN-CAP-DISPATCH-V1-SHIPS",
"W106-L-HUMANEVAL-PLUS-LLAMA31-70B-MARGIN-CAP-CHEAP-CONFIRMATION-NOT-EARNED-CAP"
]
},
{
"id": "W107",
"title": "405B reachability gate (4th HTTP 404 -> CLOSED) + next-code-battlefield β preflight (LiveCodeBench PRIMARY / APPS pivot backup) + publication-grade W89→W106 consolidation",
"outcome": "Gated branch milestone with THREE lanes, NOT a new broad benchmark tournament; /bin/zsh expensive NIM (only the sub-second free 405B side-probe). docs/RUNBOOK_W107.md locked BEFORE any NIM call. Lane α (405B gate): re-probed meta/llama-3.1-405b-instruct on NIM -> HTTP 404 (183ms), the FOURTH consecutive 404 (W104/W105/W106/W107); GATE=CLOSED (decision CID 332d4ef983313f7faf724c7d8b2ad96e8e5964a125dfd30d177bc49ba6b2111e); no 405B cheap pilot earned or launched; § 3 cheap-pilot rule NOT exercised; W104-L-...-405B-UNREACHABLE-ON-NIM-CAP refreshed. Lane β (main empirical lane): applied W101 C1-C8 rubric + W107 S1∧S2∧S3 structural-soundness pivot test -> LiveCodeBench is the structurally-sound PRIMARY (time-anchored contamination resistance is the decisive publication-grade property; functional starter_code subset has a clean deterministic subprocess executor proven offline: gold PASS / wrong FAIL / infinite-loop TIMEOUT; W89 decomposition fits), APPS the structural-pivot backup (no pivot triggered); shipped coordpy.livecodebench_loader_v1 (SHA-pinnable functional-subset loader; refuses unpinned/mismatched corpus = W102 silent-degeneration guard) + coordpy.livecodebench_executor_v1 (clean functional-form subprocess executor) + scripts/run_w107_livecodebench_preflight.py (verdict CID 55910d11e210c323fb1a393bbf8be1c3ffa2d19dd22f9e9f40e52ffc9746c6b6; 16 PASSing tests); A1 residual honestly recorded as published-baseline-grade pending operator corpus-fetch. Lane γ (consolidation): docs/CONSOLIDATED_CODE_RETIREMENT_NARRATIVE_V1.md (publication-grade W89→W106 arc: TWO confirmed retirements, both Llama-3.3-70B @ 70B, bounded on three axes) + harmonized claim surface across THEOREM_REGISTRY / RESEARCH_STATUS / HOW_NOT_TO_OVERSTATE. W107 adds NO empirical retirement and retires NO cap; W89 + W105 remain the two confirmed retirements. COO-9 stays lead path; W108 = LiveCodeBench functional-subset Phase 2 cheap pilot after operator corpus-fetch (or APPS pivot, or honest no-go). 17th consecutive preflight-discipline validation (W93-W107). Stable boundary preserved (no version bump; no PyPI; coordpy/__init__.py untouched; 2 new explicit-import-only modules).",
"commits": [
"9caa0d2"
],
"docs": [
"docs/RUNBOOK_W107.md",
"docs/RESULTS_W107_405B_GATE_V1.md",
"docs/RESULTS_W107_NEXT_BATTLEFIELD_PREFLIGHT_V1.md",
"docs/RESULTS_W107_MILESTONE_SUMMARY_V1.md",
"docs/CONSOLIDATED_CODE_RETIREMENT_NARRATIVE_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W107_V1.md"
],
"linear_issues": [
"COO-31"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W107-T-405B-GATE-FOURTH-404-CLOSED",
"W107-T-LIVECODEBENCH-PREFLIGHT-V1-SHIPS",
"W107-T-CODE-RETIREMENT-NARRATIVE-CONSOLIDATED",
"W107-L-LIVECODEBENCH-LOADER-V1-SCHEMA-CONFIRM-AT-FETCH-CAP",
"W107-L-LIVECODEBENCH-RESIDUAL-PUBLISHED-BASELINE-GRADE-CAP",
"W107-L-LIVECODEBENCH-FUNCTIONAL-SUBSET-ONLY-CAP"
]
},
{
"id": "W108",
"title": "LiveCodeBench real-data bug-fix + cheap-pilot FAIL (B-A1=-3.33pp; MLB-2=25%) + APPS backup readiness + 405B 5th-404",
"outcome": "Gated branch milestone with THREE lanes, NOT a new broad benchmark tournament. docs/RUNBOOK_W108.md locked BEFORE any expensive NIM call; the one expensive run was the EARNED 330-call cheap pilot ($0 expensive 405B). Lane alpha (405B gate): re-probed meta/llama-3.1-405b-instruct -> HTTP 404 (163ms), the FIFTH consecutive 404 (W104-W108); GATE=CLOSED (decision CID e1af4451e4d15e7038016f7acd50d72560a77fc874470157c88d1426ee370e8d); W104-L-...-405B-UNREACHABLE-ON-NIM-CAP refreshed. Lane beta (LiveCodeBench, main): diagnosed + fixed the partial-scaffold gold-path smoke (A0=A1=B=0.0) -> root cause was the real release_v6 corpus storing metadata as a JSON STRING, so the W107 loader left func_name='' -> executor ENTRY_NOT_FOUND on every arm; livecodebench_loader_v1._resolve_func_name now handles both encodings (+starter_code fallback); 19 regression tests lock it. Real-data preflight PASS (SHA pin bb4c364f...; 63 functional, all func_name resolved, all plain-arg, all dated 2025 post-cutoff; slice CID 2afc318c...; verdict CID 61b9961c...) -> pilot EARNED. Cheap pilot (meta/llama-3.3-70b-instruct; 1 seed x 30 x K=5 = 330 NIM calls; ~77min; 444 HTTP-429 retries survived): A0=43.33% / A1@K5=63.33% / B=60.00% / B-A1=-3.33pp; 7/9 gates (G3+G4 fail); MLB-1=53.33% PASS, MLB-2=25% FAIL -> FAIL (NON-mechanism-driven). FIRST contamination-resistant test of the W89 mechanism, and it FAILed. Lane gamma (APPS backup): real coordpy.apps_loader_v1 + apps_executor_v1 + preflight + 7 tests + codified in-milestone pivot conditions; pivot NOT triggered (LiveCodeBench passed real-data soundness); APPS is 2021 contamination-exposed (C7=C), backup/control only. Truth surface: the TWO confirmed retirements (W89 +5.56pp; W105 +7.00pp, both Llama-3.3-70B @ 70B) STAND unchanged; W108 adds NO retirement and retires NO research carry-forward; both retirements are on contamination-EXPOSED HumanEval-family (2021) problems and the contamination-confound is an OPEN hypothesis (not a finding). The two carry_forwards_retired below are W107 INFRASTRUCTURE caps discharged by the real-data fetch (schema confirmed; live A1 measured), NOT research retirements. graphify refreshed from HEAD at start + close. 18th consecutive preflight-discipline validation (W93-W108). COO-9 stays lead; W109 = APPS contaminated-control contrast or multi-seed LCB de-noise. Stable boundary preserved (no version bump; no PyPI; coordpy/__init__.py untouched; 2 new explicit-import-only APPS modules + slice selector; loader fixed in place).",
"commits": [
"9b1b29a",
"e1c0ea7"
],
"docs": [
"docs/RUNBOOK_W108.md",
"docs/RESULTS_W108_LIVECODEBENCH_PHASE2_70B_V1.md",
"docs/RESULTS_W108_MILESTONE_SUMMARY_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W108_V1.md"
],
"linear_issues": [
"COO-32"
],
"carry_forwards_retired": [
"W107-L-LIVECODEBENCH-LOADER-V1-SCHEMA-CONFIRM-AT-FETCH-CAP",
"W107-L-LIVECODEBENCH-RESIDUAL-PUBLISHED-BASELINE-GRADE-CAP"
],
"carry_forwards_added": [
"W108-T-LIVECODEBENCH-REAL-DATA-BUGFIX-METADATA-JSON-STRING",
"W108-T-LIVECODEBENCH-REAL-DATA-PREFLIGHT-EARNED",
"W108-T-405B-GATE-FIFTH-404-CLOSED",
"W108-T-APPS-BACKUP-SCAFFOLDING-REAL-PIVOT-READY",
"W108-L-LIVECODEBENCH-REFLEXION-PHASE2-70B-CAP",
"W108-L-REFLEXION-NOT-DEMONSTRATED-ON-CONTAMINATION-RESISTANT-BENCH-CAP"
]
},
{
"id": "W109",
"title": "APPS contaminated-control contrast (B-A1=+16.67pp; PASS_NON_MECHANISM_DRIVEN) + LiveCodeBench de-noise NOT-WARRANTED + claim tightening (contamination-confound now SUPPORTED, not proven)",
"outcome": "Gated branch milestone with THREE lanes, NOT a new broad benchmark tournament. docs/RUNBOOK_W109.md locked BEFORE any expensive NIM call; the one expensive run was the EARNED 330-call APPS cheap pilot ($0 on LCB de-noise; $0 on 405B, not re-probed). Lane alpha (APPS contaminated-control, MAIN): fetched the REAL codeparrot/apps corpus via refs/convert/parquet @ 0f10e424 (config all/test, 743MB SHA-verified shards, 5000 problems), materialized the SHA-pinned call-based subset (38 problems: 28 interview/10 introductory; apps-test.jsonl SHA f6c44d76); confirmed real schema (input_output JSON string -> {fn_name,inputs,outputs}; heterogeneous output-wrapper faithfully matched by output==expected OR output==expected[0]) -- DISCHARGES the two W108 APPS confirm-at-fetch caps. Built coordpy.apps_reflexion_bench_v1 (A0/A1/B byte-identical in shape to W89/W105/W108). Real-data preflight P1^P2^P3^P4 PASS (slice CID 783687d6; verdict CID 0cf1a8e2) -> pilot EARNED. Cheap pilot (meta/llama-3.3-70b-instruct, 1 seed x 30 x K=5 = 330 calls, ~75min, bench Merkle a571c08b): A0=73.33% / A1@K5=73.33% / B=90.00% / B-A1=+16.67pp; 9/9 gates; MLB-2=57.14% PASS, MLB-1=23.33% FAIL => PASS_NON_MECHANISM_DRIVEN (0 regressions; 4 of 5 B-wins are reflexion rescues; MLB-1 fails only because A0 is high). The W89 mechanism RECOVERED a large same-budget win on contamination-EXPOSED APPS (+16.67pp) where it FAILed on contamination-RESISTANT LiveCodeBench (-3.33pp; W108) -- a double dissociation by vintage reinforced by an A0 single-shot gap (73.33% exposed vs 43.33% resistant). Moves the contamination-confound from OPEN to SUPPORTED but NOT established (one single-seed control pair; APPS PASS non-mechanism-driven; APPS contamination-EXPOSED => CONTROL evidence only, NOT a third retirement, NOT publication-grade). Lane beta (LiveCodeBench de-noise DECISION): coordpy.livecodebench_denoise_decision_v1 two-gate rule returns NOT WARRANTED on the W108 result (negative margin + weak MLB-2; +8.33pp mean shift multi-seed cannot supply; decision CID 290afa46); $0 further LCB NIM; does NOT re-open the closed Llama-3.1 branch. Lane gamma (claim/graphify/truth): graphify refreshed from HEAD at start+close; query/explain/path/affected used; added docs/CONTAMINATION_CONTROL_FRAMING_W109_V1.md (exposed-vs-resistant 2x2 + honesty rules) and tightened RESEARCH_STATUS/THEOREM_REGISTRY/HOW_NOT_TO_OVERSTATE/CONSOLIDATED narrative/CHANGELOG. Adds NO retirement and retires NO research carry-forward (W89+W105 STAND; boundary now SHARPER). COO-9 stays lead; W110 = a SECOND contamination-RESISTANT benchmark. 19th consecutive preflight-discipline validation (W93-W109). Stable boundary preserved (no version bump; no PyPI; coordpy/__init__.py untouched; 2 new explicit-import-only modules + 3 scripts; APPS loader/executor reused unchanged).",
"commits": [
"2ebb15ce785009c1b83afeccbf23463c643e8c1e",
"7954129baf7bee8130480a8bfcc85adfe1a67006"
],
"docs": [
"docs/RUNBOOK_W109.md",
"docs/RESULTS_W109_APPS_CONTROL_PHASE2_70B_V1.md",
"docs/RESULTS_W109_MILESTONE_SUMMARY_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W109_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W109_V1.md"
],
"linear_issues": [
"COO-33",
"COO-9",
"COO-6"
],
"carry_forwards_retired": [],
"carry_forwards_added": [
"W109-T-APPS-REAL-DATA-FETCH-PINNED",
"W109-T-APPS-CONTROL-PREFLIGHT-EARNED",
"W109-L-APPS-CONTROL-PHASE2-70B-PASS-NON-MECHANISM-DRIVEN-CAP",
"W109-L-APPS-CONTROL-MLB1-INVOCATION-CAP",
"W109-T-CONTAMINATION-CONFOUND-SUPPORTED-NOT-PROVEN",
"W109-T-LIVECODEBENCH-DENOISE-NOT-WARRANTED"
]
},
{
"id": "W110",
"title": "Second contamination-RESISTANT benchmark = BigCodeBench; Phase-2 FAIL (B-A1=+0.00pp; MLB-2=25%) ⇒ W108 LCB FAIL is NOT LCB-specific; resistant superiority 0/2; contamination-confound STRENGTHENED-not-proven",
"outcome": "Gated branch milestone with THREE lanes, NOT a new broad benchmark tournament. docs/RUNBOOK_W110.md locked BEFORE any expensive NIM call; the one expensive run was the EARNED 330-call BigCodeBench cheap pilot ($0 on SWE-bench-lite, LiveBench-coding, APPS, LCB de-noise, 405B). Lane alpha (MAIN): locked the S1^S2^S3^S4 + feasibility selection rule, then on real-data HF probes REJECTED SWE-bench-lite (synthetic in-repo swe_* MiniSWEBank; real instances need Docker/per-repo-env; multi-file patches break the K=5 single-artifact byte-exact budget) and LiveBench-coding (it IS LiveCodeBench repackaged: task=LCB_generation, citation via livecodebench), and SELECTED BigCodeBench v0.1.4 (2024-06 post-cutoff release-date resistance; clean unittest oracle; single task_func K=5; novel library-composition). Fetched + SHA-pinned the corpus (refs/convert/parquet shard SHA d9a49658; materialized JSONL SHA ca4f352e; 1140 problems). Built 4 explicit-import-only modules (bigcodebench_loader_v1 + bigcodebench_executor_v1 + bigcodebench_reflexion_bench_v1 + contamination_resistant_interpretation_v1). Caught + fixed an in-milestone executor bug: the macOS interactive matplotlib backend (360/1140 BigCodeBench tasks plot) popped GUI windows AND risked a blocking plt.show() falsely TIMING-OUT correct chart solutions; forced headless Agg -> recovered +32 gold-green (936->968); added a wall-stability guard (drop golds >=20s) so the slice CID is reproducible (b69bf3a0, reproduced across runs). Real-data preflight P1^P2^P3^P4 PASS (gold-green 968/1140; verdict CID 6be9fc8e) -> pilot EARNED; canary validated the live path. Cheap pilot (meta/llama-3.3-70b-instruct, 1 seed x 30 x K=5 = 330 calls, ~106min, bench Merkle 128dfb19): A0=63.33 / A1@K5=70.00 / B=70.00 %; B-A1=+0.00pp; 7/9 gates; MLB-1=40% PASS, MLB-2=25% FAIL -> FAIL (NON-mechanism-driven); B rescued 1 (/51), regressed 1 (/26), net 0. This is the SECOND contamination-RESISTANT FAIL -> the W108 LiveCodeBench FAIL is NOT LCB-specific; the W89 mechanism fails on contamination-resistant code GENERALLY at 70B (resistant superiority 0/2 vs exposed 3/3). Lane beta (pre-committed interpretation rule, locked before the verdict): confound_direction=STRENGTHENS, earns_phase3=False; contamination-confound SUPPORTED->STRENGTHENED toward a finding, NOT proven (two single-seed resistant points; orthogonal difficulty not excluded). Lane gamma: graphify refreshed from HEAD at start (1e8f131) + mid (73a0212) + end; explain/path/affected/query run (BigCodeBench bench = 4-hop sibling of the APPS + LiveCodeBench benches). Two confirmed retirements (W89, W105) STAND, boundary now contamination-EXPOSED-specific at 70B. W110 adds NO retirement and retires NO research carry-forward. 20th consecutive preflight-discipline validation (W93-W110). Stable boundary preserved (no version bump; no PyPI; coordpy/__init__.py untouched; 4 new modules + 3 scripts + 3 test files / 17 tests). COO-9 REMAINS lead; W111 = register the tightened boundary + decide a DIFFERENT mechanism vs accept the bounded two-retirement contamination-EXPOSED-HumanEval-family claim.",
"commits": [
"73a0212",
"ed3ba63",
"88ed03b"
],
"docs": [
"docs/RUNBOOK_W110.md",
"docs/RESULTS_W110_BIGCODEBENCH_PHASE2_70B_V1.md",
"docs/RESULTS_W110_MILESTONE_SUMMARY_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W110_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W110_V1.md",
"results/w110/bigcodebench_preflight/preflight_verdict.json"
],
"linear_issues": [
"COO-34",
"COO-9",
"COO-6"
],
"carry_forwards_added": [
"W110-T-BIGCODEBENCH-REAL-DATA-FETCH-PINNED",
"W110-T-BIGCODEBENCH-SECOND-RESISTANT-PREFLIGHT-EARNED",
"W110-T-CONTAMINATION-CONFOUND-STRENGTHENED-NOT-PROVEN",
"W110-T-BIGCODEBENCH-EXECUTOR-V1-HEADLESS-AGG-FIX",
"W110-L-BIGCODEBENCH-REFLEXION-PHASE2-70B-CAP",
"W110-L-REFLEXION-FAILS-ON-CONTAMINATION-RESISTANT-CODE-GENERALLY-CAP",
"W110-L-BIGCODEBENCH-GOLD-GREEN-WALL-STABILITY-GUARD-CAP",
"W110-L-BIGCODEBENCH-RELEASE-DATE-RESISTANCE-NOT-CONTEST-DATE-CAP",
"W110-L-BIGCODEBENCH-EXECUTOR-V1-EXEC-NAMESPACE-NOT-FILE-MODULE-CAP"
],
"carry_forwards_retired": []
},
{
"id": "W111",
"title": "Different-MECHANISM tournament on contamination-resistant code + bounded-claim fallback EARNED (M3 executor-grounded patcher sub-reflexion; resistant ceiling NOT reflexion-specific)",
"outcome": "Gated branch milestone with THREE lanes, NOT a new benchmark tournament and NOT a reflexion rerun. docs/RUNBOOK_W111.md locked + committed (65274c4) BEFORE the only NIM call. After W110's resistant FAIL (reflexion 0/2), W111 asked the sharper question: is the resistant ceiling reflexion-SPECIFIC, or does it hold for ANY same-budget mechanism at 70B? Lane alpha (MAIN): a $0-NIM mechanism-mining census re-executed all 300 W110 BigCodeBench A1+B candidates (scripts/mine_w111_resistant_failure_modes_v1.py) -> resistant failure distribution = 81.6% SEMANTIC hidden-test-coupling / 1.8% API-grounding (both API failures on /51, already rescued); hard-core (8 both-A1+B-fail) = 6/8 mock-coupling (fix needs hidden test source = unreachable in a fair regime) + 2/8 output-value. This KILLED M2 (tool-augmented local symbol/doc introspection; attacks 1.8%) and M1 (library/spec planner; attacks comprehension, sacrifices a self-consistency sample, no executor grounding) AT $0 NIM, and admitted M3 (executor-grounded structured-failure patcher: typed expected/actual contract + minimal-patch on the latest candidate, materially different from prose reflexion, NEVER the hidden test source; coordpy.executor_grounded_patcher_v1; 9 tests incl. a test-source-non-leak fairness guard). The one earned NIM run was a smallest-decisive 143-call M3 probe on the pinned rescue-CONCENTRATED 13-problem hard-core slice (CID b611fae0; seed 111001; bench Merkle 70353e77; ~69min, heavy 429 throttling survived): A0=30.77 / A1=30.77 / M3=46.15 %; M3-A1=+15.38pp (rescue-concentrated UPPER BOUND); MLB-1=61.5%, MLB-2=12.5%. M3's patch loop rescued ONE hard-core problem reflexion B FAILED (/13) -- a genuine mechanism win -- but at 12.5% (1/8), BELOW reflexion's 25% and the 33% floor; the other M3 win (/20) is an attempt-0 SAMPLING win, not the mechanism; M3 did not hold reflexion's /51 rescue; 0 regressions vs A1. Literal pre-committed verdict = AMBIGUOUS (rescued 1 OUTPUT_VALUE but didn't hold /51); resolved: M3 did NOT earn the fair 30-slice pilot (EARN bar unmet + margin NON-mechanism-driven [MLB-2 12.5% < floor] + W104->W105 rescue-concentrated erosion + W106 margin-cap discipline => fair-pilot verdict-changing power LOW => NOT WARRANTED). Lane beta: the bounded-claim fallback rule was pre-committed (RUNBOOK section 6, LAST resort) and is now EARNED; no NIM on M1/M2/fair-pilot/APPS/reflexion/405B/Llama-3.1; BigCodeBench-primary / LiveCodeBench-secondary cross-check rule held (secondary not triggered -- gated on an M3 fair-slice PASS that did not happen). Lane gamma: graphify refreshed from HEAD at start (d41265d5) + re-ingested the M3 module mid-milestone (bench node degree 23) + refreshed at end; explain/path/affected/query run (M3 = 3-hop structural sibling of the W110 reflexion bench). BOUNDED-CLAIM FALLBACK EARNED: the resistant ceiling is NOT reflexion-specific (two mechanisms now fail to beat A1 on resistant code at 70B -- reflexion 0/2 + M3 sub-reflexion); the bounded two-retirement contamination-EXPOSED-HumanEval-family-at-70B claim is the honest code ceiling. W89 + W105 STAND. W111 adds NO retirement, retires NO carry-forward, and proves NOTHING about the confound (it tests a mechanism, not the confound); honest positive registered: M3 is not vacuous (it rescued /13), just sub-reflexion. 21st consecutive preflight-discipline validation (W93-W111). Stable boundary preserved (no version bump; no PyPI; coordpy/__init__.py untouched; 1 new explicit-import-only module + 2 scripts + 9 tests). COO-9 REMAINS lead; W112 = cross-scale-UP probe on a stronger code model if reachable / a NIM-free-earned M3 strengthening that clears the 33% floor / acceptance of the bounded claim.",
"commits": [
"65274c4",
"866528b"
],
"docs": [
"docs/RUNBOOK_W111.md",
"docs/RESULTS_W111_M3_PATCHER_PROBE_70B_V1.md",
"docs/RESULTS_W111_MILESTONE_SUMMARY_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W111_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W111_V1.md",
"results/w111/mechanism_mining/w110_bcb_failure_census.json"
],
"linear_issues": [
"COO-35",
"COO-9",
"COO-6"
],
"carry_forwards_added": [
"W111-T-RESISTANT-FAILURE-IS-SEMANTIC-HIDDEN-TEST-COUPLING",
"W111-T-EXECUTOR-GROUNDED-PATCHER-V1-SHIPS",
"W111-L-NO-DIFFERENT-MECHANISM-BEATS-A1-ON-RESISTANT-CODE-AT-70B-CHEAP-CAP",
"W111-L-M3-PATCHER-SUB-REFLEXION-ON-RESISTANT-HARD-CORE-CAP"
],
"carry_forwards_retired": []
},
{
"id": "W112",
"title": "Stronger-model resistant-code gate + NIM-free M3-strengthening structural kill (Llama-4-Maverick BigCodeBench +10pp but model-EXPOSED, NOT a clean resistant reopening; contamination-resistance is model-cutoff-relative)",
"outcome": "Gated branch milestone with THREE lanes, NOT a new benchmark tournament and NOT a 70B rerun. docs/RUNBOOK_W112.md locked BEFORE any NIM call (incl. the sub-second reachability sweep); the section 1-alpha target-selection rule was locked BEFORE probing so the catalogue could not be target-shopped. COO-9 stays lead; ultracode OFF. Lane alpha (stronger-model gate, LIVE): an honest NIM reachability sweep over the live 118-model catalogue (scripts/run_w112_stronger_model_reachability_sweep_v1.py; decision CID a654956b) found meta/llama-3.1-405b-instruct HTTP 404 for the 6th consecutive time (W104-W108, W112) but meta/llama-4-maverick-17b-128e-instruct HTTP 200 -- the first strictly-stronger, same-budget-comparable, NON-reasoning code model reachable since the cross-scale-UP axis opened (3 more eligible tier-2: Qwen3-Coder-480B, DeepSeek-V4-pro, Mistral-Small-4-119B). Selected Maverick tier-1 per the locked Llama-family rule. The §1alpha-earn canary (2 problems, 22 calls) confirmed the plain code path => the smallest honest BigCodeBench pilot was EARNED + RUN on the EXACT W110 fair 30-slice (CID b69bf3a0 re-derived + matched; only the model changed; 1 seed x 30 x K=5 = 330 calls; reflexion B): A0 73.33 / A1 73.33 / B 83.33 %; B-A1 = +10.00 pp; 9/9 core Phase-2 gates; MLB-2 = 37.5% PASS; MLB-1 = 26.67% FAIL => PASS_NON_MECHANISM_DRIVEN (3 clean rescues /15,/26,/51; 0 regressions); the reflexion margin REOPENED vs 70B's +0.00 pp (W110). DECISIVE CAVEAT (grounded): contamination-resistance is MODEL-CUTOFF-RELATIVE -- BigCodeBench 2024-06 is resistant for Llama-3.3-70B (~2024-01 cutoff) but EXPOSED for Llama-4-Maverick (Aug-2024 pretraining cutoff > release); the result is a structural twin of the W109 APPS contamination-EXPOSED control (A0 73.33% IDENTICAL; same MLB-1-fail/MLB-2-pass/PASS_NON_MECHANISM_DRIVEN shape), and the IDENTICAL slice flips +0.00pp -> +10.00pp as the model cutoff crosses the release date (first WITHIN-benchmark resistant->exposed flip) => EXPOSED-column result, NOT a clean reopening of contamination-RESISTANT superiority. Lane beta (NIM-free M3 strengthening, killed at $0): a harder fair-reachability re-mining (scripts/mine_w112_fair_reachability_v1.py re-executed the W110 transcripts) showed that on the MLB-2 denominator (12 invoked) the reliably fair-reachable ceiling is 8.3% (1/12) and the best-conceivable bound 33.3% (4/12; 58% mock/fixture-coupled) merely TOUCHES the floor; all four fair strengthenings (richer typed digest / multi-candidate aggregation / patch-rejection / doctest invariants) killed at $0 -- none expands the reliably-reachable set => NO_FAIR_STRENGTHENING_CAN_CLEAR_FLOOR (structurally strengthens the W111 empirical sub-floor finding). Lane gamma: graphify refreshed start (HEAD 2985b55) -> after adding the 2 W112 scripts (75,728 nodes / 241,040 edges) -> at end; explain/path/affected on run_executor_grounded_patcher_bench_v1; explain on the new mining script confirmed fair-regime reuse. Claim surfaces tightened (registry / status / honesty / consolidated narrative / CHANGELOG / contamination framing). NET: TWO confirmed retirements (W89 +5.56pp, W105 +7.00pp) STAND, contamination-EXPOSED-HumanEval-family-at-70B; W112 adds NONE and retires NONE; resistant superiority still unproven; the contamination-confound is STRENGTHENED a 3rd time (3 exposed margins incl. a within-benchmark flip vs 0/2 resistant) but NOT proven. 22nd consecutive preflight/earn-discipline validation (W93-W112). Stable boundary preserved (no version bump; no PyPI; coordpy/__init__.py untouched; ZERO new coordpy.* modules -- 2 new scripts only, the pilot reused the W110 driver verbatim; 26 reused-module tests pass). COO-9 REMAINS lead; W113 = a benchmark verifiably contamination-resistant FOR Llama-4 (problem dates > Aug-2024; date-filtered LiveCodeBench) to separate capability from exposure.",
"commits": [
"a1b7d78"
],
"docs": [
"docs/RUNBOOK_W112.md",
"docs/RESULTS_W112_STRONGER_MODEL_BIGCODEBENCH_PILOT_V1.md",
"docs/RESULTS_W112_STRONGER_MODEL_GATE_AND_SELECTION_V1.md",
"docs/RESULTS_W112_FAIR_REACHABILITY_M3_STRENGTHENING_V1.md",
"docs/RESULTS_W112_MILESTONE_SUMMARY_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W112_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W112_V1.md",
"results/w112/fair_reachability/w110_bcb_fair_reachability.json"
],
"linear_issues": [
"COO-36",
"COO-9",
"COO-6"
],
"carry_forwards_added": [
"W112-T-405B-GATE-SIXTH-404-CLOSED",
"W112-T-STRONGER-CODE-MODEL-REACHABLE-LLAMA4-MAVERICK",
"W112-T-STRONGER-MODEL-BIGCODEBENCH-MARGIN-REOPENS-BUT-MODEL-EXPOSED",
"W112-T-CONTAMINATION-RESISTANCE-IS-MODEL-CUTOFF-RELATIVE",
"W112-T-FAIR-M3-STRENGTHENING-CEILING-SUB-FLOOR",
"W112-L-STRONGER-MODEL-RESISTANT-SUPERIORITY-NOT-CLEANLY-DEMONSTRATED-CAP"
],
"carry_forwards_retired": []
},
{
"id": "W113",
"title": "Clean resistant-FOR-Llama-4 benchmark construction + earned Maverick pilot => exposure confirmed (the W112 +10pp was contamination exposure; B-A1 collapses +10.00 exposed -> +0.00 resistant at the same Maverick scale; resistant superiority 0 clean across BOTH scales; W89+W105 stand; no retirement added)",
"outcome": "Gated branch milestone with THREE lanes, NOT a new tournament and NOT a confounded exposed rerun. docs/RUNBOOK_W113.md locked BEFORE any NIM call (incl. the canary). ultracode OFF; COO-9 stays lead; $0 expensive run except the one earned pilot. Lane alpha (clean resistant main lane, LIVE): built the machine-checkable resistance rule FIRST (coordpy.livecodebench_resistant_slice_v1: RESISTANT-for-Maverick iff contest_date > 2024-08-31, the entire ambiguous August-2024 window EXCLUDED; a KNOWN/ESTIMATED/UNKNOWN cutoff registry certifies resistance only against a KNOWN cutoff); proved date integrity NIM-free (scripts/run_w113_resistant_slice_preflight.py; verdict CID 6f30990c) -- the SHA-pinned release_v6 functional subset is 63/63 resistant (dates 2025-01-11..2025-04-05; 0 missing / 0 unparseable / 0 in-August) and the deterministic resistant 30-slice CID 2afc318c == the EXACT W108 slice (the date filter did not perturb the set => clean cross-scale; only model scale varies vs W108's 70B run); EARNED + RAN the cheapest honest Maverick pilot (scripts/run_w113_resistant_pilot.py; meta/llama-4-maverick-17b-128e-instruct; 1 seed x 30 x K=5 = 330 calls; ~55 min) => A0 30.00 / A1 50.00 / B 50.00 %; B-A1 = +0.00 pp; 7/9 gates; MLB-1 = 63.33 % PASS; MLB-2 = 21.05 % FAIL => FAIL => EXPOSURE_CONFIRMED (reflexion genuinely invoked, MORE than 70B; rescued 4 / regressed 2 / net 0; interp CID aa324208). The W112 +10.00pp on EXPOSED BigCodeBench COLLAPSED to +0.00pp on resistant LiveCodeBench at the SAME scale => the +10pp was contamination EXPOSURE, not a capability reopening -- a within-MODEL exposed->resistant flip (sharpest contamination dissociation yet; corroborated by Maverick resistant A1 50 % < 70B 63.33 %). The clean 2x2 is complete; the RESISTANT column is 0 clean across BOTH scales (70B -3.33 / +0.00; Maverick +0.00). Lane beta (tier-2 readiness, NIM-free, $0): coordpy.tier2_readiness_v1 + scripts/run_w113_tier2_readiness_v1.py lock the ranking + same-filtered-slice applicability + spend rule; all three reachable tier-2 stronger models (Qwen3-Coder-480B, DeepSeek-V4-pro, Mistral-Small-4-119B) have UNKNOWN 2025-2026 cutoffs => NONE certifiably resistant on the pinned 2025 slice => tier-2 spend BLOCKED on a missing instrument under EVERY outcome (next = release_v7+). Lane gamma (graphify): refreshed from HEAD at start (0 token cost) + close (76,435 nodes); explain/path/affected/query used for file selection + dependency checks + claim-surface location; claim surfaces tightened. NET: the two retirements (W89, W105) STAND, contamination-EXPOSED-HumanEval-family-at-70B; W113 adds NO retirement; resistant superiority 0 clean across BOTH scales; contamination-confound STRENGTHENED a fourth time (within-model flip) but NOT proven; W114 = accept the bounded contamination-EXPOSED claim + pursue a GENUINELY DIFFERENT axis; 3 new explicit-import-only modules + 3 scripts + 31 tests; pilot reused the canonical evaluator + W108 NIM generator (namespace import); no version bump; no PyPI; coordpy/__init__.py untouched; 23rd consecutive preflight/earn-discipline validation (W93-W113).",
"commits": [
"e98ffb6"
],
"docs": [
"docs/RUNBOOK_W113.md",
"docs/RESULTS_W113_RESISTANT_PILOT_V1.md",
"docs/RESULTS_W113_MILESTONE_SUMMARY_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W113_V1.md",
"results/w113/resistant_slice_preflight/preflight_verdict.json",
"results/w113/tier2_readiness/tier2_readiness.json"
],
"linear_issues": [
"COO-37"
],
"carry_forwards_added": [
"W113-T-RESISTANT-FOR-LLAMA4-SLICE-CONSTRUCTED",
"W113-T-STRONGER-MODEL-RESISTANT-CODE-EXPOSURE-CONFIRMED",
"W113-T-CONTAMINATION-CONFOUND-STRENGTHENED-WITHIN-MODEL",
"W113-L-STRONGER-MODEL-RESISTANT-SUPERIORITY-FAIL-CAP",
"W113-L-TIER2-NO-CERTIFIABLE-RESISTANT-SLICE-ON-PINNED-CORPUS-CAP"
],
"carry_forwards_retired": []
},
{
"id": "W114",
"title": "Bounded-ceiling registration + newer-certifiable-post-cutoff-instrument attempt + per-model certification layer => NO stronger-than-Maverick model certifiable on the latest real data ($0 NIM); W89+W105 stand; no retirement added",
"outcome": "Gated branch milestone with THREE lanes, NOT another exposed rerun and NOT another same-scale resistant reflexion rerun. docs/RUNBOOK_W114.md locked BEFORE any NIM (the no-go branch is pre-committed by the rule => $0 spend is discipline, not omission). ultracode OFF; COO-9 stays lead. Lane alpha (bounded-ceiling registration, NIM-free): registered the W113 outcome as the honest code-superiority FLOOR across the canonical truth surfaces (RESEARCH_STATUS Last-touched; THEOREM_REGISTRY Last-touched/claim-row/new W114 section; CONSOLIDATED narrative; HOW_NOT_TO_OVERSTATE; new CONTAMINATION_CONTROL_FRAMING_W114; CHANGELOG) -- exactly TWO retirements (W89 +5.56pp, W105 +7.00pp), both meta/llama-3.3-70b-instruct @ 70B contamination-EXPOSED-HumanEval-family; resistant superiority 0 clean across BOTH 70B and Maverick scales; the W112 +10pp was EXPOSURE (W114-T-BOUNDED-EXPOSED-CODE-CEILING-REGISTERED); the truth floor a new axis must beat, NOT surrender (README untouched -- no per-milestone research claims). Lane beta (genuinely different axis = certification-supply analysis): verified from PRIMARY sources (2026-05-29) the latest LiveCodeBench release (HF file tree: test6.jsonl is the highest-numbered => release_v6 latest, no test7+; FUNCTIONAL subset 63 problems, all 2025-01-11..2025-04-05, frontier 2025-04-05) AND official model cutoffs (Llama-4-Maverick = Aug-2024 KNOWN per the official Llama-4 model card; Qwen3-Coder-480B = NO CUTOFF STATED on the official HF card + Qwen blog; DeepSeek-V4-pro = none disclosed; Mistral-Small-4-2603 = none in official Mistral docs, 2026-03 release post-dates the whole window) => verdict NO_CERTIFIABLE_STRONGER_MODEL, $0 NIM: a >=30 functional resistant slice requires a KNOWN cutoff <= ~Jan-2025; Maverick (the only reachable KNOWN cutoff) is already SETTLED (W113 resistant FAIL => redundant); every stronger-than-Maverick frontier model has an OFFICIALLY UNDISCLOSED cutoff (C1) and, where estimable, a cutoff at/after the Apr-2025 frontier (C2) => the resistant-instrument frontier has AGED OUT relative to the reachable model frontier (W114-L-RESISTANT-INSTRUMENT-FRONTIER-LAGS-MODEL-FRONTIER-CAP + W114-T-STRONGER-MODEL-CUTOFFS-OFFICIALLY-UNDISCLOSED). Lane gamma (per-model certification layer + graphify): built coordpy.stronger_model_cutoff_certification_v1 (explicit-import-only; imports the W113 registry + partition_resistant_v1 + ranking, no duplication; C1^C2^C3^C4 gate; verified instrument-frontier record + per-model official-source provenance + W113<->W114 confidence-consistency guard) + scripts/run_w114_stronger_model_certification_v1.py (re-verifies the month histogram against the SHA-pinned bb4c364f corpus: sha_ok + histogram_match; decision CID 258b6ed7); graphify refreshed from HEAD at start + close (76,573 nodes); explain/path/affected/query used for file selection + dependency checks. NET: W89 + W105 STAND, W114 adds NO retirement + retires NO carry-forward; contamination-confound UNCHANGED (W114 tests certification supply, not the confound); reachability NOT re-probed (not the binding gate; W112 facts carried); COO-9 stays lead; W115 = fires only when a resistant FUNCTIONAL instrument with >=30 problems dated strictly after a reachable frontier model's KNOWN cutoff exists. 1 new explicit-import-only module + 1 script + 12 new tests (+ 30 W113 regression tests pass); ZERO additions to coordpy/__init__.py; no version bump; no PyPI; 24th consecutive preflight/earn-discipline validation (W93-W114).",
"commits": [
"76f8b0a"
],
"docs": [
"docs/RUNBOOK_W114.md",
"docs/RESULTS_W114_STRONGER_MODEL_CERTIFICATION_V1.md",
"docs/RESULTS_W114_MILESTONE_SUMMARY_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W114_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W114_V1.md",
"results/w114/certification/certification_verdict.json"
],
"linear_issues": [
"COO-38"
],
"carry_forwards_added": [
"W114-T-BOUNDED-EXPOSED-CODE-CEILING-REGISTERED",
"W114-T-STRONGER-MODEL-CUTOFFS-OFFICIALLY-UNDISCLOSED",
"W114-T-STRONGER-MODEL-CERTIFICATION-V1-SHIPS",
"W114-L-RESISTANT-INSTRUMENT-FRONTIER-LAGS-MODEL-FRONTIER-CAP",
"W114-L-NO-NEW-STRONGER-MODEL-RESISTANT-PILOT-EARNED-CAP"
],
"carry_forwards_retired": []
},
{
"id": "W115",
"title": "External-frontier refresh (LIVE) + durable future-fire certification/instrument-supply pipeline => external frontier UNCHANGED, NO certifiable stronger model ($0 NIM); DeepSeek V4 card now exists (2026-04-27) but discloses no cutoff; W89+W105 stand",
"outcome": "Gated branch milestone with THREE lanes, NOT another exposed rerun and NOT another same-scale resistant reflexion rerun on the same release_v6 slice. Executes the W114 RUNBOOK_W114.md s9 NO_CERTIFIABLE_STRONGER_MODEL branch (COO-38 close). docs/RUNBOOK_W115.md locked BEFORE any NIM (the no-go branch is pre-committed by the rule => $0 spend is discipline, not omission). ultracode OFF; COO-9 stays lead. Lane alpha (LIVE external-frontier refresh): re-verified from PRIMARY sources (2026-05-29; WebSearch/WebFetch against official HF dataset/model cards + vendor PDFs) that the latest LiveCodeBench release is STILL release_v6 (HF file tree highest = test6.jsonl, 'add v6' ~1yr ago; NO test7+; functional frontier 2025-04-05 UNCHANGED) and that NO reachable model stronger than Maverick has a KNOWN cutoff <= ~Jan-2025 (Qwen3-Coder-480B official HF card states NO cutoff; the DeepSeek V4 official model card NOW EXISTS [published 2026-04-27, Pro=1.6T/49B] but contains NO 'cutoff' string and no training-data date => still UNKNOWN; Mistral-Small-4 UNKNOWN; Maverick Aug-2024 KNOWN but already SETTLED at W113). Both binding conditions still fail => verdict re-derives NO_CERTIFIABLE_STRONGER_MODEL (decision CID 258b6ed7, byte-identical to W114), $0 NIM; the one external change since W114 (DeepSeek V4 card publication) does NOT move the verdict (W115-L-EXTERNAL-FRONTIER-UNCHANGED-NO-CERTIFIABLE-SLICE-REVERIFIED-CAP). Reachability NOT re-probed (not the binding gate; W112 facts carried). Lane beta (future-fire certification/instrument-supply pipeline): shipped coordpy.frontier_certification_pipeline_v1 (explicit-import-only; reuses the W113 registry + the W114 certify_model_v1/decide_certification_v1 gate + the loader's LIVECODEBENCH_KNOWN_RELEASES, no duplication -- graphify-confirmed 2-hop reuse chain run_frontier_certification_v1 -> decide_certification_v1 -> certify_model_v1) + scripts/run_w115_frontier_certification_v1.py: a latest-official-release detector (newer_release_available=False), a generalised frontier-date summary + threshold table (max KNOWN cutoff month admitting a >=30 slice = 2025-01), a per-model go/no-go matrix, a disclosure-consistency guard (live disclosures vs the encoded registry -- a divergence is the W116 update signal), and a structured W116FireConditionV1 -- all driven by a FrontierSnapshotV1 (external state as DATA) so W116 is push-button (W115-T-FUTURE-FIRE-CERTIFICATION-PIPELINE-SHIPS); the script re-verifies the histogram against the SHA-pinned bb4c364f corpus (sha_ok + histogram_match); result CID 6890419c; artifact results/w115/frontier_certification/frontier_certification_verdict.json. Lane gamma (claim/graphify/readiness): graphify refreshed from HEAD at start (f8b085d) + re-ingested the new module/script (76,727 nodes) + close; explain/path/affected/query used for file selection + dependency checks; truth surfaces tightened (RESEARCH_STATUS Last-touched; THEOREM_REGISTRY Last-touched/claim-row/new W115 section; CONSOLIDATED narrative; HOW_NOT_TO_OVERSTATE; new CONTAMINATION_CONTROL_FRAMING_W115; new FRONTIER_RELEVANCE_AUDIT_W115; CHANGELOG). NET: W89 + W105 STAND, W115 adds NO retirement + retires NO carry-forward; contamination-confound UNCHANGED (W115 tests certification supply, not the confound); COO-9 stays lead; COO-39; W116 fires when the pipeline trigger flips (a newer admitted release_v7+ with >=30 post-frontier functional problems for a KNOWN-cutoff stronger model, OR a reachable stronger-than-Maverick model disclosing a KNOWN cutoff month <= 2025-01). 1 new explicit-import-only module + 1 script + 10 new tests (+ 42 W113/W114 regression tests pass); ZERO additions to coordpy/__init__.py; no version bump; no PyPI; 25th consecutive preflight/earn-discipline validation (W93-W115).",
"commits": [
"84a9009"
],
"docs": [
"docs/RUNBOOK_W115.md",
"docs/RESULTS_W115_FRONTIER_CERTIFICATION_V1.md",
"docs/RESULTS_W115_MILESTONE_SUMMARY_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W115_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W115_V1.md",
"results/w115/frontier_certification/frontier_certification_verdict.json"
],
"linear_issues": [
"COO-39"
],
"carry_forwards_added": [
"W115-T-FUTURE-FIRE-CERTIFICATION-PIPELINE-SHIPS",
"W115-L-EXTERNAL-FRONTIER-UNCHANGED-NO-CERTIFIABLE-SLICE-REVERIFIED-CAP",
"W115-L-NO-NEW-PILOT-EARNED-FRONTIER-UNCHANGED-CAP"
],
"carry_forwards_retired": []
},
{
"id": "W116",
"title": "Upstream instrument-supply ATTACK (LIVE, four upstream surfaces) + primary-source model-cutoff ATTACK + durable upstream-ADMISSION pipeline => NO admissible new instrument beyond release_v6, NO certifiable stronger model ($0 NIM); Mistral-Small-4-119B-2603 CONFIRMED REAL with a primary card disclosing no cutoff; W89+W105 stand; no retirement added",
"outcome": "Gated branch milestone with THREE lanes, NOT another exposed rerun and NOT another same-scale resistant reflexion rerun on the same release_v6 slice. Executes the W115 RUNBOOK_W115.md s9 NO_CERTIFIABLE_STRONGER_MODEL branch (COO-39 close): instead of passively waiting for release_v7, ATTACK the upstream instrument supply + the model-cutoff disclosure side, operationalise the next clean shot, and run a pilot only if certification is genuinely clean. docs/RUNBOOK_W116.md locked BEFORE any NIM (the no-go branch is pre-committed by the rule => $0 spend is discipline, not omission). ultracode OFF; COO-9 stays lead. Lane alpha (LIVE upstream instrument-supply attack): went one level UPSTREAM of release_v6 and re-verified the LiveCodeBench frontier at FOUR authoritative surfaces -- (1) HF code_generation_lite file tree (highest test6.jsonl, no test7+, lastModified 2025-06-05); (2) loader code_generation_lite.py ALLOWED_FILES (v_list=[v1..v6], release_latest -> release_v6 files, DEFAULT_CONFIG_NAME=release_latest); (3) full code_generation dataset README (release_v6 = May 2023..Apr 2025, 1055 problems, frontier 2025-04-05); (4) GitHub repo README (tops out at release_v6, no v7 tag) => NO admissible new instrument beyond release_v6; the functional frontier is conclusively 2025-04-05. The 'planned v7' mentioned only by a non-primary search summary is INADMISSIBLE under the pre-committed A1..A5 rule (no artifact, no SHA) and is recorded only as the W117 watch signal (W116-L-UPSTREAM-SUPPLY-NO-ADMISSIBLE-NEW-INSTRUMENT-FOUR-SURFACE-CAP). Lane beta (primary-source model-cutoff attack): re-checked official cutoffs from PRIMARY sources => no reachable stronger-than-Maverick model has a primary-KNOWN cutoff <= 2025-01: Qwen3-Coder-480B + DeepSeek-V4-pro UNKNOWN (official cards: NO CUTOFF STATED); mistralai/mistral-small-4-119b-2603 now CONFIRMED REAL (119B MoE, 128 experts/4 active, released 2026-03-16) with its official Mistral docs model card + official announcement (mistral.ai/news/mistral-small-4) disclosing NO cutoff -- the only figure is a non-primary aggregator (OpenRouter '2025-06') that is itself C2-exposed (post-dates the Apr-2025 frontier); Mistral-Small-3.2-24B KNOWN ~Oct-2023 but sub-70B (C3) (W116-T-MISTRAL-SMALL-4-CONFIRMED-REAL-PRIMARY-NO-CUTOFF). Lane gamma (upstream-admission pipeline + graphify): shipped coordpy.upstream_instrument_admission_v1 (explicit-import-only; reuses the W113 registry + the W114 certify_model_v1 gate + the W115 run_frontier_certification_v1 pipeline + the loader's LIVECODEBENCH_KNOWN_RELEASES, no duplication -- graphify-confirmed reuse chain run_upstream_admission_v1 -> run_frontier_certification_v1) + scripts/run_w116_upstream_admission_v1.py: a pre-committed A1..A5 admissibility rule (REFUSES aggregator/mirror/website-intro/rumor instruments), a multi-surface upstream-change detector (detect_upstream_change_v1; richer than W115's single boolean), a certifiable-slice builder, a four-way disclosure-status matrix (KNOWN/ESTIMATED-but-unusable/UNKNOWN/contradictory-or-stale), and a structured W117FireConditionV1 -- all driven by a UpstreamSupplySnapshotV1 so W117 is push-button (W116-T-UPSTREAM-ADMISSION-PIPELINE-SHIPS); the script re-verifies the histogram against the SHA-pinned bb4c364f corpus (sha_ok + histogram_match), asserts the decision-CID-drift guard, and emits results/w116/upstream_admission/upstream_admission_verdict.json (result CID 193164c4; decision CID 258b6ed7 byte-identical to W114/W115). graphify refreshed from HEAD at start (5b3f75d) + re-ingested the new module/script (77,010 nodes) + close; explain/path/affected/query used for file selection + dependency checks. NET: verdict re-derives NO_CERTIFIABLE_STRONGER_MODEL, $0 NIM, no pilot earned (a pilot needs BOTH an admissible new instrument AND a non-redundant primary-KNOWN-cutoff stronger model -- neither exists; n_admissible_new_instruments=0; W117 fires_now=False); W89 + W105 STAND, W116 adds NO retirement + retires NO carry-forward; contamination-confound UNCHANGED (W116 tests certification supply, not the confound); reachability NOT re-probed (not the binding gate; W112 facts carried); COO-9 stays lead; COO-40; W117 fires the moment detect_upstream_change_v1 flags an admissible change (a newer admitted release_v7+ / release_latest re-point / new upstream functional dataset with >=30 post-frontier problems for a KNOWN-cutoff stronger model, OR a reachable stronger-than-Maverick model disclosing a primary-KNOWN cutoff month <= 2025-01). 1 new explicit-import-only module + 1 script + 16 new tests (+ 52 W113/W114/W115 regression tests pass = 68 total); ZERO additions to coordpy/__init__.py; no version bump; no PyPI; 26th consecutive preflight/earn-discipline validation (W93-W116).",
"commits": [
"14ad122"
],
"docs": [
"docs/RUNBOOK_W116.md",
"docs/RESULTS_W116_UPSTREAM_ADMISSION_V1.md",
"docs/RESULTS_W116_MILESTONE_SUMMARY_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W116_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W116_V1.md",
"results/w116/upstream_admission/upstream_admission_verdict.json"
],
"linear_issues": [
"COO-40"
],
"carry_forwards_added": [
"W116-T-UPSTREAM-ADMISSION-PIPELINE-SHIPS",
"W116-T-MISTRAL-SMALL-4-CONFIRMED-REAL-PRIMARY-NO-CUTOFF",
"W116-L-UPSTREAM-SUPPLY-NO-ADMISSIBLE-NEW-INSTRUMENT-FOUR-SURFACE-CAP"
],
"carry_forwards_retired": []
},
{
"id": "W117",
"title": "Upstream-DERIVED instrument CONSTRUCTION attack (LIVE, eight provenance surfaces) + deeper primary-source model-cutoff attack + durable upstream-DERIVED construction/admission pipeline => NO construction-admissible instrument can be built from authoritative provenance beyond release_v6, NO certifiable stronger model ($0 NIM); W89+W105 stand; no retirement added",
"outcome": "Gated branch milestone with THREE lanes, NOT another exposed rerun, NOT a packaged-release watch-and-wait check, NOT another same-scale resistant reflexion rerun. Executes the W116 RUNBOOK_W116.md s9 branch (COO-40 close). W116 answered 'is there an admissible PACKAGED release_v7?' (No); W117 escalates to the CONSTRUCTION question: even before release_v7 is packaged, can a post-cutoff functional instrument be CONSTRUCTED from official upstream PROVENANCE? docs/RUNBOOK_W117.md locked BEFORE any NIM (the no-go branch is pre-committed by the rule => $0 spend is discipline, not omission). ultracode OFF; COO-9 stays lead. Lane alpha (LIVE upstream-derived construction attack): attacked the upstream CONSTRUCTION provenance at EIGHT authoritative surfaces (the revision history + collection mechanism, not just the release label) -- (1) HF dataset commit/revision log (latest data commit 'add v6' 2025-04-21, HEAD 'fix typos' 2025-06-05); (2) HF refs (1 branch/0 tags); (3) HF discussions/PRs (newest = 'LCB pull request' #14 + a v6-size clarification #13, no v7 thread); (4) GitHub commits (newest 2025-07-16 runner-maintenance, no data commit); (5) GitHub tags (0); (6) GitHub repo pipeline structure (lcb_runner/+assets/ only, NO scraper/collection dir); (7) dataset README provenance (LeetCode/AtCoder/Codeforces, documents only loading published releases, NO generation tool/manifest); (8) runner loader code_generation.py (loads HF-only via load_dataset(version_tag=...), no local scraping) => the authoritative construction provenance IS the packaged HF release; LCB publishes no collection pipeline or forward problem-id manifest; no post-v6 LCB-published artifact exists at any surface; the only post-v6 path (raw-contest hand-assembly) is CONSTRUCTION-INADMISSIBLE (refused by the pre-committed B1 authoritative-LCB-provenance + B2 no-operator-curation criteria) => 0 construction-admissible NEW instruments -- sharper than 'no packaged v7': a post-v6 instrument cannot be CONSTRUCTED from authoritative provenance, only hand-curated, which is refused (W117-L-NO-CONSTRUCTION-ADMISSIBLE-POST-V6-INSTRUMENT-EIGHT-SURFACE-CAP + W117-T-LCB-CONSTRUCTION-PROVENANCE-IS-PACKAGED-RELEASE). Lane beta (deeper primary-source model-cutoff attack): no reachable stronger-than-Maverick model has a primary-KNOWN cutoff <= 2025-01 and nothing newly-disclosed since W116 -- Maverick 'August 2024' re-confirmed VERBATIM (Meta MODEL_CARD.md) but C4-settled; Qwen3-Coder-480B UNKNOWN (official HF card raw README: no cutoff); DeepSeek-V4-pro UNKNOWN from primary -- official V4 model-card PDF re-checked still states NO cutoff, the only figure is a non-primary aggregator 'Apr 2026' that is itself C2-exposed, now matching Mistral-Small-4-119B-2603's UNKNOWN-primary + C2-exposed-aggregator pattern (W117-T-DEEPSEEK-V4-PRIMARY-PDF-RECONFIRMED-NO-CUTOFF). Lane gamma (construction/admission pipeline + graphify): shipped coordpy.upstream_derived_instrument_construction_v1 (explicit-import-only; reuses + WRAPS the W116 run_upstream_admission_v1 -> W115 run_frontier_certification_v1 -> W114 certify_model_v1 chain, no duplication; graphify-confirmed run_upstream_construction_v1 --calls--> run_upstream_admission_v1) + scripts/run_w117_upstream_construction_v1.py: the construction rule (A1..A5 reused + B1 + B2), an eight-surface provenance snapshot, a candidate-instrument constructor (raw-contest assembly triply refused; LCB-pipeline template construction-admissible-in-principle but artifact-absent => constructed=False + the EXACT missing artifact named), a provenance validator, a sharpened disclosure matrix, and a structured W118FireConditionV1 (packaged / construction-provenance / cutoff); the script re-verifies the histogram against the SHA-pinned bb4c364f corpus (sha_ok + histogram_match), asserts the decision-CID-drift guard, and emits results/w117/upstream_construction/upstream_construction_verdict.json (result CID c3c60483; decision CID 258b6ed7 byte-identical to W114/W115/W116). graphify refreshed from HEAD at start (dcec243) + re-ingested the new module/script (77,252 nodes) + close; explain/path/affected/query used for file selection + dependency checks. NET: verdict re-derives NO_CERTIFIABLE_STRONGER_MODEL, $0 NIM, no pilot earned (needs BOTH a construction-admissible new instrument AND a non-redundant primary-KNOWN-cutoff stronger model -- neither exists; constructed=False; W118 fires_now=False); W89 + W105 STAND, W117 adds NO retirement + retires NO carry-forward; contamination-confound UNCHANGED (W117 tests construction supply, not the confound); reachability NOT re-probed (not the binding gate; W112 facts carried); COO-9 stays lead; COO-41; W118 fires the moment a packaged release_v7+ is admitted, OR an LCB-PUBLISHED post-v6 construction provenance (a dataset revision/commit/PR with post-2025-04 functional problems, OR a published collection pipeline + problem-id manifest) enables a B1+B2 reproducible >=30-problem post-cutoff slice, OR a reachable stronger-than-Maverick model discloses a primary-KNOWN cutoff month <= 2025-01. 1 new explicit-import-only module + 1 script + 13 new tests (+ 68 W113/W114/W115/W116 regression tests pass = 81 total); ZERO additions to coordpy/__init__.py; no version bump; no PyPI; 27th consecutive preflight/earn-discipline validation (W93-W117).",
"commits": [
"113c456"
],
"docs": [
"docs/RUNBOOK_W117.md",
"docs/RESULTS_W117_UPSTREAM_CONSTRUCTION_V1.md",
"docs/RESULTS_W117_MILESTONE_SUMMARY_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W117_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W117_V1.md",
"results/w117/upstream_construction/upstream_construction_verdict.json"
],
"linear_issues": [
"COO-41"
],
"carry_forwards_added": [
"W117-T-UPSTREAM-DERIVED-CONSTRUCTION-PIPELINE-SHIPS",
"W117-L-NO-CONSTRUCTION-ADMISSIBLE-POST-V6-INSTRUMENT-EIGHT-SURFACE-CAP",
"W117-T-LCB-CONSTRUCTION-PROVENANCE-IS-PACKAGED-RELEASE",
"W117-T-DEEPSEEK-V4-PRIMARY-PDF-RECONFIRMED-NO-CUTOFF"
],
"carry_forwards_retired": []
},
{
"id": "W118",
"title": "CoordPy-OWNED post-v6 functional-instrument CONSTRUCTION (LIVE, official Codeforces API) + deeper primary-source model-cutoff attack + durable construction/admission/pilot-readiness pipeline => 894 official post-v6 functional problem IDENTITIES constructed (identity axis SOLVED at scale), but the executable GRADER is ABSENT family-wide (Codeforces/AtCoder/LeetCode), NO certifiable stronger model ($0 NIM); W89+W105 stand; no retirement added",
"outcome": "Gated branch milestone with THREE lanes, NOT another exposed rerun, NOT a 'check if v7 exists yet' watch-and-wait, NOT another same-scale resistant reflexion rerun, NOT a bounded-context/compaction job. Executes docs/RUNBOOK_W118.md (locked BEFORE any NIM; the no-go branch is pre-committed by the rule => $0 spend is discipline). ultracode OFF; COO-9 stays lead; COO-42 is the W118 issue. W118 escalated W117 from proving no post-v6 instrument can be inherited to BUILDING one from official sources. Lane alpha (LIVE CoordPy-owned construction): a real constructor (coordpy.coordpy_frontier_functional_v1) ran LIVE against the official Codeforces API (contest.list + problemset.problems; raw-fetch SHA b6342fd1...), applying a total deterministic inclusion rule (PROGRAMMING ^ FINISHED ^ contest_date strictly after the release_v6 frontier 2025-04-05) => 894 admitted post-v6 functional problems from 11223 candidates, 2025-04-07..2026-05-30, 130 contests; manifest CID fb4185a6...; the IDENTITY axis (O1..O6) is SOLVED at scale (894 >> MIN_SLICE 30) (W118-T-COORDPY-OWNED-POST-V6-FUNCTIONAL-IDENTITY-CONSTRUCTIBLE). BUT the executable functional GRADER (O7) is ABSENT family-wide: Codeforces API exposes problem metadata only (no test field/endpoint); LeetCode hidden tests deliberately private; AtCoder system tests Dropbox-only (no official API); sample-only grading is non-credible and operator-synthesised tests are operator curation (refused by O5) => identity-admissible but NOT pilot-admissible (W118-L-OFFICIAL-SOURCE-FAMILY-NO-EXECUTABLE-GRADER-CAP). Maverick (KNOWN Aug-2024) has all 894 problems resistant on a genuinely-new instrument it never ran => C1^C2^C3^C4 identity-CERTIFIABLE, blocked ONLY by the missing grader (W118-T-MAVERICK-IDENTITY-CERTIFIABLE-GRADER-BLOCKED). Lane beta (deeper primary-source cutoff attack): DeepSeek V4 PDF re-fetched directly => NO CUTOFF STATED; Maverick 'August 2024' verbatim; Qwen3-Coder-480B + Mistral-Small-4 v26.03 NO CUTOFF STATED; GLM-5 NEWLY NOTED but UNKNOWN-from-primary + C2-exposed + reachability-unverified => nothing newly primary-disclosed since W117 (W118-T-GLM5-NEWLY-NOTED-UNCERTIFIABLE). Lane gamma (construction/admission/pilot-readiness pipeline): shipped coordpy.coordpy_frontier_functional_v1 (explicit-import-only; reuses the W113 registry + W114 certify_model_v1/LatestResistantInstrumentV1/STRONGER_MODEL_CANDIDATES + the W117 run_upstream_construction_v1 + the W116 disclosure types, no duplication; graphify-confirmed reuse edge run_frontier_functional_construction_v1 --calls--> run_upstream_construction_v1) + scripts/run_w118_frontier_functional_construction_v1.py: the O1..O7 instrument rule, the official-source-family grader registry, the deterministic manifest constructor + thin live fetch, the reused C1..C4 + O7 certification, the W118 disclosure matrix, and a structured W119FireConditionV1 with a falsifiability test (W118-T-FRONTIER-FUNCTIONAL-CONSTRUCTION-PIPELINE-SHIPS); the LCB-inherited verdict re-derives NO_CERTIFIABLE_STRONGER_MODEL (decision CID 258b6ed7..., byte-identical to W114/W115/W116/W117; result CID 3ab0d186...; artifacts results/w118/frontier_functional/frontier_functional_verdict.json + coordpy_frontier_functional_v1_manifest.json). graphify refreshed from HEAD at start (32d3498) + re-ingested the new module/script (77572 nodes) + close. NET: the blocker MOVED from W117's 'no post-v6 identities can be constructed' to W118's 'abundant official post-v6 identities (894), no official executable grader'; verdict NO_CERTIFIABLE_STRONGER_MODEL, $0 NIM, no pilot; W89 + W105 STAND, W118 adds NO retirement, retires NO carry-forward; contamination-confound UNCHANGED (W118 tests construction + grader supply, not the confound); reachability NOT re-probed (not the binding gate; W112 facts carried); COO-9 stays lead; W119 fires the moment an OFFICIAL executable per-problem test suite for >=30 post-v6 functional problems appears on a clean official surface (Maverick is already identity-certifiable => a grader alone unlocks the cheapest honest verdict-changing pilot), OR a packaged release_v7+ / LCB-published post-v6 construction provenance appears, OR a reachable stronger-than-Maverick model discloses a primary-KNOWN cutoff <= the manifest frontier. 1 new explicit-import-only module + 1 script + 15 new tests (+ 81 W113/W114/W115/W116/W117 regression tests pass = 96 total); ZERO additions to coordpy/__init__.py; no version bump; no PyPI; 28th consecutive preflight/earn-discipline validation (W93-W118).",
"commits": [
"1dbce7f"
],
"docs": [
"docs/RUNBOOK_W118.md",
"docs/RESULTS_W118_FRONTIER_FUNCTIONAL_CONSTRUCTION_V1.md",
"docs/RESULTS_W118_MILESTONE_SUMMARY_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W118_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W118_V1.md",
"results/w118/frontier_functional/frontier_functional_verdict.json",
"results/w118/frontier_functional/coordpy_frontier_functional_v1_manifest.json"
],
"linear_issues": [
"COO-42"
],
"carry_forwards_added": [
"W118-T-COORDPY-OWNED-POST-V6-FUNCTIONAL-IDENTITY-CONSTRUCTIBLE",
"W118-L-OFFICIAL-SOURCE-FAMILY-NO-EXECUTABLE-GRADER-CAP",
"W118-T-MAVERICK-IDENTITY-CERTIFIABLE-GRADER-BLOCKED",
"W118-T-FRONTIER-FUNCTIONAL-CONSTRUCTION-PIPELINE-SHIPS",
"W118-T-GLM5-NEWLY-NOTED-UNCERTIFIABLE"
],
"carry_forwards_retired": []
},
{
"id": "W119",
"title": "official ICPC public-package pivot DISSOLVES the W118 grader blocker (real non-LFS data/secret + output validators + 119 accepted refs; NIM-free grader self-test 16/16 PASS) => the executable functional grader W118 proved ABSENT family-wide is now PRESENT + verified on a genuinely-new post-Maverick-cutoff resistant ICPC battlefield; but the post-cutoff resistant pass-fail slice = 24 < MIN_RESISTANT_SLICE 30 blocks both slice-admissibility AND the reused C2 cert gate => 0 certifiable models, NO pilot ($0 NIM); W89+W105 stand; no retirement added",
"outcome": "Gated branch milestone with THREE lanes (alpha official ICPC public-package construction, beta primary-source stronger-model cutoff attack, gamma package-to-pilot pipeline/graphify/truth), NOT another LCB-packaging/provenance blocker memo, NOT another exposed rerun, NOT a bounded-context/compaction job. Executes docs/RUNBOOK_W119.md; no NIM spent (pilot not earned: 24<30), so the no-go is discipline by the rule. ultracode OFF; COO-9 stays lead; COO-43 is the W119 issue. W119 made the aggressive pivot W118 set up: off the grader-LESS LiveCodeBench source family (Codeforces/AtCoder/LeetCode) onto the official ICPC problem-package family (github.com/icpc) that already ships the executable grader. LANE alpha (LIVE official ICPC public-package construction): a real constructor (coordpy.coordpy_icpc_public_functional_v1; fetch_icpc_package_listing_v1 -> build_icpc_manifest_v1) enumerated the official ICPC GitHub org (33 repos) and applied a total deterministic inclusion rule (official ICPC package ^ dated ^ contest date strictly after Maverick's Aug-2024 cutoff ^ NOT interactive ^ ships a usable grader) => 24 admitted post-cutoff resistant pass-fail problems from 26 candidates across 2 official repos (Rocky Mountain Regional 2025-2026 created 2025-11-13 + 2024-2025 created 2024-12-03; excluded interactive 1 + custom-without-validator 1 + pre-cutoff 0); months 2024-12 (12) + 2025-11 (12); manifest CID 2b337377... The executable GRADER is PRESENT + EXECUTABLE (the W118 family-wide blocker DISSOLVED): the official ICPC packages ship real non-Git-LFS data/secret/*.in+*.ans (verified by direct content fetch) + 8 output validators + 119 accepted reference solutions; a NIM-free grader self-test ran official accepted Python solutions against the official secret cases = 16/16 cases PASS (videogames 8/8 + whattimeisitmrfox 8/8) => a real executable oracle (W119-T-OFFICIAL-ICPC-PACKAGE-FAMILY-SHIPS-EXECUTABLE-GRADER + W119-T-ICPC-GRADER-SELF-TEST-PASSES). The SLICE COUNT is the binding blocker, load-bearing at BOTH levels: 24 < 30 blocks slice-admissibility AND the reused W114 C2 cert gate (needs >=30 resistant after cutoff), so even KNOWN-cutoff Maverick is NOT identity-certifiable on the 24-slice, tier-2 C1-blocked (UNKNOWN cutoffs) => 0 certifiable models; the NWERC-2024 official static-package second surface 404s (W119-L-OFFICIAL-ICPC-RESISTANT-PASSFAIL-SLICE-COUNT-CAP). LANE beta (primary-source cutoff attack, reused W118 matrix): Maverick KNOWN Aug-2024; Qwen3-Coder-480B / DeepSeek-V4-pro / Mistral-Small-4 v26.03 / GLM-5 all NO CUTOFF STATED (primary); {KNOWN: 1, UNKNOWN: 4}; nothing newly primary-disclosed since W118; NVIDIA_API_KEY present => the no-pilot is a clean count-gate no-go, not reachability. LANE gamma (package-to-pilot pipeline): shipped coordpy.coordpy_icpc_public_functional_v1 (explicit-import-only; reuses the W113 registry + W114 certify_model_v1/LatestResistantInstrumentV1/STRONGER_MODEL_CANDIDATES + the W117 run_upstream_construction_v1 + the W116/W118 disclosure types, no duplication; graphify-confirmed reuse edges to run_upstream_construction_v1 + certify_model_v1) + scripts/run_w119_icpc_public_construction_v1.py: the P1..P8 rule, the official ICPC source-family grader registry, the deterministic manifest constructor + thin live GitHub-API fetch, a real fresh-subprocess stdin/stdout executor (run_icpc_stdin_executor_v1), the grader self-test, the reused C1..C4 + grader + slice certification, and a structured W120FireConditionV1 with 2 falsifiability tests; the LCB-inherited verdict re-derives NO_CERTIFIABLE_STRONGER_MODEL (decision CID 258b6ed7..., byte-identical to W114/W115/W116/W117/W118; result CID 577f7633...; artifacts results/w119/icpc_public/icpc_public_verdict.json + coordpy_icpc_public_functional_v1_manifest.json). graphify refreshed start + close (graph content includes the W119 module: coordpy_icpc_public_functional_v1 120 nodes). NET: the blocker MOVED from W118's 'abundant official identities, no official grader' to W119's 'official grader present + self-test-passing, +6 post-cutoff resistant pass-fail tasks short of a clean pilot'; verdict NO_CERTIFIABLE_STRONGER_MODEL, $0 NIM, no pilot; W89 + W105 STAND, W119 adds NO retirement + retires NO carry-forward; contamination-confound UNCHANGED; COO-9 stays lead; W120 fires the moment the post-cutoff resistant pass-fail count reaches 30 on a clean official surface (next official ICPC regional drop / clean official second-surface aggregation - grader already present, so a slice alone unlocks the cheapest honest verdict-changing Maverick pilot), OR a reachable stronger-than-Maverick model discloses a primary-KNOWN cutoff. 1 new explicit-import-only module + 1 script + 18 new tests (110 across W113-W119); ZERO additions to coordpy/__init__.py; no version bump; no PyPI; 29th consecutive preflight/earn-discipline validation (W93-W119).",
"commits": [
"73bf3e1"
],
"docs": [
"docs/RUNBOOK_W119.md",
"docs/RESULTS_W119_ICPC_PUBLIC_CONSTRUCTION_V1.md",
"docs/RESULTS_W119_MILESTONE_SUMMARY_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W119_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W119_V1.md",
"results/w119/icpc_public/icpc_public_verdict.json",
"results/w119/icpc_public/coordpy_icpc_public_functional_v1_manifest.json"
],
"linear_issues": [
"COO-43"
],
"carry_forwards_added": [
"W119-T-OFFICIAL-ICPC-PACKAGE-FAMILY-SHIPS-EXECUTABLE-GRADER",
"W119-T-ICPC-GRADER-SELF-TEST-PASSES",
"W119-L-OFFICIAL-ICPC-RESISTANT-PASSFAIL-SLICE-COUNT-CAP",
"W119-T-ICPC-PUBLIC-FUNCTIONAL-CONSTRUCTION-PIPELINE-SHIPS"
],
"carry_forwards_retired": []
},
{
"id": "W120",
"title": "closed the official-ICPC count gap (RMRC exclusion audit [draftlottery->float] + NEW official surface icpc/na-ecna-archive) => coordpy_icpc_battlefield_v1 = 45 tier-1 pure pass-fail >= MIN_RESISTANT_SLICE 30; grader self-test 165/165 each surface; certified Maverick (KNOWN Aug-2024, reachable; C2 flips 24->45); LOCKED the runbook + ran a clean canary, then RAN the EARNED pilot (Maverick x 30-slice, 330 NIM calls): A0 20.00 / A1 23.33 / B 23.33 %; B-A1 = +0.00 pp; MLB-1 83.33% PASS / MLB-2 8.00% FAIL; 6/9; verdict FAIL => bounded resistant ceiling HOLDS; resistant superiority 0 clean across FOUR settings; W114-W119 'no certifiable resistant instrument' escape CLOSED; NO third retirement (W89+W105 stand); contamination-confound STRENGTHENED not proven",
"outcome": "Gated branch milestone with THREE lanes (alpha official-ICPC multi-surface count-gap closure, beta primary-source stronger-model certification, gamma executable battlefield-to-pilot infra), executing docs/RUNBOOK_W120.md (LOCKED before any NIM). ultracode OFF; COO-9 stays lead; COO-44 is the W120 issue. NOT another '24<30' memo and NOT an exposed rerun. LANE alpha: Route alpha1 re-derived all 26 RMRC problems from each official problem.yaml+tree (clean correction: W119's draftlottery is float_relative_tolerance 1e-6 = tier-2 float, not pure pass-fail; W119 headline-24 holds, composition sharpened to 22 pure + 1 float + 1 custom-with-validator; exclusions confirmed load-bearing: poetictournament interactive, alwaysknowwhereyourtowelis custom-without-validator). Route alpha2 admitted a NEW official surface icpc/na-ecna-archive (NA East Division 2024-11-11 + 2025-11-10, 25 post-cutoff Kattis packages; grader self-test 6/6 Python-self-testable problems, 149/149 cases PASS incl. valleygulls 40/40 under a deterministic float oracle). Combined coordpy_icpc_battlefield_v1: 51 seen -> 49 admitted -> 45 tier-1 PURE pass-fail >=30 with margin, NO loosening (+3 float +1 custom = 49 gradeable; 2 typed exclusions; snapshot SHA b212866f; manifest CID bf55bb6c; 30-slice CID 01bf9ef8; combined grader self-test 165/165 each surface; dates 2024-11-11..2025-11-13). LANE beta: Maverick KNOWN Aug-2024 re-verified verbatim (Meta llama4 MODEL_CARD.md), reachable in the 118-model NIM catalogue => C2 flips 24->45 => first certifiable on this family; Qwen3-Coder-480B/DeepSeek-V4-pro/Mistral-Small-4/GLM-5 all UNKNOWN-from-primary; {KNOWN:1, UNKNOWN:4}; nothing newly disclosed since W119. LANE gamma: shipped coordpy.coordpy_icpc_battlefield_v1 (multi-surface aggregator + R1..R8 tiered admission + exclusion audit + combined manifest + deterministic float oracle + per-model certification + core-slice selector + W121FireConditionV1) + coordpy.icpc_reflexion_bench_v1 (stdin/stdout A0/A1/B; report-shape compatible with the VERBATIM W108 gate evaluator) + 2 scripts + 13 tests (2 falsifiability); reuses W119 executor + W114 certify_model_v1 + W117 run_upstream_construction_v1 => decision CID 258b6ed7 re-derives byte-identically. PILOT (earned + ran; runbook locked first; canary clean): Maverick x the deterministic tier-1 core 30-slice, 1x30xK5 = 330 NIM calls, ~54 min; grader = official secret cases (token-diff, NO LLM judge); reflexion feedback = public samples + judge verdict + stderr only. A0 20.00 / A1 23.33 / B 23.33 %; B-A1 = +0.00 pp; MLB-1 83.33% (25/30) PASS / MLB-2 8.00% (2/25) FAIL; Phase-2 6/9; verdict FAIL -> BOUNDED_CEILING_HOLDS_ON_RESISTANT_ICPC (reflexion invoked but net-zero: 2 rescues - 1 regression). NET: NO third retirement (W89 +5.56pp + W105 +7.00pp STAND, the only two, contamination-EXPOSED-HumanEval-family @ 70B); resistant superiority now 0 CLEAN across FOUR settings (W108 -3.33 / W110 +0.00 / W113 +0.00 / W120 +0.00); the W114-W119 'no certifiable resistant instrument' escape is CLOSED (built it, certified Maverick, ran it, mechanism did not transfer); contamination-confound STRENGTHENED (4th + cleanest resistant null on a >=30 grader-clean OFFICIAL battlefield) but NOT proven (single seed 120001; ICPC-vs-HumanEval difficulty + Python-TLE floor unexcluded). W121 = accept the bounded resistant ceiling / genuinely different axis (optional cheap multi-seed) OR a reachable stronger-than-Maverick model with a primary-KNOWN cutoff (prefer strongest honest target on this same >=30 battlefield). 30th consecutive preflight/earn-discipline validation (W93-W120). No version bump; no PyPI; coordpy/__init__.py untouched.",
"commits": [
"0e146c4"
],
"docs": [
"docs/RUNBOOK_W120.md",
"docs/RESULTS_W120_ICPC_BATTLEFIELD_V1.md",
"docs/RESULTS_W120_MILESTONE_SUMMARY_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W120_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W120_V1.md",
"results/w120/icpc_battlefield/battlefield_verdict.json",
"results/w120/icpc_battlefield/battlefield_snapshot.json",
"coordpy/coordpy_icpc_battlefield_v1.py",
"coordpy/icpc_reflexion_bench_v1.py",
"scripts/run_w120_icpc_battlefield_v1.py",
"scripts/run_w120_icpc_pilot.py",
"tests/test_w120_icpc_battlefield_v1.py"
],
"linear_issues": [
"COO-44"
],
"carry_forwards_added": [
"W120-T-OFFICIAL-ICPC-MULTI-SURFACE->=30-BATTLEFIELD-CONSTRUCTIBLE",
"W120-T-MAVERICK-CERTIFIABLE-ON->=30-OFFICIAL-ICPC",
"W120-L-RESISTANT-SUPERIORITY-0-CLEAN-ON-OFFICIAL-ICPC->=30-CAP"
],
"carry_forwards_retired": []
},
{
"id": "W121",
"title": "matched EXPOSED official-ICPC control on the SAME package family as W120 (RMRC 2021 + ECNA 2022-2023 + RMRC 2022-2023 + ECNA 2023-2024, pre-Aug-2024) + Maverick same-model same-mechanism contrast RAN => coordpy_icpc_exposed_control_v1 = 42 tier-1 pure pass-fail >= 30; grader self-test 30 all-pass / 637 official secret cases each surface; Maverick EXPOSED-certifiable (W114 gate C2->C2e); LOCKED runbook + clean canary, then RAN the EARNED pilot (Maverick x exposed 30-slice, 330 NIM calls): A0 6.67 / A1 26.67 / B 30.00 %; B-A1 = +3.33 pp; MLB-1 93.33% PASS / MLB-2 25.00% FAIL; 8/9; verdict FAIL => the matched-family exposure flip did NOT reproduce the retirement-grade HumanEval-family margins (+5.56/+7.00) => contamination-confound WEAKENED, difficulty/family-ease implicated, bounded ceiling HARDENS to HumanEval-family-(ease/structure)-specific @ 70B; NOT refuted (faint sub-floor exposure-consistent gradient; single seed); NO third retirement (W89+W105 stand); paired seed NOT earned",
"outcome": "Gated branch milestone with THREE lanes executing docs/RUNBOOK_W121.md (LOCKED before any NIM; the W120 section-10 FAIL-branch 'genuinely different axis'). Removed the last loophole on the W120 resistant null by holding family+difficulty FIXED and flipping ONLY exposure. LANE alpha: SAME two official github.com/icpc families W120 used (na-ecna-archive + na-rocky-mountain-*-public), pre-cutoff editions of the same regionals; byte-identical R1/R2/R4..R8 with the date rule FLIPPED (E3: date <= 2024-08-31 = EXPOSED) => 50 seen, 48 admitted, 42 tier-1 pure pass-fail >= 30 (+5 float +1 custom; 2 custom-no-validator excluded); SHA 653e3682; manifest CID 8acbc7cc; 30-slice CID 32d15db5; grader self-test 30 all-pass / 637 cases each surface; RMRC 2023-2024 excluded (typed: minimal package, no shipped problem_statement). LANE beta: W114 gate with C2->C2e (>=30 problems at/before cutoff) certifies Maverick on the EXPOSED side; tier-2 all UNKNOWN; EARNED pilot RAN (Maverick x exposed 30-slice, 1 seed x 30 x K=5 = 330 NIM calls, ~70 min): A0 6.67 / A1 26.67 / B 30.00 %; B-A1 = +3.33 pp; MLB-1 93.33% PASS / MLB-2 25.00% FAIL; 8/9; FAIL (net +1 problem: rescues rsamistake+isbnconversion, regression icouldhavewon; bench merkle 618e270a). LANE gamma: shipped coordpy.coordpy_icpc_exposed_control_v1 (reuses W120 classifier/oracle/slice-selector + W114 cutoff registry + W117 run_upstream_construction_v1 => decision CID 258b6ed7 invariant) + MatchedFamilyComparisonV1 (differs_only_in_cutoff_side=True; same org/format/grader/tiers/difficulty/model) + the pre-committed three-branch interpret_exposed_vs_resistant_v1 + W122FireConditionV1 + 3 scripts + 17 tests. THE ANSWER: EXPOSED +3.33pp vs RESISTANT +0.00pp (W120) both within the +-3.34pp null band => the contamination hypothesis predicted a clean exposed margin (absent), difficulty/family-ease predicted a null even when exposed (matched) => contamination-confound WEAKENED (first within-family within-model exposure control), difficulty/family-ease implicated, bounded ceiling HARDENS; NOT refuted (faint sub-floor gradient: exposed +3.33 > resistant +0.00, exposed rescue 25% > resistant 8%; single seed); difficulty comparability supported (exposed A0 6.67% <= resistant A0 20.00%). NO third retirement (W89+W105 STAND); paired seed NOT earned (null-side of band; W106 discipline); COO-9 lead; COO-45; W122 = accept hardened bounded ceiling / different axis OR primary-KNOWN stronger-than-Maverick on BOTH battlefields OR (optional) one paired seed on BOTH. 31st preflight/earn-discipline validation (W93-W121). No version bump; no PyPI; coordpy/__init__.py untouched.",
"commits": [
"d49237a"
],
"docs": [
"docs/RUNBOOK_W121.md",
"docs/RESULTS_W121_EXPOSED_CONTROL_V1.md",
"docs/RESULTS_W121_MILESTONE_SUMMARY_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W121_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W121_V1.md",
"results/w121/exposed_control/exposed_control_verdict.json",
"results/w121/exposed_control/exposed_listing_live.json",
"coordpy/coordpy_icpc_exposed_control_v1.py",
"scripts/build_w121_exposed_listing_v1.py",
"scripts/run_w121_exposed_control_v1.py",
"scripts/run_w121_exposed_pilot.py",
"tests/test_w121_exposed_control_v1.py"
],
"linear_issues": [
"COO-45"
],
"carry_forwards_added": [
"W121-T-MATCHED-EXPOSED-ICPC-CONTROL-CONSTRUCTIBLE",
"W121-T-MAVERICK-EXPOSED-CERTIFIABLE",
"W121-L-MATCHED-FAMILY-EXPOSURE-DOES-NOT-REPRODUCE-MARGIN-CAP"
],
"carry_forwards_retired": []
},
{
"id": "W122",
"title": "matched-family 3-seed paired-seed CLOSURE on official ICPC (seeds 120002 + earned tiebreaker 120003 on BOTH fields; SAME W120 resistant 30-slice CID 01bf9ef8 + W121 exposed 30-slice CID 32d15db5; Maverick; only the seed changed; 1320 NIM calls) => FINAL 3-seed aggregate AMBIGUOUS_THIRD_PAIRED_SEED_EARNED (B4): seed 120003 spiked +10.00pp on BOTH fields (PASS_NON_MECHANISM_DRIVEN) => 3-seed means RESISTANT +4.44pp (OUT of +-3.34 band, in the 3.34-5.00 gap) / EXPOSED +8.89pp (OUT of band); both all_seeds_clean_pass=false, neither shows_margin => B1/B2/B3 all off; closure NOT achieved (the 2-seed 'resistant null vs exposed popped' asymmetry DISSOLVED -- resistant ALSO spiked -- so both fields show rescue-concentrated non-mechanism variance and the matched contrast is unresolvable at n=30); the single-seed caveat is retired (3 seeds/side) but REPLACED by a small-n-variance limitation; M3 KILLED NIM-free (m3_exclusive_signal_fraction=0.000; ICPC secret token-diff => ceiling mechanism-robust); stronger-model gate STRUCTURALLY CLOSED; NO third retirement (W89+W105 STAND); no 4th seed",
"outcome": "Gated branch milestone with THREE lanes executing docs/RUNBOOK_W122.md (LOCKED before any NIM; the W121 section-9 CONFOUND_WEAKENS branch's optional paired-seed tightening, PROMOTED to the main empirical lane). ultracode OFF; COO-9 stays lead; COO-46 is the W122 issue. LANE alpha (matched-family multi-seed closure): reused the EXACT W120 resistant 30-slice + W121 exposed 30-slice (re-derived NIM-free == provenance == pilot guard); Maverick x BOTH x seeds 120002+120003 (1320 calls). Per-seed B-A1: RESISTANT 120001 +0.00 FAIL / 120002 +3.33 FAIL / 120003 +10.00 PASS_NON_MECHANISM_DRIVEN (9/9, MLB-2 16.7%, 3 rescues/0 regr, merkle adf55ff9); EXPOSED 120001 +3.33 FAIL / 120002 +13.33 PASS_NON_MECHANISM_DRIVEN / 120003 +10.00 PASS_NON_MECHANISM_DRIVEN (9/9, MLB-2 18.5%, 3 rescues/0 regr, merkle d88d025f). FINAL 3-seed means RESISTANT +4.44 / EXPOSED +8.89, both out of band, neither all-clean-per-seed => interpret_paired_closure_v1 emits AMBIGUOUS_THIRD_PAIRED_SEED_EARNED (B4), caveat_closed=false; B4-after-the-3rd-seed is terminal (no 4th). The driver was fixed first so the 3-seed aggregate gathers ALL prior seeds keyed by seed (NIM-free verified). CLOSURE NOT achieved: the 3rd seed dissolved the 2-seed asymmetry (resistant also spiked +10.00); at n=30/K=5 the per-field B-A1 swings +-10pp on ~3 rescues => the matched contrast is unresolvable at n=30; the W121 'weakened' read is NOT multi-seed-confirmed and contamination is NOT established (the 3-seed data UNDERCUTS a clean contamination read: contamination-RESISTANT code spikes just like exposed => sampling variance, not exposure; faint exposed>resistant +8.89>+4.44 is exposure-consistent but non-mechanism-driven/noisy). LANE beta (same-family different-mechanism M3): NIM-free audit (audit_icpc_mechanism_signal_v1) over the real W120+W121 sidecars (240 reflexion turns) => m3_exclusive_signal_fraction=0.000 < 0.33 floor => KILL_M3_LANE_NIM_FREE (ICPC hidden oracle is SECRET token-diff, denying M3 its expected/actual differentiator => same-family ceiling mechanism-robust, not merely reflexion-specific); $0. LANE gamma (stronger-model gate): resistant field anchored to Maverick Aug-2024 => only a primary-KNOWN cutoff <= ~Aug-2024 could be resistant-certified, Maverick the unique reachable such model => gate STRUCTURALLY CLOSED; {KNOWN:1, UNKNOWN:4}; decision CID 258b6ed7 invariant; $0. NO third retirement (W89+W105 STAND, the only two); W122 adds none. 1 new explicit-import-only module + 2 scripts + 17 tests (falsifiability-first); fixed a W121 doc CID typo (f7cdc917->01bf9ef8); graphify refreshed START (6b05ccd2) + END; no version bump; no PyPI; coordpy/__init__.py untouched; 32nd consecutive preflight/earn-discipline validation (W93-W122). W123 = accept the bounded HumanEval-family ceiling / escalate to larger n PER FIELD (>=100/field, not more n=30 seeds, no 4th) OR a primary-KNOWN stronger-than-Maverick model on BOTH matched ICPC battlefields. (2026-05-31)",
"commits": [
"45e08ea"
],
"docs": [
"docs/RUNBOOK_W122.md",
"docs/RESULTS_W122_PAIRED_SEED_CLOSURE_V1.md",
"docs/RESULTS_W122_MILESTONE_SUMMARY_V1.md",
"docs/RESULTS_W122_MECHANISM_AUDIT_V1.md",
"docs/CONTAMINATION_CONTROL_FRAMING_W122_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W122_V1.md",
"results/w122/paired_seed/w122_paired_seed_120003_20260531T184401Z/paired_seed_closure_verdict.json",
"results/w122/mechanism_audit/mechanism_audit_verdict.json",
"coordpy/coordpy_icpc_paired_seed_closure_v1.py",
"scripts/run_w122_paired_seed_pilot.py",
"scripts/run_w122_mechanism_signal_audit_v1.py",
"tests/test_w122_paired_seed_closure_v1.py"
],
"linear_issues": [
"COO-46"
],
"carry_forwards_added": [
"W122-T-PAIRED-SEED-CLOSURE-RULE-PRECOMMITTED",
"W122-T-PAIRED-SEED-3SEED-AMBIGUOUS-B4",
"W122-T-BOTH-FIELDS-NONMECHANISM-SPIKE",
"W122-T-THIRD-PAIRED-SEED-RAN-RESOLVED-B4",
"W122-T-ICPC-CEILING-MECHANISM-ROBUST",
"W122-T-STRONGER-MODEL-GATE-STRUCTURALLY-CLOSED",
"W122-L-NO-THIRD-RETIREMENT-CAP",
"W122-L-MATCHED-CONTRAST-UNRESOLVABLE-AT-N30-CAP"
],
"carry_forwards_retired": []
},
{
"id": "W123",
"title": "large-n matched ICPC closure ATTEMPT (>=100/field): official-family supply census",
"outcome": "alpha-CAP — the >=100/field matched battlefield is UNREACHABLE from the official github.com/icpc family, blocked SOLELY on the post-cutoff axis. Lane-alpha (NIM-free, live gh-API-verified all_match=true, census_cid bf7b2efbc724): RESISTANT 4 post-cutoff surfaces 51 raw/~45 tier-1 (51==W120 n_seen; all mined by W120; no 5th; <100 even at 100% yield = HARD CAP) vs EXPOSED 11 pre-cutoff surfaces 135 raw/~113 tier-1 (REACHES >=100). Lane-beta NOT earned (needs BOTH>=100) => $0 NIM, no pilot, no n=30 seeds. Lane-gamma stronger-model gate CLOSED+MOOT {KNOWN:1,UNKNOWN:4}; decision CID 258b6ed7 invariant. n=30 caveat now POST-CUTOFF-SUPPLY-BOUND (W123-L). W89+W105 STAND (only two retirements). No version bump; no PyPI; coordpy/__init__.py untouched. Self-caught + corrected a phantom-output census-count error mid-milestone.",
"commits": [
"c023599",
"388032b"
],
"docs": [
"docs/RUNBOOK_W123.md",
"docs/RESULTS_W123_LARGEN_SUPPLY_CENSUS_V1.md"
],
"linear_issues": [
"COO-48"
],
"carry_forwards_added": [
"W123-L-LARGEN-MATCHED-BATTLEFIELD-SUPPLY-UNREACHABLE",
"W123-T-STRONGER-MODEL-GATE-CLOSED-AND-MOOT"
],
"carry_forwards_retired": []
},
{
"id": "W124",
"title": "transformer-native code-intervention line on matched ICPC (M4 AST-boundary hidden-state probe on distilgpt2) + learned-memory/controller line + hosted gate",
"outcome": "M4_CLOSE_BLIP_NOT_A_GAIN — distilgpt2 AST-boundary hidden state adds nothing over surface (AUC_hidden 0.6345 ≈ AUC_surface 0.6343, Δ=+0.0001 ≪ +0.05); transformer-native hidden-state line BLOCKED AT THE PRECURSOR on local code-model-encoder supply (no code-competent local model; tf 4.28.1 too old + repo transformers_runtime_v1 needs ≥~4.36); learned-memory controller at chance 0.502 (TOO_SYNTHETIC_NOT_WARRANTED); stronger-model gate CLOSED, decision CID 258b6ed7 invariant {KNOWN:1,UNKNOWN:4}; M6 tool-substrate controller contract shipped (text-translatable, contract-only); NO hosted Maverick probe earned ($0 NIM); W89+W105 STAND (no third retirement); ultracode OFF; no version bump; no PyPI; coordpy/__init__.py untouched",
"commits": [
"e3348fd"
],
"docs": [
"docs/RUNBOOK_W124.md",
"docs/RESULTS_W124_TRANSFORMER_NATIVE_CODE_INTERVENTION_V1.md"
],
"linear_issues": [
"COO-49"
],
"carry_forwards_added": [
"W124-L-TRANSFORMER-NATIVE-HIDDEN-STATE-ADDS-NOTHING-OVER-SURFACE",
"W124-L-TRANSFORMER-NATIVE-LINE-BLOCKED-AT-PRECURSOR-LOCAL-ENCODER-SUPPLY",
"W124-L-LEARNED-MEMORY-CONTROLLER-AT-CHANCE-ON-ICPC-RESCUE",
"W124-T-M6-TOOL-SUBSTRATE-CONTROLLER-CONTRACT-SHIPS",
"W124-T-STRONGER-MODEL-GATE-CLOSED-AND-HOSTED-MOOT",
"W124-L-NO-THIRD-RETIREMENT-CAP"
],
"carry_forwards_retired": []
},
{
"id": "W125",
"title": "hosted controller-native code mechanism on the official RESISTANT ICPC family (router/logprob-router/cache-aware-planner v12 + tool_call_substrate_v1 + executor_grounded_patcher_v1) + resistant-first $0 headroom replay + stronger-model gate",
"outcome": "controller arsenal is REAL + contract-clean (mechanism question YES) but the resistant field is GENERATION-CAPPED for $0 re-routing (spend question NOT earned). Lane alpha: controller_native_code_mechanism_v1 (first bridge of the hosted-controller stack to the audited tool-call substrate; graphify-confirmed 6 hops apart, no edge); structural fake-different test BITES (reflexion B + C0 control => FAKE_DIFFERENT; C1/C2/C3 => REAL; lead C3 cid 7989655f); all 4 NIM-free contract checks PASS. Lane beta ($0 replay over 330 already-paid Maverick gens on the W120 resistant 30-slice CID 01bf9ef8): A1 7/30, pool-union 8/30, oracle_headroom +1 (in null band), C2 7/30 + C3 7/30 => blind_selection_headroom=0; reflexion_divergence 23/30 but looks_right_fails_hidden 10 (public-sample signal non-discriminating) => FRESH_RESISTANT_PILOT_NOT_EARNED_HEADROOM_CAP; $0 NIM; exposed control NOT bought (resistant-first). Lane gamma: NO_CERTIFIABLE_STRONGER_MODEL, decision CID 258b6ed7 invariant, {KNOWN:1,UNKNOWN:4}; local transformer-native line CLOSED. W89+W105 STAND (no third retirement). ultracode OFF; no version bump; no PyPI; coordpy/__init__.py untouched",
"commits": [
"97232eb"
],
"docs": [
"docs/RUNBOOK_W125.md",
"docs/RESULTS_W125_CONTROLLER_NATIVE_CODE_MECHANISM_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W125_V1.md"
],
"linear_issues": [
"COO-50"
],
"carry_forwards_added": [
"W125-T-CONTROLLER-NATIVE-MECHANISM-REAL-NOT-FAKE-DIFFERENT",
"W125-L-RESISTANT-GENERATION-CAP",
"W125-T-RESISTANT-PUBLIC-SAMPLE-SIGNAL-NON-DISCRIMINATING",
"W125-L-NO-THIRD-RETIREMENT-CAP"
],
"carry_forwards_retired": []
},
{
"id": "W126",
"title": "family-adapted repair SYNTHESIS on the official RESISTANT ICPC family (adversarial_consensus_repair_v1.trust_weighted_consensus_v1 + executor_grounded_patcher_v1 digest bridged onto the official-ICPC code path) + resistant-first $0 synthesis precursor + stronger-model gate",
"outcome": "synthesis slate REAL + arsenal-native + contract/leakage-clean (mechanism question YES) but the resistant field is SYNTHESIS-CAPPED for $0 NEW-trajectory creation (oracle ceiling 0/22; spend question NOT earned); FRESH_RESISTANT_PILOT_NOT_EARNED_SYNTHESIS_DEAD; $0 NIM; W89+W105 STAND (no third retirement)",
"commits": [
"01b456a"
],
"docs": [
"docs/RUNBOOK_W126.md",
"docs/RESULTS_W126_FAMILY_ADAPTED_REPAIR_SYNTHESIS_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W126_V1.md"
],
"linear_issues": [
"COO-51"
],
"carry_forwards_added": [
"W126-L-RESISTANT-SYNTHESIS-CAP",
"W126-T-SYNTHESIS-OF-CAPABILITY-FAILURES-IS-DEAD-NOT-BLIND-CAPPED"
],
"carry_forwards_retired": []
},
{
"id": "W127",
"title": "resistant capability atlas + family-specific algorithm-scaffold GENERATION + targeted resistant fresh-generation probe (operator-greenlit dev spend; coordpy.resistant_capability_atlas_v1 + coordpy.family_scaffold_generation_v1 wired onto the official-ICPC path under a no-leakage rule)",
"outcome": "atlas (22 resistant failures = 95% wrong-algorithm + algorithmically DIVERSE, no dominant cluster, surface labels 47% concordant) + scaffold line REAL; EXPOSED dev bench EARNED (scaffold 3/8 vs baseline 1/8, net +2 / 2 families / 0 regressions / leakage-clean after a documented boilerplate-FP recalibration to a contiguous-block tripwire, positive control preserved) but the earned R1 and R2 targeted resistant probe = 0/6 new solves => RESISTANT_SCAFFOLD_FRESH_GEN_CAP (the exposed +2 did NOT transfer to the non-memorizable resistant field; most parsimoniously memorization/variance/framing, not scaffold-taught algorithm); stronger-model gate CLOSED 258b6ed7 {KNOWN:1,UNKNOWN:4}; exposed control NOT bought; $0 beyond authorized dev(80)+probe(30)=110 NIM; W89+W105 STAND (no third retirement); COO-9 lead; no version bump; no PyPI; coordpy/__init__.py untouched",
"commits": [
"95f55a8"
],
"docs": [
"docs/RUNBOOK_W127.md",
"docs/RESULTS_W127_CAPABILITY_ATLAS_AND_SCAFFOLD_GENERATION_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W127_V1.md"
],
"linear_issues": [
"COO-52"
],
"carry_forwards_added": [
"W127-L-RESISTANT-SCAFFOLD-FRESH-GEN-CAP",
"W127-T-RESISTANT-FAILURES-ARE-DIVERSE-WRONG-ALGORITHM-NOT-SURFACE",
"W127-T-EXPOSED-SCAFFOLD-DEV-BENCH-EARNS-WEAKLY-CONFOUNDED"
],
"carry_forwards_retired": []
},
{
"id": "W128",
"title": "role-diverse algorithm SEARCH on the non-scaffoldable resistant ICPC clusters + same-family hard-cluster dev bench + targeted resistant probe (operator-greenlit EXPOSED dev spend; coordpy.role_diverse_algorithm_search_v1 bridging the W41/W42 synthesis decisions + the executor digest onto the official-ICPC path under a no-leakage rule)",
"outcome": "mechanism REAL (all 11 EXPOSED hard-cluster dev runs classify genuine REAL-diversity after a documented sketch-parser-bug recalibration + clean re-run) and the diverse SEARCH LIFTS the generation ceiling (pool 3/11 > plain baseline 2/11, reaching a simulation_grid program i.i.d. sampling missed) BUT NOT EARNED: RDA4 committed 2/11, net +0 (=+1 unique blueberrywaffle - 1 regression pawnshop), ties baseline + the W127 scaffold line => ROLE_DIVERSE_HARD_CLUSTER_DEV_BENCH_NOT_EARNED; the cap is localized to the verification-based SELECTION layer (abstains 7/11, mis-commits the one pool-only win), NOT generation (W128-T-ROLE-DIVERSE-SEARCH-LIFTS-GENERATION-CEILING-BUT-SELECTION-CAPPED). T1 FALSE => targeted resistant probe NOT launched (T2 technically matches via the simulation_grid unique solve but T1 fails the net bar) => $0 resistant NIM; exposed control NOT bought; stronger-model gate CLOSED 258b6ed7 {KNOWN:1,UNKNOWN:4}; W79 substrate-controller literal bridge examined + KILLED as fake (W128-T-SUBSTRATE-CONTROLLERS-NOT-CODE-CONSENSUS-APPLICABLE); graph_flow EXPOSED supply 0 (W128-L-GRAPH-FLOW-EXPOSED-SUPPLY-CAP); 360 EXPOSED dev calls (30 validation + 165 parser-bug-invalidated + 165 valid), $0 resistant; W89+W105 STAND (no third retirement); COO-9 lead; no version bump; no PyPI; coordpy/__init__.py untouched",
"commits": [
"03501ef"
],
"docs": [
"docs/RUNBOOK_W128.md",
"docs/RESULTS_W128_ROLE_DIVERSE_ALGORITHM_SEARCH_V1.md",
"docs/FRONTIER_RELEVANCE_AUDIT_W128_V1.md"
],
"linear_issues": [
"COO-53"