diff --git a/.github/scripts/test_ci_workflow.py b/.github/scripts/test_ci_workflow.py index 4f1971e..46de6f0 100644 --- a/.github/scripts/test_ci_workflow.py +++ b/.github/scripts/test_ci_workflow.py @@ -51,6 +51,7 @@ def test_ci_workflow_guard_is_run_by_ci(self) -> None: self.assertIn("python3 .github/scripts/test_execution_status.py", text) self.assertIn("python3 .github/scripts/test_roadmap_status.py", text) self.assertIn("python3 .github/scripts/test_milestone_b_closeout_record.py", text) + self.assertIn("python3 .github/scripts/test_milestone_b_exit_checklist.py", text) if __name__ == "__main__": diff --git a/.github/scripts/test_milestone_b_exit_checklist.py b/.github/scripts/test_milestone_b_exit_checklist.py new file mode 100644 index 0000000..07bc3f9 --- /dev/null +++ b/.github/scripts/test_milestone_b_exit_checklist.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# +# Copyright 2026 The Ethos maintainers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +import re +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[2] +CHECKLIST = ROOT / "docs/milestone-b-exit-checklist.md" +ROADMAP = ROOT / "docs/roadmap.md" + + +def checklist_text() -> str: + return CHECKLIST.read_text(encoding="utf-8") + + +def normalized_checklist_text() -> str: + return re.sub(r"\s+", " ", checklist_text()) + + +class MilestoneBExitChecklistTests(unittest.TestCase): + def test_roadmap_links_to_checklist(self) -> None: + text = ROADMAP.read_text(encoding="utf-8") + + self.assertIn("[13-B exit checklist](milestone-b-exit-checklist.md)", text) + + def test_checklist_names_current_validation_commands(self) -> None: + text = checklist_text() + + self.assertIn("make milestone-b-internal-checks PYTHON=/bin/python", text) + self.assertIn("make verify-alpha", text) + self.assertIn("make layout-evaluator-alpha", text) + self.assertIn("make python-surface-test", text) + + def test_checklist_covers_internal_b_lanes(self) -> None: + text = checklist_text() + + for lane in [ + "WS-VERIFY-ALPHA", + "WS-LAYOUT", + "WS-SURFACES", + "WS-HARNESS", + "DETERMINISM", + ]: + self.assertIn(lane, text) + + def test_checklist_keeps_public_boundaries_explicit(self) -> None: + text = normalized_checklist_text() + + self.assertIn("does not approve public benchmark reports", text) + self.assertIn("release artifacts", text) + self.assertIn("package publication", text) + self.assertIn("production positioning", text) + self.assertIn("Performance/quality/footprint claims remain blocked", text) + self.assertIn("Table-quality and parser-quality claims remain blocked", text) + + def test_checklist_does_not_claim_broader_scope(self) -> None: + text = checklist_text() + + self.assertIn("No semantic/arithmetic verification expansion is claimed.", text) + self.assertIn("No broader parser/table/OCR completion is claimed.", text) + + +if __name__ == "__main__": + unittest.main() diff --git a/.github/scripts/test_milestone_b_internal_checks.py b/.github/scripts/test_milestone_b_internal_checks.py index c91a6cd..776913b 100644 --- a/.github/scripts/test_milestone_b_internal_checks.py +++ b/.github/scripts/test_milestone_b_internal_checks.py @@ -63,6 +63,7 @@ def test_target_composes_current_internal_gates(self) -> None: "$(PYTHON) .github/scripts/test_execution_status.py", "$(PYTHON) .github/scripts/test_roadmap_status.py", "$(PYTHON) .github/scripts/test_milestone_b_closeout_record.py", + "$(PYTHON) .github/scripts/test_milestone_b_exit_checklist.py", "$(MAKE) verify-alpha PYTHON=$(PYTHON)", "$(MAKE) layout-evaluator-alpha PYTHON=$(PYTHON)", "$(MAKE) python-surface-test PYTHON=$(PYTHON)", diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ba22e3b..54a883c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,6 +64,8 @@ jobs: run: python3 .github/scripts/test_roadmap_status.py - name: Milestone B closeout validation record tests run: python3 .github/scripts/test_milestone_b_closeout_record.py + - name: Milestone B exit checklist tests + run: python3 .github/scripts/test_milestone_b_exit_checklist.py - name: Gate Zero harness tests run: python3 benchmarks/harness/test_run_gate_zero.py - name: same-platform double-parse byte-diff diff --git a/Makefile b/Makefile index 6c1b324..3788e4d 100644 --- a/Makefile +++ b/Makefile @@ -54,6 +54,7 @@ milestone-b-internal-checks: $(PYTHON) .github/scripts/test_execution_status.py $(PYTHON) .github/scripts/test_roadmap_status.py $(PYTHON) .github/scripts/test_milestone_b_closeout_record.py + $(PYTHON) .github/scripts/test_milestone_b_exit_checklist.py $(MAKE) verify-alpha PYTHON=$(PYTHON) $(MAKE) layout-evaluator-alpha PYTHON=$(PYTHON) $(MAKE) python-surface-test PYTHON=$(PYTHON) diff --git a/docs/milestone-b-exit-checklist.md b/docs/milestone-b-exit-checklist.md new file mode 100644 index 0000000..f896427 --- /dev/null +++ b/docs/milestone-b-exit-checklist.md @@ -0,0 +1,48 @@ +# Milestone B Internal Exit Checklist + +This checklist maps the current committed source tree to the Milestone B lanes in +`docs/IMPLEMENTATION_PLAN.md` and the PRD's Milestone B section. + +Status: Internal Milestone B closeout validation is green for the current committed scope. +Milestone exit remains a decider call. This checklist does not approve public benchmark reports, +release artifacts, package publication, production positioning, or performance/quality/footprint +claims. + +## Required Internal Validation + +Run the aggregate closeout target from a clean tree: + +```sh +make milestone-b-internal-checks PYTHON=/bin/python +``` + +The target includes fixture validation, font-policy validation, status/roadmap/closeout-record +guards, `make verify-alpha`, `make layout-evaluator-alpha`, `make python-surface-test`, claim +language guardrails, public-readiness guardrails, and `git diff --check`. + +## Exit Evidence + +| Lane | Internal B criterion | Current evidence | Closeout status | Still outside scope | +| --- | --- | --- | --- | --- | +| WS-VERIFY-ALPHA | Alpha verification over native Ethos JSON and OpenDataLoader-style grounding sources, with deterministic evidence matching and capability-aware reports | `make verify-alpha`; native, synthetic OpenDataLoader-style, and pinned real OpenDataLoader fixtures; split-quote, stale-fingerprint, non-v1, capability-limited, malformed-input, and summary diagnostics coverage | Present for current v1 alpha policy | Future claim-kind expansion, `verify_citations` v1 hardening, broader adapter shapes, semantic/arithmetic verification | +| WS-LAYOUT | Reading order, block grouping, heading/list alpha behavior, and Markdown/text export fixtures | `make layout-evaluator-alpha`; fixture metadata and committed extraction/layout/text/Markdown goldens | Present for current fixture-backed alpha scope | Broader table, nested-list, richer heading, OCR/image-only, and wider layout semantics | +| WS-SURFACES | Internal Python surface scaffold for local CLI-backed parsing calls | `make python-surface-test`; stdlib tests with a fake caller-provided `ethos` command | Present as internal scaffold | Native bindings, wheel publication, package setup, public API stability | +| WS-HARNESS | Internal validation path composes fixture, trust-loop, layout, surface, and policy guardrails | `make milestone-b-internal-checks`; `docs/validation/milestone-b-closeout-validation-2026-06-17.md` | Present for current source-tree validation | Public comparison report flow, claim-wording approval, release/package approval | +| DETERMINISM | PR and nightly workflow guardrails cover current deterministic contracts, with Windows x64 preflight for core contracts | CI workflow static guards; `test_determinism_workflow.py`; same-platform checks in current internal validation paths | Present for current configured contracts | Windows PDFium runtime provisioning and broader cross-platform corpus validation | + +## Boundaries + +- This checklist is internal closeout evidence only. +- Public benchmark reports remain blocked. +- Release artifacts and package publication remain blocked. +- Production positioning remains blocked. +- Performance/quality/footprint claims remain blocked. +- Table-quality and parser-quality claims remain blocked. +- No semantic/arithmetic verification expansion is claimed. +- No broader parser/table/OCR completion is claimed. + +## Next Milestone Hand-off + +Milestone C should start from the trust-loop and fixture/evaluator contracts already guarded here. +The first C slice should choose a single lane, add fixture-backed behavior first, and update this +checklist only if the Milestone B closeout contract itself changes. diff --git a/docs/roadmap.md b/docs/roadmap.md index f355f68..de46daf 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -19,7 +19,7 @@ performance/quality/footprint claims. | --- | --- | --- | --- | | Week 0 | pre-kickoff | ADRs, governance, corpus freeze, CI bootstrap, competitor pins | All 11 rows done; clock starts | | A | weeks 1-8 | Contracts (5 schemas, c14n, deterministic profile), trust-boundary artifacts (`GroundingSource`, verification schemas, OpenDataLoader adapter stub, `ethos verify` CLI stub), PDFium Phase 1 spike, harness + competitor adapters, CLI skeleton | **Gate Zero**: ADR-0005 is accepted as `PROCEED` for internal Milestone B continuation. This is not public benchmark, release, package, production, or claim approval. | -| B | weeks 9-14 | **`ethos verify` alpha first**: native Ethos JSON + synthetic and pinned real OpenDataLoader verification demos, stale fingerprint checks, capability-limited reports, deterministic evidence matching including split-quote coverage, explicit unsupported non-v1 claim reporting, adapter structure diagnostics; then reading order, blocks, headings, lists, Markdown/text exporters, Python wheel scaffold, quality dashboard, Windows x64 nightly determinism | 13-B exit checklist | +| B | weeks 9-14 | **`ethos verify` alpha first**: native Ethos JSON + synthetic and pinned real OpenDataLoader verification demos, stale fingerprint checks, capability-limited reports, deterministic evidence matching including split-quote coverage, explicit unsupported non-v1 claim reporting, adapter structure diagnostics; then reading order, blocks, headings, lists, Markdown/text exporters, Python wheel scaffold, quality dashboard, Windows x64 nightly determinism | [13-B exit checklist](milestone-b-exit-checklist.md) | | C | weeks 15-22 | Simple/bordered tables; RAG chunker + citations; non-text region coordinates; security report + default-chunk exclusion; debug overlay; internal benchmark snapshot | 13-C exit + first checkpoint | | D | weeks 23-30 | `verify_citations` v1; crop API; sandbox/subprocess backend; Node beta and MCP experimental only if staffed or accepted by release-scope ADR | 13-D exit | | E | weeks 31-40 | Public benchmark report (reproducible, labeled tiers); PDFium Phase 2 project-maintained builds; stable CLI/Python docs; proof-of-trust demos; **Public Beta** | Release 1 claim audit + public-beta checkpoint |