diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fd027dec..43ca52b1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -96,7 +96,12 @@ jobs: runs-on: ubuntu-24.04 timeout-minutes: 10 steps: + # fetch-depth: 0 — mirrors deploy-pages.yml so the conformance over-time + # chart (built from the git history of conformance_status.csv) is exercised + # by the pre-merge build check, not just the post-merge deploy. - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + with: + fetch-depth: 0 - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: @@ -110,6 +115,11 @@ jobs: - name: Build site working-directory: website + # GITHUB_TOKEN raises the GitHub API rate limit for _data/releases.js + # (the releases page is generated from the live Releases API at build + # time); the build still degrades gracefully if the call fails. + env: + GITHUB_TOKEN: ${{ github.token }} run: npm run build # ── Lint (runs in parallel with all test jobs) ───────────────────────────── @@ -225,16 +235,16 @@ jobs: - name: Install cargo-llvm-cov uses: taiki-e/install-action@81ee9698f20724138a785d788c7567d40f14cd2d # cargo-llvm-cov - # The PEP conformance suite is fetched fresh from the upstream typing repo - # on every run (network-bound, one HTTP request per test file). Cache it - # keyed on conformance.sh, which holds the pinned TYPING_REF — bumping the - # ref edits that file and busts the cache; conformance.sh itself re-fetches - # whenever the stamped ref differs, so a stale restore self-heals. - - name: Cache conformance suite + # Only the conformance FIXTURES are fetched (the calculator + # conformance/upstream_main.py is committed). Cache the downloaded fixtures + # keyed on score.py, which holds the pinned ref (PINNED_TYPING_REF) — bumping + # the ref edits that file and busts the cache; score.py re-fetches whenever + # the stamped ref differs, so a stale prefix restore self-heals. + - name: Cache conformance fixtures uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 with: - path: crates/basilisk-cli/tests/conformance - key: conformance-suite-${{ hashFiles('scripts/conformance.sh') }} + path: conformance/tests + key: conformance-suite-${{ hashFiles('conformance/score.py') }} restore-keys: conformance-suite- - name: Run Rust tests with coverage diff --git a/.github/workflows/deploy-pages.yml b/.github/workflows/deploy-pages.yml index 8e9a8722..d6b24732 100644 --- a/.github/workflows/deploy-pages.yml +++ b/.github/workflows/deploy-pages.yml @@ -37,7 +37,12 @@ jobs: name: github-pages url: ${{ steps.deploy.outputs.page_url }} steps: + # fetch-depth: 0 — the conformance over-time chart (_data/conformance.js) + # reads the full git history of conformance/conformance_status.csv. A + # shallow clone would collapse the chart to a single point. - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + with: + fetch-depth: 0 - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: @@ -51,6 +56,11 @@ jobs: - name: Build site working-directory: website + # GITHUB_TOKEN raises the GitHub API rate limit for _data/releases.js + # (the releases page is generated from the live Releases API at build + # time); the build still degrades gracefully if the call fails. + env: + GITHUB_TOKEN: ${{ github.token }} run: npm run build - uses: actions/configure-pages@45bfe0192ca1faeb007ade9deae92b16b8254a0d # v6.0.0 diff --git a/.gitignore b/.gitignore index 34051a68..6e3faf1a 100644 --- a/.gitignore +++ b/.gitignore @@ -106,9 +106,11 @@ deslop-*.log benchmarks/results/ # ============================================================================= -# Conformance test suite (fetched via make conformance / scripts/conformance.sh) +# Conformance test FIXTURES: downloaded on demand into conformance/tests/ (via +# make conformance / score.py --fetch) — never committed. NOTE: the official +# calculator conformance/upstream_main.py IS committed and is NOT ignored. # ============================================================================= -crates/basilisk-cli/tests/conformance/ +conformance/tests/ # ============================================================================= # Tool / agent state @@ -149,3 +151,8 @@ URGENT_READ_ME_NOW.md .deslop-cache/ .ghissues/ + +website/_verify/ + + +/*.png \ No newline at end of file diff --git a/Makefile b/Makefile index 70d8d93e..bcef2c3e 100644 --- a/Makefile +++ b/Makefile @@ -167,7 +167,8 @@ mutation-test: ## conformance/conformance_status.csv. Fetches the upstream suite if missing; ## use FETCH=1 to force a re-download. conformance: - @bash scripts/conformance.sh $(if $(filter 1,$(FETCH)),--fetch,) + @cargo build -p basilisk-cli --bin basilisk + @python3 conformance/score.py --bin target/debug/basilisk $(if $(filter 1,$(FETCH)),--fetch,) ## bench: Benchmark Basilisk vs pyright/mypy/ty/pyrefly on the fixture suite. ## Requires hyperfine; competitor tools are skipped if not installed. diff --git a/conf-chart.png b/conf-chart.png new file mode 100644 index 00000000..b6b5d952 Binary files /dev/null and b/conf-chart.png differ diff --git a/conformance/conformance_status.csv b/conformance/conformance_status.csv index 56baea8d..5233c37c 100644 --- a/conformance/conformance_status.csv +++ b/conformance/conformance_status.csv @@ -1,144 +1,144 @@ basilisk_rules,file,category,status,caught,missed,false_positives -,_directives_deprecated_library.py,,PASS,0,0,0 -,_enums_member_values.py,,PASS,0,0,0 -,_enums_members.py,,PASS,0,0,0 -,_protocols_modules1.py,,PASS,0,0,0 -,_protocols_modules2.py,,PASS,0,0,0 -,_qualifiers_final_annotation_1.py,,PASS,0,0,0 -,_qualifiers_final_annotation_2.py,,PASS,0,0,0 -BSK-E0048,aliases_explicit.py,aliases,PASS,21,0,0 -BSK-E0047|BSK-E0048|BSK-E0092,aliases_implicit.py,aliases,PASS,22,0,0 +,_directives_deprecated_library.py,directives,PASS,0,0,0 +,_enums_member_values.py,enums,PASS,0,0,0 +,_enums_members.py,enums,PASS,0,0,0 +,_protocols_modules1.py,protocols,PASS,0,0,0 +,_protocols_modules2.py,protocols,PASS,0,0,0 +BSK-W0050,_qualifiers_final_annotation_1.py,qualifiers,FAIL,0,0,1 +,_qualifiers_final_annotation_2.py,qualifiers,PASS,0,0,0 +BSK-E0002|BSK-E0048,aliases_explicit.py,aliases,FAIL,21,0,2 +BSK-E0002|BSK-E0047|BSK-E0048|BSK-E0092,aliases_implicit.py,aliases,FAIL,22,0,3 BSK-E0014|BSK-E0050,aliases_newtype.py,aliases,PASS,14,0,0 BSK-E0014|BSK-E0104,aliases_recursive.py,aliases,PASS,11,0,0 -BSK-E0057|BSK-E0149,aliases_type_statement.py,aliases,PASS,24,0,0 -BSK-E0151,aliases_typealiastype.py,aliases,PASS,22,0,0 +BSK-E0002|BSK-E0057|BSK-E0149,aliases_type_statement.py,aliases,FAIL,24,1,1 +BSK-E0005|BSK-E0151,aliases_typealiastype.py,aliases,FAIL,22,0,1 BSK-E0107,aliases_variance.py,aliases,PASS,4,0,0 ,annotations_coroutines.py,annotations,PASS,0,0,0 -BSK-E0047,annotations_forward_refs.py,annotations,PASS,19,0,0 +BSK-E0002|BSK-E0047,annotations_forward_refs.py,annotations,FAIL,19,0,1 BSK-E0120|BSK-E0131,annotations_generators.py,annotations,PASS,10,0,0 ,annotations_methods.py,annotations,PASS,0,0,0 -BSK-E0024|BSK-E0047|BSK-E0048,annotations_typeexpr.py,annotations,PASS,15,0,0 -BSK-E0014|BSK-E0015|BSK-E0122|BSK-E0140,callables_annotation.py,callables,PASS,16,0,0 +BSK-E0002|BSK-E0011|BSK-E0024|BSK-E0047|BSK-E0048,annotations_typeexpr.py,annotations,FAIL,15,0,3 +BSK-E0002|BSK-E0011|BSK-E0014|BSK-E0015|BSK-E0122|BSK-E0140,callables_annotation.py,callables,FAIL,16,0,5 BSK-E0012|BSK-E0140|BSK-E0141,callables_kwargs.py,callables,PASS,13,0,0 -BSK-E0140,callables_protocol.py,callables,PASS,17,0,0 -BSK-E0014|BSK-E0136,callables_subtyping.py,callables,PASS,32,0,0 -BSK-E0014|BSK-E0036|BSK-E0044|BSK-E0121,classes_classvar.py,classes,PASS,17,0,0 -,classes_override.py,classes,PASS,0,0,0 -BSK-E0111|BSK-E0128,constructors_call_init.py,constructors,PASS,5,0,0 -BSK-E0041,constructors_call_metaclass.py,constructors,PASS,2,0,0 -BSK-E0074,constructors_call_new.py,constructors,PASS,2,0,0 -BSK-E0144,constructors_call_type.py,constructors,PASS,8,0,0 -BSK-E0153,constructors_callable.py,constructors,PASS,12,0,0 +BSK-E0001|BSK-E0002|BSK-E0011|BSK-E0140,callables_protocol.py,callables,FAIL,17,0,18 +BSK-E0002|BSK-E0014|BSK-E0136,callables_subtyping.py,callables,FAIL,32,0,9 +BSK-E0001|BSK-E0002|BSK-E0011|BSK-E0014|BSK-E0036|BSK-E0044|BSK-E0121|BSK-W0050,classes_classvar.py,classes,FAIL,17,0,4 +BSK-E0002|BSK-E0011,classes_override.py,classes,FAIL,0,5,4 +BSK-E0011|BSK-E0111|BSK-E0128,constructors_call_init.py,constructors,FAIL,5,0,1 +BSK-E0002|BSK-E0004|BSK-E0011|BSK-E0041,constructors_call_metaclass.py,constructors,FAIL,2,0,4 +BSK-E0002|BSK-E0004|BSK-E0011|BSK-E0074,constructors_call_new.py,constructors,FAIL,2,0,11 +BSK-E0002|BSK-E0004|BSK-E0144,constructors_call_type.py,constructors,FAIL,8,0,8 +BSK-E0004|BSK-E0011|BSK-E0153,constructors_callable.py,constructors,FAIL,12,0,4 ,constructors_consistency.py,constructors,PASS,0,0,0 -,dataclasses_descriptors.py,dataclasses,PASS,0,0,0 +BSK-E0011,dataclasses_descriptors.py,dataclasses,FAIL,0,0,6 BSK-E0054,dataclasses_final.py,dataclasses,PASS,5,0,0 BSK-E0052,dataclasses_frozen.py,dataclasses,PASS,2,0,0 -BSK-E0063,dataclasses_hash.py,dataclasses,PASS,4,0,0 +BSK-E0001|BSK-E0063,dataclasses_hash.py,dataclasses,FAIL,4,0,1 BSK-E0017,dataclasses_inheritance.py,dataclasses,PASS,2,0,0 BSK-E0069,dataclasses_kwonly.py,dataclasses,PASS,3,0,0 -BSK-E0059,dataclasses_match_args.py,dataclasses,PASS,1,0,0 +BSK-E0005|BSK-E0059,dataclasses_match_args.py,dataclasses,FAIL,1,0,1 BSK-E0060,dataclasses_order.py,dataclasses,PASS,1,0,0 BSK-E0095,dataclasses_postinit.py,dataclasses,PASS,4,0,0 -BSK-E0108,dataclasses_slots.py,dataclasses,PASS,4,0,0 -BSK-E0142,dataclasses_transform_class.py,dataclasses,PASS,6,0,0 +BSK-E0002|BSK-E0005|BSK-E0108,dataclasses_slots.py,dataclasses,FAIL,4,1,5 +BSK-E0011|BSK-E0142,dataclasses_transform_class.py,dataclasses,FAIL,6,0,1 BSK-E0142,dataclasses_transform_converter.py,dataclasses,PASS,9,0,0 -BSK-E0069,dataclasses_transform_field.py,dataclasses,PASS,2,0,0 -BSK-E0014|BSK-E0052|BSK-E0060|BSK-E0069|BSK-E0111,dataclasses_transform_func.py,dataclasses,PASS,5,0,0 -BSK-E0138,dataclasses_transform_meta.py,dataclasses,PASS,6,0,0 -BSK-E0041|BSK-E0069|BSK-E0096,dataclasses_usage.py,dataclasses,PASS,8,0,0 -BSK-E0039|BSK-E0053,directives_assert_type.py,directives,PASS,7,0,0 +BSK-E0011|BSK-E0069,dataclasses_transform_field.py,dataclasses,FAIL,2,0,4 +BSK-E0011|BSK-E0014|BSK-E0052|BSK-E0060|BSK-E0069|BSK-E0111,dataclasses_transform_func.py,dataclasses,FAIL,5,0,1 +BSK-E0004|BSK-E0011|BSK-E0138,dataclasses_transform_meta.py,dataclasses,FAIL,6,0,2 +BSK-E0002|BSK-E0005|BSK-E0041|BSK-E0069|BSK-E0096,dataclasses_usage.py,dataclasses,FAIL,8,3,3 +BSK-E0002|BSK-E0011|BSK-E0039|BSK-E0053,directives_assert_type.py,directives,FAIL,7,0,3 BSK-E0031,directives_cast.py,directives,PASS,3,0,0 BSK-E0115,directives_deprecated.py,directives,PASS,12,0,0 -BSK-E0012|BSK-E0013|BSK-E0041,directives_no_type_check.py,directives,PASS,1,0,0 -BSK-E0033,directives_reveal_type.py,directives,PASS,2,0,0 +BSK-E0011|BSK-E0012|BSK-E0013|BSK-E0041,directives_no_type_check.py,directives,FAIL,1,0,1 +BSK-E0002|BSK-E0011|BSK-E0033,directives_reveal_type.py,directives,FAIL,2,0,1 ,directives_type_checking.py,directives,PASS,0,0,0 ,directives_type_ignore.py,directives,PASS,0,0,0 ,directives_type_ignore_file1.py,directives,PASS,0,0,0 BSK-E0014,directives_type_ignore_file2.py,directives,PASS,1,0,0 -BSK-E0150,directives_version_platform.py,directives,PASS,3,0,0 -BSK-E0040,enums_behaviors.py,enums,PASS,1,0,0 +BSK-E0002|BSK-E0150,directives_version_platform.py,directives,FAIL,3,0,1 +BSK-E0040,enums_behaviors.py,enums,FAIL,1,2,0 ,enums_definition.py,enums,PASS,0,0,0 -BSK-E0061,enums_expansion.py,enums,PASS,1,0,0 -,enums_member_names.py,enums,PASS,0,0,0 -BSK-E0066,enums_member_values.py,enums,PASS,2,0,0 -BSK-E0046|BSK-E0067,enums_members.py,enums,PASS,7,0,0 -,exceptions_context_managers.py,exceptions,PASS,0,0,0 -BSK-E0027|BSK-E0047|BSK-E0092|BSK-E0132|BSK-E0134,generics_base_class.py,generics,PASS,7,0,0 -BSK-E0026|BSK-E0027|BSK-E0043|BSK-E0148,generics_basic.py,generics,PASS,13,0,0 -BSK-E0030|BSK-E0091|BSK-E0092,generics_defaults.py,generics,PASS,5,0,0 -BSK-E0102|BSK-E0128|BSK-E0130,generics_defaults_referential.py,generics,PASS,7,0,0 -BSK-E0014|BSK-E0092,generics_defaults_specialization.py,generics,PASS,3,0,0 +BSK-E0002|BSK-E0061,enums_expansion.py,enums,FAIL,1,0,4 +BSK-E0002,enums_member_names.py,enums,FAIL,0,0,2 +BSK-E0002|BSK-E0066,enums_member_values.py,enums,FAIL,2,0,4 +BSK-E0002|BSK-E0046|BSK-E0067|BSK-W0040,enums_members.py,enums,FAIL,7,0,2 +BSK-E0001|BSK-E0011,exceptions_context_managers.py,exceptions,FAIL,0,0,6 +BSK-E0002|BSK-E0027|BSK-E0047|BSK-E0092|BSK-E0132|BSK-E0134,generics_base_class.py,generics,FAIL,7,0,3 +BSK-E0002|BSK-E0011|BSK-E0026|BSK-E0027|BSK-E0043|BSK-E0148,generics_basic.py,generics,FAIL,13,0,5 +BSK-E0002|BSK-E0030|BSK-E0091|BSK-E0092,generics_defaults.py,generics,FAIL,5,1,6 +BSK-E0002|BSK-E0102|BSK-E0128|BSK-E0130,generics_defaults_referential.py,generics,FAIL,7,0,1 +BSK-E0002|BSK-E0014|BSK-E0092,generics_defaults_specialization.py,generics,FAIL,3,0,1 BSK-E0026|BSK-E0047,generics_paramspec_basic.py,generics,PASS,7,0,0 -BSK-E0122,generics_paramspec_components.py,generics,PASS,16,0,0 +BSK-E0011|BSK-E0122,generics_paramspec_components.py,generics,PASS,16,0,0 BSK-E0122,generics_paramspec_semantics.py,generics,PASS,9,0,0 BSK-E0092|BSK-E0122,generics_paramspec_specialization.py,generics,PASS,5,0,0 -BSK-E0117|BSK-E0130,generics_scoping.py,generics,PASS,10,0,0 -,generics_self_advanced.py,generics,PASS,0,0,0 +BSK-E0117|BSK-E0130,generics_scoping.py,generics,FAIL,10,4,0 +BSK-W0050,generics_self_advanced.py,generics,FAIL,0,0,1 BSK-E0075,generics_self_attributes.py,generics,PASS,2,0,0 BSK-E0078,generics_self_basic.py,generics,PASS,3,0,0 -BSK-E0077,generics_self_protocols.py,generics,PASS,2,0,0 -BSK-E0078|BSK-E0094,generics_self_usage.py,generics,PASS,11,0,0 +BSK-E0077|BSK-W0050,generics_self_protocols.py,generics,FAIL,2,0,4 +BSK-E0011|BSK-E0025|BSK-E0078|BSK-E0094|BSK-W0050,generics_self_usage.py,generics,FAIL,11,0,3 BSK-E0042,generics_syntax_compatibility.py,generics,PASS,2,0,0 -BSK-E0043|BSK-E0089|BSK-E0105,generics_syntax_declarations.py,generics,PASS,10,0,0 -BSK-E0055|BSK-E0130,generics_syntax_infer_variance.py,generics,PASS,18,0,0 -BSK-E0149,generics_syntax_scoping.py,generics,PASS,7,0,0 +BSK-E0002|BSK-E0043|BSK-E0089|BSK-E0105,generics_syntax_declarations.py,generics,FAIL,10,0,1 +BSK-E0002|BSK-E0055|BSK-E0130,generics_syntax_infer_variance.py,generics,FAIL,18,0,4 +BSK-E0002|BSK-E0005|BSK-E0149,generics_syntax_scoping.py,generics,FAIL,7,0,9 BSK-E0111|BSK-E0125,generics_type_erasure.py,generics,PASS,7,0,0 -BSK-E0085,generics_typevartuple_args.py,generics,PASS,8,0,0 -BSK-E0055|BSK-E0083|BSK-E0084|BSK-E0085|BSK-E0086,generics_typevartuple_basic.py,generics,PASS,13,0,0 -BSK-E0082,generics_typevartuple_callable.py,generics,PASS,1,0,0 -,generics_typevartuple_concat.py,generics,PASS,0,0,0 -,generics_typevartuple_overloads.py,generics,PASS,0,0,0 -BSK-E0086|BSK-E0130|BSK-E0139,generics_typevartuple_specialization.py,generics,PASS,6,0,0 -BSK-E0081,generics_typevartuple_unpack.py,generics,PASS,1,0,0 -BSK-E0026|BSK-E0055|BSK-E0080,generics_upper_bound.py,generics,PASS,3,0,0 +BSK-E0002|BSK-E0085,generics_typevartuple_args.py,generics,FAIL,8,0,1 +BSK-E0002|BSK-E0055|BSK-E0083|BSK-E0084|BSK-E0085|BSK-E0086,generics_typevartuple_basic.py,generics,FAIL,13,1,3 +BSK-E0002|BSK-E0082,generics_typevartuple_callable.py,generics,FAIL,1,0,1 +BSK-E0002,generics_typevartuple_concat.py,generics,FAIL,0,0,2 +BSK-E0002|BSK-E0011,generics_typevartuple_overloads.py,generics,FAIL,0,0,2 +BSK-E0002|BSK-E0086|BSK-E0130|BSK-E0139,generics_typevartuple_specialization.py,generics,FAIL,6,0,11 +BSK-E0002|BSK-E0081,generics_typevartuple_unpack.py,generics,FAIL,1,0,2 +BSK-E0002|BSK-E0026|BSK-E0055|BSK-E0080,generics_upper_bound.py,generics,FAIL,3,1,1 BSK-E0055|BSK-E0107,generics_variance.py,generics,PASS,9,0,0 -BSK-E0130,generics_variance_inference.py,generics,PASS,23,0,0 +BSK-E0002|BSK-E0130,generics_variance_inference.py,generics,FAIL,23,0,3 BSK-E0071,historical_positional.py,historical,PASS,4,0,0 -BSK-E0127,literals_interactions.py,literals,PASS,4,0,0 -BSK-E0014|BSK-E0051|BSK-E0109|BSK-E0126|BSK-E0129,literals_literalstring.py,literals,PASS,9,0,0 -BSK-E0014|BSK-E0051|BSK-E0068|BSK-E0117|BSK-E0129|BSK-E0130,literals_parameterizations.py,literals,PASS,17,0,0 -BSK-E0014|BSK-E0129,literals_semantics.py,literals,PASS,4,0,0 +BSK-E0002|BSK-E0011|BSK-E0127,literals_interactions.py,literals,FAIL,4,0,3 +BSK-E0002|BSK-E0011|BSK-E0014|BSK-E0051|BSK-E0109|BSK-E0126|BSK-E0129,literals_literalstring.py,literals,FAIL,9,0,10 +BSK-E0002|BSK-E0014|BSK-E0051|BSK-E0068|BSK-E0117|BSK-E0129|BSK-E0130,literals_parameterizations.py,literals,FAIL,17,0,1 +BSK-E0002|BSK-E0014|BSK-E0129,literals_semantics.py,literals,FAIL,4,0,4 BSK-E0111|BSK-E0116|BSK-E0143,namedtuples_define_class.py,namedtuples,PASS,14,0,0 BSK-E0041|BSK-E0064,namedtuples_define_functional.py,namedtuples,PASS,9,0,0 BSK-E0073,namedtuples_type_compat.py,namedtuples,PASS,2,0,0 BSK-E0143,namedtuples_usage.py,namedtuples,PASS,8,0,0 -BSK-E0101|BSK-E0112,narrowing_typeguard.py,narrowing,PASS,4,0,0 -BSK-E0101|BSK-E0112|BSK-E0113,narrowing_typeis.py,narrowing,PASS,9,0,0 -BSK-E0072,overloads_basic.py,overloads,PASS,1,0,0 -,overloads_consistency.py,overloads,PASS,0,0,0 -BSK-E0020|BSK-E0034,overloads_definitions.py,overloads,PASS,0,0,0 -BSK-E0012|BSK-E0041|BSK-E0076,overloads_evaluation.py,overloads,PASS,4,0,0 -BSK-E0099|BSK-E0146,protocols_class_objects.py,protocols,PASS,8,0,0 -BSK-E0036|BSK-E0097|BSK-E0121,protocols_definition.py,protocols,PASS,21,0,0 -BSK-E0099|BSK-E0118|BSK-E0123|BSK-E0124,protocols_explicit.py,protocols,PASS,6,0,0 -BSK-E0130|BSK-E0137,protocols_generic.py,protocols,PASS,9,0,0 -BSK-E0098|BSK-E0099|BSK-E0121,protocols_merging.py,protocols,PASS,6,0,0 +BSK-E0002|BSK-E0011|BSK-E0101|BSK-E0112,narrowing_typeguard.py,narrowing,FAIL,4,0,2 +BSK-E0002|BSK-E0011|BSK-E0101|BSK-E0112|BSK-E0113,narrowing_typeis.py,narrowing,FAIL,9,0,2 +BSK-E0011|BSK-E0072,overloads_basic.py,overloads,FAIL,1,0,1 +,overloads_consistency.py,overloads,FAIL,0,2,0 +BSK-E0020|BSK-E0034,overloads_definitions.py,overloads,FAIL,0,7,0 +BSK-E0011|BSK-E0012|BSK-E0041|BSK-E0076,overloads_evaluation.py,overloads,FAIL,4,0,5 +BSK-E0004|BSK-E0011|BSK-E0099|BSK-E0146|BSK-W0050,protocols_class_objects.py,protocols,FAIL,8,0,3 +BSK-E0001|BSK-E0011|BSK-E0036|BSK-E0097|BSK-E0121,protocols_definition.py,protocols,FAIL,21,0,9 +BSK-E0002|BSK-E0099|BSK-E0118|BSK-E0123|BSK-E0124,protocols_explicit.py,protocols,FAIL,6,0,3 +BSK-E0002|BSK-E0130|BSK-E0137,protocols_generic.py,protocols,FAIL,9,0,3 +BSK-E0002|BSK-E0098|BSK-E0099|BSK-E0121,protocols_merging.py,protocols,FAIL,6,0,1 BSK-E0079,protocols_modules.py,protocols,PASS,3,0,0 ,protocols_recursive.py,protocols,PASS,0,0,0 -BSK-E0114|BSK-E0119,protocols_runtime_checkable.py,protocols,PASS,6,0,0 +BSK-E0002|BSK-E0011|BSK-E0114|BSK-E0119|BSK-W0050,protocols_runtime_checkable.py,protocols,FAIL,6,0,4 ,protocols_self.py,protocols,PASS,0,0,0 -BSK-E0014|BSK-E0099,protocols_subtyping.py,protocols,PASS,7,0,0 +BSK-E0002|BSK-E0014|BSK-E0099,protocols_subtyping.py,protocols,FAIL,7,0,4 BSK-E0110|BSK-E0133,protocols_variance.py,protocols,PASS,5,0,0 BSK-E0045|BSK-E0058,qualifiers_annotated.py,qualifiers,PASS,20,0,0 -BSK-E0014|BSK-E0041|BSK-E0044|BSK-E0054|BSK-E0064,qualifiers_final_annotation.py,qualifiers,PASS,26,0,0 -BSK-E0034,qualifiers_final_decorator.py,qualifiers,PASS,3,0,0 -,specialtypes_any.py,specialtypes,PASS,0,0,0 -BSK-E0062|BSK-E0070,specialtypes_never.py,specialtypes,PASS,3,0,0 -BSK-E0012|BSK-E0014,specialtypes_none.py,specialtypes,PASS,3,0,0 -BSK-E0065,specialtypes_promotions.py,specialtypes,PASS,1,0,0 -BSK-E0015|BSK-E0092|BSK-E0145,specialtypes_type.py,specialtypes,PASS,9,0,0 -BSK-E0014|BSK-E0045|BSK-E0147,tuples_type_compat.py,tuples,PASS,16,0,0 -BSK-E0014|BSK-E0049|BSK-E0090,tuples_type_form.py,tuples,PASS,11,0,0 -BSK-E0049,tuples_unpacked.py,tuples,PASS,4,0,0 +BSK-E0014|BSK-E0041|BSK-E0044|BSK-E0054|BSK-E0064|BSK-W0050,qualifiers_final_annotation.py,qualifiers,FAIL,26,0,3 +BSK-E0010|BSK-E0025|BSK-E0034,qualifiers_final_decorator.py,qualifiers,FAIL,3,3,1 +BSK-E0001|BSK-E0002|BSK-E0011,specialtypes_any.py,specialtypes,FAIL,0,0,3 +BSK-E0002|BSK-E0011|BSK-E0062|BSK-E0070,specialtypes_never.py,specialtypes,FAIL,3,0,2 +BSK-E0002|BSK-E0012|BSK-E0014,specialtypes_none.py,specialtypes,FAIL,3,0,1 +BSK-E0002|BSK-E0065|BSK-W0050,specialtypes_promotions.py,specialtypes,FAIL,1,0,2 +BSK-E0002|BSK-E0015|BSK-E0092|BSK-E0145,specialtypes_type.py,specialtypes,FAIL,9,0,6 +BSK-E0002|BSK-E0014|BSK-E0023|BSK-E0045|BSK-E0147,tuples_type_compat.py,tuples,FAIL,16,0,11 +BSK-E0011|BSK-E0014|BSK-E0049|BSK-E0090,tuples_type_form.py,tuples,FAIL,11,0,1 +BSK-E0002|BSK-E0049,tuples_unpacked.py,tuples,FAIL,4,0,3 BSK-E0037,typeddicts_alt_syntax.py,typeddicts,PASS,4,0,0 -BSK-E0029|BSK-E0032,typeddicts_class_syntax.py,typeddicts,PASS,3,0,0 -BSK-E0014|BSK-E0093|BSK-E0141|BSK-E0156,typeddicts_extra_items.py,typeddicts,PASS,23,0,0 +BSK-E0002|BSK-E0029|BSK-E0032,typeddicts_class_syntax.py,typeddicts,PASS,3,0,0 +BSK-E0002|BSK-E0014|BSK-E0093|BSK-E0141|BSK-E0156,typeddicts_extra_items.py,typeddicts,FAIL,23,5,2 ,typeddicts_final.py,typeddicts,PASS,0,0,0 BSK-E0038,typeddicts_inheritance.py,typeddicts,PASS,2,0,0 -BSK-E0093,typeddicts_operations.py,typeddicts,PASS,11,0,0 +BSK-E0002|BSK-E0093,typeddicts_operations.py,typeddicts,FAIL,11,0,1 BSK-E0056,typeddicts_readonly.py,typeddicts,PASS,6,0,0 -BSK-E0014,typeddicts_readonly_consistency.py,typeddicts,PASS,7,0,0 -BSK-E0038|BSK-E0056|BSK-E0093,typeddicts_readonly_inheritance.py,typeddicts,PASS,11,0,0 +BSK-E0002|BSK-E0014,typeddicts_readonly_consistency.py,typeddicts,FAIL,7,0,2 +BSK-E0003|BSK-E0038|BSK-E0056|BSK-E0093,typeddicts_readonly_inheritance.py,typeddicts,PASS,11,0,0 BSK-E0056,typeddicts_readonly_kwargs.py,typeddicts,PASS,1,0,0 BSK-E0056|BSK-E0093,typeddicts_readonly_update.py,typeddicts,PASS,1,0,0 BSK-E0035,typeddicts_required.py,typeddicts,PASS,4,0,0 diff --git a/conformance/score.py b/conformance/score.py new file mode 100644 index 00000000..f4fc9df7 --- /dev/null +++ b/conformance/score.py @@ -0,0 +1,493 @@ +#!/usr/bin/env python3 +# Implements [CHKARCH-CONFORMANCE]. See docs/specs/CHECKER-ARCHITECTURE-SPEC.md +"""Grade Basilisk with the REAL python/typing conformance calculator. + +This script does NOT reimplement the conformance scoring. It **imports the +committed upstream tool** — `conformance/upstream_main.py`, a byte-identical, +sha256-verified copy of `conformance/src/main.py` from `python/typing` pinned to +the same commit the test fixtures come from — and **calls upstream's own +`get_expected_errors` + `diff_expected_errors` functions unmodified**. Those two +functions are the entire conformance algorithm: the same code that grades +pyright, mypy, pyrefly, ty, zuban and pycroscope. Nothing about the calculation +is ours, and nothing is downloaded at score time. + +The only Basilisk-specific code here is a checker *adapter* — exactly what +upstream itself has for every checker (`PyrightTypeChecker`, `MypyTypeChecker`, +… in `conformance/src/type_checker.py`). The adapter runs the real `basilisk` +binary and turns its output into the `{line: [errors]}` mapping the upstream +algorithm consumes. A file passes iff upstream's `errors_diff` is empty — +upstream's exact rule: `"Fail" if errors_diff.strip() else "Pass"`. + +No diagnostic codes are excluded. By default this counts EVERY diagnostic +`basilisk check` emits — both errors AND warnings — which is the strictest +grading and matches how the reference checker pyright is graded upstream +(`if kind not in ("error", "warning")`). Pass `--errors-only` for the looser +errors-only view. Either way, any diagnostic on a line the suite does not mark +`# E` is a real false positive and fails the file — same as for any checker. + +This one file is the whole Basilisk side of conformance: it runs the binary, +scores with the official functions, writes `conformance/conformance_status.csv`, +and enforces the ratchet gate (`--gate`). There is no Rust test and no shell +script. Two upstream inputs, handled differently: + • the official calculator → `conformance/upstream_main.py`: COMMITTED and + sha256-pinned, never downloaded at score time (re-pin with --refresh-upstream). + • the `# E`-annotated test fixtures → `conformance/tests/*.py`: git-ignored and + DOWNLOADED on demand (--fetch / --fetch-only; auto-fetched if missing). + +Usage: + python3 conformance/score.py [--bin PATH] [--gate] [--errors-only] + [--conformance-dir DIR] [--fetch | --fetch-only] + [--refresh-upstream] +""" + +from __future__ import annotations + +import hashlib +import importlib.util +import json +import subprocess +import sys +import types +from pathlib import Path +from typing import Callable, Sequence + +# The single home for the pinned upstream commit. The fixtures (FIXTURES_API) +# and the vendored calculator (UPSTREAM_MAIN) both track it. To bump: edit this, +# run `--refresh-upstream` (re-pins upstream_main.py + its sha256), then `--fetch`. +PINNED_TYPING_REF = "268d0c4e" +UPSTREAM_MAIN_URL = ( + f"https://raw.githubusercontent.com/python/typing/{PINNED_TYPING_REF}" + "/conformance/src/main.py" +) +# The committed, byte-identical copy of upstream's calculator, and its sha256. +UPSTREAM_MAIN = Path(__file__).resolve().parent / "upstream_main.py" +UPSTREAM_MAIN_SHA256 = ( + "b4e3bd089c73856f9920ef494350d622c2914fac238c9193ec0bb3f93f0fc6a2" +) +# The two functions that constitute the official scoring algorithm. +OFFICIAL_FUNCS = ("get_expected_errors", "diff_expected_errors") +# The `# E`-annotated test fixtures are downloaded (git-ignored) into +# conformance/tests. This API lists them at the pinned ref for the fetch. +FIXTURES_API = ( + "https://api.github.com/repos/python/typing/contents/conformance/tests" + f"?ref={PINNED_TYPING_REF}" +) + + +# --------------------------------------------------------------------------- +# Import the REAL upstream calculator (committed, sha256-verified — no download) +# --------------------------------------------------------------------------- + + +class _StubModule(types.ModuleType): + """Stand-in that resolves ANY attribute to a dummy, so upstream main.py's + unrelated top-level imports (tomli/tomlkit/options/reporting/test_groups/ + type_checker) succeed. The two scoring functions reference none of them.""" + + def __getattr__(self, _name: str) -> object: + return object + + +def load_official_calc() -> tuple[Callable, Callable, str]: + """Return upstream's real (get_expected_errors, diff_expected_errors). + + Reads the committed `conformance/upstream_main.py`, verifies it is byte-for- + byte the pinned upstream `conformance/src/main.py` (sha256), imports it behind + module stubs (above), and hands back its two functions unmodified. No network + access; no code of ours in the calculation. + """ + raw = UPSTREAM_MAIN.read_bytes() + digest = hashlib.sha256(raw).hexdigest() + if digest != UPSTREAM_MAIN_SHA256: + raise RuntimeError( + f"{UPSTREAM_MAIN.name} sha256 {digest[:12]}… != pinned " + f"{UPSTREAM_MAIN_SHA256[:12]}… — the vendored upstream calculator was " + "modified. Restore it from git, or run --refresh-upstream to re-pin." + ) + + for dep in ( + "tomli", + "tomlkit", + "options", + "reporting", + "test_groups", + "type_checker", + ): + sys.modules.setdefault(dep, _StubModule(dep)) + + spec = importlib.util.spec_from_file_location( + "typing_conformance_main", UPSTREAM_MAIN + ) + if spec is None or spec.loader is None: + raise RuntimeError(f"cannot build an import spec for {UPSTREAM_MAIN}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + funcs = tuple(getattr(module, name, None) for name in OFFICIAL_FUNCS) + if not all(funcs): + raise RuntimeError( + f"committed upstream main.py is missing {OFFICIAL_FUNCS}; the upstream " + "layout changed — re-check the pinned ref" + ) + return funcs[0], funcs[1], f"sha256:{digest[:12]}" + + +def refresh_upstream() -> int: + """Re-download upstream main.py to the committed path and print its sha256. + + Maintenance only — run when bumping PINNED_TYPING_REF. This is the ONLY code + path that touches the network; the normal score path never does. + """ + import urllib.request # local import: never loaded on the score path + + with urllib.request.urlopen(UPSTREAM_MAIN_URL, timeout=30) as resp: # noqa: S310 (pinned https) + raw = resp.read() + UPSTREAM_MAIN.write_bytes(raw) + digest = hashlib.sha256(raw).hexdigest() + print(f" fetched {UPSTREAM_MAIN_URL}") + print(f" wrote {UPSTREAM_MAIN} ({len(raw)} bytes)") + print(f" sha256 {digest}") + if digest != UPSTREAM_MAIN_SHA256: + print(f' -> update UPSTREAM_MAIN_SHA256 = "{digest}" (ref changed)') + return 0 + + +# --------------------------------------------------------------------------- +# Download the (git-ignored) test fixtures on demand +# --------------------------------------------------------------------------- + + +def ensure_fixtures(conf_dir: Path, force: bool) -> None: + """Download python/typing's conformance `.py` fixtures into `conf_dir`. + + The fixtures are git-ignored and fetched on demand (auto when missing, or via + `--fetch` / `--fetch-only`). No-op when already present at the pinned ref (a + `.ref-sha` stamp records it) unless `force`; bumping `PINNED_TYPING_REF` + invalidates the stamp. Honors `GITHUB_TOKEN` to raise the API rate limit. + """ + import os + import urllib.request # local: network only happens here and in refresh + + stamp = conf_dir / ".ref-sha" + cached_ref = stamp.read_text(encoding="utf-8").strip() if stamp.exists() else "" + present = conf_dir.exists() and any(conf_dir.glob("*.py")) + if present and cached_ref == PINNED_TYPING_REF and not force: + return + + headers = {"Accept": "application/vnd.github+json"} + token = os.environ.get("GITHUB_TOKEN") + if token: + headers["Authorization"] = f"token {token}" + + listing_req = urllib.request.Request(FIXTURES_API, headers=headers) + with urllib.request.urlopen(listing_req, timeout=60) as resp: # noqa: S310 (pinned https) + entries = json.loads(resp.read()) + fixtures = [ + e for e in entries if e.get("type") == "file" and e["name"].endswith(".py") + ] + if not fixtures: + raise RuntimeError(f"no .py fixtures found at {FIXTURES_API}") + + conf_dir.mkdir(parents=True, exist_ok=True) + for stale in conf_dir.glob("*.py"): + stale.unlink() + for entry in fixtures: + with urllib.request.urlopen(entry["download_url"], timeout=60) as resp: # noqa: S310 + (conf_dir / entry["name"]).write_bytes(resp.read()) + stamp.write_text(PINNED_TYPING_REF + "\n", encoding="utf-8") + print( + f" fetched {len(fixtures)} conformance fixtures " + f"(python/typing@{PINNED_TYPING_REF}) -> {conf_dir}" + ) + + +# --------------------------------------------------------------------------- +# Checker adapter — same role as upstream's per-checker adapters +# --------------------------------------------------------------------------- + + +class BasiliskTypeChecker: + """Runs the real `basilisk` binary; parses its JSON into {line: [errors]}. + + Each diagnostic is the analog of the suite's `# E` ("an error MUST be + reported on this line"). When `count_warnings` is set (the default for the + strictest grading, matching how pyright is graded upstream) both `error` and + `warning` severities count; otherwise only `error` does. + """ + + name = "basilisk" + + def __init__(self, binary: Path, count_warnings: bool = False) -> None: + self.binary = binary + self.count_warnings = count_warnings + + def run_test(self, test_case: Path) -> str: + proc = subprocess.run( + [ + str(self.binary), + "check", + str(test_case), + "--output", + "json", + "--color", + "never", + ], + capture_output=True, + text=True, + ) + return proc.stdout + + def parse_errors(self, output: "Sequence[str] | str") -> dict[int, list[str]]: + # upstream calls this with `output.splitlines()`; rejoin + parse JSON. + text = "\n".join(output) if not isinstance(output, str) else output + try: + diags = json.loads(text) if text.strip() else [] + except json.JSONDecodeError: + return {} + accepted = {"error", "warning"} if self.count_warnings else {"error"} + line_to_errors: dict[int, list[str]] = {} + for d in diags: + if d.get("severity") not in accepted: + continue + line_to_errors.setdefault(int(d["line"]), []).append( + f"{d.get('code', '?')}: {d.get('message', '')}" + ) + return line_to_errors + + +# --------------------------------------------------------------------------- +# Driver / reporting / gate +# --------------------------------------------------------------------------- + + +def repo_root() -> Path: + here = Path(__file__).resolve() + for parent in here.parents: + if (parent / "Cargo.toml").exists() and (parent / "crates").exists(): + return parent + return here.parent.parent + + +def find_binary(explicit: str | None, root: Path) -> Path | None: + if explicit: + p = Path(explicit) + return p if p.exists() else None + for candidate in (root / "target/release/basilisk", root / "target/debug/basilisk"): + if candidate.exists(): + return candidate + return None + + +def read_conformance_field(root: Path, key: str) -> int | None: + try: + data = json.loads((root / "coverage-thresholds.json").read_text()) + return int(data["conformance"][key]) + except (OSError, KeyError, ValueError, json.JSONDecodeError): + return None + + +def category(name: str) -> str: + # Some fixtures are prefixed with `_` (e.g. `_enums_members.py`); group them + # by their real category, not an empty string. + stem = name.lstrip("_") + return stem.split("_", 1)[0] if "_" in stem else stem[:-3] + + +Row = tuple[str, str, bool, int, int, int, list[str]] +Totals = dict[str, int] + + +def score( + checker: "BasiliskTypeChecker", + get_expected: Callable, + diff_errors: Callable, + conf_dir: Path, +) -> tuple[list[Path], list[Row], Totals]: + files = sorted(conf_dir.glob("*.py")) + rows, totals = [], {"pass": 0, "missed": 0, "fp": 0, "caught": 0} + for f in files: + output = checker.run_test(f) + diff = diff_errors(checker, f, output, []) + diff_lines = [d for d in diff.splitlines() if d.strip()] + missed = sum(1 for d in diff_lines if "Expected" in d) + fp = sum(1 for d in diff_lines if "Unexpected" in d) + passed = not diff.strip() + + errors = checker.parse_errors(output.splitlines()) + expected, _ = get_expected(f) + req_lines = [ln for ln, (req, _o) in expected.items() if req > 0] + caught = sum(1 for ln in req_lines if ln in errors) + codes = sorted({e.split(":", 1)[0] for errs in errors.values() for e in errs}) + + rows.append((f.name, category(f.name), passed, caught, missed, fp, codes)) + totals["pass"] += int(passed) + totals["missed"] += missed + totals["fp"] += fp + totals["caught"] += caught + return files, rows, totals + + +def print_scorecard( + files: list[Path], + rows: list[Row], + totals: Totals, + label: str, + digest: str, +) -> None: + n = len(files) + pct = (totals["pass"] * 100.0 / n) if n else 0.0 + print() + print("=" * 68) + print(f" BASILISK PEP CONFORMANCE — REAL python/typing CALCULATOR [{label}]") + print(" calc: imported verbatim from committed conformance/upstream_main.py") + print( + f" ref: python/typing@{PINNED_TYPING_REF} ({digest}) funcs: {', '.join(OFFICIAL_FUNCS)}" + ) + print("=" * 68) + print(f" Files: {n} total | {totals['pass']} pass | {n - totals['pass']} fail") + print(f" Score: {pct:.1f}% (Pass = empty errors_diff, upstream rule)") + print(f" Required: {totals['caught']} caught | {totals['missed']} missed") + print(f" False+: {totals['fp']} unexpected diagnostics (THESE FAIL FILES)") + print("-" * 68) + print(" Failing files:") + any_fail = False + for name, _c, passed, _ca, missed, fp, _codes in rows: + if not passed: + any_fail = True + print(f" FAIL {name:<46} missed={missed:<3} fp={fp}") + if not any_fail: + print(" (none — all files pass)") + print("=" * 68) + print() + + +def write_csv(root: Path, rows: list[Row]) -> None: + lines = ["basilisk_rules,file,category,status,caught,missed,false_positives"] + for name, cat, passed, caught, missed, fp, codes in rows: + status = "PASS" if passed else "FAIL" + lines.append(f"{'|'.join(codes)},{name},{cat},{status},{caught},{missed},{fp}") + out = root / "conformance" / "conformance_status.csv" + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text("\n".join(lines) + "\n") + print(f" Conformance CSV: {out}") + + +def parse_args(argv: list[str]) -> dict: + # Default is the STRICTEST grading: every diagnostic basilisk emits (errors + # AND warnings) is counted as "an error was reported", which is also how the + # reference checker pyright is graded upstream. `--errors-only` reports the + # looser errors-only view. `--count-warnings` is accepted for back-compat. + opts: dict = { + "bin": None, + "gate": False, + "warn": True, + "dir": None, + "refresh": False, + "fetch": False, + "fetch_only": False, + } + it = iter(argv) + for a in it: + if a == "--bin": + opts["bin"] = next(it, None) + elif a == "--gate": + opts["gate"] = True + elif a == "--count-warnings": + opts["warn"] = True + elif a == "--errors-only": + opts["warn"] = False + elif a == "--conformance-dir": + opts["dir"] = next(it, None) + elif a == "--refresh-upstream": + opts["refresh"] = True + elif a == "--fetch": + opts["fetch"] = True + elif a == "--fetch-only": + opts["fetch_only"] = True + return opts + + +def enforce_gate(root: Path, files: list[Path], totals: Totals) -> bool: + n = len(files) + pct = (totals["pass"] * 100) // n if n else 0 + threshold = read_conformance_field(root, "threshold") + ceiling = read_conformance_field(root, "max_false_positives") + failed = False + if threshold is not None: + if pct < threshold: + print( + f" ✗ PEP conformance regression: {pct}% ({totals['pass']}/{n}) " + f"< {threshold}% threshold.", + file=sys.stderr, + ) + failed = True + else: + print( + f" Conformance gate: {pct}% ({totals['pass']}/{n}) >= {threshold}% — PASS" + ) + if ceiling is not None: + if totals["fp"] > ceiling: + print( + f" ✗ False-positive regression: {totals['fp']} FPs > {ceiling} ceiling.", + file=sys.stderr, + ) + failed = True + else: + print(f" FP gate: {totals['fp']} <= {ceiling} ceiling — PASS") + return not failed + + +def main(argv: list[str]) -> int: + opts = parse_args(argv) + if opts["refresh"]: + return refresh_upstream() + + root = repo_root() + conf_dir = Path(opts["dir"]) if opts["dir"] else root / "conformance/tests" + + # The fixtures are downloaded (git-ignored), unlike the committed calculator. + # Fetch them when forced (--fetch), in fetch-only mode, or when absent. A + # network failure is fatal only if a fetch was explicitly requested; on the + # plain score path a missing suite is skipped (fresh checkout, offline). + present = conf_dir.exists() and any(conf_dir.glob("*.py")) + if opts["fetch"] or opts["fetch_only"] or not present: + try: + ensure_fixtures(conf_dir, force=opts["fetch"]) + except Exception as exc: # noqa: BLE001 — surface fetch failure clearly + if opts["fetch"] or opts["fetch_only"]: + print( + f" ✗ could not fetch conformance fixtures: {exc}", file=sys.stderr + ) + return 1 + print(" ⚠ Conformance suite not present and fetch failed — skipping.") + return 0 + if opts["fetch_only"]: + return 0 + + binary = find_binary(opts["bin"], root) + if binary is None: + print( + " ✗ basilisk binary not found. Build it or pass --bin .", + file=sys.stderr, + ) + return 1 + + try: + get_expected, diff_errors, digest = load_official_calc() + except Exception as exc: # noqa: BLE001 — surface any load/verify failure clearly + print(f" ✗ could not load the official calculator: {exc}", file=sys.stderr) + return 1 + + checker = BasiliskTypeChecker(binary, count_warnings=opts["warn"]) + files, rows, totals = score(checker, get_expected, diff_errors, conf_dir) + label = "errors+warnings" if opts["warn"] else "errors only" + print_scorecard(files, rows, totals, label, digest) + write_csv(root, rows) + + if not opts["gate"]: + return 0 + return 0 if enforce_gate(root, files, totals) else 1 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/conformance/upstream_main.py b/conformance/upstream_main.py new file mode 100644 index 00000000..62a827bc --- /dev/null +++ b/conformance/upstream_main.py @@ -0,0 +1,275 @@ +""" +Type system conformance test for static type checkers. +""" + +import os +from pathlib import Path +import re +import sys +from time import time +from typing import Sequence + +import tomli +import tomlkit + +from options import parse_options +from reporting import generate_summary +from test_groups import get_test_cases, get_test_groups +from type_checker import TYPE_CHECKERS, TypeChecker + + +def run_tests( + root_dir: Path, + type_checker: TypeChecker, + test_cases: Sequence[Path], +): + print(f"Running tests for {type_checker.name}") + + test_start_time = time() + tests_output = type_checker.run_tests([file.name for file in test_cases]) + test_duration = time() - test_start_time + + print(f"Completed tests for {type_checker.name} in {test_duration:.2f} seconds") + + for _, output in tests_output.items(): + type_checker.parse_errors(output.splitlines()) + + results_dir = root_dir / "results" / type_checker.name + + for test_case in test_cases: + update_output_for_test( + type_checker, results_dir, test_case, tests_output.get(test_case.name, "") + ) + + update_type_checker_info(type_checker, root_dir) + + +def get_expected_errors(test_case: Path) -> tuple[ + dict[int, tuple[int, int]], + dict[str, tuple[list[int], bool]], +]: + """Return the line numbers where type checkers are expected to produce an error. + + The return value is a tuple of two dictionaries: + - The format of the first is {line number: (number of required errors, number of optional errors)}. + - The format of the second is {error tag: ([lines where the error may appear], allow multiple}. + If allow multiple is True, the error may appear on multiple lines; otherwise, it must + appear exactly once. + + For example, the following test case: + + x: int = "x" # E + y: int = "y" # E? + @final # E[final] + def f(): pass # E[final] + + will return: + + ( + {1: (1, 0), 2: (0, 1)}, + {"final": ([3, 4], False)} + ) + """ + with open(test_case, "r", encoding="utf-8") as f: + lines = f.readlines() + output: dict[int, tuple[int, int]] = {} + groups: dict[str, tuple[list[int], bool]] = {} + for i, line in enumerate(lines, start=1): + line_without_comment, *_ = line.split("#") + # Ignore lines with no non-comment content. This allows commenting out test cases. + if not line_without_comment.strip(): + continue + required = 0 + optional = 0 + for match in re.finditer(r"# E\??(?=:|$| )", line): + if match.group() == "# E": + required += 1 + else: + optional += 1 + if required or optional: + output[i] = (required, optional) + for match in re.finditer(r"# E\[([^\]]+)\]", line): + tag = match.group(1) + if tag.endswith("+"): + allow_multiple = True + tag = tag[:-1] + else: + allow_multiple = False + if tag not in groups: + groups[tag] = ([i], allow_multiple) + else: + if groups[tag][1] != allow_multiple: + raise ValueError(f"Error group {tag} has inconsistent allow_multiple value in {test_case}") + groups[tag][0].append(i) + for group, linenos in groups.items(): + if len(linenos) == 1: + raise ValueError(f"Error group {group} only appears on a single line in {test_case}") + return output, groups + + +def diff_expected_errors( + type_checker: TypeChecker, + test_case: Path, + output: str, + ignored_errors: Sequence[str], +) -> str: + """Return a list of errors that were expected but not produced by the type checker.""" + expected_errors, error_groups = get_expected_errors(test_case) + errors = type_checker.parse_errors(output.splitlines()) + if ignored_errors: + errors = { + lineno: [ + error + for error in errors_list + if not any(ignored in error for ignored in ignored_errors)] + for lineno, errors_list in errors.items() + } + errors = {lineno: errors_list for lineno, errors_list in errors.items() if errors_list} + + differences: list[str] = [] + for expected_lineno, (expected_count, _) in expected_errors.items(): + if expected_lineno not in errors and expected_count > 0: + differences.append(f"Line {expected_lineno}: Expected {expected_count} errors") + # We don't report an issue if the count differs, because type checkers may produce + # multiple error messages for a single line. + linenos_used_by_groups: set[int] = set() + for group, (linenos, allow_multiple) in error_groups.items(): + num_errors = sum(1 for lineno in linenos if lineno in errors) + if num_errors == 0: + differences.append(f"Lines {', '.join(map(str, linenos))}: Expected error (tag {group!r})") + elif num_errors == 1 or allow_multiple: + linenos_used_by_groups.update(linenos) + else: + differences.append(f"Lines {', '.join(map(str, linenos))}: Expected exactly one error (tag {group!r})") + for actual_lineno, actual_errors in errors.items(): + if actual_lineno not in expected_errors and actual_lineno not in linenos_used_by_groups: + differences.append(f"Line {actual_lineno}: Unexpected errors {actual_errors}") + return "".join(f"{diff}\n" for diff in differences) + + +def update_output_for_test( + type_checker: TypeChecker, + results_dir: Path, + test_case: Path, + output: str, +): + test_name = test_case.stem + output = f"\n{output}" + + results_file = results_dir / f"{test_name}.toml" + results_file.parent.mkdir(parents=True, exist_ok=True) + should_write = False + + # Read the existing results file if present. + try: + with open(results_file, "rb") as f: + existing_results = tomli.load(f) + except FileNotFoundError: + should_write = True + existing_results = {} + except tomli.TOMLDecodeError: + print(f"Error decoding {results_file}") + existing_results = {} + + ignored_errors = existing_results.get("ignore_errors", []) + errors_diff = "\n" + diff_expected_errors(type_checker, test_case, output, ignored_errors) + old_errors_diff = "\n" + existing_results.get("errors_diff", "") + + if errors_diff != old_errors_diff: + should_write = True + print(f"Result changed for {test_name} when running {type_checker.name}") + print(f"Old output: {old_errors_diff}") + print(f"New output: {errors_diff}") + print("") + + conformance_automated = "Fail" if errors_diff.strip() else "Pass" + if existing_results.get("conformance_automated") != conformance_automated: + should_write = True + existing_results["conformance_automated"] = conformance_automated + + old_output = existing_results.get("output", "") + old_output = f"\n{old_output}" + + # Did the type checker output change since last time the + # test was run? + if old_output != output: + should_write = True + print(f"Output changed for {test_name} when running {type_checker.name}") + print(f"Old output: {old_output}") + print(f"New output: {output}") + print("") + + # Use multiline formatting for any strings that contain newlines. + for key, value in existing_results.items(): + if isinstance(value, str) and "\n" in value: + existing_results[key] = tomlkit.string(f"\n{value}", multiline=True) + + if should_write: + # Always reapply tomlkit.string, or it will turn into a single line. + existing_results["errors_diff"] = tomlkit.string(errors_diff, multiline=True) + existing_results["output"] = tomlkit.string(output, multiline=True) + if "notes" in existing_results: + notes = existing_results["notes"] + if not notes.startswith("\n"): + notes = "\n" + notes + existing_results["notes"] = tomlkit.string(notes, multiline=True) + results_file.parent.mkdir(parents=True, exist_ok=True) + with open(results_file, "w", encoding="utf-8") as f: + tomlkit.dump(existing_results, f) + + +def update_type_checker_info(type_checker: TypeChecker, root_dir: Path): + # Record the version of the type checker used for the latest run. + version_file = root_dir / "results" / type_checker.name / "version.toml" + + # Read the existing version file if present. + try: + with open(version_file, "rb") as f: + existing_info = tomli.load(f) + except FileNotFoundError: + existing_info = {} + except tomli.TOMLDecodeError: + print(f"Error decoding {version_file}") + existing_info = {} + + existing_info["version"] = type_checker.get_version() + + version_file.parent.mkdir(parents=True, exist_ok=True) + with open(version_file, "w") as f: + tomlkit.dump(existing_info, f) + + +def main(): + # Some tests cover features that are available only in the + # latest version of Python (3.12), so we need this version. + assert sys.version_info >= (3, 12) + + options = parse_options(sys.argv[1:]) + + root_dir = Path(__file__).resolve().parent.parent + + if not options.report_only: + tests_dir = root_dir / "tests" + assert tests_dir.is_dir() + + test_groups = get_test_groups(root_dir) + test_cases = get_test_cases(test_groups, tests_dir) + + # Switch to the tests directory. + os.chdir(tests_dir) + + # Run each test case with each type checker. + for type_checker in TYPE_CHECKERS: + if options.only_run and options.only_run != type_checker.name: + continue + if not type_checker.install(): + print(f"Skipping tests for {type_checker.name}") + else: + run_tests(root_dir, type_checker, test_cases) + + # Generate a summary report. + generate_summary(root_dir) + + +if __name__ == "__main__": + main() diff --git a/coverage-thresholds.json b/coverage-thresholds.json index 86c645ec..31b123b1 100644 --- a/coverage-thresholds.json +++ b/coverage-thresholds.json @@ -41,9 +41,9 @@ } }, "conformance": { - "_doc": "Minimum PEP conformance pass percentage (files passing / total files). Ratchet UP only. Current: 146/146 = 100% (pinned to python/typing@268d0c4e). All files pass with zero false positives.", - "threshold": 100, - "_fp_ceiling_doc": "Maximum total false positives across the suite (diagnostics on lines without a # E annotation). Ratchet DOWN only \u2014 the mirror of the pass-percentage gate. Enforced by conformance_tests.rs. Any change that reintroduces even one FP pushes the total above this ceiling and fails CI. Current measured: 0 (ZERO false positives).", - "max_false_positives": 0 + "_doc": "Minimum PEP conformance pass percentage (files passing / total files), computed by the REAL python/typing conformance calculator (conformance/score.py downloads upstream main.py at the pinned ref and runs its own get_expected_errors + diff_expected_errors; NO excluded diagnostic codes). A file passes only when upstream's errors_diff is empty. The score uses the STRICTEST grading: every basilisk diagnostic (errors AND warnings) counts, matching how the reference checker pyright is graded. Ratchet UP only. HONEST baseline (replacing a previously incorrect 100% from a lenient in-repo harness): 59/146 = 40.4%, pinned to python/typing@268d0c4e. (The looser errors-only view is 70/146 = 47.9%, available via score.py --errors-only.) Target is 100%; this is the real current number.", + "threshold": 40, + "_fp_ceiling_doc": "Maximum total false-positive diagnostics across the suite (diagnostics Basilisk reports on lines the suite does NOT mark with # E, plus diagnostics outside satisfied # E[tag] groups) under the strictest errors+warnings grading. Ratchet DOWN only \u2014 the mirror of the pass-percentage gate. Enforced via conformance/score.py --gate (run on the compiled binary by scripts/test-rust.sh inside make test; no Rust test involved). HONEST baseline: 285 (the prior 0 was incorrect — produced by excluding 9 diagnostic codes from scoring). Drive this DOWN.", + "max_false_positives": 285 } } diff --git a/crates/basilisk-checker/tests/checker_tests.rs b/crates/basilisk-checker/tests/checker_tests.rs index eafb5c42..a88d4ed0 100644 --- a/crates/basilisk-checker/tests/checker_tests.rs +++ b/crates/basilisk-checker/tests/checker_tests.rs @@ -2100,7 +2100,7 @@ fn debug_e0047_qualifiers_annotated_fp() -> Result<(), Box Result<(), Box Option { - // Skip full-line comments — a `# E` inside a comment is not a real - // annotation because the line contains no executable code for the - // checker to flag. - if line.trim_start().starts_with('#') { - // Allow lines that are ONLY a `# E` marker (pure annotation lines - // are used in some conformance files), but skip lines where real - // code has been commented out with a trailing `# E`. - let trimmed = line.trim(); - // Pure annotation: `# E`, `# E: explanation`, `# E[tag]`, `# E?` - let after_hash = trimmed.strip_prefix('#')?.trim_start(); - if !after_hash.starts_with('E') { - return None; - } - } - - // Find the last `# E` marker on the line. - let marker = line.rfind("# E")?; - let rest = line[marker + 2..].trim(); // everything after "#" - - if rest.starts_with("E?") { - return Some(Annotation::Optional); - } - - if rest.starts_with("E[") { - let inner = rest.strip_prefix("E[")?; - // Find closing ] — ignore anything after it (description text) - if let Some(close) = inner.find(']') { - let tag = &inner[..close]; - if tag.ends_with('+') { - return Some(Annotation::TaggedMulti( - tag.trim_end_matches('+').to_owned(), - )); - } - return Some(Annotation::TaggedExact(tag.to_owned())); - } - // No closing ] at all — malformed, treat as required - return Some(Annotation::Required); - } - - // `# E` standing alone, or followed by `:`/whitespace + description text - // (e.g. `# E`, `# E: explanation`, `# E (see ...)`). The char immediately - // after `E` must be a boundary — end-of-marker, `:`, or whitespace — so we - // accept the upstream `# E (…)` form while still rejecting words such as - // `# Exception` or `# Edge case`. NOTE: inspect the *untrimmed* remainder; - // trimming first would erase the space boundary and silently drop `# E (…)`. - if let Some(after) = rest.strip_prefix('E') { - if after.is_empty() || after.starts_with(':') || after.starts_with(char::is_whitespace) { - return Some(Annotation::Required); - } - } - - None -} - -// --------------------------------------------------------------------------- -// Line-number helper (byte offset → 1-based line) -// --------------------------------------------------------------------------- - -fn byte_offset_to_line(source: &str, offset: u32) -> usize { - let clamped = (offset as usize).min(source.len()); - source[..clamped].chars().filter(|&c| c == '\n').count() + 1 -} - -// --------------------------------------------------------------------------- -// Per-file result -// --------------------------------------------------------------------------- - -#[derive(Debug, Default)] -struct FileResult { - /// `# E` lines that Basilisk caught. - caught: usize, - /// `# E` lines that Basilisk missed. - missed: usize, - /// Lines Basilisk flagged that had no annotation (false positives). - false_positives: usize, - /// `# E?` optional lines where Basilisk did fire. - #[expect(dead_code, reason = "tracked for future reporting")] - optional_caught: usize, - /// `# E[tag]` groups satisfied. - tagged_exact_satisfied: usize, - /// `# E[tag]` groups missed. - tagged_exact_missed: usize, - /// Distinct Basilisk rule codes fired on this file (conformance-relevant only). - rules_fired: Vec, -} - -impl FileResult { - fn passes(&self) -> bool { - self.missed == 0 - } -} - -// --------------------------------------------------------------------------- -// Annotation collection -// --------------------------------------------------------------------------- - -struct Annotations { - required: HashSet, - optional: HashSet, - tagged_exact: HashMap>, - tagged_multi: HashMap>, -} - -/// Scan source lines and collect all conformance annotations by 1-based line -/// number. -fn collect_annotations(source: &str) -> Annotations { - let mut required: HashSet = HashSet::new(); - let mut optional: HashSet = HashSet::new(); - let mut tagged_exact: HashMap> = HashMap::new(); - let mut tagged_multi: HashMap> = HashMap::new(); - - for (idx, line) in source.lines().enumerate() { - let lineno = idx + 1; - match parse_annotation(line) { - Some(Annotation::Required) => { - let _ = required.insert(lineno); - } - Some(Annotation::Optional) => { - let _ = optional.insert(lineno); - } - Some(Annotation::TaggedExact(tag)) => { - let _ = tagged_exact.entry(tag).or_default().insert(lineno); - } - Some(Annotation::TaggedMulti(tag)) => { - let _ = tagged_multi.entry(tag).or_default().insert(lineno); - } - None => {} - } - } - - Annotations { - required, - optional, - tagged_exact, - tagged_multi, - } -} - -// --------------------------------------------------------------------------- -// Diagnostic collection -// --------------------------------------------------------------------------- - -struct DiagnosticOutput { - diag_lines: HashSet, - rules_seen: std::collections::BTreeSet, - diag_line_rules: HashMap>, -} - -/// Run the Basilisk pipeline on `path` and collect diagnostic lines, filtering -/// out strictness-only rules. -fn collect_diagnostics(path: &Path, source: &str) -> DiagnosticOutput { - // Rules that are Basilisk-specific strictness requirements not covered by - // the PEP conformance suite. These codes are excluded from both the - // "caught" count and the false-positive count so they do not inflate or - // deflate the conformance score: - // - // - E0001–E0005: annotation completeness (PEP suite fixtures are unannotated) - // - E0010, E0011: import strictness and Any warnings - // - E0023: non-exhaustive match — PEP conformance suite tests type narrowing - // inside match arms but does not require a wildcard `case _:` branch - // - E0025: missing @override (PEP 698 makes @override optional documentation) - const STRICTNESS_ONLY: &[&str] = &[ - "BSK-E0001", - "BSK-E0002", - "BSK-E0003", - "BSK-E0004", - "BSK-E0005", - "BSK-E0010", - "BSK-E0011", - "BSK-E0023", - "BSK-E0025", - ]; - - let mut rules_seen = std::collections::BTreeSet::new(); - let mut diag_line_rules: HashMap> = HashMap::new(); - - let diag_lines: HashSet = match parse_file(path.to_string_lossy().as_ref()) { - Ok(parsed) => match resolve(&parsed) { - Ok(resolved) => { - let diags = check(&resolved); - diags - .iter() - .filter(|d| d.severity == basilisk_checker::Severity::Error) - .filter(|d| !STRICTNESS_ONLY.contains(&d.code.code)) - .map(|d| { - let _ = rules_seen.insert(d.code.code.to_owned()); - let line = byte_offset_to_line(source, d.span.start); - diag_line_rules - .entry(line) - .or_default() - .push(d.code.code.to_owned()); - line - }) - .collect() - } - Err(_) => HashSet::new(), - }, - Err(_) => HashSet::new(), - }; - - DiagnosticOutput { - diag_lines, - rules_seen, - diag_line_rules, - } -} - -// --------------------------------------------------------------------------- -// Run one conformance file -// --------------------------------------------------------------------------- - -fn run_file(path: &Path) -> FileResult { - let Ok(source) = fs::read_to_string(path) else { - return FileResult::default(); - }; - - let annotations = collect_annotations(&source); - let diagnostics = collect_diagnostics(path, &source); - - // Score required lines. - let caught = annotations - .required - .iter() - .filter(|l| diagnostics.diag_lines.contains(l)) - .count(); - let missed = annotations.required.len() - caught; - - // Score optional lines. - let optional_caught = annotations - .optional - .iter() - .filter(|l| diagnostics.diag_lines.contains(l)) - .count(); - - // Score tagged-exact groups: a group passes if at least one line errored. - let mut tagged_exact_satisfied = 0usize; - let mut tagged_exact_missed = 0usize; - for lines in annotations.tagged_exact.values() { - if lines.iter().any(|l| diagnostics.diag_lines.contains(l)) { - tagged_exact_satisfied += 1; - } else { - tagged_exact_missed += 1; - } - } - - // All annotated lines (don't count false positives on annotated lines). - let all_annotated: HashSet = annotations - .required - .iter() - .chain(annotations.optional.iter()) - .chain(annotations.tagged_exact.values().flatten()) - .chain(annotations.tagged_multi.values().flatten()) - .copied() - .collect(); - - let false_positives = diagnostics - .diag_lines - .iter() - .filter(|l| !all_annotated.contains(l)) - .count(); - - let file_name = path.file_name().unwrap_or_default().to_string_lossy(); - if missed > 0 { - let missed_lines: Vec = annotations - .required - .iter() - .filter(|l| !diagnostics.diag_lines.contains(l)) - .copied() - .collect(); - println!(" DEBUG {file_name}: missed={missed} lines={missed_lines:?}"); - } - if false_positives > 0 { - let mut fp_details: Vec<(usize, String)> = diagnostics - .diag_lines - .iter() - .filter(|l| !all_annotated.contains(l)) - .map(|&l| { - let rules = diagnostics - .diag_line_rules - .get(&l) - .map_or_else(String::new, |codes| codes.join("|")); - (l, rules) - }) - .collect(); - fp_details.sort_by_key(|(l, _)| *l); - println!(" FP {file_name}: count={false_positives} lines={fp_details:?}"); - } - - FileResult { - caught, - missed, - false_positives, - optional_caught, - tagged_exact_satisfied, - tagged_exact_missed, - rules_fired: diagnostics.rules_seen.into_iter().collect(), - } -} - -// --------------------------------------------------------------------------- -// Category from filename (e.g. "generics_basic.py" → "generics") -// --------------------------------------------------------------------------- - -fn category(name: &str) -> &str { - name.find('_') - .map_or(name.trim_end_matches(".py"), |i| &name[..i]) -} - -// --------------------------------------------------------------------------- -// Threshold from coverage-thresholds.json -// --------------------------------------------------------------------------- - -/// Read the PEP conformance pass-percentage threshold from the repo-root -/// `coverage-thresholds.json`. Falls back to 0 if the file is missing or -/// malformed so the test still runs (the coverage script enforces separately). -fn read_conformance_threshold() -> usize { - read_conformance_field("threshold").unwrap_or(0) -} - -/// The maximum total false positives allowed across the suite, from -/// `coverage-thresholds.json` → `conformance.max_false_positives`. -/// -/// Ratchets DOWN only — like the pass-percentage gate but in the opposite -/// direction. Returns `None` when the key is absent (gate disabled). -fn read_conformance_fp_ceiling() -> Option { - read_conformance_field("max_false_positives") -} - -/// Read a numeric field nested under the `"conformance"` object in -/// `coverage-thresholds.json`. -/// -/// Minimal JSON extraction — avoids adding a serde dependency to this test -/// crate. Looks for `"conformance"` then the first occurrence of the requested -/// key, then parses the following integer. -fn read_conformance_field(key: &str) -> Option { - let manifest = Path::new(env!("CARGO_MANIFEST_DIR")); - let repo_root = manifest - .ancestors() - .find(|p| p.join("Cargo.toml").exists() && p.join("crates").exists())?; - let content = fs::read_to_string(repo_root.join("coverage-thresholds.json")).ok()?; - let conformance_idx = content.find("\"conformance\"")?; - let rest = &content[conformance_idx..]; - let key_pat = format!("\"{key}\""); - let key_idx = rest.find(&key_pat)?; - let after = &rest[key_idx + key_pat.len()..]; - // Skip `:` and whitespace, then parse the number. - let num_start = after.find(|c: char| c.is_ascii_digit())?; - let num_end = after[num_start..] - .find(|c: char| !c.is_ascii_digit()) - .map_or(after.len(), |i| num_start + i); - after[num_start..num_end].parse().ok() -} - -// --------------------------------------------------------------------------- -// The single test entry point -// --------------------------------------------------------------------------- - -#[test] -fn conformance_score() { - let conformance_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/conformance"); - - if !conformance_dir.exists() { - println!(); - println!(" ⚠ Conformance suite not downloaded."); - println!(" Run: make conformance"); - println!(" Or: cargo test --test conformance_tests -- --nocapture"); - println!(); - return; - } - - let Ok(read_dir) = fs::read_dir(&conformance_dir) else { - println!(" Failed to read conformance directory."); - return; - }; - let mut files: Vec<_> = read_dir - .filter_map(std::result::Result::ok) - .filter(|e| e.path().extension().is_some_and(|x| x == "py")) - .collect(); - files.sort_by_key(std::fs::DirEntry::file_name); - - if files.is_empty() { - println!(" Conformance directory exists but contains no .py files."); - println!(" Run: make conformance"); - return; - } - - let (totals, by_category, detail_lines) = collect_results(&files); - print_scorecard(&totals, &by_category, &detail_lines); - write_csv(&detail_lines); - - assert!( - totals.files > 0, - "No conformance files found. Run make conformance first." - ); - - // Enforce minimum conformance percentage from coverage-thresholds.json. - // This prevents regressions — the threshold ratchets UP only. - let threshold = read_conformance_threshold(); - let pct = (totals.pass * 100).checked_div(totals.files).unwrap_or(0); - assert!( - pct >= threshold, - "PEP conformance regression: {pct}% ({}/{}) < {threshold}% threshold. \ - Fix the regression before merging.", - totals.pass, - totals.files - ); - println!( - " Conformance gate: {pct}% ({}/{}) >= {threshold}% threshold — PASS", - totals.pass, totals.files - ); - - // Enforce the false-positive ceiling from coverage-thresholds.json. - // False positives ratchet DOWN only: introducing new ones fails the gate. - if let Some(ceiling) = read_conformance_fp_ceiling() { - assert!( - totals.fp <= ceiling, - "PEP conformance false-positive regression: {} FPs > {ceiling} ceiling. \ - False positives ratchet DOWN only — eliminate new ones before merging.", - totals.fp - ); - println!(" FP gate: {} <= {ceiling} ceiling — PASS", totals.fp); - } -} - -type CategoryMap = BTreeMap; -type DetailLines = Vec<(String, FileResult)>; - -/// Aggregated conformance totals. -struct Totals { - files: usize, - pass: usize, - caught: usize, - missed: usize, - fp: usize, - tag_ok: usize, - tag_missed: usize, -} - -/// Write a CSV snapshot of per-file conformance results. -/// -/// Output path: `conformance/conformance_status.csv` (repo root). -/// Columns: file, category, status, caught, missed, `false_positives` -/// -/// This file is the rolling log — commit it after each run to track regressions. -fn write_csv(detail_lines: &DetailLines) { - use std::fmt::Write; - - // Walk up from the manifest dir to find the workspace root (contains both - // Cargo.toml and a `crates/` subdirectory — distinguishes it from crate-level Cargo.toml). - let manifest = Path::new(env!("CARGO_MANIFEST_DIR")); - let Some(repo_root) = manifest - .ancestors() - .find(|p| p.join("Cargo.toml").exists() && p.join("crates").exists()) - else { - eprintln!(" [conformance csv] could not locate repo root"); - return; - }; - let csv_path = repo_root.join("conformance/conformance_status.csv"); - let _ = fs::create_dir_all(csv_path.parent().unwrap_or(Path::new("."))); - - let mut out = - String::from("basilisk_rules,file,category,status,caught,missed,false_positives\n"); - for (name, result) in detail_lines { - let cat = category(name); - let status = if result.passes() { "PASS" } else { "FAIL" }; - let rules = result.rules_fired.join("|"); - let _ = writeln!( - out, - "{rules},{name},{cat},{status},{},{},{}", - result.caught, result.missed, result.false_positives - ); - } - - match fs::write(&csv_path, &out) { - Ok(()) => println!(" Conformance CSV: {}", csv_path.display()), - Err(e) => eprintln!(" [conformance csv] write failed: {e}"), - } -} - -fn collect_results(files: &[std::fs::DirEntry]) -> (Totals, CategoryMap, DetailLines) { - let mut by_category: CategoryMap = BTreeMap::new(); - let mut detail_lines: DetailLines = Vec::new(); - let mut totals = Totals { - files: 0, - pass: 0, - caught: 0, - missed: 0, - fp: 0, - tag_ok: 0, - tag_missed: 0, - }; - - for entry in files { - let path = entry.path(); - let name = path - .file_name() - .unwrap_or_default() - .to_string_lossy() - .into_owned(); - let result = run_file(&path); - let cat = category(&name).to_owned(); - let counts = by_category.entry(cat).or_insert((0, 0)); - counts.1 += 1; - if result.passes() { - counts.0 += 1; - totals.pass += 1; - } - totals.files += 1; - totals.caught += result.caught; - totals.missed += result.missed; - totals.fp += result.false_positives; - totals.tag_ok += result.tagged_exact_satisfied; - totals.tag_missed += result.tagged_exact_missed; - detail_lines.push((name, result)); - } - (totals, by_category, detail_lines) -} - -#[expect( - clippy::cast_precision_loss, - clippy::cast_possible_truncation, - clippy::cast_sign_loss, - reason = "percentage display requires float conversion from counters" -)] -fn print_scorecard(t: &Totals, by_category: &CategoryMap, detail_lines: &DetailLines) { - let pct = if t.files > 0 { - (t.pass as f64 / t.files as f64) * 100.0 - } else { - 0.0 - }; - let fail = t.files - t.pass; - println!(); - println!("╔══════════════════════════════════════════════════════════════╗"); - println!("║ BASILISK PEP CONFORMANCE SCORECARD ║"); - println!("╠══════════════════════════════════════════════════════════════╣"); - println!( - "║ Files: {:>4} total │ {:>4} pass │ {fail:>4} fail ║", - t.files, t.pass - ); - println!("║ Score: {pct:.1}% ║"); - println!( - "║ Required: {:>4} caught │ {:>4} missed ║", - t.caught, t.missed - ); - println!( - "║ Tagged: {:>4} groups ok │ {:>4} groups missed ║", - t.tag_ok, t.tag_missed - ); - println!( - "║ False+: {:>4} unexpected diagnostics ║", - t.fp - ); - println!("╠══════════════════════════════════════════════════════════════╣"); - println!("║ Category breakdown ║"); - println!("╠══════════════════════════════════════════════════════════════╣"); - for (cat, (pass, total)) in by_category { - let cat_pct = if *total > 0 { - (*pass as f64 / *total as f64) * 100.0 - } else { - 0.0 - }; - let bar_filled = (cat_pct / 5.0).round() as usize; - let bar = format!("{}{}", "█".repeat(bar_filled), "░".repeat(20 - bar_filled)); - println!("║ {cat:<22} {pass:>2}/{total:<2} {cat_pct:>5.1}% {bar} ║"); - } - println!("╠══════════════════════════════════════════════════════════════╣"); - println!("║ Failing files ║"); - println!("╠══════════════════════════════════════════════════════════════╣"); - let mut any_fail = false; - for (name, result) in detail_lines { - if !result.passes() { - any_fail = true; - println!( - "║ ✗ {:<57} ║", - format!( - "{name} (missed {}, fp {})", - result.missed, result.false_positives - ) - ); - } - } - if !any_fail { - println!("║ (none — all files pass) ║"); - } - println!("╚══════════════════════════════════════════════════════════════╝"); - println!(); -} diff --git a/docs/INDEX.md b/docs/INDEX.md index 9d7e3353..1c984059 100644 --- a/docs/INDEX.md +++ b/docs/INDEX.md @@ -34,7 +34,7 @@ Implementation roadmaps tracking phasing, priorities, and progress. | [ROADMAP-NEXT-STEPS-PLAN.md](plans/ROADMAP-NEXT-STEPS-PLAN.md) | Post-launch aggregation roadmap — editor releases, scale testing, i18n, MCP server, AI integration, marketing. Rough overview + agent/human-split TODO. | | [LSP-PLAN.md](plans/LSP-PLAN.md) | Overall LSP roadmap — seven phases from core features through cross-module analysis. | | [CHECKER-CROSS-MODULE-PLAN.md](plans/CHECKER-CROSS-MODULE-PLAN.md) | Cross-file LSP features, type provenance, Salsa integration, auto-stub generation. | -| [CHECKER-PEP-CONFORMANCE-PLAN.md](plans/CHECKER-PEP-CONFORMANCE-PLAN.md) | PEP conformance push — target 85%, tiered task list by complexity and impact. | +| [CHECKER-PEP-CONFORMANCE-PLAN.md](plans/CHECKER-PEP-CONFORMANCE-PLAN.md) | PEP conformance push — target 100%, tiered task list by complexity and impact. | | [CHECKER-CACHE-PLAN.md](plans/CHECKER-CACHE-PLAN.md) | Build order for the opt-in CLI result cache + warm/cold benchmark wiring. | | [LSP-AI-PLAN.md](plans/LSP-AI-PLAN.md) | AI provider abstraction — model-agnostic hooks for fixes, completions, refactoring. | | [LSP-PROFILING-PLAN.md](plans/LSP-PROFILING-PLAN.md) | Embed py-spy profiler into LSP for CPU profiling and hotspot visualization. | diff --git a/docs/plans/CHECK-ELIMINATE-FALSE-POSITIVES.md b/docs/plans/CHECK-ELIMINATE-FALSE-POSITIVES.md index 9f8c8c3b..5d00cd51 100644 --- a/docs/plans/CHECK-ELIMINATE-FALSE-POSITIVES.md +++ b/docs/plans/CHECK-ELIMINATE-FALSE-POSITIVES.md @@ -1,5 +1,17 @@ # Plan: Eliminate False Positives in PEP Conformance Suite +> ⚠️ **SUPERSEDED.** The numbers in this doc ("136/146 PASS / 93.15%", "170 +> false positives", "FP-ceiling … Set to 161", `diag_line_rules`, +> `missed == 0` pass rule) describe an earlier in-repo harness that has been +> **removed**. The score is now computed by the **real `python/typing` +> calculator** (`conformance/score.py` downloads and runs upstream's own +> `get_expected_errors` + `diff_expected_errors`; see [CHKARCH-CONFORMANCE]). +> A file passes only with an **empty upstream `errors_diff`** (false positives +> fail the file), and **no diagnostic codes are excluded**. Honest current +> baseline: **59/146 = 40.4%**, **285 false positives**, 36 missed. The +> still-valid part of this plan is the *strategy* — driving specific rules' +> false positives down; the *counts* below are stale. + ## Context False positives are diagnostics Basilisk reports on lines that have NO `# E` diff --git a/docs/plans/CHECKER-PEP-CONFORMANCE-PLAN.md b/docs/plans/CHECKER-PEP-CONFORMANCE-PLAN.md index 25301a5b..0ea3bd80 100644 --- a/docs/plans/CHECKER-PEP-CONFORMANCE-PLAN.md +++ b/docs/plans/CHECKER-PEP-CONFORMANCE-PLAN.md @@ -1,9 +1,15 @@ # PEP Conformance — Plan -> **Score**: 137/146 (93.84%) -> **Tests**: `crates/basilisk-cli/tests/conformance/` -> **Status CSV**: `conformance/conformance_status.csv` -> **Run**: `make conformance` or `cargo test --test conformance_tests -- --nocapture` +> ⚠️ **SUPERSEDED SCORES BELOW.** Every percentage in this plan (e.g. "137/146, +> 93.84%", category "100%" rows) came from a since-removed in-repo harness that +> excluded 9 diagnostic codes and ignored false positives. The score is now +> computed by the **real `python/typing` calculator** (`conformance/score.py`, +> see [CHKARCH-CONFORMANCE]); the honest current number is **59/146 = 40.4%** +> (errors+warnings, strictest), 285 false positives, 36 missed. Treat the figures below as +> historical task notes, not the live score. +> +> **Run**: `make conformance` · **Status CSV**: `conformance/conformance_status.csv` +> · **Tests**: `crates/basilisk-cli/tests/conformance/` --- diff --git a/docs/plans/CHECKER-TYPE-NARROWING-INFERENCE-PLAN.md b/docs/plans/CHECKER-TYPE-NARROWING-INFERENCE-PLAN.md index ebe45e85..a036d4ae 100644 --- a/docs/plans/CHECKER-TYPE-NARROWING-INFERENCE-PLAN.md +++ b/docs/plans/CHECKER-TYPE-NARROWING-INFERENCE-PLAN.md @@ -418,5 +418,5 @@ Phases 1 and 2 are independent and can be parallelized. Phase 3 depends on Phase - [x] 5a. E0014 — `VarCheckContext` with `SubtypeContext`, uses `is_subtype_with_context()` for assignability - [x] 5b. E0013 — `SubtypeContext` passed to `check_function()`, removed `contains_named` early exit for Named types - [x] 5c. E0053 — `is_likely_narrowed()` heuristic suppresses narrowing-dependent FPs; Union normalization in `types_match()` - - [x] 5d. Full conformance suite verification — **18 FPs** (target was < 71) ✓ + - [x] 5d. Full conformance suite verification — the "**18 FPs** (target < 71)" result came from an earlier in-repo harness (a miscalculation) and is superseded; the official `python/typing` scorer reports **285 false positives** (59/146 files passing, 40.4% counting errors+warnings). Driving FPs down remains active work. - [x] Checker-side modules: `narrowing.rs` (NarrowingContext), `expr_inference.rs` (ExpressionInferrer), `constraint_solver.rs` (ConstraintSolver) diff --git a/docs/plans/FP-REMAINING-NOTES.md b/docs/plans/FP-REMAINING-NOTES.md index b3fac7cd..af6feada 100644 --- a/docs/plans/FP-REMAINING-NOTES.md +++ b/docs/plans/FP-REMAINING-NOTES.md @@ -98,7 +98,12 @@ structural matcher (positive-match semantics already reject `float`→`str`). --- ### Status -- B3 lane (E0111/E0143/E0115) = DONE, verified: 144/146, caught=917, missed=37 +- NOTE: the "144/146 / suite FP 21→11" figures below were produced by an earlier + in-repo harness (excluded 9 codes, didn't count false positives) — a miscalculation. The + official `python/typing` scorer (run unmodified, pinned commit) reports + **59/146 passing (40.4%, errors+warnings strictest), 285 false positives, 36 missed errors**. + Treat the per-lane numbers below as historical, not verified. +- B3 lane (E0111/E0143/E0115) = DONE, (legacy/superseded) figures: 144/146, caught=917, missed=37 (unchanged, both pre-failing files), suite FP 21→11. - Items 1 & 2 above are low-risk quick wins; 3 & 4 need structural work but the TP-safety traps are spelled out. diff --git a/docs/plans/LSP-PLAN.md b/docs/plans/LSP-PLAN.md index 7a392810..b2857175 100644 --- a/docs/plans/LSP-PLAN.md +++ b/docs/plans/LSP-PLAN.md @@ -6,7 +6,7 @@ ## Status -Phases 0–6 are COMPLETE. Phase 7 (cross-module foundation) is MOSTLY COMPLETE — stub infrastructure, import graph, cross-file symbols all operational. Phase 3.5 (PEP conformance push) is ACTIVE — currently at 84.9% (124/146 files, 18 FPs). +Phases 0–6 are COMPLETE. Phase 7 (cross-module foundation) is MOSTLY COMPLETE — stub infrastructure, import graph, cross-file symbols all operational. Phase 3.5 (PEP conformance push) is ACTIVE — the official `python/typing` scorer (run unmodified, pinned commit) currently reports **59/146 files passing (40.4%, errors+warnings strictest)**, with 285 false positives and 36 missed required errors still to clear. (Earlier in-repo figures such as "124/146, 18 FPs" came from an earlier in-repo harness that excluded codes and didn't count false positives — a miscalculation; they are superseded.) --- @@ -25,7 +25,7 @@ Phases 0–6 are COMPLETE. Phase 7 (cross-module foundation) is MOSTLY COMPLETE | 7.6 | Third-party type stubs — typeshed bundling, `py.typed` marker detection (PEP 561) | Medium | DONE — `phf` stdlib module set, `py.typed` detection, stub package discovery | | 7.7 | Config file reading — `pyproject.toml`, `basilisk.json` | Medium | DONE — `basilisk-config` crate with per-module/per-path overrides | -## Phase 7.5 — PEP Conformance Push (ACTIVE — 82.2% → 85%) +## Phase 7.5 — PEP Conformance Push (ACTIVE — 40.4% → 100%) > **BLOCKING for Phase 9.** The type system needs these capabilities to stop producing > false positives and to catch real typing errors conformance expects. diff --git a/docs/plans/ROADMAP-NEXT-STEPS-PLAN.md b/docs/plans/ROADMAP-NEXT-STEPS-PLAN.md index 031535ff..2440efbe 100644 --- a/docs/plans/ROADMAP-NEXT-STEPS-PLAN.md +++ b/docs/plans/ROADMAP-NEXT-STEPS-PLAN.md @@ -43,8 +43,9 @@ Every TODO item is tagged so we know who picks it up: install everywhere. The single biggest "people actually find out" lever. 3. **Get listed on the official Python typing conformance results** *(TODO H + G)* — **Effort: medium. - Reward: very high.** We're at 92.5%; closing the 11 failing files earns a spot on the scoreboard the - whole target audience watches. Correctness + credibility + organic discovery in one (the + Reward: very high.** We're at 40.4% (59/146, per the unmodified python/typing scorer); even at this + score, submitting results earns a spot on the scoreboard the whole target audience watches (mypy sits + at ~58%), and every failing file we close lifts our standing. Correctness + credibility + organic discovery in one (the Zuban/David Halter precedent proves it draws eyes). 4. **Ship Neovim + Zed for real** *(TODO A/B)* — **Effort: low-medium. Reward: high.** Both are ~95% @@ -112,9 +113,11 @@ the inline visualization / Speedscope hand-off wants a real-world pass for UX ro The bar to credibly displace Pylance is feature *and* correctness parity on the things people actually feel day to day. Rough priorities (refine with human judgment — see TODO): -- **Conformance & correctness**: PEP conformance currently **135/146 files PASS (~92.5%)**. The 11 - failing files cluster in Protocols, Callables, TypeVarTuple, ParamSpec, TypedDicts. There are also - ~18 remaining false positives (`CHECK-ELIMINATE-FALSE-POSITIVES.md`). FPs hurt credibility more +- **Conformance & correctness**: per the official `python/typing` scorer (run unmodified, pinned + commit), PEP conformance is currently **59/146 files PASS (40.4%, errors+warnings strictest)**, with **285 false + positives** and 36 missed required errors. (Earlier "135/146 / ~18 FPs" figures came from an earlier in-repo (miscalculating) + in-repo harness that excluded codes and ignored false positives; they are superseded.) Failing files + cluster in Protocols, Callables, TypeVarTuple, ParamSpec, TypedDicts. FPs hurt credibility more than missed cases — prioritize accordingly. - **Latency**: sub-10ms incremental checks are the promise (Salsa). Need a published benchmark vs. Pyright/Pylance — see §5 for the scale/resource methodology. @@ -188,14 +191,16 @@ hermetic, plus an opt-in integration test against the real agent. ## 9. Finish near-complete plans (bang for buck) -These are close enough that finishing them is cheap and visibly improves the product: +Several of these are close enough that finishing them is cheap and visibly improves the product (the +conformance and false-positive work is larger — sized honestly below against the unmodified scorer): -- **`CHECK-ELIMINATE-FALSE-POSITIVES.md`** (~93%): ~18 FPs left, mostly 1–2 per rule. **Plus an open - showstopper**: `BSK-E0149` line-scans source text and misfires on docstrings containing - `class`/`def` prefixes + bracketed tokens (e.g. our own `[SPEC-ID]` convention). Re-ground the rule - on the AST. High credibility payoff. -- **`CHECKER-PEP-CONFORMANCE-PLAN.md`** (~92.5%): clear the 11 failing files toward the conformance - results listing. +- **`CHECK-ELIMINATE-FALSE-POSITIVES.md`** (active): the real python/typing scorer reports **285 false + positives** to drive down (the old "~18 FPs left" came from the earlier in-repo harness — a + miscalculation — and is superseded). **Plus an open showstopper**: `BSK-E0149` line-scans source text and misfires on + docstrings containing `class`/`def` prefixes + bracketed tokens (e.g. our own `[SPEC-ID]` convention). + Re-ground the rule on the AST. High credibility payoff. +- **`CHECKER-PEP-CONFORMANCE-PLAN.md`** (active, 40.4% — 59/146): clear the **87 failing files** toward the + conformance results listing. - **`CHECKER-ELIMINATE-LINE-SCANNING-PLAN.md`** (~40%): the E0149 fix above is part of this; finish Phase 4 (wire the no-line-scanning lint into CI so the anti-pattern can't return). - **`LSP-STUBBING-PLAN.md`** (~95%, Phase 5 deferred): essentially shippable; decide whether the @@ -314,8 +319,8 @@ Rough plan (most of this is human-led — voice, accounts, timing, relationships ## G. Finish near-complete plans - [ ] **`[AGENT]`** Fix `BSK-E0149` docstring/line-scanning showstopper — re-ground the rule on the AST (`CHECK-ELIMINATE-FALSE-POSITIVES.md`). -- [ ] **`[AGENT]`** Clear remaining ~18 false positives. -- [ ] **`[AGENT]`** Close the 11 failing PEP-conformance files (Protocols, Callables, TypeVarTuple, ParamSpec, TypedDicts). +- [ ] **`[AGENT]`** Clear the 285 false positives. +- [ ] **`[AGENT]`** Close the 87 failing PEP-conformance files (Protocols, Callables, TypeVarTuple, ParamSpec, TypedDicts). - [ ] **`[AGENT]`** Finish `CHECKER-ELIMINATE-LINE-SCANNING-PLAN.md` Phase 4 — wire the no-line-scanning lint into CI. - [ ] **`[HUMAN]`** Decide whether `LSP-STUBBING-PLAN.md` Phase 5 (Salsa perf) ships now or later. diff --git a/docs/specs/CHECKER-ARCHITECTURE-SPEC.md b/docs/specs/CHECKER-ARCHITECTURE-SPEC.md index cfee4697..15bb0e67 100644 --- a/docs/specs/CHECKER-ARCHITECTURE-SPEC.md +++ b/docs/specs/CHECKER-ARCHITECTURE-SPEC.md @@ -58,7 +58,8 @@ See the project README for competitive analysis. | Implementation | TypeScript | Python/C | Rust | Rust | Rust | Rust | **Rust** | | License | MIT | MIT | MIT | MIT | AGPL | MIT | **MIT** | | Default strictness | Gradual | Gradual | Gradual | Gradual | Gradual | N/A | **Strict only** | -| PEP conformance target | ~95% | ~85% | ~15% | ~58% | ~69% | N/A | **100%** | +| PEP conformance (current) | ~95% | ~85% | ~15% | ~58% | ~69% | N/A | **40.4%** | +| PEP conformance target | — | — | — | — | — | N/A | **100%** | | LSP server | Yes | No | Yes | Yes | Yes | No | **Yes** | | Incremental computation | Lazy eval | Daemon | Salsa | Module-level | No | N/A | **Salsa** | | Ownership analysis | No | No | No | No | No | No | **Yes** | @@ -285,7 +286,7 @@ The `# type:` prefix ensures compatibility with editors and tools that already r ### Python Typing PEP Coverage {#CHKARCH-PEPS} -Basilisk targets **100% conformance** with the Python typing specification. We run the official conformance test suite (`python/typing` repository) in CI. +Basilisk targets **100% conformance** with the Python typing specification. This is a target, not a present-day achievement: the official `python/typing` conformance scorer (pinned commit, run unmodified in CI) currently reports **59 of 146 files passing (40.4%, counting errors and warnings — the strictest grading)**, with 285 false positives and 36 missed required errors still to clear. We run that suite in CI on every change and ratchet the pass rate up. #### Foundation PEPs {#CHKARCH-PEPS-FOUNDATION} @@ -818,7 +819,7 @@ operating directly on the module AST so it is independent of resolver state: `extra_items=T` whose type the argument matches. Implemented in `crates/basilisk-checker/src/rules/e0156/`; conformance fixture is -`crates/basilisk-cli/tests/conformance/typeddicts_extra_items.py`. +`conformance/tests/typeddicts_extra_items.py`. #### `ReadOnly` `TypedDict` inheritance {#CHKARCH-DIAG-TYPEDDICT-READONLY-INHERITANCE} @@ -1363,6 +1364,51 @@ Comparison baselines: Pyright, ty, Pyrefly, Zuban. | Property tests | `proptest` crate | Type system invariants | | Benchmarks | `make bench` (hyperfine, `benchmarks/run.sh`) vs Pyright/mypy/ty/Pyrefly | Performance tracking + regression gate (fails if basilisk regresses >25% vs the committed per-machine `benchmarks/status/.csv`) | +### PEP Conformance Scoring {#CHKARCH-CONFORMANCE} + +The conformance score is computed by the **real `python/typing` conformance +calculator**, not a Basilisk reimplementation. This is non-negotiable: the +number must be one anyone can reproduce with the same tooling the reference +checkers (pyright, mypy, pyrefly, ty, zuban, pycroscope) are graded with. + +- **Scorer**: [`conformance/score.py`](../../conformance/score.py) **imports the + committed [`conformance/upstream_main.py`](../../conformance/upstream_main.py)** — + a byte-identical, sha256-verified copy of `python/typing`'s + `conformance/src/main.py`, pinned to the same commit the fixtures come from + (`score.py` → `PINNED_TYPING_REF`, currently `268d0c4e`, sha256 + `b4e3bd08…0fc6a2`) — and calls its own `get_expected_errors` + + `diff_expected_errors` functions **unmodified**. Nothing is downloaded at score + time; the verbatim upstream file lives in the repo and `score.py` refuses to run + if its hash drifts. Refresh it only when bumping the ref: + `python3 conformance/score.py --refresh-upstream`. The only Basilisk-specific + code is a checker *adapter* that runs the real `basilisk` binary and turns its + JSON output into the `{line: [errors]}` mapping the upstream algorithm consumes — + exactly the role of upstream's per-checker adapters in `type_checker.py`. +- **Pass rule** (upstream's, verbatim): a file passes iff the upstream + `errors_diff` is empty — every `# E` line gets an error, every `# E[tag]` + group is satisfied, and **no error lands on a line the suite does not mark**. + `conformance_automated = "Fail" if errors_diff.strip() else "Pass"`. +- **No excluded codes.** By default the scorer counts **every** diagnostic + `basilisk check` emits — errors **and** warnings, the strictest grading and how + pyright is graded — including the strict-by-default completeness rules + (E0001–E0005, E0010, E0011, E0023, E0025). (`score.py` defaults to this strict + grading; pass `--errors-only` for the looser errors-only view.) One firing on an + unannotated line is a real false positive and fails the file — same as for any + other checker. +- **Gate**: `make test` (via [`scripts/test-rust.sh`](../../scripts/test-rust.sh)) + builds the `basilisk` binary, then runs `python3 conformance/score.py --gate` + on it — there is **no Rust conformance test**; the whole conformance system is + the two committed Python files plus the git-ignored downloaded fixtures under + `conformance/tests/`. The pass-percentage floor and false-positive ceiling live + in `coverage-thresholds.json` (`conformance.threshold`, + `conformance.max_false_positives`); the former ratchets **up**, the latter + **down**. Per-file results are written to `conformance/conformance_status.csv`. +- **Honest baseline** (replacing an earlier in-repo harness that + excluded the 9 codes above and didn't count false positives — a miscalculation that reported 100%): + **59 / 146 = 40.4%** (strictest grading: every diagnostic, errors AND warnings, + counted — as pyright is graded), 285 false positives, 36 missed required errors. + The looser errors-only view is 70 / 146 = 47.9%. Target: 100%. + ### Mutation Testing Ratchet {#CHKARCH-TESTING-MUTATION-RATCHET} Mutation testing is the proof that the test suite actually asserts behaviour — diff --git a/docs/specs/CHECKER-TYPE-INFERENCE-SPEC.md b/docs/specs/CHECKER-TYPE-INFERENCE-SPEC.md index 4b3069ac..4969ee2f 100644 --- a/docs/specs/CHECKER-TYPE-INFERENCE-SPEC.md +++ b/docs/specs/CHECKER-TYPE-INFERENCE-SPEC.md @@ -972,7 +972,7 @@ query("SELECT * FROM " + table) # BSK-E0015 — not LiteralString ## Conformance Test Coverage {#TYPEINF-CONFORMANCE} -The [Python typing conformance suite](https://github.com/python/typing/tree/main/conformance) is the canonical benchmark. Basilisk targets **100% conformance** (Pass on all 150 test files). +The [Python typing conformance suite](https://github.com/python/typing/tree/main/conformance) is the canonical benchmark. Basilisk **targets** 100% conformance (Pass on all 146 test files) — a target, not a present-day achievement. The official `python/typing` scorer currently reports **59 of 146 files passing (40.4%, counting errors and warnings — the strictest grading)**. Inference-relevant conformance tests: diff --git a/docs/specs/COMPILER-ARCHITECTURE-SPEC.md b/docs/specs/COMPILER-ARCHITECTURE-SPEC.md index cd77067c..813a7fa3 100644 --- a/docs/specs/COMPILER-ARCHITECTURE-SPEC.md +++ b/docs/specs/COMPILER-ARCHITECTURE-SPEC.md @@ -21,7 +21,7 @@ python3 script.py # still works -- it's valid Python ### What This Is {#COMPILER-WHAT} - A **strict subset** of Python 3.12 that compiles to native code -- 100% PEP compliant for the features it supports +- Aims to be PEP compliant for the features it supports - LLVM-based: JIT for development, AOT for deployment - Interoperable with the Python ecosystem via CPython embedding - A single binary (`basilisk`) that checks, compiles, and runs diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 00000000..e02c0346 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,11 @@ +# Ruff configuration for this repo's Python tooling (conformance scorer, scripts). +# Formatting/linting uses Ruff's defaults (line-length 88); the only customization +# is the exclude below. +# +# conformance/upstream_main.py is a BYTE-IDENTICAL vendored copy of python/typing's +# official conformance calculator, sha256-pinned in conformance/score.py. Ruff must +# never reformat it — any change breaks the integrity pin and the "we run the real +# upstream scorer, unmodified" guarantee — so it is excluded from discovery here. +# (The downloaded test fixtures under conformance/tests/ are git-ignored and already +# skipped by Ruff's respect-gitignore default.) +extend-exclude = ["conformance/upstream_main.py"] diff --git a/scripts/conformance.sh b/scripts/conformance.sh deleted file mode 100755 index 87b4c113..00000000 --- a/scripts/conformance.sh +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env bash -# Run the PEP conformance test suite. -# -# Downloads the python/typing conformance files first if they are missing. -# Outputs: conformance/conformance_status.csv (committed to the repo). -# -# Usage: -# ./scripts/conformance.sh # fetch if needed, then score -# ./scripts/conformance.sh --fetch # force re-download, then score -# ./scripts/conformance.sh --fetch-only # fetch only, no test run - -set -euo pipefail - -REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -source "$REPO_ROOT/scripts/common.sh" -cd "$REPO_ROOT" - -CONFORMANCE_DIR="crates/basilisk-cli/tests/conformance" - -# ── Fetch configuration ────────────────────────────────────────────────────── -TYPING_REPO="python/typing" -# Pinned upstream SHA. Bump deliberately, then re-run tests and update -# the conformance threshold in coverage-thresholds.json. Leaving this on -# `main` makes CI non-deterministic — upstream suite changes break us. -TYPING_REF="268d0c4e" -API_URL="https://api.github.com/repos/${TYPING_REPO}/contents/conformance/tests?ref=${TYPING_REF}" - -# ── Fetch if missing or forced ─────────────────────────────────────────────── -REF_STAMP_FILE="$CONFORMANCE_DIR/.ref-sha" - -fetch_conformance() { - header "Fetching conformance suite from ${TYPING_REPO} (ref: ${TYPING_REF})" - rm -rf "$CONFORMANCE_DIR" - mkdir -p "$CONFORMANCE_DIR" - - CURL_ARGS=(-fsSL) - if [[ -n "${GITHUB_TOKEN:-}" ]]; then - CURL_ARGS+=(-H "Authorization: token ${GITHUB_TOKEN}") - fi - FILE_LIST=$(curl "${CURL_ARGS[@]}" "$API_URL") - - COUNT=$(echo "$FILE_LIST" | python3 -c " -import json, sys -files = [f for f in json.load(sys.stdin) if f['type'] == 'file' and f['name'].endswith('.py')] -print(len(files)) -") - - echo "Downloading ${COUNT} test files to ${CONFORMANCE_DIR}..." - - echo "$FILE_LIST" | python3 -c " -import json, sys, urllib.request, os - -dest = sys.argv[1] -files = [f for f in json.load(sys.stdin) if f['type'] == 'file' and f['name'].endswith('.py')] - -for i, f in enumerate(files, 1): - out = os.path.join(dest, f['name']) - urllib.request.urlretrieve(f['download_url'], out) - if i % 25 == 0 or i == len(files): - print(f' {i}/{len(files)}') -" "$CONFORMANCE_DIR" - - echo "$TYPING_REF" > "$REF_STAMP_FILE" - ok "${COUNT} conformance files written to ${CONFORMANCE_DIR}/ (ref: ${TYPING_REF})" -} - -FETCH_ONLY=0 -for arg in "$@"; do - case "$arg" in - --fetch-only) FETCH_ONLY=1 ;; - esac -done - -CURRENT_REF="" -if [[ -f "$REF_STAMP_FILE" ]]; then - CURRENT_REF=$(cat "$REF_STAMP_FILE") -fi - -# --fetch forces a re-download; --fetch-only only ensures the pinned ref is -# present (fetch if missing/stale, then exit) so `make test` works offline. -if [[ "${1:-}" == "--fetch" ]] || \ - [[ ! -d "$CONFORMANCE_DIR" ]] || \ - [[ -z "$(ls -A "$CONFORMANCE_DIR" 2>/dev/null)" ]] || \ - [[ "$CURRENT_REF" != "$TYPING_REF" ]]; then - if [[ -n "$CURRENT_REF" ]] && [[ "$CURRENT_REF" != "$TYPING_REF" ]]; then - warn "Cached conformance ref ($CURRENT_REF) != pinned ($TYPING_REF) — refetching" - fi - fetch_conformance -else - COUNT=$(find "$CONFORMANCE_DIR" -name "*.py" | wc -l | tr -d ' ') - ok "Conformance suite present ($COUNT files, ref ${TYPING_REF}) — skipping download" -fi - -if [[ "$FETCH_ONLY" -eq 1 ]]; then - exit 0 -fi - -# ── Run the harness ────────────────────────────────────────────────────────── -header "Running PEP conformance harness" -echo "" - -cargo test --test conformance_tests -- --nocapture 2>&1 - -echo "" -header "Done" -echo -e " See ${CYAN}docs/PEP_CONFORMANCE.md${RESET} for score interpretation and the road to 95%." -echo "" diff --git a/scripts/fp_verify.sh b/scripts/fp_verify.sh index 4b862088..724cb53e 100755 --- a/scripts/fp_verify.sh +++ b/scripts/fp_verify.sh @@ -15,7 +15,6 @@ ROOT="/Users/christianfindlay/Documents/Code/Basilisk" cd "$ROOT" BASELINE="/tmp/conf_baseline.csv" CSV="conformance/conformance_status.csv" -FPLOG="/tmp/fp_current.txt" if [[ "${1:-}" == "--save-baseline" ]]; then cp "$CSV" "$BASELINE" @@ -23,9 +22,10 @@ if [[ "${1:-}" == "--save-baseline" ]]; then exit 0 fi -# Run the conformance suite (release), capturing FP lines. -cargo test --test conformance_tests --release -- --nocapture 2>&1 \ - | grep -E '^ FP ' | sort > "$FPLOG" || true +# Regenerate the conformance CSV with the official scorer against the release +# binary. score.py writes per-file caught/missed/fp to $CSV, which we diff below. +cargo build --release -p basilisk-cli --bin basilisk >/dev/null 2>&1 +python3 conformance/score.py --bin target/release/basilisk >/dev/null 2>&1 || true echo "=== totals (current) ===" awk -F, 'NR>1{c+=$5;m+=$6;f+=$7; if($4=="PASS")p++; else if($4=="FAIL")fl++} \ diff --git a/scripts/test-rust.sh b/scripts/test-rust.sh index 06dd81e2..9b14df37 100755 --- a/scripts/test-rust.sh +++ b/scripts/test-rust.sh @@ -24,28 +24,36 @@ HTML_DIR="$REPO_ROOT/target/llvm-cov/html" # Ensure llvm-tools-preview is installed so cargo-llvm-cov never prompts. rustup component add llvm-tools-preview 2>/dev/null || true -# ── Fetch conformance suite if missing or stale ────────────────────────────── -# `conformance.sh` is the single source of truth — it pins TYPING_REF and -# re-fetches when the cached ref differs. Do not duplicate that logic here. -header "Ensuring PEP conformance suite is current" -bash "$REPO_ROOT/scripts/conformance.sh" --fetch-only - -# ── Rust tests with coverage ───────────────────────────────────────────────── -# cargo-llvm-cov uses target/llvm-cov-target/ as its target directory, -# so the basilisk binary lands there — not in target/release/. +# ── Fetch the (git-ignored) conformance fixtures if missing or stale ────────── +# Only the fixtures are downloaded; the official calculator +# (conformance/upstream_main.py) is committed and never fetched. score.py pins +# the ref and re-fetches when the cached ref differs — single source of truth. +header "Ensuring PEP conformance fixtures are current" +python3 "$REPO_ROOT/conformance/score.py" --fetch-only + +# ── Rust tests + conformance, one instrumented coverage pool ───────────────── +# Coverage is gathered in TWO phases that share ONE profile pool, reported once: +# 1. the workspace test suite, then +# 2. the REAL basilisk binary scored over all 146 PEP conformance fixtures. +# Phase 2 is BOTH the conformance gate AND the source of the checker/resolver +# coverage those files exercise — the compiled binary's own instrumented run +# provides it (there is no in-repo conformance test). +# +# cargo-llvm-cov's `show-env` is the supported way to fold an external binary's +# runs into coverage: source it ONCE, then build + test + run the binary all under +# that single environment so every profraw lands in one pool under target/, then +# report. (Mixing a `cargo llvm-cov ` with `show-env` is unsupported — the +# run subcommand redirects to target/llvm-cov-target while show-env uses target/, +# so the pools diverge and the report finds no data.) header "Running tests with coverage instrumentation" +cargo llvm-cov clean --workspace +eval "$(cargo llvm-cov show-env --export-prefix)" + set +e -cargo llvm-cov \ - --profile ci \ - --workspace \ - --exclude basilisk-compiler \ - --all-targets \ - --lcov \ - --output-path "$LCOV_FILE" +cargo test --profile ci --workspace --exclude basilisk-compiler --all-targets TESTS_EXIT=$? set -e -ok "lcov.info → $LCOV_FILE" if [[ "$TESTS_EXIT" -ne 0 ]]; then echo "" echo -e "${RED}${BOLD}TESTS FAILED (exit $TESTS_EXIT).${RESET}" @@ -55,14 +63,31 @@ if [[ "$TESTS_EXIT" -ne 0 ]]; then fi ok "All workspace tests passed" -# Verify the basilisk binary exists. +# The freshly-built instrumented binary lives under the show-env build dir +# (target/ci/). Pin BASILISK_BIN to it so the conformance phase scores the exact +# binary whose objects the report reads — not a stale one from another target dir. +export BASILISK_BIN="$REPO_ROOT/target/ci/basilisk" BASILISK_BIN=$(find_basilisk_bin) || { echo -e "${RED}${BOLD}FATAL: basilisk binary not found after coverage build.${RESET}" - echo -e "${RED}Checked: target/llvm-cov-target/ci/ and fallback paths${RESET}" + echo -e "${RED}Checked: target/ci/ and fallback paths${RESET}" exit 1 } ok "basilisk binary ready: $BASILISK_BIN" +# ── PEP conformance gate (also contributes coverage) ────────────────────────── +# Score the REAL compiled binary with the official python/typing calculator +# (conformance/score.py imports the committed, sha256-verified upstream_main.py) +# and enforce the ratchet gate from coverage-thresholds.json. The binary runs +# under the sourced llvm-cov env, so its profile data joins the test pool and the +# checker/resolver paths these 146 files exercise count toward coverage. The whole +# conformance system is these two Python files + the gitignored fixtures, scored +# on the compiled binary — no Rust test. +header "Enforcing PEP conformance gate (official python/typing calculator)" +python3 "$REPO_ROOT/conformance/score.py" --bin "$BASILISK_BIN" --gate + +# ── Finalize coverage from BOTH phases (tests + conformance binary runs) ────── +cargo llvm-cov report --profile ci --lcov --output-path "$LCOV_FILE" +ok "lcov.info → $LCOV_FILE" cargo llvm-cov report --profile ci --html --output-dir "$HTML_DIR" ok "HTML report → $HTML_DIR/index.html" diff --git a/website/package-lock.json b/website/package-lock.json index 89ef0443..83c3f4a2 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -9,7 +9,8 @@ "version": "1.0.0", "devDependencies": { "@11ty/eleventy": "^3.1.6", - "eleventy-plugin-techdoc": "^0.2.0" + "eleventy-plugin-techdoc": "^0.2.0", + "markdown-it": "^14.2.0" } }, "node_modules/@11ty/dependency-tree": { diff --git a/website/package.json b/website/package.json index b8453407..6facc704 100644 --- a/website/package.json +++ b/website/package.json @@ -5,13 +5,13 @@ "type": "module", "description": "Documentation website for Basilisk — strict-by-default Python type checker", "scripts": { - "prebuild": "node scripts/copy-readme.js", "build": "eleventy", - "start": "node scripts/copy-readme.js && eleventy --serve --watch", + "start": "eleventy --serve --watch", "clean": "rm -rf _site" }, "devDependencies": { "@11ty/eleventy": "^3.1.6", - "eleventy-plugin-techdoc": "^0.2.0" + "eleventy-plugin-techdoc": "^0.2.0", + "markdown-it": "^14.2.0" } } diff --git a/website/scripts/copy-readme.js b/website/scripts/copy-readme.js deleted file mode 100644 index 878fd37e..00000000 --- a/website/scripts/copy-readme.js +++ /dev/null @@ -1,49 +0,0 @@ -/** - * copy-readme.js - * - * Copies the root README.md into src/readme.html at build time. - * The README is the single source of truth — the website page is generated - * from it automatically on every build. - * - * Front-matter is prepended so Eleventy picks it up with the right layout, - * title, and navigation entry. - */ - -import { readFileSync, writeFileSync, mkdirSync } from "node:fs"; -import { resolve, dirname } from "node:path"; -import { fileURLToPath } from "node:url"; - -const __dirname = dirname(fileURLToPath(import.meta.url)); - -const readmePath = resolve(__dirname, "../../README.md"); -const outPath = resolve(__dirname, "../src/readme.html"); - -const frontmatter = `--- -layout: layouts/docs.njk -title: README -description: Crate architecture, diagnostic rules, and development guide for Basilisk. -keywords: basilisk, readme, crate architecture, rust, python type checker -# English-only crate README — no Chinese twin exists, so opt it out of the -# language cluster (no /zh/readme/ hreflang or switcher link, which would 404). -noTranslation: true -eleventyNavigation: - key: README - order: 99 -permalink: /readme/ ---- - -`; - -// The root README uses a repo-relative logo path (`images/basilisk-logo.png`) -// that resolve on GitHub but 404 on the site at /readme/. Rewrite them to the -// site's absolute asset paths so the page renders without broken images. -// `images/screenshot.png` is a symlink to the canonical website asset, so both -// references resolve to the same file. -const readme = readFileSync(readmePath, "utf8") - .replace(/images\/basilisk-logo\.png/g, "/assets/images/logo.svg") - .replace(/images\/screenshot\.png/g, "/assets/images/screenshot.png"); - -mkdirSync(dirname(outPath), { recursive: true }); -writeFileSync(outPath, frontmatter + readme, "utf8"); - -console.log("✓ README.md copied to src/readme.html"); diff --git a/website/src/_data/conformance.js b/website/src/_data/conformance.js new file mode 100644 index 00000000..59081046 --- /dev/null +++ b/website/src/_data/conformance.js @@ -0,0 +1,237 @@ +// Eleventy global data: PEP conformance results, computed FRESH at every build +// from the committed outputs of the REAL python/typing calculator — never +// hand-typed. Implements [CHKARCH-CONFORMANCE]; mirrors _data/benchmarks.js. +// +// conformance/conformance_status.csv -> live per-file pass/fail (score.py) +// conformance/score.py -> pinned upstream ref + sha256 +// conformance/upstream_main.py -> re-hashed here to re-verify the pin +// git log of conformance_status.csv -> the over-time chart (real commits) +// +// A file passes iff the official calculator's `errors_diff` is empty. Every +// number the website shows is whatever that scorer last produced and committed — +// and the over-time chart is read straight from this file's GIT history, not a +// hand-maintained ledger, so it cannot drift from what actually happened. +import { readFileSync, existsSync, statSync } from "fs"; +import { execFileSync } from "child_process"; +import { createHash } from "crypto"; +import { dirname, join } from "path"; +import { fileURLToPath } from "url"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const REPO_ROOT = join(__dirname, "../../.."); +const CONF_DIR = join(REPO_ROOT, "conformance"); +const STATUS_REL = "conformance/conformance_status.csv"; +const STATUS_CSV = join(CONF_DIR, "conformance_status.csv"); +const SCORE_PY = join(CONF_DIR, "score.py"); +const UPSTREAM_MAIN = join(CONF_DIR, "upstream_main.py"); + +// The day the official python/typing calculator replaced our earlier in-repo +// script. That script excluded some diagnostic codes and did not count false +// positives, so it miscalculated the score (up to 100%). Commits dated on/after +// this used the official calculator; before, the earlier in-repo measurement. +const OFFICIAL_SINCE = "2026-06-23"; + +// The CSV stores lowercase category slugs; these render the few that are not a +// plain title-case word. Everything else falls back to capitalising the slug. +const CATEGORY_LABELS = { + typeddicts: "TypedDicts", + namedtuples: "NamedTuples", + typeforms: "TypeForms", + specialtypes: "Special types", +}; + +const MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]; + +const round1 = (n) => Math.round(n * 10) / 10; +const labelFor = (slug) => CATEGORY_LABELS[slug] || (slug ? slug.charAt(0).toUpperCase() + slug.slice(1) : "—"); + +// "2026-06-21" -> "Jun 21" (manual parse — no timezone surprises). +function shortDate(iso) { + const [, m, d] = iso.split("-").map((p) => parseInt(p, 10)); + return Number.isFinite(m) && Number.isFinite(d) ? `${MONTHS[m - 1]} ${d}` : iso; +} + +// Pull a `NAME = "value"` string constant straight out of score.py so the pin +// shown on the website is the exact one the scorer enforces, not a copy. +function constFromScorePy(name) { + if (!existsSync(SCORE_PY)) return null; + const src = readFileSync(SCORE_PY, "utf-8"); + const m = src.match(new RegExp(`^${name}\\s*=\\s*"([^"]+)"`, "m")); + return m ? m[1] : null; +} + +// Tally one CSV body (pass/total/fp/missed) from its raw text. +function tally(csvText) { + const rows = csvText.split(/\r?\n/).slice(1).filter((l) => l.trim() && !l.startsWith("#")); + const t = { pass: 0, total: 0, fp: 0, missed: 0, byFile: rows }; + for (const line of rows) { + const f = line.split(","); + if (f.length < 7) continue; + t.total += 1; + if (f[3] === "PASS") t.pass += 1; + t.missed += parseInt(f[5], 10) || 0; + t.fp += parseInt(f[6], 10) || 0; + } + return t; +} + +function parseStatus() { + if (!existsSync(STATUS_CSV)) return null; + const text = readFileSync(STATUS_CSV, "utf-8"); + const t = tally(text); + if (!t.total) return null; + + const cats = new Map(); + const failing = []; + let caught = 0; + for (const line of t.byFile) { + const f = line.split(","); + if (f.length < 7) continue; + const passed = f[3] === "PASS"; + const slug = f[2]; + const missed = parseInt(f[5], 10) || 0; + const fp = parseInt(f[6], 10) || 0; + caught += parseInt(f[4], 10) || 0; + if (!cats.has(slug)) cats.set(slug, { slug, label: labelFor(slug), pass: 0, total: 0 }); + const entry = cats.get(slug); + entry.total += 1; + entry.pass += passed ? 1 : 0; + if (!passed) failing.push({ file: f[1], category: slug, missed, fp }); + } + + const categories = [...cats.values()] + .filter((c) => c.slug) + .map((c) => ({ ...c, pct: round1((c.pass / c.total) * 100) })) + .sort((a, b) => a.label.localeCompare(b.label)); + + return { + pass: t.pass, + total: t.total, + fail: t.total - t.pass, + caught, + missed: t.missed, + fp: t.fp, + scorePct: round1((t.pass / t.total) * 100), + categories, + categoriesTotal: categories.length, + categoriesPass100: categories.filter((c) => c.pass === c.total).length, + failing: failing.sort((a, b) => b.fp + b.missed - (a.fp + a.missed)), + }; +} + +function git(args) { + // stderr ignored: early commits hold the file under an old path, so `git show` + // legitimately fails for those — we skip them, no need to spam the build log. + return execFileSync("git", args, { cwd: REPO_ROOT, encoding: "utf-8", maxBuffer: 1 << 26, stdio: ["ignore", "pipe", "ignore"] }); +} + +// The over-time series, read from the GIT history of conformance_status.csv. +// One real data point per commit that changed the file: its commit date and the +// score that commit recorded. Points dated before OFFICIAL_SINCE were produced +// by the earlier in-repo script; on/after, by the official calculator. +function gitHistory() { + let log; + try { + log = git(["log", "--follow", "--format=%H|%cs", "--", STATUS_REL]); + } catch { + return []; + } + const points = []; + for (const line of log.split(/\r?\n/).filter(Boolean)) { + const [hash, date] = line.split("|"); + let csv; + try { + csv = git(["show", `${hash}:${STATUS_REL}`]); + } catch { + continue; + } + const t = tally(csv); + if (!t.total) continue; + points.push({ + hash: hash.slice(0, 8), + date, + shortDate: shortDate(date), + pass: t.pass, + total: t.total, + fp: t.fp, + missed: t.missed, + score: round1((t.pass / t.total) * 100), + official: date >= OFFICIAL_SINCE, + }); + } + return points.reverse(); // oldest -> newest +} + +// Inline-SVG geometry for the over-time chart. Computed here (testable, DRY) so +// the Nunjucks include only loops over coordinates. Points are spaced evenly by +// commit (each is a real event); the y-axis is the pass percentage 0–100. +function buildChart(points) { + if (points.length < 2) return null; + const width = 760, height = 360, left = 48, right = 24, top = 28, bottom = 64; + const plotW = width - left - right, plotH = height - top - bottom; + const n = points.length; + const xAt = (i) => round1(left + (i / (n - 1)) * plotW); + const yAt = (score) => round1(top + (1 - score / 100) * plotH); + + let lastLabel = null; + const pts = points.map((p, i) => { + const showDate = p.shortDate !== lastLabel; + lastLabel = p.shortDate; + return { ...p, i, x: xAt(i), y: yAt(p.score), showDate }; + }); + const yTicks = [0, 25, 50, 75, 100].map((value) => ({ value, y: yAt(value) })); + + const previous = pts.filter((p) => !p.official); + const official = pts.filter((p) => p.official); + const lastPrevious = previous[previous.length - 1]; + const firstOfficial = official[0]; + const peak = pts.reduce((a, b) => (b.score > a.score ? b : a), pts[0]); + + return { + width, height, left, right, top, bottom, + baselineY: yAt(0), + pts, + yTicks, + prevPolyline: previous.map((p) => `${p.x},${p.y}`).join(" "), + officialPolyline: official.map((p) => `${p.x},${p.y}`).join(" "), + // The correction "cliff": last earlier-era point down to the first official one. + drop: lastPrevious && firstOfficial + ? { x1: lastPrevious.x, y1: lastPrevious.y, x2: firstOfficial.x, y2: firstOfficial.y, from: lastPrevious.score, to: firstOfficial.score } + : null, + peak, + current: pts[pts.length - 1], + }; +} + +export default function () { + const status = parseStatus(); + if (!status) { + return { hasData: false, scorePct: null, categories: [], failing: [], history: [], chart: null }; + } + + const pinnedRef = constFromScorePy("PINNED_TYPING_REF"); + const sha256 = constFromScorePy("UPSTREAM_MAIN_SHA256"); + // Re-verify the committed calculator at build time — the page states this. + let liveSha = null, upstreamBytes = null, verified = false; + if (existsSync(UPSTREAM_MAIN)) { + const raw = readFileSync(UPSTREAM_MAIN); + liveSha = createHash("sha256").update(raw).digest("hex"); + upstreamBytes = statSync(UPSTREAM_MAIN).size; + verified = sha256 != null && liveSha === sha256; + } + + const history = gitHistory(); + return { + hasData: true, + ...status, + pinnedRef, + sha256, + sha256Short: sha256 ? sha256.slice(0, 12) : null, + liveSha256Short: liveSha ? liveSha.slice(0, 12) : null, + verified, + upstreamBytes, + officialSince: OFFICIAL_SINCE, + history, + chart: buildChart(history), + }; +} diff --git a/website/src/_data/releases.js b/website/src/_data/releases.js new file mode 100644 index 00000000..6cf6e7b1 --- /dev/null +++ b/website/src/_data/releases.js @@ -0,0 +1,124 @@ +// Eleventy global data: the Basilisk GitHub Releases, fetched FRESH at every +// build from the public GitHub REST API — never hand-maintained. This mirrors +// the build-time data pattern of _data/conformance.js and _data/benchmarks.js: +// everything the /docs/releases/ page shows is whatever the API returns at build +// time (tag, title, date, release notes rendered from the release's markdown +// body, and downloadable assets). +// +// Drafts are excluded (not yet published). Prereleases are kept and badged. +// +// The build NEVER fails on a network/API error: exactly like conformance.js it +// degrades to `{ hasData: false }` and the page renders an empty state linking +// to GitHub, so an offline dev build or a rate-limited CI run still produces a +// valid site. When `GITHUB_TOKEN`/`GH_TOKEN` is present (CI) it is used to raise +// the API rate limit; the public, unauthenticated path works too. +import markdownIt from "markdown-it"; + +const OWNER = "Nimblesite"; +const REPO = "Basilisk"; +const API = `https://api.github.com/repos/${OWNER}/${REPO}/releases?per_page=100`; +const RELEASES_URL = `https://github.com/${OWNER}/${REPO}/releases`; + +// Release notes are authored by the maintainers (trusted), so raw HTML is +// allowed. `breaks: true` matches how GitHub itself renders release bodies. +const md = markdownIt({ html: true, linkify: true, breaks: true }); + +const MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]; + +// "2026-06-23T10:16:43Z" -> "Jun 23, 2026". UTC getters keep the output +// deterministic regardless of the build machine's timezone. +function formatDate(iso) { + if (!iso) return null; + const date = new Date(iso); + return Number.isNaN(date.getTime()) + ? iso + : `${MONTHS[date.getUTCMonth()]} ${date.getUTCDate()}, ${date.getUTCFullYear()}`; +} + +// Bytes -> "1.2 MB" style, base-1024. +function formatBytes(bytes) { + if (!Number.isFinite(bytes) || bytes <= 0) return "0 B"; + const units = ["B", "KB", "MB", "GB"]; + const exp = Math.min(Math.floor(Math.log(bytes) / Math.log(1024)), units.length - 1); + const value = bytes / 1024 ** exp; + return `${exp === 0 ? value : Math.round(value * 10) / 10} ${units[exp]}`; +} + +// Pull the `rel="next"` URL out of a GitHub `Link` response header (string +// splitting, no regex). Returns null when there is no next page. +function nextPageUrl(linkHeader) { + if (!linkHeader) return null; + for (const part of linkHeader.split(",")) { + const [target, ...attrs] = part.split(";"); + if (attrs.some((attr) => attr.trim() === 'rel="next"')) { + return target.trim().slice(1, -1); // strip the surrounding < > + } + } + return null; +} + +async function fetchAllReleases() { + const headers = { + Accept: "application/vnd.github+json", + "User-Agent": `${OWNER}-${REPO}-website-build`, + "X-GitHub-Api-Version": "2022-11-28", + }; + const token = process.env.GITHUB_TOKEN || process.env.GH_TOKEN; + if (token) headers.Authorization = `Bearer ${token}`; + + const releases = []; + let url = API; + while (url) { + const response = await fetch(url, { headers }); + if (!response.ok) { + throw new Error(`GitHub API ${response.status} ${response.statusText}`); + } + releases.push(...(await response.json())); + url = nextPageUrl(response.headers.get("link")); + } + return releases; +} + +// Shape one API release into the flat record the template renders. +function toRecord(release) { + return { + tag: release.tag_name, + name: release.name || release.tag_name, + url: release.html_url, + date: formatDate(release.published_at || release.created_at), + dateIso: release.published_at || release.created_at, + prerelease: release.prerelease === true, + bodyHtml: release.body ? md.render(release.body) : "", + assets: (release.assets || []).map((asset) => ({ + name: asset.name, + url: asset.browser_download_url, + size: formatBytes(asset.size), + downloads: asset.download_count || 0, + })), + }; +} + +const EMPTY = { hasData: false, releasesUrl: RELEASES_URL, count: 0, releases: [] }; + +export default async function () { + try { + const published = (await fetchAllReleases()) + .filter((release) => release.draft !== true) + .sort((a, b) => new Date(b.published_at || b.created_at) - new Date(a.published_at || a.created_at)) + .map(toRecord); + + if (!published.length) return EMPTY; + + return { + hasData: true, + releasesUrl: RELEASES_URL, + count: published.length, + latest: published[0], + releases: published, + }; + } catch (error) { + // Degrade gracefully — a broken build is worse than a stale releases page. + console.warn(`⚠ releases.js: ${error.message} — rendering empty state`); + return EMPTY; + } +} diff --git a/website/src/_includes/conformance-chart.njk b/website/src/_includes/conformance-chart.njk new file mode 100644 index 00000000..60ac1cc0 --- /dev/null +++ b/website/src/_includes/conformance-chart.njk @@ -0,0 +1,45 @@ +{# + Shared PEP-conformance over-time chart — the SINGLE source of truth for + rendering the history of conformance/conformance_status.csv in EVERY locale. + Pure inline SVG (no JS, no chart library), data-driven from _data/conformance.js + (which reads the file's real git history). Pages supply only translated prose. + + WHITESPACE: this macro is embedded inside MARKDOWN pages. markdown-it ends a + raw-HTML block at the first blank line, so the rendered SVG MUST contain no + blank lines or it gets shredded (text nodes leak out of ). Every njk + control tag therefore uses `{%- ... -%}` trimming to keep the output contiguous. + + Args: + c — the global `conformance` data object (from _data/conformance.js) + t — locale strings: { label, heading, subhead, prevLegend, officialLegend, + dropNote, caption } — rendered with `| safe` (may contain inline HTML). +#} +{%- macro chart(c, t) -%} +{%- if c.chart -%} +{%- set ch = c.chart -%} +
+
{{ t.label }}{{ t.heading | safe }} + +{%- for tick in ch.yTicks %} +{{ tick.value }}% +{%- endfor %} + +{%- if ch.drop %} + +{%- endif %} + +{%- for p in ch.pts %} +{{ p.shortDate }} ({{ p.hash }}): {{ p.score }}% — {{ p.pass }}/{{ p.total }}, {{ p.fp }} false positives{{ ' · official calculator' if p.official else ' · earlier in-repo harness' }} +{%- if p.showDate %} +{{ p.shortDate }} +{%- endif %} +{%- endfor %} +{{ ch.peak.score }}% +{{ ch.current.score }}% + +

{{ t.dropNote | safe }}

+
  • {{ t.prevLegend | safe }}
  • {{ t.officialLegend | safe }}
+

{{ t.caption | safe }}

+ +{%- endif -%} +{%- endmacro -%} diff --git a/website/src/_includes/layouts/base.njk b/website/src/_includes/layouts/base.njk index 438d1b49..e4d48c62 100644 --- a/website/src/_includes/layouts/base.njk +++ b/website/src/_includes/layouts/base.njk @@ -3,7 +3,7 @@ path straight from the URL, so language alternates never double-prefix (/zh/zh/...) even when an auto-generated page reports the wrong `lang`. Set `noTranslation: true` in a page's front matter to opt it out of the - language cluster entirely (e.g. the English-only README). -#} + language cluster entirely (e.g. the English-only Releases page). -#} {%- set effLang = 'zh' if (page.url == '/zh/' or page.url.startsWith('/zh/')) else (lang | default('en')) -%} {%- set basePath = (page.url | replace('/zh/', '/')) if effLang == 'zh' else page.url -%} diff --git a/website/src/assets/css/styles.css b/website/src/assets/css/styles.css index 4a964180..6dfb6a13 100644 --- a/website/src/assets/css/styles.css +++ b/website/src/assets/css/styles.css @@ -964,6 +964,185 @@ button { cursor: pointer; font-family: inherit; border: none; background: none; .citations-list a:hover { color: var(--color-accent); } +/* ── Conformance methodology ───────────────────────────────── */ +.conformance-method { + list-style: decimal; + padding-left: 1.5rem; + max-width: 68ch; + font-size: 0.9375rem; + line-height: 1.65; + color: var(--color-text-secondary); +} +.conformance-method li + li { margin-top: var(--space-3); } +.conformance-method strong { color: var(--color-text-primary); font-weight: 600; } +.conformance-method code, +.conformance-method__note code { + font-family: var(--font-mono); + font-size: 0.85em; + color: var(--color-code-type); + background: var(--color-bg-elevated); + padding: 0.1em 0.35em; + border-radius: var(--radius-sm); + border: 1px solid var(--color-border); +} +.conformance-method__note { + margin-top: var(--space-6); + max-width: 68ch; + font-size: 0.875rem; + line-height: 1.6; + color: var(--color-text-muted); +} +.conformance-method__note a { + color: var(--color-accent); + text-decoration: underline; + text-underline-offset: 2px; +} + +/* ── PEP conformance over-time chart ───────────────────────── */ +.conf-chart { + margin: var(--space-8) 0; + padding: var(--space-6); + background: var(--color-bg-secondary); + border: 1px solid var(--color-border); + border-radius: var(--radius-lg); +} +.conf-chart__head { display: flex; flex-direction: column; gap: var(--space-1); margin-bottom: var(--space-4); } +.conf-chart__label { + font-size: 0.75rem; + letter-spacing: 0.08em; + text-transform: uppercase; + color: var(--color-primary); + font-weight: 600; +} +.conf-chart__title { font-size: 1.125rem; font-weight: 600; color: var(--color-text-primary); } +.conf-chart__svg { display: block; width: 100%; height: auto; overflow: visible; } +.conf-chart__grid { stroke: var(--color-border); stroke-width: 1; } +.conf-chart__axis, .conf-chart__date { fill: var(--color-text-muted); font-family: var(--font-mono); font-size: 11px; } +.conf-chart__line { fill: none; stroke-width: 2.5; stroke-linejoin: round; stroke-linecap: round; } +/* Climb to 100% = the earlier (incorrect) era, in red. */ +.conf-chart__line--prev { stroke: var(--color-error); } +/* The drop and everything after = the official (correct) era, in green. */ +.conf-chart__line--official { stroke: var(--color-success); } +.conf-chart__drop { stroke: var(--color-success); stroke-width: 3; stroke-linecap: round; } +.conf-chart__dot--prev { fill: var(--color-error); } +.conf-chart__dot--official { fill: var(--color-success); } +.conf-chart__value { font-family: var(--font-mono); font-size: 14px; font-weight: 700; } +.conf-chart__value--peak { fill: var(--color-error); } +.conf-chart__value--current { fill: var(--color-success); } +.conf-chart__drop-note { + margin: var(--space-5) 0 0; + font-size: 0.9375rem; + line-height: 1.6; + color: var(--color-text-secondary); +} +.conf-chart__drop-note strong { color: var(--color-error); } +.conf-chart__legend { + list-style: none; + display: flex; + flex-wrap: wrap; + gap: var(--space-5); + margin: var(--space-4) 0 0; + padding: 0; + font-size: 0.8125rem; + color: var(--color-text-secondary); +} +.conf-chart__legend li { display: flex; align-items: center; gap: var(--space-2); } +.conf-chart__swatch { width: 18px; height: 3px; border-radius: 2px; display: inline-block; } +.conf-chart__swatch--prev { background: var(--color-error); } +.conf-chart__swatch--official { background: var(--color-success); } +.conf-chart__caption { margin: var(--space-4) 0 0; font-size: 0.8125rem; color: var(--color-text-muted); line-height: 1.6; } +.conf-chart__caption code { font-family: var(--font-mono); font-size: 0.85em; } + +/* ── Conformance methodology page bits ─────────────────────── */ +.conf-links { + display: flex; + flex-wrap: wrap; + gap: var(--space-3); + margin: var(--space-6) 0; +} +.conf-links a { + display: inline-flex; + align-items: center; + padding: var(--space-2) var(--space-4); + border: 1px solid var(--color-border-bright); + border-radius: var(--radius-md); + background: var(--color-bg-secondary); + color: var(--color-text-primary); + font-size: 0.875rem; + font-weight: 600; + text-decoration: none; + transition: border-color 0.15s, color 0.15s; +} +.conf-links a:hover { border-color: var(--color-primary); color: var(--color-primary); } +.conf-correction { + display: flex; + align-items: center; + gap: var(--space-4); + flex-wrap: wrap; + margin: var(--space-6) 0; + padding: var(--space-5) var(--space-6); + background: var(--color-bg-secondary); + border: 1px solid var(--color-border-bright); + border-radius: var(--radius-lg); +} +.conf-correction__old, .conf-correction__new { font-family: var(--font-mono); font-weight: 700; font-size: 1.75rem; } +.conf-correction__old { color: var(--color-warning); text-decoration: line-through; text-decoration-thickness: 2px; } +.conf-correction__new { color: var(--color-primary); } +.conf-correction__arrow { color: var(--color-text-muted); font-size: 1.5rem; } +.conf-correction__text { font-size: 0.9375rem; color: var(--color-text-secondary); flex: 1; min-width: 220px; } +.conf-cat-bar { + display: inline-block; + height: 8px; + border-radius: 4px; + background: var(--color-primary); + vertical-align: middle; + min-width: 2px; +} +.conf-verified { + display: inline-flex; + align-items: center; + gap: var(--space-2); + padding: var(--space-1) var(--space-3); + border-radius: var(--radius-sm); + background: rgba(52, 211, 153, 0.12); + border: 1px solid rgba(52, 211, 153, 0.35); + color: var(--color-success); + font-family: var(--font-mono); + font-size: 0.8125rem; +} + +/* ── Releases (/docs/releases/) ────────────────────────────── */ +.releases-intro { font-size: 1.0625rem; } +.release-list { display: flex; flex-direction: column; gap: var(--space-6); margin-top: var(--space-8); } +.release { + padding: var(--space-6); + background: var(--color-bg-secondary); + border: 1px solid var(--color-border-bright); + border-radius: var(--radius-lg); +} +.release-head { display: flex; align-items: center; flex-wrap: wrap; gap: var(--space-3); margin-bottom: var(--space-4); } +.release-title { font-family: var(--font-mono); font-size: 1.25rem; font-weight: 700; margin: 0; padding: 0; border: none; } +.release-title a { color: var(--color-text-primary); text-decoration: none; } +.release-title a:hover { color: var(--color-primary); } +.release-badge { font-size: 0.6875rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.06em; padding: 0.15em 0.5em; border-radius: var(--radius-sm); } +.release-badge--latest { color: var(--color-success); background: rgba(52, 211, 153, 0.12); border: 1px solid rgba(52, 211, 153, 0.35); } +.release-badge--pre { color: var(--color-warning); background: rgba(251, 191, 36, 0.12); border: 1px solid rgba(251, 191, 36, 0.35); } +.release-date { margin-left: auto; font-size: 0.8125rem; color: var(--color-text-muted); font-family: var(--font-mono); } +/* Tame the heading scale of the maintainer-authored notes so a release's own + `## headings` don't impersonate the docs page's top-level sections. */ +.release-notes > :first-child { margin-top: 0; } +.release-notes h1, .release-notes h2, .release-notes h3, .release-notes h4 { + font-size: 1rem; font-weight: 600; margin-top: var(--space-5); margin-bottom: var(--space-2); + padding: 0; border: none; color: var(--color-text-primary); letter-spacing: 0; +} +.release-assets { margin-top: var(--space-5); border-top: 1px solid var(--color-border); padding-top: var(--space-4); } +.release-assets summary { cursor: pointer; font-size: 0.875rem; font-weight: 600; color: var(--color-text-secondary); } +.release-assets summary:hover { color: var(--color-primary); } +.release-asset-list { list-style: none !important; padding-left: 0 !important; margin-top: var(--space-3); } +.release-asset-list li { display: flex; flex-wrap: wrap; align-items: baseline; gap: var(--space-3); margin-bottom: var(--space-2); } +.release-asset-list a { font-family: var(--font-mono); font-size: 0.875rem; } +.release-asset-meta { font-size: 0.75rem; color: var(--color-text-muted); font-family: var(--font-mono); } + /* ── Responsive ────────────────────────────────────────────── */ @media (max-width: 640px) { .section-heading { font-size: 1.75rem; } diff --git a/website/src/docs/comparison.md b/website/src/docs/comparison.md index e125251b..1991ab7b 100644 --- a/website/src/docs/comparison.md +++ b/website/src/docs/comparison.md @@ -40,7 +40,7 @@ Basilisk removes the choice. There is no permissive mode to fall back to. | Feature | Basilisk | Pyright | mypy | ty | Pyrefly | |---|---|---|---|---|---| | Strict by default | ✅ | ❌ opt-in | ❌ opt-in | ❌ opt-in | ❌ opt-in | -| PEP conformance¹ | 98.6% (→100% target) | ~99% | ~58% | early alpha | ~86% | +| PEP conformance¹ | {{ conformance.scorePct }}% current (→100% target) | ~99% | ~58% | early alpha | ~86% | | Implementation | Rust | TypeScript | Python/C | Rust | Rust | | Runtime required | None | Node.js | Python | None | None | | Full LSP (completions, hover, goto) | ✅ | Pylance only | ❌ | Basic | Basic | @@ -158,7 +158,7 @@ Basilisk is not a faster version of an existing tool. It occupies a different po 5. WASM plugin system (planned) — extensible without forking, secure by design **Where Basilisk is not yet the best choice:** -- PEP conformance: Basilisk passes 98.6% of the official conformance suite (144/146). Pyright still covers more edge cases today. Basilisk's target is 100%; it's not there yet. +- PEP conformance: Basilisk currently passes {{ conformance.scorePct }}% of the official conformance suite ({{ conformance.pass }}/{{ conformance.total }}, counting errors+warnings — the strictest grading), with {{ conformance.fp }} false positives and {{ conformance.missed }} missed required errors still being driven down. Pyright covers far more edge cases today. Basilisk's target is 100%; it's not there yet. See [how we measure conformance](/docs/conformance/). - Plugin ecosystem: mypy's Django and SQLAlchemy plugins are mature. Basilisk's WASM plugins are planned. - Maturity: Pylance is feature-complete today (though proprietary and VS Code only). Basilisk is in alpha. diff --git a/website/src/docs/conformance.md b/website/src/docs/conformance.md new file mode 100644 index 00000000..2e88396f --- /dev/null +++ b/website/src/docs/conformance.md @@ -0,0 +1,113 @@ +--- +layout: layouts/docs.njk +title: "How Basilisk Measures PEP Conformance" +description: "How Basilisk's PEP conformance score is measured with the official python/typing conformance suite — what the suite is, how scoring works, the byte-identical pinned calculator we run, and the correction we made to our own scoring." +keywords: pep conformance, python typing conformance suite, basilisk conformance score, type checker scoring, python/typing calculator +date: 2026-06-23 +dateModified: 2026-06-23 +author: The Basilisk Project +eleventyNavigation: + key: Conformance + order: 8 +--- +{% from "conformance-chart.njk" import chart %} + +# How we measure PEP conformance + +Basilisk is scored by the **official `python/typing` conformance suite** — the same test suite and scoring tool the typing community uses to grade pyright, mypy, pyrefly, ty, and others. We run that tool unmodified, on the real `basilisk` binary, on every change. + +Today that gives **{{ conformance.scorePct }}%** — **{{ conformance.pass }} of {{ conformance.total }}** test files passing, {{ conformance.caught }} required errors caught, with **{{ conformance.fp }} false positives** and **{{ conformance.missed }} missed required errors** left to clear. {{ conformance.categoriesPass100 }} of {{ conformance.categoriesTotal }} categories pass at 100%. The target is 100%; we ratchet toward it. + + + +## What the conformance suite is + +The [Python typing specification](https://typing.python.org/en/latest/spec/) defines how the type system is supposed to behave — generics, protocols, dataclasses, `TypedDict`, overloads, literals, and the rest. To stop the spec from being aspirational, the typing community maintains a **conformance test suite** alongside it in the [`python/typing`](https://github.com/python/typing/tree/main/conformance) repository. + +It works like this: + +- Each spec chapter has one or more **test files** — ordinary Python modules that exercise a feature and mark, with `# E` comments, every line where a conforming type checker **must** report an error (and, with `# E[tag]` groups, where one of several related errors is acceptable). +- A small **scoring tool** runs a type checker over those files and diffs its output against the annotations. A file *passes* only if the diff is empty: every required error is reported, and nothing is reported on a line the suite does not mark. +- The maintainers run every checker through it and publish the [results table](https://github.com/python/typing/blob/main/conformance/results/results.html), which is how figures like pyright's ~99% or pyrefly's ~86% are produced. + +This is the suite we use, at the pinned commit [`{{ conformance.pinnedRef }}`](https://github.com/python/typing/tree/{{ conformance.pinnedRef }}/conformance). Because the same tool and the same files grade everyone, the number is comparable across checkers and is not something we can tune in our favour. + +## How a file is scored + +The entire algorithm is two functions in the suite's `main.py` — `get_expected_errors` (reads the `# E` annotations) and `diff_expected_errors` (diffs them against the checker's output). A file passes **iff** that diff is empty: + +- the suite's rule (`upstream_main.py:185`): `"Fail" if errors_diff.strip() else "Pass"` + +We count **every** diagnostic the checker emits — errors *and* warnings, with **no diagnostic codes excluded**. That is the strictest reading of the suite and matches how the reference checker, pyright, is graded. One unexpected diagnostic (a false positive) fails the whole file, which is why our false-positive count matters as much as the pass count. + +## How we run it without forking it + +The suite's `main.py` is a batch harness for the `python/typing` maintainers: it grades all the known checkers at once, pulls in TOML config/reporting dependencies, and writes a results matrix. It has no way to invoke our binary. So, exactly as the suite does for every checker (`PyrightTypeChecker`, `MypyTypeChecker`, …), we add a thin **adapter** and reuse the suite's own scoring rather than reimplementing it. Our [`score.py`](https://github.com/Nimblesite/Basilisk/blob/main/conformance/score.py): + +1. **Adapter** — runs `basilisk check --output json` and shapes the result into the `{line: [errors]}` dict the suite's functions expect (the one thing the suite can't do for us). +2. **Calculator** — imports `get_expected_errors` and `diff_expected_errors` from a committed, byte-identical copy of the suite's `main.py` and calls them unmodified (`score.py:287` mirrors the suite's own call at `upstream_main.py:175`). It contains no scoring logic of its own. +3. **Gate** — compares the result against `coverage-thresholds.json` and fails CI on any regression. + +To keep the calculator trustworthy, the vendored copy is **sha256-pinned**. `score.py` re-hashes it on every run and refuses to score if it has drifted (`score.py:99`), and this website re-hashes it again at build time: + +{% if conformance.verified %} +

✓ verified at build — conformance/upstream_main.py is {{ conformance.upstreamBytes }} bytes, sha256 {{ conformance.sha256Short }}…, matching the pin

+{% endif %} + +Keeping the official file untouched is the whole point: the adapter and gate live in a separate, auditable file, so the calculator stays byte-for-byte the suite's own. + +## A correction we made + +Our score used to be measured by an in-repo script of our own, and it was **wrong**. That script excluded several diagnostic codes from scoring and did not count false positives, so it reported numbers that climbed all the way to 100%. It was an honest mistake, not a tuned result — but it was still incorrect. + +We replaced it with the official calculator described above. With every diagnostic counted and nothing excluded, the honest number is **{{ conformance.scorePct }}%**: + +
+ 100% + + {{ conformance.scorePct }}% + The checker did not get worse — the measurement got correct. 100% is the target we are working toward, not a claim about today. +
+ +The chart below is read straight from the **git history of `conformance/conformance_status.csv`** at build time: one point per commit that changed it, plotting the score that commit actually recorded. + +{{ chart(conformance, { + "label": "Conformance score over time", + "heading": "From the earlier in-repo number to the official calculator", + "prevLegend": "Earlier in-repo script (some codes excluded, false positives not counted)", + "officialLegend": "Official python/typing calculator", + "dropNote": "On " + conformance.chart.peak.shortDate + " the in-repo script reported " + conformance.chart.peak.score + "%. The official calculator, first run on " + conformance.chart.current.shortDate + ", reports " + conformance.chart.current.score + "% — a correction, not a regression.", + "caption": "Each dot is a real commit to conformance/conformance_status.csv, recomputed every build. Hover a point for its date, commit, score, and false-positive count." +}) }} + +## Where each category stands today + +Read live from `conformance/conformance_status.csv` at build time: + +
+ + + +{%- for cat in conformance.categories %} + +{%- endfor %} + +
CategoryPassingScore
{{ cat.label }}{{ cat.pass }} / {{ cat.total }}{{ cat.pct }}%
+
+ +## Reproduce it yourself + +```bash +# Builds the binary, fetches the (git-ignored) fixtures, runs the official +# python/typing calculator against them, writes conformance_status.csv, and +# enforces the ratchet gate from coverage-thresholds.json. +make conformance +``` + +It all lives in two files: [`conformance/score.py`](https://github.com/Nimblesite/Basilisk/blob/main/conformance/score.py) (our adapter + gate) and [`conformance/upstream_main.py`](https://github.com/Nimblesite/Basilisk/blob/main/conformance/upstream_main.py) (the suite's calculator, committed and sha256-pinned). The full annotation rules are in the [python/typing conformance README](https://github.com/python/typing/blob/main/conformance/README.md). diff --git a/website/src/docs/index.md b/website/src/docs/index.md index 8aec42d3..1d7d38e8 100644 --- a/website/src/docs/index.md +++ b/website/src/docs/index.md @@ -68,7 +68,7 @@ Basilisk is currently in **alpha** — the core checker, LSP server, and editor |---|---|---| | 1 | Parser, resolver, type checker, CLI | Complete | | 2 | LSP server, editor extensions (VS Code, Cursor, Zed, Neovim) | Complete | -| 3 | Expanded rule set, 98.6% PEP conformance, gradual adoption | In progress | +| 3 | Expanded rule set, PEP conformance push (currently {{ conformance.scorePct }}%, target 100%), gradual adoption | In progress | | 4 | Ownership & immutability analysis (Mojo-inspired) | Planned | | 5 | WASM plugins, Django/Pydantic/SQLAlchemy | Planned | | 6 | 95%+ PEP, SARIF/JUnit, JetBrains extension | Planned | diff --git a/website/src/docs/releases.njk b/website/src/docs/releases.njk new file mode 100644 index 00000000..98b7e47a --- /dev/null +++ b/website/src/docs/releases.njk @@ -0,0 +1,61 @@ +--- +layout: layouts/docs.njk +title: "Basilisk Releases — Downloads & Changelog" +description: "Every published Basilisk release — version, date, release notes, and downloadable binaries and editor extensions — generated at build time straight from GitHub Releases." +keywords: basilisk releases, download basilisk, changelog, release notes, python language server downloads, vsix +date: 2026-06-23 +dateModified: 2026-06-23 +author: The Basilisk Project +# English-only — the notes come verbatim from GitHub Releases, so opt this page +# out of the language cluster (no /zh/ hreflang or switcher link that would 404). +noTranslation: true +eleventyNavigation: + key: Releases + order: 9 +permalink: /docs/releases/ +--- +

Releases

+ +{%- if releases.hasData %} +

+ All {{ releases.count }} published Basilisk releases, generated at + build time straight from GitHub Releases. + Each entry links to its tag and lists every downloadable asset. +

+ +
+ {%- for release in releases.releases %} +
+
+

{{ release.name }}

+ {%- if loop.first %}Latest{% endif %} + {%- if release.prerelease %}Pre-release{% endif %} + {%- if release.date %}{% endif %} +
+ + {%- if release.bodyHtml %} +
{{ release.bodyHtml | safe }}
+ {%- endif %} + + {%- if release.assets.length %} +
+ {{ release.assets.length }} download{{ "s" if release.assets.length != 1 }} +
    + {%- for asset in release.assets %} +
  • + {{ asset.name }} + {{ asset.size }} · {{ asset.downloads }} download{{ "s" if asset.downloads != 1 }} +
  • + {%- endfor %} +
+
+ {%- endif %} +
+ {%- endfor %} +
+{%- else %} +

+ Release data could not be loaded at build time. See + all Basilisk releases on GitHub. +

+{%- endif %} diff --git a/website/src/docs/rules/index.md b/website/src/docs/rules/index.md index 50ce730f..78e9d19d 100644 --- a/website/src/docs/rules/index.md +++ b/website/src/docs/rules/index.md @@ -17,7 +17,7 @@ Every Basilisk diagnostic has a unique code in the format `BSK-EXXXX` (error) or Rules are enabled by default. You can dial individual rules down per-file or per-path from your editor or `pyproject.toml` — strict is the default, not a cage. -Basilisk ships **155 diagnostic codes** (150 errors, 5 warnings) spanning the full Python typing surface — generics, protocols, dataclasses, TypedDicts, overloads, literals, enums, and more — and is validated against the [official Python typing conformance suite](https://github.com/python/typing/blob/main/conformance/results/results.html) (currently **98.6%**, 144 / 146). The two foundational groups have worked examples: +Basilisk ships **155 diagnostic codes** (150 errors, 5 warnings) spanning the full Python typing surface — generics, protocols, dataclasses, TypedDicts, overloads, literals, enums, and more — and is scored by the [official Python typing conformance suite](https://github.com/python/typing/blob/main/conformance/results/results.html) (currently **{{ conformance.scorePct }}%**, {{ conformance.pass }} / {{ conformance.total }} (errors+warnings, strictest); target 100% — [how we measure](/docs/conformance/)). The two foundational groups have worked examples: | Group | Codes | Description | |---|---|---| diff --git a/website/src/index.njk b/website/src/index.njk index d775a0b4..cd84eb01 100644 --- a/website/src/index.njk +++ b/website/src/index.njk @@ -252,19 +252,19 @@ benchmarkStrings:
- 98.6% + {{ conformance.scorePct }}% PEP conformance score

04 — PEP conformance

-

144 of 146 tests passing.
Target: 100%.

+

{{ conformance.pass }} of {{ conformance.total }} tests passing ({{ conformance.scorePct }}%).
Target: 100%.

- Tested against the official Python typing conformance suite — - the same suite used to measure Pyright (~99%¹), + Scored by the official Python typing conformance suite — + the same harness used to measure Pyright (~99%¹), mypy (~58%¹), and Pyrefly (~86%¹). - 19 of 21 categories pass at 100%, with zero false positives. The remaining two — a TypeVarTuple generics case and one protocol-definition case — are next. + Today {{ conformance.categoriesPass100 }} of {{ conformance.categoriesTotal }} categories pass at 100%; the suite reports {{ conformance.fp }} false-positive diagnostics and {{ conformance.missed }} missed required errors, and we are driving both to zero. 100% is the target, not a present-day claim. How we measure this →

@@ -372,7 +372,7 @@ benchmarkStrings: PEP conformance ¹ - 98.6% (144/146) + {{ conformance.scorePct }}% ({{ conformance.pass }}/{{ conformance.total }}, target 100%) ~99% ~58% full-pass alpha diff --git a/website/src/readme.html b/website/src/readme.html deleted file mode 100644 index 7e4628ee..00000000 --- a/website/src/readme.html +++ /dev/null @@ -1,240 +0,0 @@ ---- -layout: layouts/docs.njk -title: README -description: Crate architecture, diagnostic rules, and development guide for Basilisk. -keywords: basilisk, readme, crate architecture, rust, python type checker -# English-only crate README — no Chinese twin exists, so opt it out of the -# language cluster (no /zh/readme/ hreflang or switcher link, which would 404). -noTranslation: true -eleventyNavigation: - key: README - order: 99 -permalink: /readme/ ---- - -

- Basilisk -

- -

Basilisk

- -

- The open-source Python language server.
- Complete language server, type checker, debugger, and profiler — strict by default.
- VS Code, Cursor & Windsurf (Open VSX) • Zed • Neovim. Built in Rust — single binary, no runtime. -

- -

- Website  •  - Install  •  - Quick Start  •  - Rules  •  - Refactoring  •  - Compare -

- ---- - -

- Basilisk in action — type checking, diagnostics, and refactoring in the editor -

- -## Try it - -The `examples/` folder has ready-to-go Python files: - -```sh -basilisk check examples/bad.py # everything flagged -basilisk check examples/good.py # clean -basilisk check examples/mixed.py # some errors, some clean -basilisk check examples/ # all three at once -``` - ---- - -## Quick example - - - - - - - - - - -
Basilisk rejects thisFixed
- -```python -def greet(name): - return "Hello " + name -``` - - - -```python -def greet(name: str) -> str: - return "Hello " + name -``` - -
- ---- - -## Rules - -All rules are on by default. There is no way to relax them globally. - -### Annotation rules (E0001-E0005) - -| Code | Triggers when | -|------|---------------| -| `BSK-E0001` | Function parameter has no type annotation | -| `BSK-E0002` | Function is missing a return type annotation | -| `BSK-E0003` | Variable assignment has no type annotation | -| `BSK-E0004` | `*args` or `**kwargs` has no type annotation | -| `BSK-E0005` | Class attribute has no type annotation | - -### Type correctness (E0010-E0029) - -| Code | Triggers when | -|------|---------------| -| `BSK-E0010` | Import cannot be resolved | -| `BSK-E0011` | Explicit `Any` annotation (emitted as a warning), or a return type mismatch | -| `BSK-E0012` | Argument type does not match parameter type | -| `BSK-E0013` | Return type does not match declared return type | -| `BSK-E0014` | Assignment type does not match declared variable type | -| `BSK-E0015` | Wrong number of type arguments (e.g. `list[int, str]`) | -| `BSK-E0016` | Method override has incompatible signature | -| `BSK-E0017` | Class variable override has incompatible type | -| `BSK-E0018` | Reference to an undefined name | -| `BSK-E0019` | Variable used before it is assigned | -| `BSK-E0020` | `@overload` group has no non-decorated implementation | -| `BSK-E0021` | Two `@overload` signatures overlap | -| `BSK-E0022` | Dict key type is not hashable | -| `BSK-E0023` | `match` statement is not exhaustive | -| `BSK-E0024` | Type expression is not valid (e.g. a numeric literal used as a type) | -| `BSK-E0025` | Override method is missing the `@override` decorator | -| `BSK-E0026` | `TypeVar` declared with a single constraint | -| `BSK-E0027` | Duplicate `TypeVar` in a `Generic[...]` base | -| `BSK-E0029` | Method defined inside a `TypedDict` class | - -These are the most common rules. Basilisk ships **155 diagnostic codes** in total (150 errors, 5 warnings) — see the [complete diagnostic reference](https://www.basilisk-python.dev/docs/rules/) (generated from the checker source by `scripts/gen_rules_reference.py`). - ---- - -## Refactoring - -Basilisk ships a suite of refactoring code actions — available via the lightbulb (code actions) menu in VS Code, Cursor, and Windsurf (via Open VSX), plus Zed and Neovim. No extra extensions required. - -| Action | Kind | What it does | -|--------|------|-------------| -| **Extract variable** | `refactor.extract` | Extract expression into a named variable | -| **Extract variable (replace all)** | `refactor.extract` | Replace all identical occurrences | -| **Extract constant** | `refactor.extract` | Extract to module-level `SCREAMING_SNAKE` constant | -| **Extract function** | `refactor.extract` | Extract selected statements into a new function | -| **Inline variable** | `refactor.inline` | Replace variable with its value, delete assignment | -| **Inline function** | `refactor.inline` | Replace call with function body (single-expression) | -| **Move to new file** | `refactor.move` | Move class/function to a new file, leave import behind | -| **Move to existing file** | `refactor.move` | Move class/function to a chosen file via command | -| **Rename symbol** | — | Scope-aware rename with keyword arg, `self.attr`, docstring, and `__all__` updates | -| **Remove parameter** | `refactor.rewrite` | Remove parameter from function + all call sites | -| **Add parameter** | `refactor.rewrite` | Add `new_param=None` to function signature | -| **Sort parameters** | `refactor.rewrite` | Alphabetically sort parameters (keeps `self`/`cls` first) | -| **Implement abstract methods** | `refactor.rewrite` | Generate method stubs for abstract base class | -| **Convert Union/Optional** | `refactor.rewrite` | `Union[X, Y]` ↔ `X \| Y`, `Optional[X]` ↔ `X \| None` | -| **Convert constructs** | `refactor.rewrite` | f-string ↔ `.format()`, `dict()` ↔ `{}`, `list()` ↔ `[]`, ternary ↔ if/else, NamedTuple class ↔ functional | - -Extract function detects async functions, methods (`self`/`cls`), and rejects selections containing `yield`, `break`, or `continue`. - ---- - -## Output format - -Diagnostics use rustc-style output: - -``` -error[BSK-E0001]: Missing parameter type annotation for `data` - --> src/utils.py:14:13 - | -14 | def process(data): - | ^^^^ - | - = help: Add a type annotation: `data: ` - = note: In Basilisk, all function parameters require explicit types - = see: https://www.basilisk-python.dev/errors/BSK-E0001 -``` - -| Exit code | Meaning | -|-----------|---------| -| `0` | Clean — no errors | -| `1` | Type errors found | -| `3` | Internal error | - ---- - -## Architecture - -Basilisk is a Cargo workspace. Each crate owns one layer of the analysis pipeline. - -> **Pipeline:** source text → parser → AST → resolver → scopes → checker → diagnostics -> -> **Incremental:** `basilisk-db` caches ASTs and resolved modules by content hash so only changed files re-run the pipeline. - -### Analysis pipeline - -| Crate | What it does | Status | -|-------|-------------|--------| -| [basilisk-parser](crates/basilisk-parser/) | Wraps `ruff_python_parser` to parse `.py` source into a typed AST | Done | -| [basilisk-resolver](crates/basilisk-resolver/) | Name resolution and scope analysis — catches undefined names and use-before-assignment | Done | -| [basilisk-checker](crates/basilisk-checker/) | Core type checker — implements all E0001-E0025 rules | Done | -| [basilisk-cli](crates/basilisk-cli/) | The `basilisk` binary — wires the full pipeline together | Done | - -### LSP and infrastructure - -| Crate | What it does | Status | -|-------|-------------|--------| -| [basilisk-lsp](crates/basilisk-lsp/) | LSP server — diagnostics, hover, go-to-def, code actions, refactoring, debugging | Working | -| [basilisk-db](crates/basilisk-db/) | Salsa-based incremental computation for <10ms latency | Working | -| [basilisk-config](crates/basilisk-config/) | Configuration parsing (`pyproject.toml`, `basilisk.json`) | Done | -| [basilisk-stubs](crates/basilisk-stubs/) | Bundled type stubs (typeshed) — no internet needed | Working | -| [basilisk-uv](crates/basilisk-uv/) | uv package manager integration for the LSP | Working | -| [basilisk-common](crates/basilisk-common/) | Shared constants and types — zero deps, WASM-compatible | Done | -| [basilisk-test-utils](crates/basilisk-test-utils/) | Shared E2E test helpers | Done | - -### Future capabilities - -| Crate | What it does | Status | -|-------|-------------|--------| -| [basilisk-mojo](crates/basilisk-mojo/) | Mojo-inspired ownership/immutability analysis (`Borrowed`, `InOut`, `Owned`) | Phase 4 | -| [basilisk-compiler](crates/basilisk-compiler/) | Compiles typed Python to native code | Future | -| [basilisk-plugin](crates/basilisk-plugin/) | WASM plugin host for Django, Pydantic, SQLAlchemy type extensions | Phase 5 | - -### Editor extensions - -| Extension | Editor | Status | -|-----------|--------|--------| -| [vscode-extension](vscode-extension/) | VS Code | Working | -| [basilisk.nvim](basilisk.nvim/) | Neovim 0.10+ | Working | -| [basilisk-zed](basilisk-zed/) | Zed | Phase 2 | - ---- - -## Development - -```sh -cargo build # build all crates -cargo test # run all tests -cargo clippy # lint (zero warnings policy) -cargo fmt # format -``` - -Rust 1.87+ required. - ---- - -## License - -MIT. - -Built by [NIMBLESITE PTY LTD](https://www.nimblesite.co). diff --git a/website/src/zh/docs/comparison.md b/website/src/zh/docs/comparison.md index 26afde5c..94091243 100644 --- a/website/src/zh/docs/comparison.md +++ b/website/src/zh/docs/comparison.md @@ -35,7 +35,7 @@ Basilisk 消除了这个选择。没有宽松模式可以回退。 | 功能 | Basilisk | Pyright | mypy | ty | Pyrefly | |---|---|---|---|---|---| | 默认严格 | ✅ | ❌ 选择加入 | ❌ 选择加入 | ❌ 选择加入 | ❌ 选择加入 | -| PEP 符合性¹ | 98.6%(目标 →100%) | ~99% | ~58% | 早期 alpha | ~86% | +| PEP 符合性¹ | 当前 {{ conformance.scorePct }}%(目标 →100%) | ~99% | ~58% | 早期 alpha | ~86% | | 实现语言 | Rust | TypeScript | Python/C | Rust | Rust | | 需要运行时 | 无 | Node.js | Python | 无 | 无 | | 完整 LSP(补全、悬停、跳转) | ✅ | 仅 Pylance | ❌ | 基础 | 基础 | @@ -153,7 +153,7 @@ Basilisk 不是现有工具的更快版本。它占据了不同的位置: 5. WASM 插件系统(计划中)——无需分叉即可扩展,设计安全 **Basilisk 尚不是最佳选择的地方:** -- PEP 符合性:Basilisk 通过官方符合性套件的 98.6%(144/146)。Pyright 今天覆盖更多边缘情况。Basilisk 的目标是 100%;还未达到。 +- PEP 符合性:Basilisk 当前通过官方符合性套件的 {{ conformance.scorePct }}%({{ conformance.pass }}/{{ conformance.total }},错误加警告,最严格评分),仍有 {{ conformance.fp }} 处误报和 {{ conformance.missed }} 处遗漏的必需错误正在被压低。Pyright 今天覆盖远更多边缘情况。Basilisk 的目标是 100%;还未达到。参见[我们如何衡量符合性](/zh/docs/conformance/)。 - 插件生态系统:mypy 的 Django 和 SQLAlchemy 插件已经成熟。Basilisk 的 WASM 插件是计划中的。 - 成熟度:Pylance 今天功能完整(虽然是专有的,且仅限 VS Code)。Basilisk 处于 alpha 阶段。 diff --git a/website/src/zh/docs/conformance.md b/website/src/zh/docs/conformance.md new file mode 100644 index 00000000..a389cbf7 --- /dev/null +++ b/website/src/zh/docs/conformance.md @@ -0,0 +1,107 @@ +--- +layout: layouts/docs.njk +title: "Basilisk 如何衡量 PEP 符合性" +description: "Basilisk 的 PEP 符合性得分如何用官方 python/typing 符合性套件衡量——套件是什么、评分如何进行、我们运行的字节级一致且 sha256 固定的计算器,以及我们对自己评分所做的更正。" +keywords: pep 符合性, python 类型符合性套件, basilisk 符合性得分, 类型检查器评分, python/typing 计算器 +lang: zh +--- +{% from "conformance-chart.njk" import chart %} + +# 我们如何衡量 PEP 符合性 + +Basilisk 由**官方 `python/typing` 符合性套件**评分——也就是类型社区用来为 pyright、mypy、pyrefly、ty 等打分的同一套测试与评分工具。我们在每次改动时,对真实的 `basilisk` 二进制文件原样运行该工具。 + +目前的结果是 **{{ conformance.scorePct }}%**——{{ conformance.total }} 个测试文件中 **{{ conformance.pass }}** 个通过,捕获 {{ conformance.caught }} 个必需错误,仍有 **{{ conformance.fp }} 处误报**和 **{{ conformance.missed }} 处遗漏的必需错误**待清除。{{ conformance.categoriesTotal }} 个类别中有 {{ conformance.categoriesPass100 }} 个达到 100%。目标是 100%,我们逐步逼近。 + + + +## 符合性套件是什么 + +[Python 类型规范](https://typing.python.org/en/latest/spec/)定义了类型系统应当如何运作——泛型、协议、dataclass、`TypedDict`、重载、字面量等。为了让规范不停留在纸面上,类型社区在 [`python/typing`](https://github.com/python/typing/tree/main/conformance) 仓库中与规范并行维护着一套**符合性测试套件**。 + +它的工作方式是: + +- 每个规范章节对应一个或多个**测试文件**——普通的 Python 模块,用 `# E` 注释标出每一行符合规范的类型检查器**必须**报告错误的位置(以及用 `# E[tag]` 组标出多个相关错误中报告其一即可的位置)。 +- 一个小型**评分工具**对这些文件运行某个类型检查器,并将其输出与注释做差异比对。文件只有在差异为空时才*通过*:每个必需错误都被报告,且没有任何诊断落在套件未标记的行上。 +- 维护者用它为每个检查器打分,并发布[结果表](https://github.com/python/typing/blob/main/conformance/results/results.html)——pyright 约 99%、pyrefly 约 86% 等数字便是这样得出的。 + +我们使用的正是这套套件,固定在提交 [`{{ conformance.pinnedRef }}`](https://github.com/python/typing/tree/{{ conformance.pinnedRef }}/conformance)。因为同样的工具与文件为所有人打分,这个数字在各检查器之间可比,也不是我们能朝自己有利方向调整的。 + +## 一个文件如何评分 + +整个算法就是套件 `main.py` 中的两个函数——`get_expected_errors`(读取 `# E` 注释)与 `diff_expected_errors`(与检查器输出比对)。文件**当且仅当**该差异为空时通过: + +- 套件的规则(`upstream_main.py:185`):`"Fail" if errors_diff.strip() else "Pass"` + +我们计入检查器发出的**每一个**诊断——错误*和*警告,**不排除任何诊断代码**。这是套件最严格的读法,也是参考检查器 pyright 的评分方式。一个多余的诊断(一处误报)就会让整个文件失败,这正是误报数与通过数同样重要的原因。 + +## 我们如何在不分叉的情况下运行它 + +套件的 `main.py` 是给 `python/typing` 维护者用的批处理工具:它一次性为所有已知检查器打分,引入 TOML 配置/报告依赖,并写出结果矩阵。它无法调用我们的二进制文件。因此,正如套件为每个检查器所做的那样(`PyrightTypeChecker`、`MypyTypeChecker` 等),我们加一个薄薄的**适配器**,复用套件自己的评分而非重新实现。我们的 [`score.py`](https://github.com/Nimblesite/Basilisk/blob/main/conformance/score.py): + +1. **适配器**——运行 `basilisk check --output json`,把结果整理成套件函数期望的 `{line: [errors]}` 字典(这是套件唯一无法替我们做的事)。 +2. **计算器**——从一份字节级一致的套件 `main.py` committed 副本中导入 `get_expected_errors` 与 `diff_expected_errors` 并原样调用(`score.py:287` 对应套件自己在 `upstream_main.py:175` 的调用)。它不含任何自己的评分逻辑。 +3. **门禁**——将结果与 `coverage-thresholds.json` 比较,任何回归都让 CI 失败。 + +为保证计算器可信,内置副本经 **sha256 固定**。`score.py` 在每次运行时重新哈希它,若有漂移则拒绝评分(`score.py:99`);本网站在构建时也会再次重新哈希: + +{% if conformance.verified %} +

✓ 构建时已校验 —— conformance/upstream_main.py 为 {{ conformance.upstreamBytes }} 字节,sha256 {{ conformance.sha256Short }}…,与固定值一致

+{% endif %} + +保持官方文件不被改动正是要点所在:适配器与门禁住在另一个可审计的文件里,因此计算器逐字节就是套件自己的那一份。 + +## 我们做的一处更正 + +我们的得分过去由仓库内自己的一个脚本衡量,而它是**错误的**。该脚本将若干诊断代码排除在评分之外,且未计入误报,因此报出的数字一路爬到了 100%。这是一个诚实的失误,并非有意调高——但它仍然是错的。 + +我们用上面所述的官方计算器替换了它。在计入每个诊断、不排除任何代码之后,诚实的数字是 **{{ conformance.scorePct }}%**: + +
+ 100% + + {{ conformance.scorePct }}% + 检查器没有变差——是衡量变正确了。100% 是我们正在努力达成的目标,而非对当下的宣称。 +
+ +下面的图表在构建时直接读取 **`conformance/conformance_status.csv` 的 git 历史**:每个改动该文件的提交对应一个点,绘制该提交实际记录的得分。 + +{{ chart(conformance, { + "label": "符合性得分随时间变化", + "heading": "从早期仓库内数字到官方计算器", + "prevLegend": "早期仓库内脚本(排除部分代码、未计入误报)", + "officialLegend": "官方 python/typing 计算器", + "dropNote": "在 " + conformance.chart.peak.shortDate + ",仓库内脚本报告了 " + conformance.chart.peak.score + "%。官方计算器首次于 " + conformance.chart.current.shortDate + " 运行,报出 " + conformance.chart.current.score + "%——这是更正,而非回归。", + "caption": "每个点都是对 conformance/conformance_status.csv 的真实提交,每次构建重新计算。悬停某点可查看其日期、提交、得分与误报数。" +}) }} + +## 各类别现状 + +构建时从 `conformance/conformance_status.csv` 实时读取: + +
+ + + +{%- for cat in conformance.categories %} + +{%- endfor %} + +
类别通过得分
{{ cat.label }}{{ cat.pass }} / {{ cat.total }}{{ cat.pct }}%
+
+ +## 自己复现 + +```bash +# 构建二进制、获取(被 git 忽略的)测试夹具、对其运行官方 python/typing +# 计算器、写出 conformance_status.csv,并强制执行 coverage-thresholds.json 中的棘轮门禁。 +make conformance +``` + +这一切都在两个文件里:[`conformance/score.py`](https://github.com/Nimblesite/Basilisk/blob/main/conformance/score.py)(我们的适配器与门禁)和 [`conformance/upstream_main.py`](https://github.com/Nimblesite/Basilisk/blob/main/conformance/upstream_main.py)(套件的计算器,committed 且经 sha256 固定)。完整的注解规则见 [python/typing 符合性 README](https://github.com/python/typing/blob/main/conformance/README.md)。 diff --git a/website/src/zh/docs/index.md b/website/src/zh/docs/index.md index adfa6d3d..1528a5df 100644 --- a/website/src/zh/docs/index.md +++ b/website/src/zh/docs/index.md @@ -63,7 +63,7 @@ Basilisk 目前处于 **alpha**——核心检查器、LSP 服务器和编辑器 |---|---|---| | 1 | 解析器、解析器、类型检查器、CLI | 完成 | | 2 | LSP 服务器、编辑器扩展(VS Code、Cursor、Zed、Neovim) | 完成 | -| 3 | 扩展规则集,98.6% PEP 符合性,渐进式采用 | 进行中 | +| 3 | 扩展规则集,PEP 符合性攻坚(当前 {{ conformance.scorePct }}%,目标 100%),渐进式采用 | 进行中 | | 4 | 所有权与不可变性分析(Mojo 启发) | 计划中 | | 5 | WASM 插件,Django/Pydantic/SQLAlchemy | 计划中 | | 6 | 95%+ PEP,SARIF/JUnit,JetBrains 扩展 | 计划中 | diff --git a/website/src/zh/docs/rules/index.md b/website/src/zh/docs/rules/index.md index a36ddcd6..227ffc7f 100644 --- a/website/src/zh/docs/rules/index.md +++ b/website/src/zh/docs/rules/index.md @@ -12,7 +12,7 @@ lang: zh 规则默认全部启用。您可以通过编辑器或 `pyproject.toml`,按文件或路径将单个规则调低——严格是默认值,而不是牢笼。 -Basilisk 内置 **155 个诊断代码**(150 个错误,5 个警告),覆盖完整的 Python 类型表面(泛型、协议、dataclass、TypedDict、重载、字面量、枚举等),通过[官方 Python 类型符合性套件](https://github.com/python/typing/blob/main/conformance/results/results.html)验证(当前符合率 **98.6%**,144 / 146)。下面记录了两个基础组;完整集合由检查器强制执行。 +Basilisk 内置 **155 个诊断代码**(150 个错误,5 个警告),覆盖完整的 Python 类型表面(泛型、协议、dataclass、TypedDict、重载、字面量、枚举等),由[官方 Python 类型符合性套件](https://github.com/python/typing/blob/main/conformance/results/results.html)评分(当前符合率 **{{ conformance.scorePct }}%**,{{ conformance.pass }} / {{ conformance.total }}(错误加警告,最严格);目标 100% —— [我们如何衡量](/zh/docs/conformance/))。下面记录了两个基础组;完整集合由检查器强制执行。 | 组 | 代码 | 描述 | |---|---|---| diff --git a/website/src/zh/index.njk b/website/src/zh/index.njk index 4ceb0dae..a2f3dedb 100644 --- a/website/src/zh/index.njk +++ b/website/src/zh/index.njk @@ -263,19 +263,19 @@ benchmarkStrings:
- 98.6% + {{ conformance.scorePct }}% PEP 符合性得分

04 — PEP 符合性

-

146 个测试中 144 个通过。
目标:100%。

+

{{ conformance.total }} 个测试中 {{ conformance.pass }} 个通过({{ conformance.scorePct }}%)。
目标:100%。

- 通过官方 Python 类型符合性套件测试—— + 由官方 Python 类型符合性套件评分—— 与衡量 Pyright(约 99%¹)、 mypy(约 58%¹) 和 Pyrefly(约 86%¹)的套件相同。 - 21 个类别中的 19 个达到 100%,且零误报。剩余两个 —— 一个 TypeVarTuple 泛型用例和一个协议定义用例 —— 是下一步。 + 目前 {{ conformance.categoriesTotal }} 个类别中有 {{ conformance.categoriesPass100 }} 个达到 100%;套件报告 {{ conformance.fp }} 处误报和 {{ conformance.missed }} 处遗漏的必需错误,我们正将两者都降到零。100% 是目标,而非当下的宣称。我们如何衡量 →

@@ -382,7 +382,7 @@ benchmarkStrings: PEP 符合性 ¹ - 98.6% (144/146) + {{ conformance.scorePct }}% ({{ conformance.pass }}/{{ conformance.total }},目标 100%) ~99% ~58% 完全通过 alpha