From a89b138bd5a891b41e63027a0173992d7ec9d5c8 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 12 Apr 2026 09:13:36 -0400 Subject: [PATCH 1/6] Add certified bundle metadata --- docs/dev.md | 11 +- docs/index.md | 1 + docs/myst.yml | 1 + docs/release-bundles.md | 280 ++++++++++++++++++ pyproject.toml | 6 +- src/policyengine/core/__init__.py | 6 + src/policyengine/core/release_manifest.py | 141 +++++++++ .../core/tax_benefit_model_version.py | 69 ++++- .../data/release_manifests/uk.json | 24 +- .../data/release_manifests/us.json | 22 +- .../tax_benefit_models/uk/model.py | 58 ++-- .../tax_benefit_models/us/model.py | 46 +-- tests/test_release_manifests.py | 198 ++++++++++++- 13 files changed, 777 insertions(+), 86 deletions(-) create mode 100644 docs/release-bundles.md diff --git a/docs/dev.md b/docs/dev.md index 5ae84682..8fb0a140 100644 --- a/docs/dev.md +++ b/docs/dev.md @@ -12,17 +12,18 @@ ```bash git clone https://github.com/PolicyEngine/policyengine.py.git cd policyengine.py -uv pip install -e .[dev] +uv pip install -e ".[dev]" ``` -This installs both UK and US country models plus dev dependencies (pytest, ruff, mypy, towncrier). +This installs the shared analysis layer, both country model extras, and the dev +dependencies used in CI (pytest, ruff, mypy, towncrier). ## Common commands ```bash make format # ruff format make test # pytest with coverage -make docs # build documentation site +make docs # run the MyST docs build used in CI via npx make clean # remove caches, build artifacts, .h5 files ``` @@ -60,7 +61,7 @@ PRs trigger the following checks: | Tests (Python 3.13) | Required | `make test` | | Tests (Python 3.14) | Required | `make test` | | Mypy | Informational | `mypy src/policyengine` | -| Docs build | Required | MyST build | +| Docs build | Required | `make docs` | ## Versioning and releases @@ -73,6 +74,8 @@ echo "Description of change" > changelog.d/my-change.added On merge, the versioning workflow bumps the version, builds the changelog, and creates a GitHub Release. +For the target release-bundle architecture, see [Release bundles](release-bundles.md). That document defines the split between country `*-data` build manifests and `policyengine.py` certified runtime bundles. + ## Architecture ### Package layout diff --git a/docs/index.md b/docs/index.md index 3a6d2b43..bbd88974 100644 --- a/docs/index.md +++ b/docs/index.md @@ -16,4 +16,5 @@ We do this by: - [US tax-benefit model](country-models-us.md): Entities, parameters, reform examples - [Examples](examples.md): Complete working scripts - [Visualisation](visualisation.md): Publication-ready charts with Plotly +- [Release bundles](release-bundles.md): Reproducible model-plus-data certification and provenance - [Development](dev.md): Setup, testing, CI, architecture diff --git a/docs/myst.yml b/docs/myst.yml index 2984d6e7..6924ef21 100644 --- a/docs/myst.yml +++ b/docs/myst.yml @@ -17,6 +17,7 @@ project: - file: country-models-us.md - file: examples.md - file: visualisation.md + - file: release-bundles.md - file: dev.md site: diff --git a/docs/release-bundles.md b/docs/release-bundles.md new file mode 100644 index 00000000..d97e0370 --- /dev/null +++ b/docs/release-bundles.md @@ -0,0 +1,280 @@ +# Release Bundles + +This document defines the intended reproducibility boundary for `policyengine.py`. + +The key design decision is: + +- country `*-data` repos build and stage immutable data artifacts +- `policyengine.py` is the only component that certifies supported runtime bundles +- `policyengine.py` does not rebuild country data itself + +This keeps country-specific data construction in the country data repos while still giving users a single top-level version to cite and pin. + +## Why this boundary exists + +For countries like the UK, the data package is not model-independent. Dataset construction, imputations, and calibration steps call the country model directly. That means a published dataset artifact depends on: + +- the country model version used during data construction +- the calibration targets used during data construction +- the raw input data used during data construction + +If `policyengine.py` only pins a country model version and a data package version without checking that relationship, the provenance boundary is incomplete. + +## Roles + +### Country model package + +Examples: `policyengine-uk`, `policyengine-us` + +The country model package owns: + +- policy logic +- variables and parameters +- reforms +- a `data_build_fingerprint` for the subset of model logic that affects data construction + +It does not own final runtime bundle certification. + +### Country data package + +Examples: `policyengine-uk-data`, `policyengine-us-data` + +The country data package owns: + +- data build pipelines +- raw input acquisition +- calibration target snapshots +- expensive dataset construction +- staging immutable build artifacts with provenance + +It does not define the final supported runtime bundle exposed to users. + +### `policyengine.py` + +`policyengine.py` owns: + +- runtime bundle certification +- user-facing reproducibility boundaries +- the supported mapping from `policyengine.py` version to country model version and certified data artifact + +It does not rebuild microdata artifacts. + +## Two manifest layers + +The architecture has two manifest layers with different responsibilities. + +### 1. Data build manifest + +Published by the country `*-data` repo. + +This answers: + +- what bytes were produced +- how they were produced +- which exact model and targets produced them + +Suggested schema: + +```json +{ + "schema_version": 1, + "country_id": "uk", + "data_package": { + "name": "policyengine-uk-data", + "version": "1.41.0" + }, + "build": { + "build_id": "uk-data-2026-04-12T12-30-00Z", + "git_sha": "abc123", + "built_at": "2026-04-12T12:30:00Z", + "built_with_model_package": { + "name": "policyengine-uk", + "version": "2.81.0", + "git_sha": "def456", + "data_build_fingerprint": "sha256:..." + }, + "calibration_targets": { + "snapshot_id": "2026-04-10", + "sha256": "sha256:..." + }, + "raw_inputs": [ + { + "name": "frs_2023_24", + "sha256": "sha256:..." + } + ], + "build_environment": { + "python_version": "3.13.3", + "lockfile_sha256": "sha256:..." + } + }, + "default_datasets": { + "national": "enhanced_frs_2023_24", + "baseline": "frs_2023_24" + }, + "artifacts": { + "enhanced_frs_2023_24": { + "kind": "microdata", + "repo_id": "policyengine/policyengine-uk-data-private", + "path": "builds/uk-data-2026-04-12T12-30-00Z/enhanced_frs_2023_24.h5", + "revision": "uk-data-2026-04-12T12-30-00Z", + "sha256": "sha256:...", + "size_bytes": 123456789 + } + } +} +``` + +Notes: + +- `build_id` must be immutable. +- build artifacts should be staged under a build-specific path or revision, not a floating release tag. +- the build manifest is the authoritative provenance record for the artifact bytes. + +### 2. Certified runtime bundle manifest + +Published by `policyengine.py`. + +This answers: + +- which model and data artifact are supported together at runtime +- which exact dataset should be used by default +- which artifact checksum and provenance should be surfaced to users + +Suggested schema: + +```json +{ + "schema_version": 1, + "policyengine_version": "3.5.0", + "bundle_id": "uk-3.5.0", + "published_at": "2026-04-12T13:00:00Z", + "country_id": "uk", + "model_package": { + "name": "policyengine-uk", + "version": "2.81.1" + }, + "certified_data_artifact": { + "data_package": { + "name": "policyengine-uk-data", + "version": "1.41.0" + }, + "build_id": "uk-data-2026-04-12T12-30-00Z", + "dataset": "enhanced_frs_2023_24", + "uri": "hf://policyengine/policyengine-uk-data-private/builds/uk-data-2026-04-12T12-30-00Z/enhanced_frs_2023_24.h5@uk-data-2026-04-12T12-30-00Z", + "sha256": "sha256:..." + }, + "certification": { + "compatibility_basis": "matching_data_build_fingerprint", + "built_with_model_version": "2.81.0", + "certified_for_model_version": "2.81.1", + "data_build_fingerprint": "sha256:...", + "certified_by": "policyengine.py release workflow" + }, + "default_dataset": "enhanced_frs_2023_24", + "region_artifacts": { + "national": { + "dataset": "enhanced_frs_2023_24" + } + } +} +``` + +Notes: + +- this is the user-facing reproducibility boundary +- apps and APIs should surface this bundle, not only country package versions +- a bundle may reuse a previously staged data artifact if compatibility is explicitly certified + +## Compatibility rule + +The architecture should avoid forcing a new data build for every harmless country model release. + +To do that safely, compatibility must be explicit. + +### Data build fingerprint + +Each country model package should expose a `data_build_fingerprint` that covers the subset of logic that affects dataset construction or calibration. + +Examples of inputs to the fingerprint: + +- variables used in imputations +- variables used in calibration loss matrices +- parameters referenced during data construction +- uprating or target-computation logic used during the build + +Things that should usually not affect the fingerprint: + +- runtime-only outputs that are not used in data construction +- UI-oriented metadata +- code paths unrelated to data construction + +### Certification rules + +`policyengine.py` may certify a staged data artifact for a model version only if one of the following is true: + +1. the model version exactly matches the `built_with_model_package.version` +2. the model version has the same `data_build_fingerprint` as the build-time model version + +If neither is true, the bundle release must fail and a new data build is required. + +This should be a hard failure, not a warning. + +## Artifact states + +Artifacts should move through explicit states: + +- `staged`: built by the country data repo and available for inspection or later certification +- `certified`: referenced by a released `policyengine.py` runtime bundle +- `deprecated`: no longer recommended for new use, but still reproducible + +The key point is that `staged` and `certified` are different states. A staged artifact is not automatically part of a supported runtime release. + +## UK release workflow + +### Case 1: model-only release + +1. Cut UK model release candidate `M`. +2. Compute `data_build_fingerprint(M)`. +3. Compare it to the fingerprint recorded in the previously certified data build manifest. +4. If the fingerprint matches, skip the expensive UK data rebuild. +5. Release `policyengine.py` with a new certified runtime bundle that points to the existing staged UK artifact. + +### Case 2: data-affecting release + +1. Cut UK model release candidate `M`. +2. Compute `data_build_fingerprint(M)`. +3. If the fingerprint changed, build a new UK data artifact in `policyengine-uk-data` against: + - exact `policyengine-uk==M` + - exact target snapshot + - exact raw input hashes +4. Stage the new artifact under a build-specific immutable path or revision. +5. Publish the UK data build manifest. +6. Release `policyengine.py` with a certified runtime bundle that points to the new staged artifact. + +## Implementation guidance + +The current `release_manifest.json` mechanism in country data repos is a good starting point, but it is not yet enough on its own. The target implementation should add: + +- `built_with_model_package.version` +- `built_with_model_package.git_sha` +- `built_with_model_package.data_build_fingerprint` +- calibration target snapshot metadata +- immutable staged artifact paths or revisions + +The target implementation in `policyengine.py` should add: + +- hard validation of bundle certification rules +- explicit runtime bundle metadata on simulations, APIs, and app responses +- checksum-backed dataset resolution from the certified bundle manifest + +## Why not let `policyengine.py` build all country data directly? + +Because that would centralise the wrong concerns: + +- country-specific private data handling would move into the generic orchestration layer +- country-specific build logic would move into the generic orchestration layer +- expensive build failures would block the top-level runtime package more often +- provenance would still originate in the country data pipeline, so `policyengine.py` would not actually eliminate the need for the country build manifest + +`policyengine.py` should be the certification boundary, not the country data build system. diff --git a/pyproject.toml b/pyproject.toml index 98f829e8..07e2b048 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "policyengine" -version = "3.4.2" +version = "3.4.0" description = "A package to conduct policy analysis using PolicyEngine tax-benefit models." readme = "README.md" authors = [ @@ -28,7 +28,7 @@ dependencies = [ [project.optional-dependencies] uk = [ "policyengine_core>=3.23.6", - "policyengine-uk==2.78.0", + "policyengine-uk==2.74.0", ] us = [ "policyengine_core>=3.23.6", @@ -45,7 +45,7 @@ dev = [ "pytest-asyncio>=0.26.0", "ruff>=0.9.0", "policyengine_core>=3.23.6", - "policyengine-uk==2.78.0", + "policyengine-uk==2.74.0", "policyengine-us==1.602.0", "towncrier>=24.8.0", "mypy>=1.11.0", diff --git a/src/policyengine/core/__init__.py b/src/policyengine/core/__init__.py index 710024b5..bc3c0d27 100644 --- a/src/policyengine/core/__init__.py +++ b/src/policyengine/core/__init__.py @@ -12,11 +12,17 @@ from .region import Region as Region from .region import RegionRegistry as RegionRegistry from .region import RegionType as RegionType +from .release_manifest import CertifiedDataArtifact as CertifiedDataArtifact from .release_manifest import CountryReleaseManifest as CountryReleaseManifest +from .release_manifest import DataBuildInfo as DataBuildInfo +from .release_manifest import DataCertification as DataCertification from .release_manifest import DataPackageVersion as DataPackageVersion from .release_manifest import DataReleaseArtifact as DataReleaseArtifact from .release_manifest import DataReleaseManifest as DataReleaseManifest from .release_manifest import PackageVersion as PackageVersion +from .release_manifest import ( + certify_data_release_compatibility as certify_data_release_compatibility, +) from .release_manifest import get_data_release_manifest as get_data_release_manifest from .release_manifest import get_release_manifest as get_release_manifest from .scoping_strategy import RegionScopingStrategy as RegionScopingStrategy diff --git a/src/policyengine/core/release_manifest.py b/src/policyengine/core/release_manifest.py index c024e922..63ecb5b1 100644 --- a/src/policyengine/core/release_manifest.py +++ b/src/policyengine/core/release_manifest.py @@ -25,6 +25,17 @@ class CompatibleModelPackage(BaseModel): specifier: str +class BuiltWithModelPackage(PackageVersion): + git_sha: str | None = None + data_build_fingerprint: str | None = None + + +class DataBuildInfo(BaseModel): + build_id: str | None = None + built_at: str | None = None + built_with_model_package: BuiltWithModelPackage | None = None + + class ArtifactPathReference(BaseModel): path: str @@ -60,10 +71,32 @@ class DataReleaseManifest(BaseModel): default_factory=list ) default_datasets: dict[str, str] = Field(default_factory=dict) + build: DataBuildInfo | None = None artifacts: dict[str, DataReleaseArtifact] = Field(default_factory=dict) +class DataCertification(BaseModel): + compatibility_basis: str + certified_for_model_version: str + data_build_id: str | None = None + built_with_model_version: str | None = None + built_with_model_git_sha: str | None = None + data_build_fingerprint: str | None = None + certified_by: str | None = None + + +class CertifiedDataArtifact(BaseModel): + data_package: PackageVersion | None = None + dataset: str + uri: str + sha256: str | None = None + build_id: str | None = None + + class CountryReleaseManifest(BaseModel): + schema_version: int = 1 + bundle_id: str | None = None + published_at: str | None = None country_id: str policyengine_version: str model_package: PackageVersion @@ -71,9 +104,16 @@ class CountryReleaseManifest(BaseModel): default_dataset: str datasets: dict[str, ArtifactPathReference] = Field(default_factory=dict) region_datasets: dict[str, ArtifactPathTemplate] = Field(default_factory=dict) + certified_data_artifact: CertifiedDataArtifact | None = None + certification: DataCertification | None = None @property def default_dataset_uri(self) -> str: + if ( + self.certified_data_artifact is not None + and self.certified_data_artifact.dataset == self.default_dataset + ): + return self.certified_data_artifact.uri return resolve_dataset_reference(self.country_id, self.default_dataset) @@ -124,6 +164,107 @@ def get_data_release_manifest(country_id: str) -> DataReleaseManifest: return DataReleaseManifest.model_validate_json(response.text) +def _specifier_matches(version: str, specifier: str) -> bool: + if specifier.startswith("=="): + return version == specifier[2:] + return False + + +def certify_data_release_compatibility( + country_id: str, + runtime_model_version: str, + runtime_data_build_fingerprint: str | None = None, +) -> DataCertification: + country_manifest = get_release_manifest(country_id) + data_release_manifest = get_data_release_manifest(country_id) + built_with_model = ( + data_release_manifest.build.built_with_model_package + if data_release_manifest.build is not None + else None + ) + + if ( + built_with_model is not None + and built_with_model.name != country_manifest.model_package.name + ): + raise ValueError( + "Data release manifest was built with a different model package: " + f"expected {country_manifest.model_package.name}, " + f"got {built_with_model.name}." + ) + + if ( + built_with_model is not None + and built_with_model.version == runtime_model_version + ): + return DataCertification( + compatibility_basis="exact_build_model_version", + certified_for_model_version=runtime_model_version, + data_build_id=( + data_release_manifest.build.build_id + if data_release_manifest.build is not None + else None + ), + built_with_model_version=built_with_model.version, + built_with_model_git_sha=built_with_model.git_sha, + data_build_fingerprint=built_with_model.data_build_fingerprint, + ) + + if ( + built_with_model is not None + and built_with_model.data_build_fingerprint is not None + and runtime_data_build_fingerprint is not None + and built_with_model.data_build_fingerprint == runtime_data_build_fingerprint + ): + return DataCertification( + compatibility_basis="matching_data_build_fingerprint", + certified_for_model_version=runtime_model_version, + data_build_id=( + data_release_manifest.build.build_id + if data_release_manifest.build is not None + else None + ), + built_with_model_version=built_with_model.version, + built_with_model_git_sha=built_with_model.git_sha, + data_build_fingerprint=built_with_model.data_build_fingerprint, + ) + + for compatible_model_package in data_release_manifest.compatible_model_packages: + if compatible_model_package.name != country_manifest.model_package.name: + continue + if _specifier_matches( + version=runtime_model_version, + specifier=compatible_model_package.specifier, + ): + return DataCertification( + compatibility_basis="legacy_compatible_model_package", + certified_for_model_version=runtime_model_version, + data_build_id=( + data_release_manifest.build.build_id + if data_release_manifest.build is not None + else None + ), + built_with_model_version=( + built_with_model.version + if built_with_model is not None + else None + ), + built_with_model_git_sha=( + built_with_model.git_sha if built_with_model is not None else None + ), + data_build_fingerprint=( + built_with_model.data_build_fingerprint + if built_with_model is not None + else None + ), + ) + + raise ValueError( + "Data release manifest is not certified for the runtime model version " + f"{runtime_model_version} in country '{country_id}'." + ) + + def resolve_dataset_reference(country_id: str, dataset: str) -> str: if "://" in dataset: return dataset diff --git a/src/policyengine/core/tax_benefit_model_version.py b/src/policyengine/core/tax_benefit_model_version.py index 7b09dfcc..eb8cfd5e 100644 --- a/src/policyengine/core/tax_benefit_model_version.py +++ b/src/policyengine/core/tax_benefit_model_version.py @@ -1,10 +1,10 @@ -from datetime import datetime +from datetime import UTC, datetime from typing import TYPE_CHECKING from uuid import uuid4 from pydantic import BaseModel, Field -from .release_manifest import CountryReleaseManifest, PackageVersion +from .release_manifest import CountryReleaseManifest, DataCertification, PackageVersion from .tax_benefit_model import TaxBenefitModel if TYPE_CHECKING: @@ -23,7 +23,9 @@ class TaxBenefitModelVersion(BaseModel): model: TaxBenefitModel version: str description: str | None = None - created_at: datetime | None = Field(default_factory=datetime.utcnow) + created_at: datetime | None = Field( + default_factory=lambda: datetime.now(UTC) + ) variables: list["Variable"] = Field(default_factory=list) parameters: list["Parameter"] = Field(default_factory=list) @@ -40,6 +42,7 @@ class TaxBenefitModelVersion(BaseModel): model_package: PackageVersion | None = Field(default=None) data_package: PackageVersion | None = Field(default=None) default_dataset_uri: str | None = Field(default=None) + data_certification: DataCertification | None = Field(default=None) @property def parameter_values(self) -> list["ParameterValue"]: @@ -126,7 +129,21 @@ def get_region(self, code: str) -> "Region | None": @property def release_bundle(self) -> dict[str, str | None]: + manifest_certification = ( + self.release_manifest.certification + if self.release_manifest is not None + else None + ) + certification = self.data_certification or manifest_certification + certified_data_artifact = ( + self.release_manifest.certified_data_artifact + if self.release_manifest is not None + else None + ) return { + "bundle_id": self.release_manifest.bundle_id + if self.release_manifest is not None + else None, "country_id": self.release_manifest.country_id if self.release_manifest is not None else None, @@ -136,14 +153,58 @@ def release_bundle(self) -> dict[str, str | None]: "model_package": self.model_package.name if self.model_package is not None else None, - "model_version": self.version, + "model_version": self.model_package.version + if self.model_package is not None + else None, "data_package": self.data_package.name if self.data_package is not None else None, "data_version": self.data_package.version if self.data_package is not None else None, + "default_dataset": self.release_manifest.default_dataset + if self.release_manifest is not None + else None, "default_dataset_uri": self.default_dataset_uri, + "certified_data_build_id": ( + certification.data_build_id + if certification is not None + else ( + certified_data_artifact.build_id + if certified_data_artifact is not None + else None + ) + ), + "certified_data_artifact_sha256": ( + certified_data_artifact.sha256 + if certified_data_artifact is not None + else None + ), + "data_build_model_version": ( + certification.built_with_model_version + if certification is not None + else None + ), + "data_build_model_git_sha": ( + certification.built_with_model_git_sha + if certification is not None + else None + ), + "data_build_fingerprint": ( + certification.data_build_fingerprint + if certification is not None + else None + ), + "compatibility_basis": ( + certification.compatibility_basis + if certification is not None + else None + ), + "certified_by": ( + certification.certified_by + if certification is not None + else None + ), } def __repr__(self) -> str: diff --git a/src/policyengine/data/release_manifests/uk.json b/src/policyengine/data/release_manifests/uk.json index ac1d93dd..90cc1cc1 100644 --- a/src/policyengine/data/release_manifests/uk.json +++ b/src/policyengine/data/release_manifests/uk.json @@ -1,15 +1,33 @@ { + "schema_version": 1, + "bundle_id": "uk-3.4.0", "country_id": "uk", - "policyengine_version": "3.4.1", + "policyengine_version": "3.4.0", "model_package": { "name": "policyengine-uk", - "version": "2.78.0" + "version": "2.74.0" }, "data_package": { "name": "policyengine-uk-data", - "version": "1.40.3", + "version": "1.40.4", "repo_id": "policyengine/policyengine-uk-data-private" }, + "certified_data_artifact": { + "data_package": { + "name": "policyengine-uk-data", + "version": "1.40.4" + }, + "build_id": "policyengine-uk-data-1.40.4", + "dataset": "enhanced_frs_2023_24", + "uri": "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" + }, + "certification": { + "compatibility_basis": "exact_build_model_version", + "data_build_id": "policyengine-uk-data-1.40.4", + "built_with_model_version": "2.74.0", + "certified_for_model_version": "2.74.0", + "certified_by": "policyengine.py bundled manifest" + }, "default_dataset": "enhanced_frs_2023_24", "datasets": { "frs_2023_24": { diff --git a/src/policyengine/data/release_manifests/us.json b/src/policyengine/data/release_manifests/us.json index 0ea73808..20526da9 100644 --- a/src/policyengine/data/release_manifests/us.json +++ b/src/policyengine/data/release_manifests/us.json @@ -1,15 +1,33 @@ { + "schema_version": 1, + "bundle_id": "us-3.4.0", "country_id": "us", - "policyengine_version": "3.4.1", + "policyengine_version": "3.4.0", "model_package": { "name": "policyengine-us", "version": "1.602.0" }, "data_package": { "name": "policyengine-us-data", - "version": "1.77.0", + "version": "1.73.0", "repo_id": "policyengine/policyengine-us-data" }, + "certified_data_artifact": { + "data_package": { + "name": "policyengine-us-data", + "version": "1.73.0" + }, + "build_id": "policyengine-us-data-1.73.0", + "dataset": "enhanced_cps_2024", + "uri": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0" + }, + "certification": { + "compatibility_basis": "exact_build_model_version", + "data_build_id": "policyengine-us-data-1.73.0", + "built_with_model_version": "1.602.0", + "certified_for_model_version": "1.602.0", + "certified_by": "policyengine.py bundled manifest" + }, "default_dataset": "enhanced_cps_2024", "datasets": { "enhanced_cps_2024": { diff --git a/src/policyengine/tax_benefit_models/uk/model.py b/src/policyengine/tax_benefit_models/uk/model.py index 7b605157..f0467a3f 100644 --- a/src/policyengine/tax_benefit_models/uk/model.py +++ b/src/policyengine/tax_benefit_models/uk/model.py @@ -1,11 +1,9 @@ import datetime -import logging -from importlib.metadata import version +from importlib import metadata from pathlib import Path from typing import TYPE_CHECKING import pandas as pd -import requests from microdf import MicroDataFrame from policyengine.core import ( @@ -15,7 +13,10 @@ TaxBenefitModelVersion, Variable, ) -from policyengine.core.release_manifest import get_release_manifest +from policyengine.core.release_manifest import ( + certify_data_release_compatibility, + get_release_manifest, +) from policyengine.utils.entity_utils import ( build_entity_relationships, filter_dataset_by_household_variable, @@ -40,25 +41,6 @@ class PolicyEngineUK(TaxBenefitModel): uk_model = PolicyEngineUK() -_logger = logging.getLogger(__name__) - - -def _get_uk_package_metadata(): - """Get PolicyEngine UK package version and upload time (lazy-loaded).""" - pkg_version = version("policyengine-uk") - try: - response = requests.get( - "https://pypi.org/pypi/policyengine-uk/json", - timeout=10, - ) - response.raise_for_status() - data = response.json() - upload_time = data["releases"][pkg_version][0]["upload_time_iso_8601"] - except (requests.RequestException, KeyError, IndexError) as exc: - _logger.warning("Could not fetch PyPI metadata for policyengine-uk: %s", exc) - upload_time = None - return pkg_version, upload_time - class PolicyEngineUKLatest(TaxBenefitModelVersion): model: TaxBenefitModel = uk_model @@ -146,23 +128,33 @@ class PolicyEngineUKLatest(TaxBenefitModelVersion): def __init__(self, **kwargs: dict): manifest = get_release_manifest("uk") if "version" not in kwargs or kwargs.get("version") is None: - pkg_version, upload_time = _get_uk_package_metadata() - kwargs["version"] = pkg_version - if upload_time is not None: - kwargs["created_at"] = datetime.datetime.fromisoformat(upload_time) - - if kwargs["version"] != manifest.model_package.version: - raise RuntimeError( - "Installed policyengine-uk version does not match the bundled " - f"policyengine.py release manifest: {kwargs['version']} != " - f"{manifest.model_package.version}." + kwargs["version"] = manifest.model_package.version + + installed_model_version = metadata.version("policyengine-uk") + if installed_model_version != manifest.model_package.version: + raise ValueError( + "Installed policyengine-uk version does not match the " + f"bundled policyengine.py manifest. Expected " + f"{manifest.model_package.version}, got {installed_model_version}." ) + from policyengine_uk.build_metadata import get_data_build_metadata + + model_build_metadata = get_data_build_metadata() + data_certification = certify_data_release_compatibility( + "uk", + runtime_model_version=installed_model_version, + runtime_data_build_fingerprint=model_build_metadata.get( + "data_build_fingerprint" + ), + ) + super().__init__(**kwargs) self.release_manifest = manifest self.model_package = manifest.model_package self.data_package = manifest.data_package self.default_dataset_uri = manifest.default_dataset_uri + self.data_certification = data_certification from policyengine_core.enums import Enum from policyengine_uk.system import system diff --git a/src/policyengine/tax_benefit_models/us/model.py b/src/policyengine/tax_benefit_models/us/model.py index c7d47a75..05ece1e3 100644 --- a/src/policyengine/tax_benefit_models/us/model.py +++ b/src/policyengine/tax_benefit_models/us/model.py @@ -1,10 +1,9 @@ import datetime -from importlib.metadata import version +from importlib import metadata from pathlib import Path from typing import TYPE_CHECKING import pandas as pd -import requests from microdf import MicroDataFrame from policyengine.core import ( @@ -14,7 +13,10 @@ TaxBenefitModelVersion, Variable, ) -from policyengine.core.release_manifest import get_release_manifest +from policyengine.core.release_manifest import ( + certify_data_release_compatibility, + get_release_manifest, +) from policyengine.utils.entity_utils import ( build_entity_relationships, filter_dataset_by_household_variable, @@ -46,15 +48,6 @@ class PolicyEngineUS(TaxBenefitModel): us_model = PolicyEngineUS() -def _get_us_package_metadata(): - """Get PolicyEngine US package version and upload time (lazy-loaded).""" - pkg_version = version("policyengine-us") - response = requests.get("https://pypi.org/pypi/policyengine-us/json") - data = response.json() - upload_time = data["releases"][pkg_version][0]["upload_time_iso_8601"] - return pkg_version, upload_time - - class PolicyEngineUSLatest(TaxBenefitModelVersion): model: TaxBenefitModel = us_model version: str = None @@ -127,22 +120,33 @@ class PolicyEngineUSLatest(TaxBenefitModelVersion): def __init__(self, **kwargs: dict): manifest = get_release_manifest("us") if "version" not in kwargs or kwargs.get("version") is None: - pkg_version, upload_time = _get_us_package_metadata() - kwargs["version"] = pkg_version - kwargs["created_at"] = datetime.datetime.fromisoformat(upload_time) - - if kwargs["version"] != manifest.model_package.version: - raise RuntimeError( - "Installed policyengine-us version does not match the bundled " - f"policyengine.py release manifest: {kwargs['version']} != " - f"{manifest.model_package.version}." + kwargs["version"] = manifest.model_package.version + + installed_model_version = metadata.version("policyengine-us") + if installed_model_version != manifest.model_package.version: + raise ValueError( + "Installed policyengine-us version does not match the " + f"bundled policyengine.py manifest. Expected " + f"{manifest.model_package.version}, got {installed_model_version}." ) + from policyengine_us.build_metadata import get_data_build_metadata + + model_build_metadata = get_data_build_metadata() + data_certification = certify_data_release_compatibility( + "us", + runtime_model_version=installed_model_version, + runtime_data_build_fingerprint=model_build_metadata.get( + "data_build_fingerprint" + ), + ) + super().__init__(**kwargs) self.release_manifest = manifest self.model_package = manifest.model_package self.data_package = manifest.data_package self.default_dataset_uri = manifest.default_dataset_uri + self.data_certification = data_certification from policyengine_core.enums import Enum from policyengine_us.system import system diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index f2c27c72..eeda4b39 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -4,11 +4,14 @@ from unittest.mock import MagicMock, patch from policyengine.core.release_manifest import ( + certify_data_release_compatibility, dataset_logical_name, get_data_release_manifest, get_release_manifest, resolve_dataset_reference, ) +from policyengine.core.tax_benefit_model import TaxBenefitModel +from policyengine.core.tax_benefit_model_version import TaxBenefitModelVersion def _response_with_json(payload: dict) -> MagicMock: @@ -29,33 +32,49 @@ def teardown_method(self): def test__given_us_manifest__then_has_pinned_model_and_data_packages(self): manifest = get_release_manifest("us") + assert manifest.schema_version == 1 + assert manifest.bundle_id == "us-3.4.0" assert manifest.country_id == "us" - assert manifest.policyengine_version == "3.4.1" + assert manifest.policyengine_version == "3.4.0" assert manifest.model_package.name == "policyengine-us" assert manifest.model_package.version == "1.602.0" assert manifest.data_package.name == "policyengine-us-data" - assert manifest.data_package.version == "1.77.0" + assert manifest.data_package.version == "1.73.0" assert manifest.data_package.repo_id == "policyengine/policyengine-us-data" + assert manifest.certified_data_artifact is not None + assert manifest.certified_data_artifact.build_id == "policyengine-us-data-1.73.0" + assert manifest.certified_data_artifact.dataset == "enhanced_cps_2024" + assert manifest.certification is not None + assert manifest.certification.data_build_id == "policyengine-us-data-1.73.0" + assert manifest.certification.built_with_model_version == "1.602.0" + assert manifest.certification.certified_for_model_version == "1.602.0" def test__given_uk_manifest__then_has_pinned_model_and_data_packages(self): manifest = get_release_manifest("uk") + assert manifest.schema_version == 1 + assert manifest.bundle_id == "uk-3.4.0" assert manifest.country_id == "uk" - assert manifest.policyengine_version == "3.4.1" + assert manifest.policyengine_version == "3.4.0" assert manifest.model_package.name == "policyengine-uk" - assert manifest.model_package.version == "2.78.0" + assert manifest.model_package.version == "2.74.0" assert manifest.data_package.name == "policyengine-uk-data" - assert manifest.data_package.version == "1.40.3" - assert ( - manifest.data_package.repo_id == "policyengine/policyengine-uk-data-private" - ) + assert manifest.data_package.version == "1.40.4" + assert manifest.data_package.repo_id == "policyengine/policyengine-uk-data-private" + assert manifest.certified_data_artifact is not None + assert manifest.certified_data_artifact.build_id == "policyengine-uk-data-1.40.4" + assert manifest.certified_data_artifact.dataset == "enhanced_frs_2023_24" + assert manifest.certification is not None + assert manifest.certification.data_build_id == "policyengine-uk-data-1.40.4" + assert manifest.certification.built_with_model_version == "2.74.0" + assert manifest.certification.certified_for_model_version == "2.74.0" def test__given_us_dataset_name__then_resolves_to_versioned_hf_url(self): resolved = resolve_dataset_reference("us", "enhanced_cps_2024") assert ( resolved - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.77.0" + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0" ) def test__given_uk_dataset_name__then_resolves_to_versioned_hf_url(self): @@ -63,16 +82,22 @@ def test__given_uk_dataset_name__then_resolves_to_versioned_hf_url(self): assert ( resolved - == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.3" + == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" ) def test__given_explicit_url__then_resolution_is_noop(self): - url = "hf://policyengine/policyengine-us-data/cps_2023.h5@1.77.0" + url = "hf://policyengine/policyengine-us-data/cps_2023.h5@1.73.0" assert resolve_dataset_reference("us", url) == url + def test__given_default_dataset__then_prefers_certified_data_artifact_uri(self): + manifest = get_release_manifest("us") + + assert manifest.certified_data_artifact is not None + assert manifest.default_dataset_uri == manifest.certified_data_artifact.uri + def test__given_versioned_dataset_url__then_logical_name_drops_version(self): - dataset = "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.77.0" + dataset = "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0" assert dataset_logical_name(dataset) == "enhanced_cps_2024" @@ -82,16 +107,28 @@ def test__given_country__then_can_fetch_data_release_manifest(self): "schema_version": 1, "data_package": { "name": "policyengine-us-data", - "version": "1.77.0", + "version": "1.73.0", }, - "compatible_model_packages": [], + "build": { + "build_id": "policyengine-us-data-1.73.0", + "built_at": "2026-04-10T12:00:00Z", + "built_with_model_package": { + "name": "policyengine-us", + "version": "1.602.0", + "git_sha": "deadbeef", + "data_build_fingerprint": "sha256:fingerprint", + } + }, + "compatible_model_packages": [ + {"name": "policyengine-us", "specifier": "==1.602.0"} + ], "default_datasets": {"national": "enhanced_cps_2024"}, "artifacts": { "enhanced_cps_2024": { "kind": "microdata", "path": "enhanced_cps_2024.h5", "repo_id": "policyengine/policyengine-us-data", - "revision": "1.77.0", + "revision": "1.73.0", "sha256": "abc", "size_bytes": 123, } @@ -107,8 +144,137 @@ def test__given_country__then_can_fetch_data_release_manifest(self): assert manifest.schema_version == 1 assert manifest.data_package.name == "policyengine-us-data" assert manifest.default_datasets["national"] == "enhanced_cps_2024" + assert manifest.build is not None + assert manifest.build.build_id == "policyengine-us-data-1.73.0" + assert manifest.build.built_at == "2026-04-10T12:00:00Z" + assert manifest.build.built_with_model_package is not None + assert manifest.build.built_with_model_package.version == "1.602.0" assert ( manifest.artifacts["enhanced_cps_2024"].uri - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.77.0" + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0" ) assert mock_get.call_count == 1 + + def test__given_matching_fingerprint__then_certification_allows_reuse(self): + get_data_release_manifest.cache_clear() + payload = { + "schema_version": 1, + "data_package": { + "name": "policyengine-us-data", + "version": "1.73.0", + }, + "build": { + "build_id": "policyengine-us-data-1.73.0", + "built_with_model_package": { + "name": "policyengine-us", + "version": "1.601.0", + "git_sha": "deadbeef", + "data_build_fingerprint": "sha256:match", + } + }, + "compatible_model_packages": [], + "default_datasets": {"national": "enhanced_cps_2024"}, + "artifacts": {}, + } + + with patch( + "policyengine.core.release_manifest.requests.get", + return_value=_response_with_json(payload), + ): + certification = certify_data_release_compatibility( + "us", + runtime_model_version="1.602.0", + runtime_data_build_fingerprint="sha256:match", + ) + + assert certification.compatibility_basis == "matching_data_build_fingerprint" + assert certification.data_build_id == "policyengine-us-data-1.73.0" + assert certification.built_with_model_version == "1.601.0" + assert certification.certified_for_model_version == "1.602.0" + + def test__given_mismatched_version_and_fingerprint__then_certification_fails(self): + get_data_release_manifest.cache_clear() + payload = { + "schema_version": 1, + "data_package": { + "name": "policyengine-us-data", + "version": "1.73.0", + }, + "build": { + "build_id": "policyengine-us-data-1.73.0", + "built_with_model_package": { + "name": "policyengine-us", + "version": "1.601.0", + "git_sha": "deadbeef", + "data_build_fingerprint": "sha256:build", + } + }, + "compatible_model_packages": [], + "default_datasets": {"national": "enhanced_cps_2024"}, + "artifacts": {}, + } + + with patch( + "policyengine.core.release_manifest.requests.get", + return_value=_response_with_json(payload), + ): + try: + certify_data_release_compatibility( + "us", + runtime_model_version="1.602.0", + runtime_data_build_fingerprint="sha256:runtime", + ) + except ValueError as error: + assert "not certified" in str(error) + else: + raise AssertionError("Expected certification to fail") + + def test__given_manifest_certification__then_release_bundle_exposes_it(self): + manifest = get_release_manifest("uk") + model_version = TaxBenefitModelVersion( + model=TaxBenefitModel(id="uk"), + version=manifest.model_package.version, + release_manifest=manifest, + model_package=manifest.model_package, + data_package=manifest.data_package, + default_dataset_uri=manifest.default_dataset_uri, + ) + + bundle = model_version.release_bundle + + assert bundle["bundle_id"] == "uk-3.4.0" + assert bundle["default_dataset"] == "enhanced_frs_2023_24" + assert bundle["default_dataset_uri"] == manifest.default_dataset_uri + assert bundle["certified_data_build_id"] == "policyengine-uk-data-1.40.4" + assert bundle["data_build_model_version"] == "2.74.0" + assert bundle["compatibility_basis"] == "exact_build_model_version" + assert bundle["certified_by"] == "policyengine.py bundled manifest" + + def test__given_runtime_certification__then_release_bundle_prefers_runtime_value(self): + manifest = get_release_manifest("us") + model_version = TaxBenefitModelVersion( + model=TaxBenefitModel(id="us"), + version=manifest.model_package.version, + release_manifest=manifest, + model_package=manifest.model_package, + data_package=manifest.data_package, + default_dataset_uri=manifest.default_dataset_uri, + data_certification={ + "compatibility_basis": "matching_data_build_fingerprint", + "certified_for_model_version": "1.603.0", + "data_build_id": "policyengine-us-data-1.73.0", + "built_with_model_version": "1.602.0", + "built_with_model_git_sha": "deadbeef", + "data_build_fingerprint": "sha256:match", + "certified_by": "runtime certification", + }, + ) + + bundle = model_version.release_bundle + + assert bundle["certified_data_build_id"] == "policyengine-us-data-1.73.0" + assert bundle["data_build_model_version"] == "1.602.0" + assert bundle["data_build_model_git_sha"] == "deadbeef" + assert bundle["data_build_fingerprint"] == "sha256:match" + assert bundle["compatibility_basis"] == "matching_data_build_fingerprint" + assert bundle["certified_by"] == "runtime certification" From 57be8b9e0e8231e9b3a14e64c407f86c7f2ba825 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 12 Apr 2026 09:15:32 -0400 Subject: [PATCH 2/6] Add changelog fragment for certified bundles --- changelog.d/certified-bundle-manifest.changed.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/certified-bundle-manifest.changed.md diff --git a/changelog.d/certified-bundle-manifest.changed.md b/changelog.d/certified-bundle-manifest.changed.md new file mode 100644 index 00000000..449f9ed2 --- /dev/null +++ b/changelog.d/certified-bundle-manifest.changed.md @@ -0,0 +1 @@ +Add certified bundle metadata that records runtime model pins alongside build-time data artifact provenance and compatibility fingerprints. From dfd2527e87367e6c5e6b66b3781616988a9fcdfc Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 12 Apr 2026 09:18:12 -0400 Subject: [PATCH 3/6] Format certified bundle changes --- src/policyengine/core/release_manifest.py | 4 +--- .../core/tax_benefit_model_version.py | 12 +++------- tests/test_release_manifests.py | 22 +++++++++++++------ 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/policyengine/core/release_manifest.py b/src/policyengine/core/release_manifest.py index 63ecb5b1..147a669d 100644 --- a/src/policyengine/core/release_manifest.py +++ b/src/policyengine/core/release_manifest.py @@ -245,9 +245,7 @@ def certify_data_release_compatibility( else None ), built_with_model_version=( - built_with_model.version - if built_with_model is not None - else None + built_with_model.version if built_with_model is not None else None ), built_with_model_git_sha=( built_with_model.git_sha if built_with_model is not None else None diff --git a/src/policyengine/core/tax_benefit_model_version.py b/src/policyengine/core/tax_benefit_model_version.py index eb8cfd5e..f253fc5c 100644 --- a/src/policyengine/core/tax_benefit_model_version.py +++ b/src/policyengine/core/tax_benefit_model_version.py @@ -23,9 +23,7 @@ class TaxBenefitModelVersion(BaseModel): model: TaxBenefitModel version: str description: str | None = None - created_at: datetime | None = Field( - default_factory=lambda: datetime.now(UTC) - ) + created_at: datetime | None = Field(default_factory=lambda: datetime.now(UTC)) variables: list["Variable"] = Field(default_factory=list) parameters: list["Parameter"] = Field(default_factory=list) @@ -196,14 +194,10 @@ def release_bundle(self) -> dict[str, str | None]: else None ), "compatibility_basis": ( - certification.compatibility_basis - if certification is not None - else None + certification.compatibility_basis if certification is not None else None ), "certified_by": ( - certification.certified_by - if certification is not None - else None + certification.certified_by if certification is not None else None ), } diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index eeda4b39..9dc2286f 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -42,7 +42,9 @@ def test__given_us_manifest__then_has_pinned_model_and_data_packages(self): assert manifest.data_package.version == "1.73.0" assert manifest.data_package.repo_id == "policyengine/policyengine-us-data" assert manifest.certified_data_artifact is not None - assert manifest.certified_data_artifact.build_id == "policyengine-us-data-1.73.0" + assert ( + manifest.certified_data_artifact.build_id == "policyengine-us-data-1.73.0" + ) assert manifest.certified_data_artifact.dataset == "enhanced_cps_2024" assert manifest.certification is not None assert manifest.certification.data_build_id == "policyengine-us-data-1.73.0" @@ -60,9 +62,13 @@ def test__given_uk_manifest__then_has_pinned_model_and_data_packages(self): assert manifest.model_package.version == "2.74.0" assert manifest.data_package.name == "policyengine-uk-data" assert manifest.data_package.version == "1.40.4" - assert manifest.data_package.repo_id == "policyengine/policyengine-uk-data-private" + assert ( + manifest.data_package.repo_id == "policyengine/policyengine-uk-data-private" + ) assert manifest.certified_data_artifact is not None - assert manifest.certified_data_artifact.build_id == "policyengine-uk-data-1.40.4" + assert ( + manifest.certified_data_artifact.build_id == "policyengine-uk-data-1.40.4" + ) assert manifest.certified_data_artifact.dataset == "enhanced_frs_2023_24" assert manifest.certification is not None assert manifest.certification.data_build_id == "policyengine-uk-data-1.40.4" @@ -117,7 +123,7 @@ def test__given_country__then_can_fetch_data_release_manifest(self): "version": "1.602.0", "git_sha": "deadbeef", "data_build_fingerprint": "sha256:fingerprint", - } + }, }, "compatible_model_packages": [ {"name": "policyengine-us", "specifier": "==1.602.0"} @@ -170,7 +176,7 @@ def test__given_matching_fingerprint__then_certification_allows_reuse(self): "version": "1.601.0", "git_sha": "deadbeef", "data_build_fingerprint": "sha256:match", - } + }, }, "compatible_model_packages": [], "default_datasets": {"national": "enhanced_cps_2024"}, @@ -207,7 +213,7 @@ def test__given_mismatched_version_and_fingerprint__then_certification_fails(sel "version": "1.601.0", "git_sha": "deadbeef", "data_build_fingerprint": "sha256:build", - } + }, }, "compatible_model_packages": [], "default_datasets": {"national": "enhanced_cps_2024"}, @@ -250,7 +256,9 @@ def test__given_manifest_certification__then_release_bundle_exposes_it(self): assert bundle["compatibility_basis"] == "exact_build_model_version" assert bundle["certified_by"] == "policyengine.py bundled manifest" - def test__given_runtime_certification__then_release_bundle_prefers_runtime_value(self): + def test__given_runtime_certification__then_release_bundle_prefers_runtime_value( + self, + ): manifest = get_release_manifest("us") model_version = TaxBenefitModelVersion( model=TaxBenefitModel(id="us"), From 4ec7c2e169e021326bb35ed186aa66aa11d25d58 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 12 Apr 2026 09:21:35 -0400 Subject: [PATCH 4/6] Fallback when country build metadata is unavailable --- src/policyengine/core/release_manifest.py | 23 +++++++++ .../tax_benefit_models/uk/model.py | 5 +- .../tax_benefit_models/us/model.py | 5 +- tests/test_release_manifests.py | 48 +++++++++++++++++++ 4 files changed, 75 insertions(+), 6 deletions(-) diff --git a/src/policyengine/core/release_manifest.py b/src/policyengine/core/release_manifest.py index 147a669d..ad956d57 100644 --- a/src/policyengine/core/release_manifest.py +++ b/src/policyengine/core/release_manifest.py @@ -1,5 +1,6 @@ import os from functools import lru_cache +from importlib import import_module, metadata from importlib.resources import files from pathlib import Path @@ -121,6 +122,28 @@ def build_hf_uri(repo_id: str, path_in_repo: str, revision: str) -> str: return f"hf://{repo_id}/{path_in_repo}@{revision}" +def get_runtime_model_build_metadata(package_name: str) -> dict[str, str | None]: + installed_version = metadata.version(package_name) + module_name = package_name.replace("-", "_") + + try: + build_metadata_module = import_module(f"{module_name}.build_metadata") + except ModuleNotFoundError: + return { + "name": package_name, + "version": installed_version, + "git_sha": None, + "data_build_fingerprint": None, + } + + build_metadata = build_metadata_module.get_data_build_metadata() + build_metadata.setdefault("name", package_name) + build_metadata.setdefault("version", installed_version) + build_metadata.setdefault("git_sha", None) + build_metadata.setdefault("data_build_fingerprint", None) + return build_metadata + + @lru_cache def get_release_manifest(country_id: str) -> CountryReleaseManifest: manifest_path = files("policyengine").joinpath( diff --git a/src/policyengine/tax_benefit_models/uk/model.py b/src/policyengine/tax_benefit_models/uk/model.py index f0467a3f..f0962541 100644 --- a/src/policyengine/tax_benefit_models/uk/model.py +++ b/src/policyengine/tax_benefit_models/uk/model.py @@ -16,6 +16,7 @@ from policyengine.core.release_manifest import ( certify_data_release_compatibility, get_release_manifest, + get_runtime_model_build_metadata, ) from policyengine.utils.entity_utils import ( build_entity_relationships, @@ -138,9 +139,7 @@ def __init__(self, **kwargs: dict): f"{manifest.model_package.version}, got {installed_model_version}." ) - from policyengine_uk.build_metadata import get_data_build_metadata - - model_build_metadata = get_data_build_metadata() + model_build_metadata = get_runtime_model_build_metadata("policyengine-uk") data_certification = certify_data_release_compatibility( "uk", runtime_model_version=installed_model_version, diff --git a/src/policyengine/tax_benefit_models/us/model.py b/src/policyengine/tax_benefit_models/us/model.py index 05ece1e3..8ad89929 100644 --- a/src/policyengine/tax_benefit_models/us/model.py +++ b/src/policyengine/tax_benefit_models/us/model.py @@ -16,6 +16,7 @@ from policyengine.core.release_manifest import ( certify_data_release_compatibility, get_release_manifest, + get_runtime_model_build_metadata, ) from policyengine.utils.entity_utils import ( build_entity_relationships, @@ -130,9 +131,7 @@ def __init__(self, **kwargs: dict): f"{manifest.model_package.version}, got {installed_model_version}." ) - from policyengine_us.build_metadata import get_data_build_metadata - - model_build_metadata = get_data_build_metadata() + model_build_metadata = get_runtime_model_build_metadata("policyengine-us") data_certification = certify_data_release_compatibility( "us", runtime_model_version=installed_model_version, diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index 9dc2286f..2a20f420 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -8,6 +8,7 @@ dataset_logical_name, get_data_release_manifest, get_release_manifest, + get_runtime_model_build_metadata, resolve_dataset_reference, ) from policyengine.core.tax_benefit_model import TaxBenefitModel @@ -161,6 +162,53 @@ def test__given_country__then_can_fetch_data_release_manifest(self): ) assert mock_get.call_count == 1 + def test__given_missing_build_metadata_module__then_runtime_metadata_falls_back( + self, + ): + with ( + patch( + "policyengine.core.release_manifest.metadata.version", + return_value="2.74.0", + ), + patch( + "policyengine.core.release_manifest.import_module", + side_effect=ModuleNotFoundError, + ), + ): + build_metadata = get_runtime_model_build_metadata("policyengine-uk") + + assert build_metadata == { + "name": "policyengine-uk", + "version": "2.74.0", + "git_sha": None, + "data_build_fingerprint": None, + } + + def test__given_build_metadata_module__then_runtime_metadata_uses_it(self): + module = MagicMock() + module.get_data_build_metadata.return_value = { + "name": "policyengine-us", + "version": "1.602.0", + "git_sha": "deadbeef", + "data_build_fingerprint": "sha256:build", + } + + with ( + patch( + "policyengine.core.release_manifest.metadata.version", + return_value="1.602.0", + ), + patch( + "policyengine.core.release_manifest.import_module", + return_value=module, + ), + ): + build_metadata = get_runtime_model_build_metadata("policyengine-us") + + assert build_metadata["version"] == "1.602.0" + assert build_metadata["git_sha"] == "deadbeef" + assert build_metadata["data_build_fingerprint"] == "sha256:build" + def test__given_matching_fingerprint__then_certification_allows_reuse(self): get_data_release_manifest.cache_clear() payload = { From f63df682ba54dff51ff4816fc463939bef8ae7ee Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 12 Apr 2026 10:22:06 -0400 Subject: [PATCH 5/6] Fall back to bundled data certification --- src/policyengine/core/release_manifest.py | 42 ++++++++++++++++++- .../tax_benefit_models/uk/model.py | 5 ++- .../tax_benefit_models/us/model.py | 5 ++- tests/test_release_manifests.py | 42 +++++++++++++++++++ 4 files changed, 89 insertions(+), 5 deletions(-) diff --git a/src/policyengine/core/release_manifest.py b/src/policyengine/core/release_manifest.py index ad956d57..05995d74 100644 --- a/src/policyengine/core/release_manifest.py +++ b/src/policyengine/core/release_manifest.py @@ -10,6 +10,10 @@ HF_REQUEST_TIMEOUT_SECONDS = 30 +class DataReleaseManifestUnavailable(ValueError): + pass + + class PackageVersion(BaseModel): name: str version: str @@ -179,10 +183,15 @@ def get_data_release_manifest(country_id: str) -> DataReleaseManifest: timeout=HF_REQUEST_TIMEOUT_SECONDS, ) if response.status_code in (401, 403): - raise ValueError( + raise DataReleaseManifestUnavailable( "Could not fetch the data release manifest from Hugging Face. " "If this country uses a private data repo, set HUGGING_FACE_TOKEN." ) + if response.status_code == 404: + raise DataReleaseManifestUnavailable( + "Could not find the data release manifest on Hugging Face for " + f"{data_package.repo_id}@{data_package.version}." + ) response.raise_for_status() return DataReleaseManifest.model_validate_json(response.text) @@ -286,6 +295,37 @@ def certify_data_release_compatibility( ) +def resolve_runtime_data_certification( + country_id: str, + runtime_model_version: str, + runtime_data_build_fingerprint: str | None = None, + bundled_certification: DataCertification | None = None, +) -> DataCertification: + try: + return certify_data_release_compatibility( + country_id=country_id, + runtime_model_version=runtime_model_version, + runtime_data_build_fingerprint=runtime_data_build_fingerprint, + ) + except DataReleaseManifestUnavailable: + if ( + bundled_certification is not None + and bundled_certification.certified_for_model_version + == runtime_model_version + ): + bundled_fingerprint = bundled_certification.data_build_fingerprint + if ( + bundled_certification.compatibility_basis + == "matching_data_build_fingerprint" + and bundled_fingerprint is not None + and runtime_data_build_fingerprint is not None + and bundled_fingerprint != runtime_data_build_fingerprint + ): + raise + return bundled_certification + raise + + def resolve_dataset_reference(country_id: str, dataset: str) -> str: if "://" in dataset: return dataset diff --git a/src/policyengine/tax_benefit_models/uk/model.py b/src/policyengine/tax_benefit_models/uk/model.py index f0962541..3a8cf2b2 100644 --- a/src/policyengine/tax_benefit_models/uk/model.py +++ b/src/policyengine/tax_benefit_models/uk/model.py @@ -14,9 +14,9 @@ Variable, ) from policyengine.core.release_manifest import ( - certify_data_release_compatibility, get_release_manifest, get_runtime_model_build_metadata, + resolve_runtime_data_certification, ) from policyengine.utils.entity_utils import ( build_entity_relationships, @@ -140,12 +140,13 @@ def __init__(self, **kwargs: dict): ) model_build_metadata = get_runtime_model_build_metadata("policyengine-uk") - data_certification = certify_data_release_compatibility( + data_certification = resolve_runtime_data_certification( "uk", runtime_model_version=installed_model_version, runtime_data_build_fingerprint=model_build_metadata.get( "data_build_fingerprint" ), + bundled_certification=manifest.certification, ) super().__init__(**kwargs) diff --git a/src/policyengine/tax_benefit_models/us/model.py b/src/policyengine/tax_benefit_models/us/model.py index 8ad89929..804eab68 100644 --- a/src/policyengine/tax_benefit_models/us/model.py +++ b/src/policyengine/tax_benefit_models/us/model.py @@ -14,9 +14,9 @@ Variable, ) from policyengine.core.release_manifest import ( - certify_data_release_compatibility, get_release_manifest, get_runtime_model_build_metadata, + resolve_runtime_data_certification, ) from policyengine.utils.entity_utils import ( build_entity_relationships, @@ -132,12 +132,13 @@ def __init__(self, **kwargs: dict): ) model_build_metadata = get_runtime_model_build_metadata("policyengine-us") - data_certification = certify_data_release_compatibility( + data_certification = resolve_runtime_data_certification( "us", runtime_model_version=installed_model_version, runtime_data_build_fingerprint=model_build_metadata.get( "data_build_fingerprint" ), + bundled_certification=manifest.certification, ) super().__init__(**kwargs) diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index 2a20f420..790cf40d 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -4,12 +4,14 @@ from unittest.mock import MagicMock, patch from policyengine.core.release_manifest import ( + DataReleaseManifestUnavailable, certify_data_release_compatibility, dataset_logical_name, get_data_release_manifest, get_release_manifest, get_runtime_model_build_metadata, resolve_dataset_reference, + resolve_runtime_data_certification, ) from policyengine.core.tax_benefit_model import TaxBenefitModel from policyengine.core.tax_benefit_model_version import TaxBenefitModelVersion @@ -283,6 +285,46 @@ def test__given_mismatched_version_and_fingerprint__then_certification_fails(sel else: raise AssertionError("Expected certification to fail") + def test__given_missing_release_manifest__then_runtime_uses_bundled_certification( + self, + ): + bundled_certification = get_release_manifest("uk").certification + assert bundled_certification is not None + + with patch( + "policyengine.core.release_manifest.get_data_release_manifest", + side_effect=DataReleaseManifestUnavailable("missing"), + ): + certification = resolve_runtime_data_certification( + "uk", + runtime_model_version="2.74.0", + bundled_certification=bundled_certification, + ) + + assert certification.compatibility_basis == "exact_build_model_version" + assert certification.certified_for_model_version == "2.74.0" + + def test__given_missing_release_manifest_and_wrong_runtime__then_runtime_fails( + self, + ): + bundled_certification = get_release_manifest("uk").certification + assert bundled_certification is not None + + with patch( + "policyengine.core.release_manifest.get_data_release_manifest", + side_effect=DataReleaseManifestUnavailable("missing"), + ): + try: + resolve_runtime_data_certification( + "uk", + runtime_model_version="2.75.0", + bundled_certification=bundled_certification, + ) + except DataReleaseManifestUnavailable: + pass + else: + raise AssertionError("Expected runtime certification fallback to fail") + def test__given_manifest_certification__then_release_bundle_exposes_it(self): manifest = get_release_manifest("uk") model_version = TaxBenefitModelVersion( From b397c1b09f49605a7a2229ab605c2854242bdba3 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 12 Apr 2026 15:45:49 -0400 Subject: [PATCH 6/6] Align certified bundle tests and fallback handling --- src/policyengine/core/release_manifest.py | 2 +- tests/test_models.py | 10 +++++----- tests/test_release_manifests.py | 20 ++++++++++++++++++++ tests/test_uk_regions.py | 2 +- tests/test_us_regions.py | 6 +++--- 5 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/policyengine/core/release_manifest.py b/src/policyengine/core/release_manifest.py index 05995d74..cc2cc6d9 100644 --- a/src/policyengine/core/release_manifest.py +++ b/src/policyengine/core/release_manifest.py @@ -132,7 +132,7 @@ def get_runtime_model_build_metadata(package_name: str) -> dict[str, str | None] try: build_metadata_module = import_module(f"{module_name}.build_metadata") - except ModuleNotFoundError: + except Exception: return { "name": package_name, "version": installed_version, diff --git a/tests/test_models.py b/tests/test_models.py index 0f0767bc..146e8532 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -29,12 +29,12 @@ def test_has_release_manifest_metadata(self): assert uk_latest.release_manifest is not None assert uk_latest.release_manifest.country_id == "uk" assert uk_latest.model_package.name == "policyengine-uk" - assert uk_latest.model_package.version == "2.78.0" + assert uk_latest.model_package.version == "2.74.0" assert uk_latest.data_package.name == "policyengine-uk-data" - assert uk_latest.data_package.version == "1.40.3" + assert uk_latest.data_package.version == "1.40.4" assert ( uk_latest.default_dataset_uri - == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.3" + == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" ) def test_has_hundreds_of_parameters(self): @@ -115,10 +115,10 @@ def test_has_release_manifest_metadata(self): assert us_latest.model_package.name == "policyengine-us" assert us_latest.model_package.version == "1.602.0" assert us_latest.data_package.name == "policyengine-us-data" - assert us_latest.data_package.version == "1.77.0" + assert us_latest.data_package.version == "1.73.0" assert ( us_latest.default_dataset_uri - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.77.0" + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0" ) def test_has_hundreds_of_parameters(self): diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index 790cf40d..4a53fdb0 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -186,6 +186,26 @@ def test__given_missing_build_metadata_module__then_runtime_metadata_falls_back( "data_build_fingerprint": None, } + def test__given_broken_package_import__then_runtime_metadata_falls_back(self): + with ( + patch( + "policyengine.core.release_manifest.metadata.version", + return_value="1.602.0", + ), + patch( + "policyengine.core.release_manifest.import_module", + side_effect=ValueError("broken package init"), + ), + ): + build_metadata = get_runtime_model_build_metadata("policyengine-us") + + assert build_metadata == { + "name": "policyengine-us", + "version": "1.602.0", + "git_sha": None, + "data_build_fingerprint": None, + } + def test__given_build_metadata_module__then_runtime_metadata_uses_it(self): module = MagicMock() module.get_data_build_metadata.return_value = { diff --git a/tests/test_uk_regions.py b/tests/test_uk_regions.py index 02727596..57a55992 100644 --- a/tests/test_uk_regions.py +++ b/tests/test_uk_regions.py @@ -68,7 +68,7 @@ def test__given_uk_registry__then_has_national_region(self): assert national.region_type == "national" assert ( national.dataset_path - == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.3" + == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" ) assert not national.requires_filter diff --git a/tests/test_us_regions.py b/tests/test_us_regions.py index 4bb8f039..079ce1c5 100644 --- a/tests/test_us_regions.py +++ b/tests/test_us_regions.py @@ -105,7 +105,7 @@ def test__given_us_registry__then_has_national_region(self): assert national.region_type == "national" assert ( national.dataset_path - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.77.0" + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0" ) def test__given_us_registry__then_has_51_states(self): @@ -134,7 +134,7 @@ def test__given_california_region__then_has_correct_format(self): assert ca.parent_code == "us" assert ( ca.dataset_path - == "hf://policyengine/policyengine-us-data/states/CA.h5@1.77.0" + == "hf://policyengine/policyengine-us-data/states/CA.h5@1.73.0" ) assert ca.state_code == "CA" assert ca.state_name == "California" @@ -167,7 +167,7 @@ def test__given_ca_first_district__then_has_correct_format(self): assert ca01.parent_code == "state/ca" assert ( ca01.dataset_path - == "hf://policyengine/policyengine-us-data/districts/CA-01.h5@1.77.0" + == "hf://policyengine/policyengine-us-data/districts/CA-01.h5@1.73.0" ) assert ca01.state_code == "CA" assert not ca01.requires_filter