From ffe0b19f261f965459e741b3c57c2b7b9b9dad98 Mon Sep 17 00:00:00 2001 From: d-laub Date: Mon, 1 Jun 2026 17:57:06 -0700 Subject: [PATCH 01/28] feat(_fasta_cache): add FastaCache models and fingerprint Skeleton module with Pydantic models (Fingerprint, SourceHints, FastaCache), module-level constants (FORMAT_VERSION, FINGERPRINT_WINDOW, suffix/filename literals), and the fingerprint() function (blake2b, 1 MiB window). Co-Authored-By: Claude Sonnet 4.6 --- python/genvarloader/_fasta_cache.py | 51 +++++++++++++++++++++++++++++ tests/unit/test_fasta_cache.py | 36 ++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 python/genvarloader/_fasta_cache.py create mode 100644 tests/unit/test_fasta_cache.py diff --git a/python/genvarloader/_fasta_cache.py b/python/genvarloader/_fasta_cache.py new file mode 100644 index 0000000..8d9ec05 --- /dev/null +++ b/python/genvarloader/_fasta_cache.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from hashlib import blake2b +from pathlib import Path +from typing import Literal + +from pydantic import BaseModel +from pydantic_extra_types.semantic_version import SemanticVersion + +__all__ = ["FastaCache"] + +FORMAT_VERSION = SemanticVersion.parse("1.0.0") +"""On-disk schema version for the .gvlfa cache.""" +FINGERPRINT_WINDOW = 1 << 20 # 1 MiB +GVLFA_SUFFIX = ".gvlfa" +LEGACY_SUFFIX = ".gvl" +DATA_FILENAME = "sequence.bin" +METADATA_FILENAME = "metadata.json" + + +class SourceHints(BaseModel): + filename: str + absolute_path: str + relative_path: str + + +class Fingerprint(BaseModel): + algorithm: Literal["blake2b"] = "blake2b" + n_bytes_hashed: int + digest: str + size_bytes: int + + +class FastaCache(BaseModel): + format_version: SemanticVersion + genvarloader_version: SemanticVersion + contigs: dict[str, int] + source: SourceHints + fingerprint: Fingerprint + + +def fingerprint(path: str | Path) -> Fingerprint: + """Cheap content fingerprint: blake2b of the first FINGERPRINT_WINDOW bytes, + plus the total file size (catches changes past the hashed window).""" + path = Path(path) + size = path.stat().st_size + n = min(FINGERPRINT_WINDOW, size) + h = blake2b() + with open(path, "rb") as f: + h.update(f.read(n)) + return Fingerprint(n_bytes_hashed=n, digest=h.hexdigest(), size_bytes=size) diff --git a/tests/unit/test_fasta_cache.py b/tests/unit/test_fasta_cache.py new file mode 100644 index 0000000..6da8948 --- /dev/null +++ b/tests/unit/test_fasta_cache.py @@ -0,0 +1,36 @@ +from pathlib import Path + +import genvarloader._fasta_cache as fc + + +def _write(path: Path, data: bytes) -> Path: + path.write_bytes(data) + return path + + +def test_fingerprint_records_size_and_window(tmp_path): + src = _write(tmp_path / "a.bin", b"ACGT" * 10) + fp = fc.fingerprint(src) + assert fp.algorithm == "blake2b" + assert fp.size_bytes == 40 + assert fp.n_bytes_hashed == 40 + assert isinstance(fp.digest, str) and len(fp.digest) > 0 + + +def test_fingerprint_window_caps_at_1mib(tmp_path): + src = _write(tmp_path / "big.bin", b"N" * (fc.FINGERPRINT_WINDOW + 100)) + fp = fc.fingerprint(src) + assert fp.size_bytes == fc.FINGERPRINT_WINDOW + 100 + assert fp.n_bytes_hashed == fc.FINGERPRINT_WINDOW + + +def test_fingerprint_identical_bytes_match(tmp_path): + a = _write(tmp_path / "a.bin", b"ACGTACGT") + b = _write(tmp_path / "b.bin", b"ACGTACGT") + assert fc.fingerprint(a).digest == fc.fingerprint(b).digest + + +def test_fingerprint_flipped_first_byte_differs(tmp_path): + a = _write(tmp_path / "a.bin", b"ACGTACGT") + b = _write(tmp_path / "b.bin", b"TCGTACGT") + assert fc.fingerprint(a).digest != fc.fingerprint(b).digest From a920b0a29b30000d880ef343a30bf2a87967dedd Mon Sep 17 00:00:00 2001 From: d-laub Date: Mon, 1 Jun 2026 18:00:26 -0700 Subject: [PATCH 02/28] feat(_fasta_cache): add source hints and three-way resolution --- python/genvarloader/_fasta_cache.py | 29 +++++++++++++++++++++ tests/unit/test_fasta_cache.py | 39 +++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/python/genvarloader/_fasta_cache.py b/python/genvarloader/_fasta_cache.py index 8d9ec05..107c71a 100644 --- a/python/genvarloader/_fasta_cache.py +++ b/python/genvarloader/_fasta_cache.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os from hashlib import blake2b from pathlib import Path from typing import Literal @@ -39,6 +40,34 @@ class FastaCache(BaseModel): fingerprint: Fingerprint +def _source_hints(source_fa: str | Path, gvlfa_dir: str | Path) -> SourceHints: + source = Path(source_fa).resolve() + dest = Path(gvlfa_dir).resolve() + try: + relative = os.path.relpath(source, dest) + except ValueError: # e.g. different drives on Windows + relative = str(source) + return SourceHints( + filename=source.name, + absolute_path=str(source), + relative_path=str(relative), + ) + + +def resolve_source(gvlfa_dir: str | Path, meta: FastaCache) -> Path | None: + """Locate the source FASTA: sibling, then relative, then absolute. First hit wins.""" + gvlfa_dir = Path(gvlfa_dir) + candidates = [ + gvlfa_dir.parent / meta.source.filename, + gvlfa_dir / meta.source.relative_path, + Path(meta.source.absolute_path), + ] + for c in candidates: + if c.exists(): + return c.resolve() + return None + + def fingerprint(path: str | Path) -> Fingerprint: """Cheap content fingerprint: blake2b of the first FINGERPRINT_WINDOW bytes, plus the total file size (catches changes past the hashed window).""" diff --git a/tests/unit/test_fasta_cache.py b/tests/unit/test_fasta_cache.py index 6da8948..ac2935f 100644 --- a/tests/unit/test_fasta_cache.py +++ b/tests/unit/test_fasta_cache.py @@ -34,3 +34,42 @@ def test_fingerprint_flipped_first_byte_differs(tmp_path): a = _write(tmp_path / "a.bin", b"ACGTACGT") b = _write(tmp_path / "b.bin", b"TCGTACGT") assert fc.fingerprint(a).digest != fc.fingerprint(b).digest + + +def _make_meta(tmp_path, source_fa, gvlfa_dir) -> fc.FastaCache: + return fc.FastaCache( + format_version=fc.FORMAT_VERSION, + genvarloader_version=fc.FORMAT_VERSION, + contigs={"chr1": 4}, + source=fc._source_hints(source_fa, gvlfa_dir), + fingerprint=fc.fingerprint(source_fa), + ) + + +def test_resolve_source_sibling(tmp_path): + src = _write(tmp_path / "ref.fa", b"ACGT") + gvlfa = tmp_path / "ref.fa.gvlfa" + gvlfa.mkdir() + meta = _make_meta(tmp_path, src, gvlfa) + assert fc.resolve_source(gvlfa, meta) == src.resolve() + + +def test_resolve_source_absolute_when_moved(tmp_path): + # Source elsewhere; cache dir has no sibling and a non-resolving relative. + src_dir = tmp_path / "elsewhere" + src_dir.mkdir() + src = _write(src_dir / "ref.fa", b"ACGT") + gvlfa = tmp_path / "cache" / "ref.fa.gvlfa" + gvlfa.mkdir(parents=True) + meta = _make_meta(tmp_path, src, gvlfa) + # No sibling ref.fa next to the cache dir, but absolute_path still points home. + assert fc.resolve_source(gvlfa, meta) == src.resolve() + + +def test_resolve_source_missing_returns_none(tmp_path): + src = _write(tmp_path / "ref.fa", b"ACGT") + gvlfa = tmp_path / "ref.fa.gvlfa" + gvlfa.mkdir() + meta = _make_meta(tmp_path, src, gvlfa) + src.unlink() + assert fc.resolve_source(gvlfa, meta) is None From ce9a501c0aab84189d475eecef897792b55a2aa2 Mon Sep 17 00:00:00 2001 From: d-laub Date: Mon, 1 Jun 2026 18:05:20 -0700 Subject: [PATCH 03/28] feat(_fasta_cache): add build, load, and validity guards Implements build() (writes sequence.bin + metadata.json from a source FASTA), load() (reads metadata and classifies cache as fresh/stale/unvalidated), and helpers _data_size_ok/_fingerprints_match/_check_format_version. 13 tests pass. Co-Authored-By: Claude Sonnet 4.6 --- python/genvarloader/_fasta_cache.py | 82 +++++++++++++++++++++++++++++ tests/unit/test_fasta_cache.py | 81 ++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) diff --git a/python/genvarloader/_fasta_cache.py b/python/genvarloader/_fasta_cache.py index 107c71a..fa91aea 100644 --- a/python/genvarloader/_fasta_cache.py +++ b/python/genvarloader/_fasta_cache.py @@ -2,11 +2,15 @@ import os from hashlib import blake2b +from importlib.metadata import version from pathlib import Path from typing import Literal +import numpy as np +import pysam from pydantic import BaseModel from pydantic_extra_types.semantic_version import SemanticVersion +from tqdm.auto import tqdm __all__ = ["FastaCache"] @@ -78,3 +82,81 @@ def fingerprint(path: str | Path) -> Fingerprint: with open(path, "rb") as f: h.update(f.read(n)) return Fingerprint(n_bytes_hashed=n, digest=h.hexdigest(), size_bytes=size) + + +def _gvl_version() -> SemanticVersion: + return SemanticVersion.parse(version("genvarloader")) + + +def _contig_lengths(source_fa: str | Path) -> dict[str, int]: + with pysam.FastaFile(str(source_fa)) as f: + return {c: f.get_reference_length(c) for c in f.references} + + +def _write_sequence(source_fa: Path, gvlfa_dir: Path, contigs: dict[str, int]) -> None: + total = sum(contigs.values()) + data = np.memmap(gvlfa_dir / DATA_FILENAME, dtype=np.uint8, mode="w+", shape=total) + offset = 0 + with pysam.FastaFile(str(source_fa)) as f: + for c in tqdm(contigs, total=total, unit=" nucleotide"): + c_seq = np.frombuffer(f.fetch(c).encode("ascii").upper(), "S1") + data[offset : offset + len(c_seq)] = c_seq.view(np.uint8) + offset += len(c_seq) + data.flush() + + +def build(source_fa: str | Path, gvlfa_dir: str | Path) -> FastaCache: + """Build a fresh .gvlfa cache containing all contigs of the source FASTA.""" + source_fa = Path(source_fa) + gvlfa_dir = Path(gvlfa_dir) + gvlfa_dir.mkdir(parents=True, exist_ok=True) + contigs = _contig_lengths(source_fa) + _write_sequence(source_fa, gvlfa_dir, contigs) + meta = FastaCache( + format_version=FORMAT_VERSION, + genvarloader_version=_gvl_version(), + contigs=contigs, + source=_source_hints(source_fa, gvlfa_dir), + fingerprint=fingerprint(source_fa), + ) + (gvlfa_dir / METADATA_FILENAME).write_text(meta.model_dump_json()) + return meta + + +def _check_format_version(meta: FastaCache, gvlfa_dir: Path) -> None: + if meta.format_version.major > FORMAT_VERSION.major: + raise ValueError( + f"FASTA cache at {gvlfa_dir} has format version {meta.format_version}, " + f"newer than supported {FORMAT_VERSION}. Upgrade genvarloader." + ) + + +def _data_size_ok(gvlfa_dir: Path, meta: FastaCache) -> bool: + data_path = Path(gvlfa_dir) / DATA_FILENAME + if not data_path.exists(): + return False + return data_path.stat().st_size == sum(meta.contigs.values()) + + +def _fingerprints_match(stored: Fingerprint, source_fa: Path) -> bool: + if stored.size_bytes != Path(source_fa).stat().st_size: + return False + return fingerprint(source_fa).digest == stored.digest + + +def load(gvlfa_dir: str | Path) -> tuple[FastaCache, Path | None, str]: + """Read cache metadata and classify it: 'fresh' | 'stale' | 'unvalidated'. + + Raises ValueError if the format version is too new to read. + """ + gvlfa_dir = Path(gvlfa_dir) + meta = FastaCache.model_validate_json((gvlfa_dir / METADATA_FILENAME).read_text()) + _check_format_version(meta, gvlfa_dir) + source = resolve_source(gvlfa_dir, meta) + if source is None: + status = "unvalidated" + elif _fingerprints_match(meta.fingerprint, source): + status = "fresh" + else: + status = "stale" + return meta, source, status diff --git a/tests/unit/test_fasta_cache.py b/tests/unit/test_fasta_cache.py index ac2935f..6b4f26e 100644 --- a/tests/unit/test_fasta_cache.py +++ b/tests/unit/test_fasta_cache.py @@ -1,5 +1,10 @@ +import shutil from pathlib import Path +import numpy as np +import pysam +import pytest + import genvarloader._fasta_cache as fc @@ -73,3 +78,79 @@ def test_resolve_source_missing_returns_none(tmp_path): meta = _make_meta(tmp_path, src, gvlfa) src.unlink() assert fc.resolve_source(gvlfa, meta) is None + + +@pytest.fixture +def local_fa(tmp_path, ref_fasta): + """Copy the shared bgzipped FASTA (and its .fai/.gzi if present) into tmp_path.""" + src = Path(ref_fasta) + dst = tmp_path / src.name + shutil.copy(src, dst) + for ext in (".fai", ".gzi"): + side = Path(str(src) + ext) + if side.exists(): + shutil.copy(side, tmp_path / side.name) + return dst + + +def test_build_then_load_round_trip(tmp_path, local_fa): + gvlfa = tmp_path / "out.gvlfa" + meta = fc.build(local_fa, gvlfa) + assert (gvlfa / fc.METADATA_FILENAME).exists() + assert (gvlfa / fc.DATA_FILENAME).exists() + + loaded, source, status = fc.load(gvlfa) + assert loaded.contigs == meta.contigs + assert source == local_fa.resolve() + assert status == "fresh" + + +def test_build_sequence_matches_pysam(tmp_path, local_fa): + gvlfa = tmp_path / "out.gvlfa" + meta = fc.build(local_fa, gvlfa) + data = np.memmap(gvlfa / fc.DATA_FILENAME, dtype="S1", mode="r") + offset = 0 + with pysam.FastaFile(str(local_fa)) as f: + for contig, length in meta.contigs.items(): + expected = np.frombuffer(f.fetch(contig).encode("ascii").upper(), "S1") + np.testing.assert_array_equal(data[offset : offset + length], expected) + offset += length + + +def test_load_stale_when_source_changes(tmp_path, local_fa): + gvlfa = tmp_path / "out.gvlfa" + fc.build(local_fa, gvlfa) + # Truncate the source so size_bytes (and thus the fingerprint) changes. + with open(local_fa, "r+b") as f: + f.truncate(10) + _, source, status = fc.load(gvlfa) + assert source == local_fa.resolve() + assert status == "stale" + + +def test_load_unvalidated_when_source_missing(tmp_path, local_fa): + gvlfa = tmp_path / "out.gvlfa" + fc.build(local_fa, gvlfa) + local_fa.unlink() + _, source, status = fc.load(gvlfa) + assert source is None + assert status == "unvalidated" + + +def test_load_rejects_newer_major_format(tmp_path, local_fa): + gvlfa = tmp_path / "out.gvlfa" + meta = fc.build(local_fa, gvlfa) + meta.format_version = type(meta.format_version)( + major=fc.FORMAT_VERSION.major + 1, minor=0, patch=0 + ) + (gvlfa / fc.METADATA_FILENAME).write_text(meta.model_dump_json()) + with pytest.raises(ValueError, match="newer than supported"): + fc.load(gvlfa) + + +def test_data_size_ok_detects_corruption(tmp_path, local_fa): + gvlfa = tmp_path / "out.gvlfa" + meta = fc.build(local_fa, gvlfa) + assert fc._data_size_ok(gvlfa, meta) is True + (gvlfa / fc.DATA_FILENAME).write_bytes(b"too short") + assert fc._data_size_ok(gvlfa, meta) is False From 1b997596e5cf7f1b1f523fa4926ea2b2d1967aa6 Mon Sep 17 00:00:00 2001 From: d-laub Date: Mon, 1 Jun 2026 18:08:53 -0700 Subject: [PATCH 04/28] refactor(_fasta_cache): fix progress bar advance and tighten load status type Update _write_sequence to use pbar.update(len(c_seq)) so the tqdm bar advances by nucleotides per contig instead of once per contig. Narrow load()'s return annotation from str to Literal["fresh","stale","unvalidated"]. Co-Authored-By: Claude Sonnet 4.6 --- python/genvarloader/_fasta_cache.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/genvarloader/_fasta_cache.py b/python/genvarloader/_fasta_cache.py index fa91aea..1eb1a2b 100644 --- a/python/genvarloader/_fasta_cache.py +++ b/python/genvarloader/_fasta_cache.py @@ -98,10 +98,13 @@ def _write_sequence(source_fa: Path, gvlfa_dir: Path, contigs: dict[str, int]) - data = np.memmap(gvlfa_dir / DATA_FILENAME, dtype=np.uint8, mode="w+", shape=total) offset = 0 with pysam.FastaFile(str(source_fa)) as f: - for c in tqdm(contigs, total=total, unit=" nucleotide"): + pbar = tqdm(total=total, unit=" nucleotide") + for c in contigs: c_seq = np.frombuffer(f.fetch(c).encode("ascii").upper(), "S1") data[offset : offset + len(c_seq)] = c_seq.view(np.uint8) offset += len(c_seq) + pbar.update(len(c_seq)) + pbar.close() data.flush() @@ -144,7 +147,7 @@ def _fingerprints_match(stored: Fingerprint, source_fa: Path) -> bool: return fingerprint(source_fa).digest == stored.digest -def load(gvlfa_dir: str | Path) -> tuple[FastaCache, Path | None, str]: +def load(gvlfa_dir: str | Path) -> tuple[FastaCache, Path | None, Literal["fresh", "stale", "unvalidated"]]: """Read cache metadata and classify it: 'fresh' | 'stale' | 'unvalidated'. Raises ValueError if the format version is too new to read. From 1e2215a543e24cb3c651f8b1e6c5ee95cc15e4c1 Mon Sep 17 00:00:00 2001 From: d-laub Date: Mon, 1 Jun 2026 18:10:25 -0700 Subject: [PATCH 05/28] feat(_fasta_cache): migrate legacy .gvl caches by reusing bytes --- python/genvarloader/_fasta_cache.py | 27 +++++++++++++++++++++++++++ tests/unit/test_fasta_cache.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/python/genvarloader/_fasta_cache.py b/python/genvarloader/_fasta_cache.py index 1eb1a2b..5f1fc2c 100644 --- a/python/genvarloader/_fasta_cache.py +++ b/python/genvarloader/_fasta_cache.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import shutil from hashlib import blake2b from importlib.metadata import version from pathlib import Path @@ -147,6 +148,32 @@ def _fingerprints_match(stored: Fingerprint, source_fa: Path) -> bool: return fingerprint(source_fa).digest == stored.digest +def migrate_legacy( + source_fa: str | Path, legacy_gvl: str | Path, gvlfa_dir: str | Path +) -> FastaCache: + """Upgrade a legacy flat .gvl cache to a .gvlfa dir by reusing its bytes. + + Reads contig lengths and fingerprints the source *before* touching the legacy + file, so a missing/unreadable source aborts without disturbing it. + """ + source_fa = Path(source_fa) + legacy_gvl = Path(legacy_gvl) + gvlfa_dir = Path(gvlfa_dir) + contigs = _contig_lengths(source_fa) # raises here if source is unreadable + fp = fingerprint(source_fa) + gvlfa_dir.mkdir(parents=True, exist_ok=True) + shutil.move(str(legacy_gvl), str(gvlfa_dir / DATA_FILENAME)) + meta = FastaCache( + format_version=FORMAT_VERSION, + genvarloader_version=_gvl_version(), + contigs=contigs, + source=_source_hints(source_fa, gvlfa_dir), + fingerprint=fp, + ) + (gvlfa_dir / METADATA_FILENAME).write_text(meta.model_dump_json()) + return meta + + def load(gvlfa_dir: str | Path) -> tuple[FastaCache, Path | None, Literal["fresh", "stale", "unvalidated"]]: """Read cache metadata and classify it: 'fresh' | 'stale' | 'unvalidated'. diff --git a/tests/unit/test_fasta_cache.py b/tests/unit/test_fasta_cache.py index 6b4f26e..46cfadb 100644 --- a/tests/unit/test_fasta_cache.py +++ b/tests/unit/test_fasta_cache.py @@ -154,3 +154,32 @@ def test_data_size_ok_detects_corruption(tmp_path, local_fa): assert fc._data_size_ok(gvlfa, meta) is True (gvlfa / fc.DATA_FILENAME).write_bytes(b"too short") assert fc._data_size_ok(gvlfa, meta) is False + + +def test_migrate_legacy_reuses_bytes_and_removes_old(tmp_path, local_fa): + # Fabricate a legacy flat cache: concatenated contigs, exactly as the old + # _write_to_cache produced. Build via the new code, then rename to legacy. + staging = tmp_path / "staging.gvlfa" + fc.build(local_fa, staging) + legacy = local_fa.with_name(local_fa.name + fc.LEGACY_SUFFIX) + shutil.move(str(staging / fc.DATA_FILENAME), str(legacy)) + legacy_bytes = legacy.read_bytes() + + gvlfa = local_fa.with_name(local_fa.name + fc.GVLFA_SUFFIX) + meta = fc.migrate_legacy(local_fa, legacy, gvlfa) + + assert not legacy.exists() # old file removed (moved) + assert (gvlfa / fc.DATA_FILENAME).read_bytes() == legacy_bytes # bytes reused + assert meta.contigs == fc._contig_lengths(local_fa) + _, source, status = fc.load(gvlfa) + assert status == "fresh" and source == local_fa.resolve() + + +def test_migrate_legacy_aborts_before_move_if_source_unreadable(tmp_path): + legacy = tmp_path / "ghost.fa.gvl" + legacy.write_bytes(b"data") + missing_src = tmp_path / "ghost.fa" # does not exist + gvlfa = tmp_path / "ghost.fa.gvlfa" + with pytest.raises(Exception): + fc.migrate_legacy(missing_src, legacy, gvlfa) + assert legacy.exists() # untouched on failure From 483c9e76071615145c68049eb6b2038539dc5856 Mon Sep 17 00:00:00 2001 From: d-laub Date: Mon, 1 Jun 2026 18:14:39 -0700 Subject: [PATCH 06/28] feat(_fasta_cache): add ensure_cache orchestrator and dispatch Adds is_gvlfa, ensure_cache, _ensure_from_fasta, _ensure_from_gvlfa, _cache_dir_for, _legacy_for; restores ensure_cache to __all__. Adds import warnings + loguru logger. Fixed potential meta unbound-name via explicit None init. 21/21 tests pass. Co-Authored-By: Claude Sonnet 4.6 --- python/genvarloader/_fasta_cache.py | 83 ++++++++++++++++++++++++++++- tests/unit/test_fasta_cache.py | 64 ++++++++++++++++++++++ 2 files changed, 146 insertions(+), 1 deletion(-) diff --git a/python/genvarloader/_fasta_cache.py b/python/genvarloader/_fasta_cache.py index 5f1fc2c..bc0deb9 100644 --- a/python/genvarloader/_fasta_cache.py +++ b/python/genvarloader/_fasta_cache.py @@ -2,6 +2,7 @@ import os import shutil +import warnings from hashlib import blake2b from importlib.metadata import version from pathlib import Path @@ -9,11 +10,12 @@ import numpy as np import pysam +from loguru import logger from pydantic import BaseModel from pydantic_extra_types.semantic_version import SemanticVersion from tqdm.auto import tqdm -__all__ = ["FastaCache"] +__all__ = ["FastaCache", "ensure_cache"] FORMAT_VERSION = SemanticVersion.parse("1.0.0") """On-disk schema version for the .gvlfa cache.""" @@ -174,6 +176,85 @@ def migrate_legacy( return meta +def _cache_dir_for(source_fa: Path) -> Path: + return source_fa.with_name(source_fa.name + GVLFA_SUFFIX) + + +def _legacy_for(source_fa: Path) -> Path: + return source_fa.with_name(source_fa.name + LEGACY_SUFFIX) + + +def is_gvlfa(path: str | Path) -> bool: + """True if path is a .gvlfa cache directory.""" + path = Path(path) + if not path.is_dir(): + return False + return path.name.endswith(GVLFA_SUFFIX) or (path / METADATA_FILENAME).exists() + + +def ensure_cache(path: str | Path) -> tuple[FastaCache, Path]: + """Resolve a usable cache for `path` (a .fa or a .gvlfa), building, migrating, + or rebuilding as needed. Returns (metadata, path to sequence.bin).""" + path = Path(path) + if is_gvlfa(path): + return _ensure_from_gvlfa(path) + return _ensure_from_fasta(path) + + +def _ensure_from_fasta(source_fa: Path) -> tuple[FastaCache, Path]: + gvlfa_dir = _cache_dir_for(source_fa) + legacy = _legacy_for(source_fa) + data_path = gvlfa_dir / DATA_FILENAME + if gvlfa_dir.exists(): + meta: FastaCache | None = None + try: + meta = FastaCache.model_validate_json( + (gvlfa_dir / METADATA_FILENAME).read_text() + ) + _check_format_version(meta, gvlfa_dir) + valid = _data_size_ok(gvlfa_dir, meta) and _fingerprints_match( + meta.fingerprint, source_fa + ) + except Exception: + valid = False + if not valid or meta is None: + logger.info(f"Building FASTA cache at {gvlfa_dir}.") + meta = build(source_fa, gvlfa_dir) + return meta, data_path + if legacy.exists(): + logger.info(f"Migrating legacy FASTA cache {legacy} -> {gvlfa_dir}.") + return migrate_legacy(source_fa, legacy, gvlfa_dir), data_path + logger.info(f"Building FASTA cache at {gvlfa_dir}.") + return build(source_fa, gvlfa_dir), data_path + + +def _ensure_from_gvlfa(gvlfa_dir: Path) -> tuple[FastaCache, Path]: + data_path = gvlfa_dir / DATA_FILENAME + try: + meta, source, status = load(gvlfa_dir) + except ValueError: + raise # format-too-new: actionable, do not swallow + except Exception as e: + raise ValueError(f"FASTA cache at {gvlfa_dir} is unreadable: {e}") from e + if not _data_size_ok(gvlfa_dir, meta): + if source is not None: + return build(source, gvlfa_dir), data_path + raise ValueError( + f"FASTA cache data at {data_path} is corrupt and the source FASTA " + "could not be located to rebuild it." + ) + if status == "stale": + logger.info(f"Source FASTA changed; rebuilding cache at {gvlfa_dir}.") + meta = build(source, gvlfa_dir) + elif status == "unvalidated": + warnings.warn( + f"Could not locate source FASTA for cache {gvlfa_dir}; using cached " + "data without validation. On-demand reads (in_memory=False) will fail.", + stacklevel=2, + ) + return meta, data_path + + def load(gvlfa_dir: str | Path) -> tuple[FastaCache, Path | None, Literal["fresh", "stale", "unvalidated"]]: """Read cache metadata and classify it: 'fresh' | 'stale' | 'unvalidated'. diff --git a/tests/unit/test_fasta_cache.py b/tests/unit/test_fasta_cache.py index 46cfadb..8f87b0d 100644 --- a/tests/unit/test_fasta_cache.py +++ b/tests/unit/test_fasta_cache.py @@ -183,3 +183,67 @@ def test_migrate_legacy_aborts_before_move_if_source_unreadable(tmp_path): with pytest.raises(Exception): fc.migrate_legacy(missing_src, legacy, gvlfa) assert legacy.exists() # untouched on failure + + +def test_is_gvlfa_dir(tmp_path, local_fa): + gvlfa = tmp_path / "out.gvlfa" + fc.build(local_fa, gvlfa) + assert fc.is_gvlfa(gvlfa) is True + assert fc.is_gvlfa(local_fa) is False + + +def test_ensure_cache_from_fasta_builds_then_loads(tmp_path, local_fa): + meta, data_path = fc.ensure_cache(local_fa) + expected_dir = local_fa.with_name(local_fa.name + fc.GVLFA_SUFFIX) + assert data_path == expected_dir / fc.DATA_FILENAME + assert data_path.exists() + # Second call loads the existing fresh cache (no error, same contigs). + meta2, _ = fc.ensure_cache(local_fa) + assert meta2.contigs == meta.contigs + + +def test_ensure_cache_from_fasta_rebuilds_when_stale(tmp_path, local_fa): + fc.ensure_cache(local_fa) + gvlfa = local_fa.with_name(local_fa.name + fc.GVLFA_SUFFIX) + # Corrupt the data file; ensure_cache should rebuild it to the right size. + (gvlfa / fc.DATA_FILENAME).write_bytes(b"short") + meta, data_path = fc.ensure_cache(local_fa) + assert data_path.stat().st_size == sum(meta.contigs.values()) + + +def test_ensure_cache_from_fasta_migrates_legacy(tmp_path, local_fa): + staging = tmp_path / "staging.gvlfa" + fc.build(local_fa, staging) + legacy = local_fa.with_name(local_fa.name + fc.LEGACY_SUFFIX) + shutil.move(str(staging / fc.DATA_FILENAME), str(legacy)) + shutil.rmtree(staging) + + meta, data_path = fc.ensure_cache(local_fa) + assert not legacy.exists() + assert data_path.exists() + assert meta.contigs == fc._contig_lengths(local_fa) + + +def test_ensure_cache_from_gvlfa_missing_source_warns(tmp_path, local_fa): + gvlfa = tmp_path / "out.gvlfa" + fc.build(local_fa, gvlfa) + local_fa.unlink() + with pytest.warns(UserWarning, match="Could not locate source FASTA"): + meta, data_path = fc.ensure_cache(gvlfa) + assert data_path == gvlfa / fc.DATA_FILENAME + + +def test_ensure_cache_from_gvlfa_stale_rebuilds(tmp_path): + # Use a plain-text FASTA: rebuilding needs a valid re-readable source, and + # truncating a bgzipped file would corrupt it for pysam. + src = tmp_path / "tiny.fa" + src.write_text(">chr1\nACGTACGT\n") + pysam.faidx(str(src)) + gvlfa = tmp_path / "tiny.fa.gvlfa" + fc.build(src, gvlfa) + # Change the source content (same path), invalidating the fingerprint. + src.write_text(">chr1\nTTTTTTTT\n") + pysam.faidx(str(src)) + meta, data_path = fc.ensure_cache(gvlfa) + data = np.memmap(data_path, dtype="S1", mode="r") + np.testing.assert_array_equal(data, np.frombuffer(b"TTTTTTTT", "S1")) From 7df826de9dd329569e1f40fff1cd9b43471a6b0e Mon Sep 17 00:00:00 2001 From: d-laub Date: Mon, 1 Jun 2026 18:19:33 -0700 Subject: [PATCH 07/28] fix(_fasta_cache): raise on format-too-new sibling cache instead of silent downgrade --- python/genvarloader/_fasta_cache.py | 12 ++++++++---- tests/unit/test_fasta_cache.py | 25 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/python/genvarloader/_fasta_cache.py b/python/genvarloader/_fasta_cache.py index bc0deb9..7caf266 100644 --- a/python/genvarloader/_fasta_cache.py +++ b/python/genvarloader/_fasta_cache.py @@ -206,18 +206,22 @@ def _ensure_from_fasta(source_fa: Path) -> tuple[FastaCache, Path]: legacy = _legacy_for(source_fa) data_path = gvlfa_dir / DATA_FILENAME if gvlfa_dir.exists(): - meta: FastaCache | None = None try: - meta = FastaCache.model_validate_json( + meta: FastaCache | None = FastaCache.model_validate_json( (gvlfa_dir / METADATA_FILENAME).read_text() ) + except Exception: + meta = None # unreadable/corrupt metadata -> rebuild below + if meta is not None: + # Format-too-new raises here and propagates, matching _ensure_from_gvlfa: + # gvl never silently downgrades a cache written by a newer version. _check_format_version(meta, gvlfa_dir) valid = _data_size_ok(gvlfa_dir, meta) and _fingerprints_match( meta.fingerprint, source_fa ) - except Exception: + else: valid = False - if not valid or meta is None: + if not valid or meta is None: # redundant at runtime; satisfies the type checker logger.info(f"Building FASTA cache at {gvlfa_dir}.") meta = build(source_fa, gvlfa_dir) return meta, data_path diff --git a/tests/unit/test_fasta_cache.py b/tests/unit/test_fasta_cache.py index 8f87b0d..aee318b 100644 --- a/tests/unit/test_fasta_cache.py +++ b/tests/unit/test_fasta_cache.py @@ -247,3 +247,28 @@ def test_ensure_cache_from_gvlfa_stale_rebuilds(tmp_path): meta, data_path = fc.ensure_cache(gvlfa) data = np.memmap(data_path, dtype="S1", mode="r") np.testing.assert_array_equal(data, np.frombuffer(b"TTTTTTTT", "S1")) + + +def test_ensure_cache_from_gvlfa_corrupt_data_no_source_raises(tmp_path, local_fa): + gvlfa = tmp_path / "out.gvlfa" + fc.build(local_fa, gvlfa) + # Corrupt the data and remove the source so it cannot be rebuilt. + (gvlfa / fc.DATA_FILENAME).write_bytes(b"short") + local_fa.unlink() + with pytest.raises(ValueError, match="could not be located"): + fc.ensure_cache(gvlfa) + + +def test_ensure_cache_format_too_new_raises_from_both_entry_points(tmp_path, local_fa): + gvlfa = local_fa.with_name(local_fa.name + fc.GVLFA_SUFFIX) + meta = fc.build(local_fa, gvlfa) + meta.format_version = type(meta.format_version)( + major=fc.FORMAT_VERSION.major + 1, minor=0, patch=0 + ) + (gvlfa / fc.METADATA_FILENAME).write_text(meta.model_dump_json()) + # Direct .gvlfa entry point raises. + with pytest.raises(ValueError, match="newer than supported"): + fc.ensure_cache(gvlfa) + # .fa entry point (sibling cache) must ALSO raise, not silently rebuild/downgrade. + with pytest.raises(ValueError, match="newer than supported"): + fc.ensure_cache(local_fa) From 33d05de46d1f40be14fafdf1fb7892b7af98ddce Mon Sep 17 00:00:00 2001 From: d-laub Date: Mon, 1 Jun 2026 18:24:06 -0700 Subject: [PATCH 08/28] feat(fasta): use .gvlfa cache module and accept .gvlfa input Rewires Fasta.__init__ to delegate all cache management to the new _fasta_cache module, adds .gvlfa directory as a valid path input, migrates legacy .gvl flat caches automatically, and warns+defers when source FASTA is missing. Removes _valid_cache, _write_to_cache, _get_sequences, _get_contig_lengths methods. Co-Authored-By: Claude Sonnet 4.6 --- python/genvarloader/_fasta.py | 131 ++++++++++++++-------------------- tests/unit/test_fasta.py | 89 +++++++++++++++++++++++ 2 files changed, 142 insertions(+), 78 deletions(-) diff --git a/python/genvarloader/_fasta.py b/python/genvarloader/_fasta.py index e0070a9..182b9c0 100644 --- a/python/genvarloader/_fasta.py +++ b/python/genvarloader/_fasta.py @@ -1,6 +1,5 @@ from __future__ import annotations -from collections.abc import Iterable from pathlib import Path from typing import cast @@ -11,6 +10,7 @@ from tqdm.auto import tqdm from typing_extensions import assert_never +from . import _fasta_cache from ._types import Reader from ._utils import get_rel_starts, normalize_contig_name @@ -47,7 +47,7 @@ def __init__( name : str Name of the reader, for example `'seq'`. path : Union[str, Path] - Path to the FASTA file. + Path to the FASTA file or a `.gvlfa` cache directory. pad : Optional[str], optional A single character which, if passed, will pad out-of-bound ranges with this value. By default no padding is done and out-of-bound ranges raise an error. @@ -58,8 +58,11 @@ def __init__( loaded into memory and the FASTA file will be closed. If `False`, the sequences will be read from the FASTA file on demand. Defaults to `False`. cache : bool, optional - Whether to cache the sequences to disk. If `True`, the sequences will be written - to disk in a `.fa.gvl` file. Defaults to `False`. Only used if `in_memory` is `True`. + Whether to cache the sequences to disk. If `True`, the sequences are written + to a sibling `.gvlfa` directory (a self-describing, fingerprint-validated + cache). Defaults to `False`. Only used if `in_memory` is `True`. A legacy + `.fa.gvl` cache, if present, is migrated automatically. A `.gvlfa` directory + may also be passed directly as `path`. Raises ------ @@ -67,7 +70,7 @@ def __init__( If pad value is not a single character. """ self.name = name - self.path = Path(path) + path = Path(path) self.pad: bytes | None if pad is None: self.pad = pad @@ -76,8 +79,22 @@ def __init__( raise ValueError("Pad value must be a single character.") self.pad = pad.encode("ascii") - with self._open() as f: - self.contigs = {c: f.get_reference_length(c) for c in f.references} + self._is_cache_input = _fasta_cache.is_gvlfa(path) + if self._is_cache_input: + meta, data_path = _fasta_cache.ensure_cache(path) + self.cache_path = path + self._data_path = data_path + self.contigs = dict(meta.contigs) + source = _fasta_cache.resolve_source(path, meta) + self._source_available = source is not None + self.path = source if source is not None else path + else: + self.path = path + self.cache_path = _fasta_cache._cache_dir_for(path) + self._data_path = self.cache_path / _fasta_cache.DATA_FILENAME + self._source_available = True + with self._open() as f: + self.contigs = {c: f.get_reference_length(c) for c in f.references} if alphabet is None: self.alphabet: sp.NucleotideAlphabet = sp.alphabets.DNA @@ -106,85 +123,43 @@ def rev_strand_fn(data: NDArray[np.bytes_]): assert_never(self.alphabet) self.rev_strand_fn = rev_strand_fn - self.contigs = self._get_contig_lengths() - self.handle: pysam.FastaFile | None = None - self.cache_path = self.path.with_suffix(self.path.suffix + ".gvl") if not in_memory: self.sequences = None + elif self._is_cache_input: + self.sequences = self._memmap_sequences() + elif cache: + _, self._data_path = _fasta_cache.ensure_cache(self.path) + self.sequences = self._memmap_sequences() else: - if cache: - if not self._valid_cache(): - self._write_to_cache() - self.sequences = self._get_sequences(self.contigs) - else: - self.sequences = self._get_sequences(self.contigs) - - def _valid_cache(self) -> bool: - """Check if cache exists and has a modified time >= the FASTA file.""" - if not self.cache_path.exists(): - return False - if self.cache_path.stat().st_mtime_ns < self.path.stat().st_mtime_ns: - return False - return True - - def _get_contig_lengths(self) -> dict[str, int]: - with self._open() as f: - return {c: f.get_reference_length(c) for c in f.references} - - def _get_sequences( - self, contigs: Iterable[str], from_fasta=False - ) -> dict[str, NDArray[np.bytes_]]: - """Load contigs into memory.""" - sequences: dict[str, NDArray[np.bytes_]] = {} - if from_fasta or not self.cache_path.exists(): - with self._open() as f: - pbar = tqdm(total=len(self.contigs)) - for c in contigs: - pbar.set_description(f"Reading contig {c}") - sequences[c] = np.frombuffer( - f.fetch(c).encode("ascii").upper(), "S1" - ) - pbar.update() - pbar.close() - elif self.cache_path.exists(): - seqs = np.memmap(self.cache_path, dtype=np.bytes_, mode="r") - offset = 0 - for contig, length in self.contigs.items(): - sequences[contig] = seqs[offset : offset + length] - offset += length - return sequences - - def _write_to_cache(self): - """Write contigs to cache.""" + self.sequences = self._read_all_from_fasta() + + def _memmap_sequences(self) -> dict[str, NDArray[np.bytes_]]: + """Load contigs as views into the cached sequence.bin memmap.""" + seqs = np.memmap(self._data_path, dtype="S1", mode="r") + out: dict[str, NDArray[np.bytes_]] = {} offset = 0 - seqs = np.memmap( - self.cache_path, - dtype=np.uint8, - mode="w+", - shape=sum(self.contigs.values()), - ) - f = None - - for c in (pbar := tqdm(self.contigs, total=len(seqs), unit=" nucleotide")): - if self.sequences is None: - if f is None: - f = self._open() - pbar.set_description(f"Reading contig {c}") - c_seq = np.frombuffer(f.fetch(c).encode("ascii").upper(), "S1") - else: - c_seq = self.sequences[c] - pbar.set_description(f"Writing contig {c}") - seqs[offset : offset + len(c_seq)] = c_seq.view(np.uint8) - seqs.flush() - offset += len(c_seq) - pbar.update(len(c_seq)) - - if f is not None: - f.close() + for contig, length in self.contigs.items(): + out[contig] = seqs[offset : offset + length] + offset += length + return out + + def _read_all_from_fasta(self) -> dict[str, NDArray[np.bytes_]]: + """Load all contigs into memory directly from the FASTA (no disk cache).""" + out: dict[str, NDArray[np.bytes_]] = {} + with self._open() as f: + for c in tqdm(self.contigs, total=len(self.contigs)): + out[c] = np.frombuffer(f.fetch(c).encode("ascii").upper(), "S1") + return out def _open(self): + if not self._source_available: + raise FileNotFoundError( + f"Source FASTA for cache {self.cache_path} could not be located; " + "on-demand reads require the source file. Re-open with in_memory=True " + "to use cached data, or restore the source FASTA." + ) return pysam.FastaFile(str(self.path)) def close(self): diff --git a/tests/unit/test_fasta.py b/tests/unit/test_fasta.py index e50c81c..095be7e 100644 --- a/tests/unit/test_fasta.py +++ b/tests/unit/test_fasta.py @@ -72,3 +72,92 @@ def test_fasta_zero_length_range(ref_fasta): fasta = Fasta("ref", ref_fasta, pad="N") seq = fasta.read("chr1", 100, 100) assert len(seq) == 0 + + +from pathlib import Path + +import genvarloader._fasta_cache as fc + + +def test_fasta_cache_creates_gvlfa_dir(tmp_path, ref_fasta): + import shutil + + local = tmp_path / Path(ref_fasta).name + shutil.copy(ref_fasta, local) + for ext in (".fai", ".gzi"): + side = Path(str(ref_fasta) + ext) + if side.exists(): + shutil.copy(side, tmp_path / side.name) + + fasta = Fasta("ref", local, pad="N", in_memory=True, cache=True) + gvlfa = local.with_name(local.name + ".gvlfa") + assert gvlfa.is_dir() + assert (gvlfa / "sequence.bin").exists() + # Sequence served from cache matches a direct pysam read. + contig = next(iter(fasta.contigs)) + seq = fasta.read(contig, 0, 8) + with FastaFile(str(local)) as f: + expected = np.frombuffer(f.fetch(contig, 0, 8).encode("ascii").upper(), "S1") + np.testing.assert_array_equal(seq, expected) + + +def test_fasta_accepts_gvlfa_directly_in_memory(tmp_path, ref_fasta): + import shutil + + local = tmp_path / Path(ref_fasta).name + shutil.copy(ref_fasta, local) + for ext in (".fai", ".gzi"): + side = Path(str(ref_fasta) + ext) + if side.exists(): + shutil.copy(side, tmp_path / side.name) + meta, _ = fc.ensure_cache(local) + gvlfa = local.with_name(local.name + ".gvlfa") + + fasta = Fasta("ref", gvlfa, pad="N", in_memory=True) + assert dict(fasta.contigs) == meta.contigs + contig = next(iter(fasta.contigs)) + seq = fasta.read(contig, 0, 8) + with FastaFile(str(local)) as f: + expected = np.frombuffer(f.fetch(contig, 0, 8).encode("ascii").upper(), "S1") + np.testing.assert_array_equal(seq, expected) + + +def test_fasta_gvlfa_missing_source_errors_on_ondemand_read(tmp_path): + src = tmp_path / "tiny.fa" + src.write_text(">chr1\nACGTACGT\n") + FastaFile(str(src)).close() # triggers .fai creation via pysam.faidx below + import pysam + + pysam.faidx(str(src)) + fc.ensure_cache(src) + gvlfa = src.with_name(src.name + ".gvlfa") + src.unlink() + (tmp_path / "tiny.fa.fai").unlink(missing_ok=True) + + with pytest.warns(UserWarning, match="Could not locate source FASTA"): + fasta = Fasta("ref", gvlfa, pad="N", in_memory=False) + with pytest.raises(FileNotFoundError, match="could not be located"): + fasta.read("chr1", 0, 4) + + +def test_fasta_migrates_legacy_gvl(tmp_path): + src = tmp_path / "tiny.fa" + src.write_text(">chr1\nACGTACGT\n") + import pysam + + pysam.faidx(str(src)) + # Stage bytes into a legacy flat .gvl next to the source. + staging = tmp_path / "staging.gvlfa" + fc.build(src, staging) + import shutil + + legacy = src.with_name(src.name + ".gvl") + shutil.move(str(staging / "sequence.bin"), str(legacy)) + shutil.rmtree(staging) + + fasta = Fasta("ref", src, pad="N", in_memory=True, cache=True) + assert not legacy.exists() + assert src.with_name(src.name + ".gvlfa").is_dir() + np.testing.assert_array_equal( + fasta.read("chr1", 0, 8), np.frombuffer(b"ACGTACGT", "S1") + ) From 3a7881bd5ff1c27df96abb75ac64d924b2b7c8b5 Mon Sep 17 00:00:00 2001 From: d-laub Date: Mon, 1 Jun 2026 18:28:15 -0700 Subject: [PATCH 09/28] test(fasta): cover in-memory no-cache _read_all_from_fasta path --- tests/unit/test_fasta.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/unit/test_fasta.py b/tests/unit/test_fasta.py index 095be7e..0626f39 100644 --- a/tests/unit/test_fasta.py +++ b/tests/unit/test_fasta.py @@ -161,3 +161,14 @@ def test_fasta_migrates_legacy_gvl(tmp_path): np.testing.assert_array_equal( fasta.read("chr1", 0, 8), np.frombuffer(b"ACGTACGT", "S1") ) + + +def test_fasta_in_memory_no_cache_reads_from_fasta(tmp_path, ref_fasta): + # in_memory=True, cache=False -> _read_all_from_fasta path, no .gvlfa written. + fasta = Fasta("ref", ref_fasta, pad="N", in_memory=True, cache=False) + assert fasta.sequences is not None + contig = next(iter(fasta.contigs)) + seq = fasta.read(contig, 0, 8) + with FastaFile(str(ref_fasta)) as f: + expected = np.frombuffer(f.fetch(contig, 0, 8).encode("ascii").upper(), "S1") + np.testing.assert_array_equal(seq, expected) From 91f0207f8eff3ce07846e4f619ff1b985becddd4 Mon Sep 17 00:00:00 2001 From: d-laub Date: Mon, 1 Jun 2026 18:30:18 -0700 Subject: [PATCH 10/28] refactor(reference): build cache via ensure_cache, accept .gvlfa Rewires Reference.from_path to call ensure_cache() instead of the removed Fasta._valid_cache/_write_to_cache methods. Accepts both a .fa source and a pre-built .gvlfa directory as the fasta argument. Removes now-unused loguru.logger import. Co-Authored-By: Claude Sonnet 4.6 --- python/genvarloader/_dataset/_reference.py | 22 +++++++++------------ tests/unit/test_fasta.py | 23 ++++++++++++++++++++++ 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/python/genvarloader/_dataset/_reference.py b/python/genvarloader/_dataset/_reference.py index 89d5b34..2d63e9b 100644 --- a/python/genvarloader/_dataset/_reference.py +++ b/python/genvarloader/_dataset/_reference.py @@ -11,13 +11,12 @@ import polars as pl from genoray._utils import ContigNormalizer from hirola import HashTable -from loguru import logger from numpy.typing import ArrayLike, NDArray from seqpro.rag import Ragged, lengths_to_offsets from typing_extensions import Self from .._flat import _Flat -from .._fasta import Fasta +from .._fasta_cache import ensure_cache from .._ragged import RaggedSeqs, reverse_complement_masked, to_padded from .._torch import TORCH_AVAILABLE, get_dataloader, no_torch_error from .._types import Idx, StrIdx @@ -73,17 +72,14 @@ def from_path( reference genomes or have very limited RAM. """ path = Path(fasta) - _fasta = Fasta("ref", fasta, "N") + meta, data_path = ensure_cache(fasta) + full_contigs = meta.contigs - if not _fasta._valid_cache(): - logger.info("Memory-mapping FASTA file for faster access.") - _fasta._write_to_cache() - - ref_mmap = np.memmap(_fasta.cache_path, np.uint8, "r") - offsets = lengths_to_offsets(np.array(list(_fasta.contigs.values()))) + ref_mmap = np.memmap(data_path, np.uint8, "r") + offsets = lengths_to_offsets(np.array(list(full_contigs.values()))) pad_char = ord("N") - c_map = ContigNormalizer(_fasta.contigs) + c_map = ContigNormalizer(full_contigs) if contigs is None: contigs = c_map.contigs else: @@ -98,14 +94,14 @@ def from_path( c_map = ContigNormalizer(contigs) if in_memory: - reference = np.empty(sum(_fasta.contigs[c] for c in contigs), np.uint8) + reference = np.empty(sum(full_contigs[c] for c in contigs), np.uint8) offset = 0 for c in contigs: - c_idx = list(_fasta.contigs).index(c) + c_idx = list(full_contigs).index(c) o_s, o_e = offsets[c_idx], offsets[c_idx + 1] reference[offset : offset + o_e - o_s] = ref_mmap[o_s:o_e] offset += o_e - o_s - offsets = lengths_to_offsets(np.array([_fasta.contigs[c] for c in contigs])) + offsets = lengths_to_offsets(np.array([full_contigs[c] for c in contigs])) else: reference = ref_mmap diff --git a/tests/unit/test_fasta.py b/tests/unit/test_fasta.py index 0626f39..15716d4 100644 --- a/tests/unit/test_fasta.py +++ b/tests/unit/test_fasta.py @@ -172,3 +172,26 @@ def test_fasta_in_memory_no_cache_reads_from_fasta(tmp_path, ref_fasta): with FastaFile(str(ref_fasta)) as f: expected = np.frombuffer(f.fetch(contig, 0, 8).encode("ascii").upper(), "S1") np.testing.assert_array_equal(seq, expected) + + +def test_reference_from_path_accepts_gvlfa(tmp_path, ref_fasta): + import shutil + + import genvarloader as gvl + + local = tmp_path / Path(ref_fasta).name + shutil.copy(ref_fasta, local) + for ext in (".fai", ".gzi"): + side = Path(str(ref_fasta) + ext) + if side.exists(): + shutil.copy(side, tmp_path / side.name) + + ref_from_fa = gvl.Reference.from_path(local, in_memory=True) + gvlfa = local.with_name(local.name + ".gvlfa") + assert gvlfa.is_dir() + + ref_from_gvlfa = gvl.Reference.from_path(gvlfa, in_memory=True) + assert ref_from_gvlfa.contigs == ref_from_fa.contigs + np.testing.assert_array_equal( + ref_from_gvlfa.reference, ref_from_fa.reference + ) From bc975cd6ff482c396bc3bfd64008c8d3c1032105 Mon Sep 17 00:00:00 2001 From: d-laub Date: Mon, 1 Jun 2026 18:36:12 -0700 Subject: [PATCH 11/28] docs(skill): note Reference.from_path .gvlfa support --- skills/genvarloader/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skills/genvarloader/SKILL.md b/skills/genvarloader/SKILL.md index d965f0d..4c5fb7b 100644 --- a/skills/genvarloader/SKILL.md +++ b/skills/genvarloader/SKILL.md @@ -229,7 +229,7 @@ Footprint is computed exactly via `Dataset._output_bytes_per_instance(...)` (use ## Other public surface (one-liners) -- `gvl.Reference.from_path(fasta, contigs=None)` — wrap a FASTA. Cached. +- `gvl.Reference.from_path(fasta, contigs=None)` — wrap a FASTA (path to a `.fa`/`.fa.bgz`, or a `.gvlfa` cache dir). Builds/reuses a sibling `.gvlfa` cache directory (self-describing, fingerprint-validated; legacy `.fa.gvl` caches auto-migrate). - `gvl.read_bedlike(path)` / `gvl.with_length(bed, L)` — BED helpers (re-exported from `seqpro`). - `gvl.Ragged`, `gvl.RaggedAnnotatedHaps`, `gvl.RaggedVariants`, `gvl.RaggedIntervals` — ragged return containers. - `gvl.to_nested_tensor(ragged)` — convert to a PyTorch nested tensor (requires `torch`). From 4eac37c8a278e9d2c4c2541d3717e95c7824b362 Mon Sep 17 00:00:00 2001 From: d-laub Date: Mon, 1 Jun 2026 18:43:37 -0700 Subject: [PATCH 12/28] fix(_fasta_cache): guard legacy migration against stale/truncated bytes Co-Authored-By: Claude Sonnet 4.6 --- python/genvarloader/_dataset/_reference.py | 5 ++++- python/genvarloader/_fasta_cache.py | 14 +++++++++++++- tests/unit/test_fasta_cache.py | 15 +++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/python/genvarloader/_dataset/_reference.py b/python/genvarloader/_dataset/_reference.py index 2d63e9b..020cc7a 100644 --- a/python/genvarloader/_dataset/_reference.py +++ b/python/genvarloader/_dataset/_reference.py @@ -59,7 +59,10 @@ def from_path( Parameters ---------- fasta - Path to the FASTA file. + Path to a ``.fa``/``.fa.bgz`` FASTA file or an existing ``.gvlfa`` + cache directory. When a FASTA path is given, a sibling ``.gvlfa`` + cache is built on first use and reused on subsequent calls; a legacy + ``.fa.gvl`` flat cache is automatically migrated to the new format. contigs List of contig names to load. If None, all contigs in the FASTA file are loaded. Can be either UCSC or Ensembl style (i.e. with or without the "chr" prefix) and diff --git a/python/genvarloader/_fasta_cache.py b/python/genvarloader/_fasta_cache.py index 7caf266..26e3966 100644 --- a/python/genvarloader/_fasta_cache.py +++ b/python/genvarloader/_fasta_cache.py @@ -138,6 +138,9 @@ def _check_format_version(meta: FastaCache, gvlfa_dir: Path) -> None: def _data_size_ok(gvlfa_dir: Path, meta: FastaCache) -> bool: + # NOTE: only total byte count is checked here. A same-size corruption of + # sequence.bin is not detected; full-content hashing would be prohibitively + # expensive for large reference genomes. data_path = Path(gvlfa_dir) / DATA_FILENAME if not data_path.exists(): return False @@ -156,13 +159,22 @@ def migrate_legacy( """Upgrade a legacy flat .gvl cache to a .gvlfa dir by reusing its bytes. Reads contig lengths and fingerprints the source *before* touching the legacy - file, so a missing/unreadable source aborts without disturbing it. + file, so a missing/unreadable source aborts without disturbing it. If the legacy + bytes don't match the current source's expected size (i.e. the legacy cache is + stale or truncated), the legacy file is left untouched and a fresh cache is built + from the source instead of reusing untrustworthy bytes. """ source_fa = Path(source_fa) legacy_gvl = Path(legacy_gvl) gvlfa_dir = Path(gvlfa_dir) contigs = _contig_lengths(source_fa) # raises here if source is unreadable fp = fingerprint(source_fa) + if legacy_gvl.stat().st_size != sum(contigs.values()): + logger.info( + f"Legacy cache {legacy_gvl} size does not match source {source_fa}; " + "ignoring stale legacy bytes and building a fresh cache." + ) + return build(source_fa, gvlfa_dir) gvlfa_dir.mkdir(parents=True, exist_ok=True) shutil.move(str(legacy_gvl), str(gvlfa_dir / DATA_FILENAME)) meta = FastaCache( diff --git a/tests/unit/test_fasta_cache.py b/tests/unit/test_fasta_cache.py index aee318b..d0ef47b 100644 --- a/tests/unit/test_fasta_cache.py +++ b/tests/unit/test_fasta_cache.py @@ -185,6 +185,21 @@ def test_migrate_legacy_aborts_before_move_if_source_unreadable(tmp_path): assert legacy.exists() # untouched on failure +def test_migrate_legacy_truncated_bytes_builds_fresh(tmp_path, local_fa): + # A stale/truncated legacy cache must NOT be trusted: migrate should build + # fresh from the source and leave the legacy file untouched. + legacy = local_fa.with_name(local_fa.name + fc.LEGACY_SUFFIX) + legacy.write_bytes(b"truncated") # wrong size vs the real source + gvlfa = local_fa.with_name(local_fa.name + fc.GVLFA_SUFFIX) + meta = fc.migrate_legacy(local_fa, legacy, gvlfa) + + assert legacy.exists() # stale legacy left untouched, not consumed + assert (gvlfa / fc.DATA_FILENAME).stat().st_size == sum(meta.contigs.values()) + # The rebuilt cache is valid and fresh against the source. + _, source, status = fc.load(gvlfa) + assert status == "fresh" and source == local_fa.resolve() + + def test_is_gvlfa_dir(tmp_path, local_fa): gvlfa = tmp_path / "out.gvlfa" fc.build(local_fa, gvlfa) From e629962167bb7b1c2148cbbe02412e426e72e775 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 Jun 2026 07:20:28 +0000 Subject: [PATCH 13/28] chore(pre-commit): auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- python/genvarloader/_fasta_cache.py | 8 ++++++-- tests/unit/test_fasta.py | 4 +--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/python/genvarloader/_fasta_cache.py b/python/genvarloader/_fasta_cache.py index 26e3966..e01afc4 100644 --- a/python/genvarloader/_fasta_cache.py +++ b/python/genvarloader/_fasta_cache.py @@ -233,7 +233,9 @@ def _ensure_from_fasta(source_fa: Path) -> tuple[FastaCache, Path]: ) else: valid = False - if not valid or meta is None: # redundant at runtime; satisfies the type checker + if ( + not valid or meta is None + ): # redundant at runtime; satisfies the type checker logger.info(f"Building FASTA cache at {gvlfa_dir}.") meta = build(source_fa, gvlfa_dir) return meta, data_path @@ -271,7 +273,9 @@ def _ensure_from_gvlfa(gvlfa_dir: Path) -> tuple[FastaCache, Path]: return meta, data_path -def load(gvlfa_dir: str | Path) -> tuple[FastaCache, Path | None, Literal["fresh", "stale", "unvalidated"]]: +def load( + gvlfa_dir: str | Path, +) -> tuple[FastaCache, Path | None, Literal["fresh", "stale", "unvalidated"]]: """Read cache metadata and classify it: 'fresh' | 'stale' | 'unvalidated'. Raises ValueError if the format version is too new to read. diff --git a/tests/unit/test_fasta.py b/tests/unit/test_fasta.py index 15716d4..f0a7b64 100644 --- a/tests/unit/test_fasta.py +++ b/tests/unit/test_fasta.py @@ -192,6 +192,4 @@ def test_reference_from_path_accepts_gvlfa(tmp_path, ref_fasta): ref_from_gvlfa = gvl.Reference.from_path(gvlfa, in_memory=True) assert ref_from_gvlfa.contigs == ref_from_fa.contigs - np.testing.assert_array_equal( - ref_from_gvlfa.reference, ref_from_fa.reference - ) + np.testing.assert_array_equal(ref_from_gvlfa.reference, ref_from_fa.reference) From d7ef3d0d5ed5af57e781c175cda32858ec0356ee Mon Sep 17 00:00:00 2001 From: d-laub Date: Tue, 2 Jun 2026 00:40:01 -0700 Subject: [PATCH 14/28] docs(specs): design robust on-disk artifacts (atomic creation + validation) --- ...26-06-02-robust-ondisk-artifacts-design.md | 165 ++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 docs/superpowers/specs/2026-06-02-robust-ondisk-artifacts-design.md diff --git a/docs/superpowers/specs/2026-06-02-robust-ondisk-artifacts-design.md b/docs/superpowers/specs/2026-06-02-robust-ondisk-artifacts-design.md new file mode 100644 index 0000000..ec22b33 --- /dev/null +++ b/docs/superpowers/specs/2026-06-02-robust-ondisk-artifacts-design.md @@ -0,0 +1,165 @@ +# Robust on-disk artifacts: atomic creation + validation — Design + +**Status:** Approved design, ready for implementation planning. + +**Goal:** Make GenVarLoader's generated on-disk artifacts safe under concurrency and resilient to format drift. Two coupled problems, one initiative: + +1. **Concurrency (closes [#21]):** Parallel jobs that share an input (classically one reference FASTA across a SLURM array) race to build the same `.gvlfa` cache and corrupt it. The same race can affect `gvl.write` dataset directories. +2. **Validation hardening:** Files written into a `*.gvl/` dataset directory by `gvl.write` are write-once-assume-valid — no real format-version gate and no integrity check on read. A future on-disk layout change would silently load old datasets as wrong data; an interrupted write leaves an undetected partial dataset. + +This extends the robust-sidecar idiom already established by the `.gvlfa` FASTA cache (PR #206) and the existing `SvarLink` (`_svar_link.py`): fingerprint/version + graceful resolution + safe creation. + +[#21]: https://github.com/mcvickerlab/GenVarLoader/issues/21 + +--- + +## Background & decisions + +The corruption in #21 stems from a **shared mutable target**: today `_fasta_cache.build()` does `np.memmap(gvlfa_dir / "sequence.bin", mode="w+")` and writes into the live path; `gvl.write()` writes its many files directly into the destination directory. Concurrent builders interleave writes, and readers can observe partially-written artifacts. + +Two ways to make creation safe were considered: + +- **Advisory locks** (`flock`/`fcntl`) make other jobs wait. On network filesystems (NFS, Lustre, GPFS) — exactly where #21's parallel jobs run — advisory locks are unreliable: `flock` is often local-only across nodes, POSIX locks depend on a frequently-misconfigured network lock manager, and Lustre needs a specific mount option. The failure mode is silent (looks locked, isn't). +- **Atomic build-to-temp-then-rename** makes the *publish* indivisible. Each job builds into its own private sibling temp directory, then `os.replace`s it into place. `rename(2)` is a single metadata operation the server serializes (atomic even on NFS, within one filesystem); there is no shared target during the build, so writes cannot interleave. "Last writer wins" is harmless because every builder produces byte-identical valid content. + +**Decisions (from brainstorming):** + +| # | Decision | +|---|---| +| Concurrency primitive | **Atomic rename for correctness** (works on any filesystem) **+ advisory lock as a best-effort optimization** (avoid N redundant builds). The lock is never load-bearing for correctness. | +| Lock library | Add **`filelock`** dependency (v1). | +| Validation depth (datasets) | **Format-version gate + structural/size integrity** on open. | +| Dataset mismatch policy | **Always raise an actionable error** — datasets cannot auto-rebuild (no retained source). Only the FASTA cache auto-rebuilds (it has its source). | +| Atomic-creation scope | **Both** the `.gvlfa` cache **and** `gvl.write` dataset creation. | +| Back-compat | A dataset with **no `format_version`** is treated as the current layout (`1.0.0`) and loaded best-effort (no warning, no forced regeneration). | +| Out of scope | genoray `.gvi` and pysam `.fai`/`.gzi` are created by those libraries; making them atomic needs upstream changes. Documented as a limitation. | + +--- + +## Architecture + +A new small module **`python/genvarloader/_atomic.py`** owns the single reusable primitive both artifact types need — safely publishing a directory. Everything else reuses it. + +### `_atomic.py` — safe directory publish (single responsibility) + +```python +@contextmanager +def atomic_dir( + dest: Path, + *, + overwrite: bool = False, + lock: bool = True, + timeout: float = DEFAULT_LOCK_TIMEOUT, +) -> Iterator[Path]: + """Yield a private temp dir to build into; atomically publish it to `dest` on + clean exit, or remove it on error. Optional best-effort lock avoids redundant + concurrent builds but is never required for correctness.""" +``` + +- **Temp location:** `.tmp.-/` as a *sibling* of `dest` (same filesystem, so `os.replace` is atomic; avoids `EXDEV`). +- **Publish:** on clean context exit, `os.replace(tmp, dest)`. On exception, remove `tmp` and re-raise; `dest` is never touched. +- **Existing dest:** checked up front — exists & not `overwrite` → `FileExistsError`; exists & `overwrite` → the final `os.replace` replaces it (for a directory dest, replace into a freshly-renamed-aside or removed old dir; see Open Questions). +- **Lock layer** (`filelock`): a `.lock` sibling. Flow is **double-checked**: acquire lock (with `timeout`) → re-validate `dest` (another job may have just published it → reuse, skip build) → else build. On lock timeout or a silent network-FS no-op, fall back to building anyway; atomic rename keeps that correct. + +### `_fasta_cache.py` (from PR #206) + +`build()` and `migrate_legacy()` refactored to publish through `atomic_dir` rather than writing `sequence.bin`/`metadata.json` into the live directory. `ensure_cache` already has the format-version gate + blake2b fingerprint; it gains the lock + atomic publish and a double-check re-classification inside the lock. Stale/corrupt caches still auto-rebuild (the source FASTA is available). + +### `_write.py` + +`write()`'s body runs against a temp dir yielded by `atomic_dir(path, overwrite=overwrite, ...)`; the final `os.replace` publishes the complete `*.gvl/`. The existing `overwrite` / `FileExistsError` semantics move into `atomic_dir`. `Metadata` gains a real **`format_version: SemanticVersion`** field — bumped only when the on-disk layout changes, distinct from the existing package `version` field. + +### `_open.py` + +`_load_metadata()` gains a `_validate(metadata, path)` step: + +1. **Format-version gate** — incompatible major (too-new *or* too-old) → actionable `ValueError` ("written by format N; regenerate with `gvl.write`"). Missing `format_version` → treat as `1.0.0`. +2. **Structural + size integrity** — required files exist, and each `.npy`/memmap `stat().st_size` equals the size implied by the shapes in metadata (`n_regions`, `ploidy`, `n_samples`, final offset values, track set). Mismatch → `ValueError` naming the file. + +Datasets never auto-rebuild. + +--- + +## Data flow + +### A. Building the FASTA cache (`ensure_cache`) + +1. Dispatch `.fa` vs `.gvlfa` and decide build / migrate / rebuild / reuse (unchanged). +2. If a build is needed: + - `with atomic_dir(gvlfa_dir, lock=True, timeout=T) as tmp:` + - lock acquired (or timed out → proceed) + - **double-check:** re-classify `gvlfa_dir`; if another job just published a fresh cache, skip building and reuse it + - else write `sequence.bin` + `metadata.json` into `tmp` + - context exit → `os.replace(tmp, gvlfa_dir)` +3. return `(meta, data_path)`. + +### B. Writing a dataset (`gvl.write`) + +1. `with atomic_dir(path, overwrite=overwrite, lock=True, timeout=T) as tmp:` + - existing-dest handled up front (`FileExistsError` unless `overwrite`) + - every write (`input_regions.arrow`, `regions.npy`, `genotypes/*`, `intervals/**`, `metadata.json` with `format_version`) targets `tmp` +2. exit → atomic publish to `path`. An interrupted write leaves only an orphan `.tmp.*`, never a half-written `path`. + +### C. Opening a dataset (`Dataset.open`) + +1. `_load_metadata()` reads `metadata.json`. +2. `_validate(metadata, path)`: format-version gate, then structural + size integrity. +3. construct readers as today. + +### Orphan / lock-file policy + +Temp dirs are removed on exception. A *crashed* process can still leave an orphan `.tmp.*`; these are harmless (distinctly named, ignored on open) and documented rather than swept (sweeping risks deleting a live job's temp). `.lock` files persist empty and are reused. + +--- + +## Error handling + +| Situation | Behavior | +|---|---| +| Lock timeout / silent network-FS no-op | `log.info`, build anyway — atomic rename keeps it correct (never an error) | +| Build raises mid-way | temp dir removed; exception propagates; live `dest` untouched | +| Dataset `dest` exists, `overwrite=False` | `FileExistsError` (preserved behavior) | +| FASTA format too-new | `ValueError` "newer than supported" (existing behavior) | +| Dataset format too-new / too-old-incompatible | actionable `ValueError` → regenerate with `gvl.write` | +| Dataset missing `format_version` | treat as `1.0.0`, load (no warning) | +| Dataset integrity fail (missing file / size mismatch) | `ValueError` naming the file → regenerate | +| FASTA cache integrity fail | auto-rebuild (source available), else `ValueError` | + +**Correctness invariant:** a reader never observes a partially-written `dest`, and two builders never write the same file. + +--- + +## Test plan + +### `tests/unit/test_atomic.py` (the primitive) +- publishes on clean exit (dest appears, temp gone); removes temp + leaves no dest on exception +- `overwrite=False` + existing dest → `FileExistsError`; `overwrite=True` replaces +- lock double-check: `build_fn` runs once when dest becomes valid mid-wait; lock timeout → falls back to build, no error + +### Concurrency regression (`multiprocessing`, `@pytest.mark.slow`) — the #21 fix +- N processes `ensure_cache` the same `.fa` concurrently → resulting `sequence.bin` is valid and **byte-identical** to a single-process build; no corruption +- N processes `gvl.write` the same path (overwrite) → exactly one valid, openable dataset; no partial dirs left as `path` + +### Format / validation +- dataset `format_version` too-new → `ValueError`; missing → loads as `1.0.0`; truncated/missing `.npy` → `ValueError` naming it +- write→open round-trip validates clean; back-compat: a dataset with the `format_version` field stripped still opens + +### Regression +- full fast suite green (write + open are now atomic); `gen` + a `Dataset` round-trip + +Cross-platform: `filelock` covers posix/Windows; `os.replace` is atomic within a directory on both. + +--- + +## Out of scope (documented limitations) + +- **genoray `.gvi`** (VCF/PGEN variant index) and **pysam `.fai`/`.gzi`** (FASTA index) are created by their respective libraries; making their creation atomic/locked requires upstream changes. Concurrent jobs relying on these still depend on those libraries' own behavior. +- No per-region / per-sample integrity beyond total byte-size checks (full-content hashing of multi-GB datasets is intentionally avoided as too expensive — same tradeoff `_data_size_ok` documents for the FASTA cache). + +--- + +## Open questions for implementation + +1. **Directory `os.replace` onto a non-empty existing dir** fails on POSIX. For `overwrite=True` the publish must first move the old `dest` aside to `.old.` (then remove it after a successful rename) or `rmtree` it just before the rename. Decide the exact sequence to keep the window where `dest` is absent as small as possible. +2. **`DEFAULT_LOCK_TIMEOUT` value** and whether to expose `lock`/`timeout` as parameters on `gvl.write` / `Reference.from_path` / `Fasta`, or keep them internal with a sensible default. +3. **`format_version` starting value and bump policy** — declare the current dataset layout `1.0.0`; document when a major bump is required. From 021a6321497016dbf6e0cc39446de2b74cf54032 Mon Sep 17 00:00:00 2001 From: d-laub Date: Wed, 3 Jun 2026 12:03:47 -0700 Subject: [PATCH 15/28] docs(specs): resolve open questions for robust on-disk artifacts Move-aside-then-rename overwrite, internal 60s lock timeout, format_version 1.0.0 with major-on-break bump policy. Confirm sibling temp (not /tmp) for atomic os.replace. Co-Authored-By: Claude Opus 4.8 --- .../2026-06-02-robust-ondisk-artifacts-design.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/superpowers/specs/2026-06-02-robust-ondisk-artifacts-design.md b/docs/superpowers/specs/2026-06-02-robust-ondisk-artifacts-design.md index ec22b33..9068e5c 100644 --- a/docs/superpowers/specs/2026-06-02-robust-ondisk-artifacts-design.md +++ b/docs/superpowers/specs/2026-06-02-robust-ondisk-artifacts-design.md @@ -1,6 +1,6 @@ # Robust on-disk artifacts: atomic creation + validation — Design -**Status:** Approved design, ready for implementation planning. +**Status:** Approved design with all implementation decisions resolved; ready for implementation planning. **Goal:** Make GenVarLoader's generated on-disk artifacts safe under concurrency and resilient to format drift. Two coupled problems, one initiative: @@ -58,8 +58,8 @@ def atomic_dir( - **Temp location:** `.tmp.-/` as a *sibling* of `dest` (same filesystem, so `os.replace` is atomic; avoids `EXDEV`). - **Publish:** on clean context exit, `os.replace(tmp, dest)`. On exception, remove `tmp` and re-raise; `dest` is never touched. -- **Existing dest:** checked up front — exists & not `overwrite` → `FileExistsError`; exists & `overwrite` → the final `os.replace` replaces it (for a directory dest, replace into a freshly-renamed-aside or removed old dir; see Open Questions). -- **Lock layer** (`filelock`): a `.lock` sibling. Flow is **double-checked**: acquire lock (with `timeout`) → re-validate `dest` (another job may have just published it → reuse, skip build) → else build. On lock timeout or a silent network-FS no-op, fall back to building anyway; atomic rename keeps that correct. +- **Existing dest:** checked up front — exists & not `overwrite` → `FileExistsError`; exists & `overwrite` → **move-aside then rename**: `os.replace(dest, .old.)`, then `os.replace(tmp, dest)`, then `rmtree(.old.)`. Two fast metadata renames keep the dest-absent window minimal; if the second rename fails the `.old` copy is still recoverable. +- **Lock layer** (`filelock`): a `.lock` sibling, **internal with a 60s default `DEFAULT_LOCK_TIMEOUT`** (not exposed on the public API — the lock is only a best-effort optimization; atomic rename is the correctness guarantee). Flow is **double-checked**: acquire lock (with `timeout`) → re-validate `dest` (another job may have just published it → reuse, skip build) → else build. On lock timeout or a silent network-FS no-op, fall back to building anyway; atomic rename keeps that correct. ### `_fasta_cache.py` (from PR #206) @@ -158,8 +158,8 @@ Cross-platform: `filelock` covers posix/Windows; `os.replace` is atomic within a --- -## Open questions for implementation +## Resolved implementation decisions -1. **Directory `os.replace` onto a non-empty existing dir** fails on POSIX. For `overwrite=True` the publish must first move the old `dest` aside to `.old.` (then remove it after a successful rename) or `rmtree` it just before the rename. Decide the exact sequence to keep the window where `dest` is absent as small as possible. -2. **`DEFAULT_LOCK_TIMEOUT` value** and whether to expose `lock`/`timeout` as parameters on `gvl.write` / `Reference.from_path` / `Fasta`, or keep them internal with a sensible default. -3. **`format_version` starting value and bump policy** — declare the current dataset layout `1.0.0`; document when a major bump is required. +1. **Overwrite publish (non-empty existing dir).** `os.replace` cannot replace a non-empty dir on POSIX, so `overwrite=True` uses **move-aside then rename**: `os.replace(dest, .old.)` → `os.replace(tmp, dest)` → `rmtree(.old.)`. Two fast metadata renames keep the dest-absent window minimal, and a failure of the second rename leaves the old data recoverable under `.old` (vs. an `rmtree`-first approach that spans the whole delete and loses old data on a mid-delete crash). +2. **Lock timeout / API surface.** `DEFAULT_LOCK_TIMEOUT = 60s`, kept **internal** — `lock`/`timeout` are not exposed on `gvl.write` / `Reference.from_path` / `Fasta`. The lock is a best-effort optimization only; atomic rename is the correctness guarantee, so tuning is rarely needed. +3. **`format_version`.** Current dataset layout is **`1.0.0`**. Bump **MAJOR** only when an existing dataset can no longer be read correctly by new code (incompatible layout change); minor/patch for backward-compatible additions. A missing `format_version` field is treated as `1.0.0`. From a8e89819fb6ff8c3e789176ef042636d578a3e57 Mon Sep 17 00:00:00 2001 From: d-laub Date: Wed, 3 Jun 2026 12:11:01 -0700 Subject: [PATCH 16/28] docs(plans): implementation plan for robust on-disk artifacts (closes #21) Co-Authored-By: Claude Opus 4.8 --- .../2026-06-03-robust-ondisk-artifacts.md | 1060 +++++++++++++++++ 1 file changed, 1060 insertions(+) create mode 100644 docs/superpowers/plans/2026-06-03-robust-ondisk-artifacts.md diff --git a/docs/superpowers/plans/2026-06-03-robust-ondisk-artifacts.md b/docs/superpowers/plans/2026-06-03-robust-ondisk-artifacts.md new file mode 100644 index 0000000..30756e4 --- /dev/null +++ b/docs/superpowers/plans/2026-06-03-robust-ondisk-artifacts.md @@ -0,0 +1,1060 @@ +# Robust On-Disk Artifacts Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make GenVarLoader's generated on-disk artifacts (the `.gvlfa` FASTA cache and `gvl.write` dataset directories) safe under concurrent creation and resilient to format drift, closing [#21]. + +**Architecture:** A new single-responsibility primitive `_atomic.py:atomic_dir` builds each artifact into a private sibling temp dir and publishes it with an atomic `os.replace`, guarded by a best-effort `filelock` (never load-bearing for correctness). `_fasta_cache.build`/`migrate_legacy` and `gvl.write` publish through it. `Dataset.open` gains a format-version gate plus structural/size integrity checks; datasets never auto-rebuild (no retained source), the FASTA cache does (source available). + +**Tech Stack:** Python 3.10+, pydantic v2 (`SemanticVersion`), numpy memmap, `filelock`, pytest + `multiprocessing` for the concurrency regression. + +**Reference spec:** `docs/superpowers/specs/2026-06-02-robust-ondisk-artifacts-design.md` + +--- + +## File Structure + +- **Create** `python/genvarloader/_atomic.py` — `atomic_dir` context manager, `SkipPublish` sentinel exception, `DEFAULT_LOCK_TIMEOUT`. Sole owner of "safely publish a directory." +- **Create** `python/genvarloader/_dataset/_validate.py` — `validate_dataset(metadata, path)`: format-version gate + structural/size integrity. Imports `DATASET_FORMAT_VERSION` from `_write.py`. +- **Modify** `pyproject.toml` — add `filelock` dependency. +- **Modify** `python/genvarloader/_fasta_cache.py` — `build`/`migrate_legacy` publish through `atomic_dir`; `ensure_cache` rebuild paths go through a locked double-checked helper. +- **Modify** `python/genvarloader/_dataset/_write.py` — `Metadata` gains `format_version`; module gains `DATASET_FORMAT_VERSION`; `write()` body targets an `atomic_dir` temp. +- **Modify** `python/genvarloader/_dataset/_open.py` — `OpenRequest._load_metadata` calls `validate_dataset`. +- **Create** `tests/unit/test_atomic.py`, `tests/unit/dataset/test_validate.py`, `tests/unit/test_concurrency.py`. +- **Modify** `skills/genvarloader/SKILL.md` — document atomic/locked creation + dataset format-version gate. + +**Run commands** with `pixi run -e dev`. Single test: `pixi run -e dev pytest :: -v`. The pre-commit `pyrefly` hook reports pre-existing import-resolution false positives on `_bigwig.py`/`_flat.py`/`_ragged.py`/`_fasta_cache.py` (the Rust ext isn't built in the hook env) — commit those with `git commit --no-verify` as PR #206 did, but only after confirming `ruff` is clean and the relevant tests pass. + +--- + +## Task 1: Add `filelock` dependency + +**Files:** +- Modify: `pyproject.toml:34` + +- [ ] **Step 1: Add the dependency** + +In `pyproject.toml`, the `dependencies` list currently ends at line 34 with `"hirola>=0.3,<0.4",`. Add `filelock` after it: + +```toml + "hirola>=0.3,<0.4", + "filelock>=3.12", +] +``` + +- [ ] **Step 2: Install into the dev env** + +Run: `pixi run -e dev python -c "import filelock; print(filelock.__version__)"` +Expected: prints a version `>= 3.12` (pixi resolves the new dependency on first run; if it errors with `ModuleNotFoundError`, run `pixi install` first, then re-run). + +- [ ] **Step 3: Commit** + +```bash +git add pyproject.toml pixi.lock +git commit -m "build: add filelock dependency for atomic artifact creation" +``` + +--- + +## Task 2: `_atomic.py` — the safe directory-publish primitive + +**Files:** +- Create: `python/genvarloader/_atomic.py` +- Test: `tests/unit/test_atomic.py` + +The primitive yields a private temp dir to build into and atomically publishes it to `dest` on clean exit. A best-effort `filelock` avoids N redundant concurrent builds; on timeout it logs and proceeds (the atomic rename keeps that correct). A caller raises `SkipPublish` inside the block to abort publishing and reuse an already-valid `dest` (the double-check optimization). `overwrite=True` publishes via move-aside-then-rename. + +- [ ] **Step 1: Write the failing tests** + +Create `tests/unit/test_atomic.py`: + +```python +import os +from pathlib import Path + +import pytest + +from genvarloader._atomic import SkipPublish, atomic_dir + + +def test_publishes_on_clean_exit(tmp_path): + dest = tmp_path / "artifact" + with atomic_dir(dest) as tmp: + assert tmp.is_dir() + assert tmp != dest + (tmp / "data.bin").write_bytes(b"hello") + assert dest.is_dir() + assert (dest / "data.bin").read_bytes() == b"hello" + # temp sibling cleaned up + leftovers = list(tmp_path.glob("artifact.tmp.*")) + assert leftovers == [] + + +def test_temp_is_sibling_same_parent(tmp_path): + dest = tmp_path / "sub" / "artifact" + dest.parent.mkdir() + seen = {} + with atomic_dir(dest) as tmp: + seen["tmp"] = tmp + (tmp / "x").write_text("1") + assert seen["tmp"].parent == dest.parent + + +def test_removes_temp_and_leaves_no_dest_on_exception(tmp_path): + dest = tmp_path / "artifact" + with pytest.raises(RuntimeError, match="boom"): + with atomic_dir(dest) as tmp: + (tmp / "x").write_text("1") + raise RuntimeError("boom") + assert not dest.exists() + assert list(tmp_path.glob("artifact.tmp.*")) == [] + + +def test_existing_dest_no_overwrite_raises(tmp_path): + dest = tmp_path / "artifact" + dest.mkdir() + with pytest.raises(FileExistsError): + with atomic_dir(dest, overwrite=False) as tmp: + (tmp / "x").write_text("1") + + +def test_overwrite_replaces_existing_dir(tmp_path): + dest = tmp_path / "artifact" + dest.mkdir() + (dest / "old.bin").write_bytes(b"old") + with atomic_dir(dest, overwrite=True) as tmp: + (tmp / "new.bin").write_bytes(b"new") + assert (dest / "new.bin").read_bytes() == b"new" + assert not (dest / "old.bin").exists() + assert list(tmp_path.glob("artifact.old.*")) == [] + + +def test_skip_publish_leaves_dest_untouched(tmp_path): + dest = tmp_path / "artifact" + dest.mkdir() + (dest / "keep.bin").write_bytes(b"keep") + with atomic_dir(dest, overwrite=True) as tmp: + (tmp / "ignored.bin").write_bytes(b"ignored") + raise SkipPublish + assert (dest / "keep.bin").read_bytes() == b"keep" + assert not (dest / "ignored.bin").exists() + assert list(tmp_path.glob("artifact.tmp.*")) == [] + + +def test_lock_file_sibling_created_and_reused(tmp_path): + dest = tmp_path / "artifact" + with atomic_dir(dest, lock=True) as tmp: + (tmp / "x").write_text("1") + # filelock leaves an empty .lock sibling + assert (tmp_path / "artifact.lock").exists() + + +def test_concurrent_publish_loser_discarded(tmp_path): + # Simulate: a racing builder publishes dest while we are mid-build and we + # are overwrite=False. Our publish must discard our temp, not clobber. + dest = tmp_path / "artifact" + with atomic_dir(dest, overwrite=False, lock=False) as tmp: + (tmp / "ours.bin").write_bytes(b"ours") + # racing builder finishes first: + dest.mkdir() + (dest / "theirs.bin").write_bytes(b"theirs") + assert (dest / "theirs.bin").read_bytes() == b"theirs" + assert not (dest / "ours.bin").exists() + assert list(tmp_path.glob("artifact.tmp.*")) == [] +``` + +- [ ] **Step 2: Run the tests to verify they fail** + +Run: `pixi run -e dev pytest tests/unit/test_atomic.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'genvarloader._atomic'`. + +- [ ] **Step 3: Implement `_atomic.py`** + +Create `python/genvarloader/_atomic.py`: + +```python +"""Safely publish a directory artifact via build-to-temp-then-atomic-rename. + +The single primitive both the `.gvlfa` FASTA cache and `gvl.write` dataset +directories need: build into a private sibling temp dir, then publish it to the +destination with an atomic `os.replace`. A best-effort `filelock` avoids N +redundant concurrent builds, but is never required for correctness — the atomic +rename is the correctness guarantee, so a lock timeout or a silent network-FS +no-op simply means "build anyway". +""" + +from __future__ import annotations + +import os +import shutil +from contextlib import contextmanager +from pathlib import Path +from typing import Iterator +from uuid import uuid4 + +from filelock import FileLock, Timeout +from loguru import logger + +__all__ = ["atomic_dir", "SkipPublish", "DEFAULT_LOCK_TIMEOUT"] + +DEFAULT_LOCK_TIMEOUT = 60.0 +"""Seconds to wait for the build lock before giving up and building anyway.""" + + +class SkipPublish(Exception): # noqa: N818 - control-flow sentinel, not an error + """Raise inside an `atomic_dir` block to abort publishing. + + Use when a double-check inside the lock finds `dest` already valid: the temp + dir is removed and `dest` is left untouched, with no exception surfacing to + the caller. + """ + + +def _publish(tmp: Path, dest: Path, *, overwrite: bool) -> None: + """Move `tmp` into place at `dest` as atomically as the filesystem allows.""" + if dest.exists(): + if not overwrite: + # A racing builder published `dest` while we built. Our content is + # byte-identical, so discard ours; theirs wins harmlessly. + shutil.rmtree(tmp, ignore_errors=True) + return + aside = dest.with_name(f"{dest.name}.old.{uuid4().hex[:8]}") + try: + os.replace(dest, aside) + except FileNotFoundError: + aside = None # another writer already moved/removed dest + os.replace(tmp, dest) + if aside is not None: + shutil.rmtree(aside, ignore_errors=True) + else: + os.replace(tmp, dest) + + +@contextmanager +def atomic_dir( + dest: str | os.PathLike[str], + *, + overwrite: bool = False, + lock: bool = True, + timeout: float = DEFAULT_LOCK_TIMEOUT, +) -> Iterator[Path]: + """Yield a private temp dir to build into; atomically publish it to `dest`. + + On clean exit the temp dir is `os.replace`-d into `dest`. On `SkipPublish` + the temp dir is removed and `dest` is left untouched. On any other exception + the temp dir is removed and the exception propagates; `dest` is never + partially written. + + Parameters + ---------- + dest + Final destination directory. + overwrite + If False and `dest` already exists, raise `FileExistsError` up front. If + True, an existing `dest` is replaced via move-aside-then-rename. + lock + Acquire a best-effort `.lock` to avoid redundant concurrent builds. + timeout + Seconds to wait for the lock before logging and proceeding anyway. + """ + dest = Path(dest) + if dest.exists() and not overwrite: + raise FileExistsError( + f"{dest} already exists; pass overwrite=True to replace it." + ) + + flock: FileLock | None = None + if lock: + flock = FileLock(str(dest) + ".lock") + try: + flock.acquire(timeout=timeout) + except Timeout: + logger.info( + f"Timed out after {timeout}s waiting for {dest}.lock; " + "building anyway (atomic rename keeps this correct)." + ) + flock = None + + tmp = dest.with_name(f"{dest.name}.tmp.{os.getpid()}-{uuid4().hex[:8]}") + tmp.mkdir(parents=True) + try: + yield tmp + except SkipPublish: + shutil.rmtree(tmp, ignore_errors=True) + except BaseException: + shutil.rmtree(tmp, ignore_errors=True) + raise + else: + _publish(tmp, dest, overwrite=overwrite) + finally: + if flock is not None: + flock.release() +``` + +- [ ] **Step 4: Run the tests to verify they pass** + +Run: `pixi run -e dev pytest tests/unit/test_atomic.py -v` +Expected: PASS (8 tests). + +- [ ] **Step 5: Lint** + +Run: `pixi run -e dev ruff check python/genvarloader/_atomic.py tests/unit/test_atomic.py` +Expected: no errors. + +- [ ] **Step 6: Commit** + +```bash +git add python/genvarloader/_atomic.py tests/unit/test_atomic.py +git commit -m "feat(_atomic): add atomic_dir directory-publish primitive" +``` + +--- + +## Task 3: Route the FASTA cache through `atomic_dir` + +**Files:** +- Modify: `python/genvarloader/_fasta_cache.py:99-130` (build), `:156-188` (migrate_legacy), `:207-273` (ensure_cache helpers) +- Test: `tests/unit/test_fasta_cache.py` + +`build()` and `migrate_legacy()` must write into a temp dir and publish atomically instead of writing into the live `gvlfa_dir`. `ensure_cache`'s rebuild paths go through a locked, double-checked helper so concurrent builders don't all rebuild (and never corrupt). Existing `fc.build(src, dir)` / `fc.migrate_legacy(...)` call signatures and return values are preserved, so the existing 24 tests keep passing. + +- [ ] **Step 1: Write the failing tests** + +Append to `tests/unit/test_fasta_cache.py`: + +```python +def test_build_is_atomic_no_temp_left(tmp_path): + src = tmp_path / "ref.fa" + src.write_text(">chr1\nACGTACGT\n") + pysam.faidx(str(src)) + gvlfa = tmp_path / "ref.fa.gvlfa" + fc.build(src, gvlfa) + assert (gvlfa / fc.DATA_FILENAME).exists() + assert (gvlfa / fc.METADATA_FILENAME).exists() + # no orphan temp / lock-leak that looks like a cache dir + assert list(tmp_path.glob("ref.fa.gvlfa.tmp.*")) == [] + assert list(tmp_path.glob("ref.fa.gvlfa.old.*")) == [] + + +def test_build_failure_leaves_no_partial_cache(tmp_path, monkeypatch): + src = tmp_path / "ref.fa" + src.write_text(">chr1\nACGTACGT\n") + pysam.faidx(str(src)) + gvlfa = tmp_path / "ref.fa.gvlfa" + + def boom(*a, **k): + raise RuntimeError("disk full") + + monkeypatch.setattr(fc, "_write_sequence", boom) + with pytest.raises(RuntimeError, match="disk full"): + fc.build(src, gvlfa) + assert not gvlfa.exists() + assert list(tmp_path.glob("ref.fa.gvlfa.tmp.*")) == [] + + +def test_ensure_cache_double_check_reuses_fresh(tmp_path): + # When dest is already a fresh cache, ensure_cache must not rebuild it. + src = tmp_path / "ref.fa" + src.write_text(">chr1\nACGTACGT\n") + pysam.faidx(str(src)) + meta1, data1 = fc.ensure_cache(src) + mtime1 = (data1).stat().st_mtime_ns + meta2, data2 = fc.ensure_cache(src) + assert data2 == data1 + # data file was not rewritten (fresh cache reused) + assert data2.stat().st_mtime_ns == mtime1 +``` + +- [ ] **Step 2: Run the tests to verify they fail** + +Run: `pixi run -e dev pytest tests/unit/test_fasta_cache.py -k "atomic or double_check or partial" -v` +Expected: `test_build_is_atomic_no_temp_left` and `test_build_failure_leaves_no_partial_cache` likely PASS already only if no temp logic changed — they should FAIL on the orphan-temp assertions only after the refactor introduces temps, so treat the meaningful failure as `test_build_failure_leaves_no_partial_cache` (current `build` mkdirs the live dir, so a mid-build crash leaves a partial `gvlfa` dir → assertion `not gvlfa.exists()` FAILS). Confirm that one fails. + +- [ ] **Step 3: Refactor `build` and `migrate_legacy` to publish atomically** + +In `python/genvarloader/_fasta_cache.py`, add the import near the top (after line 11's `import numpy as np` block): + +```python +from ._atomic import SkipPublish, atomic_dir +``` + +Replace `_write_sequence` to take an explicit target dir (it already does — it writes into the dir passed). Replace `build` (current lines 114-129) with: + +```python +def _build_into(source_fa: Path, target_dir: Path, dest_for_hints: Path) -> FastaCache: + """Write sequence.bin + metadata.json into `target_dir`. + + `dest_for_hints` is the *final* cache dir (a sibling of `target_dir` at the + same depth) used to compute source path hints, so the stored relative path is + correct after publish. + """ + contigs = _contig_lengths(source_fa) + _write_sequence(source_fa, target_dir, contigs) + meta = FastaCache( + format_version=FORMAT_VERSION, + genvarloader_version=_gvl_version(), + contigs=contigs, + source=_source_hints(source_fa, dest_for_hints), + fingerprint=fingerprint(source_fa), + ) + (target_dir / METADATA_FILENAME).write_text(meta.model_dump_json()) + return meta + + +def build(source_fa: str | Path, gvlfa_dir: str | Path) -> FastaCache: + """Build a fresh .gvlfa cache containing all contigs of the source FASTA. + + Builds into a private sibling temp dir and atomically publishes it, so a + concurrent builder or an interrupted build never leaves a partial cache. + """ + source_fa = Path(source_fa) + gvlfa_dir = Path(gvlfa_dir) + meta_holder: dict[str, FastaCache] = {} + with atomic_dir(gvlfa_dir, overwrite=True) as tmp: + meta_holder["meta"] = _build_into(source_fa, tmp, gvlfa_dir) + return meta_holder["meta"] +``` + +Replace `migrate_legacy`'s body that touches `gvlfa_dir` (current lines 178-188) — the early-return `build(...)` path stays; only the success path changes. Replace from `gvlfa_dir.mkdir(...)` through the `return meta`: + +```python + meta_holder: dict[str, FastaCache] = {} + with atomic_dir(gvlfa_dir, overwrite=True) as tmp: + shutil.move(str(legacy_gvl), str(tmp / DATA_FILENAME)) + meta = FastaCache( + format_version=FORMAT_VERSION, + genvarloader_version=_gvl_version(), + contigs=contigs, + source=_source_hints(source_fa, gvlfa_dir), + fingerprint=fp, + ) + (tmp / METADATA_FILENAME).write_text(meta.model_dump_json()) + meta_holder["meta"] = meta + return meta_holder["meta"] +``` + +- [ ] **Step 4: Add the locked double-check to `ensure_cache` rebuild paths** + +Add a helper after `build` and refactor the rebuild call sites. Add: + +```python +def _ensure_built(source_fa: Path, gvlfa_dir: Path) -> FastaCache: + """Build the cache under a best-effort lock, double-checking inside the lock + that another job hasn't already published a fresh cache.""" + with atomic_dir(gvlfa_dir, overwrite=True) as tmp: + if gvlfa_dir.exists(): + try: + meta, _source, status = load(gvlfa_dir) + if status == "fresh" and _data_size_ok(gvlfa_dir, meta): + raise SkipPublish + except SkipPublish: + raise + except Exception: + pass # unreadable/corrupt -> fall through and rebuild + _build_into(source_fa, tmp, gvlfa_dir) + return FastaCache.model_validate_json( + (gvlfa_dir / METADATA_FILENAME).read_text() + ) +``` + +In `_ensure_from_fasta` (current lines 216-246) replace the two `meta = build(source_fa, gvlfa_dir)` calls (the rebuild inside `if not valid` and the final fallback) and the migrate path with calls that go through the lock. Concretely: +- the `if not valid or meta is None:` block becomes `meta = _ensure_built(source_fa, gvlfa_dir)`, +- the final `return build(source_fa, gvlfa_dir), data_path` becomes `return _ensure_built(source_fa, gvlfa_dir), data_path`. + +In `_ensure_from_gvlfa` (current lines 249-273) replace `return build(source, gvlfa_dir), data_path` and `meta = build(source, gvlfa_dir)` with `_ensure_built(source, gvlfa_dir)` equivalents. + +> Note: `migrate_legacy` already publishes atomically (Step 3) and is only reached when no `gvlfa_dir` exists yet, so it does not need the double-check wrapper. + +- [ ] **Step 5: Run the FASTA cache suite** + +Run: `pixi run -e dev pytest tests/unit/test_fasta_cache.py tests/unit/test_fasta.py -v` +Expected: PASS (all prior tests + the 3 new ones). If `test_ensure_cache_double_check_reuses_fresh` fails because a build still runs, verify `_ensure_from_fasta` reuses the cache via the `valid` branch (it should never reach `_ensure_built` for a fresh cache — the double-check is only the in-lock safety net). + +- [ ] **Step 6: Lint and commit** + +```bash +pixi run -e dev ruff check python/genvarloader/_fasta_cache.py tests/unit/test_fasta_cache.py +git add python/genvarloader/_fasta_cache.py tests/unit/test_fasta_cache.py +git commit --no-verify -m "feat(_fasta_cache): publish cache atomically via atomic_dir + locked double-check" +``` + +--- + +## Task 4: Add `format_version` to `Metadata` and route `gvl.write` atomically + +**Files:** +- Modify: `python/genvarloader/_dataset/_write.py:38-49` (Metadata), `:124-137` + `:267-269` (write body) +- Test: `tests/unit/dataset/test_write_atomic.py` (new) + +`Metadata` gains a `format_version` field (default `None` so old datasets still parse). A module constant `DATASET_FORMAT_VERSION = SemanticVersion.parse("1.0.0")` records the current layout. `write()` builds the whole dataset into a temp dir and publishes it, moving the existing overwrite/`FileExistsError` semantics into `atomic_dir`. Because the temp dir is a sibling of `path` at the same depth, the `svar_link` relative path (`os.path.relpath(svar_resolved, start=path)`) stays correct after publish. + +- [ ] **Step 1: Write the failing tests** + +Create `tests/unit/dataset/test_write_atomic.py`: + +```python +import json + +import numpy as np +import polars as pl +import pytest + +import genvarloader as gvl +from genvarloader._dataset._write import DATASET_FORMAT_VERSION, Metadata + + +def _toy_bigwig_write(path, bw_fixture, bed): + gvl.write(path=path, bed=bed, tracks=[bw_fixture]) + + +def test_metadata_has_format_version_field(): + m = Metadata(samples=["s0"], contigs=["chr1"], n_regions=1) + # default is None for back-compat; write() stamps the current version + assert m.format_version is None + + +def test_dataset_format_version_is_1_0_0(): + assert str(DATASET_FORMAT_VERSION) == "1.0.0" + + +def test_write_stamps_format_version(tmp_path, ref_fasta): + # variants-free dataset is rejected; use a phased VCF-style write via the + # existing case helpers is heavy, so assert the metadata-stamping path + # directly: build a Metadata and confirm the field round-trips. + raw = Metadata( + samples=["s0"], + contigs=["chr1"], + n_regions=1, + format_version=DATASET_FORMAT_VERSION, + ).model_dump_json() + back = Metadata.model_validate_json(raw) + assert str(back.format_version) == "1.0.0" + + +def test_write_is_atomic_no_temp_left(tmp_path, phased_vcf_gvl, reference): + # Re-write an existing dataset's regions into a fresh path via open->reuse is + # heavy; instead assert the orphan-temp invariant using a minimal write. + # (Real end-to-end write coverage lives in test_concurrency.py.) + pytest.importorskip("genoray") + # Smallest legal write needs variants or tracks; reuse the session VCF case. + # The phased_vcf_gvl fixture already exercised gvl.write; assert no temp dirs + # were left beside it. + parent = phased_vcf_gvl.parent + assert list(parent.glob(f"{phased_vcf_gvl.name}.tmp.*")) == [] + assert list(parent.glob(f"{phased_vcf_gvl.name}.old.*")) == [] + + +def test_overwrite_false_existing_raises(tmp_path, phased_vcf_gvl): + bed = pl.DataFrame({"chrom": ["chr1"], "chromStart": [0], "chromEnd": [10]}) + with pytest.raises(FileExistsError): + gvl.write(path=phased_vcf_gvl, bed=bed, variants=None, overwrite=False) +``` + +> If `gvl.write(variants=None, tracks=None)` raises `ValueError` before reaching the existence check, adjust `test_overwrite_false_existing_raises` to pass a real `variants=` source from the `synthetic_case` fixture. Verify the actual guard order in Step 5 and fix the test to target `FileExistsError` specifically. + +- [ ] **Step 2: Run the tests to verify they fail** + +Run: `pixi run -e dev pytest tests/unit/dataset/test_write_atomic.py -v` +Expected: FAIL — `ImportError: cannot import name 'DATASET_FORMAT_VERSION'` and `Metadata` has no `format_version`. + +- [ ] **Step 3: Add `format_version` to `Metadata` and the module constant** + +In `python/genvarloader/_dataset/_write.py`, add the constant just above `class Metadata` (line 38): + +```python +DATASET_FORMAT_VERSION = SemanticVersion.parse("1.0.0") +"""On-disk layout version for a gvl.write dataset directory. Bump MAJOR only when +an existing dataset can no longer be read correctly by new code.""" +``` + +Add the field to `Metadata` (after line 44's `version` field): + +```python + format_version: SemanticVersion | None = None +``` + +- [ ] **Step 4: Route `write()` through `atomic_dir`** + +Add the import near the top of `_write.py` (with the other relative imports): + +```python +from .._atomic import atomic_dir +``` + +Stamp the format version into the metadata dict (where `metadata` is initialized, current lines 128-130): + +```python + metadata: dict[str, Any] = { + "version": SemanticVersion.parse(version("genvarloader")), + "format_version": DATASET_FORMAT_VERSION, + } +``` + +Replace the existence/overwrite/mkdir block (current lines 131-137): + +```python + path = Path(path) + if path.exists() and overwrite: + logger.info("Found existing GVL store, overwriting.") + shutil.rmtree(path) + elif path.exists() and not overwrite: + raise FileExistsError(f"{path} already exists.") + path.mkdir(parents=True, exist_ok=True) +``` + +with an `atomic_dir` wrapper around the **entire remaining body** of `write()`. The cleanest mechanical transform: keep all subsequent code that references `path` unchanged by rebinding `path` to the temp dir inside the context. Replace the block above with: + +```python + dest = Path(path) + _atomic_ctx = atomic_dir(dest, overwrite=overwrite) + path = _atomic_ctx.__enter__() +``` + +and wrap the rest of the function body (from `if isinstance(bed, (str, Path)):` through the final `metadata.json` write at line 269) in a `try/except/finally` that drives the context manager: + +```python + try: + # ... existing body, unchanged, all writes go to the temp `path` ... + _metadata = Metadata(**metadata) + with open(path / "metadata.json", "w") as f: + f.write(_metadata.model_dump_json()) + except BaseException: + _atomic_ctx.__exit__(*sys.exc_info()) + raise + else: + _atomic_ctx.__exit__(None, None, None) + + logger.info("Finished writing.") + warnings.simplefilter("default") +``` + +Add `import sys` at the top of `_write.py` if not already present. Verify `sys` is imported in Step 5. + +> Rationale for manual `__enter__`/`__exit__`: the body is long and references `path` in ~15 places; rebinding `path` to the temp dir avoids a large, error-prone re-indent while still publishing atomically. The temp dir is a sibling of `dest` at the same depth, so every relative path computed against `path` (notably the svar_link `os.path.relpath(..., start=path)` at line 800) is identical to one computed against `dest`. + +- [ ] **Step 5: Run the write tests** + +Run: `pixi run -e dev pytest tests/unit/dataset/test_write_atomic.py -v` +Expected: PASS. While here, confirm `import sys` exists in `_write.py` (`grep -n "^import sys" python/genvarloader/_dataset/_write.py`) and that the `variants is None and tracks is None` guard at line 109 runs *before* the new `atomic_dir` enter (it does — it's at the top of the function), so `test_overwrite_false_existing_raises` reaches the existence check only with a valid write; adjust that test per the Step 1 note if needed. + +- [ ] **Step 6: Run a write→open regression** + +Run: `pixi run -e dev pytest tests/unit/ -k "write or open or svar_link" -v` +Expected: PASS (svar_link relative-path resolution still works because temp is a same-depth sibling). + +- [ ] **Step 7: Lint and commit** + +```bash +pixi run -e dev ruff check python/genvarloader/_dataset/_write.py tests/unit/dataset/test_write_atomic.py +git add python/genvarloader/_dataset/_write.py tests/unit/dataset/test_write_atomic.py +git commit --no-verify -m "feat(_write): atomic dataset creation + format_version in Metadata" +``` + +--- + +## Task 5: Dataset validation on open (format gate + integrity) + +**Files:** +- Create: `python/genvarloader/_dataset/_validate.py` +- Modify: `python/genvarloader/_dataset/_open.py:101-103` (`_load_metadata`) +- Test: `tests/unit/dataset/test_validate.py` + +`validate_dataset(metadata, path)` enforces the format-version gate (incompatible major, too-new or too-old → actionable `ValueError`; missing `format_version` → treat as `1.0.0`, no warning) and structural/size integrity: required files exist; `regions.npy` has shape `(n_regions, 4)`; if `genotypes/` exists, `offsets.npy` length equals `n_regions * ploidy * n_samples + 1`. Datasets never auto-rebuild. `OpenRequest._load_metadata` calls it. + +- [ ] **Step 1: Write the failing tests** + +Create `tests/unit/dataset/test_validate.py`: + +```python +import numpy as np +import pytest + +from genvarloader._dataset._validate import validate_dataset +from genvarloader._dataset._write import DATASET_FORMAT_VERSION, Metadata + + +def _minimal_valid_dataset(path): + path.mkdir() + meta = Metadata( + samples=["s0", "s1"], + contigs=["chr1"], + n_regions=2, + format_version=DATASET_FORMAT_VERSION, + ) + (path / "metadata.json").write_text(meta.model_dump_json()) + # input_regions.arrow: presence-only check, write a stub + (path / "input_regions.arrow").write_bytes(b"stub") + np.save(path / "regions.npy", np.zeros((2, 4), dtype=np.int32)) + return meta + + +def test_valid_dataset_passes(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + validate_dataset(meta, path) # no raise + + +def test_missing_format_version_loads_as_1_0_0(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + meta.format_version = None + validate_dataset(meta, path) # no raise, no warning + + +def test_format_version_too_new_raises(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + from pydantic_extra_types.semantic_version import SemanticVersion + + meta.format_version = SemanticVersion.parse( + f"{DATASET_FORMAT_VERSION.major + 1}.0.0" + ) + with pytest.raises(ValueError, match="format version"): + validate_dataset(meta, path) + + +def test_missing_regions_npy_raises(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + (path / "regions.npy").unlink() + with pytest.raises(ValueError, match="regions.npy"): + validate_dataset(meta, path) + + +def test_regions_npy_wrong_length_raises(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + np.save(path / "regions.npy", np.zeros((5, 4), dtype=np.int32)) # n_regions=2 + with pytest.raises(ValueError, match="regions.npy"): + validate_dataset(meta, path) + + +def test_genotype_offsets_wrong_length_raises(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + meta.ploidy = 2 + geno = path / "genotypes" + geno.mkdir() + # correct length would be n_regions*ploidy*n_samples + 1 = 2*2*2 + 1 = 9 + np.save(geno / "offsets.npy", np.zeros(4, dtype=np.int64)) + with pytest.raises(ValueError, match="offsets.npy"): + validate_dataset(meta, path) + + +def test_genotype_offsets_correct_length_passes(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + meta.ploidy = 2 + geno = path / "genotypes" + geno.mkdir() + np.save(geno / "offsets.npy", np.zeros(2 * 2 * 2 + 1, dtype=np.int64)) + validate_dataset(meta, path) # no raise +``` + +- [ ] **Step 2: Run the tests to verify they fail** + +Run: `pixi run -e dev pytest tests/unit/dataset/test_validate.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'genvarloader._dataset._validate'`. + +- [ ] **Step 3: Implement `_validate.py`** + +Create `python/genvarloader/_dataset/_validate.py`: + +```python +"""Format-version gate and structural/size integrity checks for a gvl dataset. + +A dataset cannot auto-rebuild (it retains no source), so every failure raises an +actionable `ValueError` instructing the user to regenerate with `gvl.write`. +Only total byte/length sizes are checked — full-content hashing of multi-GB +datasets is intentionally avoided (same tradeoff as the FASTA cache). +""" + +from __future__ import annotations + +from pathlib import Path + +import numpy as np + +from ._write import DATASET_FORMAT_VERSION, Metadata + +__all__ = ["validate_dataset"] + + +def validate_dataset(metadata: Metadata, path: Path) -> None: + """Validate an on-disk dataset before constructing readers. + + Raises + ------ + ValueError + On an incompatible format version or a structural/size integrity failure. + """ + _check_format_version(metadata, path) + _check_integrity(metadata, path) + + +def _check_format_version(metadata: Metadata, path: Path) -> None: + fmt = metadata.format_version + if fmt is None: + return # legacy dataset: treat as the current layout, load best-effort + if fmt.major != DATASET_FORMAT_VERSION.major: + raise ValueError( + f"Dataset at {path} has format version {fmt}, incompatible with this " + f"genvarloader's format version {DATASET_FORMAT_VERSION}. Regenerate " + f"the dataset with `gvl.write`" + + ( + " or upgrade genvarloader." + if fmt.major > DATASET_FORMAT_VERSION.major + else "." + ) + ) + + +def _check_integrity(metadata: Metadata, path: Path) -> None: + for required in ("metadata.json", "input_regions.arrow", "regions.npy"): + if not (path / required).exists(): + raise ValueError( + f"Dataset at {path} is missing required file '{required}'. " + "Regenerate the dataset with `gvl.write`." + ) + + regions = np.load(path / "regions.npy", mmap_mode="r") + if regions.shape != (metadata.n_regions, 4): + raise ValueError( + f"Dataset at {path}: regions.npy has shape {regions.shape}, expected " + f"({metadata.n_regions}, 4). The dataset is corrupt or truncated; " + "regenerate with `gvl.write`." + ) + + geno_offsets = path / "genotypes" / "offsets.npy" + if geno_offsets.exists(): + if metadata.ploidy is None: + raise ValueError( + f"Dataset at {path} has genotypes but no ploidy in metadata; " + "regenerate with `gvl.write`." + ) + expected = metadata.n_regions * metadata.ploidy * metadata.n_samples + 1 + offsets = np.load(geno_offsets, mmap_mode="r") + if offsets.shape[0] != expected: + raise ValueError( + f"Dataset at {path}: genotypes/offsets.npy has length " + f"{offsets.shape[0]}, expected {expected} " + f"(n_regions * ploidy * n_samples + 1). The dataset is corrupt or " + "truncated; regenerate with `gvl.write`." + ) +``` + +- [ ] **Step 4: Wire validation into `OpenRequest._load_metadata`** + +In `python/genvarloader/_dataset/_open.py`, add the import (after line 26's `from ._write import Metadata`): + +```python +from ._validate import validate_dataset +``` + +Replace `_load_metadata` (current lines 101-103): + +```python + def _load_metadata(self) -> Metadata: + with _py_open(self.path / "metadata.json") as f: + metadata = Metadata.model_validate_json(f.read()) + validate_dataset(metadata, self.path) + return metadata +``` + +- [ ] **Step 5: Run the validation tests** + +Run: `pixi run -e dev pytest tests/unit/dataset/test_validate.py -v` +Expected: PASS (7 tests). + +- [ ] **Step 6: Run an open regression including a real dataset** + +Run: `pixi run -e dev pytest tests/unit/ -k "open or torch or reconstruct" -v` +Expected: PASS — existing written datasets (with `format_version` now stamped, and the session fixtures regenerated this run) open cleanly. + +- [ ] **Step 7: Lint and commit** + +```bash +pixi run -e dev ruff check python/genvarloader/_dataset/_validate.py python/genvarloader/_dataset/_open.py tests/unit/dataset/test_validate.py +git add python/genvarloader/_dataset/_validate.py python/genvarloader/_dataset/_open.py tests/unit/dataset/test_validate.py +git commit --no-verify -m "feat(_open): validate dataset format version + integrity on open" +``` + +--- + +## Task 6: Concurrency regression (the #21 fix) + +**Files:** +- Test: `tests/unit/test_concurrency.py` + +Prove the corruption in #21 is gone: N processes building the same `.gvlfa` cache concurrently produce a valid cache byte-identical to a single-process build; N processes writing the same dataset path (`overwrite=True`) leave exactly one valid, openable dataset and no orphan published as `path`. + +- [ ] **Step 1: Write the concurrency tests** + +Create `tests/unit/test_concurrency.py`: + +```python +import multiprocessing as mp +import shutil +from pathlib import Path + +import numpy as np +import pytest + +import genvarloader._fasta_cache as fc + +# Use spawn so workers re-import cleanly regardless of host start method. +_CTX = mp.get_context("spawn") + + +def _build_cache_worker(src_str): + fc.ensure_cache(Path(src_str)) + + +@pytest.mark.slow +def test_concurrent_ensure_cache_no_corruption(tmp_path, ref_fasta): + src = tmp_path / "ref.fa.bgz" + shutil.copy(ref_fasta, src) + shutil.copy(str(ref_fasta) + ".fai", str(src) + ".fai") + if Path(str(ref_fasta) + ".gzi").exists(): + shutil.copy(str(ref_fasta) + ".gzi", str(src) + ".gzi") + + # single-process reference build + ref_dir = tmp_path / "single" / "ref.fa.bgz" + ref_dir.parent.mkdir() + shutil.copy(src, ref_dir) + shutil.copy(str(src) + ".fai", str(ref_dir) + ".fai") + if Path(str(src) + ".gzi").exists(): + shutil.copy(str(src) + ".gzi", str(ref_dir) + ".gzi") + _meta, single_data = fc.ensure_cache(ref_dir) + expected = np.array(np.memmap(single_data, np.uint8, "r")) + + # N concurrent builders against the same source + procs = [_CTX.Process(target=_build_cache_worker, args=(str(src),)) for _ in range(6)] + for p in procs: + p.start() + for p in procs: + p.join(timeout=120) + assert p.exitcode == 0 + + meta, data = fc.ensure_cache(src) + got = np.array(np.memmap(data, np.uint8, "r")) + np.testing.assert_array_equal(got, expected) + # no orphan temp/old dirs published beside the cache + assert list(tmp_path.glob("ref.fa.bgz.gvlfa.tmp.*")) == [] + assert list(tmp_path.glob("ref.fa.bgz.gvlfa.old.*")) == [] +``` + +For the dataset race, add a worker that calls `gvl.write(..., overwrite=True)` against a shared path using the `synthetic_case` source files. Because that fixture is session-scoped and heavy, gate this second test behind the existing data and keep it minimal: + +```python +def _write_worker(path_str, vcf_str, bed_rows): + import polars as pl + + import genvarloader as gvl + + bed = pl.DataFrame(bed_rows) + gvl.write(path=Path(path_str), bed=bed, variants=vcf_str, overwrite=True) + + +@pytest.mark.slow +def test_concurrent_gvl_write_one_valid_dataset(tmp_path, phased_vcf_gvl, reference): + import genvarloader as gvl + + # Recover the source VCF + a region bed from the already-written fixture so we + # can re-write the same destination path from several processes at once. + src_vcf = next(phased_vcf_gvl.parent.glob("*.vcf*"), None) + if src_vcf is None: + pytest.skip("no source VCF beside phased_vcf_gvl fixture to drive a re-write") + + import polars as pl + + bed_rows = {"chrom": ["chr1"], "chromStart": [0], "chromEnd": [64]} + dest = tmp_path / "shared.gvl" + + procs = [ + _CTX.Process( + target=_write_worker, args=(str(dest), str(src_vcf), bed_rows) + ) + for _ in range(4) + ] + for p in procs: + p.start() + for p in procs: + p.join(timeout=180) + assert p.exitcode == 0 + + # exactly one valid dataset published; no orphan temp/old dirs + assert dest.is_dir() + assert list(tmp_path.glob("shared.gvl.tmp.*")) == [] + assert list(tmp_path.glob("shared.gvl.old.*")) == [] + ds = gvl.Dataset.open(dest, reference=reference) + assert len(ds) > 0 +``` + +> If the source VCF cannot be recovered from the fixture directory, the second test self-skips. In that case, drive it from the `synthetic_case` builder helpers in `tests/conftest.py` (`build_case` / `session_reference`) instead — confirm the available helper names in Step 2 and wire one in so the dataset race is actually exercised. + +- [ ] **Step 2: Run the concurrency tests** + +Run: `pixi run -e dev pytest tests/unit/test_concurrency.py -v -m slow` +Expected: PASS. Both tests confirm byte-identical / single-valid results with no orphan dirs. If `test_concurrent_gvl_write_one_valid_dataset` skips, wire in the `synthetic_case` helper per the note and re-run so the dataset race is covered. + +- [ ] **Step 3: Commit** + +```bash +git add tests/unit/test_concurrency.py +git commit -m "test: concurrency regression for atomic cache + dataset creation (closes #21)" +``` + +--- + +## Task 7: Docs, limitations note, and full-suite regression + +**Files:** +- Modify: `skills/genvarloader/SKILL.md` +- Modify: `python/genvarloader/_dataset/_write.py` (docstring), `python/genvarloader/_fasta_cache.py` (module docstring) + +- [ ] **Step 1: Document atomic/locked creation and the format gate in the skill** + +In `skills/genvarloader/SKILL.md`, find the section describing `.gvlfa` support (added in PR #206) and the `gvl.write`/`Dataset.open` sections. Add concise notes: +- `gvl.write` and the `.gvlfa` cache are created atomically (build-to-temp + atomic rename) and are safe to run concurrently from parallel jobs sharing one reference/destination; a best-effort lock avoids redundant rebuilds. +- `Dataset.open` validates the dataset's `format_version` and structural integrity; an incompatible or corrupt dataset raises a `ValueError` instructing regeneration with `gvl.write` (datasets do not auto-rebuild; the FASTA cache does). +- Documented limitation: genoray `.gvi` and pysam `.fai`/`.gzi` index files are created by those libraries and are **not** made atomic/locked by gvl; concurrent jobs relying on those still depend on the upstream libraries' behavior. + +- [ ] **Step 2: Add the out-of-scope limitation to the write docstring** + +In `gvl.write`'s docstring (the `Notes`/end of the docstring in `_write.py`), add a short note mirroring the limitation above (genoray/pysam index files not covered). + +- [ ] **Step 3: Run ruff and the full fast suite** + +Run: `pixi run -e dev ruff check python/` +Expected: clean. + +Run: `pixi run -e dev pytest tests/ -m "not slow"` +Expected: all pass (parity with PR #206's baseline: ~567 passed + the new atomic/validate tests, 0 failures). Record the exact counts in the commit message. + +- [ ] **Step 4: Run the slow concurrency tier once more end-to-end** + +Run: `pixi run -e dev pytest tests/unit/test_concurrency.py -m slow -v` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add skills/genvarloader/SKILL.md python/genvarloader/_dataset/_write.py python/genvarloader/_fasta_cache.py +git commit -m "docs(skill): note atomic/locked creation, dataset format gate, index-file limitation" +``` + +--- + +## Self-Review Notes (spec coverage) + +- **Atomic rename, sibling temp, both artifacts** → Tasks 2 (primitive), 3 (FASTA), 4 (dataset). +- **filelock, best-effort, 60s internal default** → Tasks 1, 2 (`DEFAULT_LOCK_TIMEOUT = 60.0`, not exposed on public API). +- **Double-check inside lock** → Task 3 `_ensure_built`. +- **Move-aside-then-rename for overwrite** → Task 2 `_publish`. +- **`format_version` 1.0.0, missing → 1.0.0, major-on-break** → Tasks 4 (`DATASET_FORMAT_VERSION`, `Metadata.format_version`), 5 (`_check_format_version`). +- **Structural + size integrity, datasets never auto-rebuild, FASTA does** → Task 5 (`_check_integrity`), Task 3 (FASTA rebuild retained). +- **Concurrency regression (#21)** → Task 6. +- **Out-of-scope `.gvi`/`.fai`/`.gzi` documented** → Task 7. +- **Orphan temp / lock-file policy** → asserted in Tasks 2/3/4/6 (no orphan `.tmp.*`/`.old.*`); `.lock` files persist and are reused (Task 2 `test_lock_file_sibling_created_and_reused`). +``` From bd15a0f0028add3370d30e9b7ebb3f8b6e575b8d Mon Sep 17 00:00:00 2001 From: d-laub Date: Wed, 3 Jun 2026 12:14:49 -0700 Subject: [PATCH 17/28] build: add filelock dependency for atomic artifact creation Co-Authored-By: Claude Opus 4.8 --- pixi.lock | 1 + pyproject.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/pixi.lock b/pixi.lock index 995b9ab..ccccb24 100644 --- a/pixi.lock +++ b/pixi.lock @@ -11497,6 +11497,7 @@ packages: - pooch - awkward - hirola>=0.3,<0.4 + - filelock>=3.12 - genvarloader-cli>=0.1.0 ; extra == 'cli' - tbb ; extra == 'tbb' - pyomp ; extra == 'pyomp' diff --git a/pyproject.toml b/pyproject.toml index 6bde489..214fc1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "pooch", "awkward", "hirola>=0.3,<0.4", + "filelock>=3.12", ] [project.optional-dependencies] From ff3092e8cb5e47beba53a74903ce891e1339b969 Mon Sep 17 00:00:00 2001 From: d-laub Date: Wed, 3 Jun 2026 12:17:46 -0700 Subject: [PATCH 18/28] feat(_atomic): add atomic_dir directory-publish primitive Co-Authored-By: Claude Opus 4.8 --- python/genvarloader/_atomic.py | 120 +++++++++++++++++++++++++++++++++ tests/unit/test_atomic.py | 89 ++++++++++++++++++++++++ 2 files changed, 209 insertions(+) create mode 100644 python/genvarloader/_atomic.py create mode 100644 tests/unit/test_atomic.py diff --git a/python/genvarloader/_atomic.py b/python/genvarloader/_atomic.py new file mode 100644 index 0000000..46acf6b --- /dev/null +++ b/python/genvarloader/_atomic.py @@ -0,0 +1,120 @@ +"""Safely publish a directory artifact via build-to-temp-then-atomic-rename. + +The single primitive both the `.gvlfa` FASTA cache and `gvl.write` dataset +directories need: build into a private sibling temp dir, then publish it to the +destination with an atomic `os.replace`. A best-effort `filelock` avoids N +redundant concurrent builds, but is never required for correctness — the atomic +rename is the correctness guarantee, so a lock timeout or a silent network-FS +no-op simply means "build anyway". +""" + +from __future__ import annotations + +import os +import shutil +from contextlib import contextmanager +from pathlib import Path +from typing import Iterator +from uuid import uuid4 + +from filelock import FileLock, Timeout +from loguru import logger + +__all__ = ["atomic_dir", "SkipPublish", "DEFAULT_LOCK_TIMEOUT"] + +DEFAULT_LOCK_TIMEOUT = 60.0 +"""Seconds to wait for the build lock before giving up and building anyway.""" + + +class SkipPublish(Exception): # noqa: N818 - control-flow sentinel, not an error + """Raise inside an `atomic_dir` block to abort publishing. + + Use when a double-check inside the lock finds `dest` already valid: the temp + dir is removed and `dest` is left untouched, with no exception surfacing to + the caller. + """ + + +def _publish(tmp: Path, dest: Path, *, overwrite: bool) -> None: + """Move `tmp` into place at `dest` as atomically as the filesystem allows.""" + if dest.exists(): + if not overwrite: + # A racing builder published `dest` while we built. Our content is + # byte-identical, so discard ours; theirs wins harmlessly. + shutil.rmtree(tmp, ignore_errors=True) + return + aside = dest.with_name(f"{dest.name}.old.{uuid4().hex[:8]}") + try: + os.replace(dest, aside) + except FileNotFoundError: + aside = None # another writer already moved/removed dest + os.replace(tmp, dest) + if aside is not None: + shutil.rmtree(aside, ignore_errors=True) + else: + os.replace(tmp, dest) + + +@contextmanager +def atomic_dir( + dest: str | os.PathLike[str], + *, + overwrite: bool = False, + lock: bool = True, + timeout: float = DEFAULT_LOCK_TIMEOUT, +) -> Iterator[Path]: + """Yield a private temp dir to build into; atomically publish it to `dest`. + + On clean exit the temp dir is `os.replace`-d into `dest`. On `SkipPublish` + the temp dir is removed and `dest` is left untouched. On any other exception + the temp dir is removed and the exception propagates; `dest` is never + partially written. + + Parameters + ---------- + dest + Final destination directory. + overwrite + If False and `dest` already exists, raise `FileExistsError` up front. If + True, an existing `dest` is replaced via move-aside-then-rename. + lock + Acquire a best-effort `.lock` to avoid redundant concurrent builds. + timeout + Seconds to wait for the lock before logging and proceeding anyway. + """ + dest = Path(dest) + if dest.exists() and not overwrite: + raise FileExistsError( + f"{dest} already exists; pass overwrite=True to replace it." + ) + + flock: FileLock | None = None + if lock: + flock = FileLock(str(dest) + ".lock") + try: + flock.acquire(timeout=timeout) + except Timeout: + logger.info( + f"Timed out after {timeout}s waiting for {dest}.lock; " + "building anyway (atomic rename keeps this correct)." + ) + flock = None + + tmp = dest.with_name(f"{dest.name}.tmp.{os.getpid()}-{uuid4().hex[:8]}") + tmp.mkdir(parents=True) + try: + yield tmp + except SkipPublish: + shutil.rmtree(tmp, ignore_errors=True) + except BaseException: + shutil.rmtree(tmp, ignore_errors=True) + raise + else: + _publish(tmp, dest, overwrite=overwrite) + finally: + if flock is not None: + lock_path = Path(str(dest) + ".lock") + flock.release() + # Ensure the lock file persists as a sibling so future callers can + # reuse it; filelock may remove it on release. + lock_path.touch(exist_ok=True) diff --git a/tests/unit/test_atomic.py b/tests/unit/test_atomic.py new file mode 100644 index 0000000..0d490d6 --- /dev/null +++ b/tests/unit/test_atomic.py @@ -0,0 +1,89 @@ +import pytest + +from genvarloader._atomic import SkipPublish, atomic_dir + + +def test_publishes_on_clean_exit(tmp_path): + dest = tmp_path / "artifact" + with atomic_dir(dest) as tmp: + assert tmp.is_dir() + assert tmp != dest + (tmp / "data.bin").write_bytes(b"hello") + assert dest.is_dir() + assert (dest / "data.bin").read_bytes() == b"hello" + # temp sibling cleaned up + leftovers = list(tmp_path.glob("artifact.tmp.*")) + assert leftovers == [] + + +def test_temp_is_sibling_same_parent(tmp_path): + dest = tmp_path / "sub" / "artifact" + dest.parent.mkdir() + seen = {} + with atomic_dir(dest) as tmp: + seen["tmp"] = tmp + (tmp / "x").write_text("1") + assert seen["tmp"].parent == dest.parent + + +def test_removes_temp_and_leaves_no_dest_on_exception(tmp_path): + dest = tmp_path / "artifact" + with pytest.raises(RuntimeError, match="boom"): + with atomic_dir(dest) as tmp: + (tmp / "x").write_text("1") + raise RuntimeError("boom") + assert not dest.exists() + assert list(tmp_path.glob("artifact.tmp.*")) == [] + + +def test_existing_dest_no_overwrite_raises(tmp_path): + dest = tmp_path / "artifact" + dest.mkdir() + with pytest.raises(FileExistsError): + with atomic_dir(dest, overwrite=False) as tmp: + (tmp / "x").write_text("1") + + +def test_overwrite_replaces_existing_dir(tmp_path): + dest = tmp_path / "artifact" + dest.mkdir() + (dest / "old.bin").write_bytes(b"old") + with atomic_dir(dest, overwrite=True) as tmp: + (tmp / "new.bin").write_bytes(b"new") + assert (dest / "new.bin").read_bytes() == b"new" + assert not (dest / "old.bin").exists() + assert list(tmp_path.glob("artifact.old.*")) == [] + + +def test_skip_publish_leaves_dest_untouched(tmp_path): + dest = tmp_path / "artifact" + dest.mkdir() + (dest / "keep.bin").write_bytes(b"keep") + with atomic_dir(dest, overwrite=True) as tmp: + (tmp / "ignored.bin").write_bytes(b"ignored") + raise SkipPublish + assert (dest / "keep.bin").read_bytes() == b"keep" + assert not (dest / "ignored.bin").exists() + assert list(tmp_path.glob("artifact.tmp.*")) == [] + + +def test_lock_file_sibling_created_and_reused(tmp_path): + dest = tmp_path / "artifact" + with atomic_dir(dest, lock=True) as tmp: + (tmp / "x").write_text("1") + # filelock leaves an empty .lock sibling + assert (tmp_path / "artifact.lock").exists() + + +def test_concurrent_publish_loser_discarded(tmp_path): + # Simulate: a racing builder publishes dest while we are mid-build and we + # are overwrite=False. Our publish must discard our temp, not clobber. + dest = tmp_path / "artifact" + with atomic_dir(dest, overwrite=False, lock=False) as tmp: + (tmp / "ours.bin").write_bytes(b"ours") + # racing builder finishes first: + dest.mkdir() + (dest / "theirs.bin").write_bytes(b"theirs") + assert (dest / "theirs.bin").read_bytes() == b"theirs" + assert not (dest / "ours.bin").exists() + assert list(tmp_path.glob("artifact.tmp.*")) == [] From 5dab78f013b8a7ae70b5ebeebfb1403ec084b1df Mon Sep 17 00:00:00 2001 From: d-laub Date: Wed, 3 Jun 2026 12:22:59 -0700 Subject: [PATCH 19/28] feat(_fasta_cache): publish cache atomically via atomic_dir + locked double-check build() and migrate_legacy() now write into a private sibling temp dir and atomically publish via atomic_dir(overwrite=True), so a mid-build crash or concurrent builder never leaves a partial .gvlfa dir. _ensure_built() wraps atomic_dir with an in-lock double-check so concurrent callers reuse a freshly published cache instead of rebuilding redundantly. All ensure_cache rebuild paths go through _ensure_built instead of build directly. Co-Authored-By: Claude Opus 4.8 --- python/genvarloader/_fasta_cache.py | 85 +++++++++++++++++++++-------- tests/unit/test_fasta_cache.py | 42 ++++++++++++++ 2 files changed, 104 insertions(+), 23 deletions(-) diff --git a/python/genvarloader/_fasta_cache.py b/python/genvarloader/_fasta_cache.py index e01afc4..425c4d4 100644 --- a/python/genvarloader/_fasta_cache.py +++ b/python/genvarloader/_fasta_cache.py @@ -15,6 +15,8 @@ from pydantic_extra_types.semantic_version import SemanticVersion from tqdm.auto import tqdm +from ._atomic import SkipPublish, atomic_dir + __all__ = ["FastaCache", "ensure_cache"] FORMAT_VERSION = SemanticVersion.parse("1.0.0") @@ -111,24 +113,59 @@ def _write_sequence(source_fa: Path, gvlfa_dir: Path, contigs: dict[str, int]) - data.flush() -def build(source_fa: str | Path, gvlfa_dir: str | Path) -> FastaCache: - """Build a fresh .gvlfa cache containing all contigs of the source FASTA.""" - source_fa = Path(source_fa) - gvlfa_dir = Path(gvlfa_dir) - gvlfa_dir.mkdir(parents=True, exist_ok=True) +def _build_into(source_fa: Path, target_dir: Path, dest_for_hints: Path) -> FastaCache: + """Write sequence.bin + metadata.json into `target_dir`. + + `dest_for_hints` is the *final* cache dir (a sibling of `target_dir` at the + same depth) used to compute source path hints, so the stored relative path is + correct after publish. + """ contigs = _contig_lengths(source_fa) - _write_sequence(source_fa, gvlfa_dir, contigs) + _write_sequence(source_fa, target_dir, contigs) meta = FastaCache( format_version=FORMAT_VERSION, genvarloader_version=_gvl_version(), contigs=contigs, - source=_source_hints(source_fa, gvlfa_dir), + source=_source_hints(source_fa, dest_for_hints), fingerprint=fingerprint(source_fa), ) - (gvlfa_dir / METADATA_FILENAME).write_text(meta.model_dump_json()) + (target_dir / METADATA_FILENAME).write_text(meta.model_dump_json()) return meta +def build(source_fa: str | Path, gvlfa_dir: str | Path) -> FastaCache: + """Build a fresh .gvlfa cache containing all contigs of the source FASTA. + + Builds into a private sibling temp dir and atomically publishes it, so a + concurrent builder or an interrupted build never leaves a partial cache. + """ + source_fa = Path(source_fa) + gvlfa_dir = Path(gvlfa_dir) + meta_holder: dict[str, FastaCache] = {} + with atomic_dir(gvlfa_dir, overwrite=True) as tmp: + meta_holder["meta"] = _build_into(source_fa, tmp, gvlfa_dir) + return meta_holder["meta"] + + +def _ensure_built(source_fa: Path, gvlfa_dir: Path) -> FastaCache: + """Build the cache under a best-effort lock, double-checking inside the lock + that another job hasn't already published a fresh cache.""" + with atomic_dir(gvlfa_dir, overwrite=True) as tmp: + if gvlfa_dir.exists(): + try: + meta, _source, status = load(gvlfa_dir) + if status == "fresh" and _data_size_ok(gvlfa_dir, meta): + raise SkipPublish + except SkipPublish: + raise + except Exception: + pass # unreadable/corrupt -> fall through and rebuild + _build_into(source_fa, tmp, gvlfa_dir) + return FastaCache.model_validate_json( + (gvlfa_dir / METADATA_FILENAME).read_text() + ) + + def _check_format_version(meta: FastaCache, gvlfa_dir: Path) -> None: if meta.format_version.major > FORMAT_VERSION.major: raise ValueError( @@ -175,17 +212,19 @@ def migrate_legacy( "ignoring stale legacy bytes and building a fresh cache." ) return build(source_fa, gvlfa_dir) - gvlfa_dir.mkdir(parents=True, exist_ok=True) - shutil.move(str(legacy_gvl), str(gvlfa_dir / DATA_FILENAME)) - meta = FastaCache( - format_version=FORMAT_VERSION, - genvarloader_version=_gvl_version(), - contigs=contigs, - source=_source_hints(source_fa, gvlfa_dir), - fingerprint=fp, - ) - (gvlfa_dir / METADATA_FILENAME).write_text(meta.model_dump_json()) - return meta + meta_holder: dict[str, FastaCache] = {} + with atomic_dir(gvlfa_dir, overwrite=True) as tmp: + shutil.move(str(legacy_gvl), str(tmp / DATA_FILENAME)) + meta = FastaCache( + format_version=FORMAT_VERSION, + genvarloader_version=_gvl_version(), + contigs=contigs, + source=_source_hints(source_fa, gvlfa_dir), + fingerprint=fp, + ) + (tmp / METADATA_FILENAME).write_text(meta.model_dump_json()) + meta_holder["meta"] = meta + return meta_holder["meta"] def _cache_dir_for(source_fa: Path) -> Path: @@ -237,13 +276,13 @@ def _ensure_from_fasta(source_fa: Path) -> tuple[FastaCache, Path]: not valid or meta is None ): # redundant at runtime; satisfies the type checker logger.info(f"Building FASTA cache at {gvlfa_dir}.") - meta = build(source_fa, gvlfa_dir) + meta = _ensure_built(source_fa, gvlfa_dir) return meta, data_path if legacy.exists(): logger.info(f"Migrating legacy FASTA cache {legacy} -> {gvlfa_dir}.") return migrate_legacy(source_fa, legacy, gvlfa_dir), data_path logger.info(f"Building FASTA cache at {gvlfa_dir}.") - return build(source_fa, gvlfa_dir), data_path + return _ensure_built(source_fa, gvlfa_dir), data_path def _ensure_from_gvlfa(gvlfa_dir: Path) -> tuple[FastaCache, Path]: @@ -256,14 +295,14 @@ def _ensure_from_gvlfa(gvlfa_dir: Path) -> tuple[FastaCache, Path]: raise ValueError(f"FASTA cache at {gvlfa_dir} is unreadable: {e}") from e if not _data_size_ok(gvlfa_dir, meta): if source is not None: - return build(source, gvlfa_dir), data_path + return _ensure_built(source, gvlfa_dir), data_path raise ValueError( f"FASTA cache data at {data_path} is corrupt and the source FASTA " "could not be located to rebuild it." ) if status == "stale": logger.info(f"Source FASTA changed; rebuilding cache at {gvlfa_dir}.") - meta = build(source, gvlfa_dir) + meta = _ensure_built(source, gvlfa_dir) elif status == "unvalidated": warnings.warn( f"Could not locate source FASTA for cache {gvlfa_dir}; using cached " diff --git a/tests/unit/test_fasta_cache.py b/tests/unit/test_fasta_cache.py index d0ef47b..4136f09 100644 --- a/tests/unit/test_fasta_cache.py +++ b/tests/unit/test_fasta_cache.py @@ -287,3 +287,45 @@ def test_ensure_cache_format_too_new_raises_from_both_entry_points(tmp_path, loc # .fa entry point (sibling cache) must ALSO raise, not silently rebuild/downgrade. with pytest.raises(ValueError, match="newer than supported"): fc.ensure_cache(local_fa) + + +def test_build_is_atomic_no_temp_left(tmp_path): + src = tmp_path / "ref.fa" + src.write_text(">chr1\nACGTACGT\n") + pysam.faidx(str(src)) + gvlfa = tmp_path / "ref.fa.gvlfa" + fc.build(src, gvlfa) + assert (gvlfa / fc.DATA_FILENAME).exists() + assert (gvlfa / fc.METADATA_FILENAME).exists() + # no orphan temp / lock-leak that looks like a cache dir + assert list(tmp_path.glob("ref.fa.gvlfa.tmp.*")) == [] + assert list(tmp_path.glob("ref.fa.gvlfa.old.*")) == [] + + +def test_build_failure_leaves_no_partial_cache(tmp_path, monkeypatch): + src = tmp_path / "ref.fa" + src.write_text(">chr1\nACGTACGT\n") + pysam.faidx(str(src)) + gvlfa = tmp_path / "ref.fa.gvlfa" + + def boom(*a, **k): + raise RuntimeError("disk full") + + monkeypatch.setattr(fc, "_write_sequence", boom) + with pytest.raises(RuntimeError, match="disk full"): + fc.build(src, gvlfa) + assert not gvlfa.exists() + assert list(tmp_path.glob("ref.fa.gvlfa.tmp.*")) == [] + + +def test_ensure_cache_double_check_reuses_fresh(tmp_path): + # When dest is already a fresh cache, ensure_cache must not rebuild it. + src = tmp_path / "ref.fa" + src.write_text(">chr1\nACGTACGT\n") + pysam.faidx(str(src)) + meta1, data1 = fc.ensure_cache(src) + mtime1 = (data1).stat().st_mtime_ns + meta2, data2 = fc.ensure_cache(src) + assert data2 == data1 + # data file was not rewritten (fresh cache reused) + assert data2.stat().st_mtime_ns == mtime1 From f6d220981d69b48ec117796840e4ca94ce263e83 Mon Sep 17 00:00:00 2001 From: d-laub Date: Wed, 3 Jun 2026 12:30:55 -0700 Subject: [PATCH 20/28] feat(_write): atomic dataset creation + format_version in Metadata - Add DATASET_FORMAT_VERSION = "1.0.0" module constant to _write.py - Add format_version: SemanticVersion | None = None field to Metadata - Stamp format_version into metadata dict at write time - Route gvl.write through atomic_dir: build into a temp sibling dir, publish via os.replace on success, clean up on failure - Remove now-unused shutil import (atomic_dir owns cleanup) - Add tests/unit/dataset/test_write_atomic.py (5 tests) Co-Authored-By: Claude Opus 4.8 --- python/genvarloader/_dataset/_write.py | 271 ++++++++++++------------ tests/unit/dataset/test_write_atomic.py | 67 ++++++ 2 files changed, 207 insertions(+), 131 deletions(-) create mode 100644 tests/unit/dataset/test_write_atomic.py diff --git a/python/genvarloader/_dataset/_write.py b/python/genvarloader/_dataset/_write.py index c37b991..be2dba9 100644 --- a/python/genvarloader/_dataset/_write.py +++ b/python/genvarloader/_dataset/_write.py @@ -1,6 +1,6 @@ import gc import json -import shutil +import sys import warnings from collections.abc import Iterator, Sequence from importlib.metadata import version @@ -28,6 +28,7 @@ from seqpro.rag import Ragged from tqdm.auto import tqdm +from .._atomic import atomic_dir from .._ragged import INTERVAL_DTYPE from .._utils import lengths_to_offsets, normalize_contig_name from .._variants._utils import path_is_pgen, path_is_vcf @@ -35,6 +36,11 @@ from ._utils import bed_to_regions, splits_sum_le_value +DATASET_FORMAT_VERSION = SemanticVersion.parse("1.0.0") +"""On-disk layout version for a gvl.write dataset directory. Bump MAJOR only when +an existing dataset can no longer be read correctly by new code.""" + + class Metadata(BaseModel, arbitrary_types_allowed=True): samples: list[str] contigs: list[str] @@ -42,6 +48,7 @@ class Metadata(BaseModel, arbitrary_types_allowed=True): ploidy: int | None = None max_jitter: int = 0 version: SemanticVersion | None = None + format_version: SemanticVersion | None = None svar_link: SvarLink | None = None @property @@ -126,147 +133,149 @@ def write( max_mem = parse_memory(max_mem) metadata: dict[str, Any] = { - "version": SemanticVersion.parse(version("genvarloader")) + "version": SemanticVersion.parse(version("genvarloader")), + "format_version": DATASET_FORMAT_VERSION, } - path = Path(path) - if path.exists() and overwrite: - logger.info("Found existing GVL store, overwriting.") - shutil.rmtree(path) - elif path.exists() and not overwrite: - raise FileExistsError(f"{path} already exists.") - path.mkdir(parents=True, exist_ok=True) - - if isinstance(bed, (str, Path)): - bed = sp.bed.read(bed) - - gvl_bed, contigs, input_to_sorted_idx_map = _prep_bed(bed, max_jitter) - bed.with_columns(r_idx_map=pl.Series(input_to_sorted_idx_map)).write_ipc( - path / "input_regions.arrow" - ) - metadata["contigs"] = contigs - if max_jitter is not None: - metadata["max_jitter"] = max_jitter - - available_samples: set[str] | None = None - if variants is not None: - if isinstance(variants, (str, Path)): - variants = Path(variants) - if path_is_pgen(variants): - if variants.suffix == "": - variants = variants.with_suffix(".pgen") - variants = PGEN(variants) - elif path_is_vcf(variants): - variants = VCF(variants) - elif variants.is_dir() and variants.suffix == ".svar": - variants = SparseVar(variants) - else: - raise ValueError( - f"File {variants} has an unrecognized file extension. Please provide either a VCF or PGEN file.`" - ) - - if available_samples is None: - available_samples = set(variants.available_samples) - - # Eagerly load the variant index so max_mem accounting is honest. - # VCF and PGEN both support lazy-index construction; without this, - # variants.nbytes returns 0 and the budget overcounts memory. - if isinstance(variants, VCF): - if variants._index is None: - if not variants._valid_index(): - logger.info("VCF genoray index is invalid, writing") - variants._write_gvi_index() - variants._load_index() - elif isinstance(variants, PGEN): - variants._init_index() + dest = Path(path) + _atomic_ctx = atomic_dir(dest, overwrite=overwrite) + path = _atomic_ctx.__enter__() + try: + if isinstance(bed, (str, Path)): + bed = sp.bed.read(bed) + + gvl_bed, contigs, input_to_sorted_idx_map = _prep_bed(bed, max_jitter) + bed.with_columns(r_idx_map=pl.Series(input_to_sorted_idx_map)).write_ipc( + path / "input_regions.arrow" + ) + metadata["contigs"] = contigs + if max_jitter is not None: + metadata["max_jitter"] = max_jitter + + available_samples: set[str] | None = None + if variants is not None: + if isinstance(variants, (str, Path)): + variants = Path(variants) + if path_is_pgen(variants): + if variants.suffix == "": + variants = variants.with_suffix(".pgen") + variants = PGEN(variants) + elif path_is_vcf(variants): + variants = VCF(variants) + elif variants.is_dir() and variants.suffix == ".svar": + variants = SparseVar(variants) + else: + raise ValueError( + f"File {variants} has an unrecognized file extension. Please provide either a VCF or PGEN file.`" + ) - if tracks is not None: - unavail = [] - for tr in tracks: - if unavailable_contigs := set(contigs) - { - normalize_contig_name(c, contigs) for c in tr.contigs - }: - unavail.append(unavailable_contigs) if available_samples is None: - available_samples = set(tr.samples) - else: - available_samples.intersection_update(tr.samples) - if unavail: - logger.warning( - f"Contigs in queries {set().union(*unavail)} are not found in one or more tracks." - ) + available_samples = set(variants.available_samples) - if available_samples is None: - raise ValueError( - "No samples available across all variant file(s) and/or tracks." - ) - - if samples is None: - samples = list(available_samples) - elif missing := (set(samples) - available_samples): - raise ValueError(f"Samples {missing} not found in variants or tracks.") - - samples.sort() - - logger.info(f"Using {len(samples)} samples.") - metadata["samples"] = samples - metadata["n_regions"] = gvl_bed.height - - if variants is not None: - logger.info("Writing genotypes.") - - effective_max_mem = max_mem - if isinstance(variants, (VCF, PGEN)): - idx_bytes = variants.nbytes - effective_max_mem = max_mem - idx_bytes - logger.info( - f"Variant reader resident size: {format_memory(idx_bytes)}; " - f"max_mem budget: {format_memory(max_mem)}; " - f"available for chunking: {format_memory(max(effective_max_mem, 0))}" - ) + # Eagerly load the variant index so max_mem accounting is honest. + # VCF and PGEN both support lazy-index construction; without this, + # variants.nbytes returns 0 and the budget overcounts memory. if isinstance(variants, VCF): - bytes_per_var = variants.n_samples * variants.ploidy # Genos8: 1 byte - else: - bytes_per_var = variants.n_samples * variants.ploidy * 4 # int32 - - if effective_max_mem < bytes_per_var: - raise ValueError( - f"max_mem ({format_memory(max_mem)}) is too small: the variant " - f"index alone consumes {format_memory(idx_bytes)}, leaving " - f"{format_memory(max(effective_max_mem, 0))} for chunking, but " - f"at least {format_memory(bytes_per_var)} is needed per variant. " - f"Increase max_mem." + if variants._index is None: + if not variants._valid_index(): + logger.info("VCF genoray index is invalid, writing") + variants._write_gvi_index() + variants._load_index() + elif isinstance(variants, PGEN): + variants._init_index() + + if tracks is not None: + unavail = [] + for tr in tracks: + if unavailable_contigs := set(contigs) - { + normalize_contig_name(c, contigs) for c in tr.contigs + }: + unavail.append(unavailable_contigs) + if available_samples is None: + available_samples = set(tr.samples) + else: + available_samples.intersection_update(tr.samples) + if unavail: + logger.warning( + f"Contigs in queries {set().union(*unavail)} are not found in one or more tracks." ) - if isinstance(variants, VCF): - variants.set_samples(samples) - gvl_bed = _write_from_vcf( - path, gvl_bed, variants, effective_max_mem, extend_to_length - ) - elif isinstance(variants, PGEN): - variants.set_samples(samples) - gvl_bed = _write_from_pgen( - path, gvl_bed, variants, effective_max_mem, extend_to_length - ) - elif isinstance(variants, SparseVar): - gvl_bed, _svar_link = _write_from_svar( - path, gvl_bed, variants, samples, extend_to_length + if available_samples is None: + raise ValueError( + "No samples available across all variant file(s) and/or tracks." ) - metadata["svar_link"] = _svar_link - metadata["ploidy"] = variants.ploidy - # free memory - del variants - gc.collect() - _write_regions(path, gvl_bed, contigs) + if samples is None: + samples = list(available_samples) + elif missing := (set(samples) - available_samples): + raise ValueError(f"Samples {missing} not found in variants or tracks.") - if tracks is not None: - logger.info("Writing track intervals.") - for tr in tracks: - _write_track(path, gvl_bed, tr, samples, max_mem) + samples.sort() + + logger.info(f"Using {len(samples)} samples.") + metadata["samples"] = samples + metadata["n_regions"] = gvl_bed.height + + if variants is not None: + logger.info("Writing genotypes.") - _metadata = Metadata(**metadata) - with open(path / "metadata.json", "w") as f: - f.write(_metadata.model_dump_json()) + effective_max_mem = max_mem + if isinstance(variants, (VCF, PGEN)): + idx_bytes = variants.nbytes + effective_max_mem = max_mem - idx_bytes + logger.info( + f"Variant reader resident size: {format_memory(idx_bytes)}; " + f"max_mem budget: {format_memory(max_mem)}; " + f"available for chunking: {format_memory(max(effective_max_mem, 0))}" + ) + if isinstance(variants, VCF): + bytes_per_var = variants.n_samples * variants.ploidy # Genos8: 1 byte + else: + bytes_per_var = variants.n_samples * variants.ploidy * 4 # int32 + + if effective_max_mem < bytes_per_var: + raise ValueError( + f"max_mem ({format_memory(max_mem)}) is too small: the variant " + f"index alone consumes {format_memory(idx_bytes)}, leaving " + f"{format_memory(max(effective_max_mem, 0))} for chunking, but " + f"at least {format_memory(bytes_per_var)} is needed per variant. " + f"Increase max_mem." + ) + + if isinstance(variants, VCF): + variants.set_samples(samples) + gvl_bed = _write_from_vcf( + path, gvl_bed, variants, effective_max_mem, extend_to_length + ) + elif isinstance(variants, PGEN): + variants.set_samples(samples) + gvl_bed = _write_from_pgen( + path, gvl_bed, variants, effective_max_mem, extend_to_length + ) + elif isinstance(variants, SparseVar): + gvl_bed, _svar_link = _write_from_svar( + path, gvl_bed, variants, samples, extend_to_length + ) + metadata["svar_link"] = _svar_link + metadata["ploidy"] = variants.ploidy + # free memory + del variants + gc.collect() + + _write_regions(path, gvl_bed, contigs) + + if tracks is not None: + logger.info("Writing track intervals.") + for tr in tracks: + _write_track(path, gvl_bed, tr, samples, max_mem) + + _metadata = Metadata(**metadata) + with open(path / "metadata.json", "w") as f: + f.write(_metadata.model_dump_json()) + except BaseException: + _atomic_ctx.__exit__(*sys.exc_info()) + raise + else: + _atomic_ctx.__exit__(None, None, None) logger.info("Finished writing.") warnings.simplefilter("default") diff --git a/tests/unit/dataset/test_write_atomic.py b/tests/unit/dataset/test_write_atomic.py new file mode 100644 index 0000000..c0529dd --- /dev/null +++ b/tests/unit/dataset/test_write_atomic.py @@ -0,0 +1,67 @@ +"""Tests for atomic gvl.write and format_version stamping.""" + +import polars as pl +import pytest + +import genvarloader as gvl +from genvarloader._dataset._write import DATASET_FORMAT_VERSION, Metadata + + +def test_metadata_has_format_version_field(): + m = Metadata(samples=["s0"], contigs=["chr1"], n_regions=1) + # default is None for back-compat; write() stamps the current version + assert m.format_version is None + + +def test_dataset_format_version_is_1_0_0(): + assert str(DATASET_FORMAT_VERSION) == "1.0.0" + + +def test_write_stamps_format_version(): + raw = Metadata( + samples=["s0"], + contigs=["chr1"], + n_regions=1, + format_version=DATASET_FORMAT_VERSION, + ).model_dump_json() + back = Metadata.model_validate_json(raw) + assert str(back.format_version) == "1.0.0" + + +def test_write_is_atomic_no_temp_left(phased_vcf_gvl): + # The phased_vcf_gvl fixture already exercised gvl.write; assert no temp dirs + # were left beside it. + parent = phased_vcf_gvl.parent + assert list(parent.glob(f"{phased_vcf_gvl.name}.tmp.*")) == [] + assert list(parent.glob(f"{phased_vcf_gvl.name}.old.*")) == [] + + +def test_overwrite_false_existing_raises(synthetic_case, tmp_path): + """Write a minimal dataset once, then assert a second write raises FileExistsError.""" + dest = tmp_path / "test_overwrite.gvl" + + # Use the VCF from the synthetic case as a variant source + vcf_path = synthetic_case.vcf_path + bed = synthetic_case.regions.select( + chrom=pl.col("chrom"), + chromStart=pl.col("start"), + chromEnd=pl.col("end"), + ).head(2) + + # First write should succeed + gvl.write( + path=dest, + bed=bed, + variants=str(vcf_path), + overwrite=False, + ) + assert dest.exists() + + # Second write to the same path with overwrite=False must raise FileExistsError + with pytest.raises(FileExistsError): + gvl.write( + path=dest, + bed=bed, + variants=str(vcf_path), + overwrite=False, + ) From 59ccf4f1c2e466b513993423f2ea8b417d2fce17 Mon Sep 17 00:00:00 2001 From: d-laub Date: Wed, 3 Jun 2026 12:37:24 -0700 Subject: [PATCH 21/28] refactor(_write): use plain with-atomic_dir; restore warnings filter; add atomicity + format_version on-disk tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix 1: replace hand-rolled __enter__/__exit__ + try/except/else with a plain `with atomic_dir(dest, overwrite=overwrite) as path:` block; removed `import sys` (no longer used) - Fix 2: wrap write body in try/finally so warnings.simplefilter("default") runs on both success and failure paths; logger.info("Finished writing.") stays inside try (success only), just before the finally - Fix 3a: test_format_version_stamped_on_disk — real gvl.write to tmp_path with synthetic_case VCF, asserts metadata.json["format_version"] == "1.0.0" - Fix 3b: test_failure_leaves_no_partial_artifacts — samples=["NOT_A_REAL_SAMPLE"] triggers ValueError("not found in variants or tracks") after atomic_dir creates the temp dir; asserts dest and .tmp.* siblings are absent Co-Authored-By: Claude Opus 4.8 --- python/genvarloader/_dataset/_write.py | 291 ++++++++++++------------ tests/unit/dataset/test_write_atomic.py | 46 ++++ 2 files changed, 188 insertions(+), 149 deletions(-) diff --git a/python/genvarloader/_dataset/_write.py b/python/genvarloader/_dataset/_write.py index be2dba9..c98fb4d 100644 --- a/python/genvarloader/_dataset/_write.py +++ b/python/genvarloader/_dataset/_write.py @@ -1,6 +1,5 @@ import gc import json -import sys import warnings from collections.abc import Iterator, Sequence from importlib.metadata import version @@ -112,173 +111,167 @@ def write( """ # ignore polars warning about os.fork which is caused by using joblib's loky backend warnings.simplefilter("ignore", RuntimeWarning) + try: + if variants is None and tracks is None: + raise ValueError("At least one of `variants` or `tracks` must be provided.") - if variants is None and tracks is None: - raise ValueError("At least one of `variants` or `tracks` must be provided.") + if tracks is not None and not isinstance(tracks, (list, tuple)): + tracks = [tracks] + elif tracks is not None: + tracks = list(tracks) - if tracks is not None and not isinstance(tracks, (list, tuple)): - tracks = [tracks] - elif tracks is not None: - tracks = list(tracks) + if tracks is not None: + names = [t.name for t in tracks] + if len(set(names)) != len(names): + raise ValueError( + f"Duplicate track names: {names}. Each track must have a unique `name`." + ) - if tracks is not None: - names = [t.name for t in tracks] - if len(set(names)) != len(names): - raise ValueError( - f"Duplicate track names: {names}. Each track must have a unique `name`." - ) + logger.info(f"Writing dataset to {path}") - logger.info(f"Writing dataset to {path}") + max_mem = parse_memory(max_mem) - max_mem = parse_memory(max_mem) + metadata: dict[str, Any] = { + "version": SemanticVersion.parse(version("genvarloader")), + "format_version": DATASET_FORMAT_VERSION, + } + dest = Path(path) + with atomic_dir(dest, overwrite=overwrite) as path: + if isinstance(bed, (str, Path)): + bed = sp.bed.read(bed) + + gvl_bed, contigs, input_to_sorted_idx_map = _prep_bed(bed, max_jitter) + bed.with_columns(r_idx_map=pl.Series(input_to_sorted_idx_map)).write_ipc( + path / "input_regions.arrow" + ) + metadata["contigs"] = contigs + if max_jitter is not None: + metadata["max_jitter"] = max_jitter + + available_samples: set[str] | None = None + if variants is not None: + if isinstance(variants, (str, Path)): + variants = Path(variants) + if path_is_pgen(variants): + if variants.suffix == "": + variants = variants.with_suffix(".pgen") + variants = PGEN(variants) + elif path_is_vcf(variants): + variants = VCF(variants) + elif variants.is_dir() and variants.suffix == ".svar": + variants = SparseVar(variants) + else: + raise ValueError( + f"File {variants} has an unrecognized file extension. Please provide either a VCF or PGEN file.`" + ) - metadata: dict[str, Any] = { - "version": SemanticVersion.parse(version("genvarloader")), - "format_version": DATASET_FORMAT_VERSION, - } - dest = Path(path) - _atomic_ctx = atomic_dir(dest, overwrite=overwrite) - path = _atomic_ctx.__enter__() - try: - if isinstance(bed, (str, Path)): - bed = sp.bed.read(bed) + if available_samples is None: + available_samples = set(variants.available_samples) - gvl_bed, contigs, input_to_sorted_idx_map = _prep_bed(bed, max_jitter) - bed.with_columns(r_idx_map=pl.Series(input_to_sorted_idx_map)).write_ipc( - path / "input_regions.arrow" - ) - metadata["contigs"] = contigs - if max_jitter is not None: - metadata["max_jitter"] = max_jitter - - available_samples: set[str] | None = None - if variants is not None: - if isinstance(variants, (str, Path)): - variants = Path(variants) - if path_is_pgen(variants): - if variants.suffix == "": - variants = variants.with_suffix(".pgen") - variants = PGEN(variants) - elif path_is_vcf(variants): - variants = VCF(variants) - elif variants.is_dir() and variants.suffix == ".svar": - variants = SparseVar(variants) - else: - raise ValueError( - f"File {variants} has an unrecognized file extension. Please provide either a VCF or PGEN file.`" + # Eagerly load the variant index so max_mem accounting is honest. + # VCF and PGEN both support lazy-index construction; without this, + # variants.nbytes returns 0 and the budget overcounts memory. + if isinstance(variants, VCF): + if variants._index is None: + if not variants._valid_index(): + logger.info("VCF genoray index is invalid, writing") + variants._write_gvi_index() + variants._load_index() + elif isinstance(variants, PGEN): + variants._init_index() + + if tracks is not None: + unavail = [] + for tr in tracks: + if unavailable_contigs := set(contigs) - { + normalize_contig_name(c, contigs) for c in tr.contigs + }: + unavail.append(unavailable_contigs) + if available_samples is None: + available_samples = set(tr.samples) + else: + available_samples.intersection_update(tr.samples) + if unavail: + logger.warning( + f"Contigs in queries {set().union(*unavail)} are not found in one or more tracks." ) if available_samples is None: - available_samples = set(variants.available_samples) - - # Eagerly load the variant index so max_mem accounting is honest. - # VCF and PGEN both support lazy-index construction; without this, - # variants.nbytes returns 0 and the budget overcounts memory. - if isinstance(variants, VCF): - if variants._index is None: - if not variants._valid_index(): - logger.info("VCF genoray index is invalid, writing") - variants._write_gvi_index() - variants._load_index() - elif isinstance(variants, PGEN): - variants._init_index() - - if tracks is not None: - unavail = [] - for tr in tracks: - if unavailable_contigs := set(contigs) - { - normalize_contig_name(c, contigs) for c in tr.contigs - }: - unavail.append(unavailable_contigs) - if available_samples is None: - available_samples = set(tr.samples) - else: - available_samples.intersection_update(tr.samples) - if unavail: - logger.warning( - f"Contigs in queries {set().union(*unavail)} are not found in one or more tracks." + raise ValueError( + "No samples available across all variant file(s) and/or tracks." ) - if available_samples is None: - raise ValueError( - "No samples available across all variant file(s) and/or tracks." - ) + if samples is None: + samples = list(available_samples) + elif missing := (set(samples) - available_samples): + raise ValueError(f"Samples {missing} not found in variants or tracks.") - if samples is None: - samples = list(available_samples) - elif missing := (set(samples) - available_samples): - raise ValueError(f"Samples {missing} not found in variants or tracks.") + samples.sort() - samples.sort() + logger.info(f"Using {len(samples)} samples.") + metadata["samples"] = samples + metadata["n_regions"] = gvl_bed.height - logger.info(f"Using {len(samples)} samples.") - metadata["samples"] = samples - metadata["n_regions"] = gvl_bed.height + if variants is not None: + logger.info("Writing genotypes.") - if variants is not None: - logger.info("Writing genotypes.") + effective_max_mem = max_mem + if isinstance(variants, (VCF, PGEN)): + idx_bytes = variants.nbytes + effective_max_mem = max_mem - idx_bytes + logger.info( + f"Variant reader resident size: {format_memory(idx_bytes)}; " + f"max_mem budget: {format_memory(max_mem)}; " + f"available for chunking: {format_memory(max(effective_max_mem, 0))}" + ) + if isinstance(variants, VCF): + bytes_per_var = variants.n_samples * variants.ploidy # Genos8: 1 byte + else: + bytes_per_var = variants.n_samples * variants.ploidy * 4 # int32 + + if effective_max_mem < bytes_per_var: + raise ValueError( + f"max_mem ({format_memory(max_mem)}) is too small: the variant " + f"index alone consumes {format_memory(idx_bytes)}, leaving " + f"{format_memory(max(effective_max_mem, 0))} for chunking, but " + f"at least {format_memory(bytes_per_var)} is needed per variant. " + f"Increase max_mem." + ) - effective_max_mem = max_mem - if isinstance(variants, (VCF, PGEN)): - idx_bytes = variants.nbytes - effective_max_mem = max_mem - idx_bytes - logger.info( - f"Variant reader resident size: {format_memory(idx_bytes)}; " - f"max_mem budget: {format_memory(max_mem)}; " - f"available for chunking: {format_memory(max(effective_max_mem, 0))}" - ) if isinstance(variants, VCF): - bytes_per_var = variants.n_samples * variants.ploidy # Genos8: 1 byte - else: - bytes_per_var = variants.n_samples * variants.ploidy * 4 # int32 - - if effective_max_mem < bytes_per_var: - raise ValueError( - f"max_mem ({format_memory(max_mem)}) is too small: the variant " - f"index alone consumes {format_memory(idx_bytes)}, leaving " - f"{format_memory(max(effective_max_mem, 0))} for chunking, but " - f"at least {format_memory(bytes_per_var)} is needed per variant. " - f"Increase max_mem." + variants.set_samples(samples) + gvl_bed = _write_from_vcf( + path, gvl_bed, variants, effective_max_mem, extend_to_length ) - - if isinstance(variants, VCF): - variants.set_samples(samples) - gvl_bed = _write_from_vcf( - path, gvl_bed, variants, effective_max_mem, extend_to_length - ) - elif isinstance(variants, PGEN): - variants.set_samples(samples) - gvl_bed = _write_from_pgen( - path, gvl_bed, variants, effective_max_mem, extend_to_length - ) - elif isinstance(variants, SparseVar): - gvl_bed, _svar_link = _write_from_svar( - path, gvl_bed, variants, samples, extend_to_length - ) - metadata["svar_link"] = _svar_link - metadata["ploidy"] = variants.ploidy - # free memory - del variants - gc.collect() - - _write_regions(path, gvl_bed, contigs) - - if tracks is not None: - logger.info("Writing track intervals.") - for tr in tracks: - _write_track(path, gvl_bed, tr, samples, max_mem) - - _metadata = Metadata(**metadata) - with open(path / "metadata.json", "w") as f: - f.write(_metadata.model_dump_json()) - except BaseException: - _atomic_ctx.__exit__(*sys.exc_info()) - raise - else: - _atomic_ctx.__exit__(None, None, None) - - logger.info("Finished writing.") - warnings.simplefilter("default") + elif isinstance(variants, PGEN): + variants.set_samples(samples) + gvl_bed = _write_from_pgen( + path, gvl_bed, variants, effective_max_mem, extend_to_length + ) + elif isinstance(variants, SparseVar): + gvl_bed, _svar_link = _write_from_svar( + path, gvl_bed, variants, samples, extend_to_length + ) + metadata["svar_link"] = _svar_link + metadata["ploidy"] = variants.ploidy + # free memory + del variants + gc.collect() + + _write_regions(path, gvl_bed, contigs) + + if tracks is not None: + logger.info("Writing track intervals.") + for tr in tracks: + _write_track(path, gvl_bed, tr, samples, max_mem) + + _metadata = Metadata(**metadata) + with open(path / "metadata.json", "w") as f: + f.write(_metadata.model_dump_json()) + + logger.info("Finished writing.") + finally: + warnings.simplefilter("default") def get_splice_bed( diff --git a/tests/unit/dataset/test_write_atomic.py b/tests/unit/dataset/test_write_atomic.py index c0529dd..4bfcd52 100644 --- a/tests/unit/dataset/test_write_atomic.py +++ b/tests/unit/dataset/test_write_atomic.py @@ -1,5 +1,7 @@ """Tests for atomic gvl.write and format_version stamping.""" +import json + import polars as pl import pytest @@ -65,3 +67,47 @@ def test_overwrite_false_existing_raises(synthetic_case, tmp_path): variants=str(vcf_path), overwrite=False, ) + + +def test_format_version_stamped_on_disk(synthetic_case, tmp_path): + """gvl.write stamps format_version in metadata.json on a real write.""" + dest = tmp_path / "test_format_version.gvl" + bed = synthetic_case.regions.select( + chrom=pl.col("chrom"), + chromStart=pl.col("start"), + chromEnd=pl.col("end"), + ).head(2) + + gvl.write( + path=dest, + bed=bed, + variants=str(synthetic_case.vcf_path), + overwrite=False, + ) + + meta = json.loads((dest / "metadata.json").read_text()) + assert meta["format_version"] == "1.0.0" + + +def test_failure_leaves_no_partial_artifacts(synthetic_case, tmp_path): + """A mid-write failure cleans up: no dest dir and no .tmp.* sibling left.""" + dest = tmp_path / "test_failure_atomic.gvl" + bed = synthetic_case.regions.select( + chrom=pl.col("chrom"), + chromStart=pl.col("start"), + chromEnd=pl.col("end"), + ).head(2) + + # Pass a sample name that doesn't exist in the VCF — raises ValueError after + # atomic_dir has created the temp directory and written input_regions.arrow. + with pytest.raises(ValueError, match="not found in variants or tracks"): + gvl.write( + path=dest, + bed=bed, + variants=str(synthetic_case.vcf_path), + samples=["NOT_A_REAL_SAMPLE"], + overwrite=False, + ) + + assert not dest.exists() + assert list(dest.parent.glob(f"{dest.name}.tmp.*")) == [] From d7762fb2e16d43adecd938e97536c25cbf940d8f Mon Sep 17 00:00:00 2001 From: d-laub Date: Wed, 3 Jun 2026 12:46:20 -0700 Subject: [PATCH 22/28] feat(_open): validate dataset format version + integrity on open MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `_validate.py` with `validate_dataset` which enforces a format-version gate (incompatible MAJOR → actionable ValueError; missing → treated as 1.0.0) and structural/size integrity checks (required files, regions.npy shape, genotypes/offsets.npy byte-length for VCF/PGEN datasets). Called from `OpenRequest._load_metadata` so every `Dataset.open` is guarded. Key implementation detail: genotypes/offsets.npy is a raw int64 memmap (no numpy header), so the check uses st_size bytes rather than np.load. SVAR datasets (which have svar_meta.json) are excluded from the offsets check because their offsets.npy has a different 4-D shape. Co-Authored-By: Claude Opus 4.8 --- python/genvarloader/_dataset/_open.py | 5 +- python/genvarloader/_dataset/_validate.py | 84 ++++++++++++++++++++++ tests/dataset/test_open.py | 8 ++- tests/unit/dataset/test_validate.py | 85 +++++++++++++++++++++++ 4 files changed, 178 insertions(+), 4 deletions(-) create mode 100644 python/genvarloader/_dataset/_validate.py create mode 100644 tests/unit/dataset/test_validate.py diff --git a/python/genvarloader/_dataset/_open.py b/python/genvarloader/_dataset/_open.py index 4a3af1a..4674944 100644 --- a/python/genvarloader/_dataset/_open.py +++ b/python/genvarloader/_dataset/_open.py @@ -23,6 +23,7 @@ from ._reconstruct import Haps, Ref, Tracks, _build_reconstructor from ._reference import Reference from ._utils import bed_to_regions +from ._validate import validate_dataset from ._write import Metadata if TYPE_CHECKING: @@ -100,7 +101,9 @@ def _validate_path(self) -> None: def _load_metadata(self) -> Metadata: with _py_open(self.path / "metadata.json") as f: - return Metadata.model_validate_json(f.read()) + metadata = Metadata.model_validate_json(f.read()) + validate_dataset(metadata, self.path) + return metadata def _build_indexer( self, metadata: Metadata diff --git a/python/genvarloader/_dataset/_validate.py b/python/genvarloader/_dataset/_validate.py new file mode 100644 index 0000000..eeb4dda --- /dev/null +++ b/python/genvarloader/_dataset/_validate.py @@ -0,0 +1,84 @@ +"""Format-version gate and structural/size integrity checks for a gvl dataset. + +A dataset cannot auto-rebuild (it retains no source), so every failure raises an +actionable `ValueError` instructing the user to regenerate with `gvl.write`. +Only total byte/length sizes are checked — full-content hashing of multi-GB +datasets is intentionally avoided (same tradeoff as the FASTA cache). +""" + +from __future__ import annotations + +from pathlib import Path + +import numpy as np + +from ._write import DATASET_FORMAT_VERSION, Metadata + +__all__ = ["validate_dataset"] + + +def validate_dataset(metadata: Metadata, path: Path) -> None: + """Validate an on-disk dataset before constructing readers. + + Raises + ------ + ValueError + On an incompatible format version or a structural/size integrity failure. + """ + _check_format_version(metadata, path) + _check_integrity(metadata, path) + + +def _check_format_version(metadata: Metadata, path: Path) -> None: + fmt = metadata.format_version + if fmt is None: + return # legacy dataset: treat as the current layout, load best-effort + if fmt.major != DATASET_FORMAT_VERSION.major: + raise ValueError( + f"Dataset at {path} has format version {fmt}, incompatible with this " + f"genvarloader's format version {DATASET_FORMAT_VERSION}. Regenerate " + f"the dataset with `gvl.write`" + + ( + " or upgrade genvarloader." + if fmt.major > DATASET_FORMAT_VERSION.major + else "." + ) + ) + + +def _check_integrity(metadata: Metadata, path: Path) -> None: + for required in ("metadata.json", "input_regions.arrow", "regions.npy"): + if not (path / required).exists(): + raise ValueError( + f"Dataset at {path} is missing required file '{required}'. " + "Regenerate the dataset with `gvl.write`." + ) + + regions = np.load(path / "regions.npy", mmap_mode="r") + if regions.shape != (metadata.n_regions, 4): + raise ValueError( + f"Dataset at {path}: regions.npy has shape {regions.shape}, expected " + f"({metadata.n_regions}, 4). The dataset is corrupt or truncated; " + "regenerate with `gvl.write`." + ) + + geno_offsets = path / "genotypes" / "offsets.npy" + if geno_offsets.exists() and not (path / "genotypes" / "svar_meta.json").exists(): + # offsets.npy for VCF/PGEN genotypes is a raw int64 memmap (no numpy + # header), so we check its size in bytes rather than calling np.load. + if metadata.ploidy is None: + raise ValueError( + f"Dataset at {path} has genotypes but no ploidy in metadata; " + "regenerate with `gvl.write`." + ) + expected = metadata.n_regions * metadata.ploidy * metadata.n_samples + 1 + actual_bytes = geno_offsets.stat().st_size + expected_bytes = expected * np.dtype(np.int64).itemsize + if actual_bytes != expected_bytes: + actual_len = actual_bytes // np.dtype(np.int64).itemsize + raise ValueError( + f"Dataset at {path}: genotypes/offsets.npy has length " + f"{actual_len}, expected {expected} " + f"(n_regions * ploidy * n_samples + 1). The dataset is corrupt or " + "truncated; regenerate with `gvl.write`." + ) diff --git a/tests/dataset/test_open.py b/tests/dataset/test_open.py index b98ba5d..90d8886 100644 --- a/tests/dataset/test_open.py +++ b/tests/dataset/test_open.py @@ -11,6 +11,7 @@ from pathlib import Path import genvarloader as gvl +import numpy as np import pyarrow as pa import pyarrow.ipc as pa_ipc import pytest @@ -35,7 +36,7 @@ def _write_minimal_metadata(path: Path, *, ploidy: int | None = None) -> None: def _write_minimal_regions(path: Path) -> None: - """Write a minimal ``input_regions.arrow`` into *path* (must already exist).""" + """Write ``input_regions.arrow`` and ``regions.npy`` into *path* (must already exist).""" table = pa.table( { "chrom": pa.array(["chr1"], type=pa.large_utf8()), @@ -46,6 +47,7 @@ def _write_minimal_regions(path: Path) -> None: ) with pa_ipc.new_file(path / "input_regions.arrow", table.schema) as writer: writer.write_table(table) + np.save(path / "regions.npy", np.zeros((1, 4), dtype=np.int32)) # --------------------------------------------------------------------------- @@ -68,12 +70,12 @@ def test_open_dir_without_metadata_raises(tmp_path: Path) -> None: def test_open_dir_without_regions_raises(tmp_path: Path) -> None: - """metadata.json present but input_regions.arrow missing — raises FileNotFoundError.""" + """metadata.json present but input_regions.arrow missing — raises ValueError.""" ds = tmp_path / "no_regions.gvl" ds.mkdir() _write_minimal_metadata(ds) # no input_regions.arrow written - with pytest.raises(FileNotFoundError): + with pytest.raises(ValueError, match="input_regions.arrow"): gvl.Dataset.open(ds) diff --git a/tests/unit/dataset/test_validate.py b/tests/unit/dataset/test_validate.py new file mode 100644 index 0000000..1e49855 --- /dev/null +++ b/tests/unit/dataset/test_validate.py @@ -0,0 +1,85 @@ +import numpy as np +import pytest + +from genvarloader._dataset._validate import validate_dataset +from genvarloader._dataset._write import DATASET_FORMAT_VERSION, Metadata + + +def _minimal_valid_dataset(path): + path.mkdir() + meta = Metadata( + samples=["s0", "s1"], + contigs=["chr1"], + n_regions=2, + format_version=DATASET_FORMAT_VERSION, + ) + (path / "metadata.json").write_text(meta.model_dump_json()) + # input_regions.arrow: presence-only check, write a stub + (path / "input_regions.arrow").write_bytes(b"stub") + np.save(path / "regions.npy", np.zeros((2, 4), dtype=np.int32)) + return meta + + +def test_valid_dataset_passes(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + validate_dataset(meta, path) # no raise + + +def test_missing_format_version_loads_as_1_0_0(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + meta.format_version = None + validate_dataset(meta, path) # no raise, no warning + + +def test_format_version_too_new_raises(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + from pydantic_extra_types.semantic_version import SemanticVersion + + meta.format_version = SemanticVersion.parse( + f"{DATASET_FORMAT_VERSION.major + 1}.0.0" + ) + with pytest.raises(ValueError, match="format version"): + validate_dataset(meta, path) + + +def test_missing_regions_npy_raises(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + (path / "regions.npy").unlink() + with pytest.raises(ValueError, match="regions.npy"): + validate_dataset(meta, path) + + +def test_regions_npy_wrong_length_raises(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + np.save(path / "regions.npy", np.zeros((5, 4), dtype=np.int32)) # n_regions=2 + with pytest.raises(ValueError, match="regions.npy"): + validate_dataset(meta, path) + + +def test_genotype_offsets_wrong_length_raises(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + meta.ploidy = 2 + geno = path / "genotypes" + geno.mkdir() + # correct length would be n_regions*ploidy*n_samples + 1 = 2*2*2 + 1 = 9 + # offsets.npy is a raw memmap (no numpy header), so use tofile not np.save + np.zeros(4, dtype=np.int64).tofile(geno / "offsets.npy") + with pytest.raises(ValueError, match="offsets.npy"): + validate_dataset(meta, path) + + +def test_genotype_offsets_correct_length_passes(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + meta.ploidy = 2 + geno = path / "genotypes" + geno.mkdir() + # offsets.npy is a raw memmap (no numpy header), so use tofile not np.save + np.zeros(2 * 2 * 2 + 1, dtype=np.int64).tofile(geno / "offsets.npy") + validate_dataset(meta, path) # no raise From c5e16c64c1551f4dbff1b9da4c08367b5c255468 Mon Sep 17 00:00:00 2001 From: d-laub Date: Wed, 3 Jun 2026 12:50:52 -0700 Subject: [PATCH 23/28] test(_validate): cover too-old format version and genotypes-without-ploidy branches Co-Authored-By: Claude Opus 4.8 --- tests/unit/dataset/test_validate.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/unit/dataset/test_validate.py b/tests/unit/dataset/test_validate.py index 1e49855..c59787d 100644 --- a/tests/unit/dataset/test_validate.py +++ b/tests/unit/dataset/test_validate.py @@ -83,3 +83,26 @@ def test_genotype_offsets_correct_length_passes(tmp_path): # offsets.npy is a raw memmap (no numpy header), so use tofile not np.save np.zeros(2 * 2 * 2 + 1, dtype=np.int64).tofile(geno / "offsets.npy") validate_dataset(meta, path) # no raise + + +def test_format_version_too_old_raises(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) + from pydantic_extra_types.semantic_version import SemanticVersion + + # DATASET_FORMAT_VERSION.major is 1, so major 0 is "too old" + meta.format_version = SemanticVersion.parse("0.0.0") + with pytest.raises(ValueError, match="format version") as exc_info: + validate_dataset(meta, path) + assert "upgrade genvarloader" not in str(exc_info.value) + + +def test_genotypes_present_but_no_ploidy_raises(tmp_path): + path = tmp_path / "ds.gvl" + meta = _minimal_valid_dataset(path) # ploidy defaults to None + geno = path / "genotypes" + geno.mkdir() + # offsets.npy is a raw memmap (no numpy header), so use tofile not np.save + np.zeros(4, dtype=np.int64).tofile(geno / "offsets.npy") + with pytest.raises(ValueError, match="ploidy"): + validate_dataset(meta, path) From 6f15e89e8f8ee351631f86177d88017018ed6b30 Mon Sep 17 00:00:00 2001 From: d-laub Date: Wed, 3 Jun 2026 12:53:03 -0700 Subject: [PATCH 24/28] test: concurrency regression for atomic cache + dataset creation (closes #21) Co-Authored-By: Claude Opus 4.8 --- tests/unit/test_concurrency.py | 124 +++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 tests/unit/test_concurrency.py diff --git a/tests/unit/test_concurrency.py b/tests/unit/test_concurrency.py new file mode 100644 index 0000000..24c2187 --- /dev/null +++ b/tests/unit/test_concurrency.py @@ -0,0 +1,124 @@ +"""Concurrency regression tests for atomic cache + dataset creation (issue #21). + +These tests prove that N processes racing to build the same .gvlfa cache or +write to the same dataset path produce exactly one valid artifact with no +orphan .tmp.* / .old.* directories left behind. +""" + +import multiprocessing as mp +import shutil +from pathlib import Path + +import numpy as np +import pytest + +import genvarloader._fasta_cache as fc + +# Use spawn so workers re-import cleanly regardless of host start method. +_CTX = mp.get_context("spawn") + + +# --------------------------------------------------------------------------- +# Worker functions (must be module-level for pickle under spawn) +# --------------------------------------------------------------------------- + + +def _build_cache_worker(src_str): + import genvarloader._fasta_cache as _fc + + _fc.ensure_cache(Path(src_str)) + + +def _write_worker(path_str, vcf_str, bed_dict): + import polars as pl + + import genvarloader as gvl + + bed = pl.DataFrame(bed_dict) + gvl.write(path=Path(path_str), bed=bed, variants=vcf_str, overwrite=True) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.slow +def test_concurrent_ensure_cache_no_corruption(tmp_path, ref_fasta): + """N concurrent processes building the same .gvlfa cache produce a valid, + byte-identical result with no orphan .tmp.* / .old.* directories.""" + src = tmp_path / "ref.fa.bgz" + shutil.copy(ref_fasta, src) + shutil.copy(str(ref_fasta) + ".fai", str(src) + ".fai") + if Path(str(ref_fasta) + ".gzi").exists(): + shutil.copy(str(ref_fasta) + ".gzi", str(src) + ".gzi") + + # Single-process reference build in an isolated sub-directory + ref_dir = tmp_path / "single" + ref_dir.mkdir() + ref_copy = ref_dir / "ref.fa.bgz" + shutil.copy(src, ref_copy) + shutil.copy(str(src) + ".fai", str(ref_copy) + ".fai") + if Path(str(src) + ".gzi").exists(): + shutil.copy(str(src) + ".gzi", str(ref_copy) + ".gzi") + _meta, single_data = fc.ensure_cache(ref_copy) + expected = np.array(np.memmap(single_data, np.uint8, "r")) + + # N concurrent builders against the same source path + procs = [ + _CTX.Process(target=_build_cache_worker, args=(str(src),)) for _ in range(6) + ] + for p in procs: + p.start() + for p in procs: + p.join(timeout=120) + assert p.exitcode == 0, f"worker exited with code {p.exitcode}" + + _meta2, data = fc.ensure_cache(src) + got = np.array(np.memmap(data, np.uint8, "r")) + np.testing.assert_array_equal(got, expected) + + # No orphan temp / old dirs published beside the cache + assert list(tmp_path.glob("ref.fa.bgz.gvlfa.tmp.*")) == [] + assert list(tmp_path.glob("ref.fa.bgz.gvlfa.old.*")) == [] + + +@pytest.mark.slow +def test_concurrent_gvl_write_one_valid_dataset(tmp_path, synthetic_case, reference): + """N concurrent processes writing to the same dataset path (overwrite=True) + leave exactly one valid, openable dataset with no orphan dirs.""" + import polars as pl + + import genvarloader as gvl + + vcf = str(synthetic_case.vcf_path) + + # Build the bed dict using the same column names gvl.write expects, + # mirroring the pattern in tests/unit/dataset/test_write_atomic.py. + bed = synthetic_case.regions.select( + chrom=pl.col("chrom"), + chromStart=pl.col("start"), + chromEnd=pl.col("end"), + ).head(2) + + bed_dict = bed.to_dict(as_series=False) + + dest = tmp_path / "shared.gvl" + + procs = [ + _CTX.Process(target=_write_worker, args=(str(dest), vcf, bed_dict)) + for _ in range(4) + ] + for p in procs: + p.start() + for p in procs: + p.join(timeout=180) + assert p.exitcode == 0, f"worker exited with code {p.exitcode}" + + # Exactly one valid dataset published; no orphan temp / old dirs + assert dest.is_dir(), "destination dataset directory does not exist" + assert list(tmp_path.glob("shared.gvl.tmp.*")) == [] + assert list(tmp_path.glob("shared.gvl.old.*")) == [] + + ds = gvl.Dataset.open(dest, reference=reference) + assert len(ds) > 0 From 83790b31b13b0d3b251df9c9eeb2fb6ed7570cee Mon Sep 17 00:00:00 2001 From: d-laub Date: Wed, 3 Jun 2026 12:57:26 -0700 Subject: [PATCH 25/28] docs(skill): note atomic/locked creation, dataset format gate, index-file limitation - SKILL.md: gvl.write section notes atomic build + advisory lock + no auto-rebuild; Dataset.open section notes format_version/integrity validation + no auto-rebuild; Reference.from_path one-liner updated with atomic/lock/auto-rebuild note. - _write.py: Notes section in write() docstring covers atomic dir, advisory lock, and out-of-scope genoray/pysam index files. - _fasta_cache.py: module-level docstring records atomic build + lock + auto-rebuild behaviour. Fast suite: 584 passed, 39 skipped, 4 xfailed, 0 failed. Slow concurrency tier: 2 passed. Co-Authored-By: Claude Opus 4.8 --- python/genvarloader/_dataset/_write.py | 12 ++++++++++++ python/genvarloader/_fasta_cache.py | 7 +++++++ skills/genvarloader/SKILL.md | 8 +++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/python/genvarloader/_dataset/_write.py b/python/genvarloader/_dataset/_write.py index c98fb4d..c2df831 100644 --- a/python/genvarloader/_dataset/_write.py +++ b/python/genvarloader/_dataset/_write.py @@ -108,6 +108,18 @@ def write( Otherwise, deletions can cause the length of haplotypes to be less than the intervals in `bed`. This can be disabled if having haplotypes shorter than the intervals is acceptable, in which case they will be padded with reference bases when appropriate. Disabling this also reduces the amount of data read/written and is faster to run. + + Notes + ----- + The dataset directory is built atomically: all data is written to a private sibling + temp directory and published via :func:`os.replace`. A best-effort ``filelock`` + prevents redundant parallel rebuilds, but correctness relies on the atomic rename — + the lock is advisory only. + + Out of scope: ``genoray`` ``.gvi`` index files and ``pysam`` ``.fai``/``.gzi`` index + files are created by those libraries and are not covered by gvl's atomic/locked + creation. Concurrent jobs that trigger index creation for those files depend on the + upstream libraries' behavior. """ # ignore polars warning about os.fork which is caused by using joblib's loky backend warnings.simplefilter("ignore", RuntimeWarning) diff --git a/python/genvarloader/_fasta_cache.py b/python/genvarloader/_fasta_cache.py index 425c4d4..3789b4a 100644 --- a/python/genvarloader/_fasta_cache.py +++ b/python/genvarloader/_fasta_cache.py @@ -1,3 +1,10 @@ +"""FASTA cache (``.gvlfa``) build and validation logic. + +The cache is built atomically (temp directory + :func:`os.replace`) under a best-effort +``filelock``, so concurrent builders sharing one reference FASTA are safe. The cache +auto-rebuilds from its source when stale or missing (source fingerprint mismatch or +incomplete on-disk data). +""" from __future__ import annotations import os diff --git a/skills/genvarloader/SKILL.md b/skills/genvarloader/SKILL.md index 4c5fb7b..ceb81ef 100644 --- a/skills/genvarloader/SKILL.md +++ b/skills/genvarloader/SKILL.md @@ -105,6 +105,10 @@ Notable: Source: `python/genvarloader/_dataset/_write.py`. +**Atomic creation:** `gvl.write` builds into a private sibling temp directory and publishes via an atomic `os.replace`. A best-effort `filelock` avoids redundant rebuilds when parallel jobs share the same destination, but correctness relies on the rename — the lock is advisory only. **Datasets do not auto-rebuild**; if the on-disk artifact is missing or corrupt, re-run `gvl.write`. + +**Out-of-scope:** `genoray` `.gvi` index files and `pysam` `.fai`/`.gzi` index files are created by those libraries and are **not** covered by gvl's atomic/locked creation. Concurrent jobs that trigger index creation for those files depend on the upstream libraries' behavior. + ## `Dataset.open` — key arguments ```python @@ -122,6 +126,8 @@ gvl.Dataset.open( Without `reference=`, only variants/haplotypes are available (you can't produce reference-overlaid sequences). `svar=` overrides the recorded SVAR location. +**Format validation:** `Dataset.open` validates the dataset's `format_version` and structural integrity (file presence + sizes). An incompatible or corrupt dataset raises a `ValueError` instructing regeneration with `gvl.write`. Datasets do **not** auto-rebuild. + - **`var_fields: list[str] | None`** — Variant fields to include on `RaggedVariants` output. Defaults to the minimum useful set `["alt", "ilen", "start"]`. Pass additional names (e.g. `"ref"`, `"dosage"`, or any numeric info column in the source variants table) to load them eagerly at open time. Must be a subset of `Dataset.available_var_fields`. Can be reconfigured later via `Dataset.with_settings(var_fields=...)`, which lazily loads any newly-requested columns. `"dosage"` must be requested explicitly — it is *not* added automatically even when `dosages.npy` exists on disk. ## Output modes — `with_seqs` × `with_tracks` @@ -229,7 +235,7 @@ Footprint is computed exactly via `Dataset._output_bytes_per_instance(...)` (use ## Other public surface (one-liners) -- `gvl.Reference.from_path(fasta, contigs=None)` — wrap a FASTA (path to a `.fa`/`.fa.bgz`, or a `.gvlfa` cache dir). Builds/reuses a sibling `.gvlfa` cache directory (self-describing, fingerprint-validated; legacy `.fa.gvl` caches auto-migrate). +- `gvl.Reference.from_path(fasta, contigs=None)` — wrap a FASTA (path to a `.fa`/`.fa.bgz`, or a `.gvlfa` cache dir). Builds/reuses a sibling `.gvlfa` cache directory (self-describing, fingerprint-validated; legacy `.fa.gvl` caches auto-migrate). The cache is built atomically (temp + `os.replace`) under a best-effort lock, so concurrent builders sharing one reference are safe; the cache **auto-rebuilds** from its source when stale or missing. - `gvl.read_bedlike(path)` / `gvl.with_length(bed, L)` — BED helpers (re-exported from `seqpro`). - `gvl.Ragged`, `gvl.RaggedAnnotatedHaps`, `gvl.RaggedVariants`, `gvl.RaggedIntervals` — ragged return containers. - `gvl.to_nested_tensor(ragged)` — convert to a PyTorch nested tensor (requires `torch`). From 0053a3b7c2e23fa851830de0a2539665967c40da Mon Sep 17 00:00:00 2001 From: d-laub Date: Wed, 3 Jun 2026 13:02:19 -0700 Subject: [PATCH 26/28] docs(_atomic): clarify overwrite=False discard rationale per consumer Co-Authored-By: Claude Opus 4.8 --- python/genvarloader/_atomic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/genvarloader/_atomic.py b/python/genvarloader/_atomic.py index 46acf6b..51e5669 100644 --- a/python/genvarloader/_atomic.py +++ b/python/genvarloader/_atomic.py @@ -39,8 +39,9 @@ def _publish(tmp: Path, dest: Path, *, overwrite: bool) -> None: """Move `tmp` into place at `dest` as atomically as the filesystem allows.""" if dest.exists(): if not overwrite: - # A racing builder published `dest` while we built. Our content is - # byte-identical, so discard ours; theirs wins harmlessly. + # A racing builder published `dest` while we built. Discard ours and + # let theirs win: for the FASTA cache the content is byte-identical so + # this is harmless; for a dataset write it is first-writer-wins. shutil.rmtree(tmp, ignore_errors=True) return aside = dest.with_name(f"{dest.name}.old.{uuid4().hex[:8]}") From 93cef1a296a4bb509094809b41e0dd496cd41fae Mon Sep 17 00:00:00 2001 From: d-laub Date: Thu, 4 Jun 2026 01:02:07 -0700 Subject: [PATCH 27/28] style: apply ruff format to _write and _fasta_cache Co-Authored-By: Claude Opus 4.8 --- python/genvarloader/_dataset/_write.py | 8 ++++++-- python/genvarloader/_fasta_cache.py | 5 ++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/python/genvarloader/_dataset/_write.py b/python/genvarloader/_dataset/_write.py index c2df831..dd62d29 100644 --- a/python/genvarloader/_dataset/_write.py +++ b/python/genvarloader/_dataset/_write.py @@ -237,9 +237,13 @@ def write( f"available for chunking: {format_memory(max(effective_max_mem, 0))}" ) if isinstance(variants, VCF): - bytes_per_var = variants.n_samples * variants.ploidy # Genos8: 1 byte + bytes_per_var = ( + variants.n_samples * variants.ploidy + ) # Genos8: 1 byte else: - bytes_per_var = variants.n_samples * variants.ploidy * 4 # int32 + bytes_per_var = ( + variants.n_samples * variants.ploidy * 4 + ) # int32 if effective_max_mem < bytes_per_var: raise ValueError( diff --git a/python/genvarloader/_fasta_cache.py b/python/genvarloader/_fasta_cache.py index 3789b4a..7318f06 100644 --- a/python/genvarloader/_fasta_cache.py +++ b/python/genvarloader/_fasta_cache.py @@ -5,6 +5,7 @@ auto-rebuilds from its source when stale or missing (source fingerprint mismatch or incomplete on-disk data). """ + from __future__ import annotations import os @@ -168,9 +169,7 @@ def _ensure_built(source_fa: Path, gvlfa_dir: Path) -> FastaCache: except Exception: pass # unreadable/corrupt -> fall through and rebuild _build_into(source_fa, tmp, gvlfa_dir) - return FastaCache.model_validate_json( - (gvlfa_dir / METADATA_FILENAME).read_text() - ) + return FastaCache.model_validate_json((gvlfa_dir / METADATA_FILENAME).read_text()) def _check_format_version(meta: FastaCache, gvlfa_dir: Path) -> None: From edfba67444f26c0647c5a93ab9c6152d8c951355 Mon Sep 17 00:00:00 2001 From: d-laub Date: Thu, 4 Jun 2026 01:12:01 -0700 Subject: [PATCH 28/28] fix(test_fasta): move mid-file imports to top (E402, CI lint) Co-Authored-By: Claude Opus 4.8 --- tests/unit/test_fasta.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_fasta.py b/tests/unit/test_fasta.py index f0a7b64..4683ce6 100644 --- a/tests/unit/test_fasta.py +++ b/tests/unit/test_fasta.py @@ -1,3 +1,6 @@ +from pathlib import Path + +import genvarloader._fasta_cache as fc import numpy as np import pytest from genvarloader._fasta import Fasta, NoPadError @@ -74,11 +77,6 @@ def test_fasta_zero_length_range(ref_fasta): assert len(seq) == 0 -from pathlib import Path - -import genvarloader._fasta_cache as fc - - def test_fasta_cache_creates_gvlfa_dir(tmp_path, ref_fasta): import shutil