ONEcampaign · jm-rivera · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,30 @@
 # Changelog for oda_reader
 
+## 1.6.0 (2026-04-28)
+- Adds `use_raw_cache=False` to `bulk_download_crs`, `download_crs_file`, `bulk_download_dac2a`
+  and `bulk_download_multisystem` for the cases where you want to bypass the bulk cache and
+  re-download fresh on every call. Caching remains on by default.
+- Adds a typed `BulkPayloadCorruptError` (importable from `oda_reader`) for the rare case
+  where a freshly downloaded zip arrives corrupt. The corrupt entry is removed before the
+  exception is raised, so the next call cleanly re-downloads. The error message tells you
+  what to do next.
+- Strengthens corruption detection by validating freshly downloaded zips end-to-end (full
+  member CRC check), not just the central directory. Cached files are trusted on hit so
+  this doesn't slow normal use.
+- The bulk cache now self-maintains: it keeps the two most recent downloads per dataset
+  and evicts older ones, and sweeps stale temp files left behind by interrupted downloads
+  (older than 24h) on startup. No more manually clearing the cache directory to free space.
+- Temp-file naming now includes the hostname alongside the PID, preventing collisions when
+  the cache directory lives on a shared / NFS mount used by multiple machines.
+- `clear_cache`, `set_cache_dir`, `enable_cache` and `disable_cache` now emit a
+  `DeprecationWarning` for users who also import `oda_data`, pointing at the umbrella
+  `oda_data.cache.*` API. Standalone `oda_reader` users see no warning. The shims continue
+  to work through the `1.x` series and will be removed in `2.0`.
+- Cache directory is now versioned by the installed package version (via `importlib.metadata`) rather than a hardcoded string, so upgrades automatically invalidate old caches that may contain partial or corrupt downloads from prior versions.
+- Bulk-download cache writes are now atomic: downloads stream into a sibling temp file and are only renamed over the destination on success, so partial downloads no longer pollute the cache on interruption or error.
+- On `BadZipFile`, the corrupt cached archive is removed so the next call cleanly re-downloads instead of looping on the same poisoned entry.
+- Cached archives are validated with `zipfile.is_zipfile` before reuse; a corrupt entry is removed and re-downloaded transparently.
+
 ## 1.5.1 (2026-04-15)
 - Adds support for Deflate64-compressed ZIP files in bulk downloads. The OECD switched the full CRS bulk file to Deflate64 compression, which Python's standard library does not support. This release patches `zipfile` at runtime using the `inflate64` library to handle Deflate64 transparently.
 - Adds `inflate64` as a dependency.

diff --git a/docs/docs/bulk-downloads.md b/docs/docs/bulk-downloads.md
@@ -110,6 +110,24 @@ print(f"Education projects: {education_count}")
 print(f"Total commitments: ${education_amount/1e9:.1f}B")
 ```
 
+## Forcing a Fresh Download
+
+By default, bulk downloads are cached on disk so a second call returns
+instantly. If you need to bypass that cache (for example, in a CI job that
+should always pull the latest file), pass `use_raw_cache=False`:
+
+```python
+# Always download fresh; the zip is extracted to a temp dir and discarded
+crs = bulk_download_crs(use_raw_cache=False)
+```
+
+The integrity check on the freshly downloaded zip still runs; only the
+on-disk caching is skipped. This flag is available on `bulk_download_crs`,
+`download_crs_file`, `bulk_download_dac2a` and `bulk_download_multisystem`.
+
+See [Caching & Performance](caching.md#bulk-file-cache) for how the bulk
+cache is managed (LRU eviction, TTL, integrity validation).
+
 ## Year-Specific CRS Files
 
 OECD also provides individual files for specific years:
@@ -215,6 +233,11 @@ See [Schema Translation](schema-translation.md) for detailed comparison.
 
 **File not found errors**: Older CRS year-specific files use grouped years (e.g., "1995-99"). Check which grouping includes your target year.
 
+**`BulkPayloadCorruptError`**: The OECD's bulk endpoint occasionally serves a
+truncated or malformed zip. The corrupt entry is removed automatically before
+the exception is raised, so the next call cleanly re-downloads. Retry the
+call, or pass `use_raw_cache=False` to skip the cache for that invocation.
+
 ## Next Steps
 
 - **[Caching & Performance](caching.md)** - Understand how bulk downloads are cached

diff --git a/docs/docs/caching.md b/docs/docs/caching.md
@@ -4,12 +4,14 @@ ODA Reader uses caching to make repeated queries fast and reduce dependency on O
 
 ## How Caching Works
 
-ODA Reader caches two types of data:
+ODA Reader caches three types of data:
 
 1. **HTTP responses**: Raw API responses before processing
 2. **DataFrames**: Processed pandas DataFrames after schema translation
+3. **Bulk files**: Large parquet/zip files downloaded by `bulk_download_crs`,
+   `download_crs_file`, `bulk_download_dac2a` and `bulk_download_multisystem`
 
-Both caches are automatic and transparent - you don't need to change your code to benefit from caching.
+All three caches are automatic and transparent - you don't need to change your code to benefit from caching.
 
 **Example of caching in action**:
 
@@ -103,9 +105,16 @@ This removes all cached API responses and DataFrames. Your next query will hit t
 - Cache has grown too large
 - You're troubleshooting unexpected results
 
+**Using `oda_reader` alongside `oda_data`?** `clear_cache`, `set_cache_dir`,
+`enable_cache` and `disable_cache` are deprecated under the umbrella package
+and emit a `DeprecationWarning` pointing at the `oda_data.cache.*` API
+(e.g. `oda_data.cache.clear("all")`). Standalone `oda_reader` users see no
+warning. The shims continue to work through the `1.x` series and will be
+removed in `2.0`.
+
 ### Automatic Cache Cleanup
 
-ODA Reader automatically enforces cache limits:
+ODA Reader automatically enforces cache limits across the cache root:
 
 - **Max size**: 2.5 GB
 - **Max age**: 7 days
@@ -116,6 +125,58 @@ When you import oda_reader, it checks cache limits:
 
 This happens automatically - you don't need to do anything.
 
+### Bulk File Cache
+
+The bulk file cache (used by `bulk_download_crs`, `download_crs_file`,
+`bulk_download_dac2a` and `bulk_download_multisystem`) is governed separately
+because the files are large (~1 GB each):
+
+- **LRU eviction**: only the two most recent bulk files are kept; older
+  entries are removed automatically the next time you import oda_reader.
+- **Per-entry TTL**: an entry is considered stale after 30 days and refetched
+  on next use.
+- **Integrity validation**: every freshly downloaded zip is end-to-end checked
+  before being trusted. A corrupt download is removed from the cache and
+  raises `BulkPayloadCorruptError` so you can simply retry. Cached files are
+  trusted on hit (no recheck on every call).
+- **Self-healing**: temp files left behind by interrupted downloads (older
+  than 24 hours) are swept on startup, so an aborted download can't pollute
+  the cache directory indefinitely.
+
+#### Bypassing the Bulk File Cache
+
+If you need a fresh download every call (e.g. for a CI job that should always
+hit the source), pass `use_raw_cache=False`:
+
+```python
+from oda_reader import bulk_download_crs
+
+# Download to a temp directory and discard the zip after extraction
+crs = bulk_download_crs(use_raw_cache=False)
+```
+
+Validation still runs in this mode; only the on-disk caching is skipped. The
+flag is available on `bulk_download_crs`, `download_crs_file`,
+`bulk_download_dac2a` and `bulk_download_multisystem`. `download_aiddata`
+takes a different code path and is not affected.
+
+#### Handling Corrupt Downloads
+
+The OECD's bulk endpoint occasionally serves a truncated or malformed file.
+When that happens, a `BulkPayloadCorruptError` is raised and the bad entry is
+already removed from disk by the time you see it, so the next call cleanly
+re-downloads:
+
+```python
+from oda_reader import bulk_download_crs, BulkPayloadCorruptError
+
+try:
+    crs = bulk_download_crs()
+except BulkPayloadCorruptError:
+    # Bad entry already removed — just retry
+    crs = bulk_download_crs()
+```
+
 ## HTTP Caching (Separate from DataFrame Cache)
 
 ODA Reader also caches raw HTTP responses using `requests-cache`:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "oda_reader"
-version = "1.5.1"
+version = "1.6.0"
 description = "A simple package to import ODA data from the OECD's API and AidData's database"
 readme = "README.md"
 license = "MIT"
@@ -53,6 +53,7 @@ docs = [
     "mkdocstrings[python]>=0.24.0",
 ]
 test = [
+    "freezegun>=1.4.0",
     "pytest>=9.0.3",
     "pytest-mock>=3.12",
     "pytest-cov>=4.1",
@@ -120,9 +121,10 @@ unfixable = []
 # Allow longer lines in tests and examples
 "tests/**/*.py" = ["E501"]
 "docs/examples/**/*.py" = ["E501"]
-# Allow global variable usage in cache and common modules (architectural decision)
+# Allow global variable usage in singleton-managing modules (architectural decision)
 "src/oda_reader/_cache/*.py" = ["PLW0603"]
 "src/oda_reader/common.py" = ["PLW0603"]
+"src/oda_reader/_http_primitives.py" = ["PLW0603"]
 # Allow try-except without from (external libraries may not support cause chaining)
 "src/oda_reader/download/download_tools.py" = ["B904"]
 # Allow camelcase import in tools (QueryBuilder is the standard name)
@@ -149,6 +151,7 @@ python_functions = ["test_*"]
 markers = [
     "unit: Fast unit tests (no external dependencies)",
     "integration: Tests that call real OECD API",
+    "network: Tests that require live network access (opt-in via RUN_NETWORK_TESTS=1)",
     "slow: Long-running tests (bulk downloads)",
     "cache: Tests that verify cache behavior",
 ]

diff --git a/src/oda_reader/__init__.py b/src/oda_reader/__init__.py
@@ -3,21 +3,23 @@
 specifically designed to work with OECD DAC data.
 """
 
-# Core data download functions
-# Cache management (new system)
+import sys
+import warnings
+from collections.abc import Callable
+from typing import Any
+
 from oda_reader._cache import (
     bulk_cache_manager,
     cache_dir,  # Deprecated alias
-    clear_cache,
     dataframe_cache,
-    disable_cache,
-    # Legacy functions (for backward compatibility)
-    enable_cache,
     enforce_cache_limits,
     get_cache_dir,
     reset_cache_dir,
-    set_cache_dir,
 )
+from oda_reader._cache.config import set_cache_dir as _impl_set_cache_dir
+from oda_reader._cache.legacy import clear_cache as _impl_clear_cache
+from oda_reader._cache.legacy import disable_cache as _impl_disable_cache
+from oda_reader._cache.legacy import enable_cache as _impl_enable_cache
 from oda_reader.aiddata import download_aiddata
 from oda_reader.common import (
     API_RATE_LIMITER,
@@ -26,15 +28,79 @@
     enable_http_cache,
     get_http_cache_info,
 )
-from oda_reader.download.version_discovery import clear_version_cache
 from oda_reader.crs import bulk_download_crs, download_crs, download_crs_file
 from oda_reader.dac1 import download_dac1
 from oda_reader.dac2a import bulk_download_dac2a, download_dac2a
 from oda_reader.download.query_builder import QueryBuilder
+from oda_reader.download.version_discovery import clear_version_cache
+from oda_reader.exceptions import BulkDownloadHTTPError, BulkPayloadCorruptError
 from oda_reader.multisystem import bulk_download_multisystem, download_multisystem
 from oda_reader.tools import get_available_filters
 
+# Each shim emits a one-time-per-session DeprecationWarning when oda_data is
+# also imported (umbrella users should migrate to oda_data.cache.*); standalone
+# oda_reader users see no warning.
+_WARNED_SHIMS: set[str] = set()
+
+
+def _warn_once_if_oda_data_imported(name: str, replacement: str) -> None:
+    if name in _WARNED_SHIMS or "oda_data" not in sys.modules:
+        return
+    warnings.warn(
+        f"oda_reader.{name} is deprecated for users who also import oda_data; "
+        f"use {replacement} for the umbrella API. This shim is preserved for "
+        "standalone oda_reader users through 1.x and removed in 2.0.",
+        DeprecationWarning,
+        stacklevel=3,
+    )
+    _WARNED_SHIMS.add(name)
+
+
+def _make_deprecation_shim(
+    name: str, replacement: str, impl: Callable[..., Any], one_liner: str
+) -> Callable[..., Any]:
+    def shim(*args: Any, **kwargs: Any) -> Any:
+        _warn_once_if_oda_data_imported(name, replacement)
+        return impl(*args, **kwargs)
+
+    shim.__name__ = name
+    shim.__qualname__ = name
+    shim.__doc__ = (
+        f"{one_liner} Deprecated under the oda_data umbrella; use {replacement}."
+    )
+    return shim
+
+
+clear_cache = _make_deprecation_shim(
+    "clear_cache",
+    "oda_data.cache.clear('all')",
+    _impl_clear_cache,
+    "Clear the cache directory.",
+)
+set_cache_dir = _make_deprecation_shim(
+    "set_cache_dir",
+    "oda_data.set_cache_root() or the ODA_DATA_CACHE_DIR env var",
+    _impl_set_cache_dir,
+    "Set a custom cache directory path.",
+)
+enable_cache = _make_deprecation_shim(
+    "enable_cache",
+    "oda_data.cache.enable_cache('all')",
+    _impl_enable_cache,
+    "Enable caching globally.",
+)
+disable_cache = _make_deprecation_shim(
+    "disable_cache",
+    "oda_data.cache.disable_cache('all')",
+    _impl_disable_cache,
+    "Disable caching globally.",
+)
+
+
 __all__ = [
+    # Boundary contract
+    "BulkPayloadCorruptError",
+    "BulkDownloadHTTPError",
     # Data download
     "QueryBuilder",
     "download_dac1",
@@ -63,7 +129,7 @@
     "bulk_cache_manager",
     # Rate limiting
     "API_RATE_LIMITER",
-    # Legacy (backward compatibility)
+    # Legacy (backward compatibility - deprecated for oda_data users)
     "enable_cache",
     "disable_cache",
     "clear_cache",