Reed-CompBio · tristan-f-r · Jul 1, 2025 · Jul 28, 2025 · Jan 6, 2026 · Jan 6, 2026
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -16,6 +16,8 @@
 		// For web display
 		"ghcr.io/devcontainers/features/node:1": {},
 		// For scripting
-		"ghcr.io/va-h/devcontainers-features/uv:1": {}
+		"ghcr.io/va-h/devcontainers-features/uv:1": {},
+		// For paxtools
+		"ghcr.io/devcontainers/features/java:1": {}
 	}
 }
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,5 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
-# See https://pre-commit.com/ for documentation
 default_language_version:
   # Match this to the version specified in environment.yml
   python: python3.11
@@ -10,21 +9,25 @@ repos:
     hooks:
       # Attempts to load all yaml files to verify syntax.
       - id: check-yaml
-      # Attempts to load all TOML files to verify syntax.
+        # Attempts to load all TOML files to verify syntax.
       - id: check-toml
-      # Trims trailing whitespace.
+        # Trims trailing whitespace.
       - id: trailing-whitespace
         # Preserves Markdown hard linebreaks.
         args: [--markdown-linebreak-ext=md]
         # Do not trim whitespace from all files, input files may need trailing whitespace for empty values in columns.
         types_or: [markdown, python, yaml]
         # Skip this Markdown file, which has an example of an input text file within it.
         exclude: input/README.md
-  - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: 'v0.0.269'
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: 'v0.15.4'
     hooks:
       - id: ruff
   - repo: https://github.com/google/yamlfmt
     rev: v0.17.0
     hooks:
       - id: yamlfmt
+  - repo: https://github.com/crate-ci/typos
+    rev: v1.34.0
+    hooks:
+      - id: typos
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -2,7 +2,7 @@
 
 ## Helping Out
 
-There are `TODOs` that better enhance the reproducability and accuracy of datasets or analysis of algorithm outputs, as well as
+There are `TODOs` that better enhance the reproducibility and accuracy of datasets or analysis of algorithm outputs, as well as
 [open resolvable issues](https://github.com/Reed-CompBio/spras-benchmarking/).
 
 ## Adding a dataset

diff --git a/README.md b/README.md
@@ -39,21 +39,22 @@ uv run snakemake --cores 1
 
 ## Organization
 
-There are five primary folders in this repository:
+There are six primary folders in this repository:
 
 ```
 .
 ├── cache
 ├── configs
 ├── datasets
 ├── spras
+├── tools
 └── web
 ```
 
 `spras` is the cloned submodule of [SPRAS](https://github.com/reed-compbio/spras), `web` is an
 [astro](https://astro.build/) app which generates the `spras-benchmarking` [output](https://reed-compbio.github.io/spras-benchmarking/),
 `configs` is the YAML file used to talk to SPRAS, and `datasets` contains the raw data. `cache` is utility for `datasets` which provides a convenient
-way to fetch online files for further processing.
+way to fetch online files for further processing. `tools` is the miscellaneous utilities for dataset processing, for tasks common to datasets.
 
 The workflow runs as so:
 

diff --git a/_typos.toml b/_typos.toml
@@ -0,0 +1,12 @@
+[type.txt]
+# Ignore data files
+extend-glob = ["*.jsonc", "*.json"]
+check-file = false
+
+[files]
+extend-exclude = [
+    # PANTHER SPRAS formatting contains an intentional typo
+    "datasets/synthetic_data/scripts/panther_spras_formatting.py",
+    # Bad variable names in this file that may be removed later
+    "datasets/diseases/viz/viz.ipynb"
+]
diff --git a/cache/Snakefile b/cache/Snakefile
@@ -1,34 +1,33 @@
-from cache import link
+from cache import FetchConfig, link
+from cache.directory import CacheItem # for exposing to Snakefiles that import this Snakefile.
 from cache.util import uncompress
 import urllib.parse
-from dataclasses import dataclass
 from typing import Union
 from pathlib import Path
 
-@dataclass
-class FetchConfig:
-    directive: list[str]
-    uncompress: bool = False
+def stringify_directive(directive: Union[CacheItem, FetchConfig]) -> str:
+    return urllib.parse.quote_plus(directive.name if isinstance(directive, CacheItem) else '/'.join(directive))
 
 def produce_fetch_rules(input_dict: dict[str, Union[FetchConfig, list[str]]]):
     """
     Produces fetch rules based on a dictionary mapping
     output files to their directory.py-based directive.
     """
-    # Map inputs to be wrapped with FetchConfig if list[str]
-    input_dict = {k: FetchConfig(v) if isinstance(v, list) else v for k, v in input_dict.items()}
+    # Map inputs to be wrapped with FetchConfig if list[str] or CacheItem
+    input_dict = {k: FetchConfig(v) if isinstance(v, tuple) or isinstance(v, CacheItem) else v for k, v in input_dict.items()}
 
-    directives = [urllib.parse.quote_plus("/".join(directive.directive)) for directive in input_dict.values()]
+    directives = list(input_dict.values())
     assert len(directives) == len(set(directives)), "Directives aren't unique!"
 
     for output_file, config in input_dict.items():
         # Since placeholders are evaluated when the job is actually ran,
         # we pass data using params and output.
         rule:
-            name: f"fetch_{urllib.parse.quote_plus('/'.join(config.directive))}_to_{urllib.parse.quote_plus(output_file)}"
+            name:
+                f"fetch_{stringify_directive(config.directive)}_to_{urllib.parse.quote_plus(output_file)}"
             output: file=output_file
             params:
                 config=config
             run:
                 Path(output.file).parent.mkdir(exist_ok=True)
-                link(Path(output.file), params.config.directive, uncompress=params.config.uncompress)
+                link(Path(output.file), params.config)
diff --git a/cache/__init__.py b/cache/__init__.py
@@ -2,81 +2,126 @@
 This is how spras-benchmarking handles artifact caching. `cache` should be used specifically inside `Snakefile`
 """
 
+from dataclasses import dataclass
+from typing import Union
 from cache.util import uncompress as uncompress_file
-from cache.directory import get_cache_item
+from cache.directory import CacheItem, get_cache_item
 from pathlib import Path
 import os
 from urllib.parse import quote_plus
 import pickle
 
-__all__ = ["link"]
+__all__ = ["FetchConfig", "link"]
 
 dir_path = Path(os.path.dirname(os.path.realpath(__file__)))
 artifacts_dir = dir_path / "artifacts"
 
-def get_artifact_name(directive: list[str]) -> str:
+@dataclass(frozen=True)
+class FetchConfig:
+    directive: Union[CacheItem, tuple[str, ...]]
+    uncompress: bool = False
+
+def get_artifact_name(directive: tuple[str, ...]) -> str:
     return quote_plus("/".join(directive))
 
-def has_expired(directive: list[str]) -> bool:
+def add_suffix(path: Path, suffix: str):
+    return path.with_suffix(path.suffix + suffix)
+
+def has_expired(
+        cache_item: CacheItem,
+        output: Path
+) -> bool:
     """
     Check if the artifact metadata associated with a directive has expired.
     Avoids re-downloading the artifact if nothing has changed.
     """
-    artifact_name = get_artifact_name(directive)
-    cache_item = get_cache_item(directive)
 
-    metadata_dir = artifacts_dir / 'metadata'
-    metadata_dir.mkdir(exist_ok=True)
-    metadata_file = (artifacts_dir / 'metadata' / artifact_name).with_suffix((artifacts_dir / artifact_name).suffix + '.metadata')
+    metadata_file = add_suffix(output, ".metadata")
 
     # metadata never existed: we need to retrieve the new file
     if not metadata_file.exists():
-        with open(metadata_file, 'wb') as f:
+        with open(metadata_file, "wb") as f:
             pickle.dump(cache_item, f)
         return True
 
     old_cache_item = None
-    with open(metadata_file, 'rb') as f:
+    with open(metadata_file, "rb") as f:
         old_cache_item = pickle.load(f)
 
     # metadata expired: re-retrieve the item
     if old_cache_item != cache_item:
-        with open(metadata_file, 'wb') as f:
+        with open(metadata_file, "wb") as f:
             pickle.dump(cache_item, f)
         return True
 
     # metadata hasn't changed and already existed: this hasn't expired
     return False
 
-def link(output: str, directive: list[str], uncompress=False):
+def link_with_cache_item(
+    output: Path,
+    cache_item: CacheItem,
+    uncompress: bool = False
+):
+    """
+    Intermediary function for `link`.
+    This does almost all of what `link` is characterized to do in its documentation,
+    except for doing symlinking.
+    """
+    # If `uncompress` is `True`, we make
+    # `output` our 'compressed output.'
+    uncompressed_output = output
+    if uncompress:
+        output = add_suffix(output, ".compresseded")
+
+    # Re-download if the file doesn't exist or the directive has expired.
+    # Note that we check for expiration first to trigger metadata creation.
+    if has_expired(cache_item, output) or not output.exists():
+        output.unlink(missing_ok=True)
+        cache_item.download(output)
+
+    if uncompress:
+        uncompressed_artifact_path = add_suffix(output, ".uncompressed")
+        uncompressed_artifact_path.unlink(missing_ok=True)
+        uncompress_file(output, uncompressed_output)
+
+def link(
+        output: str,
+        config: FetchConfig
+):
     """
     Links output files from cache.directory directives.
     For example,
 
     ```py
-    link("output/ensg-ensp.tsv", ["BioMart", "ensg-ensp.tsv"])
+    link("output/ensg-ensp.tsv", FetchConfig(["BioMart", "ensg-ensp.tsv"]))
     ```
 
-    would download and check BioMart's cache for ENSG-ENSP mapping, then symlink the cached output
-    (lying somewhere in the cache folder) with the desired `output`.
+    would download and check BioMart's cache for ENSG-ENSP mapping, then:
+    - If `config.directive` is a `CacheItem`, we write the file directly to `output`.
+    - Otherwise, we symlink the cached output (lying somewhere in the cache folder) with the desired `output`
+    to avoid file duplication.
+
+    This function wraps around link_with_cache_item and handles symlinking
+    depending on the type of config.directive.
+    TODO: most likely a nicer way to design this.
     """
 
-    artifacts_dir.mkdir(exist_ok=True)
+    if isinstance(config.directive, CacheItem):
+        link_with_cache_item(
+            Path(output),
+            config.directive,
+            config.uncompress
+        )
+    else:
+        artifacts_dir.mkdir(exist_ok=True)
+        artifact_name = get_artifact_name(config.directive)
+        artifact_output = artifacts_dir / artifact_name
 
-    artifact_name = get_artifact_name(directive)
+        link_with_cache_item(
+            artifact_output,
+            get_cache_item(config.directive),
+            config.uncompress
+        )
 
-    Path(output).unlink(missing_ok=True)
+        Path(output).symlink_to(artifact_output)
 
-    # Re-download if the file doesn't exist or the directive has expired.
-    cache_item = get_cache_item(directive)
-    if not (artifacts_dir / artifact_name).exists() or has_expired(directive):
-        (artifacts_dir / artifact_name).unlink(missing_ok=True)
-        cache_item.download(artifacts_dir / artifact_name)
-
-    if uncompress:
-        uncompressed_artifact_path = Path(str(artifacts_dir / artifact_name) + '.uncompressed')
-        uncompressed_artifact_path.unlink(missing_ok=True)
-        uncompress_file(artifacts_dir / artifact_name, uncompressed_artifact_path)
-        Path(output).symlink_to(uncompressed_artifact_path)
-    else:
-        Path(output).symlink_to(artifacts_dir / artifact_name)
diff --git a/cache/cli.py b/cache/cli.py
@@ -10,20 +10,21 @@
 import argparse
 from cache.directory import get_cache_item
 
+
 def parse_args():
-    parser = argparse.ArgumentParser(
-        prog='Cache',
-        description='CLI utility for directory.py')
-    parser.add_argument('path')
-    parser.add_argument('output')
+    parser = argparse.ArgumentParser(prog="Cache", description="CLI utility for directory.py")
+    parser.add_argument("path")
+    parser.add_argument("output")
 
     return parser.parse_args()
 
+
 def main():
     args = parse_args()
     cache_item = get_cache_item(args.path.split("/"))
 
     cache_item.download(args.output)
 
+
 if __name__ == "__main__":
     main()