From 478b6d1688f65f0c16eadbd35244db204cb1fd5d Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Wed, 19 Nov 2025 13:57:47 -0700
Subject: [PATCH 01/23] first pass at matbench running locally and on a remote
 HPC via globus compute, multi-gpu support

---
 garden_ai/benchmarks/__init__.py              |  34 ++
 .../benchmarks/matbench_discovery/README.md   | 285 +++++++++++
 .../benchmarks/matbench_discovery/__init__.py | 156 ++++++
 .../benchmarks/matbench_discovery/enums.py    |  15 +
 .../examples/matbench_1000_structures.py      |  83 ++++
 .../examples/matbench_test.py                 |  31 ++
 .../examples/matbench_test_local_mps.py       |  46 ++
 .../examples/matbench_test_remote.py          |  59 +++
 .../matbench_discovery/remote_runner.py       | 470 ++++++++++++++++++
 .../benchmarks/matbench_discovery/tasks.py    | 321 ++++++++++++
 10 files changed, 1500 insertions(+)
 create mode 100644 garden_ai/benchmarks/__init__.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/README.md
 create mode 100644 garden_ai/benchmarks/matbench_discovery/__init__.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/enums.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/remote_runner.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/tasks.py

diff --git a/garden_ai/benchmarks/__init__.py b/garden_ai/benchmarks/__init__.py
new file mode 100644
index 00000000..329de6cc
--- /dev/null
+++ b/garden_ai/benchmarks/__init__.py
@@ -0,0 +1,34 @@
+"""Garden AI benchmarking framework.
+
+This module provides interfaces for running standardized benchmarks on
+models hosted in Garden AI or developed locally.
+
+Available benchmarks:
+    - MatbenchDiscovery: Materials discovery benchmark suite
+"""
+
+from .matbench_discovery import IS2RETask, MatbenchDiscovery, MatbenchTask
+
+__all__ = [
+    "MatbenchDiscovery",
+    "MatbenchTask",
+    "IS2RETask",
+]
+
+
+def publish_benchmark_result(benchmark, model, results):
+    """Publish benchmark results to Garden AI backend.
+
+    This is a placeholder for future functionality to store benchmark
+    results alongside published models.
+
+    Args:
+        benchmark: Benchmark adapter instance
+        model: Model that was benchmarked
+        results: Dictionary of benchmark metrics
+    """
+    # TODO: Implement when backend API is ready
+    raise NotImplementedError(
+        "Publishing benchmark results is not yet implemented. "
+        "For now, save results locally or to your own storage."
+    )
diff --git a/garden_ai/benchmarks/matbench_discovery/README.md b/garden_ai/benchmarks/matbench_discovery/README.md
new file mode 100644
index 00000000..4273cd02
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/README.md
@@ -0,0 +1,285 @@
+# Matbench Discovery Benchmark Adapter
+
+Minimal viable implementation for running [Matbench Discovery](https://matbench-discovery.materialsproject.org/) benchmarks on remote HPC systems via Globus Compute.
+
+## Overview
+
+This adapter enables Garden AI users to benchmark their materials models against the Matbench Discovery test suite without manually managing HPC jobs, environment setup, or data transfers.
+
+### Current Status: MVP
+
+**Implemented:**
+- ✅ IS2RE (Initial Structure to Relaxed Energy) task
+- ✅ Remote environment setup with UV
+- ✅ Automatic dependency installation
+- ✅ Basic metric calculation
+- ✅ Multi-GPU parallelization (automatic GPU detection and work distribution)
+
+**Future Work:**
+- ⏳ Additional tasks (RS2RE, S2EFS, thermal conductivity)
+- ⏳ Globus Transfer for model weights and large datasets
+- ⏳ Checkpointing and failure recovery
+- ⏳ Full metric calculation against DFT ground truth
+- ⏳ Backend integration for result publishing
+
+## Architecture
+
+```
+User's Machine                    Remote HPC Endpoint
+├─ MatbenchDiscovery             ├─ Clone matbench-discovery repo
+│  ├─ tasks.IS2RE                │  ├─ Set up UV virtual environment
+│  └─ Globus Compute Executor ───┼─>├─ Install dependencies
+                                 │  │  ├─ matbench-discovery
+                                 │  │  └─ model package (e.g., mace-torch)
+                                 │  ├─ Load test structures via DataFiles
+                                 │  ├─ Run structure relaxations
+                                 │  ├─ Calculate metrics
+                                 │  └─ Return results
+```
+
+## File Structure
+
+```
+matbench_discovery/
+├── __init__.py         # Main adapter class (MatbenchDiscovery)
+├── tasks.py            # Task implementations (IS2RETask)
+├── remote_runner.py    # Remote execution functions
+├── enums.py            # Task enumerations
+├── example.py          # Usage example
+└── README.md           # This file
+```
+
+## Usage
+
+### Basic Example
+
+```python
+from garden_ai.benchmarks import MatbenchDiscovery
+from my_model import MyModel
+
+# Configure endpoint
+endpoint_id = "your-endpoint-uuid"
+endpoint_config = {
+    "account": "project-account",
+    "partition": "gpu-debug",
+    "scheduler_options": "#SBATCH --gpus-per-node=1"
+}
+
+# Run benchmark
+with MatbenchDiscovery(endpoint_id, endpoint_config) as bench:
+    model = MyModel()
+    task = bench.tasks.IS2RE
+
+    # Submit job (returns immediately)
+    future = task.submit(model, num_structures=100)
+
+    # Wait for completion
+    results = future.result()
+
+    # Calculate metrics
+    metrics = task.calculate_metrics(results)
+    print(metrics)
+```
+
+### Multi-GPU Parallelization
+
+The adapter automatically detects and uses all available GPUs on the compute node for parallel processing. This significantly improves throughput for large-scale benchmarks.
+
+**Example: 4-GPU Configuration on Anvil**
+
+```python
+from garden_ai.benchmarks import MatbenchDiscovery
+
+endpoint_id = "your-endpoint-uuid"
+endpoint_config = {
+    "account": "your-account",
+    "qos": "gpu",
+    "partition": "gpu",
+    "scheduler_options": "#SBATCH --gpus-per-node=4\n#SBATCH --time=4:00:00\n#SBATCH --mem=64G",
+    "worker_init": "pip install --user uv",
+}
+
+with MatbenchDiscovery(endpoint_id, endpoint_config) as bench:
+    task = bench.tasks.IS2RE
+
+    # Multi-GPU is enabled by default
+    future = task.submit(
+        model_package="mace-torch",
+        model_factory="mace_mp",
+        model_kwargs={"model": "medium", "device": "cuda"},
+        num_structures=1000,
+        use_multi_gpu=True,  # Default: True
+    )
+
+    results = future.result()
+    metrics = task.calculate_metrics(results)
+```
+
+**How it works:**
+1. Automatically detects available GPUs using `torch.cuda.device_count()`
+2. Splits structures into equal batches (one per GPU)
+3. Processes batches in parallel using multiprocessing
+4. Aggregates results from all workers
+
+**Performance expectations:**
+- **Single GPU**: ~10-20 structures/hour (baseline)
+- **4 GPUs**: ~3-4x speedup (~40-80 structures/hour)
+- Actual performance depends on model complexity and structure size
+
+**Disabling multi-GPU:**
+```python
+future = task.submit(
+    model_package="mace-torch",
+    model_factory="mace_mp",
+    model_kwargs={"model": "medium", "device": "cuda"},
+    num_structures=100,
+    use_multi_gpu=False,  # Use single GPU/CPU
+)
+```
+
+### Scaling Guide
+
+**Recommended test progression:**
+
+1. **Small test (10-100 structures)**: Verify setup and model compatibility
+   - Partition: `gpu-debug`
+   - Time: 30 minutes
+   - GPUs: 1-4
+
+2. **Medium test (1000 structures)**: Test multi-GPU parallelization
+   - Partition: `gpu`
+   - Time: 4 hours
+   - GPUs: 4
+   - Expected throughput: ~250-300 structures/hour with 4 GPUs
+
+3. **Full dataset (~257k structures)**: Production run
+   - Partition: `gpu`
+   - Time: 48+ hours
+   - GPUs: 4
+   - Consider implementing checkpointing for runs >24 hours
+
+### Model Requirements
+
+For the MVP, models must:
+
+1. **Be pip-installable** (or provide package name)
+2. **Implement ASE calculator interface** (or be convertible to one)
+3. **Have a checkpoint file** (optional, can be None for models with default weights)
+
+Example model:
+
+```python
+class MyModel:
+    def __init__(self):
+        self.checkpoint_path = "/path/to/checkpoint.pt"
+
+    # ASE calculator interface
+    def calculate(self, atoms, properties, system_changes):
+        # Calculate energy, forces, stress
+        ...
+```
+
+### Workflow Details
+
+When you call `task.submit(model)`:
+
+1. **Model introspection**: Extracts model class name, module, and checkpoint path
+2. **Remote submission**: Sends job to Globus Compute endpoint
+3. **Environment setup** (on remote):
+   - Clones matbench-discovery repository
+   - Creates Python 3.11 virtual environment with UV
+   - Installs matbench-discovery package
+   - Installs model package (e.g., `pip install mace-torch`)
+4. **Benchmark execution**:
+   - Loads test structures using `DataFiles.wbm_initial_structures`
+   - Instantiates model and loads checkpoint
+   - Runs geometry optimizations (ASE FIRE optimizer)
+   - Collects results
+5. **Result return**: Returns energies, convergence stats, and failures
+
+## Configuration Options
+
+### MatbenchDiscovery
+
+```python
+MatbenchDiscovery(
+    endpoint_id="uuid",           # Required: Globus Compute endpoint
+    user_endpoint_config=dict,     # Optional: HPC scheduler config
+    repo_ref="main",               # Optional: Git ref to use
+    model_package="mace-torch"     # Optional: Default model package
+)
+```
+
+### IS2RETask.submit()
+
+```python
+task.submit(
+    model,                         # Required: Model instance
+    num_structures=100,            # Optional: Number of structures to test
+    model_package="mace-torch",    # Optional: Override default package
+    use_multi_gpu=True,            # Optional: Enable multi-GPU (default: True)
+)
+```
+
+## Design Decisions
+
+### Why UV?
+- Fast, deterministic installs
+- Handles both `pyproject.toml` and `requirements.txt`
+- Built-in venv creation with specific Python versions
+
+### Why DataFiles auto-download?
+- Avoids manual Globus Transfer setup for MVP
+- Matbench's DataFiles handles caching automatically
+- Can optimize with explicit transfer later
+
+### Why ASE calculator interface?
+- Standard in materials modeling community
+- Most interatomic potentials support it (MACE, M3GNet, CHGNet, etc.)
+- Simple adaptation layer if needed
+
+### Why multiprocessing for multi-GPU?
+- Simple and effective for within-node parallelization
+- Avoids CUDA initialization issues with fork
+- Each GPU gets isolated process with dedicated memory
+- Easy to debug and monitor per-GPU progress
+
+## Limitations
+
+1. **No weight transfer**: Model checkpoints must be accessible from remote (URL or shared filesystem)
+2. **Basic metrics**: Only reports convergence stats, not comparison to DFT ground truth
+3. **IS2RE only**: Other tasks not yet implemented
+4. **No checkpointing**: If job fails, must restart from scratch (recommended for runs >24 hours)
+5. **No result publishing**: Backend integration not yet implemented
+6. **Single-node parallelization**: Multi-GPU works within a node; SLURM array jobs for multi-node not yet implemented
+
+## Next Steps
+
+To generalize beyond Matbench:
+
+1. **Extract base classes**: `BenchmarkAdapter`, `BenchmarkTask`, `RemoteRunner`
+2. **Add data staging**: Implement Globus Transfer for weights/datasets
+3. **Define model interface**: Standard protocol for model serialization
+4. **Add checkpointing**: Save intermediate results for failure recovery
+5. **Implement batching**: Distribute work across SLURM array jobs
+
+## Testing
+
+```bash
+# Install dependencies
+cd garden_ai/benchmarks/matbench_discovery
+pip install -e .
+
+# Update example.py with your endpoint details
+vim example.py
+
+# Run example
+python example.py
+```
+
+## References
+
+- [Matbench Discovery](https://matbench-discovery.materialsproject.org/)
+- [Matbench Discovery GitHub](https://github.com/janosh/matbench-discovery)
+- [Globus Compute](https://globus-compute.readthedocs.io/)
+- [ASE Calculator Interface](https://wiki.fysik.dtu.dk/ase/ase/calculators/calculators.html)
diff --git a/garden_ai/benchmarks/matbench_discovery/__init__.py b/garden_ai/benchmarks/matbench_discovery/__init__.py
new file mode 100644
index 00000000..3f687a6c
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/__init__.py
@@ -0,0 +1,156 @@
+"""Matbench Discovery benchmark adapter for Garden AI.
+
+This module provides a clean interface for running Matbench Discovery benchmarks
+on remote HPC systems via Globus Compute. It handles environment setup,
+dependency installation, and benchmark execution.
+
+Example usage:
+    >>> from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
+    >>> from my_model import MyModel
+    >>>
+    >>> # Configure for your HPC endpoint
+    >>> endpoint_id = "your-endpoint-uuid"
+    >>> endpoint_config = {
+    ...     "account": "project-account",
+    ...     "partition": "gpu",
+    ...     "scheduler_options": "#SBATCH --gpus-per-node=1"
+    ... }
+    >>>
+    >>> # Run benchmark
+    >>> with MatbenchDiscovery(endpoint_id, endpoint_config) as bench:
+    ...     model = MyModel()
+    ...     task = bench.tasks.IS2RE
+    ...     future = task.submit(model, num_structures=100)
+    ...     results = future.result()
+    ...     metrics = task.calculate_metrics(results)
+    ...     print(metrics)
+"""
+
+from typing import Any
+
+from globus_compute_sdk import Executor
+from globus_compute_sdk.serialize import CombinedCode, ComputeSerializer
+
+from .enums import MatbenchTask
+from .tasks import IS2RETask
+
+__all__ = [
+    "MatbenchDiscovery",
+    "MatbenchTask",
+    "IS2RETask",
+]
+
+
+class MatbenchDiscovery:
+    """Adapter for running Matbench Discovery benchmarks locally or remotely.
+
+    This class manages the lifecycle of benchmark execution:
+    - Provides access to benchmark tasks (IS2RE, etc.)
+    - For remote execution: creates and manages Globus Compute executor
+    - For local execution: runs in ephemeral UV environment
+
+    Use as a context manager to ensure proper cleanup:
+        # Local execution
+        with MatbenchDiscovery() as bench:
+            result = bench.tasks.IS2RE.local(...)
+
+        # Remote execution
+        with MatbenchDiscovery(endpoint_id="uuid", endpoint_config={...}) as bench:
+            future = bench.tasks.IS2RE.submit(...)
+
+    Attributes:
+        tasks: Namespace containing available benchmark tasks
+            - tasks.IS2RE: Initial Structure to Relaxed Energy task
+    """
+
+    # Matbench Discovery repository configuration
+    REPO_URL = "https://github.com/janosh/matbench-discovery"
+    REPO_REF = "main"
+    PYTHON_VERSION = "3.11"
+
+    def __init__(
+        self,
+        endpoint_id: str | None = None,
+        user_endpoint_config: dict[str, Any] | None = None,
+        repo_ref: str | None = None,
+        model_package: str | None = None,
+    ):
+        """Initialize Matbench Discovery adapter.
+
+        Args:
+            endpoint_id: Globus Compute endpoint UUID for remote execution.
+                        If None, only local execution (.local()) is available.
+            user_endpoint_config: Optional HPC configuration for remote endpoint.
+                                 Example for SLURM:
+                                 {
+                                     "account": "project-account",
+                                     "partition": "gpu-debug",
+                                     "scheduler_options": "#SBATCH --gpus-per-node=1"
+                                 }
+            repo_ref: Git branch/tag/commit to use (default: "main")
+            model_package: Default model package to install for all tasks
+                          (can be overridden per task)
+        """
+        self.endpoint_id = endpoint_id
+        self.user_endpoint_config = user_endpoint_config
+        self.repo_ref = repo_ref or self.REPO_REF
+        self.model_package = model_package
+
+        # Executor is created lazily on first submit() call
+        self._executor: Executor | None = None
+        self.tasks: Any = None
+
+    def _get_executor(self) -> Executor:
+        """Get or create the Globus Compute executor (lazy initialization).
+
+        Returns:
+            Executor instance
+
+        Raises:
+            ValueError: If endpoint_id was not provided during initialization
+        """
+        if self._executor is None:
+            if self.endpoint_id is None:
+                raise ValueError(
+                    "endpoint_id is required for remote execution. "
+                    "Either provide endpoint_id during initialization or use .local() method."
+                )
+
+            executor_kwargs = {"endpoint_id": self.endpoint_id}
+            if self.user_endpoint_config:
+                executor_kwargs["user_endpoint_config"] = self.user_endpoint_config
+
+            # Use CombinedCode serialization to send actual function code
+            # rather than module references (avoids needing garden_ai installed remotely)
+            executor_kwargs["serializer"] = ComputeSerializer(
+                strategy_code=CombinedCode()
+            )
+
+            self._executor = Executor(**executor_kwargs)
+
+        return self._executor
+
+    def __enter__(self):
+        """Set up tasks when entering context."""
+        # Initialize tasks - executor will be created lazily when needed
+        # Using a simple namespace object for dot access
+        self.tasks = type(
+            "Tasks",
+            (),
+            {
+                "IS2RE": IS2RETask(
+                    adapter=self,  # Pass adapter instead of executor
+                    repo_url=self.REPO_URL,
+                    repo_ref=self.repo_ref,
+                    model_package=self.model_package,
+                )
+            },
+        )()
+
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Clean up executor when exiting context."""
+        if self._executor:
+            self._executor.shutdown(wait=True)
+        return False  # Don't suppress exceptions
diff --git a/garden_ai/benchmarks/matbench_discovery/enums.py b/garden_ai/benchmarks/matbench_discovery/enums.py
new file mode 100644
index 00000000..8cc2f99b
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/enums.py
@@ -0,0 +1,15 @@
+"""Enums for Matbench Discovery benchmark tasks."""
+
+from enum import Enum
+
+
+class MatbenchTask(Enum):
+    """Available Matbench Discovery benchmark tasks.
+
+    Currently only IS2RE is implemented for the MVP.
+    Future tasks could include:
+    - RS2RE: Relaxed Structure to Relaxed Energy
+    - S2EFS: Structure to Energy, Forces, and Stress
+    """
+
+    IS2RE = "is2re"  # Initial Structure to Relaxed Energy
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py
new file mode 100644
index 00000000..151b043e
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py
@@ -0,0 +1,83 @@
+"""Test Matbench Discovery benchmark on Anvil HPC with 1000 structures.
+
+This script demonstrates scaling to 1000 structures using 4 GPUs in parallel.
+It's designed to test the multi-GPU parallelization implementation and measure
+throughput before attempting the full dataset.
+"""
+
+from garden_ai.benchmarks import MatbenchDiscovery
+
+ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
+
+ENDPOINT_CONFIG = {
+    "account": "cis250461-gpu",  # HPC allocation/account
+    "qos": "gpu",
+    "partition": "gpu-debug",  # Use full partition (not debug) for longer run
+    "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --time=00:30:00\n#SBATCH --mem=32G",
+    "worker_init": "pip install --user uv",  # Install uv on worker startup
+}
+
+MODEL_PACKAGE = "mace-torch"
+MODEL_FACTORY = "mace_mp"
+MODEL_KWARGS = {
+    "model": "medium",
+    "device": "cuda",  # Use GPU on HPC
+    "default_dtype": "float64",
+}
+
+NUM_STRUCTURES = 1000
+
+print("=" * 80)
+print("Matbench Discovery IS2RE Benchmark - 1000 Structures")
+print("=" * 80)
+print(f"Endpoint: {ENDPOINT_ID}")
+print(f"Model: {MODEL_PACKAGE} / {MODEL_FACTORY}")
+print(f"Structures: {NUM_STRUCTURES}")
+print("Multi-GPU: Enabled (2 GPUs)")
+print("=" * 80)
+
+with MatbenchDiscovery(
+    endpoint_id=ENDPOINT_ID,
+    user_endpoint_config=ENDPOINT_CONFIG,
+) as bench:
+    task = bench.tasks.IS2RE
+
+    future = task.submit(
+        model_package=MODEL_PACKAGE,
+        model_factory=MODEL_FACTORY,
+        model_kwargs=MODEL_KWARGS,
+        num_structures=NUM_STRUCTURES,
+        use_multi_gpu=True,  # Enable multi-GPU parallelization
+    )
+
+    print("\nJob submitted! Waiting for results...")
+    print("This may take a while. You can monitor progress in the Globus Compute logs.")
+    print()
+
+    try:
+        result = future.result()
+        metrics = task.calculate_metrics(result)
+
+        print("\nResults:")
+        print("=" * 80)
+        for key, value in metrics.items():
+            print(f"  {key}: {value}")
+
+        print("=" * 80)
+        print("\nRaw Results:")
+        print(f"  Converged: {result['num_converged']}")
+        print(f"  Failed: {len(result.get('failed_indices', []))}")
+        if result.get("energies"):
+            valid_energies = [e for e in result["energies"] if e is not None]
+            if valid_energies:
+                print(f"  Sample energies: {valid_energies[:3]}")
+
+        # Calculate and display throughput
+        if "num_converged" in result and result["num_converged"] > 0:
+            print("\nPerformance:")
+            print(f"  Success rate: {metrics.get('success_rate', 0):.1%}")
+            print("  Note: Check job logs for detailed throughput (structures/hour)")
+
+    except Exception as e:
+        print(f"\n[ERROR] Benchmark failed: {e}")
+        raise
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py
new file mode 100644
index 00000000..3b2912f8
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py
@@ -0,0 +1,31 @@
+"""Test Matbench Discovery benchmark locally."""
+
+from garden_ai.benchmarks import MatbenchDiscovery
+
+print("Matbench Discovery IS2RE Benchmark")
+print("=" * 80)
+
+with MatbenchDiscovery() as bench:
+    task = bench.tasks.IS2RE
+
+    # Run benchmark locally
+    result = task.local(
+        model_package="mace-torch",
+        model_factory="mace_mp",
+        model_kwargs={
+            "model": "medium",
+            "device": "cpu",
+            "default_dtype": "float32",
+        },
+        num_structures=10,
+    )
+
+    # Calculate metrics
+    metrics = task.calculate_metrics(result)
+
+    # Display results
+    print("\nResults:")
+    print("=" * 80)
+    for key, value in metrics.items():
+        print(f"  {key}: {value}")
+    print("=" * 80)
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py
new file mode 100644
index 00000000..9f9bfd8c
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py
@@ -0,0 +1,46 @@
+"""Test Matbench Discovery benchmark locally on Mac.
+
+This script tests the benchmark implementation locally. Note that MPS (Apple Silicon
+GPU) is not compatible with MACE model checkpoints which use float64, so this runs
+on CPU. This is still useful for verifying the workflow works before using Anvil.
+"""
+
+from garden_ai.benchmarks import MatbenchDiscovery
+
+print("=" * 80)
+print("Matbench Discovery Local Test")
+print("=" * 80)
+
+# Run benchmark locally with MPS acceleration
+with MatbenchDiscovery() as bench:
+    task = bench.tasks.IS2RE
+
+    print("\nRunning local benchmark...")
+    print("Note: Using CPU because MACE model checkpoints use float64,")
+    print("which is not supported by MPS. This is still useful for testing")
+    print("the workflow before running on Anvil with CUDA.\n")
+
+    result = task.local(
+        model_package="mace-torch",
+        model_factory="mace_mp",
+        model_kwargs={
+            "model": "medium",
+            "device": "cpu",  # MPS doesn't support float64 used by MACE checkpoints
+            "default_dtype": "float32",
+        },
+        num_structures=10,  # Small test to verify workflow
+        use_multi_gpu=False,
+    )
+
+    # Calculate metrics
+    metrics = task.calculate_metrics(result)
+
+    # Display results
+    print("\nResults:")
+    print("=" * 80)
+    for key, value in metrics.items():
+        print(f"  {key}: {value}")
+    print("=" * 80)
+
+    print("\nLocal test complete!")
+    print("If this works, you can proceed with confidence to run on Anvil.")
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py
new file mode 100644
index 00000000..08be5dca
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py
@@ -0,0 +1,59 @@
+"""Test Matbench Discovery benchmark on remote a HPC endpoint."""
+
+from garden_ai.benchmarks import MatbenchDiscovery
+
+ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
+
+ENDPOINT_CONFIG = {
+    "account": "cis250461-gpu",  # HPC allocation/account
+    "qos": "gpu",
+    "partition": "gpu-debug",  # SLURM partition
+    "scheduler_options": "#SBATCH --gpus-per-node=4",  # Request 4 GPUs
+    "worker_init": "pip install --user uv",  # Install uv on worker startup
+}
+
+MODEL_PACKAGE = "mace-torch"
+MODEL_FACTORY = "mace_mp"
+MODEL_KWARGS = {
+    "model": "medium",
+    "device": "cuda",  # Use GPU on HPC
+    "default_dtype": "float32",
+}
+
+NUM_STRUCTURES = 100  # Increased from 10 to test multi-GPU parallelization
+
+with MatbenchDiscovery(
+    endpoint_id=ENDPOINT_ID,
+    user_endpoint_config=ENDPOINT_CONFIG,
+) as bench:
+    task = bench.tasks.IS2RE
+
+    future = task.submit(
+        model_package=MODEL_PACKAGE,
+        model_factory=MODEL_FACTORY,
+        model_kwargs=MODEL_KWARGS,
+        num_structures=NUM_STRUCTURES,
+        use_multi_gpu=True,  # Enable multi-GPU parallelization
+    )
+
+    try:
+        result = future.result()
+        metrics = task.calculate_metrics(result)
+
+        print("\nResults:")
+        print("=" * 80)
+        for key, value in metrics.items():
+            print(f"  {key}: {value}")
+
+        print("=" * 80)
+        print("\nRaw Results:")
+        print(f"  Converged: {result['num_converged']}")
+        print(f"  Failed: {len(result.get('failed_indices', []))}")
+        if result.get("energies"):
+            valid_energies = [e for e in result["energies"] if e is not None]
+            if valid_energies:
+                print(f"  Sample energies: {valid_energies[:3]}")
+
+    except Exception as e:
+        print(f"\n[ERROR] Benchmark failed: {e}")
+        raise
diff --git a/garden_ai/benchmarks/matbench_discovery/remote_runner.py b/garden_ai/benchmarks/matbench_discovery/remote_runner.py
new file mode 100644
index 00000000..109c9944
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/remote_runner.py
@@ -0,0 +1,470 @@
+"""Remote execution functions for Matbench Discovery benchmarks.
+
+These functions are serialized and executed on Globus Compute endpoints.
+They handle environment setup, dependency installation, and benchmark execution.
+"""
+
+
+def run_matbench_is2re(
+    repo_url: str,
+    repo_ref: str,
+    model_package: str,
+    model_factory: str,
+    model_kwargs: dict,
+    model_checkpoint: str | None,
+    num_structures: int,
+    use_multi_gpu: bool = True,
+) -> dict:
+    """Run Matbench IS2RE benchmark on remote Globus Compute endpoint.
+
+    This function performs the complete benchmark workflow:
+    1. Set up Python environment with UV
+    2. Install dependencies (matbench-discovery + model package)
+    3. Execute benchmark runner script in the environment
+    4. Return results
+
+    Args:
+        repo_url: GitHub URL for matbench-discovery repo
+        repo_ref: Git branch/tag/commit to checkout
+        model_package: Python package name to install (e.g., "mace-torch")
+        model_factory: Function or class name to create model (e.g., "mace_mp", "MACE")
+        model_kwargs: Dictionary of kwargs to pass when creating model
+        model_checkpoint: Path/URL to model checkpoint file (optional)
+        num_structures: Number of test structures to run (subset for MVP)
+        use_multi_gpu: If True, automatically detect and use all available GPUs
+                      in parallel. If False, use single GPU/CPU. (default: True)
+
+    Returns:
+        Dictionary with benchmark results:
+            - energies: List of final energies (None for failed relaxations)
+            - num_converged: Count of successful relaxations
+            - failed_indices: List of structure indices that failed
+
+    Raises:
+        RuntimeError: If benchmark execution fails
+    """
+    # All imports must be inside the function for CombinedCode serialization
+    import json
+    import logging
+    import os
+    import subprocess
+    import sys
+    import tempfile
+    from pathlib import Path
+
+    # Configure logging
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        stream=sys.stdout,
+        force=True,
+    )
+    # Ensure stdout is unbuffered
+    if hasattr(sys.stdout, "reconfigure"):
+        sys.stdout.reconfigure(line_buffering=True)
+
+    logger = logging.getLogger(__name__)
+
+    # Create isolated working directory
+    work_dir = Path(tempfile.mkdtemp(prefix="matbench_benchmark_"))
+
+    # This script runs INSIDE the virtual environment
+    BENCHMARK_RUNNER_SCRIPT = '''
+import json
+import sys
+import time
+import logging
+import os
+import concurrent.futures
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s [%(levelname)s] [%(name)s] [PID:%(process)d] %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+    stream=sys.stdout,
+    force=True
+)
+logger = logging.getLogger("benchmark_runner")
+
+def setup_device(gpu_id: Optional[int] = None) -> str:
+    """Setup compute device for this process."""
+    import torch
+
+    if gpu_id is not None and torch.cuda.is_available():
+        # Set visible devices to just this GPU to avoid contention
+        # and ensure model uses the correct device
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+        return "cuda:0"
+    elif torch.cuda.is_available():
+        return "cuda"
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return "mps"
+    else:
+        return "cpu"
+
+def process_batch(
+    batch_id: int,
+    structures: List[Any],
+    start_idx: int,
+    model_config: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Process a batch of structures on a specific device."""
+
+    # Setup logging for this worker
+    worker_logger = logging.getLogger(f"worker_{batch_id}")
+    worker_logger.setLevel(logging.INFO)
+
+    gpu_id = model_config.get("gpu_id")
+    device = setup_device(gpu_id)
+    worker_logger.info(f"Worker {batch_id} started on {device} with {len(structures)} structures")
+
+    # Initialize model
+    try:
+        import importlib
+
+        package_name = model_config["package"]
+        factory_name = model_config["factory"]
+        kwargs = model_config["kwargs"].copy()
+        checkpoint = model_config.get("checkpoint")
+
+        # Update device in kwargs
+        if "device" in kwargs:
+            kwargs["device"] = device
+
+        # Import factory
+        module_parts = package_name.split(".")
+        if len(module_parts) > 1:
+            module = importlib.import_module(package_name)
+            factory = getattr(module, factory_name)
+        else:
+            base_module = module_parts[0].split("-")[0]
+            try:
+                module = importlib.import_module(f"{base_module}.calculators")
+                factory = getattr(module, factory_name)
+            except (ImportError, AttributeError):
+                module = importlib.import_module(base_module)
+                factory = getattr(module, factory_name)
+
+        # Create model
+        model = factory(**kwargs)
+
+        # Load checkpoint
+        if checkpoint and checkpoint != "None":
+            if hasattr(model, "load_checkpoint"):
+                model.load_checkpoint(checkpoint)
+            elif hasattr(model, "load_state_dict"):
+                import torch
+                model.load_state_dict(torch.load(checkpoint))
+
+    except Exception as e:
+        worker_logger.error(f"Failed to initialize model: {e}")
+        return {
+            "energies": [None] * len(structures),
+            "num_converged": 0,
+            "failed_indices": [start_idx + i for i in range(len(structures))],
+            "error": str(e)
+        }
+
+    # Run relaxations
+    from ase.optimize import FIRE
+
+    energies = []
+    failed_indices = []
+    num_converged = 0
+
+    batch_start = time.time()
+
+    for i, atoms in enumerate(structures):
+        global_idx = start_idx + i
+        try:
+            atoms.calc = model
+            opt = FIRE(atoms, logfile=None)
+            opt.run(fmax=0.05, steps=500)
+
+            energy = atoms.get_potential_energy()
+            energies.append(energy)
+            num_converged += 1
+
+            # Log progress occasionally
+            if (i + 1) % 10 == 0:
+                elapsed = time.time() - batch_start
+                rate = (i + 1) / elapsed if elapsed > 0 else 0
+                eta = (len(structures) - i - 1) / rate if rate > 0 else 0
+                worker_logger.info(
+                    f"Progress: {i+1}/{len(structures)} "
+                    f"({rate:.2f} struct/s, ETA: {eta/60:.1f}m)"
+                )
+
+        except Exception as e:
+            worker_logger.warning(f"Structure {global_idx} failed: {e}")
+            energies.append(None)
+            failed_indices.append(global_idx)
+
+    return {
+        "energies": energies,
+        "num_converged": num_converged,
+        "failed_indices": failed_indices
+    }
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python benchmark_runner.py <config_file>")
+        sys.exit(1)
+
+    config_path = sys.argv[1]
+    with open(config_path) as f:
+        config = json.load(f)
+
+    logger.info("Starting benchmark runner...")
+
+    # Load structures
+    logger.info("Loading structures...")
+    try:
+        from matbench_discovery.data import DataFiles
+        from zipfile import ZipFile
+        from ase.io import read
+        from io import TextIOWrapper
+
+        structures = []
+        zip_path = DataFiles.wbm_initial_atoms.path
+        num_structures = config["num_structures"]
+
+        with ZipFile(zip_path, 'r') as zf:
+            file_list = sorted(
+                zf.namelist(),
+                key=lambda x: int(x.split('.')[0]) if x.split('.')[0].isdigit() else float('inf')
+            )
+            for i, filename in enumerate(file_list[:num_structures]):
+                with zf.open(filename) as f:
+                    text_stream = TextIOWrapper(f, encoding='utf-8')
+                    atoms = read(text_stream, format='extxyz')
+                    structures.append(atoms)
+
+        logger.info(f"Loaded {len(structures)} structures")
+
+    except Exception as e:
+        logger.error(f"Failed to load structures: {e}")
+        sys.exit(1)
+
+    # Determine parallelization strategy
+    import torch
+    num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
+    use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1
+
+    results = {
+        "energies": [],
+        "num_converged": 0,
+        "failed_indices": []
+    }
+
+    start_time = time.time()
+
+    if use_multi_gpu:
+        logger.info(f"Running on {num_gpus} GPUs in parallel")
+
+        # Split structures
+        batch_size = len(structures) // num_gpus
+        futures = []
+
+        # Use 'spawn' start method for CUDA compatibility
+        import multiprocessing
+        ctx = multiprocessing.get_context('spawn')
+
+        with concurrent.futures.ProcessPoolExecutor(max_workers=num_gpus, mp_context=ctx) as executor:
+            for i in range(num_gpus):
+                start_idx = i * batch_size
+                end_idx = len(structures) if i == num_gpus - 1 else (i + 1) * batch_size
+                batch_structures = structures[start_idx:end_idx]
+
+                model_config = {
+                    "package": config["model_package"],
+                    "factory": config["model_factory"],
+                    "kwargs": config["model_kwargs"],
+                    "checkpoint": config["model_checkpoint"],
+                    "gpu_id": i
+                }
+
+                futures.append(
+                    executor.submit(
+                        process_batch,
+                        i,
+                        batch_structures,
+                        start_idx,
+                        model_config
+                    )
+                )
+
+            # Collect results
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    batch_res = future.result()
+                    results["energies"].extend(batch_res["energies"])
+                    results["num_converged"] += batch_res["num_converged"]
+                    results["failed_indices"].extend(batch_res["failed_indices"])
+                except Exception as e:
+                    logger.error(f"Worker failed: {e}")
+
+    else:
+        logger.info("Running in single process")
+        model_config = {
+            "package": config["model_package"],
+            "factory": config["model_factory"],
+            "kwargs": config["model_kwargs"],
+            "checkpoint": config["model_checkpoint"],
+            # No gpu_id means let model decide or use default
+        }
+
+        batch_res = process_batch(0, structures, 0, model_config)
+        results = batch_res
+
+    elapsed = time.time() - start_time
+    logger.info(f"Benchmark complete in {elapsed:.1f}s")
+    logger.info(f"Converged: {results['num_converged']}/{len(structures)}")
+
+    # Save results
+    with open("results.json", "w") as f:
+        json.dump(results, f, indent=2)
+
+if __name__ == "__main__":
+    main()
+'''
+
+    try:
+        # ----------------------------------------------------------------------
+        # 1. ENVIRONMENT SETUP
+        # ----------------------------------------------------------------------
+        logger.info("Step 1/4: Setting up environment...")
+
+        uv_bin = (
+            subprocess.run(
+                ["python", "-c", "import uv; print(uv.find_uv_bin())"],
+                capture_output=True,
+            )
+            .stdout.decode("utf-8")
+            .strip()
+        )
+
+        # Create UV virtual environment
+        subprocess.run(
+            [uv_bin, "venv", "--python", "3.11"],
+            cwd=work_dir,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+
+        venv_python = work_dir / ".venv/bin/python"
+        if not venv_python.exists():
+            # Windows path
+            venv_python = work_dir / ".venv/Scripts/python.exe"
+
+        if not venv_python.exists():
+            raise RuntimeError(f"Virtual environment python not found at {venv_python}")
+
+        # Install matbench-discovery and model package
+        logger.info("Installing dependencies...")
+        subprocess.run(
+            [
+                uv_bin,
+                "pip",
+                "install",
+                "--python",
+                str(venv_python),
+                "matbench-discovery",
+            ],
+            cwd=work_dir,
+            check=True,
+        )
+        subprocess.run(
+            [uv_bin, "pip", "install", "--python", str(venv_python), model_package],
+            cwd=work_dir,
+            check=True,
+        )
+
+        # Set SSL cert file to certifi's CA bundle to fix HPC SSL verification issues
+        env = dict(os.environ)
+        env["MBD_AUTO_DOWNLOAD_FILES"] = "true"
+
+        try:
+            certifi_path = subprocess.run(
+                [str(venv_python), "-c", "import certifi; print(certifi.where())"],
+                capture_output=True,
+                text=True,
+                check=True,
+            ).stdout.strip()
+            env["SSL_CERT_FILE"] = certifi_path
+        except Exception as e:
+            logger.warning(f"Failed to set SSL_CERT_FILE: {e}")
+
+        # ----------------------------------------------------------------------
+        # 2. PREPARE BENCHMARK SCRIPT
+        # ----------------------------------------------------------------------
+        logger.info("Step 2/4: Preparing benchmark script...")
+
+        # Write runner script
+        runner_path = work_dir / "benchmark_runner.py"
+        runner_path.write_text(BENCHMARK_RUNNER_SCRIPT)
+
+        # Write config
+        config = {
+            "repo_url": repo_url,
+            "repo_ref": repo_ref,
+            "model_package": model_package,
+            "model_factory": model_factory,
+            "model_kwargs": model_kwargs,
+            "model_checkpoint": model_checkpoint,
+            "num_structures": num_structures,
+            "use_multi_gpu": use_multi_gpu,
+        }
+
+        config_path = work_dir / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(config, f, indent=2)
+
+        # ----------------------------------------------------------------------
+        # 3. EXECUTE BENCHMARK
+        # ----------------------------------------------------------------------
+        logger.info("Step 3/4: Executing benchmark...")
+
+        # Run the runner script inside the venv
+        # We stream output directly to stdout so the user sees progress
+        proc = subprocess.run(
+            [str(venv_python), str(runner_path), str(config_path)],
+            cwd=work_dir,
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+            check=False,  # We check return code manually
+        )
+
+        if proc.returncode != 0:
+            raise RuntimeError(
+                f"Benchmark runner failed with return code {proc.returncode}"
+            )
+
+        # ----------------------------------------------------------------------
+        # 4. COLLECT RESULTS
+        # ----------------------------------------------------------------------
+        logger.info("Step 4/4: Collecting results...")
+
+        results_path = work_dir / "results.json"
+        if not results_path.exists():
+            raise RuntimeError(
+                "Results file not found - benchmark may have crashed silently"
+            )
+
+        with open(results_path) as f:
+            results = json.load(f)
+
+        logger.info("Benchmark completed successfully.")
+        return results
+
+    finally:
+        # Cleanup working directory
+        import shutil
+
+        shutil.rmtree(work_dir, ignore_errors=True)
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
new file mode 100644
index 00000000..f35b1306
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -0,0 +1,321 @@
+"""Matbench Discovery benchmark task implementations."""
+
+from typing import TYPE_CHECKING, Any
+
+from .remote_runner import run_matbench_is2re
+
+if TYPE_CHECKING:
+    from . import MatbenchDiscovery
+
+
+class IS2RETask:
+    """Initial Structure to Relaxed Energy benchmark task.
+
+    This task evaluates a model's ability to predict the relaxed energy
+    and geometry of crystal structures starting from unrelaxed initial
+    configurations.
+
+    The task:
+    1. Loads initial (unrelaxed) structures from the WBM test set
+    2. Uses the model to perform geometry optimization
+    3. Records final energies and relaxed structures
+    4. Calculates metrics comparing to DFT ground truth
+    """
+
+    def __init__(
+        self,
+        adapter: "MatbenchDiscovery",
+        repo_url: str,
+        repo_ref: str,
+        model_package: str | None = None,
+    ):
+        """Initialize IS2RE task.
+
+        Args:
+            adapter: MatbenchDiscovery adapter instance
+            repo_url: Matbench Discovery repository URL
+            repo_ref: Git ref (branch/tag/commit) to use
+            model_package: Default model package to install (can override in submit)
+        """
+        self.adapter = adapter
+        self.repo_url = repo_url
+        self.repo_ref = repo_ref
+        self.model_package = model_package
+        self.name = "IS2RE"
+
+    def submit(
+        self,
+        model=None,
+        num_structures: int = 100,
+        model_package: str | None = None,
+        model_factory: str | None = None,
+        model_kwargs: dict | None = None,
+        use_multi_gpu: bool = True,
+    ):
+        """Submit IS2RE benchmark job to remote executor.
+
+        You can specify the model in two ways:
+        1. Pass a local model instance (will introspect to get remote construction info)
+        2. Explicitly specify model_package and model_factory
+
+        Args:
+            model: (Optional) Local model instance. If provided, will extract
+                   package, class, and checkpoint information from it.
+            num_structures: Number of test structures to evaluate (default: 100).
+                           Full test set has ~257k structures. Use smaller values
+                           for quick testing.
+            model_package: Python package name to install (e.g., "mace-torch").
+                          Required if model is None.
+            model_factory: How to instantiate the model on remote. Can be:
+                          - Function name: "mace_mp" (will call as function)
+                          - Class name: "MACE" (will instantiate as class)
+                          Required if model is None.
+            model_kwargs: Dictionary of kwargs to pass when creating model remotely.
+                         Example: {"model": "medium", "device": "cuda"}
+            use_multi_gpu: If True, automatically detect and use all available GPUs
+                          in parallel for faster processing. If False, use single
+                          GPU/CPU. (default: True)
+
+        Returns:
+            Future object that will contain benchmark results when complete.
+            Call .result() to block and wait for completion.
+
+        Examples:
+            Using local model instance:
+            >>> from mace.calculators import mace_mp
+            >>> model = mace_mp(model="medium")
+            >>> future = task.submit(model, num_structures=50)
+
+            Specifying remote construction explicitly:
+            >>> future = task.submit(
+            ...     model_package="mace-torch",
+            ...     model_factory="mace_mp",
+            ...     model_kwargs={"model": "medium", "device": "cuda"},
+            ...     num_structures=50,
+            ...     use_multi_gpu=True
+            ... )
+        """
+        # Determine how to construct model remotely
+        if model is not None:
+            # Extract info from local model instance
+            if model_package is None:
+                if self.model_package is not None:
+                    model_package = self.model_package
+                else:
+                    # Infer from model's module
+                    model_package = model.__class__.__module__.split(".")[0]
+
+            if model_factory is None:
+                model_factory = model.__class__.__name__
+
+            # Get checkpoint path if model has one
+            model_checkpoint = None
+            if hasattr(model, "checkpoint_path"):
+                model_checkpoint = model.checkpoint_path
+            elif hasattr(model, "checkpoint"):
+                model_checkpoint = model.checkpoint
+
+            # Try to extract initialization kwargs if available
+            if model_kwargs is None and hasattr(model, "_init_kwargs"):
+                model_kwargs = model._init_kwargs
+
+        else:
+            # Must provide explicit construction info
+            if model_package is None or model_factory is None:
+                raise ValueError(
+                    "If model is not provided, must specify both "
+                    "model_package and model_factory"
+                )
+            model_checkpoint = None
+
+        if model_kwargs is None:
+            model_kwargs = {}
+
+        # Get executor (will create if needed) and submit remote execution
+        executor = self.adapter._get_executor()
+        future = executor.submit(
+            run_matbench_is2re,
+            repo_url=self.repo_url,
+            repo_ref=self.repo_ref,
+            model_package=model_package,
+            model_factory=model_factory,
+            model_kwargs=model_kwargs,
+            model_checkpoint=model_checkpoint,
+            num_structures=num_structures,
+            use_multi_gpu=use_multi_gpu,
+        )
+
+        return future
+
+    def local(
+        self,
+        model=None,
+        num_structures: int = 100,
+        model_package: str | None = None,
+        model_factory: str | None = None,
+        model_kwargs: dict | None = None,
+        use_multi_gpu: bool = True,
+    ) -> dict:
+        """Run benchmark locally in ephemeral UV environment.
+
+        This executes the same benchmark workflow locally instead of submitting
+        to a remote Globus Compute endpoint. Useful for testing and development.
+
+        Args:
+            model: Optional local model instance to extract metadata from
+            num_structures: Number of test structures to evaluate
+            model_package: Python package name to install (e.g., "mace-torch")
+            model_factory: Function or class name to create model
+            model_kwargs: Dictionary of kwargs for model creation
+            use_multi_gpu: If True, automatically detect and use all available GPUs
+                          in parallel. If False, use single GPU/CPU. (default: True)
+
+        Returns:
+            Dictionary with benchmark results (same format as remote execution)
+
+        Example:
+            >>> results = task.local(
+            ...     model_package="mace-torch",
+            ...     model_factory="mace_mp",
+            ...     model_kwargs={"model": "medium", "device": "cpu"},
+            ...     num_structures=10,
+            ...     use_multi_gpu=False
+            ... )
+        """
+        import json
+        import subprocess
+        import tempfile
+        from pathlib import Path
+
+        # Extract model metadata if model instance provided
+        if model is not None:
+            if model_package is None:
+                if self.model_package is not None:
+                    model_package = self.model_package
+                else:
+                    model_package = model.__class__.__module__.split(".")[0]
+
+            if model_factory is None:
+                model_factory = model.__class__.__name__
+
+            model_checkpoint = None
+            if hasattr(model, "checkpoint_path"):
+                model_checkpoint = model.checkpoint_path
+            elif hasattr(model, "checkpoint"):
+                model_checkpoint = model.checkpoint
+
+            if model_kwargs is None and hasattr(model, "_init_kwargs"):
+                model_kwargs = model._init_kwargs
+        else:
+            if model_package is None or model_factory is None:
+                raise ValueError(
+                    "If model is not provided, must specify both "
+                    "model_package and model_factory"
+                )
+            model_checkpoint = None
+
+        if model_kwargs is None:
+            model_kwargs = {}
+
+        # Run benchmark in subprocess with isolated environment
+        import sys
+
+        config = {
+            "repo_url": self.repo_url,
+            "repo_ref": self.repo_ref,
+            "model_package": model_package,
+            "model_factory": model_factory,
+            "model_kwargs": model_kwargs,
+            "model_checkpoint": model_checkpoint,
+            "num_structures": num_structures,
+            "use_multi_gpu": use_multi_gpu,
+        }
+
+        results_file_path = (
+            Path(tempfile.gettempdir()) / f"benchmark_results_{id(config)}.json"
+        )
+
+        wrapper_script = f'''
+import json
+from garden_ai.benchmarks.matbench_discovery.remote_runner import run_matbench_is2re
+
+config = {repr(config)}
+results = run_matbench_is2re(**config)
+
+with open("{results_file_path}", "w") as f:
+    json.dump(results, f, indent=2)
+'''
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(wrapper_script)
+            wrapper_path = f.name
+
+        try:
+            # Run without capturing output so logs stream to console in real-time
+            result = subprocess.run(
+                [sys.executable, wrapper_path],
+                timeout=3600,
+                # Don't capture output - let it stream to console
+                stdout=None,
+                stderr=None,
+            )
+
+            if result.returncode != 0:
+                raise RuntimeError(
+                    f"Local benchmark failed with return code {result.returncode}"
+                )
+
+            if not results_file_path.exists():
+                raise RuntimeError(
+                    f"Benchmark results file not found at {results_file_path}"
+                )
+
+            with open(results_file_path) as f:
+                return json.load(f)
+
+        finally:
+            Path(wrapper_path).unlink(missing_ok=True)
+            results_file_path.unlink(missing_ok=True)
+
+    def calculate_metrics(self, outputs: dict) -> dict[str, Any]:
+        """Calculate benchmark metrics from raw outputs.
+
+        For MVP, this returns basic statistics. Future versions will compare
+        against DFT ground truth and calculate proper benchmark metrics like
+        F1 score, discovery yield, etc.
+
+        Args:
+            outputs: Dictionary from remote execution containing:
+                - energies: List of relaxed energies
+                - num_converged: Number of successful relaxations
+                - failed_indices: Indices of failed structures
+
+        Returns:
+            Dictionary of calculated metrics:
+                - num_attempted: Total structures attempted
+                - num_converged: Number of successful relaxations
+                - success_rate: Fraction of successful relaxations
+                - mean_energy: Average final energy (eV/atom, if available)
+                - num_failed: Count of failed relaxations
+        """
+        energies = outputs.get("energies", [])
+        num_converged = outputs.get("num_converged", 0)
+        failed_indices = outputs.get("failed_indices", [])
+
+        # Filter out None values (failed relaxations)
+        valid_energies = [e for e in energies if e is not None]
+
+        metrics = {
+            "num_attempted": len(energies),
+            "num_converged": num_converged,
+            "num_failed": len(failed_indices),
+            "success_rate": num_converged / len(energies) if energies else 0.0,
+        }
+
+        # Calculate energy statistics if we have valid results
+        if valid_energies:
+            metrics["mean_energy"] = sum(valid_energies) / len(valid_energies)
+            metrics["min_energy"] = min(valid_energies)
+            metrics["max_energy"] = max(valid_energies)
+
+        return metrics

From 5653925e9ab77ac16d7599ba9a9d6ffee9260a8f Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Fri, 21 Nov 2025 09:46:20 -0700
Subject: [PATCH 02/23] multi-gpu setup working nicely

---
 .../examples/matbench_1000_structures.py      |  83 ------
 .../examples/matbench_mace_multi_gpu.py       |  97 +++++++
 .../matbench_discovery/remote_runner.py       | 262 ++++++++----------
 .../benchmarks/matbench_discovery/tasks.py    |   2 +
 4 files changed, 214 insertions(+), 230 deletions(-)
 delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py

diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py
deleted file mode 100644
index 151b043e..00000000
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""Test Matbench Discovery benchmark on Anvil HPC with 1000 structures.
-
-This script demonstrates scaling to 1000 structures using 4 GPUs in parallel.
-It's designed to test the multi-GPU parallelization implementation and measure
-throughput before attempting the full dataset.
-"""
-
-from garden_ai.benchmarks import MatbenchDiscovery
-
-ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
-
-ENDPOINT_CONFIG = {
-    "account": "cis250461-gpu",  # HPC allocation/account
-    "qos": "gpu",
-    "partition": "gpu-debug",  # Use full partition (not debug) for longer run
-    "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --time=00:30:00\n#SBATCH --mem=32G",
-    "worker_init": "pip install --user uv",  # Install uv on worker startup
-}
-
-MODEL_PACKAGE = "mace-torch"
-MODEL_FACTORY = "mace_mp"
-MODEL_KWARGS = {
-    "model": "medium",
-    "device": "cuda",  # Use GPU on HPC
-    "default_dtype": "float64",
-}
-
-NUM_STRUCTURES = 1000
-
-print("=" * 80)
-print("Matbench Discovery IS2RE Benchmark - 1000 Structures")
-print("=" * 80)
-print(f"Endpoint: {ENDPOINT_ID}")
-print(f"Model: {MODEL_PACKAGE} / {MODEL_FACTORY}")
-print(f"Structures: {NUM_STRUCTURES}")
-print("Multi-GPU: Enabled (2 GPUs)")
-print("=" * 80)
-
-with MatbenchDiscovery(
-    endpoint_id=ENDPOINT_ID,
-    user_endpoint_config=ENDPOINT_CONFIG,
-) as bench:
-    task = bench.tasks.IS2RE
-
-    future = task.submit(
-        model_package=MODEL_PACKAGE,
-        model_factory=MODEL_FACTORY,
-        model_kwargs=MODEL_KWARGS,
-        num_structures=NUM_STRUCTURES,
-        use_multi_gpu=True,  # Enable multi-GPU parallelization
-    )
-
-    print("\nJob submitted! Waiting for results...")
-    print("This may take a while. You can monitor progress in the Globus Compute logs.")
-    print()
-
-    try:
-        result = future.result()
-        metrics = task.calculate_metrics(result)
-
-        print("\nResults:")
-        print("=" * 80)
-        for key, value in metrics.items():
-            print(f"  {key}: {value}")
-
-        print("=" * 80)
-        print("\nRaw Results:")
-        print(f"  Converged: {result['num_converged']}")
-        print(f"  Failed: {len(result.get('failed_indices', []))}")
-        if result.get("energies"):
-            valid_energies = [e for e in result["energies"] if e is not None]
-            if valid_energies:
-                print(f"  Sample energies: {valid_energies[:3]}")
-
-        # Calculate and display throughput
-        if "num_converged" in result and result["num_converged"] > 0:
-            print("\nPerformance:")
-            print(f"  Success rate: {metrics.get('success_rate', 0):.1%}")
-            print("  Note: Check job logs for detailed throughput (structures/hour)")
-
-    except Exception as e:
-        print(f"\n[ERROR] Benchmark failed: {e}")
-        raise
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
new file mode 100644
index 00000000..475ef079
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
@@ -0,0 +1,97 @@
+"""Test Matbench Discovery benchmark on Anvil HPC.
+
+This script demonstrates running the IS2RE benchmark with a subset of structures
+using multi-GPU parallelization on a Globus Compute endpoint.
+"""
+
+from garden_ai.benchmarks import MatbenchDiscovery
+
+# ------------------------------------------------------------------------------
+# Configuration
+# ------------------------------------------------------------------------------
+
+# Globus Compute Endpoint ID (Anvil HPC)
+ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
+
+# Job Configuration
+NUM_GPUS = 2
+ENDPOINT_CONFIG = {
+    "account": "cis250461-gpu",
+    "qos": "gpu",
+    "partition": "gpu",
+    "scheduler_options": f"#SBATCH --gpus-per-node={NUM_GPUS}\n#SBATCH --time=00:30:00\n",
+    "cores_per_node": 32,
+    "mem_per_node": 32,  # GB
+    "worker_init": "pip install --user uv",  # Ensure uv is available
+}
+
+# Model Configuration
+MODEL_PACKAGE = "mace-torch"
+MODEL_FACTORY = "mace_mp"
+MODEL_KWARGS = {
+    "model": "medium",
+    "device": "cuda",
+    "default_dtype": "float64",
+}
+
+# Benchmark Configuration
+NUM_STRUCTURES = 500
+
+
+def main():
+    print("=" * 80)
+    print("Matbench Discovery IS2RE Benchmark")
+    print("=" * 80)
+    print(f"Endpoint:   {ENDPOINT_ID}")
+    print(f"Model:      {MODEL_PACKAGE} / {MODEL_FACTORY}")
+    print(f"Structures: {NUM_STRUCTURES}")
+    print(f"Resources:  {NUM_GPUS} GPUs (Multi-GPU Enabled)")
+    print("=" * 80)
+
+    with MatbenchDiscovery(
+        endpoint_id=ENDPOINT_ID,
+        user_endpoint_config=ENDPOINT_CONFIG,
+    ) as bench:
+        task = bench.tasks.IS2RE
+
+        print("\nSubmitting task to endpoint...")
+        future = task.submit(
+            model_package=MODEL_PACKAGE,
+            model_factory=MODEL_FACTORY,
+            model_kwargs=MODEL_KWARGS,
+            num_structures=NUM_STRUCTURES,
+            use_multi_gpu=True,
+        )
+
+        print("Job submitted! Waiting for results (this may take a while)...")
+
+        try:
+            result = future.result()
+            metrics = task.calculate_metrics(result)
+
+            print("\n" + "=" * 80)
+            print("Benchmark Results")
+            print("=" * 80)
+
+            # Print primary metrics
+            for key, value in metrics.items():
+                print(f"{key:<20}: {value}")
+
+            print("-" * 80)
+            print(f"Converged: {result['num_converged']} / {NUM_STRUCTURES}")
+            print(f"Failed:    {len(result.get('failed_indices', []))}")
+
+            if result.get("energies"):
+                valid_energies = [e for e in result["energies"] if e is not None]
+                if valid_energies:
+                    print(f"Sample energies:   {valid_energies[:3]} ...")
+
+            print("=" * 80)
+
+        except Exception as e:
+            print(f"\n[ERROR] Benchmark failed: {e}")
+            raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/garden_ai/benchmarks/matbench_discovery/remote_runner.py b/garden_ai/benchmarks/matbench_discovery/remote_runner.py
index 109c9944..432d43cc 100644
--- a/garden_ai/benchmarks/matbench_discovery/remote_runner.py
+++ b/garden_ai/benchmarks/matbench_discovery/remote_runner.py
@@ -55,12 +55,10 @@ def run_matbench_is2re(
     # Configure logging
     logging.basicConfig(
         level=logging.INFO,
-        format="%(asctime)s [%(levelname)s] %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
         stream=sys.stdout,
         force=True,
+        format="%(asctime)s [%(levelname)s] %(message)s",
     )
-    # Ensure stdout is unbuffered
     if hasattr(sys.stdout, "reconfigure"):
         sys.stdout.reconfigure(line_buffering=True)
 
@@ -77,13 +75,21 @@ def run_matbench_is2re(
 import logging
 import os
 import concurrent.futures
+import importlib
 from pathlib import Path
 from typing import List, Dict, Any, Optional
+from zipfile import ZipFile
+from io import TextIOWrapper
+
+import torch
+from ase.io import read
+from ase.optimize import FIRE
+from matbench_discovery.data import DataFiles
 
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
-    format='%(asctime)s [%(levelname)s] [%(name)s] [PID:%(process)d] %(message)s',
+    format='%(asctime)s [%(levelname)s] [PID:%(process)d] %(message)s',
     datefmt='%Y-%m-%d %H:%M:%S',
     stream=sys.stdout,
     force=True
@@ -92,74 +98,72 @@ def run_matbench_is2re(
 
 def setup_device(gpu_id: Optional[int] = None) -> str:
     """Setup compute device for this process."""
-    import torch
-
-    if gpu_id is not None and torch.cuda.is_available():
-        # Set visible devices to just this GPU to avoid contention
-        # and ensure model uses the correct device
-        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
-        return "cuda:0"
-    elif torch.cuda.is_available():
-        return "cuda"
+    if torch.cuda.is_available():
+        return f"cuda:{gpu_id}" if gpu_id is not None else "cuda"
     elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
         return "mps"
-    else:
-        return "cpu"
+    return "cpu"
 
-def process_batch(
-    batch_id: int,
-    structures: List[Any],
-    start_idx: int,
-    model_config: Dict[str, Any]
-) -> Dict[str, Any]:
-    """Process a batch of structures on a specific device."""
+def load_model(config: Dict[str, Any], device: str):
+    """Initialize the model from configuration."""
+    package_name = config["package"]
+    factory_name = config["factory"]
+    kwargs = config["kwargs"].copy()
+    checkpoint = config.get("checkpoint")
 
-    # Setup logging for this worker
-    worker_logger = logging.getLogger(f"worker_{batch_id}")
-    worker_logger.setLevel(logging.INFO)
-
-    gpu_id = model_config.get("gpu_id")
-    device = setup_device(gpu_id)
-    worker_logger.info(f"Worker {batch_id} started on {device} with {len(structures)} structures")
+    if "device" in kwargs:
+        kwargs["device"] = device
 
-    # Initialize model
+    # Import factory function
+    module_parts = package_name.split(".")
     try:
-        import importlib
-
-        package_name = model_config["package"]
-        factory_name = model_config["factory"]
-        kwargs = model_config["kwargs"].copy()
-        checkpoint = model_config.get("checkpoint")
-
-        # Update device in kwargs
-        if "device" in kwargs:
-            kwargs["device"] = device
-
-        # Import factory
-        module_parts = package_name.split(".")
         if len(module_parts) > 1:
             module = importlib.import_module(package_name)
-            factory = getattr(module, factory_name)
         else:
+            # Try common patterns for model packages
             base_module = module_parts[0].split("-")[0]
             try:
                 module = importlib.import_module(f"{base_module}.calculators")
-                factory = getattr(module, factory_name)
-            except (ImportError, AttributeError):
+            except ImportError:
                 module = importlib.import_module(base_module)
-                factory = getattr(module, factory_name)
 
-        # Create model
-        model = factory(**kwargs)
+        factory = getattr(module, factory_name)
+    except (ImportError, AttributeError) as e:
+        raise ImportError(f"Could not load model factory {factory_name} from {package_name}: {e}")
+
+    # Create model
+    model = factory(**kwargs)
+
+    # Load checkpoint if provided
+    if checkpoint and checkpoint != "None":
+        if hasattr(model, "load_checkpoint"):
+            model.load_checkpoint(checkpoint)
+        elif hasattr(model, "load_state_dict"):
+            model.load_state_dict(torch.load(checkpoint))
+
+    return model
+
+def process_batch(
+    batch_id: int,
+    structures: List[Any],
+    start_idx: int,
+    model_config: Dict[str, Any],
+    num_threads: int
+) -> Dict[str, Any]:
+    """Process a batch of structures on a specific device."""
+
+    # Configure thread limits to avoid contention
+    os.environ["OMP_NUM_THREADS"] = str(num_threads)
+    torch.set_num_threads(num_threads)
 
-        # Load checkpoint
-        if checkpoint and checkpoint != "None":
-            if hasattr(model, "load_checkpoint"):
-                model.load_checkpoint(checkpoint)
-            elif hasattr(model, "load_state_dict"):
-                import torch
-                model.load_state_dict(torch.load(checkpoint))
+    gpu_id = model_config.get("gpu_id")
+    device = setup_device(gpu_id)
 
+    worker_logger = logging.getLogger(f"worker_{batch_id}")
+    worker_logger.info(f"Started on {device} with {len(structures)} structures. Threads: {num_threads}")
+
+    try:
+        model = load_model(model_config, device)
     except Exception as e:
         worker_logger.error(f"Failed to initialize model: {e}")
         return {
@@ -169,13 +173,9 @@ def process_batch(
             "error": str(e)
         }
 
-    # Run relaxations
-    from ase.optimize import FIRE
-
     energies = []
     failed_indices = []
     num_converged = 0
-
     batch_start = time.time()
 
     for i, atoms in enumerate(structures):
@@ -185,19 +185,13 @@ def process_batch(
             opt = FIRE(atoms, logfile=None)
             opt.run(fmax=0.05, steps=500)
 
-            energy = atoms.get_potential_energy()
-            energies.append(energy)
+            energies.append(atoms.get_potential_energy())
             num_converged += 1
 
-            # Log progress occasionally
             if (i + 1) % 10 == 0:
                 elapsed = time.time() - batch_start
                 rate = (i + 1) / elapsed if elapsed > 0 else 0
-                eta = (len(structures) - i - 1) / rate if rate > 0 else 0
-                worker_logger.info(
-                    f"Progress: {i+1}/{len(structures)} "
-                    f"({rate:.2f} struct/s, ETA: {eta/60:.1f}m)"
-                )
+                worker_logger.info(f"Progress: {i+1}/{len(structures)} ({rate:.2f} struct/s)")
 
         except Exception as e:
             worker_logger.warning(f"Structure {global_idx} failed: {e}")
@@ -210,75 +204,69 @@ def process_batch(
         "failed_indices": failed_indices
     }
 
+def load_structures(num_structures: int) -> List[Any]:
+    """Load structures from the Matbench Discovery dataset."""
+    structures = []
+    zip_path = DataFiles.wbm_initial_atoms.path
+
+    with ZipFile(zip_path, 'r') as zf:
+        # Sort files numerically
+        file_list = sorted(
+            zf.namelist(),
+            key=lambda x: int(x.split('.')[0]) if x.split('.')[0].isdigit() else float('inf')
+        )
+        for filename in file_list[:num_structures]:
+            with zf.open(filename) as f:
+                text_stream = TextIOWrapper(f, encoding='utf-8')
+                structures.append(read(text_stream, format='extxyz'))
+    return structures
+
 def main():
     if len(sys.argv) != 2:
-        print("Usage: python benchmark_runner.py <config_file>")
-        sys.exit(1)
+        sys.exit("Usage: python benchmark_runner.py <config_file>")
 
-    config_path = sys.argv[1]
-    with open(config_path) as f:
+    with open(sys.argv[1]) as f:
         config = json.load(f)
 
     logger.info("Starting benchmark runner...")
 
-    # Load structures
-    logger.info("Loading structures...")
     try:
-        from matbench_discovery.data import DataFiles
-        from zipfile import ZipFile
-        from ase.io import read
-        from io import TextIOWrapper
-
-        structures = []
-        zip_path = DataFiles.wbm_initial_atoms.path
-        num_structures = config["num_structures"]
-
-        with ZipFile(zip_path, 'r') as zf:
-            file_list = sorted(
-                zf.namelist(),
-                key=lambda x: int(x.split('.')[0]) if x.split('.')[0].isdigit() else float('inf')
-            )
-            for i, filename in enumerate(file_list[:num_structures]):
-                with zf.open(filename) as f:
-                    text_stream = TextIOWrapper(f, encoding='utf-8')
-                    atoms = read(text_stream, format='extxyz')
-                    structures.append(atoms)
-
+        structures = load_structures(config["num_structures"])
         logger.info(f"Loaded {len(structures)} structures")
-
     except Exception as e:
         logger.error(f"Failed to load structures: {e}")
         sys.exit(1)
 
-    # Determine parallelization strategy
-    import torch
+    # Shuffle for load balancing
+    import random
+    random.seed(42)
+    random.shuffle(structures)
+
+    # Resource detection
     num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
     use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1
 
-    results = {
-        "energies": [],
-        "num_converged": 0,
-        "failed_indices": []
-    }
+    total_cores = os.cpu_count() or 1
+    num_workers = num_gpus if use_multi_gpu else 1
+    # Reserve cores for overhead if possible
+    available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores
+    threads_per_worker = max(1, available_cores // num_workers)
+
+    logger.info(f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)")
 
+    results = {"energies": [], "num_converged": 0, "failed_indices": []}
     start_time = time.time()
 
     if use_multi_gpu:
-        logger.info(f"Running on {num_gpus} GPUs in parallel")
-
-        # Split structures
+        logger.info(f"Parallel execution on {num_gpus} GPUs")
         batch_size = len(structures) // num_gpus
         futures = []
 
-        # Use 'spawn' start method for CUDA compatibility
-        import multiprocessing
         ctx = multiprocessing.get_context('spawn')
-
         with concurrent.futures.ProcessPoolExecutor(max_workers=num_gpus, mp_context=ctx) as executor:
             for i in range(num_gpus):
                 start_idx = i * batch_size
                 end_idx = len(structures) if i == num_gpus - 1 else (i + 1) * batch_size
-                batch_structures = structures[start_idx:end_idx]
 
                 model_config = {
                     "package": config["model_package"],
@@ -288,17 +276,10 @@ def main():
                     "gpu_id": i
                 }
 
-                futures.append(
-                    executor.submit(
-                        process_batch,
-                        i,
-                        batch_structures,
-                        start_idx,
-                        model_config
-                    )
-                )
-
-            # Collect results
+                futures.append(executor.submit(
+                    process_batch, i, structures[start_idx:end_idx], start_idx, model_config, threads_per_worker
+                ))
+
             for future in concurrent.futures.as_completed(futures):
                 try:
                     batch_res = future.result()
@@ -307,29 +288,24 @@ def main():
                     results["failed_indices"].extend(batch_res["failed_indices"])
                 except Exception as e:
                     logger.error(f"Worker failed: {e}")
-
     else:
-        logger.info("Running in single process")
+        logger.info("Single process execution")
         model_config = {
             "package": config["model_package"],
             "factory": config["model_factory"],
             "kwargs": config["model_kwargs"],
-            "checkpoint": config["model_checkpoint"],
-            # No gpu_id means let model decide or use default
+            "checkpoint": config["model_checkpoint"]
         }
-
-        batch_res = process_batch(0, structures, 0, model_config)
-        results = batch_res
+        results = process_batch(0, structures, 0, model_config, threads_per_worker)
 
     elapsed = time.time() - start_time
-    logger.info(f"Benchmark complete in {elapsed:.1f}s")
-    logger.info(f"Converged: {results['num_converged']}/{len(structures)}")
+    logger.info(f"Benchmark complete in {elapsed:.1f}s. Converged: {results['num_converged']}/{len(structures)}")
 
-    # Save results
     with open("results.json", "w") as f:
         json.dump(results, f, indent=2)
 
 if __name__ == "__main__":
+    import multiprocessing
     main()
 '''
 
@@ -339,14 +315,10 @@ def main():
         # ----------------------------------------------------------------------
         logger.info("Step 1/4: Setting up environment...")
 
-        uv_bin = (
-            subprocess.run(
-                ["python", "-c", "import uv; print(uv.find_uv_bin())"],
-                capture_output=True,
-            )
-            .stdout.decode("utf-8")
-            .strip()
-        )
+        # Find UV binary
+        uv_bin = subprocess.check_output(
+            [sys.executable, "-c", "import uv; print(uv.find_uv_bin())"], text=True
+        ).strip()
 
         # Create UV virtual environment
         subprocess.run(
@@ -354,18 +326,16 @@ def main():
             cwd=work_dir,
             check=True,
             capture_output=True,
-            text=True,
         )
 
         venv_python = work_dir / ".venv/bin/python"
         if not venv_python.exists():
-            # Windows path
-            venv_python = work_dir / ".venv/Scripts/python.exe"
+            venv_python = work_dir / ".venv/Scripts/python.exe"  # Windows fallback
 
         if not venv_python.exists():
             raise RuntimeError(f"Virtual environment python not found at {venv_python}")
 
-        # Install matbench-discovery and model package
+        # Install dependencies
         logger.info("Installing dependencies...")
         subprocess.run(
             [
@@ -385,17 +355,15 @@ def main():
             check=True,
         )
 
-        # Set SSL cert file to certifi's CA bundle to fix HPC SSL verification issues
+        # Set SSL cert file for HPC
         env = dict(os.environ)
         env["MBD_AUTO_DOWNLOAD_FILES"] = "true"
 
         try:
-            certifi_path = subprocess.run(
+            certifi_path = subprocess.check_output(
                 [str(venv_python), "-c", "import certifi; print(certifi.where())"],
-                capture_output=True,
                 text=True,
-                check=True,
-            ).stdout.strip()
+            ).strip()
             env["SSL_CERT_FILE"] = certifi_path
         except Exception as e:
             logger.warning(f"Failed to set SSL_CERT_FILE: {e}")
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index f35b1306..c8f95e5d 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -278,6 +278,8 @@ def local(
             results_file_path.unlink(missing_ok=True)
 
     def calculate_metrics(self, outputs: dict) -> dict[str, Any]:
+        # TODO: implement the full metrics calculation,
+        # this is just a placeholder for now
         """Calculate benchmark metrics from raw outputs.
 
         For MVP, this returns basic statistics. Future versions will compare

From e2c2a44216a73bbb9bc8e53b164efae2df544eb9 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Tue, 2 Dec 2025 10:39:02 -0700
Subject: [PATCH 03/23] checkpoint/resume, more examples

---
 .../benchmarks/matbench_discovery/__init__.py |   90 +-
 .../benchmarks/matbench_discovery/enums.py    |   46 +-
 .../examples/matbench_equiformerv2.py         |  129 ++
 .../examples/matbench_mace_multi_gpu.py       |  112 +-
 .../examples/matbench_mattersim.py            |  108 ++
 .../examples/matbench_sevennet.py             |  111 ++
 .../examples/matbench_test.py                 |   31 -
 .../examples/matbench_test_local_mps.py       |   46 -
 .../examples/matbench_test_remote.py          |   59 -
 .../examples/run_random_10k_benchmark.py      |  223 ++++
 .../benchmarks/matbench_discovery/metrics.py  |  193 +++
 .../matbench_discovery/remote_runner.py       |  438 -------
 .../benchmarks/matbench_discovery/tasks.py    | 1126 +++++++++++++----
 13 files changed, 1806 insertions(+), 906 deletions(-)
 create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py
 delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py
 delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py
 delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/metrics.py
 delete mode 100644 garden_ai/benchmarks/matbench_discovery/remote_runner.py

diff --git a/garden_ai/benchmarks/matbench_discovery/__init__.py b/garden_ai/benchmarks/matbench_discovery/__init__.py
index 3f687a6c..3256522e 100644
--- a/garden_ai/benchmarks/matbench_discovery/__init__.py
+++ b/garden_ai/benchmarks/matbench_discovery/__init__.py
@@ -31,13 +31,34 @@
 from globus_compute_sdk import Executor
 from globus_compute_sdk.serialize import CombinedCode, ComputeSerializer
 
-from .enums import MatbenchTask
-from .tasks import IS2RETask
+from .enums import DatasetSize, MatbenchTask
+from .tasks import (
+    IP2ETask,
+    IS2ETask,
+    IS2RETask,
+    RP2RETask,
+    RS2RETask,
+    S2EFSMTask,
+    S2EFSTask,
+    S2EFTask,
+    S2ETask,
+    S2RETask,
+)
 
 __all__ = [
     "MatbenchDiscovery",
     "MatbenchTask",
+    "DatasetSize",
     "IS2RETask",
+    "RS2RETask",
+    "S2EFSTask",
+    "S2EFTask",
+    "S2EFSMTask",
+    "IS2ETask",
+    "S2ETask",
+    "S2RETask",
+    "RP2RETask",
+    "IP2ETask",
 ]
 
 
@@ -92,7 +113,12 @@ def __init__(
                           (can be overridden per task)
         """
         self.endpoint_id = endpoint_id
-        self.user_endpoint_config = user_endpoint_config
+        self.user_endpoint_config = user_endpoint_config or {}
+
+        # Ensure 'requirements' is present to avoid endpoint template errors
+        if "requirements" not in self.user_endpoint_config:
+            self.user_endpoint_config["requirements"] = ""
+
         self.repo_ref = repo_ref or self.REPO_REF
         self.model_package = model_package
 
@@ -139,11 +165,65 @@ def __enter__(self):
             (),
             {
                 "IS2RE": IS2RETask(
-                    adapter=self,  # Pass adapter instead of executor
+                    adapter=self,
                     repo_url=self.REPO_URL,
                     repo_ref=self.repo_ref,
                     model_package=self.model_package,
-                )
+                ),
+                "RS2RE": RS2RETask(
+                    adapter=self,
+                    repo_url=self.REPO_URL,
+                    repo_ref=self.repo_ref,
+                    model_package=self.model_package,
+                ),
+                "S2EFS": S2EFSTask(
+                    adapter=self,
+                    repo_url=self.REPO_URL,
+                    repo_ref=self.repo_ref,
+                    model_package=self.model_package,
+                ),
+                "S2EF": S2EFTask(
+                    adapter=self,
+                    repo_url=self.REPO_URL,
+                    repo_ref=self.repo_ref,
+                    model_package=self.model_package,
+                ),
+                "S2EFSM": S2EFSMTask(
+                    adapter=self,
+                    repo_url=self.REPO_URL,
+                    repo_ref=self.repo_ref,
+                    model_package=self.model_package,
+                ),
+                "IS2E": IS2ETask(
+                    adapter=self,
+                    repo_url=self.REPO_URL,
+                    repo_ref=self.repo_ref,
+                    model_package=self.model_package,
+                ),
+                "S2E": S2ETask(
+                    adapter=self,
+                    repo_url=self.REPO_URL,
+                    repo_ref=self.repo_ref,
+                    model_package=self.model_package,
+                ),
+                "S2RE": S2RETask(
+                    adapter=self,
+                    repo_url=self.REPO_URL,
+                    repo_ref=self.repo_ref,
+                    model_package=self.model_package,
+                ),
+                "RP2RE": RP2RETask(
+                    adapter=self,
+                    repo_url=self.REPO_URL,
+                    repo_ref=self.repo_ref,
+                    model_package=self.model_package,
+                ),
+                "IP2E": IP2ETask(
+                    adapter=self,
+                    repo_url=self.REPO_URL,
+                    repo_ref=self.repo_ref,
+                    model_package=self.model_package,
+                ),
             },
         )()
 
diff --git a/garden_ai/benchmarks/matbench_discovery/enums.py b/garden_ai/benchmarks/matbench_discovery/enums.py
index 8cc2f99b..5c34cb6b 100644
--- a/garden_ai/benchmarks/matbench_discovery/enums.py
+++ b/garden_ai/benchmarks/matbench_discovery/enums.py
@@ -12,4 +12,48 @@ class MatbenchTask(Enum):
     - S2EFS: Structure to Energy, Forces, and Stress
     """
 
-    IS2RE = "is2re"  # Initial Structure to Relaxed Energy
+    IS2RE = "IS2RE"  # Initial Structure to Relaxed Energy
+    RS2RE = "RS2RE"  # Relaxed Structure to Relaxed Energy
+    S2EFS = "S2EFS"  # Structure to Energy, Forces, Stress
+    S2EF = "S2EF"  # Structure to Energy, Force
+    S2EFSM = "S2EFSM"  # Structure to Energy, Force, Stress, Magmoms
+    IS2E = "IS2E"  # Initial Structure to Energy
+    S2E = "S2E"  # Structure to Energy
+    S2RE = "S2RE"  # Structure to Relaxed Energy
+    RP2RE = "RP2RE"  # Relaxed Prototype to Relaxed Energy
+    IP2E = "IP2E"  # Initial Prototype to Energy
+
+
+class DatasetSize(str, Enum):
+    """Predefined dataset sizes for Matbench Discovery benchmarks.
+
+    These correspond to different subsets of the WBM test set that are commonly
+    used for evaluating materials discovery models.
+    """
+
+    FULL = "full"
+    """Full WBM test set (~257k structures)"""
+
+    UNIQUE_PROTOS = "unique_protos"
+    """Unique prototypes subset (~215k structures) - removes duplicate prototypes"""
+
+    RANDOM_10K = "random_10k"
+    """Random 10k structures from the unique prototypes subset (fixed seed)"""
+
+    RANDOM_100 = "random_100"
+    """Random 100 structures for quick testing (fixed seed)"""
+
+    def seed(self, seed: int) -> "DatasetConfig":
+        """Return a configuration with a custom random seed."""
+        return DatasetConfig(self, seed)
+
+
+class DatasetConfig:
+    """Configuration for a dataset subset with a specific random seed."""
+
+    def __init__(self, subset: DatasetSize, seed: int):
+        self.subset = subset
+        self.seed = seed
+
+    def __repr__(self):
+        return f"{self.subset.name}(seed={self.seed})"
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
new file mode 100644
index 00000000..ec3afe91
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+"""
+Matbench Discovery Benchmark - EquiformerV2 Example
+
+EquiformerV2 is an improved equivariant transformer from FAIR-Chem (formerly OCP).
+Paper: https://arxiv.org/abs/2306.12059
+GitHub: https://github.com/Open-Catalyst-Project/ocp
+
+Note: This example uses the S2EFS task (Structure to Energy, Forces, Stress)
+instead of IS2RE because EquiformerV2 doesn't support geometry relaxation.
+"""
+
+from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+# Globus Compute endpoint
+ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
+
+# HPC endpoint configuration
+ENDPOINT_CONFIG = {
+    "account": "cis250461-gpu",
+    "partition": "gpu-debug",
+    "qos": "gpu",
+    "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8",
+}
+
+
+# Model factory function for EquiformerV2
+def create_equiformerv2_model(device):
+    """Create EquiformerV2 model calculator.
+
+    Args:
+        device: Device to load model on ("cuda" or "cpu")
+
+    Returns:
+        ASE calculator for EquiformerV2
+    """
+    from fairchem.core.calculate.ase_calculator import Calculator
+
+    # Use pre-trained checkpoint - will auto-download from HuggingFace
+    return Calculator(
+        model_name="EquiformerV2-31M-S2EF-OC20-All+MD", cpu=(device == "cpu")
+    )
+
+
+# Benchmark parameters
+NUM_STRUCTURES = 1000
+USE_MULTI_GPU = True
+
+# =============================================================================
+# Run Benchmark
+# =============================================================================
+
+
+def main():
+    """Run Matbench Discovery S2EFS benchmark with EquiformerV2."""
+
+    print("=" * 80)
+    print("Matbench Discovery S2EFS Benchmark")
+    print("=" * 80)
+    print(f"Endpoint:   {ENDPOINT_ID}")
+    print("Model:      EquiformerV2-31M")
+    print("Task:       S2EFS (Structure to Energy, Forces, Stress)")
+    print(f"Structures: {NUM_STRUCTURES}")
+    print(f"Resources:  {'Multi-GPU' if USE_MULTI_GPU else 'Single GPU'}")
+    print("=" * 80)
+    print()
+
+    with MatbenchDiscovery(
+        endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG
+    ) as bench:
+        # Run S2EFS task (uses relaxed structures, no geometry optimization)
+        # This is suitable for EquiformerV2 which doesn't support relaxation
+        print("Submitting S2EFS task...")
+        future = bench.tasks.S2EFS.submit(
+            model_factory=create_equiformerv2_model,
+            model_package="fairchem-core",
+            num_structures=NUM_STRUCTURES,
+            use_multi_gpu=USE_MULTI_GPU,
+        )
+
+        print("Waiting for results (this may take a while)...")
+        output = future.result()
+
+        # Display metrics
+        print()
+        print("=" * 80)
+        print("Benchmark Results")
+        print("=" * 80)
+
+        metrics = output.get("metrics", {})
+        if "error" in metrics:
+            print(f"Error: {metrics['error']}")
+        else:
+            # Energy metrics
+            if "energy_mae" in metrics:
+                print("Energy Metrics:")
+                print(f"  MAE (eV/atom):  {metrics.get('energy_mae', 'N/A'):.6f}")
+                print(f"  RMSE (eV/atom): {metrics.get('energy_rmse', 'N/A'):.6f}")
+                print(f"  R²:             {metrics.get('energy_r2', 'N/A'):.6f}")
+                print()
+
+            # Force metrics
+            if "force_mae" in metrics:
+                print("Force Metrics:")
+                print(f"  MAE (eV/Å):     {metrics.get('force_mae', 'N/A'):.6f}")
+                print(f"  RMSE (eV/Å):    {metrics.get('force_rmse', 'N/A'):.6f}")
+                print(f"  R²:             {metrics.get('force_r2', 'N/A'):.6f}")
+                print()
+
+            # Stress metrics
+            if "stress_mae" in metrics:
+                print("Stress Metrics:")
+                print(f"  MAE (GPa):      {metrics.get('stress_mae', 'N/A'):.6f}")
+                print(f"  RMSE (GPa):     {metrics.get('stress_rmse', 'N/A'):.6f}")
+                print(f"  R²:             {metrics.get('stress_r2', 'N/A'):.6f}")
+                print()
+
+            if "num_evaluated" in metrics:
+                print(f"Structures:     {metrics.get('num_evaluated', 'N/A')}")
+
+        print("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
index 475ef079..9f971086 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
@@ -4,89 +4,77 @@
 using multi-GPU parallelization on a Globus Compute endpoint.
 """
 
-from garden_ai.benchmarks import MatbenchDiscovery
+from garden_ai.benchmarks.matbench_discovery import DatasetSize, MatbenchDiscovery
 
-# ------------------------------------------------------------------------------
-# Configuration
-# ------------------------------------------------------------------------------
-
-# Globus Compute Endpoint ID (Anvil HPC)
+# Globus Compute endpoint
 ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
 
-# Job Configuration
-NUM_GPUS = 2
+# HPC endpoint configuration
 ENDPOINT_CONFIG = {
     "account": "cis250461-gpu",
+    "partition": "gpu-debug",
     "qos": "gpu",
-    "partition": "gpu",
-    "scheduler_options": f"#SBATCH --gpus-per-node={NUM_GPUS}\n#SBATCH --time=00:30:00\n",
-    "cores_per_node": 32,
-    "mem_per_node": 32,  # GB
-    "worker_init": "pip install --user uv",  # Ensure uv is available
+    "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8",
 }
 
-# Model Configuration
-MODEL_PACKAGE = "mace-torch"
-MODEL_FACTORY = "mace_mp"
-MODEL_KWARGS = {
-    "model": "medium",
-    "device": "cuda",
-    "default_dtype": "float64",
-}
 
-# Benchmark Configuration
-NUM_STRUCTURES = 500
+# Model factory function for MACE
+def create_mace_model(device):
+    from mace.calculators import mace_mp
+
+    return mace_mp(model="medium", device=device, default_dtype="float64")
+
+
+NUM_STRUCTURES = DatasetSize.RANDOM_100
 
 
 def main():
-    print("=" * 80)
-    print("Matbench Discovery IS2RE Benchmark")
-    print("=" * 80)
-    print(f"Endpoint:   {ENDPOINT_ID}")
-    print(f"Model:      {MODEL_PACKAGE} / {MODEL_FACTORY}")
-    print(f"Structures: {NUM_STRUCTURES}")
-    print(f"Resources:  {NUM_GPUS} GPUs (Multi-GPU Enabled)")
-    print("=" * 80)
+    """Run Matbench Discovery IS2RE benchmark with MACE."""
 
     with MatbenchDiscovery(
-        endpoint_id=ENDPOINT_ID,
-        user_endpoint_config=ENDPOINT_CONFIG,
+        endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG
     ) as bench:
-        task = bench.tasks.IS2RE
-
-        print("\nSubmitting task to endpoint...")
-        future = task.submit(
-            model_package=MODEL_PACKAGE,
-            model_factory=MODEL_FACTORY,
-            model_kwargs=MODEL_KWARGS,
+        # Run IS2RE task (Initial Structure to Relaxed Energy)
+        future = bench.tasks.IS2RE.submit(
+            model_factory=create_mace_model,
+            model_packages="mace-torch",
             num_structures=NUM_STRUCTURES,
-            use_multi_gpu=True,
         )
 
         print("Job submitted! Waiting for results (this may take a while)...")
 
         try:
-            result = future.result()
-            metrics = task.calculate_metrics(result)
-
-            print("\n" + "=" * 80)
-            print("Benchmark Results")
-            print("=" * 80)
-
-            # Print primary metrics
-            for key, value in metrics.items():
-                print(f"{key:<20}: {value}")
-
-            print("-" * 80)
-            print(f"Converged: {result['num_converged']} / {NUM_STRUCTURES}")
-            print(f"Failed:    {len(result.get('failed_indices', []))}")
-
-            if result.get("energies"):
-                valid_energies = [e for e in result["energies"] if e is not None]
-                if valid_energies:
-                    print(f"Sample energies:   {valid_energies[:3]} ...")
-
-            print("=" * 80)
+            output = future.result()
+            metrics = output.get("metrics", {})
+
+            if "error" in metrics:
+                print(f"error               : {metrics['error']}")
+            else:
+                # Discovery metrics (stability classification)
+                if "F1" in metrics:
+                    print(f"F1                  : {metrics['F1']:.6f}")
+                    print(f"DAF                 : {metrics['DAF']:.2f}x")
+                    print(f"Precision           : {metrics['Precision']:.6f}")
+                    print(f"Recall              : {metrics['Recall']:.6f}")
+                    print(f"Accuracy            : {metrics['Accuracy']:.6f}")
+
+                # Regression metrics
+                if "MAE" in metrics:
+                    print(f"MAE (eV/atom)       : {metrics['MAE']:.6f}")
+                    print(f"RMSE (eV/atom)      : {metrics['RMSE']:.6f}")
+                    print(f"R2                  : {metrics['R2']:.6f}")
+
+                # Force metrics (if S2EFS task)
+                if "force_mae" in metrics:
+                    print(f"force_mae           : {metrics['force_mae']:.6f}")
+                    print(f"force_rmse          : {metrics['force_rmse']:.6f}")
+                    print(f"force_r2            : {metrics['force_r2']:.6f}")
+                    print(f"stress_mae          : {metrics['stress_mae']:.6f}")
+                    print(f"stress_rmse         : {metrics['stress_rmse']:.6f}")
+                    print(f"stress_r2           : {metrics['stress_r2']:.6f}")
+
+                if "num_evaluated" in metrics:
+                    print(f"num_evaluated       : {metrics['num_evaluated']}")
 
         except Exception as e:
             print(f"\n[ERROR] Benchmark failed: {e}")
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py
new file mode 100644
index 00000000..fcf77a1c
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+Matbench Discovery Benchmark - MatterSim Example
+
+MatterSim is a deep learning atomistic model for general material simulations.
+Paper: https://arxiv.org/abs/2405.04967
+GitHub: https://github.com/microsoft/mattersim
+"""
+
+from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+# Globus Compute endpoint
+ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
+
+# HPC endpoint configuration
+ENDPOINT_CONFIG = {
+    "account": "cis250461-gpu",
+    "partition": "gpu-debug",
+    "qos": "gpu",
+    "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8",
+}
+
+
+# Model factory function for MatterSim
+def create_mattersim_model(device):
+    """Create MatterSim model calculator.
+
+    Args:
+        device: Device to load model on ("cuda" or "cpu")
+
+    Returns:
+        ASE calculator for MatterSim
+    """
+    from mattersim.forcefield import MatterSimCalculator
+
+    return MatterSimCalculator(device=device)
+
+
+# Benchmark parameters
+NUM_STRUCTURES = 1000
+USE_MULTI_GPU = True
+
+# =============================================================================
+# Run Benchmark
+# =============================================================================
+
+
+def main():
+    """Run Matbench Discovery IS2RE benchmark with MatterSim."""
+
+    print("=" * 80)
+    print("Matbench Discovery IS2RE Benchmark")
+    print("=" * 80)
+    print(f"Endpoint:   {ENDPOINT_ID}")
+    print("Model:      MatterSim")
+    print(f"Structures: {NUM_STRUCTURES}")
+    print(f"Resources:  {'Multi-GPU' if USE_MULTI_GPU else 'Single GPU'}")
+    print("=" * 80)
+    print()
+
+    with MatbenchDiscovery(
+        endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG
+    ) as bench:
+        # Run IS2RE task
+        print("Submitting IS2RE task...")
+        future = bench.tasks.IS2RE.submit(
+            model_factory=create_mattersim_model,
+            model_package="mattersim",
+            num_structures=NUM_STRUCTURES,
+            use_multi_gpu=USE_MULTI_GPU,
+        )
+
+        print("Waiting for results (this may take a while)...")
+        output = future.result()
+
+        # Display metrics
+        print()
+        print("=" * 80)
+        print("Benchmark Results")
+        print("=" * 80)
+
+        metrics = output.get("metrics", {})
+        if "error" in metrics:
+            print(f"Error: {metrics['error']}")
+        else:
+            # Discovery metrics
+            print(f"F1 Score:       {metrics.get('F1', 'N/A'):.6f}")
+            print(f"DAF:            {metrics.get('DAF', 'N/A'):.2f}x")
+            print(f"Precision:      {metrics.get('Precision', 'N/A'):.6f}")
+            print(f"Recall:         {metrics.get('Recall', 'N/A'):.6f}")
+            print(f"Accuracy:       {metrics.get('Accuracy', 'N/A'):.6f}")
+            print()
+            # Regression metrics
+            print(f"MAE (eV/atom):  {metrics.get('MAE', 'N/A'):.6f}")
+            print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}")
+            print(f"R²:             {metrics.get('R2', 'N/A'):.6f}")
+            print()
+            print(f"Structures:     {metrics.get('num_evaluated', 'N/A')}")
+
+        print("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py
new file mode 100644
index 00000000..d028b740
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""
+Matbench Discovery Benchmark - SevenNet Example
+
+This script demonstrates running the Matbench Discovery IS2RE benchmark
+using SevenNet as the MLIP model on a remote Globus Compute endpoint.
+
+SevenNet is a graph neural network potential with good transferability.
+"""
+
+from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+# Globus Compute endpoint (replace with your endpoint UUID)
+ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
+
+# HPC endpoint configuration (adjust for your cluster)
+ENDPOINT_CONFIG = {
+    "account": "cis250461-gpu",
+    "partition": "gpu-debug",
+    "qos": "gpu",
+    "scheduler_options": "#SBATCH --gpus-per-node=2\n",
+    "cores_per_node": 16,
+    "mem_per_node": 32,  # GB
+}
+
+
+# Model factory function for SevenNet
+def create_sevennet_model(device):
+    """Create SevenNet model calculator.
+
+    Args:
+        device: Device to load model on ("cuda" or "cpu")
+
+    Returns:
+        ASE calculator for SevenNet
+    """
+    from sevenn.calculator import SevenNetCalculator
+
+    return SevenNetCalculator(model="7net-0", device=device)
+
+
+# Benchmark parameters
+NUM_STRUCTURES = 1000  # Number of structures to evaluate
+USE_MULTI_GPU = True  # Enable multi-GPU parallelization
+
+# =============================================================================
+# Run Benchmark
+# =============================================================================
+
+
+def main():
+    """Run Matbench Discovery IS2RE benchmark with SevenNet."""
+
+    print("=" * 80)
+    print("Matbench Discovery IS2RE Benchmark")
+    print("=" * 80)
+    print(f"Endpoint:   {ENDPOINT_ID}")
+    print("Model:      SevenNet (7net-0)")
+    print(f"Structures: {NUM_STRUCTURES}")
+    print(f"Resources:  {'Multi-GPU' if USE_MULTI_GPU else 'Single GPU'}")
+    print("=" * 80)
+    print()
+
+    with MatbenchDiscovery(
+        endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG
+    ) as bench:
+        # Run IS2RE task (Initial Structure to Relaxed Energy)
+        print("Submitting IS2RE task...")
+        future = bench.tasks.IS2RE.submit(
+            model_factory=create_sevennet_model,
+            model_package="sevenn",
+            num_structures=NUM_STRUCTURES,
+            use_multi_gpu=USE_MULTI_GPU,
+        )
+
+        print("Waiting for results (this may take a while)...")
+        output = future.result()
+
+        # Display metrics
+        print()
+        print("=" * 80)
+        print("Benchmark Results")
+        print("=" * 80)
+
+        metrics = output.get("metrics", {})
+        if "error" in metrics:
+            print(f"Error: {metrics['error']}")
+        else:
+            # Discovery metrics
+            print(f"F1 Score:       {metrics.get('F1', 'N/A'):.6f}")
+            print(f"DAF:            {metrics.get('DAF', 'N/A'):.2f}x")
+            print(f"Precision:      {metrics.get('Precision', 'N/A'):.6f}")
+            print(f"Recall:         {metrics.get('Recall', 'N/A'):.6f}")
+            print(f"Accuracy:       {metrics.get('Accuracy', 'N/A'):.6f}")
+            print()
+            # Regression metrics
+            print(f"MAE (eV/atom):  {metrics.get('MAE', 'N/A'):.6f}")
+            print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}")
+            print(f"R²:             {metrics.get('R2', 'N/A'):.6f}")
+            print()
+            print(f"Structures:     {metrics.get('num_evaluated', 'N/A')}")
+
+        print("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py
deleted file mode 100644
index 3b2912f8..00000000
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py
+++ /dev/null
@@ -1,31 +0,0 @@
-"""Test Matbench Discovery benchmark locally."""
-
-from garden_ai.benchmarks import MatbenchDiscovery
-
-print("Matbench Discovery IS2RE Benchmark")
-print("=" * 80)
-
-with MatbenchDiscovery() as bench:
-    task = bench.tasks.IS2RE
-
-    # Run benchmark locally
-    result = task.local(
-        model_package="mace-torch",
-        model_factory="mace_mp",
-        model_kwargs={
-            "model": "medium",
-            "device": "cpu",
-            "default_dtype": "float32",
-        },
-        num_structures=10,
-    )
-
-    # Calculate metrics
-    metrics = task.calculate_metrics(result)
-
-    # Display results
-    print("\nResults:")
-    print("=" * 80)
-    for key, value in metrics.items():
-        print(f"  {key}: {value}")
-    print("=" * 80)
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py
deleted file mode 100644
index 9f9bfd8c..00000000
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py
+++ /dev/null
@@ -1,46 +0,0 @@
-"""Test Matbench Discovery benchmark locally on Mac.
-
-This script tests the benchmark implementation locally. Note that MPS (Apple Silicon
-GPU) is not compatible with MACE model checkpoints which use float64, so this runs
-on CPU. This is still useful for verifying the workflow works before using Anvil.
-"""
-
-from garden_ai.benchmarks import MatbenchDiscovery
-
-print("=" * 80)
-print("Matbench Discovery Local Test")
-print("=" * 80)
-
-# Run benchmark locally with MPS acceleration
-with MatbenchDiscovery() as bench:
-    task = bench.tasks.IS2RE
-
-    print("\nRunning local benchmark...")
-    print("Note: Using CPU because MACE model checkpoints use float64,")
-    print("which is not supported by MPS. This is still useful for testing")
-    print("the workflow before running on Anvil with CUDA.\n")
-
-    result = task.local(
-        model_package="mace-torch",
-        model_factory="mace_mp",
-        model_kwargs={
-            "model": "medium",
-            "device": "cpu",  # MPS doesn't support float64 used by MACE checkpoints
-            "default_dtype": "float32",
-        },
-        num_structures=10,  # Small test to verify workflow
-        use_multi_gpu=False,
-    )
-
-    # Calculate metrics
-    metrics = task.calculate_metrics(result)
-
-    # Display results
-    print("\nResults:")
-    print("=" * 80)
-    for key, value in metrics.items():
-        print(f"  {key}: {value}")
-    print("=" * 80)
-
-    print("\nLocal test complete!")
-    print("If this works, you can proceed with confidence to run on Anvil.")
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py
deleted file mode 100644
index 08be5dca..00000000
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py
+++ /dev/null
@@ -1,59 +0,0 @@
-"""Test Matbench Discovery benchmark on remote a HPC endpoint."""
-
-from garden_ai.benchmarks import MatbenchDiscovery
-
-ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
-
-ENDPOINT_CONFIG = {
-    "account": "cis250461-gpu",  # HPC allocation/account
-    "qos": "gpu",
-    "partition": "gpu-debug",  # SLURM partition
-    "scheduler_options": "#SBATCH --gpus-per-node=4",  # Request 4 GPUs
-    "worker_init": "pip install --user uv",  # Install uv on worker startup
-}
-
-MODEL_PACKAGE = "mace-torch"
-MODEL_FACTORY = "mace_mp"
-MODEL_KWARGS = {
-    "model": "medium",
-    "device": "cuda",  # Use GPU on HPC
-    "default_dtype": "float32",
-}
-
-NUM_STRUCTURES = 100  # Increased from 10 to test multi-GPU parallelization
-
-with MatbenchDiscovery(
-    endpoint_id=ENDPOINT_ID,
-    user_endpoint_config=ENDPOINT_CONFIG,
-) as bench:
-    task = bench.tasks.IS2RE
-
-    future = task.submit(
-        model_package=MODEL_PACKAGE,
-        model_factory=MODEL_FACTORY,
-        model_kwargs=MODEL_KWARGS,
-        num_structures=NUM_STRUCTURES,
-        use_multi_gpu=True,  # Enable multi-GPU parallelization
-    )
-
-    try:
-        result = future.result()
-        metrics = task.calculate_metrics(result)
-
-        print("\nResults:")
-        print("=" * 80)
-        for key, value in metrics.items():
-            print(f"  {key}: {value}")
-
-        print("=" * 80)
-        print("\nRaw Results:")
-        print(f"  Converged: {result['num_converged']}")
-        print(f"  Failed: {len(result.get('failed_indices', []))}")
-        if result.get("energies"):
-            valid_energies = [e for e in result["energies"] if e is not None]
-            if valid_energies:
-                print(f"  Sample energies: {valid_energies[:3]}")
-
-    except Exception as e:
-        print(f"\n[ERROR] Benchmark failed: {e}")
-        raise
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py
new file mode 100644
index 00000000..3bdfd6ca
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+"""
+Run Matbench Discovery benchmarks on 10k most stable structures.
+
+This script benchmarks MACE, MatterSim, and SevenNet on the 10k most stable
+materials from the unique prototypes subset and saves comprehensive metrics to JSON.
+"""
+
+import json
+from datetime import datetime
+from pathlib import Path
+
+from garden_ai.benchmarks.matbench_discovery import DatasetSize, MatbenchDiscovery
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+# Globus Compute endpoint
+ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
+
+# HPC endpoint configuration
+ENDPOINT_CONFIG = {
+    "account": "cis250461-gpu",
+    "partition": "gpu",
+    "qos": "gpu",
+    "scheduler_options": "#SBATCH --gpus-per-node=4\n",
+    "cores_per_node": 8,
+    "mem_per_node": 32,
+}
+
+# Output file for metrics
+OUTPUT_FILE = "stable_10k_benchmark_results.json"
+
+# =============================================================================
+# Model Factory Functions
+# =============================================================================
+
+
+def create_mace_model(device):
+    """Create MACE model calculator."""
+    from mace.calculators import mace_mp
+
+    return mace_mp(model="medium-mpa-0", device=device, default_dtype="float64")
+
+
+def create_mattersim_model(device):
+    """Create MatterSim model calculator."""
+    from mattersim.forcefield import MatterSimCalculator
+
+    return MatterSimCalculator(device=device)
+
+
+def create_sevennet_model(device):
+    """Create SevenNet model calculator."""
+    from sevenn.calculator import SevenNetCalculator
+
+    return SevenNetCalculator(model="7net-0", device=device)
+
+
+# Model configurations
+MODELS = {
+    "MACE": {
+        "package": "mace-torch",
+        "factory": create_mace_model,
+    },
+    "MatterSim": {
+        "package": "mattersim",
+        "factory": create_mattersim_model,
+    },
+    "SevenNet": {
+        "package": "sevenn",
+        "factory": create_sevennet_model,
+    },
+}
+
+# =============================================================================
+# Run Benchmarks
+# =============================================================================
+
+
+def main():
+    """Run benchmarks on all models and save results."""
+
+    print("=" * 80)
+    print("Matbench Discovery Benchmark - Stable 10k")
+    print("=" * 80)
+    print("Dataset: 10k Most Stable Structures")
+    print(f"Models: {', '.join(MODELS.keys())}")
+    print(f"Endpoint: {ENDPOINT_ID}")
+    print("=" * 80)
+    print()
+
+    results = {
+        "metadata": {
+            "timestamp": datetime.now().isoformat(),
+            "dataset": "stable_10k",
+            "dataset_size": 10000,
+            "endpoint_id": ENDPOINT_ID,
+        },
+        "models": {},
+    }
+
+    with MatbenchDiscovery(
+        endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG
+    ) as bench:
+        for model_name, config in MODELS.items():
+            print(f"\n{'=' * 80}")
+            print(f"Running {model_name}...")
+            print(f"{'=' * 80}\n")
+
+            try:
+                # Submit job
+                future = bench.tasks.IS2RE.submit(
+                    model_factory=config["factory"],
+                    model_packages=[
+                        config["package"],
+                        "cuequivariance",
+                        "cuequivariance-torch",
+                        "cuequivariance-ops-torch-cu12",
+                    ],
+                    num_structures=DatasetSize.RANDOM_10K,
+                )
+
+                print(f"Job submitted for {model_name}. Waiting for results...")
+
+                try:
+                    output = future.result()
+                except Exception as e:
+                    print(f"⚠️ {model_name} failed first attempt: {e}")
+                    print(f"   Resuming from checkpoint: {future.checkpoint_path}")
+
+                    # Extract checkpoint name from path
+                    checkpoint_name = Path(future.checkpoint_path).name
+
+                    # Resubmit with same checkpoint name to resume
+                    retry_future = bench.tasks.IS2RE.submit(
+                        model_factory=config["factory"],
+                        model_packages=[
+                            config["package"],
+                            "cuequivariance",
+                            "cuequivariance-torch",
+                            "cuequivariance-ops-torch-cu12",
+                        ],
+                        num_structures=DatasetSize.RANDOM_10K,
+                        checkpoint_name=checkpoint_name,
+                    )
+
+                    try:
+                        print("   Retry job submitted. Waiting for results...")
+                        output = retry_future.result()
+                        print("   ✅ Retry successful!")
+                    except Exception as retry_e:
+                        print(f"❌ {model_name} failed retry: {retry_e}")
+                        results["models"][model_name] = {
+                            "status": "error",
+                            "error": str(retry_e),
+                        }
+                        continue  # Skip to next model
+
+                # Store complete output (contains both metrics and per-structure results)
+                results["models"][model_name] = {
+                    "status": "success",
+                    **output,  # Unpack entire output dict (metrics + results)
+                }
+
+                # Display metrics
+                metrics = output.get("metrics", {})
+                if "error" in metrics:
+                    print(f"❌ {model_name} failed: {metrics['error']}")
+                    results["models"][model_name]["status"] = "failed"
+                    results["models"][model_name]["error"] = metrics["error"]
+                else:
+                    print(f"✅ {model_name} completed successfully!")
+                    print(f"   F1 Score:       {metrics.get('F1', 'N/A'):.6f}")
+                    print(f"   DAF:            {metrics.get('DAF', 'N/A'):.2f}x")
+                    print(f"   MAE (eV/atom):  {metrics.get('MAE', 'N/A'):.6f}")
+                    print(f"   RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}")
+                    print(f"   Structures:     {metrics.get('num_evaluated', 'N/A')}")
+
+            except Exception as e:
+                print(f"❌ {model_name} error: {e}")
+                results["models"][model_name] = {
+                    "status": "error",
+                    "error": str(e),
+                }
+
+    # Save results to JSON
+    output_path = Path(OUTPUT_FILE)
+    with open(output_path, "w") as f:
+        json.dump(results, f, indent=2)
+
+    print(f"\n{'=' * 80}")
+    print("Benchmark Complete!")
+    print(f"{'=' * 80}")
+    print(f"\nResults saved to: {output_path.absolute()}")
+
+    # Print summary table
+    print(f"\n{'=' * 80}")
+    print("Summary")
+    print(f"{'=' * 80}\n")
+    print(f"{'Model':<15} {'Status':<10} {'F1':<10} {'DAF':<10} {'MAE':<10}")
+    print("-" * 80)
+
+    for model_name, data in results["models"].items():
+        if data["status"] == "success":
+            metrics = data["metrics"]
+            print(
+                f"{model_name:<15} {data['status']:<10} "
+                f"{metrics.get('F1', 0):<10.6f} "
+                f"{metrics.get('DAF', 0):<10.2f} "
+                f"{metrics.get('MAE', 0):<10.6f}"
+            )
+        else:
+            print(
+                f"{model_name:<15} {data['status']:<10} {'N/A':<10} {'N/A':<10} {'N/A':<10}"
+            )
+
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/garden_ai/benchmarks/matbench_discovery/metrics.py b/garden_ai/benchmarks/matbench_discovery/metrics.py
new file mode 100644
index 00000000..c08bad2d
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/metrics.py
@@ -0,0 +1,193 @@
+"""Functions to classify energy above convex hull predictions as true/false
+positive/negative and compute performance metrics.
+
+Adapted from matbench-discovery to avoid import issues.
+Original source: https://github.com/janosh/matbench-discovery/blob/main/matbench_discovery/metrics/discovery.py
+"""
+
+from collections.abc import Sequence
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import r2_score
+
+# Default stability threshold from matbench-discovery
+# STABILITY_THRESHOLD = 0.0
+
+
+def classify_stable(
+    each_true: Sequence[float] | pd.Series | np.ndarray,
+    each_pred: Sequence[float] | pd.Series | np.ndarray,
+    *,
+    stability_threshold: float = 0.0,
+    fillna: bool = True,
+) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
+    """Classify model stability predictions as true/false positive/negatives (usually
+    w.r.t DFT-ground truth labels). All energies are assumed to be in eV/atom
+    (but shouldn't really matter as long as they're consistent).
+
+    Args:
+        each_true (Sequence[float] | pd.Series): Ground truth energy above convex hull
+            values.
+        each_pred (Sequence[float] | pd.Series): Model-predicted energy above convex
+            hull values.
+        stability_threshold (float, optional): Maximum energy above convex hull
+            for a material to still be considered stable. Usually 0, 0.05 or 0.1.
+            Defaults to 0.0, meaning a material has to be directly on
+            the hull to be called stable. Negative values mean a material has to pull
+            the known hull down by that amount to count as stable. Few materials lie
+            below the known hull, so only negative values very close to 0 make sense.
+        fillna (bool): Whether to fill NaNs as the model predicting unstable. Defaults
+            to True.
+
+    Returns:
+        tuple[TP, FN, FP, TN]: Indices as pd.Series for true positives,
+            false negatives, false positives and true negatives (in this order).
+
+    Raises:
+        ValueError: If sum of positive + negative preds doesn't add up to the total.
+    """
+    if len(each_true) != len(each_pred):
+        raise ValueError(f"{len(each_true)=} != {len(each_pred)=}")
+
+    each_true_arr, each_pred_arr = pd.Series(each_true), pd.Series(each_pred)
+
+    if stability_threshold is None or np.isnan(stability_threshold):
+        raise ValueError("stability_threshold must be a real number")
+    actual_pos = each_true_arr <= (stability_threshold or 0)
+    actual_neg = each_true_arr > (stability_threshold or 0)
+
+    model_pos = each_pred_arr <= (stability_threshold or 0)
+    model_neg = each_pred_arr > (stability_threshold or 0)
+
+    if fillna:
+        nan_mask = np.isnan(each_pred)
+        # for in both the model's stable and unstable preds, fill NaNs as unstable
+        model_pos[nan_mask] = False
+        model_neg[nan_mask] = True
+
+        n_pos, n_neg, total = model_pos.sum(), model_neg.sum(), len(each_pred)
+        if n_pos + n_neg != total:
+            raise ValueError(
+                f"after filling NaNs, the sum of positive ({n_pos}) and negative "
+                f"({n_neg}) predictions should add up to {total=}"
+            )
+
+    true_pos = actual_pos & model_pos
+    false_neg = actual_pos & model_neg
+    false_pos = actual_neg & model_pos
+    true_neg = actual_neg & model_neg
+
+    return true_pos, false_neg, false_pos, true_neg
+
+
+def stable_metrics(
+    each_true: Sequence[float] | pd.Series | np.ndarray,
+    each_pred: Sequence[float] | pd.Series | np.ndarray,
+    *,
+    stability_threshold: float = 0.0,
+    fillna: bool = True,
+    prevalence: float | None = None,
+) -> dict[str, float]:
+    """Get a dictionary of stability prediction metrics. Mostly binary classification
+    metrics, but also MAE, RMSE and R2.
+
+    Args:
+        each_true (Sequence[float] | pd.Series): true energy above convex hull
+        each_pred (Sequence[float] | pd.Series): predicted energy above convex hull
+        stability_threshold (float): Where to place stability threshold relative to
+            convex hull in eV/atom, usually 0 or 0.1 eV. Default = 0.0.
+        fillna (bool): Whether to fill NaNs as the model predicting unstable. Defaults
+            to True.
+        prevalence (float, optional): Prevalence of stable materials in the dataset.
+            If None, calculated from the input data. Defaults to None.
+
+    Note: Should give equivalent classification metrics to
+        sklearn.metrics.classification_report(
+            each_true > stability_threshold,
+            each_pred > stability_threshold,
+            output_dict=True,
+        )
+        when using the same stability_threshold.
+
+    Returns:
+        dict[str, float]: dictionary of classification metrics with keys DAF, Precision,
+            Recall, Accuracy, F1, TPR, FPR, TNR, FNR, MAE, RMSE, R2.
+
+    Raises:
+        ValueError: If FPR + TNR don't add up to 1.
+        ValueError: If TPR + FNR don't add up to 1.
+    """
+    n_true_pos, n_false_neg, n_false_pos, n_true_neg = map(
+        sum,
+        classify_stable(
+            each_true, each_pred, stability_threshold=stability_threshold, fillna=fillna
+        ),
+    )
+
+    n_total_pos = n_true_pos + n_false_neg
+    n_total_neg = n_true_neg + n_false_pos
+    # prevalence: dummy discovery rate of stable crystals by selecting randomly from
+    # all materials
+    if prevalence is None:
+        prevalence = (
+            n_total_pos / (n_total_pos + n_total_neg)
+            if (n_total_pos + n_total_neg) > 0
+            else float("nan")
+        )
+    # Calculate ratios with guards against division by zero
+    precision = (
+        n_true_pos / (n_true_pos + n_false_pos)
+        if (n_true_pos + n_false_pos) > 0
+        else float("nan")
+    )
+    recall = n_true_pos / n_total_pos if n_total_pos > 0 else float("nan")
+
+    TPR = recall
+    FPR = n_false_pos / n_total_neg if n_total_neg > 0 else float("nan")
+    TNR = n_true_neg / n_total_neg if n_total_neg > 0 else float("nan")
+    FNR = n_false_neg / n_total_pos if n_total_pos > 0 else float("nan")
+
+    # sanity check: false positives + true negatives = all negatives
+    if FPR > 0 and TNR > 0 and FPR + TNR != 1:
+        # Floating point tolerance
+        if abs(FPR + TNR - 1) > 1e-6:
+            raise ValueError(f"{FPR=} {TNR=} don't add up to 1")
+
+    # sanity check: true positives + false negatives = all positives
+    if TPR > 0 and FNR > 0 and TPR + FNR != 1:
+        # Floating point tolerance
+        if abs(TPR + FNR - 1) > 1e-6:
+            raise ValueError(f"{TPR=} {FNR=} don't add up to 1")
+
+    # Drop NaNs to calculate regression metrics
+    is_nan = np.isnan(each_true) | np.isnan(each_pred)
+    each_true, each_pred = np.array(each_true)[~is_nan], np.array(each_pred)[~is_nan]
+
+    if precision + recall == 0:  # Calculate F1 score, handling division by zero
+        f1_score = float("nan")
+    else:
+        f1_score = 2 * (precision * recall) / (precision + recall)
+
+    return dict(
+        F1=f1_score,
+        DAF=precision / prevalence if prevalence > 0 else float("nan"),
+        Precision=precision,
+        Recall=recall,
+        Accuracy=(
+            (n_true_pos + n_true_neg) / (n_total_pos + n_total_neg)
+            if (n_total_pos + n_total_neg > 0)
+            else float("nan")
+        ),
+        TPR=TPR,
+        FPR=FPR,
+        TNR=TNR,
+        FNR=FNR,
+        TP=n_true_pos,
+        FP=n_false_pos,
+        TN=n_true_neg,
+        FN=n_false_neg,
+        MAE=np.abs(each_true - each_pred).mean(),
+        RMSE=((each_true - each_pred) ** 2).mean() ** 0.5,
+        R2=r2_score(each_true, each_pred) if len(each_true) > 1 else float("nan"),
+    )
diff --git a/garden_ai/benchmarks/matbench_discovery/remote_runner.py b/garden_ai/benchmarks/matbench_discovery/remote_runner.py
deleted file mode 100644
index 432d43cc..00000000
--- a/garden_ai/benchmarks/matbench_discovery/remote_runner.py
+++ /dev/null
@@ -1,438 +0,0 @@
-"""Remote execution functions for Matbench Discovery benchmarks.
-
-These functions are serialized and executed on Globus Compute endpoints.
-They handle environment setup, dependency installation, and benchmark execution.
-"""
-
-
-def run_matbench_is2re(
-    repo_url: str,
-    repo_ref: str,
-    model_package: str,
-    model_factory: str,
-    model_kwargs: dict,
-    model_checkpoint: str | None,
-    num_structures: int,
-    use_multi_gpu: bool = True,
-) -> dict:
-    """Run Matbench IS2RE benchmark on remote Globus Compute endpoint.
-
-    This function performs the complete benchmark workflow:
-    1. Set up Python environment with UV
-    2. Install dependencies (matbench-discovery + model package)
-    3. Execute benchmark runner script in the environment
-    4. Return results
-
-    Args:
-        repo_url: GitHub URL for matbench-discovery repo
-        repo_ref: Git branch/tag/commit to checkout
-        model_package: Python package name to install (e.g., "mace-torch")
-        model_factory: Function or class name to create model (e.g., "mace_mp", "MACE")
-        model_kwargs: Dictionary of kwargs to pass when creating model
-        model_checkpoint: Path/URL to model checkpoint file (optional)
-        num_structures: Number of test structures to run (subset for MVP)
-        use_multi_gpu: If True, automatically detect and use all available GPUs
-                      in parallel. If False, use single GPU/CPU. (default: True)
-
-    Returns:
-        Dictionary with benchmark results:
-            - energies: List of final energies (None for failed relaxations)
-            - num_converged: Count of successful relaxations
-            - failed_indices: List of structure indices that failed
-
-    Raises:
-        RuntimeError: If benchmark execution fails
-    """
-    # All imports must be inside the function for CombinedCode serialization
-    import json
-    import logging
-    import os
-    import subprocess
-    import sys
-    import tempfile
-    from pathlib import Path
-
-    # Configure logging
-    logging.basicConfig(
-        level=logging.INFO,
-        stream=sys.stdout,
-        force=True,
-        format="%(asctime)s [%(levelname)s] %(message)s",
-    )
-    if hasattr(sys.stdout, "reconfigure"):
-        sys.stdout.reconfigure(line_buffering=True)
-
-    logger = logging.getLogger(__name__)
-
-    # Create isolated working directory
-    work_dir = Path(tempfile.mkdtemp(prefix="matbench_benchmark_"))
-
-    # This script runs INSIDE the virtual environment
-    BENCHMARK_RUNNER_SCRIPT = '''
-import json
-import sys
-import time
-import logging
-import os
-import concurrent.futures
-import importlib
-from pathlib import Path
-from typing import List, Dict, Any, Optional
-from zipfile import ZipFile
-from io import TextIOWrapper
-
-import torch
-from ase.io import read
-from ase.optimize import FIRE
-from matbench_discovery.data import DataFiles
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s [%(levelname)s] [PID:%(process)d] %(message)s',
-    datefmt='%Y-%m-%d %H:%M:%S',
-    stream=sys.stdout,
-    force=True
-)
-logger = logging.getLogger("benchmark_runner")
-
-def setup_device(gpu_id: Optional[int] = None) -> str:
-    """Setup compute device for this process."""
-    if torch.cuda.is_available():
-        return f"cuda:{gpu_id}" if gpu_id is not None else "cuda"
-    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-        return "mps"
-    return "cpu"
-
-def load_model(config: Dict[str, Any], device: str):
-    """Initialize the model from configuration."""
-    package_name = config["package"]
-    factory_name = config["factory"]
-    kwargs = config["kwargs"].copy()
-    checkpoint = config.get("checkpoint")
-
-    if "device" in kwargs:
-        kwargs["device"] = device
-
-    # Import factory function
-    module_parts = package_name.split(".")
-    try:
-        if len(module_parts) > 1:
-            module = importlib.import_module(package_name)
-        else:
-            # Try common patterns for model packages
-            base_module = module_parts[0].split("-")[0]
-            try:
-                module = importlib.import_module(f"{base_module}.calculators")
-            except ImportError:
-                module = importlib.import_module(base_module)
-
-        factory = getattr(module, factory_name)
-    except (ImportError, AttributeError) as e:
-        raise ImportError(f"Could not load model factory {factory_name} from {package_name}: {e}")
-
-    # Create model
-    model = factory(**kwargs)
-
-    # Load checkpoint if provided
-    if checkpoint and checkpoint != "None":
-        if hasattr(model, "load_checkpoint"):
-            model.load_checkpoint(checkpoint)
-        elif hasattr(model, "load_state_dict"):
-            model.load_state_dict(torch.load(checkpoint))
-
-    return model
-
-def process_batch(
-    batch_id: int,
-    structures: List[Any],
-    start_idx: int,
-    model_config: Dict[str, Any],
-    num_threads: int
-) -> Dict[str, Any]:
-    """Process a batch of structures on a specific device."""
-
-    # Configure thread limits to avoid contention
-    os.environ["OMP_NUM_THREADS"] = str(num_threads)
-    torch.set_num_threads(num_threads)
-
-    gpu_id = model_config.get("gpu_id")
-    device = setup_device(gpu_id)
-
-    worker_logger = logging.getLogger(f"worker_{batch_id}")
-    worker_logger.info(f"Started on {device} with {len(structures)} structures. Threads: {num_threads}")
-
-    try:
-        model = load_model(model_config, device)
-    except Exception as e:
-        worker_logger.error(f"Failed to initialize model: {e}")
-        return {
-            "energies": [None] * len(structures),
-            "num_converged": 0,
-            "failed_indices": [start_idx + i for i in range(len(structures))],
-            "error": str(e)
-        }
-
-    energies = []
-    failed_indices = []
-    num_converged = 0
-    batch_start = time.time()
-
-    for i, atoms in enumerate(structures):
-        global_idx = start_idx + i
-        try:
-            atoms.calc = model
-            opt = FIRE(atoms, logfile=None)
-            opt.run(fmax=0.05, steps=500)
-
-            energies.append(atoms.get_potential_energy())
-            num_converged += 1
-
-            if (i + 1) % 10 == 0:
-                elapsed = time.time() - batch_start
-                rate = (i + 1) / elapsed if elapsed > 0 else 0
-                worker_logger.info(f"Progress: {i+1}/{len(structures)} ({rate:.2f} struct/s)")
-
-        except Exception as e:
-            worker_logger.warning(f"Structure {global_idx} failed: {e}")
-            energies.append(None)
-            failed_indices.append(global_idx)
-
-    return {
-        "energies": energies,
-        "num_converged": num_converged,
-        "failed_indices": failed_indices
-    }
-
-def load_structures(num_structures: int) -> List[Any]:
-    """Load structures from the Matbench Discovery dataset."""
-    structures = []
-    zip_path = DataFiles.wbm_initial_atoms.path
-
-    with ZipFile(zip_path, 'r') as zf:
-        # Sort files numerically
-        file_list = sorted(
-            zf.namelist(),
-            key=lambda x: int(x.split('.')[0]) if x.split('.')[0].isdigit() else float('inf')
-        )
-        for filename in file_list[:num_structures]:
-            with zf.open(filename) as f:
-                text_stream = TextIOWrapper(f, encoding='utf-8')
-                structures.append(read(text_stream, format='extxyz'))
-    return structures
-
-def main():
-    if len(sys.argv) != 2:
-        sys.exit("Usage: python benchmark_runner.py <config_file>")
-
-    with open(sys.argv[1]) as f:
-        config = json.load(f)
-
-    logger.info("Starting benchmark runner...")
-
-    try:
-        structures = load_structures(config["num_structures"])
-        logger.info(f"Loaded {len(structures)} structures")
-    except Exception as e:
-        logger.error(f"Failed to load structures: {e}")
-        sys.exit(1)
-
-    # Shuffle for load balancing
-    import random
-    random.seed(42)
-    random.shuffle(structures)
-
-    # Resource detection
-    num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
-    use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1
-
-    total_cores = os.cpu_count() or 1
-    num_workers = num_gpus if use_multi_gpu else 1
-    # Reserve cores for overhead if possible
-    available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores
-    threads_per_worker = max(1, available_cores // num_workers)
-
-    logger.info(f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)")
-
-    results = {"energies": [], "num_converged": 0, "failed_indices": []}
-    start_time = time.time()
-
-    if use_multi_gpu:
-        logger.info(f"Parallel execution on {num_gpus} GPUs")
-        batch_size = len(structures) // num_gpus
-        futures = []
-
-        ctx = multiprocessing.get_context('spawn')
-        with concurrent.futures.ProcessPoolExecutor(max_workers=num_gpus, mp_context=ctx) as executor:
-            for i in range(num_gpus):
-                start_idx = i * batch_size
-                end_idx = len(structures) if i == num_gpus - 1 else (i + 1) * batch_size
-
-                model_config = {
-                    "package": config["model_package"],
-                    "factory": config["model_factory"],
-                    "kwargs": config["model_kwargs"],
-                    "checkpoint": config["model_checkpoint"],
-                    "gpu_id": i
-                }
-
-                futures.append(executor.submit(
-                    process_batch, i, structures[start_idx:end_idx], start_idx, model_config, threads_per_worker
-                ))
-
-            for future in concurrent.futures.as_completed(futures):
-                try:
-                    batch_res = future.result()
-                    results["energies"].extend(batch_res["energies"])
-                    results["num_converged"] += batch_res["num_converged"]
-                    results["failed_indices"].extend(batch_res["failed_indices"])
-                except Exception as e:
-                    logger.error(f"Worker failed: {e}")
-    else:
-        logger.info("Single process execution")
-        model_config = {
-            "package": config["model_package"],
-            "factory": config["model_factory"],
-            "kwargs": config["model_kwargs"],
-            "checkpoint": config["model_checkpoint"]
-        }
-        results = process_batch(0, structures, 0, model_config, threads_per_worker)
-
-    elapsed = time.time() - start_time
-    logger.info(f"Benchmark complete in {elapsed:.1f}s. Converged: {results['num_converged']}/{len(structures)}")
-
-    with open("results.json", "w") as f:
-        json.dump(results, f, indent=2)
-
-if __name__ == "__main__":
-    import multiprocessing
-    main()
-'''
-
-    try:
-        # ----------------------------------------------------------------------
-        # 1. ENVIRONMENT SETUP
-        # ----------------------------------------------------------------------
-        logger.info("Step 1/4: Setting up environment...")
-
-        # Find UV binary
-        uv_bin = subprocess.check_output(
-            [sys.executable, "-c", "import uv; print(uv.find_uv_bin())"], text=True
-        ).strip()
-
-        # Create UV virtual environment
-        subprocess.run(
-            [uv_bin, "venv", "--python", "3.11"],
-            cwd=work_dir,
-            check=True,
-            capture_output=True,
-        )
-
-        venv_python = work_dir / ".venv/bin/python"
-        if not venv_python.exists():
-            venv_python = work_dir / ".venv/Scripts/python.exe"  # Windows fallback
-
-        if not venv_python.exists():
-            raise RuntimeError(f"Virtual environment python not found at {venv_python}")
-
-        # Install dependencies
-        logger.info("Installing dependencies...")
-        subprocess.run(
-            [
-                uv_bin,
-                "pip",
-                "install",
-                "--python",
-                str(venv_python),
-                "matbench-discovery",
-            ],
-            cwd=work_dir,
-            check=True,
-        )
-        subprocess.run(
-            [uv_bin, "pip", "install", "--python", str(venv_python), model_package],
-            cwd=work_dir,
-            check=True,
-        )
-
-        # Set SSL cert file for HPC
-        env = dict(os.environ)
-        env["MBD_AUTO_DOWNLOAD_FILES"] = "true"
-
-        try:
-            certifi_path = subprocess.check_output(
-                [str(venv_python), "-c", "import certifi; print(certifi.where())"],
-                text=True,
-            ).strip()
-            env["SSL_CERT_FILE"] = certifi_path
-        except Exception as e:
-            logger.warning(f"Failed to set SSL_CERT_FILE: {e}")
-
-        # ----------------------------------------------------------------------
-        # 2. PREPARE BENCHMARK SCRIPT
-        # ----------------------------------------------------------------------
-        logger.info("Step 2/4: Preparing benchmark script...")
-
-        # Write runner script
-        runner_path = work_dir / "benchmark_runner.py"
-        runner_path.write_text(BENCHMARK_RUNNER_SCRIPT)
-
-        # Write config
-        config = {
-            "repo_url": repo_url,
-            "repo_ref": repo_ref,
-            "model_package": model_package,
-            "model_factory": model_factory,
-            "model_kwargs": model_kwargs,
-            "model_checkpoint": model_checkpoint,
-            "num_structures": num_structures,
-            "use_multi_gpu": use_multi_gpu,
-        }
-
-        config_path = work_dir / "config.json"
-        with open(config_path, "w") as f:
-            json.dump(config, f, indent=2)
-
-        # ----------------------------------------------------------------------
-        # 3. EXECUTE BENCHMARK
-        # ----------------------------------------------------------------------
-        logger.info("Step 3/4: Executing benchmark...")
-
-        # Run the runner script inside the venv
-        # We stream output directly to stdout so the user sees progress
-        proc = subprocess.run(
-            [str(venv_python), str(runner_path), str(config_path)],
-            cwd=work_dir,
-            env=env,
-            stdout=sys.stdout,
-            stderr=sys.stderr,
-            check=False,  # We check return code manually
-        )
-
-        if proc.returncode != 0:
-            raise RuntimeError(
-                f"Benchmark runner failed with return code {proc.returncode}"
-            )
-
-        # ----------------------------------------------------------------------
-        # 4. COLLECT RESULTS
-        # ----------------------------------------------------------------------
-        logger.info("Step 4/4: Collecting results...")
-
-        results_path = work_dir / "results.json"
-        if not results_path.exists():
-            raise RuntimeError(
-                "Results file not found - benchmark may have crashed silently"
-            )
-
-        with open(results_path) as f:
-            results = json.load(f)
-
-        logger.info("Benchmark completed successfully.")
-        return results
-
-    finally:
-        # Cleanup working directory
-        import shutil
-
-        shutil.rmtree(work_dir, ignore_errors=True)
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index c8f95e5d..b4348d27 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -1,26 +1,599 @@
 """Matbench Discovery benchmark task implementations."""
 
-from typing import TYPE_CHECKING, Any
+from __future__ import annotations
 
-from .remote_runner import run_matbench_is2re
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+from ..utils.remote_execution import run_remote_benchmark
+from ..utils.script_builder import BenchmarkScriptBuilder
+from ..utils.task import BaseBenchmarkTask
 
 if TYPE_CHECKING:
     from . import MatbenchDiscovery
+    from .enums import DatasetConfig, DatasetSize
+
+from .metrics import classify_stable, stable_metrics
+
+# ------------------------------------------------------------------------------
+# REMOTE FUNCTIONS
+# These functions are injected into the remote script.
+# They must be self-contained (imports inside or provided by builder).
+# ------------------------------------------------------------------------------
+
+
+def load_model(device: str):
+    """Initialize the model using the user-provided factory function.
+
+    The factory function is injected into this script by the benchmark framework.
+    """
+    # Call the user's factory function (injected as load_model_user)
+    model = load_model_user(device)  # noqa: F821
+    return model
+
+
+def get_material_ids_for_subset(
+    subset_type: str, seed: int = 42
+) -> Optional[List[str]]:
+    """Get material IDs for a specific dataset subset.
+
+    Args:
+        subset_type: One of 'full', 'unique_protos', 'random_10k', 'random_100'
+        seed: Random seed for sampling (default: 42)
+
+    Returns:
+        List of material IDs, or None for 'full' (load all)
+    """
+    if subset_type == "full":
+        return None  # Load all materials
+
+    import pandas as pd
+    from matbench_discovery.data import DataFiles
+
+    # Load wbm_summary
+    df = pd.read_csv(DataFiles.wbm_summary.path)
+
+    if subset_type == "unique_protos":
+        # Filter to unique prototypes (removes duplicates and MP overlaps)
+        df_filtered = df.query("unique_prototype")
+        return df_filtered["material_id"].tolist()
+
+    elif subset_type == "random_10k":
+        # Random sample of 10k unique prototypes (fixed seed for reproducibility)
+        df_filtered = df.query("unique_prototype")
+        df_sampled = df_filtered.sample(n=10000, random_state=seed)
+        return df_sampled["material_id"].tolist()
+
+    elif subset_type == "random_100":
+        # Random sample of 100 unique prototypes (fixed seed for reproducibility)
+        # Useful for quick end-to-end testing
+        df_filtered = df.query("unique_prototype")
+        df_sampled = df_filtered.sample(n=100, random_state=seed)
+        return df_sampled["material_id"].tolist()
+
+    else:
+        raise ValueError(f"Unknown subset_type: {subset_type}")
+
+
+# --- Reusable Process Functions ---
+
+
+def process_batch_relaxation(
+    batch_id: int,
+    structures: List[Tuple[str, Any]],
+    model_config: Dict[str, Any],
+    num_threads: int,
+) -> Dict[str, Any]:
+    """Process a batch of structures for IS2RE (Relaxation)."""
+    import logging
+    import os
+    import time
+
+    import torch
+    from ase.optimize import FIRE
+
+    # Configure thread limits to avoid contention
+    os.environ["OMP_NUM_THREADS"] = str(num_threads)
+    torch.set_num_threads(num_threads)
+
+    gpu_id = model_config.get("gpu_id")
+    device = setup_device(gpu_id)  # noqa: F821
+
+    worker_logger = logging.getLogger(f"worker_{batch_id}")
+    worker_logger.info(
+        f"Started relaxation on {device} with {len(structures)} structures. Threads: {num_threads}"
+    )
+
+    global _MODEL_CACHE
+    try:
+        if _MODEL_CACHE is None:
+            model = load_model(device)
+            _MODEL_CACHE = model
+        else:
+            model = _MODEL_CACHE
+    except Exception as e:
+        worker_logger.error(f"Failed to initialize model: {e}")
+        worker_logger.error(
+            "Model initialization is critical - cannot continue benchmark"
+        )
+        raise RuntimeError(f"Model initialization failed: {e}") from e
+
+    results = {}
+    batch_start = time.time()
+
+    for i, (struct_id, atoms) in enumerate(structures):
+        try:
+            atoms.calc = model
+            opt = FIRE(atoms, logfile=None)
+            opt.run(fmax=0.05, steps=500)
+
+            energy = atoms.get_potential_energy()
+            results[struct_id] = {"energy": energy}
+
+            if (i + 1) % 10 == 0:
+                elapsed = time.time() - batch_start
+                rate = (i + 1) / elapsed if elapsed > 0 else 0
+                worker_logger.info(
+                    f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)"
+                )
+
+        except Exception as e:
+            worker_logger.warning(f"Structure {struct_id} failed: {e}")
+            results[struct_id] = {"energy": None, "error": str(e)}
+
+    return results
+
+
+def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Tuple[str, Any]]:
+    """Load initial structures for IS2RE."""
+    from io import TextIOWrapper
+    from zipfile import ZipFile
+
+    from ase.io import read
+    from matbench_discovery.data import DataFiles
+
+    dataset_subset = config.get("dataset_subset", "full")
+    dataset_seed = config.get("dataset_seed", 42)
+    mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed)
+
+    structures = []
+    zip_path = DataFiles.wbm_initial_atoms.path
+
+    with ZipFile(zip_path, "r") as zf:
+        if mat_ids is None:
+            # Load all files (full dataset)
+            file_list = sorted(
+                zf.namelist(),
+                key=lambda x: int(x.split(".")[0])
+                if x.split(".")[0].isdigit()
+                else float("inf"),
+            )
+            num_structures = config.get("num_structures", 100)
+            file_list = file_list[:num_structures]
+        else:
+            # Filter to specific material IDs
+            mat_id_set = set(mat_ids)
+            file_list = [
+                f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set
+            ]
+
+        for filename in file_list:
+            with zf.open(filename) as f:
+                text_stream = TextIOWrapper(f, encoding="utf-8")
+                structures.append((filename, read(text_stream, format="extxyz")))
+    return structures
+
+
+def calculate_metrics_energy(
+    results: Dict[str, Any], config: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Calculate energy metrics using matbench-discovery's stable_metrics algorithm.
+
+    Uses the injected stable_metrics function.
+    Returns: F1, DAF, Precision, Recall, Accuracy, TPR, FPR, TNR, FNR, TP, FP, TN, FN, MAE, RMSE, R2
+    """
+    import logging
+
+    import numpy as np
+
+    logger = logging.getLogger("metrics")
+
+    # Results format: {id: {"energy": float, "error": str}}
+    if len(results) == 0:
+        return {"error": "No results to evaluate"}
+
+    try:
+        # Import matbench-discovery data
+        from matbench_discovery.data import df_wbm
+    except Exception as e:
+        return {"error": f"Failed to import matbench-discovery: {e}"}
+
+    # Extract model energies
+    model_energies = {}
+    for sid, res in results.items():
+        if isinstance(res, dict) and res.get("energy") is not None:
+            mat_id = sid.replace(".extxyz", "")
+            model_energies[mat_id] = res["energy"]
+
+    if not model_energies:
+        return {"error": "No valid energies found in results"}
+
+    # Get common IDs between predictions and ground truth
+    # Use direct string column names instead of MbdKey enum to avoid issues
+    df_wbm_indexed = df_wbm.set_index("material_id")
+    common_ids = list(set(model_energies.keys()) & set(df_wbm_indexed.index))
+
+    if not common_ids:
+        return {"error": "No matching IDs between results and ground truth"}
+
+    # Get subset of data
+    df_subset = df_wbm_indexed.loc[common_ids]
+
+    # Calculate predicted formation energies
+    y_pred = np.array([model_energies[mid] for mid in common_ids])
+    y_true = df_subset["uncorrected_energy"].values  # Uncorrected total energy
+    n_atoms = df_subset["n_sites"].values
+
+    # Predicted formation energy ERROR per atom (from total energy difference)
+    # This is the ERROR: (E_pred - E_dft) / n_atoms
+    e_form_error = (y_pred - y_true) / n_atoms
+
+    # Get ground truth e_above_hull for stability classification
+    each_true = df_subset["e_above_hull_mp2020_corrected_ppd_mp"].values
+
+    # Calculate predicted e_above_hull
+    # Since e_form_error is already (E_pred - E_dft)/n_atoms, we just add it to each_true
+    each_pred = each_true + e_form_error
+
+    # Debug logging to understand the distribution
+    logger.info("Energy statistics:")
+    logger.info(
+        f"  each_true: min={each_true.min():.4f}, max={each_true.max():.4f}, mean={each_true.mean():.4f}"
+    )
+    logger.info(
+        f"  each_pred: min={each_pred.min():.4f}, max={each_pred.max():.4f}, mean={each_pred.mean():.4f}"
+    )
+
+    # Calculate global prevalence for DAF normalization (matches official leaderboard)
+    # Filter to unique prototypes
+    df_unique = df_wbm.query("unique_prototype")
+    # Calculate prevalence: (stable count) / (total count)
+    # Stability threshold is 0.0
+    stable_count = (df_unique["e_above_hull_mp2020_corrected_ppd_mp"] <= 0).sum()
+    global_prevalence = stable_count / len(df_unique)
+
+    logger.info(
+        f"Using global prevalence for DAF: {global_prevalence:.6f} ({stable_count}/{len(df_unique)})"
+    )
+
+    # Calculate metrics using the injected function
+    # stable_metrics is injected into the script scope
+    metrics = stable_metrics(each_true, each_pred, prevalence=global_prevalence)
+
+    # Add num_evaluated
+    metrics["num_evaluated"] = len(common_ids)
+
+    return metrics
+
+
+def process_batch_static(
+    batch_id: int,
+    structures: List[Tuple[str, Any]],
+    model_config: Dict[str, Any],
+    num_threads: int,
+) -> Dict[str, Any]:
+    """Process a batch of structures for RS2RE (Static Calculation)."""
+    import logging
+    import os
+    import time
+
+    import torch
+
+    os.environ["OMP_NUM_THREADS"] = str(num_threads)
+    torch.set_num_threads(num_threads)
+
+    gpu_id = model_config.get("gpu_id")
+    device = setup_device(gpu_id)  # noqa: F821
+
+    worker_logger = logging.getLogger(f"worker_{batch_id}")
+    worker_logger.info(
+        f"Started static calculation on {device} with {len(structures)} structures."
+    )
+
+    global _MODEL_CACHE
+    try:
+        if _MODEL_CACHE is None:
+            model = load_model(device)
+            _MODEL_CACHE = model
+        else:
+            model = _MODEL_CACHE
+    except Exception as e:
+        return {sid: {"energy": None, "error": str(e)} for sid, _ in structures}
+
+    results = {}
+    batch_start = time.time()
+
+    for i, (struct_id, atoms) in enumerate(structures):
+        try:
+            atoms.calc = model
+            # No relaxation, just static energy
+            energy = atoms.get_potential_energy()
+            results[struct_id] = {"energy": energy}
+
+            if (i + 1) % 50 == 0:
+                elapsed = time.time() - batch_start
+                rate = (i + 1) / elapsed if elapsed > 0 else 0
+                worker_logger.info(
+                    f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)"
+                )
+
+        except Exception as e:
+            worker_logger.warning(f"Structure {struct_id} failed: {e}")
+            results[struct_id] = {"energy": None, "error": str(e)}
+
+    return results
+
+
+def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Tuple[str, Any]]:
+    """Load relaxed structures for RS2RE."""
+    from io import TextIOWrapper
+    from zipfile import ZipFile
+
+    from ase.io import read
+    from matbench_discovery.data import DataFiles
+
+    dataset_subset = config.get("dataset_subset", "full")
+    dataset_seed = config.get("dataset_seed", 42)
+    mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed)
+
+    structures = []
+    # Use relaxed atoms
+    zip_path = DataFiles.wbm_relaxed_atoms.path
+
+    with ZipFile(zip_path, "r") as zf:
+        if mat_ids is None:
+            # Load all files (full dataset)
+            file_list = sorted(
+                zf.namelist(),
+                key=lambda x: int(x.split(".")[0])
+                if x.split(".")[0].isdigit()
+                else float("inf"),
+            )
+            num_structures = config.get("num_structures", 100)
+            file_list = file_list[:num_structures]
+        else:
+            # Filter to specific material IDs
+            mat_id_set = set(mat_ids)
+            file_list = [
+                f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set
+            ]
+
+        for filename in file_list:
+            with zf.open(filename) as f:
+                text_stream = TextIOWrapper(f, encoding="utf-8")
+                structures.append((filename, read(text_stream, format="extxyz")))
+    return structures
+
+
+# Reuse calculate_metrics_energy for all energy-only tasks
+
+
+def process_batch_forces(
+    batch_id: int,
+    structures: List[Tuple[str, Any]],
+    model_config: Dict[str, Any],
+    num_threads: int,
+) -> Dict[str, Any]:
+    """Process a batch of structures for S2EFS (Energy, Forces, Stress)."""
+    import logging
+    import os
+    import time
+
+    import torch
+
+    os.environ["OMP_NUM_THREADS"] = str(num_threads)
+    torch.set_num_threads(num_threads)
+
+    gpu_id = model_config.get("gpu_id")
+    device = setup_device(gpu_id)  # noqa: F821
+
+    worker_logger = logging.getLogger(f"worker_{batch_id}")
+    worker_logger.info(
+        f"Started forces calculation on {device} with {len(structures)} structures."
+    )
+
+    global _MODEL_CACHE
+    try:
+        if _MODEL_CACHE is None:
+            model = load_model(device)
+            _MODEL_CACHE = model
+        else:
+            model = _MODEL_CACHE
+    except Exception as e:
+        return {sid: {"error": str(e)} for sid, _ in structures}
 
+    results = {}
+    batch_start = time.time()
 
-class IS2RETask:
-    """Initial Structure to Relaxed Energy benchmark task.
+    for i, (struct_id, atoms) in enumerate(structures):
+        try:
+            atoms.calc = model
 
-    This task evaluates a model's ability to predict the relaxed energy
-    and geometry of crystal structures starting from unrelaxed initial
-    configurations.
+            energy = atoms.get_potential_energy()
+            forces = atoms.get_forces().tolist()
+            stress = atoms.get_stress().tolist()
+
+            results[struct_id] = {"energy": energy, "forces": forces, "stress": stress}
+
+            if (i + 1) % 50 == 0:
+                elapsed = time.time() - batch_start
+                rate = (i + 1) / elapsed if elapsed > 0 else 0
+                worker_logger.info(
+                    f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)"
+                )
 
-    The task:
-    1. Loads initial (unrelaxed) structures from the WBM test set
-    2. Uses the model to perform geometry optimization
-    3. Records final energies and relaxed structures
-    4. Calculates metrics comparing to DFT ground truth
+        except Exception as e:
+            worker_logger.warning(f"Structure {struct_id} failed: {e}")
+            results[struct_id] = {"error": str(e)}
+
+    return results
+
+
+def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Tuple[str, Any]]:
+    """Load MP trajectories for S2EFS."""
+    from io import TextIOWrapper
+    from zipfile import ZipFile
+
+    from ase.io import read
+    from matbench_discovery.data import DataFiles
+
+    num_structures = config.get("num_structures", 100)
+    structures = []
+    # Use MP trajectories
+    zip_path = DataFiles.mp_trj_extxyz.path
+
+    with ZipFile(zip_path, "r") as zf:
+        file_list = sorted(zf.namelist())
+        for filename in file_list[:num_structures]:
+            with zf.open(filename) as f:
+                text_stream = TextIOWrapper(f, encoding="utf-8")
+                # Read all frames? Or just one? Usually S2EFS is on frames.
+                # Let's assume we evaluate on the last frame or all frames.
+                # For simplicity, let's take the last frame (relaxed?) or random?
+                # Actually, MP trj contains relaxation steps.
+                # Let's read the last frame for now as a proxy for "a structure".
+                # Or better, read all frames and treat them as separate tasks?
+                # For this benchmark, let's just treat the file as containing one structure per file if possible,
+                # or just take the last one.
+                atoms_list = read(text_stream, format="extxyz", index=":")
+                if atoms_list:
+                    # Just take the last one for now
+                    structures.append((filename, atoms_list[-1]))
+    return structures
+
+
+def calculate_metrics_forces(
+    results: Dict[str, Any], config: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Calculate comprehensive S2EFS metrics (Energy, Forces, Stress).
+
+    Returns MAE, RMSE, and R2 for each component.
     """
+    from io import TextIOWrapper
+    from zipfile import ZipFile
+
+    import numpy as np
+    from ase.io import read
+    from matbench_discovery.data import DataFiles
+    from sklearn.metrics import r2_score
+
+    # We need to load ground truth from the dataset itself because MP trj has E/F/S in the extxyz
+    # This is expensive to re-read. Ideally we should have passed GT in results or loaded it efficiently.
+    # For now, let's re-read the GT for the processed IDs.
+
+    metrics = {
+        "energy_mae": [],
+        "energy_rmse": [],
+        "force_mae": [],
+        "force_rmse": [],
+        "stress_mae": [],
+        "stress_rmse": [],
+    }
+
+    # Collect all predictions and ground truth for R2 calculation
+    all_e_pred, all_e_true = [], []
+    all_f_pred, all_f_true = [], []
+    all_s_pred, all_s_true = [], []
+
+    zip_path = DataFiles.mp_trj_extxyz.path
+
+    with ZipFile(zip_path, "r") as zf:
+        for sid, res in results.items():
+            if "error" in res:
+                continue
+
+            try:
+                with zf.open(sid) as f:
+                    text_stream = TextIOWrapper(f, encoding="utf-8")
+                    atoms_list = read(text_stream, format="extxyz", index=":")
+                    gt_atoms = atoms_list[-1]  # Matching load_dataset logic
+
+                    # Energy (per atom)
+                    e_pred = res["energy"]
+                    e_true = gt_atoms.get_potential_energy()
+                    n_atoms = len(gt_atoms)
+
+                    energy_error = abs(e_pred - e_true) / n_atoms
+                    metrics["energy_mae"].append(energy_error)
+                    metrics["energy_rmse"].append(energy_error**2)
+
+                    all_e_pred.append(e_pred / n_atoms)
+                    all_e_true.append(e_true / n_atoms)
+
+                    # Forces
+                    f_pred = np.array(res["forces"])
+                    f_true = gt_atoms.get_forces()
+                    force_error = np.abs(f_pred - f_true)
+                    metrics["force_mae"].append(force_error.mean())
+                    metrics["force_rmse"].append((force_error**2).mean())
+
+                    all_f_pred.extend(f_pred.flatten())
+                    all_f_true.extend(f_true.flatten())
+
+                    # Stress
+                    s_pred = np.array(res["stress"])
+                    s_true = gt_atoms.get_stress()
+                    stress_error = np.abs(s_pred - s_true)
+                    metrics["stress_mae"].append(stress_error.mean())
+                    metrics["stress_rmse"].append((stress_error**2).mean())
+
+                    all_s_pred.extend(s_pred.flatten())
+                    all_s_true.extend(s_true.flatten())
+
+            except Exception:
+                pass
+
+    # Calculate final metrics
+    result_metrics = {}
+
+    if metrics["energy_mae"]:
+        result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"]))
+        result_metrics["energy_rmse"] = float(np.sqrt(np.mean(metrics["energy_rmse"])))
+        result_metrics["energy_r2"] = (
+            float(r2_score(all_e_true, all_e_pred))
+            if len(all_e_true) > 1
+            else float("nan")
+        )
+
+    if metrics["force_mae"]:
+        result_metrics["force_mae"] = float(np.mean(metrics["force_mae"]))
+        result_metrics["force_rmse"] = float(np.sqrt(np.mean(metrics["force_rmse"])))
+        result_metrics["force_r2"] = (
+            float(r2_score(all_f_true, all_f_pred))
+            if len(all_f_true) > 1
+            else float("nan")
+        )
+
+    if metrics["stress_mae"]:
+        result_metrics["stress_mae"] = float(np.mean(metrics["stress_mae"]))
+        result_metrics["stress_rmse"] = float(np.sqrt(np.mean(metrics["stress_rmse"])))
+        result_metrics["stress_r2"] = (
+            float(r2_score(all_s_true, all_s_pred))
+            if len(all_s_true) > 1
+            else float("nan")
+        )
+
+    result_metrics["num_evaluated"] = len(metrics["energy_mae"])
+
+    return result_metrics
+
+
+# ------------------------------------------------------------------------------
+# Task Classes
+# ------------------------------------------------------------------------------
+
+
+class MatbenchTask(BaseBenchmarkTask):
+    """Base class for Matbench Discovery tasks."""
 
     def __init__(
         self,
@@ -28,296 +601,321 @@ def __init__(
         repo_url: str,
         repo_ref: str,
         model_package: str | None = None,
+        task_name: str = "unknown",
     ):
-        """Initialize IS2RE task.
+        super().__init__(adapter, repo_url, repo_ref, model_package)
+        self.name = task_name
+
+    def calculate_metrics(self, output: Dict[str, Any]) -> Dict[str, Any]:
+        """Retrieve metrics from the remote output."""
+        return output.get("metrics", {})
+
+    def _build_script(
+        self, process_fn, load_dataset_fn, calc_metrics_fn, model_factory
+    ) -> str:
+        """Build the remote execution script with specific functions.
 
         Args:
-            adapter: MatbenchDiscovery adapter instance
-            repo_url: Matbench Discovery repository URL
-            repo_ref: Git ref (branch/tag/commit) to use
-            model_package: Default model package to install (can override in submit)
+            process_fn: Task-specific process_batch function
+            load_dataset_fn: Task-specific load_dataset function
+            calc_metrics_fn: Task-specific calculate_metrics function
+            model_factory: User-provided function that creates the model
         """
-        self.adapter = adapter
-        self.repo_url = repo_url
-        self.repo_ref = repo_ref
-        self.model_package = model_package
-        self.name = "IS2RE"
+        builder = BenchmarkScriptBuilder()
+
+        # Add global model cache
+        builder.add_preamble("_MODEL_CACHE = None")
+
+        # Common imports
+        builder.add_import("from typing import List, Dict, Any, Tuple, Optional")
+        builder.add_import("import torch")
+        builder.add_import("from ase.optimize import FIRE")
+        builder.add_import("from ase.io import read")
+        builder.add_import("from matbench_discovery.data import DataFiles")
+        builder.add_import("from zipfile import ZipFile")
+        builder.add_import("from io import TextIOWrapper")
+        builder.add_import("import pandas as pd")
+        builder.add_import("import numpy as np")
+        builder.add_import("from collections.abc import Sequence")
+        builder.add_import("from sklearn.metrics import r2_score")
+
+        # Add user's model factory (renamed to load_model_user so load_model can call it)
+        builder.add_function(model_factory, name="load_model_user")
+
+        # Add our load_model wrapper that calls load_model_user
+        builder.add_function(load_model)
+
+        # Add helper function for dataset subset filtering
+        builder.add_function(get_material_ids_for_subset)
+
+        # Add task-specific functions with standard names expected by runner
+        builder.add_function(process_fn, name="process_batch")
+        builder.add_function(load_dataset_fn, name="load_dataset")
+        builder.add_function(calc_metrics_fn, name="calculate_metrics_remote")
+
+        # Inject metrics helper functions
+        builder.add_function(classify_stable)
+        builder.add_function(stable_metrics)
+
+        return builder.build()
 
     def submit(
         self,
-        model=None,
-        num_structures: int = 100,
-        model_package: str | None = None,
-        model_factory: str | None = None,
-        model_kwargs: dict | None = None,
-        use_multi_gpu: bool = True,
+        model_factory: callable,
+        model_packages: str | List[str],
+        num_structures: int | "DatasetSize" | "DatasetConfig" = 100,
+        checkpoint_name: str | None = None,
     ):
-        """Submit IS2RE benchmark job to remote executor.
-
-        You can specify the model in two ways:
-        1. Pass a local model instance (will introspect to get remote construction info)
-        2. Explicitly specify model_package and model_factory
+        """Submit benchmark job to remote executor.
 
         Args:
-            model: (Optional) Local model instance. If provided, will extract
-                   package, class, and checkpoint information from it.
-            num_structures: Number of test structures to evaluate (default: 100).
-                           Full test set has ~257k structures. Use smaller values
-                           for quick testing.
-            model_package: Python package name to install (e.g., "mace-torch").
-                          Required if model is None.
-            model_factory: How to instantiate the model on remote. Can be:
-                          - Function name: "mace_mp" (will call as function)
-                          - Class name: "MACE" (will instantiate as class)
-                          Required if model is None.
-            model_kwargs: Dictionary of kwargs to pass when creating model remotely.
-                         Example: {"model": "medium", "device": "cuda"}
-            use_multi_gpu: If True, automatically detect and use all available GPUs
-                          in parallel for faster processing. If False, use single
-                          GPU/CPU. (default: True)
-
-        Returns:
-            Future object that will contain benchmark results when complete.
-            Call .result() to block and wait for completion.
-
-        Examples:
-            Using local model instance:
-            >>> from mace.calculators import mace_mp
-            >>> model = mace_mp(model="medium")
-            >>> future = task.submit(model, num_structures=50)
-
-            Specifying remote construction explicitly:
-            >>> future = task.submit(
-            ...     model_package="mace-torch",
-            ...     model_factory="mace_mp",
-            ...     model_kwargs={"model": "medium", "device": "cuda"},
-            ...     num_structures=50,
-            ...     use_multi_gpu=True
-            ... )
+            model_factory: User-provided function that takes device and returns an ASE calculator.
+                          Example: lambda device: mace_mp(model="medium", device=device)
+            model_packages: Python package(s) to install. Can be a single package string
+                          (e.g., "mace-torch") or a list (e.g., ["torch", "mace-torch"])
+            num_structures: Number of structures to evaluate, or DatasetSize enum, or DatasetConfig
+                          (DatasetSize.FULL, DatasetSize.RANDOM_10K, DatasetSize.RANDOM_10K.seed(10))
+            checkpoint_name: Optional name for the checkpoint file (e.g. "my_checkpoint.json").
+                             If not provided, one will be generated.
         """
-        # Determine how to construct model remotely
-        if model is not None:
-            # Extract info from local model instance
-            if model_package is None:
-                if self.model_package is not None:
-                    model_package = self.model_package
-                else:
-                    # Infer from model's module
-                    model_package = model.__class__.__module__.split(".")[0]
-
-            if model_factory is None:
-                model_factory = model.__class__.__name__
-
-            # Get checkpoint path if model has one
-            model_checkpoint = None
-            if hasattr(model, "checkpoint_path"):
-                model_checkpoint = model.checkpoint_path
-            elif hasattr(model, "checkpoint"):
-                model_checkpoint = model.checkpoint
-
-            # Try to extract initialization kwargs if available
-            if model_kwargs is None and hasattr(model, "_init_kwargs"):
-                model_kwargs = model._init_kwargs
+        import time
+        import uuid
 
+        from .enums import DatasetConfig, DatasetSize
+
+        # Build script with task-specific functions AND user's factory
+        script_content = self._build_script(
+            self.process_fn,
+            self.load_dataset_fn,
+            self.calc_metrics_fn,
+            model_factory,  # Inject user's factory function
+        )
+
+        # Handle single package string or list of packages
+        packages = (
+            [model_packages] if isinstance(model_packages, str) else model_packages
+        )
+        dependencies = ["matbench-discovery>=1.3.0"] + packages
+
+        # Handle DatasetSize enum, DatasetConfig, or integer
+        if isinstance(num_structures, DatasetSize):
+            runner_config = {
+                "repo_url": self.repo_url,
+                "repo_ref": self.repo_ref,
+                "dataset_subset": num_structures.value,
+                "dataset_seed": 42,  # Default seed
+            }
+        elif isinstance(num_structures, DatasetConfig):
+            runner_config = {
+                "repo_url": self.repo_url,
+                "repo_ref": self.repo_ref,
+                "dataset_subset": num_structures.subset.value,
+                "dataset_seed": num_structures.seed,
+            }
         else:
-            # Must provide explicit construction info
-            if model_package is None or model_factory is None:
-                raise ValueError(
-                    "If model is not provided, must specify both "
-                    "model_package and model_factory"
-                )
-            model_checkpoint = None
+            # Integer - use traditional num_structures approach
+            runner_config = {
+                "repo_url": self.repo_url,
+                "repo_ref": self.repo_ref,
+                "num_structures": num_structures,
+                "dataset_subset": "full",
+            }
+
+        # Generate checkpoint name if not provided
+        if not checkpoint_name:
+            # Format: matbench_{model}_{subset}_{timestamp}_{uuid}.json
+            # Clean up model name for filename
+            model_str = (
+                str(model_packages)
+                .replace("[", "")
+                .replace("]", "")
+                .replace("'", "")
+                .replace('"', "")
+                .replace(",", "_")
+                .replace(" ", "")
+            )
+            subset_str = runner_config.get("dataset_subset", "custom")
+            timestamp = int(time.time())
+            short_uuid = str(uuid.uuid4())[:8]
+            checkpoint_name = (
+                f"matbench_{model_str}_{subset_str}_{timestamp}_{short_uuid}.json"
+            )
 
-        if model_kwargs is None:
-            model_kwargs = {}
+        print(f"Checkpoint will be saved to: ~/.garden/benchmarks/{checkpoint_name}")
 
-        # Get executor (will create if needed) and submit remote execution
         executor = self.adapter._get_executor()
         future = executor.submit(
-            run_matbench_is2re,
-            repo_url=self.repo_url,
-            repo_ref=self.repo_ref,
-            model_package=model_package,
-            model_factory=model_factory,
-            model_kwargs=model_kwargs,
-            model_checkpoint=model_checkpoint,
-            num_structures=num_structures,
-            use_multi_gpu=use_multi_gpu,
+            run_remote_benchmark,
+            script_content=script_content,
+            dependencies=dependencies,
+            config=runner_config,
+            checkpoint_name=checkpoint_name,
         )
 
+        # Attach checkpoint path to future for programmatic access
+        future.checkpoint_path = f"~/.garden/benchmarks/{checkpoint_name}"
+
         return future
 
     def local(
         self,
-        model=None,
-        num_structures: int = 100,
-        model_package: str | None = None,
-        model_factory: str | None = None,
-        model_kwargs: dict | None = None,
-        use_multi_gpu: bool = True,
+        model_factory: callable,
+        model_packages: str | List[str],
+        num_structures: int | "DatasetSize" | "DatasetConfig" = 100,
+        checkpoint_path: str | None = None,
     ) -> dict:
-        """Run benchmark locally in ephemeral UV environment.
-
-        This executes the same benchmark workflow locally instead of submitting
-        to a remote Globus Compute endpoint. Useful for testing and development.
+        """Run benchmark locally.
 
         Args:
-            model: Optional local model instance to extract metadata from
-            num_structures: Number of test structures to evaluate
-            model_package: Python package name to install (e.g., "mace-torch")
-            model_factory: Function or class name to create model
-            model_kwargs: Dictionary of kwargs for model creation
-            use_multi_gpu: If True, automatically detect and use all available GPUs
-                          in parallel. If False, use single GPU/CPU. (default: True)
-
-        Returns:
-            Dictionary with benchmark results (same format as remote execution)
-
-        Example:
-            >>> results = task.local(
-            ...     model_package="mace-torch",
-            ...     model_factory="mace_mp",
-            ...     model_kwargs={"model": "medium", "device": "cpu"},
-            ...     num_structures=10,
-            ...     use_multi_gpu=False
-            ... )
+            model_factory: User-provided function that takes device and returns an ASE calculator
+            model_packages: Python package(s) to install. Can be a single package string
+                          (e.g., "mace-torch") or a list (e.g., ["torch", "mace-torch"])
+            num_structures: Number of structures to evaluate, or DatasetSize enum, or DatasetConfig
+                          (DatasetSize.FULL, DatasetSize.RANDOM_10K, DatasetSize.RANDOM_10K.seed(10))
+            checkpoint_path: Optional path to resume from checkpoint
         """
-        import json
-        import subprocess
-        import tempfile
-        from pathlib import Path
-
-        # Extract model metadata if model instance provided
-        if model is not None:
-            if model_package is None:
-                if self.model_package is not None:
-                    model_package = self.model_package
-                else:
-                    model_package = model.__class__.__module__.split(".")[0]
-
-            if model_factory is None:
-                model_factory = model.__class__.__name__
-
-            model_checkpoint = None
-            if hasattr(model, "checkpoint_path"):
-                model_checkpoint = model.checkpoint_path
-            elif hasattr(model, "checkpoint"):
-                model_checkpoint = model.checkpoint
-
-            if model_kwargs is None and hasattr(model, "_init_kwargs"):
-                model_kwargs = model._init_kwargs
+        from ..utils.remote_execution import run_remote_benchmark
+        from .enums import DatasetConfig, DatasetSize
+
+        # Build script with task-specific functions AND user's factory
+        script_content = self._build_script(
+            self.process_fn, self.load_dataset_fn, self.calc_metrics_fn, model_factory
+        )
+
+        # Handle single package string or list of packages
+        packages = (
+            [model_packages] if isinstance(model_packages, str) else model_packages
+        )
+        dependencies = ["matbench-discovery>=1.3.0"] + packages
+
+        # Handle DatasetSize enum, DatasetConfig, or integer
+        if isinstance(num_structures, DatasetSize):
+            runner_config = {
+                "repo_url": self.repo_url,
+                "repo_ref": self.repo_ref,
+                "dataset_subset": num_structures.value,
+                "dataset_seed": 42,  # Default seed
+            }
+        elif isinstance(num_structures, DatasetConfig):
+            runner_config = {
+                "repo_url": self.repo_url,
+                "repo_ref": self.repo_ref,
+                "dataset_subset": num_structures.subset.value,
+                "dataset_seed": num_structures.seed,
+            }
         else:
-            if model_package is None or model_factory is None:
-                raise ValueError(
-                    "If model is not provided, must specify both "
-                    "model_package and model_factory"
-                )
-            model_checkpoint = None
-
-        if model_kwargs is None:
-            model_kwargs = {}
-
-        # Run benchmark in subprocess with isolated environment
-        import sys
-
-        config = {
-            "repo_url": self.repo_url,
-            "repo_ref": self.repo_ref,
-            "model_package": model_package,
-            "model_factory": model_factory,
-            "model_kwargs": model_kwargs,
-            "model_checkpoint": model_checkpoint,
-            "num_structures": num_structures,
-            "use_multi_gpu": use_multi_gpu,
-        }
-
-        results_file_path = (
-            Path(tempfile.gettempdir()) / f"benchmark_results_{id(config)}.json"
+            # Integer - use traditional num_structures approach
+            runner_config = {
+                "repo_url": self.repo_url,
+                "repo_ref": self.repo_ref,
+                "num_structures": num_structures,
+                "dataset_subset": "full",
+            }
+
+        # Run locally (no Globus Compute)
+        return run_remote_benchmark(
+            script_content=script_content,
+            dependencies=dependencies,
+            config=runner_config,
+            checkpoint_path=checkpoint_path,
         )
 
-        wrapper_script = f'''
-import json
-from garden_ai.benchmarks.matbench_discovery.remote_runner import run_matbench_is2re
 
-config = {repr(config)}
-results = run_matbench_is2re(**config)
+class IS2RETask(MatbenchTask):
+    """Initial Structure to Relaxed Energy."""
 
-with open("{results_file_path}", "w") as f:
-    json.dump(results, f, indent=2)
-'''
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, task_name="IS2RE", **kwargs)
+        self.process_fn = process_batch_relaxation
+        self.load_dataset_fn = load_dataset_wbm_initial
+        self.calc_metrics_fn = calculate_metrics_energy
 
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
-            f.write(wrapper_script)
-            wrapper_path = f.name
 
-        try:
-            # Run without capturing output so logs stream to console in real-time
-            result = subprocess.run(
-                [sys.executable, wrapper_path],
-                timeout=3600,
-                # Don't capture output - let it stream to console
-                stdout=None,
-                stderr=None,
-            )
+class RS2RETask(MatbenchTask):
+    """Relaxed Structure to Relaxed Energy."""
 
-            if result.returncode != 0:
-                raise RuntimeError(
-                    f"Local benchmark failed with return code {result.returncode}"
-                )
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, task_name="RS2RE", **kwargs)
+        self.process_fn = process_batch_static
+        self.load_dataset_fn = load_dataset_wbm_relaxed
+        self.calc_metrics_fn = calculate_metrics_energy
 
-            if not results_file_path.exists():
-                raise RuntimeError(
-                    f"Benchmark results file not found at {results_file_path}"
-                )
 
-            with open(results_file_path) as f:
-                return json.load(f)
+class S2EFSTask(MatbenchTask):
+    """Structure to Energy, Forces, Stress."""
 
-        finally:
-            Path(wrapper_path).unlink(missing_ok=True)
-            results_file_path.unlink(missing_ok=True)
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, task_name="S2EFS", **kwargs)
+        self.process_fn = process_batch_forces
+        self.load_dataset_fn = load_dataset_mp_trj
+        self.calc_metrics_fn = calculate_metrics_forces
 
-    def calculate_metrics(self, outputs: dict) -> dict[str, Any]:
-        # TODO: implement the full metrics calculation,
-        # this is just a placeholder for now
-        """Calculate benchmark metrics from raw outputs.
 
-        For MVP, this returns basic statistics. Future versions will compare
-        against DFT ground truth and calculate proper benchmark metrics like
-        F1 score, discovery yield, etc.
+class S2EFTask(MatbenchTask):
+    """Structure to Energy, Force."""
 
-        Args:
-            outputs: Dictionary from remote execution containing:
-                - energies: List of relaxed energies
-                - num_converged: Number of successful relaxations
-                - failed_indices: Indices of failed structures
-
-        Returns:
-            Dictionary of calculated metrics:
-                - num_attempted: Total structures attempted
-                - num_converged: Number of successful relaxations
-                - success_rate: Fraction of successful relaxations
-                - mean_energy: Average final energy (eV/atom, if available)
-                - num_failed: Count of failed relaxations
-        """
-        energies = outputs.get("energies", [])
-        num_converged = outputs.get("num_converged", 0)
-        failed_indices = outputs.get("failed_indices", [])
-
-        # Filter out None values (failed relaxations)
-        valid_energies = [e for e in energies if e is not None]
-
-        metrics = {
-            "num_attempted": len(energies),
-            "num_converged": num_converged,
-            "num_failed": len(failed_indices),
-            "success_rate": num_converged / len(energies) if energies else 0.0,
-        }
-
-        # Calculate energy statistics if we have valid results
-        if valid_energies:
-            metrics["mean_energy"] = sum(valid_energies) / len(valid_energies)
-            metrics["min_energy"] = min(valid_energies)
-            metrics["max_energy"] = max(valid_energies)
-
-        return metrics
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, task_name="S2EF", **kwargs)
+        self.process_fn = process_batch_forces
+        self.load_dataset_fn = load_dataset_mp_trj
+        self.calc_metrics_fn = calculate_metrics_forces
+
+
+class S2EFSMTask(MatbenchTask):
+    """Structure to Energy, Force, Stress, Magmoms."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, task_name="S2EFSM", **kwargs)
+        self.process_fn = process_batch_forces
+        self.load_dataset_fn = load_dataset_mp_trj
+        self.calc_metrics_fn = calculate_metrics_forces
+
+
+class IS2ETask(MatbenchTask):
+    """Initial Structure to Energy."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, task_name="IS2E", **kwargs)
+        self.process_fn = process_batch_static
+        self.load_dataset_fn = load_dataset_wbm_initial
+        self.calc_metrics_fn = calculate_metrics_energy
+
+
+class S2ETask(MatbenchTask):
+    """Structure to Energy."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, task_name="S2E", **kwargs)
+        self.process_fn = process_batch_static
+        self.load_dataset_fn = load_dataset_wbm_relaxed
+        self.calc_metrics_fn = calculate_metrics_energy
+
+
+class S2RETask(MatbenchTask):
+    """Structure to Relaxed Energy."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, task_name="S2RE", **kwargs)
+        self.process_fn = process_batch_relaxation
+        self.load_dataset_fn = load_dataset_wbm_initial
+        self.calc_metrics_fn = calculate_metrics_energy
+
+
+class RP2RETask(MatbenchTask):
+    """Relaxed Prototype to Relaxed Energy."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, task_name="RP2RE", **kwargs)
+        self.process_fn = process_batch_relaxation
+        self.load_dataset_fn = load_dataset_wbm_initial  # Placeholder
+        self.calc_metrics_fn = calculate_metrics_energy
+
+
+class IP2ETask(MatbenchTask):
+    """Initial Prototype to Energy."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, task_name="IP2E", **kwargs)
+        self.process_fn = process_batch_static
+        self.load_dataset_fn = load_dataset_wbm_initial  # Placeholder
+        self.calc_metrics_fn = calculate_metrics_energy

From 77a4a01675f21d74664c0824629c7a90f74d77e7 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Tue, 2 Dec 2025 11:34:27 -0700
Subject: [PATCH 04/23] WIP cleanup

---
 .../examples/run_random_10k_benchmark.py      |   2 +-
 .../benchmarks/matbench_discovery/remote.py   | 485 +++++++++++
 .../benchmarks/matbench_discovery/tasks.py    | 757 +++---------------
 garden_ai/benchmarks/templates/base_runner.py | 248 ++++++
 garden_ai/benchmarks/utils/remote.py          | 176 ++++
 .../benchmarks/utils/remote_execution.py      | 202 +++++
 garden_ai/benchmarks/utils/script_builder.py  |  96 +++
 garden_ai/benchmarks/utils/task.py            | 132 +++
 8 files changed, 1444 insertions(+), 654 deletions(-)
 create mode 100644 garden_ai/benchmarks/matbench_discovery/remote.py
 create mode 100644 garden_ai/benchmarks/templates/base_runner.py
 create mode 100644 garden_ai/benchmarks/utils/remote.py
 create mode 100644 garden_ai/benchmarks/utils/remote_execution.py
 create mode 100644 garden_ai/benchmarks/utils/script_builder.py
 create mode 100644 garden_ai/benchmarks/utils/task.py

diff --git a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py
index 3bdfd6ca..96c8208f 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py
@@ -55,7 +55,7 @@ def create_sevennet_model(device):
     """Create SevenNet model calculator."""
     from sevenn.calculator import SevenNetCalculator
 
-    return SevenNetCalculator(model="7net-0", device=device)
+    return SevenNetCalculator(model="7net-l3i5", device=device)
 
 
 # Model configurations
diff --git a/garden_ai/benchmarks/matbench_discovery/remote.py b/garden_ai/benchmarks/matbench_discovery/remote.py
new file mode 100644
index 00000000..247d2ba2
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/remote.py
@@ -0,0 +1,485 @@
+"""Remote functions for Matbench Discovery benchmark.
+
+These functions are injected into the remote script.
+They must be self-contained (imports inside or provided by builder).
+"""
+
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+# ------------------------------------------------------------------------------
+# Common Helpers
+# ------------------------------------------------------------------------------
+
+
+def _process_batch_common(
+    batch_id: int,
+    structures: List[Tuple[str, Any]],
+    model_config: Dict[str, Any],
+    num_threads: int,
+    compute_fn: Callable[[Any, Any], Dict[str, Any]],
+    task_name: str,
+) -> Dict[str, Any]:
+    """Common logic for processing a batch of structures.
+
+    Args:
+        batch_id: ID of the current batch
+        structures: List of (id, atoms) tuples
+        model_config: Configuration for the model
+        num_threads: Number of threads to use
+        compute_fn: Function taking (model, atoms) and returning a result dict
+        task_name: Name of the task for logging
+    """
+    import logging
+    import os
+    import time
+
+    import torch
+
+    # Configure thread limits to avoid contention
+    os.environ["OMP_NUM_THREADS"] = str(num_threads)
+    torch.set_num_threads(num_threads)
+
+    gpu_id = model_config.get("gpu_id")
+    device = setup_device(gpu_id)  # noqa: F821
+
+    worker_logger = logging.getLogger(f"worker_{batch_id}")
+    worker_logger.info(
+        f"Started {task_name} on {device} with {len(structures)} structures. Threads: {num_threads}"
+    )
+
+    global _MODEL_CACHE
+    try:
+        if _MODEL_CACHE is None:
+            model = load_model(device)  # noqa: F821
+            _MODEL_CACHE = model
+        else:
+            model = _MODEL_CACHE
+    except Exception as e:
+        worker_logger.error(f"Failed to initialize model: {e}")
+        worker_logger.error(
+            "Model initialization is critical - cannot continue benchmark"
+        )
+        raise RuntimeError(f"Model initialization failed: {e}") from e
+
+    results = {}
+    batch_start = time.time()
+
+    for i, (struct_id, atoms) in enumerate(structures):
+        try:
+            # Run the specific computation
+            result = compute_fn(model, atoms)
+            results[struct_id] = result
+
+            if (i + 1) % 10 == 0:
+                elapsed = time.time() - batch_start
+                rate = (i + 1) / elapsed if elapsed > 0 else 0
+                worker_logger.info(
+                    f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)"
+                )
+
+        except Exception as e:
+            worker_logger.warning(f"Structure {struct_id} failed: {e}")
+            results[struct_id] = {"error": str(e)}
+
+    return results
+
+
+def _load_dataset_common(
+    config: Dict[str, Any],
+    zip_path: str,
+    read_format: str = "extxyz",
+    read_index: str | slice = None,
+) -> List[Tuple[str, Any]]:
+    """Common logic for loading datasets from a zip file."""
+    from io import TextIOWrapper
+    from zipfile import ZipFile
+
+    from ase.io import read
+
+    # get_material_ids_for_subset is injected
+    dataset_subset = config.get("dataset_subset", "full")
+    dataset_seed = config.get("dataset_seed", 42)
+    mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed)  # noqa: F821
+
+    structures = []
+
+    with ZipFile(zip_path, "r") as zf:
+        if mat_ids is None:
+            # Load all files (full dataset)
+            # Sort by numeric ID if possible
+            file_list = sorted(
+                zf.namelist(),
+                key=lambda x: int(x.split(".")[0])
+                if x.split(".")[0].isdigit()
+                else float("inf"),
+            )
+            num_structures = config.get("num_structures", 100)
+            file_list = file_list[:num_structures]
+        else:
+            # Filter to specific material IDs
+            mat_id_set = set(mat_ids)
+            file_list = [
+                f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set
+            ]
+
+        for filename in file_list:
+            with zf.open(filename) as f:
+                text_stream = TextIOWrapper(f, encoding="utf-8")
+                if read_index is not None:
+                    atoms_list = read(text_stream, format=read_format, index=read_index)
+                    # If we got a list and need one item, take the last one (common for trajectories)
+                    if isinstance(atoms_list, list) and atoms_list:
+                        structures.append((filename, atoms_list[-1]))
+                    elif not isinstance(atoms_list, list):
+                        structures.append((filename, atoms_list))
+                else:
+                    structures.append((filename, read(text_stream, format=read_format)))
+
+    return structures
+
+
+# ------------------------------------------------------------------------------
+# Injected Functions
+# ------------------------------------------------------------------------------
+
+
+def load_model(device: str):
+    """Initialize the model using the user-provided factory function.
+
+    The factory function is injected into this script by the benchmark framework.
+    """
+    # Call the user's factory function (injected as load_model_user)
+    model = load_model_user(device)  # noqa: F821
+    return model
+
+
+def get_material_ids_for_subset(
+    subset_type: str, seed: int = 42
+) -> Optional[List[str]]:
+    """Get material IDs for a specific dataset subset.
+
+    Args:
+        subset_type: One of 'full', 'unique_protos', 'random_10k', 'random_100'
+        seed: Random seed for sampling (default: 42)
+
+    Returns:
+        List of material IDs, or None for 'full' (load all)
+    """
+    if subset_type == "full":
+        return None  # Load all materials
+
+    import pandas as pd
+    from matbench_discovery.data import DataFiles
+
+    # Load wbm_summary
+    df = pd.read_csv(DataFiles.wbm_summary.path)
+
+    if subset_type == "unique_protos":
+        # Filter to unique prototypes (removes duplicates and MP overlaps)
+        df_filtered = df.query("unique_prototype")
+        return df_filtered["material_id"].tolist()
+
+    elif subset_type == "random_10k":
+        # Random sample of 10k unique prototypes (fixed seed for reproducibility)
+        df_filtered = df.query("unique_prototype")
+        df_sampled = df_filtered.sample(n=10000, random_state=seed)
+        return df_sampled["material_id"].tolist()
+
+    elif subset_type == "random_100":
+        # Random sample of 100 unique prototypes (fixed seed for reproducibility)
+        # Useful for quick end-to-end testing
+        df_filtered = df.query("unique_prototype")
+        df_sampled = df_filtered.sample(n=100, random_state=seed)
+        return df_sampled["material_id"].tolist()
+
+    else:
+        raise ValueError(f"Unknown subset_type: {subset_type}")
+
+
+def process_batch_relaxation(
+    batch_id: int,
+    structures: List[Tuple[str, Any]],
+    model_config: Dict[str, Any],
+    num_threads: int,
+) -> Dict[str, Any]:
+    """Process a batch of structures for IS2RE (Relaxation)."""
+    from ase.optimize import FIRE
+
+    def compute(model, atoms):
+        atoms.calc = model
+        opt = FIRE(atoms, logfile=None)
+        opt.run(fmax=0.05, steps=500)
+        energy = atoms.get_potential_energy()
+        return {"energy": energy}
+
+    return _process_batch_common(
+        batch_id, structures, model_config, num_threads, compute, "relaxation"
+    )
+
+
+def process_batch_static(
+    batch_id: int,
+    structures: List[Tuple[str, Any]],
+    model_config: Dict[str, Any],
+    num_threads: int,
+) -> Dict[str, Any]:
+    """Process a batch of structures for RS2RE (Static Calculation)."""
+
+    def compute(model, atoms):
+        atoms.calc = model
+        energy = atoms.get_potential_energy()
+        return {"energy": energy}
+
+    return _process_batch_common(
+        batch_id, structures, model_config, num_threads, compute, "static calculation"
+    )
+
+
+def process_batch_forces(
+    batch_id: int,
+    structures: List[Tuple[str, Any]],
+    model_config: Dict[str, Any],
+    num_threads: int,
+) -> Dict[str, Any]:
+    """Process a batch of structures for S2EFS (Energy, Forces, Stress)."""
+
+    def compute(model, atoms):
+        atoms.calc = model
+        energy = atoms.get_potential_energy()
+        forces = atoms.get_forces().tolist()
+        stress = atoms.get_stress().tolist()
+        return {"energy": energy, "forces": forces, "stress": stress}
+
+    return _process_batch_common(
+        batch_id, structures, model_config, num_threads, compute, "forces calculation"
+    )
+
+
+def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Tuple[str, Any]]:
+    """Load initial structures for IS2RE."""
+    from matbench_discovery.data import DataFiles
+
+    return _load_dataset_common(config, DataFiles.wbm_initial_atoms.path)
+
+
+def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Tuple[str, Any]]:
+    """Load relaxed structures for RS2RE."""
+    from matbench_discovery.data import DataFiles
+
+    return _load_dataset_common(config, DataFiles.wbm_relaxed_atoms.path)
+
+
+def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Tuple[str, Any]]:
+    """Load MP trajectories for S2EFS."""
+    from matbench_discovery.data import DataFiles
+
+    # Use index=":" to read all frames, but _load_dataset_common handles taking the last one
+    return _load_dataset_common(config, DataFiles.mp_trj_extxyz.path, read_index=":")
+
+
+def calculate_metrics_energy(
+    results: Dict[str, Any], config: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Calculate energy metrics using matbench-discovery's stable_metrics algorithm.
+
+    Uses the injected stable_metrics function.
+    Returns: F1, DAF, Precision, Recall, Accuracy, TPR, FPR, TNR, FNR, TP, FP, TN, FN, MAE, RMSE, R2
+    """
+    import logging
+
+    import numpy as np
+
+    logger = logging.getLogger("metrics")
+
+    # Results format: {id: {"energy": float, "error": str}}
+    if len(results) == 0:
+        return {"error": "No results to evaluate"}
+
+    try:
+        # Import matbench-discovery data
+        from matbench_discovery.data import df_wbm
+    except Exception as e:
+        return {"error": f"Failed to import matbench-discovery: {e}"}
+
+    # Extract model energies
+    model_energies = {}
+    for sid, res in results.items():
+        if isinstance(res, dict) and res.get("energy") is not None:
+            mat_id = sid.replace(".extxyz", "")
+            model_energies[mat_id] = res["energy"]
+
+    if not model_energies:
+        return {"error": "No valid energies found in results"}
+
+    # Get common IDs between predictions and ground truth
+    # Use direct string column names instead of MbdKey enum to avoid issues
+    df_wbm_indexed = df_wbm.set_index("material_id")
+    common_ids = list(set(model_energies.keys()) & set(df_wbm_indexed.index))
+
+    if not common_ids:
+        return {"error": "No matching IDs between results and ground truth"}
+
+    # Get subset of data
+    df_subset = df_wbm_indexed.loc[common_ids]
+
+    # Calculate predicted formation energies
+    y_pred = np.array([model_energies[mid] for mid in common_ids])
+    y_true = df_subset["uncorrected_energy"].values  # Uncorrected total energy
+    n_atoms = df_subset["n_sites"].values
+
+    # Predicted formation energy ERROR per atom (from total energy difference)
+    # This is the ERROR: (E_pred - E_dft) / n_atoms
+    e_form_error = (y_pred - y_true) / n_atoms
+
+    # Get ground truth e_above_hull for stability classification
+    each_true = df_subset["e_above_hull_mp2020_corrected_ppd_mp"].values
+
+    # Calculate predicted e_above_hull
+    # Since e_form_error is already (E_pred - E_dft)/n_atoms, we just add it to each_true
+    each_pred = each_true + e_form_error
+
+    # Debug logging to understand the distribution
+    logger.info("Energy statistics:")
+    logger.info(
+        f"  each_true: min={each_true.min():.4f}, max={each_true.max():.4f}, mean={each_true.mean():.4f}"
+    )
+    logger.info(
+        f"  each_pred: min={each_pred.min():.4f}, max={each_pred.max():.4f}, mean={each_pred.mean():.4f}"
+    )
+
+    # Calculate global prevalence for DAF normalization (matches official leaderboard)
+    # Filter to unique prototypes
+    df_unique = df_wbm.query("unique_prototype")
+    # Calculate prevalence: (stable count) / (total count)
+    # Stability threshold is 0.0
+    stable_count = (df_unique["e_above_hull_mp2020_corrected_ppd_mp"] <= 0).sum()
+    global_prevalence = stable_count / len(df_unique)
+
+    logger.info(
+        f"Using global prevalence for DAF: {global_prevalence:.6f} ({stable_count}/{len(df_unique)})"
+    )
+
+    # Calculate metrics using the injected function
+    # stable_metrics is injected into the script scope
+    metrics = stable_metrics(each_true, each_pred, prevalence=global_prevalence)  # noqa: F821
+
+    # Add num_evaluated
+    metrics["num_evaluated"] = len(common_ids)
+
+    return metrics
+
+
+def calculate_metrics_forces(
+    results: Dict[str, Any], config: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Calculate comprehensive S2EFS metrics (Energy, Forces, Stress).
+
+    Returns MAE, RMSE, and R2 for each component.
+    """
+    from io import TextIOWrapper
+    from zipfile import ZipFile
+
+    import numpy as np
+    from ase.io import read
+    from matbench_discovery.data import DataFiles
+    from sklearn.metrics import r2_score
+
+    # We need to load ground truth from the dataset itself because MP trj has E/F/S in the extxyz
+    # This is expensive to re-read. Ideally we should have passed GT in results or loaded it efficiently.
+    # For now, let's re-read the GT for the processed IDs.
+
+    metrics = {
+        "energy_mae": [],
+        "energy_rmse": [],
+        "force_mae": [],
+        "force_rmse": [],
+        "stress_mae": [],
+        "stress_rmse": [],
+    }
+
+    # Collect all predictions and ground truth for R2 calculation
+    all_e_pred, all_e_true = [], []
+    all_f_pred, all_f_true = [], []
+    all_s_pred, all_s_true = [], []
+
+    zip_path = DataFiles.mp_trj_extxyz.path
+
+    with ZipFile(zip_path, "r") as zf:
+        for sid, res in results.items():
+            if "error" in res:
+                continue
+
+            try:
+                with zf.open(sid) as f:
+                    text_stream = TextIOWrapper(f, encoding="utf-8")
+                    atoms_list = read(text_stream, format="extxyz", index=":")
+                    gt_atoms = atoms_list[-1]  # Matching load_dataset logic
+
+                    # Energy (per atom)
+                    e_pred = res["energy"]
+                    e_true = gt_atoms.get_potential_energy()
+                    n_atoms = len(gt_atoms)
+
+                    energy_error = abs(e_pred - e_true) / n_atoms
+                    metrics["energy_mae"].append(energy_error)
+                    metrics["energy_rmse"].append(energy_error**2)
+
+                    all_e_pred.append(e_pred / n_atoms)
+                    all_e_true.append(e_true / n_atoms)
+
+                    # Forces
+                    f_pred = np.array(res["forces"])
+                    f_true = gt_atoms.get_forces()
+                    force_error = np.abs(f_pred - f_true)
+                    metrics["force_mae"].append(force_error.mean())
+                    metrics["force_rmse"].append((force_error**2).mean())
+
+                    all_f_pred.extend(f_pred.flatten())
+                    all_f_true.extend(f_true.flatten())
+
+                    # Stress
+                    s_pred = np.array(res["stress"])
+                    s_true = gt_atoms.get_stress()
+                    stress_error = np.abs(s_pred - s_true)
+                    metrics["stress_mae"].append(stress_error.mean())
+                    metrics["stress_rmse"].append((stress_error**2).mean())
+
+                    all_s_pred.extend(s_pred.flatten())
+                    all_s_true.extend(s_true.flatten())
+
+            except Exception:
+                pass
+
+    # Calculate final metrics
+    result_metrics = {}
+
+    if metrics["energy_mae"]:
+        result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"]))
+        result_metrics["energy_rmse"] = float(np.sqrt(np.mean(metrics["energy_rmse"])))
+        result_metrics["energy_r2"] = (
+            float(r2_score(all_e_true, all_e_pred))
+            if len(all_e_true) > 1
+            else float("nan")
+        )
+
+    if metrics["force_mae"]:
+        result_metrics["force_mae"] = float(np.mean(metrics["force_mae"]))
+        result_metrics["force_rmse"] = float(np.sqrt(np.mean(metrics["force_rmse"])))
+        result_metrics["force_r2"] = (
+            float(r2_score(all_f_true, all_f_pred))
+            if len(all_f_true) > 1
+            else float("nan")
+        )
+
+    if metrics["stress_mae"]:
+        result_metrics["stress_mae"] = float(np.mean(metrics["stress_mae"]))
+        result_metrics["stress_rmse"] = float(np.sqrt(np.mean(metrics["stress_rmse"])))
+        result_metrics["stress_r2"] = (
+            float(r2_score(all_s_true, all_s_pred))
+            if len(all_s_true) > 1
+            else float("nan")
+        )
+
+    result_metrics["num_evaluated"] = len(metrics["energy_mae"])
+
+    return result_metrics
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index b4348d27..e26045d6 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List
 
 from ..utils.remote_execution import run_remote_benchmark
 from ..utils.script_builder import BenchmarkScriptBuilder
@@ -19,573 +19,25 @@
 # These functions are injected into the remote script.
 # They must be self-contained (imports inside or provided by builder).
 # ------------------------------------------------------------------------------
-
-
-def load_model(device: str):
-    """Initialize the model using the user-provided factory function.
-
-    The factory function is injected into this script by the benchmark framework.
-    """
-    # Call the user's factory function (injected as load_model_user)
-    model = load_model_user(device)  # noqa: F821
-    return model
-
-
-def get_material_ids_for_subset(
-    subset_type: str, seed: int = 42
-) -> Optional[List[str]]:
-    """Get material IDs for a specific dataset subset.
-
-    Args:
-        subset_type: One of 'full', 'unique_protos', 'random_10k', 'random_100'
-        seed: Random seed for sampling (default: 42)
-
-    Returns:
-        List of material IDs, or None for 'full' (load all)
-    """
-    if subset_type == "full":
-        return None  # Load all materials
-
-    import pandas as pd
-    from matbench_discovery.data import DataFiles
-
-    # Load wbm_summary
-    df = pd.read_csv(DataFiles.wbm_summary.path)
-
-    if subset_type == "unique_protos":
-        # Filter to unique prototypes (removes duplicates and MP overlaps)
-        df_filtered = df.query("unique_prototype")
-        return df_filtered["material_id"].tolist()
-
-    elif subset_type == "random_10k":
-        # Random sample of 10k unique prototypes (fixed seed for reproducibility)
-        df_filtered = df.query("unique_prototype")
-        df_sampled = df_filtered.sample(n=10000, random_state=seed)
-        return df_sampled["material_id"].tolist()
-
-    elif subset_type == "random_100":
-        # Random sample of 100 unique prototypes (fixed seed for reproducibility)
-        # Useful for quick end-to-end testing
-        df_filtered = df.query("unique_prototype")
-        df_sampled = df_filtered.sample(n=100, random_state=seed)
-        return df_sampled["material_id"].tolist()
-
-    else:
-        raise ValueError(f"Unknown subset_type: {subset_type}")
-
-
-# --- Reusable Process Functions ---
-
-
-def process_batch_relaxation(
-    batch_id: int,
-    structures: List[Tuple[str, Any]],
-    model_config: Dict[str, Any],
-    num_threads: int,
-) -> Dict[str, Any]:
-    """Process a batch of structures for IS2RE (Relaxation)."""
-    import logging
-    import os
-    import time
-
-    import torch
-    from ase.optimize import FIRE
-
-    # Configure thread limits to avoid contention
-    os.environ["OMP_NUM_THREADS"] = str(num_threads)
-    torch.set_num_threads(num_threads)
-
-    gpu_id = model_config.get("gpu_id")
-    device = setup_device(gpu_id)  # noqa: F821
-
-    worker_logger = logging.getLogger(f"worker_{batch_id}")
-    worker_logger.info(
-        f"Started relaxation on {device} with {len(structures)} structures. Threads: {num_threads}"
-    )
-
-    global _MODEL_CACHE
-    try:
-        if _MODEL_CACHE is None:
-            model = load_model(device)
-            _MODEL_CACHE = model
-        else:
-            model = _MODEL_CACHE
-    except Exception as e:
-        worker_logger.error(f"Failed to initialize model: {e}")
-        worker_logger.error(
-            "Model initialization is critical - cannot continue benchmark"
-        )
-        raise RuntimeError(f"Model initialization failed: {e}") from e
-
-    results = {}
-    batch_start = time.time()
-
-    for i, (struct_id, atoms) in enumerate(structures):
-        try:
-            atoms.calc = model
-            opt = FIRE(atoms, logfile=None)
-            opt.run(fmax=0.05, steps=500)
-
-            energy = atoms.get_potential_energy()
-            results[struct_id] = {"energy": energy}
-
-            if (i + 1) % 10 == 0:
-                elapsed = time.time() - batch_start
-                rate = (i + 1) / elapsed if elapsed > 0 else 0
-                worker_logger.info(
-                    f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)"
-                )
-
-        except Exception as e:
-            worker_logger.warning(f"Structure {struct_id} failed: {e}")
-            results[struct_id] = {"energy": None, "error": str(e)}
-
-    return results
-
-
-def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Tuple[str, Any]]:
-    """Load initial structures for IS2RE."""
-    from io import TextIOWrapper
-    from zipfile import ZipFile
-
-    from ase.io import read
-    from matbench_discovery.data import DataFiles
-
-    dataset_subset = config.get("dataset_subset", "full")
-    dataset_seed = config.get("dataset_seed", 42)
-    mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed)
-
-    structures = []
-    zip_path = DataFiles.wbm_initial_atoms.path
-
-    with ZipFile(zip_path, "r") as zf:
-        if mat_ids is None:
-            # Load all files (full dataset)
-            file_list = sorted(
-                zf.namelist(),
-                key=lambda x: int(x.split(".")[0])
-                if x.split(".")[0].isdigit()
-                else float("inf"),
-            )
-            num_structures = config.get("num_structures", 100)
-            file_list = file_list[:num_structures]
-        else:
-            # Filter to specific material IDs
-            mat_id_set = set(mat_ids)
-            file_list = [
-                f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set
-            ]
-
-        for filename in file_list:
-            with zf.open(filename) as f:
-                text_stream = TextIOWrapper(f, encoding="utf-8")
-                structures.append((filename, read(text_stream, format="extxyz")))
-    return structures
-
-
-def calculate_metrics_energy(
-    results: Dict[str, Any], config: Dict[str, Any]
-) -> Dict[str, Any]:
-    """Calculate energy metrics using matbench-discovery's stable_metrics algorithm.
-
-    Uses the injected stable_metrics function.
-    Returns: F1, DAF, Precision, Recall, Accuracy, TPR, FPR, TNR, FNR, TP, FP, TN, FN, MAE, RMSE, R2
-    """
-    import logging
-
-    import numpy as np
-
-    logger = logging.getLogger("metrics")
-
-    # Results format: {id: {"energy": float, "error": str}}
-    if len(results) == 0:
-        return {"error": "No results to evaluate"}
-
-    try:
-        # Import matbench-discovery data
-        from matbench_discovery.data import df_wbm
-    except Exception as e:
-        return {"error": f"Failed to import matbench-discovery: {e}"}
-
-    # Extract model energies
-    model_energies = {}
-    for sid, res in results.items():
-        if isinstance(res, dict) and res.get("energy") is not None:
-            mat_id = sid.replace(".extxyz", "")
-            model_energies[mat_id] = res["energy"]
-
-    if not model_energies:
-        return {"error": "No valid energies found in results"}
-
-    # Get common IDs between predictions and ground truth
-    # Use direct string column names instead of MbdKey enum to avoid issues
-    df_wbm_indexed = df_wbm.set_index("material_id")
-    common_ids = list(set(model_energies.keys()) & set(df_wbm_indexed.index))
-
-    if not common_ids:
-        return {"error": "No matching IDs between results and ground truth"}
-
-    # Get subset of data
-    df_subset = df_wbm_indexed.loc[common_ids]
-
-    # Calculate predicted formation energies
-    y_pred = np.array([model_energies[mid] for mid in common_ids])
-    y_true = df_subset["uncorrected_energy"].values  # Uncorrected total energy
-    n_atoms = df_subset["n_sites"].values
-
-    # Predicted formation energy ERROR per atom (from total energy difference)
-    # This is the ERROR: (E_pred - E_dft) / n_atoms
-    e_form_error = (y_pred - y_true) / n_atoms
-
-    # Get ground truth e_above_hull for stability classification
-    each_true = df_subset["e_above_hull_mp2020_corrected_ppd_mp"].values
-
-    # Calculate predicted e_above_hull
-    # Since e_form_error is already (E_pred - E_dft)/n_atoms, we just add it to each_true
-    each_pred = each_true + e_form_error
-
-    # Debug logging to understand the distribution
-    logger.info("Energy statistics:")
-    logger.info(
-        f"  each_true: min={each_true.min():.4f}, max={each_true.max():.4f}, mean={each_true.mean():.4f}"
-    )
-    logger.info(
-        f"  each_pred: min={each_pred.min():.4f}, max={each_pred.max():.4f}, mean={each_pred.mean():.4f}"
-    )
-
-    # Calculate global prevalence for DAF normalization (matches official leaderboard)
-    # Filter to unique prototypes
-    df_unique = df_wbm.query("unique_prototype")
-    # Calculate prevalence: (stable count) / (total count)
-    # Stability threshold is 0.0
-    stable_count = (df_unique["e_above_hull_mp2020_corrected_ppd_mp"] <= 0).sum()
-    global_prevalence = stable_count / len(df_unique)
-
-    logger.info(
-        f"Using global prevalence for DAF: {global_prevalence:.6f} ({stable_count}/{len(df_unique)})"
-    )
-
-    # Calculate metrics using the injected function
-    # stable_metrics is injected into the script scope
-    metrics = stable_metrics(each_true, each_pred, prevalence=global_prevalence)
-
-    # Add num_evaluated
-    metrics["num_evaluated"] = len(common_ids)
-
-    return metrics
-
-
-def process_batch_static(
-    batch_id: int,
-    structures: List[Tuple[str, Any]],
-    model_config: Dict[str, Any],
-    num_threads: int,
-) -> Dict[str, Any]:
-    """Process a batch of structures for RS2RE (Static Calculation)."""
-    import logging
-    import os
-    import time
-
-    import torch
-
-    os.environ["OMP_NUM_THREADS"] = str(num_threads)
-    torch.set_num_threads(num_threads)
-
-    gpu_id = model_config.get("gpu_id")
-    device = setup_device(gpu_id)  # noqa: F821
-
-    worker_logger = logging.getLogger(f"worker_{batch_id}")
-    worker_logger.info(
-        f"Started static calculation on {device} with {len(structures)} structures."
-    )
-
-    global _MODEL_CACHE
-    try:
-        if _MODEL_CACHE is None:
-            model = load_model(device)
-            _MODEL_CACHE = model
-        else:
-            model = _MODEL_CACHE
-    except Exception as e:
-        return {sid: {"energy": None, "error": str(e)} for sid, _ in structures}
-
-    results = {}
-    batch_start = time.time()
-
-    for i, (struct_id, atoms) in enumerate(structures):
-        try:
-            atoms.calc = model
-            # No relaxation, just static energy
-            energy = atoms.get_potential_energy()
-            results[struct_id] = {"energy": energy}
-
-            if (i + 1) % 50 == 0:
-                elapsed = time.time() - batch_start
-                rate = (i + 1) / elapsed if elapsed > 0 else 0
-                worker_logger.info(
-                    f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)"
-                )
-
-        except Exception as e:
-            worker_logger.warning(f"Structure {struct_id} failed: {e}")
-            results[struct_id] = {"energy": None, "error": str(e)}
-
-    return results
-
-
-def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Tuple[str, Any]]:
-    """Load relaxed structures for RS2RE."""
-    from io import TextIOWrapper
-    from zipfile import ZipFile
-
-    from ase.io import read
-    from matbench_discovery.data import DataFiles
-
-    dataset_subset = config.get("dataset_subset", "full")
-    dataset_seed = config.get("dataset_seed", 42)
-    mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed)
-
-    structures = []
-    # Use relaxed atoms
-    zip_path = DataFiles.wbm_relaxed_atoms.path
-
-    with ZipFile(zip_path, "r") as zf:
-        if mat_ids is None:
-            # Load all files (full dataset)
-            file_list = sorted(
-                zf.namelist(),
-                key=lambda x: int(x.split(".")[0])
-                if x.split(".")[0].isdigit()
-                else float("inf"),
-            )
-            num_structures = config.get("num_structures", 100)
-            file_list = file_list[:num_structures]
-        else:
-            # Filter to specific material IDs
-            mat_id_set = set(mat_ids)
-            file_list = [
-                f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set
-            ]
-
-        for filename in file_list:
-            with zf.open(filename) as f:
-                text_stream = TextIOWrapper(f, encoding="utf-8")
-                structures.append((filename, read(text_stream, format="extxyz")))
-    return structures
-
-
-# Reuse calculate_metrics_energy for all energy-only tasks
-
-
-def process_batch_forces(
-    batch_id: int,
-    structures: List[Tuple[str, Any]],
-    model_config: Dict[str, Any],
-    num_threads: int,
-) -> Dict[str, Any]:
-    """Process a batch of structures for S2EFS (Energy, Forces, Stress)."""
-    import logging
-    import os
-    import time
-
-    import torch
-
-    os.environ["OMP_NUM_THREADS"] = str(num_threads)
-    torch.set_num_threads(num_threads)
-
-    gpu_id = model_config.get("gpu_id")
-    device = setup_device(gpu_id)  # noqa: F821
-
-    worker_logger = logging.getLogger(f"worker_{batch_id}")
-    worker_logger.info(
-        f"Started forces calculation on {device} with {len(structures)} structures."
-    )
-
-    global _MODEL_CACHE
-    try:
-        if _MODEL_CACHE is None:
-            model = load_model(device)
-            _MODEL_CACHE = model
-        else:
-            model = _MODEL_CACHE
-    except Exception as e:
-        return {sid: {"error": str(e)} for sid, _ in structures}
-
-    results = {}
-    batch_start = time.time()
-
-    for i, (struct_id, atoms) in enumerate(structures):
-        try:
-            atoms.calc = model
-
-            energy = atoms.get_potential_energy()
-            forces = atoms.get_forces().tolist()
-            stress = atoms.get_stress().tolist()
-
-            results[struct_id] = {"energy": energy, "forces": forces, "stress": stress}
-
-            if (i + 1) % 50 == 0:
-                elapsed = time.time() - batch_start
-                rate = (i + 1) / elapsed if elapsed > 0 else 0
-                worker_logger.info(
-                    f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)"
-                )
-
-        except Exception as e:
-            worker_logger.warning(f"Structure {struct_id} failed: {e}")
-            results[struct_id] = {"error": str(e)}
-
-    return results
-
-
-def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Tuple[str, Any]]:
-    """Load MP trajectories for S2EFS."""
-    from io import TextIOWrapper
-    from zipfile import ZipFile
-
-    from ase.io import read
-    from matbench_discovery.data import DataFiles
-
-    num_structures = config.get("num_structures", 100)
-    structures = []
-    # Use MP trajectories
-    zip_path = DataFiles.mp_trj_extxyz.path
-
-    with ZipFile(zip_path, "r") as zf:
-        file_list = sorted(zf.namelist())
-        for filename in file_list[:num_structures]:
-            with zf.open(filename) as f:
-                text_stream = TextIOWrapper(f, encoding="utf-8")
-                # Read all frames? Or just one? Usually S2EFS is on frames.
-                # Let's assume we evaluate on the last frame or all frames.
-                # For simplicity, let's take the last frame (relaxed?) or random?
-                # Actually, MP trj contains relaxation steps.
-                # Let's read the last frame for now as a proxy for "a structure".
-                # Or better, read all frames and treat them as separate tasks?
-                # For this benchmark, let's just treat the file as containing one structure per file if possible,
-                # or just take the last one.
-                atoms_list = read(text_stream, format="extxyz", index=":")
-                if atoms_list:
-                    # Just take the last one for now
-                    structures.append((filename, atoms_list[-1]))
-    return structures
-
-
-def calculate_metrics_forces(
-    results: Dict[str, Any], config: Dict[str, Any]
-) -> Dict[str, Any]:
-    """Calculate comprehensive S2EFS metrics (Energy, Forces, Stress).
-
-    Returns MAE, RMSE, and R2 for each component.
-    """
-    from io import TextIOWrapper
-    from zipfile import ZipFile
-
-    import numpy as np
-    from ase.io import read
-    from matbench_discovery.data import DataFiles
-    from sklearn.metrics import r2_score
-
-    # We need to load ground truth from the dataset itself because MP trj has E/F/S in the extxyz
-    # This is expensive to re-read. Ideally we should have passed GT in results or loaded it efficiently.
-    # For now, let's re-read the GT for the processed IDs.
-
-    metrics = {
-        "energy_mae": [],
-        "energy_rmse": [],
-        "force_mae": [],
-        "force_rmse": [],
-        "stress_mae": [],
-        "stress_rmse": [],
-    }
-
-    # Collect all predictions and ground truth for R2 calculation
-    all_e_pred, all_e_true = [], []
-    all_f_pred, all_f_true = [], []
-    all_s_pred, all_s_true = [], []
-
-    zip_path = DataFiles.mp_trj_extxyz.path
-
-    with ZipFile(zip_path, "r") as zf:
-        for sid, res in results.items():
-            if "error" in res:
-                continue
-
-            try:
-                with zf.open(sid) as f:
-                    text_stream = TextIOWrapper(f, encoding="utf-8")
-                    atoms_list = read(text_stream, format="extxyz", index=":")
-                    gt_atoms = atoms_list[-1]  # Matching load_dataset logic
-
-                    # Energy (per atom)
-                    e_pred = res["energy"]
-                    e_true = gt_atoms.get_potential_energy()
-                    n_atoms = len(gt_atoms)
-
-                    energy_error = abs(e_pred - e_true) / n_atoms
-                    metrics["energy_mae"].append(energy_error)
-                    metrics["energy_rmse"].append(energy_error**2)
-
-                    all_e_pred.append(e_pred / n_atoms)
-                    all_e_true.append(e_true / n_atoms)
-
-                    # Forces
-                    f_pred = np.array(res["forces"])
-                    f_true = gt_atoms.get_forces()
-                    force_error = np.abs(f_pred - f_true)
-                    metrics["force_mae"].append(force_error.mean())
-                    metrics["force_rmse"].append((force_error**2).mean())
-
-                    all_f_pred.extend(f_pred.flatten())
-                    all_f_true.extend(f_true.flatten())
-
-                    # Stress
-                    s_pred = np.array(res["stress"])
-                    s_true = gt_atoms.get_stress()
-                    stress_error = np.abs(s_pred - s_true)
-                    metrics["stress_mae"].append(stress_error.mean())
-                    metrics["stress_rmse"].append((stress_error**2).mean())
-
-                    all_s_pred.extend(s_pred.flatten())
-                    all_s_true.extend(s_true.flatten())
-
-            except Exception:
-                pass
-
-    # Calculate final metrics
-    result_metrics = {}
-
-    if metrics["energy_mae"]:
-        result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"]))
-        result_metrics["energy_rmse"] = float(np.sqrt(np.mean(metrics["energy_rmse"])))
-        result_metrics["energy_r2"] = (
-            float(r2_score(all_e_true, all_e_pred))
-            if len(all_e_true) > 1
-            else float("nan")
-        )
-
-    if metrics["force_mae"]:
-        result_metrics["force_mae"] = float(np.mean(metrics["force_mae"]))
-        result_metrics["force_rmse"] = float(np.sqrt(np.mean(metrics["force_rmse"])))
-        result_metrics["force_r2"] = (
-            float(r2_score(all_f_true, all_f_pred))
-            if len(all_f_true) > 1
-            else float("nan")
-        )
-
-    if metrics["stress_mae"]:
-        result_metrics["stress_mae"] = float(np.mean(metrics["stress_mae"]))
-        result_metrics["stress_rmse"] = float(np.sqrt(np.mean(metrics["stress_rmse"])))
-        result_metrics["stress_r2"] = (
-            float(r2_score(all_s_true, all_s_pred))
-            if len(all_s_true) > 1
-            else float("nan")
-        )
-
-    result_metrics["num_evaluated"] = len(metrics["energy_mae"])
-
-    return result_metrics
-
+# ------------------------------------------------------------------------------
+# REMOTE FUNCTIONS
+# These functions are injected into the remote script.
+# They are now imported from remote.py to keep this file clean.
+# ------------------------------------------------------------------------------
+from .remote import (
+    _load_dataset_common,
+    _process_batch_common,
+    calculate_metrics_energy,
+    calculate_metrics_forces,
+    get_material_ids_for_subset,
+    load_dataset_mp_trj,
+    load_dataset_wbm_initial,
+    load_dataset_wbm_relaxed,
+    load_model,
+    process_batch_forces,
+    process_batch_relaxation,
+    process_batch_static,
+)
 
 # ------------------------------------------------------------------------------
 # Task Classes
@@ -627,7 +79,9 @@ def _build_script(
         builder.add_preamble("_MODEL_CACHE = None")
 
         # Common imports
-        builder.add_import("from typing import List, Dict, Any, Tuple, Optional")
+        builder.add_import(
+            "from typing import List, Dict, Any, Tuple, Optional, Callable"
+        )
         builder.add_import("import torch")
         builder.add_import("from ase.optimize import FIRE")
         builder.add_import("from ase.io import read")
@@ -648,6 +102,10 @@ def _build_script(
         # Add helper function for dataset subset filtering
         builder.add_function(get_material_ids_for_subset)
 
+        # Add common helpers
+        builder.add_function(_process_batch_common)
+        builder.add_function(_load_dataset_common)
+
         # Add task-specific functions with standard names expected by runner
         builder.add_function(process_fn, name="process_batch")
         builder.add_function(load_dataset_fn, name="load_dataset")
@@ -659,12 +117,69 @@ def _build_script(
 
         return builder.build()
 
+    def _prepare_runner_config(
+        self, num_structures: int | "DatasetSize" | "DatasetConfig"
+    ) -> Dict[str, Any]:
+        """Prepare the runner configuration based on num_structures."""
+        from .enums import DatasetConfig, DatasetSize
+
+        if isinstance(num_structures, DatasetSize):
+            return {
+                "repo_url": self.repo_url,
+                "repo_ref": self.repo_ref,
+                "dataset_subset": num_structures.value,
+                "dataset_seed": 42,
+            }
+        elif isinstance(num_structures, DatasetConfig):
+            return {
+                "repo_url": self.repo_url,
+                "repo_ref": self.repo_ref,
+                "dataset_subset": num_structures.subset.value,
+                "dataset_seed": num_structures.seed,
+            }
+        else:
+            return {
+                "repo_url": self.repo_url,
+                "repo_ref": self.repo_ref,
+                "num_structures": num_structures,
+                "dataset_subset": "full",
+            }
+
+    def _prepare_dependencies(self, model_packages: str | List[str]) -> List[str]:
+        """Prepare the list of dependencies."""
+        packages = (
+            [model_packages] if isinstance(model_packages, str) else model_packages
+        )
+        return ["matbench-discovery>=1.3.0"] + packages
+
+    def _generate_checkpoint_name(
+        self, model_packages: str | List[str], runner_config: Dict[str, Any]
+    ) -> str:
+        """Generate a unique checkpoint name."""
+        import time
+        import uuid
+
+        model_str = (
+            str(model_packages)
+            .replace("[", "")
+            .replace("]", "")
+            .replace("'", "")
+            .replace('"', "")
+            .replace(",", "_")
+            .replace(" ", "")
+        )
+        subset_str = runner_config.get("dataset_subset", "custom")
+        timestamp = int(time.time())
+        short_uuid = str(uuid.uuid4())[:8]
+        return f"matbench_{model_str}_{subset_str}_{timestamp}_{short_uuid}.json"
+
     def submit(
         self,
         model_factory: callable,
         model_packages: str | List[str],
         num_structures: int | "DatasetSize" | "DatasetConfig" = 100,
         checkpoint_name: str | None = None,
+        checkpoint_path: str | None = None,
     ):
         """Submit benchmark job to remote executor.
 
@@ -677,12 +192,9 @@ def submit(
                           (DatasetSize.FULL, DatasetSize.RANDOM_10K, DatasetSize.RANDOM_10K.seed(10))
             checkpoint_name: Optional name for the checkpoint file (e.g. "my_checkpoint.json").
                              If not provided, one will be generated.
+            checkpoint_path: Optional path to an existing checkpoint file to resume from.
+                             If provided, checkpoint_name is ignored and no new checkpoint is created.
         """
-        import time
-        import uuid
-
-        from .enums import DatasetConfig, DatasetSize
-
         # Build script with task-specific functions AND user's factory
         script_content = self._build_script(
             self.process_fn,
@@ -691,57 +203,23 @@ def submit(
             model_factory,  # Inject user's factory function
         )
 
-        # Handle single package string or list of packages
-        packages = (
-            [model_packages] if isinstance(model_packages, str) else model_packages
-        )
-        dependencies = ["matbench-discovery>=1.3.0"] + packages
+        dependencies = self._prepare_dependencies(model_packages)
+        runner_config = self._prepare_runner_config(num_structures)
 
-        # Handle DatasetSize enum, DatasetConfig, or integer
-        if isinstance(num_structures, DatasetSize):
-            runner_config = {
-                "repo_url": self.repo_url,
-                "repo_ref": self.repo_ref,
-                "dataset_subset": num_structures.value,
-                "dataset_seed": 42,  # Default seed
-            }
-        elif isinstance(num_structures, DatasetConfig):
-            runner_config = {
-                "repo_url": self.repo_url,
-                "repo_ref": self.repo_ref,
-                "dataset_subset": num_structures.subset.value,
-                "dataset_seed": num_structures.seed,
-            }
-        else:
-            # Integer - use traditional num_structures approach
-            runner_config = {
-                "repo_url": self.repo_url,
-                "repo_ref": self.repo_ref,
-                "num_structures": num_structures,
-                "dataset_subset": "full",
-            }
-
-        # Generate checkpoint name if not provided
-        if not checkpoint_name:
-            # Format: matbench_{model}_{subset}_{timestamp}_{uuid}.json
-            # Clean up model name for filename
-            model_str = (
-                str(model_packages)
-                .replace("[", "")
-                .replace("]", "")
-                .replace("'", "")
-                .replace('"', "")
-                .replace(",", "_")
-                .replace(" ", "")
-            )
-            subset_str = runner_config.get("dataset_subset", "custom")
-            timestamp = int(time.time())
-            short_uuid = str(uuid.uuid4())[:8]
-            checkpoint_name = (
-                f"matbench_{model_str}_{subset_str}_{timestamp}_{short_uuid}.json"
+        # Generate checkpoint name if not provided AND no checkpoint_path is provided
+        if not checkpoint_name and not checkpoint_path:
+            checkpoint_name = self._generate_checkpoint_name(
+                model_packages, runner_config
             )
 
-        print(f"Checkpoint will be saved to: ~/.garden/benchmarks/{checkpoint_name}")
+        if checkpoint_path:
+            print(f"Resuming from checkpoint: {checkpoint_path}")
+            final_checkpoint_path = checkpoint_path
+        else:
+            print(
+                f"Checkpoint will be saved to: ~/.garden/benchmarks/{checkpoint_name}"
+            )
+            final_checkpoint_path = f"~/.garden/benchmarks/{checkpoint_name}"
 
         executor = self.adapter._get_executor()
         future = executor.submit(
@@ -750,10 +228,11 @@ def submit(
             dependencies=dependencies,
             config=runner_config,
             checkpoint_name=checkpoint_name,
+            checkpoint_path=checkpoint_path,
         )
 
         # Attach checkpoint path to future for programmatic access
-        future.checkpoint_path = f"~/.garden/benchmarks/{checkpoint_name}"
+        future.checkpoint_path = final_checkpoint_path
 
         return future
 
@@ -775,42 +254,14 @@ def local(
             checkpoint_path: Optional path to resume from checkpoint
         """
         from ..utils.remote_execution import run_remote_benchmark
-        from .enums import DatasetConfig, DatasetSize
 
         # Build script with task-specific functions AND user's factory
         script_content = self._build_script(
             self.process_fn, self.load_dataset_fn, self.calc_metrics_fn, model_factory
         )
 
-        # Handle single package string or list of packages
-        packages = (
-            [model_packages] if isinstance(model_packages, str) else model_packages
-        )
-        dependencies = ["matbench-discovery>=1.3.0"] + packages
-
-        # Handle DatasetSize enum, DatasetConfig, or integer
-        if isinstance(num_structures, DatasetSize):
-            runner_config = {
-                "repo_url": self.repo_url,
-                "repo_ref": self.repo_ref,
-                "dataset_subset": num_structures.value,
-                "dataset_seed": 42,  # Default seed
-            }
-        elif isinstance(num_structures, DatasetConfig):
-            runner_config = {
-                "repo_url": self.repo_url,
-                "repo_ref": self.repo_ref,
-                "dataset_subset": num_structures.subset.value,
-                "dataset_seed": num_structures.seed,
-            }
-        else:
-            # Integer - use traditional num_structures approach
-            runner_config = {
-                "repo_url": self.repo_url,
-                "repo_ref": self.repo_ref,
-                "num_structures": num_structures,
-                "dataset_subset": "full",
-            }
+        dependencies = self._prepare_dependencies(model_packages)
+        runner_config = self._prepare_runner_config(num_structures)
 
         # Run locally (no Globus Compute)
         return run_remote_benchmark(
diff --git a/garden_ai/benchmarks/templates/base_runner.py b/garden_ai/benchmarks/templates/base_runner.py
new file mode 100644
index 00000000..60ed80d6
--- /dev/null
+++ b/garden_ai/benchmarks/templates/base_runner.py
@@ -0,0 +1,248 @@
+import concurrent.futures
+import json
+import logging
+import multiprocessing
+import os
+import sys
+import time
+from typing import Optional
+
+# ------------------------------------------------------------------------------
+# BOILERPLATE: Logging & Device Setup
+# ------------------------------------------------------------------------------
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] [PID:%(process)d] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    stream=sys.stdout,
+    force=True,
+)
+logger = logging.getLogger("benchmark_runner")
+
+
+def setup_device(gpu_id: Optional[int] = None) -> str:
+    """Setup compute device for this process."""
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            return f"cuda:{gpu_id}" if gpu_id is not None else "cuda"
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            return "mps"
+    except ImportError:
+        pass
+    return "cpu"
+
+
+def convert_numpy_types(obj):
+    """Convert numpy types to Python native types for JSON serialization."""
+    import numpy as np
+
+    if isinstance(obj, (np.integer, np.floating)):
+        return obj.item()
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {k: convert_numpy_types(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_numpy_types(item) for item in obj]
+    return obj
+
+
+# ------------------------------------------------------------------------------
+# USER DEFINED FUNCTIONS (Injected)
+# ------------------------------------------------------------------------------
+# - load_model(config, device)
+# - process_batch(batch_id, batch_data, model_config, num_threads)
+# - load_dataset(config) -> List[Any]
+# ------------------------------------------------------------------------------
+
+# ------------------------------------------------------------------------------
+# MAIN EXECUTION LOOP
+# ------------------------------------------------------------------------------
+
+
+def main():
+    if len(sys.argv) != 2:
+        sys.exit("Usage: python benchmark_runner.py <config_file>")
+
+    with open(sys.argv[1]) as f:
+        config = json.load(f)
+
+    logger.info("Starting benchmark runner...")
+
+    checkpoint_path = config.get("checkpoint_path")
+    results = {}
+
+    # Load existing checkpoint if available
+    if checkpoint_path and os.path.exists(checkpoint_path):
+        logger.info(f"Loading checkpoint from {checkpoint_path}")
+        try:
+            with open(checkpoint_path) as f:
+                results = json.load(f)
+            logger.info(f"Found {len(results)} processed items in checkpoint")
+        except Exception as e:
+            logger.warning(f"Failed to load checkpoint: {e}. Starting fresh.")
+
+    # Load Dataset
+    try:
+        all_items = load_dataset(config)  # noqa: F821
+        logger.info(f"Loaded {len(all_items)} total items")
+    except Exception as e:
+        logger.error(f"Failed to load dataset: {e}")
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+    # Filter out already processed items
+    # Assuming items are (id, data) tuples
+    items_to_process = [
+        (item_id, item) for item_id, item in all_items if str(item_id) not in results
+    ]
+
+    if not items_to_process:
+        logger.info("All items already processed!")
+        with open("results.json", "w") as f:
+            json.dump(results, f, indent=2)
+        return
+
+    logger.info(f"Processing {len(items_to_process)} remaining items")
+
+    # Shuffle for load balancing
+    import random
+
+    random.seed(42)
+    random.shuffle(items_to_process)
+
+    # Resource detection
+    try:
+        import torch
+
+        num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
+    except ImportError:
+        num_gpus = 0
+
+    use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1
+
+    total_cores = os.cpu_count() or 1
+    num_workers = num_gpus if use_multi_gpu else 1
+    # Reserve some cores for system/overhead if possible
+    available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores
+    threads_per_worker = max(1, available_cores // num_workers)
+
+    logger.info(
+        f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)"
+    )
+
+    start_time = time.time()
+
+    # Chunk items into smaller batches to allow frequent checkpointing
+    chunk_size = 1000 * num_workers
+    chunks = [
+        items_to_process[i : i + chunk_size]
+        for i in range(0, len(items_to_process), chunk_size)
+    ]
+
+    logger.info(f"Split into {len(chunks)} chunks for processing")
+
+    ctx = multiprocessing.get_context("spawn")
+
+    with concurrent.futures.ProcessPoolExecutor(
+        max_workers=num_workers, mp_context=ctx
+    ) as executor:
+        for chunk_idx, chunk in enumerate(chunks):
+            chunk_start = time.time()
+            logger.info(
+                f"Starting chunk {chunk_idx + 1}/{len(chunks)} ({len(chunk)} items)"
+            )
+
+            # Split chunk among workers
+            futures = []
+            batch_size = (len(chunk) + num_workers - 1) // num_workers
+
+            for i in range(num_workers):
+                start = i * batch_size
+                end = min((i + 1) * batch_size, len(chunk))
+                if start < end:
+                    batch = chunk[start:end]
+
+                    # Inject worker specific config
+                    worker_config = config.copy()
+                    worker_config["gpu_id"] = i if use_multi_gpu else None
+
+                    futures.append(
+                        executor.submit(
+                            process_batch,  # noqa: F821
+                            i,
+                            batch,
+                            worker_config,
+                            threads_per_worker,
+                        )
+                    )
+
+            # Collect results for this chunk
+            chunk_results = {}
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    batch_res = future.result()
+                    chunk_results.update(batch_res)
+                except Exception as e:
+                    logger.error(f"Worker failed in chunk {chunk_idx}: {e}")
+                    import traceback
+
+                    traceback.print_exc()
+                    # Critical failure - abort benchmark immediately
+                    logger.error("Aborting benchmark due to worker failure")
+                    sys.exit(1)
+
+            # Update main results and save checkpoint
+            results.update(chunk_results)
+
+            if checkpoint_path:
+                try:
+                    tmp_path = checkpoint_path + ".tmp"
+                    with open(tmp_path, "w") as f:
+                        # Convert numpy types before saving checkpoint
+                        clean_results = convert_numpy_types(results)
+                        json.dump(clean_results, f, indent=2)
+                    os.replace(tmp_path, checkpoint_path)
+                    logger.info(f"Checkpoint saved to {checkpoint_path}")
+                except Exception as e:
+                    logger.error(f"Failed to save checkpoint: {e}")
+
+            elapsed = time.time() - chunk_start
+            logger.info(f"Chunk {chunk_idx + 1} complete in {elapsed:.1f}s")
+
+    total_elapsed = time.time() - start_time
+    logger.info(f"Benchmark complete in {total_elapsed:.1f}s.")
+
+    # Calculate metrics from results
+    logger.info("Calculating metrics...")
+    try:
+        metrics = calculate_metrics_remote(results, config)  # noqa: F821
+        logger.info(f"Metrics calculated: {metrics}")
+    except Exception as e:
+        logger.error(f"Failed to calculate metrics: {e}")
+        import traceback
+
+        traceback.print_exc()
+        metrics = {"error": f"Metrics calculation failed: {e}"}
+
+    # Write both results and metrics
+    output = {"results": results, "metrics": metrics}
+
+    # Custom JSON encoder to handle numpy types
+    # convert_numpy_types moved to global scope
+
+    # Convert numpy types before serialization
+    output = convert_numpy_types(output)
+
+    with open("results.json", "w") as f:
+        json.dump(output, f, indent=2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/garden_ai/benchmarks/utils/remote.py b/garden_ai/benchmarks/utils/remote.py
new file mode 100644
index 00000000..b9a4780e
--- /dev/null
+++ b/garden_ai/benchmarks/utils/remote.py
@@ -0,0 +1,176 @@
+import json
+import logging
+import os
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List
+
+logger = logging.getLogger(__name__)
+
+
+class RemoteBenchmarkRunner:
+    """
+    Handles the setup and execution of benchmarks on remote Globus Compute endpoints.
+
+    This class manages:
+    1. Creating an isolated working directory
+    2. Setting up a Python environment using `uv`
+    3. Installing dependencies
+    4. Executing the benchmark script
+    5. Collecting results
+    """
+
+    def __init__(self, work_dir_prefix: str = "garden_benchmark_"):
+        self.work_dir = Path(tempfile.mkdtemp(prefix=work_dir_prefix))
+        self.uv_bin = None
+        self.venv_python = None
+        self.env = dict(os.environ)
+
+        # Configure logging if not already configured
+        if not logging.getLogger().handlers:
+            logging.basicConfig(
+                level=logging.INFO,
+                stream=sys.stdout,
+                force=True,
+                format="%(asctime)s [%(levelname)s] %(message)s",
+            )
+
+    def setup_environment(self, python_version: str = "3.11"):
+        """Find uv and create virtual environment."""
+        logger.info("Setting up environment...")
+
+        # Find UV binary
+        try:
+            self.uv_bin = subprocess.check_output(
+                [sys.executable, "-c", "import uv; print(uv.find_uv_bin())"], text=True
+            ).strip()
+        except subprocess.CalledProcessError:
+            import shutil
+
+            self.uv_bin = shutil.which("uv")
+            if not self.uv_bin:
+                raise RuntimeError("Could not find uv binary. Please install uv.")
+
+        # Create UV virtual environment
+        subprocess.run(
+            [self.uv_bin, "venv", "--python", python_version],
+            cwd=self.work_dir,
+            check=True,
+            capture_output=True,
+        )
+
+        self.venv_python = self.work_dir / ".venv/bin/python"
+        if not self.venv_python.exists():
+            self.venv_python = (
+                self.work_dir / ".venv/Scripts/python.exe"
+            )  # Windows fallback
+
+        if not self.venv_python.exists():
+            raise RuntimeError(
+                f"Virtual environment python not found at {self.venv_python}"
+            )
+
+        # Set SSL cert file for HPC if needed
+        self._setup_ssl_cert()
+
+    def _setup_ssl_cert(self):
+        """Set SSL_CERT_FILE environment variable if certifi is available."""
+        try:
+            certifi_path = subprocess.check_output(
+                [str(self.venv_python), "-c", "import certifi; print(certifi.where())"],
+                text=True,
+            ).strip()
+            self.env["SSL_CERT_FILE"] = certifi_path
+        except Exception as e:
+            logger.warning(f"Failed to set SSL_CERT_FILE: {e}")
+
+    def install_dependencies(self, packages: List[str]):
+        """Install Python packages into the virtual environment."""
+        logger.info(f"Installing dependencies: {packages}")
+        if not self.uv_bin or not self.venv_python:
+            raise RuntimeError("Environment not setup. Call setup_environment() first.")
+
+        cmd = [
+            self.uv_bin,
+            "pip",
+            "install",
+            "--python",
+            str(self.venv_python),
+        ] + packages
+
+        subprocess.run(cmd, cwd=self.work_dir, check=True)
+
+    def run_benchmark(
+        self,
+        script_content: str,
+        config: Dict[str, Any],
+        script_name: str = "benchmark_runner.py",
+    ) -> Dict[str, Any]:
+        """
+        Execute the benchmark script.
+
+        Args:
+            script_content: The Python script to run.
+            config: Configuration dictionary to pass to the script (saved as config.json).
+            script_name: Filename for the script.
+
+        Returns:
+            Dictionary containing the results loaded from results.json.
+        """
+        if not self.venv_python:
+            raise RuntimeError("Environment not setup. Call setup_environment() first.")
+
+        logger.info("Preparing benchmark script...")
+
+        # Write runner script
+        runner_path = self.work_dir / script_name
+        runner_path.write_text(script_content)
+
+        # Write config
+        config_path = self.work_dir / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(config, f, indent=2)
+
+        logger.info("Executing benchmark...")
+
+        # Run the runner script inside the venv
+        proc = subprocess.run(
+            [str(self.venv_python), str(runner_path), str(config_path)],
+            cwd=self.work_dir,
+            env=self.env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+            check=False,
+        )
+
+        if proc.returncode != 0:
+            raise RuntimeError(
+                f"Benchmark runner failed with return code {proc.returncode}"
+            )
+
+        logger.info("Collecting results...")
+        results_path = self.work_dir / "results.json"
+        if not results_path.exists():
+            raise RuntimeError(
+                "Results file not found - benchmark may have crashed silently"
+            )
+
+        with open(results_path) as f:
+            results = json.load(f)
+
+        logger.info("Benchmark completed successfully.")
+        return results
+
+    def cleanup(self):
+        """Remove the working directory."""
+        import shutil
+
+        shutil.rmtree(self.work_dir, ignore_errors=True)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.cleanup()
diff --git a/garden_ai/benchmarks/utils/remote_execution.py b/garden_ai/benchmarks/utils/remote_execution.py
new file mode 100644
index 00000000..d6541695
--- /dev/null
+++ b/garden_ai/benchmarks/utils/remote_execution.py
@@ -0,0 +1,202 @@
+"""Generic remote execution utility for benchmarks.
+
+This module contains the `run_remote_benchmark` function which is designed to be
+serialized and executed on Globus Compute endpoints. It handles the boilerplate
+of setting up a Python environment, installing dependencies, and running a
+provided benchmark script.
+"""
+
+
+def run_remote_benchmark(
+    script_content: str,
+    dependencies: list[str],
+    config: dict,
+    checkpoint_name: str | None = None,
+    checkpoint_path: str | None = None,
+) -> dict:
+    """Run a generic benchmark script on a remote Globus Compute endpoint.
+
+    This function:
+    1. Creates a temporary working directory.
+    2. Sets up a Python environment using `uv`.
+    3. Installs the specified dependencies.
+    4. Writes the `script_content` to a file.
+    5. Writes the `config` to a JSON file.
+    6. Executes the script in the environment.
+    7. Returns the results from `results.json`.
+
+    Args:
+        script_content: The full Python script to execute.
+        dependencies: List of Python packages to install (e.g. ["numpy", "torch"]).
+        config: Dictionary of configuration parameters to pass to the script.
+                Written to `config.json`.
+        checkpoint_name: Name of the checkpoint file (e.g. "checkpoint_123.json").
+                         Saved to ~/.garden/benchmarks/.
+        checkpoint_path: Optional path to an existing checkpoint file to resume from.
+                         If provided, this path is used directly.
+
+    Returns:
+        The content of `results.json` produced by the script.
+    """
+    # All imports must be inside the function for serialization
+    import json
+    import logging
+    import os
+    import subprocess
+    import sys
+    import tempfile
+    from pathlib import Path
+
+    # Configure logging
+    logging.basicConfig(
+        level=logging.INFO,
+        stream=sys.stdout,
+        force=True,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+    )
+    if hasattr(sys.stdout, "reconfigure"):
+        sys.stdout.reconfigure(line_buffering=True)
+
+    logger = logging.getLogger(__name__)
+
+    # Create isolated working directory
+    work_dir = Path(tempfile.mkdtemp(prefix="garden_benchmark_"))
+
+    try:
+        # ----------------------------------------------------------------------
+        # 1. ENVIRONMENT SETUP
+        # ----------------------------------------------------------------------
+        logger.info("Step 1/4: Setting up environment...")
+
+        # Find UV binary
+        try:
+            uv_bin = subprocess.check_output(
+                [sys.executable, "-c", "import uv; print(uv.find_uv_bin())"], text=True
+            ).strip()
+        except subprocess.CalledProcessError:
+            import shutil
+
+            uv_bin = shutil.which("uv")
+            if not uv_bin:
+                raise RuntimeError("Could not find uv binary. Please install uv.")
+
+        # Create UV virtual environment
+        subprocess.run(
+            [uv_bin, "venv", "--python", "3.11"],
+            cwd=work_dir,
+            check=True,
+            capture_output=True,
+        )
+
+        venv_python = work_dir / ".venv/bin/python"
+        if not venv_python.exists():
+            venv_python = work_dir / ".venv/Scripts/python.exe"  # Windows fallback
+
+        if not venv_python.exists():
+            raise RuntimeError(f"Virtual environment python not found at {venv_python}")
+
+        # Install dependencies
+        logger.info(f"Installing dependencies: {dependencies}")
+        # Install in one go for better resolution
+        cmd = [uv_bin, "pip", "install", "--python", str(venv_python)] + dependencies
+        subprocess.run(
+            cmd,
+            cwd=work_dir,
+            check=True,
+        )
+
+        # Set SSL cert file for HPC if needed
+        env = dict(os.environ)
+
+        # Propagate common useful env vars if present
+        for key in ["MBD_AUTO_DOWNLOAD_FILES", "HF_TOKEN", "WANDB_API_KEY"]:
+            if key in os.environ:
+                env[key] = os.environ[key]
+
+        try:
+            certifi_path = subprocess.check_output(
+                [str(venv_python), "-c", "import certifi; print(certifi.where())"],
+                text=True,
+            ).strip()
+            env["SSL_CERT_FILE"] = certifi_path
+        except Exception as e:
+            logger.warning(f"Failed to set SSL_CERT_FILE: {e}")
+
+        # ----------------------------------------------------------------------
+        # 2. PREPARE BENCHMARK SCRIPT
+        # ----------------------------------------------------------------------
+        logger.info("Step 2/4: Preparing benchmark script...")
+
+        # Write runner script
+        runner_path = work_dir / "benchmark_runner.py"
+        runner_path.write_text(script_content)
+
+        # Determine checkpoint path
+        if checkpoint_path:
+            # User provided a specific path to resume from
+            final_checkpoint_path = checkpoint_path
+        elif checkpoint_name:
+            # Use persistent location in user home
+            checkpoint_dir = Path.home() / ".garden" / "benchmarks"
+            checkpoint_dir.mkdir(parents=True, exist_ok=True)
+            final_checkpoint_path = str(checkpoint_dir / checkpoint_name)
+        else:
+            # Fallback to tmp dir if no name provided (legacy behavior)
+            final_checkpoint_path = str(work_dir / "checkpoint.json")
+
+        config["checkpoint_path"] = final_checkpoint_path
+
+        # Log checkpoint path prominently for user reference
+        print(f"{'=' * 80}")
+        print(f"Checkpoint will be saved to: {final_checkpoint_path}")
+        print("To resume this job if it fails, use:")
+        print(f'  checkpoint_path="{final_checkpoint_path}"')
+        print(f"{'=' * 80}")
+
+        # Write config
+        config_path = work_dir / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(config, f, indent=2)
+
+        # ----------------------------------------------------------------------
+        # 3. EXECUTE BENCHMARK
+        # ----------------------------------------------------------------------
+        logger.info("Step 3/4: Executing benchmark...")
+
+        # Run the runner script inside the venv
+        # DO NOT capture output - let it stream to stdout/stderr in real-time
+        # so we can see errors immediately
+        proc = subprocess.run(
+            [str(venv_python), str(runner_path), str(config_path)],
+            cwd=work_dir,
+            env=env,
+            check=False,  # Don't raise immediately, we'll check returncode
+        )
+
+        if proc.returncode != 0:
+            raise RuntimeError(
+                f"Benchmark runner failed with return code {proc.returncode}."
+            )
+
+        # ----------------------------------------------------------------------
+        # 4. COLLECT RESULTS
+        # ----------------------------------------------------------------------
+        logger.info("Step 4/4: Collecting results...")
+
+        results_path = work_dir / "results.json"
+        if not results_path.exists():
+            raise RuntimeError(
+                "Results file not found - benchmark may have crashed silently"
+            )
+
+        with open(results_path) as f:
+            results = json.load(f)
+
+        logger.info("Benchmark completed successfully.")
+        return results
+
+    finally:
+        # Cleanup working directory
+        import shutil
+
+        shutil.rmtree(work_dir, ignore_errors=True)
diff --git a/garden_ai/benchmarks/utils/script_builder.py b/garden_ai/benchmarks/utils/script_builder.py
new file mode 100644
index 00000000..7987bdc7
--- /dev/null
+++ b/garden_ai/benchmarks/utils/script_builder.py
@@ -0,0 +1,96 @@
+import inspect
+from pathlib import Path
+from typing import Callable
+
+
+class BenchmarkScriptBuilder:
+    """Helper to build a self-contained benchmark script from a template."""
+
+    def __init__(self, template_path: str | Path = None):
+        if template_path is None:
+            # Default to the base_runner.py in templates
+            template_path = (
+                Path(__file__).parent.parent / "templates" / "base_runner.py"
+            )
+
+        self.template_path = Path(template_path)
+        self.imports = set()
+        self.functions = []
+        self.preamble = []
+
+    def add_import(self, import_stmt: str):
+        """Add an import statement (e.g. 'import numpy as np')."""
+        self.imports.add(import_stmt)
+        return self
+
+    def add_preamble(self, code: str):
+        """Add arbitrary code to the top of the script (after imports)."""
+        self.preamble.append(code)
+        return self
+
+    def add_function(self, func: Callable, name: str = None):
+        """Add a function definition to the script.
+
+        The function source code is inspected and appended.
+        If name is provided, the function definition is renamed.
+        """
+        source = inspect.getsource(func)
+
+        if name:
+            import re
+
+            # Replace 'def old_name(' with 'def new_name('
+            # This is a simple regex replacement, assuming standard formatting
+            pattern = r"def\s+" + func.__name__ + r"\s*\("
+            replacement = f"def {name}("
+            source = re.sub(pattern, replacement, source, count=1)
+
+        self.functions.append(source)
+        return self
+
+    def build(self) -> str:
+        """Assemble the final script."""
+        if not self.template_path.exists():
+            raise FileNotFoundError(f"Template not found at {self.template_path}")
+
+        template_content = self.template_path.read_text()
+
+        # Construct sections
+        imports_block = "\n".join(sorted(self.imports))
+        preamble_block = "\n".join(self.preamble)
+        functions_block = "\n\n".join(self.functions)
+
+        # We inject our custom code BEFORE the template's main execution logic
+        # but AFTER the template's own imports (which are inside the file).
+        # Actually, the template has imports at the top. We should probably prepend ours.
+
+        # Simple strategy: Prepend everything to the template, but the template
+        # has "USER DEFINED FUNCTIONS" placeholders. We can just append our functions
+        # before the main block?
+
+        # Better strategy: The template is designed to have functions injected.
+        # Let's just put imports at the top, then functions, then the template content.
+        # But we need to be careful about imports in the template.
+
+        final_script = f"""
+# ------------------------------------------------------------------------------
+# INJECTED IMPORTS
+# ------------------------------------------------------------------------------
+{imports_block}
+
+# ------------------------------------------------------------------------------
+# INJECTED PREAMBLE
+# ------------------------------------------------------------------------------
+{preamble_block}
+
+# ------------------------------------------------------------------------------
+# INJECTED FUNCTIONS
+# ------------------------------------------------------------------------------
+{functions_block}
+
+# ------------------------------------------------------------------------------
+# BASE RUNNER TEMPLATE
+# ------------------------------------------------------------------------------
+{template_content}
+"""
+        return final_script
diff --git a/garden_ai/benchmarks/utils/task.py b/garden_ai/benchmarks/utils/task.py
new file mode 100644
index 00000000..4ca2fd60
--- /dev/null
+++ b/garden_ai/benchmarks/utils/task.py
@@ -0,0 +1,132 @@
+import json
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+
+class BaseBenchmarkTask:
+    """
+    Base class for benchmark tasks.
+
+    Provides common utilities for:
+    - Extracting model metadata (package, factory, kwargs)
+    - Running benchmarks locally for testing
+    """
+
+    def __init__(
+        self, adapter, repo_url: str, repo_ref: str, model_package: Optional[str] = None
+    ):
+        self.adapter = adapter
+        self.repo_url = repo_url
+        self.repo_ref = repo_ref
+        self.model_package = model_package
+
+    def _extract_model_config(
+        self,
+        model: Any = None,
+        model_package: Optional[str] = None,
+        model_factory: Optional[str] = None,
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Helper to resolve model configuration from either a local instance or explicit arguments.
+        """
+        model_checkpoint = None
+
+        if model is not None:
+            # Extract info from local model instance
+            if model_package is None:
+                if self.model_package is not None:
+                    model_package = self.model_package
+                else:
+                    # Infer from model's module
+                    model_package = model.__class__.__module__.split(".")[0]
+
+            if model_factory is None:
+                model_factory = model.__class__.__name__
+
+            # Get checkpoint path if model has one
+            if hasattr(model, "checkpoint_path"):
+                model_checkpoint = model.checkpoint_path
+            elif hasattr(model, "checkpoint"):
+                model_checkpoint = model.checkpoint
+
+            # Try to extract initialization kwargs if available
+            if model_kwargs is None and hasattr(model, "_init_kwargs"):
+                model_kwargs = model._init_kwargs
+
+        else:
+            # Must provide explicit construction info
+            if model_package is None or model_factory is None:
+                raise ValueError(
+                    "If model is not provided, must specify both "
+                    "model_package and model_factory"
+                )
+
+        if model_kwargs is None:
+            model_kwargs = {}
+
+        return {
+            "model_package": model_package,
+            "model_factory": model_factory,
+            "model_kwargs": model_kwargs,
+            "model_checkpoint": model_checkpoint,
+        }
+
+    def _run_local_wrapper(
+        self, runner_func_path: str, runner_func_name: str, config: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        Execute a benchmark runner function locally in a subprocess.
+
+        Args:
+            runner_func_path: Import path to the runner function (e.g. 'garden_ai.benchmarks.matbench_discovery.remote_runner')
+            runner_func_name: Name of the runner function (e.g. 'run_matbench_is2re')
+            config: Configuration dictionary to pass to the runner function.
+        """
+        results_file_path = (
+            Path(tempfile.gettempdir()) / f"benchmark_results_{id(config)}.json"
+        )
+
+        wrapper_script = f'''
+import json
+from {runner_func_path} import {runner_func_name}
+
+config = {repr(config)}
+results = {runner_func_name}(**config)
+
+with open("{results_file_path}", "w") as f:
+    json.dump(results, f, indent=2)
+'''
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(wrapper_script)
+            wrapper_path = f.name
+
+        try:
+            # Run without capturing output so logs stream to console in real-time
+            result = subprocess.run(
+                [sys.executable, wrapper_path],
+                timeout=3600,
+                stdout=None,
+                stderr=None,
+            )
+
+            if result.returncode != 0:
+                raise RuntimeError(
+                    f"Local benchmark failed with return code {result.returncode}"
+                )
+
+            if not results_file_path.exists():
+                raise RuntimeError(
+                    f"Benchmark results file not found at {results_file_path}"
+                )
+
+            with open(results_file_path) as f:
+                return json.load(f)
+
+        finally:
+            Path(wrapper_path).unlink(missing_ok=True)
+            results_file_path.unlink(missing_ok=True)

From f5b888e789615a3019b4a1b8e41f89a5531754bb Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Thu, 4 Dec 2025 11:15:42 -0700
Subject: [PATCH 05/23] refactor to use groundhog functions

---
 garden_ai/benchmarks/__init__.py              |    5 +-
 .../benchmarks/matbench_discovery/__init__.py |  230 +---
 .../examples/dummy_model.py                   |   17 +
 .../examples/matbench_mace_multi_gpu.py       |   80 +-
 .../examples/test_hog_refactor.py             |   70 +
 .../benchmarks/matbench_discovery/remote.py   |  485 -------
 .../benchmarks/matbench_discovery/tasks.py    | 1163 +++++++++++++----
 garden_ai/benchmarks/templates/base_runner.py |  248 ----
 garden_ai/benchmarks/utils/script_builder.py  |   23 +-
 9 files changed, 1003 insertions(+), 1318 deletions(-)
 create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py
 create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py
 delete mode 100644 garden_ai/benchmarks/matbench_discovery/remote.py
 delete mode 100644 garden_ai/benchmarks/templates/base_runner.py

diff --git a/garden_ai/benchmarks/__init__.py b/garden_ai/benchmarks/__init__.py
index 329de6cc..5d40ae5a 100644
--- a/garden_ai/benchmarks/__init__.py
+++ b/garden_ai/benchmarks/__init__.py
@@ -7,12 +7,13 @@
     - MatbenchDiscovery: Materials discovery benchmark suite
 """
 
-from .matbench_discovery import IS2RETask, MatbenchDiscovery, MatbenchTask
+from .matbench_discovery.enums import DatasetSize, MatbenchTask
+from .matbench_discovery.tasks import MatbenchDiscovery
 
 __all__ = [
     "MatbenchDiscovery",
     "MatbenchTask",
-    "IS2RETask",
+    "DatasetSize",
 ]
 
 
diff --git a/garden_ai/benchmarks/matbench_discovery/__init__.py b/garden_ai/benchmarks/matbench_discovery/__init__.py
index 3256522e..1a5b9516 100644
--- a/garden_ai/benchmarks/matbench_discovery/__init__.py
+++ b/garden_ai/benchmarks/matbench_discovery/__init__.py
@@ -1,236 +1,10 @@
-"""Matbench Discovery benchmark adapter for Garden AI.
-
-This module provides a clean interface for running Matbench Discovery benchmarks
-on remote HPC systems via Globus Compute. It handles environment setup,
-dependency installation, and benchmark execution.
-
-Example usage:
-    >>> from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
-    >>> from my_model import MyModel
-    >>>
-    >>> # Configure for your HPC endpoint
-    >>> endpoint_id = "your-endpoint-uuid"
-    >>> endpoint_config = {
-    ...     "account": "project-account",
-    ...     "partition": "gpu",
-    ...     "scheduler_options": "#SBATCH --gpus-per-node=1"
-    ... }
-    >>>
-    >>> # Run benchmark
-    >>> with MatbenchDiscovery(endpoint_id, endpoint_config) as bench:
-    ...     model = MyModel()
-    ...     task = bench.tasks.IS2RE
-    ...     future = task.submit(model, num_structures=100)
-    ...     results = future.result()
-    ...     metrics = task.calculate_metrics(results)
-    ...     print(metrics)
-"""
-
-from typing import Any
-
-from globus_compute_sdk import Executor
-from globus_compute_sdk.serialize import CombinedCode, ComputeSerializer
+"""Matbench Discovery benchmark adapter for Garden AI."""
 
 from .enums import DatasetSize, MatbenchTask
-from .tasks import (
-    IP2ETask,
-    IS2ETask,
-    IS2RETask,
-    RP2RETask,
-    RS2RETask,
-    S2EFSMTask,
-    S2EFSTask,
-    S2EFTask,
-    S2ETask,
-    S2RETask,
-)
+from .tasks import MatbenchDiscovery
 
 __all__ = [
     "MatbenchDiscovery",
     "MatbenchTask",
     "DatasetSize",
-    "IS2RETask",
-    "RS2RETask",
-    "S2EFSTask",
-    "S2EFTask",
-    "S2EFSMTask",
-    "IS2ETask",
-    "S2ETask",
-    "S2RETask",
-    "RP2RETask",
-    "IP2ETask",
 ]
-
-
-class MatbenchDiscovery:
-    """Adapter for running Matbench Discovery benchmarks locally or remotely.
-
-    This class manages the lifecycle of benchmark execution:
-    - Provides access to benchmark tasks (IS2RE, etc.)
-    - For remote execution: creates and manages Globus Compute executor
-    - For local execution: runs in ephemeral UV environment
-
-    Use as a context manager to ensure proper cleanup:
-        # Local execution
-        with MatbenchDiscovery() as bench:
-            result = bench.tasks.IS2RE.local(...)
-
-        # Remote execution
-        with MatbenchDiscovery(endpoint_id="uuid", endpoint_config={...}) as bench:
-            future = bench.tasks.IS2RE.submit(...)
-
-    Attributes:
-        tasks: Namespace containing available benchmark tasks
-            - tasks.IS2RE: Initial Structure to Relaxed Energy task
-    """
-
-    # Matbench Discovery repository configuration
-    REPO_URL = "https://github.com/janosh/matbench-discovery"
-    REPO_REF = "main"
-    PYTHON_VERSION = "3.11"
-
-    def __init__(
-        self,
-        endpoint_id: str | None = None,
-        user_endpoint_config: dict[str, Any] | None = None,
-        repo_ref: str | None = None,
-        model_package: str | None = None,
-    ):
-        """Initialize Matbench Discovery adapter.
-
-        Args:
-            endpoint_id: Globus Compute endpoint UUID for remote execution.
-                        If None, only local execution (.local()) is available.
-            user_endpoint_config: Optional HPC configuration for remote endpoint.
-                                 Example for SLURM:
-                                 {
-                                     "account": "project-account",
-                                     "partition": "gpu-debug",
-                                     "scheduler_options": "#SBATCH --gpus-per-node=1"
-                                 }
-            repo_ref: Git branch/tag/commit to use (default: "main")
-            model_package: Default model package to install for all tasks
-                          (can be overridden per task)
-        """
-        self.endpoint_id = endpoint_id
-        self.user_endpoint_config = user_endpoint_config or {}
-
-        # Ensure 'requirements' is present to avoid endpoint template errors
-        if "requirements" not in self.user_endpoint_config:
-            self.user_endpoint_config["requirements"] = ""
-
-        self.repo_ref = repo_ref or self.REPO_REF
-        self.model_package = model_package
-
-        # Executor is created lazily on first submit() call
-        self._executor: Executor | None = None
-        self.tasks: Any = None
-
-    def _get_executor(self) -> Executor:
-        """Get or create the Globus Compute executor (lazy initialization).
-
-        Returns:
-            Executor instance
-
-        Raises:
-            ValueError: If endpoint_id was not provided during initialization
-        """
-        if self._executor is None:
-            if self.endpoint_id is None:
-                raise ValueError(
-                    "endpoint_id is required for remote execution. "
-                    "Either provide endpoint_id during initialization or use .local() method."
-                )
-
-            executor_kwargs = {"endpoint_id": self.endpoint_id}
-            if self.user_endpoint_config:
-                executor_kwargs["user_endpoint_config"] = self.user_endpoint_config
-
-            # Use CombinedCode serialization to send actual function code
-            # rather than module references (avoids needing garden_ai installed remotely)
-            executor_kwargs["serializer"] = ComputeSerializer(
-                strategy_code=CombinedCode()
-            )
-
-            self._executor = Executor(**executor_kwargs)
-
-        return self._executor
-
-    def __enter__(self):
-        """Set up tasks when entering context."""
-        # Initialize tasks - executor will be created lazily when needed
-        # Using a simple namespace object for dot access
-        self.tasks = type(
-            "Tasks",
-            (),
-            {
-                "IS2RE": IS2RETask(
-                    adapter=self,
-                    repo_url=self.REPO_URL,
-                    repo_ref=self.repo_ref,
-                    model_package=self.model_package,
-                ),
-                "RS2RE": RS2RETask(
-                    adapter=self,
-                    repo_url=self.REPO_URL,
-                    repo_ref=self.repo_ref,
-                    model_package=self.model_package,
-                ),
-                "S2EFS": S2EFSTask(
-                    adapter=self,
-                    repo_url=self.REPO_URL,
-                    repo_ref=self.repo_ref,
-                    model_package=self.model_package,
-                ),
-                "S2EF": S2EFTask(
-                    adapter=self,
-                    repo_url=self.REPO_URL,
-                    repo_ref=self.repo_ref,
-                    model_package=self.model_package,
-                ),
-                "S2EFSM": S2EFSMTask(
-                    adapter=self,
-                    repo_url=self.REPO_URL,
-                    repo_ref=self.repo_ref,
-                    model_package=self.model_package,
-                ),
-                "IS2E": IS2ETask(
-                    adapter=self,
-                    repo_url=self.REPO_URL,
-                    repo_ref=self.repo_ref,
-                    model_package=self.model_package,
-                ),
-                "S2E": S2ETask(
-                    adapter=self,
-                    repo_url=self.REPO_URL,
-                    repo_ref=self.repo_ref,
-                    model_package=self.model_package,
-                ),
-                "S2RE": S2RETask(
-                    adapter=self,
-                    repo_url=self.REPO_URL,
-                    repo_ref=self.repo_ref,
-                    model_package=self.model_package,
-                ),
-                "RP2RE": RP2RETask(
-                    adapter=self,
-                    repo_url=self.REPO_URL,
-                    repo_ref=self.repo_ref,
-                    model_package=self.model_package,
-                ),
-                "IP2E": IP2ETask(
-                    adapter=self,
-                    repo_url=self.REPO_URL,
-                    repo_ref=self.repo_ref,
-                    model_package=self.model_package,
-                ),
-            },
-        )()
-
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        """Clean up executor when exiting context."""
-        if self._executor:
-            self._executor.shutdown(wait=True)
-        return False  # Don't suppress exceptions
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py b/garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py
new file mode 100644
index 00000000..745eb1b1
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py
@@ -0,0 +1,17 @@
+def create_dummy_model(device):
+    """Create a dummy calculator for testing."""
+    import numpy as np
+    from ase.calculators.calculator import Calculator, all_changes
+
+    class DummyCalc(Calculator):
+        implemented_properties = ["energy", "forces", "stress"]
+
+        def calculate(
+            self, atoms=None, properties=["energy"], system_changes=all_changes
+        ):
+            super().calculate(atoms, properties, system_changes)
+            self.results["energy"] = -1.0 * len(self.atoms)
+            self.results["forces"] = np.zeros((len(self.atoms), 3))
+            self.results["stress"] = np.zeros(6)
+
+    return DummyCalc()
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
index 9f971086..3707f57f 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
@@ -6,17 +6,6 @@
 
 from garden_ai.benchmarks.matbench_discovery import DatasetSize, MatbenchDiscovery
 
-# Globus Compute endpoint
-ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
-
-# HPC endpoint configuration
-ENDPOINT_CONFIG = {
-    "account": "cis250461-gpu",
-    "partition": "gpu-debug",
-    "qos": "gpu",
-    "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8",
-}
-
 
 # Model factory function for MACE
 def create_mace_model(device):
@@ -25,61 +14,16 @@ def create_mace_model(device):
     return mace_mp(model="medium", device=device, default_dtype="float64")
 
 
-NUM_STRUCTURES = DatasetSize.RANDOM_100
-
-
-def main():
-    """Run Matbench Discovery IS2RE benchmark with MACE."""
-
-    with MatbenchDiscovery(
-        endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG
-    ) as bench:
-        # Run IS2RE task (Initial Structure to Relaxed Energy)
-        future = bench.tasks.IS2RE.submit(
-            model_factory=create_mace_model,
-            model_packages="mace-torch",
-            num_structures=NUM_STRUCTURES,
-        )
-
-        print("Job submitted! Waiting for results (this may take a while)...")
-
-        try:
-            output = future.result()
-            metrics = output.get("metrics", {})
-
-            if "error" in metrics:
-                print(f"error               : {metrics['error']}")
-            else:
-                # Discovery metrics (stability classification)
-                if "F1" in metrics:
-                    print(f"F1                  : {metrics['F1']:.6f}")
-                    print(f"DAF                 : {metrics['DAF']:.2f}x")
-                    print(f"Precision           : {metrics['Precision']:.6f}")
-                    print(f"Recall              : {metrics['Recall']:.6f}")
-                    print(f"Accuracy            : {metrics['Accuracy']:.6f}")
-
-                # Regression metrics
-                if "MAE" in metrics:
-                    print(f"MAE (eV/atom)       : {metrics['MAE']:.6f}")
-                    print(f"RMSE (eV/atom)      : {metrics['RMSE']:.6f}")
-                    print(f"R2                  : {metrics['R2']:.6f}")
-
-                # Force metrics (if S2EFS task)
-                if "force_mae" in metrics:
-                    print(f"force_mae           : {metrics['force_mae']:.6f}")
-                    print(f"force_rmse          : {metrics['force_rmse']:.6f}")
-                    print(f"force_r2            : {metrics['force_r2']:.6f}")
-                    print(f"stress_mae          : {metrics['stress_mae']:.6f}")
-                    print(f"stress_rmse         : {metrics['stress_rmse']:.6f}")
-                    print(f"stress_r2           : {metrics['stress_r2']:.6f}")
-
-                if "num_evaluated" in metrics:
-                    print(f"num_evaluated       : {metrics['num_evaluated']}")
-
-        except Exception as e:
-            print(f"\n[ERROR] Benchmark failed: {e}")
-            raise
-
+results = MatbenchDiscovery.IS2RE.remote(
+    endpoint="5aafb4c1-27b2-40d8-a038-a0277611868f",
+    walltime="01:00:00",
+    scheduler_options={"gpus-per-node": 2, "cores-per-node": 16},
+    account="youraccount",
+    partition="gpu-debug",
+    qos="gpu",
+    model_factory=create_mace_model,
+    model_packages="mace-torch",
+    num_structures=DatasetSize.RANDOM_100,
+)
 
-if __name__ == "__main__":
-    main()
+print(results["metrics"])
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py b/garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py
new file mode 100644
index 00000000..eccf0489
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+"""
+Test Matbench Discovery refactor with Groundhog HPC.
+"""
+
+import os
+
+from dummy_model import create_dummy_model
+
+from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+# Globus Compute endpoint (use local if possible, or the one from example)
+ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
+
+# HPC endpoint configuration
+ENDPOINT_CONFIG = {
+    "account": "cis250461-gpu",
+    "partition": "gpu",
+    "qos": "gpu",
+    "scheduler_options": "#SBATCH --gpus-per-node=1\n",
+    "cores_per_node": 4,
+    "mem_per_node": 16,
+}
+
+# =============================================================================
+# Model Factory Functions
+# =============================================================================
+
+
+def main():
+    """Run benchmarks on all models and save results."""
+
+    print("=" * 80)
+    print("Matbench Discovery Test - Groundhog Refactor")
+    print("=" * 80)
+
+    print("Running LOCAL test...")
+
+    # Ensure subprocess can find dummy_model
+    cwd = os.getcwd()
+    os.environ["PYTHONPATH"] = cwd + os.pathsep + os.environ.get("PYTHONPATH", "")
+
+    try:
+        # Run locally using the new static method API
+        output = MatbenchDiscovery.IS2RE.local(
+            model_factory=create_dummy_model,
+            model_packages=["numpy", "ase"],  # Minimal deps
+            num_structures=1,
+            sys_path=[os.getcwd()],
+        )
+        print("Local run output keys:", output.keys())
+        if "error" in output.get("metrics", {}):
+            print("Local metrics error:", output["metrics"]["error"])
+        else:
+            print("Local run successful!")
+            print("Metrics:", output.get("metrics"))
+
+    except Exception as e:
+        print(f"Local run failed: {e}")
+        import traceback
+
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/garden_ai/benchmarks/matbench_discovery/remote.py b/garden_ai/benchmarks/matbench_discovery/remote.py
deleted file mode 100644
index 247d2ba2..00000000
--- a/garden_ai/benchmarks/matbench_discovery/remote.py
+++ /dev/null
@@ -1,485 +0,0 @@
-"""Remote functions for Matbench Discovery benchmark.
-
-These functions are injected into the remote script.
-They must be self-contained (imports inside or provided by builder).
-"""
-
-from typing import Any, Callable, Dict, List, Optional, Tuple
-
-# ------------------------------------------------------------------------------
-# Common Helpers
-# ------------------------------------------------------------------------------
-
-
-def _process_batch_common(
-    batch_id: int,
-    structures: List[Tuple[str, Any]],
-    model_config: Dict[str, Any],
-    num_threads: int,
-    compute_fn: Callable[[Any, Any], Dict[str, Any]],
-    task_name: str,
-) -> Dict[str, Any]:
-    """Common logic for processing a batch of structures.
-
-    Args:
-        batch_id: ID of the current batch
-        structures: List of (id, atoms) tuples
-        model_config: Configuration for the model
-        num_threads: Number of threads to use
-        compute_fn: Function taking (model, atoms) and returning a result dict
-        task_name: Name of the task for logging
-    """
-    import logging
-    import os
-    import time
-
-    import torch
-
-    # Configure thread limits to avoid contention
-    os.environ["OMP_NUM_THREADS"] = str(num_threads)
-    torch.set_num_threads(num_threads)
-
-    gpu_id = model_config.get("gpu_id")
-    device = setup_device(gpu_id)  # noqa: F821
-
-    worker_logger = logging.getLogger(f"worker_{batch_id}")
-    worker_logger.info(
-        f"Started {task_name} on {device} with {len(structures)} structures. Threads: {num_threads}"
-    )
-
-    global _MODEL_CACHE
-    try:
-        if _MODEL_CACHE is None:
-            model = load_model(device)  # noqa: F821
-            _MODEL_CACHE = model
-        else:
-            model = _MODEL_CACHE
-    except Exception as e:
-        worker_logger.error(f"Failed to initialize model: {e}")
-        worker_logger.error(
-            "Model initialization is critical - cannot continue benchmark"
-        )
-        raise RuntimeError(f"Model initialization failed: {e}") from e
-
-    results = {}
-    batch_start = time.time()
-
-    for i, (struct_id, atoms) in enumerate(structures):
-        try:
-            # Run the specific computation
-            result = compute_fn(model, atoms)
-            results[struct_id] = result
-
-            if (i + 1) % 10 == 0:
-                elapsed = time.time() - batch_start
-                rate = (i + 1) / elapsed if elapsed > 0 else 0
-                worker_logger.info(
-                    f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)"
-                )
-
-        except Exception as e:
-            worker_logger.warning(f"Structure {struct_id} failed: {e}")
-            results[struct_id] = {"error": str(e)}
-
-    return results
-
-
-def _load_dataset_common(
-    config: Dict[str, Any],
-    zip_path: str,
-    read_format: str = "extxyz",
-    read_index: str | slice = None,
-) -> List[Tuple[str, Any]]:
-    """Common logic for loading datasets from a zip file."""
-    from io import TextIOWrapper
-    from zipfile import ZipFile
-
-    from ase.io import read
-
-    # get_material_ids_for_subset is injected
-    dataset_subset = config.get("dataset_subset", "full")
-    dataset_seed = config.get("dataset_seed", 42)
-    mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed)  # noqa: F821
-
-    structures = []
-
-    with ZipFile(zip_path, "r") as zf:
-        if mat_ids is None:
-            # Load all files (full dataset)
-            # Sort by numeric ID if possible
-            file_list = sorted(
-                zf.namelist(),
-                key=lambda x: int(x.split(".")[0])
-                if x.split(".")[0].isdigit()
-                else float("inf"),
-            )
-            num_structures = config.get("num_structures", 100)
-            file_list = file_list[:num_structures]
-        else:
-            # Filter to specific material IDs
-            mat_id_set = set(mat_ids)
-            file_list = [
-                f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set
-            ]
-
-        for filename in file_list:
-            with zf.open(filename) as f:
-                text_stream = TextIOWrapper(f, encoding="utf-8")
-                if read_index is not None:
-                    atoms_list = read(text_stream, format=read_format, index=read_index)
-                    # If we got a list and need one item, take the last one (common for trajectories)
-                    if isinstance(atoms_list, list) and atoms_list:
-                        structures.append((filename, atoms_list[-1]))
-                    elif not isinstance(atoms_list, list):
-                        structures.append((filename, atoms_list))
-                else:
-                    structures.append((filename, read(text_stream, format=read_format)))
-
-    return structures
-
-
-# ------------------------------------------------------------------------------
-# Injected Functions
-# ------------------------------------------------------------------------------
-
-
-def load_model(device: str):
-    """Initialize the model using the user-provided factory function.
-
-    The factory function is injected into this script by the benchmark framework.
-    """
-    # Call the user's factory function (injected as load_model_user)
-    model = load_model_user(device)  # noqa: F821
-    return model
-
-
-def get_material_ids_for_subset(
-    subset_type: str, seed: int = 42
-) -> Optional[List[str]]:
-    """Get material IDs for a specific dataset subset.
-
-    Args:
-        subset_type: One of 'full', 'unique_protos', 'random_10k', 'random_100'
-        seed: Random seed for sampling (default: 42)
-
-    Returns:
-        List of material IDs, or None for 'full' (load all)
-    """
-    if subset_type == "full":
-        return None  # Load all materials
-
-    import pandas as pd
-    from matbench_discovery.data import DataFiles
-
-    # Load wbm_summary
-    df = pd.read_csv(DataFiles.wbm_summary.path)
-
-    if subset_type == "unique_protos":
-        # Filter to unique prototypes (removes duplicates and MP overlaps)
-        df_filtered = df.query("unique_prototype")
-        return df_filtered["material_id"].tolist()
-
-    elif subset_type == "random_10k":
-        # Random sample of 10k unique prototypes (fixed seed for reproducibility)
-        df_filtered = df.query("unique_prototype")
-        df_sampled = df_filtered.sample(n=10000, random_state=seed)
-        return df_sampled["material_id"].tolist()
-
-    elif subset_type == "random_100":
-        # Random sample of 100 unique prototypes (fixed seed for reproducibility)
-        # Useful for quick end-to-end testing
-        df_filtered = df.query("unique_prototype")
-        df_sampled = df_filtered.sample(n=100, random_state=seed)
-        return df_sampled["material_id"].tolist()
-
-    else:
-        raise ValueError(f"Unknown subset_type: {subset_type}")
-
-
-def process_batch_relaxation(
-    batch_id: int,
-    structures: List[Tuple[str, Any]],
-    model_config: Dict[str, Any],
-    num_threads: int,
-) -> Dict[str, Any]:
-    """Process a batch of structures for IS2RE (Relaxation)."""
-    from ase.optimize import FIRE
-
-    def compute(model, atoms):
-        atoms.calc = model
-        opt = FIRE(atoms, logfile=None)
-        opt.run(fmax=0.05, steps=500)
-        energy = atoms.get_potential_energy()
-        return {"energy": energy}
-
-    return _process_batch_common(
-        batch_id, structures, model_config, num_threads, compute, "relaxation"
-    )
-
-
-def process_batch_static(
-    batch_id: int,
-    structures: List[Tuple[str, Any]],
-    model_config: Dict[str, Any],
-    num_threads: int,
-) -> Dict[str, Any]:
-    """Process a batch of structures for RS2RE (Static Calculation)."""
-
-    def compute(model, atoms):
-        atoms.calc = model
-        energy = atoms.get_potential_energy()
-        return {"energy": energy}
-
-    return _process_batch_common(
-        batch_id, structures, model_config, num_threads, compute, "static calculation"
-    )
-
-
-def process_batch_forces(
-    batch_id: int,
-    structures: List[Tuple[str, Any]],
-    model_config: Dict[str, Any],
-    num_threads: int,
-) -> Dict[str, Any]:
-    """Process a batch of structures for S2EFS (Energy, Forces, Stress)."""
-
-    def compute(model, atoms):
-        atoms.calc = model
-        energy = atoms.get_potential_energy()
-        forces = atoms.get_forces().tolist()
-        stress = atoms.get_stress().tolist()
-        return {"energy": energy, "forces": forces, "stress": stress}
-
-    return _process_batch_common(
-        batch_id, structures, model_config, num_threads, compute, "forces calculation"
-    )
-
-
-def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Tuple[str, Any]]:
-    """Load initial structures for IS2RE."""
-    from matbench_discovery.data import DataFiles
-
-    return _load_dataset_common(config, DataFiles.wbm_initial_atoms.path)
-
-
-def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Tuple[str, Any]]:
-    """Load relaxed structures for RS2RE."""
-    from matbench_discovery.data import DataFiles
-
-    return _load_dataset_common(config, DataFiles.wbm_relaxed_atoms.path)
-
-
-def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Tuple[str, Any]]:
-    """Load MP trajectories for S2EFS."""
-    from matbench_discovery.data import DataFiles
-
-    # Use index=":" to read all frames, but _load_dataset_common handles taking the last one
-    return _load_dataset_common(config, DataFiles.mp_trj_extxyz.path, read_index=":")
-
-
-def calculate_metrics_energy(
-    results: Dict[str, Any], config: Dict[str, Any]
-) -> Dict[str, Any]:
-    """Calculate energy metrics using matbench-discovery's stable_metrics algorithm.
-
-    Uses the injected stable_metrics function.
-    Returns: F1, DAF, Precision, Recall, Accuracy, TPR, FPR, TNR, FNR, TP, FP, TN, FN, MAE, RMSE, R2
-    """
-    import logging
-
-    import numpy as np
-
-    logger = logging.getLogger("metrics")
-
-    # Results format: {id: {"energy": float, "error": str}}
-    if len(results) == 0:
-        return {"error": "No results to evaluate"}
-
-    try:
-        # Import matbench-discovery data
-        from matbench_discovery.data import df_wbm
-    except Exception as e:
-        return {"error": f"Failed to import matbench-discovery: {e}"}
-
-    # Extract model energies
-    model_energies = {}
-    for sid, res in results.items():
-        if isinstance(res, dict) and res.get("energy") is not None:
-            mat_id = sid.replace(".extxyz", "")
-            model_energies[mat_id] = res["energy"]
-
-    if not model_energies:
-        return {"error": "No valid energies found in results"}
-
-    # Get common IDs between predictions and ground truth
-    # Use direct string column names instead of MbdKey enum to avoid issues
-    df_wbm_indexed = df_wbm.set_index("material_id")
-    common_ids = list(set(model_energies.keys()) & set(df_wbm_indexed.index))
-
-    if not common_ids:
-        return {"error": "No matching IDs between results and ground truth"}
-
-    # Get subset of data
-    df_subset = df_wbm_indexed.loc[common_ids]
-
-    # Calculate predicted formation energies
-    y_pred = np.array([model_energies[mid] for mid in common_ids])
-    y_true = df_subset["uncorrected_energy"].values  # Uncorrected total energy
-    n_atoms = df_subset["n_sites"].values
-
-    # Predicted formation energy ERROR per atom (from total energy difference)
-    # This is the ERROR: (E_pred - E_dft) / n_atoms
-    e_form_error = (y_pred - y_true) / n_atoms
-
-    # Get ground truth e_above_hull for stability classification
-    each_true = df_subset["e_above_hull_mp2020_corrected_ppd_mp"].values
-
-    # Calculate predicted e_above_hull
-    # Since e_form_error is already (E_pred - E_dft)/n_atoms, we just add it to each_true
-    each_pred = each_true + e_form_error
-
-    # Debug logging to understand the distribution
-    logger.info("Energy statistics:")
-    logger.info(
-        f"  each_true: min={each_true.min():.4f}, max={each_true.max():.4f}, mean={each_true.mean():.4f}"
-    )
-    logger.info(
-        f"  each_pred: min={each_pred.min():.4f}, max={each_pred.max():.4f}, mean={each_pred.mean():.4f}"
-    )
-
-    # Calculate global prevalence for DAF normalization (matches official leaderboard)
-    # Filter to unique prototypes
-    df_unique = df_wbm.query("unique_prototype")
-    # Calculate prevalence: (stable count) / (total count)
-    # Stability threshold is 0.0
-    stable_count = (df_unique["e_above_hull_mp2020_corrected_ppd_mp"] <= 0).sum()
-    global_prevalence = stable_count / len(df_unique)
-
-    logger.info(
-        f"Using global prevalence for DAF: {global_prevalence:.6f} ({stable_count}/{len(df_unique)})"
-    )
-
-    # Calculate metrics using the injected function
-    # stable_metrics is injected into the script scope
-    metrics = stable_metrics(each_true, each_pred, prevalence=global_prevalence)  # noqa: F821
-
-    # Add num_evaluated
-    metrics["num_evaluated"] = len(common_ids)
-
-    return metrics
-
-
-def calculate_metrics_forces(
-    results: Dict[str, Any], config: Dict[str, Any]
-) -> Dict[str, Any]:
-    """Calculate comprehensive S2EFS metrics (Energy, Forces, Stress).
-
-    Returns MAE, RMSE, and R2 for each component.
-    """
-    from io import TextIOWrapper
-    from zipfile import ZipFile
-
-    import numpy as np
-    from ase.io import read
-    from matbench_discovery.data import DataFiles
-    from sklearn.metrics import r2_score
-
-    # We need to load ground truth from the dataset itself because MP trj has E/F/S in the extxyz
-    # This is expensive to re-read. Ideally we should have passed GT in results or loaded it efficiently.
-    # For now, let's re-read the GT for the processed IDs.
-
-    metrics = {
-        "energy_mae": [],
-        "energy_rmse": [],
-        "force_mae": [],
-        "force_rmse": [],
-        "stress_mae": [],
-        "stress_rmse": [],
-    }
-
-    # Collect all predictions and ground truth for R2 calculation
-    all_e_pred, all_e_true = [], []
-    all_f_pred, all_f_true = [], []
-    all_s_pred, all_s_true = [], []
-
-    zip_path = DataFiles.mp_trj_extxyz.path
-
-    with ZipFile(zip_path, "r") as zf:
-        for sid, res in results.items():
-            if "error" in res:
-                continue
-
-            try:
-                with zf.open(sid) as f:
-                    text_stream = TextIOWrapper(f, encoding="utf-8")
-                    atoms_list = read(text_stream, format="extxyz", index=":")
-                    gt_atoms = atoms_list[-1]  # Matching load_dataset logic
-
-                    # Energy (per atom)
-                    e_pred = res["energy"]
-                    e_true = gt_atoms.get_potential_energy()
-                    n_atoms = len(gt_atoms)
-
-                    energy_error = abs(e_pred - e_true) / n_atoms
-                    metrics["energy_mae"].append(energy_error)
-                    metrics["energy_rmse"].append(energy_error**2)
-
-                    all_e_pred.append(e_pred / n_atoms)
-                    all_e_true.append(e_true / n_atoms)
-
-                    # Forces
-                    f_pred = np.array(res["forces"])
-                    f_true = gt_atoms.get_forces()
-                    force_error = np.abs(f_pred - f_true)
-                    metrics["force_mae"].append(force_error.mean())
-                    metrics["force_rmse"].append((force_error**2).mean())
-
-                    all_f_pred.extend(f_pred.flatten())
-                    all_f_true.extend(f_true.flatten())
-
-                    # Stress
-                    s_pred = np.array(res["stress"])
-                    s_true = gt_atoms.get_stress()
-                    stress_error = np.abs(s_pred - s_true)
-                    metrics["stress_mae"].append(stress_error.mean())
-                    metrics["stress_rmse"].append((stress_error**2).mean())
-
-                    all_s_pred.extend(s_pred.flatten())
-                    all_s_true.extend(s_true.flatten())
-
-            except Exception:
-                pass
-
-    # Calculate final metrics
-    result_metrics = {}
-
-    if metrics["energy_mae"]:
-        result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"]))
-        result_metrics["energy_rmse"] = float(np.sqrt(np.mean(metrics["energy_rmse"])))
-        result_metrics["energy_r2"] = (
-            float(r2_score(all_e_true, all_e_pred))
-            if len(all_e_true) > 1
-            else float("nan")
-        )
-
-    if metrics["force_mae"]:
-        result_metrics["force_mae"] = float(np.mean(metrics["force_mae"]))
-        result_metrics["force_rmse"] = float(np.sqrt(np.mean(metrics["force_rmse"])))
-        result_metrics["force_r2"] = (
-            float(r2_score(all_f_true, all_f_pred))
-            if len(all_f_true) > 1
-            else float("nan")
-        )
-
-    if metrics["stress_mae"]:
-        result_metrics["stress_mae"] = float(np.mean(metrics["stress_mae"]))
-        result_metrics["stress_rmse"] = float(np.sqrt(np.mean(metrics["stress_rmse"])))
-        result_metrics["stress_r2"] = (
-            float(r2_score(all_s_true, all_s_pred))
-            if len(all_s_true) > 1
-            else float("nan")
-        )
-
-    result_metrics["num_evaluated"] = len(metrics["energy_mae"])
-
-    return result_metrics
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index e26045d6..b130912b 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -1,161 +1,794 @@
-"""Matbench Discovery benchmark task implementations."""
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "groundhog-hpc",
+#     "garden-ai",
+#     "ase",
+#     "numpy",
+#     "pandas",
+#     "scikit-learn",
+#     "torch",
+#     "matbench-discovery",
+# ]
+# ///
+"""Matbench Discovery benchmark task implementations using Groundhog HPC."""
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Dict, List
+import concurrent.futures
+import json
+import logging
+import multiprocessing
+import os
+import sys
+import time
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence
 
-from ..utils.remote_execution import run_remote_benchmark
-from ..utils.script_builder import BenchmarkScriptBuilder
-from ..utils.task import BaseBenchmarkTask
+import groundhog_hpc as hog
+import numpy as np
+import pandas as pd
+from sklearn.metrics import r2_score
+
+# Ensure local modules can be imported during local execution
+sys.path.append(os.getcwd())
 
 if TYPE_CHECKING:
-    from . import MatbenchDiscovery
     from .enums import DatasetConfig, DatasetSize
 
-from .metrics import classify_stable, stable_metrics
-
 # ------------------------------------------------------------------------------
-# REMOTE FUNCTIONS
-# These functions are injected into the remote script.
-# They must be self-contained (imports inside or provided by builder).
+# BOILERPLATE: Logging & Device Setup
 # ------------------------------------------------------------------------------
+
+
+def setup_logging():
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s] [PID:%(process)d] %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        stream=sys.stdout,
+        force=True,
+    )
+    return logging.getLogger("benchmark_runner")
+
+
+def setup_device(gpu_id: Optional[int] = None) -> str:
+    """Setup compute device for this process."""
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            return f"cuda:{gpu_id}" if gpu_id is not None else "cuda"
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            return "mps"
+    except ImportError:
+        pass
+    return "cpu"
+
+
+def convert_numpy_types(obj):
+    """Convert numpy types to Python native types for JSON serialization."""
+    import numpy as np
+
+    if isinstance(obj, (np.integer, np.floating)):
+        return obj.item()
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {k: convert_numpy_types(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_numpy_types(item) for item in obj]
+    return obj
+
+
 # ------------------------------------------------------------------------------
-# REMOTE FUNCTIONS
-# These functions are injected into the remote script.
-# They are now imported from remote.py to keep this file clean.
+# METRICS HELPERS (Inlined from metrics.py)
 # ------------------------------------------------------------------------------
-from .remote import (
-    _load_dataset_common,
-    _process_batch_common,
-    calculate_metrics_energy,
-    calculate_metrics_forces,
-    get_material_ids_for_subset,
-    load_dataset_mp_trj,
-    load_dataset_wbm_initial,
-    load_dataset_wbm_relaxed,
-    load_model,
-    process_batch_forces,
-    process_batch_relaxation,
-    process_batch_static,
-)
+
+
+def classify_stable(
+    each_true: Sequence[float] | pd.Series | np.ndarray,
+    each_pred: Sequence[float] | pd.Series | np.ndarray,
+    *,
+    stability_threshold: float = 0.0,
+    fillna: bool = True,
+) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
+    if len(each_true) != len(each_pred):
+        raise ValueError(f"{len(each_true)=} != {len(each_pred)=}")
+
+    each_true_arr, each_pred_arr = pd.Series(each_true), pd.Series(each_pred)
+
+    if stability_threshold is None or np.isnan(stability_threshold):
+        raise ValueError("stability_threshold must be a real number")
+    actual_pos = each_true_arr <= (stability_threshold or 0)
+    actual_neg = each_true_arr > (stability_threshold or 0)
+
+    model_pos = each_pred_arr <= (stability_threshold or 0)
+    model_neg = each_pred_arr > (stability_threshold or 0)
+
+    if fillna:
+        nan_mask = np.isnan(each_pred)
+        model_pos[nan_mask] = False
+        model_neg[nan_mask] = True
+
+        n_pos, n_neg, total = model_pos.sum(), model_neg.sum(), len(each_pred)
+        if n_pos + n_neg != total:
+            raise ValueError(
+                f"after filling NaNs, the sum of positive ({n_pos}) and negative "
+                f"({n_neg}) predictions should add up to {total=}"
+            )
+
+    true_pos = actual_pos & model_pos
+    false_neg = actual_pos & model_neg
+    false_pos = actual_neg & model_pos
+    true_neg = actual_neg & model_neg
+
+    return true_pos, false_neg, false_pos, true_neg
+
+
+def stable_metrics(
+    each_true: Sequence[float] | pd.Series | np.ndarray,
+    each_pred: Sequence[float] | pd.Series | np.ndarray,
+    *,
+    stability_threshold: float = 0.0,
+    fillna: bool = True,
+    prevalence: float | None = None,
+) -> dict[str, float]:
+    n_true_pos, n_false_neg, n_false_pos, n_true_neg = map(
+        sum,
+        classify_stable(
+            each_true, each_pred, stability_threshold=stability_threshold, fillna=fillna
+        ),
+    )
+
+    n_total_pos = n_true_pos + n_false_neg
+    n_total_neg = n_true_neg + n_false_pos
+    if prevalence is None:
+        prevalence = (
+            n_total_pos / (n_total_pos + n_total_neg)
+            if (n_total_pos + n_total_neg) > 0
+            else float("nan")
+        )
+    precision = (
+        n_true_pos / (n_true_pos + n_false_pos)
+        if (n_true_pos + n_false_pos) > 0
+        else float("nan")
+    )
+    recall = n_true_pos / n_total_pos if n_total_pos > 0 else float("nan")
+
+    TPR = recall
+    FPR = n_false_pos / n_total_neg if n_total_neg > 0 else float("nan")
+    TNR = n_true_neg / n_total_neg if n_total_neg > 0 else float("nan")
+    FNR = n_false_neg / n_total_pos if n_total_pos > 0 else float("nan")
+
+    if FPR > 0 and TNR > 0 and FPR + TNR != 1:
+        if abs(FPR + TNR - 1) > 1e-6:
+            raise ValueError(f"{FPR=} {TNR=} don't add up to 1")
+
+    if TPR > 0 and FNR > 0 and TPR + FNR != 1:
+        if abs(TPR + FNR - 1) > 1e-6:
+            raise ValueError(f"{TPR=} {FNR=} don't add up to 1")
+
+    is_nan = np.isnan(each_true) | np.isnan(each_pred)
+    each_true, each_pred = np.array(each_true)[~is_nan], np.array(each_pred)[~is_nan]
+
+    if precision + recall == 0:
+        f1_score = float("nan")
+    else:
+        f1_score = 2 * (precision * recall) / (precision + recall)
+
+    return dict(
+        F1=f1_score,
+        DAF=precision / prevalence if prevalence > 0 else float("nan"),
+        Precision=precision,
+        Recall=recall,
+        Accuracy=(
+            (n_true_pos + n_true_neg) / (n_total_pos + n_total_neg)
+            if (n_total_pos + n_total_neg > 0)
+            else float("nan")
+        ),
+        TPR=TPR,
+        FPR=FPR,
+        TNR=TNR,
+        FNR=FNR,
+        TP=n_true_pos,
+        FP=n_false_pos,
+        TN=n_true_neg,
+        FN=n_false_neg,
+        MAE=np.abs(each_true - each_pred).mean(),
+        RMSE=((each_true - each_pred) ** 2).mean() ** 0.5,
+        R2=r2_score(each_true, each_pred) if len(each_true) > 1 else float("nan"),
+    )
+
 
 # ------------------------------------------------------------------------------
-# Task Classes
+# REMOTE HELPERS (Inlined from remote.py)
 # ------------------------------------------------------------------------------
 
+_MODEL_CACHE = None
 
-class MatbenchTask(BaseBenchmarkTask):
-    """Base class for Matbench Discovery tasks."""
 
-    def __init__(
-        self,
-        adapter: "MatbenchDiscovery",
-        repo_url: str,
-        repo_ref: str,
-        model_package: str | None = None,
-        task_name: str = "unknown",
-    ):
-        super().__init__(adapter, repo_url, repo_ref, model_package)
-        self.name = task_name
+def _process_batch_common(
+    batch_id: int,
+    structures: List[Any],
+    model_config: Dict[str, Any],
+    num_threads: int,
+    compute_fn: Callable[[Any, Any], Dict[str, Any]],
+    task_name: str,
+    model_factory: Callable[[str], Any],
+) -> Dict[str, Any]:
+    import logging
+    import os
+    import time
 
-    def calculate_metrics(self, output: Dict[str, Any]) -> Dict[str, Any]:
-        """Retrieve metrics from the remote output."""
-        return output.get("metrics", {})
+    import torch
 
-    def _build_script(
-        self, process_fn, load_dataset_fn, calc_metrics_fn, model_factory
-    ) -> str:
-        """Build the remote execution script with specific functions.
-
-        Args:
-            process_fn: Task-specific process_batch function
-            load_dataset_fn: Task-specific load_dataset function
-            calc_metrics_fn: Task-specific calculate_metrics function
-            model_factory: User-provided function that creates the model
-        """
-        builder = BenchmarkScriptBuilder()
-
-        # Add global model cache
-        builder.add_preamble("_MODEL_CACHE = None")
-
-        # Common imports
-        builder.add_import(
-            "from typing import List, Dict, Any, Tuple, Optional, Callable"
+    os.environ["OMP_NUM_THREADS"] = str(num_threads)
+    torch.set_num_threads(num_threads)
+
+    gpu_id = model_config.get("gpu_id")
+    device = setup_device(gpu_id)
+
+    worker_logger = logging.getLogger(f"worker_{batch_id}")
+    worker_logger.info(
+        f"Started {task_name} on {device} with {len(structures)} structures. Threads: {num_threads}"
+    )
+
+    global _MODEL_CACHE
+    try:
+        if _MODEL_CACHE is None:
+            model = model_factory(device)
+            _MODEL_CACHE = model
+        else:
+            model = _MODEL_CACHE
+    except Exception as e:
+        worker_logger.error(f"Failed to initialize model: {e}")
+        raise RuntimeError(f"Model initialization failed: {e}") from e
+
+    results = {}
+    batch_start = time.time()
+
+    for i, (struct_id, atoms) in enumerate(structures):
+        try:
+            result = compute_fn(model, atoms)
+            results[struct_id] = result
+
+            if (i + 1) % 10 == 0:
+                elapsed = time.time() - batch_start
+                rate = (i + 1) / elapsed if elapsed > 0 else 0
+                worker_logger.info(
+                    f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)"
+                )
+
+        except Exception as e:
+            worker_logger.warning(f"Structure {struct_id} failed: {e}")
+            results[struct_id] = {"error": str(e)}
+
+    return results
+
+
+def get_material_ids_for_subset(
+    subset_type: str, seed: int = 42
+) -> Optional[List[str]]:
+    if subset_type == "full":
+        return None
+
+    import pandas as pd
+    from matbench_discovery.data import DataFiles
+
+    df = pd.read_csv(DataFiles.wbm_summary.path)
+
+    if subset_type == "unique_protos":
+        df_filtered = df.query("unique_prototype")
+        return df_filtered["material_id"].tolist()
+
+    elif subset_type == "random_10k":
+        df_filtered = df.query("unique_prototype")
+        df_sampled = df_filtered.sample(n=10000, random_state=seed)
+        return df_sampled["material_id"].tolist()
+
+    elif subset_type == "random_100":
+        df_filtered = df.query("unique_prototype")
+        df_sampled = df_filtered.sample(n=100, random_state=seed)
+        return df_sampled["material_id"].tolist()
+
+    else:
+        raise ValueError(f"Unknown subset_type: {subset_type}")
+
+
+def _load_dataset_common(
+    config: Dict[str, Any],
+    zip_path: str,
+    read_format: str = "extxyz",
+    read_index: str | slice = None,
+) -> List[Any]:
+    from io import TextIOWrapper
+    from zipfile import ZipFile
+
+    from ase.io import read
+
+    dataset_subset = config.get("dataset_subset", "full")
+    dataset_seed = config.get("dataset_seed", 42)
+    mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed)
+
+    structures = []
+
+    with ZipFile(zip_path, "r") as zf:
+        if mat_ids is None:
+            file_list = sorted(
+                zf.namelist(),
+                key=lambda x: int(x.split(".")[0])
+                if x.split(".")[0].isdigit()
+                else float("inf"),
+            )
+            num_structures = config.get("num_structures", 100)
+            if isinstance(num_structures, int):
+                file_list = file_list[:num_structures]
+        else:
+            mat_id_set = set(mat_ids)
+            file_list = [
+                f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set
+            ]
+
+        for filename in file_list:
+            with zf.open(filename) as f:
+                text_stream = TextIOWrapper(f, encoding="utf-8")
+                if read_index is not None:
+                    atoms_list = read(text_stream, format=read_format, index=read_index)
+                    if isinstance(atoms_list, list) and atoms_list:
+                        structures.append((filename, atoms_list[-1]))
+                    elif not isinstance(atoms_list, list):
+                        structures.append((filename, atoms_list))
+                else:
+                    structures.append((filename, read(text_stream, format=read_format)))
+
+    return structures
+
+
+# Task-specific helpers
+def process_batch_relaxation(
+    batch_id: int,
+    structures: List[Any],
+    model_config: Dict[str, Any],
+    num_threads: int,
+    model_factory: Callable[[str], Any],
+) -> Dict[str, Any]:
+    from ase.optimize import FIRE
+
+    def compute(model, atoms):
+        atoms.calc = model
+        opt = FIRE(atoms, logfile=None)
+        opt.run(fmax=0.05, steps=500)
+        energy = atoms.get_potential_energy()
+        return {"energy": energy}
+
+    return _process_batch_common(
+        batch_id,
+        structures,
+        model_config,
+        num_threads,
+        compute,
+        "relaxation",
+        model_factory,
+    )
+
+
+def process_batch_static(
+    batch_id: int,
+    structures: List[Any],
+    model_config: Dict[str, Any],
+    num_threads: int,
+    model_factory: Callable[[str], Any],
+) -> Dict[str, Any]:
+    def compute(model, atoms):
+        atoms.calc = model
+        energy = atoms.get_potential_energy()
+        return {"energy": energy}
+
+    return _process_batch_common(
+        batch_id,
+        structures,
+        model_config,
+        num_threads,
+        compute,
+        "static calculation",
+        model_factory,
+    )
+
+
+def process_batch_forces(
+    batch_id: int,
+    structures: List[Any],
+    model_config: Dict[str, Any],
+    num_threads: int,
+    model_factory: Callable[[str], Any],
+) -> Dict[str, Any]:
+    def compute(model, atoms):
+        atoms.calc = model
+        energy = atoms.get_potential_energy()
+        forces = atoms.get_forces().tolist()
+        stress = atoms.get_stress().tolist()
+        return {"energy": energy, "forces": forces, "stress": stress}
+
+    return _process_batch_common(
+        batch_id,
+        structures,
+        model_config,
+        num_threads,
+        compute,
+        "forces calculation",
+        model_factory,
+    )
+
+
+def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Any]:
+    from matbench_discovery.data import DataFiles
+
+    return _load_dataset_common(config, DataFiles.wbm_initial_atoms.path)
+
+
+def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Any]:
+    from matbench_discovery.data import DataFiles
+
+    return _load_dataset_common(config, DataFiles.wbm_relaxed_atoms.path)
+
+
+def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]:
+    from matbench_discovery.data import DataFiles
+
+    return _load_dataset_common(config, DataFiles.mp_trj_extxyz.path, read_index=":")
+
+
+def calculate_metrics_energy(
+    results: Dict[str, Any], config: Dict[str, Any]
+) -> Dict[str, Any]:
+    import numpy as np
+    from matbench_discovery.data import df_wbm
+
+    if len(results) == 0:
+        return {"error": "No results to evaluate"}
+
+    model_energies = {}
+    for sid, res in results.items():
+        if isinstance(res, dict) and res.get("energy") is not None:
+            mat_id = sid.replace(".extxyz", "")
+            model_energies[mat_id] = res["energy"]
+
+    if not model_energies:
+        return {"error": "No valid energies found in results"}
+
+    df_wbm_indexed = df_wbm.set_index("material_id")
+    common_ids = list(set(model_energies.keys()) & set(df_wbm_indexed.index))
+
+    if not common_ids:
+        return {"error": "No matching IDs between results and ground truth"}
+
+    df_subset = df_wbm_indexed.loc[common_ids]
+    y_pred = np.array([model_energies[mid] for mid in common_ids])
+    y_true = df_subset["uncorrected_energy"].values
+    n_atoms = df_subset["n_sites"].values
+
+    e_form_error = (y_pred - y_true) / n_atoms
+    each_true = df_subset["e_above_hull_mp2020_corrected_ppd_mp"].values
+    each_pred = each_true + e_form_error
+
+    df_unique = df_wbm.query("unique_prototype")
+    stable_count = (df_unique["e_above_hull_mp2020_corrected_ppd_mp"] <= 0).sum()
+    global_prevalence = stable_count / len(df_unique)
+
+    metrics = stable_metrics(each_true, each_pred, prevalence=global_prevalence)
+    metrics["num_evaluated"] = len(common_ids)
+    return metrics
+
+
+def calculate_metrics_forces(
+    results: Dict[str, Any], config: Dict[str, Any]
+) -> Dict[str, Any]:
+    from io import TextIOWrapper
+    from zipfile import ZipFile
+
+    import numpy as np
+    from ase.io import read
+    from matbench_discovery.data import DataFiles
+    from sklearn.metrics import r2_score
+
+    metrics = {
+        "energy_mae": [],
+        "energy_rmse": [],
+        "force_mae": [],
+        "force_rmse": [],
+        "stress_mae": [],
+        "stress_rmse": [],
+    }
+    all_e_pred, all_e_true = [], []
+    all_f_pred, all_f_true = [], []
+    all_s_pred, all_s_true = [], []
+
+    zip_path = DataFiles.mp_trj_extxyz.path
+
+    with ZipFile(zip_path, "r") as zf:
+        for sid, res in results.items():
+            if "error" in res:
+                continue
+            try:
+                with zf.open(sid) as f:
+                    text_stream = TextIOWrapper(f, encoding="utf-8")
+                    atoms_list = read(text_stream, format="extxyz", index=":")
+                    gt_atoms = atoms_list[-1]
+
+                    e_pred = res["energy"]
+                    e_true = gt_atoms.get_potential_energy()
+                    n_atoms = len(gt_atoms)
+                    energy_error = abs(e_pred - e_true) / n_atoms
+                    metrics["energy_mae"].append(energy_error)
+                    metrics["energy_rmse"].append(energy_error**2)
+                    all_e_pred.append(e_pred / n_atoms)
+                    all_e_true.append(e_true / n_atoms)
+
+                    f_pred = np.array(res["forces"])
+                    f_true = gt_atoms.get_forces()
+                    force_error = np.abs(f_pred - f_true)
+                    metrics["force_mae"].append(force_error.mean())
+                    metrics["force_rmse"].append((force_error**2).mean())
+                    all_f_pred.extend(f_pred.flatten())
+                    all_f_true.extend(f_true.flatten())
+
+                    s_pred = np.array(res["stress"])
+                    s_true = gt_atoms.get_stress()
+                    stress_error = np.abs(s_pred - s_true)
+                    metrics["stress_mae"].append(stress_error.mean())
+                    metrics["stress_rmse"].append((stress_error**2).mean())
+                    all_s_pred.extend(s_pred.flatten())
+                    all_s_true.extend(s_true.flatten())
+
+            except Exception:
+                pass
+
+    result_metrics = {}
+    if metrics["energy_mae"]:
+        result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"]))
+        result_metrics["energy_rmse"] = float(np.sqrt(np.mean(metrics["energy_rmse"])))
+        result_metrics["energy_r2"] = (
+            float(r2_score(all_e_true, all_e_pred))
+            if len(all_e_true) > 1
+            else float("nan")
+        )
+
+    if metrics["force_mae"]:
+        result_metrics["force_mae"] = float(np.mean(metrics["force_mae"]))
+        result_metrics["force_rmse"] = float(np.sqrt(np.mean(metrics["force_rmse"])))
+        result_metrics["force_r2"] = (
+            float(r2_score(all_f_true, all_f_pred))
+            if len(all_f_true) > 1
+            else float("nan")
         )
-        builder.add_import("import torch")
-        builder.add_import("from ase.optimize import FIRE")
-        builder.add_import("from ase.io import read")
-        builder.add_import("from matbench_discovery.data import DataFiles")
-        builder.add_import("from zipfile import ZipFile")
-        builder.add_import("from io import TextIOWrapper")
-        builder.add_import("import pandas as pd")
-        builder.add_import("import numpy as np")
-        builder.add_import("from collections.abc import Sequence")
-        builder.add_import("from sklearn.metrics import r2_score")
 
-        # Add user's model factory (renamed to load_model_user so load_model can call it)
-        builder.add_function(model_factory, name="load_model_user")
+    if metrics["stress_mae"]:
+        result_metrics["stress_mae"] = float(np.mean(metrics["stress_mae"]))
+        result_metrics["stress_rmse"] = float(np.sqrt(np.mean(metrics["stress_rmse"])))
+        result_metrics["stress_r2"] = (
+            float(r2_score(all_s_true, all_s_pred))
+            if len(all_s_true) > 1
+            else float("nan")
+        )
+
+    result_metrics["num_evaluated"] = len(metrics["energy_mae"])
+    return result_metrics
+
+
+# ------------------------------------------------------------------------------
+# MAIN RUNNER (Inlined from runners.py)
+# ------------------------------------------------------------------------------
 
-        # Add our load_model wrapper that calls load_model_user
-        builder.add_function(load_model)
 
-        # Add helper function for dataset subset filtering
-        builder.add_function(get_material_ids_for_subset)
+def run_benchmark_hog(
+    config: Dict[str, Any],
+    model_factory: Any,
+    load_dataset_fn: Any,
+    process_fn: Any,
+    calc_metrics_fn: Any,
+) -> Dict[str, Any]:
+    logger = setup_logging()
+    logger.info("Starting benchmark runner...")
+
+    checkpoint_path = config.get("checkpoint_path")
+    results = {}
+
+    if checkpoint_path and os.path.exists(checkpoint_path):
+        logger.info(f"Loading checkpoint from {checkpoint_path}")
+        try:
+            with open(checkpoint_path) as f:
+                results = json.load(f)
+            logger.info(f"Found {len(results)} processed items in checkpoint")
+        except Exception as e:
+            logger.warning(f"Failed to load checkpoint: {e}. Starting fresh.")
+
+    try:
+        all_items = load_dataset_fn(config)
+        logger.info(f"Loaded {len(all_items)} total items")
+    except Exception as e:
+        logger.error(f"Failed to load dataset: {e}")
+        import traceback
+
+        traceback.print_exc()
+        raise
+
+    items_to_process = [
+        (item_id, item) for item_id, item in all_items if str(item_id) not in results
+    ]
+
+    if not items_to_process:
+        logger.info("All items already processed!")
+        return {"results": results, "metrics": {}}
+
+    logger.info(f"Processing {len(items_to_process)} remaining items")
+
+    import random
+
+    random.seed(42)
+    random.shuffle(items_to_process)
+
+    try:
+        import torch
+
+        num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
+    except ImportError:
+        num_gpus = 0
+
+    use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1
+    total_cores = os.cpu_count() or 1
+    num_workers = num_gpus if use_multi_gpu else 1
+    available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores
+    threads_per_worker = max(1, available_cores // num_workers)
+
+    logger.info(
+        f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)"
+    )
+
+    start_time = time.time()
+    chunk_size = 1000 * num_workers
+    chunks = [
+        items_to_process[i : i + chunk_size]
+        for i in range(0, len(items_to_process), chunk_size)
+    ]
+
+    ctx = multiprocessing.get_context("spawn")
+
+    with concurrent.futures.ProcessPoolExecutor(
+        max_workers=num_workers, mp_context=ctx
+    ) as executor:
+        for chunk_idx, chunk in enumerate(chunks):
+            chunk_start = time.time()
+            logger.info(
+                f"Starting chunk {chunk_idx + 1}/{len(chunks)} ({len(chunk)} items)"
+            )
 
-        # Add common helpers
-        builder.add_function(_process_batch_common)
-        builder.add_function(_load_dataset_common)
+            futures = []
+            batch_size = (len(chunk) + num_workers - 1) // num_workers
+
+            for i in range(num_workers):
+                start = i * batch_size
+                end = min((i + 1) * batch_size, len(chunk))
+                if start < end:
+                    batch = chunk[start:end]
+                    worker_config = config.copy()
+                    worker_config["gpu_id"] = i if use_multi_gpu else None
+                    futures.append(
+                        executor.submit(
+                            process_fn,
+                            i,
+                            batch,
+                            worker_config,
+                            threads_per_worker,
+                            model_factory,
+                        )
+                    )
+
+            chunk_results = {}
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    batch_res = future.result()
+                    chunk_results.update(batch_res)
+                except Exception as e:
+                    logger.error(f"Worker failed in chunk {chunk_idx}: {e}")
+                    raise RuntimeError(
+                        "Aborting benchmark due to worker failure"
+                    ) from e
+
+            results.update(chunk_results)
+
+            if checkpoint_path:
+                try:
+                    tmp_path = checkpoint_path + ".tmp"
+                    with open(tmp_path, "w") as f:
+                        clean_results = convert_numpy_types(results)
+                        json.dump(clean_results, f, indent=2)
+                    os.replace(tmp_path, checkpoint_path)
+                    logger.info(f"Checkpoint saved to {checkpoint_path}")
+                except Exception as e:
+                    logger.error(f"Failed to save checkpoint: {e}")
+
+            elapsed = time.time() - chunk_start
+            logger.info(f"Chunk {chunk_idx + 1} complete in {elapsed:.1f}s")
+
+    total_elapsed = time.time() - start_time
+    logger.info(f"Benchmark complete in {total_elapsed:.1f}s.")
+
+    logger.info("Calculating metrics...")
+    try:
+        metrics = calc_metrics_fn(results, config)
+        logger.info(f"Metrics calculated: {metrics}")
+    except Exception as e:
+        logger.error(f"Failed to calculate metrics: {e}")
+        import traceback
+
+        traceback.print_exc()
+        metrics = {"error": f"Metrics calculation failed: {e}"}
+
+    output = {"results": results, "metrics": metrics}
+    output = convert_numpy_types(output)
+    return output
 
-        # Add task-specific functions with standard names expected by runner
-        builder.add_function(process_fn, name="process_batch")
-        builder.add_function(load_dataset_fn, name="load_dataset")
-        builder.add_function(calc_metrics_fn, name="calculate_metrics_remote")
 
-        # Inject metrics helper functions
-        builder.add_function(classify_stable)
-        builder.add_function(stable_metrics)
+# ------------------------------------------------------------------------------
+# CLASS DEFINITION
+# ------------------------------------------------------------------------------
 
-        return builder.build()
 
+class MatbenchDiscovery:
+    """Matbench Discovery tasks using Groundhog HPC."""
+
+    REPO_URL = "https://github.com/janosh/matbench-discovery"
+    REPO_REF = "main"
+
+    @staticmethod
     def _prepare_runner_config(
-        self, num_structures: int | "DatasetSize" | "DatasetConfig"
+        num_structures: int | "DatasetSize" | "DatasetConfig",
+        repo_url: str = REPO_URL,
+        repo_ref: str = REPO_REF,
     ) -> Dict[str, Any]:
         """Prepare the runner configuration based on num_structures."""
-        from .enums import DatasetConfig, DatasetSize
-
-        if isinstance(num_structures, DatasetSize):
+        # Need to handle DatasetSize/Config which might be passed as objects or values
+        # Since we are in the script, we might not have the enums imported if they are not in this file.
+        # But the user passes them.
+        # If they are passed as arguments, they are serialized.
+        # We need to extract value.
+
+        # Simple heuristic: if it has 'value' attr, use it.
+        subset = "full"
+        seed = 42
+
+        if hasattr(num_structures, "value"):  # Enum
+            subset = num_structures.value
+            # Check for seed method/attr if it's our custom Config
+            if hasattr(num_structures, "seed"):
+                if callable(num_structures.seed):
+                    pass  # It's the method
+                else:
+                    seed = num_structures.seed
+        elif hasattr(num_structures, "subset"):  # DatasetConfig
+            subset = num_structures.subset.value
+            seed = num_structures.seed
+        elif isinstance(num_structures, int):
+            subset = "full"
+            # We handle int as limit in load_dataset
             return {
-                "repo_url": self.repo_url,
-                "repo_ref": self.repo_ref,
-                "dataset_subset": num_structures.value,
-                "dataset_seed": 42,
-            }
-        elif isinstance(num_structures, DatasetConfig):
-            return {
-                "repo_url": self.repo_url,
-                "repo_ref": self.repo_ref,
-                "dataset_subset": num_structures.subset.value,
-                "dataset_seed": num_structures.seed,
-            }
-        else:
-            return {
-                "repo_url": self.repo_url,
-                "repo_ref": self.repo_ref,
+                "repo_url": repo_url,
+                "repo_ref": repo_ref,
                 "num_structures": num_structures,
                 "dataset_subset": "full",
             }
 
-    def _prepare_dependencies(self, model_packages: str | List[str]) -> List[str]:
-        """Prepare the list of dependencies."""
-        packages = (
-            [model_packages] if isinstance(model_packages, str) else model_packages
-        )
-        return ["matbench-discovery>=1.3.0"] + packages
+        return {
+            "repo_url": repo_url,
+            "repo_ref": repo_ref,
+            "dataset_subset": subset,
+            "dataset_seed": seed,
+        }
 
+    @staticmethod
     def _generate_checkpoint_name(
-        self, model_packages: str | List[str], runner_config: Dict[str, Any]
+        model_packages: str | List[str], runner_config: Dict[str, Any]
     ) -> str:
-        """Generate a unique checkpoint name."""
         import time
         import uuid
 
@@ -173,42 +806,30 @@ def _generate_checkpoint_name(
         short_uuid = str(uuid.uuid4())[:8]
         return f"matbench_{model_str}_{subset_str}_{timestamp}_{short_uuid}.json"
 
-    def submit(
-        self,
-        model_factory: callable,
+    @staticmethod
+    def _run_task(
+        model_factory: Any,
         model_packages: str | List[str],
-        num_structures: int | "DatasetSize" | "DatasetConfig" = 100,
-        checkpoint_name: str | None = None,
-        checkpoint_path: str | None = None,
-    ):
-        """Submit benchmark job to remote executor.
-
-        Args:
-            model_factory: User-provided function that takes device and returns an ASE calculator.
-                          Example: lambda device: mace_mp(model="medium", device=device)
-            model_packages: Python package(s) to install. Can be a single package string
-                          (e.g., "mace-torch") or a list (e.g., ["torch", "mace-torch"])
-            num_structures: Number of structures to evaluate, or DatasetSize enum, or DatasetConfig
-                          (DatasetSize.FULL, DatasetSize.RANDOM_10K, DatasetSize.RANDOM_10K.seed(10))
-            checkpoint_name: Optional name for the checkpoint file (e.g. "my_checkpoint.json").
-                             If not provided, one will be generated.
-            checkpoint_path: Optional path to an existing checkpoint file to resume from.
-                             If provided, checkpoint_name is ignored and no new checkpoint is created.
-        """
-        # Build script with task-specific functions AND user's factory
-        script_content = self._build_script(
-            self.process_fn,
-            self.load_dataset_fn,
-            self.calc_metrics_fn,
-            model_factory,  # Inject user's factory function
-        )
+        num_structures: int | "DatasetSize" | "DatasetConfig",
+        checkpoint_name: str | None,
+        checkpoint_path: str | None,
+        process_fn: Any,
+        load_dataset_fn: Any,
+        calc_metrics_fn: Any,
+        sys_path: List[str] | None = None,
+    ) -> Dict[str, Any]:
+        # Add custom sys.path if provided (useful for local execution/testing)
+        if sys_path:
+            import sys
 
-        dependencies = self._prepare_dependencies(model_packages)
-        runner_config = self._prepare_runner_config(num_structures)
+            for p in sys_path:
+                if p not in sys.path:
+                    sys.path.append(p)
+
+        runner_config = MatbenchDiscovery._prepare_runner_config(num_structures)
 
-        # Generate checkpoint name if not provided AND no checkpoint_path is provided
         if not checkpoint_name and not checkpoint_path:
-            checkpoint_name = self._generate_checkpoint_name(
+            checkpoint_name = MatbenchDiscovery._generate_checkpoint_name(
                 model_packages, runner_config
             )
 
@@ -219,154 +840,124 @@ def submit(
             print(
                 f"Checkpoint will be saved to: ~/.garden/benchmarks/{checkpoint_name}"
             )
-            final_checkpoint_path = f"~/.garden/benchmarks/{checkpoint_name}"
-
-        executor = self.adapter._get_executor()
-        future = executor.submit(
-            run_remote_benchmark,
-            script_content=script_content,
-            dependencies=dependencies,
-            config=runner_config,
-            checkpoint_name=checkpoint_name,
-            checkpoint_path=checkpoint_path,
-        )
+            final_checkpoint_path = os.path.expanduser(
+                f"~/.garden/benchmarks/{checkpoint_name}"
+            )
+            # Ensure directory exists
+            os.makedirs(os.path.dirname(final_checkpoint_path), exist_ok=True)
 
-        # Attach checkpoint path to future for programmatic access
-        future.checkpoint_path = final_checkpoint_path
+        runner_config["checkpoint_path"] = final_checkpoint_path
 
-        return future
+        return run_benchmark_hog(
+            runner_config,
+            model_factory,
+            load_dataset_fn,
+            process_fn,
+            calc_metrics_fn,
+        )
 
-    def local(
-        self,
-        model_factory: callable,
+    @hog.method()
+    def IS2RE(
+        model_factory: Any,
         model_packages: str | List[str],
         num_structures: int | "DatasetSize" | "DatasetConfig" = 100,
+        checkpoint_name: str | None = None,
         checkpoint_path: str | None = None,
-    ) -> dict:
-        """Run benchmark locally.
-
-        Args:
-            model_factory: User-provided function that takes device and returns an ASE calculator
-            model_packages: Python package(s) to install. Can be a single package string
-                          (e.g., "mace-torch") or a list (e.g., ["torch", "mace-torch"])
-            num_structures: Number of structures to evaluate, or DatasetSize enum, or DatasetConfig
-                          (DatasetSize.FULL, DatasetSize.RANDOM_10K, DatasetSize.RANDOM_10K.seed(10))
-            checkpoint_path: Optional path to resume from checkpoint
-        """
-        from ..utils.remote_execution import run_remote_benchmark
-
-        # Build script with task-specific functions AND user's factory
-        script_content = self._build_script(
-            self.process_fn, self.load_dataset_fn, self.calc_metrics_fn, model_factory
+        sys_path: List[str] | None = None,
+    ) -> Dict[str, Any]:
+        """Initial Structure to Relaxed Energy."""
+        return MatbenchDiscovery._run_task(
+            model_factory,
+            model_packages,
+            num_structures,
+            checkpoint_name,
+            checkpoint_path,
+            process_batch_relaxation,
+            load_dataset_wbm_initial,
+            calculate_metrics_energy,
+            sys_path=sys_path,
         )
 
-        dependencies = self._prepare_dependencies(model_packages)
-        runner_config = self._prepare_runner_config(num_structures)
-
-        # Run locally (no Globus Compute)
-        return run_remote_benchmark(
-            script_content=script_content,
-            dependencies=dependencies,
-            config=runner_config,
-            checkpoint_path=checkpoint_path,
+    @hog.method()
+    def RS2RE(
+        model_factory: Any,
+        model_packages: str | List[str],
+        num_structures: int | "DatasetSize" | "DatasetConfig" = 100,
+        checkpoint_name: str | None = None,
+        checkpoint_path: str | None = None,
+        sys_path: List[str] | None = None,
+    ) -> Dict[str, Any]:
+        """Relaxed Structure to Relaxed Energy."""
+        return MatbenchDiscovery._run_task(
+            model_factory,
+            model_packages,
+            num_structures,
+            checkpoint_name,
+            checkpoint_path,
+            process_batch_static,
+            load_dataset_wbm_relaxed,
+            calculate_metrics_energy,
+            sys_path=sys_path,
         )
 
+    @hog.method()
+    def S2EFS(
+        model_factory: Any,
+        model_packages: str | List[str],
+        num_structures: int | "DatasetSize" | "DatasetConfig" = 100,
+        checkpoint_name: str | None = None,
+        checkpoint_path: str | None = None,
+        sys_path: List[str] | None = None,
+    ) -> Dict[str, Any]:
+        """Structure to Energy, Forces, Stress."""
+        return MatbenchDiscovery._run_task(
+            model_factory,
+            model_packages,
+            num_structures,
+            checkpoint_name,
+            checkpoint_path,
+            process_batch_forces,
+            load_dataset_mp_trj,
+            calculate_metrics_forces,
+            sys_path=sys_path,
+        )
 
-class IS2RETask(MatbenchTask):
-    """Initial Structure to Relaxed Energy."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, task_name="IS2RE", **kwargs)
-        self.process_fn = process_batch_relaxation
-        self.load_dataset_fn = load_dataset_wbm_initial
-        self.calc_metrics_fn = calculate_metrics_energy
-
-
-class RS2RETask(MatbenchTask):
-    """Relaxed Structure to Relaxed Energy."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, task_name="RS2RE", **kwargs)
-        self.process_fn = process_batch_static
-        self.load_dataset_fn = load_dataset_wbm_relaxed
-        self.calc_metrics_fn = calculate_metrics_energy
-
-
-class S2EFSTask(MatbenchTask):
-    """Structure to Energy, Forces, Stress."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, task_name="S2EFS", **kwargs)
-        self.process_fn = process_batch_forces
-        self.load_dataset_fn = load_dataset_mp_trj
-        self.calc_metrics_fn = calculate_metrics_forces
-
-
-class S2EFTask(MatbenchTask):
-    """Structure to Energy, Force."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, task_name="S2EF", **kwargs)
-        self.process_fn = process_batch_forces
-        self.load_dataset_fn = load_dataset_mp_trj
-        self.calc_metrics_fn = calculate_metrics_forces
-
-
-class S2EFSMTask(MatbenchTask):
-    """Structure to Energy, Force, Stress, Magmoms."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, task_name="S2EFSM", **kwargs)
-        self.process_fn = process_batch_forces
-        self.load_dataset_fn = load_dataset_mp_trj
-        self.calc_metrics_fn = calculate_metrics_forces
-
-
-class IS2ETask(MatbenchTask):
-    """Initial Structure to Energy."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, task_name="IS2E", **kwargs)
-        self.process_fn = process_batch_static
-        self.load_dataset_fn = load_dataset_wbm_initial
-        self.calc_metrics_fn = calculate_metrics_energy
-
-
-class S2ETask(MatbenchTask):
-    """Structure to Energy."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, task_name="S2E", **kwargs)
-        self.process_fn = process_batch_static
-        self.load_dataset_fn = load_dataset_wbm_relaxed
-        self.calc_metrics_fn = calculate_metrics_energy
-
-
-class S2RETask(MatbenchTask):
-    """Structure to Relaxed Energy."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, task_name="S2RE", **kwargs)
-        self.process_fn = process_batch_relaxation
-        self.load_dataset_fn = load_dataset_wbm_initial
-        self.calc_metrics_fn = calculate_metrics_energy
-
-
-class RP2RETask(MatbenchTask):
-    """Relaxed Prototype to Relaxed Energy."""
+    # Aliases
+    @hog.method()
+    def S2EF(*args, **kwargs):
+        return MatbenchDiscovery.S2EFS(*args, **kwargs)
+
+    @hog.method()
+    def S2EFSM(*args, **kwargs):
+        return MatbenchDiscovery.S2EFS(*args, **kwargs)
+
+    @hog.method()
+    def IS2E(*args, **kwargs):
+        # Same as IS2RE but static? No, IS2E is Initial Structure to Energy (Static).
+        # IS2RE is Relaxation.
+        # IS2E logic:
+        return MatbenchDiscovery._run_task(
+            *args,
+            **kwargs,
+            process_fn=process_batch_static,
+            load_dataset_fn=load_dataset_wbm_initial,
+            calc_metrics_fn=calculate_metrics_energy,
+        )
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, task_name="RP2RE", **kwargs)
-        self.process_fn = process_batch_relaxation
-        self.load_dataset_fn = load_dataset_wbm_initial  # Placeholder
-        self.calc_metrics_fn = calculate_metrics_energy
+    @hog.method()
+    def S2E(*args, **kwargs):
+        # Structure to Energy (Relaxed Structure to Energy) -> RS2RE
+        return MatbenchDiscovery.RS2RE(*args, **kwargs)
 
+    @hog.method()
+    def S2RE(*args, **kwargs):
+        # Structure to Relaxed Energy -> IS2RE
+        return MatbenchDiscovery.IS2RE(*args, **kwargs)
 
-class IP2ETask(MatbenchTask):
-    """Initial Prototype to Energy."""
+    @hog.method()
+    def RP2RE(*args, **kwargs):
+        return MatbenchDiscovery.IS2RE(*args, **kwargs)
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, task_name="IP2E", **kwargs)
-        self.process_fn = process_batch_static
-        self.load_dataset_fn = load_dataset_wbm_initial  # Placeholder
-        self.calc_metrics_fn = calculate_metrics_energy
+    @hog.method()
+    def IP2E(*args, **kwargs):
+        return MatbenchDiscovery.IS2E(*args, **kwargs)
diff --git a/garden_ai/benchmarks/templates/base_runner.py b/garden_ai/benchmarks/templates/base_runner.py
deleted file mode 100644
index 60ed80d6..00000000
--- a/garden_ai/benchmarks/templates/base_runner.py
+++ /dev/null
@@ -1,248 +0,0 @@
-import concurrent.futures
-import json
-import logging
-import multiprocessing
-import os
-import sys
-import time
-from typing import Optional
-
-# ------------------------------------------------------------------------------
-# BOILERPLATE: Logging & Device Setup
-# ------------------------------------------------------------------------------
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] [PID:%(process)d] %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-    stream=sys.stdout,
-    force=True,
-)
-logger = logging.getLogger("benchmark_runner")
-
-
-def setup_device(gpu_id: Optional[int] = None) -> str:
-    """Setup compute device for this process."""
-    try:
-        import torch
-
-        if torch.cuda.is_available():
-            return f"cuda:{gpu_id}" if gpu_id is not None else "cuda"
-        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-            return "mps"
-    except ImportError:
-        pass
-    return "cpu"
-
-
-def convert_numpy_types(obj):
-    """Convert numpy types to Python native types for JSON serialization."""
-    import numpy as np
-
-    if isinstance(obj, (np.integer, np.floating)):
-        return obj.item()
-    elif isinstance(obj, np.ndarray):
-        return obj.tolist()
-    elif isinstance(obj, dict):
-        return {k: convert_numpy_types(v) for k, v in obj.items()}
-    elif isinstance(obj, list):
-        return [convert_numpy_types(item) for item in obj]
-    return obj
-
-
-# ------------------------------------------------------------------------------
-# USER DEFINED FUNCTIONS (Injected)
-# ------------------------------------------------------------------------------
-# - load_model(config, device)
-# - process_batch(batch_id, batch_data, model_config, num_threads)
-# - load_dataset(config) -> List[Any]
-# ------------------------------------------------------------------------------
-
-# ------------------------------------------------------------------------------
-# MAIN EXECUTION LOOP
-# ------------------------------------------------------------------------------
-
-
-def main():
-    if len(sys.argv) != 2:
-        sys.exit("Usage: python benchmark_runner.py <config_file>")
-
-    with open(sys.argv[1]) as f:
-        config = json.load(f)
-
-    logger.info("Starting benchmark runner...")
-
-    checkpoint_path = config.get("checkpoint_path")
-    results = {}
-
-    # Load existing checkpoint if available
-    if checkpoint_path and os.path.exists(checkpoint_path):
-        logger.info(f"Loading checkpoint from {checkpoint_path}")
-        try:
-            with open(checkpoint_path) as f:
-                results = json.load(f)
-            logger.info(f"Found {len(results)} processed items in checkpoint")
-        except Exception as e:
-            logger.warning(f"Failed to load checkpoint: {e}. Starting fresh.")
-
-    # Load Dataset
-    try:
-        all_items = load_dataset(config)  # noqa: F821
-        logger.info(f"Loaded {len(all_items)} total items")
-    except Exception as e:
-        logger.error(f"Failed to load dataset: {e}")
-        import traceback
-
-        traceback.print_exc()
-        sys.exit(1)
-
-    # Filter out already processed items
-    # Assuming items are (id, data) tuples
-    items_to_process = [
-        (item_id, item) for item_id, item in all_items if str(item_id) not in results
-    ]
-
-    if not items_to_process:
-        logger.info("All items already processed!")
-        with open("results.json", "w") as f:
-            json.dump(results, f, indent=2)
-        return
-
-    logger.info(f"Processing {len(items_to_process)} remaining items")
-
-    # Shuffle for load balancing
-    import random
-
-    random.seed(42)
-    random.shuffle(items_to_process)
-
-    # Resource detection
-    try:
-        import torch
-
-        num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
-    except ImportError:
-        num_gpus = 0
-
-    use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1
-
-    total_cores = os.cpu_count() or 1
-    num_workers = num_gpus if use_multi_gpu else 1
-    # Reserve some cores for system/overhead if possible
-    available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores
-    threads_per_worker = max(1, available_cores // num_workers)
-
-    logger.info(
-        f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)"
-    )
-
-    start_time = time.time()
-
-    # Chunk items into smaller batches to allow frequent checkpointing
-    chunk_size = 1000 * num_workers
-    chunks = [
-        items_to_process[i : i + chunk_size]
-        for i in range(0, len(items_to_process), chunk_size)
-    ]
-
-    logger.info(f"Split into {len(chunks)} chunks for processing")
-
-    ctx = multiprocessing.get_context("spawn")
-
-    with concurrent.futures.ProcessPoolExecutor(
-        max_workers=num_workers, mp_context=ctx
-    ) as executor:
-        for chunk_idx, chunk in enumerate(chunks):
-            chunk_start = time.time()
-            logger.info(
-                f"Starting chunk {chunk_idx + 1}/{len(chunks)} ({len(chunk)} items)"
-            )
-
-            # Split chunk among workers
-            futures = []
-            batch_size = (len(chunk) + num_workers - 1) // num_workers
-
-            for i in range(num_workers):
-                start = i * batch_size
-                end = min((i + 1) * batch_size, len(chunk))
-                if start < end:
-                    batch = chunk[start:end]
-
-                    # Inject worker specific config
-                    worker_config = config.copy()
-                    worker_config["gpu_id"] = i if use_multi_gpu else None
-
-                    futures.append(
-                        executor.submit(
-                            process_batch,  # noqa: F821
-                            i,
-                            batch,
-                            worker_config,
-                            threads_per_worker,
-                        )
-                    )
-
-            # Collect results for this chunk
-            chunk_results = {}
-            for future in concurrent.futures.as_completed(futures):
-                try:
-                    batch_res = future.result()
-                    chunk_results.update(batch_res)
-                except Exception as e:
-                    logger.error(f"Worker failed in chunk {chunk_idx}: {e}")
-                    import traceback
-
-                    traceback.print_exc()
-                    # Critical failure - abort benchmark immediately
-                    logger.error("Aborting benchmark due to worker failure")
-                    sys.exit(1)
-
-            # Update main results and save checkpoint
-            results.update(chunk_results)
-
-            if checkpoint_path:
-                try:
-                    tmp_path = checkpoint_path + ".tmp"
-                    with open(tmp_path, "w") as f:
-                        # Convert numpy types before saving checkpoint
-                        clean_results = convert_numpy_types(results)
-                        json.dump(clean_results, f, indent=2)
-                    os.replace(tmp_path, checkpoint_path)
-                    logger.info(f"Checkpoint saved to {checkpoint_path}")
-                except Exception as e:
-                    logger.error(f"Failed to save checkpoint: {e}")
-
-            elapsed = time.time() - chunk_start
-            logger.info(f"Chunk {chunk_idx + 1} complete in {elapsed:.1f}s")
-
-    total_elapsed = time.time() - start_time
-    logger.info(f"Benchmark complete in {total_elapsed:.1f}s.")
-
-    # Calculate metrics from results
-    logger.info("Calculating metrics...")
-    try:
-        metrics = calculate_metrics_remote(results, config)  # noqa: F821
-        logger.info(f"Metrics calculated: {metrics}")
-    except Exception as e:
-        logger.error(f"Failed to calculate metrics: {e}")
-        import traceback
-
-        traceback.print_exc()
-        metrics = {"error": f"Metrics calculation failed: {e}"}
-
-    # Write both results and metrics
-    output = {"results": results, "metrics": metrics}
-
-    # Custom JSON encoder to handle numpy types
-    # convert_numpy_types moved to global scope
-
-    # Convert numpy types before serialization
-    output = convert_numpy_types(output)
-
-    with open("results.json", "w") as f:
-        json.dump(output, f, indent=2)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/garden_ai/benchmarks/utils/script_builder.py b/garden_ai/benchmarks/utils/script_builder.py
index 7987bdc7..9613923c 100644
--- a/garden_ai/benchmarks/utils/script_builder.py
+++ b/garden_ai/benchmarks/utils/script_builder.py
@@ -17,6 +17,8 @@ def __init__(self, template_path: str | Path = None):
         self.imports = set()
         self.functions = []
         self.preamble = []
+        self.pep723_dependencies = []
+        self.pep723_requires_python = None
 
     def add_import(self, import_stmt: str):
         """Add an import statement (e.g. 'import numpy as np')."""
@@ -28,6 +30,14 @@ def add_preamble(self, code: str):
         self.preamble.append(code)
         return self
 
+    def add_pep723_metadata(
+        self, dependencies: list[str], requires_python: str = ">=3.10"
+    ):
+        """Add PEP 723 script metadata."""
+        self.pep723_dependencies.extend(dependencies)
+        self.pep723_requires_python = requires_python
+        return self
+
     def add_function(self, func: Callable, name: str = None):
         """Add a function definition to the script.
 
@@ -72,7 +82,18 @@ def build(self) -> str:
         # Let's just put imports at the top, then functions, then the template content.
         # But we need to be careful about imports in the template.
 
-        final_script = f"""
+        # Construct PEP 723 block
+        pep723_block = ""
+        if self.pep723_dependencies or self.pep723_requires_python:
+            pep723_block = "# /// script\n"
+            if self.pep723_requires_python:
+                pep723_block += f'# requires-python = "{self.pep723_requires_python}"\n'
+            if self.pep723_dependencies:
+                deps_list = '",\n#     "'.join(self.pep723_dependencies)
+                pep723_block += f'# dependencies = [\n#     "{deps_list}",\n# ]\n'
+            pep723_block += "# ///\n"
+
+        final_script = f"""{pep723_block}
 # ------------------------------------------------------------------------------
 # INJECTED IMPORTS
 # ------------------------------------------------------------------------------

From a9bc8cf3110d973bc6008e24c4d5a08ab1ff39b0 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Fri, 5 Dec 2025 11:18:46 -0700
Subject: [PATCH 06/23] update examples scripts, tweak task setup

---
 .../examples/matbench_equiformerv2.py         | 134 +++++------
 .../examples/matbench_mace_multi_gpu.py       |  30 ++-
 .../examples/matbench_mattersim.py            | 103 ++++----
 .../examples/matbench_sevennet.py             | 109 ++++-----
 .../examples/run_random_10k_benchmark.py      | 167 ++++++-------
 .../benchmarks/matbench_discovery/tasks.py    | 226 ++++++++++++++++--
 6 files changed, 449 insertions(+), 320 deletions(-)

diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
index ec3afe91..7855f825 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
@@ -19,16 +19,12 @@
 # Globus Compute endpoint
 ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
 
-# HPC endpoint configuration
-ENDPOINT_CONFIG = {
-    "account": "cis250461-gpu",
-    "partition": "gpu-debug",
-    "qos": "gpu",
-    "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8",
-}
+
+# =============================================================================
+# Model Factory
+# =============================================================================
 
 
-# Model factory function for EquiformerV2
 def create_equiformerv2_model(device):
     """Create EquiformerV2 model calculator.
 
@@ -46,10 +42,6 @@ def create_equiformerv2_model(device):
     )
 
 
-# Benchmark parameters
-NUM_STRUCTURES = 1000
-USE_MULTI_GPU = True
-
 # =============================================================================
 # Run Benchmark
 # =============================================================================
@@ -59,70 +51,66 @@ def main():
     """Run Matbench Discovery S2EFS benchmark with EquiformerV2."""
 
     print("=" * 80)
-    print("Matbench Discovery S2EFS Benchmark")
-    print("=" * 80)
-    print(f"Endpoint:   {ENDPOINT_ID}")
-    print("Model:      EquiformerV2-31M")
-    print("Task:       S2EFS (Structure to Energy, Forces, Stress)")
-    print(f"Structures: {NUM_STRUCTURES}")
-    print(f"Resources:  {'Multi-GPU' if USE_MULTI_GPU else 'Single GPU'}")
+    print("Matbench Discovery S2EFS Benchmark - EquiformerV2")
     print("=" * 80)
+
+    # Run S2EFS task using the new groundhog API
+    # S2EFS is suitable for EquiformerV2 which doesn't support relaxation
+    output = MatbenchDiscovery.S2EFS.remote(
+        endpoint=ENDPOINT_ID,
+        user_endpoint_config={
+            "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8\n",
+            "walltime": 7200,  # 2 hours in seconds
+            "qos": "gpu",
+            "partition": "gpu-debug",
+            "account": "cis250461-gpu",
+            "cores_per_node": 16,
+            "mem_per_node": 32,
+            "requirements": "",
+        },
+        model_factory=create_equiformerv2_model,
+        model_packages="fairchem-core",
+        num_structures="random_100",
+    )
+
+    # Display metrics
     print()
+    print("=" * 80)
+    print("Benchmark Results")
+    print("=" * 80)
+
+    metrics = output.get("metrics", {})
+    if "error" in metrics:
+        print(f"Error: {metrics['error']}")
+    else:
+        # Energy metrics
+        if "energy_mae" in metrics:
+            print("Energy Metrics:")
+            print(f"  MAE (eV/atom):  {metrics.get('energy_mae', 'N/A'):.6f}")
+            print(f"  RMSE (eV/atom): {metrics.get('energy_rmse', 'N/A'):.6f}")
+            print(f"  R²:             {metrics.get('energy_r2', 'N/A'):.6f}")
+            print()
+
+        # Force metrics
+        if "force_mae" in metrics:
+            print("Force Metrics:")
+            print(f"  MAE (eV/Å):     {metrics.get('force_mae', 'N/A'):.6f}")
+            print(f"  RMSE (eV/Å):    {metrics.get('force_rmse', 'N/A'):.6f}")
+            print(f"  R²:             {metrics.get('force_r2', 'N/A'):.6f}")
+            print()
+
+        # Stress metrics
+        if "stress_mae" in metrics:
+            print("Stress Metrics:")
+            print(f"  MAE (GPa):      {metrics.get('stress_mae', 'N/A'):.6f}")
+            print(f"  RMSE (GPa):     {metrics.get('stress_rmse', 'N/A'):.6f}")
+            print(f"  R²:             {metrics.get('stress_r2', 'N/A'):.6f}")
+            print()
+
+        if "num_evaluated" in metrics:
+            print(f"Structures:     {metrics.get('num_evaluated', 'N/A')}")
 
-    with MatbenchDiscovery(
-        endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG
-    ) as bench:
-        # Run S2EFS task (uses relaxed structures, no geometry optimization)
-        # This is suitable for EquiformerV2 which doesn't support relaxation
-        print("Submitting S2EFS task...")
-        future = bench.tasks.S2EFS.submit(
-            model_factory=create_equiformerv2_model,
-            model_package="fairchem-core",
-            num_structures=NUM_STRUCTURES,
-            use_multi_gpu=USE_MULTI_GPU,
-        )
-
-        print("Waiting for results (this may take a while)...")
-        output = future.result()
-
-        # Display metrics
-        print()
-        print("=" * 80)
-        print("Benchmark Results")
-        print("=" * 80)
-
-        metrics = output.get("metrics", {})
-        if "error" in metrics:
-            print(f"Error: {metrics['error']}")
-        else:
-            # Energy metrics
-            if "energy_mae" in metrics:
-                print("Energy Metrics:")
-                print(f"  MAE (eV/atom):  {metrics.get('energy_mae', 'N/A'):.6f}")
-                print(f"  RMSE (eV/atom): {metrics.get('energy_rmse', 'N/A'):.6f}")
-                print(f"  R²:             {metrics.get('energy_r2', 'N/A'):.6f}")
-                print()
-
-            # Force metrics
-            if "force_mae" in metrics:
-                print("Force Metrics:")
-                print(f"  MAE (eV/Å):     {metrics.get('force_mae', 'N/A'):.6f}")
-                print(f"  RMSE (eV/Å):    {metrics.get('force_rmse', 'N/A'):.6f}")
-                print(f"  R²:             {metrics.get('force_r2', 'N/A'):.6f}")
-                print()
-
-            # Stress metrics
-            if "stress_mae" in metrics:
-                print("Stress Metrics:")
-                print(f"  MAE (GPa):      {metrics.get('stress_mae', 'N/A'):.6f}")
-                print(f"  RMSE (GPa):     {metrics.get('stress_rmse', 'N/A'):.6f}")
-                print(f"  R²:             {metrics.get('stress_r2', 'N/A'):.6f}")
-                print()
-
-            if "num_evaluated" in metrics:
-                print(f"Structures:     {metrics.get('num_evaluated', 'N/A')}")
-
-        print("=" * 80)
+    print("=" * 80)
 
 
 if __name__ == "__main__":
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
index 3707f57f..7b3783c2 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
@@ -4,7 +4,9 @@
 using multi-GPU parallelization on a Globus Compute endpoint.
 """
 
-from garden_ai.benchmarks.matbench_discovery import DatasetSize, MatbenchDiscovery
+from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
+
+ANVIL = "5aafb4c1-27b2-40d8-a038-a0277611868f"
 
 
 # Model factory function for MACE
@@ -15,15 +17,25 @@ def create_mace_model(device):
 
 
 results = MatbenchDiscovery.IS2RE.remote(
-    endpoint="5aafb4c1-27b2-40d8-a038-a0277611868f",
-    walltime="01:00:00",
-    scheduler_options={"gpus-per-node": 2, "cores-per-node": 16},
-    account="youraccount",
-    partition="gpu-debug",
-    qos="gpu",
+    endpoint=ANVIL,
+    user_endpoint_config={
+        "scheduler_options": "#SBATCH --gpus-per-node=2\n",
+        "walltime": 3600,
+        "qos": "gpu",
+        "partition": "gpu-debug",
+        "account": "cis250461-gpu",
+        "cores_per_node": 16,
+        "mem_per_node": 32,
+        "requirements": "",  # 'requirements' is required for Anvil endpoint
+    },
     model_factory=create_mace_model,
-    model_packages="mace-torch",
-    num_structures=DatasetSize.RANDOM_100,
+    model_packages=[
+        "mace-torch",
+        "cuequivariance",
+        "cuequivariance-torch",
+        "cuequivariance-ops-torch-cu12",
+    ],
+    num_structures="random_100",
 )
 
 print(results["metrics"])
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py
index fcf77a1c..22099e9e 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py
@@ -16,16 +16,12 @@
 # Globus Compute endpoint
 ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
 
-# HPC endpoint configuration
-ENDPOINT_CONFIG = {
-    "account": "cis250461-gpu",
-    "partition": "gpu-debug",
-    "qos": "gpu",
-    "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8",
-}
+
+# =============================================================================
+# Model Factory
+# =============================================================================
 
 
-# Model factory function for MatterSim
 def create_mattersim_model(device):
     """Create MatterSim model calculator.
 
@@ -40,10 +36,6 @@ def create_mattersim_model(device):
     return MatterSimCalculator(device=device)
 
 
-# Benchmark parameters
-NUM_STRUCTURES = 1000
-USE_MULTI_GPU = True
-
 # =============================================================================
 # Run Benchmark
 # =============================================================================
@@ -53,55 +45,52 @@ def main():
     """Run Matbench Discovery IS2RE benchmark with MatterSim."""
 
     print("=" * 80)
-    print("Matbench Discovery IS2RE Benchmark")
-    print("=" * 80)
-    print(f"Endpoint:   {ENDPOINT_ID}")
-    print("Model:      MatterSim")
-    print(f"Structures: {NUM_STRUCTURES}")
-    print(f"Resources:  {'Multi-GPU' if USE_MULTI_GPU else 'Single GPU'}")
+    print("Matbench Discovery IS2RE Benchmark - MatterSim")
     print("=" * 80)
+
+    # Run IS2RE task using the new groundhog API
+    output = MatbenchDiscovery.IS2RE.remote(
+        endpoint=ENDPOINT_ID,
+        user_endpoint_config={
+            "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8\n",
+            "walltime": 7200,  # 2 hours in seconds
+            "qos": "gpu",
+            "partition": "gpu-debug",
+            "account": "cis250461-gpu",
+            "cores_per_node": 16,
+            "mem_per_node": 32,
+            "requirements": "",
+        },
+        model_factory=create_mattersim_model,
+        model_packages="mattersim",
+        num_structures="random_100",
+    )
+
+    # Display metrics
     print()
+    print("=" * 80)
+    print("Benchmark Results")
+    print("=" * 80)
 
-    with MatbenchDiscovery(
-        endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG
-    ) as bench:
-        # Run IS2RE task
-        print("Submitting IS2RE task...")
-        future = bench.tasks.IS2RE.submit(
-            model_factory=create_mattersim_model,
-            model_package="mattersim",
-            num_structures=NUM_STRUCTURES,
-            use_multi_gpu=USE_MULTI_GPU,
-        )
-
-        print("Waiting for results (this may take a while)...")
-        output = future.result()
-
-        # Display metrics
+    metrics = output.get("metrics", {})
+    if "error" in metrics:
+        print(f"Error: {metrics['error']}")
+    else:
+        # Discovery metrics
+        print(f"F1 Score:       {metrics.get('F1', 'N/A'):.6f}")
+        print(f"DAF:            {metrics.get('DAF', 'N/A'):.2f}x")
+        print(f"Precision:      {metrics.get('Precision', 'N/A'):.6f}")
+        print(f"Recall:         {metrics.get('Recall', 'N/A'):.6f}")
+        print(f"Accuracy:       {metrics.get('Accuracy', 'N/A'):.6f}")
+        print()
+        # Regression metrics
+        print(f"MAE (eV/atom):  {metrics.get('MAE', 'N/A'):.6f}")
+        print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}")
+        print(f"R²:             {metrics.get('R2', 'N/A'):.6f}")
         print()
-        print("=" * 80)
-        print("Benchmark Results")
-        print("=" * 80)
-
-        metrics = output.get("metrics", {})
-        if "error" in metrics:
-            print(f"Error: {metrics['error']}")
-        else:
-            # Discovery metrics
-            print(f"F1 Score:       {metrics.get('F1', 'N/A'):.6f}")
-            print(f"DAF:            {metrics.get('DAF', 'N/A'):.2f}x")
-            print(f"Precision:      {metrics.get('Precision', 'N/A'):.6f}")
-            print(f"Recall:         {metrics.get('Recall', 'N/A'):.6f}")
-            print(f"Accuracy:       {metrics.get('Accuracy', 'N/A'):.6f}")
-            print()
-            # Regression metrics
-            print(f"MAE (eV/atom):  {metrics.get('MAE', 'N/A'):.6f}")
-            print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}")
-            print(f"R²:             {metrics.get('R2', 'N/A'):.6f}")
-            print()
-            print(f"Structures:     {metrics.get('num_evaluated', 'N/A')}")
-
-        print("=" * 80)
+        print(f"Structures:     {metrics.get('num_evaluated', 'N/A')}")
+
+    print("=" * 80)
 
 
 if __name__ == "__main__":
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py
index d028b740..e24d0d69 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py
@@ -17,18 +17,12 @@
 # Globus Compute endpoint (replace with your endpoint UUID)
 ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
 
-# HPC endpoint configuration (adjust for your cluster)
-ENDPOINT_CONFIG = {
-    "account": "cis250461-gpu",
-    "partition": "gpu-debug",
-    "qos": "gpu",
-    "scheduler_options": "#SBATCH --gpus-per-node=2\n",
-    "cores_per_node": 16,
-    "mem_per_node": 32,  # GB
-}
-
-
-# Model factory function for SevenNet
+
+# =============================================================================
+# Model Factory
+# =============================================================================
+
+
 def create_sevennet_model(device):
     """Create SevenNet model calculator.
 
@@ -43,10 +37,6 @@ def create_sevennet_model(device):
     return SevenNetCalculator(model="7net-0", device=device)
 
 
-# Benchmark parameters
-NUM_STRUCTURES = 1000  # Number of structures to evaluate
-USE_MULTI_GPU = True  # Enable multi-GPU parallelization
-
 # =============================================================================
 # Run Benchmark
 # =============================================================================
@@ -56,55 +46,52 @@ def main():
     """Run Matbench Discovery IS2RE benchmark with SevenNet."""
 
     print("=" * 80)
-    print("Matbench Discovery IS2RE Benchmark")
-    print("=" * 80)
-    print(f"Endpoint:   {ENDPOINT_ID}")
-    print("Model:      SevenNet (7net-0)")
-    print(f"Structures: {NUM_STRUCTURES}")
-    print(f"Resources:  {'Multi-GPU' if USE_MULTI_GPU else 'Single GPU'}")
+    print("Matbench Discovery IS2RE Benchmark - SevenNet")
     print("=" * 80)
+
+    # Run IS2RE task using the new groundhog API
+    output = MatbenchDiscovery.IS2RE.remote(
+        endpoint=ENDPOINT_ID,
+        user_endpoint_config={
+            "scheduler_options": "#SBATCH --gpus-per-node=2\n",
+            "walltime": 7200,  # 2 hours in seconds
+            "qos": "gpu",
+            "partition": "gpu-debug",
+            "account": "cis250461-gpu",
+            "cores_per_node": 16,
+            "mem_per_node": 32,
+            "requirements": "",
+        },
+        model_factory=create_sevennet_model,
+        model_packages="sevenn",
+        num_structures="random_100",
+    )
+
+    # Display metrics
     print()
+    print("=" * 80)
+    print("Benchmark Results")
+    print("=" * 80)
 
-    with MatbenchDiscovery(
-        endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG
-    ) as bench:
-        # Run IS2RE task (Initial Structure to Relaxed Energy)
-        print("Submitting IS2RE task...")
-        future = bench.tasks.IS2RE.submit(
-            model_factory=create_sevennet_model,
-            model_package="sevenn",
-            num_structures=NUM_STRUCTURES,
-            use_multi_gpu=USE_MULTI_GPU,
-        )
-
-        print("Waiting for results (this may take a while)...")
-        output = future.result()
-
-        # Display metrics
+    metrics = output.get("metrics", {})
+    if "error" in metrics:
+        print(f"Error: {metrics['error']}")
+    else:
+        # Discovery metrics
+        print(f"F1 Score:       {metrics.get('F1', 'N/A'):.6f}")
+        print(f"DAF:            {metrics.get('DAF', 'N/A'):.2f}x")
+        print(f"Precision:      {metrics.get('Precision', 'N/A'):.6f}")
+        print(f"Recall:         {metrics.get('Recall', 'N/A'):.6f}")
+        print(f"Accuracy:       {metrics.get('Accuracy', 'N/A'):.6f}")
         print()
-        print("=" * 80)
-        print("Benchmark Results")
-        print("=" * 80)
-
-        metrics = output.get("metrics", {})
-        if "error" in metrics:
-            print(f"Error: {metrics['error']}")
-        else:
-            # Discovery metrics
-            print(f"F1 Score:       {metrics.get('F1', 'N/A'):.6f}")
-            print(f"DAF:            {metrics.get('DAF', 'N/A'):.2f}x")
-            print(f"Precision:      {metrics.get('Precision', 'N/A'):.6f}")
-            print(f"Recall:         {metrics.get('Recall', 'N/A'):.6f}")
-            print(f"Accuracy:       {metrics.get('Accuracy', 'N/A'):.6f}")
-            print()
-            # Regression metrics
-            print(f"MAE (eV/atom):  {metrics.get('MAE', 'N/A'):.6f}")
-            print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}")
-            print(f"R²:             {metrics.get('R2', 'N/A'):.6f}")
-            print()
-            print(f"Structures:     {metrics.get('num_evaluated', 'N/A')}")
-
-        print("=" * 80)
+        # Regression metrics
+        print(f"MAE (eV/atom):  {metrics.get('MAE', 'N/A'):.6f}")
+        print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}")
+        print(f"R²:             {metrics.get('R2', 'N/A'):.6f}")
+        print()
+        print(f"Structures:     {metrics.get('num_evaluated', 'N/A')}")
+
+    print("=" * 80)
 
 
 if __name__ == "__main__":
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py
index 96c8208f..c171239e 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python3
 """
-Run Matbench Discovery benchmarks on 10k most stable structures.
+Run Matbench Discovery benchmarks on 10k random structures.
 
-This script benchmarks MACE, MatterSim, and SevenNet on the 10k most stable
-materials from the unique prototypes subset and saves comprehensive metrics to JSON.
+This script benchmarks MACE, MatterSim, and SevenNet on a random 10k
+sample from the unique prototypes subset and saves comprehensive metrics to JSON.
 """
 
 import json
 from datetime import datetime
 from pathlib import Path
 
-from garden_ai.benchmarks.matbench_discovery import DatasetSize, MatbenchDiscovery
+from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
 
 # =============================================================================
 # Configuration
@@ -19,18 +19,21 @@
 # Globus Compute endpoint
 ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
 
-# HPC endpoint configuration
+# Common endpoint configuration
 ENDPOINT_CONFIG = {
-    "account": "cis250461-gpu",
-    "partition": "gpu",
-    "qos": "gpu",
     "scheduler_options": "#SBATCH --gpus-per-node=4\n",
-    "cores_per_node": 8,
-    "mem_per_node": 32,
+    "walltime": 14400,  # 4 hours in seconds
+    "qos": "gpu",
+    "partition": "gpu",
+    "account": "cis250461-gpu",
+    "cores_per_node": 16,
+    "mem_per_node": 64,
+    "requirements": "",
 }
 
 # Output file for metrics
-OUTPUT_FILE = "stable_10k_benchmark_results.json"
+OUTPUT_FILE = "random_10k_benchmark_results.json"
+
 
 # =============================================================================
 # Model Factory Functions
@@ -61,19 +64,25 @@ def create_sevennet_model(device):
 # Model configurations
 MODELS = {
     "MACE": {
-        "package": "mace-torch",
+        "packages": [
+            "mace-torch",
+            "cuequivariance",
+            "cuequivariance-torch",
+            "cuequivariance-ops-torch-cu12",
+        ],
         "factory": create_mace_model,
     },
     "MatterSim": {
-        "package": "mattersim",
+        "packages": ["mattersim"],
         "factory": create_mattersim_model,
     },
     "SevenNet": {
-        "package": "sevenn",
+        "packages": ["sevenn"],
         "factory": create_sevennet_model,
     },
 }
 
+
 # =============================================================================
 # Run Benchmarks
 # =============================================================================
@@ -83,9 +92,9 @@ def main():
     """Run benchmarks on all models and save results."""
 
     print("=" * 80)
-    print("Matbench Discovery Benchmark - Stable 10k")
+    print("Matbench Discovery Benchmark - Random 10k")
     print("=" * 80)
-    print("Dataset: 10k Most Stable Structures")
+    print("Dataset: Random 10k from Unique Prototypes")
     print(f"Models: {', '.join(MODELS.keys())}")
     print(f"Endpoint: {ENDPOINT_ID}")
     print("=" * 80)
@@ -94,96 +103,54 @@ def main():
     results = {
         "metadata": {
             "timestamp": datetime.now().isoformat(),
-            "dataset": "stable_10k",
+            "dataset": "random_10k",
             "dataset_size": 10000,
             "endpoint_id": ENDPOINT_ID,
         },
         "models": {},
     }
 
-    with MatbenchDiscovery(
-        endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG
-    ) as bench:
-        for model_name, config in MODELS.items():
-            print(f"\n{'=' * 80}")
-            print(f"Running {model_name}...")
-            print(f"{'=' * 80}\n")
-
-            try:
-                # Submit job
-                future = bench.tasks.IS2RE.submit(
-                    model_factory=config["factory"],
-                    model_packages=[
-                        config["package"],
-                        "cuequivariance",
-                        "cuequivariance-torch",
-                        "cuequivariance-ops-torch-cu12",
-                    ],
-                    num_structures=DatasetSize.RANDOM_10K,
-                )
-
-                print(f"Job submitted for {model_name}. Waiting for results...")
-
-                try:
-                    output = future.result()
-                except Exception as e:
-                    print(f"⚠️ {model_name} failed first attempt: {e}")
-                    print(f"   Resuming from checkpoint: {future.checkpoint_path}")
-
-                    # Extract checkpoint name from path
-                    checkpoint_name = Path(future.checkpoint_path).name
-
-                    # Resubmit with same checkpoint name to resume
-                    retry_future = bench.tasks.IS2RE.submit(
-                        model_factory=config["factory"],
-                        model_packages=[
-                            config["package"],
-                            "cuequivariance",
-                            "cuequivariance-torch",
-                            "cuequivariance-ops-torch-cu12",
-                        ],
-                        num_structures=DatasetSize.RANDOM_10K,
-                        checkpoint_name=checkpoint_name,
-                    )
-
-                    try:
-                        print("   Retry job submitted. Waiting for results...")
-                        output = retry_future.result()
-                        print("   ✅ Retry successful!")
-                    except Exception as retry_e:
-                        print(f"❌ {model_name} failed retry: {retry_e}")
-                        results["models"][model_name] = {
-                            "status": "error",
-                            "error": str(retry_e),
-                        }
-                        continue  # Skip to next model
-
-                # Store complete output (contains both metrics and per-structure results)
-                results["models"][model_name] = {
-                    "status": "success",
-                    **output,  # Unpack entire output dict (metrics + results)
-                }
-
-                # Display metrics
-                metrics = output.get("metrics", {})
-                if "error" in metrics:
-                    print(f"❌ {model_name} failed: {metrics['error']}")
-                    results["models"][model_name]["status"] = "failed"
-                    results["models"][model_name]["error"] = metrics["error"]
-                else:
-                    print(f"✅ {model_name} completed successfully!")
-                    print(f"   F1 Score:       {metrics.get('F1', 'N/A'):.6f}")
-                    print(f"   DAF:            {metrics.get('DAF', 'N/A'):.2f}x")
-                    print(f"   MAE (eV/atom):  {metrics.get('MAE', 'N/A'):.6f}")
-                    print(f"   RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}")
-                    print(f"   Structures:     {metrics.get('num_evaluated', 'N/A')}")
-
-            except Exception as e:
-                print(f"❌ {model_name} error: {e}")
-                results["models"][model_name] = {
-                    "status": "error",
-                    "error": str(e),
-                }
+    for model_name, config in MODELS.items():
+        print(f"\n{'=' * 80}")
+        print(f"Running {model_name}...")
+        print(f"{'=' * 80}\n")
+
+        try:
+            # Run benchmark using the new groundhog API
+            output = MatbenchDiscovery.IS2RE.remote(
+                endpoint=ENDPOINT_ID,
+                user_endpoint_config=ENDPOINT_CONFIG,
+                model_factory=config["factory"],
+                model_packages=config["packages"],
+                num_structures="random_10k",
+            )
+
+            # Store complete output (contains both metrics and per-structure results)
+            results["models"][model_name] = {
+                "status": "success",
+                **output,
+            }
+
+            # Display metrics
+            metrics = output.get("metrics", {})
+            if "error" in metrics:
+                print(f"❌ {model_name} failed: {metrics['error']}")
+                results["models"][model_name]["status"] = "failed"
+                results["models"][model_name]["error"] = metrics["error"]
+            else:
+                print(f"✅ {model_name} completed successfully!")
+                print(f"   F1 Score:       {metrics.get('F1', 'N/A'):.6f}")
+                print(f"   DAF:            {metrics.get('DAF', 'N/A'):.2f}x")
+                print(f"   MAE (eV/atom):  {metrics.get('MAE', 'N/A'):.6f}")
+                print(f"   RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}")
+                print(f"   Structures:     {metrics.get('num_evaluated', 'N/A')}")
+
+        except Exception as e:
+            print(f"❌ {model_name} error: {e}")
+            results["models"][model_name] = {
+                "status": "error",
+                "error": str(e),
+            }
 
     # Save results to JSON
     output_path = Path(OUTPUT_FILE)
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index b130912b..7bb9592d 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -214,10 +214,11 @@ def _process_batch_common(
     num_threads: int,
     compute_fn: Callable[[Any, Any], Dict[str, Any]],
     task_name: str,
-    model_factory: Callable[[str], Any],
+    model_factory_source: str,
 ) -> Dict[str, Any]:
     import logging
     import os
+    import re
     import time
 
     import torch
@@ -236,6 +237,19 @@ def _process_batch_common(
     global _MODEL_CACHE
     try:
         if _MODEL_CACHE is None:
+            # Reconstruct model_factory from source code
+            func_name_match = re.search(r"def\s+(\w+)\s*\(", model_factory_source)
+            if not func_name_match:
+                raise ValueError(
+                    "Could not extract function name from model_factory source"
+                )
+            func_name = func_name_match.group(1)
+
+            # Execute the source to define the function
+            local_namespace = {}
+            exec(model_factory_source, local_namespace)
+            model_factory = local_namespace[func_name]
+
             model = model_factory(device)
             _MODEL_CACHE = model
         else:
@@ -350,7 +364,7 @@ def process_batch_relaxation(
     structures: List[Any],
     model_config: Dict[str, Any],
     num_threads: int,
-    model_factory: Callable[[str], Any],
+    model_factory_source: str,
 ) -> Dict[str, Any]:
     from ase.optimize import FIRE
 
@@ -368,7 +382,7 @@ def compute(model, atoms):
         num_threads,
         compute,
         "relaxation",
-        model_factory,
+        model_factory_source,
     )
 
 
@@ -377,7 +391,7 @@ def process_batch_static(
     structures: List[Any],
     model_config: Dict[str, Any],
     num_threads: int,
-    model_factory: Callable[[str], Any],
+    model_factory_source: str,
 ) -> Dict[str, Any]:
     def compute(model, atoms):
         atoms.calc = model
@@ -391,7 +405,7 @@ def compute(model, atoms):
         num_threads,
         compute,
         "static calculation",
-        model_factory,
+        model_factory_source,
     )
 
 
@@ -400,7 +414,7 @@ def process_batch_forces(
     structures: List[Any],
     model_config: Dict[str, Any],
     num_threads: int,
-    model_factory: Callable[[str], Any],
+    model_factory_source: str,
 ) -> Dict[str, Any]:
     def compute(model, atoms):
         atoms.calc = model
@@ -416,7 +430,7 @@ def compute(model, atoms):
         num_threads,
         compute,
         "forces calculation",
-        model_factory,
+        model_factory_source,
     )
 
 
@@ -582,7 +596,8 @@ def calculate_metrics_forces(
 
 def run_benchmark_hog(
     config: Dict[str, Any],
-    model_factory: Any,
+    model_packages: str | List[str],
+    model_factory_source: str,
     load_dataset_fn: Any,
     process_fn: Any,
     calc_metrics_fn: Any,
@@ -590,6 +605,54 @@ def run_benchmark_hog(
     logger = setup_logging()
     logger.info("Starting benchmark runner...")
 
+    # Install model packages if specified
+    if model_packages:
+        import subprocess
+
+        packages = (
+            model_packages if isinstance(model_packages, list) else [model_packages]
+        )
+        logger.info(f"Installing model packages: {packages}")
+        try:
+            result = subprocess.run(
+                ["uv", "pip", "install"] + packages,
+                capture_output=True,
+                text=True,
+                timeout=300,  # 5 minute timeout
+            )
+            if result.returncode != 0:
+                error_msg = (
+                    f"Failed to install model packages: {packages}\n"
+                    f"stdout: {result.stdout}\n"
+                    f"stderr: {result.stderr}"
+                )
+                logger.error(error_msg)
+                raise RuntimeError(error_msg)
+            logger.info("Model packages installed successfully")
+        except subprocess.TimeoutExpired:
+            error_msg = f"Model package installation timed out after 300s: {packages}"
+            logger.error(error_msg)
+            raise RuntimeError(error_msg)
+        except Exception as e:
+            if isinstance(e, RuntimeError):
+                raise  # Re-raise our own errors
+            error_msg = f"Could not install model packages: {e}"
+            logger.error(error_msg)
+            raise RuntimeError(error_msg) from e
+
+    # Fix SSL certificate issues on HPC nodes using certifi
+    try:
+        import ssl
+
+        import certifi
+
+        os.environ["SSL_CERT_FILE"] = certifi.where()
+        os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()
+        ssl._create_default_https_context = ssl.create_default_context
+        logger.info(f"SSL certificates configured: {certifi.where()}")
+    except ImportError:
+        logger.warning("certifi not available, SSL issues may occur")
+
     checkpoint_path = config.get("checkpoint_path")
     results = {}
 
@@ -679,7 +742,7 @@ def run_benchmark_hog(
                             batch,
                             worker_config,
                             threads_per_worker,
-                            model_factory,
+                            model_factory_source,
                         )
                     )
 
@@ -729,12 +792,66 @@ def run_benchmark_hog(
     return output
 
 
+# ------------------------------------------------------------------------------
+# BENCHMARK METHOD WRAPPER
+# ------------------------------------------------------------------------------
+
+
+class BenchmarkMethod:
+    """Wrapper around groundhog Method that handles model_factory source extraction.
+
+    This wrapper intercepts .remote(), .local(), and .submit() calls to automatically
+    extract source code from the model_factory callable before passing to groundhog.
+    This avoids pickle serialization issues with functions defined in __main__.
+    """
+
+    def __init__(self, hog_method):
+        """Initialize wrapper with the underlying groundhog Method."""
+        self._hog_method = hog_method
+
+    def _extract_factory_source(self, kwargs):
+        """Extract source code from model_factory if it's a callable."""
+        import inspect
+
+        if "model_factory" in kwargs:
+            factory = kwargs["model_factory"]
+            if callable(factory) and not isinstance(factory, str):
+                try:
+                    kwargs["model_factory"] = inspect.getsource(factory)
+                except (OSError, TypeError) as e:
+                    raise ValueError(
+                        f"Could not extract source code from model_factory. "
+                        f"Ensure the function is defined in a file (not interactive/lambda). "
+                        f"Error: {e}"
+                    )
+        return kwargs
+
+    def remote(self, *args, **kwargs):
+        """Execute remotely with automatic model_factory source extraction."""
+        kwargs = self._extract_factory_source(kwargs)
+        return self._hog_method.remote(*args, **kwargs)
+
+    def local(self, *args, **kwargs):
+        """Execute locally with automatic model_factory source extraction."""
+        kwargs = self._extract_factory_source(kwargs)
+        return self._hog_method.local(*args, **kwargs)
+
+    def submit(self, *args, **kwargs):
+        """Submit for async execution with automatic model_factory source extraction."""
+        kwargs = self._extract_factory_source(kwargs)
+        return self._hog_method.submit(*args, **kwargs)
+
+    def __call__(self, *args, **kwargs):
+        """Direct call (for local execution within groundhog)."""
+        return self._hog_method(*args, **kwargs)
+
+
 # ------------------------------------------------------------------------------
 # CLASS DEFINITION
 # ------------------------------------------------------------------------------
 
 
-class MatbenchDiscovery:
+class _MatbenchDiscoveryBase:
     """Matbench Discovery tasks using Groundhog HPC."""
 
     REPO_URL = "https://github.com/janosh/matbench-discovery"
@@ -742,7 +859,7 @@ class MatbenchDiscovery:
 
     @staticmethod
     def _prepare_runner_config(
-        num_structures: int | "DatasetSize" | "DatasetConfig",
+        num_structures: int | "DatasetSize" | "DatasetConfig" | str,
         repo_url: str = REPO_URL,
         repo_ref: str = REPO_REF,
     ) -> Dict[str, Any]:
@@ -757,7 +874,10 @@ def _prepare_runner_config(
         subset = "full"
         seed = 42
 
-        if hasattr(num_structures, "value"):  # Enum
+        if isinstance(num_structures, str):
+            # String value like "random_100" - use directly as subset
+            subset = num_structures
+        elif hasattr(num_structures, "value"):  # Enum
             subset = num_structures.value
             # Check for seed method/attr if it's our custom Config
             if hasattr(num_structures, "seed"):
@@ -818,6 +938,23 @@ def _run_task(
         calc_metrics_fn: Any,
         sys_path: List[str] | None = None,
     ) -> Dict[str, Any]:
+        import inspect
+
+        # Handle model_factory as either a callable or source string
+        # For remote execution, user should pass inspect.getsource(factory)
+        # For local execution, can pass the function directly
+        if isinstance(model_factory, str):
+            model_factory_source = model_factory
+        else:
+            try:
+                model_factory_source = inspect.getsource(model_factory)
+            except (OSError, TypeError) as e:
+                raise ValueError(
+                    f"Could not extract source code from model_factory. "
+                    f"For remote execution, use: inspect.getsource(your_factory). "
+                    f"Error: {e}"
+                )
+
         # Add custom sys.path if provided (useful for local execution/testing)
         if sys_path:
             import sys
@@ -850,7 +987,8 @@ def _run_task(
 
         return run_benchmark_hog(
             runner_config,
-            model_factory,
+            model_packages,
+            model_factory_source,
             load_dataset_fn,
             process_fn,
             calc_metrics_fn,
@@ -925,18 +1063,18 @@ def S2EFS(
     # Aliases
     @hog.method()
     def S2EF(*args, **kwargs):
-        return MatbenchDiscovery.S2EFS(*args, **kwargs)
+        return _MatbenchDiscoveryBase.S2EFS(*args, **kwargs)
 
     @hog.method()
     def S2EFSM(*args, **kwargs):
-        return MatbenchDiscovery.S2EFS(*args, **kwargs)
+        return _MatbenchDiscoveryBase.S2EFS(*args, **kwargs)
 
     @hog.method()
     def IS2E(*args, **kwargs):
         # Same as IS2RE but static? No, IS2E is Initial Structure to Energy (Static).
         # IS2RE is Relaxation.
         # IS2E logic:
-        return MatbenchDiscovery._run_task(
+        return _MatbenchDiscoveryBase._run_task(
             *args,
             **kwargs,
             process_fn=process_batch_static,
@@ -947,17 +1085,65 @@ def IS2E(*args, **kwargs):
     @hog.method()
     def S2E(*args, **kwargs):
         # Structure to Energy (Relaxed Structure to Energy) -> RS2RE
-        return MatbenchDiscovery.RS2RE(*args, **kwargs)
+        return _MatbenchDiscoveryBase.RS2RE(*args, **kwargs)
 
     @hog.method()
     def S2RE(*args, **kwargs):
         # Structure to Relaxed Energy -> IS2RE
-        return MatbenchDiscovery.IS2RE(*args, **kwargs)
+        return _MatbenchDiscoveryBase.IS2RE(*args, **kwargs)
 
     @hog.method()
     def RP2RE(*args, **kwargs):
-        return MatbenchDiscovery.IS2RE(*args, **kwargs)
+        return _MatbenchDiscoveryBase.IS2RE(*args, **kwargs)
 
     @hog.method()
     def IP2E(*args, **kwargs):
-        return MatbenchDiscovery.IS2E(*args, **kwargs)
+        return _MatbenchDiscoveryBase.IS2E(*args, **kwargs)
+
+
+# ------------------------------------------------------------------------------
+# PUBLIC API - Wrapped methods with automatic source extraction
+# ------------------------------------------------------------------------------
+
+
+class MatbenchDiscovery:
+    """Matbench Discovery benchmark tasks.
+
+    This class provides wrapped methods that automatically handle model_factory
+    source extraction for remote execution. Users can pass callable functions
+    directly without needing to call inspect.getsource() themselves.
+
+    Example:
+        def create_mace_model(device):
+            from mace.calculators import mace_mp
+            return mace_mp(model="medium", device=device)
+
+        # Just pass the function - source extraction is automatic
+        results = MatbenchDiscovery.IS2RE.remote(
+            endpoint="your-endpoint-id",
+            model_factory=create_mace_model,
+            model_packages="mace-torch",
+        )
+    """
+
+    REPO_URL = _MatbenchDiscoveryBase.REPO_URL
+    REPO_REF = _MatbenchDiscoveryBase.REPO_REF
+
+    # Internal methods (needed for remote execution compatibility)
+    _prepare_runner_config = _MatbenchDiscoveryBase._prepare_runner_config
+    _generate_checkpoint_name = _MatbenchDiscoveryBase._generate_checkpoint_name
+    _run_task = _MatbenchDiscoveryBase._run_task
+
+    # Main benchmark tasks - wrapped for automatic model_factory source extraction
+    IS2RE = BenchmarkMethod(_MatbenchDiscoveryBase.IS2RE)
+    RS2RE = BenchmarkMethod(_MatbenchDiscoveryBase.RS2RE)
+    S2EFS = BenchmarkMethod(_MatbenchDiscoveryBase.S2EFS)
+
+    # Aliases
+    S2EF = BenchmarkMethod(_MatbenchDiscoveryBase.S2EF)
+    S2EFSM = BenchmarkMethod(_MatbenchDiscoveryBase.S2EFSM)
+    IS2E = BenchmarkMethod(_MatbenchDiscoveryBase.IS2E)
+    S2E = BenchmarkMethod(_MatbenchDiscoveryBase.S2E)
+    S2RE = BenchmarkMethod(_MatbenchDiscoveryBase.S2RE)
+    RP2RE = BenchmarkMethod(_MatbenchDiscoveryBase.RP2RE)
+    IP2E = BenchmarkMethod(_MatbenchDiscoveryBase.IP2E)

From 9abea84be87f55be630d81b78cade75b409a7ca2 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Mon, 8 Dec 2025 11:20:26 -0700
Subject: [PATCH 07/23] tweak metrics, print checkpoint path for sync calls

---
 .../examples/matbench_mace_multi_gpu.py       |  12 +-
 .../benchmarks/matbench_discovery/tasks.py    | 207 +++++++++++++-----
 garden_ai/benchmarks/utils/meta_metrics.py    | 161 ++++++++++++++
 3 files changed, 320 insertions(+), 60 deletions(-)
 create mode 100644 garden_ai/benchmarks/utils/meta_metrics.py

diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
index 7b3783c2..787587c3 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
@@ -4,12 +4,13 @@
 using multi-GPU parallelization on a Globus Compute endpoint.
 """
 
+from rich import print
+
 from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
 
 ANVIL = "5aafb4c1-27b2-40d8-a038-a0277611868f"
 
 
-# Model factory function for MACE
 def create_mace_model(device):
     from mace.calculators import mace_mp
 
@@ -19,13 +20,12 @@ def create_mace_model(device):
 results = MatbenchDiscovery.IS2RE.remote(
     endpoint=ANVIL,
     user_endpoint_config={
-        "scheduler_options": "#SBATCH --gpus-per-node=2\n",
+        "scheduler_options": "#SBATCH --gpus-per-node=4\n",
         "walltime": 3600,
         "qos": "gpu",
-        "partition": "gpu-debug",
+        "partition": "gpu",
         "account": "cis250461-gpu",
         "cores_per_node": 16,
-        "mem_per_node": 32,
         "requirements": "",  # 'requirements' is required for Anvil endpoint
     },
     model_factory=create_mace_model,
@@ -35,7 +35,7 @@ def create_mace_model(device):
         "cuequivariance-torch",
         "cuequivariance-ops-torch-cu12",
     ],
-    num_structures="random_100",
+    num_structures=100,
 )
 
-print(results["metrics"])
+print(results)
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index 7bb9592d..1b27b79a 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -11,7 +11,6 @@
 #     "matbench-discovery",
 # ]
 # ///
-"""Matbench Discovery benchmark task implementations using Groundhog HPC."""
 
 from __future__ import annotations
 
@@ -35,10 +34,6 @@
 if TYPE_CHECKING:
     from .enums import DatasetConfig, DatasetSize
 
-# ------------------------------------------------------------------------------
-# BOILERPLATE: Logging & Device Setup
-# ------------------------------------------------------------------------------
-
 
 def setup_logging():
     logging.basicConfig(
@@ -80,11 +75,33 @@ def convert_numpy_types(obj):
     return obj
 
 
-# ------------------------------------------------------------------------------
-# METRICS HELPERS (Inlined from metrics.py)
-# ------------------------------------------------------------------------------
+# Meta metrics functions - will be injected from source for remote execution
+get_hardware_info = None
+extract_model_info = None
+calculate_run_metadata = None
+
+
+def _inject_meta_metrics(source: str) -> None:
+    """Inject meta_metrics functions from source code for remote execution."""
+    global get_hardware_info, extract_model_info, calculate_run_metadata
+    namespace = {}
+    exec(source, namespace)
+    get_hardware_info = namespace["get_hardware_info"]
+    extract_model_info = namespace["extract_model_info"]
+    calculate_run_metadata = namespace["calculate_run_metadata"]
+
+
+def _get_meta_metrics_source() -> str:
+    """Get source code of meta_metrics module (called locally)."""
+    import inspect
 
+    from garden_ai.benchmarks.utils import meta_metrics
 
+    return inspect.getsource(meta_metrics)
+
+
+# Metrics calculations lifted from https://github.com/janosh/matbench-discovery/tree/main/matbench_discovery/metrics
+# Since they aren't setup to be easily imported, we just copy them here
 def classify_stable(
     each_true: Sequence[float] | pd.Series | np.ndarray,
     each_pred: Sequence[float] | pd.Series | np.ndarray,
@@ -125,6 +142,7 @@ def classify_stable(
     return true_pos, false_neg, false_pos, true_neg
 
 
+# This is also coptied from the matbench-discovery repo
 def stable_metrics(
     each_true: Sequence[float] | pd.Series | np.ndarray,
     each_pred: Sequence[float] | pd.Series | np.ndarray,
@@ -200,10 +218,6 @@ def stable_metrics(
     )
 
 
-# ------------------------------------------------------------------------------
-# REMOTE HELPERS (Inlined from remote.py)
-# ------------------------------------------------------------------------------
-
 _MODEL_CACHE = None
 
 
@@ -589,15 +603,11 @@ def calculate_metrics_forces(
     return result_metrics
 
 
-# ------------------------------------------------------------------------------
-# MAIN RUNNER (Inlined from runners.py)
-# ------------------------------------------------------------------------------
-
-
 def run_benchmark_hog(
     config: Dict[str, Any],
     model_packages: str | List[str],
     model_factory_source: str,
+    meta_metrics_source: str,
     load_dataset_fn: Any,
     process_fn: Any,
     calc_metrics_fn: Any,
@@ -605,6 +615,15 @@ def run_benchmark_hog(
     logger = setup_logging()
     logger.info("Starting benchmark runner...")
 
+    # Inject meta_metrics functions from source
+    _inject_meta_metrics(meta_metrics_source)
+
+    # Collect hardware and model info
+    hardware_info = get_hardware_info()
+    model_info = extract_model_info(model_packages)
+    logger.info(f"Hardware: {hardware_info}")
+    logger.info(f"Model: {model_info}")
+
     # Install model packages if specified
     if model_packages:
         import subprocess
@@ -681,7 +700,15 @@ def run_benchmark_hog(
 
     if not items_to_process:
         logger.info("All items already processed!")
-        return {"results": results, "metrics": {}}
+        run_metadata = calculate_run_metadata(
+            hardware_info=hardware_info,
+            model_info=model_info,
+            total_elapsed=0,
+            num_workers=0,
+            num_structures_total=len(all_items),
+            num_structures_processed=0,
+        )
+        return {"metrics": {}, "run_metadata": run_metadata}
 
     logger.info(f"Processing {len(items_to_process)} remaining items")
 
@@ -698,7 +725,12 @@ def run_benchmark_hog(
         num_gpus = 0
 
     use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1
-    total_cores = os.cpu_count() or 1
+    # Use sched_getaffinity to get cores available to this job, not total cores on node
+    try:
+        total_cores = len(os.sched_getaffinity(0))
+    except AttributeError:
+        # Fallback for systems without sched_getaffinity (e.g., macOS)
+        total_cores = os.cpu_count() or 1
     num_workers = num_gpus if use_multi_gpu else 1
     available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores
     threads_per_worker = max(1, available_cores // num_workers)
@@ -787,32 +819,34 @@ def run_benchmark_hog(
         traceback.print_exc()
         metrics = {"error": f"Metrics calculation failed: {e}"}
 
-    output = {"results": results, "metrics": metrics}
+    # Calculate run metadata
+    run_metadata = calculate_run_metadata(
+        hardware_info=hardware_info,
+        model_info=model_info,
+        total_elapsed=total_elapsed,
+        num_workers=num_workers,
+        num_structures_total=len(all_items),
+        num_structures_processed=len(items_to_process),
+    )
+    logger.info(f"Run metadata: {run_metadata}")
+
+    output = {"metrics": metrics, "run_metadata": run_metadata}
     output = convert_numpy_types(output)
     return output
 
 
-# ------------------------------------------------------------------------------
-# BENCHMARK METHOD WRAPPER
-# ------------------------------------------------------------------------------
-
-
 class BenchmarkMethod:
-    """Wrapper around groundhog Method that handles model_factory source extraction.
-
-    This wrapper intercepts .remote(), .local(), and .submit() calls to automatically
-    extract source code from the model_factory callable before passing to groundhog.
-    This avoids pickle serialization issues with functions defined in __main__.
-    """
+    """Wrapper around groundhog Method that handles source extraction for remote execution."""
 
     def __init__(self, hog_method):
         """Initialize wrapper with the underlying groundhog Method."""
         self._hog_method = hog_method
 
-    def _extract_factory_source(self, kwargs):
-        """Extract source code from model_factory if it's a callable."""
+    def _extract_sources(self, kwargs):
+        """Extract source code from model_factory and meta_metrics for remote execution."""
         import inspect
 
+        # Extract model_factory source
         if "model_factory" in kwargs:
             factory = kwargs["model_factory"]
             if callable(factory) and not isinstance(factory, str):
@@ -824,21 +858,88 @@ def _extract_factory_source(self, kwargs):
                         f"Ensure the function is defined in a file (not interactive/lambda). "
                         f"Error: {e}"
                     )
+
+        # Extract meta_metrics source (runs locally where garden_ai is available)
+        kwargs["meta_metrics_source"] = _get_meta_metrics_source()
+
         return kwargs
 
+    def _get_checkpoint_path_info(self, kwargs):
+        """Determine and return checkpoint path information from kwargs."""
+        checkpoint_path = kwargs.get("checkpoint_path")
+        checkpoint_name = kwargs.get("checkpoint_name")
+        model_packages = kwargs.get("model_packages", "")
+
+        if checkpoint_path:
+            return checkpoint_path, "resuming"
+        elif checkpoint_name:
+            final_path = os.path.expanduser(f"~/.garden/benchmarks/{checkpoint_name}")
+            return final_path, "new"
+        else:
+            # Generate checkpoint name using same logic as _run_task
+            num_structures = kwargs.get("num_structures", 100)
+
+            # Determine subset string for checkpoint name
+            subset = "full"
+            if isinstance(num_structures, str):
+                subset = num_structures
+            elif hasattr(num_structures, "value"):  # DatasetSize enum
+                subset = num_structures.value
+            elif hasattr(num_structures, "subset"):  # DatasetConfig
+                subset = num_structures.subset.value
+            elif isinstance(num_structures, int):
+                subset = "full" if num_structures >= 200000 else f"num_{num_structures}"
+
+            # Extract model name from packages
+            model_str = "unknown"
+            if isinstance(model_packages, list):
+                model_str = "_".join(
+                    pkg.split("/")[-1].split("@")[0] for pkg in model_packages[:2]
+                )
+            elif isinstance(model_packages, str):
+                model_str = model_packages.split("/")[-1].split("@")[0]
+
+            # Generate timestamp and uuid like in _generate_checkpoint_name
+            import time
+            import uuid
+
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            short_uuid = str(uuid.uuid4())[:8]
+            checkpoint_name = (
+                f"matbench_{model_str}_{subset}_{timestamp}_{short_uuid}.json"
+            )
+            final_path = os.path.expanduser(f"~/.garden/benchmarks/{checkpoint_name}")
+            return final_path, "new"
+
+    def _print_checkpoint_info(self, kwargs):
+        """Print checkpoint information before execution."""
+        checkpoint_path, checkpoint_type = self._get_checkpoint_path_info(kwargs)
+
+        print("=" * 80)
+        if checkpoint_type == "resuming":
+            print(f"📂 Resuming from checkpoint: {checkpoint_path}")
+        else:
+            print(f"💾 Checkpoint will be saved to: {checkpoint_path}")
+        print("   To resume this benchmark if it fails, use:")
+        print(f'   checkpoint_path="{checkpoint_path}"')
+        print("=" * 80)
+
     def remote(self, *args, **kwargs):
-        """Execute remotely with automatic model_factory source extraction."""
-        kwargs = self._extract_factory_source(kwargs)
+        """Execute remotely with automatic source extraction."""
+        kwargs = self._extract_sources(kwargs)
+        self._print_checkpoint_info(kwargs)
         return self._hog_method.remote(*args, **kwargs)
 
     def local(self, *args, **kwargs):
-        """Execute locally with automatic model_factory source extraction."""
-        kwargs = self._extract_factory_source(kwargs)
+        """Execute locally with automatic source extraction."""
+        kwargs = self._extract_sources(kwargs)
+        self._print_checkpoint_info(kwargs)
         return self._hog_method.local(*args, **kwargs)
 
     def submit(self, *args, **kwargs):
-        """Submit for async execution with automatic model_factory source extraction."""
-        kwargs = self._extract_factory_source(kwargs)
+        """Submit for async execution with automatic source extraction."""
+        kwargs = self._extract_sources(kwargs)
+        self._print_checkpoint_info(kwargs)
         return self._hog_method.submit(*args, **kwargs)
 
     def __call__(self, *args, **kwargs):
@@ -846,11 +947,6 @@ def __call__(self, *args, **kwargs):
         return self._hog_method(*args, **kwargs)
 
 
-# ------------------------------------------------------------------------------
-# CLASS DEFINITION
-# ------------------------------------------------------------------------------
-
-
 class _MatbenchDiscoveryBase:
     """Matbench Discovery tasks using Groundhog HPC."""
 
@@ -937,12 +1033,11 @@ def _run_task(
         load_dataset_fn: Any,
         calc_metrics_fn: Any,
         sys_path: List[str] | None = None,
+        meta_metrics_source: str | None = None,
     ) -> Dict[str, Any]:
         import inspect
 
         # Handle model_factory as either a callable or source string
-        # For remote execution, user should pass inspect.getsource(factory)
-        # For local execution, can pass the function directly
         if isinstance(model_factory, str):
             model_factory_source = model_factory
         else:
@@ -955,7 +1050,7 @@ def _run_task(
                     f"Error: {e}"
                 )
 
-        # Add custom sys.path if provided (useful for local execution/testing)
+        # Add custom sys.path if provided
         if sys_path:
             import sys
 
@@ -980,15 +1075,19 @@ def _run_task(
             final_checkpoint_path = os.path.expanduser(
                 f"~/.garden/benchmarks/{checkpoint_name}"
             )
-            # Ensure directory exists
             os.makedirs(os.path.dirname(final_checkpoint_path), exist_ok=True)
 
         runner_config["checkpoint_path"] = final_checkpoint_path
 
+        # meta_metrics_source is injected by BenchmarkMethod wrapper
+        if meta_metrics_source is None:
+            raise ValueError("meta_metrics_source required for benchmark execution")
+
         return run_benchmark_hog(
             runner_config,
             model_packages,
             model_factory_source,
+            meta_metrics_source,
             load_dataset_fn,
             process_fn,
             calc_metrics_fn,
@@ -1002,6 +1101,7 @@ def IS2RE(
         checkpoint_name: str | None = None,
         checkpoint_path: str | None = None,
         sys_path: List[str] | None = None,
+        meta_metrics_source: str | None = None,
     ) -> Dict[str, Any]:
         """Initial Structure to Relaxed Energy."""
         return MatbenchDiscovery._run_task(
@@ -1014,6 +1114,7 @@ def IS2RE(
             load_dataset_wbm_initial,
             calculate_metrics_energy,
             sys_path=sys_path,
+            meta_metrics_source=meta_metrics_source,
         )
 
     @hog.method()
@@ -1024,6 +1125,7 @@ def RS2RE(
         checkpoint_name: str | None = None,
         checkpoint_path: str | None = None,
         sys_path: List[str] | None = None,
+        meta_metrics_source: str | None = None,
     ) -> Dict[str, Any]:
         """Relaxed Structure to Relaxed Energy."""
         return MatbenchDiscovery._run_task(
@@ -1036,6 +1138,7 @@ def RS2RE(
             load_dataset_wbm_relaxed,
             calculate_metrics_energy,
             sys_path=sys_path,
+            meta_metrics_source=meta_metrics_source,
         )
 
     @hog.method()
@@ -1046,6 +1149,7 @@ def S2EFS(
         checkpoint_name: str | None = None,
         checkpoint_path: str | None = None,
         sys_path: List[str] | None = None,
+        meta_metrics_source: str | None = None,
     ) -> Dict[str, Any]:
         """Structure to Energy, Forces, Stress."""
         return MatbenchDiscovery._run_task(
@@ -1058,6 +1162,7 @@ def S2EFS(
             load_dataset_mp_trj,
             calculate_metrics_forces,
             sys_path=sys_path,
+            meta_metrics_source=meta_metrics_source,
         )
 
     # Aliases
@@ -1101,11 +1206,6 @@ def IP2E(*args, **kwargs):
         return _MatbenchDiscoveryBase.IS2E(*args, **kwargs)
 
 
-# ------------------------------------------------------------------------------
-# PUBLIC API - Wrapped methods with automatic source extraction
-# ------------------------------------------------------------------------------
-
-
 class MatbenchDiscovery:
     """Matbench Discovery benchmark tasks.
 
@@ -1118,7 +1218,6 @@ def create_mace_model(device):
             from mace.calculators import mace_mp
             return mace_mp(model="medium", device=device)
 
-        # Just pass the function - source extraction is automatic
         results = MatbenchDiscovery.IS2RE.remote(
             endpoint="your-endpoint-id",
             model_factory=create_mace_model,
diff --git a/garden_ai/benchmarks/utils/meta_metrics.py b/garden_ai/benchmarks/utils/meta_metrics.py
new file mode 100644
index 00000000..e18120d4
--- /dev/null
+++ b/garden_ai/benchmarks/utils/meta_metrics.py
@@ -0,0 +1,161 @@
+"""Meta-level benchmark metrics utilities.
+
+Shared utilities for collecting hardware info, estimating costs, and extracting
+model metadata that can be reused across different benchmark implementations.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+# GPU hourly cost estimates (USD) - Modal pricing (https://modal.com/pricing)
+GPU_HOURLY_COSTS = {
+    "B200": 6.25,  # $0.001736/sec
+    "H200": 4.54,  # $0.001261/sec
+    "H100": 3.95,  # $0.001097/sec
+    "A100-80GB": 2.50,  # $0.000694/sec (80GB variant)
+    "A100": 2.10,  # $0.000583/sec (40GB variant)
+    "L40S": 1.95,  # $0.000542/sec
+    "A10": 1.10,  # $0.000306/sec
+    "L4": 0.80,  # $0.000222/sec
+    "T4": 0.59,  # $0.000164/sec
+    "default": 2.00,  # Fallback for unknown GPUs
+}
+
+# Model name inference from package names
+MODEL_PACKAGE_NAMES = {
+    "mace": "MACE",
+    "mattersim": "MatterSim",
+    "sevennet": "SevenNet",
+    "chgnet": "CHGNet",
+    "equiformer": "EquiformerV2",
+    "orb": "Orb",
+    "m3gnet": "M3GNet",
+    "alignn": "ALIGNN",
+}
+
+
+def get_hardware_info() -> Dict[str, Any]:
+    """Collect hardware information about the execution environment.
+
+    Returns:
+        Dictionary containing:
+        - device_type: "cuda", "mps", or "cpu"
+        - num_gpus: Number of GPUs available
+        - gpu_names: List of GPU names
+        - gpu_memory_gb: Memory of first GPU in GB (if available)
+    """
+    info = {"device_type": "cpu", "num_gpus": 0, "gpu_names": [], "gpu_memory_gb": None}
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            info["device_type"] = "cuda"
+            info["num_gpus"] = torch.cuda.device_count()
+            info["gpu_names"] = [
+                torch.cuda.get_device_name(i) for i in range(info["num_gpus"])
+            ]
+            if info["num_gpus"] > 0:
+                props = torch.cuda.get_device_properties(0)
+                info["gpu_memory_gb"] = round(props.total_memory / (1024**3), 1)
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            info["device_type"] = "mps"
+    except ImportError:
+        pass
+    return info
+
+
+def get_gpu_hourly_cost(gpu_name: str) -> float:
+    """Estimate hourly cost for a GPU based on its name.
+
+    Args:
+        gpu_name: GPU name string (e.g., "NVIDIA A100-SXM4-40GB")
+
+    Returns:
+        Estimated hourly cost in USD
+    """
+    gpu_name_upper = gpu_name.upper()
+    for key in GPU_HOURLY_COSTS:
+        if key != "default" and key.upper() in gpu_name_upper:
+            return GPU_HOURLY_COSTS[key]
+    return GPU_HOURLY_COSTS["default"]
+
+
+def extract_model_info(model_packages: str | List[str]) -> Dict[str, Any]:
+    """Extract model info from package specification.
+
+    Args:
+        model_packages: Package name(s) used to install the model
+
+    Returns:
+        Dictionary containing:
+        - model_name: Inferred model name or "unknown"
+        - model_packages: List of package names
+    """
+    packages = model_packages if isinstance(model_packages, list) else [model_packages]
+    model_name = "unknown"
+    for pkg in packages:
+        pkg_lower = pkg.lower()
+        for key, name in MODEL_PACKAGE_NAMES.items():
+            if key in pkg_lower:
+                model_name = name
+                break
+        if model_name != "unknown":
+            break
+    return {"model_name": model_name, "model_packages": packages}
+
+
+def calculate_run_metadata(
+    hardware_info: Dict[str, Any],
+    model_info: Dict[str, Any],
+    total_elapsed: float,
+    num_workers: int,
+    num_structures_total: int,
+    num_structures_processed: int,
+) -> Dict[str, Any]:
+    """Calculate run metadata including timing, cost, and hardware info.
+
+    Args:
+        hardware_info: Output from get_hardware_info()
+        model_info: Output from extract_model_info()
+        total_elapsed: Total benchmark runtime in seconds
+        num_workers: Number of worker processes used
+        num_structures_total: Total structures in dataset
+        num_structures_processed: Structures processed in this run
+
+    Returns:
+        Complete run_metadata dictionary
+    """
+    throughput = num_structures_total / total_elapsed if total_elapsed > 0 else 0
+
+    # Calculate cost estimate
+    gpu_hourly_cost = (
+        get_gpu_hourly_cost(hardware_info["gpu_names"][0])
+        if hardware_info["gpu_names"]
+        else 0
+    )
+    total_gpu_hours = (total_elapsed / 3600) * num_workers
+    total_cost = total_gpu_hours * gpu_hourly_cost
+    cost_per_1k = (
+        (total_cost / num_structures_total) * 1000 if num_structures_total > 0 else 0
+    )
+
+    return {
+        "model": model_info,
+        "hardware": hardware_info,
+        "timing": {
+            "total_seconds": round(total_elapsed, 2),
+            "throughput_per_second": round(throughput, 3),
+            "num_workers": num_workers,
+        },
+        "cost": {
+            "gpu_hourly_rate_usd": gpu_hourly_cost,
+            "total_gpu_hours": round(total_gpu_hours, 4),
+            "estimated_cost_usd": round(total_cost, 4),
+            "estimated_cost_per_1000_structures_usd": round(cost_per_1k, 4),
+        },
+        "dataset": {
+            "num_structures_total": num_structures_total,
+            "num_structures_processed": num_structures_processed,
+        },
+    }

From 018bb369835c93edb9572cd4eb78c64efb3d76f2 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Mon, 8 Dec 2025 11:29:39 -0700
Subject: [PATCH 08/23] fix checkpoint path for remote calls

---
 .../examples/matbench_mace_multi_gpu.py       |  6 +-
 .../benchmarks/matbench_discovery/tasks.py    | 94 +++++++++++++------
 2 files changed, 67 insertions(+), 33 deletions(-)

diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
index 787587c3..354ca456 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
@@ -14,14 +14,14 @@
 def create_mace_model(device):
     from mace.calculators import mace_mp
 
-    return mace_mp(model="medium", device=device, default_dtype="float64")
+    return mace_mp(model="medium-mpa-0", device=device, default_dtype="float64")
 
 
 results = MatbenchDiscovery.IS2RE.remote(
     endpoint=ANVIL,
     user_endpoint_config={
         "scheduler_options": "#SBATCH --gpus-per-node=4\n",
-        "walltime": 3600,
+        "walltime": "05:00:00",
         "qos": "gpu",
         "partition": "gpu",
         "account": "cis250461-gpu",
@@ -35,7 +35,7 @@ def create_mace_model(device):
         "cuequivariance-torch",
         "cuequivariance-ops-torch-cu12",
     ],
-    num_structures=100,
+    checkpoint_path="~/.garden/benchmarks/matbench_mace-torch_cuequivariance_full_20251208_115719_ed2e47af.json",
 )
 
 print(results)
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index 1b27b79a..7471b3bd 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -2,7 +2,6 @@
 # requires-python = ">=3.10"
 # dependencies = [
 #     "groundhog-hpc",
-#     "garden-ai",
 #     "ase",
 #     "numpy",
 #     "pandas",
@@ -348,9 +347,11 @@ def _load_dataset_common(
                 if x.split(".")[0].isdigit()
                 else float("inf"),
             )
-            num_structures = config.get("num_structures", 100)
-            if isinstance(num_structures, int):
-                file_list = file_list[:num_structures]
+            # Only limit structures if explicitly specified (not when using full dataset)
+            if "num_structures" in config:
+                num_structures = config["num_structures"]
+                if isinstance(num_structures, int):
+                    file_list = file_list[:num_structures]
         else:
             mat_id_set = set(mat_ids)
             file_list = [
@@ -864,20 +865,31 @@ def _extract_sources(self, kwargs):
 
         return kwargs
 
-    def _get_checkpoint_path_info(self, kwargs):
-        """Determine and return checkpoint path information from kwargs."""
+    def _get_checkpoint_info_for_display(self, kwargs, is_remote: bool):
+        """Get checkpoint information to display to the user.
+
+        Args:
+            kwargs: Method keyword arguments
+            is_remote: True if this is a remote/submit call, False for local
+
+        Returns:
+            Tuple of (display_message, checkpoint_identifier, is_resuming)
+        """
         checkpoint_path = kwargs.get("checkpoint_path")
         checkpoint_name = kwargs.get("checkpoint_name")
-        model_packages = kwargs.get("model_packages", "")
 
         if checkpoint_path:
-            return checkpoint_path, "resuming"
-        elif checkpoint_name:
-            final_path = os.path.expanduser(f"~/.garden/benchmarks/{checkpoint_name}")
-            return final_path, "new"
-        else:
-            # Generate checkpoint name using same logic as _run_task
-            num_structures = kwargs.get("num_structures", 100)
+            # User provided explicit path
+            if is_remote:
+                msg = f"Resuming from checkpoint on remote system: {checkpoint_path}"
+            else:
+                msg = f"Resuming from checkpoint: {checkpoint_path}"
+            return msg, checkpoint_path, True
+
+        # Generate checkpoint name
+        if not checkpoint_name:
+            model_packages = kwargs.get("model_packages", "")
+            num_structures = kwargs.get("num_structures", "full")
 
             # Determine subset string for checkpoint name
             subset = "full"
@@ -908,38 +920,60 @@ def _get_checkpoint_path_info(self, kwargs):
             checkpoint_name = (
                 f"matbench_{model_str}_{subset}_{timestamp}_{short_uuid}.json"
             )
-            final_path = os.path.expanduser(f"~/.garden/benchmarks/{checkpoint_name}")
-            return final_path, "new"
 
-    def _print_checkpoint_info(self, kwargs):
-        """Print checkpoint information before execution."""
-        checkpoint_path, checkpoint_type = self._get_checkpoint_path_info(kwargs)
+        # Construct display message
+        if is_remote:
+            msg = f"Checkpoint will be saved on remote system: ~/.garden/benchmarks/{checkpoint_name}"
+            identifier = f"~/.garden/benchmarks/{checkpoint_name}"
+        else:
+            local_path = os.path.expanduser(f"~/.garden/benchmarks/{checkpoint_name}")
+            msg = f"Checkpoint will be saved locally: {local_path}"
+            identifier = local_path
+
+        return msg, identifier, False
+
+    def _print_checkpoint_info(self, kwargs, is_remote: bool):
+        """Print checkpoint information before execution.
+
+        Args:
+            kwargs: Method keyword arguments
+            is_remote: True if this is a remote/submit call, False for local
+        """
+        msg, identifier, is_resuming = self._get_checkpoint_info_for_display(
+            kwargs, is_remote
+        )
 
         print("=" * 80)
-        if checkpoint_type == "resuming":
-            print(f"📂 Resuming from checkpoint: {checkpoint_path}")
+        if is_resuming:
+            print(f"📂 {msg}")
+        else:
+            print(f"💾 {msg}")
+
+        if is_remote:
+            print("   To resume this benchmark if it fails, use:")
+            print(f'   checkpoint_path="{identifier}"')
+            print("   Note: Checkpoint is on the remote system, not your local machine")
         else:
-            print(f"💾 Checkpoint will be saved to: {checkpoint_path}")
-        print("   To resume this benchmark if it fails, use:")
-        print(f'   checkpoint_path="{checkpoint_path}"')
+            print("   To resume this benchmark if it fails, use:")
+            print(f'   checkpoint_path="{identifier}"')
         print("=" * 80)
 
     def remote(self, *args, **kwargs):
         """Execute remotely with automatic source extraction."""
         kwargs = self._extract_sources(kwargs)
-        self._print_checkpoint_info(kwargs)
+        self._print_checkpoint_info(kwargs, is_remote=True)
         return self._hog_method.remote(*args, **kwargs)
 
     def local(self, *args, **kwargs):
         """Execute locally with automatic source extraction."""
         kwargs = self._extract_sources(kwargs)
-        self._print_checkpoint_info(kwargs)
+        self._print_checkpoint_info(kwargs, is_remote=False)
         return self._hog_method.local(*args, **kwargs)
 
     def submit(self, *args, **kwargs):
         """Submit for async execution with automatic source extraction."""
         kwargs = self._extract_sources(kwargs)
-        self._print_checkpoint_info(kwargs)
+        self._print_checkpoint_info(kwargs, is_remote=True)
         return self._hog_method.submit(*args, **kwargs)
 
     def __call__(self, *args, **kwargs):
@@ -1097,7 +1131,7 @@ def _run_task(
     def IS2RE(
         model_factory: Any,
         model_packages: str | List[str],
-        num_structures: int | "DatasetSize" | "DatasetConfig" = 100,
+        num_structures: int | "DatasetSize" | "DatasetConfig" = "full",
         checkpoint_name: str | None = None,
         checkpoint_path: str | None = None,
         sys_path: List[str] | None = None,
@@ -1121,7 +1155,7 @@ def IS2RE(
     def RS2RE(
         model_factory: Any,
         model_packages: str | List[str],
-        num_structures: int | "DatasetSize" | "DatasetConfig" = 100,
+        num_structures: int | "DatasetSize" | "DatasetConfig" = "full",
         checkpoint_name: str | None = None,
         checkpoint_path: str | None = None,
         sys_path: List[str] | None = None,
@@ -1145,7 +1179,7 @@ def RS2RE(
     def S2EFS(
         model_factory: Any,
         model_packages: str | List[str],
-        num_structures: int | "DatasetSize" | "DatasetConfig" = 100,
+        num_structures: int | "DatasetSize" | "DatasetConfig" = "full",
         checkpoint_name: str | None = None,
         checkpoint_path: str | None = None,
         sys_path: List[str] | None = None,

From cd62506e86cb1a0d31aff668afaa74d8a26d348d Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Tue, 9 Dec 2025 14:11:47 -0700
Subject: [PATCH 09/23] cleanup, remove old examples

---
 .../examples/dummy_model.py                   |  17 --
 .../examples/run_random_10k_benchmark.py      | 190 ----------------
 .../examples/test_hog_refactor.py             |  70 ------
 .../benchmarks/matbench_discovery/tasks.py    | 141 +-----------
 garden_ai/benchmarks/utils/remote.py          | 176 ---------------
 .../benchmarks/utils/remote_execution.py      | 202 ------------------
 garden_ai/benchmarks/utils/script_builder.py  | 117 ----------
 garden_ai/benchmarks/utils/task.py            | 132 ------------
 8 files changed, 2 insertions(+), 1043 deletions(-)
 delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py
 delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py
 delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py
 delete mode 100644 garden_ai/benchmarks/utils/remote.py
 delete mode 100644 garden_ai/benchmarks/utils/remote_execution.py
 delete mode 100644 garden_ai/benchmarks/utils/script_builder.py
 delete mode 100644 garden_ai/benchmarks/utils/task.py

diff --git a/garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py b/garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py
deleted file mode 100644
index 745eb1b1..00000000
--- a/garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py
+++ /dev/null
@@ -1,17 +0,0 @@
-def create_dummy_model(device):
-    """Create a dummy calculator for testing."""
-    import numpy as np
-    from ase.calculators.calculator import Calculator, all_changes
-
-    class DummyCalc(Calculator):
-        implemented_properties = ["energy", "forces", "stress"]
-
-        def calculate(
-            self, atoms=None, properties=["energy"], system_changes=all_changes
-        ):
-            super().calculate(atoms, properties, system_changes)
-            self.results["energy"] = -1.0 * len(self.atoms)
-            self.results["forces"] = np.zeros((len(self.atoms), 3))
-            self.results["stress"] = np.zeros(6)
-
-    return DummyCalc()
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py
deleted file mode 100644
index c171239e..00000000
--- a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py
+++ /dev/null
@@ -1,190 +0,0 @@
-#!/usr/bin/env python3
-"""
-Run Matbench Discovery benchmarks on 10k random structures.
-
-This script benchmarks MACE, MatterSim, and SevenNet on a random 10k
-sample from the unique prototypes subset and saves comprehensive metrics to JSON.
-"""
-
-import json
-from datetime import datetime
-from pathlib import Path
-
-from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
-
-# =============================================================================
-# Configuration
-# =============================================================================
-
-# Globus Compute endpoint
-ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
-
-# Common endpoint configuration
-ENDPOINT_CONFIG = {
-    "scheduler_options": "#SBATCH --gpus-per-node=4\n",
-    "walltime": 14400,  # 4 hours in seconds
-    "qos": "gpu",
-    "partition": "gpu",
-    "account": "cis250461-gpu",
-    "cores_per_node": 16,
-    "mem_per_node": 64,
-    "requirements": "",
-}
-
-# Output file for metrics
-OUTPUT_FILE = "random_10k_benchmark_results.json"
-
-
-# =============================================================================
-# Model Factory Functions
-# =============================================================================
-
-
-def create_mace_model(device):
-    """Create MACE model calculator."""
-    from mace.calculators import mace_mp
-
-    return mace_mp(model="medium-mpa-0", device=device, default_dtype="float64")
-
-
-def create_mattersim_model(device):
-    """Create MatterSim model calculator."""
-    from mattersim.forcefield import MatterSimCalculator
-
-    return MatterSimCalculator(device=device)
-
-
-def create_sevennet_model(device):
-    """Create SevenNet model calculator."""
-    from sevenn.calculator import SevenNetCalculator
-
-    return SevenNetCalculator(model="7net-l3i5", device=device)
-
-
-# Model configurations
-MODELS = {
-    "MACE": {
-        "packages": [
-            "mace-torch",
-            "cuequivariance",
-            "cuequivariance-torch",
-            "cuequivariance-ops-torch-cu12",
-        ],
-        "factory": create_mace_model,
-    },
-    "MatterSim": {
-        "packages": ["mattersim"],
-        "factory": create_mattersim_model,
-    },
-    "SevenNet": {
-        "packages": ["sevenn"],
-        "factory": create_sevennet_model,
-    },
-}
-
-
-# =============================================================================
-# Run Benchmarks
-# =============================================================================
-
-
-def main():
-    """Run benchmarks on all models and save results."""
-
-    print("=" * 80)
-    print("Matbench Discovery Benchmark - Random 10k")
-    print("=" * 80)
-    print("Dataset: Random 10k from Unique Prototypes")
-    print(f"Models: {', '.join(MODELS.keys())}")
-    print(f"Endpoint: {ENDPOINT_ID}")
-    print("=" * 80)
-    print()
-
-    results = {
-        "metadata": {
-            "timestamp": datetime.now().isoformat(),
-            "dataset": "random_10k",
-            "dataset_size": 10000,
-            "endpoint_id": ENDPOINT_ID,
-        },
-        "models": {},
-    }
-
-    for model_name, config in MODELS.items():
-        print(f"\n{'=' * 80}")
-        print(f"Running {model_name}...")
-        print(f"{'=' * 80}\n")
-
-        try:
-            # Run benchmark using the new groundhog API
-            output = MatbenchDiscovery.IS2RE.remote(
-                endpoint=ENDPOINT_ID,
-                user_endpoint_config=ENDPOINT_CONFIG,
-                model_factory=config["factory"],
-                model_packages=config["packages"],
-                num_structures="random_10k",
-            )
-
-            # Store complete output (contains both metrics and per-structure results)
-            results["models"][model_name] = {
-                "status": "success",
-                **output,
-            }
-
-            # Display metrics
-            metrics = output.get("metrics", {})
-            if "error" in metrics:
-                print(f"❌ {model_name} failed: {metrics['error']}")
-                results["models"][model_name]["status"] = "failed"
-                results["models"][model_name]["error"] = metrics["error"]
-            else:
-                print(f"✅ {model_name} completed successfully!")
-                print(f"   F1 Score:       {metrics.get('F1', 'N/A'):.6f}")
-                print(f"   DAF:            {metrics.get('DAF', 'N/A'):.2f}x")
-                print(f"   MAE (eV/atom):  {metrics.get('MAE', 'N/A'):.6f}")
-                print(f"   RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}")
-                print(f"   Structures:     {metrics.get('num_evaluated', 'N/A')}")
-
-        except Exception as e:
-            print(f"❌ {model_name} error: {e}")
-            results["models"][model_name] = {
-                "status": "error",
-                "error": str(e),
-            }
-
-    # Save results to JSON
-    output_path = Path(OUTPUT_FILE)
-    with open(output_path, "w") as f:
-        json.dump(results, f, indent=2)
-
-    print(f"\n{'=' * 80}")
-    print("Benchmark Complete!")
-    print(f"{'=' * 80}")
-    print(f"\nResults saved to: {output_path.absolute()}")
-
-    # Print summary table
-    print(f"\n{'=' * 80}")
-    print("Summary")
-    print(f"{'=' * 80}\n")
-    print(f"{'Model':<15} {'Status':<10} {'F1':<10} {'DAF':<10} {'MAE':<10}")
-    print("-" * 80)
-
-    for model_name, data in results["models"].items():
-        if data["status"] == "success":
-            metrics = data["metrics"]
-            print(
-                f"{model_name:<15} {data['status']:<10} "
-                f"{metrics.get('F1', 0):<10.6f} "
-                f"{metrics.get('DAF', 0):<10.2f} "
-                f"{metrics.get('MAE', 0):<10.6f}"
-            )
-        else:
-            print(
-                f"{model_name:<15} {data['status']:<10} {'N/A':<10} {'N/A':<10} {'N/A':<10}"
-            )
-
-    print()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py b/garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py
deleted file mode 100644
index eccf0489..00000000
--- a/garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test Matbench Discovery refactor with Groundhog HPC.
-"""
-
-import os
-
-from dummy_model import create_dummy_model
-
-from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
-
-# =============================================================================
-# Configuration
-# =============================================================================
-
-# Globus Compute endpoint (use local if possible, or the one from example)
-ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
-
-# HPC endpoint configuration
-ENDPOINT_CONFIG = {
-    "account": "cis250461-gpu",
-    "partition": "gpu",
-    "qos": "gpu",
-    "scheduler_options": "#SBATCH --gpus-per-node=1\n",
-    "cores_per_node": 4,
-    "mem_per_node": 16,
-}
-
-# =============================================================================
-# Model Factory Functions
-# =============================================================================
-
-
-def main():
-    """Run benchmarks on all models and save results."""
-
-    print("=" * 80)
-    print("Matbench Discovery Test - Groundhog Refactor")
-    print("=" * 80)
-
-    print("Running LOCAL test...")
-
-    # Ensure subprocess can find dummy_model
-    cwd = os.getcwd()
-    os.environ["PYTHONPATH"] = cwd + os.pathsep + os.environ.get("PYTHONPATH", "")
-
-    try:
-        # Run locally using the new static method API
-        output = MatbenchDiscovery.IS2RE.local(
-            model_factory=create_dummy_model,
-            model_packages=["numpy", "ase"],  # Minimal deps
-            num_structures=1,
-            sys_path=[os.getcwd()],
-        )
-        print("Local run output keys:", output.keys())
-        if "error" in output.get("metrics", {}):
-            print("Local metrics error:", output["metrics"]["error"])
-        else:
-            print("Local run successful!")
-            print("Metrics:", output.get("metrics"))
-
-    except Exception as e:
-        print(f"Local run failed: {e}")
-        import traceback
-
-        traceback.print_exc()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index 7471b3bd..d087c06a 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -1,16 +1,3 @@
-# /// script
-# requires-python = ">=3.10"
-# dependencies = [
-#     "groundhog-hpc",
-#     "ase",
-#     "numpy",
-#     "pandas",
-#     "scikit-learn",
-#     "torch",
-#     "matbench-discovery",
-# ]
-# ///
-
 from __future__ import annotations
 
 import concurrent.futures
@@ -20,15 +7,14 @@
 import os
 import sys
 import time
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
 
 import groundhog_hpc as hog
 import numpy as np
 import pandas as pd
 from sklearn.metrics import r2_score
 
-# Ensure local modules can be imported during local execution
-sys.path.append(os.getcwd())
+from .metrics import stable_metrics
 
 if TYPE_CHECKING:
     from .enums import DatasetConfig, DatasetSize
@@ -61,7 +47,6 @@ def setup_device(gpu_id: Optional[int] = None) -> str:
 
 def convert_numpy_types(obj):
     """Convert numpy types to Python native types for JSON serialization."""
-    import numpy as np
 
     if isinstance(obj, (np.integer, np.floating)):
         return obj.item()
@@ -99,124 +84,6 @@ def _get_meta_metrics_source() -> str:
     return inspect.getsource(meta_metrics)
 
 
-# Metrics calculations lifted from https://github.com/janosh/matbench-discovery/tree/main/matbench_discovery/metrics
-# Since they aren't setup to be easily imported, we just copy them here
-def classify_stable(
-    each_true: Sequence[float] | pd.Series | np.ndarray,
-    each_pred: Sequence[float] | pd.Series | np.ndarray,
-    *,
-    stability_threshold: float = 0.0,
-    fillna: bool = True,
-) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
-    if len(each_true) != len(each_pred):
-        raise ValueError(f"{len(each_true)=} != {len(each_pred)=}")
-
-    each_true_arr, each_pred_arr = pd.Series(each_true), pd.Series(each_pred)
-
-    if stability_threshold is None or np.isnan(stability_threshold):
-        raise ValueError("stability_threshold must be a real number")
-    actual_pos = each_true_arr <= (stability_threshold or 0)
-    actual_neg = each_true_arr > (stability_threshold or 0)
-
-    model_pos = each_pred_arr <= (stability_threshold or 0)
-    model_neg = each_pred_arr > (stability_threshold or 0)
-
-    if fillna:
-        nan_mask = np.isnan(each_pred)
-        model_pos[nan_mask] = False
-        model_neg[nan_mask] = True
-
-        n_pos, n_neg, total = model_pos.sum(), model_neg.sum(), len(each_pred)
-        if n_pos + n_neg != total:
-            raise ValueError(
-                f"after filling NaNs, the sum of positive ({n_pos}) and negative "
-                f"({n_neg}) predictions should add up to {total=}"
-            )
-
-    true_pos = actual_pos & model_pos
-    false_neg = actual_pos & model_neg
-    false_pos = actual_neg & model_pos
-    true_neg = actual_neg & model_neg
-
-    return true_pos, false_neg, false_pos, true_neg
-
-
-# This is also coptied from the matbench-discovery repo
-def stable_metrics(
-    each_true: Sequence[float] | pd.Series | np.ndarray,
-    each_pred: Sequence[float] | pd.Series | np.ndarray,
-    *,
-    stability_threshold: float = 0.0,
-    fillna: bool = True,
-    prevalence: float | None = None,
-) -> dict[str, float]:
-    n_true_pos, n_false_neg, n_false_pos, n_true_neg = map(
-        sum,
-        classify_stable(
-            each_true, each_pred, stability_threshold=stability_threshold, fillna=fillna
-        ),
-    )
-
-    n_total_pos = n_true_pos + n_false_neg
-    n_total_neg = n_true_neg + n_false_pos
-    if prevalence is None:
-        prevalence = (
-            n_total_pos / (n_total_pos + n_total_neg)
-            if (n_total_pos + n_total_neg) > 0
-            else float("nan")
-        )
-    precision = (
-        n_true_pos / (n_true_pos + n_false_pos)
-        if (n_true_pos + n_false_pos) > 0
-        else float("nan")
-    )
-    recall = n_true_pos / n_total_pos if n_total_pos > 0 else float("nan")
-
-    TPR = recall
-    FPR = n_false_pos / n_total_neg if n_total_neg > 0 else float("nan")
-    TNR = n_true_neg / n_total_neg if n_total_neg > 0 else float("nan")
-    FNR = n_false_neg / n_total_pos if n_total_pos > 0 else float("nan")
-
-    if FPR > 0 and TNR > 0 and FPR + TNR != 1:
-        if abs(FPR + TNR - 1) > 1e-6:
-            raise ValueError(f"{FPR=} {TNR=} don't add up to 1")
-
-    if TPR > 0 and FNR > 0 and TPR + FNR != 1:
-        if abs(TPR + FNR - 1) > 1e-6:
-            raise ValueError(f"{TPR=} {FNR=} don't add up to 1")
-
-    is_nan = np.isnan(each_true) | np.isnan(each_pred)
-    each_true, each_pred = np.array(each_true)[~is_nan], np.array(each_pred)[~is_nan]
-
-    if precision + recall == 0:
-        f1_score = float("nan")
-    else:
-        f1_score = 2 * (precision * recall) / (precision + recall)
-
-    return dict(
-        F1=f1_score,
-        DAF=precision / prevalence if prevalence > 0 else float("nan"),
-        Precision=precision,
-        Recall=recall,
-        Accuracy=(
-            (n_true_pos + n_true_neg) / (n_total_pos + n_total_neg)
-            if (n_total_pos + n_total_neg > 0)
-            else float("nan")
-        ),
-        TPR=TPR,
-        FPR=FPR,
-        TNR=TNR,
-        FNR=FNR,
-        TP=n_true_pos,
-        FP=n_false_pos,
-        TN=n_true_neg,
-        FN=n_false_neg,
-        MAE=np.abs(each_true - each_pred).mean(),
-        RMSE=((each_true - each_pred) ** 2).mean() ** 0.5,
-        R2=r2_score(each_true, each_pred) if len(each_true) > 1 else float("nan"),
-    )
-
-
 _MODEL_CACHE = None
 
 
@@ -299,7 +166,6 @@ def get_material_ids_for_subset(
     if subset_type == "full":
         return None
 
-    import pandas as pd
     from matbench_discovery.data import DataFiles
 
     df = pd.read_csv(DataFiles.wbm_summary.path)
@@ -470,7 +336,6 @@ def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]:
 def calculate_metrics_energy(
     results: Dict[str, Any], config: Dict[str, Any]
 ) -> Dict[str, Any]:
-    import numpy as np
     from matbench_discovery.data import df_wbm
 
     if len(results) == 0:
@@ -515,10 +380,8 @@ def calculate_metrics_forces(
     from io import TextIOWrapper
     from zipfile import ZipFile
 
-    import numpy as np
     from ase.io import read
     from matbench_discovery.data import DataFiles
-    from sklearn.metrics import r2_score
 
     metrics = {
         "energy_mae": [],
diff --git a/garden_ai/benchmarks/utils/remote.py b/garden_ai/benchmarks/utils/remote.py
deleted file mode 100644
index b9a4780e..00000000
--- a/garden_ai/benchmarks/utils/remote.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import json
-import logging
-import os
-import subprocess
-import sys
-import tempfile
-from pathlib import Path
-from typing import Any, Dict, List
-
-logger = logging.getLogger(__name__)
-
-
-class RemoteBenchmarkRunner:
-    """
-    Handles the setup and execution of benchmarks on remote Globus Compute endpoints.
-
-    This class manages:
-    1. Creating an isolated working directory
-    2. Setting up a Python environment using `uv`
-    3. Installing dependencies
-    4. Executing the benchmark script
-    5. Collecting results
-    """
-
-    def __init__(self, work_dir_prefix: str = "garden_benchmark_"):
-        self.work_dir = Path(tempfile.mkdtemp(prefix=work_dir_prefix))
-        self.uv_bin = None
-        self.venv_python = None
-        self.env = dict(os.environ)
-
-        # Configure logging if not already configured
-        if not logging.getLogger().handlers:
-            logging.basicConfig(
-                level=logging.INFO,
-                stream=sys.stdout,
-                force=True,
-                format="%(asctime)s [%(levelname)s] %(message)s",
-            )
-
-    def setup_environment(self, python_version: str = "3.11"):
-        """Find uv and create virtual environment."""
-        logger.info("Setting up environment...")
-
-        # Find UV binary
-        try:
-            self.uv_bin = subprocess.check_output(
-                [sys.executable, "-c", "import uv; print(uv.find_uv_bin())"], text=True
-            ).strip()
-        except subprocess.CalledProcessError:
-            import shutil
-
-            self.uv_bin = shutil.which("uv")
-            if not self.uv_bin:
-                raise RuntimeError("Could not find uv binary. Please install uv.")
-
-        # Create UV virtual environment
-        subprocess.run(
-            [self.uv_bin, "venv", "--python", python_version],
-            cwd=self.work_dir,
-            check=True,
-            capture_output=True,
-        )
-
-        self.venv_python = self.work_dir / ".venv/bin/python"
-        if not self.venv_python.exists():
-            self.venv_python = (
-                self.work_dir / ".venv/Scripts/python.exe"
-            )  # Windows fallback
-
-        if not self.venv_python.exists():
-            raise RuntimeError(
-                f"Virtual environment python not found at {self.venv_python}"
-            )
-
-        # Set SSL cert file for HPC if needed
-        self._setup_ssl_cert()
-
-    def _setup_ssl_cert(self):
-        """Set SSL_CERT_FILE environment variable if certifi is available."""
-        try:
-            certifi_path = subprocess.check_output(
-                [str(self.venv_python), "-c", "import certifi; print(certifi.where())"],
-                text=True,
-            ).strip()
-            self.env["SSL_CERT_FILE"] = certifi_path
-        except Exception as e:
-            logger.warning(f"Failed to set SSL_CERT_FILE: {e}")
-
-    def install_dependencies(self, packages: List[str]):
-        """Install Python packages into the virtual environment."""
-        logger.info(f"Installing dependencies: {packages}")
-        if not self.uv_bin or not self.venv_python:
-            raise RuntimeError("Environment not setup. Call setup_environment() first.")
-
-        cmd = [
-            self.uv_bin,
-            "pip",
-            "install",
-            "--python",
-            str(self.venv_python),
-        ] + packages
-
-        subprocess.run(cmd, cwd=self.work_dir, check=True)
-
-    def run_benchmark(
-        self,
-        script_content: str,
-        config: Dict[str, Any],
-        script_name: str = "benchmark_runner.py",
-    ) -> Dict[str, Any]:
-        """
-        Execute the benchmark script.
-
-        Args:
-            script_content: The Python script to run.
-            config: Configuration dictionary to pass to the script (saved as config.json).
-            script_name: Filename for the script.
-
-        Returns:
-            Dictionary containing the results loaded from results.json.
-        """
-        if not self.venv_python:
-            raise RuntimeError("Environment not setup. Call setup_environment() first.")
-
-        logger.info("Preparing benchmark script...")
-
-        # Write runner script
-        runner_path = self.work_dir / script_name
-        runner_path.write_text(script_content)
-
-        # Write config
-        config_path = self.work_dir / "config.json"
-        with open(config_path, "w") as f:
-            json.dump(config, f, indent=2)
-
-        logger.info("Executing benchmark...")
-
-        # Run the runner script inside the venv
-        proc = subprocess.run(
-            [str(self.venv_python), str(runner_path), str(config_path)],
-            cwd=self.work_dir,
-            env=self.env,
-            stdout=sys.stdout,
-            stderr=sys.stderr,
-            check=False,
-        )
-
-        if proc.returncode != 0:
-            raise RuntimeError(
-                f"Benchmark runner failed with return code {proc.returncode}"
-            )
-
-        logger.info("Collecting results...")
-        results_path = self.work_dir / "results.json"
-        if not results_path.exists():
-            raise RuntimeError(
-                "Results file not found - benchmark may have crashed silently"
-            )
-
-        with open(results_path) as f:
-            results = json.load(f)
-
-        logger.info("Benchmark completed successfully.")
-        return results
-
-    def cleanup(self):
-        """Remove the working directory."""
-        import shutil
-
-        shutil.rmtree(self.work_dir, ignore_errors=True)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.cleanup()
diff --git a/garden_ai/benchmarks/utils/remote_execution.py b/garden_ai/benchmarks/utils/remote_execution.py
deleted file mode 100644
index d6541695..00000000
--- a/garden_ai/benchmarks/utils/remote_execution.py
+++ /dev/null
@@ -1,202 +0,0 @@
-"""Generic remote execution utility for benchmarks.
-
-This module contains the `run_remote_benchmark` function which is designed to be
-serialized and executed on Globus Compute endpoints. It handles the boilerplate
-of setting up a Python environment, installing dependencies, and running a
-provided benchmark script.
-"""
-
-
-def run_remote_benchmark(
-    script_content: str,
-    dependencies: list[str],
-    config: dict,
-    checkpoint_name: str | None = None,
-    checkpoint_path: str | None = None,
-) -> dict:
-    """Run a generic benchmark script on a remote Globus Compute endpoint.
-
-    This function:
-    1. Creates a temporary working directory.
-    2. Sets up a Python environment using `uv`.
-    3. Installs the specified dependencies.
-    4. Writes the `script_content` to a file.
-    5. Writes the `config` to a JSON file.
-    6. Executes the script in the environment.
-    7. Returns the results from `results.json`.
-
-    Args:
-        script_content: The full Python script to execute.
-        dependencies: List of Python packages to install (e.g. ["numpy", "torch"]).
-        config: Dictionary of configuration parameters to pass to the script.
-                Written to `config.json`.
-        checkpoint_name: Name of the checkpoint file (e.g. "checkpoint_123.json").
-                         Saved to ~/.garden/benchmarks/.
-        checkpoint_path: Optional path to an existing checkpoint file to resume from.
-                         If provided, this path is used directly.
-
-    Returns:
-        The content of `results.json` produced by the script.
-    """
-    # All imports must be inside the function for serialization
-    import json
-    import logging
-    import os
-    import subprocess
-    import sys
-    import tempfile
-    from pathlib import Path
-
-    # Configure logging
-    logging.basicConfig(
-        level=logging.INFO,
-        stream=sys.stdout,
-        force=True,
-        format="%(asctime)s [%(levelname)s] %(message)s",
-    )
-    if hasattr(sys.stdout, "reconfigure"):
-        sys.stdout.reconfigure(line_buffering=True)
-
-    logger = logging.getLogger(__name__)
-
-    # Create isolated working directory
-    work_dir = Path(tempfile.mkdtemp(prefix="garden_benchmark_"))
-
-    try:
-        # ----------------------------------------------------------------------
-        # 1. ENVIRONMENT SETUP
-        # ----------------------------------------------------------------------
-        logger.info("Step 1/4: Setting up environment...")
-
-        # Find UV binary
-        try:
-            uv_bin = subprocess.check_output(
-                [sys.executable, "-c", "import uv; print(uv.find_uv_bin())"], text=True
-            ).strip()
-        except subprocess.CalledProcessError:
-            import shutil
-
-            uv_bin = shutil.which("uv")
-            if not uv_bin:
-                raise RuntimeError("Could not find uv binary. Please install uv.")
-
-        # Create UV virtual environment
-        subprocess.run(
-            [uv_bin, "venv", "--python", "3.11"],
-            cwd=work_dir,
-            check=True,
-            capture_output=True,
-        )
-
-        venv_python = work_dir / ".venv/bin/python"
-        if not venv_python.exists():
-            venv_python = work_dir / ".venv/Scripts/python.exe"  # Windows fallback
-
-        if not venv_python.exists():
-            raise RuntimeError(f"Virtual environment python not found at {venv_python}")
-
-        # Install dependencies
-        logger.info(f"Installing dependencies: {dependencies}")
-        # Install in one go for better resolution
-        cmd = [uv_bin, "pip", "install", "--python", str(venv_python)] + dependencies
-        subprocess.run(
-            cmd,
-            cwd=work_dir,
-            check=True,
-        )
-
-        # Set SSL cert file for HPC if needed
-        env = dict(os.environ)
-
-        # Propagate common useful env vars if present
-        for key in ["MBD_AUTO_DOWNLOAD_FILES", "HF_TOKEN", "WANDB_API_KEY"]:
-            if key in os.environ:
-                env[key] = os.environ[key]
-
-        try:
-            certifi_path = subprocess.check_output(
-                [str(venv_python), "-c", "import certifi; print(certifi.where())"],
-                text=True,
-            ).strip()
-            env["SSL_CERT_FILE"] = certifi_path
-        except Exception as e:
-            logger.warning(f"Failed to set SSL_CERT_FILE: {e}")
-
-        # ----------------------------------------------------------------------
-        # 2. PREPARE BENCHMARK SCRIPT
-        # ----------------------------------------------------------------------
-        logger.info("Step 2/4: Preparing benchmark script...")
-
-        # Write runner script
-        runner_path = work_dir / "benchmark_runner.py"
-        runner_path.write_text(script_content)
-
-        # Determine checkpoint path
-        if checkpoint_path:
-            # User provided a specific path to resume from
-            final_checkpoint_path = checkpoint_path
-        elif checkpoint_name:
-            # Use persistent location in user home
-            checkpoint_dir = Path.home() / ".garden" / "benchmarks"
-            checkpoint_dir.mkdir(parents=True, exist_ok=True)
-            final_checkpoint_path = str(checkpoint_dir / checkpoint_name)
-        else:
-            # Fallback to tmp dir if no name provided (legacy behavior)
-            final_checkpoint_path = str(work_dir / "checkpoint.json")
-
-        config["checkpoint_path"] = final_checkpoint_path
-
-        # Log checkpoint path prominently for user reference
-        print(f"{'=' * 80}")
-        print(f"Checkpoint will be saved to: {final_checkpoint_path}")
-        print("To resume this job if it fails, use:")
-        print(f'  checkpoint_path="{final_checkpoint_path}"')
-        print(f"{'=' * 80}")
-
-        # Write config
-        config_path = work_dir / "config.json"
-        with open(config_path, "w") as f:
-            json.dump(config, f, indent=2)
-
-        # ----------------------------------------------------------------------
-        # 3. EXECUTE BENCHMARK
-        # ----------------------------------------------------------------------
-        logger.info("Step 3/4: Executing benchmark...")
-
-        # Run the runner script inside the venv
-        # DO NOT capture output - let it stream to stdout/stderr in real-time
-        # so we can see errors immediately
-        proc = subprocess.run(
-            [str(venv_python), str(runner_path), str(config_path)],
-            cwd=work_dir,
-            env=env,
-            check=False,  # Don't raise immediately, we'll check returncode
-        )
-
-        if proc.returncode != 0:
-            raise RuntimeError(
-                f"Benchmark runner failed with return code {proc.returncode}."
-            )
-
-        # ----------------------------------------------------------------------
-        # 4. COLLECT RESULTS
-        # ----------------------------------------------------------------------
-        logger.info("Step 4/4: Collecting results...")
-
-        results_path = work_dir / "results.json"
-        if not results_path.exists():
-            raise RuntimeError(
-                "Results file not found - benchmark may have crashed silently"
-            )
-
-        with open(results_path) as f:
-            results = json.load(f)
-
-        logger.info("Benchmark completed successfully.")
-        return results
-
-    finally:
-        # Cleanup working directory
-        import shutil
-
-        shutil.rmtree(work_dir, ignore_errors=True)
diff --git a/garden_ai/benchmarks/utils/script_builder.py b/garden_ai/benchmarks/utils/script_builder.py
deleted file mode 100644
index 9613923c..00000000
--- a/garden_ai/benchmarks/utils/script_builder.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import inspect
-from pathlib import Path
-from typing import Callable
-
-
-class BenchmarkScriptBuilder:
-    """Helper to build a self-contained benchmark script from a template."""
-
-    def __init__(self, template_path: str | Path = None):
-        if template_path is None:
-            # Default to the base_runner.py in templates
-            template_path = (
-                Path(__file__).parent.parent / "templates" / "base_runner.py"
-            )
-
-        self.template_path = Path(template_path)
-        self.imports = set()
-        self.functions = []
-        self.preamble = []
-        self.pep723_dependencies = []
-        self.pep723_requires_python = None
-
-    def add_import(self, import_stmt: str):
-        """Add an import statement (e.g. 'import numpy as np')."""
-        self.imports.add(import_stmt)
-        return self
-
-    def add_preamble(self, code: str):
-        """Add arbitrary code to the top of the script (after imports)."""
-        self.preamble.append(code)
-        return self
-
-    def add_pep723_metadata(
-        self, dependencies: list[str], requires_python: str = ">=3.10"
-    ):
-        """Add PEP 723 script metadata."""
-        self.pep723_dependencies.extend(dependencies)
-        self.pep723_requires_python = requires_python
-        return self
-
-    def add_function(self, func: Callable, name: str = None):
-        """Add a function definition to the script.
-
-        The function source code is inspected and appended.
-        If name is provided, the function definition is renamed.
-        """
-        source = inspect.getsource(func)
-
-        if name:
-            import re
-
-            # Replace 'def old_name(' with 'def new_name('
-            # This is a simple regex replacement, assuming standard formatting
-            pattern = r"def\s+" + func.__name__ + r"\s*\("
-            replacement = f"def {name}("
-            source = re.sub(pattern, replacement, source, count=1)
-
-        self.functions.append(source)
-        return self
-
-    def build(self) -> str:
-        """Assemble the final script."""
-        if not self.template_path.exists():
-            raise FileNotFoundError(f"Template not found at {self.template_path}")
-
-        template_content = self.template_path.read_text()
-
-        # Construct sections
-        imports_block = "\n".join(sorted(self.imports))
-        preamble_block = "\n".join(self.preamble)
-        functions_block = "\n\n".join(self.functions)
-
-        # We inject our custom code BEFORE the template's main execution logic
-        # but AFTER the template's own imports (which are inside the file).
-        # Actually, the template has imports at the top. We should probably prepend ours.
-
-        # Simple strategy: Prepend everything to the template, but the template
-        # has "USER DEFINED FUNCTIONS" placeholders. We can just append our functions
-        # before the main block?
-
-        # Better strategy: The template is designed to have functions injected.
-        # Let's just put imports at the top, then functions, then the template content.
-        # But we need to be careful about imports in the template.
-
-        # Construct PEP 723 block
-        pep723_block = ""
-        if self.pep723_dependencies or self.pep723_requires_python:
-            pep723_block = "# /// script\n"
-            if self.pep723_requires_python:
-                pep723_block += f'# requires-python = "{self.pep723_requires_python}"\n'
-            if self.pep723_dependencies:
-                deps_list = '",\n#     "'.join(self.pep723_dependencies)
-                pep723_block += f'# dependencies = [\n#     "{deps_list}",\n# ]\n'
-            pep723_block += "# ///\n"
-
-        final_script = f"""{pep723_block}
-# ------------------------------------------------------------------------------
-# INJECTED IMPORTS
-# ------------------------------------------------------------------------------
-{imports_block}
-
-# ------------------------------------------------------------------------------
-# INJECTED PREAMBLE
-# ------------------------------------------------------------------------------
-{preamble_block}
-
-# ------------------------------------------------------------------------------
-# INJECTED FUNCTIONS
-# ------------------------------------------------------------------------------
-{functions_block}
-
-# ------------------------------------------------------------------------------
-# BASE RUNNER TEMPLATE
-# ------------------------------------------------------------------------------
-{template_content}
-"""
-        return final_script
diff --git a/garden_ai/benchmarks/utils/task.py b/garden_ai/benchmarks/utils/task.py
deleted file mode 100644
index 4ca2fd60..00000000
--- a/garden_ai/benchmarks/utils/task.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import json
-import subprocess
-import sys
-import tempfile
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-
-class BaseBenchmarkTask:
-    """
-    Base class for benchmark tasks.
-
-    Provides common utilities for:
-    - Extracting model metadata (package, factory, kwargs)
-    - Running benchmarks locally for testing
-    """
-
-    def __init__(
-        self, adapter, repo_url: str, repo_ref: str, model_package: Optional[str] = None
-    ):
-        self.adapter = adapter
-        self.repo_url = repo_url
-        self.repo_ref = repo_ref
-        self.model_package = model_package
-
-    def _extract_model_config(
-        self,
-        model: Any = None,
-        model_package: Optional[str] = None,
-        model_factory: Optional[str] = None,
-        model_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Dict[str, Any]:
-        """
-        Helper to resolve model configuration from either a local instance or explicit arguments.
-        """
-        model_checkpoint = None
-
-        if model is not None:
-            # Extract info from local model instance
-            if model_package is None:
-                if self.model_package is not None:
-                    model_package = self.model_package
-                else:
-                    # Infer from model's module
-                    model_package = model.__class__.__module__.split(".")[0]
-
-            if model_factory is None:
-                model_factory = model.__class__.__name__
-
-            # Get checkpoint path if model has one
-            if hasattr(model, "checkpoint_path"):
-                model_checkpoint = model.checkpoint_path
-            elif hasattr(model, "checkpoint"):
-                model_checkpoint = model.checkpoint
-
-            # Try to extract initialization kwargs if available
-            if model_kwargs is None and hasattr(model, "_init_kwargs"):
-                model_kwargs = model._init_kwargs
-
-        else:
-            # Must provide explicit construction info
-            if model_package is None or model_factory is None:
-                raise ValueError(
-                    "If model is not provided, must specify both "
-                    "model_package and model_factory"
-                )
-
-        if model_kwargs is None:
-            model_kwargs = {}
-
-        return {
-            "model_package": model_package,
-            "model_factory": model_factory,
-            "model_kwargs": model_kwargs,
-            "model_checkpoint": model_checkpoint,
-        }
-
-    def _run_local_wrapper(
-        self, runner_func_path: str, runner_func_name: str, config: Dict[str, Any]
-    ) -> Dict[str, Any]:
-        """
-        Execute a benchmark runner function locally in a subprocess.
-
-        Args:
-            runner_func_path: Import path to the runner function (e.g. 'garden_ai.benchmarks.matbench_discovery.remote_runner')
-            runner_func_name: Name of the runner function (e.g. 'run_matbench_is2re')
-            config: Configuration dictionary to pass to the runner function.
-        """
-        results_file_path = (
-            Path(tempfile.gettempdir()) / f"benchmark_results_{id(config)}.json"
-        )
-
-        wrapper_script = f'''
-import json
-from {runner_func_path} import {runner_func_name}
-
-config = {repr(config)}
-results = {runner_func_name}(**config)
-
-with open("{results_file_path}", "w") as f:
-    json.dump(results, f, indent=2)
-'''
-
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
-            f.write(wrapper_script)
-            wrapper_path = f.name
-
-        try:
-            # Run without capturing output so logs stream to console in real-time
-            result = subprocess.run(
-                [sys.executable, wrapper_path],
-                timeout=3600,
-                stdout=None,
-                stderr=None,
-            )
-
-            if result.returncode != 0:
-                raise RuntimeError(
-                    f"Local benchmark failed with return code {result.returncode}"
-                )
-
-            if not results_file_path.exists():
-                raise RuntimeError(
-                    f"Benchmark results file not found at {results_file_path}"
-                )
-
-            with open(results_file_path) as f:
-                return json.load(f)
-
-        finally:
-            Path(wrapper_path).unlink(missing_ok=True)
-            results_file_path.unlink(missing_ok=True)

From b12f922551ab173b139463b21072106d5f97ecbe Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Wed, 10 Dec 2025 09:15:08 -0700
Subject: [PATCH 10/23] fix checkpoint bug, clean up examples

---
 .../examples/local_execution.py               |  31 +++
 .../examples/matbench_equiformerv2.py         |  84 +------
 .../examples/matbench_mace_multi_gpu.py       |  64 +++---
 .../examples/matbench_mattersim.py            |  67 +-----
 .../examples/matbench_sevennet.py             |  68 +-----
 .../benchmarks/matbench_discovery/tasks.py    | 211 +++++++++++++++++-
 garden_ai/benchmarks/utils/meta_metrics.py    |   2 +
 7 files changed, 292 insertions(+), 235 deletions(-)
 create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/local_execution.py

diff --git a/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py b/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py
new file mode 100644
index 00000000..6414f5cf
--- /dev/null
+++ b/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+"""Matbench Discovery Benchmark - Local Execution Example"""
+
+from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
+
+
+def create_mattersim_model(device):
+    from mattersim.forcefield import MatterSimCalculator
+
+    return MatterSimCalculator(device=device)
+
+
+def main():
+    print("Running MatterSim benchmark locally...")
+
+    # Run IS2RE task locally
+    # Note: Requires a GPU or MPS if using MatterSim, or CPU if specified/supported
+    output = MatbenchDiscovery.IS2RE.local(
+        model_factory=create_mattersim_model,
+        model_packages="mattersim",
+        num_structures="random_100",
+    )
+
+    if "error" in output.get("metrics", {}):
+        print(f"Error: {output['metrics']['error']}")
+    else:
+        print("Benchmark Results:", output.get("metrics"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
index 7855f825..e877f230 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
@@ -1,39 +1,13 @@
 #!/usr/bin/env python3
-"""
-Matbench Discovery Benchmark - EquiformerV2 Example
-
-EquiformerV2 is an improved equivariant transformer from FAIR-Chem (formerly OCP).
-Paper: https://arxiv.org/abs/2306.12059
-GitHub: https://github.com/Open-Catalyst-Project/ocp
-
-Note: This example uses the S2EFS task (Structure to Energy, Forces, Stress)
-instead of IS2RE because EquiformerV2 doesn't support geometry relaxation.
-"""
+"""Matbench Discovery Benchmark - EquiformerV2 Example"""
 
 from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
 
-# =============================================================================
-# Configuration
-# =============================================================================
-
 # Globus Compute endpoint
 ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
 
 
-# =============================================================================
-# Model Factory
-# =============================================================================
-
-
 def create_equiformerv2_model(device):
-    """Create EquiformerV2 model calculator.
-
-    Args:
-        device: Device to load model on ("cuda" or "cpu")
-
-    Returns:
-        ASE calculator for EquiformerV2
-    """
     from fairchem.core.calculate.ase_calculator import Calculator
 
     # Use pre-trained checkpoint - will auto-download from HuggingFace
@@ -42,25 +16,15 @@ def create_equiformerv2_model(device):
     )
 
 
-# =============================================================================
-# Run Benchmark
-# =============================================================================
-
-
 def main():
-    """Run Matbench Discovery S2EFS benchmark with EquiformerV2."""
-
-    print("=" * 80)
-    print("Matbench Discovery S2EFS Benchmark - EquiformerV2")
-    print("=" * 80)
+    print(f"Running EquiformerV2 benchmark on endpoint {ENDPOINT_ID}...")
 
-    # Run S2EFS task using the new groundhog API
-    # S2EFS is suitable for EquiformerV2 which doesn't support relaxation
+    # Run S2EFS task (structure to energy/forces/stress)
     output = MatbenchDiscovery.S2EFS.remote(
         endpoint=ENDPOINT_ID,
         user_endpoint_config={
             "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8\n",
-            "walltime": 7200,  # 2 hours in seconds
+            "walltime": 7200,
             "qos": "gpu",
             "partition": "gpu-debug",
             "account": "cis250461-gpu",
@@ -73,44 +37,10 @@ def main():
         num_structures="random_100",
     )
 
-    # Display metrics
-    print()
-    print("=" * 80)
-    print("Benchmark Results")
-    print("=" * 80)
-
-    metrics = output.get("metrics", {})
-    if "error" in metrics:
-        print(f"Error: {metrics['error']}")
+    if "error" in output.get("metrics", {}):
+        print(f"Error: {output['metrics']['error']}")
     else:
-        # Energy metrics
-        if "energy_mae" in metrics:
-            print("Energy Metrics:")
-            print(f"  MAE (eV/atom):  {metrics.get('energy_mae', 'N/A'):.6f}")
-            print(f"  RMSE (eV/atom): {metrics.get('energy_rmse', 'N/A'):.6f}")
-            print(f"  R²:             {metrics.get('energy_r2', 'N/A'):.6f}")
-            print()
-
-        # Force metrics
-        if "force_mae" in metrics:
-            print("Force Metrics:")
-            print(f"  MAE (eV/Å):     {metrics.get('force_mae', 'N/A'):.6f}")
-            print(f"  RMSE (eV/Å):    {metrics.get('force_rmse', 'N/A'):.6f}")
-            print(f"  R²:             {metrics.get('force_r2', 'N/A'):.6f}")
-            print()
-
-        # Stress metrics
-        if "stress_mae" in metrics:
-            print("Stress Metrics:")
-            print(f"  MAE (GPa):      {metrics.get('stress_mae', 'N/A'):.6f}")
-            print(f"  RMSE (GPa):     {metrics.get('stress_rmse', 'N/A'):.6f}")
-            print(f"  R²:             {metrics.get('stress_r2', 'N/A'):.6f}")
-            print()
-
-        if "num_evaluated" in metrics:
-            print(f"Structures:     {metrics.get('num_evaluated', 'N/A')}")
-
-    print("=" * 80)
+        print("Benchmark Results:", output.get("metrics"))
 
 
 if __name__ == "__main__":
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
index 354ca456..4fadc7f5 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
@@ -1,10 +1,5 @@
-"""Test Matbench Discovery benchmark on Anvil HPC.
-
-This script demonstrates running the IS2RE benchmark with a subset of structures
-using multi-GPU parallelization on a Globus Compute endpoint.
-"""
-
-from rich import print
+#!/usr/bin/env python3
+"""Matbench Discovery Benchmark - MACE Multi-GPU Example"""
 
 from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
 
@@ -17,25 +12,36 @@ def create_mace_model(device):
     return mace_mp(model="medium-mpa-0", device=device, default_dtype="float64")
 
 
-results = MatbenchDiscovery.IS2RE.remote(
-    endpoint=ANVIL,
-    user_endpoint_config={
-        "scheduler_options": "#SBATCH --gpus-per-node=4\n",
-        "walltime": "05:00:00",
-        "qos": "gpu",
-        "partition": "gpu",
-        "account": "cis250461-gpu",
-        "cores_per_node": 16,
-        "requirements": "",  # 'requirements' is required for Anvil endpoint
-    },
-    model_factory=create_mace_model,
-    model_packages=[
-        "mace-torch",
-        "cuequivariance",
-        "cuequivariance-torch",
-        "cuequivariance-ops-torch-cu12",
-    ],
-    checkpoint_path="~/.garden/benchmarks/matbench_mace-torch_cuequivariance_full_20251208_115719_ed2e47af.json",
-)
-
-print(results)
+def main():
+    print(f"Running MACE benchmark on endpoint {ANVIL}...")
+
+    results = MatbenchDiscovery.IS2RE.remote(
+        endpoint=ANVIL,
+        user_endpoint_config={
+            "scheduler_options": "#SBATCH --gpus-per-node=4\n",
+            "walltime": "05:00:00",
+            "qos": "gpu",
+            "partition": "gpu",
+            "account": "cis250461-gpu",
+            "cores_per_node": 16,
+            "requirements": "",  # 'requirements' is required for Anvil endpoint
+        },
+        model_factory=create_mace_model,
+        model_packages=[
+            "mace-torch",
+            "cuequivariance",
+            "cuequivariance-torch",
+            "cuequivariance-ops-torch-cu12",
+        ],
+        checkpoint_path="~/.garden/benchmarks/matbench_mace-torch_cuequivariance_full_20251208_115719_ed2e47af.json",
+        num_structures="random_100",
+    )
+
+    if "error" in results.get("metrics", {}):
+        print(f"Error: {results['metrics']['error']}")
+    else:
+        print("Benchmark Results:", results.get("metrics"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py
index 22099e9e..8a7636ba 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py
@@ -1,59 +1,27 @@
 #!/usr/bin/env python3
-"""
-Matbench Discovery Benchmark - MatterSim Example
-
-MatterSim is a deep learning atomistic model for general material simulations.
-Paper: https://arxiv.org/abs/2405.04967
-GitHub: https://github.com/microsoft/mattersim
-"""
+"""Matbench Discovery Benchmark - MatterSim Example"""
 
 from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
 
-# =============================================================================
-# Configuration
-# =============================================================================
-
 # Globus Compute endpoint
 ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
 
 
-# =============================================================================
-# Model Factory
-# =============================================================================
-
-
 def create_mattersim_model(device):
-    """Create MatterSim model calculator.
-
-    Args:
-        device: Device to load model on ("cuda" or "cpu")
-
-    Returns:
-        ASE calculator for MatterSim
-    """
     from mattersim.forcefield import MatterSimCalculator
 
     return MatterSimCalculator(device=device)
 
 
-# =============================================================================
-# Run Benchmark
-# =============================================================================
-
-
 def main():
-    """Run Matbench Discovery IS2RE benchmark with MatterSim."""
-
-    print("=" * 80)
-    print("Matbench Discovery IS2RE Benchmark - MatterSim")
-    print("=" * 80)
+    print(f"Running MatterSim benchmark on endpoint {ENDPOINT_ID}...")
 
-    # Run IS2RE task using the new groundhog API
+    # Run IS2RE task
     output = MatbenchDiscovery.IS2RE.remote(
         endpoint=ENDPOINT_ID,
         user_endpoint_config={
             "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8\n",
-            "walltime": 7200,  # 2 hours in seconds
+            "walltime": 7200,
             "qos": "gpu",
             "partition": "gpu-debug",
             "account": "cis250461-gpu",
@@ -66,31 +34,10 @@ def main():
         num_structures="random_100",
     )
 
-    # Display metrics
-    print()
-    print("=" * 80)
-    print("Benchmark Results")
-    print("=" * 80)
-
-    metrics = output.get("metrics", {})
-    if "error" in metrics:
-        print(f"Error: {metrics['error']}")
+    if "error" in output.get("metrics", {}):
+        print(f"Error: {output['metrics']['error']}")
     else:
-        # Discovery metrics
-        print(f"F1 Score:       {metrics.get('F1', 'N/A'):.6f}")
-        print(f"DAF:            {metrics.get('DAF', 'N/A'):.2f}x")
-        print(f"Precision:      {metrics.get('Precision', 'N/A'):.6f}")
-        print(f"Recall:         {metrics.get('Recall', 'N/A'):.6f}")
-        print(f"Accuracy:       {metrics.get('Accuracy', 'N/A'):.6f}")
-        print()
-        # Regression metrics
-        print(f"MAE (eV/atom):  {metrics.get('MAE', 'N/A'):.6f}")
-        print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}")
-        print(f"R²:             {metrics.get('R2', 'N/A'):.6f}")
-        print()
-        print(f"Structures:     {metrics.get('num_evaluated', 'N/A')}")
-
-    print("=" * 80)
+        print("Benchmark Results:", output.get("metrics"))
 
 
 if __name__ == "__main__":
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py
index e24d0d69..411c64e1 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py
@@ -1,60 +1,27 @@
 #!/usr/bin/env python3
-"""
-Matbench Discovery Benchmark - SevenNet Example
-
-This script demonstrates running the Matbench Discovery IS2RE benchmark
-using SevenNet as the MLIP model on a remote Globus Compute endpoint.
-
-SevenNet is a graph neural network potential with good transferability.
-"""
+"""Matbench Discovery Benchmark - SevenNet Example"""
 
 from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
 
-# =============================================================================
-# Configuration
-# =============================================================================
-
 # Globus Compute endpoint (replace with your endpoint UUID)
 ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
 
 
-# =============================================================================
-# Model Factory
-# =============================================================================
-
-
 def create_sevennet_model(device):
-    """Create SevenNet model calculator.
-
-    Args:
-        device: Device to load model on ("cuda" or "cpu")
-
-    Returns:
-        ASE calculator for SevenNet
-    """
     from sevenn.calculator import SevenNetCalculator
 
     return SevenNetCalculator(model="7net-0", device=device)
 
 
-# =============================================================================
-# Run Benchmark
-# =============================================================================
-
-
 def main():
-    """Run Matbench Discovery IS2RE benchmark with SevenNet."""
+    print(f"Running SevenNet benchmark on endpoint {ENDPOINT_ID}...")
 
-    print("=" * 80)
-    print("Matbench Discovery IS2RE Benchmark - SevenNet")
-    print("=" * 80)
-
-    # Run IS2RE task using the new groundhog API
+    # Run IS2RE task
     output = MatbenchDiscovery.IS2RE.remote(
         endpoint=ENDPOINT_ID,
         user_endpoint_config={
             "scheduler_options": "#SBATCH --gpus-per-node=2\n",
-            "walltime": 7200,  # 2 hours in seconds
+            "walltime": 7200,
             "qos": "gpu",
             "partition": "gpu-debug",
             "account": "cis250461-gpu",
@@ -67,31 +34,10 @@ def main():
         num_structures="random_100",
     )
 
-    # Display metrics
-    print()
-    print("=" * 80)
-    print("Benchmark Results")
-    print("=" * 80)
-
-    metrics = output.get("metrics", {})
-    if "error" in metrics:
-        print(f"Error: {metrics['error']}")
+    if "error" in output.get("metrics", {}):
+        print(f"Error: {output['metrics']['error']}")
     else:
-        # Discovery metrics
-        print(f"F1 Score:       {metrics.get('F1', 'N/A'):.6f}")
-        print(f"DAF:            {metrics.get('DAF', 'N/A'):.2f}x")
-        print(f"Precision:      {metrics.get('Precision', 'N/A'):.6f}")
-        print(f"Recall:         {metrics.get('Recall', 'N/A'):.6f}")
-        print(f"Accuracy:       {metrics.get('Accuracy', 'N/A'):.6f}")
-        print()
-        # Regression metrics
-        print(f"MAE (eV/atom):  {metrics.get('MAE', 'N/A'):.6f}")
-        print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}")
-        print(f"R²:             {metrics.get('R2', 'N/A'):.6f}")
-        print()
-        print(f"Structures:     {metrics.get('num_evaluated', 'N/A')}")
-
-    print("=" * 80)
+        print("Benchmark Results:", output.get("metrics"))
 
 
 if __name__ == "__main__":
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index d087c06a..9d59b708 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -1,3 +1,16 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "groundhog-hpc",
+#     "ase",
+#     "numpy",
+#     "pandas",
+#     "scikit-learn",
+#     "torch",
+#     "matbench-discovery",
+# ]
+# ///
+
 from __future__ import annotations
 
 import concurrent.futures
@@ -7,17 +20,48 @@
 import os
 import sys
 import time
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional, Sequence
 
 import groundhog_hpc as hog
 import numpy as np
 import pandas as pd
 from sklearn.metrics import r2_score
 
-from .metrics import stable_metrics
 
-if TYPE_CHECKING:
-    from .enums import DatasetConfig, DatasetSize
+class DatasetSize(str, Enum):
+    """Predefined dataset sizes for Matbench Discovery benchmarks.
+
+    These correspond to different subsets of the WBM test set that are commonly
+    used for evaluating materials discovery models.
+    """
+
+    FULL = "full"
+    """Full WBM test set (~257k structures)"""
+
+    UNIQUE_PROTOS = "unique_protos"
+    """Unique prototypes subset (~215k structures) - removes duplicate prototypes"""
+
+    RANDOM_10K = "random_10k"
+    """Random 10k structures from the unique prototypes subset (fixed seed)"""
+
+    RANDOM_100 = "random_100"
+    """Random 100 structures for quick testing (fixed seed)"""
+
+    def seed(self, seed: int) -> "DatasetConfig":
+        """Return a configuration with a custom random seed."""
+        return DatasetConfig(self, seed)
+
+
+class DatasetConfig:
+    """Configuration for a dataset subset with a specific random seed."""
+
+    def __init__(self, subset: DatasetSize, seed: int):
+        self.subset = subset
+        self.seed = seed
+
+    def __repr__(self):
+        return f"{self.subset.name}(seed={self.seed})"
 
 
 def setup_logging():
@@ -47,7 +91,6 @@ def setup_device(gpu_id: Optional[int] = None) -> str:
 
 def convert_numpy_types(obj):
     """Convert numpy types to Python native types for JSON serialization."""
-
     if isinstance(obj, (np.integer, np.floating)):
         return obj.item()
     elif isinstance(obj, np.ndarray):
@@ -87,6 +130,124 @@ def _get_meta_metrics_source() -> str:
 _MODEL_CACHE = None
 
 
+# Metrics calculations lifted from https://github.com/janosh/matbench-discovery/tree/main/matbench_discovery/metrics
+# Since they aren't setup to be easily imported, we just copy them here
+def classify_stable(
+    each_true: Sequence[float] | pd.Series | np.ndarray,
+    each_pred: Sequence[float] | pd.Series | np.ndarray,
+    *,
+    stability_threshold: float = 0.0,
+    fillna: bool = True,
+) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
+    if len(each_true) != len(each_pred):
+        raise ValueError(f"{len(each_true)=} != {len(each_pred)=}")
+
+    each_true_arr, each_pred_arr = pd.Series(each_true), pd.Series(each_pred)
+
+    if stability_threshold is None or np.isnan(stability_threshold):
+        raise ValueError("stability_threshold must be a real number")
+    actual_pos = each_true_arr <= (stability_threshold or 0)
+    actual_neg = each_true_arr > (stability_threshold or 0)
+
+    model_pos = each_pred_arr <= (stability_threshold or 0)
+    model_neg = each_pred_arr > (stability_threshold or 0)
+
+    if fillna:
+        nan_mask = np.isnan(each_pred)
+        model_pos[nan_mask] = False
+        model_neg[nan_mask] = True
+
+        n_pos, n_neg, total = model_pos.sum(), model_neg.sum(), len(each_pred)
+        if n_pos + n_neg != total:
+            raise ValueError(
+                f"after filling NaNs, the sum of positive ({n_pos}) and negative "
+                f"({n_neg}) predictions should add up to {total=}"
+            )
+
+    true_pos = actual_pos & model_pos
+    false_neg = actual_pos & model_neg
+    false_pos = actual_neg & model_pos
+    true_neg = actual_neg & model_neg
+
+    return true_pos, false_neg, false_pos, true_neg
+
+
+# This is also coptied from the matbench-discovery repo
+def stable_metrics(
+    each_true: Sequence[float] | pd.Series | np.ndarray,
+    each_pred: Sequence[float] | pd.Series | np.ndarray,
+    *,
+    stability_threshold: float = 0.0,
+    fillna: bool = True,
+    prevalence: float | None = None,
+) -> dict[str, float]:
+    n_true_pos, n_false_neg, n_false_pos, n_true_neg = map(
+        sum,
+        classify_stable(
+            each_true, each_pred, stability_threshold=stability_threshold, fillna=fillna
+        ),
+    )
+
+    n_total_pos = n_true_pos + n_false_neg
+    n_total_neg = n_true_neg + n_false_pos
+    if prevalence is None:
+        prevalence = (
+            n_total_pos / (n_total_pos + n_total_neg)
+            if (n_total_pos + n_total_neg) > 0
+            else float("nan")
+        )
+    precision = (
+        n_true_pos / (n_true_pos + n_false_pos)
+        if (n_true_pos + n_false_pos) > 0
+        else float("nan")
+    )
+    recall = n_true_pos / n_total_pos if n_total_pos > 0 else float("nan")
+
+    TPR = recall
+    FPR = n_false_pos / n_total_neg if n_total_neg > 0 else float("nan")
+    TNR = n_true_neg / n_total_neg if n_total_neg > 0 else float("nan")
+    FNR = n_false_neg / n_total_pos if n_total_pos > 0 else float("nan")
+
+    if FPR > 0 and TNR > 0 and FPR + TNR != 1:
+        if abs(FPR + TNR - 1) > 1e-6:
+            raise ValueError(f"{FPR=} {TNR=} don't add up to 1")
+
+    if TPR > 0 and FNR > 0 and TPR + FNR != 1:
+        if abs(TPR + FNR - 1) > 1e-6:
+            raise ValueError(f"{TPR=} {FNR=} don't add up to 1")
+
+    is_nan = np.isnan(each_true) | np.isnan(each_pred)
+    each_true, each_pred = np.array(each_true)[~is_nan], np.array(each_pred)[~is_nan]
+
+    if precision + recall == 0:
+        f1_score = float("nan")
+    else:
+        f1_score = 2 * (precision * recall) / (precision + recall)
+
+    return dict(
+        F1=f1_score,
+        DAF=precision / prevalence if prevalence > 0 else float("nan"),
+        Precision=precision,
+        Recall=recall,
+        Accuracy=(
+            (n_true_pos + n_true_neg) / (n_total_pos + n_total_neg)
+            if (n_total_pos + n_total_neg > 0)
+            else float("nan")
+        ),
+        TPR=TPR,
+        FPR=FPR,
+        TNR=TNR,
+        FNR=FNR,
+        TP=n_true_pos,
+        FP=n_false_pos,
+        TN=n_true_neg,
+        FN=n_false_neg,
+        MAE=np.abs(each_true - each_pred).mean(),
+        RMSE=((each_true - each_pred) ** 2).mean() ** 0.5,
+        R2=r2_score(each_true, each_pred) if len(each_true) > 1 else float("nan"),
+    )
+
+
 def _process_batch_common(
     batch_id: int,
     structures: List[Any],
@@ -584,7 +745,12 @@ def run_benchmark_hog(
     try:
         import torch
 
-        num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
+        if torch.cuda.is_available():
+            num_gpus = torch.cuda.device_count()
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            num_gpus = 1
+        else:
+            num_gpus = 0
     except ImportError:
         num_gpus = 0
 
@@ -599,6 +765,10 @@ def run_benchmark_hog(
     available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores
     threads_per_worker = max(1, available_cores // num_workers)
 
+    # MPS (Apple Silicon) performance degrades with high thread counts due to contention
+    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        threads_per_worker = 1
+
     logger.info(
         f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)"
     )
@@ -665,6 +835,10 @@ def run_benchmark_hog(
                     logger.info(f"Checkpoint saved to {checkpoint_path}")
                 except Exception as e:
                     logger.error(f"Failed to save checkpoint: {e}")
+                    raise RuntimeError(
+                        f"Critical: Failed to save checkpoint to {checkpoint_path}. "
+                        f"Aborting to prevent loss of progress. Error: {e}"
+                    ) from e
 
             elapsed = time.time() - chunk_start
             logger.info(f"Chunk {chunk_idx + 1} complete in {elapsed:.1f}s")
@@ -963,8 +1137,17 @@ def _run_task(
             )
 
         if checkpoint_path:
-            print(f"Resuming from checkpoint: {checkpoint_path}")
-            final_checkpoint_path = checkpoint_path
+            # Always expand tilde to home directory
+            final_checkpoint_path = os.path.expanduser(checkpoint_path)
+            if os.path.exists(final_checkpoint_path):
+                print(f"Resuming from checkpoint: {final_checkpoint_path}")
+            else:
+                print(
+                    f"WARNING: Checkpoint file not found at {final_checkpoint_path}. "
+                    f"Starting fresh and will save checkpoints to this path."
+                )
+                # Ensure directory exists for new checkpoint
+                os.makedirs(os.path.dirname(final_checkpoint_path), exist_ok=True)
         else:
             print(
                 f"Checkpoint will be saved to: ~/.garden/benchmarks/{checkpoint_name}"
@@ -974,6 +1157,18 @@ def _run_task(
             )
             os.makedirs(os.path.dirname(final_checkpoint_path), exist_ok=True)
 
+        # Validate we can write to the checkpoint path early to fail fast
+        try:
+            test_file = final_checkpoint_path + ".write_test"
+            with open(test_file, "w") as f:
+                f.write("test")
+            os.remove(test_file)
+        except Exception as e:
+            raise RuntimeError(
+                f"Cannot write to checkpoint path: {final_checkpoint_path}. "
+                f"Check permissions and disk space. Error: {e}"
+            ) from e
+
         runner_config["checkpoint_path"] = final_checkpoint_path
 
         # meta_metrics_source is injected by BenchmarkMethod wrapper
diff --git a/garden_ai/benchmarks/utils/meta_metrics.py b/garden_ai/benchmarks/utils/meta_metrics.py
index e18120d4..481a2b35 100644
--- a/garden_ai/benchmarks/utils/meta_metrics.py
+++ b/garden_ai/benchmarks/utils/meta_metrics.py
@@ -60,6 +60,8 @@ def get_hardware_info() -> Dict[str, Any]:
                 info["gpu_memory_gb"] = round(props.total_memory / (1024**3), 1)
         elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
             info["device_type"] = "mps"
+            info["num_gpus"] = 1
+            info["gpu_names"] = ["Apple Metal Performance Shaders"]
     except ImportError:
         pass
     return info

From 5192800ed0c91158c84812c7e098d36400e31f36 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Wed, 10 Dec 2025 09:24:41 -0700
Subject: [PATCH 11/23] bump groundhog version

---
 pyproject.toml | 2 +-
 uv.lock        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6b3d1706..dec7f17a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,7 @@ dependencies = [
     # used transitively by modal -> grpclib, force 4.3.0 to reslove CVE-2025-57804
     # Can remove once we upgrade to more current modal sdk version
     "h2>=4.3.0",
-    "groundhog-hpc>=0.5.0",
+    "groundhog-hpc>=0.5.6",
 ]
 
 [project.optional-dependencies]
diff --git a/uv.lock b/uv.lock
index 0d176456..6a2e9a3e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1098,7 +1098,7 @@ requires-dist = [
     { name = "gitpython", specifier = ">=3.1.35,<4.0.0" },
     { name = "globus-compute-sdk", specifier = ">=4.0.0" },
     { name = "globus-sdk", specifier = ">=3.34.0,<4.0.0" },
-    { name = "groundhog-hpc", specifier = ">=0.5.0" },
+    { name = "groundhog-hpc", specifier = ">=0.5.6" },
     { name = "h2", specifier = ">=4.3.0" },
     { name = "huggingface-hub", specifier = "==0.18.0" },
     { name = "ipython", specifier = "<8.13" },
@@ -1251,7 +1251,7 @@ wheels = [
 
 [[package]]
 name = "groundhog-hpc"
-version = "0.5.4"
+version = "0.5.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "globus-compute-sdk" },
@@ -1265,9 +1265,9 @@ dependencies = [
     { name = "typer" },
     { name = "uv" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ea/4a/79c3bef59e0e4e538875949cec290d26cf5cefd601135e190723f9fc89de/groundhog_hpc-0.5.4.tar.gz", hash = "sha256:1f9ef486a6b62a3f28168689425b9b838c1abe92d76291a392299c70a4f5a0ec", size = 31705, upload-time = "2025-11-06T23:31:08.795Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/5a/e7/adf855aaded946d2cff12851320c7b53114fed40d4b833efaf0081bc3aea/groundhog_hpc-0.5.6.tar.gz", hash = "sha256:cc5a25c0dfc6a0ddc641e631cc7dae1466e81b8f24984f102eb8300cf6340b42", size = 32346, upload-time = "2025-12-09T18:49:49.554Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ea/c4/abece517b27357edc102891244233e68a12de57c0cee7bd4a404ad86bb74/groundhog_hpc-0.5.4-py3-none-any.whl", hash = "sha256:287c91211f2d64fb89b84b3be0cd26611cd3c1a29c76b83efe1219f3ad8fc53f", size = 44364, upload-time = "2025-11-06T23:31:07.512Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/13/702590a7f6064c01609379225c678b42e6c1a56e72e85a8d14a23ec9213a/groundhog_hpc-0.5.6-py3-none-any.whl", hash = "sha256:d6347031c1f779e24379fd9619ca59dc2dce8df521fcd0c6cb51b00a7e807eab", size = 45086, upload-time = "2025-12-09T18:49:50.346Z" },
 ]
 
 [[package]]

From 060865ca8ddc7b0b80d9c50992731385b43ba880 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Wed, 10 Dec 2025 13:17:24 -0700
Subject: [PATCH 12/23] calculate metrics if the given checkpoint file has
 finished all of the materials

---
 .../benchmarks/matbench_discovery/tasks.py     | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index 9d59b708..c4d5e0e3 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -724,7 +724,21 @@ def run_benchmark_hog(
     ]
 
     if not items_to_process:
-        logger.info("All items already processed!")
+        logger.info(
+            "All items already processed! Calculating metrics from checkpoint..."
+        )
+
+        # Calculate metrics from checkpoint results
+        try:
+            metrics = calc_metrics_fn(results, config)
+            logger.info(f"Metrics calculated: {metrics}")
+        except Exception as e:
+            logger.error(f"Failed to calculate metrics: {e}")
+            import traceback
+
+            traceback.print_exc()
+            metrics = {"error": f"Metrics calculation failed: {e}"}
+
         run_metadata = calculate_run_metadata(
             hardware_info=hardware_info,
             model_info=model_info,
@@ -733,7 +747,7 @@ def run_benchmark_hog(
             num_structures_total=len(all_items),
             num_structures_processed=0,
         )
-        return {"metrics": {}, "run_metadata": run_metadata}
+        return {"metrics": metrics, "run_metadata": run_metadata}
 
     logger.info(f"Processing {len(items_to_process)} remaining items")
 

From 8eeefb297aca329a8ffb566107499d7f3ecdae82 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Wed, 10 Dec 2025 13:43:08 -0700
Subject: [PATCH 13/23] remote unused files

---
 .../benchmarks/matbench_discovery/README.md   | 285 ------------------
 .../examples/matbench_mace_multi_gpu.py       |   1 -
 .../benchmarks/matbench_discovery/metrics.py  | 193 ------------
 3 files changed, 479 deletions(-)
 delete mode 100644 garden_ai/benchmarks/matbench_discovery/README.md
 delete mode 100644 garden_ai/benchmarks/matbench_discovery/metrics.py

diff --git a/garden_ai/benchmarks/matbench_discovery/README.md b/garden_ai/benchmarks/matbench_discovery/README.md
deleted file mode 100644
index 4273cd02..00000000
--- a/garden_ai/benchmarks/matbench_discovery/README.md
+++ /dev/null
@@ -1,285 +0,0 @@
-# Matbench Discovery Benchmark Adapter
-
-Minimal viable implementation for running [Matbench Discovery](https://matbench-discovery.materialsproject.org/) benchmarks on remote HPC systems via Globus Compute.
-
-## Overview
-
-This adapter enables Garden AI users to benchmark their materials models against the Matbench Discovery test suite without manually managing HPC jobs, environment setup, or data transfers.
-
-### Current Status: MVP
-
-**Implemented:**
-- ✅ IS2RE (Initial Structure to Relaxed Energy) task
-- ✅ Remote environment setup with UV
-- ✅ Automatic dependency installation
-- ✅ Basic metric calculation
-- ✅ Multi-GPU parallelization (automatic GPU detection and work distribution)
-
-**Future Work:**
-- ⏳ Additional tasks (RS2RE, S2EFS, thermal conductivity)
-- ⏳ Globus Transfer for model weights and large datasets
-- ⏳ Checkpointing and failure recovery
-- ⏳ Full metric calculation against DFT ground truth
-- ⏳ Backend integration for result publishing
-
-## Architecture
-
-```
-User's Machine                    Remote HPC Endpoint
-├─ MatbenchDiscovery             ├─ Clone matbench-discovery repo
-│  ├─ tasks.IS2RE                │  ├─ Set up UV virtual environment
-│  └─ Globus Compute Executor ───┼─>├─ Install dependencies
-                                 │  │  ├─ matbench-discovery
-                                 │  │  └─ model package (e.g., mace-torch)
-                                 │  ├─ Load test structures via DataFiles
-                                 │  ├─ Run structure relaxations
-                                 │  ├─ Calculate metrics
-                                 │  └─ Return results
-```
-
-## File Structure
-
-```
-matbench_discovery/
-├── __init__.py         # Main adapter class (MatbenchDiscovery)
-├── tasks.py            # Task implementations (IS2RETask)
-├── remote_runner.py    # Remote execution functions
-├── enums.py            # Task enumerations
-├── example.py          # Usage example
-└── README.md           # This file
-```
-
-## Usage
-
-### Basic Example
-
-```python
-from garden_ai.benchmarks import MatbenchDiscovery
-from my_model import MyModel
-
-# Configure endpoint
-endpoint_id = "your-endpoint-uuid"
-endpoint_config = {
-    "account": "project-account",
-    "partition": "gpu-debug",
-    "scheduler_options": "#SBATCH --gpus-per-node=1"
-}
-
-# Run benchmark
-with MatbenchDiscovery(endpoint_id, endpoint_config) as bench:
-    model = MyModel()
-    task = bench.tasks.IS2RE
-
-    # Submit job (returns immediately)
-    future = task.submit(model, num_structures=100)
-
-    # Wait for completion
-    results = future.result()
-
-    # Calculate metrics
-    metrics = task.calculate_metrics(results)
-    print(metrics)
-```
-
-### Multi-GPU Parallelization
-
-The adapter automatically detects and uses all available GPUs on the compute node for parallel processing. This significantly improves throughput for large-scale benchmarks.
-
-**Example: 4-GPU Configuration on Anvil**
-
-```python
-from garden_ai.benchmarks import MatbenchDiscovery
-
-endpoint_id = "your-endpoint-uuid"
-endpoint_config = {
-    "account": "your-account",
-    "qos": "gpu",
-    "partition": "gpu",
-    "scheduler_options": "#SBATCH --gpus-per-node=4\n#SBATCH --time=4:00:00\n#SBATCH --mem=64G",
-    "worker_init": "pip install --user uv",
-}
-
-with MatbenchDiscovery(endpoint_id, endpoint_config) as bench:
-    task = bench.tasks.IS2RE
-
-    # Multi-GPU is enabled by default
-    future = task.submit(
-        model_package="mace-torch",
-        model_factory="mace_mp",
-        model_kwargs={"model": "medium", "device": "cuda"},
-        num_structures=1000,
-        use_multi_gpu=True,  # Default: True
-    )
-
-    results = future.result()
-    metrics = task.calculate_metrics(results)
-```
-
-**How it works:**
-1. Automatically detects available GPUs using `torch.cuda.device_count()`
-2. Splits structures into equal batches (one per GPU)
-3. Processes batches in parallel using multiprocessing
-4. Aggregates results from all workers
-
-**Performance expectations:**
-- **Single GPU**: ~10-20 structures/hour (baseline)
-- **4 GPUs**: ~3-4x speedup (~40-80 structures/hour)
-- Actual performance depends on model complexity and structure size
-
-**Disabling multi-GPU:**
-```python
-future = task.submit(
-    model_package="mace-torch",
-    model_factory="mace_mp",
-    model_kwargs={"model": "medium", "device": "cuda"},
-    num_structures=100,
-    use_multi_gpu=False,  # Use single GPU/CPU
-)
-```
-
-### Scaling Guide
-
-**Recommended test progression:**
-
-1. **Small test (10-100 structures)**: Verify setup and model compatibility
-   - Partition: `gpu-debug`
-   - Time: 30 minutes
-   - GPUs: 1-4
-
-2. **Medium test (1000 structures)**: Test multi-GPU parallelization
-   - Partition: `gpu`
-   - Time: 4 hours
-   - GPUs: 4
-   - Expected throughput: ~250-300 structures/hour with 4 GPUs
-
-3. **Full dataset (~257k structures)**: Production run
-   - Partition: `gpu`
-   - Time: 48+ hours
-   - GPUs: 4
-   - Consider implementing checkpointing for runs >24 hours
-
-### Model Requirements
-
-For the MVP, models must:
-
-1. **Be pip-installable** (or provide package name)
-2. **Implement ASE calculator interface** (or be convertible to one)
-3. **Have a checkpoint file** (optional, can be None for models with default weights)
-
-Example model:
-
-```python
-class MyModel:
-    def __init__(self):
-        self.checkpoint_path = "/path/to/checkpoint.pt"
-
-    # ASE calculator interface
-    def calculate(self, atoms, properties, system_changes):
-        # Calculate energy, forces, stress
-        ...
-```
-
-### Workflow Details
-
-When you call `task.submit(model)`:
-
-1. **Model introspection**: Extracts model class name, module, and checkpoint path
-2. **Remote submission**: Sends job to Globus Compute endpoint
-3. **Environment setup** (on remote):
-   - Clones matbench-discovery repository
-   - Creates Python 3.11 virtual environment with UV
-   - Installs matbench-discovery package
-   - Installs model package (e.g., `pip install mace-torch`)
-4. **Benchmark execution**:
-   - Loads test structures using `DataFiles.wbm_initial_structures`
-   - Instantiates model and loads checkpoint
-   - Runs geometry optimizations (ASE FIRE optimizer)
-   - Collects results
-5. **Result return**: Returns energies, convergence stats, and failures
-
-## Configuration Options
-
-### MatbenchDiscovery
-
-```python
-MatbenchDiscovery(
-    endpoint_id="uuid",           # Required: Globus Compute endpoint
-    user_endpoint_config=dict,     # Optional: HPC scheduler config
-    repo_ref="main",               # Optional: Git ref to use
-    model_package="mace-torch"     # Optional: Default model package
-)
-```
-
-### IS2RETask.submit()
-
-```python
-task.submit(
-    model,                         # Required: Model instance
-    num_structures=100,            # Optional: Number of structures to test
-    model_package="mace-torch",    # Optional: Override default package
-    use_multi_gpu=True,            # Optional: Enable multi-GPU (default: True)
-)
-```
-
-## Design Decisions
-
-### Why UV?
-- Fast, deterministic installs
-- Handles both `pyproject.toml` and `requirements.txt`
-- Built-in venv creation with specific Python versions
-
-### Why DataFiles auto-download?
-- Avoids manual Globus Transfer setup for MVP
-- Matbench's DataFiles handles caching automatically
-- Can optimize with explicit transfer later
-
-### Why ASE calculator interface?
-- Standard in materials modeling community
-- Most interatomic potentials support it (MACE, M3GNet, CHGNet, etc.)
-- Simple adaptation layer if needed
-
-### Why multiprocessing for multi-GPU?
-- Simple and effective for within-node parallelization
-- Avoids CUDA initialization issues with fork
-- Each GPU gets isolated process with dedicated memory
-- Easy to debug and monitor per-GPU progress
-
-## Limitations
-
-1. **No weight transfer**: Model checkpoints must be accessible from remote (URL or shared filesystem)
-2. **Basic metrics**: Only reports convergence stats, not comparison to DFT ground truth
-3. **IS2RE only**: Other tasks not yet implemented
-4. **No checkpointing**: If job fails, must restart from scratch (recommended for runs >24 hours)
-5. **No result publishing**: Backend integration not yet implemented
-6. **Single-node parallelization**: Multi-GPU works within a node; SLURM array jobs for multi-node not yet implemented
-
-## Next Steps
-
-To generalize beyond Matbench:
-
-1. **Extract base classes**: `BenchmarkAdapter`, `BenchmarkTask`, `RemoteRunner`
-2. **Add data staging**: Implement Globus Transfer for weights/datasets
-3. **Define model interface**: Standard protocol for model serialization
-4. **Add checkpointing**: Save intermediate results for failure recovery
-5. **Implement batching**: Distribute work across SLURM array jobs
-
-## Testing
-
-```bash
-# Install dependencies
-cd garden_ai/benchmarks/matbench_discovery
-pip install -e .
-
-# Update example.py with your endpoint details
-vim example.py
-
-# Run example
-python example.py
-```
-
-## References
-
-- [Matbench Discovery](https://matbench-discovery.materialsproject.org/)
-- [Matbench Discovery GitHub](https://github.com/janosh/matbench-discovery)
-- [Globus Compute](https://globus-compute.readthedocs.io/)
-- [ASE Calculator Interface](https://wiki.fysik.dtu.dk/ase/ase/calculators/calculators.html)
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
index 4fadc7f5..e0fb0003 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
@@ -34,7 +34,6 @@ def main():
             "cuequivariance-ops-torch-cu12",
         ],
         checkpoint_path="~/.garden/benchmarks/matbench_mace-torch_cuequivariance_full_20251208_115719_ed2e47af.json",
-        num_structures="random_100",
     )
 
     if "error" in results.get("metrics", {}):
diff --git a/garden_ai/benchmarks/matbench_discovery/metrics.py b/garden_ai/benchmarks/matbench_discovery/metrics.py
deleted file mode 100644
index c08bad2d..00000000
--- a/garden_ai/benchmarks/matbench_discovery/metrics.py
+++ /dev/null
@@ -1,193 +0,0 @@
-"""Functions to classify energy above convex hull predictions as true/false
-positive/negative and compute performance metrics.
-
-Adapted from matbench-discovery to avoid import issues.
-Original source: https://github.com/janosh/matbench-discovery/blob/main/matbench_discovery/metrics/discovery.py
-"""
-
-from collections.abc import Sequence
-
-import numpy as np
-import pandas as pd
-from sklearn.metrics import r2_score
-
-# Default stability threshold from matbench-discovery
-# STABILITY_THRESHOLD = 0.0
-
-
-def classify_stable(
-    each_true: Sequence[float] | pd.Series | np.ndarray,
-    each_pred: Sequence[float] | pd.Series | np.ndarray,
-    *,
-    stability_threshold: float = 0.0,
-    fillna: bool = True,
-) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
-    """Classify model stability predictions as true/false positive/negatives (usually
-    w.r.t DFT-ground truth labels). All energies are assumed to be in eV/atom
-    (but shouldn't really matter as long as they're consistent).
-
-    Args:
-        each_true (Sequence[float] | pd.Series): Ground truth energy above convex hull
-            values.
-        each_pred (Sequence[float] | pd.Series): Model-predicted energy above convex
-            hull values.
-        stability_threshold (float, optional): Maximum energy above convex hull
-            for a material to still be considered stable. Usually 0, 0.05 or 0.1.
-            Defaults to 0.0, meaning a material has to be directly on
-            the hull to be called stable. Negative values mean a material has to pull
-            the known hull down by that amount to count as stable. Few materials lie
-            below the known hull, so only negative values very close to 0 make sense.
-        fillna (bool): Whether to fill NaNs as the model predicting unstable. Defaults
-            to True.
-
-    Returns:
-        tuple[TP, FN, FP, TN]: Indices as pd.Series for true positives,
-            false negatives, false positives and true negatives (in this order).
-
-    Raises:
-        ValueError: If sum of positive + negative preds doesn't add up to the total.
-    """
-    if len(each_true) != len(each_pred):
-        raise ValueError(f"{len(each_true)=} != {len(each_pred)=}")
-
-    each_true_arr, each_pred_arr = pd.Series(each_true), pd.Series(each_pred)
-
-    if stability_threshold is None or np.isnan(stability_threshold):
-        raise ValueError("stability_threshold must be a real number")
-    actual_pos = each_true_arr <= (stability_threshold or 0)
-    actual_neg = each_true_arr > (stability_threshold or 0)
-
-    model_pos = each_pred_arr <= (stability_threshold or 0)
-    model_neg = each_pred_arr > (stability_threshold or 0)
-
-    if fillna:
-        nan_mask = np.isnan(each_pred)
-        # for in both the model's stable and unstable preds, fill NaNs as unstable
-        model_pos[nan_mask] = False
-        model_neg[nan_mask] = True
-
-        n_pos, n_neg, total = model_pos.sum(), model_neg.sum(), len(each_pred)
-        if n_pos + n_neg != total:
-            raise ValueError(
-                f"after filling NaNs, the sum of positive ({n_pos}) and negative "
-                f"({n_neg}) predictions should add up to {total=}"
-            )
-
-    true_pos = actual_pos & model_pos
-    false_neg = actual_pos & model_neg
-    false_pos = actual_neg & model_pos
-    true_neg = actual_neg & model_neg
-
-    return true_pos, false_neg, false_pos, true_neg
-
-
-def stable_metrics(
-    each_true: Sequence[float] | pd.Series | np.ndarray,
-    each_pred: Sequence[float] | pd.Series | np.ndarray,
-    *,
-    stability_threshold: float = 0.0,
-    fillna: bool = True,
-    prevalence: float | None = None,
-) -> dict[str, float]:
-    """Get a dictionary of stability prediction metrics. Mostly binary classification
-    metrics, but also MAE, RMSE and R2.
-
-    Args:
-        each_true (Sequence[float] | pd.Series): true energy above convex hull
-        each_pred (Sequence[float] | pd.Series): predicted energy above convex hull
-        stability_threshold (float): Where to place stability threshold relative to
-            convex hull in eV/atom, usually 0 or 0.1 eV. Default = 0.0.
-        fillna (bool): Whether to fill NaNs as the model predicting unstable. Defaults
-            to True.
-        prevalence (float, optional): Prevalence of stable materials in the dataset.
-            If None, calculated from the input data. Defaults to None.
-
-    Note: Should give equivalent classification metrics to
-        sklearn.metrics.classification_report(
-            each_true > stability_threshold,
-            each_pred > stability_threshold,
-            output_dict=True,
-        )
-        when using the same stability_threshold.
-
-    Returns:
-        dict[str, float]: dictionary of classification metrics with keys DAF, Precision,
-            Recall, Accuracy, F1, TPR, FPR, TNR, FNR, MAE, RMSE, R2.
-
-    Raises:
-        ValueError: If FPR + TNR don't add up to 1.
-        ValueError: If TPR + FNR don't add up to 1.
-    """
-    n_true_pos, n_false_neg, n_false_pos, n_true_neg = map(
-        sum,
-        classify_stable(
-            each_true, each_pred, stability_threshold=stability_threshold, fillna=fillna
-        ),
-    )
-
-    n_total_pos = n_true_pos + n_false_neg
-    n_total_neg = n_true_neg + n_false_pos
-    # prevalence: dummy discovery rate of stable crystals by selecting randomly from
-    # all materials
-    if prevalence is None:
-        prevalence = (
-            n_total_pos / (n_total_pos + n_total_neg)
-            if (n_total_pos + n_total_neg) > 0
-            else float("nan")
-        )
-    # Calculate ratios with guards against division by zero
-    precision = (
-        n_true_pos / (n_true_pos + n_false_pos)
-        if (n_true_pos + n_false_pos) > 0
-        else float("nan")
-    )
-    recall = n_true_pos / n_total_pos if n_total_pos > 0 else float("nan")
-
-    TPR = recall
-    FPR = n_false_pos / n_total_neg if n_total_neg > 0 else float("nan")
-    TNR = n_true_neg / n_total_neg if n_total_neg > 0 else float("nan")
-    FNR = n_false_neg / n_total_pos if n_total_pos > 0 else float("nan")
-
-    # sanity check: false positives + true negatives = all negatives
-    if FPR > 0 and TNR > 0 and FPR + TNR != 1:
-        # Floating point tolerance
-        if abs(FPR + TNR - 1) > 1e-6:
-            raise ValueError(f"{FPR=} {TNR=} don't add up to 1")
-
-    # sanity check: true positives + false negatives = all positives
-    if TPR > 0 and FNR > 0 and TPR + FNR != 1:
-        # Floating point tolerance
-        if abs(TPR + FNR - 1) > 1e-6:
-            raise ValueError(f"{TPR=} {FNR=} don't add up to 1")
-
-    # Drop NaNs to calculate regression metrics
-    is_nan = np.isnan(each_true) | np.isnan(each_pred)
-    each_true, each_pred = np.array(each_true)[~is_nan], np.array(each_pred)[~is_nan]
-
-    if precision + recall == 0:  # Calculate F1 score, handling division by zero
-        f1_score = float("nan")
-    else:
-        f1_score = 2 * (precision * recall) / (precision + recall)
-
-    return dict(
-        F1=f1_score,
-        DAF=precision / prevalence if prevalence > 0 else float("nan"),
-        Precision=precision,
-        Recall=recall,
-        Accuracy=(
-            (n_true_pos + n_true_neg) / (n_total_pos + n_total_neg)
-            if (n_total_pos + n_total_neg > 0)
-            else float("nan")
-        ),
-        TPR=TPR,
-        FPR=FPR,
-        TNR=TNR,
-        FNR=FNR,
-        TP=n_true_pos,
-        FP=n_false_pos,
-        TN=n_true_neg,
-        FN=n_false_neg,
-        MAE=np.abs(each_true - each_pred).mean(),
-        RMSE=((each_true - each_pred) ** 2).mean() ** 0.5,
-        R2=r2_score(each_true, each_pred) if len(each_true) > 1 else float("nan"),
-    )

From 97d7291e0b78cc3341295a9b0d174012433c9d1d Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Wed, 10 Dec 2025 14:05:25 -0700
Subject: [PATCH 14/23] fix type errors

---
 .../examples/matbench_equiformerv2.py         |  4 +-
 .../benchmarks/matbench_discovery/tasks.py    | 63 +++++++++++--------
 garden_ai/benchmarks/utils/meta_metrics.py    |  9 ++-
 3 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
index e877f230..afd3b2d5 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
@@ -8,7 +8,9 @@
 
 
 def create_equiformerv2_model(device):
-    from fairchem.core.calculate.ase_calculator import Calculator
+    from fairchem.core.calculate.ase_calculator import (
+        Calculator,  # type: ignore[import-not-found]
+    )
 
     # Use pre-trained checkpoint - will auto-download from HuggingFace
     return Calculator(
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index c4d5e0e3..25c045a0 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -25,8 +25,8 @@
 
 import groundhog_hpc as hog
 import numpy as np
-import pandas as pd
-from sklearn.metrics import r2_score
+import pandas as pd  # type: ignore[import-untyped]
+from sklearn.metrics import r2_score  # type: ignore[import-untyped]
 
 
 class DatasetSize(str, Enum):
@@ -111,7 +111,7 @@ def convert_numpy_types(obj):
 def _inject_meta_metrics(source: str) -> None:
     """Inject meta_metrics functions from source code for remote execution."""
     global get_hardware_info, extract_model_info, calculate_run_metadata
-    namespace = {}
+    namespace: Dict[str, Any] = {}
     exec(source, namespace)
     get_hardware_info = namespace["get_hardware_info"]
     extract_model_info = namespace["extract_model_info"]
@@ -287,7 +287,7 @@ def _process_batch_common(
             func_name = func_name_match.group(1)
 
             # Execute the source to define the function
-            local_namespace = {}
+            local_namespace: Dict[str, Any] = {}
             exec(model_factory_source, local_namespace)
             model_factory = local_namespace[func_name]
 
@@ -327,7 +327,7 @@ def get_material_ids_for_subset(
     if subset_type == "full":
         return None
 
-    from matbench_discovery.data import DataFiles
+    from matbench_discovery.data import DataFiles  # type: ignore[import-untyped]
 
     df = pd.read_csv(DataFiles.wbm_summary.path)
 
@@ -353,7 +353,7 @@ def _load_dataset_common(
     config: Dict[str, Any],
     zip_path: str,
     read_format: str = "extxyz",
-    read_index: str | slice = None,
+    read_index: Optional[str | slice] = None,
 ) -> List[Any]:
     from io import TextIOWrapper
     from zipfile import ZipFile
@@ -395,7 +395,9 @@ def _load_dataset_common(
                     elif not isinstance(atoms_list, list):
                         structures.append((filename, atoms_list))
                 else:
-                    structures.append((filename, read(text_stream, format=read_format)))
+                    structures.append(
+                        (filename, read(text_stream, format=read_format))  # type: ignore[arg-type]
+                    )
 
     return structures
 
@@ -477,19 +479,19 @@ def compute(model, atoms):
 
 
 def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Any]:
-    from matbench_discovery.data import DataFiles
+    from matbench_discovery.data import DataFiles  # type: ignore[import-untyped]
 
     return _load_dataset_common(config, DataFiles.wbm_initial_atoms.path)
 
 
 def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Any]:
-    from matbench_discovery.data import DataFiles
+    from matbench_discovery.data import DataFiles  # type: ignore[import-untyped]
 
     return _load_dataset_common(config, DataFiles.wbm_relaxed_atoms.path)
 
 
 def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]:
-    from matbench_discovery.data import DataFiles
+    from matbench_discovery.data import DataFiles  # type: ignore[import-untyped]
 
     return _load_dataset_common(config, DataFiles.mp_trj_extxyz.path, read_index=":")
 
@@ -497,7 +499,7 @@ def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]:
 def calculate_metrics_energy(
     results: Dict[str, Any], config: Dict[str, Any]
 ) -> Dict[str, Any]:
-    from matbench_discovery.data import df_wbm
+    from matbench_discovery.data import df_wbm  # type: ignore[import-untyped]
 
     if len(results) == 0:
         return {"error": "No results to evaluate"}
@@ -542,9 +544,9 @@ def calculate_metrics_forces(
     from zipfile import ZipFile
 
     from ase.io import read
-    from matbench_discovery.data import DataFiles
+    from matbench_discovery.data import DataFiles  # type: ignore[import-untyped]
 
-    metrics = {
+    metrics: Dict[str, List[float]] = {
         "energy_mae": [],
         "energy_rmse": [],
         "force_mae": [],
@@ -552,9 +554,12 @@ def calculate_metrics_forces(
         "stress_mae": [],
         "stress_rmse": [],
     }
-    all_e_pred, all_e_true = [], []
-    all_f_pred, all_f_true = [], []
-    all_s_pred, all_s_true = [], []
+    all_e_pred: List[float] = []
+    all_e_true: List[float] = []
+    all_f_pred: List[float] = []
+    all_f_true: List[float] = []
+    all_s_pred: List[float] = []
+    all_s_true: List[float] = []
 
     zip_path = DataFiles.mp_trj_extxyz.path
 
@@ -569,8 +574,8 @@ def calculate_metrics_forces(
                     gt_atoms = atoms_list[-1]
 
                     e_pred = res["energy"]
-                    e_true = gt_atoms.get_potential_energy()
-                    n_atoms = len(gt_atoms)
+                    e_true = gt_atoms.get_potential_energy()  # type: ignore[union-attr]
+                    n_atoms = len(gt_atoms)  # type: ignore[arg-type]
                     energy_error = abs(e_pred - e_true) / n_atoms
                     metrics["energy_mae"].append(energy_error)
                     metrics["energy_rmse"].append(energy_error**2)
@@ -578,7 +583,7 @@ def calculate_metrics_forces(
                     all_e_true.append(e_true / n_atoms)
 
                     f_pred = np.array(res["forces"])
-                    f_true = gt_atoms.get_forces()
+                    f_true = gt_atoms.get_forces()  # type: ignore[union-attr]
                     force_error = np.abs(f_pred - f_true)
                     metrics["force_mae"].append(force_error.mean())
                     metrics["force_rmse"].append((force_error**2).mean())
@@ -586,7 +591,7 @@ def calculate_metrics_forces(
                     all_f_true.extend(f_true.flatten())
 
                     s_pred = np.array(res["stress"])
-                    s_true = gt_atoms.get_stress()
+                    s_true = gt_atoms.get_stress()  # type: ignore[union-attr]
                     stress_error = np.abs(s_pred - s_true)
                     metrics["stress_mae"].append(stress_error.mean())
                     metrics["stress_rmse"].append((stress_error**2).mean())
@@ -644,6 +649,8 @@ def run_benchmark_hog(
     _inject_meta_metrics(meta_metrics_source)
 
     # Collect hardware and model info
+    assert get_hardware_info is not None, "meta_metrics not injected"
+    assert extract_model_info is not None, "meta_metrics not injected"
     hardware_info = get_hardware_info()
     model_info = extract_model_info(model_packages)
     logger.info(f"Hardware: {hardware_info}")
@@ -739,6 +746,7 @@ def run_benchmark_hog(
             traceback.print_exc()
             metrics = {"error": f"Metrics calculation failed: {e}"}
 
+        assert calculate_run_metadata is not None, "meta_metrics not injected"
         run_metadata = calculate_run_metadata(
             hardware_info=hardware_info,
             model_info=model_info,
@@ -771,7 +779,7 @@ def run_benchmark_hog(
     use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1
     # Use sched_getaffinity to get cores available to this job, not total cores on node
     try:
-        total_cores = len(os.sched_getaffinity(0))
+        total_cores = len(os.sched_getaffinity(0))  # type: ignore[attr-defined]
     except AttributeError:
         # Fallback for systems without sched_getaffinity (e.g., macOS)
         total_cores = os.cpu_count() or 1
@@ -872,6 +880,7 @@ def run_benchmark_hog(
         metrics = {"error": f"Metrics calculation failed: {e}"}
 
     # Calculate run metadata
+    assert calculate_run_metadata is not None, "meta_metrics not injected"
     run_metadata = calculate_run_metadata(
         hardware_info=hardware_info,
         model_info=model_info,
@@ -1067,8 +1076,8 @@ def _prepare_runner_config(
                 else:
                     seed = num_structures.seed
         elif hasattr(num_structures, "subset"):  # DatasetConfig
-            subset = num_structures.subset.value
-            seed = num_structures.seed
+            subset = num_structures.subset.value  # type: ignore[union-attr]
+            seed = num_structures.seed  # type: ignore[union-attr]
         elif isinstance(num_structures, int):
             subset = "full"
             # We handle int as limit in load_dataset
@@ -1111,7 +1120,7 @@ def _generate_checkpoint_name(
     def _run_task(
         model_factory: Any,
         model_packages: str | List[str],
-        num_structures: int | "DatasetSize" | "DatasetConfig",
+        num_structures: int | str | DatasetSize | DatasetConfig,
         checkpoint_name: str | None,
         checkpoint_path: str | None,
         process_fn: Any,
@@ -1203,7 +1212,7 @@ def _run_task(
     def IS2RE(
         model_factory: Any,
         model_packages: str | List[str],
-        num_structures: int | "DatasetSize" | "DatasetConfig" = "full",
+        num_structures: int | str | DatasetSize | DatasetConfig = "full",
         checkpoint_name: str | None = None,
         checkpoint_path: str | None = None,
         sys_path: List[str] | None = None,
@@ -1227,7 +1236,7 @@ def IS2RE(
     def RS2RE(
         model_factory: Any,
         model_packages: str | List[str],
-        num_structures: int | "DatasetSize" | "DatasetConfig" = "full",
+        num_structures: int | str | DatasetSize | DatasetConfig = "full",
         checkpoint_name: str | None = None,
         checkpoint_path: str | None = None,
         sys_path: List[str] | None = None,
@@ -1251,7 +1260,7 @@ def RS2RE(
     def S2EFS(
         model_factory: Any,
         model_packages: str | List[str],
-        num_structures: int | "DatasetSize" | "DatasetConfig" = "full",
+        num_structures: int | str | DatasetSize | DatasetConfig = "full",
         checkpoint_name: str | None = None,
         checkpoint_path: str | None = None,
         sys_path: List[str] | None = None,
diff --git a/garden_ai/benchmarks/utils/meta_metrics.py b/garden_ai/benchmarks/utils/meta_metrics.py
index 481a2b35..951d549f 100644
--- a/garden_ai/benchmarks/utils/meta_metrics.py
+++ b/garden_ai/benchmarks/utils/meta_metrics.py
@@ -51,11 +51,10 @@ def get_hardware_info() -> Dict[str, Any]:
 
         if torch.cuda.is_available():
             info["device_type"] = "cuda"
-            info["num_gpus"] = torch.cuda.device_count()
-            info["gpu_names"] = [
-                torch.cuda.get_device_name(i) for i in range(info["num_gpus"])
-            ]
-            if info["num_gpus"] > 0:
+            num_gpus = torch.cuda.device_count()
+            info["num_gpus"] = num_gpus
+            info["gpu_names"] = [torch.cuda.get_device_name(i) for i in range(num_gpus)]
+            if num_gpus > 0:
                 props = torch.cuda.get_device_properties(0)
                 info["gpu_memory_gb"] = round(props.total_memory / (1024**3), 1)
         elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():

From 0c2997bca3b3d64c383b81954ea6c21527e4a36a Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Wed, 10 Dec 2025 14:08:32 -0700
Subject: [PATCH 15/23] appease mypy

---
 .../examples/matbench_equiformerv2.py            |  4 +---
 garden_ai/benchmarks/matbench_discovery/tasks.py | 16 ++++++++--------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
index afd3b2d5..eb4d64ea 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
@@ -8,9 +8,7 @@
 
 
 def create_equiformerv2_model(device):
-    from fairchem.core.calculate.ase_calculator import (
-        Calculator,  # type: ignore[import-not-found]
-    )
+    from fairchem.core.calculate.ase_calculator import Calculator  # type: ignore
 
     # Use pre-trained checkpoint - will auto-download from HuggingFace
     return Calculator(
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index 25c045a0..45ec9701 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -25,8 +25,8 @@
 
 import groundhog_hpc as hog
 import numpy as np
-import pandas as pd  # type: ignore[import-untyped]
-from sklearn.metrics import r2_score  # type: ignore[import-untyped]
+import pandas as pd  # type: ignore
+from sklearn.metrics import r2_score  # type: ignore
 
 
 class DatasetSize(str, Enum):
@@ -327,7 +327,7 @@ def get_material_ids_for_subset(
     if subset_type == "full":
         return None
 
-    from matbench_discovery.data import DataFiles  # type: ignore[import-untyped]
+    from matbench_discovery.data import DataFiles  # type: ignore
 
     df = pd.read_csv(DataFiles.wbm_summary.path)
 
@@ -479,19 +479,19 @@ def compute(model, atoms):
 
 
 def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Any]:
-    from matbench_discovery.data import DataFiles  # type: ignore[import-untyped]
+    from matbench_discovery.data import DataFiles  # type: ignore
 
     return _load_dataset_common(config, DataFiles.wbm_initial_atoms.path)
 
 
 def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Any]:
-    from matbench_discovery.data import DataFiles  # type: ignore[import-untyped]
+    from matbench_discovery.data import DataFiles  # type: ignore
 
     return _load_dataset_common(config, DataFiles.wbm_relaxed_atoms.path)
 
 
 def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]:
-    from matbench_discovery.data import DataFiles  # type: ignore[import-untyped]
+    from matbench_discovery.data import DataFiles  # type: ignore
 
     return _load_dataset_common(config, DataFiles.mp_trj_extxyz.path, read_index=":")
 
@@ -499,7 +499,7 @@ def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]:
 def calculate_metrics_energy(
     results: Dict[str, Any], config: Dict[str, Any]
 ) -> Dict[str, Any]:
-    from matbench_discovery.data import df_wbm  # type: ignore[import-untyped]
+    from matbench_discovery.data import df_wbm  # type: ignore
 
     if len(results) == 0:
         return {"error": "No results to evaluate"}
@@ -544,7 +544,7 @@ def calculate_metrics_forces(
     from zipfile import ZipFile
 
     from ase.io import read
-    from matbench_discovery.data import DataFiles  # type: ignore[import-untyped]
+    from matbench_discovery.data import DataFiles  # type: ignore
 
     metrics: Dict[str, List[float]] = {
         "energy_mae": [],

From 2f96ee4d1ab6ce3d1cee57dfc25d1dc5c98adee3 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Wed, 10 Dec 2025 14:45:18 -0700
Subject: [PATCH 16/23] implement publish_benchmark_result helper

---
 garden_ai/backend_client.py                   | 10 +++
 garden_ai/benchmarks/__init__.py              | 90 ++++++++++++++++---
 .../benchmarks/matbench_discovery/tasks.py    | 47 +++++++---
 3 files changed, 122 insertions(+), 25 deletions(-)

diff --git a/garden_ai/backend_client.py b/garden_ai/backend_client.py
index 63d5e7a0..9574fa1d 100644
--- a/garden_ai/backend_client.py
+++ b/garden_ai/backend_client.py
@@ -6,6 +6,10 @@
 
 from garden_ai.constants import GardenConstants
 from garden_ai.gardens import Garden
+from garden_ai.schemas.benchmark import (
+    BenchmarkResultCreateRequest,
+    BenchmarkResultResponse,
+)
 from garden_ai.schemas.garden import GardenMetadata
 from garden_ai.schemas.hpc import HpcInvocationCreateRequest
 from garden_ai.schemas.modal import (
@@ -157,3 +161,9 @@ def search_gardens(self, payload: dict) -> dict:
     def create_hpc_invocation(self, payload: HpcInvocationCreateRequest) -> dict:
         response = self._post("/hpc/invocations", payload.model_dump(mode="json"))
         return response
+
+    def publish_benchmark_result(
+        self, payload: BenchmarkResultCreateRequest
+    ) -> BenchmarkResultResponse:
+        response = self._post("/benchmarks", payload.model_dump(mode="json"))
+        return BenchmarkResultResponse(**response)
diff --git a/garden_ai/benchmarks/__init__.py b/garden_ai/benchmarks/__init__.py
index 5d40ae5a..5dc7dd7a 100644
--- a/garden_ai/benchmarks/__init__.py
+++ b/garden_ai/benchmarks/__init__.py
@@ -7,6 +7,11 @@
     - MatbenchDiscovery: Materials discovery benchmark suite
 """
 
+from typing import Any, Dict, Optional
+
+from garden_ai.client import GardenClient
+from garden_ai.schemas.benchmark import BenchmarkResultCreateRequest
+
 from .matbench_discovery.enums import DatasetSize, MatbenchTask
 from .matbench_discovery.tasks import MatbenchDiscovery
 
@@ -14,22 +19,85 @@
     "MatbenchDiscovery",
     "MatbenchTask",
     "DatasetSize",
+    "publish_benchmark_result",
 ]
 
 
-def publish_benchmark_result(benchmark, model, results):
-    """Publish benchmark results to Garden AI backend.
+def publish_benchmark_result(
+    result: Dict[str, Any],
+    benchmark_name: Optional[str] = None,
+    task_name: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Publish benchmark results to the Garden AI backend.
 
-    This is a placeholder for future functionality to store benchmark
-    results alongside published models.
+    This function takes the output from a benchmark task (e.g., MatbenchDiscovery.IS2RE.remote())
+    and publishes it to the Garden backend for tracking and leaderboard purposes.
 
     Args:
-        benchmark: Benchmark adapter instance
-        model: Model that was benchmarked
-        results: Dictionary of benchmark metrics
+        result: The output dictionary from a benchmark task. Should contain:
+            - 'metrics': Dictionary of benchmark metrics (F1, DAF, MAE, etc.)
+            - 'run_metadata': Optional run metadata (hardware, timing, cost)
+            - '_benchmark_info': Auto-injected benchmark/task info (if from wrapped method)
+        benchmark_name: Override for benchmark name (defaults to auto-detected from result)
+        task_name: Override for task name (defaults to auto-detected from result)
+
+    Returns:
+        Dictionary containing the response from the backend, including the result ID.
+
+    Raises:
+        ValueError: If benchmark_name or task_name cannot be determined.
+        requests.HTTPError: If the backend request fails.
+
+    Example:
+        ```python
+        from garden_ai.benchmarks import MatbenchDiscovery, publish_benchmark_result
+
+        # Run a benchmark
+        output = MatbenchDiscovery.IS2RE.remote(
+            endpoint="your-endpoint-id",
+            model_factory=create_model,
+            model_packages="mace-torch",
+        )
+
+        # Publish the results
+        response = publish_benchmark_result(output)
+        print(f"Published with ID: {response['id']}")
+        ```
     """
-    # TODO: Implement when backend API is ready
-    raise NotImplementedError(
-        "Publishing benchmark results is not yet implemented. "
-        "For now, save results locally or to your own storage."
+    # Extract benchmark info from result or use provided overrides
+    benchmark_info = result.get("_benchmark_info", {})
+
+    final_benchmark_name = benchmark_name or benchmark_info.get("benchmark_name")
+    final_task_name = task_name or benchmark_info.get("task_name")
+
+    if not final_benchmark_name:
+        raise ValueError(
+            "benchmark_name is required. Either pass it explicitly or use a result "
+            "from a MatbenchDiscovery task method (e.g., MatbenchDiscovery.IS2RE.remote())."
+        )
+
+    if not final_task_name:
+        raise ValueError(
+            "task_name is required. Either pass it explicitly or use a result "
+            "from a MatbenchDiscovery task method (e.g., MatbenchDiscovery.IS2RE.remote())."
+        )
+
+    # Extract metrics and run_metadata
+    metrics = result.get("metrics", {})
+    run_metadata = result.get("run_metadata")
+
+    if not metrics:
+        raise ValueError("Result must contain 'metrics' dictionary.")
+
+    # Create the request payload
+    payload = BenchmarkResultCreateRequest(
+        benchmark_name=final_benchmark_name,
+        benchmark_task_name=final_task_name,
+        metrics=metrics,
+        run_metadata=run_metadata,
     )
+
+    # Get authenticated client and publish
+    client = GardenClient()
+    response = client.backend_client.publish_benchmark_result(payload)
+    return response.model_dump()
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index 45ec9701..28f09fbd 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -899,9 +899,17 @@ def run_benchmark_hog(
 class BenchmarkMethod:
     """Wrapper around groundhog Method that handles source extraction for remote execution."""
 
-    def __init__(self, hog_method):
-        """Initialize wrapper with the underlying groundhog Method."""
+    BENCHMARK_NAME = "matbench_discovery"
+
+    def __init__(self, hog_method, task_name: str):
+        """Initialize wrapper with the underlying groundhog Method.
+
+        Args:
+            hog_method: The underlying groundhog method to wrap.
+            task_name: Name of the benchmark task (e.g., 'IS2RE', 'S2EFS').
+        """
         self._hog_method = hog_method
+        self._task_name = task_name
 
     def _extract_sources(self, kwargs):
         """Extract source code from model_factory and meta_metrics for remote execution."""
@@ -1018,17 +1026,28 @@ def _print_checkpoint_info(self, kwargs, is_remote: bool):
             print(f'   checkpoint_path="{identifier}"')
         print("=" * 80)
 
+    def _add_benchmark_info(self, result: Dict[str, Any]) -> Dict[str, Any]:
+        """Add benchmark metadata to the result for publishing."""
+        if isinstance(result, dict):
+            result["_benchmark_info"] = {
+                "benchmark_name": self.BENCHMARK_NAME,
+                "task_name": self._task_name,
+            }
+        return result
+
     def remote(self, *args, **kwargs):
         """Execute remotely with automatic source extraction."""
         kwargs = self._extract_sources(kwargs)
         self._print_checkpoint_info(kwargs, is_remote=True)
-        return self._hog_method.remote(*args, **kwargs)
+        result = self._hog_method.remote(*args, **kwargs)
+        return self._add_benchmark_info(result)
 
     def local(self, *args, **kwargs):
         """Execute locally with automatic source extraction."""
         kwargs = self._extract_sources(kwargs)
         self._print_checkpoint_info(kwargs, is_remote=False)
-        return self._hog_method.local(*args, **kwargs)
+        result = self._hog_method.local(*args, **kwargs)
+        return self._add_benchmark_info(result)
 
     def submit(self, *args, **kwargs):
         """Submit for async execution with automatic source extraction."""
@@ -1349,15 +1368,15 @@ def create_mace_model(device):
     _run_task = _MatbenchDiscoveryBase._run_task
 
     # Main benchmark tasks - wrapped for automatic model_factory source extraction
-    IS2RE = BenchmarkMethod(_MatbenchDiscoveryBase.IS2RE)
-    RS2RE = BenchmarkMethod(_MatbenchDiscoveryBase.RS2RE)
-    S2EFS = BenchmarkMethod(_MatbenchDiscoveryBase.S2EFS)
+    IS2RE = BenchmarkMethod(_MatbenchDiscoveryBase.IS2RE, "IS2RE")
+    RS2RE = BenchmarkMethod(_MatbenchDiscoveryBase.RS2RE, "RS2RE")
+    S2EFS = BenchmarkMethod(_MatbenchDiscoveryBase.S2EFS, "S2EFS")
 
     # Aliases
-    S2EF = BenchmarkMethod(_MatbenchDiscoveryBase.S2EF)
-    S2EFSM = BenchmarkMethod(_MatbenchDiscoveryBase.S2EFSM)
-    IS2E = BenchmarkMethod(_MatbenchDiscoveryBase.IS2E)
-    S2E = BenchmarkMethod(_MatbenchDiscoveryBase.S2E)
-    S2RE = BenchmarkMethod(_MatbenchDiscoveryBase.S2RE)
-    RP2RE = BenchmarkMethod(_MatbenchDiscoveryBase.RP2RE)
-    IP2E = BenchmarkMethod(_MatbenchDiscoveryBase.IP2E)
+    S2EF = BenchmarkMethod(_MatbenchDiscoveryBase.S2EF, "S2EF")
+    S2EFSM = BenchmarkMethod(_MatbenchDiscoveryBase.S2EFSM, "S2EFSM")
+    IS2E = BenchmarkMethod(_MatbenchDiscoveryBase.IS2E, "IS2E")
+    S2E = BenchmarkMethod(_MatbenchDiscoveryBase.S2E, "S2E")
+    S2RE = BenchmarkMethod(_MatbenchDiscoveryBase.S2RE, "S2RE")
+    RP2RE = BenchmarkMethod(_MatbenchDiscoveryBase.RP2RE, "RP2RE")
+    IP2E = BenchmarkMethod(_MatbenchDiscoveryBase.IP2E, "IP2E")

From 723bb580778e410aaf73857ffd4100536afa6406 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Wed, 10 Dec 2025 14:48:00 -0700
Subject: [PATCH 17/23] cleanup comments

---
 garden_ai/benchmarks/matbench_discovery/enums.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/garden_ai/benchmarks/matbench_discovery/enums.py b/garden_ai/benchmarks/matbench_discovery/enums.py
index 5c34cb6b..fed3b514 100644
--- a/garden_ai/benchmarks/matbench_discovery/enums.py
+++ b/garden_ai/benchmarks/matbench_discovery/enums.py
@@ -4,13 +4,7 @@
 
 
 class MatbenchTask(Enum):
-    """Available Matbench Discovery benchmark tasks.
-
-    Currently only IS2RE is implemented for the MVP.
-    Future tasks could include:
-    - RS2RE: Relaxed Structure to Relaxed Energy
-    - S2EFS: Structure to Energy, Forces, and Stress
-    """
+    """Available Matbench Discovery benchmark tasks."""
 
     IS2RE = "IS2RE"  # Initial Structure to Relaxed Energy
     RS2RE = "RS2RE"  # Relaxed Structure to Relaxed Energy

From 83baabac33a88e065d03b293ac89cd15e22a1b86 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Wed, 10 Dec 2025 14:55:06 -0700
Subject: [PATCH 18/23] add py.typed to appease mypy as per PEP 561

---
 garden_ai/py.typed | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 garden_ai/py.typed

diff --git a/garden_ai/py.typed b/garden_ai/py.typed
new file mode 100644
index 00000000..e69de29b

From ac03466841165f5fbdf23c352fcae6111ee6b369 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Wed, 10 Dec 2025 14:59:17 -0700
Subject: [PATCH 19/23] add missing schema file :facepalm

---
 garden_ai/schemas/benchmark.py | 36 ++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 garden_ai/schemas/benchmark.py

diff --git a/garden_ai/schemas/benchmark.py b/garden_ai/schemas/benchmark.py
new file mode 100644
index 00000000..f607c8a8
--- /dev/null
+++ b/garden_ai/schemas/benchmark.py
@@ -0,0 +1,36 @@
+"""Benchmark-related schemas for API requests/responses."""
+
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field
+
+
+class BenchmarkResultCreateRequest(BaseModel):
+    """Request schema for publishing benchmark results to the backend."""
+
+    benchmark_name: str = Field(
+        ...,
+        description="Name of the benchmark suite (e.g., 'matbench_discovery')",
+    )
+    benchmark_task_name: str = Field(
+        ...,
+        description="Name of the specific task within the benchmark (e.g., 'IS2RE', 'S2EFS')",
+    )
+    metrics: Dict[str, Any] = Field(
+        ...,
+        description="Dictionary of benchmark metrics (F1, DAF, MAE, etc.)",
+    )
+    run_metadata: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Optional run metadata (hardware info, timing, cost estimates)",
+    )
+
+
+class BenchmarkResultResponse(BaseModel):
+    """Response schema from the benchmark result creation endpoint."""
+
+    id: str = Field(..., description="Unique identifier for the benchmark result")
+    benchmark_name: str
+    benchmark_task_name: str
+    metrics: Dict[str, Any]
+    run_metadata: Optional[Dict[str, Any]] = None

From 964c4f14124622614472013693190fe089dbd537 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Thu, 11 Dec 2025 10:21:24 -0700
Subject: [PATCH 20/23] fix checkpoint resume bug, clean up examples

---
 .../examples/local_execution.py               | 33 ++++------
 .../examples/matbench_equiformerv2.py         | 43 ++++--------
 .../examples/matbench_mace_multi_gpu.py       | 55 ++++++----------
 .../examples/matbench_mattersim.py            | 42 ++++--------
 .../examples/matbench_sevennet.py             | 42 ++++--------
 .../benchmarks/matbench_discovery/tasks.py    | 65 +++++++++++++++----
 6 files changed, 121 insertions(+), 159 deletions(-)

diff --git a/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py b/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py
index 6414f5cf..1f482904 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py
@@ -10,22 +10,17 @@ def create_mattersim_model(device):
     return MatterSimCalculator(device=device)
 
 
-def main():
-    print("Running MatterSim benchmark locally...")
-
-    # Run IS2RE task locally
-    # Note: Requires a GPU or MPS if using MatterSim, or CPU if specified/supported
-    output = MatbenchDiscovery.IS2RE.local(
-        model_factory=create_mattersim_model,
-        model_packages="mattersim",
-        num_structures="random_100",
-    )
-
-    if "error" in output.get("metrics", {}):
-        print(f"Error: {output['metrics']['error']}")
-    else:
-        print("Benchmark Results:", output.get("metrics"))
-
-
-if __name__ == "__main__":
-    main()
+print("Running MatterSim benchmark locally...")
+
+# Run IS2RE task locally
+# Note: Requires a GPU or MPS if using MatterSim, or CPU if specified/supported
+output = MatbenchDiscovery.IS2RE.local(
+    model_factory=create_mattersim_model,
+    model_packages="mattersim",
+    num_structures="random_100",
+)
+
+if "error" in output.get("metrics", {}):
+    print(f"Error: {output['metrics']['error']}")
+else:
+    print("Benchmark Results:", output.get("metrics"))
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
index eb4d64ea..b32cfed0 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
@@ -3,9 +3,6 @@
 
 from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
 
-# Globus Compute endpoint
-ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
-
 
 def create_equiformerv2_model(device):
     from fairchem.core.calculate.ase_calculator import Calculator  # type: ignore
@@ -16,32 +13,16 @@ def create_equiformerv2_model(device):
     )
 
 
-def main():
-    print(f"Running EquiformerV2 benchmark on endpoint {ENDPOINT_ID}...")
-
-    # Run S2EFS task (structure to energy/forces/stress)
-    output = MatbenchDiscovery.S2EFS.remote(
-        endpoint=ENDPOINT_ID,
-        user_endpoint_config={
-            "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8\n",
-            "walltime": 7200,
-            "qos": "gpu",
-            "partition": "gpu-debug",
-            "account": "cis250461-gpu",
-            "cores_per_node": 16,
-            "mem_per_node": 32,
-            "requirements": "",
-        },
-        model_factory=create_equiformerv2_model,
-        model_packages="fairchem-core",
-        num_structures="random_100",
-    )
-
-    if "error" in output.get("metrics", {}):
-        print(f"Error: {output['metrics']['error']}")
-    else:
-        print("Benchmark Results:", output.get("metrics"))
-
+# Run S2EFS task (structure to energy/forces/stress)
+output = MatbenchDiscovery.S2EFS.remote(
+    endpoint="anvil",
+    account="your-account-here",
+    model_factory=create_equiformerv2_model,
+    model_packages="fairchem-core",
+    num_structures="random_10k",
+)
 
-if __name__ == "__main__":
-    main()
+if "error" in output.get("metrics", {}):
+    print(f"Error: {output['metrics']['error']}")
+else:
+    print("Benchmark Results:", output.get("metrics"))
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
index e0fb0003..90e7134b 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 """Matbench Discovery Benchmark - MACE Multi-GPU Example"""
 
-from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
+from rich import print
 
-ANVIL = "5aafb4c1-27b2-40d8-a038-a0277611868f"
+from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
 
 
 def create_mace_model(device):
@@ -12,35 +12,22 @@ def create_mace_model(device):
     return mace_mp(model="medium-mpa-0", device=device, default_dtype="float64")
 
 
-def main():
-    print(f"Running MACE benchmark on endpoint {ANVIL}...")
-
-    results = MatbenchDiscovery.IS2RE.remote(
-        endpoint=ANVIL,
-        user_endpoint_config={
-            "scheduler_options": "#SBATCH --gpus-per-node=4\n",
-            "walltime": "05:00:00",
-            "qos": "gpu",
-            "partition": "gpu",
-            "account": "cis250461-gpu",
-            "cores_per_node": 16,
-            "requirements": "",  # 'requirements' is required for Anvil endpoint
-        },
-        model_factory=create_mace_model,
-        model_packages=[
-            "mace-torch",
-            "cuequivariance",
-            "cuequivariance-torch",
-            "cuequivariance-ops-torch-cu12",
-        ],
-        checkpoint_path="~/.garden/benchmarks/matbench_mace-torch_cuequivariance_full_20251208_115719_ed2e47af.json",
-    )
-
-    if "error" in results.get("metrics", {}):
-        print(f"Error: {results['metrics']['error']}")
-    else:
-        print("Benchmark Results:", results.get("metrics"))
-
-
-if __name__ == "__main__":
-    main()
+print("Running MACE benchmark on endpoint anvil...")
+
+results = MatbenchDiscovery.IS2RE.remote(
+    endpoint="anvil",
+    account="cis250461-gpu",
+    model_factory=create_mace_model,
+    model_packages=[
+        "mace-torch",
+        "cuequivariance",
+        "cuequivariance-torch",
+        "cuequivariance-ops-torch-cu12",
+    ],
+    num_structures="random_100",
+)
+
+if "error" in results.get("metrics", {}):
+    print(f"Error: {results['metrics']['error']}")
+else:
+    print("Benchmark Results:", results)
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py
index 8a7636ba..f9a5c4c8 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py
@@ -3,9 +3,6 @@
 
 from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
 
-# Globus Compute endpoint
-ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
-
 
 def create_mattersim_model(device):
     from mattersim.forcefield import MatterSimCalculator
@@ -13,32 +10,15 @@ def create_mattersim_model(device):
     return MatterSimCalculator(device=device)
 
 
-def main():
-    print(f"Running MatterSim benchmark on endpoint {ENDPOINT_ID}...")
-
-    # Run IS2RE task
-    output = MatbenchDiscovery.IS2RE.remote(
-        endpoint=ENDPOINT_ID,
-        user_endpoint_config={
-            "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8\n",
-            "walltime": 7200,
-            "qos": "gpu",
-            "partition": "gpu-debug",
-            "account": "cis250461-gpu",
-            "cores_per_node": 16,
-            "mem_per_node": 32,
-            "requirements": "",
-        },
-        model_factory=create_mattersim_model,
-        model_packages="mattersim",
-        num_structures="random_100",
-    )
-
-    if "error" in output.get("metrics", {}):
-        print(f"Error: {output['metrics']['error']}")
-    else:
-        print("Benchmark Results:", output.get("metrics"))
-
+output = MatbenchDiscovery.IS2RE.remote(
+    endpoint="anvil",
+    account="your-account-here",
+    model_factory=create_mattersim_model,
+    model_packages="mattersim",
+    num_structures="random_100",
+)
 
-if __name__ == "__main__":
-    main()
+if "error" in output.get("metrics", {}):
+    print(f"Error: {output['metrics']['error']}")
+else:
+    print("Benchmark Results:", output)
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py
index 411c64e1..da69d7ab 100644
--- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py
+++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py
@@ -3,9 +3,6 @@
 
 from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
 
-# Globus Compute endpoint (replace with your endpoint UUID)
-ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f"
-
 
 def create_sevennet_model(device):
     from sevenn.calculator import SevenNetCalculator
@@ -13,32 +10,15 @@ def create_sevennet_model(device):
     return SevenNetCalculator(model="7net-0", device=device)
 
 
-def main():
-    print(f"Running SevenNet benchmark on endpoint {ENDPOINT_ID}...")
-
-    # Run IS2RE task
-    output = MatbenchDiscovery.IS2RE.remote(
-        endpoint=ENDPOINT_ID,
-        user_endpoint_config={
-            "scheduler_options": "#SBATCH --gpus-per-node=2\n",
-            "walltime": 7200,
-            "qos": "gpu",
-            "partition": "gpu-debug",
-            "account": "cis250461-gpu",
-            "cores_per_node": 16,
-            "mem_per_node": 32,
-            "requirements": "",
-        },
-        model_factory=create_sevennet_model,
-        model_packages="sevenn",
-        num_structures="random_100",
-    )
-
-    if "error" in output.get("metrics", {}):
-        print(f"Error: {output['metrics']['error']}")
-    else:
-        print("Benchmark Results:", output.get("metrics"))
-
+output = MatbenchDiscovery.IS2RE.remote(
+    endpoint="anvil",
+    account="your-account-here",
+    model_factory=create_sevennet_model,
+    model_packages="sevenn",
+    num_structures="random_100",
+)
 
-if __name__ == "__main__":
-    main()
+if "error" in output.get("metrics", {}):
+    print(f"Error: {output['metrics']['error']}")
+else:
+    print("Benchmark Results:", output)
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index 28f09fbd..b2710467 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -9,6 +9,15 @@
 #     "torch",
 #     "matbench-discovery",
 # ]
+#
+# [tool.hog.anvil]
+# endpoint = "5aafb4c1-27b2-40d8-a038-a0277611868f"
+# account = "replace with your account"
+# qos = "gpu"
+# partition = "gpu"
+# cores_per_node = 16
+# scheduler_options = "#SBATCH --gpus-per-node=4\n"
+# requirements = ""
 # ///
 
 from __future__ import annotations
@@ -706,13 +715,29 @@ def run_benchmark_hog(
 
     checkpoint_path = config.get("checkpoint_path")
     results = {}
+    prior_elapsed = 0.0  # Cumulative time from previous sessions
 
     if checkpoint_path and os.path.exists(checkpoint_path):
         logger.info(f"Loading checkpoint from {checkpoint_path}")
         try:
             with open(checkpoint_path) as f:
-                results = json.load(f)
-            logger.info(f"Found {len(results)} processed items in checkpoint")
+                checkpoint_data = json.load(f)
+
+            # Handle new format with metadata vs old format (plain results dict)
+            if "_checkpoint_meta" in checkpoint_data:
+                results = checkpoint_data.get("results", {})
+                meta = checkpoint_data["_checkpoint_meta"]
+                prior_elapsed = meta.get("elapsed_seconds", 0.0)
+                logger.info(
+                    f"Found {len(results)} processed items in checkpoint "
+                    f"(prior elapsed: {prior_elapsed:.1f}s)"
+                )
+            else:
+                # Backward compatibility: old format is plain results dict
+                results = checkpoint_data
+                logger.info(
+                    f"Found {len(results)} processed items in checkpoint (legacy format)"
+                )
         except Exception as e:
             logger.warning(f"Failed to load checkpoint: {e}. Starting fresh.")
 
@@ -746,14 +771,15 @@ def run_benchmark_hog(
             traceback.print_exc()
             metrics = {"error": f"Metrics calculation failed: {e}"}
 
+        # Use cumulative values from checkpoint metadata
         assert calculate_run_metadata is not None, "meta_metrics not injected"
         run_metadata = calculate_run_metadata(
             hardware_info=hardware_info,
             model_info=model_info,
-            total_elapsed=0,
+            total_elapsed=prior_elapsed,
             num_workers=0,
             num_structures_total=len(all_items),
-            num_structures_processed=0,
+            num_structures_processed=len(results),
         )
         return {"metrics": metrics, "run_metadata": run_metadata}
 
@@ -850,9 +876,20 @@ def run_benchmark_hog(
             if checkpoint_path:
                 try:
                     tmp_path = checkpoint_path + ".tmp"
+                    # Calculate cumulative elapsed time for checkpoint
+                    current_elapsed = time.time() - start_time
+                    cumulative_elapsed = prior_elapsed + current_elapsed
+
+                    # Save checkpoint with metadata for resume
+                    checkpoint_data = {
+                        "results": convert_numpy_types(results),
+                        "_checkpoint_meta": {
+                            "elapsed_seconds": cumulative_elapsed,
+                            "structures_processed": len(results),
+                        },
+                    }
                     with open(tmp_path, "w") as f:
-                        clean_results = convert_numpy_types(results)
-                        json.dump(clean_results, f, indent=2)
+                        json.dump(checkpoint_data, f, indent=2)
                     os.replace(tmp_path, checkpoint_path)
                     logger.info(f"Checkpoint saved to {checkpoint_path}")
                 except Exception as e:
@@ -865,8 +902,12 @@ def run_benchmark_hog(
             elapsed = time.time() - chunk_start
             logger.info(f"Chunk {chunk_idx + 1} complete in {elapsed:.1f}s")
 
-    total_elapsed = time.time() - start_time
-    logger.info(f"Benchmark complete in {total_elapsed:.1f}s.")
+    session_elapsed = time.time() - start_time
+    total_elapsed = prior_elapsed + session_elapsed
+    logger.info(
+        f"Session complete in {session_elapsed:.1f}s. "
+        f"Total elapsed: {total_elapsed:.1f}s."
+    )
 
     logger.info("Calculating metrics...")
     try:
@@ -879,7 +920,7 @@ def run_benchmark_hog(
         traceback.print_exc()
         metrics = {"error": f"Metrics calculation failed: {e}"}
 
-    # Calculate run metadata
+    # Calculate run metadata using cumulative values
     assert calculate_run_metadata is not None, "meta_metrics not injected"
     run_metadata = calculate_run_metadata(
         hardware_info=hardware_info,
@@ -887,7 +928,7 @@ def run_benchmark_hog(
         total_elapsed=total_elapsed,
         num_workers=num_workers,
         num_structures_total=len(all_items),
-        num_structures_processed=len(items_to_process),
+        num_structures_processed=len(results),
     )
     logger.info(f"Run metadata: {run_metadata}")
 
@@ -1310,9 +1351,7 @@ def S2EFSM(*args, **kwargs):
 
     @hog.method()
     def IS2E(*args, **kwargs):
-        # Same as IS2RE but static? No, IS2E is Initial Structure to Energy (Static).
-        # IS2RE is Relaxation.
-        # IS2E logic:
+        # IS2E is Initial Structure to Energy (Static).
         return _MatbenchDiscoveryBase._run_task(
             *args,
             **kwargs,

From cf7b6c59e28ad5ab6f9da4a522fef1fe2af4e9a3 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Fri, 12 Dec 2025 09:44:32 -0700
Subject: [PATCH 21/23] fix missing import

---
 garden_ai/benchmarks/matbench_discovery/tasks.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index b2710467..99cbb8f7 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -30,12 +30,13 @@
 import sys
 import time
 from enum import Enum
-from typing import Any, Callable, Dict, List, Optional, Sequence
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence
 
 import groundhog_hpc as hog
 import numpy as np
-import pandas as pd  # type: ignore
-from sklearn.metrics import r2_score  # type: ignore
+
+if TYPE_CHECKING:
+    import pandas as pd
 
 
 class DatasetSize(str, Enum):
@@ -148,6 +149,8 @@ def classify_stable(
     stability_threshold: float = 0.0,
     fillna: bool = True,
 ) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
+    import pandas as pd
+
     if len(each_true) != len(each_pred):
         raise ValueError(f"{len(each_true)=} != {len(each_pred)=}")
 
@@ -233,6 +236,8 @@ def stable_metrics(
     else:
         f1_score = 2 * (precision * recall) / (precision + recall)
 
+    from sklearn.metrics import r2_score  # type: ignore
+
     return dict(
         F1=f1_score,
         DAF=precision / prevalence if prevalence > 0 else float("nan"),
@@ -336,6 +341,7 @@ def get_material_ids_for_subset(
     if subset_type == "full":
         return None
 
+    import pandas as pd
     from matbench_discovery.data import DataFiles  # type: ignore
 
     df = pd.read_csv(DataFiles.wbm_summary.path)
@@ -610,6 +616,8 @@ def calculate_metrics_forces(
             except Exception:
                 pass
 
+    from sklearn.metrics import r2_score  # type: ignore
+
     result_metrics = {}
     if metrics["energy_mae"]:
         result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"]))

From 5342023881c82bf39f7547b40a65cc81e112f0c3 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Wed, 14 Jan 2026 13:40:50 -0700
Subject: [PATCH 22/23] fix a few bugs, update request schemas

---
 garden_ai/benchmarks/__init__.py              |  33 ++-
 .../benchmarks/matbench_discovery/tasks.py    | 238 +++++++++---------
 garden_ai/client.py                           |   5 +-
 garden_ai/schemas/benchmark.py                |   2 +-
 4 files changed, 145 insertions(+), 133 deletions(-)

diff --git a/garden_ai/benchmarks/__init__.py b/garden_ai/benchmarks/__init__.py
index 5dc7dd7a..c3281dcd 100644
--- a/garden_ai/benchmarks/__init__.py
+++ b/garden_ai/benchmarks/__init__.py
@@ -25,6 +25,8 @@
 
 def publish_benchmark_result(
     result: Dict[str, Any],
+    model_name: str,
+    garden_doi: Optional[str] = None,
     benchmark_name: Optional[str] = None,
     task_name: Optional[str] = None,
 ) -> Dict[str, Any]:
@@ -38,6 +40,9 @@ def publish_benchmark_result(
             - 'metrics': Dictionary of benchmark metrics (F1, DAF, MAE, etc.)
             - 'run_metadata': Optional run metadata (hardware, timing, cost)
             - '_benchmark_info': Auto-injected benchmark/task info (if from wrapped method)
+        model_name: The specific name/variant of the model (e.g., "mace-mp-0-medium", "chgnet-v0.3.0").
+            This is required to identify the model on the leaderboard.
+        garden_doi: Optional DOI for the Garden publication associated with this benchmark result.
         benchmark_name: Override for benchmark name (defaults to auto-detected from result)
         task_name: Override for task name (defaults to auto-detected from result)
 
@@ -53,14 +58,10 @@ def publish_benchmark_result(
         from garden_ai.benchmarks import MatbenchDiscovery, publish_benchmark_result
 
         # Run a benchmark
-        output = MatbenchDiscovery.IS2RE.remote(
-            endpoint="your-endpoint-id",
-            model_factory=create_model,
-            model_packages="mace-torch",
-        )
+        output = MatbenchDiscovery.IS2RE.remote(...)
 
         # Publish the results
-        response = publish_benchmark_result(output)
+        response = publish_benchmark_result(output, model_name="mace-medium", garden_doi="10.26311/example.doi")
         print(f"Published with ID: {response['id']}")
         ```
     """
@@ -82,19 +83,25 @@ def publish_benchmark_result(
             "from a MatbenchDiscovery task method (e.g., MatbenchDiscovery.IS2RE.remote())."
         )
 
-    # Extract metrics and run_metadata
-    metrics = result.get("metrics", {})
-    run_metadata = result.get("run_metadata")
+    # Inject model name into run_metadata
+    if "run_metadata" not in result:
+        result["run_metadata"] = {}
+    if "model" not in result["run_metadata"]:
+        result["run_metadata"]["model"] = {}
+
+    result["run_metadata"]["model"]["variant"] = model_name
 
-    if not metrics:
-        raise ValueError("Result must contain 'metrics' dictionary.")
+    # Inject garden_doi if provided
+    if garden_doi:
+        result["run_metadata"]["garden_doi"] = garden_doi
 
     # Create the request payload
+    # Note: We pass the modified result (containing metrics and metadata) as 'metrics'
+    # This assumes the backend handles the unified blob or we rely on the schema field description.
     payload = BenchmarkResultCreateRequest(
         benchmark_name=final_benchmark_name,
         benchmark_task_name=final_task_name,
-        metrics=metrics,
-        run_metadata=run_metadata,
+        metrics=result,
     )
 
     # Get authenticated client and publish
diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index 99cbb8f7..d2d0ee7a 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -1,5 +1,5 @@
 # /// script
-# requires-python = ">=3.10"
+# requires-python = "==3.12"
 # dependencies = [
 #     "groundhog-hpc",
 #     "ase",
@@ -8,16 +8,22 @@
 #     "scikit-learn",
 #     "torch",
 #     "matbench-discovery",
+#     "bibtexparser<1.4.3",
 # ]
 #
 # [tool.hog.anvil]
 # endpoint = "5aafb4c1-27b2-40d8-a038-a0277611868f"
-# account = "replace with your account"
 # qos = "gpu"
 # partition = "gpu"
 # cores_per_node = 16
+# mem_per_mode = 32
 # scheduler_options = "#SBATCH --gpus-per-node=4\n"
 # requirements = ""
+#
+# [tool.hog.sophia]
+# endpoint = "8d07224c-ceaa-4b7f-946d-fae3f7423d5b"
+# account = "Garden-Ai"
+# queue = "by-gpu"
 # ///
 
 from __future__ import annotations
@@ -27,6 +33,7 @@
 import logging
 import multiprocessing
 import os
+import random
 import sys
 import time
 from enum import Enum
@@ -140,8 +147,20 @@ def _get_meta_metrics_source() -> str:
 _MODEL_CACHE = None
 
 
-# Metrics calculations lifted from https://github.com/janosh/matbench-discovery/tree/main/matbench_discovery/metrics
-# Since they aren't setup to be easily imported, we just copy them here
+# Helper functions from matbench-discovery/metrics/geo_opt.py and phonons.py
+
+
+def calc_rmsd(
+    coords_true: np.ndarray,
+    coords_pred: np.ndarray,
+) -> float:
+    """Calculate the Root Mean Square Deviation (RMSD) between two sets of coordinates.
+    Assumes atoms are in the same order.
+    """
+    return np.sqrt(((coords_true - coords_pred) ** 2).mean())
+
+
+# Metrics calculations lifted from https://github.com/janosh/matbench-discovery/blob/main/matbench_discovery/metrics/discovery.py
 def classify_stable(
     each_true: Sequence[float] | pd.Series | np.ndarray,
     each_pred: Sequence[float] | pd.Series | np.ndarray,
@@ -238,6 +257,7 @@ def stable_metrics(
 
     from sklearn.metrics import r2_score  # type: ignore
 
+    # Return the standard discovery metrics
     return dict(
         F1=f1_score,
         DAF=precision / prevalence if prevalence > 0 else float("nan"),
@@ -252,13 +272,13 @@ def stable_metrics(
         FPR=FPR,
         TNR=TNR,
         FNR=FNR,
-        TP=n_true_pos,
-        FP=n_false_pos,
-        TN=n_true_neg,
-        FN=n_false_neg,
         MAE=np.abs(each_true - each_pred).mean(),
         RMSE=((each_true - each_pred) ** 2).mean() ** 0.5,
-        R2=r2_score(each_true, each_pred) if len(each_true) > 1 else float("nan"),
+        **{
+            "R^2": r2_score(each_true, each_pred)
+            if len(each_true) > 1
+            else float("nan")
+        },
     )
 
 
@@ -514,12 +534,46 @@ def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]:
 def calculate_metrics_energy(
     results: Dict[str, Any], config: Dict[str, Any]
 ) -> Dict[str, Any]:
-    from matbench_discovery.data import df_wbm  # type: ignore
+    from io import TextIOWrapper
+    from zipfile import ZipFile
+
+    from ase.io import read
+    from matbench_discovery.data import DataFiles, df_wbm  # type: ignore
 
     if len(results) == 0:
         return {"error": "No results to evaluate"}
 
     model_energies = {}
+    rmsd_list = []
+
+    # Calculate RMSD if positions are returned (e.g. for IS2RE)
+    try:
+        # Check if any result has positions
+        first_res = next(iter(results.values()))
+        if isinstance(first_res, dict) and "positions" in first_res:
+            with ZipFile(DataFiles.wbm_relaxed_atoms.path, "r") as zf:
+                for sid, res in results.items():
+                    if isinstance(res, dict) and "positions" in res:
+                        try:
+                            # Load GT structure
+                            # sid is the filename in the zip (e.g. "material_id.extxyz")
+                            with zf.open(sid) as f:
+                                text_stream = TextIOWrapper(f, encoding="utf-8")
+                                # Read first frame (should be only one for relaxed)
+                                gt_atoms = read(text_stream, format="extxyz")
+
+                                pred_pos = np.array(res["positions"])
+                                gt_pos = gt_atoms.get_positions()  # type: ignore
+
+                                if pred_pos.shape == gt_pos.shape:
+                                    # Use helper function
+                                    rmsd = calc_rmsd(gt_pos, pred_pos)
+                                    rmsd_list.append(rmsd)
+                        except Exception:
+                            pass
+    except Exception as e:
+        print(f"Warning: RMSD calculation failed: {e}")
+
     for sid, res in results.items():
         if isinstance(res, dict) and res.get("energy") is not None:
             mat_id = sid.replace(".extxyz", "")
@@ -549,6 +603,10 @@ def calculate_metrics_energy(
 
     metrics = stable_metrics(each_true, each_pred, prevalence=global_prevalence)
     metrics["num_evaluated"] = len(common_ids)
+
+    # Inject RMSD
+    metrics["RMSD"] = float(np.mean(rmsd_list)) if rmsd_list else float("nan")
+
     return metrics
 
 
@@ -561,20 +619,9 @@ def calculate_metrics_forces(
     from ase.io import read
     from matbench_discovery.data import DataFiles  # type: ignore
 
-    metrics: Dict[str, List[float]] = {
-        "energy_mae": [],
-        "energy_rmse": [],
-        "force_mae": [],
-        "force_rmse": [],
-        "stress_mae": [],
-        "stress_rmse": [],
-    }
+    # We will use the standard stable_metrics for energy predictions in the trajectory
     all_e_pred: List[float] = []
     all_e_true: List[float] = []
-    all_f_pred: List[float] = []
-    all_f_true: List[float] = []
-    all_s_pred: List[float] = []
-    all_s_true: List[float] = []
 
     zip_path = DataFiles.mp_trj_extxyz.path
 
@@ -583,71 +630,32 @@ def calculate_metrics_forces(
             if "error" in res:
                 continue
             try:
-                with zf.open(sid) as f:
-                    text_stream = TextIOWrapper(f, encoding="utf-8")
-                    atoms_list = read(text_stream, format="extxyz", index=":")
-                    gt_atoms = atoms_list[-1]
-
-                    e_pred = res["energy"]
-                    e_true = gt_atoms.get_potential_energy()  # type: ignore[union-attr]
-                    n_atoms = len(gt_atoms)  # type: ignore[arg-type]
-                    energy_error = abs(e_pred - e_true) / n_atoms
-                    metrics["energy_mae"].append(energy_error)
-                    metrics["energy_rmse"].append(energy_error**2)
-                    all_e_pred.append(e_pred / n_atoms)
-                    all_e_true.append(e_true / n_atoms)
-
-                    f_pred = np.array(res["forces"])
-                    f_true = gt_atoms.get_forces()  # type: ignore[union-attr]
-                    force_error = np.abs(f_pred - f_true)
-                    metrics["force_mae"].append(force_error.mean())
-                    metrics["force_rmse"].append((force_error**2).mean())
-                    all_f_pred.extend(f_pred.flatten())
-                    all_f_true.extend(f_true.flatten())
-
-                    s_pred = np.array(res["stress"])
-                    s_true = gt_atoms.get_stress()  # type: ignore[union-attr]
-                    stress_error = np.abs(s_pred - s_true)
-                    metrics["stress_mae"].append(stress_error.mean())
-                    metrics["stress_rmse"].append((stress_error**2).mean())
-                    all_s_pred.extend(s_pred.flatten())
-                    all_s_true.extend(s_true.flatten())
-
+                if isinstance(res, dict) and "energy" in res:
+                    with zf.open(sid) as f:
+                        text_stream = TextIOWrapper(f, encoding="utf-8")
+                        atoms_list = read(text_stream, format="extxyz", index=":")
+                        gt_atoms = atoms_list[-1]  # type: ignore
+
+                        e_pred = res["energy"]
+                        e_true = gt_atoms.get_potential_energy()  # type: ignore
+                        n_atoms = len(gt_atoms)  # type: ignore
+
+                        # Normalize per atom
+                        all_e_pred.append(e_pred / n_atoms)
+                        all_e_true.append(e_true / n_atoms)
             except Exception:
                 pass
 
-    from sklearn.metrics import r2_score  # type: ignore
-
-    result_metrics = {}
-    if metrics["energy_mae"]:
-        result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"]))
-        result_metrics["energy_rmse"] = float(np.sqrt(np.mean(metrics["energy_rmse"])))
-        result_metrics["energy_r2"] = (
-            float(r2_score(all_e_true, all_e_pred))
-            if len(all_e_true) > 1
-            else float("nan")
-        )
-
-    if metrics["force_mae"]:
-        result_metrics["force_mae"] = float(np.mean(metrics["force_mae"]))
-        result_metrics["force_rmse"] = float(np.sqrt(np.mean(metrics["force_rmse"])))
-        result_metrics["force_r2"] = (
-            float(r2_score(all_f_true, all_f_pred))
-            if len(all_f_true) > 1
-            else float("nan")
-        )
+    if not all_e_true:
+        return {"error": "No valid energy comparisons found"}
 
-    if metrics["stress_mae"]:
-        result_metrics["stress_mae"] = float(np.mean(metrics["stress_mae"]))
-        result_metrics["stress_rmse"] = float(np.sqrt(np.mean(metrics["stress_rmse"])))
-        result_metrics["stress_r2"] = (
-            float(r2_score(all_s_true, all_s_pred))
-            if len(all_s_true) > 1
-            else float("nan")
-        )
+    each_true = np.array(all_e_true)
+    each_pred = np.array(all_e_pred)
 
-    result_metrics["num_evaluated"] = len(metrics["energy_mae"])
-    return result_metrics
+    # Calculate standard discovery metrics on energies
+    metrics = stable_metrics(each_true, each_pred)
+    metrics["num_evaluated"] = len(all_e_true)
+    return metrics
 
 
 def run_benchmark_hog(
@@ -763,6 +771,37 @@ def run_benchmark_hog(
         (item_id, item) for item_id, item in all_items if str(item_id) not in results
     ]
 
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            num_gpus = torch.cuda.device_count()
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            num_gpus = 1
+        else:
+            num_gpus = 0
+    except ImportError:
+        num_gpus = 0
+
+    use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1
+    # Use sched_getaffinity to get cores available to this job, not total cores on node
+    try:
+        total_cores = len(os.sched_getaffinity(0))  # type: ignore[attr-defined]
+    except AttributeError:
+        # Fallback for systems without sched_getaffinity (e.g., macOS)
+        total_cores = os.cpu_count() or 1
+    num_workers = num_gpus if use_multi_gpu else 1
+    available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores
+    threads_per_worker = max(1, available_cores // num_workers)
+
+    # MPS (Apple Silicon) performance degrades with high thread counts due to contention
+    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        threads_per_worker = 1
+
+    logger.info(
+        f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)"
+    )
+
     if not items_to_process:
         logger.info(
             "All items already processed! Calculating metrics from checkpoint..."
@@ -785,7 +824,7 @@ def run_benchmark_hog(
             hardware_info=hardware_info,
             model_info=model_info,
             total_elapsed=prior_elapsed,
-            num_workers=0,
+            num_workers=num_workers,
             num_structures_total=len(all_items),
             num_structures_processed=len(results),
         )
@@ -793,42 +832,9 @@ def run_benchmark_hog(
 
     logger.info(f"Processing {len(items_to_process)} remaining items")
 
-    import random
-
     random.seed(42)
     random.shuffle(items_to_process)
 
-    try:
-        import torch
-
-        if torch.cuda.is_available():
-            num_gpus = torch.cuda.device_count()
-        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-            num_gpus = 1
-        else:
-            num_gpus = 0
-    except ImportError:
-        num_gpus = 0
-
-    use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1
-    # Use sched_getaffinity to get cores available to this job, not total cores on node
-    try:
-        total_cores = len(os.sched_getaffinity(0))  # type: ignore[attr-defined]
-    except AttributeError:
-        # Fallback for systems without sched_getaffinity (e.g., macOS)
-        total_cores = os.cpu_count() or 1
-    num_workers = num_gpus if use_multi_gpu else 1
-    available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores
-    threads_per_worker = max(1, available_cores // num_workers)
-
-    # MPS (Apple Silicon) performance degrades with high thread counts due to contention
-    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-        threads_per_worker = 1
-
-    logger.info(
-        f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)"
-    )
-
     start_time = time.time()
     chunk_size = 1000 * num_workers
     chunks = [
diff --git a/garden_ai/client.py b/garden_ai/client.py
index 87b8e0e4..2b3ee425 100644
--- a/garden_ai/client.py
+++ b/garden_ai/client.py
@@ -23,9 +23,9 @@
 from globus_sdk.authorizers import GlobusAuthorizer
 from globus_sdk.scopes import ScopeBuilder
 from globus_sdk.tokenstorage import SimpleJSONFileAdapter
-from modal.cli._traceback import setup_rich_traceback
 from rich import print
 from rich.prompt import Prompt
+from rich.traceback import install
 
 from garden_ai.backend_client import BackendClient
 from garden_ai.constants import GardenConstants
@@ -34,8 +34,7 @@
 from garden_ai.hpc.gardens.mlip_garden import MLIPGarden
 
 logger = logging.getLogger()
-# modal helper replacement for rich.traceback.install
-setup_rich_traceback()
+install()
 
 
 class AuthException(Exception):
diff --git a/garden_ai/schemas/benchmark.py b/garden_ai/schemas/benchmark.py
index f607c8a8..283f7fd3 100644
--- a/garden_ai/schemas/benchmark.py
+++ b/garden_ai/schemas/benchmark.py
@@ -29,7 +29,7 @@ class BenchmarkResultCreateRequest(BaseModel):
 class BenchmarkResultResponse(BaseModel):
     """Response schema from the benchmark result creation endpoint."""
 
-    id: str = Field(..., description="Unique identifier for the benchmark result")
+    id: int = Field(..., description="Unique identifier for the benchmark result")
     benchmark_name: str
     benchmark_task_name: str
     metrics: Dict[str, Any]

From d9ee1ffdcf1a2491b6ed2e2082d020dda4cca385 Mon Sep 17 00:00:00 2001
From: hholb <hholbrook@uchicago.edu>
Date: Thu, 22 Jan 2026 12:49:32 -0700
Subject: [PATCH 23/23] fix gpu assignements

---
 .../benchmarks/matbench_discovery/tasks.py    | 32 ++++++++++++++++---
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py
index d2d0ee7a..cdf44cca 100644
--- a/garden_ai/benchmarks/matbench_discovery/tasks.py
+++ b/garden_ai/benchmarks/matbench_discovery/tasks.py
@@ -296,14 +296,18 @@ def _process_batch_common(
     import re
     import time
 
+    gpu_id = model_config.get("gpu_id")
+    if gpu_id is not None:
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+        device = "cuda"
+    else:
+        device = setup_device(gpu_id)
+
     import torch
 
     os.environ["OMP_NUM_THREADS"] = str(num_threads)
     torch.set_num_threads(num_threads)
 
-    gpu_id = model_config.get("gpu_id")
-    device = setup_device(gpu_id)
-
     worker_logger = logging.getLogger(f"worker_{batch_id}")
     worker_logger.info(
         f"Started {task_name} on {device} with {len(structures)} structures. Threads: {num_threads}"
@@ -590,10 +594,28 @@ def calculate_metrics_energy(
 
     df_subset = df_wbm_indexed.loc[common_ids]
     y_pred = np.array([model_energies[mid] for mid in common_ids])
-    y_true = df_subset["uncorrected_energy"].values
     n_atoms = df_subset["n_sites"].values
 
-    e_form_error = (y_pred - y_true) / n_atoms
+    # CRITICAL FIX: Compute formation energy error, not total energy error
+    # Formation energy is defined as: E_formation = E_total - Σ(n_i × E_ref_i)
+    # where E_ref_i are elemental reference energies in their standard states
+
+    # Get ground truth formation energy per atom (uncorrected, matches model prediction level)
+    y_true_form = df_subset["e_form_per_atom_uncorrected"].values  # eV/atom
+
+    # Compute reference energy per atom from known DFT data
+    # E_ref_per_atom = E_total_per_atom - E_form_per_atom
+    y_true_total = df_subset["uncorrected_energy"].values
+    ref_energy_per_atom = (y_true_total / n_atoms) - y_true_form
+
+    # Compute model's predicted formation energy per atom
+    # E_form_pred = E_total_pred / n_atoms - E_ref_per_atom
+    y_pred_form = (y_pred / n_atoms) - ref_energy_per_atom
+
+    # Formation energy error (this is what affects stability predictions!)
+    e_form_error = y_pred_form - y_true_form
+
+    # Predict energy above hull by adding formation energy error to ground truth hull distance
     each_true = df_subset["e_above_hull_mp2020_corrected_ppd_mp"].values
     each_pred = each_true + e_form_error