From 478b6d1688f65f0c16eadbd35244db204cb1fd5d Mon Sep 17 00:00:00 2001 From: hholb Date: Wed, 19 Nov 2025 13:57:47 -0700 Subject: [PATCH 01/23] first pass at matbench running locally and on a remote HPC via globus compute, multi-gpu support --- garden_ai/benchmarks/__init__.py | 34 ++ .../benchmarks/matbench_discovery/README.md | 285 +++++++++++ .../benchmarks/matbench_discovery/__init__.py | 156 ++++++ .../benchmarks/matbench_discovery/enums.py | 15 + .../examples/matbench_1000_structures.py | 83 ++++ .../examples/matbench_test.py | 31 ++ .../examples/matbench_test_local_mps.py | 46 ++ .../examples/matbench_test_remote.py | 59 +++ .../matbench_discovery/remote_runner.py | 470 ++++++++++++++++++ .../benchmarks/matbench_discovery/tasks.py | 321 ++++++++++++ 10 files changed, 1500 insertions(+) create mode 100644 garden_ai/benchmarks/__init__.py create mode 100644 garden_ai/benchmarks/matbench_discovery/README.md create mode 100644 garden_ai/benchmarks/matbench_discovery/__init__.py create mode 100644 garden_ai/benchmarks/matbench_discovery/enums.py create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py create mode 100644 garden_ai/benchmarks/matbench_discovery/remote_runner.py create mode 100644 garden_ai/benchmarks/matbench_discovery/tasks.py diff --git a/garden_ai/benchmarks/__init__.py b/garden_ai/benchmarks/__init__.py new file mode 100644 index 00000000..329de6cc --- /dev/null +++ b/garden_ai/benchmarks/__init__.py @@ -0,0 +1,34 @@ +"""Garden AI benchmarking framework. + +This module provides interfaces for running standardized benchmarks on +models hosted in Garden AI or developed locally. + +Available benchmarks: + - MatbenchDiscovery: Materials discovery benchmark suite +""" + +from .matbench_discovery import IS2RETask, MatbenchDiscovery, MatbenchTask + +__all__ = [ + "MatbenchDiscovery", + "MatbenchTask", + "IS2RETask", +] + + +def publish_benchmark_result(benchmark, model, results): + """Publish benchmark results to Garden AI backend. + + This is a placeholder for future functionality to store benchmark + results alongside published models. + + Args: + benchmark: Benchmark adapter instance + model: Model that was benchmarked + results: Dictionary of benchmark metrics + """ + # TODO: Implement when backend API is ready + raise NotImplementedError( + "Publishing benchmark results is not yet implemented. " + "For now, save results locally or to your own storage." + ) diff --git a/garden_ai/benchmarks/matbench_discovery/README.md b/garden_ai/benchmarks/matbench_discovery/README.md new file mode 100644 index 00000000..4273cd02 --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/README.md @@ -0,0 +1,285 @@ +# Matbench Discovery Benchmark Adapter + +Minimal viable implementation for running [Matbench Discovery](https://matbench-discovery.materialsproject.org/) benchmarks on remote HPC systems via Globus Compute. + +## Overview + +This adapter enables Garden AI users to benchmark their materials models against the Matbench Discovery test suite without manually managing HPC jobs, environment setup, or data transfers. + +### Current Status: MVP + +**Implemented:** +- ✅ IS2RE (Initial Structure to Relaxed Energy) task +- ✅ Remote environment setup with UV +- ✅ Automatic dependency installation +- ✅ Basic metric calculation +- ✅ Multi-GPU parallelization (automatic GPU detection and work distribution) + +**Future Work:** +- ⏳ Additional tasks (RS2RE, S2EFS, thermal conductivity) +- ⏳ Globus Transfer for model weights and large datasets +- ⏳ Checkpointing and failure recovery +- ⏳ Full metric calculation against DFT ground truth +- ⏳ Backend integration for result publishing + +## Architecture + +``` +User's Machine Remote HPC Endpoint +├─ MatbenchDiscovery ├─ Clone matbench-discovery repo +│ ├─ tasks.IS2RE │ ├─ Set up UV virtual environment +│ └─ Globus Compute Executor ───┼─>├─ Install dependencies + │ │ ├─ matbench-discovery + │ │ └─ model package (e.g., mace-torch) + │ ├─ Load test structures via DataFiles + │ ├─ Run structure relaxations + │ ├─ Calculate metrics + │ └─ Return results +``` + +## File Structure + +``` +matbench_discovery/ +├── __init__.py # Main adapter class (MatbenchDiscovery) +├── tasks.py # Task implementations (IS2RETask) +├── remote_runner.py # Remote execution functions +├── enums.py # Task enumerations +├── example.py # Usage example +└── README.md # This file +``` + +## Usage + +### Basic Example + +```python +from garden_ai.benchmarks import MatbenchDiscovery +from my_model import MyModel + +# Configure endpoint +endpoint_id = "your-endpoint-uuid" +endpoint_config = { + "account": "project-account", + "partition": "gpu-debug", + "scheduler_options": "#SBATCH --gpus-per-node=1" +} + +# Run benchmark +with MatbenchDiscovery(endpoint_id, endpoint_config) as bench: + model = MyModel() + task = bench.tasks.IS2RE + + # Submit job (returns immediately) + future = task.submit(model, num_structures=100) + + # Wait for completion + results = future.result() + + # Calculate metrics + metrics = task.calculate_metrics(results) + print(metrics) +``` + +### Multi-GPU Parallelization + +The adapter automatically detects and uses all available GPUs on the compute node for parallel processing. This significantly improves throughput for large-scale benchmarks. + +**Example: 4-GPU Configuration on Anvil** + +```python +from garden_ai.benchmarks import MatbenchDiscovery + +endpoint_id = "your-endpoint-uuid" +endpoint_config = { + "account": "your-account", + "qos": "gpu", + "partition": "gpu", + "scheduler_options": "#SBATCH --gpus-per-node=4\n#SBATCH --time=4:00:00\n#SBATCH --mem=64G", + "worker_init": "pip install --user uv", +} + +with MatbenchDiscovery(endpoint_id, endpoint_config) as bench: + task = bench.tasks.IS2RE + + # Multi-GPU is enabled by default + future = task.submit( + model_package="mace-torch", + model_factory="mace_mp", + model_kwargs={"model": "medium", "device": "cuda"}, + num_structures=1000, + use_multi_gpu=True, # Default: True + ) + + results = future.result() + metrics = task.calculate_metrics(results) +``` + +**How it works:** +1. Automatically detects available GPUs using `torch.cuda.device_count()` +2. Splits structures into equal batches (one per GPU) +3. Processes batches in parallel using multiprocessing +4. Aggregates results from all workers + +**Performance expectations:** +- **Single GPU**: ~10-20 structures/hour (baseline) +- **4 GPUs**: ~3-4x speedup (~40-80 structures/hour) +- Actual performance depends on model complexity and structure size + +**Disabling multi-GPU:** +```python +future = task.submit( + model_package="mace-torch", + model_factory="mace_mp", + model_kwargs={"model": "medium", "device": "cuda"}, + num_structures=100, + use_multi_gpu=False, # Use single GPU/CPU +) +``` + +### Scaling Guide + +**Recommended test progression:** + +1. **Small test (10-100 structures)**: Verify setup and model compatibility + - Partition: `gpu-debug` + - Time: 30 minutes + - GPUs: 1-4 + +2. **Medium test (1000 structures)**: Test multi-GPU parallelization + - Partition: `gpu` + - Time: 4 hours + - GPUs: 4 + - Expected throughput: ~250-300 structures/hour with 4 GPUs + +3. **Full dataset (~257k structures)**: Production run + - Partition: `gpu` + - Time: 48+ hours + - GPUs: 4 + - Consider implementing checkpointing for runs >24 hours + +### Model Requirements + +For the MVP, models must: + +1. **Be pip-installable** (or provide package name) +2. **Implement ASE calculator interface** (or be convertible to one) +3. **Have a checkpoint file** (optional, can be None for models with default weights) + +Example model: + +```python +class MyModel: + def __init__(self): + self.checkpoint_path = "/path/to/checkpoint.pt" + + # ASE calculator interface + def calculate(self, atoms, properties, system_changes): + # Calculate energy, forces, stress + ... +``` + +### Workflow Details + +When you call `task.submit(model)`: + +1. **Model introspection**: Extracts model class name, module, and checkpoint path +2. **Remote submission**: Sends job to Globus Compute endpoint +3. **Environment setup** (on remote): + - Clones matbench-discovery repository + - Creates Python 3.11 virtual environment with UV + - Installs matbench-discovery package + - Installs model package (e.g., `pip install mace-torch`) +4. **Benchmark execution**: + - Loads test structures using `DataFiles.wbm_initial_structures` + - Instantiates model and loads checkpoint + - Runs geometry optimizations (ASE FIRE optimizer) + - Collects results +5. **Result return**: Returns energies, convergence stats, and failures + +## Configuration Options + +### MatbenchDiscovery + +```python +MatbenchDiscovery( + endpoint_id="uuid", # Required: Globus Compute endpoint + user_endpoint_config=dict, # Optional: HPC scheduler config + repo_ref="main", # Optional: Git ref to use + model_package="mace-torch" # Optional: Default model package +) +``` + +### IS2RETask.submit() + +```python +task.submit( + model, # Required: Model instance + num_structures=100, # Optional: Number of structures to test + model_package="mace-torch", # Optional: Override default package + use_multi_gpu=True, # Optional: Enable multi-GPU (default: True) +) +``` + +## Design Decisions + +### Why UV? +- Fast, deterministic installs +- Handles both `pyproject.toml` and `requirements.txt` +- Built-in venv creation with specific Python versions + +### Why DataFiles auto-download? +- Avoids manual Globus Transfer setup for MVP +- Matbench's DataFiles handles caching automatically +- Can optimize with explicit transfer later + +### Why ASE calculator interface? +- Standard in materials modeling community +- Most interatomic potentials support it (MACE, M3GNet, CHGNet, etc.) +- Simple adaptation layer if needed + +### Why multiprocessing for multi-GPU? +- Simple and effective for within-node parallelization +- Avoids CUDA initialization issues with fork +- Each GPU gets isolated process with dedicated memory +- Easy to debug and monitor per-GPU progress + +## Limitations + +1. **No weight transfer**: Model checkpoints must be accessible from remote (URL or shared filesystem) +2. **Basic metrics**: Only reports convergence stats, not comparison to DFT ground truth +3. **IS2RE only**: Other tasks not yet implemented +4. **No checkpointing**: If job fails, must restart from scratch (recommended for runs >24 hours) +5. **No result publishing**: Backend integration not yet implemented +6. **Single-node parallelization**: Multi-GPU works within a node; SLURM array jobs for multi-node not yet implemented + +## Next Steps + +To generalize beyond Matbench: + +1. **Extract base classes**: `BenchmarkAdapter`, `BenchmarkTask`, `RemoteRunner` +2. **Add data staging**: Implement Globus Transfer for weights/datasets +3. **Define model interface**: Standard protocol for model serialization +4. **Add checkpointing**: Save intermediate results for failure recovery +5. **Implement batching**: Distribute work across SLURM array jobs + +## Testing + +```bash +# Install dependencies +cd garden_ai/benchmarks/matbench_discovery +pip install -e . + +# Update example.py with your endpoint details +vim example.py + +# Run example +python example.py +``` + +## References + +- [Matbench Discovery](https://matbench-discovery.materialsproject.org/) +- [Matbench Discovery GitHub](https://github.com/janosh/matbench-discovery) +- [Globus Compute](https://globus-compute.readthedocs.io/) +- [ASE Calculator Interface](https://wiki.fysik.dtu.dk/ase/ase/calculators/calculators.html) diff --git a/garden_ai/benchmarks/matbench_discovery/__init__.py b/garden_ai/benchmarks/matbench_discovery/__init__.py new file mode 100644 index 00000000..3f687a6c --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/__init__.py @@ -0,0 +1,156 @@ +"""Matbench Discovery benchmark adapter for Garden AI. + +This module provides a clean interface for running Matbench Discovery benchmarks +on remote HPC systems via Globus Compute. It handles environment setup, +dependency installation, and benchmark execution. + +Example usage: + >>> from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery + >>> from my_model import MyModel + >>> + >>> # Configure for your HPC endpoint + >>> endpoint_id = "your-endpoint-uuid" + >>> endpoint_config = { + ... "account": "project-account", + ... "partition": "gpu", + ... "scheduler_options": "#SBATCH --gpus-per-node=1" + ... } + >>> + >>> # Run benchmark + >>> with MatbenchDiscovery(endpoint_id, endpoint_config) as bench: + ... model = MyModel() + ... task = bench.tasks.IS2RE + ... future = task.submit(model, num_structures=100) + ... results = future.result() + ... metrics = task.calculate_metrics(results) + ... print(metrics) +""" + +from typing import Any + +from globus_compute_sdk import Executor +from globus_compute_sdk.serialize import CombinedCode, ComputeSerializer + +from .enums import MatbenchTask +from .tasks import IS2RETask + +__all__ = [ + "MatbenchDiscovery", + "MatbenchTask", + "IS2RETask", +] + + +class MatbenchDiscovery: + """Adapter for running Matbench Discovery benchmarks locally or remotely. + + This class manages the lifecycle of benchmark execution: + - Provides access to benchmark tasks (IS2RE, etc.) + - For remote execution: creates and manages Globus Compute executor + - For local execution: runs in ephemeral UV environment + + Use as a context manager to ensure proper cleanup: + # Local execution + with MatbenchDiscovery() as bench: + result = bench.tasks.IS2RE.local(...) + + # Remote execution + with MatbenchDiscovery(endpoint_id="uuid", endpoint_config={...}) as bench: + future = bench.tasks.IS2RE.submit(...) + + Attributes: + tasks: Namespace containing available benchmark tasks + - tasks.IS2RE: Initial Structure to Relaxed Energy task + """ + + # Matbench Discovery repository configuration + REPO_URL = "https://github.com/janosh/matbench-discovery" + REPO_REF = "main" + PYTHON_VERSION = "3.11" + + def __init__( + self, + endpoint_id: str | None = None, + user_endpoint_config: dict[str, Any] | None = None, + repo_ref: str | None = None, + model_package: str | None = None, + ): + """Initialize Matbench Discovery adapter. + + Args: + endpoint_id: Globus Compute endpoint UUID for remote execution. + If None, only local execution (.local()) is available. + user_endpoint_config: Optional HPC configuration for remote endpoint. + Example for SLURM: + { + "account": "project-account", + "partition": "gpu-debug", + "scheduler_options": "#SBATCH --gpus-per-node=1" + } + repo_ref: Git branch/tag/commit to use (default: "main") + model_package: Default model package to install for all tasks + (can be overridden per task) + """ + self.endpoint_id = endpoint_id + self.user_endpoint_config = user_endpoint_config + self.repo_ref = repo_ref or self.REPO_REF + self.model_package = model_package + + # Executor is created lazily on first submit() call + self._executor: Executor | None = None + self.tasks: Any = None + + def _get_executor(self) -> Executor: + """Get or create the Globus Compute executor (lazy initialization). + + Returns: + Executor instance + + Raises: + ValueError: If endpoint_id was not provided during initialization + """ + if self._executor is None: + if self.endpoint_id is None: + raise ValueError( + "endpoint_id is required for remote execution. " + "Either provide endpoint_id during initialization or use .local() method." + ) + + executor_kwargs = {"endpoint_id": self.endpoint_id} + if self.user_endpoint_config: + executor_kwargs["user_endpoint_config"] = self.user_endpoint_config + + # Use CombinedCode serialization to send actual function code + # rather than module references (avoids needing garden_ai installed remotely) + executor_kwargs["serializer"] = ComputeSerializer( + strategy_code=CombinedCode() + ) + + self._executor = Executor(**executor_kwargs) + + return self._executor + + def __enter__(self): + """Set up tasks when entering context.""" + # Initialize tasks - executor will be created lazily when needed + # Using a simple namespace object for dot access + self.tasks = type( + "Tasks", + (), + { + "IS2RE": IS2RETask( + adapter=self, # Pass adapter instead of executor + repo_url=self.REPO_URL, + repo_ref=self.repo_ref, + model_package=self.model_package, + ) + }, + )() + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Clean up executor when exiting context.""" + if self._executor: + self._executor.shutdown(wait=True) + return False # Don't suppress exceptions diff --git a/garden_ai/benchmarks/matbench_discovery/enums.py b/garden_ai/benchmarks/matbench_discovery/enums.py new file mode 100644 index 00000000..8cc2f99b --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/enums.py @@ -0,0 +1,15 @@ +"""Enums for Matbench Discovery benchmark tasks.""" + +from enum import Enum + + +class MatbenchTask(Enum): + """Available Matbench Discovery benchmark tasks. + + Currently only IS2RE is implemented for the MVP. + Future tasks could include: + - RS2RE: Relaxed Structure to Relaxed Energy + - S2EFS: Structure to Energy, Forces, and Stress + """ + + IS2RE = "is2re" # Initial Structure to Relaxed Energy diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py new file mode 100644 index 00000000..151b043e --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py @@ -0,0 +1,83 @@ +"""Test Matbench Discovery benchmark on Anvil HPC with 1000 structures. + +This script demonstrates scaling to 1000 structures using 4 GPUs in parallel. +It's designed to test the multi-GPU parallelization implementation and measure +throughput before attempting the full dataset. +""" + +from garden_ai.benchmarks import MatbenchDiscovery + +ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" + +ENDPOINT_CONFIG = { + "account": "cis250461-gpu", # HPC allocation/account + "qos": "gpu", + "partition": "gpu-debug", # Use full partition (not debug) for longer run + "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --time=00:30:00\n#SBATCH --mem=32G", + "worker_init": "pip install --user uv", # Install uv on worker startup +} + +MODEL_PACKAGE = "mace-torch" +MODEL_FACTORY = "mace_mp" +MODEL_KWARGS = { + "model": "medium", + "device": "cuda", # Use GPU on HPC + "default_dtype": "float64", +} + +NUM_STRUCTURES = 1000 + +print("=" * 80) +print("Matbench Discovery IS2RE Benchmark - 1000 Structures") +print("=" * 80) +print(f"Endpoint: {ENDPOINT_ID}") +print(f"Model: {MODEL_PACKAGE} / {MODEL_FACTORY}") +print(f"Structures: {NUM_STRUCTURES}") +print("Multi-GPU: Enabled (2 GPUs)") +print("=" * 80) + +with MatbenchDiscovery( + endpoint_id=ENDPOINT_ID, + user_endpoint_config=ENDPOINT_CONFIG, +) as bench: + task = bench.tasks.IS2RE + + future = task.submit( + model_package=MODEL_PACKAGE, + model_factory=MODEL_FACTORY, + model_kwargs=MODEL_KWARGS, + num_structures=NUM_STRUCTURES, + use_multi_gpu=True, # Enable multi-GPU parallelization + ) + + print("\nJob submitted! Waiting for results...") + print("This may take a while. You can monitor progress in the Globus Compute logs.") + print() + + try: + result = future.result() + metrics = task.calculate_metrics(result) + + print("\nResults:") + print("=" * 80) + for key, value in metrics.items(): + print(f" {key}: {value}") + + print("=" * 80) + print("\nRaw Results:") + print(f" Converged: {result['num_converged']}") + print(f" Failed: {len(result.get('failed_indices', []))}") + if result.get("energies"): + valid_energies = [e for e in result["energies"] if e is not None] + if valid_energies: + print(f" Sample energies: {valid_energies[:3]}") + + # Calculate and display throughput + if "num_converged" in result and result["num_converged"] > 0: + print("\nPerformance:") + print(f" Success rate: {metrics.get('success_rate', 0):.1%}") + print(" Note: Check job logs for detailed throughput (structures/hour)") + + except Exception as e: + print(f"\n[ERROR] Benchmark failed: {e}") + raise diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py new file mode 100644 index 00000000..3b2912f8 --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py @@ -0,0 +1,31 @@ +"""Test Matbench Discovery benchmark locally.""" + +from garden_ai.benchmarks import MatbenchDiscovery + +print("Matbench Discovery IS2RE Benchmark") +print("=" * 80) + +with MatbenchDiscovery() as bench: + task = bench.tasks.IS2RE + + # Run benchmark locally + result = task.local( + model_package="mace-torch", + model_factory="mace_mp", + model_kwargs={ + "model": "medium", + "device": "cpu", + "default_dtype": "float32", + }, + num_structures=10, + ) + + # Calculate metrics + metrics = task.calculate_metrics(result) + + # Display results + print("\nResults:") + print("=" * 80) + for key, value in metrics.items(): + print(f" {key}: {value}") + print("=" * 80) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py new file mode 100644 index 00000000..9f9bfd8c --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py @@ -0,0 +1,46 @@ +"""Test Matbench Discovery benchmark locally on Mac. + +This script tests the benchmark implementation locally. Note that MPS (Apple Silicon +GPU) is not compatible with MACE model checkpoints which use float64, so this runs +on CPU. This is still useful for verifying the workflow works before using Anvil. +""" + +from garden_ai.benchmarks import MatbenchDiscovery + +print("=" * 80) +print("Matbench Discovery Local Test") +print("=" * 80) + +# Run benchmark locally with MPS acceleration +with MatbenchDiscovery() as bench: + task = bench.tasks.IS2RE + + print("\nRunning local benchmark...") + print("Note: Using CPU because MACE model checkpoints use float64,") + print("which is not supported by MPS. This is still useful for testing") + print("the workflow before running on Anvil with CUDA.\n") + + result = task.local( + model_package="mace-torch", + model_factory="mace_mp", + model_kwargs={ + "model": "medium", + "device": "cpu", # MPS doesn't support float64 used by MACE checkpoints + "default_dtype": "float32", + }, + num_structures=10, # Small test to verify workflow + use_multi_gpu=False, + ) + + # Calculate metrics + metrics = task.calculate_metrics(result) + + # Display results + print("\nResults:") + print("=" * 80) + for key, value in metrics.items(): + print(f" {key}: {value}") + print("=" * 80) + + print("\nLocal test complete!") + print("If this works, you can proceed with confidence to run on Anvil.") diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py new file mode 100644 index 00000000..08be5dca --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py @@ -0,0 +1,59 @@ +"""Test Matbench Discovery benchmark on remote a HPC endpoint.""" + +from garden_ai.benchmarks import MatbenchDiscovery + +ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" + +ENDPOINT_CONFIG = { + "account": "cis250461-gpu", # HPC allocation/account + "qos": "gpu", + "partition": "gpu-debug", # SLURM partition + "scheduler_options": "#SBATCH --gpus-per-node=4", # Request 4 GPUs + "worker_init": "pip install --user uv", # Install uv on worker startup +} + +MODEL_PACKAGE = "mace-torch" +MODEL_FACTORY = "mace_mp" +MODEL_KWARGS = { + "model": "medium", + "device": "cuda", # Use GPU on HPC + "default_dtype": "float32", +} + +NUM_STRUCTURES = 100 # Increased from 10 to test multi-GPU parallelization + +with MatbenchDiscovery( + endpoint_id=ENDPOINT_ID, + user_endpoint_config=ENDPOINT_CONFIG, +) as bench: + task = bench.tasks.IS2RE + + future = task.submit( + model_package=MODEL_PACKAGE, + model_factory=MODEL_FACTORY, + model_kwargs=MODEL_KWARGS, + num_structures=NUM_STRUCTURES, + use_multi_gpu=True, # Enable multi-GPU parallelization + ) + + try: + result = future.result() + metrics = task.calculate_metrics(result) + + print("\nResults:") + print("=" * 80) + for key, value in metrics.items(): + print(f" {key}: {value}") + + print("=" * 80) + print("\nRaw Results:") + print(f" Converged: {result['num_converged']}") + print(f" Failed: {len(result.get('failed_indices', []))}") + if result.get("energies"): + valid_energies = [e for e in result["energies"] if e is not None] + if valid_energies: + print(f" Sample energies: {valid_energies[:3]}") + + except Exception as e: + print(f"\n[ERROR] Benchmark failed: {e}") + raise diff --git a/garden_ai/benchmarks/matbench_discovery/remote_runner.py b/garden_ai/benchmarks/matbench_discovery/remote_runner.py new file mode 100644 index 00000000..109c9944 --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/remote_runner.py @@ -0,0 +1,470 @@ +"""Remote execution functions for Matbench Discovery benchmarks. + +These functions are serialized and executed on Globus Compute endpoints. +They handle environment setup, dependency installation, and benchmark execution. +""" + + +def run_matbench_is2re( + repo_url: str, + repo_ref: str, + model_package: str, + model_factory: str, + model_kwargs: dict, + model_checkpoint: str | None, + num_structures: int, + use_multi_gpu: bool = True, +) -> dict: + """Run Matbench IS2RE benchmark on remote Globus Compute endpoint. + + This function performs the complete benchmark workflow: + 1. Set up Python environment with UV + 2. Install dependencies (matbench-discovery + model package) + 3. Execute benchmark runner script in the environment + 4. Return results + + Args: + repo_url: GitHub URL for matbench-discovery repo + repo_ref: Git branch/tag/commit to checkout + model_package: Python package name to install (e.g., "mace-torch") + model_factory: Function or class name to create model (e.g., "mace_mp", "MACE") + model_kwargs: Dictionary of kwargs to pass when creating model + model_checkpoint: Path/URL to model checkpoint file (optional) + num_structures: Number of test structures to run (subset for MVP) + use_multi_gpu: If True, automatically detect and use all available GPUs + in parallel. If False, use single GPU/CPU. (default: True) + + Returns: + Dictionary with benchmark results: + - energies: List of final energies (None for failed relaxations) + - num_converged: Count of successful relaxations + - failed_indices: List of structure indices that failed + + Raises: + RuntimeError: If benchmark execution fails + """ + # All imports must be inside the function for CombinedCode serialization + import json + import logging + import os + import subprocess + import sys + import tempfile + from pathlib import Path + + # Configure logging + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + stream=sys.stdout, + force=True, + ) + # Ensure stdout is unbuffered + if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(line_buffering=True) + + logger = logging.getLogger(__name__) + + # Create isolated working directory + work_dir = Path(tempfile.mkdtemp(prefix="matbench_benchmark_")) + + # This script runs INSIDE the virtual environment + BENCHMARK_RUNNER_SCRIPT = ''' +import json +import sys +import time +import logging +import os +import concurrent.futures +from pathlib import Path +from typing import List, Dict, Any, Optional + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] [%(name)s] [PID:%(process)d] %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + stream=sys.stdout, + force=True +) +logger = logging.getLogger("benchmark_runner") + +def setup_device(gpu_id: Optional[int] = None) -> str: + """Setup compute device for this process.""" + import torch + + if gpu_id is not None and torch.cuda.is_available(): + # Set visible devices to just this GPU to avoid contention + # and ensure model uses the correct device + os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) + return "cuda:0" + elif torch.cuda.is_available(): + return "cuda" + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + return "mps" + else: + return "cpu" + +def process_batch( + batch_id: int, + structures: List[Any], + start_idx: int, + model_config: Dict[str, Any] +) -> Dict[str, Any]: + """Process a batch of structures on a specific device.""" + + # Setup logging for this worker + worker_logger = logging.getLogger(f"worker_{batch_id}") + worker_logger.setLevel(logging.INFO) + + gpu_id = model_config.get("gpu_id") + device = setup_device(gpu_id) + worker_logger.info(f"Worker {batch_id} started on {device} with {len(structures)} structures") + + # Initialize model + try: + import importlib + + package_name = model_config["package"] + factory_name = model_config["factory"] + kwargs = model_config["kwargs"].copy() + checkpoint = model_config.get("checkpoint") + + # Update device in kwargs + if "device" in kwargs: + kwargs["device"] = device + + # Import factory + module_parts = package_name.split(".") + if len(module_parts) > 1: + module = importlib.import_module(package_name) + factory = getattr(module, factory_name) + else: + base_module = module_parts[0].split("-")[0] + try: + module = importlib.import_module(f"{base_module}.calculators") + factory = getattr(module, factory_name) + except (ImportError, AttributeError): + module = importlib.import_module(base_module) + factory = getattr(module, factory_name) + + # Create model + model = factory(**kwargs) + + # Load checkpoint + if checkpoint and checkpoint != "None": + if hasattr(model, "load_checkpoint"): + model.load_checkpoint(checkpoint) + elif hasattr(model, "load_state_dict"): + import torch + model.load_state_dict(torch.load(checkpoint)) + + except Exception as e: + worker_logger.error(f"Failed to initialize model: {e}") + return { + "energies": [None] * len(structures), + "num_converged": 0, + "failed_indices": [start_idx + i for i in range(len(structures))], + "error": str(e) + } + + # Run relaxations + from ase.optimize import FIRE + + energies = [] + failed_indices = [] + num_converged = 0 + + batch_start = time.time() + + for i, atoms in enumerate(structures): + global_idx = start_idx + i + try: + atoms.calc = model + opt = FIRE(atoms, logfile=None) + opt.run(fmax=0.05, steps=500) + + energy = atoms.get_potential_energy() + energies.append(energy) + num_converged += 1 + + # Log progress occasionally + if (i + 1) % 10 == 0: + elapsed = time.time() - batch_start + rate = (i + 1) / elapsed if elapsed > 0 else 0 + eta = (len(structures) - i - 1) / rate if rate > 0 else 0 + worker_logger.info( + f"Progress: {i+1}/{len(structures)} " + f"({rate:.2f} struct/s, ETA: {eta/60:.1f}m)" + ) + + except Exception as e: + worker_logger.warning(f"Structure {global_idx} failed: {e}") + energies.append(None) + failed_indices.append(global_idx) + + return { + "energies": energies, + "num_converged": num_converged, + "failed_indices": failed_indices + } + +def main(): + if len(sys.argv) != 2: + print("Usage: python benchmark_runner.py ") + sys.exit(1) + + config_path = sys.argv[1] + with open(config_path) as f: + config = json.load(f) + + logger.info("Starting benchmark runner...") + + # Load structures + logger.info("Loading structures...") + try: + from matbench_discovery.data import DataFiles + from zipfile import ZipFile + from ase.io import read + from io import TextIOWrapper + + structures = [] + zip_path = DataFiles.wbm_initial_atoms.path + num_structures = config["num_structures"] + + with ZipFile(zip_path, 'r') as zf: + file_list = sorted( + zf.namelist(), + key=lambda x: int(x.split('.')[0]) if x.split('.')[0].isdigit() else float('inf') + ) + for i, filename in enumerate(file_list[:num_structures]): + with zf.open(filename) as f: + text_stream = TextIOWrapper(f, encoding='utf-8') + atoms = read(text_stream, format='extxyz') + structures.append(atoms) + + logger.info(f"Loaded {len(structures)} structures") + + except Exception as e: + logger.error(f"Failed to load structures: {e}") + sys.exit(1) + + # Determine parallelization strategy + import torch + num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 + use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1 + + results = { + "energies": [], + "num_converged": 0, + "failed_indices": [] + } + + start_time = time.time() + + if use_multi_gpu: + logger.info(f"Running on {num_gpus} GPUs in parallel") + + # Split structures + batch_size = len(structures) // num_gpus + futures = [] + + # Use 'spawn' start method for CUDA compatibility + import multiprocessing + ctx = multiprocessing.get_context('spawn') + + with concurrent.futures.ProcessPoolExecutor(max_workers=num_gpus, mp_context=ctx) as executor: + for i in range(num_gpus): + start_idx = i * batch_size + end_idx = len(structures) if i == num_gpus - 1 else (i + 1) * batch_size + batch_structures = structures[start_idx:end_idx] + + model_config = { + "package": config["model_package"], + "factory": config["model_factory"], + "kwargs": config["model_kwargs"], + "checkpoint": config["model_checkpoint"], + "gpu_id": i + } + + futures.append( + executor.submit( + process_batch, + i, + batch_structures, + start_idx, + model_config + ) + ) + + # Collect results + for future in concurrent.futures.as_completed(futures): + try: + batch_res = future.result() + results["energies"].extend(batch_res["energies"]) + results["num_converged"] += batch_res["num_converged"] + results["failed_indices"].extend(batch_res["failed_indices"]) + except Exception as e: + logger.error(f"Worker failed: {e}") + + else: + logger.info("Running in single process") + model_config = { + "package": config["model_package"], + "factory": config["model_factory"], + "kwargs": config["model_kwargs"], + "checkpoint": config["model_checkpoint"], + # No gpu_id means let model decide or use default + } + + batch_res = process_batch(0, structures, 0, model_config) + results = batch_res + + elapsed = time.time() - start_time + logger.info(f"Benchmark complete in {elapsed:.1f}s") + logger.info(f"Converged: {results['num_converged']}/{len(structures)}") + + # Save results + with open("results.json", "w") as f: + json.dump(results, f, indent=2) + +if __name__ == "__main__": + main() +''' + + try: + # ---------------------------------------------------------------------- + # 1. ENVIRONMENT SETUP + # ---------------------------------------------------------------------- + logger.info("Step 1/4: Setting up environment...") + + uv_bin = ( + subprocess.run( + ["python", "-c", "import uv; print(uv.find_uv_bin())"], + capture_output=True, + ) + .stdout.decode("utf-8") + .strip() + ) + + # Create UV virtual environment + subprocess.run( + [uv_bin, "venv", "--python", "3.11"], + cwd=work_dir, + check=True, + capture_output=True, + text=True, + ) + + venv_python = work_dir / ".venv/bin/python" + if not venv_python.exists(): + # Windows path + venv_python = work_dir / ".venv/Scripts/python.exe" + + if not venv_python.exists(): + raise RuntimeError(f"Virtual environment python not found at {venv_python}") + + # Install matbench-discovery and model package + logger.info("Installing dependencies...") + subprocess.run( + [ + uv_bin, + "pip", + "install", + "--python", + str(venv_python), + "matbench-discovery", + ], + cwd=work_dir, + check=True, + ) + subprocess.run( + [uv_bin, "pip", "install", "--python", str(venv_python), model_package], + cwd=work_dir, + check=True, + ) + + # Set SSL cert file to certifi's CA bundle to fix HPC SSL verification issues + env = dict(os.environ) + env["MBD_AUTO_DOWNLOAD_FILES"] = "true" + + try: + certifi_path = subprocess.run( + [str(venv_python), "-c", "import certifi; print(certifi.where())"], + capture_output=True, + text=True, + check=True, + ).stdout.strip() + env["SSL_CERT_FILE"] = certifi_path + except Exception as e: + logger.warning(f"Failed to set SSL_CERT_FILE: {e}") + + # ---------------------------------------------------------------------- + # 2. PREPARE BENCHMARK SCRIPT + # ---------------------------------------------------------------------- + logger.info("Step 2/4: Preparing benchmark script...") + + # Write runner script + runner_path = work_dir / "benchmark_runner.py" + runner_path.write_text(BENCHMARK_RUNNER_SCRIPT) + + # Write config + config = { + "repo_url": repo_url, + "repo_ref": repo_ref, + "model_package": model_package, + "model_factory": model_factory, + "model_kwargs": model_kwargs, + "model_checkpoint": model_checkpoint, + "num_structures": num_structures, + "use_multi_gpu": use_multi_gpu, + } + + config_path = work_dir / "config.json" + with open(config_path, "w") as f: + json.dump(config, f, indent=2) + + # ---------------------------------------------------------------------- + # 3. EXECUTE BENCHMARK + # ---------------------------------------------------------------------- + logger.info("Step 3/4: Executing benchmark...") + + # Run the runner script inside the venv + # We stream output directly to stdout so the user sees progress + proc = subprocess.run( + [str(venv_python), str(runner_path), str(config_path)], + cwd=work_dir, + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + check=False, # We check return code manually + ) + + if proc.returncode != 0: + raise RuntimeError( + f"Benchmark runner failed with return code {proc.returncode}" + ) + + # ---------------------------------------------------------------------- + # 4. COLLECT RESULTS + # ---------------------------------------------------------------------- + logger.info("Step 4/4: Collecting results...") + + results_path = work_dir / "results.json" + if not results_path.exists(): + raise RuntimeError( + "Results file not found - benchmark may have crashed silently" + ) + + with open(results_path) as f: + results = json.load(f) + + logger.info("Benchmark completed successfully.") + return results + + finally: + # Cleanup working directory + import shutil + + shutil.rmtree(work_dir, ignore_errors=True) diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py new file mode 100644 index 00000000..f35b1306 --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -0,0 +1,321 @@ +"""Matbench Discovery benchmark task implementations.""" + +from typing import TYPE_CHECKING, Any + +from .remote_runner import run_matbench_is2re + +if TYPE_CHECKING: + from . import MatbenchDiscovery + + +class IS2RETask: + """Initial Structure to Relaxed Energy benchmark task. + + This task evaluates a model's ability to predict the relaxed energy + and geometry of crystal structures starting from unrelaxed initial + configurations. + + The task: + 1. Loads initial (unrelaxed) structures from the WBM test set + 2. Uses the model to perform geometry optimization + 3. Records final energies and relaxed structures + 4. Calculates metrics comparing to DFT ground truth + """ + + def __init__( + self, + adapter: "MatbenchDiscovery", + repo_url: str, + repo_ref: str, + model_package: str | None = None, + ): + """Initialize IS2RE task. + + Args: + adapter: MatbenchDiscovery adapter instance + repo_url: Matbench Discovery repository URL + repo_ref: Git ref (branch/tag/commit) to use + model_package: Default model package to install (can override in submit) + """ + self.adapter = adapter + self.repo_url = repo_url + self.repo_ref = repo_ref + self.model_package = model_package + self.name = "IS2RE" + + def submit( + self, + model=None, + num_structures: int = 100, + model_package: str | None = None, + model_factory: str | None = None, + model_kwargs: dict | None = None, + use_multi_gpu: bool = True, + ): + """Submit IS2RE benchmark job to remote executor. + + You can specify the model in two ways: + 1. Pass a local model instance (will introspect to get remote construction info) + 2. Explicitly specify model_package and model_factory + + Args: + model: (Optional) Local model instance. If provided, will extract + package, class, and checkpoint information from it. + num_structures: Number of test structures to evaluate (default: 100). + Full test set has ~257k structures. Use smaller values + for quick testing. + model_package: Python package name to install (e.g., "mace-torch"). + Required if model is None. + model_factory: How to instantiate the model on remote. Can be: + - Function name: "mace_mp" (will call as function) + - Class name: "MACE" (will instantiate as class) + Required if model is None. + model_kwargs: Dictionary of kwargs to pass when creating model remotely. + Example: {"model": "medium", "device": "cuda"} + use_multi_gpu: If True, automatically detect and use all available GPUs + in parallel for faster processing. If False, use single + GPU/CPU. (default: True) + + Returns: + Future object that will contain benchmark results when complete. + Call .result() to block and wait for completion. + + Examples: + Using local model instance: + >>> from mace.calculators import mace_mp + >>> model = mace_mp(model="medium") + >>> future = task.submit(model, num_structures=50) + + Specifying remote construction explicitly: + >>> future = task.submit( + ... model_package="mace-torch", + ... model_factory="mace_mp", + ... model_kwargs={"model": "medium", "device": "cuda"}, + ... num_structures=50, + ... use_multi_gpu=True + ... ) + """ + # Determine how to construct model remotely + if model is not None: + # Extract info from local model instance + if model_package is None: + if self.model_package is not None: + model_package = self.model_package + else: + # Infer from model's module + model_package = model.__class__.__module__.split(".")[0] + + if model_factory is None: + model_factory = model.__class__.__name__ + + # Get checkpoint path if model has one + model_checkpoint = None + if hasattr(model, "checkpoint_path"): + model_checkpoint = model.checkpoint_path + elif hasattr(model, "checkpoint"): + model_checkpoint = model.checkpoint + + # Try to extract initialization kwargs if available + if model_kwargs is None and hasattr(model, "_init_kwargs"): + model_kwargs = model._init_kwargs + + else: + # Must provide explicit construction info + if model_package is None or model_factory is None: + raise ValueError( + "If model is not provided, must specify both " + "model_package and model_factory" + ) + model_checkpoint = None + + if model_kwargs is None: + model_kwargs = {} + + # Get executor (will create if needed) and submit remote execution + executor = self.adapter._get_executor() + future = executor.submit( + run_matbench_is2re, + repo_url=self.repo_url, + repo_ref=self.repo_ref, + model_package=model_package, + model_factory=model_factory, + model_kwargs=model_kwargs, + model_checkpoint=model_checkpoint, + num_structures=num_structures, + use_multi_gpu=use_multi_gpu, + ) + + return future + + def local( + self, + model=None, + num_structures: int = 100, + model_package: str | None = None, + model_factory: str | None = None, + model_kwargs: dict | None = None, + use_multi_gpu: bool = True, + ) -> dict: + """Run benchmark locally in ephemeral UV environment. + + This executes the same benchmark workflow locally instead of submitting + to a remote Globus Compute endpoint. Useful for testing and development. + + Args: + model: Optional local model instance to extract metadata from + num_structures: Number of test structures to evaluate + model_package: Python package name to install (e.g., "mace-torch") + model_factory: Function or class name to create model + model_kwargs: Dictionary of kwargs for model creation + use_multi_gpu: If True, automatically detect and use all available GPUs + in parallel. If False, use single GPU/CPU. (default: True) + + Returns: + Dictionary with benchmark results (same format as remote execution) + + Example: + >>> results = task.local( + ... model_package="mace-torch", + ... model_factory="mace_mp", + ... model_kwargs={"model": "medium", "device": "cpu"}, + ... num_structures=10, + ... use_multi_gpu=False + ... ) + """ + import json + import subprocess + import tempfile + from pathlib import Path + + # Extract model metadata if model instance provided + if model is not None: + if model_package is None: + if self.model_package is not None: + model_package = self.model_package + else: + model_package = model.__class__.__module__.split(".")[0] + + if model_factory is None: + model_factory = model.__class__.__name__ + + model_checkpoint = None + if hasattr(model, "checkpoint_path"): + model_checkpoint = model.checkpoint_path + elif hasattr(model, "checkpoint"): + model_checkpoint = model.checkpoint + + if model_kwargs is None and hasattr(model, "_init_kwargs"): + model_kwargs = model._init_kwargs + else: + if model_package is None or model_factory is None: + raise ValueError( + "If model is not provided, must specify both " + "model_package and model_factory" + ) + model_checkpoint = None + + if model_kwargs is None: + model_kwargs = {} + + # Run benchmark in subprocess with isolated environment + import sys + + config = { + "repo_url": self.repo_url, + "repo_ref": self.repo_ref, + "model_package": model_package, + "model_factory": model_factory, + "model_kwargs": model_kwargs, + "model_checkpoint": model_checkpoint, + "num_structures": num_structures, + "use_multi_gpu": use_multi_gpu, + } + + results_file_path = ( + Path(tempfile.gettempdir()) / f"benchmark_results_{id(config)}.json" + ) + + wrapper_script = f''' +import json +from garden_ai.benchmarks.matbench_discovery.remote_runner import run_matbench_is2re + +config = {repr(config)} +results = run_matbench_is2re(**config) + +with open("{results_file_path}", "w") as f: + json.dump(results, f, indent=2) +''' + + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: + f.write(wrapper_script) + wrapper_path = f.name + + try: + # Run without capturing output so logs stream to console in real-time + result = subprocess.run( + [sys.executable, wrapper_path], + timeout=3600, + # Don't capture output - let it stream to console + stdout=None, + stderr=None, + ) + + if result.returncode != 0: + raise RuntimeError( + f"Local benchmark failed with return code {result.returncode}" + ) + + if not results_file_path.exists(): + raise RuntimeError( + f"Benchmark results file not found at {results_file_path}" + ) + + with open(results_file_path) as f: + return json.load(f) + + finally: + Path(wrapper_path).unlink(missing_ok=True) + results_file_path.unlink(missing_ok=True) + + def calculate_metrics(self, outputs: dict) -> dict[str, Any]: + """Calculate benchmark metrics from raw outputs. + + For MVP, this returns basic statistics. Future versions will compare + against DFT ground truth and calculate proper benchmark metrics like + F1 score, discovery yield, etc. + + Args: + outputs: Dictionary from remote execution containing: + - energies: List of relaxed energies + - num_converged: Number of successful relaxations + - failed_indices: Indices of failed structures + + Returns: + Dictionary of calculated metrics: + - num_attempted: Total structures attempted + - num_converged: Number of successful relaxations + - success_rate: Fraction of successful relaxations + - mean_energy: Average final energy (eV/atom, if available) + - num_failed: Count of failed relaxations + """ + energies = outputs.get("energies", []) + num_converged = outputs.get("num_converged", 0) + failed_indices = outputs.get("failed_indices", []) + + # Filter out None values (failed relaxations) + valid_energies = [e for e in energies if e is not None] + + metrics = { + "num_attempted": len(energies), + "num_converged": num_converged, + "num_failed": len(failed_indices), + "success_rate": num_converged / len(energies) if energies else 0.0, + } + + # Calculate energy statistics if we have valid results + if valid_energies: + metrics["mean_energy"] = sum(valid_energies) / len(valid_energies) + metrics["min_energy"] = min(valid_energies) + metrics["max_energy"] = max(valid_energies) + + return metrics From 5653925e9ab77ac16d7599ba9a9d6ffee9260a8f Mon Sep 17 00:00:00 2001 From: hholb Date: Fri, 21 Nov 2025 09:46:20 -0700 Subject: [PATCH 02/23] multi-gpu setup working nicely --- .../examples/matbench_1000_structures.py | 83 ------ .../examples/matbench_mace_multi_gpu.py | 97 +++++++ .../matbench_discovery/remote_runner.py | 262 ++++++++---------- .../benchmarks/matbench_discovery/tasks.py | 2 + 4 files changed, 214 insertions(+), 230 deletions(-) delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py deleted file mode 100644 index 151b043e..00000000 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_1000_structures.py +++ /dev/null @@ -1,83 +0,0 @@ -"""Test Matbench Discovery benchmark on Anvil HPC with 1000 structures. - -This script demonstrates scaling to 1000 structures using 4 GPUs in parallel. -It's designed to test the multi-GPU parallelization implementation and measure -throughput before attempting the full dataset. -""" - -from garden_ai.benchmarks import MatbenchDiscovery - -ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" - -ENDPOINT_CONFIG = { - "account": "cis250461-gpu", # HPC allocation/account - "qos": "gpu", - "partition": "gpu-debug", # Use full partition (not debug) for longer run - "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --time=00:30:00\n#SBATCH --mem=32G", - "worker_init": "pip install --user uv", # Install uv on worker startup -} - -MODEL_PACKAGE = "mace-torch" -MODEL_FACTORY = "mace_mp" -MODEL_KWARGS = { - "model": "medium", - "device": "cuda", # Use GPU on HPC - "default_dtype": "float64", -} - -NUM_STRUCTURES = 1000 - -print("=" * 80) -print("Matbench Discovery IS2RE Benchmark - 1000 Structures") -print("=" * 80) -print(f"Endpoint: {ENDPOINT_ID}") -print(f"Model: {MODEL_PACKAGE} / {MODEL_FACTORY}") -print(f"Structures: {NUM_STRUCTURES}") -print("Multi-GPU: Enabled (2 GPUs)") -print("=" * 80) - -with MatbenchDiscovery( - endpoint_id=ENDPOINT_ID, - user_endpoint_config=ENDPOINT_CONFIG, -) as bench: - task = bench.tasks.IS2RE - - future = task.submit( - model_package=MODEL_PACKAGE, - model_factory=MODEL_FACTORY, - model_kwargs=MODEL_KWARGS, - num_structures=NUM_STRUCTURES, - use_multi_gpu=True, # Enable multi-GPU parallelization - ) - - print("\nJob submitted! Waiting for results...") - print("This may take a while. You can monitor progress in the Globus Compute logs.") - print() - - try: - result = future.result() - metrics = task.calculate_metrics(result) - - print("\nResults:") - print("=" * 80) - for key, value in metrics.items(): - print(f" {key}: {value}") - - print("=" * 80) - print("\nRaw Results:") - print(f" Converged: {result['num_converged']}") - print(f" Failed: {len(result.get('failed_indices', []))}") - if result.get("energies"): - valid_energies = [e for e in result["energies"] if e is not None] - if valid_energies: - print(f" Sample energies: {valid_energies[:3]}") - - # Calculate and display throughput - if "num_converged" in result and result["num_converged"] > 0: - print("\nPerformance:") - print(f" Success rate: {metrics.get('success_rate', 0):.1%}") - print(" Note: Check job logs for detailed throughput (structures/hour)") - - except Exception as e: - print(f"\n[ERROR] Benchmark failed: {e}") - raise diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py new file mode 100644 index 00000000..475ef079 --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py @@ -0,0 +1,97 @@ +"""Test Matbench Discovery benchmark on Anvil HPC. + +This script demonstrates running the IS2RE benchmark with a subset of structures +using multi-GPU parallelization on a Globus Compute endpoint. +""" + +from garden_ai.benchmarks import MatbenchDiscovery + +# ------------------------------------------------------------------------------ +# Configuration +# ------------------------------------------------------------------------------ + +# Globus Compute Endpoint ID (Anvil HPC) +ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" + +# Job Configuration +NUM_GPUS = 2 +ENDPOINT_CONFIG = { + "account": "cis250461-gpu", + "qos": "gpu", + "partition": "gpu", + "scheduler_options": f"#SBATCH --gpus-per-node={NUM_GPUS}\n#SBATCH --time=00:30:00\n", + "cores_per_node": 32, + "mem_per_node": 32, # GB + "worker_init": "pip install --user uv", # Ensure uv is available +} + +# Model Configuration +MODEL_PACKAGE = "mace-torch" +MODEL_FACTORY = "mace_mp" +MODEL_KWARGS = { + "model": "medium", + "device": "cuda", + "default_dtype": "float64", +} + +# Benchmark Configuration +NUM_STRUCTURES = 500 + + +def main(): + print("=" * 80) + print("Matbench Discovery IS2RE Benchmark") + print("=" * 80) + print(f"Endpoint: {ENDPOINT_ID}") + print(f"Model: {MODEL_PACKAGE} / {MODEL_FACTORY}") + print(f"Structures: {NUM_STRUCTURES}") + print(f"Resources: {NUM_GPUS} GPUs (Multi-GPU Enabled)") + print("=" * 80) + + with MatbenchDiscovery( + endpoint_id=ENDPOINT_ID, + user_endpoint_config=ENDPOINT_CONFIG, + ) as bench: + task = bench.tasks.IS2RE + + print("\nSubmitting task to endpoint...") + future = task.submit( + model_package=MODEL_PACKAGE, + model_factory=MODEL_FACTORY, + model_kwargs=MODEL_KWARGS, + num_structures=NUM_STRUCTURES, + use_multi_gpu=True, + ) + + print("Job submitted! Waiting for results (this may take a while)...") + + try: + result = future.result() + metrics = task.calculate_metrics(result) + + print("\n" + "=" * 80) + print("Benchmark Results") + print("=" * 80) + + # Print primary metrics + for key, value in metrics.items(): + print(f"{key:<20}: {value}") + + print("-" * 80) + print(f"Converged: {result['num_converged']} / {NUM_STRUCTURES}") + print(f"Failed: {len(result.get('failed_indices', []))}") + + if result.get("energies"): + valid_energies = [e for e in result["energies"] if e is not None] + if valid_energies: + print(f"Sample energies: {valid_energies[:3]} ...") + + print("=" * 80) + + except Exception as e: + print(f"\n[ERROR] Benchmark failed: {e}") + raise + + +if __name__ == "__main__": + main() diff --git a/garden_ai/benchmarks/matbench_discovery/remote_runner.py b/garden_ai/benchmarks/matbench_discovery/remote_runner.py index 109c9944..432d43cc 100644 --- a/garden_ai/benchmarks/matbench_discovery/remote_runner.py +++ b/garden_ai/benchmarks/matbench_discovery/remote_runner.py @@ -55,12 +55,10 @@ def run_matbench_is2re( # Configure logging logging.basicConfig( level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", stream=sys.stdout, force=True, + format="%(asctime)s [%(levelname)s] %(message)s", ) - # Ensure stdout is unbuffered if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(line_buffering=True) @@ -77,13 +75,21 @@ def run_matbench_is2re( import logging import os import concurrent.futures +import importlib from pathlib import Path from typing import List, Dict, Any, Optional +from zipfile import ZipFile +from io import TextIOWrapper + +import torch +from ase.io import read +from ase.optimize import FIRE +from matbench_discovery.data import DataFiles # Configure logging logging.basicConfig( level=logging.INFO, - format='%(asctime)s [%(levelname)s] [%(name)s] [PID:%(process)d] %(message)s', + format='%(asctime)s [%(levelname)s] [PID:%(process)d] %(message)s', datefmt='%Y-%m-%d %H:%M:%S', stream=sys.stdout, force=True @@ -92,74 +98,72 @@ def run_matbench_is2re( def setup_device(gpu_id: Optional[int] = None) -> str: """Setup compute device for this process.""" - import torch - - if gpu_id is not None and torch.cuda.is_available(): - # Set visible devices to just this GPU to avoid contention - # and ensure model uses the correct device - os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) - return "cuda:0" - elif torch.cuda.is_available(): - return "cuda" + if torch.cuda.is_available(): + return f"cuda:{gpu_id}" if gpu_id is not None else "cuda" elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): return "mps" - else: - return "cpu" + return "cpu" -def process_batch( - batch_id: int, - structures: List[Any], - start_idx: int, - model_config: Dict[str, Any] -) -> Dict[str, Any]: - """Process a batch of structures on a specific device.""" +def load_model(config: Dict[str, Any], device: str): + """Initialize the model from configuration.""" + package_name = config["package"] + factory_name = config["factory"] + kwargs = config["kwargs"].copy() + checkpoint = config.get("checkpoint") - # Setup logging for this worker - worker_logger = logging.getLogger(f"worker_{batch_id}") - worker_logger.setLevel(logging.INFO) - - gpu_id = model_config.get("gpu_id") - device = setup_device(gpu_id) - worker_logger.info(f"Worker {batch_id} started on {device} with {len(structures)} structures") + if "device" in kwargs: + kwargs["device"] = device - # Initialize model + # Import factory function + module_parts = package_name.split(".") try: - import importlib - - package_name = model_config["package"] - factory_name = model_config["factory"] - kwargs = model_config["kwargs"].copy() - checkpoint = model_config.get("checkpoint") - - # Update device in kwargs - if "device" in kwargs: - kwargs["device"] = device - - # Import factory - module_parts = package_name.split(".") if len(module_parts) > 1: module = importlib.import_module(package_name) - factory = getattr(module, factory_name) else: + # Try common patterns for model packages base_module = module_parts[0].split("-")[0] try: module = importlib.import_module(f"{base_module}.calculators") - factory = getattr(module, factory_name) - except (ImportError, AttributeError): + except ImportError: module = importlib.import_module(base_module) - factory = getattr(module, factory_name) - # Create model - model = factory(**kwargs) + factory = getattr(module, factory_name) + except (ImportError, AttributeError) as e: + raise ImportError(f"Could not load model factory {factory_name} from {package_name}: {e}") + + # Create model + model = factory(**kwargs) + + # Load checkpoint if provided + if checkpoint and checkpoint != "None": + if hasattr(model, "load_checkpoint"): + model.load_checkpoint(checkpoint) + elif hasattr(model, "load_state_dict"): + model.load_state_dict(torch.load(checkpoint)) + + return model + +def process_batch( + batch_id: int, + structures: List[Any], + start_idx: int, + model_config: Dict[str, Any], + num_threads: int +) -> Dict[str, Any]: + """Process a batch of structures on a specific device.""" + + # Configure thread limits to avoid contention + os.environ["OMP_NUM_THREADS"] = str(num_threads) + torch.set_num_threads(num_threads) - # Load checkpoint - if checkpoint and checkpoint != "None": - if hasattr(model, "load_checkpoint"): - model.load_checkpoint(checkpoint) - elif hasattr(model, "load_state_dict"): - import torch - model.load_state_dict(torch.load(checkpoint)) + gpu_id = model_config.get("gpu_id") + device = setup_device(gpu_id) + worker_logger = logging.getLogger(f"worker_{batch_id}") + worker_logger.info(f"Started on {device} with {len(structures)} structures. Threads: {num_threads}") + + try: + model = load_model(model_config, device) except Exception as e: worker_logger.error(f"Failed to initialize model: {e}") return { @@ -169,13 +173,9 @@ def process_batch( "error": str(e) } - # Run relaxations - from ase.optimize import FIRE - energies = [] failed_indices = [] num_converged = 0 - batch_start = time.time() for i, atoms in enumerate(structures): @@ -185,19 +185,13 @@ def process_batch( opt = FIRE(atoms, logfile=None) opt.run(fmax=0.05, steps=500) - energy = atoms.get_potential_energy() - energies.append(energy) + energies.append(atoms.get_potential_energy()) num_converged += 1 - # Log progress occasionally if (i + 1) % 10 == 0: elapsed = time.time() - batch_start rate = (i + 1) / elapsed if elapsed > 0 else 0 - eta = (len(structures) - i - 1) / rate if rate > 0 else 0 - worker_logger.info( - f"Progress: {i+1}/{len(structures)} " - f"({rate:.2f} struct/s, ETA: {eta/60:.1f}m)" - ) + worker_logger.info(f"Progress: {i+1}/{len(structures)} ({rate:.2f} struct/s)") except Exception as e: worker_logger.warning(f"Structure {global_idx} failed: {e}") @@ -210,75 +204,69 @@ def process_batch( "failed_indices": failed_indices } +def load_structures(num_structures: int) -> List[Any]: + """Load structures from the Matbench Discovery dataset.""" + structures = [] + zip_path = DataFiles.wbm_initial_atoms.path + + with ZipFile(zip_path, 'r') as zf: + # Sort files numerically + file_list = sorted( + zf.namelist(), + key=lambda x: int(x.split('.')[0]) if x.split('.')[0].isdigit() else float('inf') + ) + for filename in file_list[:num_structures]: + with zf.open(filename) as f: + text_stream = TextIOWrapper(f, encoding='utf-8') + structures.append(read(text_stream, format='extxyz')) + return structures + def main(): if len(sys.argv) != 2: - print("Usage: python benchmark_runner.py ") - sys.exit(1) + sys.exit("Usage: python benchmark_runner.py ") - config_path = sys.argv[1] - with open(config_path) as f: + with open(sys.argv[1]) as f: config = json.load(f) logger.info("Starting benchmark runner...") - # Load structures - logger.info("Loading structures...") try: - from matbench_discovery.data import DataFiles - from zipfile import ZipFile - from ase.io import read - from io import TextIOWrapper - - structures = [] - zip_path = DataFiles.wbm_initial_atoms.path - num_structures = config["num_structures"] - - with ZipFile(zip_path, 'r') as zf: - file_list = sorted( - zf.namelist(), - key=lambda x: int(x.split('.')[0]) if x.split('.')[0].isdigit() else float('inf') - ) - for i, filename in enumerate(file_list[:num_structures]): - with zf.open(filename) as f: - text_stream = TextIOWrapper(f, encoding='utf-8') - atoms = read(text_stream, format='extxyz') - structures.append(atoms) - + structures = load_structures(config["num_structures"]) logger.info(f"Loaded {len(structures)} structures") - except Exception as e: logger.error(f"Failed to load structures: {e}") sys.exit(1) - # Determine parallelization strategy - import torch + # Shuffle for load balancing + import random + random.seed(42) + random.shuffle(structures) + + # Resource detection num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1 - results = { - "energies": [], - "num_converged": 0, - "failed_indices": [] - } + total_cores = os.cpu_count() or 1 + num_workers = num_gpus if use_multi_gpu else 1 + # Reserve cores for overhead if possible + available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores + threads_per_worker = max(1, available_cores // num_workers) + + logger.info(f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)") + results = {"energies": [], "num_converged": 0, "failed_indices": []} start_time = time.time() if use_multi_gpu: - logger.info(f"Running on {num_gpus} GPUs in parallel") - - # Split structures + logger.info(f"Parallel execution on {num_gpus} GPUs") batch_size = len(structures) // num_gpus futures = [] - # Use 'spawn' start method for CUDA compatibility - import multiprocessing ctx = multiprocessing.get_context('spawn') - with concurrent.futures.ProcessPoolExecutor(max_workers=num_gpus, mp_context=ctx) as executor: for i in range(num_gpus): start_idx = i * batch_size end_idx = len(structures) if i == num_gpus - 1 else (i + 1) * batch_size - batch_structures = structures[start_idx:end_idx] model_config = { "package": config["model_package"], @@ -288,17 +276,10 @@ def main(): "gpu_id": i } - futures.append( - executor.submit( - process_batch, - i, - batch_structures, - start_idx, - model_config - ) - ) - - # Collect results + futures.append(executor.submit( + process_batch, i, structures[start_idx:end_idx], start_idx, model_config, threads_per_worker + )) + for future in concurrent.futures.as_completed(futures): try: batch_res = future.result() @@ -307,29 +288,24 @@ def main(): results["failed_indices"].extend(batch_res["failed_indices"]) except Exception as e: logger.error(f"Worker failed: {e}") - else: - logger.info("Running in single process") + logger.info("Single process execution") model_config = { "package": config["model_package"], "factory": config["model_factory"], "kwargs": config["model_kwargs"], - "checkpoint": config["model_checkpoint"], - # No gpu_id means let model decide or use default + "checkpoint": config["model_checkpoint"] } - - batch_res = process_batch(0, structures, 0, model_config) - results = batch_res + results = process_batch(0, structures, 0, model_config, threads_per_worker) elapsed = time.time() - start_time - logger.info(f"Benchmark complete in {elapsed:.1f}s") - logger.info(f"Converged: {results['num_converged']}/{len(structures)}") + logger.info(f"Benchmark complete in {elapsed:.1f}s. Converged: {results['num_converged']}/{len(structures)}") - # Save results with open("results.json", "w") as f: json.dump(results, f, indent=2) if __name__ == "__main__": + import multiprocessing main() ''' @@ -339,14 +315,10 @@ def main(): # ---------------------------------------------------------------------- logger.info("Step 1/4: Setting up environment...") - uv_bin = ( - subprocess.run( - ["python", "-c", "import uv; print(uv.find_uv_bin())"], - capture_output=True, - ) - .stdout.decode("utf-8") - .strip() - ) + # Find UV binary + uv_bin = subprocess.check_output( + [sys.executable, "-c", "import uv; print(uv.find_uv_bin())"], text=True + ).strip() # Create UV virtual environment subprocess.run( @@ -354,18 +326,16 @@ def main(): cwd=work_dir, check=True, capture_output=True, - text=True, ) venv_python = work_dir / ".venv/bin/python" if not venv_python.exists(): - # Windows path - venv_python = work_dir / ".venv/Scripts/python.exe" + venv_python = work_dir / ".venv/Scripts/python.exe" # Windows fallback if not venv_python.exists(): raise RuntimeError(f"Virtual environment python not found at {venv_python}") - # Install matbench-discovery and model package + # Install dependencies logger.info("Installing dependencies...") subprocess.run( [ @@ -385,17 +355,15 @@ def main(): check=True, ) - # Set SSL cert file to certifi's CA bundle to fix HPC SSL verification issues + # Set SSL cert file for HPC env = dict(os.environ) env["MBD_AUTO_DOWNLOAD_FILES"] = "true" try: - certifi_path = subprocess.run( + certifi_path = subprocess.check_output( [str(venv_python), "-c", "import certifi; print(certifi.where())"], - capture_output=True, text=True, - check=True, - ).stdout.strip() + ).strip() env["SSL_CERT_FILE"] = certifi_path except Exception as e: logger.warning(f"Failed to set SSL_CERT_FILE: {e}") diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index f35b1306..c8f95e5d 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -278,6 +278,8 @@ def local( results_file_path.unlink(missing_ok=True) def calculate_metrics(self, outputs: dict) -> dict[str, Any]: + # TODO: implement the full metrics calculation, + # this is just a placeholder for now """Calculate benchmark metrics from raw outputs. For MVP, this returns basic statistics. Future versions will compare From e2c2a44216a73bbb9bc8e53b164efae2df544eb9 Mon Sep 17 00:00:00 2001 From: hholb Date: Tue, 2 Dec 2025 10:39:02 -0700 Subject: [PATCH 03/23] checkpoint/resume, more examples --- .../benchmarks/matbench_discovery/__init__.py | 90 +- .../benchmarks/matbench_discovery/enums.py | 46 +- .../examples/matbench_equiformerv2.py | 129 ++ .../examples/matbench_mace_multi_gpu.py | 112 +- .../examples/matbench_mattersim.py | 108 ++ .../examples/matbench_sevennet.py | 111 ++ .../examples/matbench_test.py | 31 - .../examples/matbench_test_local_mps.py | 46 - .../examples/matbench_test_remote.py | 59 - .../examples/run_random_10k_benchmark.py | 223 ++++ .../benchmarks/matbench_discovery/metrics.py | 193 +++ .../matbench_discovery/remote_runner.py | 438 ------- .../benchmarks/matbench_discovery/tasks.py | 1126 +++++++++++++---- 13 files changed, 1806 insertions(+), 906 deletions(-) create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py create mode 100644 garden_ai/benchmarks/matbench_discovery/metrics.py delete mode 100644 garden_ai/benchmarks/matbench_discovery/remote_runner.py diff --git a/garden_ai/benchmarks/matbench_discovery/__init__.py b/garden_ai/benchmarks/matbench_discovery/__init__.py index 3f687a6c..3256522e 100644 --- a/garden_ai/benchmarks/matbench_discovery/__init__.py +++ b/garden_ai/benchmarks/matbench_discovery/__init__.py @@ -31,13 +31,34 @@ from globus_compute_sdk import Executor from globus_compute_sdk.serialize import CombinedCode, ComputeSerializer -from .enums import MatbenchTask -from .tasks import IS2RETask +from .enums import DatasetSize, MatbenchTask +from .tasks import ( + IP2ETask, + IS2ETask, + IS2RETask, + RP2RETask, + RS2RETask, + S2EFSMTask, + S2EFSTask, + S2EFTask, + S2ETask, + S2RETask, +) __all__ = [ "MatbenchDiscovery", "MatbenchTask", + "DatasetSize", "IS2RETask", + "RS2RETask", + "S2EFSTask", + "S2EFTask", + "S2EFSMTask", + "IS2ETask", + "S2ETask", + "S2RETask", + "RP2RETask", + "IP2ETask", ] @@ -92,7 +113,12 @@ def __init__( (can be overridden per task) """ self.endpoint_id = endpoint_id - self.user_endpoint_config = user_endpoint_config + self.user_endpoint_config = user_endpoint_config or {} + + # Ensure 'requirements' is present to avoid endpoint template errors + if "requirements" not in self.user_endpoint_config: + self.user_endpoint_config["requirements"] = "" + self.repo_ref = repo_ref or self.REPO_REF self.model_package = model_package @@ -139,11 +165,65 @@ def __enter__(self): (), { "IS2RE": IS2RETask( - adapter=self, # Pass adapter instead of executor + adapter=self, repo_url=self.REPO_URL, repo_ref=self.repo_ref, model_package=self.model_package, - ) + ), + "RS2RE": RS2RETask( + adapter=self, + repo_url=self.REPO_URL, + repo_ref=self.repo_ref, + model_package=self.model_package, + ), + "S2EFS": S2EFSTask( + adapter=self, + repo_url=self.REPO_URL, + repo_ref=self.repo_ref, + model_package=self.model_package, + ), + "S2EF": S2EFTask( + adapter=self, + repo_url=self.REPO_URL, + repo_ref=self.repo_ref, + model_package=self.model_package, + ), + "S2EFSM": S2EFSMTask( + adapter=self, + repo_url=self.REPO_URL, + repo_ref=self.repo_ref, + model_package=self.model_package, + ), + "IS2E": IS2ETask( + adapter=self, + repo_url=self.REPO_URL, + repo_ref=self.repo_ref, + model_package=self.model_package, + ), + "S2E": S2ETask( + adapter=self, + repo_url=self.REPO_URL, + repo_ref=self.repo_ref, + model_package=self.model_package, + ), + "S2RE": S2RETask( + adapter=self, + repo_url=self.REPO_URL, + repo_ref=self.repo_ref, + model_package=self.model_package, + ), + "RP2RE": RP2RETask( + adapter=self, + repo_url=self.REPO_URL, + repo_ref=self.repo_ref, + model_package=self.model_package, + ), + "IP2E": IP2ETask( + adapter=self, + repo_url=self.REPO_URL, + repo_ref=self.repo_ref, + model_package=self.model_package, + ), }, )() diff --git a/garden_ai/benchmarks/matbench_discovery/enums.py b/garden_ai/benchmarks/matbench_discovery/enums.py index 8cc2f99b..5c34cb6b 100644 --- a/garden_ai/benchmarks/matbench_discovery/enums.py +++ b/garden_ai/benchmarks/matbench_discovery/enums.py @@ -12,4 +12,48 @@ class MatbenchTask(Enum): - S2EFS: Structure to Energy, Forces, and Stress """ - IS2RE = "is2re" # Initial Structure to Relaxed Energy + IS2RE = "IS2RE" # Initial Structure to Relaxed Energy + RS2RE = "RS2RE" # Relaxed Structure to Relaxed Energy + S2EFS = "S2EFS" # Structure to Energy, Forces, Stress + S2EF = "S2EF" # Structure to Energy, Force + S2EFSM = "S2EFSM" # Structure to Energy, Force, Stress, Magmoms + IS2E = "IS2E" # Initial Structure to Energy + S2E = "S2E" # Structure to Energy + S2RE = "S2RE" # Structure to Relaxed Energy + RP2RE = "RP2RE" # Relaxed Prototype to Relaxed Energy + IP2E = "IP2E" # Initial Prototype to Energy + + +class DatasetSize(str, Enum): + """Predefined dataset sizes for Matbench Discovery benchmarks. + + These correspond to different subsets of the WBM test set that are commonly + used for evaluating materials discovery models. + """ + + FULL = "full" + """Full WBM test set (~257k structures)""" + + UNIQUE_PROTOS = "unique_protos" + """Unique prototypes subset (~215k structures) - removes duplicate prototypes""" + + RANDOM_10K = "random_10k" + """Random 10k structures from the unique prototypes subset (fixed seed)""" + + RANDOM_100 = "random_100" + """Random 100 structures for quick testing (fixed seed)""" + + def seed(self, seed: int) -> "DatasetConfig": + """Return a configuration with a custom random seed.""" + return DatasetConfig(self, seed) + + +class DatasetConfig: + """Configuration for a dataset subset with a specific random seed.""" + + def __init__(self, subset: DatasetSize, seed: int): + self.subset = subset + self.seed = seed + + def __repr__(self): + return f"{self.subset.name}(seed={self.seed})" diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py new file mode 100644 index 00000000..ec3afe91 --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +""" +Matbench Discovery Benchmark - EquiformerV2 Example + +EquiformerV2 is an improved equivariant transformer from FAIR-Chem (formerly OCP). +Paper: https://arxiv.org/abs/2306.12059 +GitHub: https://github.com/Open-Catalyst-Project/ocp + +Note: This example uses the S2EFS task (Structure to Energy, Forces, Stress) +instead of IS2RE because EquiformerV2 doesn't support geometry relaxation. +""" + +from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery + +# ============================================================================= +# Configuration +# ============================================================================= + +# Globus Compute endpoint +ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" + +# HPC endpoint configuration +ENDPOINT_CONFIG = { + "account": "cis250461-gpu", + "partition": "gpu-debug", + "qos": "gpu", + "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8", +} + + +# Model factory function for EquiformerV2 +def create_equiformerv2_model(device): + """Create EquiformerV2 model calculator. + + Args: + device: Device to load model on ("cuda" or "cpu") + + Returns: + ASE calculator for EquiformerV2 + """ + from fairchem.core.calculate.ase_calculator import Calculator + + # Use pre-trained checkpoint - will auto-download from HuggingFace + return Calculator( + model_name="EquiformerV2-31M-S2EF-OC20-All+MD", cpu=(device == "cpu") + ) + + +# Benchmark parameters +NUM_STRUCTURES = 1000 +USE_MULTI_GPU = True + +# ============================================================================= +# Run Benchmark +# ============================================================================= + + +def main(): + """Run Matbench Discovery S2EFS benchmark with EquiformerV2.""" + + print("=" * 80) + print("Matbench Discovery S2EFS Benchmark") + print("=" * 80) + print(f"Endpoint: {ENDPOINT_ID}") + print("Model: EquiformerV2-31M") + print("Task: S2EFS (Structure to Energy, Forces, Stress)") + print(f"Structures: {NUM_STRUCTURES}") + print(f"Resources: {'Multi-GPU' if USE_MULTI_GPU else 'Single GPU'}") + print("=" * 80) + print() + + with MatbenchDiscovery( + endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG + ) as bench: + # Run S2EFS task (uses relaxed structures, no geometry optimization) + # This is suitable for EquiformerV2 which doesn't support relaxation + print("Submitting S2EFS task...") + future = bench.tasks.S2EFS.submit( + model_factory=create_equiformerv2_model, + model_package="fairchem-core", + num_structures=NUM_STRUCTURES, + use_multi_gpu=USE_MULTI_GPU, + ) + + print("Waiting for results (this may take a while)...") + output = future.result() + + # Display metrics + print() + print("=" * 80) + print("Benchmark Results") + print("=" * 80) + + metrics = output.get("metrics", {}) + if "error" in metrics: + print(f"Error: {metrics['error']}") + else: + # Energy metrics + if "energy_mae" in metrics: + print("Energy Metrics:") + print(f" MAE (eV/atom): {metrics.get('energy_mae', 'N/A'):.6f}") + print(f" RMSE (eV/atom): {metrics.get('energy_rmse', 'N/A'):.6f}") + print(f" R²: {metrics.get('energy_r2', 'N/A'):.6f}") + print() + + # Force metrics + if "force_mae" in metrics: + print("Force Metrics:") + print(f" MAE (eV/Å): {metrics.get('force_mae', 'N/A'):.6f}") + print(f" RMSE (eV/Å): {metrics.get('force_rmse', 'N/A'):.6f}") + print(f" R²: {metrics.get('force_r2', 'N/A'):.6f}") + print() + + # Stress metrics + if "stress_mae" in metrics: + print("Stress Metrics:") + print(f" MAE (GPa): {metrics.get('stress_mae', 'N/A'):.6f}") + print(f" RMSE (GPa): {metrics.get('stress_rmse', 'N/A'):.6f}") + print(f" R²: {metrics.get('stress_r2', 'N/A'):.6f}") + print() + + if "num_evaluated" in metrics: + print(f"Structures: {metrics.get('num_evaluated', 'N/A')}") + + print("=" * 80) + + +if __name__ == "__main__": + main() diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py index 475ef079..9f971086 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py @@ -4,89 +4,77 @@ using multi-GPU parallelization on a Globus Compute endpoint. """ -from garden_ai.benchmarks import MatbenchDiscovery +from garden_ai.benchmarks.matbench_discovery import DatasetSize, MatbenchDiscovery -# ------------------------------------------------------------------------------ -# Configuration -# ------------------------------------------------------------------------------ - -# Globus Compute Endpoint ID (Anvil HPC) +# Globus Compute endpoint ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" -# Job Configuration -NUM_GPUS = 2 +# HPC endpoint configuration ENDPOINT_CONFIG = { "account": "cis250461-gpu", + "partition": "gpu-debug", "qos": "gpu", - "partition": "gpu", - "scheduler_options": f"#SBATCH --gpus-per-node={NUM_GPUS}\n#SBATCH --time=00:30:00\n", - "cores_per_node": 32, - "mem_per_node": 32, # GB - "worker_init": "pip install --user uv", # Ensure uv is available + "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8", } -# Model Configuration -MODEL_PACKAGE = "mace-torch" -MODEL_FACTORY = "mace_mp" -MODEL_KWARGS = { - "model": "medium", - "device": "cuda", - "default_dtype": "float64", -} -# Benchmark Configuration -NUM_STRUCTURES = 500 +# Model factory function for MACE +def create_mace_model(device): + from mace.calculators import mace_mp + + return mace_mp(model="medium", device=device, default_dtype="float64") + + +NUM_STRUCTURES = DatasetSize.RANDOM_100 def main(): - print("=" * 80) - print("Matbench Discovery IS2RE Benchmark") - print("=" * 80) - print(f"Endpoint: {ENDPOINT_ID}") - print(f"Model: {MODEL_PACKAGE} / {MODEL_FACTORY}") - print(f"Structures: {NUM_STRUCTURES}") - print(f"Resources: {NUM_GPUS} GPUs (Multi-GPU Enabled)") - print("=" * 80) + """Run Matbench Discovery IS2RE benchmark with MACE.""" with MatbenchDiscovery( - endpoint_id=ENDPOINT_ID, - user_endpoint_config=ENDPOINT_CONFIG, + endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG ) as bench: - task = bench.tasks.IS2RE - - print("\nSubmitting task to endpoint...") - future = task.submit( - model_package=MODEL_PACKAGE, - model_factory=MODEL_FACTORY, - model_kwargs=MODEL_KWARGS, + # Run IS2RE task (Initial Structure to Relaxed Energy) + future = bench.tasks.IS2RE.submit( + model_factory=create_mace_model, + model_packages="mace-torch", num_structures=NUM_STRUCTURES, - use_multi_gpu=True, ) print("Job submitted! Waiting for results (this may take a while)...") try: - result = future.result() - metrics = task.calculate_metrics(result) - - print("\n" + "=" * 80) - print("Benchmark Results") - print("=" * 80) - - # Print primary metrics - for key, value in metrics.items(): - print(f"{key:<20}: {value}") - - print("-" * 80) - print(f"Converged: {result['num_converged']} / {NUM_STRUCTURES}") - print(f"Failed: {len(result.get('failed_indices', []))}") - - if result.get("energies"): - valid_energies = [e for e in result["energies"] if e is not None] - if valid_energies: - print(f"Sample energies: {valid_energies[:3]} ...") - - print("=" * 80) + output = future.result() + metrics = output.get("metrics", {}) + + if "error" in metrics: + print(f"error : {metrics['error']}") + else: + # Discovery metrics (stability classification) + if "F1" in metrics: + print(f"F1 : {metrics['F1']:.6f}") + print(f"DAF : {metrics['DAF']:.2f}x") + print(f"Precision : {metrics['Precision']:.6f}") + print(f"Recall : {metrics['Recall']:.6f}") + print(f"Accuracy : {metrics['Accuracy']:.6f}") + + # Regression metrics + if "MAE" in metrics: + print(f"MAE (eV/atom) : {metrics['MAE']:.6f}") + print(f"RMSE (eV/atom) : {metrics['RMSE']:.6f}") + print(f"R2 : {metrics['R2']:.6f}") + + # Force metrics (if S2EFS task) + if "force_mae" in metrics: + print(f"force_mae : {metrics['force_mae']:.6f}") + print(f"force_rmse : {metrics['force_rmse']:.6f}") + print(f"force_r2 : {metrics['force_r2']:.6f}") + print(f"stress_mae : {metrics['stress_mae']:.6f}") + print(f"stress_rmse : {metrics['stress_rmse']:.6f}") + print(f"stress_r2 : {metrics['stress_r2']:.6f}") + + if "num_evaluated" in metrics: + print(f"num_evaluated : {metrics['num_evaluated']}") except Exception as e: print(f"\n[ERROR] Benchmark failed: {e}") diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py new file mode 100644 index 00000000..fcf77a1c --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +""" +Matbench Discovery Benchmark - MatterSim Example + +MatterSim is a deep learning atomistic model for general material simulations. +Paper: https://arxiv.org/abs/2405.04967 +GitHub: https://github.com/microsoft/mattersim +""" + +from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery + +# ============================================================================= +# Configuration +# ============================================================================= + +# Globus Compute endpoint +ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" + +# HPC endpoint configuration +ENDPOINT_CONFIG = { + "account": "cis250461-gpu", + "partition": "gpu-debug", + "qos": "gpu", + "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8", +} + + +# Model factory function for MatterSim +def create_mattersim_model(device): + """Create MatterSim model calculator. + + Args: + device: Device to load model on ("cuda" or "cpu") + + Returns: + ASE calculator for MatterSim + """ + from mattersim.forcefield import MatterSimCalculator + + return MatterSimCalculator(device=device) + + +# Benchmark parameters +NUM_STRUCTURES = 1000 +USE_MULTI_GPU = True + +# ============================================================================= +# Run Benchmark +# ============================================================================= + + +def main(): + """Run Matbench Discovery IS2RE benchmark with MatterSim.""" + + print("=" * 80) + print("Matbench Discovery IS2RE Benchmark") + print("=" * 80) + print(f"Endpoint: {ENDPOINT_ID}") + print("Model: MatterSim") + print(f"Structures: {NUM_STRUCTURES}") + print(f"Resources: {'Multi-GPU' if USE_MULTI_GPU else 'Single GPU'}") + print("=" * 80) + print() + + with MatbenchDiscovery( + endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG + ) as bench: + # Run IS2RE task + print("Submitting IS2RE task...") + future = bench.tasks.IS2RE.submit( + model_factory=create_mattersim_model, + model_package="mattersim", + num_structures=NUM_STRUCTURES, + use_multi_gpu=USE_MULTI_GPU, + ) + + print("Waiting for results (this may take a while)...") + output = future.result() + + # Display metrics + print() + print("=" * 80) + print("Benchmark Results") + print("=" * 80) + + metrics = output.get("metrics", {}) + if "error" in metrics: + print(f"Error: {metrics['error']}") + else: + # Discovery metrics + print(f"F1 Score: {metrics.get('F1', 'N/A'):.6f}") + print(f"DAF: {metrics.get('DAF', 'N/A'):.2f}x") + print(f"Precision: {metrics.get('Precision', 'N/A'):.6f}") + print(f"Recall: {metrics.get('Recall', 'N/A'):.6f}") + print(f"Accuracy: {metrics.get('Accuracy', 'N/A'):.6f}") + print() + # Regression metrics + print(f"MAE (eV/atom): {metrics.get('MAE', 'N/A'):.6f}") + print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}") + print(f"R²: {metrics.get('R2', 'N/A'):.6f}") + print() + print(f"Structures: {metrics.get('num_evaluated', 'N/A')}") + + print("=" * 80) + + +if __name__ == "__main__": + main() diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py new file mode 100644 index 00000000..d028b740 --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +""" +Matbench Discovery Benchmark - SevenNet Example + +This script demonstrates running the Matbench Discovery IS2RE benchmark +using SevenNet as the MLIP model on a remote Globus Compute endpoint. + +SevenNet is a graph neural network potential with good transferability. +""" + +from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery + +# ============================================================================= +# Configuration +# ============================================================================= + +# Globus Compute endpoint (replace with your endpoint UUID) +ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" + +# HPC endpoint configuration (adjust for your cluster) +ENDPOINT_CONFIG = { + "account": "cis250461-gpu", + "partition": "gpu-debug", + "qos": "gpu", + "scheduler_options": "#SBATCH --gpus-per-node=2\n", + "cores_per_node": 16, + "mem_per_node": 32, # GB +} + + +# Model factory function for SevenNet +def create_sevennet_model(device): + """Create SevenNet model calculator. + + Args: + device: Device to load model on ("cuda" or "cpu") + + Returns: + ASE calculator for SevenNet + """ + from sevenn.calculator import SevenNetCalculator + + return SevenNetCalculator(model="7net-0", device=device) + + +# Benchmark parameters +NUM_STRUCTURES = 1000 # Number of structures to evaluate +USE_MULTI_GPU = True # Enable multi-GPU parallelization + +# ============================================================================= +# Run Benchmark +# ============================================================================= + + +def main(): + """Run Matbench Discovery IS2RE benchmark with SevenNet.""" + + print("=" * 80) + print("Matbench Discovery IS2RE Benchmark") + print("=" * 80) + print(f"Endpoint: {ENDPOINT_ID}") + print("Model: SevenNet (7net-0)") + print(f"Structures: {NUM_STRUCTURES}") + print(f"Resources: {'Multi-GPU' if USE_MULTI_GPU else 'Single GPU'}") + print("=" * 80) + print() + + with MatbenchDiscovery( + endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG + ) as bench: + # Run IS2RE task (Initial Structure to Relaxed Energy) + print("Submitting IS2RE task...") + future = bench.tasks.IS2RE.submit( + model_factory=create_sevennet_model, + model_package="sevenn", + num_structures=NUM_STRUCTURES, + use_multi_gpu=USE_MULTI_GPU, + ) + + print("Waiting for results (this may take a while)...") + output = future.result() + + # Display metrics + print() + print("=" * 80) + print("Benchmark Results") + print("=" * 80) + + metrics = output.get("metrics", {}) + if "error" in metrics: + print(f"Error: {metrics['error']}") + else: + # Discovery metrics + print(f"F1 Score: {metrics.get('F1', 'N/A'):.6f}") + print(f"DAF: {metrics.get('DAF', 'N/A'):.2f}x") + print(f"Precision: {metrics.get('Precision', 'N/A'):.6f}") + print(f"Recall: {metrics.get('Recall', 'N/A'):.6f}") + print(f"Accuracy: {metrics.get('Accuracy', 'N/A'):.6f}") + print() + # Regression metrics + print(f"MAE (eV/atom): {metrics.get('MAE', 'N/A'):.6f}") + print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}") + print(f"R²: {metrics.get('R2', 'N/A'):.6f}") + print() + print(f"Structures: {metrics.get('num_evaluated', 'N/A')}") + + print("=" * 80) + + +if __name__ == "__main__": + main() diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py deleted file mode 100644 index 3b2912f8..00000000 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Test Matbench Discovery benchmark locally.""" - -from garden_ai.benchmarks import MatbenchDiscovery - -print("Matbench Discovery IS2RE Benchmark") -print("=" * 80) - -with MatbenchDiscovery() as bench: - task = bench.tasks.IS2RE - - # Run benchmark locally - result = task.local( - model_package="mace-torch", - model_factory="mace_mp", - model_kwargs={ - "model": "medium", - "device": "cpu", - "default_dtype": "float32", - }, - num_structures=10, - ) - - # Calculate metrics - metrics = task.calculate_metrics(result) - - # Display results - print("\nResults:") - print("=" * 80) - for key, value in metrics.items(): - print(f" {key}: {value}") - print("=" * 80) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py deleted file mode 100644 index 9f9bfd8c..00000000 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_local_mps.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Test Matbench Discovery benchmark locally on Mac. - -This script tests the benchmark implementation locally. Note that MPS (Apple Silicon -GPU) is not compatible with MACE model checkpoints which use float64, so this runs -on CPU. This is still useful for verifying the workflow works before using Anvil. -""" - -from garden_ai.benchmarks import MatbenchDiscovery - -print("=" * 80) -print("Matbench Discovery Local Test") -print("=" * 80) - -# Run benchmark locally with MPS acceleration -with MatbenchDiscovery() as bench: - task = bench.tasks.IS2RE - - print("\nRunning local benchmark...") - print("Note: Using CPU because MACE model checkpoints use float64,") - print("which is not supported by MPS. This is still useful for testing") - print("the workflow before running on Anvil with CUDA.\n") - - result = task.local( - model_package="mace-torch", - model_factory="mace_mp", - model_kwargs={ - "model": "medium", - "device": "cpu", # MPS doesn't support float64 used by MACE checkpoints - "default_dtype": "float32", - }, - num_structures=10, # Small test to verify workflow - use_multi_gpu=False, - ) - - # Calculate metrics - metrics = task.calculate_metrics(result) - - # Display results - print("\nResults:") - print("=" * 80) - for key, value in metrics.items(): - print(f" {key}: {value}") - print("=" * 80) - - print("\nLocal test complete!") - print("If this works, you can proceed with confidence to run on Anvil.") diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py deleted file mode 100644 index 08be5dca..00000000 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_test_remote.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Test Matbench Discovery benchmark on remote a HPC endpoint.""" - -from garden_ai.benchmarks import MatbenchDiscovery - -ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" - -ENDPOINT_CONFIG = { - "account": "cis250461-gpu", # HPC allocation/account - "qos": "gpu", - "partition": "gpu-debug", # SLURM partition - "scheduler_options": "#SBATCH --gpus-per-node=4", # Request 4 GPUs - "worker_init": "pip install --user uv", # Install uv on worker startup -} - -MODEL_PACKAGE = "mace-torch" -MODEL_FACTORY = "mace_mp" -MODEL_KWARGS = { - "model": "medium", - "device": "cuda", # Use GPU on HPC - "default_dtype": "float32", -} - -NUM_STRUCTURES = 100 # Increased from 10 to test multi-GPU parallelization - -with MatbenchDiscovery( - endpoint_id=ENDPOINT_ID, - user_endpoint_config=ENDPOINT_CONFIG, -) as bench: - task = bench.tasks.IS2RE - - future = task.submit( - model_package=MODEL_PACKAGE, - model_factory=MODEL_FACTORY, - model_kwargs=MODEL_KWARGS, - num_structures=NUM_STRUCTURES, - use_multi_gpu=True, # Enable multi-GPU parallelization - ) - - try: - result = future.result() - metrics = task.calculate_metrics(result) - - print("\nResults:") - print("=" * 80) - for key, value in metrics.items(): - print(f" {key}: {value}") - - print("=" * 80) - print("\nRaw Results:") - print(f" Converged: {result['num_converged']}") - print(f" Failed: {len(result.get('failed_indices', []))}") - if result.get("energies"): - valid_energies = [e for e in result["energies"] if e is not None] - if valid_energies: - print(f" Sample energies: {valid_energies[:3]}") - - except Exception as e: - print(f"\n[ERROR] Benchmark failed: {e}") - raise diff --git a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py new file mode 100644 index 00000000..3bdfd6ca --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 +""" +Run Matbench Discovery benchmarks on 10k most stable structures. + +This script benchmarks MACE, MatterSim, and SevenNet on the 10k most stable +materials from the unique prototypes subset and saves comprehensive metrics to JSON. +""" + +import json +from datetime import datetime +from pathlib import Path + +from garden_ai.benchmarks.matbench_discovery import DatasetSize, MatbenchDiscovery + +# ============================================================================= +# Configuration +# ============================================================================= + +# Globus Compute endpoint +ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" + +# HPC endpoint configuration +ENDPOINT_CONFIG = { + "account": "cis250461-gpu", + "partition": "gpu", + "qos": "gpu", + "scheduler_options": "#SBATCH --gpus-per-node=4\n", + "cores_per_node": 8, + "mem_per_node": 32, +} + +# Output file for metrics +OUTPUT_FILE = "stable_10k_benchmark_results.json" + +# ============================================================================= +# Model Factory Functions +# ============================================================================= + + +def create_mace_model(device): + """Create MACE model calculator.""" + from mace.calculators import mace_mp + + return mace_mp(model="medium-mpa-0", device=device, default_dtype="float64") + + +def create_mattersim_model(device): + """Create MatterSim model calculator.""" + from mattersim.forcefield import MatterSimCalculator + + return MatterSimCalculator(device=device) + + +def create_sevennet_model(device): + """Create SevenNet model calculator.""" + from sevenn.calculator import SevenNetCalculator + + return SevenNetCalculator(model="7net-0", device=device) + + +# Model configurations +MODELS = { + "MACE": { + "package": "mace-torch", + "factory": create_mace_model, + }, + "MatterSim": { + "package": "mattersim", + "factory": create_mattersim_model, + }, + "SevenNet": { + "package": "sevenn", + "factory": create_sevennet_model, + }, +} + +# ============================================================================= +# Run Benchmarks +# ============================================================================= + + +def main(): + """Run benchmarks on all models and save results.""" + + print("=" * 80) + print("Matbench Discovery Benchmark - Stable 10k") + print("=" * 80) + print("Dataset: 10k Most Stable Structures") + print(f"Models: {', '.join(MODELS.keys())}") + print(f"Endpoint: {ENDPOINT_ID}") + print("=" * 80) + print() + + results = { + "metadata": { + "timestamp": datetime.now().isoformat(), + "dataset": "stable_10k", + "dataset_size": 10000, + "endpoint_id": ENDPOINT_ID, + }, + "models": {}, + } + + with MatbenchDiscovery( + endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG + ) as bench: + for model_name, config in MODELS.items(): + print(f"\n{'=' * 80}") + print(f"Running {model_name}...") + print(f"{'=' * 80}\n") + + try: + # Submit job + future = bench.tasks.IS2RE.submit( + model_factory=config["factory"], + model_packages=[ + config["package"], + "cuequivariance", + "cuequivariance-torch", + "cuequivariance-ops-torch-cu12", + ], + num_structures=DatasetSize.RANDOM_10K, + ) + + print(f"Job submitted for {model_name}. Waiting for results...") + + try: + output = future.result() + except Exception as e: + print(f"⚠️ {model_name} failed first attempt: {e}") + print(f" Resuming from checkpoint: {future.checkpoint_path}") + + # Extract checkpoint name from path + checkpoint_name = Path(future.checkpoint_path).name + + # Resubmit with same checkpoint name to resume + retry_future = bench.tasks.IS2RE.submit( + model_factory=config["factory"], + model_packages=[ + config["package"], + "cuequivariance", + "cuequivariance-torch", + "cuequivariance-ops-torch-cu12", + ], + num_structures=DatasetSize.RANDOM_10K, + checkpoint_name=checkpoint_name, + ) + + try: + print(" Retry job submitted. Waiting for results...") + output = retry_future.result() + print(" ✅ Retry successful!") + except Exception as retry_e: + print(f"❌ {model_name} failed retry: {retry_e}") + results["models"][model_name] = { + "status": "error", + "error": str(retry_e), + } + continue # Skip to next model + + # Store complete output (contains both metrics and per-structure results) + results["models"][model_name] = { + "status": "success", + **output, # Unpack entire output dict (metrics + results) + } + + # Display metrics + metrics = output.get("metrics", {}) + if "error" in metrics: + print(f"❌ {model_name} failed: {metrics['error']}") + results["models"][model_name]["status"] = "failed" + results["models"][model_name]["error"] = metrics["error"] + else: + print(f"✅ {model_name} completed successfully!") + print(f" F1 Score: {metrics.get('F1', 'N/A'):.6f}") + print(f" DAF: {metrics.get('DAF', 'N/A'):.2f}x") + print(f" MAE (eV/atom): {metrics.get('MAE', 'N/A'):.6f}") + print(f" RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}") + print(f" Structures: {metrics.get('num_evaluated', 'N/A')}") + + except Exception as e: + print(f"❌ {model_name} error: {e}") + results["models"][model_name] = { + "status": "error", + "error": str(e), + } + + # Save results to JSON + output_path = Path(OUTPUT_FILE) + with open(output_path, "w") as f: + json.dump(results, f, indent=2) + + print(f"\n{'=' * 80}") + print("Benchmark Complete!") + print(f"{'=' * 80}") + print(f"\nResults saved to: {output_path.absolute()}") + + # Print summary table + print(f"\n{'=' * 80}") + print("Summary") + print(f"{'=' * 80}\n") + print(f"{'Model':<15} {'Status':<10} {'F1':<10} {'DAF':<10} {'MAE':<10}") + print("-" * 80) + + for model_name, data in results["models"].items(): + if data["status"] == "success": + metrics = data["metrics"] + print( + f"{model_name:<15} {data['status']:<10} " + f"{metrics.get('F1', 0):<10.6f} " + f"{metrics.get('DAF', 0):<10.2f} " + f"{metrics.get('MAE', 0):<10.6f}" + ) + else: + print( + f"{model_name:<15} {data['status']:<10} {'N/A':<10} {'N/A':<10} {'N/A':<10}" + ) + + print() + + +if __name__ == "__main__": + main() diff --git a/garden_ai/benchmarks/matbench_discovery/metrics.py b/garden_ai/benchmarks/matbench_discovery/metrics.py new file mode 100644 index 00000000..c08bad2d --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/metrics.py @@ -0,0 +1,193 @@ +"""Functions to classify energy above convex hull predictions as true/false +positive/negative and compute performance metrics. + +Adapted from matbench-discovery to avoid import issues. +Original source: https://github.com/janosh/matbench-discovery/blob/main/matbench_discovery/metrics/discovery.py +""" + +from collections.abc import Sequence + +import numpy as np +import pandas as pd +from sklearn.metrics import r2_score + +# Default stability threshold from matbench-discovery +# STABILITY_THRESHOLD = 0.0 + + +def classify_stable( + each_true: Sequence[float] | pd.Series | np.ndarray, + each_pred: Sequence[float] | pd.Series | np.ndarray, + *, + stability_threshold: float = 0.0, + fillna: bool = True, +) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]: + """Classify model stability predictions as true/false positive/negatives (usually + w.r.t DFT-ground truth labels). All energies are assumed to be in eV/atom + (but shouldn't really matter as long as they're consistent). + + Args: + each_true (Sequence[float] | pd.Series): Ground truth energy above convex hull + values. + each_pred (Sequence[float] | pd.Series): Model-predicted energy above convex + hull values. + stability_threshold (float, optional): Maximum energy above convex hull + for a material to still be considered stable. Usually 0, 0.05 or 0.1. + Defaults to 0.0, meaning a material has to be directly on + the hull to be called stable. Negative values mean a material has to pull + the known hull down by that amount to count as stable. Few materials lie + below the known hull, so only negative values very close to 0 make sense. + fillna (bool): Whether to fill NaNs as the model predicting unstable. Defaults + to True. + + Returns: + tuple[TP, FN, FP, TN]: Indices as pd.Series for true positives, + false negatives, false positives and true negatives (in this order). + + Raises: + ValueError: If sum of positive + negative preds doesn't add up to the total. + """ + if len(each_true) != len(each_pred): + raise ValueError(f"{len(each_true)=} != {len(each_pred)=}") + + each_true_arr, each_pred_arr = pd.Series(each_true), pd.Series(each_pred) + + if stability_threshold is None or np.isnan(stability_threshold): + raise ValueError("stability_threshold must be a real number") + actual_pos = each_true_arr <= (stability_threshold or 0) + actual_neg = each_true_arr > (stability_threshold or 0) + + model_pos = each_pred_arr <= (stability_threshold or 0) + model_neg = each_pred_arr > (stability_threshold or 0) + + if fillna: + nan_mask = np.isnan(each_pred) + # for in both the model's stable and unstable preds, fill NaNs as unstable + model_pos[nan_mask] = False + model_neg[nan_mask] = True + + n_pos, n_neg, total = model_pos.sum(), model_neg.sum(), len(each_pred) + if n_pos + n_neg != total: + raise ValueError( + f"after filling NaNs, the sum of positive ({n_pos}) and negative " + f"({n_neg}) predictions should add up to {total=}" + ) + + true_pos = actual_pos & model_pos + false_neg = actual_pos & model_neg + false_pos = actual_neg & model_pos + true_neg = actual_neg & model_neg + + return true_pos, false_neg, false_pos, true_neg + + +def stable_metrics( + each_true: Sequence[float] | pd.Series | np.ndarray, + each_pred: Sequence[float] | pd.Series | np.ndarray, + *, + stability_threshold: float = 0.0, + fillna: bool = True, + prevalence: float | None = None, +) -> dict[str, float]: + """Get a dictionary of stability prediction metrics. Mostly binary classification + metrics, but also MAE, RMSE and R2. + + Args: + each_true (Sequence[float] | pd.Series): true energy above convex hull + each_pred (Sequence[float] | pd.Series): predicted energy above convex hull + stability_threshold (float): Where to place stability threshold relative to + convex hull in eV/atom, usually 0 or 0.1 eV. Default = 0.0. + fillna (bool): Whether to fill NaNs as the model predicting unstable. Defaults + to True. + prevalence (float, optional): Prevalence of stable materials in the dataset. + If None, calculated from the input data. Defaults to None. + + Note: Should give equivalent classification metrics to + sklearn.metrics.classification_report( + each_true > stability_threshold, + each_pred > stability_threshold, + output_dict=True, + ) + when using the same stability_threshold. + + Returns: + dict[str, float]: dictionary of classification metrics with keys DAF, Precision, + Recall, Accuracy, F1, TPR, FPR, TNR, FNR, MAE, RMSE, R2. + + Raises: + ValueError: If FPR + TNR don't add up to 1. + ValueError: If TPR + FNR don't add up to 1. + """ + n_true_pos, n_false_neg, n_false_pos, n_true_neg = map( + sum, + classify_stable( + each_true, each_pred, stability_threshold=stability_threshold, fillna=fillna + ), + ) + + n_total_pos = n_true_pos + n_false_neg + n_total_neg = n_true_neg + n_false_pos + # prevalence: dummy discovery rate of stable crystals by selecting randomly from + # all materials + if prevalence is None: + prevalence = ( + n_total_pos / (n_total_pos + n_total_neg) + if (n_total_pos + n_total_neg) > 0 + else float("nan") + ) + # Calculate ratios with guards against division by zero + precision = ( + n_true_pos / (n_true_pos + n_false_pos) + if (n_true_pos + n_false_pos) > 0 + else float("nan") + ) + recall = n_true_pos / n_total_pos if n_total_pos > 0 else float("nan") + + TPR = recall + FPR = n_false_pos / n_total_neg if n_total_neg > 0 else float("nan") + TNR = n_true_neg / n_total_neg if n_total_neg > 0 else float("nan") + FNR = n_false_neg / n_total_pos if n_total_pos > 0 else float("nan") + + # sanity check: false positives + true negatives = all negatives + if FPR > 0 and TNR > 0 and FPR + TNR != 1: + # Floating point tolerance + if abs(FPR + TNR - 1) > 1e-6: + raise ValueError(f"{FPR=} {TNR=} don't add up to 1") + + # sanity check: true positives + false negatives = all positives + if TPR > 0 and FNR > 0 and TPR + FNR != 1: + # Floating point tolerance + if abs(TPR + FNR - 1) > 1e-6: + raise ValueError(f"{TPR=} {FNR=} don't add up to 1") + + # Drop NaNs to calculate regression metrics + is_nan = np.isnan(each_true) | np.isnan(each_pred) + each_true, each_pred = np.array(each_true)[~is_nan], np.array(each_pred)[~is_nan] + + if precision + recall == 0: # Calculate F1 score, handling division by zero + f1_score = float("nan") + else: + f1_score = 2 * (precision * recall) / (precision + recall) + + return dict( + F1=f1_score, + DAF=precision / prevalence if prevalence > 0 else float("nan"), + Precision=precision, + Recall=recall, + Accuracy=( + (n_true_pos + n_true_neg) / (n_total_pos + n_total_neg) + if (n_total_pos + n_total_neg > 0) + else float("nan") + ), + TPR=TPR, + FPR=FPR, + TNR=TNR, + FNR=FNR, + TP=n_true_pos, + FP=n_false_pos, + TN=n_true_neg, + FN=n_false_neg, + MAE=np.abs(each_true - each_pred).mean(), + RMSE=((each_true - each_pred) ** 2).mean() ** 0.5, + R2=r2_score(each_true, each_pred) if len(each_true) > 1 else float("nan"), + ) diff --git a/garden_ai/benchmarks/matbench_discovery/remote_runner.py b/garden_ai/benchmarks/matbench_discovery/remote_runner.py deleted file mode 100644 index 432d43cc..00000000 --- a/garden_ai/benchmarks/matbench_discovery/remote_runner.py +++ /dev/null @@ -1,438 +0,0 @@ -"""Remote execution functions for Matbench Discovery benchmarks. - -These functions are serialized and executed on Globus Compute endpoints. -They handle environment setup, dependency installation, and benchmark execution. -""" - - -def run_matbench_is2re( - repo_url: str, - repo_ref: str, - model_package: str, - model_factory: str, - model_kwargs: dict, - model_checkpoint: str | None, - num_structures: int, - use_multi_gpu: bool = True, -) -> dict: - """Run Matbench IS2RE benchmark on remote Globus Compute endpoint. - - This function performs the complete benchmark workflow: - 1. Set up Python environment with UV - 2. Install dependencies (matbench-discovery + model package) - 3. Execute benchmark runner script in the environment - 4. Return results - - Args: - repo_url: GitHub URL for matbench-discovery repo - repo_ref: Git branch/tag/commit to checkout - model_package: Python package name to install (e.g., "mace-torch") - model_factory: Function or class name to create model (e.g., "mace_mp", "MACE") - model_kwargs: Dictionary of kwargs to pass when creating model - model_checkpoint: Path/URL to model checkpoint file (optional) - num_structures: Number of test structures to run (subset for MVP) - use_multi_gpu: If True, automatically detect and use all available GPUs - in parallel. If False, use single GPU/CPU. (default: True) - - Returns: - Dictionary with benchmark results: - - energies: List of final energies (None for failed relaxations) - - num_converged: Count of successful relaxations - - failed_indices: List of structure indices that failed - - Raises: - RuntimeError: If benchmark execution fails - """ - # All imports must be inside the function for CombinedCode serialization - import json - import logging - import os - import subprocess - import sys - import tempfile - from pathlib import Path - - # Configure logging - logging.basicConfig( - level=logging.INFO, - stream=sys.stdout, - force=True, - format="%(asctime)s [%(levelname)s] %(message)s", - ) - if hasattr(sys.stdout, "reconfigure"): - sys.stdout.reconfigure(line_buffering=True) - - logger = logging.getLogger(__name__) - - # Create isolated working directory - work_dir = Path(tempfile.mkdtemp(prefix="matbench_benchmark_")) - - # This script runs INSIDE the virtual environment - BENCHMARK_RUNNER_SCRIPT = ''' -import json -import sys -import time -import logging -import os -import concurrent.futures -import importlib -from pathlib import Path -from typing import List, Dict, Any, Optional -from zipfile import ZipFile -from io import TextIOWrapper - -import torch -from ase.io import read -from ase.optimize import FIRE -from matbench_discovery.data import DataFiles - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s [%(levelname)s] [PID:%(process)d] %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - stream=sys.stdout, - force=True -) -logger = logging.getLogger("benchmark_runner") - -def setup_device(gpu_id: Optional[int] = None) -> str: - """Setup compute device for this process.""" - if torch.cuda.is_available(): - return f"cuda:{gpu_id}" if gpu_id is not None else "cuda" - elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): - return "mps" - return "cpu" - -def load_model(config: Dict[str, Any], device: str): - """Initialize the model from configuration.""" - package_name = config["package"] - factory_name = config["factory"] - kwargs = config["kwargs"].copy() - checkpoint = config.get("checkpoint") - - if "device" in kwargs: - kwargs["device"] = device - - # Import factory function - module_parts = package_name.split(".") - try: - if len(module_parts) > 1: - module = importlib.import_module(package_name) - else: - # Try common patterns for model packages - base_module = module_parts[0].split("-")[0] - try: - module = importlib.import_module(f"{base_module}.calculators") - except ImportError: - module = importlib.import_module(base_module) - - factory = getattr(module, factory_name) - except (ImportError, AttributeError) as e: - raise ImportError(f"Could not load model factory {factory_name} from {package_name}: {e}") - - # Create model - model = factory(**kwargs) - - # Load checkpoint if provided - if checkpoint and checkpoint != "None": - if hasattr(model, "load_checkpoint"): - model.load_checkpoint(checkpoint) - elif hasattr(model, "load_state_dict"): - model.load_state_dict(torch.load(checkpoint)) - - return model - -def process_batch( - batch_id: int, - structures: List[Any], - start_idx: int, - model_config: Dict[str, Any], - num_threads: int -) -> Dict[str, Any]: - """Process a batch of structures on a specific device.""" - - # Configure thread limits to avoid contention - os.environ["OMP_NUM_THREADS"] = str(num_threads) - torch.set_num_threads(num_threads) - - gpu_id = model_config.get("gpu_id") - device = setup_device(gpu_id) - - worker_logger = logging.getLogger(f"worker_{batch_id}") - worker_logger.info(f"Started on {device} with {len(structures)} structures. Threads: {num_threads}") - - try: - model = load_model(model_config, device) - except Exception as e: - worker_logger.error(f"Failed to initialize model: {e}") - return { - "energies": [None] * len(structures), - "num_converged": 0, - "failed_indices": [start_idx + i for i in range(len(structures))], - "error": str(e) - } - - energies = [] - failed_indices = [] - num_converged = 0 - batch_start = time.time() - - for i, atoms in enumerate(structures): - global_idx = start_idx + i - try: - atoms.calc = model - opt = FIRE(atoms, logfile=None) - opt.run(fmax=0.05, steps=500) - - energies.append(atoms.get_potential_energy()) - num_converged += 1 - - if (i + 1) % 10 == 0: - elapsed = time.time() - batch_start - rate = (i + 1) / elapsed if elapsed > 0 else 0 - worker_logger.info(f"Progress: {i+1}/{len(structures)} ({rate:.2f} struct/s)") - - except Exception as e: - worker_logger.warning(f"Structure {global_idx} failed: {e}") - energies.append(None) - failed_indices.append(global_idx) - - return { - "energies": energies, - "num_converged": num_converged, - "failed_indices": failed_indices - } - -def load_structures(num_structures: int) -> List[Any]: - """Load structures from the Matbench Discovery dataset.""" - structures = [] - zip_path = DataFiles.wbm_initial_atoms.path - - with ZipFile(zip_path, 'r') as zf: - # Sort files numerically - file_list = sorted( - zf.namelist(), - key=lambda x: int(x.split('.')[0]) if x.split('.')[0].isdigit() else float('inf') - ) - for filename in file_list[:num_structures]: - with zf.open(filename) as f: - text_stream = TextIOWrapper(f, encoding='utf-8') - structures.append(read(text_stream, format='extxyz')) - return structures - -def main(): - if len(sys.argv) != 2: - sys.exit("Usage: python benchmark_runner.py ") - - with open(sys.argv[1]) as f: - config = json.load(f) - - logger.info("Starting benchmark runner...") - - try: - structures = load_structures(config["num_structures"]) - logger.info(f"Loaded {len(structures)} structures") - except Exception as e: - logger.error(f"Failed to load structures: {e}") - sys.exit(1) - - # Shuffle for load balancing - import random - random.seed(42) - random.shuffle(structures) - - # Resource detection - num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 - use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1 - - total_cores = os.cpu_count() or 1 - num_workers = num_gpus if use_multi_gpu else 1 - # Reserve cores for overhead if possible - available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores - threads_per_worker = max(1, available_cores // num_workers) - - logger.info(f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)") - - results = {"energies": [], "num_converged": 0, "failed_indices": []} - start_time = time.time() - - if use_multi_gpu: - logger.info(f"Parallel execution on {num_gpus} GPUs") - batch_size = len(structures) // num_gpus - futures = [] - - ctx = multiprocessing.get_context('spawn') - with concurrent.futures.ProcessPoolExecutor(max_workers=num_gpus, mp_context=ctx) as executor: - for i in range(num_gpus): - start_idx = i * batch_size - end_idx = len(structures) if i == num_gpus - 1 else (i + 1) * batch_size - - model_config = { - "package": config["model_package"], - "factory": config["model_factory"], - "kwargs": config["model_kwargs"], - "checkpoint": config["model_checkpoint"], - "gpu_id": i - } - - futures.append(executor.submit( - process_batch, i, structures[start_idx:end_idx], start_idx, model_config, threads_per_worker - )) - - for future in concurrent.futures.as_completed(futures): - try: - batch_res = future.result() - results["energies"].extend(batch_res["energies"]) - results["num_converged"] += batch_res["num_converged"] - results["failed_indices"].extend(batch_res["failed_indices"]) - except Exception as e: - logger.error(f"Worker failed: {e}") - else: - logger.info("Single process execution") - model_config = { - "package": config["model_package"], - "factory": config["model_factory"], - "kwargs": config["model_kwargs"], - "checkpoint": config["model_checkpoint"] - } - results = process_batch(0, structures, 0, model_config, threads_per_worker) - - elapsed = time.time() - start_time - logger.info(f"Benchmark complete in {elapsed:.1f}s. Converged: {results['num_converged']}/{len(structures)}") - - with open("results.json", "w") as f: - json.dump(results, f, indent=2) - -if __name__ == "__main__": - import multiprocessing - main() -''' - - try: - # ---------------------------------------------------------------------- - # 1. ENVIRONMENT SETUP - # ---------------------------------------------------------------------- - logger.info("Step 1/4: Setting up environment...") - - # Find UV binary - uv_bin = subprocess.check_output( - [sys.executable, "-c", "import uv; print(uv.find_uv_bin())"], text=True - ).strip() - - # Create UV virtual environment - subprocess.run( - [uv_bin, "venv", "--python", "3.11"], - cwd=work_dir, - check=True, - capture_output=True, - ) - - venv_python = work_dir / ".venv/bin/python" - if not venv_python.exists(): - venv_python = work_dir / ".venv/Scripts/python.exe" # Windows fallback - - if not venv_python.exists(): - raise RuntimeError(f"Virtual environment python not found at {venv_python}") - - # Install dependencies - logger.info("Installing dependencies...") - subprocess.run( - [ - uv_bin, - "pip", - "install", - "--python", - str(venv_python), - "matbench-discovery", - ], - cwd=work_dir, - check=True, - ) - subprocess.run( - [uv_bin, "pip", "install", "--python", str(venv_python), model_package], - cwd=work_dir, - check=True, - ) - - # Set SSL cert file for HPC - env = dict(os.environ) - env["MBD_AUTO_DOWNLOAD_FILES"] = "true" - - try: - certifi_path = subprocess.check_output( - [str(venv_python), "-c", "import certifi; print(certifi.where())"], - text=True, - ).strip() - env["SSL_CERT_FILE"] = certifi_path - except Exception as e: - logger.warning(f"Failed to set SSL_CERT_FILE: {e}") - - # ---------------------------------------------------------------------- - # 2. PREPARE BENCHMARK SCRIPT - # ---------------------------------------------------------------------- - logger.info("Step 2/4: Preparing benchmark script...") - - # Write runner script - runner_path = work_dir / "benchmark_runner.py" - runner_path.write_text(BENCHMARK_RUNNER_SCRIPT) - - # Write config - config = { - "repo_url": repo_url, - "repo_ref": repo_ref, - "model_package": model_package, - "model_factory": model_factory, - "model_kwargs": model_kwargs, - "model_checkpoint": model_checkpoint, - "num_structures": num_structures, - "use_multi_gpu": use_multi_gpu, - } - - config_path = work_dir / "config.json" - with open(config_path, "w") as f: - json.dump(config, f, indent=2) - - # ---------------------------------------------------------------------- - # 3. EXECUTE BENCHMARK - # ---------------------------------------------------------------------- - logger.info("Step 3/4: Executing benchmark...") - - # Run the runner script inside the venv - # We stream output directly to stdout so the user sees progress - proc = subprocess.run( - [str(venv_python), str(runner_path), str(config_path)], - cwd=work_dir, - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - check=False, # We check return code manually - ) - - if proc.returncode != 0: - raise RuntimeError( - f"Benchmark runner failed with return code {proc.returncode}" - ) - - # ---------------------------------------------------------------------- - # 4. COLLECT RESULTS - # ---------------------------------------------------------------------- - logger.info("Step 4/4: Collecting results...") - - results_path = work_dir / "results.json" - if not results_path.exists(): - raise RuntimeError( - "Results file not found - benchmark may have crashed silently" - ) - - with open(results_path) as f: - results = json.load(f) - - logger.info("Benchmark completed successfully.") - return results - - finally: - # Cleanup working directory - import shutil - - shutil.rmtree(work_dir, ignore_errors=True) diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index c8f95e5d..b4348d27 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -1,26 +1,599 @@ """Matbench Discovery benchmark task implementations.""" -from typing import TYPE_CHECKING, Any +from __future__ import annotations -from .remote_runner import run_matbench_is2re +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple + +from ..utils.remote_execution import run_remote_benchmark +from ..utils.script_builder import BenchmarkScriptBuilder +from ..utils.task import BaseBenchmarkTask if TYPE_CHECKING: from . import MatbenchDiscovery + from .enums import DatasetConfig, DatasetSize + +from .metrics import classify_stable, stable_metrics + +# ------------------------------------------------------------------------------ +# REMOTE FUNCTIONS +# These functions are injected into the remote script. +# They must be self-contained (imports inside or provided by builder). +# ------------------------------------------------------------------------------ + + +def load_model(device: str): + """Initialize the model using the user-provided factory function. + + The factory function is injected into this script by the benchmark framework. + """ + # Call the user's factory function (injected as load_model_user) + model = load_model_user(device) # noqa: F821 + return model + + +def get_material_ids_for_subset( + subset_type: str, seed: int = 42 +) -> Optional[List[str]]: + """Get material IDs for a specific dataset subset. + + Args: + subset_type: One of 'full', 'unique_protos', 'random_10k', 'random_100' + seed: Random seed for sampling (default: 42) + + Returns: + List of material IDs, or None for 'full' (load all) + """ + if subset_type == "full": + return None # Load all materials + + import pandas as pd + from matbench_discovery.data import DataFiles + + # Load wbm_summary + df = pd.read_csv(DataFiles.wbm_summary.path) + + if subset_type == "unique_protos": + # Filter to unique prototypes (removes duplicates and MP overlaps) + df_filtered = df.query("unique_prototype") + return df_filtered["material_id"].tolist() + + elif subset_type == "random_10k": + # Random sample of 10k unique prototypes (fixed seed for reproducibility) + df_filtered = df.query("unique_prototype") + df_sampled = df_filtered.sample(n=10000, random_state=seed) + return df_sampled["material_id"].tolist() + + elif subset_type == "random_100": + # Random sample of 100 unique prototypes (fixed seed for reproducibility) + # Useful for quick end-to-end testing + df_filtered = df.query("unique_prototype") + df_sampled = df_filtered.sample(n=100, random_state=seed) + return df_sampled["material_id"].tolist() + + else: + raise ValueError(f"Unknown subset_type: {subset_type}") + + +# --- Reusable Process Functions --- + + +def process_batch_relaxation( + batch_id: int, + structures: List[Tuple[str, Any]], + model_config: Dict[str, Any], + num_threads: int, +) -> Dict[str, Any]: + """Process a batch of structures for IS2RE (Relaxation).""" + import logging + import os + import time + + import torch + from ase.optimize import FIRE + + # Configure thread limits to avoid contention + os.environ["OMP_NUM_THREADS"] = str(num_threads) + torch.set_num_threads(num_threads) + + gpu_id = model_config.get("gpu_id") + device = setup_device(gpu_id) # noqa: F821 + + worker_logger = logging.getLogger(f"worker_{batch_id}") + worker_logger.info( + f"Started relaxation on {device} with {len(structures)} structures. Threads: {num_threads}" + ) + + global _MODEL_CACHE + try: + if _MODEL_CACHE is None: + model = load_model(device) + _MODEL_CACHE = model + else: + model = _MODEL_CACHE + except Exception as e: + worker_logger.error(f"Failed to initialize model: {e}") + worker_logger.error( + "Model initialization is critical - cannot continue benchmark" + ) + raise RuntimeError(f"Model initialization failed: {e}") from e + + results = {} + batch_start = time.time() + + for i, (struct_id, atoms) in enumerate(structures): + try: + atoms.calc = model + opt = FIRE(atoms, logfile=None) + opt.run(fmax=0.05, steps=500) + + energy = atoms.get_potential_energy() + results[struct_id] = {"energy": energy} + + if (i + 1) % 10 == 0: + elapsed = time.time() - batch_start + rate = (i + 1) / elapsed if elapsed > 0 else 0 + worker_logger.info( + f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)" + ) + + except Exception as e: + worker_logger.warning(f"Structure {struct_id} failed: {e}") + results[struct_id] = {"energy": None, "error": str(e)} + + return results + + +def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Tuple[str, Any]]: + """Load initial structures for IS2RE.""" + from io import TextIOWrapper + from zipfile import ZipFile + + from ase.io import read + from matbench_discovery.data import DataFiles + + dataset_subset = config.get("dataset_subset", "full") + dataset_seed = config.get("dataset_seed", 42) + mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed) + + structures = [] + zip_path = DataFiles.wbm_initial_atoms.path + + with ZipFile(zip_path, "r") as zf: + if mat_ids is None: + # Load all files (full dataset) + file_list = sorted( + zf.namelist(), + key=lambda x: int(x.split(".")[0]) + if x.split(".")[0].isdigit() + else float("inf"), + ) + num_structures = config.get("num_structures", 100) + file_list = file_list[:num_structures] + else: + # Filter to specific material IDs + mat_id_set = set(mat_ids) + file_list = [ + f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set + ] + + for filename in file_list: + with zf.open(filename) as f: + text_stream = TextIOWrapper(f, encoding="utf-8") + structures.append((filename, read(text_stream, format="extxyz"))) + return structures + + +def calculate_metrics_energy( + results: Dict[str, Any], config: Dict[str, Any] +) -> Dict[str, Any]: + """Calculate energy metrics using matbench-discovery's stable_metrics algorithm. + + Uses the injected stable_metrics function. + Returns: F1, DAF, Precision, Recall, Accuracy, TPR, FPR, TNR, FNR, TP, FP, TN, FN, MAE, RMSE, R2 + """ + import logging + + import numpy as np + + logger = logging.getLogger("metrics") + + # Results format: {id: {"energy": float, "error": str}} + if len(results) == 0: + return {"error": "No results to evaluate"} + + try: + # Import matbench-discovery data + from matbench_discovery.data import df_wbm + except Exception as e: + return {"error": f"Failed to import matbench-discovery: {e}"} + + # Extract model energies + model_energies = {} + for sid, res in results.items(): + if isinstance(res, dict) and res.get("energy") is not None: + mat_id = sid.replace(".extxyz", "") + model_energies[mat_id] = res["energy"] + + if not model_energies: + return {"error": "No valid energies found in results"} + + # Get common IDs between predictions and ground truth + # Use direct string column names instead of MbdKey enum to avoid issues + df_wbm_indexed = df_wbm.set_index("material_id") + common_ids = list(set(model_energies.keys()) & set(df_wbm_indexed.index)) + + if not common_ids: + return {"error": "No matching IDs between results and ground truth"} + + # Get subset of data + df_subset = df_wbm_indexed.loc[common_ids] + + # Calculate predicted formation energies + y_pred = np.array([model_energies[mid] for mid in common_ids]) + y_true = df_subset["uncorrected_energy"].values # Uncorrected total energy + n_atoms = df_subset["n_sites"].values + + # Predicted formation energy ERROR per atom (from total energy difference) + # This is the ERROR: (E_pred - E_dft) / n_atoms + e_form_error = (y_pred - y_true) / n_atoms + + # Get ground truth e_above_hull for stability classification + each_true = df_subset["e_above_hull_mp2020_corrected_ppd_mp"].values + + # Calculate predicted e_above_hull + # Since e_form_error is already (E_pred - E_dft)/n_atoms, we just add it to each_true + each_pred = each_true + e_form_error + + # Debug logging to understand the distribution + logger.info("Energy statistics:") + logger.info( + f" each_true: min={each_true.min():.4f}, max={each_true.max():.4f}, mean={each_true.mean():.4f}" + ) + logger.info( + f" each_pred: min={each_pred.min():.4f}, max={each_pred.max():.4f}, mean={each_pred.mean():.4f}" + ) + + # Calculate global prevalence for DAF normalization (matches official leaderboard) + # Filter to unique prototypes + df_unique = df_wbm.query("unique_prototype") + # Calculate prevalence: (stable count) / (total count) + # Stability threshold is 0.0 + stable_count = (df_unique["e_above_hull_mp2020_corrected_ppd_mp"] <= 0).sum() + global_prevalence = stable_count / len(df_unique) + + logger.info( + f"Using global prevalence for DAF: {global_prevalence:.6f} ({stable_count}/{len(df_unique)})" + ) + + # Calculate metrics using the injected function + # stable_metrics is injected into the script scope + metrics = stable_metrics(each_true, each_pred, prevalence=global_prevalence) + + # Add num_evaluated + metrics["num_evaluated"] = len(common_ids) + + return metrics + + +def process_batch_static( + batch_id: int, + structures: List[Tuple[str, Any]], + model_config: Dict[str, Any], + num_threads: int, +) -> Dict[str, Any]: + """Process a batch of structures for RS2RE (Static Calculation).""" + import logging + import os + import time + + import torch + + os.environ["OMP_NUM_THREADS"] = str(num_threads) + torch.set_num_threads(num_threads) + + gpu_id = model_config.get("gpu_id") + device = setup_device(gpu_id) # noqa: F821 + + worker_logger = logging.getLogger(f"worker_{batch_id}") + worker_logger.info( + f"Started static calculation on {device} with {len(structures)} structures." + ) + + global _MODEL_CACHE + try: + if _MODEL_CACHE is None: + model = load_model(device) + _MODEL_CACHE = model + else: + model = _MODEL_CACHE + except Exception as e: + return {sid: {"energy": None, "error": str(e)} for sid, _ in structures} + + results = {} + batch_start = time.time() + + for i, (struct_id, atoms) in enumerate(structures): + try: + atoms.calc = model + # No relaxation, just static energy + energy = atoms.get_potential_energy() + results[struct_id] = {"energy": energy} + + if (i + 1) % 50 == 0: + elapsed = time.time() - batch_start + rate = (i + 1) / elapsed if elapsed > 0 else 0 + worker_logger.info( + f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)" + ) + + except Exception as e: + worker_logger.warning(f"Structure {struct_id} failed: {e}") + results[struct_id] = {"energy": None, "error": str(e)} + + return results + + +def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Tuple[str, Any]]: + """Load relaxed structures for RS2RE.""" + from io import TextIOWrapper + from zipfile import ZipFile + + from ase.io import read + from matbench_discovery.data import DataFiles + + dataset_subset = config.get("dataset_subset", "full") + dataset_seed = config.get("dataset_seed", 42) + mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed) + + structures = [] + # Use relaxed atoms + zip_path = DataFiles.wbm_relaxed_atoms.path + + with ZipFile(zip_path, "r") as zf: + if mat_ids is None: + # Load all files (full dataset) + file_list = sorted( + zf.namelist(), + key=lambda x: int(x.split(".")[0]) + if x.split(".")[0].isdigit() + else float("inf"), + ) + num_structures = config.get("num_structures", 100) + file_list = file_list[:num_structures] + else: + # Filter to specific material IDs + mat_id_set = set(mat_ids) + file_list = [ + f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set + ] + + for filename in file_list: + with zf.open(filename) as f: + text_stream = TextIOWrapper(f, encoding="utf-8") + structures.append((filename, read(text_stream, format="extxyz"))) + return structures + + +# Reuse calculate_metrics_energy for all energy-only tasks + + +def process_batch_forces( + batch_id: int, + structures: List[Tuple[str, Any]], + model_config: Dict[str, Any], + num_threads: int, +) -> Dict[str, Any]: + """Process a batch of structures for S2EFS (Energy, Forces, Stress).""" + import logging + import os + import time + + import torch + + os.environ["OMP_NUM_THREADS"] = str(num_threads) + torch.set_num_threads(num_threads) + + gpu_id = model_config.get("gpu_id") + device = setup_device(gpu_id) # noqa: F821 + + worker_logger = logging.getLogger(f"worker_{batch_id}") + worker_logger.info( + f"Started forces calculation on {device} with {len(structures)} structures." + ) + + global _MODEL_CACHE + try: + if _MODEL_CACHE is None: + model = load_model(device) + _MODEL_CACHE = model + else: + model = _MODEL_CACHE + except Exception as e: + return {sid: {"error": str(e)} for sid, _ in structures} + results = {} + batch_start = time.time() -class IS2RETask: - """Initial Structure to Relaxed Energy benchmark task. + for i, (struct_id, atoms) in enumerate(structures): + try: + atoms.calc = model - This task evaluates a model's ability to predict the relaxed energy - and geometry of crystal structures starting from unrelaxed initial - configurations. + energy = atoms.get_potential_energy() + forces = atoms.get_forces().tolist() + stress = atoms.get_stress().tolist() + + results[struct_id] = {"energy": energy, "forces": forces, "stress": stress} + + if (i + 1) % 50 == 0: + elapsed = time.time() - batch_start + rate = (i + 1) / elapsed if elapsed > 0 else 0 + worker_logger.info( + f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)" + ) - The task: - 1. Loads initial (unrelaxed) structures from the WBM test set - 2. Uses the model to perform geometry optimization - 3. Records final energies and relaxed structures - 4. Calculates metrics comparing to DFT ground truth + except Exception as e: + worker_logger.warning(f"Structure {struct_id} failed: {e}") + results[struct_id] = {"error": str(e)} + + return results + + +def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Tuple[str, Any]]: + """Load MP trajectories for S2EFS.""" + from io import TextIOWrapper + from zipfile import ZipFile + + from ase.io import read + from matbench_discovery.data import DataFiles + + num_structures = config.get("num_structures", 100) + structures = [] + # Use MP trajectories + zip_path = DataFiles.mp_trj_extxyz.path + + with ZipFile(zip_path, "r") as zf: + file_list = sorted(zf.namelist()) + for filename in file_list[:num_structures]: + with zf.open(filename) as f: + text_stream = TextIOWrapper(f, encoding="utf-8") + # Read all frames? Or just one? Usually S2EFS is on frames. + # Let's assume we evaluate on the last frame or all frames. + # For simplicity, let's take the last frame (relaxed?) or random? + # Actually, MP trj contains relaxation steps. + # Let's read the last frame for now as a proxy for "a structure". + # Or better, read all frames and treat them as separate tasks? + # For this benchmark, let's just treat the file as containing one structure per file if possible, + # or just take the last one. + atoms_list = read(text_stream, format="extxyz", index=":") + if atoms_list: + # Just take the last one for now + structures.append((filename, atoms_list[-1])) + return structures + + +def calculate_metrics_forces( + results: Dict[str, Any], config: Dict[str, Any] +) -> Dict[str, Any]: + """Calculate comprehensive S2EFS metrics (Energy, Forces, Stress). + + Returns MAE, RMSE, and R2 for each component. """ + from io import TextIOWrapper + from zipfile import ZipFile + + import numpy as np + from ase.io import read + from matbench_discovery.data import DataFiles + from sklearn.metrics import r2_score + + # We need to load ground truth from the dataset itself because MP trj has E/F/S in the extxyz + # This is expensive to re-read. Ideally we should have passed GT in results or loaded it efficiently. + # For now, let's re-read the GT for the processed IDs. + + metrics = { + "energy_mae": [], + "energy_rmse": [], + "force_mae": [], + "force_rmse": [], + "stress_mae": [], + "stress_rmse": [], + } + + # Collect all predictions and ground truth for R2 calculation + all_e_pred, all_e_true = [], [] + all_f_pred, all_f_true = [], [] + all_s_pred, all_s_true = [], [] + + zip_path = DataFiles.mp_trj_extxyz.path + + with ZipFile(zip_path, "r") as zf: + for sid, res in results.items(): + if "error" in res: + continue + + try: + with zf.open(sid) as f: + text_stream = TextIOWrapper(f, encoding="utf-8") + atoms_list = read(text_stream, format="extxyz", index=":") + gt_atoms = atoms_list[-1] # Matching load_dataset logic + + # Energy (per atom) + e_pred = res["energy"] + e_true = gt_atoms.get_potential_energy() + n_atoms = len(gt_atoms) + + energy_error = abs(e_pred - e_true) / n_atoms + metrics["energy_mae"].append(energy_error) + metrics["energy_rmse"].append(energy_error**2) + + all_e_pred.append(e_pred / n_atoms) + all_e_true.append(e_true / n_atoms) + + # Forces + f_pred = np.array(res["forces"]) + f_true = gt_atoms.get_forces() + force_error = np.abs(f_pred - f_true) + metrics["force_mae"].append(force_error.mean()) + metrics["force_rmse"].append((force_error**2).mean()) + + all_f_pred.extend(f_pred.flatten()) + all_f_true.extend(f_true.flatten()) + + # Stress + s_pred = np.array(res["stress"]) + s_true = gt_atoms.get_stress() + stress_error = np.abs(s_pred - s_true) + metrics["stress_mae"].append(stress_error.mean()) + metrics["stress_rmse"].append((stress_error**2).mean()) + + all_s_pred.extend(s_pred.flatten()) + all_s_true.extend(s_true.flatten()) + + except Exception: + pass + + # Calculate final metrics + result_metrics = {} + + if metrics["energy_mae"]: + result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"])) + result_metrics["energy_rmse"] = float(np.sqrt(np.mean(metrics["energy_rmse"]))) + result_metrics["energy_r2"] = ( + float(r2_score(all_e_true, all_e_pred)) + if len(all_e_true) > 1 + else float("nan") + ) + + if metrics["force_mae"]: + result_metrics["force_mae"] = float(np.mean(metrics["force_mae"])) + result_metrics["force_rmse"] = float(np.sqrt(np.mean(metrics["force_rmse"]))) + result_metrics["force_r2"] = ( + float(r2_score(all_f_true, all_f_pred)) + if len(all_f_true) > 1 + else float("nan") + ) + + if metrics["stress_mae"]: + result_metrics["stress_mae"] = float(np.mean(metrics["stress_mae"])) + result_metrics["stress_rmse"] = float(np.sqrt(np.mean(metrics["stress_rmse"]))) + result_metrics["stress_r2"] = ( + float(r2_score(all_s_true, all_s_pred)) + if len(all_s_true) > 1 + else float("nan") + ) + + result_metrics["num_evaluated"] = len(metrics["energy_mae"]) + + return result_metrics + + +# ------------------------------------------------------------------------------ +# Task Classes +# ------------------------------------------------------------------------------ + + +class MatbenchTask(BaseBenchmarkTask): + """Base class for Matbench Discovery tasks.""" def __init__( self, @@ -28,296 +601,321 @@ def __init__( repo_url: str, repo_ref: str, model_package: str | None = None, + task_name: str = "unknown", ): - """Initialize IS2RE task. + super().__init__(adapter, repo_url, repo_ref, model_package) + self.name = task_name + + def calculate_metrics(self, output: Dict[str, Any]) -> Dict[str, Any]: + """Retrieve metrics from the remote output.""" + return output.get("metrics", {}) + + def _build_script( + self, process_fn, load_dataset_fn, calc_metrics_fn, model_factory + ) -> str: + """Build the remote execution script with specific functions. Args: - adapter: MatbenchDiscovery adapter instance - repo_url: Matbench Discovery repository URL - repo_ref: Git ref (branch/tag/commit) to use - model_package: Default model package to install (can override in submit) + process_fn: Task-specific process_batch function + load_dataset_fn: Task-specific load_dataset function + calc_metrics_fn: Task-specific calculate_metrics function + model_factory: User-provided function that creates the model """ - self.adapter = adapter - self.repo_url = repo_url - self.repo_ref = repo_ref - self.model_package = model_package - self.name = "IS2RE" + builder = BenchmarkScriptBuilder() + + # Add global model cache + builder.add_preamble("_MODEL_CACHE = None") + + # Common imports + builder.add_import("from typing import List, Dict, Any, Tuple, Optional") + builder.add_import("import torch") + builder.add_import("from ase.optimize import FIRE") + builder.add_import("from ase.io import read") + builder.add_import("from matbench_discovery.data import DataFiles") + builder.add_import("from zipfile import ZipFile") + builder.add_import("from io import TextIOWrapper") + builder.add_import("import pandas as pd") + builder.add_import("import numpy as np") + builder.add_import("from collections.abc import Sequence") + builder.add_import("from sklearn.metrics import r2_score") + + # Add user's model factory (renamed to load_model_user so load_model can call it) + builder.add_function(model_factory, name="load_model_user") + + # Add our load_model wrapper that calls load_model_user + builder.add_function(load_model) + + # Add helper function for dataset subset filtering + builder.add_function(get_material_ids_for_subset) + + # Add task-specific functions with standard names expected by runner + builder.add_function(process_fn, name="process_batch") + builder.add_function(load_dataset_fn, name="load_dataset") + builder.add_function(calc_metrics_fn, name="calculate_metrics_remote") + + # Inject metrics helper functions + builder.add_function(classify_stable) + builder.add_function(stable_metrics) + + return builder.build() def submit( self, - model=None, - num_structures: int = 100, - model_package: str | None = None, - model_factory: str | None = None, - model_kwargs: dict | None = None, - use_multi_gpu: bool = True, + model_factory: callable, + model_packages: str | List[str], + num_structures: int | "DatasetSize" | "DatasetConfig" = 100, + checkpoint_name: str | None = None, ): - """Submit IS2RE benchmark job to remote executor. - - You can specify the model in two ways: - 1. Pass a local model instance (will introspect to get remote construction info) - 2. Explicitly specify model_package and model_factory + """Submit benchmark job to remote executor. Args: - model: (Optional) Local model instance. If provided, will extract - package, class, and checkpoint information from it. - num_structures: Number of test structures to evaluate (default: 100). - Full test set has ~257k structures. Use smaller values - for quick testing. - model_package: Python package name to install (e.g., "mace-torch"). - Required if model is None. - model_factory: How to instantiate the model on remote. Can be: - - Function name: "mace_mp" (will call as function) - - Class name: "MACE" (will instantiate as class) - Required if model is None. - model_kwargs: Dictionary of kwargs to pass when creating model remotely. - Example: {"model": "medium", "device": "cuda"} - use_multi_gpu: If True, automatically detect and use all available GPUs - in parallel for faster processing. If False, use single - GPU/CPU. (default: True) - - Returns: - Future object that will contain benchmark results when complete. - Call .result() to block and wait for completion. - - Examples: - Using local model instance: - >>> from mace.calculators import mace_mp - >>> model = mace_mp(model="medium") - >>> future = task.submit(model, num_structures=50) - - Specifying remote construction explicitly: - >>> future = task.submit( - ... model_package="mace-torch", - ... model_factory="mace_mp", - ... model_kwargs={"model": "medium", "device": "cuda"}, - ... num_structures=50, - ... use_multi_gpu=True - ... ) + model_factory: User-provided function that takes device and returns an ASE calculator. + Example: lambda device: mace_mp(model="medium", device=device) + model_packages: Python package(s) to install. Can be a single package string + (e.g., "mace-torch") or a list (e.g., ["torch", "mace-torch"]) + num_structures: Number of structures to evaluate, or DatasetSize enum, or DatasetConfig + (DatasetSize.FULL, DatasetSize.RANDOM_10K, DatasetSize.RANDOM_10K.seed(10)) + checkpoint_name: Optional name for the checkpoint file (e.g. "my_checkpoint.json"). + If not provided, one will be generated. """ - # Determine how to construct model remotely - if model is not None: - # Extract info from local model instance - if model_package is None: - if self.model_package is not None: - model_package = self.model_package - else: - # Infer from model's module - model_package = model.__class__.__module__.split(".")[0] - - if model_factory is None: - model_factory = model.__class__.__name__ - - # Get checkpoint path if model has one - model_checkpoint = None - if hasattr(model, "checkpoint_path"): - model_checkpoint = model.checkpoint_path - elif hasattr(model, "checkpoint"): - model_checkpoint = model.checkpoint - - # Try to extract initialization kwargs if available - if model_kwargs is None and hasattr(model, "_init_kwargs"): - model_kwargs = model._init_kwargs + import time + import uuid + from .enums import DatasetConfig, DatasetSize + + # Build script with task-specific functions AND user's factory + script_content = self._build_script( + self.process_fn, + self.load_dataset_fn, + self.calc_metrics_fn, + model_factory, # Inject user's factory function + ) + + # Handle single package string or list of packages + packages = ( + [model_packages] if isinstance(model_packages, str) else model_packages + ) + dependencies = ["matbench-discovery>=1.3.0"] + packages + + # Handle DatasetSize enum, DatasetConfig, or integer + if isinstance(num_structures, DatasetSize): + runner_config = { + "repo_url": self.repo_url, + "repo_ref": self.repo_ref, + "dataset_subset": num_structures.value, + "dataset_seed": 42, # Default seed + } + elif isinstance(num_structures, DatasetConfig): + runner_config = { + "repo_url": self.repo_url, + "repo_ref": self.repo_ref, + "dataset_subset": num_structures.subset.value, + "dataset_seed": num_structures.seed, + } else: - # Must provide explicit construction info - if model_package is None or model_factory is None: - raise ValueError( - "If model is not provided, must specify both " - "model_package and model_factory" - ) - model_checkpoint = None + # Integer - use traditional num_structures approach + runner_config = { + "repo_url": self.repo_url, + "repo_ref": self.repo_ref, + "num_structures": num_structures, + "dataset_subset": "full", + } + + # Generate checkpoint name if not provided + if not checkpoint_name: + # Format: matbench_{model}_{subset}_{timestamp}_{uuid}.json + # Clean up model name for filename + model_str = ( + str(model_packages) + .replace("[", "") + .replace("]", "") + .replace("'", "") + .replace('"', "") + .replace(",", "_") + .replace(" ", "") + ) + subset_str = runner_config.get("dataset_subset", "custom") + timestamp = int(time.time()) + short_uuid = str(uuid.uuid4())[:8] + checkpoint_name = ( + f"matbench_{model_str}_{subset_str}_{timestamp}_{short_uuid}.json" + ) - if model_kwargs is None: - model_kwargs = {} + print(f"Checkpoint will be saved to: ~/.garden/benchmarks/{checkpoint_name}") - # Get executor (will create if needed) and submit remote execution executor = self.adapter._get_executor() future = executor.submit( - run_matbench_is2re, - repo_url=self.repo_url, - repo_ref=self.repo_ref, - model_package=model_package, - model_factory=model_factory, - model_kwargs=model_kwargs, - model_checkpoint=model_checkpoint, - num_structures=num_structures, - use_multi_gpu=use_multi_gpu, + run_remote_benchmark, + script_content=script_content, + dependencies=dependencies, + config=runner_config, + checkpoint_name=checkpoint_name, ) + # Attach checkpoint path to future for programmatic access + future.checkpoint_path = f"~/.garden/benchmarks/{checkpoint_name}" + return future def local( self, - model=None, - num_structures: int = 100, - model_package: str | None = None, - model_factory: str | None = None, - model_kwargs: dict | None = None, - use_multi_gpu: bool = True, + model_factory: callable, + model_packages: str | List[str], + num_structures: int | "DatasetSize" | "DatasetConfig" = 100, + checkpoint_path: str | None = None, ) -> dict: - """Run benchmark locally in ephemeral UV environment. - - This executes the same benchmark workflow locally instead of submitting - to a remote Globus Compute endpoint. Useful for testing and development. + """Run benchmark locally. Args: - model: Optional local model instance to extract metadata from - num_structures: Number of test structures to evaluate - model_package: Python package name to install (e.g., "mace-torch") - model_factory: Function or class name to create model - model_kwargs: Dictionary of kwargs for model creation - use_multi_gpu: If True, automatically detect and use all available GPUs - in parallel. If False, use single GPU/CPU. (default: True) - - Returns: - Dictionary with benchmark results (same format as remote execution) - - Example: - >>> results = task.local( - ... model_package="mace-torch", - ... model_factory="mace_mp", - ... model_kwargs={"model": "medium", "device": "cpu"}, - ... num_structures=10, - ... use_multi_gpu=False - ... ) + model_factory: User-provided function that takes device and returns an ASE calculator + model_packages: Python package(s) to install. Can be a single package string + (e.g., "mace-torch") or a list (e.g., ["torch", "mace-torch"]) + num_structures: Number of structures to evaluate, or DatasetSize enum, or DatasetConfig + (DatasetSize.FULL, DatasetSize.RANDOM_10K, DatasetSize.RANDOM_10K.seed(10)) + checkpoint_path: Optional path to resume from checkpoint """ - import json - import subprocess - import tempfile - from pathlib import Path - - # Extract model metadata if model instance provided - if model is not None: - if model_package is None: - if self.model_package is not None: - model_package = self.model_package - else: - model_package = model.__class__.__module__.split(".")[0] - - if model_factory is None: - model_factory = model.__class__.__name__ - - model_checkpoint = None - if hasattr(model, "checkpoint_path"): - model_checkpoint = model.checkpoint_path - elif hasattr(model, "checkpoint"): - model_checkpoint = model.checkpoint - - if model_kwargs is None and hasattr(model, "_init_kwargs"): - model_kwargs = model._init_kwargs + from ..utils.remote_execution import run_remote_benchmark + from .enums import DatasetConfig, DatasetSize + + # Build script with task-specific functions AND user's factory + script_content = self._build_script( + self.process_fn, self.load_dataset_fn, self.calc_metrics_fn, model_factory + ) + + # Handle single package string or list of packages + packages = ( + [model_packages] if isinstance(model_packages, str) else model_packages + ) + dependencies = ["matbench-discovery>=1.3.0"] + packages + + # Handle DatasetSize enum, DatasetConfig, or integer + if isinstance(num_structures, DatasetSize): + runner_config = { + "repo_url": self.repo_url, + "repo_ref": self.repo_ref, + "dataset_subset": num_structures.value, + "dataset_seed": 42, # Default seed + } + elif isinstance(num_structures, DatasetConfig): + runner_config = { + "repo_url": self.repo_url, + "repo_ref": self.repo_ref, + "dataset_subset": num_structures.subset.value, + "dataset_seed": num_structures.seed, + } else: - if model_package is None or model_factory is None: - raise ValueError( - "If model is not provided, must specify both " - "model_package and model_factory" - ) - model_checkpoint = None - - if model_kwargs is None: - model_kwargs = {} - - # Run benchmark in subprocess with isolated environment - import sys - - config = { - "repo_url": self.repo_url, - "repo_ref": self.repo_ref, - "model_package": model_package, - "model_factory": model_factory, - "model_kwargs": model_kwargs, - "model_checkpoint": model_checkpoint, - "num_structures": num_structures, - "use_multi_gpu": use_multi_gpu, - } - - results_file_path = ( - Path(tempfile.gettempdir()) / f"benchmark_results_{id(config)}.json" + # Integer - use traditional num_structures approach + runner_config = { + "repo_url": self.repo_url, + "repo_ref": self.repo_ref, + "num_structures": num_structures, + "dataset_subset": "full", + } + + # Run locally (no Globus Compute) + return run_remote_benchmark( + script_content=script_content, + dependencies=dependencies, + config=runner_config, + checkpoint_path=checkpoint_path, ) - wrapper_script = f''' -import json -from garden_ai.benchmarks.matbench_discovery.remote_runner import run_matbench_is2re -config = {repr(config)} -results = run_matbench_is2re(**config) +class IS2RETask(MatbenchTask): + """Initial Structure to Relaxed Energy.""" -with open("{results_file_path}", "w") as f: - json.dump(results, f, indent=2) -''' + def __init__(self, *args, **kwargs): + super().__init__(*args, task_name="IS2RE", **kwargs) + self.process_fn = process_batch_relaxation + self.load_dataset_fn = load_dataset_wbm_initial + self.calc_metrics_fn = calculate_metrics_energy - with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: - f.write(wrapper_script) - wrapper_path = f.name - try: - # Run without capturing output so logs stream to console in real-time - result = subprocess.run( - [sys.executable, wrapper_path], - timeout=3600, - # Don't capture output - let it stream to console - stdout=None, - stderr=None, - ) +class RS2RETask(MatbenchTask): + """Relaxed Structure to Relaxed Energy.""" - if result.returncode != 0: - raise RuntimeError( - f"Local benchmark failed with return code {result.returncode}" - ) + def __init__(self, *args, **kwargs): + super().__init__(*args, task_name="RS2RE", **kwargs) + self.process_fn = process_batch_static + self.load_dataset_fn = load_dataset_wbm_relaxed + self.calc_metrics_fn = calculate_metrics_energy - if not results_file_path.exists(): - raise RuntimeError( - f"Benchmark results file not found at {results_file_path}" - ) - with open(results_file_path) as f: - return json.load(f) +class S2EFSTask(MatbenchTask): + """Structure to Energy, Forces, Stress.""" - finally: - Path(wrapper_path).unlink(missing_ok=True) - results_file_path.unlink(missing_ok=True) + def __init__(self, *args, **kwargs): + super().__init__(*args, task_name="S2EFS", **kwargs) + self.process_fn = process_batch_forces + self.load_dataset_fn = load_dataset_mp_trj + self.calc_metrics_fn = calculate_metrics_forces - def calculate_metrics(self, outputs: dict) -> dict[str, Any]: - # TODO: implement the full metrics calculation, - # this is just a placeholder for now - """Calculate benchmark metrics from raw outputs. - For MVP, this returns basic statistics. Future versions will compare - against DFT ground truth and calculate proper benchmark metrics like - F1 score, discovery yield, etc. +class S2EFTask(MatbenchTask): + """Structure to Energy, Force.""" - Args: - outputs: Dictionary from remote execution containing: - - energies: List of relaxed energies - - num_converged: Number of successful relaxations - - failed_indices: Indices of failed structures - - Returns: - Dictionary of calculated metrics: - - num_attempted: Total structures attempted - - num_converged: Number of successful relaxations - - success_rate: Fraction of successful relaxations - - mean_energy: Average final energy (eV/atom, if available) - - num_failed: Count of failed relaxations - """ - energies = outputs.get("energies", []) - num_converged = outputs.get("num_converged", 0) - failed_indices = outputs.get("failed_indices", []) - - # Filter out None values (failed relaxations) - valid_energies = [e for e in energies if e is not None] - - metrics = { - "num_attempted": len(energies), - "num_converged": num_converged, - "num_failed": len(failed_indices), - "success_rate": num_converged / len(energies) if energies else 0.0, - } - - # Calculate energy statistics if we have valid results - if valid_energies: - metrics["mean_energy"] = sum(valid_energies) / len(valid_energies) - metrics["min_energy"] = min(valid_energies) - metrics["max_energy"] = max(valid_energies) - - return metrics + def __init__(self, *args, **kwargs): + super().__init__(*args, task_name="S2EF", **kwargs) + self.process_fn = process_batch_forces + self.load_dataset_fn = load_dataset_mp_trj + self.calc_metrics_fn = calculate_metrics_forces + + +class S2EFSMTask(MatbenchTask): + """Structure to Energy, Force, Stress, Magmoms.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, task_name="S2EFSM", **kwargs) + self.process_fn = process_batch_forces + self.load_dataset_fn = load_dataset_mp_trj + self.calc_metrics_fn = calculate_metrics_forces + + +class IS2ETask(MatbenchTask): + """Initial Structure to Energy.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, task_name="IS2E", **kwargs) + self.process_fn = process_batch_static + self.load_dataset_fn = load_dataset_wbm_initial + self.calc_metrics_fn = calculate_metrics_energy + + +class S2ETask(MatbenchTask): + """Structure to Energy.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, task_name="S2E", **kwargs) + self.process_fn = process_batch_static + self.load_dataset_fn = load_dataset_wbm_relaxed + self.calc_metrics_fn = calculate_metrics_energy + + +class S2RETask(MatbenchTask): + """Structure to Relaxed Energy.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, task_name="S2RE", **kwargs) + self.process_fn = process_batch_relaxation + self.load_dataset_fn = load_dataset_wbm_initial + self.calc_metrics_fn = calculate_metrics_energy + + +class RP2RETask(MatbenchTask): + """Relaxed Prototype to Relaxed Energy.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, task_name="RP2RE", **kwargs) + self.process_fn = process_batch_relaxation + self.load_dataset_fn = load_dataset_wbm_initial # Placeholder + self.calc_metrics_fn = calculate_metrics_energy + + +class IP2ETask(MatbenchTask): + """Initial Prototype to Energy.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, task_name="IP2E", **kwargs) + self.process_fn = process_batch_static + self.load_dataset_fn = load_dataset_wbm_initial # Placeholder + self.calc_metrics_fn = calculate_metrics_energy From 77a4a01675f21d74664c0824629c7a90f74d77e7 Mon Sep 17 00:00:00 2001 From: hholb Date: Tue, 2 Dec 2025 11:34:27 -0700 Subject: [PATCH 04/23] WIP cleanup --- .../examples/run_random_10k_benchmark.py | 2 +- .../benchmarks/matbench_discovery/remote.py | 485 +++++++++++ .../benchmarks/matbench_discovery/tasks.py | 757 +++--------------- garden_ai/benchmarks/templates/base_runner.py | 248 ++++++ garden_ai/benchmarks/utils/remote.py | 176 ++++ .../benchmarks/utils/remote_execution.py | 202 +++++ garden_ai/benchmarks/utils/script_builder.py | 96 +++ garden_ai/benchmarks/utils/task.py | 132 +++ 8 files changed, 1444 insertions(+), 654 deletions(-) create mode 100644 garden_ai/benchmarks/matbench_discovery/remote.py create mode 100644 garden_ai/benchmarks/templates/base_runner.py create mode 100644 garden_ai/benchmarks/utils/remote.py create mode 100644 garden_ai/benchmarks/utils/remote_execution.py create mode 100644 garden_ai/benchmarks/utils/script_builder.py create mode 100644 garden_ai/benchmarks/utils/task.py diff --git a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py index 3bdfd6ca..96c8208f 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py @@ -55,7 +55,7 @@ def create_sevennet_model(device): """Create SevenNet model calculator.""" from sevenn.calculator import SevenNetCalculator - return SevenNetCalculator(model="7net-0", device=device) + return SevenNetCalculator(model="7net-l3i5", device=device) # Model configurations diff --git a/garden_ai/benchmarks/matbench_discovery/remote.py b/garden_ai/benchmarks/matbench_discovery/remote.py new file mode 100644 index 00000000..247d2ba2 --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/remote.py @@ -0,0 +1,485 @@ +"""Remote functions for Matbench Discovery benchmark. + +These functions are injected into the remote script. +They must be self-contained (imports inside or provided by builder). +""" + +from typing import Any, Callable, Dict, List, Optional, Tuple + +# ------------------------------------------------------------------------------ +# Common Helpers +# ------------------------------------------------------------------------------ + + +def _process_batch_common( + batch_id: int, + structures: List[Tuple[str, Any]], + model_config: Dict[str, Any], + num_threads: int, + compute_fn: Callable[[Any, Any], Dict[str, Any]], + task_name: str, +) -> Dict[str, Any]: + """Common logic for processing a batch of structures. + + Args: + batch_id: ID of the current batch + structures: List of (id, atoms) tuples + model_config: Configuration for the model + num_threads: Number of threads to use + compute_fn: Function taking (model, atoms) and returning a result dict + task_name: Name of the task for logging + """ + import logging + import os + import time + + import torch + + # Configure thread limits to avoid contention + os.environ["OMP_NUM_THREADS"] = str(num_threads) + torch.set_num_threads(num_threads) + + gpu_id = model_config.get("gpu_id") + device = setup_device(gpu_id) # noqa: F821 + + worker_logger = logging.getLogger(f"worker_{batch_id}") + worker_logger.info( + f"Started {task_name} on {device} with {len(structures)} structures. Threads: {num_threads}" + ) + + global _MODEL_CACHE + try: + if _MODEL_CACHE is None: + model = load_model(device) # noqa: F821 + _MODEL_CACHE = model + else: + model = _MODEL_CACHE + except Exception as e: + worker_logger.error(f"Failed to initialize model: {e}") + worker_logger.error( + "Model initialization is critical - cannot continue benchmark" + ) + raise RuntimeError(f"Model initialization failed: {e}") from e + + results = {} + batch_start = time.time() + + for i, (struct_id, atoms) in enumerate(structures): + try: + # Run the specific computation + result = compute_fn(model, atoms) + results[struct_id] = result + + if (i + 1) % 10 == 0: + elapsed = time.time() - batch_start + rate = (i + 1) / elapsed if elapsed > 0 else 0 + worker_logger.info( + f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)" + ) + + except Exception as e: + worker_logger.warning(f"Structure {struct_id} failed: {e}") + results[struct_id] = {"error": str(e)} + + return results + + +def _load_dataset_common( + config: Dict[str, Any], + zip_path: str, + read_format: str = "extxyz", + read_index: str | slice = None, +) -> List[Tuple[str, Any]]: + """Common logic for loading datasets from a zip file.""" + from io import TextIOWrapper + from zipfile import ZipFile + + from ase.io import read + + # get_material_ids_for_subset is injected + dataset_subset = config.get("dataset_subset", "full") + dataset_seed = config.get("dataset_seed", 42) + mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed) # noqa: F821 + + structures = [] + + with ZipFile(zip_path, "r") as zf: + if mat_ids is None: + # Load all files (full dataset) + # Sort by numeric ID if possible + file_list = sorted( + zf.namelist(), + key=lambda x: int(x.split(".")[0]) + if x.split(".")[0].isdigit() + else float("inf"), + ) + num_structures = config.get("num_structures", 100) + file_list = file_list[:num_structures] + else: + # Filter to specific material IDs + mat_id_set = set(mat_ids) + file_list = [ + f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set + ] + + for filename in file_list: + with zf.open(filename) as f: + text_stream = TextIOWrapper(f, encoding="utf-8") + if read_index is not None: + atoms_list = read(text_stream, format=read_format, index=read_index) + # If we got a list and need one item, take the last one (common for trajectories) + if isinstance(atoms_list, list) and atoms_list: + structures.append((filename, atoms_list[-1])) + elif not isinstance(atoms_list, list): + structures.append((filename, atoms_list)) + else: + structures.append((filename, read(text_stream, format=read_format))) + + return structures + + +# ------------------------------------------------------------------------------ +# Injected Functions +# ------------------------------------------------------------------------------ + + +def load_model(device: str): + """Initialize the model using the user-provided factory function. + + The factory function is injected into this script by the benchmark framework. + """ + # Call the user's factory function (injected as load_model_user) + model = load_model_user(device) # noqa: F821 + return model + + +def get_material_ids_for_subset( + subset_type: str, seed: int = 42 +) -> Optional[List[str]]: + """Get material IDs for a specific dataset subset. + + Args: + subset_type: One of 'full', 'unique_protos', 'random_10k', 'random_100' + seed: Random seed for sampling (default: 42) + + Returns: + List of material IDs, or None for 'full' (load all) + """ + if subset_type == "full": + return None # Load all materials + + import pandas as pd + from matbench_discovery.data import DataFiles + + # Load wbm_summary + df = pd.read_csv(DataFiles.wbm_summary.path) + + if subset_type == "unique_protos": + # Filter to unique prototypes (removes duplicates and MP overlaps) + df_filtered = df.query("unique_prototype") + return df_filtered["material_id"].tolist() + + elif subset_type == "random_10k": + # Random sample of 10k unique prototypes (fixed seed for reproducibility) + df_filtered = df.query("unique_prototype") + df_sampled = df_filtered.sample(n=10000, random_state=seed) + return df_sampled["material_id"].tolist() + + elif subset_type == "random_100": + # Random sample of 100 unique prototypes (fixed seed for reproducibility) + # Useful for quick end-to-end testing + df_filtered = df.query("unique_prototype") + df_sampled = df_filtered.sample(n=100, random_state=seed) + return df_sampled["material_id"].tolist() + + else: + raise ValueError(f"Unknown subset_type: {subset_type}") + + +def process_batch_relaxation( + batch_id: int, + structures: List[Tuple[str, Any]], + model_config: Dict[str, Any], + num_threads: int, +) -> Dict[str, Any]: + """Process a batch of structures for IS2RE (Relaxation).""" + from ase.optimize import FIRE + + def compute(model, atoms): + atoms.calc = model + opt = FIRE(atoms, logfile=None) + opt.run(fmax=0.05, steps=500) + energy = atoms.get_potential_energy() + return {"energy": energy} + + return _process_batch_common( + batch_id, structures, model_config, num_threads, compute, "relaxation" + ) + + +def process_batch_static( + batch_id: int, + structures: List[Tuple[str, Any]], + model_config: Dict[str, Any], + num_threads: int, +) -> Dict[str, Any]: + """Process a batch of structures for RS2RE (Static Calculation).""" + + def compute(model, atoms): + atoms.calc = model + energy = atoms.get_potential_energy() + return {"energy": energy} + + return _process_batch_common( + batch_id, structures, model_config, num_threads, compute, "static calculation" + ) + + +def process_batch_forces( + batch_id: int, + structures: List[Tuple[str, Any]], + model_config: Dict[str, Any], + num_threads: int, +) -> Dict[str, Any]: + """Process a batch of structures for S2EFS (Energy, Forces, Stress).""" + + def compute(model, atoms): + atoms.calc = model + energy = atoms.get_potential_energy() + forces = atoms.get_forces().tolist() + stress = atoms.get_stress().tolist() + return {"energy": energy, "forces": forces, "stress": stress} + + return _process_batch_common( + batch_id, structures, model_config, num_threads, compute, "forces calculation" + ) + + +def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Tuple[str, Any]]: + """Load initial structures for IS2RE.""" + from matbench_discovery.data import DataFiles + + return _load_dataset_common(config, DataFiles.wbm_initial_atoms.path) + + +def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Tuple[str, Any]]: + """Load relaxed structures for RS2RE.""" + from matbench_discovery.data import DataFiles + + return _load_dataset_common(config, DataFiles.wbm_relaxed_atoms.path) + + +def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Tuple[str, Any]]: + """Load MP trajectories for S2EFS.""" + from matbench_discovery.data import DataFiles + + # Use index=":" to read all frames, but _load_dataset_common handles taking the last one + return _load_dataset_common(config, DataFiles.mp_trj_extxyz.path, read_index=":") + + +def calculate_metrics_energy( + results: Dict[str, Any], config: Dict[str, Any] +) -> Dict[str, Any]: + """Calculate energy metrics using matbench-discovery's stable_metrics algorithm. + + Uses the injected stable_metrics function. + Returns: F1, DAF, Precision, Recall, Accuracy, TPR, FPR, TNR, FNR, TP, FP, TN, FN, MAE, RMSE, R2 + """ + import logging + + import numpy as np + + logger = logging.getLogger("metrics") + + # Results format: {id: {"energy": float, "error": str}} + if len(results) == 0: + return {"error": "No results to evaluate"} + + try: + # Import matbench-discovery data + from matbench_discovery.data import df_wbm + except Exception as e: + return {"error": f"Failed to import matbench-discovery: {e}"} + + # Extract model energies + model_energies = {} + for sid, res in results.items(): + if isinstance(res, dict) and res.get("energy") is not None: + mat_id = sid.replace(".extxyz", "") + model_energies[mat_id] = res["energy"] + + if not model_energies: + return {"error": "No valid energies found in results"} + + # Get common IDs between predictions and ground truth + # Use direct string column names instead of MbdKey enum to avoid issues + df_wbm_indexed = df_wbm.set_index("material_id") + common_ids = list(set(model_energies.keys()) & set(df_wbm_indexed.index)) + + if not common_ids: + return {"error": "No matching IDs between results and ground truth"} + + # Get subset of data + df_subset = df_wbm_indexed.loc[common_ids] + + # Calculate predicted formation energies + y_pred = np.array([model_energies[mid] for mid in common_ids]) + y_true = df_subset["uncorrected_energy"].values # Uncorrected total energy + n_atoms = df_subset["n_sites"].values + + # Predicted formation energy ERROR per atom (from total energy difference) + # This is the ERROR: (E_pred - E_dft) / n_atoms + e_form_error = (y_pred - y_true) / n_atoms + + # Get ground truth e_above_hull for stability classification + each_true = df_subset["e_above_hull_mp2020_corrected_ppd_mp"].values + + # Calculate predicted e_above_hull + # Since e_form_error is already (E_pred - E_dft)/n_atoms, we just add it to each_true + each_pred = each_true + e_form_error + + # Debug logging to understand the distribution + logger.info("Energy statistics:") + logger.info( + f" each_true: min={each_true.min():.4f}, max={each_true.max():.4f}, mean={each_true.mean():.4f}" + ) + logger.info( + f" each_pred: min={each_pred.min():.4f}, max={each_pred.max():.4f}, mean={each_pred.mean():.4f}" + ) + + # Calculate global prevalence for DAF normalization (matches official leaderboard) + # Filter to unique prototypes + df_unique = df_wbm.query("unique_prototype") + # Calculate prevalence: (stable count) / (total count) + # Stability threshold is 0.0 + stable_count = (df_unique["e_above_hull_mp2020_corrected_ppd_mp"] <= 0).sum() + global_prevalence = stable_count / len(df_unique) + + logger.info( + f"Using global prevalence for DAF: {global_prevalence:.6f} ({stable_count}/{len(df_unique)})" + ) + + # Calculate metrics using the injected function + # stable_metrics is injected into the script scope + metrics = stable_metrics(each_true, each_pred, prevalence=global_prevalence) # noqa: F821 + + # Add num_evaluated + metrics["num_evaluated"] = len(common_ids) + + return metrics + + +def calculate_metrics_forces( + results: Dict[str, Any], config: Dict[str, Any] +) -> Dict[str, Any]: + """Calculate comprehensive S2EFS metrics (Energy, Forces, Stress). + + Returns MAE, RMSE, and R2 for each component. + """ + from io import TextIOWrapper + from zipfile import ZipFile + + import numpy as np + from ase.io import read + from matbench_discovery.data import DataFiles + from sklearn.metrics import r2_score + + # We need to load ground truth from the dataset itself because MP trj has E/F/S in the extxyz + # This is expensive to re-read. Ideally we should have passed GT in results or loaded it efficiently. + # For now, let's re-read the GT for the processed IDs. + + metrics = { + "energy_mae": [], + "energy_rmse": [], + "force_mae": [], + "force_rmse": [], + "stress_mae": [], + "stress_rmse": [], + } + + # Collect all predictions and ground truth for R2 calculation + all_e_pred, all_e_true = [], [] + all_f_pred, all_f_true = [], [] + all_s_pred, all_s_true = [], [] + + zip_path = DataFiles.mp_trj_extxyz.path + + with ZipFile(zip_path, "r") as zf: + for sid, res in results.items(): + if "error" in res: + continue + + try: + with zf.open(sid) as f: + text_stream = TextIOWrapper(f, encoding="utf-8") + atoms_list = read(text_stream, format="extxyz", index=":") + gt_atoms = atoms_list[-1] # Matching load_dataset logic + + # Energy (per atom) + e_pred = res["energy"] + e_true = gt_atoms.get_potential_energy() + n_atoms = len(gt_atoms) + + energy_error = abs(e_pred - e_true) / n_atoms + metrics["energy_mae"].append(energy_error) + metrics["energy_rmse"].append(energy_error**2) + + all_e_pred.append(e_pred / n_atoms) + all_e_true.append(e_true / n_atoms) + + # Forces + f_pred = np.array(res["forces"]) + f_true = gt_atoms.get_forces() + force_error = np.abs(f_pred - f_true) + metrics["force_mae"].append(force_error.mean()) + metrics["force_rmse"].append((force_error**2).mean()) + + all_f_pred.extend(f_pred.flatten()) + all_f_true.extend(f_true.flatten()) + + # Stress + s_pred = np.array(res["stress"]) + s_true = gt_atoms.get_stress() + stress_error = np.abs(s_pred - s_true) + metrics["stress_mae"].append(stress_error.mean()) + metrics["stress_rmse"].append((stress_error**2).mean()) + + all_s_pred.extend(s_pred.flatten()) + all_s_true.extend(s_true.flatten()) + + except Exception: + pass + + # Calculate final metrics + result_metrics = {} + + if metrics["energy_mae"]: + result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"])) + result_metrics["energy_rmse"] = float(np.sqrt(np.mean(metrics["energy_rmse"]))) + result_metrics["energy_r2"] = ( + float(r2_score(all_e_true, all_e_pred)) + if len(all_e_true) > 1 + else float("nan") + ) + + if metrics["force_mae"]: + result_metrics["force_mae"] = float(np.mean(metrics["force_mae"])) + result_metrics["force_rmse"] = float(np.sqrt(np.mean(metrics["force_rmse"]))) + result_metrics["force_r2"] = ( + float(r2_score(all_f_true, all_f_pred)) + if len(all_f_true) > 1 + else float("nan") + ) + + if metrics["stress_mae"]: + result_metrics["stress_mae"] = float(np.mean(metrics["stress_mae"])) + result_metrics["stress_rmse"] = float(np.sqrt(np.mean(metrics["stress_rmse"]))) + result_metrics["stress_r2"] = ( + float(r2_score(all_s_true, all_s_pred)) + if len(all_s_true) > 1 + else float("nan") + ) + + result_metrics["num_evaluated"] = len(metrics["energy_mae"]) + + return result_metrics diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index b4348d27..e26045d6 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, List from ..utils.remote_execution import run_remote_benchmark from ..utils.script_builder import BenchmarkScriptBuilder @@ -19,573 +19,25 @@ # These functions are injected into the remote script. # They must be self-contained (imports inside or provided by builder). # ------------------------------------------------------------------------------ - - -def load_model(device: str): - """Initialize the model using the user-provided factory function. - - The factory function is injected into this script by the benchmark framework. - """ - # Call the user's factory function (injected as load_model_user) - model = load_model_user(device) # noqa: F821 - return model - - -def get_material_ids_for_subset( - subset_type: str, seed: int = 42 -) -> Optional[List[str]]: - """Get material IDs for a specific dataset subset. - - Args: - subset_type: One of 'full', 'unique_protos', 'random_10k', 'random_100' - seed: Random seed for sampling (default: 42) - - Returns: - List of material IDs, or None for 'full' (load all) - """ - if subset_type == "full": - return None # Load all materials - - import pandas as pd - from matbench_discovery.data import DataFiles - - # Load wbm_summary - df = pd.read_csv(DataFiles.wbm_summary.path) - - if subset_type == "unique_protos": - # Filter to unique prototypes (removes duplicates and MP overlaps) - df_filtered = df.query("unique_prototype") - return df_filtered["material_id"].tolist() - - elif subset_type == "random_10k": - # Random sample of 10k unique prototypes (fixed seed for reproducibility) - df_filtered = df.query("unique_prototype") - df_sampled = df_filtered.sample(n=10000, random_state=seed) - return df_sampled["material_id"].tolist() - - elif subset_type == "random_100": - # Random sample of 100 unique prototypes (fixed seed for reproducibility) - # Useful for quick end-to-end testing - df_filtered = df.query("unique_prototype") - df_sampled = df_filtered.sample(n=100, random_state=seed) - return df_sampled["material_id"].tolist() - - else: - raise ValueError(f"Unknown subset_type: {subset_type}") - - -# --- Reusable Process Functions --- - - -def process_batch_relaxation( - batch_id: int, - structures: List[Tuple[str, Any]], - model_config: Dict[str, Any], - num_threads: int, -) -> Dict[str, Any]: - """Process a batch of structures for IS2RE (Relaxation).""" - import logging - import os - import time - - import torch - from ase.optimize import FIRE - - # Configure thread limits to avoid contention - os.environ["OMP_NUM_THREADS"] = str(num_threads) - torch.set_num_threads(num_threads) - - gpu_id = model_config.get("gpu_id") - device = setup_device(gpu_id) # noqa: F821 - - worker_logger = logging.getLogger(f"worker_{batch_id}") - worker_logger.info( - f"Started relaxation on {device} with {len(structures)} structures. Threads: {num_threads}" - ) - - global _MODEL_CACHE - try: - if _MODEL_CACHE is None: - model = load_model(device) - _MODEL_CACHE = model - else: - model = _MODEL_CACHE - except Exception as e: - worker_logger.error(f"Failed to initialize model: {e}") - worker_logger.error( - "Model initialization is critical - cannot continue benchmark" - ) - raise RuntimeError(f"Model initialization failed: {e}") from e - - results = {} - batch_start = time.time() - - for i, (struct_id, atoms) in enumerate(structures): - try: - atoms.calc = model - opt = FIRE(atoms, logfile=None) - opt.run(fmax=0.05, steps=500) - - energy = atoms.get_potential_energy() - results[struct_id] = {"energy": energy} - - if (i + 1) % 10 == 0: - elapsed = time.time() - batch_start - rate = (i + 1) / elapsed if elapsed > 0 else 0 - worker_logger.info( - f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)" - ) - - except Exception as e: - worker_logger.warning(f"Structure {struct_id} failed: {e}") - results[struct_id] = {"energy": None, "error": str(e)} - - return results - - -def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Tuple[str, Any]]: - """Load initial structures for IS2RE.""" - from io import TextIOWrapper - from zipfile import ZipFile - - from ase.io import read - from matbench_discovery.data import DataFiles - - dataset_subset = config.get("dataset_subset", "full") - dataset_seed = config.get("dataset_seed", 42) - mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed) - - structures = [] - zip_path = DataFiles.wbm_initial_atoms.path - - with ZipFile(zip_path, "r") as zf: - if mat_ids is None: - # Load all files (full dataset) - file_list = sorted( - zf.namelist(), - key=lambda x: int(x.split(".")[0]) - if x.split(".")[0].isdigit() - else float("inf"), - ) - num_structures = config.get("num_structures", 100) - file_list = file_list[:num_structures] - else: - # Filter to specific material IDs - mat_id_set = set(mat_ids) - file_list = [ - f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set - ] - - for filename in file_list: - with zf.open(filename) as f: - text_stream = TextIOWrapper(f, encoding="utf-8") - structures.append((filename, read(text_stream, format="extxyz"))) - return structures - - -def calculate_metrics_energy( - results: Dict[str, Any], config: Dict[str, Any] -) -> Dict[str, Any]: - """Calculate energy metrics using matbench-discovery's stable_metrics algorithm. - - Uses the injected stable_metrics function. - Returns: F1, DAF, Precision, Recall, Accuracy, TPR, FPR, TNR, FNR, TP, FP, TN, FN, MAE, RMSE, R2 - """ - import logging - - import numpy as np - - logger = logging.getLogger("metrics") - - # Results format: {id: {"energy": float, "error": str}} - if len(results) == 0: - return {"error": "No results to evaluate"} - - try: - # Import matbench-discovery data - from matbench_discovery.data import df_wbm - except Exception as e: - return {"error": f"Failed to import matbench-discovery: {e}"} - - # Extract model energies - model_energies = {} - for sid, res in results.items(): - if isinstance(res, dict) and res.get("energy") is not None: - mat_id = sid.replace(".extxyz", "") - model_energies[mat_id] = res["energy"] - - if not model_energies: - return {"error": "No valid energies found in results"} - - # Get common IDs between predictions and ground truth - # Use direct string column names instead of MbdKey enum to avoid issues - df_wbm_indexed = df_wbm.set_index("material_id") - common_ids = list(set(model_energies.keys()) & set(df_wbm_indexed.index)) - - if not common_ids: - return {"error": "No matching IDs between results and ground truth"} - - # Get subset of data - df_subset = df_wbm_indexed.loc[common_ids] - - # Calculate predicted formation energies - y_pred = np.array([model_energies[mid] for mid in common_ids]) - y_true = df_subset["uncorrected_energy"].values # Uncorrected total energy - n_atoms = df_subset["n_sites"].values - - # Predicted formation energy ERROR per atom (from total energy difference) - # This is the ERROR: (E_pred - E_dft) / n_atoms - e_form_error = (y_pred - y_true) / n_atoms - - # Get ground truth e_above_hull for stability classification - each_true = df_subset["e_above_hull_mp2020_corrected_ppd_mp"].values - - # Calculate predicted e_above_hull - # Since e_form_error is already (E_pred - E_dft)/n_atoms, we just add it to each_true - each_pred = each_true + e_form_error - - # Debug logging to understand the distribution - logger.info("Energy statistics:") - logger.info( - f" each_true: min={each_true.min():.4f}, max={each_true.max():.4f}, mean={each_true.mean():.4f}" - ) - logger.info( - f" each_pred: min={each_pred.min():.4f}, max={each_pred.max():.4f}, mean={each_pred.mean():.4f}" - ) - - # Calculate global prevalence for DAF normalization (matches official leaderboard) - # Filter to unique prototypes - df_unique = df_wbm.query("unique_prototype") - # Calculate prevalence: (stable count) / (total count) - # Stability threshold is 0.0 - stable_count = (df_unique["e_above_hull_mp2020_corrected_ppd_mp"] <= 0).sum() - global_prevalence = stable_count / len(df_unique) - - logger.info( - f"Using global prevalence for DAF: {global_prevalence:.6f} ({stable_count}/{len(df_unique)})" - ) - - # Calculate metrics using the injected function - # stable_metrics is injected into the script scope - metrics = stable_metrics(each_true, each_pred, prevalence=global_prevalence) - - # Add num_evaluated - metrics["num_evaluated"] = len(common_ids) - - return metrics - - -def process_batch_static( - batch_id: int, - structures: List[Tuple[str, Any]], - model_config: Dict[str, Any], - num_threads: int, -) -> Dict[str, Any]: - """Process a batch of structures for RS2RE (Static Calculation).""" - import logging - import os - import time - - import torch - - os.environ["OMP_NUM_THREADS"] = str(num_threads) - torch.set_num_threads(num_threads) - - gpu_id = model_config.get("gpu_id") - device = setup_device(gpu_id) # noqa: F821 - - worker_logger = logging.getLogger(f"worker_{batch_id}") - worker_logger.info( - f"Started static calculation on {device} with {len(structures)} structures." - ) - - global _MODEL_CACHE - try: - if _MODEL_CACHE is None: - model = load_model(device) - _MODEL_CACHE = model - else: - model = _MODEL_CACHE - except Exception as e: - return {sid: {"energy": None, "error": str(e)} for sid, _ in structures} - - results = {} - batch_start = time.time() - - for i, (struct_id, atoms) in enumerate(structures): - try: - atoms.calc = model - # No relaxation, just static energy - energy = atoms.get_potential_energy() - results[struct_id] = {"energy": energy} - - if (i + 1) % 50 == 0: - elapsed = time.time() - batch_start - rate = (i + 1) / elapsed if elapsed > 0 else 0 - worker_logger.info( - f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)" - ) - - except Exception as e: - worker_logger.warning(f"Structure {struct_id} failed: {e}") - results[struct_id] = {"energy": None, "error": str(e)} - - return results - - -def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Tuple[str, Any]]: - """Load relaxed structures for RS2RE.""" - from io import TextIOWrapper - from zipfile import ZipFile - - from ase.io import read - from matbench_discovery.data import DataFiles - - dataset_subset = config.get("dataset_subset", "full") - dataset_seed = config.get("dataset_seed", 42) - mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed) - - structures = [] - # Use relaxed atoms - zip_path = DataFiles.wbm_relaxed_atoms.path - - with ZipFile(zip_path, "r") as zf: - if mat_ids is None: - # Load all files (full dataset) - file_list = sorted( - zf.namelist(), - key=lambda x: int(x.split(".")[0]) - if x.split(".")[0].isdigit() - else float("inf"), - ) - num_structures = config.get("num_structures", 100) - file_list = file_list[:num_structures] - else: - # Filter to specific material IDs - mat_id_set = set(mat_ids) - file_list = [ - f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set - ] - - for filename in file_list: - with zf.open(filename) as f: - text_stream = TextIOWrapper(f, encoding="utf-8") - structures.append((filename, read(text_stream, format="extxyz"))) - return structures - - -# Reuse calculate_metrics_energy for all energy-only tasks - - -def process_batch_forces( - batch_id: int, - structures: List[Tuple[str, Any]], - model_config: Dict[str, Any], - num_threads: int, -) -> Dict[str, Any]: - """Process a batch of structures for S2EFS (Energy, Forces, Stress).""" - import logging - import os - import time - - import torch - - os.environ["OMP_NUM_THREADS"] = str(num_threads) - torch.set_num_threads(num_threads) - - gpu_id = model_config.get("gpu_id") - device = setup_device(gpu_id) # noqa: F821 - - worker_logger = logging.getLogger(f"worker_{batch_id}") - worker_logger.info( - f"Started forces calculation on {device} with {len(structures)} structures." - ) - - global _MODEL_CACHE - try: - if _MODEL_CACHE is None: - model = load_model(device) - _MODEL_CACHE = model - else: - model = _MODEL_CACHE - except Exception as e: - return {sid: {"error": str(e)} for sid, _ in structures} - - results = {} - batch_start = time.time() - - for i, (struct_id, atoms) in enumerate(structures): - try: - atoms.calc = model - - energy = atoms.get_potential_energy() - forces = atoms.get_forces().tolist() - stress = atoms.get_stress().tolist() - - results[struct_id] = {"energy": energy, "forces": forces, "stress": stress} - - if (i + 1) % 50 == 0: - elapsed = time.time() - batch_start - rate = (i + 1) / elapsed if elapsed > 0 else 0 - worker_logger.info( - f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)" - ) - - except Exception as e: - worker_logger.warning(f"Structure {struct_id} failed: {e}") - results[struct_id] = {"error": str(e)} - - return results - - -def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Tuple[str, Any]]: - """Load MP trajectories for S2EFS.""" - from io import TextIOWrapper - from zipfile import ZipFile - - from ase.io import read - from matbench_discovery.data import DataFiles - - num_structures = config.get("num_structures", 100) - structures = [] - # Use MP trajectories - zip_path = DataFiles.mp_trj_extxyz.path - - with ZipFile(zip_path, "r") as zf: - file_list = sorted(zf.namelist()) - for filename in file_list[:num_structures]: - with zf.open(filename) as f: - text_stream = TextIOWrapper(f, encoding="utf-8") - # Read all frames? Or just one? Usually S2EFS is on frames. - # Let's assume we evaluate on the last frame or all frames. - # For simplicity, let's take the last frame (relaxed?) or random? - # Actually, MP trj contains relaxation steps. - # Let's read the last frame for now as a proxy for "a structure". - # Or better, read all frames and treat them as separate tasks? - # For this benchmark, let's just treat the file as containing one structure per file if possible, - # or just take the last one. - atoms_list = read(text_stream, format="extxyz", index=":") - if atoms_list: - # Just take the last one for now - structures.append((filename, atoms_list[-1])) - return structures - - -def calculate_metrics_forces( - results: Dict[str, Any], config: Dict[str, Any] -) -> Dict[str, Any]: - """Calculate comprehensive S2EFS metrics (Energy, Forces, Stress). - - Returns MAE, RMSE, and R2 for each component. - """ - from io import TextIOWrapper - from zipfile import ZipFile - - import numpy as np - from ase.io import read - from matbench_discovery.data import DataFiles - from sklearn.metrics import r2_score - - # We need to load ground truth from the dataset itself because MP trj has E/F/S in the extxyz - # This is expensive to re-read. Ideally we should have passed GT in results or loaded it efficiently. - # For now, let's re-read the GT for the processed IDs. - - metrics = { - "energy_mae": [], - "energy_rmse": [], - "force_mae": [], - "force_rmse": [], - "stress_mae": [], - "stress_rmse": [], - } - - # Collect all predictions and ground truth for R2 calculation - all_e_pred, all_e_true = [], [] - all_f_pred, all_f_true = [], [] - all_s_pred, all_s_true = [], [] - - zip_path = DataFiles.mp_trj_extxyz.path - - with ZipFile(zip_path, "r") as zf: - for sid, res in results.items(): - if "error" in res: - continue - - try: - with zf.open(sid) as f: - text_stream = TextIOWrapper(f, encoding="utf-8") - atoms_list = read(text_stream, format="extxyz", index=":") - gt_atoms = atoms_list[-1] # Matching load_dataset logic - - # Energy (per atom) - e_pred = res["energy"] - e_true = gt_atoms.get_potential_energy() - n_atoms = len(gt_atoms) - - energy_error = abs(e_pred - e_true) / n_atoms - metrics["energy_mae"].append(energy_error) - metrics["energy_rmse"].append(energy_error**2) - - all_e_pred.append(e_pred / n_atoms) - all_e_true.append(e_true / n_atoms) - - # Forces - f_pred = np.array(res["forces"]) - f_true = gt_atoms.get_forces() - force_error = np.abs(f_pred - f_true) - metrics["force_mae"].append(force_error.mean()) - metrics["force_rmse"].append((force_error**2).mean()) - - all_f_pred.extend(f_pred.flatten()) - all_f_true.extend(f_true.flatten()) - - # Stress - s_pred = np.array(res["stress"]) - s_true = gt_atoms.get_stress() - stress_error = np.abs(s_pred - s_true) - metrics["stress_mae"].append(stress_error.mean()) - metrics["stress_rmse"].append((stress_error**2).mean()) - - all_s_pred.extend(s_pred.flatten()) - all_s_true.extend(s_true.flatten()) - - except Exception: - pass - - # Calculate final metrics - result_metrics = {} - - if metrics["energy_mae"]: - result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"])) - result_metrics["energy_rmse"] = float(np.sqrt(np.mean(metrics["energy_rmse"]))) - result_metrics["energy_r2"] = ( - float(r2_score(all_e_true, all_e_pred)) - if len(all_e_true) > 1 - else float("nan") - ) - - if metrics["force_mae"]: - result_metrics["force_mae"] = float(np.mean(metrics["force_mae"])) - result_metrics["force_rmse"] = float(np.sqrt(np.mean(metrics["force_rmse"]))) - result_metrics["force_r2"] = ( - float(r2_score(all_f_true, all_f_pred)) - if len(all_f_true) > 1 - else float("nan") - ) - - if metrics["stress_mae"]: - result_metrics["stress_mae"] = float(np.mean(metrics["stress_mae"])) - result_metrics["stress_rmse"] = float(np.sqrt(np.mean(metrics["stress_rmse"]))) - result_metrics["stress_r2"] = ( - float(r2_score(all_s_true, all_s_pred)) - if len(all_s_true) > 1 - else float("nan") - ) - - result_metrics["num_evaluated"] = len(metrics["energy_mae"]) - - return result_metrics - +# ------------------------------------------------------------------------------ +# REMOTE FUNCTIONS +# These functions are injected into the remote script. +# They are now imported from remote.py to keep this file clean. +# ------------------------------------------------------------------------------ +from .remote import ( + _load_dataset_common, + _process_batch_common, + calculate_metrics_energy, + calculate_metrics_forces, + get_material_ids_for_subset, + load_dataset_mp_trj, + load_dataset_wbm_initial, + load_dataset_wbm_relaxed, + load_model, + process_batch_forces, + process_batch_relaxation, + process_batch_static, +) # ------------------------------------------------------------------------------ # Task Classes @@ -627,7 +79,9 @@ def _build_script( builder.add_preamble("_MODEL_CACHE = None") # Common imports - builder.add_import("from typing import List, Dict, Any, Tuple, Optional") + builder.add_import( + "from typing import List, Dict, Any, Tuple, Optional, Callable" + ) builder.add_import("import torch") builder.add_import("from ase.optimize import FIRE") builder.add_import("from ase.io import read") @@ -648,6 +102,10 @@ def _build_script( # Add helper function for dataset subset filtering builder.add_function(get_material_ids_for_subset) + # Add common helpers + builder.add_function(_process_batch_common) + builder.add_function(_load_dataset_common) + # Add task-specific functions with standard names expected by runner builder.add_function(process_fn, name="process_batch") builder.add_function(load_dataset_fn, name="load_dataset") @@ -659,12 +117,69 @@ def _build_script( return builder.build() + def _prepare_runner_config( + self, num_structures: int | "DatasetSize" | "DatasetConfig" + ) -> Dict[str, Any]: + """Prepare the runner configuration based on num_structures.""" + from .enums import DatasetConfig, DatasetSize + + if isinstance(num_structures, DatasetSize): + return { + "repo_url": self.repo_url, + "repo_ref": self.repo_ref, + "dataset_subset": num_structures.value, + "dataset_seed": 42, + } + elif isinstance(num_structures, DatasetConfig): + return { + "repo_url": self.repo_url, + "repo_ref": self.repo_ref, + "dataset_subset": num_structures.subset.value, + "dataset_seed": num_structures.seed, + } + else: + return { + "repo_url": self.repo_url, + "repo_ref": self.repo_ref, + "num_structures": num_structures, + "dataset_subset": "full", + } + + def _prepare_dependencies(self, model_packages: str | List[str]) -> List[str]: + """Prepare the list of dependencies.""" + packages = ( + [model_packages] if isinstance(model_packages, str) else model_packages + ) + return ["matbench-discovery>=1.3.0"] + packages + + def _generate_checkpoint_name( + self, model_packages: str | List[str], runner_config: Dict[str, Any] + ) -> str: + """Generate a unique checkpoint name.""" + import time + import uuid + + model_str = ( + str(model_packages) + .replace("[", "") + .replace("]", "") + .replace("'", "") + .replace('"', "") + .replace(",", "_") + .replace(" ", "") + ) + subset_str = runner_config.get("dataset_subset", "custom") + timestamp = int(time.time()) + short_uuid = str(uuid.uuid4())[:8] + return f"matbench_{model_str}_{subset_str}_{timestamp}_{short_uuid}.json" + def submit( self, model_factory: callable, model_packages: str | List[str], num_structures: int | "DatasetSize" | "DatasetConfig" = 100, checkpoint_name: str | None = None, + checkpoint_path: str | None = None, ): """Submit benchmark job to remote executor. @@ -677,12 +192,9 @@ def submit( (DatasetSize.FULL, DatasetSize.RANDOM_10K, DatasetSize.RANDOM_10K.seed(10)) checkpoint_name: Optional name for the checkpoint file (e.g. "my_checkpoint.json"). If not provided, one will be generated. + checkpoint_path: Optional path to an existing checkpoint file to resume from. + If provided, checkpoint_name is ignored and no new checkpoint is created. """ - import time - import uuid - - from .enums import DatasetConfig, DatasetSize - # Build script with task-specific functions AND user's factory script_content = self._build_script( self.process_fn, @@ -691,57 +203,23 @@ def submit( model_factory, # Inject user's factory function ) - # Handle single package string or list of packages - packages = ( - [model_packages] if isinstance(model_packages, str) else model_packages - ) - dependencies = ["matbench-discovery>=1.3.0"] + packages + dependencies = self._prepare_dependencies(model_packages) + runner_config = self._prepare_runner_config(num_structures) - # Handle DatasetSize enum, DatasetConfig, or integer - if isinstance(num_structures, DatasetSize): - runner_config = { - "repo_url": self.repo_url, - "repo_ref": self.repo_ref, - "dataset_subset": num_structures.value, - "dataset_seed": 42, # Default seed - } - elif isinstance(num_structures, DatasetConfig): - runner_config = { - "repo_url": self.repo_url, - "repo_ref": self.repo_ref, - "dataset_subset": num_structures.subset.value, - "dataset_seed": num_structures.seed, - } - else: - # Integer - use traditional num_structures approach - runner_config = { - "repo_url": self.repo_url, - "repo_ref": self.repo_ref, - "num_structures": num_structures, - "dataset_subset": "full", - } - - # Generate checkpoint name if not provided - if not checkpoint_name: - # Format: matbench_{model}_{subset}_{timestamp}_{uuid}.json - # Clean up model name for filename - model_str = ( - str(model_packages) - .replace("[", "") - .replace("]", "") - .replace("'", "") - .replace('"', "") - .replace(",", "_") - .replace(" ", "") - ) - subset_str = runner_config.get("dataset_subset", "custom") - timestamp = int(time.time()) - short_uuid = str(uuid.uuid4())[:8] - checkpoint_name = ( - f"matbench_{model_str}_{subset_str}_{timestamp}_{short_uuid}.json" + # Generate checkpoint name if not provided AND no checkpoint_path is provided + if not checkpoint_name and not checkpoint_path: + checkpoint_name = self._generate_checkpoint_name( + model_packages, runner_config ) - print(f"Checkpoint will be saved to: ~/.garden/benchmarks/{checkpoint_name}") + if checkpoint_path: + print(f"Resuming from checkpoint: {checkpoint_path}") + final_checkpoint_path = checkpoint_path + else: + print( + f"Checkpoint will be saved to: ~/.garden/benchmarks/{checkpoint_name}" + ) + final_checkpoint_path = f"~/.garden/benchmarks/{checkpoint_name}" executor = self.adapter._get_executor() future = executor.submit( @@ -750,10 +228,11 @@ def submit( dependencies=dependencies, config=runner_config, checkpoint_name=checkpoint_name, + checkpoint_path=checkpoint_path, ) # Attach checkpoint path to future for programmatic access - future.checkpoint_path = f"~/.garden/benchmarks/{checkpoint_name}" + future.checkpoint_path = final_checkpoint_path return future @@ -775,42 +254,14 @@ def local( checkpoint_path: Optional path to resume from checkpoint """ from ..utils.remote_execution import run_remote_benchmark - from .enums import DatasetConfig, DatasetSize # Build script with task-specific functions AND user's factory script_content = self._build_script( self.process_fn, self.load_dataset_fn, self.calc_metrics_fn, model_factory ) - # Handle single package string or list of packages - packages = ( - [model_packages] if isinstance(model_packages, str) else model_packages - ) - dependencies = ["matbench-discovery>=1.3.0"] + packages - - # Handle DatasetSize enum, DatasetConfig, or integer - if isinstance(num_structures, DatasetSize): - runner_config = { - "repo_url": self.repo_url, - "repo_ref": self.repo_ref, - "dataset_subset": num_structures.value, - "dataset_seed": 42, # Default seed - } - elif isinstance(num_structures, DatasetConfig): - runner_config = { - "repo_url": self.repo_url, - "repo_ref": self.repo_ref, - "dataset_subset": num_structures.subset.value, - "dataset_seed": num_structures.seed, - } - else: - # Integer - use traditional num_structures approach - runner_config = { - "repo_url": self.repo_url, - "repo_ref": self.repo_ref, - "num_structures": num_structures, - "dataset_subset": "full", - } + dependencies = self._prepare_dependencies(model_packages) + runner_config = self._prepare_runner_config(num_structures) # Run locally (no Globus Compute) return run_remote_benchmark( diff --git a/garden_ai/benchmarks/templates/base_runner.py b/garden_ai/benchmarks/templates/base_runner.py new file mode 100644 index 00000000..60ed80d6 --- /dev/null +++ b/garden_ai/benchmarks/templates/base_runner.py @@ -0,0 +1,248 @@ +import concurrent.futures +import json +import logging +import multiprocessing +import os +import sys +import time +from typing import Optional + +# ------------------------------------------------------------------------------ +# BOILERPLATE: Logging & Device Setup +# ------------------------------------------------------------------------------ + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] [PID:%(process)d] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + stream=sys.stdout, + force=True, +) +logger = logging.getLogger("benchmark_runner") + + +def setup_device(gpu_id: Optional[int] = None) -> str: + """Setup compute device for this process.""" + try: + import torch + + if torch.cuda.is_available(): + return f"cuda:{gpu_id}" if gpu_id is not None else "cuda" + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + return "mps" + except ImportError: + pass + return "cpu" + + +def convert_numpy_types(obj): + """Convert numpy types to Python native types for JSON serialization.""" + import numpy as np + + if isinstance(obj, (np.integer, np.floating)): + return obj.item() + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, dict): + return {k: convert_numpy_types(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [convert_numpy_types(item) for item in obj] + return obj + + +# ------------------------------------------------------------------------------ +# USER DEFINED FUNCTIONS (Injected) +# ------------------------------------------------------------------------------ +# - load_model(config, device) +# - process_batch(batch_id, batch_data, model_config, num_threads) +# - load_dataset(config) -> List[Any] +# ------------------------------------------------------------------------------ + +# ------------------------------------------------------------------------------ +# MAIN EXECUTION LOOP +# ------------------------------------------------------------------------------ + + +def main(): + if len(sys.argv) != 2: + sys.exit("Usage: python benchmark_runner.py ") + + with open(sys.argv[1]) as f: + config = json.load(f) + + logger.info("Starting benchmark runner...") + + checkpoint_path = config.get("checkpoint_path") + results = {} + + # Load existing checkpoint if available + if checkpoint_path and os.path.exists(checkpoint_path): + logger.info(f"Loading checkpoint from {checkpoint_path}") + try: + with open(checkpoint_path) as f: + results = json.load(f) + logger.info(f"Found {len(results)} processed items in checkpoint") + except Exception as e: + logger.warning(f"Failed to load checkpoint: {e}. Starting fresh.") + + # Load Dataset + try: + all_items = load_dataset(config) # noqa: F821 + logger.info(f"Loaded {len(all_items)} total items") + except Exception as e: + logger.error(f"Failed to load dataset: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + # Filter out already processed items + # Assuming items are (id, data) tuples + items_to_process = [ + (item_id, item) for item_id, item in all_items if str(item_id) not in results + ] + + if not items_to_process: + logger.info("All items already processed!") + with open("results.json", "w") as f: + json.dump(results, f, indent=2) + return + + logger.info(f"Processing {len(items_to_process)} remaining items") + + # Shuffle for load balancing + import random + + random.seed(42) + random.shuffle(items_to_process) + + # Resource detection + try: + import torch + + num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 + except ImportError: + num_gpus = 0 + + use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1 + + total_cores = os.cpu_count() or 1 + num_workers = num_gpus if use_multi_gpu else 1 + # Reserve some cores for system/overhead if possible + available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores + threads_per_worker = max(1, available_cores // num_workers) + + logger.info( + f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)" + ) + + start_time = time.time() + + # Chunk items into smaller batches to allow frequent checkpointing + chunk_size = 1000 * num_workers + chunks = [ + items_to_process[i : i + chunk_size] + for i in range(0, len(items_to_process), chunk_size) + ] + + logger.info(f"Split into {len(chunks)} chunks for processing") + + ctx = multiprocessing.get_context("spawn") + + with concurrent.futures.ProcessPoolExecutor( + max_workers=num_workers, mp_context=ctx + ) as executor: + for chunk_idx, chunk in enumerate(chunks): + chunk_start = time.time() + logger.info( + f"Starting chunk {chunk_idx + 1}/{len(chunks)} ({len(chunk)} items)" + ) + + # Split chunk among workers + futures = [] + batch_size = (len(chunk) + num_workers - 1) // num_workers + + for i in range(num_workers): + start = i * batch_size + end = min((i + 1) * batch_size, len(chunk)) + if start < end: + batch = chunk[start:end] + + # Inject worker specific config + worker_config = config.copy() + worker_config["gpu_id"] = i if use_multi_gpu else None + + futures.append( + executor.submit( + process_batch, # noqa: F821 + i, + batch, + worker_config, + threads_per_worker, + ) + ) + + # Collect results for this chunk + chunk_results = {} + for future in concurrent.futures.as_completed(futures): + try: + batch_res = future.result() + chunk_results.update(batch_res) + except Exception as e: + logger.error(f"Worker failed in chunk {chunk_idx}: {e}") + import traceback + + traceback.print_exc() + # Critical failure - abort benchmark immediately + logger.error("Aborting benchmark due to worker failure") + sys.exit(1) + + # Update main results and save checkpoint + results.update(chunk_results) + + if checkpoint_path: + try: + tmp_path = checkpoint_path + ".tmp" + with open(tmp_path, "w") as f: + # Convert numpy types before saving checkpoint + clean_results = convert_numpy_types(results) + json.dump(clean_results, f, indent=2) + os.replace(tmp_path, checkpoint_path) + logger.info(f"Checkpoint saved to {checkpoint_path}") + except Exception as e: + logger.error(f"Failed to save checkpoint: {e}") + + elapsed = time.time() - chunk_start + logger.info(f"Chunk {chunk_idx + 1} complete in {elapsed:.1f}s") + + total_elapsed = time.time() - start_time + logger.info(f"Benchmark complete in {total_elapsed:.1f}s.") + + # Calculate metrics from results + logger.info("Calculating metrics...") + try: + metrics = calculate_metrics_remote(results, config) # noqa: F821 + logger.info(f"Metrics calculated: {metrics}") + except Exception as e: + logger.error(f"Failed to calculate metrics: {e}") + import traceback + + traceback.print_exc() + metrics = {"error": f"Metrics calculation failed: {e}"} + + # Write both results and metrics + output = {"results": results, "metrics": metrics} + + # Custom JSON encoder to handle numpy types + # convert_numpy_types moved to global scope + + # Convert numpy types before serialization + output = convert_numpy_types(output) + + with open("results.json", "w") as f: + json.dump(output, f, indent=2) + + +if __name__ == "__main__": + main() diff --git a/garden_ai/benchmarks/utils/remote.py b/garden_ai/benchmarks/utils/remote.py new file mode 100644 index 00000000..b9a4780e --- /dev/null +++ b/garden_ai/benchmarks/utils/remote.py @@ -0,0 +1,176 @@ +import json +import logging +import os +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import Any, Dict, List + +logger = logging.getLogger(__name__) + + +class RemoteBenchmarkRunner: + """ + Handles the setup and execution of benchmarks on remote Globus Compute endpoints. + + This class manages: + 1. Creating an isolated working directory + 2. Setting up a Python environment using `uv` + 3. Installing dependencies + 4. Executing the benchmark script + 5. Collecting results + """ + + def __init__(self, work_dir_prefix: str = "garden_benchmark_"): + self.work_dir = Path(tempfile.mkdtemp(prefix=work_dir_prefix)) + self.uv_bin = None + self.venv_python = None + self.env = dict(os.environ) + + # Configure logging if not already configured + if not logging.getLogger().handlers: + logging.basicConfig( + level=logging.INFO, + stream=sys.stdout, + force=True, + format="%(asctime)s [%(levelname)s] %(message)s", + ) + + def setup_environment(self, python_version: str = "3.11"): + """Find uv and create virtual environment.""" + logger.info("Setting up environment...") + + # Find UV binary + try: + self.uv_bin = subprocess.check_output( + [sys.executable, "-c", "import uv; print(uv.find_uv_bin())"], text=True + ).strip() + except subprocess.CalledProcessError: + import shutil + + self.uv_bin = shutil.which("uv") + if not self.uv_bin: + raise RuntimeError("Could not find uv binary. Please install uv.") + + # Create UV virtual environment + subprocess.run( + [self.uv_bin, "venv", "--python", python_version], + cwd=self.work_dir, + check=True, + capture_output=True, + ) + + self.venv_python = self.work_dir / ".venv/bin/python" + if not self.venv_python.exists(): + self.venv_python = ( + self.work_dir / ".venv/Scripts/python.exe" + ) # Windows fallback + + if not self.venv_python.exists(): + raise RuntimeError( + f"Virtual environment python not found at {self.venv_python}" + ) + + # Set SSL cert file for HPC if needed + self._setup_ssl_cert() + + def _setup_ssl_cert(self): + """Set SSL_CERT_FILE environment variable if certifi is available.""" + try: + certifi_path = subprocess.check_output( + [str(self.venv_python), "-c", "import certifi; print(certifi.where())"], + text=True, + ).strip() + self.env["SSL_CERT_FILE"] = certifi_path + except Exception as e: + logger.warning(f"Failed to set SSL_CERT_FILE: {e}") + + def install_dependencies(self, packages: List[str]): + """Install Python packages into the virtual environment.""" + logger.info(f"Installing dependencies: {packages}") + if not self.uv_bin or not self.venv_python: + raise RuntimeError("Environment not setup. Call setup_environment() first.") + + cmd = [ + self.uv_bin, + "pip", + "install", + "--python", + str(self.venv_python), + ] + packages + + subprocess.run(cmd, cwd=self.work_dir, check=True) + + def run_benchmark( + self, + script_content: str, + config: Dict[str, Any], + script_name: str = "benchmark_runner.py", + ) -> Dict[str, Any]: + """ + Execute the benchmark script. + + Args: + script_content: The Python script to run. + config: Configuration dictionary to pass to the script (saved as config.json). + script_name: Filename for the script. + + Returns: + Dictionary containing the results loaded from results.json. + """ + if not self.venv_python: + raise RuntimeError("Environment not setup. Call setup_environment() first.") + + logger.info("Preparing benchmark script...") + + # Write runner script + runner_path = self.work_dir / script_name + runner_path.write_text(script_content) + + # Write config + config_path = self.work_dir / "config.json" + with open(config_path, "w") as f: + json.dump(config, f, indent=2) + + logger.info("Executing benchmark...") + + # Run the runner script inside the venv + proc = subprocess.run( + [str(self.venv_python), str(runner_path), str(config_path)], + cwd=self.work_dir, + env=self.env, + stdout=sys.stdout, + stderr=sys.stderr, + check=False, + ) + + if proc.returncode != 0: + raise RuntimeError( + f"Benchmark runner failed with return code {proc.returncode}" + ) + + logger.info("Collecting results...") + results_path = self.work_dir / "results.json" + if not results_path.exists(): + raise RuntimeError( + "Results file not found - benchmark may have crashed silently" + ) + + with open(results_path) as f: + results = json.load(f) + + logger.info("Benchmark completed successfully.") + return results + + def cleanup(self): + """Remove the working directory.""" + import shutil + + shutil.rmtree(self.work_dir, ignore_errors=True) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.cleanup() diff --git a/garden_ai/benchmarks/utils/remote_execution.py b/garden_ai/benchmarks/utils/remote_execution.py new file mode 100644 index 00000000..d6541695 --- /dev/null +++ b/garden_ai/benchmarks/utils/remote_execution.py @@ -0,0 +1,202 @@ +"""Generic remote execution utility for benchmarks. + +This module contains the `run_remote_benchmark` function which is designed to be +serialized and executed on Globus Compute endpoints. It handles the boilerplate +of setting up a Python environment, installing dependencies, and running a +provided benchmark script. +""" + + +def run_remote_benchmark( + script_content: str, + dependencies: list[str], + config: dict, + checkpoint_name: str | None = None, + checkpoint_path: str | None = None, +) -> dict: + """Run a generic benchmark script on a remote Globus Compute endpoint. + + This function: + 1. Creates a temporary working directory. + 2. Sets up a Python environment using `uv`. + 3. Installs the specified dependencies. + 4. Writes the `script_content` to a file. + 5. Writes the `config` to a JSON file. + 6. Executes the script in the environment. + 7. Returns the results from `results.json`. + + Args: + script_content: The full Python script to execute. + dependencies: List of Python packages to install (e.g. ["numpy", "torch"]). + config: Dictionary of configuration parameters to pass to the script. + Written to `config.json`. + checkpoint_name: Name of the checkpoint file (e.g. "checkpoint_123.json"). + Saved to ~/.garden/benchmarks/. + checkpoint_path: Optional path to an existing checkpoint file to resume from. + If provided, this path is used directly. + + Returns: + The content of `results.json` produced by the script. + """ + # All imports must be inside the function for serialization + import json + import logging + import os + import subprocess + import sys + import tempfile + from pathlib import Path + + # Configure logging + logging.basicConfig( + level=logging.INFO, + stream=sys.stdout, + force=True, + format="%(asctime)s [%(levelname)s] %(message)s", + ) + if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(line_buffering=True) + + logger = logging.getLogger(__name__) + + # Create isolated working directory + work_dir = Path(tempfile.mkdtemp(prefix="garden_benchmark_")) + + try: + # ---------------------------------------------------------------------- + # 1. ENVIRONMENT SETUP + # ---------------------------------------------------------------------- + logger.info("Step 1/4: Setting up environment...") + + # Find UV binary + try: + uv_bin = subprocess.check_output( + [sys.executable, "-c", "import uv; print(uv.find_uv_bin())"], text=True + ).strip() + except subprocess.CalledProcessError: + import shutil + + uv_bin = shutil.which("uv") + if not uv_bin: + raise RuntimeError("Could not find uv binary. Please install uv.") + + # Create UV virtual environment + subprocess.run( + [uv_bin, "venv", "--python", "3.11"], + cwd=work_dir, + check=True, + capture_output=True, + ) + + venv_python = work_dir / ".venv/bin/python" + if not venv_python.exists(): + venv_python = work_dir / ".venv/Scripts/python.exe" # Windows fallback + + if not venv_python.exists(): + raise RuntimeError(f"Virtual environment python not found at {venv_python}") + + # Install dependencies + logger.info(f"Installing dependencies: {dependencies}") + # Install in one go for better resolution + cmd = [uv_bin, "pip", "install", "--python", str(venv_python)] + dependencies + subprocess.run( + cmd, + cwd=work_dir, + check=True, + ) + + # Set SSL cert file for HPC if needed + env = dict(os.environ) + + # Propagate common useful env vars if present + for key in ["MBD_AUTO_DOWNLOAD_FILES", "HF_TOKEN", "WANDB_API_KEY"]: + if key in os.environ: + env[key] = os.environ[key] + + try: + certifi_path = subprocess.check_output( + [str(venv_python), "-c", "import certifi; print(certifi.where())"], + text=True, + ).strip() + env["SSL_CERT_FILE"] = certifi_path + except Exception as e: + logger.warning(f"Failed to set SSL_CERT_FILE: {e}") + + # ---------------------------------------------------------------------- + # 2. PREPARE BENCHMARK SCRIPT + # ---------------------------------------------------------------------- + logger.info("Step 2/4: Preparing benchmark script...") + + # Write runner script + runner_path = work_dir / "benchmark_runner.py" + runner_path.write_text(script_content) + + # Determine checkpoint path + if checkpoint_path: + # User provided a specific path to resume from + final_checkpoint_path = checkpoint_path + elif checkpoint_name: + # Use persistent location in user home + checkpoint_dir = Path.home() / ".garden" / "benchmarks" + checkpoint_dir.mkdir(parents=True, exist_ok=True) + final_checkpoint_path = str(checkpoint_dir / checkpoint_name) + else: + # Fallback to tmp dir if no name provided (legacy behavior) + final_checkpoint_path = str(work_dir / "checkpoint.json") + + config["checkpoint_path"] = final_checkpoint_path + + # Log checkpoint path prominently for user reference + print(f"{'=' * 80}") + print(f"Checkpoint will be saved to: {final_checkpoint_path}") + print("To resume this job if it fails, use:") + print(f' checkpoint_path="{final_checkpoint_path}"') + print(f"{'=' * 80}") + + # Write config + config_path = work_dir / "config.json" + with open(config_path, "w") as f: + json.dump(config, f, indent=2) + + # ---------------------------------------------------------------------- + # 3. EXECUTE BENCHMARK + # ---------------------------------------------------------------------- + logger.info("Step 3/4: Executing benchmark...") + + # Run the runner script inside the venv + # DO NOT capture output - let it stream to stdout/stderr in real-time + # so we can see errors immediately + proc = subprocess.run( + [str(venv_python), str(runner_path), str(config_path)], + cwd=work_dir, + env=env, + check=False, # Don't raise immediately, we'll check returncode + ) + + if proc.returncode != 0: + raise RuntimeError( + f"Benchmark runner failed with return code {proc.returncode}." + ) + + # ---------------------------------------------------------------------- + # 4. COLLECT RESULTS + # ---------------------------------------------------------------------- + logger.info("Step 4/4: Collecting results...") + + results_path = work_dir / "results.json" + if not results_path.exists(): + raise RuntimeError( + "Results file not found - benchmark may have crashed silently" + ) + + with open(results_path) as f: + results = json.load(f) + + logger.info("Benchmark completed successfully.") + return results + + finally: + # Cleanup working directory + import shutil + + shutil.rmtree(work_dir, ignore_errors=True) diff --git a/garden_ai/benchmarks/utils/script_builder.py b/garden_ai/benchmarks/utils/script_builder.py new file mode 100644 index 00000000..7987bdc7 --- /dev/null +++ b/garden_ai/benchmarks/utils/script_builder.py @@ -0,0 +1,96 @@ +import inspect +from pathlib import Path +from typing import Callable + + +class BenchmarkScriptBuilder: + """Helper to build a self-contained benchmark script from a template.""" + + def __init__(self, template_path: str | Path = None): + if template_path is None: + # Default to the base_runner.py in templates + template_path = ( + Path(__file__).parent.parent / "templates" / "base_runner.py" + ) + + self.template_path = Path(template_path) + self.imports = set() + self.functions = [] + self.preamble = [] + + def add_import(self, import_stmt: str): + """Add an import statement (e.g. 'import numpy as np').""" + self.imports.add(import_stmt) + return self + + def add_preamble(self, code: str): + """Add arbitrary code to the top of the script (after imports).""" + self.preamble.append(code) + return self + + def add_function(self, func: Callable, name: str = None): + """Add a function definition to the script. + + The function source code is inspected and appended. + If name is provided, the function definition is renamed. + """ + source = inspect.getsource(func) + + if name: + import re + + # Replace 'def old_name(' with 'def new_name(' + # This is a simple regex replacement, assuming standard formatting + pattern = r"def\s+" + func.__name__ + r"\s*\(" + replacement = f"def {name}(" + source = re.sub(pattern, replacement, source, count=1) + + self.functions.append(source) + return self + + def build(self) -> str: + """Assemble the final script.""" + if not self.template_path.exists(): + raise FileNotFoundError(f"Template not found at {self.template_path}") + + template_content = self.template_path.read_text() + + # Construct sections + imports_block = "\n".join(sorted(self.imports)) + preamble_block = "\n".join(self.preamble) + functions_block = "\n\n".join(self.functions) + + # We inject our custom code BEFORE the template's main execution logic + # but AFTER the template's own imports (which are inside the file). + # Actually, the template has imports at the top. We should probably prepend ours. + + # Simple strategy: Prepend everything to the template, but the template + # has "USER DEFINED FUNCTIONS" placeholders. We can just append our functions + # before the main block? + + # Better strategy: The template is designed to have functions injected. + # Let's just put imports at the top, then functions, then the template content. + # But we need to be careful about imports in the template. + + final_script = f""" +# ------------------------------------------------------------------------------ +# INJECTED IMPORTS +# ------------------------------------------------------------------------------ +{imports_block} + +# ------------------------------------------------------------------------------ +# INJECTED PREAMBLE +# ------------------------------------------------------------------------------ +{preamble_block} + +# ------------------------------------------------------------------------------ +# INJECTED FUNCTIONS +# ------------------------------------------------------------------------------ +{functions_block} + +# ------------------------------------------------------------------------------ +# BASE RUNNER TEMPLATE +# ------------------------------------------------------------------------------ +{template_content} +""" + return final_script diff --git a/garden_ai/benchmarks/utils/task.py b/garden_ai/benchmarks/utils/task.py new file mode 100644 index 00000000..4ca2fd60 --- /dev/null +++ b/garden_ai/benchmarks/utils/task.py @@ -0,0 +1,132 @@ +import json +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import Any, Dict, Optional + + +class BaseBenchmarkTask: + """ + Base class for benchmark tasks. + + Provides common utilities for: + - Extracting model metadata (package, factory, kwargs) + - Running benchmarks locally for testing + """ + + def __init__( + self, adapter, repo_url: str, repo_ref: str, model_package: Optional[str] = None + ): + self.adapter = adapter + self.repo_url = repo_url + self.repo_ref = repo_ref + self.model_package = model_package + + def _extract_model_config( + self, + model: Any = None, + model_package: Optional[str] = None, + model_factory: Optional[str] = None, + model_kwargs: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + """ + Helper to resolve model configuration from either a local instance or explicit arguments. + """ + model_checkpoint = None + + if model is not None: + # Extract info from local model instance + if model_package is None: + if self.model_package is not None: + model_package = self.model_package + else: + # Infer from model's module + model_package = model.__class__.__module__.split(".")[0] + + if model_factory is None: + model_factory = model.__class__.__name__ + + # Get checkpoint path if model has one + if hasattr(model, "checkpoint_path"): + model_checkpoint = model.checkpoint_path + elif hasattr(model, "checkpoint"): + model_checkpoint = model.checkpoint + + # Try to extract initialization kwargs if available + if model_kwargs is None and hasattr(model, "_init_kwargs"): + model_kwargs = model._init_kwargs + + else: + # Must provide explicit construction info + if model_package is None or model_factory is None: + raise ValueError( + "If model is not provided, must specify both " + "model_package and model_factory" + ) + + if model_kwargs is None: + model_kwargs = {} + + return { + "model_package": model_package, + "model_factory": model_factory, + "model_kwargs": model_kwargs, + "model_checkpoint": model_checkpoint, + } + + def _run_local_wrapper( + self, runner_func_path: str, runner_func_name: str, config: Dict[str, Any] + ) -> Dict[str, Any]: + """ + Execute a benchmark runner function locally in a subprocess. + + Args: + runner_func_path: Import path to the runner function (e.g. 'garden_ai.benchmarks.matbench_discovery.remote_runner') + runner_func_name: Name of the runner function (e.g. 'run_matbench_is2re') + config: Configuration dictionary to pass to the runner function. + """ + results_file_path = ( + Path(tempfile.gettempdir()) / f"benchmark_results_{id(config)}.json" + ) + + wrapper_script = f''' +import json +from {runner_func_path} import {runner_func_name} + +config = {repr(config)} +results = {runner_func_name}(**config) + +with open("{results_file_path}", "w") as f: + json.dump(results, f, indent=2) +''' + + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: + f.write(wrapper_script) + wrapper_path = f.name + + try: + # Run without capturing output so logs stream to console in real-time + result = subprocess.run( + [sys.executable, wrapper_path], + timeout=3600, + stdout=None, + stderr=None, + ) + + if result.returncode != 0: + raise RuntimeError( + f"Local benchmark failed with return code {result.returncode}" + ) + + if not results_file_path.exists(): + raise RuntimeError( + f"Benchmark results file not found at {results_file_path}" + ) + + with open(results_file_path) as f: + return json.load(f) + + finally: + Path(wrapper_path).unlink(missing_ok=True) + results_file_path.unlink(missing_ok=True) From f5b888e789615a3019b4a1b8e41f89a5531754bb Mon Sep 17 00:00:00 2001 From: hholb Date: Thu, 4 Dec 2025 11:15:42 -0700 Subject: [PATCH 05/23] refactor to use groundhog functions --- garden_ai/benchmarks/__init__.py | 5 +- .../benchmarks/matbench_discovery/__init__.py | 230 +--- .../examples/dummy_model.py | 17 + .../examples/matbench_mace_multi_gpu.py | 80 +- .../examples/test_hog_refactor.py | 70 + .../benchmarks/matbench_discovery/remote.py | 485 ------- .../benchmarks/matbench_discovery/tasks.py | 1163 +++++++++++++---- garden_ai/benchmarks/templates/base_runner.py | 248 ---- garden_ai/benchmarks/utils/script_builder.py | 23 +- 9 files changed, 1003 insertions(+), 1318 deletions(-) create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py delete mode 100644 garden_ai/benchmarks/matbench_discovery/remote.py delete mode 100644 garden_ai/benchmarks/templates/base_runner.py diff --git a/garden_ai/benchmarks/__init__.py b/garden_ai/benchmarks/__init__.py index 329de6cc..5d40ae5a 100644 --- a/garden_ai/benchmarks/__init__.py +++ b/garden_ai/benchmarks/__init__.py @@ -7,12 +7,13 @@ - MatbenchDiscovery: Materials discovery benchmark suite """ -from .matbench_discovery import IS2RETask, MatbenchDiscovery, MatbenchTask +from .matbench_discovery.enums import DatasetSize, MatbenchTask +from .matbench_discovery.tasks import MatbenchDiscovery __all__ = [ "MatbenchDiscovery", "MatbenchTask", - "IS2RETask", + "DatasetSize", ] diff --git a/garden_ai/benchmarks/matbench_discovery/__init__.py b/garden_ai/benchmarks/matbench_discovery/__init__.py index 3256522e..1a5b9516 100644 --- a/garden_ai/benchmarks/matbench_discovery/__init__.py +++ b/garden_ai/benchmarks/matbench_discovery/__init__.py @@ -1,236 +1,10 @@ -"""Matbench Discovery benchmark adapter for Garden AI. - -This module provides a clean interface for running Matbench Discovery benchmarks -on remote HPC systems via Globus Compute. It handles environment setup, -dependency installation, and benchmark execution. - -Example usage: - >>> from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery - >>> from my_model import MyModel - >>> - >>> # Configure for your HPC endpoint - >>> endpoint_id = "your-endpoint-uuid" - >>> endpoint_config = { - ... "account": "project-account", - ... "partition": "gpu", - ... "scheduler_options": "#SBATCH --gpus-per-node=1" - ... } - >>> - >>> # Run benchmark - >>> with MatbenchDiscovery(endpoint_id, endpoint_config) as bench: - ... model = MyModel() - ... task = bench.tasks.IS2RE - ... future = task.submit(model, num_structures=100) - ... results = future.result() - ... metrics = task.calculate_metrics(results) - ... print(metrics) -""" - -from typing import Any - -from globus_compute_sdk import Executor -from globus_compute_sdk.serialize import CombinedCode, ComputeSerializer +"""Matbench Discovery benchmark adapter for Garden AI.""" from .enums import DatasetSize, MatbenchTask -from .tasks import ( - IP2ETask, - IS2ETask, - IS2RETask, - RP2RETask, - RS2RETask, - S2EFSMTask, - S2EFSTask, - S2EFTask, - S2ETask, - S2RETask, -) +from .tasks import MatbenchDiscovery __all__ = [ "MatbenchDiscovery", "MatbenchTask", "DatasetSize", - "IS2RETask", - "RS2RETask", - "S2EFSTask", - "S2EFTask", - "S2EFSMTask", - "IS2ETask", - "S2ETask", - "S2RETask", - "RP2RETask", - "IP2ETask", ] - - -class MatbenchDiscovery: - """Adapter for running Matbench Discovery benchmarks locally or remotely. - - This class manages the lifecycle of benchmark execution: - - Provides access to benchmark tasks (IS2RE, etc.) - - For remote execution: creates and manages Globus Compute executor - - For local execution: runs in ephemeral UV environment - - Use as a context manager to ensure proper cleanup: - # Local execution - with MatbenchDiscovery() as bench: - result = bench.tasks.IS2RE.local(...) - - # Remote execution - with MatbenchDiscovery(endpoint_id="uuid", endpoint_config={...}) as bench: - future = bench.tasks.IS2RE.submit(...) - - Attributes: - tasks: Namespace containing available benchmark tasks - - tasks.IS2RE: Initial Structure to Relaxed Energy task - """ - - # Matbench Discovery repository configuration - REPO_URL = "https://github.com/janosh/matbench-discovery" - REPO_REF = "main" - PYTHON_VERSION = "3.11" - - def __init__( - self, - endpoint_id: str | None = None, - user_endpoint_config: dict[str, Any] | None = None, - repo_ref: str | None = None, - model_package: str | None = None, - ): - """Initialize Matbench Discovery adapter. - - Args: - endpoint_id: Globus Compute endpoint UUID for remote execution. - If None, only local execution (.local()) is available. - user_endpoint_config: Optional HPC configuration for remote endpoint. - Example for SLURM: - { - "account": "project-account", - "partition": "gpu-debug", - "scheduler_options": "#SBATCH --gpus-per-node=1" - } - repo_ref: Git branch/tag/commit to use (default: "main") - model_package: Default model package to install for all tasks - (can be overridden per task) - """ - self.endpoint_id = endpoint_id - self.user_endpoint_config = user_endpoint_config or {} - - # Ensure 'requirements' is present to avoid endpoint template errors - if "requirements" not in self.user_endpoint_config: - self.user_endpoint_config["requirements"] = "" - - self.repo_ref = repo_ref or self.REPO_REF - self.model_package = model_package - - # Executor is created lazily on first submit() call - self._executor: Executor | None = None - self.tasks: Any = None - - def _get_executor(self) -> Executor: - """Get or create the Globus Compute executor (lazy initialization). - - Returns: - Executor instance - - Raises: - ValueError: If endpoint_id was not provided during initialization - """ - if self._executor is None: - if self.endpoint_id is None: - raise ValueError( - "endpoint_id is required for remote execution. " - "Either provide endpoint_id during initialization or use .local() method." - ) - - executor_kwargs = {"endpoint_id": self.endpoint_id} - if self.user_endpoint_config: - executor_kwargs["user_endpoint_config"] = self.user_endpoint_config - - # Use CombinedCode serialization to send actual function code - # rather than module references (avoids needing garden_ai installed remotely) - executor_kwargs["serializer"] = ComputeSerializer( - strategy_code=CombinedCode() - ) - - self._executor = Executor(**executor_kwargs) - - return self._executor - - def __enter__(self): - """Set up tasks when entering context.""" - # Initialize tasks - executor will be created lazily when needed - # Using a simple namespace object for dot access - self.tasks = type( - "Tasks", - (), - { - "IS2RE": IS2RETask( - adapter=self, - repo_url=self.REPO_URL, - repo_ref=self.repo_ref, - model_package=self.model_package, - ), - "RS2RE": RS2RETask( - adapter=self, - repo_url=self.REPO_URL, - repo_ref=self.repo_ref, - model_package=self.model_package, - ), - "S2EFS": S2EFSTask( - adapter=self, - repo_url=self.REPO_URL, - repo_ref=self.repo_ref, - model_package=self.model_package, - ), - "S2EF": S2EFTask( - adapter=self, - repo_url=self.REPO_URL, - repo_ref=self.repo_ref, - model_package=self.model_package, - ), - "S2EFSM": S2EFSMTask( - adapter=self, - repo_url=self.REPO_URL, - repo_ref=self.repo_ref, - model_package=self.model_package, - ), - "IS2E": IS2ETask( - adapter=self, - repo_url=self.REPO_URL, - repo_ref=self.repo_ref, - model_package=self.model_package, - ), - "S2E": S2ETask( - adapter=self, - repo_url=self.REPO_URL, - repo_ref=self.repo_ref, - model_package=self.model_package, - ), - "S2RE": S2RETask( - adapter=self, - repo_url=self.REPO_URL, - repo_ref=self.repo_ref, - model_package=self.model_package, - ), - "RP2RE": RP2RETask( - adapter=self, - repo_url=self.REPO_URL, - repo_ref=self.repo_ref, - model_package=self.model_package, - ), - "IP2E": IP2ETask( - adapter=self, - repo_url=self.REPO_URL, - repo_ref=self.repo_ref, - model_package=self.model_package, - ), - }, - )() - - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - """Clean up executor when exiting context.""" - if self._executor: - self._executor.shutdown(wait=True) - return False # Don't suppress exceptions diff --git a/garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py b/garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py new file mode 100644 index 00000000..745eb1b1 --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py @@ -0,0 +1,17 @@ +def create_dummy_model(device): + """Create a dummy calculator for testing.""" + import numpy as np + from ase.calculators.calculator import Calculator, all_changes + + class DummyCalc(Calculator): + implemented_properties = ["energy", "forces", "stress"] + + def calculate( + self, atoms=None, properties=["energy"], system_changes=all_changes + ): + super().calculate(atoms, properties, system_changes) + self.results["energy"] = -1.0 * len(self.atoms) + self.results["forces"] = np.zeros((len(self.atoms), 3)) + self.results["stress"] = np.zeros(6) + + return DummyCalc() diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py index 9f971086..3707f57f 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py @@ -6,17 +6,6 @@ from garden_ai.benchmarks.matbench_discovery import DatasetSize, MatbenchDiscovery -# Globus Compute endpoint -ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" - -# HPC endpoint configuration -ENDPOINT_CONFIG = { - "account": "cis250461-gpu", - "partition": "gpu-debug", - "qos": "gpu", - "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8", -} - # Model factory function for MACE def create_mace_model(device): @@ -25,61 +14,16 @@ def create_mace_model(device): return mace_mp(model="medium", device=device, default_dtype="float64") -NUM_STRUCTURES = DatasetSize.RANDOM_100 - - -def main(): - """Run Matbench Discovery IS2RE benchmark with MACE.""" - - with MatbenchDiscovery( - endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG - ) as bench: - # Run IS2RE task (Initial Structure to Relaxed Energy) - future = bench.tasks.IS2RE.submit( - model_factory=create_mace_model, - model_packages="mace-torch", - num_structures=NUM_STRUCTURES, - ) - - print("Job submitted! Waiting for results (this may take a while)...") - - try: - output = future.result() - metrics = output.get("metrics", {}) - - if "error" in metrics: - print(f"error : {metrics['error']}") - else: - # Discovery metrics (stability classification) - if "F1" in metrics: - print(f"F1 : {metrics['F1']:.6f}") - print(f"DAF : {metrics['DAF']:.2f}x") - print(f"Precision : {metrics['Precision']:.6f}") - print(f"Recall : {metrics['Recall']:.6f}") - print(f"Accuracy : {metrics['Accuracy']:.6f}") - - # Regression metrics - if "MAE" in metrics: - print(f"MAE (eV/atom) : {metrics['MAE']:.6f}") - print(f"RMSE (eV/atom) : {metrics['RMSE']:.6f}") - print(f"R2 : {metrics['R2']:.6f}") - - # Force metrics (if S2EFS task) - if "force_mae" in metrics: - print(f"force_mae : {metrics['force_mae']:.6f}") - print(f"force_rmse : {metrics['force_rmse']:.6f}") - print(f"force_r2 : {metrics['force_r2']:.6f}") - print(f"stress_mae : {metrics['stress_mae']:.6f}") - print(f"stress_rmse : {metrics['stress_rmse']:.6f}") - print(f"stress_r2 : {metrics['stress_r2']:.6f}") - - if "num_evaluated" in metrics: - print(f"num_evaluated : {metrics['num_evaluated']}") - - except Exception as e: - print(f"\n[ERROR] Benchmark failed: {e}") - raise - +results = MatbenchDiscovery.IS2RE.remote( + endpoint="5aafb4c1-27b2-40d8-a038-a0277611868f", + walltime="01:00:00", + scheduler_options={"gpus-per-node": 2, "cores-per-node": 16}, + account="youraccount", + partition="gpu-debug", + qos="gpu", + model_factory=create_mace_model, + model_packages="mace-torch", + num_structures=DatasetSize.RANDOM_100, +) -if __name__ == "__main__": - main() +print(results["metrics"]) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py b/garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py new file mode 100644 index 00000000..eccf0489 --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +""" +Test Matbench Discovery refactor with Groundhog HPC. +""" + +import os + +from dummy_model import create_dummy_model + +from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery + +# ============================================================================= +# Configuration +# ============================================================================= + +# Globus Compute endpoint (use local if possible, or the one from example) +ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" + +# HPC endpoint configuration +ENDPOINT_CONFIG = { + "account": "cis250461-gpu", + "partition": "gpu", + "qos": "gpu", + "scheduler_options": "#SBATCH --gpus-per-node=1\n", + "cores_per_node": 4, + "mem_per_node": 16, +} + +# ============================================================================= +# Model Factory Functions +# ============================================================================= + + +def main(): + """Run benchmarks on all models and save results.""" + + print("=" * 80) + print("Matbench Discovery Test - Groundhog Refactor") + print("=" * 80) + + print("Running LOCAL test...") + + # Ensure subprocess can find dummy_model + cwd = os.getcwd() + os.environ["PYTHONPATH"] = cwd + os.pathsep + os.environ.get("PYTHONPATH", "") + + try: + # Run locally using the new static method API + output = MatbenchDiscovery.IS2RE.local( + model_factory=create_dummy_model, + model_packages=["numpy", "ase"], # Minimal deps + num_structures=1, + sys_path=[os.getcwd()], + ) + print("Local run output keys:", output.keys()) + if "error" in output.get("metrics", {}): + print("Local metrics error:", output["metrics"]["error"]) + else: + print("Local run successful!") + print("Metrics:", output.get("metrics")) + + except Exception as e: + print(f"Local run failed: {e}") + import traceback + + traceback.print_exc() + + +if __name__ == "__main__": + main() diff --git a/garden_ai/benchmarks/matbench_discovery/remote.py b/garden_ai/benchmarks/matbench_discovery/remote.py deleted file mode 100644 index 247d2ba2..00000000 --- a/garden_ai/benchmarks/matbench_discovery/remote.py +++ /dev/null @@ -1,485 +0,0 @@ -"""Remote functions for Matbench Discovery benchmark. - -These functions are injected into the remote script. -They must be self-contained (imports inside or provided by builder). -""" - -from typing import Any, Callable, Dict, List, Optional, Tuple - -# ------------------------------------------------------------------------------ -# Common Helpers -# ------------------------------------------------------------------------------ - - -def _process_batch_common( - batch_id: int, - structures: List[Tuple[str, Any]], - model_config: Dict[str, Any], - num_threads: int, - compute_fn: Callable[[Any, Any], Dict[str, Any]], - task_name: str, -) -> Dict[str, Any]: - """Common logic for processing a batch of structures. - - Args: - batch_id: ID of the current batch - structures: List of (id, atoms) tuples - model_config: Configuration for the model - num_threads: Number of threads to use - compute_fn: Function taking (model, atoms) and returning a result dict - task_name: Name of the task for logging - """ - import logging - import os - import time - - import torch - - # Configure thread limits to avoid contention - os.environ["OMP_NUM_THREADS"] = str(num_threads) - torch.set_num_threads(num_threads) - - gpu_id = model_config.get("gpu_id") - device = setup_device(gpu_id) # noqa: F821 - - worker_logger = logging.getLogger(f"worker_{batch_id}") - worker_logger.info( - f"Started {task_name} on {device} with {len(structures)} structures. Threads: {num_threads}" - ) - - global _MODEL_CACHE - try: - if _MODEL_CACHE is None: - model = load_model(device) # noqa: F821 - _MODEL_CACHE = model - else: - model = _MODEL_CACHE - except Exception as e: - worker_logger.error(f"Failed to initialize model: {e}") - worker_logger.error( - "Model initialization is critical - cannot continue benchmark" - ) - raise RuntimeError(f"Model initialization failed: {e}") from e - - results = {} - batch_start = time.time() - - for i, (struct_id, atoms) in enumerate(structures): - try: - # Run the specific computation - result = compute_fn(model, atoms) - results[struct_id] = result - - if (i + 1) % 10 == 0: - elapsed = time.time() - batch_start - rate = (i + 1) / elapsed if elapsed > 0 else 0 - worker_logger.info( - f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)" - ) - - except Exception as e: - worker_logger.warning(f"Structure {struct_id} failed: {e}") - results[struct_id] = {"error": str(e)} - - return results - - -def _load_dataset_common( - config: Dict[str, Any], - zip_path: str, - read_format: str = "extxyz", - read_index: str | slice = None, -) -> List[Tuple[str, Any]]: - """Common logic for loading datasets from a zip file.""" - from io import TextIOWrapper - from zipfile import ZipFile - - from ase.io import read - - # get_material_ids_for_subset is injected - dataset_subset = config.get("dataset_subset", "full") - dataset_seed = config.get("dataset_seed", 42) - mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed) # noqa: F821 - - structures = [] - - with ZipFile(zip_path, "r") as zf: - if mat_ids is None: - # Load all files (full dataset) - # Sort by numeric ID if possible - file_list = sorted( - zf.namelist(), - key=lambda x: int(x.split(".")[0]) - if x.split(".")[0].isdigit() - else float("inf"), - ) - num_structures = config.get("num_structures", 100) - file_list = file_list[:num_structures] - else: - # Filter to specific material IDs - mat_id_set = set(mat_ids) - file_list = [ - f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set - ] - - for filename in file_list: - with zf.open(filename) as f: - text_stream = TextIOWrapper(f, encoding="utf-8") - if read_index is not None: - atoms_list = read(text_stream, format=read_format, index=read_index) - # If we got a list and need one item, take the last one (common for trajectories) - if isinstance(atoms_list, list) and atoms_list: - structures.append((filename, atoms_list[-1])) - elif not isinstance(atoms_list, list): - structures.append((filename, atoms_list)) - else: - structures.append((filename, read(text_stream, format=read_format))) - - return structures - - -# ------------------------------------------------------------------------------ -# Injected Functions -# ------------------------------------------------------------------------------ - - -def load_model(device: str): - """Initialize the model using the user-provided factory function. - - The factory function is injected into this script by the benchmark framework. - """ - # Call the user's factory function (injected as load_model_user) - model = load_model_user(device) # noqa: F821 - return model - - -def get_material_ids_for_subset( - subset_type: str, seed: int = 42 -) -> Optional[List[str]]: - """Get material IDs for a specific dataset subset. - - Args: - subset_type: One of 'full', 'unique_protos', 'random_10k', 'random_100' - seed: Random seed for sampling (default: 42) - - Returns: - List of material IDs, or None for 'full' (load all) - """ - if subset_type == "full": - return None # Load all materials - - import pandas as pd - from matbench_discovery.data import DataFiles - - # Load wbm_summary - df = pd.read_csv(DataFiles.wbm_summary.path) - - if subset_type == "unique_protos": - # Filter to unique prototypes (removes duplicates and MP overlaps) - df_filtered = df.query("unique_prototype") - return df_filtered["material_id"].tolist() - - elif subset_type == "random_10k": - # Random sample of 10k unique prototypes (fixed seed for reproducibility) - df_filtered = df.query("unique_prototype") - df_sampled = df_filtered.sample(n=10000, random_state=seed) - return df_sampled["material_id"].tolist() - - elif subset_type == "random_100": - # Random sample of 100 unique prototypes (fixed seed for reproducibility) - # Useful for quick end-to-end testing - df_filtered = df.query("unique_prototype") - df_sampled = df_filtered.sample(n=100, random_state=seed) - return df_sampled["material_id"].tolist() - - else: - raise ValueError(f"Unknown subset_type: {subset_type}") - - -def process_batch_relaxation( - batch_id: int, - structures: List[Tuple[str, Any]], - model_config: Dict[str, Any], - num_threads: int, -) -> Dict[str, Any]: - """Process a batch of structures for IS2RE (Relaxation).""" - from ase.optimize import FIRE - - def compute(model, atoms): - atoms.calc = model - opt = FIRE(atoms, logfile=None) - opt.run(fmax=0.05, steps=500) - energy = atoms.get_potential_energy() - return {"energy": energy} - - return _process_batch_common( - batch_id, structures, model_config, num_threads, compute, "relaxation" - ) - - -def process_batch_static( - batch_id: int, - structures: List[Tuple[str, Any]], - model_config: Dict[str, Any], - num_threads: int, -) -> Dict[str, Any]: - """Process a batch of structures for RS2RE (Static Calculation).""" - - def compute(model, atoms): - atoms.calc = model - energy = atoms.get_potential_energy() - return {"energy": energy} - - return _process_batch_common( - batch_id, structures, model_config, num_threads, compute, "static calculation" - ) - - -def process_batch_forces( - batch_id: int, - structures: List[Tuple[str, Any]], - model_config: Dict[str, Any], - num_threads: int, -) -> Dict[str, Any]: - """Process a batch of structures for S2EFS (Energy, Forces, Stress).""" - - def compute(model, atoms): - atoms.calc = model - energy = atoms.get_potential_energy() - forces = atoms.get_forces().tolist() - stress = atoms.get_stress().tolist() - return {"energy": energy, "forces": forces, "stress": stress} - - return _process_batch_common( - batch_id, structures, model_config, num_threads, compute, "forces calculation" - ) - - -def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Tuple[str, Any]]: - """Load initial structures for IS2RE.""" - from matbench_discovery.data import DataFiles - - return _load_dataset_common(config, DataFiles.wbm_initial_atoms.path) - - -def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Tuple[str, Any]]: - """Load relaxed structures for RS2RE.""" - from matbench_discovery.data import DataFiles - - return _load_dataset_common(config, DataFiles.wbm_relaxed_atoms.path) - - -def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Tuple[str, Any]]: - """Load MP trajectories for S2EFS.""" - from matbench_discovery.data import DataFiles - - # Use index=":" to read all frames, but _load_dataset_common handles taking the last one - return _load_dataset_common(config, DataFiles.mp_trj_extxyz.path, read_index=":") - - -def calculate_metrics_energy( - results: Dict[str, Any], config: Dict[str, Any] -) -> Dict[str, Any]: - """Calculate energy metrics using matbench-discovery's stable_metrics algorithm. - - Uses the injected stable_metrics function. - Returns: F1, DAF, Precision, Recall, Accuracy, TPR, FPR, TNR, FNR, TP, FP, TN, FN, MAE, RMSE, R2 - """ - import logging - - import numpy as np - - logger = logging.getLogger("metrics") - - # Results format: {id: {"energy": float, "error": str}} - if len(results) == 0: - return {"error": "No results to evaluate"} - - try: - # Import matbench-discovery data - from matbench_discovery.data import df_wbm - except Exception as e: - return {"error": f"Failed to import matbench-discovery: {e}"} - - # Extract model energies - model_energies = {} - for sid, res in results.items(): - if isinstance(res, dict) and res.get("energy") is not None: - mat_id = sid.replace(".extxyz", "") - model_energies[mat_id] = res["energy"] - - if not model_energies: - return {"error": "No valid energies found in results"} - - # Get common IDs between predictions and ground truth - # Use direct string column names instead of MbdKey enum to avoid issues - df_wbm_indexed = df_wbm.set_index("material_id") - common_ids = list(set(model_energies.keys()) & set(df_wbm_indexed.index)) - - if not common_ids: - return {"error": "No matching IDs between results and ground truth"} - - # Get subset of data - df_subset = df_wbm_indexed.loc[common_ids] - - # Calculate predicted formation energies - y_pred = np.array([model_energies[mid] for mid in common_ids]) - y_true = df_subset["uncorrected_energy"].values # Uncorrected total energy - n_atoms = df_subset["n_sites"].values - - # Predicted formation energy ERROR per atom (from total energy difference) - # This is the ERROR: (E_pred - E_dft) / n_atoms - e_form_error = (y_pred - y_true) / n_atoms - - # Get ground truth e_above_hull for stability classification - each_true = df_subset["e_above_hull_mp2020_corrected_ppd_mp"].values - - # Calculate predicted e_above_hull - # Since e_form_error is already (E_pred - E_dft)/n_atoms, we just add it to each_true - each_pred = each_true + e_form_error - - # Debug logging to understand the distribution - logger.info("Energy statistics:") - logger.info( - f" each_true: min={each_true.min():.4f}, max={each_true.max():.4f}, mean={each_true.mean():.4f}" - ) - logger.info( - f" each_pred: min={each_pred.min():.4f}, max={each_pred.max():.4f}, mean={each_pred.mean():.4f}" - ) - - # Calculate global prevalence for DAF normalization (matches official leaderboard) - # Filter to unique prototypes - df_unique = df_wbm.query("unique_prototype") - # Calculate prevalence: (stable count) / (total count) - # Stability threshold is 0.0 - stable_count = (df_unique["e_above_hull_mp2020_corrected_ppd_mp"] <= 0).sum() - global_prevalence = stable_count / len(df_unique) - - logger.info( - f"Using global prevalence for DAF: {global_prevalence:.6f} ({stable_count}/{len(df_unique)})" - ) - - # Calculate metrics using the injected function - # stable_metrics is injected into the script scope - metrics = stable_metrics(each_true, each_pred, prevalence=global_prevalence) # noqa: F821 - - # Add num_evaluated - metrics["num_evaluated"] = len(common_ids) - - return metrics - - -def calculate_metrics_forces( - results: Dict[str, Any], config: Dict[str, Any] -) -> Dict[str, Any]: - """Calculate comprehensive S2EFS metrics (Energy, Forces, Stress). - - Returns MAE, RMSE, and R2 for each component. - """ - from io import TextIOWrapper - from zipfile import ZipFile - - import numpy as np - from ase.io import read - from matbench_discovery.data import DataFiles - from sklearn.metrics import r2_score - - # We need to load ground truth from the dataset itself because MP trj has E/F/S in the extxyz - # This is expensive to re-read. Ideally we should have passed GT in results or loaded it efficiently. - # For now, let's re-read the GT for the processed IDs. - - metrics = { - "energy_mae": [], - "energy_rmse": [], - "force_mae": [], - "force_rmse": [], - "stress_mae": [], - "stress_rmse": [], - } - - # Collect all predictions and ground truth for R2 calculation - all_e_pred, all_e_true = [], [] - all_f_pred, all_f_true = [], [] - all_s_pred, all_s_true = [], [] - - zip_path = DataFiles.mp_trj_extxyz.path - - with ZipFile(zip_path, "r") as zf: - for sid, res in results.items(): - if "error" in res: - continue - - try: - with zf.open(sid) as f: - text_stream = TextIOWrapper(f, encoding="utf-8") - atoms_list = read(text_stream, format="extxyz", index=":") - gt_atoms = atoms_list[-1] # Matching load_dataset logic - - # Energy (per atom) - e_pred = res["energy"] - e_true = gt_atoms.get_potential_energy() - n_atoms = len(gt_atoms) - - energy_error = abs(e_pred - e_true) / n_atoms - metrics["energy_mae"].append(energy_error) - metrics["energy_rmse"].append(energy_error**2) - - all_e_pred.append(e_pred / n_atoms) - all_e_true.append(e_true / n_atoms) - - # Forces - f_pred = np.array(res["forces"]) - f_true = gt_atoms.get_forces() - force_error = np.abs(f_pred - f_true) - metrics["force_mae"].append(force_error.mean()) - metrics["force_rmse"].append((force_error**2).mean()) - - all_f_pred.extend(f_pred.flatten()) - all_f_true.extend(f_true.flatten()) - - # Stress - s_pred = np.array(res["stress"]) - s_true = gt_atoms.get_stress() - stress_error = np.abs(s_pred - s_true) - metrics["stress_mae"].append(stress_error.mean()) - metrics["stress_rmse"].append((stress_error**2).mean()) - - all_s_pred.extend(s_pred.flatten()) - all_s_true.extend(s_true.flatten()) - - except Exception: - pass - - # Calculate final metrics - result_metrics = {} - - if metrics["energy_mae"]: - result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"])) - result_metrics["energy_rmse"] = float(np.sqrt(np.mean(metrics["energy_rmse"]))) - result_metrics["energy_r2"] = ( - float(r2_score(all_e_true, all_e_pred)) - if len(all_e_true) > 1 - else float("nan") - ) - - if metrics["force_mae"]: - result_metrics["force_mae"] = float(np.mean(metrics["force_mae"])) - result_metrics["force_rmse"] = float(np.sqrt(np.mean(metrics["force_rmse"]))) - result_metrics["force_r2"] = ( - float(r2_score(all_f_true, all_f_pred)) - if len(all_f_true) > 1 - else float("nan") - ) - - if metrics["stress_mae"]: - result_metrics["stress_mae"] = float(np.mean(metrics["stress_mae"])) - result_metrics["stress_rmse"] = float(np.sqrt(np.mean(metrics["stress_rmse"]))) - result_metrics["stress_r2"] = ( - float(r2_score(all_s_true, all_s_pred)) - if len(all_s_true) > 1 - else float("nan") - ) - - result_metrics["num_evaluated"] = len(metrics["energy_mae"]) - - return result_metrics diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index e26045d6..b130912b 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -1,161 +1,794 @@ -"""Matbench Discovery benchmark task implementations.""" +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "groundhog-hpc", +# "garden-ai", +# "ase", +# "numpy", +# "pandas", +# "scikit-learn", +# "torch", +# "matbench-discovery", +# ] +# /// +"""Matbench Discovery benchmark task implementations using Groundhog HPC.""" from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict, List +import concurrent.futures +import json +import logging +import multiprocessing +import os +import sys +import time +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence -from ..utils.remote_execution import run_remote_benchmark -from ..utils.script_builder import BenchmarkScriptBuilder -from ..utils.task import BaseBenchmarkTask +import groundhog_hpc as hog +import numpy as np +import pandas as pd +from sklearn.metrics import r2_score + +# Ensure local modules can be imported during local execution +sys.path.append(os.getcwd()) if TYPE_CHECKING: - from . import MatbenchDiscovery from .enums import DatasetConfig, DatasetSize -from .metrics import classify_stable, stable_metrics - # ------------------------------------------------------------------------------ -# REMOTE FUNCTIONS -# These functions are injected into the remote script. -# They must be self-contained (imports inside or provided by builder). +# BOILERPLATE: Logging & Device Setup # ------------------------------------------------------------------------------ + + +def setup_logging(): + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] [PID:%(process)d] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + stream=sys.stdout, + force=True, + ) + return logging.getLogger("benchmark_runner") + + +def setup_device(gpu_id: Optional[int] = None) -> str: + """Setup compute device for this process.""" + try: + import torch + + if torch.cuda.is_available(): + return f"cuda:{gpu_id}" if gpu_id is not None else "cuda" + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + return "mps" + except ImportError: + pass + return "cpu" + + +def convert_numpy_types(obj): + """Convert numpy types to Python native types for JSON serialization.""" + import numpy as np + + if isinstance(obj, (np.integer, np.floating)): + return obj.item() + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, dict): + return {k: convert_numpy_types(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [convert_numpy_types(item) for item in obj] + return obj + + # ------------------------------------------------------------------------------ -# REMOTE FUNCTIONS -# These functions are injected into the remote script. -# They are now imported from remote.py to keep this file clean. +# METRICS HELPERS (Inlined from metrics.py) # ------------------------------------------------------------------------------ -from .remote import ( - _load_dataset_common, - _process_batch_common, - calculate_metrics_energy, - calculate_metrics_forces, - get_material_ids_for_subset, - load_dataset_mp_trj, - load_dataset_wbm_initial, - load_dataset_wbm_relaxed, - load_model, - process_batch_forces, - process_batch_relaxation, - process_batch_static, -) + + +def classify_stable( + each_true: Sequence[float] | pd.Series | np.ndarray, + each_pred: Sequence[float] | pd.Series | np.ndarray, + *, + stability_threshold: float = 0.0, + fillna: bool = True, +) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]: + if len(each_true) != len(each_pred): + raise ValueError(f"{len(each_true)=} != {len(each_pred)=}") + + each_true_arr, each_pred_arr = pd.Series(each_true), pd.Series(each_pred) + + if stability_threshold is None or np.isnan(stability_threshold): + raise ValueError("stability_threshold must be a real number") + actual_pos = each_true_arr <= (stability_threshold or 0) + actual_neg = each_true_arr > (stability_threshold or 0) + + model_pos = each_pred_arr <= (stability_threshold or 0) + model_neg = each_pred_arr > (stability_threshold or 0) + + if fillna: + nan_mask = np.isnan(each_pred) + model_pos[nan_mask] = False + model_neg[nan_mask] = True + + n_pos, n_neg, total = model_pos.sum(), model_neg.sum(), len(each_pred) + if n_pos + n_neg != total: + raise ValueError( + f"after filling NaNs, the sum of positive ({n_pos}) and negative " + f"({n_neg}) predictions should add up to {total=}" + ) + + true_pos = actual_pos & model_pos + false_neg = actual_pos & model_neg + false_pos = actual_neg & model_pos + true_neg = actual_neg & model_neg + + return true_pos, false_neg, false_pos, true_neg + + +def stable_metrics( + each_true: Sequence[float] | pd.Series | np.ndarray, + each_pred: Sequence[float] | pd.Series | np.ndarray, + *, + stability_threshold: float = 0.0, + fillna: bool = True, + prevalence: float | None = None, +) -> dict[str, float]: + n_true_pos, n_false_neg, n_false_pos, n_true_neg = map( + sum, + classify_stable( + each_true, each_pred, stability_threshold=stability_threshold, fillna=fillna + ), + ) + + n_total_pos = n_true_pos + n_false_neg + n_total_neg = n_true_neg + n_false_pos + if prevalence is None: + prevalence = ( + n_total_pos / (n_total_pos + n_total_neg) + if (n_total_pos + n_total_neg) > 0 + else float("nan") + ) + precision = ( + n_true_pos / (n_true_pos + n_false_pos) + if (n_true_pos + n_false_pos) > 0 + else float("nan") + ) + recall = n_true_pos / n_total_pos if n_total_pos > 0 else float("nan") + + TPR = recall + FPR = n_false_pos / n_total_neg if n_total_neg > 0 else float("nan") + TNR = n_true_neg / n_total_neg if n_total_neg > 0 else float("nan") + FNR = n_false_neg / n_total_pos if n_total_pos > 0 else float("nan") + + if FPR > 0 and TNR > 0 and FPR + TNR != 1: + if abs(FPR + TNR - 1) > 1e-6: + raise ValueError(f"{FPR=} {TNR=} don't add up to 1") + + if TPR > 0 and FNR > 0 and TPR + FNR != 1: + if abs(TPR + FNR - 1) > 1e-6: + raise ValueError(f"{TPR=} {FNR=} don't add up to 1") + + is_nan = np.isnan(each_true) | np.isnan(each_pred) + each_true, each_pred = np.array(each_true)[~is_nan], np.array(each_pred)[~is_nan] + + if precision + recall == 0: + f1_score = float("nan") + else: + f1_score = 2 * (precision * recall) / (precision + recall) + + return dict( + F1=f1_score, + DAF=precision / prevalence if prevalence > 0 else float("nan"), + Precision=precision, + Recall=recall, + Accuracy=( + (n_true_pos + n_true_neg) / (n_total_pos + n_total_neg) + if (n_total_pos + n_total_neg > 0) + else float("nan") + ), + TPR=TPR, + FPR=FPR, + TNR=TNR, + FNR=FNR, + TP=n_true_pos, + FP=n_false_pos, + TN=n_true_neg, + FN=n_false_neg, + MAE=np.abs(each_true - each_pred).mean(), + RMSE=((each_true - each_pred) ** 2).mean() ** 0.5, + R2=r2_score(each_true, each_pred) if len(each_true) > 1 else float("nan"), + ) + # ------------------------------------------------------------------------------ -# Task Classes +# REMOTE HELPERS (Inlined from remote.py) # ------------------------------------------------------------------------------ +_MODEL_CACHE = None -class MatbenchTask(BaseBenchmarkTask): - """Base class for Matbench Discovery tasks.""" - def __init__( - self, - adapter: "MatbenchDiscovery", - repo_url: str, - repo_ref: str, - model_package: str | None = None, - task_name: str = "unknown", - ): - super().__init__(adapter, repo_url, repo_ref, model_package) - self.name = task_name +def _process_batch_common( + batch_id: int, + structures: List[Any], + model_config: Dict[str, Any], + num_threads: int, + compute_fn: Callable[[Any, Any], Dict[str, Any]], + task_name: str, + model_factory: Callable[[str], Any], +) -> Dict[str, Any]: + import logging + import os + import time - def calculate_metrics(self, output: Dict[str, Any]) -> Dict[str, Any]: - """Retrieve metrics from the remote output.""" - return output.get("metrics", {}) + import torch - def _build_script( - self, process_fn, load_dataset_fn, calc_metrics_fn, model_factory - ) -> str: - """Build the remote execution script with specific functions. - - Args: - process_fn: Task-specific process_batch function - load_dataset_fn: Task-specific load_dataset function - calc_metrics_fn: Task-specific calculate_metrics function - model_factory: User-provided function that creates the model - """ - builder = BenchmarkScriptBuilder() - - # Add global model cache - builder.add_preamble("_MODEL_CACHE = None") - - # Common imports - builder.add_import( - "from typing import List, Dict, Any, Tuple, Optional, Callable" + os.environ["OMP_NUM_THREADS"] = str(num_threads) + torch.set_num_threads(num_threads) + + gpu_id = model_config.get("gpu_id") + device = setup_device(gpu_id) + + worker_logger = logging.getLogger(f"worker_{batch_id}") + worker_logger.info( + f"Started {task_name} on {device} with {len(structures)} structures. Threads: {num_threads}" + ) + + global _MODEL_CACHE + try: + if _MODEL_CACHE is None: + model = model_factory(device) + _MODEL_CACHE = model + else: + model = _MODEL_CACHE + except Exception as e: + worker_logger.error(f"Failed to initialize model: {e}") + raise RuntimeError(f"Model initialization failed: {e}") from e + + results = {} + batch_start = time.time() + + for i, (struct_id, atoms) in enumerate(structures): + try: + result = compute_fn(model, atoms) + results[struct_id] = result + + if (i + 1) % 10 == 0: + elapsed = time.time() - batch_start + rate = (i + 1) / elapsed if elapsed > 0 else 0 + worker_logger.info( + f"Progress: {i + 1}/{len(structures)} ({rate:.2f} struct/s)" + ) + + except Exception as e: + worker_logger.warning(f"Structure {struct_id} failed: {e}") + results[struct_id] = {"error": str(e)} + + return results + + +def get_material_ids_for_subset( + subset_type: str, seed: int = 42 +) -> Optional[List[str]]: + if subset_type == "full": + return None + + import pandas as pd + from matbench_discovery.data import DataFiles + + df = pd.read_csv(DataFiles.wbm_summary.path) + + if subset_type == "unique_protos": + df_filtered = df.query("unique_prototype") + return df_filtered["material_id"].tolist() + + elif subset_type == "random_10k": + df_filtered = df.query("unique_prototype") + df_sampled = df_filtered.sample(n=10000, random_state=seed) + return df_sampled["material_id"].tolist() + + elif subset_type == "random_100": + df_filtered = df.query("unique_prototype") + df_sampled = df_filtered.sample(n=100, random_state=seed) + return df_sampled["material_id"].tolist() + + else: + raise ValueError(f"Unknown subset_type: {subset_type}") + + +def _load_dataset_common( + config: Dict[str, Any], + zip_path: str, + read_format: str = "extxyz", + read_index: str | slice = None, +) -> List[Any]: + from io import TextIOWrapper + from zipfile import ZipFile + + from ase.io import read + + dataset_subset = config.get("dataset_subset", "full") + dataset_seed = config.get("dataset_seed", 42) + mat_ids = get_material_ids_for_subset(dataset_subset, seed=dataset_seed) + + structures = [] + + with ZipFile(zip_path, "r") as zf: + if mat_ids is None: + file_list = sorted( + zf.namelist(), + key=lambda x: int(x.split(".")[0]) + if x.split(".")[0].isdigit() + else float("inf"), + ) + num_structures = config.get("num_structures", 100) + if isinstance(num_structures, int): + file_list = file_list[:num_structures] + else: + mat_id_set = set(mat_ids) + file_list = [ + f for f in zf.namelist() if f.replace(".extxyz", "") in mat_id_set + ] + + for filename in file_list: + with zf.open(filename) as f: + text_stream = TextIOWrapper(f, encoding="utf-8") + if read_index is not None: + atoms_list = read(text_stream, format=read_format, index=read_index) + if isinstance(atoms_list, list) and atoms_list: + structures.append((filename, atoms_list[-1])) + elif not isinstance(atoms_list, list): + structures.append((filename, atoms_list)) + else: + structures.append((filename, read(text_stream, format=read_format))) + + return structures + + +# Task-specific helpers +def process_batch_relaxation( + batch_id: int, + structures: List[Any], + model_config: Dict[str, Any], + num_threads: int, + model_factory: Callable[[str], Any], +) -> Dict[str, Any]: + from ase.optimize import FIRE + + def compute(model, atoms): + atoms.calc = model + opt = FIRE(atoms, logfile=None) + opt.run(fmax=0.05, steps=500) + energy = atoms.get_potential_energy() + return {"energy": energy} + + return _process_batch_common( + batch_id, + structures, + model_config, + num_threads, + compute, + "relaxation", + model_factory, + ) + + +def process_batch_static( + batch_id: int, + structures: List[Any], + model_config: Dict[str, Any], + num_threads: int, + model_factory: Callable[[str], Any], +) -> Dict[str, Any]: + def compute(model, atoms): + atoms.calc = model + energy = atoms.get_potential_energy() + return {"energy": energy} + + return _process_batch_common( + batch_id, + structures, + model_config, + num_threads, + compute, + "static calculation", + model_factory, + ) + + +def process_batch_forces( + batch_id: int, + structures: List[Any], + model_config: Dict[str, Any], + num_threads: int, + model_factory: Callable[[str], Any], +) -> Dict[str, Any]: + def compute(model, atoms): + atoms.calc = model + energy = atoms.get_potential_energy() + forces = atoms.get_forces().tolist() + stress = atoms.get_stress().tolist() + return {"energy": energy, "forces": forces, "stress": stress} + + return _process_batch_common( + batch_id, + structures, + model_config, + num_threads, + compute, + "forces calculation", + model_factory, + ) + + +def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Any]: + from matbench_discovery.data import DataFiles + + return _load_dataset_common(config, DataFiles.wbm_initial_atoms.path) + + +def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Any]: + from matbench_discovery.data import DataFiles + + return _load_dataset_common(config, DataFiles.wbm_relaxed_atoms.path) + + +def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]: + from matbench_discovery.data import DataFiles + + return _load_dataset_common(config, DataFiles.mp_trj_extxyz.path, read_index=":") + + +def calculate_metrics_energy( + results: Dict[str, Any], config: Dict[str, Any] +) -> Dict[str, Any]: + import numpy as np + from matbench_discovery.data import df_wbm + + if len(results) == 0: + return {"error": "No results to evaluate"} + + model_energies = {} + for sid, res in results.items(): + if isinstance(res, dict) and res.get("energy") is not None: + mat_id = sid.replace(".extxyz", "") + model_energies[mat_id] = res["energy"] + + if not model_energies: + return {"error": "No valid energies found in results"} + + df_wbm_indexed = df_wbm.set_index("material_id") + common_ids = list(set(model_energies.keys()) & set(df_wbm_indexed.index)) + + if not common_ids: + return {"error": "No matching IDs between results and ground truth"} + + df_subset = df_wbm_indexed.loc[common_ids] + y_pred = np.array([model_energies[mid] for mid in common_ids]) + y_true = df_subset["uncorrected_energy"].values + n_atoms = df_subset["n_sites"].values + + e_form_error = (y_pred - y_true) / n_atoms + each_true = df_subset["e_above_hull_mp2020_corrected_ppd_mp"].values + each_pred = each_true + e_form_error + + df_unique = df_wbm.query("unique_prototype") + stable_count = (df_unique["e_above_hull_mp2020_corrected_ppd_mp"] <= 0).sum() + global_prevalence = stable_count / len(df_unique) + + metrics = stable_metrics(each_true, each_pred, prevalence=global_prevalence) + metrics["num_evaluated"] = len(common_ids) + return metrics + + +def calculate_metrics_forces( + results: Dict[str, Any], config: Dict[str, Any] +) -> Dict[str, Any]: + from io import TextIOWrapper + from zipfile import ZipFile + + import numpy as np + from ase.io import read + from matbench_discovery.data import DataFiles + from sklearn.metrics import r2_score + + metrics = { + "energy_mae": [], + "energy_rmse": [], + "force_mae": [], + "force_rmse": [], + "stress_mae": [], + "stress_rmse": [], + } + all_e_pred, all_e_true = [], [] + all_f_pred, all_f_true = [], [] + all_s_pred, all_s_true = [], [] + + zip_path = DataFiles.mp_trj_extxyz.path + + with ZipFile(zip_path, "r") as zf: + for sid, res in results.items(): + if "error" in res: + continue + try: + with zf.open(sid) as f: + text_stream = TextIOWrapper(f, encoding="utf-8") + atoms_list = read(text_stream, format="extxyz", index=":") + gt_atoms = atoms_list[-1] + + e_pred = res["energy"] + e_true = gt_atoms.get_potential_energy() + n_atoms = len(gt_atoms) + energy_error = abs(e_pred - e_true) / n_atoms + metrics["energy_mae"].append(energy_error) + metrics["energy_rmse"].append(energy_error**2) + all_e_pred.append(e_pred / n_atoms) + all_e_true.append(e_true / n_atoms) + + f_pred = np.array(res["forces"]) + f_true = gt_atoms.get_forces() + force_error = np.abs(f_pred - f_true) + metrics["force_mae"].append(force_error.mean()) + metrics["force_rmse"].append((force_error**2).mean()) + all_f_pred.extend(f_pred.flatten()) + all_f_true.extend(f_true.flatten()) + + s_pred = np.array(res["stress"]) + s_true = gt_atoms.get_stress() + stress_error = np.abs(s_pred - s_true) + metrics["stress_mae"].append(stress_error.mean()) + metrics["stress_rmse"].append((stress_error**2).mean()) + all_s_pred.extend(s_pred.flatten()) + all_s_true.extend(s_true.flatten()) + + except Exception: + pass + + result_metrics = {} + if metrics["energy_mae"]: + result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"])) + result_metrics["energy_rmse"] = float(np.sqrt(np.mean(metrics["energy_rmse"]))) + result_metrics["energy_r2"] = ( + float(r2_score(all_e_true, all_e_pred)) + if len(all_e_true) > 1 + else float("nan") + ) + + if metrics["force_mae"]: + result_metrics["force_mae"] = float(np.mean(metrics["force_mae"])) + result_metrics["force_rmse"] = float(np.sqrt(np.mean(metrics["force_rmse"]))) + result_metrics["force_r2"] = ( + float(r2_score(all_f_true, all_f_pred)) + if len(all_f_true) > 1 + else float("nan") ) - builder.add_import("import torch") - builder.add_import("from ase.optimize import FIRE") - builder.add_import("from ase.io import read") - builder.add_import("from matbench_discovery.data import DataFiles") - builder.add_import("from zipfile import ZipFile") - builder.add_import("from io import TextIOWrapper") - builder.add_import("import pandas as pd") - builder.add_import("import numpy as np") - builder.add_import("from collections.abc import Sequence") - builder.add_import("from sklearn.metrics import r2_score") - # Add user's model factory (renamed to load_model_user so load_model can call it) - builder.add_function(model_factory, name="load_model_user") + if metrics["stress_mae"]: + result_metrics["stress_mae"] = float(np.mean(metrics["stress_mae"])) + result_metrics["stress_rmse"] = float(np.sqrt(np.mean(metrics["stress_rmse"]))) + result_metrics["stress_r2"] = ( + float(r2_score(all_s_true, all_s_pred)) + if len(all_s_true) > 1 + else float("nan") + ) + + result_metrics["num_evaluated"] = len(metrics["energy_mae"]) + return result_metrics + + +# ------------------------------------------------------------------------------ +# MAIN RUNNER (Inlined from runners.py) +# ------------------------------------------------------------------------------ - # Add our load_model wrapper that calls load_model_user - builder.add_function(load_model) - # Add helper function for dataset subset filtering - builder.add_function(get_material_ids_for_subset) +def run_benchmark_hog( + config: Dict[str, Any], + model_factory: Any, + load_dataset_fn: Any, + process_fn: Any, + calc_metrics_fn: Any, +) -> Dict[str, Any]: + logger = setup_logging() + logger.info("Starting benchmark runner...") + + checkpoint_path = config.get("checkpoint_path") + results = {} + + if checkpoint_path and os.path.exists(checkpoint_path): + logger.info(f"Loading checkpoint from {checkpoint_path}") + try: + with open(checkpoint_path) as f: + results = json.load(f) + logger.info(f"Found {len(results)} processed items in checkpoint") + except Exception as e: + logger.warning(f"Failed to load checkpoint: {e}. Starting fresh.") + + try: + all_items = load_dataset_fn(config) + logger.info(f"Loaded {len(all_items)} total items") + except Exception as e: + logger.error(f"Failed to load dataset: {e}") + import traceback + + traceback.print_exc() + raise + + items_to_process = [ + (item_id, item) for item_id, item in all_items if str(item_id) not in results + ] + + if not items_to_process: + logger.info("All items already processed!") + return {"results": results, "metrics": {}} + + logger.info(f"Processing {len(items_to_process)} remaining items") + + import random + + random.seed(42) + random.shuffle(items_to_process) + + try: + import torch + + num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 + except ImportError: + num_gpus = 0 + + use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1 + total_cores = os.cpu_count() or 1 + num_workers = num_gpus if use_multi_gpu else 1 + available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores + threads_per_worker = max(1, available_cores // num_workers) + + logger.info( + f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)" + ) + + start_time = time.time() + chunk_size = 1000 * num_workers + chunks = [ + items_to_process[i : i + chunk_size] + for i in range(0, len(items_to_process), chunk_size) + ] + + ctx = multiprocessing.get_context("spawn") + + with concurrent.futures.ProcessPoolExecutor( + max_workers=num_workers, mp_context=ctx + ) as executor: + for chunk_idx, chunk in enumerate(chunks): + chunk_start = time.time() + logger.info( + f"Starting chunk {chunk_idx + 1}/{len(chunks)} ({len(chunk)} items)" + ) - # Add common helpers - builder.add_function(_process_batch_common) - builder.add_function(_load_dataset_common) + futures = [] + batch_size = (len(chunk) + num_workers - 1) // num_workers + + for i in range(num_workers): + start = i * batch_size + end = min((i + 1) * batch_size, len(chunk)) + if start < end: + batch = chunk[start:end] + worker_config = config.copy() + worker_config["gpu_id"] = i if use_multi_gpu else None + futures.append( + executor.submit( + process_fn, + i, + batch, + worker_config, + threads_per_worker, + model_factory, + ) + ) + + chunk_results = {} + for future in concurrent.futures.as_completed(futures): + try: + batch_res = future.result() + chunk_results.update(batch_res) + except Exception as e: + logger.error(f"Worker failed in chunk {chunk_idx}: {e}") + raise RuntimeError( + "Aborting benchmark due to worker failure" + ) from e + + results.update(chunk_results) + + if checkpoint_path: + try: + tmp_path = checkpoint_path + ".tmp" + with open(tmp_path, "w") as f: + clean_results = convert_numpy_types(results) + json.dump(clean_results, f, indent=2) + os.replace(tmp_path, checkpoint_path) + logger.info(f"Checkpoint saved to {checkpoint_path}") + except Exception as e: + logger.error(f"Failed to save checkpoint: {e}") + + elapsed = time.time() - chunk_start + logger.info(f"Chunk {chunk_idx + 1} complete in {elapsed:.1f}s") + + total_elapsed = time.time() - start_time + logger.info(f"Benchmark complete in {total_elapsed:.1f}s.") + + logger.info("Calculating metrics...") + try: + metrics = calc_metrics_fn(results, config) + logger.info(f"Metrics calculated: {metrics}") + except Exception as e: + logger.error(f"Failed to calculate metrics: {e}") + import traceback + + traceback.print_exc() + metrics = {"error": f"Metrics calculation failed: {e}"} + + output = {"results": results, "metrics": metrics} + output = convert_numpy_types(output) + return output - # Add task-specific functions with standard names expected by runner - builder.add_function(process_fn, name="process_batch") - builder.add_function(load_dataset_fn, name="load_dataset") - builder.add_function(calc_metrics_fn, name="calculate_metrics_remote") - # Inject metrics helper functions - builder.add_function(classify_stable) - builder.add_function(stable_metrics) +# ------------------------------------------------------------------------------ +# CLASS DEFINITION +# ------------------------------------------------------------------------------ - return builder.build() +class MatbenchDiscovery: + """Matbench Discovery tasks using Groundhog HPC.""" + + REPO_URL = "https://github.com/janosh/matbench-discovery" + REPO_REF = "main" + + @staticmethod def _prepare_runner_config( - self, num_structures: int | "DatasetSize" | "DatasetConfig" + num_structures: int | "DatasetSize" | "DatasetConfig", + repo_url: str = REPO_URL, + repo_ref: str = REPO_REF, ) -> Dict[str, Any]: """Prepare the runner configuration based on num_structures.""" - from .enums import DatasetConfig, DatasetSize - - if isinstance(num_structures, DatasetSize): + # Need to handle DatasetSize/Config which might be passed as objects or values + # Since we are in the script, we might not have the enums imported if they are not in this file. + # But the user passes them. + # If they are passed as arguments, they are serialized. + # We need to extract value. + + # Simple heuristic: if it has 'value' attr, use it. + subset = "full" + seed = 42 + + if hasattr(num_structures, "value"): # Enum + subset = num_structures.value + # Check for seed method/attr if it's our custom Config + if hasattr(num_structures, "seed"): + if callable(num_structures.seed): + pass # It's the method + else: + seed = num_structures.seed + elif hasattr(num_structures, "subset"): # DatasetConfig + subset = num_structures.subset.value + seed = num_structures.seed + elif isinstance(num_structures, int): + subset = "full" + # We handle int as limit in load_dataset return { - "repo_url": self.repo_url, - "repo_ref": self.repo_ref, - "dataset_subset": num_structures.value, - "dataset_seed": 42, - } - elif isinstance(num_structures, DatasetConfig): - return { - "repo_url": self.repo_url, - "repo_ref": self.repo_ref, - "dataset_subset": num_structures.subset.value, - "dataset_seed": num_structures.seed, - } - else: - return { - "repo_url": self.repo_url, - "repo_ref": self.repo_ref, + "repo_url": repo_url, + "repo_ref": repo_ref, "num_structures": num_structures, "dataset_subset": "full", } - def _prepare_dependencies(self, model_packages: str | List[str]) -> List[str]: - """Prepare the list of dependencies.""" - packages = ( - [model_packages] if isinstance(model_packages, str) else model_packages - ) - return ["matbench-discovery>=1.3.0"] + packages + return { + "repo_url": repo_url, + "repo_ref": repo_ref, + "dataset_subset": subset, + "dataset_seed": seed, + } + @staticmethod def _generate_checkpoint_name( - self, model_packages: str | List[str], runner_config: Dict[str, Any] + model_packages: str | List[str], runner_config: Dict[str, Any] ) -> str: - """Generate a unique checkpoint name.""" import time import uuid @@ -173,42 +806,30 @@ def _generate_checkpoint_name( short_uuid = str(uuid.uuid4())[:8] return f"matbench_{model_str}_{subset_str}_{timestamp}_{short_uuid}.json" - def submit( - self, - model_factory: callable, + @staticmethod + def _run_task( + model_factory: Any, model_packages: str | List[str], - num_structures: int | "DatasetSize" | "DatasetConfig" = 100, - checkpoint_name: str | None = None, - checkpoint_path: str | None = None, - ): - """Submit benchmark job to remote executor. - - Args: - model_factory: User-provided function that takes device and returns an ASE calculator. - Example: lambda device: mace_mp(model="medium", device=device) - model_packages: Python package(s) to install. Can be a single package string - (e.g., "mace-torch") or a list (e.g., ["torch", "mace-torch"]) - num_structures: Number of structures to evaluate, or DatasetSize enum, or DatasetConfig - (DatasetSize.FULL, DatasetSize.RANDOM_10K, DatasetSize.RANDOM_10K.seed(10)) - checkpoint_name: Optional name for the checkpoint file (e.g. "my_checkpoint.json"). - If not provided, one will be generated. - checkpoint_path: Optional path to an existing checkpoint file to resume from. - If provided, checkpoint_name is ignored and no new checkpoint is created. - """ - # Build script with task-specific functions AND user's factory - script_content = self._build_script( - self.process_fn, - self.load_dataset_fn, - self.calc_metrics_fn, - model_factory, # Inject user's factory function - ) + num_structures: int | "DatasetSize" | "DatasetConfig", + checkpoint_name: str | None, + checkpoint_path: str | None, + process_fn: Any, + load_dataset_fn: Any, + calc_metrics_fn: Any, + sys_path: List[str] | None = None, + ) -> Dict[str, Any]: + # Add custom sys.path if provided (useful for local execution/testing) + if sys_path: + import sys - dependencies = self._prepare_dependencies(model_packages) - runner_config = self._prepare_runner_config(num_structures) + for p in sys_path: + if p not in sys.path: + sys.path.append(p) + + runner_config = MatbenchDiscovery._prepare_runner_config(num_structures) - # Generate checkpoint name if not provided AND no checkpoint_path is provided if not checkpoint_name and not checkpoint_path: - checkpoint_name = self._generate_checkpoint_name( + checkpoint_name = MatbenchDiscovery._generate_checkpoint_name( model_packages, runner_config ) @@ -219,154 +840,124 @@ def submit( print( f"Checkpoint will be saved to: ~/.garden/benchmarks/{checkpoint_name}" ) - final_checkpoint_path = f"~/.garden/benchmarks/{checkpoint_name}" - - executor = self.adapter._get_executor() - future = executor.submit( - run_remote_benchmark, - script_content=script_content, - dependencies=dependencies, - config=runner_config, - checkpoint_name=checkpoint_name, - checkpoint_path=checkpoint_path, - ) + final_checkpoint_path = os.path.expanduser( + f"~/.garden/benchmarks/{checkpoint_name}" + ) + # Ensure directory exists + os.makedirs(os.path.dirname(final_checkpoint_path), exist_ok=True) - # Attach checkpoint path to future for programmatic access - future.checkpoint_path = final_checkpoint_path + runner_config["checkpoint_path"] = final_checkpoint_path - return future + return run_benchmark_hog( + runner_config, + model_factory, + load_dataset_fn, + process_fn, + calc_metrics_fn, + ) - def local( - self, - model_factory: callable, + @hog.method() + def IS2RE( + model_factory: Any, model_packages: str | List[str], num_structures: int | "DatasetSize" | "DatasetConfig" = 100, + checkpoint_name: str | None = None, checkpoint_path: str | None = None, - ) -> dict: - """Run benchmark locally. - - Args: - model_factory: User-provided function that takes device and returns an ASE calculator - model_packages: Python package(s) to install. Can be a single package string - (e.g., "mace-torch") or a list (e.g., ["torch", "mace-torch"]) - num_structures: Number of structures to evaluate, or DatasetSize enum, or DatasetConfig - (DatasetSize.FULL, DatasetSize.RANDOM_10K, DatasetSize.RANDOM_10K.seed(10)) - checkpoint_path: Optional path to resume from checkpoint - """ - from ..utils.remote_execution import run_remote_benchmark - - # Build script with task-specific functions AND user's factory - script_content = self._build_script( - self.process_fn, self.load_dataset_fn, self.calc_metrics_fn, model_factory + sys_path: List[str] | None = None, + ) -> Dict[str, Any]: + """Initial Structure to Relaxed Energy.""" + return MatbenchDiscovery._run_task( + model_factory, + model_packages, + num_structures, + checkpoint_name, + checkpoint_path, + process_batch_relaxation, + load_dataset_wbm_initial, + calculate_metrics_energy, + sys_path=sys_path, ) - dependencies = self._prepare_dependencies(model_packages) - runner_config = self._prepare_runner_config(num_structures) - - # Run locally (no Globus Compute) - return run_remote_benchmark( - script_content=script_content, - dependencies=dependencies, - config=runner_config, - checkpoint_path=checkpoint_path, + @hog.method() + def RS2RE( + model_factory: Any, + model_packages: str | List[str], + num_structures: int | "DatasetSize" | "DatasetConfig" = 100, + checkpoint_name: str | None = None, + checkpoint_path: str | None = None, + sys_path: List[str] | None = None, + ) -> Dict[str, Any]: + """Relaxed Structure to Relaxed Energy.""" + return MatbenchDiscovery._run_task( + model_factory, + model_packages, + num_structures, + checkpoint_name, + checkpoint_path, + process_batch_static, + load_dataset_wbm_relaxed, + calculate_metrics_energy, + sys_path=sys_path, ) + @hog.method() + def S2EFS( + model_factory: Any, + model_packages: str | List[str], + num_structures: int | "DatasetSize" | "DatasetConfig" = 100, + checkpoint_name: str | None = None, + checkpoint_path: str | None = None, + sys_path: List[str] | None = None, + ) -> Dict[str, Any]: + """Structure to Energy, Forces, Stress.""" + return MatbenchDiscovery._run_task( + model_factory, + model_packages, + num_structures, + checkpoint_name, + checkpoint_path, + process_batch_forces, + load_dataset_mp_trj, + calculate_metrics_forces, + sys_path=sys_path, + ) -class IS2RETask(MatbenchTask): - """Initial Structure to Relaxed Energy.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, task_name="IS2RE", **kwargs) - self.process_fn = process_batch_relaxation - self.load_dataset_fn = load_dataset_wbm_initial - self.calc_metrics_fn = calculate_metrics_energy - - -class RS2RETask(MatbenchTask): - """Relaxed Structure to Relaxed Energy.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, task_name="RS2RE", **kwargs) - self.process_fn = process_batch_static - self.load_dataset_fn = load_dataset_wbm_relaxed - self.calc_metrics_fn = calculate_metrics_energy - - -class S2EFSTask(MatbenchTask): - """Structure to Energy, Forces, Stress.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, task_name="S2EFS", **kwargs) - self.process_fn = process_batch_forces - self.load_dataset_fn = load_dataset_mp_trj - self.calc_metrics_fn = calculate_metrics_forces - - -class S2EFTask(MatbenchTask): - """Structure to Energy, Force.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, task_name="S2EF", **kwargs) - self.process_fn = process_batch_forces - self.load_dataset_fn = load_dataset_mp_trj - self.calc_metrics_fn = calculate_metrics_forces - - -class S2EFSMTask(MatbenchTask): - """Structure to Energy, Force, Stress, Magmoms.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, task_name="S2EFSM", **kwargs) - self.process_fn = process_batch_forces - self.load_dataset_fn = load_dataset_mp_trj - self.calc_metrics_fn = calculate_metrics_forces - - -class IS2ETask(MatbenchTask): - """Initial Structure to Energy.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, task_name="IS2E", **kwargs) - self.process_fn = process_batch_static - self.load_dataset_fn = load_dataset_wbm_initial - self.calc_metrics_fn = calculate_metrics_energy - - -class S2ETask(MatbenchTask): - """Structure to Energy.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, task_name="S2E", **kwargs) - self.process_fn = process_batch_static - self.load_dataset_fn = load_dataset_wbm_relaxed - self.calc_metrics_fn = calculate_metrics_energy - - -class S2RETask(MatbenchTask): - """Structure to Relaxed Energy.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, task_name="S2RE", **kwargs) - self.process_fn = process_batch_relaxation - self.load_dataset_fn = load_dataset_wbm_initial - self.calc_metrics_fn = calculate_metrics_energy - - -class RP2RETask(MatbenchTask): - """Relaxed Prototype to Relaxed Energy.""" + # Aliases + @hog.method() + def S2EF(*args, **kwargs): + return MatbenchDiscovery.S2EFS(*args, **kwargs) + + @hog.method() + def S2EFSM(*args, **kwargs): + return MatbenchDiscovery.S2EFS(*args, **kwargs) + + @hog.method() + def IS2E(*args, **kwargs): + # Same as IS2RE but static? No, IS2E is Initial Structure to Energy (Static). + # IS2RE is Relaxation. + # IS2E logic: + return MatbenchDiscovery._run_task( + *args, + **kwargs, + process_fn=process_batch_static, + load_dataset_fn=load_dataset_wbm_initial, + calc_metrics_fn=calculate_metrics_energy, + ) - def __init__(self, *args, **kwargs): - super().__init__(*args, task_name="RP2RE", **kwargs) - self.process_fn = process_batch_relaxation - self.load_dataset_fn = load_dataset_wbm_initial # Placeholder - self.calc_metrics_fn = calculate_metrics_energy + @hog.method() + def S2E(*args, **kwargs): + # Structure to Energy (Relaxed Structure to Energy) -> RS2RE + return MatbenchDiscovery.RS2RE(*args, **kwargs) + @hog.method() + def S2RE(*args, **kwargs): + # Structure to Relaxed Energy -> IS2RE + return MatbenchDiscovery.IS2RE(*args, **kwargs) -class IP2ETask(MatbenchTask): - """Initial Prototype to Energy.""" + @hog.method() + def RP2RE(*args, **kwargs): + return MatbenchDiscovery.IS2RE(*args, **kwargs) - def __init__(self, *args, **kwargs): - super().__init__(*args, task_name="IP2E", **kwargs) - self.process_fn = process_batch_static - self.load_dataset_fn = load_dataset_wbm_initial # Placeholder - self.calc_metrics_fn = calculate_metrics_energy + @hog.method() + def IP2E(*args, **kwargs): + return MatbenchDiscovery.IS2E(*args, **kwargs) diff --git a/garden_ai/benchmarks/templates/base_runner.py b/garden_ai/benchmarks/templates/base_runner.py deleted file mode 100644 index 60ed80d6..00000000 --- a/garden_ai/benchmarks/templates/base_runner.py +++ /dev/null @@ -1,248 +0,0 @@ -import concurrent.futures -import json -import logging -import multiprocessing -import os -import sys -import time -from typing import Optional - -# ------------------------------------------------------------------------------ -# BOILERPLATE: Logging & Device Setup -# ------------------------------------------------------------------------------ - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] [PID:%(process)d] %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - stream=sys.stdout, - force=True, -) -logger = logging.getLogger("benchmark_runner") - - -def setup_device(gpu_id: Optional[int] = None) -> str: - """Setup compute device for this process.""" - try: - import torch - - if torch.cuda.is_available(): - return f"cuda:{gpu_id}" if gpu_id is not None else "cuda" - elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): - return "mps" - except ImportError: - pass - return "cpu" - - -def convert_numpy_types(obj): - """Convert numpy types to Python native types for JSON serialization.""" - import numpy as np - - if isinstance(obj, (np.integer, np.floating)): - return obj.item() - elif isinstance(obj, np.ndarray): - return obj.tolist() - elif isinstance(obj, dict): - return {k: convert_numpy_types(v) for k, v in obj.items()} - elif isinstance(obj, list): - return [convert_numpy_types(item) for item in obj] - return obj - - -# ------------------------------------------------------------------------------ -# USER DEFINED FUNCTIONS (Injected) -# ------------------------------------------------------------------------------ -# - load_model(config, device) -# - process_batch(batch_id, batch_data, model_config, num_threads) -# - load_dataset(config) -> List[Any] -# ------------------------------------------------------------------------------ - -# ------------------------------------------------------------------------------ -# MAIN EXECUTION LOOP -# ------------------------------------------------------------------------------ - - -def main(): - if len(sys.argv) != 2: - sys.exit("Usage: python benchmark_runner.py ") - - with open(sys.argv[1]) as f: - config = json.load(f) - - logger.info("Starting benchmark runner...") - - checkpoint_path = config.get("checkpoint_path") - results = {} - - # Load existing checkpoint if available - if checkpoint_path and os.path.exists(checkpoint_path): - logger.info(f"Loading checkpoint from {checkpoint_path}") - try: - with open(checkpoint_path) as f: - results = json.load(f) - logger.info(f"Found {len(results)} processed items in checkpoint") - except Exception as e: - logger.warning(f"Failed to load checkpoint: {e}. Starting fresh.") - - # Load Dataset - try: - all_items = load_dataset(config) # noqa: F821 - logger.info(f"Loaded {len(all_items)} total items") - except Exception as e: - logger.error(f"Failed to load dataset: {e}") - import traceback - - traceback.print_exc() - sys.exit(1) - - # Filter out already processed items - # Assuming items are (id, data) tuples - items_to_process = [ - (item_id, item) for item_id, item in all_items if str(item_id) not in results - ] - - if not items_to_process: - logger.info("All items already processed!") - with open("results.json", "w") as f: - json.dump(results, f, indent=2) - return - - logger.info(f"Processing {len(items_to_process)} remaining items") - - # Shuffle for load balancing - import random - - random.seed(42) - random.shuffle(items_to_process) - - # Resource detection - try: - import torch - - num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 - except ImportError: - num_gpus = 0 - - use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1 - - total_cores = os.cpu_count() or 1 - num_workers = num_gpus if use_multi_gpu else 1 - # Reserve some cores for system/overhead if possible - available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores - threads_per_worker = max(1, available_cores // num_workers) - - logger.info( - f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)" - ) - - start_time = time.time() - - # Chunk items into smaller batches to allow frequent checkpointing - chunk_size = 1000 * num_workers - chunks = [ - items_to_process[i : i + chunk_size] - for i in range(0, len(items_to_process), chunk_size) - ] - - logger.info(f"Split into {len(chunks)} chunks for processing") - - ctx = multiprocessing.get_context("spawn") - - with concurrent.futures.ProcessPoolExecutor( - max_workers=num_workers, mp_context=ctx - ) as executor: - for chunk_idx, chunk in enumerate(chunks): - chunk_start = time.time() - logger.info( - f"Starting chunk {chunk_idx + 1}/{len(chunks)} ({len(chunk)} items)" - ) - - # Split chunk among workers - futures = [] - batch_size = (len(chunk) + num_workers - 1) // num_workers - - for i in range(num_workers): - start = i * batch_size - end = min((i + 1) * batch_size, len(chunk)) - if start < end: - batch = chunk[start:end] - - # Inject worker specific config - worker_config = config.copy() - worker_config["gpu_id"] = i if use_multi_gpu else None - - futures.append( - executor.submit( - process_batch, # noqa: F821 - i, - batch, - worker_config, - threads_per_worker, - ) - ) - - # Collect results for this chunk - chunk_results = {} - for future in concurrent.futures.as_completed(futures): - try: - batch_res = future.result() - chunk_results.update(batch_res) - except Exception as e: - logger.error(f"Worker failed in chunk {chunk_idx}: {e}") - import traceback - - traceback.print_exc() - # Critical failure - abort benchmark immediately - logger.error("Aborting benchmark due to worker failure") - sys.exit(1) - - # Update main results and save checkpoint - results.update(chunk_results) - - if checkpoint_path: - try: - tmp_path = checkpoint_path + ".tmp" - with open(tmp_path, "w") as f: - # Convert numpy types before saving checkpoint - clean_results = convert_numpy_types(results) - json.dump(clean_results, f, indent=2) - os.replace(tmp_path, checkpoint_path) - logger.info(f"Checkpoint saved to {checkpoint_path}") - except Exception as e: - logger.error(f"Failed to save checkpoint: {e}") - - elapsed = time.time() - chunk_start - logger.info(f"Chunk {chunk_idx + 1} complete in {elapsed:.1f}s") - - total_elapsed = time.time() - start_time - logger.info(f"Benchmark complete in {total_elapsed:.1f}s.") - - # Calculate metrics from results - logger.info("Calculating metrics...") - try: - metrics = calculate_metrics_remote(results, config) # noqa: F821 - logger.info(f"Metrics calculated: {metrics}") - except Exception as e: - logger.error(f"Failed to calculate metrics: {e}") - import traceback - - traceback.print_exc() - metrics = {"error": f"Metrics calculation failed: {e}"} - - # Write both results and metrics - output = {"results": results, "metrics": metrics} - - # Custom JSON encoder to handle numpy types - # convert_numpy_types moved to global scope - - # Convert numpy types before serialization - output = convert_numpy_types(output) - - with open("results.json", "w") as f: - json.dump(output, f, indent=2) - - -if __name__ == "__main__": - main() diff --git a/garden_ai/benchmarks/utils/script_builder.py b/garden_ai/benchmarks/utils/script_builder.py index 7987bdc7..9613923c 100644 --- a/garden_ai/benchmarks/utils/script_builder.py +++ b/garden_ai/benchmarks/utils/script_builder.py @@ -17,6 +17,8 @@ def __init__(self, template_path: str | Path = None): self.imports = set() self.functions = [] self.preamble = [] + self.pep723_dependencies = [] + self.pep723_requires_python = None def add_import(self, import_stmt: str): """Add an import statement (e.g. 'import numpy as np').""" @@ -28,6 +30,14 @@ def add_preamble(self, code: str): self.preamble.append(code) return self + def add_pep723_metadata( + self, dependencies: list[str], requires_python: str = ">=3.10" + ): + """Add PEP 723 script metadata.""" + self.pep723_dependencies.extend(dependencies) + self.pep723_requires_python = requires_python + return self + def add_function(self, func: Callable, name: str = None): """Add a function definition to the script. @@ -72,7 +82,18 @@ def build(self) -> str: # Let's just put imports at the top, then functions, then the template content. # But we need to be careful about imports in the template. - final_script = f""" + # Construct PEP 723 block + pep723_block = "" + if self.pep723_dependencies or self.pep723_requires_python: + pep723_block = "# /// script\n" + if self.pep723_requires_python: + pep723_block += f'# requires-python = "{self.pep723_requires_python}"\n' + if self.pep723_dependencies: + deps_list = '",\n# "'.join(self.pep723_dependencies) + pep723_block += f'# dependencies = [\n# "{deps_list}",\n# ]\n' + pep723_block += "# ///\n" + + final_script = f"""{pep723_block} # ------------------------------------------------------------------------------ # INJECTED IMPORTS # ------------------------------------------------------------------------------ From a9bc8cf3110d973bc6008e24c4d5a08ab1ff39b0 Mon Sep 17 00:00:00 2001 From: hholb Date: Fri, 5 Dec 2025 11:18:46 -0700 Subject: [PATCH 06/23] update examples scripts, tweak task setup --- .../examples/matbench_equiformerv2.py | 134 +++++------ .../examples/matbench_mace_multi_gpu.py | 30 ++- .../examples/matbench_mattersim.py | 103 ++++---- .../examples/matbench_sevennet.py | 109 ++++----- .../examples/run_random_10k_benchmark.py | 167 ++++++------- .../benchmarks/matbench_discovery/tasks.py | 226 ++++++++++++++++-- 6 files changed, 449 insertions(+), 320 deletions(-) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py index ec3afe91..7855f825 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py @@ -19,16 +19,12 @@ # Globus Compute endpoint ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" -# HPC endpoint configuration -ENDPOINT_CONFIG = { - "account": "cis250461-gpu", - "partition": "gpu-debug", - "qos": "gpu", - "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8", -} + +# ============================================================================= +# Model Factory +# ============================================================================= -# Model factory function for EquiformerV2 def create_equiformerv2_model(device): """Create EquiformerV2 model calculator. @@ -46,10 +42,6 @@ def create_equiformerv2_model(device): ) -# Benchmark parameters -NUM_STRUCTURES = 1000 -USE_MULTI_GPU = True - # ============================================================================= # Run Benchmark # ============================================================================= @@ -59,70 +51,66 @@ def main(): """Run Matbench Discovery S2EFS benchmark with EquiformerV2.""" print("=" * 80) - print("Matbench Discovery S2EFS Benchmark") - print("=" * 80) - print(f"Endpoint: {ENDPOINT_ID}") - print("Model: EquiformerV2-31M") - print("Task: S2EFS (Structure to Energy, Forces, Stress)") - print(f"Structures: {NUM_STRUCTURES}") - print(f"Resources: {'Multi-GPU' if USE_MULTI_GPU else 'Single GPU'}") + print("Matbench Discovery S2EFS Benchmark - EquiformerV2") print("=" * 80) + + # Run S2EFS task using the new groundhog API + # S2EFS is suitable for EquiformerV2 which doesn't support relaxation + output = MatbenchDiscovery.S2EFS.remote( + endpoint=ENDPOINT_ID, + user_endpoint_config={ + "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8\n", + "walltime": 7200, # 2 hours in seconds + "qos": "gpu", + "partition": "gpu-debug", + "account": "cis250461-gpu", + "cores_per_node": 16, + "mem_per_node": 32, + "requirements": "", + }, + model_factory=create_equiformerv2_model, + model_packages="fairchem-core", + num_structures="random_100", + ) + + # Display metrics print() + print("=" * 80) + print("Benchmark Results") + print("=" * 80) + + metrics = output.get("metrics", {}) + if "error" in metrics: + print(f"Error: {metrics['error']}") + else: + # Energy metrics + if "energy_mae" in metrics: + print("Energy Metrics:") + print(f" MAE (eV/atom): {metrics.get('energy_mae', 'N/A'):.6f}") + print(f" RMSE (eV/atom): {metrics.get('energy_rmse', 'N/A'):.6f}") + print(f" R²: {metrics.get('energy_r2', 'N/A'):.6f}") + print() + + # Force metrics + if "force_mae" in metrics: + print("Force Metrics:") + print(f" MAE (eV/Å): {metrics.get('force_mae', 'N/A'):.6f}") + print(f" RMSE (eV/Å): {metrics.get('force_rmse', 'N/A'):.6f}") + print(f" R²: {metrics.get('force_r2', 'N/A'):.6f}") + print() + + # Stress metrics + if "stress_mae" in metrics: + print("Stress Metrics:") + print(f" MAE (GPa): {metrics.get('stress_mae', 'N/A'):.6f}") + print(f" RMSE (GPa): {metrics.get('stress_rmse', 'N/A'):.6f}") + print(f" R²: {metrics.get('stress_r2', 'N/A'):.6f}") + print() + + if "num_evaluated" in metrics: + print(f"Structures: {metrics.get('num_evaluated', 'N/A')}") - with MatbenchDiscovery( - endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG - ) as bench: - # Run S2EFS task (uses relaxed structures, no geometry optimization) - # This is suitable for EquiformerV2 which doesn't support relaxation - print("Submitting S2EFS task...") - future = bench.tasks.S2EFS.submit( - model_factory=create_equiformerv2_model, - model_package="fairchem-core", - num_structures=NUM_STRUCTURES, - use_multi_gpu=USE_MULTI_GPU, - ) - - print("Waiting for results (this may take a while)...") - output = future.result() - - # Display metrics - print() - print("=" * 80) - print("Benchmark Results") - print("=" * 80) - - metrics = output.get("metrics", {}) - if "error" in metrics: - print(f"Error: {metrics['error']}") - else: - # Energy metrics - if "energy_mae" in metrics: - print("Energy Metrics:") - print(f" MAE (eV/atom): {metrics.get('energy_mae', 'N/A'):.6f}") - print(f" RMSE (eV/atom): {metrics.get('energy_rmse', 'N/A'):.6f}") - print(f" R²: {metrics.get('energy_r2', 'N/A'):.6f}") - print() - - # Force metrics - if "force_mae" in metrics: - print("Force Metrics:") - print(f" MAE (eV/Å): {metrics.get('force_mae', 'N/A'):.6f}") - print(f" RMSE (eV/Å): {metrics.get('force_rmse', 'N/A'):.6f}") - print(f" R²: {metrics.get('force_r2', 'N/A'):.6f}") - print() - - # Stress metrics - if "stress_mae" in metrics: - print("Stress Metrics:") - print(f" MAE (GPa): {metrics.get('stress_mae', 'N/A'):.6f}") - print(f" RMSE (GPa): {metrics.get('stress_rmse', 'N/A'):.6f}") - print(f" R²: {metrics.get('stress_r2', 'N/A'):.6f}") - print() - - if "num_evaluated" in metrics: - print(f"Structures: {metrics.get('num_evaluated', 'N/A')}") - - print("=" * 80) + print("=" * 80) if __name__ == "__main__": diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py index 3707f57f..7b3783c2 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py @@ -4,7 +4,9 @@ using multi-GPU parallelization on a Globus Compute endpoint. """ -from garden_ai.benchmarks.matbench_discovery import DatasetSize, MatbenchDiscovery +from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery + +ANVIL = "5aafb4c1-27b2-40d8-a038-a0277611868f" # Model factory function for MACE @@ -15,15 +17,25 @@ def create_mace_model(device): results = MatbenchDiscovery.IS2RE.remote( - endpoint="5aafb4c1-27b2-40d8-a038-a0277611868f", - walltime="01:00:00", - scheduler_options={"gpus-per-node": 2, "cores-per-node": 16}, - account="youraccount", - partition="gpu-debug", - qos="gpu", + endpoint=ANVIL, + user_endpoint_config={ + "scheduler_options": "#SBATCH --gpus-per-node=2\n", + "walltime": 3600, + "qos": "gpu", + "partition": "gpu-debug", + "account": "cis250461-gpu", + "cores_per_node": 16, + "mem_per_node": 32, + "requirements": "", # 'requirements' is required for Anvil endpoint + }, model_factory=create_mace_model, - model_packages="mace-torch", - num_structures=DatasetSize.RANDOM_100, + model_packages=[ + "mace-torch", + "cuequivariance", + "cuequivariance-torch", + "cuequivariance-ops-torch-cu12", + ], + num_structures="random_100", ) print(results["metrics"]) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py index fcf77a1c..22099e9e 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py @@ -16,16 +16,12 @@ # Globus Compute endpoint ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" -# HPC endpoint configuration -ENDPOINT_CONFIG = { - "account": "cis250461-gpu", - "partition": "gpu-debug", - "qos": "gpu", - "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8", -} + +# ============================================================================= +# Model Factory +# ============================================================================= -# Model factory function for MatterSim def create_mattersim_model(device): """Create MatterSim model calculator. @@ -40,10 +36,6 @@ def create_mattersim_model(device): return MatterSimCalculator(device=device) -# Benchmark parameters -NUM_STRUCTURES = 1000 -USE_MULTI_GPU = True - # ============================================================================= # Run Benchmark # ============================================================================= @@ -53,55 +45,52 @@ def main(): """Run Matbench Discovery IS2RE benchmark with MatterSim.""" print("=" * 80) - print("Matbench Discovery IS2RE Benchmark") - print("=" * 80) - print(f"Endpoint: {ENDPOINT_ID}") - print("Model: MatterSim") - print(f"Structures: {NUM_STRUCTURES}") - print(f"Resources: {'Multi-GPU' if USE_MULTI_GPU else 'Single GPU'}") + print("Matbench Discovery IS2RE Benchmark - MatterSim") print("=" * 80) + + # Run IS2RE task using the new groundhog API + output = MatbenchDiscovery.IS2RE.remote( + endpoint=ENDPOINT_ID, + user_endpoint_config={ + "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8\n", + "walltime": 7200, # 2 hours in seconds + "qos": "gpu", + "partition": "gpu-debug", + "account": "cis250461-gpu", + "cores_per_node": 16, + "mem_per_node": 32, + "requirements": "", + }, + model_factory=create_mattersim_model, + model_packages="mattersim", + num_structures="random_100", + ) + + # Display metrics print() + print("=" * 80) + print("Benchmark Results") + print("=" * 80) - with MatbenchDiscovery( - endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG - ) as bench: - # Run IS2RE task - print("Submitting IS2RE task...") - future = bench.tasks.IS2RE.submit( - model_factory=create_mattersim_model, - model_package="mattersim", - num_structures=NUM_STRUCTURES, - use_multi_gpu=USE_MULTI_GPU, - ) - - print("Waiting for results (this may take a while)...") - output = future.result() - - # Display metrics + metrics = output.get("metrics", {}) + if "error" in metrics: + print(f"Error: {metrics['error']}") + else: + # Discovery metrics + print(f"F1 Score: {metrics.get('F1', 'N/A'):.6f}") + print(f"DAF: {metrics.get('DAF', 'N/A'):.2f}x") + print(f"Precision: {metrics.get('Precision', 'N/A'):.6f}") + print(f"Recall: {metrics.get('Recall', 'N/A'):.6f}") + print(f"Accuracy: {metrics.get('Accuracy', 'N/A'):.6f}") + print() + # Regression metrics + print(f"MAE (eV/atom): {metrics.get('MAE', 'N/A'):.6f}") + print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}") + print(f"R²: {metrics.get('R2', 'N/A'):.6f}") print() - print("=" * 80) - print("Benchmark Results") - print("=" * 80) - - metrics = output.get("metrics", {}) - if "error" in metrics: - print(f"Error: {metrics['error']}") - else: - # Discovery metrics - print(f"F1 Score: {metrics.get('F1', 'N/A'):.6f}") - print(f"DAF: {metrics.get('DAF', 'N/A'):.2f}x") - print(f"Precision: {metrics.get('Precision', 'N/A'):.6f}") - print(f"Recall: {metrics.get('Recall', 'N/A'):.6f}") - print(f"Accuracy: {metrics.get('Accuracy', 'N/A'):.6f}") - print() - # Regression metrics - print(f"MAE (eV/atom): {metrics.get('MAE', 'N/A'):.6f}") - print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}") - print(f"R²: {metrics.get('R2', 'N/A'):.6f}") - print() - print(f"Structures: {metrics.get('num_evaluated', 'N/A')}") - - print("=" * 80) + print(f"Structures: {metrics.get('num_evaluated', 'N/A')}") + + print("=" * 80) if __name__ == "__main__": diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py index d028b740..e24d0d69 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py @@ -17,18 +17,12 @@ # Globus Compute endpoint (replace with your endpoint UUID) ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" -# HPC endpoint configuration (adjust for your cluster) -ENDPOINT_CONFIG = { - "account": "cis250461-gpu", - "partition": "gpu-debug", - "qos": "gpu", - "scheduler_options": "#SBATCH --gpus-per-node=2\n", - "cores_per_node": 16, - "mem_per_node": 32, # GB -} - - -# Model factory function for SevenNet + +# ============================================================================= +# Model Factory +# ============================================================================= + + def create_sevennet_model(device): """Create SevenNet model calculator. @@ -43,10 +37,6 @@ def create_sevennet_model(device): return SevenNetCalculator(model="7net-0", device=device) -# Benchmark parameters -NUM_STRUCTURES = 1000 # Number of structures to evaluate -USE_MULTI_GPU = True # Enable multi-GPU parallelization - # ============================================================================= # Run Benchmark # ============================================================================= @@ -56,55 +46,52 @@ def main(): """Run Matbench Discovery IS2RE benchmark with SevenNet.""" print("=" * 80) - print("Matbench Discovery IS2RE Benchmark") - print("=" * 80) - print(f"Endpoint: {ENDPOINT_ID}") - print("Model: SevenNet (7net-0)") - print(f"Structures: {NUM_STRUCTURES}") - print(f"Resources: {'Multi-GPU' if USE_MULTI_GPU else 'Single GPU'}") + print("Matbench Discovery IS2RE Benchmark - SevenNet") print("=" * 80) + + # Run IS2RE task using the new groundhog API + output = MatbenchDiscovery.IS2RE.remote( + endpoint=ENDPOINT_ID, + user_endpoint_config={ + "scheduler_options": "#SBATCH --gpus-per-node=2\n", + "walltime": 7200, # 2 hours in seconds + "qos": "gpu", + "partition": "gpu-debug", + "account": "cis250461-gpu", + "cores_per_node": 16, + "mem_per_node": 32, + "requirements": "", + }, + model_factory=create_sevennet_model, + model_packages="sevenn", + num_structures="random_100", + ) + + # Display metrics print() + print("=" * 80) + print("Benchmark Results") + print("=" * 80) - with MatbenchDiscovery( - endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG - ) as bench: - # Run IS2RE task (Initial Structure to Relaxed Energy) - print("Submitting IS2RE task...") - future = bench.tasks.IS2RE.submit( - model_factory=create_sevennet_model, - model_package="sevenn", - num_structures=NUM_STRUCTURES, - use_multi_gpu=USE_MULTI_GPU, - ) - - print("Waiting for results (this may take a while)...") - output = future.result() - - # Display metrics + metrics = output.get("metrics", {}) + if "error" in metrics: + print(f"Error: {metrics['error']}") + else: + # Discovery metrics + print(f"F1 Score: {metrics.get('F1', 'N/A'):.6f}") + print(f"DAF: {metrics.get('DAF', 'N/A'):.2f}x") + print(f"Precision: {metrics.get('Precision', 'N/A'):.6f}") + print(f"Recall: {metrics.get('Recall', 'N/A'):.6f}") + print(f"Accuracy: {metrics.get('Accuracy', 'N/A'):.6f}") print() - print("=" * 80) - print("Benchmark Results") - print("=" * 80) - - metrics = output.get("metrics", {}) - if "error" in metrics: - print(f"Error: {metrics['error']}") - else: - # Discovery metrics - print(f"F1 Score: {metrics.get('F1', 'N/A'):.6f}") - print(f"DAF: {metrics.get('DAF', 'N/A'):.2f}x") - print(f"Precision: {metrics.get('Precision', 'N/A'):.6f}") - print(f"Recall: {metrics.get('Recall', 'N/A'):.6f}") - print(f"Accuracy: {metrics.get('Accuracy', 'N/A'):.6f}") - print() - # Regression metrics - print(f"MAE (eV/atom): {metrics.get('MAE', 'N/A'):.6f}") - print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}") - print(f"R²: {metrics.get('R2', 'N/A'):.6f}") - print() - print(f"Structures: {metrics.get('num_evaluated', 'N/A')}") - - print("=" * 80) + # Regression metrics + print(f"MAE (eV/atom): {metrics.get('MAE', 'N/A'):.6f}") + print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}") + print(f"R²: {metrics.get('R2', 'N/A'):.6f}") + print() + print(f"Structures: {metrics.get('num_evaluated', 'N/A')}") + + print("=" * 80) if __name__ == "__main__": diff --git a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py index 96c8208f..c171239e 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py @@ -1,16 +1,16 @@ #!/usr/bin/env python3 """ -Run Matbench Discovery benchmarks on 10k most stable structures. +Run Matbench Discovery benchmarks on 10k random structures. -This script benchmarks MACE, MatterSim, and SevenNet on the 10k most stable -materials from the unique prototypes subset and saves comprehensive metrics to JSON. +This script benchmarks MACE, MatterSim, and SevenNet on a random 10k +sample from the unique prototypes subset and saves comprehensive metrics to JSON. """ import json from datetime import datetime from pathlib import Path -from garden_ai.benchmarks.matbench_discovery import DatasetSize, MatbenchDiscovery +from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery # ============================================================================= # Configuration @@ -19,18 +19,21 @@ # Globus Compute endpoint ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" -# HPC endpoint configuration +# Common endpoint configuration ENDPOINT_CONFIG = { - "account": "cis250461-gpu", - "partition": "gpu", - "qos": "gpu", "scheduler_options": "#SBATCH --gpus-per-node=4\n", - "cores_per_node": 8, - "mem_per_node": 32, + "walltime": 14400, # 4 hours in seconds + "qos": "gpu", + "partition": "gpu", + "account": "cis250461-gpu", + "cores_per_node": 16, + "mem_per_node": 64, + "requirements": "", } # Output file for metrics -OUTPUT_FILE = "stable_10k_benchmark_results.json" +OUTPUT_FILE = "random_10k_benchmark_results.json" + # ============================================================================= # Model Factory Functions @@ -61,19 +64,25 @@ def create_sevennet_model(device): # Model configurations MODELS = { "MACE": { - "package": "mace-torch", + "packages": [ + "mace-torch", + "cuequivariance", + "cuequivariance-torch", + "cuequivariance-ops-torch-cu12", + ], "factory": create_mace_model, }, "MatterSim": { - "package": "mattersim", + "packages": ["mattersim"], "factory": create_mattersim_model, }, "SevenNet": { - "package": "sevenn", + "packages": ["sevenn"], "factory": create_sevennet_model, }, } + # ============================================================================= # Run Benchmarks # ============================================================================= @@ -83,9 +92,9 @@ def main(): """Run benchmarks on all models and save results.""" print("=" * 80) - print("Matbench Discovery Benchmark - Stable 10k") + print("Matbench Discovery Benchmark - Random 10k") print("=" * 80) - print("Dataset: 10k Most Stable Structures") + print("Dataset: Random 10k from Unique Prototypes") print(f"Models: {', '.join(MODELS.keys())}") print(f"Endpoint: {ENDPOINT_ID}") print("=" * 80) @@ -94,96 +103,54 @@ def main(): results = { "metadata": { "timestamp": datetime.now().isoformat(), - "dataset": "stable_10k", + "dataset": "random_10k", "dataset_size": 10000, "endpoint_id": ENDPOINT_ID, }, "models": {}, } - with MatbenchDiscovery( - endpoint_id=ENDPOINT_ID, user_endpoint_config=ENDPOINT_CONFIG - ) as bench: - for model_name, config in MODELS.items(): - print(f"\n{'=' * 80}") - print(f"Running {model_name}...") - print(f"{'=' * 80}\n") - - try: - # Submit job - future = bench.tasks.IS2RE.submit( - model_factory=config["factory"], - model_packages=[ - config["package"], - "cuequivariance", - "cuequivariance-torch", - "cuequivariance-ops-torch-cu12", - ], - num_structures=DatasetSize.RANDOM_10K, - ) - - print(f"Job submitted for {model_name}. Waiting for results...") - - try: - output = future.result() - except Exception as e: - print(f"⚠️ {model_name} failed first attempt: {e}") - print(f" Resuming from checkpoint: {future.checkpoint_path}") - - # Extract checkpoint name from path - checkpoint_name = Path(future.checkpoint_path).name - - # Resubmit with same checkpoint name to resume - retry_future = bench.tasks.IS2RE.submit( - model_factory=config["factory"], - model_packages=[ - config["package"], - "cuequivariance", - "cuequivariance-torch", - "cuequivariance-ops-torch-cu12", - ], - num_structures=DatasetSize.RANDOM_10K, - checkpoint_name=checkpoint_name, - ) - - try: - print(" Retry job submitted. Waiting for results...") - output = retry_future.result() - print(" ✅ Retry successful!") - except Exception as retry_e: - print(f"❌ {model_name} failed retry: {retry_e}") - results["models"][model_name] = { - "status": "error", - "error": str(retry_e), - } - continue # Skip to next model - - # Store complete output (contains both metrics and per-structure results) - results["models"][model_name] = { - "status": "success", - **output, # Unpack entire output dict (metrics + results) - } - - # Display metrics - metrics = output.get("metrics", {}) - if "error" in metrics: - print(f"❌ {model_name} failed: {metrics['error']}") - results["models"][model_name]["status"] = "failed" - results["models"][model_name]["error"] = metrics["error"] - else: - print(f"✅ {model_name} completed successfully!") - print(f" F1 Score: {metrics.get('F1', 'N/A'):.6f}") - print(f" DAF: {metrics.get('DAF', 'N/A'):.2f}x") - print(f" MAE (eV/atom): {metrics.get('MAE', 'N/A'):.6f}") - print(f" RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}") - print(f" Structures: {metrics.get('num_evaluated', 'N/A')}") - - except Exception as e: - print(f"❌ {model_name} error: {e}") - results["models"][model_name] = { - "status": "error", - "error": str(e), - } + for model_name, config in MODELS.items(): + print(f"\n{'=' * 80}") + print(f"Running {model_name}...") + print(f"{'=' * 80}\n") + + try: + # Run benchmark using the new groundhog API + output = MatbenchDiscovery.IS2RE.remote( + endpoint=ENDPOINT_ID, + user_endpoint_config=ENDPOINT_CONFIG, + model_factory=config["factory"], + model_packages=config["packages"], + num_structures="random_10k", + ) + + # Store complete output (contains both metrics and per-structure results) + results["models"][model_name] = { + "status": "success", + **output, + } + + # Display metrics + metrics = output.get("metrics", {}) + if "error" in metrics: + print(f"❌ {model_name} failed: {metrics['error']}") + results["models"][model_name]["status"] = "failed" + results["models"][model_name]["error"] = metrics["error"] + else: + print(f"✅ {model_name} completed successfully!") + print(f" F1 Score: {metrics.get('F1', 'N/A'):.6f}") + print(f" DAF: {metrics.get('DAF', 'N/A'):.2f}x") + print(f" MAE (eV/atom): {metrics.get('MAE', 'N/A'):.6f}") + print(f" RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}") + print(f" Structures: {metrics.get('num_evaluated', 'N/A')}") + + except Exception as e: + print(f"❌ {model_name} error: {e}") + results["models"][model_name] = { + "status": "error", + "error": str(e), + } # Save results to JSON output_path = Path(OUTPUT_FILE) diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index b130912b..7bb9592d 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -214,10 +214,11 @@ def _process_batch_common( num_threads: int, compute_fn: Callable[[Any, Any], Dict[str, Any]], task_name: str, - model_factory: Callable[[str], Any], + model_factory_source: str, ) -> Dict[str, Any]: import logging import os + import re import time import torch @@ -236,6 +237,19 @@ def _process_batch_common( global _MODEL_CACHE try: if _MODEL_CACHE is None: + # Reconstruct model_factory from source code + func_name_match = re.search(r"def\s+(\w+)\s*\(", model_factory_source) + if not func_name_match: + raise ValueError( + "Could not extract function name from model_factory source" + ) + func_name = func_name_match.group(1) + + # Execute the source to define the function + local_namespace = {} + exec(model_factory_source, local_namespace) + model_factory = local_namespace[func_name] + model = model_factory(device) _MODEL_CACHE = model else: @@ -350,7 +364,7 @@ def process_batch_relaxation( structures: List[Any], model_config: Dict[str, Any], num_threads: int, - model_factory: Callable[[str], Any], + model_factory_source: str, ) -> Dict[str, Any]: from ase.optimize import FIRE @@ -368,7 +382,7 @@ def compute(model, atoms): num_threads, compute, "relaxation", - model_factory, + model_factory_source, ) @@ -377,7 +391,7 @@ def process_batch_static( structures: List[Any], model_config: Dict[str, Any], num_threads: int, - model_factory: Callable[[str], Any], + model_factory_source: str, ) -> Dict[str, Any]: def compute(model, atoms): atoms.calc = model @@ -391,7 +405,7 @@ def compute(model, atoms): num_threads, compute, "static calculation", - model_factory, + model_factory_source, ) @@ -400,7 +414,7 @@ def process_batch_forces( structures: List[Any], model_config: Dict[str, Any], num_threads: int, - model_factory: Callable[[str], Any], + model_factory_source: str, ) -> Dict[str, Any]: def compute(model, atoms): atoms.calc = model @@ -416,7 +430,7 @@ def compute(model, atoms): num_threads, compute, "forces calculation", - model_factory, + model_factory_source, ) @@ -582,7 +596,8 @@ def calculate_metrics_forces( def run_benchmark_hog( config: Dict[str, Any], - model_factory: Any, + model_packages: str | List[str], + model_factory_source: str, load_dataset_fn: Any, process_fn: Any, calc_metrics_fn: Any, @@ -590,6 +605,54 @@ def run_benchmark_hog( logger = setup_logging() logger.info("Starting benchmark runner...") + # Install model packages if specified + if model_packages: + import subprocess + + packages = ( + model_packages if isinstance(model_packages, list) else [model_packages] + ) + logger.info(f"Installing model packages: {packages}") + try: + result = subprocess.run( + ["uv", "pip", "install"] + packages, + capture_output=True, + text=True, + timeout=300, # 5 minute timeout + ) + if result.returncode != 0: + error_msg = ( + f"Failed to install model packages: {packages}\n" + f"stdout: {result.stdout}\n" + f"stderr: {result.stderr}" + ) + logger.error(error_msg) + raise RuntimeError(error_msg) + logger.info("Model packages installed successfully") + except subprocess.TimeoutExpired: + error_msg = f"Model package installation timed out after 300s: {packages}" + logger.error(error_msg) + raise RuntimeError(error_msg) + except Exception as e: + if isinstance(e, RuntimeError): + raise # Re-raise our own errors + error_msg = f"Could not install model packages: {e}" + logger.error(error_msg) + raise RuntimeError(error_msg) from e + + # Fix SSL certificate issues on HPC nodes using certifi + try: + import ssl + + import certifi + + os.environ["SSL_CERT_FILE"] = certifi.where() + os.environ["REQUESTS_CA_BUNDLE"] = certifi.where() + ssl._create_default_https_context = ssl.create_default_context + logger.info(f"SSL certificates configured: {certifi.where()}") + except ImportError: + logger.warning("certifi not available, SSL issues may occur") + checkpoint_path = config.get("checkpoint_path") results = {} @@ -679,7 +742,7 @@ def run_benchmark_hog( batch, worker_config, threads_per_worker, - model_factory, + model_factory_source, ) ) @@ -729,12 +792,66 @@ def run_benchmark_hog( return output +# ------------------------------------------------------------------------------ +# BENCHMARK METHOD WRAPPER +# ------------------------------------------------------------------------------ + + +class BenchmarkMethod: + """Wrapper around groundhog Method that handles model_factory source extraction. + + This wrapper intercepts .remote(), .local(), and .submit() calls to automatically + extract source code from the model_factory callable before passing to groundhog. + This avoids pickle serialization issues with functions defined in __main__. + """ + + def __init__(self, hog_method): + """Initialize wrapper with the underlying groundhog Method.""" + self._hog_method = hog_method + + def _extract_factory_source(self, kwargs): + """Extract source code from model_factory if it's a callable.""" + import inspect + + if "model_factory" in kwargs: + factory = kwargs["model_factory"] + if callable(factory) and not isinstance(factory, str): + try: + kwargs["model_factory"] = inspect.getsource(factory) + except (OSError, TypeError) as e: + raise ValueError( + f"Could not extract source code from model_factory. " + f"Ensure the function is defined in a file (not interactive/lambda). " + f"Error: {e}" + ) + return kwargs + + def remote(self, *args, **kwargs): + """Execute remotely with automatic model_factory source extraction.""" + kwargs = self._extract_factory_source(kwargs) + return self._hog_method.remote(*args, **kwargs) + + def local(self, *args, **kwargs): + """Execute locally with automatic model_factory source extraction.""" + kwargs = self._extract_factory_source(kwargs) + return self._hog_method.local(*args, **kwargs) + + def submit(self, *args, **kwargs): + """Submit for async execution with automatic model_factory source extraction.""" + kwargs = self._extract_factory_source(kwargs) + return self._hog_method.submit(*args, **kwargs) + + def __call__(self, *args, **kwargs): + """Direct call (for local execution within groundhog).""" + return self._hog_method(*args, **kwargs) + + # ------------------------------------------------------------------------------ # CLASS DEFINITION # ------------------------------------------------------------------------------ -class MatbenchDiscovery: +class _MatbenchDiscoveryBase: """Matbench Discovery tasks using Groundhog HPC.""" REPO_URL = "https://github.com/janosh/matbench-discovery" @@ -742,7 +859,7 @@ class MatbenchDiscovery: @staticmethod def _prepare_runner_config( - num_structures: int | "DatasetSize" | "DatasetConfig", + num_structures: int | "DatasetSize" | "DatasetConfig" | str, repo_url: str = REPO_URL, repo_ref: str = REPO_REF, ) -> Dict[str, Any]: @@ -757,7 +874,10 @@ def _prepare_runner_config( subset = "full" seed = 42 - if hasattr(num_structures, "value"): # Enum + if isinstance(num_structures, str): + # String value like "random_100" - use directly as subset + subset = num_structures + elif hasattr(num_structures, "value"): # Enum subset = num_structures.value # Check for seed method/attr if it's our custom Config if hasattr(num_structures, "seed"): @@ -818,6 +938,23 @@ def _run_task( calc_metrics_fn: Any, sys_path: List[str] | None = None, ) -> Dict[str, Any]: + import inspect + + # Handle model_factory as either a callable or source string + # For remote execution, user should pass inspect.getsource(factory) + # For local execution, can pass the function directly + if isinstance(model_factory, str): + model_factory_source = model_factory + else: + try: + model_factory_source = inspect.getsource(model_factory) + except (OSError, TypeError) as e: + raise ValueError( + f"Could not extract source code from model_factory. " + f"For remote execution, use: inspect.getsource(your_factory). " + f"Error: {e}" + ) + # Add custom sys.path if provided (useful for local execution/testing) if sys_path: import sys @@ -850,7 +987,8 @@ def _run_task( return run_benchmark_hog( runner_config, - model_factory, + model_packages, + model_factory_source, load_dataset_fn, process_fn, calc_metrics_fn, @@ -925,18 +1063,18 @@ def S2EFS( # Aliases @hog.method() def S2EF(*args, **kwargs): - return MatbenchDiscovery.S2EFS(*args, **kwargs) + return _MatbenchDiscoveryBase.S2EFS(*args, **kwargs) @hog.method() def S2EFSM(*args, **kwargs): - return MatbenchDiscovery.S2EFS(*args, **kwargs) + return _MatbenchDiscoveryBase.S2EFS(*args, **kwargs) @hog.method() def IS2E(*args, **kwargs): # Same as IS2RE but static? No, IS2E is Initial Structure to Energy (Static). # IS2RE is Relaxation. # IS2E logic: - return MatbenchDiscovery._run_task( + return _MatbenchDiscoveryBase._run_task( *args, **kwargs, process_fn=process_batch_static, @@ -947,17 +1085,65 @@ def IS2E(*args, **kwargs): @hog.method() def S2E(*args, **kwargs): # Structure to Energy (Relaxed Structure to Energy) -> RS2RE - return MatbenchDiscovery.RS2RE(*args, **kwargs) + return _MatbenchDiscoveryBase.RS2RE(*args, **kwargs) @hog.method() def S2RE(*args, **kwargs): # Structure to Relaxed Energy -> IS2RE - return MatbenchDiscovery.IS2RE(*args, **kwargs) + return _MatbenchDiscoveryBase.IS2RE(*args, **kwargs) @hog.method() def RP2RE(*args, **kwargs): - return MatbenchDiscovery.IS2RE(*args, **kwargs) + return _MatbenchDiscoveryBase.IS2RE(*args, **kwargs) @hog.method() def IP2E(*args, **kwargs): - return MatbenchDiscovery.IS2E(*args, **kwargs) + return _MatbenchDiscoveryBase.IS2E(*args, **kwargs) + + +# ------------------------------------------------------------------------------ +# PUBLIC API - Wrapped methods with automatic source extraction +# ------------------------------------------------------------------------------ + + +class MatbenchDiscovery: + """Matbench Discovery benchmark tasks. + + This class provides wrapped methods that automatically handle model_factory + source extraction for remote execution. Users can pass callable functions + directly without needing to call inspect.getsource() themselves. + + Example: + def create_mace_model(device): + from mace.calculators import mace_mp + return mace_mp(model="medium", device=device) + + # Just pass the function - source extraction is automatic + results = MatbenchDiscovery.IS2RE.remote( + endpoint="your-endpoint-id", + model_factory=create_mace_model, + model_packages="mace-torch", + ) + """ + + REPO_URL = _MatbenchDiscoveryBase.REPO_URL + REPO_REF = _MatbenchDiscoveryBase.REPO_REF + + # Internal methods (needed for remote execution compatibility) + _prepare_runner_config = _MatbenchDiscoveryBase._prepare_runner_config + _generate_checkpoint_name = _MatbenchDiscoveryBase._generate_checkpoint_name + _run_task = _MatbenchDiscoveryBase._run_task + + # Main benchmark tasks - wrapped for automatic model_factory source extraction + IS2RE = BenchmarkMethod(_MatbenchDiscoveryBase.IS2RE) + RS2RE = BenchmarkMethod(_MatbenchDiscoveryBase.RS2RE) + S2EFS = BenchmarkMethod(_MatbenchDiscoveryBase.S2EFS) + + # Aliases + S2EF = BenchmarkMethod(_MatbenchDiscoveryBase.S2EF) + S2EFSM = BenchmarkMethod(_MatbenchDiscoveryBase.S2EFSM) + IS2E = BenchmarkMethod(_MatbenchDiscoveryBase.IS2E) + S2E = BenchmarkMethod(_MatbenchDiscoveryBase.S2E) + S2RE = BenchmarkMethod(_MatbenchDiscoveryBase.S2RE) + RP2RE = BenchmarkMethod(_MatbenchDiscoveryBase.RP2RE) + IP2E = BenchmarkMethod(_MatbenchDiscoveryBase.IP2E) From 9abea84be87f55be630d81b78cade75b409a7ca2 Mon Sep 17 00:00:00 2001 From: hholb Date: Mon, 8 Dec 2025 11:20:26 -0700 Subject: [PATCH 07/23] tweak metrics, print checkpoint path for sync calls --- .../examples/matbench_mace_multi_gpu.py | 12 +- .../benchmarks/matbench_discovery/tasks.py | 207 +++++++++++++----- garden_ai/benchmarks/utils/meta_metrics.py | 161 ++++++++++++++ 3 files changed, 320 insertions(+), 60 deletions(-) create mode 100644 garden_ai/benchmarks/utils/meta_metrics.py diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py index 7b3783c2..787587c3 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py @@ -4,12 +4,13 @@ using multi-GPU parallelization on a Globus Compute endpoint. """ +from rich import print + from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery ANVIL = "5aafb4c1-27b2-40d8-a038-a0277611868f" -# Model factory function for MACE def create_mace_model(device): from mace.calculators import mace_mp @@ -19,13 +20,12 @@ def create_mace_model(device): results = MatbenchDiscovery.IS2RE.remote( endpoint=ANVIL, user_endpoint_config={ - "scheduler_options": "#SBATCH --gpus-per-node=2\n", + "scheduler_options": "#SBATCH --gpus-per-node=4\n", "walltime": 3600, "qos": "gpu", - "partition": "gpu-debug", + "partition": "gpu", "account": "cis250461-gpu", "cores_per_node": 16, - "mem_per_node": 32, "requirements": "", # 'requirements' is required for Anvil endpoint }, model_factory=create_mace_model, @@ -35,7 +35,7 @@ def create_mace_model(device): "cuequivariance-torch", "cuequivariance-ops-torch-cu12", ], - num_structures="random_100", + num_structures=100, ) -print(results["metrics"]) +print(results) diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index 7bb9592d..1b27b79a 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -11,7 +11,6 @@ # "matbench-discovery", # ] # /// -"""Matbench Discovery benchmark task implementations using Groundhog HPC.""" from __future__ import annotations @@ -35,10 +34,6 @@ if TYPE_CHECKING: from .enums import DatasetConfig, DatasetSize -# ------------------------------------------------------------------------------ -# BOILERPLATE: Logging & Device Setup -# ------------------------------------------------------------------------------ - def setup_logging(): logging.basicConfig( @@ -80,11 +75,33 @@ def convert_numpy_types(obj): return obj -# ------------------------------------------------------------------------------ -# METRICS HELPERS (Inlined from metrics.py) -# ------------------------------------------------------------------------------ +# Meta metrics functions - will be injected from source for remote execution +get_hardware_info = None +extract_model_info = None +calculate_run_metadata = None + + +def _inject_meta_metrics(source: str) -> None: + """Inject meta_metrics functions from source code for remote execution.""" + global get_hardware_info, extract_model_info, calculate_run_metadata + namespace = {} + exec(source, namespace) + get_hardware_info = namespace["get_hardware_info"] + extract_model_info = namespace["extract_model_info"] + calculate_run_metadata = namespace["calculate_run_metadata"] + + +def _get_meta_metrics_source() -> str: + """Get source code of meta_metrics module (called locally).""" + import inspect + from garden_ai.benchmarks.utils import meta_metrics + return inspect.getsource(meta_metrics) + + +# Metrics calculations lifted from https://github.com/janosh/matbench-discovery/tree/main/matbench_discovery/metrics +# Since they aren't setup to be easily imported, we just copy them here def classify_stable( each_true: Sequence[float] | pd.Series | np.ndarray, each_pred: Sequence[float] | pd.Series | np.ndarray, @@ -125,6 +142,7 @@ def classify_stable( return true_pos, false_neg, false_pos, true_neg +# This is also coptied from the matbench-discovery repo def stable_metrics( each_true: Sequence[float] | pd.Series | np.ndarray, each_pred: Sequence[float] | pd.Series | np.ndarray, @@ -200,10 +218,6 @@ def stable_metrics( ) -# ------------------------------------------------------------------------------ -# REMOTE HELPERS (Inlined from remote.py) -# ------------------------------------------------------------------------------ - _MODEL_CACHE = None @@ -589,15 +603,11 @@ def calculate_metrics_forces( return result_metrics -# ------------------------------------------------------------------------------ -# MAIN RUNNER (Inlined from runners.py) -# ------------------------------------------------------------------------------ - - def run_benchmark_hog( config: Dict[str, Any], model_packages: str | List[str], model_factory_source: str, + meta_metrics_source: str, load_dataset_fn: Any, process_fn: Any, calc_metrics_fn: Any, @@ -605,6 +615,15 @@ def run_benchmark_hog( logger = setup_logging() logger.info("Starting benchmark runner...") + # Inject meta_metrics functions from source + _inject_meta_metrics(meta_metrics_source) + + # Collect hardware and model info + hardware_info = get_hardware_info() + model_info = extract_model_info(model_packages) + logger.info(f"Hardware: {hardware_info}") + logger.info(f"Model: {model_info}") + # Install model packages if specified if model_packages: import subprocess @@ -681,7 +700,15 @@ def run_benchmark_hog( if not items_to_process: logger.info("All items already processed!") - return {"results": results, "metrics": {}} + run_metadata = calculate_run_metadata( + hardware_info=hardware_info, + model_info=model_info, + total_elapsed=0, + num_workers=0, + num_structures_total=len(all_items), + num_structures_processed=0, + ) + return {"metrics": {}, "run_metadata": run_metadata} logger.info(f"Processing {len(items_to_process)} remaining items") @@ -698,7 +725,12 @@ def run_benchmark_hog( num_gpus = 0 use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1 - total_cores = os.cpu_count() or 1 + # Use sched_getaffinity to get cores available to this job, not total cores on node + try: + total_cores = len(os.sched_getaffinity(0)) + except AttributeError: + # Fallback for systems without sched_getaffinity (e.g., macOS) + total_cores = os.cpu_count() or 1 num_workers = num_gpus if use_multi_gpu else 1 available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores threads_per_worker = max(1, available_cores // num_workers) @@ -787,32 +819,34 @@ def run_benchmark_hog( traceback.print_exc() metrics = {"error": f"Metrics calculation failed: {e}"} - output = {"results": results, "metrics": metrics} + # Calculate run metadata + run_metadata = calculate_run_metadata( + hardware_info=hardware_info, + model_info=model_info, + total_elapsed=total_elapsed, + num_workers=num_workers, + num_structures_total=len(all_items), + num_structures_processed=len(items_to_process), + ) + logger.info(f"Run metadata: {run_metadata}") + + output = {"metrics": metrics, "run_metadata": run_metadata} output = convert_numpy_types(output) return output -# ------------------------------------------------------------------------------ -# BENCHMARK METHOD WRAPPER -# ------------------------------------------------------------------------------ - - class BenchmarkMethod: - """Wrapper around groundhog Method that handles model_factory source extraction. - - This wrapper intercepts .remote(), .local(), and .submit() calls to automatically - extract source code from the model_factory callable before passing to groundhog. - This avoids pickle serialization issues with functions defined in __main__. - """ + """Wrapper around groundhog Method that handles source extraction for remote execution.""" def __init__(self, hog_method): """Initialize wrapper with the underlying groundhog Method.""" self._hog_method = hog_method - def _extract_factory_source(self, kwargs): - """Extract source code from model_factory if it's a callable.""" + def _extract_sources(self, kwargs): + """Extract source code from model_factory and meta_metrics for remote execution.""" import inspect + # Extract model_factory source if "model_factory" in kwargs: factory = kwargs["model_factory"] if callable(factory) and not isinstance(factory, str): @@ -824,21 +858,88 @@ def _extract_factory_source(self, kwargs): f"Ensure the function is defined in a file (not interactive/lambda). " f"Error: {e}" ) + + # Extract meta_metrics source (runs locally where garden_ai is available) + kwargs["meta_metrics_source"] = _get_meta_metrics_source() + return kwargs + def _get_checkpoint_path_info(self, kwargs): + """Determine and return checkpoint path information from kwargs.""" + checkpoint_path = kwargs.get("checkpoint_path") + checkpoint_name = kwargs.get("checkpoint_name") + model_packages = kwargs.get("model_packages", "") + + if checkpoint_path: + return checkpoint_path, "resuming" + elif checkpoint_name: + final_path = os.path.expanduser(f"~/.garden/benchmarks/{checkpoint_name}") + return final_path, "new" + else: + # Generate checkpoint name using same logic as _run_task + num_structures = kwargs.get("num_structures", 100) + + # Determine subset string for checkpoint name + subset = "full" + if isinstance(num_structures, str): + subset = num_structures + elif hasattr(num_structures, "value"): # DatasetSize enum + subset = num_structures.value + elif hasattr(num_structures, "subset"): # DatasetConfig + subset = num_structures.subset.value + elif isinstance(num_structures, int): + subset = "full" if num_structures >= 200000 else f"num_{num_structures}" + + # Extract model name from packages + model_str = "unknown" + if isinstance(model_packages, list): + model_str = "_".join( + pkg.split("/")[-1].split("@")[0] for pkg in model_packages[:2] + ) + elif isinstance(model_packages, str): + model_str = model_packages.split("/")[-1].split("@")[0] + + # Generate timestamp and uuid like in _generate_checkpoint_name + import time + import uuid + + timestamp = time.strftime("%Y%m%d_%H%M%S") + short_uuid = str(uuid.uuid4())[:8] + checkpoint_name = ( + f"matbench_{model_str}_{subset}_{timestamp}_{short_uuid}.json" + ) + final_path = os.path.expanduser(f"~/.garden/benchmarks/{checkpoint_name}") + return final_path, "new" + + def _print_checkpoint_info(self, kwargs): + """Print checkpoint information before execution.""" + checkpoint_path, checkpoint_type = self._get_checkpoint_path_info(kwargs) + + print("=" * 80) + if checkpoint_type == "resuming": + print(f"📂 Resuming from checkpoint: {checkpoint_path}") + else: + print(f"💾 Checkpoint will be saved to: {checkpoint_path}") + print(" To resume this benchmark if it fails, use:") + print(f' checkpoint_path="{checkpoint_path}"') + print("=" * 80) + def remote(self, *args, **kwargs): - """Execute remotely with automatic model_factory source extraction.""" - kwargs = self._extract_factory_source(kwargs) + """Execute remotely with automatic source extraction.""" + kwargs = self._extract_sources(kwargs) + self._print_checkpoint_info(kwargs) return self._hog_method.remote(*args, **kwargs) def local(self, *args, **kwargs): - """Execute locally with automatic model_factory source extraction.""" - kwargs = self._extract_factory_source(kwargs) + """Execute locally with automatic source extraction.""" + kwargs = self._extract_sources(kwargs) + self._print_checkpoint_info(kwargs) return self._hog_method.local(*args, **kwargs) def submit(self, *args, **kwargs): - """Submit for async execution with automatic model_factory source extraction.""" - kwargs = self._extract_factory_source(kwargs) + """Submit for async execution with automatic source extraction.""" + kwargs = self._extract_sources(kwargs) + self._print_checkpoint_info(kwargs) return self._hog_method.submit(*args, **kwargs) def __call__(self, *args, **kwargs): @@ -846,11 +947,6 @@ def __call__(self, *args, **kwargs): return self._hog_method(*args, **kwargs) -# ------------------------------------------------------------------------------ -# CLASS DEFINITION -# ------------------------------------------------------------------------------ - - class _MatbenchDiscoveryBase: """Matbench Discovery tasks using Groundhog HPC.""" @@ -937,12 +1033,11 @@ def _run_task( load_dataset_fn: Any, calc_metrics_fn: Any, sys_path: List[str] | None = None, + meta_metrics_source: str | None = None, ) -> Dict[str, Any]: import inspect # Handle model_factory as either a callable or source string - # For remote execution, user should pass inspect.getsource(factory) - # For local execution, can pass the function directly if isinstance(model_factory, str): model_factory_source = model_factory else: @@ -955,7 +1050,7 @@ def _run_task( f"Error: {e}" ) - # Add custom sys.path if provided (useful for local execution/testing) + # Add custom sys.path if provided if sys_path: import sys @@ -980,15 +1075,19 @@ def _run_task( final_checkpoint_path = os.path.expanduser( f"~/.garden/benchmarks/{checkpoint_name}" ) - # Ensure directory exists os.makedirs(os.path.dirname(final_checkpoint_path), exist_ok=True) runner_config["checkpoint_path"] = final_checkpoint_path + # meta_metrics_source is injected by BenchmarkMethod wrapper + if meta_metrics_source is None: + raise ValueError("meta_metrics_source required for benchmark execution") + return run_benchmark_hog( runner_config, model_packages, model_factory_source, + meta_metrics_source, load_dataset_fn, process_fn, calc_metrics_fn, @@ -1002,6 +1101,7 @@ def IS2RE( checkpoint_name: str | None = None, checkpoint_path: str | None = None, sys_path: List[str] | None = None, + meta_metrics_source: str | None = None, ) -> Dict[str, Any]: """Initial Structure to Relaxed Energy.""" return MatbenchDiscovery._run_task( @@ -1014,6 +1114,7 @@ def IS2RE( load_dataset_wbm_initial, calculate_metrics_energy, sys_path=sys_path, + meta_metrics_source=meta_metrics_source, ) @hog.method() @@ -1024,6 +1125,7 @@ def RS2RE( checkpoint_name: str | None = None, checkpoint_path: str | None = None, sys_path: List[str] | None = None, + meta_metrics_source: str | None = None, ) -> Dict[str, Any]: """Relaxed Structure to Relaxed Energy.""" return MatbenchDiscovery._run_task( @@ -1036,6 +1138,7 @@ def RS2RE( load_dataset_wbm_relaxed, calculate_metrics_energy, sys_path=sys_path, + meta_metrics_source=meta_metrics_source, ) @hog.method() @@ -1046,6 +1149,7 @@ def S2EFS( checkpoint_name: str | None = None, checkpoint_path: str | None = None, sys_path: List[str] | None = None, + meta_metrics_source: str | None = None, ) -> Dict[str, Any]: """Structure to Energy, Forces, Stress.""" return MatbenchDiscovery._run_task( @@ -1058,6 +1162,7 @@ def S2EFS( load_dataset_mp_trj, calculate_metrics_forces, sys_path=sys_path, + meta_metrics_source=meta_metrics_source, ) # Aliases @@ -1101,11 +1206,6 @@ def IP2E(*args, **kwargs): return _MatbenchDiscoveryBase.IS2E(*args, **kwargs) -# ------------------------------------------------------------------------------ -# PUBLIC API - Wrapped methods with automatic source extraction -# ------------------------------------------------------------------------------ - - class MatbenchDiscovery: """Matbench Discovery benchmark tasks. @@ -1118,7 +1218,6 @@ def create_mace_model(device): from mace.calculators import mace_mp return mace_mp(model="medium", device=device) - # Just pass the function - source extraction is automatic results = MatbenchDiscovery.IS2RE.remote( endpoint="your-endpoint-id", model_factory=create_mace_model, diff --git a/garden_ai/benchmarks/utils/meta_metrics.py b/garden_ai/benchmarks/utils/meta_metrics.py new file mode 100644 index 00000000..e18120d4 --- /dev/null +++ b/garden_ai/benchmarks/utils/meta_metrics.py @@ -0,0 +1,161 @@ +"""Meta-level benchmark metrics utilities. + +Shared utilities for collecting hardware info, estimating costs, and extracting +model metadata that can be reused across different benchmark implementations. +""" + +from __future__ import annotations + +from typing import Any, Dict, List + +# GPU hourly cost estimates (USD) - Modal pricing (https://modal.com/pricing) +GPU_HOURLY_COSTS = { + "B200": 6.25, # $0.001736/sec + "H200": 4.54, # $0.001261/sec + "H100": 3.95, # $0.001097/sec + "A100-80GB": 2.50, # $0.000694/sec (80GB variant) + "A100": 2.10, # $0.000583/sec (40GB variant) + "L40S": 1.95, # $0.000542/sec + "A10": 1.10, # $0.000306/sec + "L4": 0.80, # $0.000222/sec + "T4": 0.59, # $0.000164/sec + "default": 2.00, # Fallback for unknown GPUs +} + +# Model name inference from package names +MODEL_PACKAGE_NAMES = { + "mace": "MACE", + "mattersim": "MatterSim", + "sevennet": "SevenNet", + "chgnet": "CHGNet", + "equiformer": "EquiformerV2", + "orb": "Orb", + "m3gnet": "M3GNet", + "alignn": "ALIGNN", +} + + +def get_hardware_info() -> Dict[str, Any]: + """Collect hardware information about the execution environment. + + Returns: + Dictionary containing: + - device_type: "cuda", "mps", or "cpu" + - num_gpus: Number of GPUs available + - gpu_names: List of GPU names + - gpu_memory_gb: Memory of first GPU in GB (if available) + """ + info = {"device_type": "cpu", "num_gpus": 0, "gpu_names": [], "gpu_memory_gb": None} + try: + import torch + + if torch.cuda.is_available(): + info["device_type"] = "cuda" + info["num_gpus"] = torch.cuda.device_count() + info["gpu_names"] = [ + torch.cuda.get_device_name(i) for i in range(info["num_gpus"]) + ] + if info["num_gpus"] > 0: + props = torch.cuda.get_device_properties(0) + info["gpu_memory_gb"] = round(props.total_memory / (1024**3), 1) + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + info["device_type"] = "mps" + except ImportError: + pass + return info + + +def get_gpu_hourly_cost(gpu_name: str) -> float: + """Estimate hourly cost for a GPU based on its name. + + Args: + gpu_name: GPU name string (e.g., "NVIDIA A100-SXM4-40GB") + + Returns: + Estimated hourly cost in USD + """ + gpu_name_upper = gpu_name.upper() + for key in GPU_HOURLY_COSTS: + if key != "default" and key.upper() in gpu_name_upper: + return GPU_HOURLY_COSTS[key] + return GPU_HOURLY_COSTS["default"] + + +def extract_model_info(model_packages: str | List[str]) -> Dict[str, Any]: + """Extract model info from package specification. + + Args: + model_packages: Package name(s) used to install the model + + Returns: + Dictionary containing: + - model_name: Inferred model name or "unknown" + - model_packages: List of package names + """ + packages = model_packages if isinstance(model_packages, list) else [model_packages] + model_name = "unknown" + for pkg in packages: + pkg_lower = pkg.lower() + for key, name in MODEL_PACKAGE_NAMES.items(): + if key in pkg_lower: + model_name = name + break + if model_name != "unknown": + break + return {"model_name": model_name, "model_packages": packages} + + +def calculate_run_metadata( + hardware_info: Dict[str, Any], + model_info: Dict[str, Any], + total_elapsed: float, + num_workers: int, + num_structures_total: int, + num_structures_processed: int, +) -> Dict[str, Any]: + """Calculate run metadata including timing, cost, and hardware info. + + Args: + hardware_info: Output from get_hardware_info() + model_info: Output from extract_model_info() + total_elapsed: Total benchmark runtime in seconds + num_workers: Number of worker processes used + num_structures_total: Total structures in dataset + num_structures_processed: Structures processed in this run + + Returns: + Complete run_metadata dictionary + """ + throughput = num_structures_total / total_elapsed if total_elapsed > 0 else 0 + + # Calculate cost estimate + gpu_hourly_cost = ( + get_gpu_hourly_cost(hardware_info["gpu_names"][0]) + if hardware_info["gpu_names"] + else 0 + ) + total_gpu_hours = (total_elapsed / 3600) * num_workers + total_cost = total_gpu_hours * gpu_hourly_cost + cost_per_1k = ( + (total_cost / num_structures_total) * 1000 if num_structures_total > 0 else 0 + ) + + return { + "model": model_info, + "hardware": hardware_info, + "timing": { + "total_seconds": round(total_elapsed, 2), + "throughput_per_second": round(throughput, 3), + "num_workers": num_workers, + }, + "cost": { + "gpu_hourly_rate_usd": gpu_hourly_cost, + "total_gpu_hours": round(total_gpu_hours, 4), + "estimated_cost_usd": round(total_cost, 4), + "estimated_cost_per_1000_structures_usd": round(cost_per_1k, 4), + }, + "dataset": { + "num_structures_total": num_structures_total, + "num_structures_processed": num_structures_processed, + }, + } From 018bb369835c93edb9572cd4eb78c64efb3d76f2 Mon Sep 17 00:00:00 2001 From: hholb Date: Mon, 8 Dec 2025 11:29:39 -0700 Subject: [PATCH 08/23] fix checkpoint path for remote calls --- .../examples/matbench_mace_multi_gpu.py | 6 +- .../benchmarks/matbench_discovery/tasks.py | 94 +++++++++++++------ 2 files changed, 67 insertions(+), 33 deletions(-) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py index 787587c3..354ca456 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py @@ -14,14 +14,14 @@ def create_mace_model(device): from mace.calculators import mace_mp - return mace_mp(model="medium", device=device, default_dtype="float64") + return mace_mp(model="medium-mpa-0", device=device, default_dtype="float64") results = MatbenchDiscovery.IS2RE.remote( endpoint=ANVIL, user_endpoint_config={ "scheduler_options": "#SBATCH --gpus-per-node=4\n", - "walltime": 3600, + "walltime": "05:00:00", "qos": "gpu", "partition": "gpu", "account": "cis250461-gpu", @@ -35,7 +35,7 @@ def create_mace_model(device): "cuequivariance-torch", "cuequivariance-ops-torch-cu12", ], - num_structures=100, + checkpoint_path="~/.garden/benchmarks/matbench_mace-torch_cuequivariance_full_20251208_115719_ed2e47af.json", ) print(results) diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index 1b27b79a..7471b3bd 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -2,7 +2,6 @@ # requires-python = ">=3.10" # dependencies = [ # "groundhog-hpc", -# "garden-ai", # "ase", # "numpy", # "pandas", @@ -348,9 +347,11 @@ def _load_dataset_common( if x.split(".")[0].isdigit() else float("inf"), ) - num_structures = config.get("num_structures", 100) - if isinstance(num_structures, int): - file_list = file_list[:num_structures] + # Only limit structures if explicitly specified (not when using full dataset) + if "num_structures" in config: + num_structures = config["num_structures"] + if isinstance(num_structures, int): + file_list = file_list[:num_structures] else: mat_id_set = set(mat_ids) file_list = [ @@ -864,20 +865,31 @@ def _extract_sources(self, kwargs): return kwargs - def _get_checkpoint_path_info(self, kwargs): - """Determine and return checkpoint path information from kwargs.""" + def _get_checkpoint_info_for_display(self, kwargs, is_remote: bool): + """Get checkpoint information to display to the user. + + Args: + kwargs: Method keyword arguments + is_remote: True if this is a remote/submit call, False for local + + Returns: + Tuple of (display_message, checkpoint_identifier, is_resuming) + """ checkpoint_path = kwargs.get("checkpoint_path") checkpoint_name = kwargs.get("checkpoint_name") - model_packages = kwargs.get("model_packages", "") if checkpoint_path: - return checkpoint_path, "resuming" - elif checkpoint_name: - final_path = os.path.expanduser(f"~/.garden/benchmarks/{checkpoint_name}") - return final_path, "new" - else: - # Generate checkpoint name using same logic as _run_task - num_structures = kwargs.get("num_structures", 100) + # User provided explicit path + if is_remote: + msg = f"Resuming from checkpoint on remote system: {checkpoint_path}" + else: + msg = f"Resuming from checkpoint: {checkpoint_path}" + return msg, checkpoint_path, True + + # Generate checkpoint name + if not checkpoint_name: + model_packages = kwargs.get("model_packages", "") + num_structures = kwargs.get("num_structures", "full") # Determine subset string for checkpoint name subset = "full" @@ -908,38 +920,60 @@ def _get_checkpoint_path_info(self, kwargs): checkpoint_name = ( f"matbench_{model_str}_{subset}_{timestamp}_{short_uuid}.json" ) - final_path = os.path.expanduser(f"~/.garden/benchmarks/{checkpoint_name}") - return final_path, "new" - def _print_checkpoint_info(self, kwargs): - """Print checkpoint information before execution.""" - checkpoint_path, checkpoint_type = self._get_checkpoint_path_info(kwargs) + # Construct display message + if is_remote: + msg = f"Checkpoint will be saved on remote system: ~/.garden/benchmarks/{checkpoint_name}" + identifier = f"~/.garden/benchmarks/{checkpoint_name}" + else: + local_path = os.path.expanduser(f"~/.garden/benchmarks/{checkpoint_name}") + msg = f"Checkpoint will be saved locally: {local_path}" + identifier = local_path + + return msg, identifier, False + + def _print_checkpoint_info(self, kwargs, is_remote: bool): + """Print checkpoint information before execution. + + Args: + kwargs: Method keyword arguments + is_remote: True if this is a remote/submit call, False for local + """ + msg, identifier, is_resuming = self._get_checkpoint_info_for_display( + kwargs, is_remote + ) print("=" * 80) - if checkpoint_type == "resuming": - print(f"📂 Resuming from checkpoint: {checkpoint_path}") + if is_resuming: + print(f"📂 {msg}") + else: + print(f"💾 {msg}") + + if is_remote: + print(" To resume this benchmark if it fails, use:") + print(f' checkpoint_path="{identifier}"') + print(" Note: Checkpoint is on the remote system, not your local machine") else: - print(f"💾 Checkpoint will be saved to: {checkpoint_path}") - print(" To resume this benchmark if it fails, use:") - print(f' checkpoint_path="{checkpoint_path}"') + print(" To resume this benchmark if it fails, use:") + print(f' checkpoint_path="{identifier}"') print("=" * 80) def remote(self, *args, **kwargs): """Execute remotely with automatic source extraction.""" kwargs = self._extract_sources(kwargs) - self._print_checkpoint_info(kwargs) + self._print_checkpoint_info(kwargs, is_remote=True) return self._hog_method.remote(*args, **kwargs) def local(self, *args, **kwargs): """Execute locally with automatic source extraction.""" kwargs = self._extract_sources(kwargs) - self._print_checkpoint_info(kwargs) + self._print_checkpoint_info(kwargs, is_remote=False) return self._hog_method.local(*args, **kwargs) def submit(self, *args, **kwargs): """Submit for async execution with automatic source extraction.""" kwargs = self._extract_sources(kwargs) - self._print_checkpoint_info(kwargs) + self._print_checkpoint_info(kwargs, is_remote=True) return self._hog_method.submit(*args, **kwargs) def __call__(self, *args, **kwargs): @@ -1097,7 +1131,7 @@ def _run_task( def IS2RE( model_factory: Any, model_packages: str | List[str], - num_structures: int | "DatasetSize" | "DatasetConfig" = 100, + num_structures: int | "DatasetSize" | "DatasetConfig" = "full", checkpoint_name: str | None = None, checkpoint_path: str | None = None, sys_path: List[str] | None = None, @@ -1121,7 +1155,7 @@ def IS2RE( def RS2RE( model_factory: Any, model_packages: str | List[str], - num_structures: int | "DatasetSize" | "DatasetConfig" = 100, + num_structures: int | "DatasetSize" | "DatasetConfig" = "full", checkpoint_name: str | None = None, checkpoint_path: str | None = None, sys_path: List[str] | None = None, @@ -1145,7 +1179,7 @@ def RS2RE( def S2EFS( model_factory: Any, model_packages: str | List[str], - num_structures: int | "DatasetSize" | "DatasetConfig" = 100, + num_structures: int | "DatasetSize" | "DatasetConfig" = "full", checkpoint_name: str | None = None, checkpoint_path: str | None = None, sys_path: List[str] | None = None, From cd62506e86cb1a0d31aff668afaa74d8a26d348d Mon Sep 17 00:00:00 2001 From: hholb Date: Tue, 9 Dec 2025 14:11:47 -0700 Subject: [PATCH 09/23] cleanup, remove old examples --- .../examples/dummy_model.py | 17 -- .../examples/run_random_10k_benchmark.py | 190 ---------------- .../examples/test_hog_refactor.py | 70 ------ .../benchmarks/matbench_discovery/tasks.py | 141 +----------- garden_ai/benchmarks/utils/remote.py | 176 --------------- .../benchmarks/utils/remote_execution.py | 202 ------------------ garden_ai/benchmarks/utils/script_builder.py | 117 ---------- garden_ai/benchmarks/utils/task.py | 132 ------------ 8 files changed, 2 insertions(+), 1043 deletions(-) delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py delete mode 100644 garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py delete mode 100644 garden_ai/benchmarks/utils/remote.py delete mode 100644 garden_ai/benchmarks/utils/remote_execution.py delete mode 100644 garden_ai/benchmarks/utils/script_builder.py delete mode 100644 garden_ai/benchmarks/utils/task.py diff --git a/garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py b/garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py deleted file mode 100644 index 745eb1b1..00000000 --- a/garden_ai/benchmarks/matbench_discovery/examples/dummy_model.py +++ /dev/null @@ -1,17 +0,0 @@ -def create_dummy_model(device): - """Create a dummy calculator for testing.""" - import numpy as np - from ase.calculators.calculator import Calculator, all_changes - - class DummyCalc(Calculator): - implemented_properties = ["energy", "forces", "stress"] - - def calculate( - self, atoms=None, properties=["energy"], system_changes=all_changes - ): - super().calculate(atoms, properties, system_changes) - self.results["energy"] = -1.0 * len(self.atoms) - self.results["forces"] = np.zeros((len(self.atoms), 3)) - self.results["stress"] = np.zeros(6) - - return DummyCalc() diff --git a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py b/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py deleted file mode 100644 index c171239e..00000000 --- a/garden_ai/benchmarks/matbench_discovery/examples/run_random_10k_benchmark.py +++ /dev/null @@ -1,190 +0,0 @@ -#!/usr/bin/env python3 -""" -Run Matbench Discovery benchmarks on 10k random structures. - -This script benchmarks MACE, MatterSim, and SevenNet on a random 10k -sample from the unique prototypes subset and saves comprehensive metrics to JSON. -""" - -import json -from datetime import datetime -from pathlib import Path - -from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery - -# ============================================================================= -# Configuration -# ============================================================================= - -# Globus Compute endpoint -ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" - -# Common endpoint configuration -ENDPOINT_CONFIG = { - "scheduler_options": "#SBATCH --gpus-per-node=4\n", - "walltime": 14400, # 4 hours in seconds - "qos": "gpu", - "partition": "gpu", - "account": "cis250461-gpu", - "cores_per_node": 16, - "mem_per_node": 64, - "requirements": "", -} - -# Output file for metrics -OUTPUT_FILE = "random_10k_benchmark_results.json" - - -# ============================================================================= -# Model Factory Functions -# ============================================================================= - - -def create_mace_model(device): - """Create MACE model calculator.""" - from mace.calculators import mace_mp - - return mace_mp(model="medium-mpa-0", device=device, default_dtype="float64") - - -def create_mattersim_model(device): - """Create MatterSim model calculator.""" - from mattersim.forcefield import MatterSimCalculator - - return MatterSimCalculator(device=device) - - -def create_sevennet_model(device): - """Create SevenNet model calculator.""" - from sevenn.calculator import SevenNetCalculator - - return SevenNetCalculator(model="7net-l3i5", device=device) - - -# Model configurations -MODELS = { - "MACE": { - "packages": [ - "mace-torch", - "cuequivariance", - "cuequivariance-torch", - "cuequivariance-ops-torch-cu12", - ], - "factory": create_mace_model, - }, - "MatterSim": { - "packages": ["mattersim"], - "factory": create_mattersim_model, - }, - "SevenNet": { - "packages": ["sevenn"], - "factory": create_sevennet_model, - }, -} - - -# ============================================================================= -# Run Benchmarks -# ============================================================================= - - -def main(): - """Run benchmarks on all models and save results.""" - - print("=" * 80) - print("Matbench Discovery Benchmark - Random 10k") - print("=" * 80) - print("Dataset: Random 10k from Unique Prototypes") - print(f"Models: {', '.join(MODELS.keys())}") - print(f"Endpoint: {ENDPOINT_ID}") - print("=" * 80) - print() - - results = { - "metadata": { - "timestamp": datetime.now().isoformat(), - "dataset": "random_10k", - "dataset_size": 10000, - "endpoint_id": ENDPOINT_ID, - }, - "models": {}, - } - - for model_name, config in MODELS.items(): - print(f"\n{'=' * 80}") - print(f"Running {model_name}...") - print(f"{'=' * 80}\n") - - try: - # Run benchmark using the new groundhog API - output = MatbenchDiscovery.IS2RE.remote( - endpoint=ENDPOINT_ID, - user_endpoint_config=ENDPOINT_CONFIG, - model_factory=config["factory"], - model_packages=config["packages"], - num_structures="random_10k", - ) - - # Store complete output (contains both metrics and per-structure results) - results["models"][model_name] = { - "status": "success", - **output, - } - - # Display metrics - metrics = output.get("metrics", {}) - if "error" in metrics: - print(f"❌ {model_name} failed: {metrics['error']}") - results["models"][model_name]["status"] = "failed" - results["models"][model_name]["error"] = metrics["error"] - else: - print(f"✅ {model_name} completed successfully!") - print(f" F1 Score: {metrics.get('F1', 'N/A'):.6f}") - print(f" DAF: {metrics.get('DAF', 'N/A'):.2f}x") - print(f" MAE (eV/atom): {metrics.get('MAE', 'N/A'):.6f}") - print(f" RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}") - print(f" Structures: {metrics.get('num_evaluated', 'N/A')}") - - except Exception as e: - print(f"❌ {model_name} error: {e}") - results["models"][model_name] = { - "status": "error", - "error": str(e), - } - - # Save results to JSON - output_path = Path(OUTPUT_FILE) - with open(output_path, "w") as f: - json.dump(results, f, indent=2) - - print(f"\n{'=' * 80}") - print("Benchmark Complete!") - print(f"{'=' * 80}") - print(f"\nResults saved to: {output_path.absolute()}") - - # Print summary table - print(f"\n{'=' * 80}") - print("Summary") - print(f"{'=' * 80}\n") - print(f"{'Model':<15} {'Status':<10} {'F1':<10} {'DAF':<10} {'MAE':<10}") - print("-" * 80) - - for model_name, data in results["models"].items(): - if data["status"] == "success": - metrics = data["metrics"] - print( - f"{model_name:<15} {data['status']:<10} " - f"{metrics.get('F1', 0):<10.6f} " - f"{metrics.get('DAF', 0):<10.2f} " - f"{metrics.get('MAE', 0):<10.6f}" - ) - else: - print( - f"{model_name:<15} {data['status']:<10} {'N/A':<10} {'N/A':<10} {'N/A':<10}" - ) - - print() - - -if __name__ == "__main__": - main() diff --git a/garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py b/garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py deleted file mode 100644 index eccf0489..00000000 --- a/garden_ai/benchmarks/matbench_discovery/examples/test_hog_refactor.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python3 -""" -Test Matbench Discovery refactor with Groundhog HPC. -""" - -import os - -from dummy_model import create_dummy_model - -from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery - -# ============================================================================= -# Configuration -# ============================================================================= - -# Globus Compute endpoint (use local if possible, or the one from example) -ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" - -# HPC endpoint configuration -ENDPOINT_CONFIG = { - "account": "cis250461-gpu", - "partition": "gpu", - "qos": "gpu", - "scheduler_options": "#SBATCH --gpus-per-node=1\n", - "cores_per_node": 4, - "mem_per_node": 16, -} - -# ============================================================================= -# Model Factory Functions -# ============================================================================= - - -def main(): - """Run benchmarks on all models and save results.""" - - print("=" * 80) - print("Matbench Discovery Test - Groundhog Refactor") - print("=" * 80) - - print("Running LOCAL test...") - - # Ensure subprocess can find dummy_model - cwd = os.getcwd() - os.environ["PYTHONPATH"] = cwd + os.pathsep + os.environ.get("PYTHONPATH", "") - - try: - # Run locally using the new static method API - output = MatbenchDiscovery.IS2RE.local( - model_factory=create_dummy_model, - model_packages=["numpy", "ase"], # Minimal deps - num_structures=1, - sys_path=[os.getcwd()], - ) - print("Local run output keys:", output.keys()) - if "error" in output.get("metrics", {}): - print("Local metrics error:", output["metrics"]["error"]) - else: - print("Local run successful!") - print("Metrics:", output.get("metrics")) - - except Exception as e: - print(f"Local run failed: {e}") - import traceback - - traceback.print_exc() - - -if __name__ == "__main__": - main() diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index 7471b3bd..d087c06a 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -1,16 +1,3 @@ -# /// script -# requires-python = ">=3.10" -# dependencies = [ -# "groundhog-hpc", -# "ase", -# "numpy", -# "pandas", -# "scikit-learn", -# "torch", -# "matbench-discovery", -# ] -# /// - from __future__ import annotations import concurrent.futures @@ -20,15 +7,14 @@ import os import sys import time -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional import groundhog_hpc as hog import numpy as np import pandas as pd from sklearn.metrics import r2_score -# Ensure local modules can be imported during local execution -sys.path.append(os.getcwd()) +from .metrics import stable_metrics if TYPE_CHECKING: from .enums import DatasetConfig, DatasetSize @@ -61,7 +47,6 @@ def setup_device(gpu_id: Optional[int] = None) -> str: def convert_numpy_types(obj): """Convert numpy types to Python native types for JSON serialization.""" - import numpy as np if isinstance(obj, (np.integer, np.floating)): return obj.item() @@ -99,124 +84,6 @@ def _get_meta_metrics_source() -> str: return inspect.getsource(meta_metrics) -# Metrics calculations lifted from https://github.com/janosh/matbench-discovery/tree/main/matbench_discovery/metrics -# Since they aren't setup to be easily imported, we just copy them here -def classify_stable( - each_true: Sequence[float] | pd.Series | np.ndarray, - each_pred: Sequence[float] | pd.Series | np.ndarray, - *, - stability_threshold: float = 0.0, - fillna: bool = True, -) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]: - if len(each_true) != len(each_pred): - raise ValueError(f"{len(each_true)=} != {len(each_pred)=}") - - each_true_arr, each_pred_arr = pd.Series(each_true), pd.Series(each_pred) - - if stability_threshold is None or np.isnan(stability_threshold): - raise ValueError("stability_threshold must be a real number") - actual_pos = each_true_arr <= (stability_threshold or 0) - actual_neg = each_true_arr > (stability_threshold or 0) - - model_pos = each_pred_arr <= (stability_threshold or 0) - model_neg = each_pred_arr > (stability_threshold or 0) - - if fillna: - nan_mask = np.isnan(each_pred) - model_pos[nan_mask] = False - model_neg[nan_mask] = True - - n_pos, n_neg, total = model_pos.sum(), model_neg.sum(), len(each_pred) - if n_pos + n_neg != total: - raise ValueError( - f"after filling NaNs, the sum of positive ({n_pos}) and negative " - f"({n_neg}) predictions should add up to {total=}" - ) - - true_pos = actual_pos & model_pos - false_neg = actual_pos & model_neg - false_pos = actual_neg & model_pos - true_neg = actual_neg & model_neg - - return true_pos, false_neg, false_pos, true_neg - - -# This is also coptied from the matbench-discovery repo -def stable_metrics( - each_true: Sequence[float] | pd.Series | np.ndarray, - each_pred: Sequence[float] | pd.Series | np.ndarray, - *, - stability_threshold: float = 0.0, - fillna: bool = True, - prevalence: float | None = None, -) -> dict[str, float]: - n_true_pos, n_false_neg, n_false_pos, n_true_neg = map( - sum, - classify_stable( - each_true, each_pred, stability_threshold=stability_threshold, fillna=fillna - ), - ) - - n_total_pos = n_true_pos + n_false_neg - n_total_neg = n_true_neg + n_false_pos - if prevalence is None: - prevalence = ( - n_total_pos / (n_total_pos + n_total_neg) - if (n_total_pos + n_total_neg) > 0 - else float("nan") - ) - precision = ( - n_true_pos / (n_true_pos + n_false_pos) - if (n_true_pos + n_false_pos) > 0 - else float("nan") - ) - recall = n_true_pos / n_total_pos if n_total_pos > 0 else float("nan") - - TPR = recall - FPR = n_false_pos / n_total_neg if n_total_neg > 0 else float("nan") - TNR = n_true_neg / n_total_neg if n_total_neg > 0 else float("nan") - FNR = n_false_neg / n_total_pos if n_total_pos > 0 else float("nan") - - if FPR > 0 and TNR > 0 and FPR + TNR != 1: - if abs(FPR + TNR - 1) > 1e-6: - raise ValueError(f"{FPR=} {TNR=} don't add up to 1") - - if TPR > 0 and FNR > 0 and TPR + FNR != 1: - if abs(TPR + FNR - 1) > 1e-6: - raise ValueError(f"{TPR=} {FNR=} don't add up to 1") - - is_nan = np.isnan(each_true) | np.isnan(each_pred) - each_true, each_pred = np.array(each_true)[~is_nan], np.array(each_pred)[~is_nan] - - if precision + recall == 0: - f1_score = float("nan") - else: - f1_score = 2 * (precision * recall) / (precision + recall) - - return dict( - F1=f1_score, - DAF=precision / prevalence if prevalence > 0 else float("nan"), - Precision=precision, - Recall=recall, - Accuracy=( - (n_true_pos + n_true_neg) / (n_total_pos + n_total_neg) - if (n_total_pos + n_total_neg > 0) - else float("nan") - ), - TPR=TPR, - FPR=FPR, - TNR=TNR, - FNR=FNR, - TP=n_true_pos, - FP=n_false_pos, - TN=n_true_neg, - FN=n_false_neg, - MAE=np.abs(each_true - each_pred).mean(), - RMSE=((each_true - each_pred) ** 2).mean() ** 0.5, - R2=r2_score(each_true, each_pred) if len(each_true) > 1 else float("nan"), - ) - - _MODEL_CACHE = None @@ -299,7 +166,6 @@ def get_material_ids_for_subset( if subset_type == "full": return None - import pandas as pd from matbench_discovery.data import DataFiles df = pd.read_csv(DataFiles.wbm_summary.path) @@ -470,7 +336,6 @@ def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]: def calculate_metrics_energy( results: Dict[str, Any], config: Dict[str, Any] ) -> Dict[str, Any]: - import numpy as np from matbench_discovery.data import df_wbm if len(results) == 0: @@ -515,10 +380,8 @@ def calculate_metrics_forces( from io import TextIOWrapper from zipfile import ZipFile - import numpy as np from ase.io import read from matbench_discovery.data import DataFiles - from sklearn.metrics import r2_score metrics = { "energy_mae": [], diff --git a/garden_ai/benchmarks/utils/remote.py b/garden_ai/benchmarks/utils/remote.py deleted file mode 100644 index b9a4780e..00000000 --- a/garden_ai/benchmarks/utils/remote.py +++ /dev/null @@ -1,176 +0,0 @@ -import json -import logging -import os -import subprocess -import sys -import tempfile -from pathlib import Path -from typing import Any, Dict, List - -logger = logging.getLogger(__name__) - - -class RemoteBenchmarkRunner: - """ - Handles the setup and execution of benchmarks on remote Globus Compute endpoints. - - This class manages: - 1. Creating an isolated working directory - 2. Setting up a Python environment using `uv` - 3. Installing dependencies - 4. Executing the benchmark script - 5. Collecting results - """ - - def __init__(self, work_dir_prefix: str = "garden_benchmark_"): - self.work_dir = Path(tempfile.mkdtemp(prefix=work_dir_prefix)) - self.uv_bin = None - self.venv_python = None - self.env = dict(os.environ) - - # Configure logging if not already configured - if not logging.getLogger().handlers: - logging.basicConfig( - level=logging.INFO, - stream=sys.stdout, - force=True, - format="%(asctime)s [%(levelname)s] %(message)s", - ) - - def setup_environment(self, python_version: str = "3.11"): - """Find uv and create virtual environment.""" - logger.info("Setting up environment...") - - # Find UV binary - try: - self.uv_bin = subprocess.check_output( - [sys.executable, "-c", "import uv; print(uv.find_uv_bin())"], text=True - ).strip() - except subprocess.CalledProcessError: - import shutil - - self.uv_bin = shutil.which("uv") - if not self.uv_bin: - raise RuntimeError("Could not find uv binary. Please install uv.") - - # Create UV virtual environment - subprocess.run( - [self.uv_bin, "venv", "--python", python_version], - cwd=self.work_dir, - check=True, - capture_output=True, - ) - - self.venv_python = self.work_dir / ".venv/bin/python" - if not self.venv_python.exists(): - self.venv_python = ( - self.work_dir / ".venv/Scripts/python.exe" - ) # Windows fallback - - if not self.venv_python.exists(): - raise RuntimeError( - f"Virtual environment python not found at {self.venv_python}" - ) - - # Set SSL cert file for HPC if needed - self._setup_ssl_cert() - - def _setup_ssl_cert(self): - """Set SSL_CERT_FILE environment variable if certifi is available.""" - try: - certifi_path = subprocess.check_output( - [str(self.venv_python), "-c", "import certifi; print(certifi.where())"], - text=True, - ).strip() - self.env["SSL_CERT_FILE"] = certifi_path - except Exception as e: - logger.warning(f"Failed to set SSL_CERT_FILE: {e}") - - def install_dependencies(self, packages: List[str]): - """Install Python packages into the virtual environment.""" - logger.info(f"Installing dependencies: {packages}") - if not self.uv_bin or not self.venv_python: - raise RuntimeError("Environment not setup. Call setup_environment() first.") - - cmd = [ - self.uv_bin, - "pip", - "install", - "--python", - str(self.venv_python), - ] + packages - - subprocess.run(cmd, cwd=self.work_dir, check=True) - - def run_benchmark( - self, - script_content: str, - config: Dict[str, Any], - script_name: str = "benchmark_runner.py", - ) -> Dict[str, Any]: - """ - Execute the benchmark script. - - Args: - script_content: The Python script to run. - config: Configuration dictionary to pass to the script (saved as config.json). - script_name: Filename for the script. - - Returns: - Dictionary containing the results loaded from results.json. - """ - if not self.venv_python: - raise RuntimeError("Environment not setup. Call setup_environment() first.") - - logger.info("Preparing benchmark script...") - - # Write runner script - runner_path = self.work_dir / script_name - runner_path.write_text(script_content) - - # Write config - config_path = self.work_dir / "config.json" - with open(config_path, "w") as f: - json.dump(config, f, indent=2) - - logger.info("Executing benchmark...") - - # Run the runner script inside the venv - proc = subprocess.run( - [str(self.venv_python), str(runner_path), str(config_path)], - cwd=self.work_dir, - env=self.env, - stdout=sys.stdout, - stderr=sys.stderr, - check=False, - ) - - if proc.returncode != 0: - raise RuntimeError( - f"Benchmark runner failed with return code {proc.returncode}" - ) - - logger.info("Collecting results...") - results_path = self.work_dir / "results.json" - if not results_path.exists(): - raise RuntimeError( - "Results file not found - benchmark may have crashed silently" - ) - - with open(results_path) as f: - results = json.load(f) - - logger.info("Benchmark completed successfully.") - return results - - def cleanup(self): - """Remove the working directory.""" - import shutil - - shutil.rmtree(self.work_dir, ignore_errors=True) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.cleanup() diff --git a/garden_ai/benchmarks/utils/remote_execution.py b/garden_ai/benchmarks/utils/remote_execution.py deleted file mode 100644 index d6541695..00000000 --- a/garden_ai/benchmarks/utils/remote_execution.py +++ /dev/null @@ -1,202 +0,0 @@ -"""Generic remote execution utility for benchmarks. - -This module contains the `run_remote_benchmark` function which is designed to be -serialized and executed on Globus Compute endpoints. It handles the boilerplate -of setting up a Python environment, installing dependencies, and running a -provided benchmark script. -""" - - -def run_remote_benchmark( - script_content: str, - dependencies: list[str], - config: dict, - checkpoint_name: str | None = None, - checkpoint_path: str | None = None, -) -> dict: - """Run a generic benchmark script on a remote Globus Compute endpoint. - - This function: - 1. Creates a temporary working directory. - 2. Sets up a Python environment using `uv`. - 3. Installs the specified dependencies. - 4. Writes the `script_content` to a file. - 5. Writes the `config` to a JSON file. - 6. Executes the script in the environment. - 7. Returns the results from `results.json`. - - Args: - script_content: The full Python script to execute. - dependencies: List of Python packages to install (e.g. ["numpy", "torch"]). - config: Dictionary of configuration parameters to pass to the script. - Written to `config.json`. - checkpoint_name: Name of the checkpoint file (e.g. "checkpoint_123.json"). - Saved to ~/.garden/benchmarks/. - checkpoint_path: Optional path to an existing checkpoint file to resume from. - If provided, this path is used directly. - - Returns: - The content of `results.json` produced by the script. - """ - # All imports must be inside the function for serialization - import json - import logging - import os - import subprocess - import sys - import tempfile - from pathlib import Path - - # Configure logging - logging.basicConfig( - level=logging.INFO, - stream=sys.stdout, - force=True, - format="%(asctime)s [%(levelname)s] %(message)s", - ) - if hasattr(sys.stdout, "reconfigure"): - sys.stdout.reconfigure(line_buffering=True) - - logger = logging.getLogger(__name__) - - # Create isolated working directory - work_dir = Path(tempfile.mkdtemp(prefix="garden_benchmark_")) - - try: - # ---------------------------------------------------------------------- - # 1. ENVIRONMENT SETUP - # ---------------------------------------------------------------------- - logger.info("Step 1/4: Setting up environment...") - - # Find UV binary - try: - uv_bin = subprocess.check_output( - [sys.executable, "-c", "import uv; print(uv.find_uv_bin())"], text=True - ).strip() - except subprocess.CalledProcessError: - import shutil - - uv_bin = shutil.which("uv") - if not uv_bin: - raise RuntimeError("Could not find uv binary. Please install uv.") - - # Create UV virtual environment - subprocess.run( - [uv_bin, "venv", "--python", "3.11"], - cwd=work_dir, - check=True, - capture_output=True, - ) - - venv_python = work_dir / ".venv/bin/python" - if not venv_python.exists(): - venv_python = work_dir / ".venv/Scripts/python.exe" # Windows fallback - - if not venv_python.exists(): - raise RuntimeError(f"Virtual environment python not found at {venv_python}") - - # Install dependencies - logger.info(f"Installing dependencies: {dependencies}") - # Install in one go for better resolution - cmd = [uv_bin, "pip", "install", "--python", str(venv_python)] + dependencies - subprocess.run( - cmd, - cwd=work_dir, - check=True, - ) - - # Set SSL cert file for HPC if needed - env = dict(os.environ) - - # Propagate common useful env vars if present - for key in ["MBD_AUTO_DOWNLOAD_FILES", "HF_TOKEN", "WANDB_API_KEY"]: - if key in os.environ: - env[key] = os.environ[key] - - try: - certifi_path = subprocess.check_output( - [str(venv_python), "-c", "import certifi; print(certifi.where())"], - text=True, - ).strip() - env["SSL_CERT_FILE"] = certifi_path - except Exception as e: - logger.warning(f"Failed to set SSL_CERT_FILE: {e}") - - # ---------------------------------------------------------------------- - # 2. PREPARE BENCHMARK SCRIPT - # ---------------------------------------------------------------------- - logger.info("Step 2/4: Preparing benchmark script...") - - # Write runner script - runner_path = work_dir / "benchmark_runner.py" - runner_path.write_text(script_content) - - # Determine checkpoint path - if checkpoint_path: - # User provided a specific path to resume from - final_checkpoint_path = checkpoint_path - elif checkpoint_name: - # Use persistent location in user home - checkpoint_dir = Path.home() / ".garden" / "benchmarks" - checkpoint_dir.mkdir(parents=True, exist_ok=True) - final_checkpoint_path = str(checkpoint_dir / checkpoint_name) - else: - # Fallback to tmp dir if no name provided (legacy behavior) - final_checkpoint_path = str(work_dir / "checkpoint.json") - - config["checkpoint_path"] = final_checkpoint_path - - # Log checkpoint path prominently for user reference - print(f"{'=' * 80}") - print(f"Checkpoint will be saved to: {final_checkpoint_path}") - print("To resume this job if it fails, use:") - print(f' checkpoint_path="{final_checkpoint_path}"') - print(f"{'=' * 80}") - - # Write config - config_path = work_dir / "config.json" - with open(config_path, "w") as f: - json.dump(config, f, indent=2) - - # ---------------------------------------------------------------------- - # 3. EXECUTE BENCHMARK - # ---------------------------------------------------------------------- - logger.info("Step 3/4: Executing benchmark...") - - # Run the runner script inside the venv - # DO NOT capture output - let it stream to stdout/stderr in real-time - # so we can see errors immediately - proc = subprocess.run( - [str(venv_python), str(runner_path), str(config_path)], - cwd=work_dir, - env=env, - check=False, # Don't raise immediately, we'll check returncode - ) - - if proc.returncode != 0: - raise RuntimeError( - f"Benchmark runner failed with return code {proc.returncode}." - ) - - # ---------------------------------------------------------------------- - # 4. COLLECT RESULTS - # ---------------------------------------------------------------------- - logger.info("Step 4/4: Collecting results...") - - results_path = work_dir / "results.json" - if not results_path.exists(): - raise RuntimeError( - "Results file not found - benchmark may have crashed silently" - ) - - with open(results_path) as f: - results = json.load(f) - - logger.info("Benchmark completed successfully.") - return results - - finally: - # Cleanup working directory - import shutil - - shutil.rmtree(work_dir, ignore_errors=True) diff --git a/garden_ai/benchmarks/utils/script_builder.py b/garden_ai/benchmarks/utils/script_builder.py deleted file mode 100644 index 9613923c..00000000 --- a/garden_ai/benchmarks/utils/script_builder.py +++ /dev/null @@ -1,117 +0,0 @@ -import inspect -from pathlib import Path -from typing import Callable - - -class BenchmarkScriptBuilder: - """Helper to build a self-contained benchmark script from a template.""" - - def __init__(self, template_path: str | Path = None): - if template_path is None: - # Default to the base_runner.py in templates - template_path = ( - Path(__file__).parent.parent / "templates" / "base_runner.py" - ) - - self.template_path = Path(template_path) - self.imports = set() - self.functions = [] - self.preamble = [] - self.pep723_dependencies = [] - self.pep723_requires_python = None - - def add_import(self, import_stmt: str): - """Add an import statement (e.g. 'import numpy as np').""" - self.imports.add(import_stmt) - return self - - def add_preamble(self, code: str): - """Add arbitrary code to the top of the script (after imports).""" - self.preamble.append(code) - return self - - def add_pep723_metadata( - self, dependencies: list[str], requires_python: str = ">=3.10" - ): - """Add PEP 723 script metadata.""" - self.pep723_dependencies.extend(dependencies) - self.pep723_requires_python = requires_python - return self - - def add_function(self, func: Callable, name: str = None): - """Add a function definition to the script. - - The function source code is inspected and appended. - If name is provided, the function definition is renamed. - """ - source = inspect.getsource(func) - - if name: - import re - - # Replace 'def old_name(' with 'def new_name(' - # This is a simple regex replacement, assuming standard formatting - pattern = r"def\s+" + func.__name__ + r"\s*\(" - replacement = f"def {name}(" - source = re.sub(pattern, replacement, source, count=1) - - self.functions.append(source) - return self - - def build(self) -> str: - """Assemble the final script.""" - if not self.template_path.exists(): - raise FileNotFoundError(f"Template not found at {self.template_path}") - - template_content = self.template_path.read_text() - - # Construct sections - imports_block = "\n".join(sorted(self.imports)) - preamble_block = "\n".join(self.preamble) - functions_block = "\n\n".join(self.functions) - - # We inject our custom code BEFORE the template's main execution logic - # but AFTER the template's own imports (which are inside the file). - # Actually, the template has imports at the top. We should probably prepend ours. - - # Simple strategy: Prepend everything to the template, but the template - # has "USER DEFINED FUNCTIONS" placeholders. We can just append our functions - # before the main block? - - # Better strategy: The template is designed to have functions injected. - # Let's just put imports at the top, then functions, then the template content. - # But we need to be careful about imports in the template. - - # Construct PEP 723 block - pep723_block = "" - if self.pep723_dependencies or self.pep723_requires_python: - pep723_block = "# /// script\n" - if self.pep723_requires_python: - pep723_block += f'# requires-python = "{self.pep723_requires_python}"\n' - if self.pep723_dependencies: - deps_list = '",\n# "'.join(self.pep723_dependencies) - pep723_block += f'# dependencies = [\n# "{deps_list}",\n# ]\n' - pep723_block += "# ///\n" - - final_script = f"""{pep723_block} -# ------------------------------------------------------------------------------ -# INJECTED IMPORTS -# ------------------------------------------------------------------------------ -{imports_block} - -# ------------------------------------------------------------------------------ -# INJECTED PREAMBLE -# ------------------------------------------------------------------------------ -{preamble_block} - -# ------------------------------------------------------------------------------ -# INJECTED FUNCTIONS -# ------------------------------------------------------------------------------ -{functions_block} - -# ------------------------------------------------------------------------------ -# BASE RUNNER TEMPLATE -# ------------------------------------------------------------------------------ -{template_content} -""" - return final_script diff --git a/garden_ai/benchmarks/utils/task.py b/garden_ai/benchmarks/utils/task.py deleted file mode 100644 index 4ca2fd60..00000000 --- a/garden_ai/benchmarks/utils/task.py +++ /dev/null @@ -1,132 +0,0 @@ -import json -import subprocess -import sys -import tempfile -from pathlib import Path -from typing import Any, Dict, Optional - - -class BaseBenchmarkTask: - """ - Base class for benchmark tasks. - - Provides common utilities for: - - Extracting model metadata (package, factory, kwargs) - - Running benchmarks locally for testing - """ - - def __init__( - self, adapter, repo_url: str, repo_ref: str, model_package: Optional[str] = None - ): - self.adapter = adapter - self.repo_url = repo_url - self.repo_ref = repo_ref - self.model_package = model_package - - def _extract_model_config( - self, - model: Any = None, - model_package: Optional[str] = None, - model_factory: Optional[str] = None, - model_kwargs: Optional[Dict[str, Any]] = None, - ) -> Dict[str, Any]: - """ - Helper to resolve model configuration from either a local instance or explicit arguments. - """ - model_checkpoint = None - - if model is not None: - # Extract info from local model instance - if model_package is None: - if self.model_package is not None: - model_package = self.model_package - else: - # Infer from model's module - model_package = model.__class__.__module__.split(".")[0] - - if model_factory is None: - model_factory = model.__class__.__name__ - - # Get checkpoint path if model has one - if hasattr(model, "checkpoint_path"): - model_checkpoint = model.checkpoint_path - elif hasattr(model, "checkpoint"): - model_checkpoint = model.checkpoint - - # Try to extract initialization kwargs if available - if model_kwargs is None and hasattr(model, "_init_kwargs"): - model_kwargs = model._init_kwargs - - else: - # Must provide explicit construction info - if model_package is None or model_factory is None: - raise ValueError( - "If model is not provided, must specify both " - "model_package and model_factory" - ) - - if model_kwargs is None: - model_kwargs = {} - - return { - "model_package": model_package, - "model_factory": model_factory, - "model_kwargs": model_kwargs, - "model_checkpoint": model_checkpoint, - } - - def _run_local_wrapper( - self, runner_func_path: str, runner_func_name: str, config: Dict[str, Any] - ) -> Dict[str, Any]: - """ - Execute a benchmark runner function locally in a subprocess. - - Args: - runner_func_path: Import path to the runner function (e.g. 'garden_ai.benchmarks.matbench_discovery.remote_runner') - runner_func_name: Name of the runner function (e.g. 'run_matbench_is2re') - config: Configuration dictionary to pass to the runner function. - """ - results_file_path = ( - Path(tempfile.gettempdir()) / f"benchmark_results_{id(config)}.json" - ) - - wrapper_script = f''' -import json -from {runner_func_path} import {runner_func_name} - -config = {repr(config)} -results = {runner_func_name}(**config) - -with open("{results_file_path}", "w") as f: - json.dump(results, f, indent=2) -''' - - with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: - f.write(wrapper_script) - wrapper_path = f.name - - try: - # Run without capturing output so logs stream to console in real-time - result = subprocess.run( - [sys.executable, wrapper_path], - timeout=3600, - stdout=None, - stderr=None, - ) - - if result.returncode != 0: - raise RuntimeError( - f"Local benchmark failed with return code {result.returncode}" - ) - - if not results_file_path.exists(): - raise RuntimeError( - f"Benchmark results file not found at {results_file_path}" - ) - - with open(results_file_path) as f: - return json.load(f) - - finally: - Path(wrapper_path).unlink(missing_ok=True) - results_file_path.unlink(missing_ok=True) From b12f922551ab173b139463b21072106d5f97ecbe Mon Sep 17 00:00:00 2001 From: hholb Date: Wed, 10 Dec 2025 09:15:08 -0700 Subject: [PATCH 10/23] fix checkpoint bug, clean up examples --- .../examples/local_execution.py | 31 +++ .../examples/matbench_equiformerv2.py | 84 +------ .../examples/matbench_mace_multi_gpu.py | 64 +++--- .../examples/matbench_mattersim.py | 67 +----- .../examples/matbench_sevennet.py | 68 +----- .../benchmarks/matbench_discovery/tasks.py | 211 +++++++++++++++++- garden_ai/benchmarks/utils/meta_metrics.py | 2 + 7 files changed, 292 insertions(+), 235 deletions(-) create mode 100644 garden_ai/benchmarks/matbench_discovery/examples/local_execution.py diff --git a/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py b/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py new file mode 100644 index 00000000..6414f5cf --- /dev/null +++ b/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +"""Matbench Discovery Benchmark - Local Execution Example""" + +from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery + + +def create_mattersim_model(device): + from mattersim.forcefield import MatterSimCalculator + + return MatterSimCalculator(device=device) + + +def main(): + print("Running MatterSim benchmark locally...") + + # Run IS2RE task locally + # Note: Requires a GPU or MPS if using MatterSim, or CPU if specified/supported + output = MatbenchDiscovery.IS2RE.local( + model_factory=create_mattersim_model, + model_packages="mattersim", + num_structures="random_100", + ) + + if "error" in output.get("metrics", {}): + print(f"Error: {output['metrics']['error']}") + else: + print("Benchmark Results:", output.get("metrics")) + + +if __name__ == "__main__": + main() diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py index 7855f825..e877f230 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py @@ -1,39 +1,13 @@ #!/usr/bin/env python3 -""" -Matbench Discovery Benchmark - EquiformerV2 Example - -EquiformerV2 is an improved equivariant transformer from FAIR-Chem (formerly OCP). -Paper: https://arxiv.org/abs/2306.12059 -GitHub: https://github.com/Open-Catalyst-Project/ocp - -Note: This example uses the S2EFS task (Structure to Energy, Forces, Stress) -instead of IS2RE because EquiformerV2 doesn't support geometry relaxation. -""" +"""Matbench Discovery Benchmark - EquiformerV2 Example""" from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery -# ============================================================================= -# Configuration -# ============================================================================= - # Globus Compute endpoint ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" -# ============================================================================= -# Model Factory -# ============================================================================= - - def create_equiformerv2_model(device): - """Create EquiformerV2 model calculator. - - Args: - device: Device to load model on ("cuda" or "cpu") - - Returns: - ASE calculator for EquiformerV2 - """ from fairchem.core.calculate.ase_calculator import Calculator # Use pre-trained checkpoint - will auto-download from HuggingFace @@ -42,25 +16,15 @@ def create_equiformerv2_model(device): ) -# ============================================================================= -# Run Benchmark -# ============================================================================= - - def main(): - """Run Matbench Discovery S2EFS benchmark with EquiformerV2.""" - - print("=" * 80) - print("Matbench Discovery S2EFS Benchmark - EquiformerV2") - print("=" * 80) + print(f"Running EquiformerV2 benchmark on endpoint {ENDPOINT_ID}...") - # Run S2EFS task using the new groundhog API - # S2EFS is suitable for EquiformerV2 which doesn't support relaxation + # Run S2EFS task (structure to energy/forces/stress) output = MatbenchDiscovery.S2EFS.remote( endpoint=ENDPOINT_ID, user_endpoint_config={ "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8\n", - "walltime": 7200, # 2 hours in seconds + "walltime": 7200, "qos": "gpu", "partition": "gpu-debug", "account": "cis250461-gpu", @@ -73,44 +37,10 @@ def main(): num_structures="random_100", ) - # Display metrics - print() - print("=" * 80) - print("Benchmark Results") - print("=" * 80) - - metrics = output.get("metrics", {}) - if "error" in metrics: - print(f"Error: {metrics['error']}") + if "error" in output.get("metrics", {}): + print(f"Error: {output['metrics']['error']}") else: - # Energy metrics - if "energy_mae" in metrics: - print("Energy Metrics:") - print(f" MAE (eV/atom): {metrics.get('energy_mae', 'N/A'):.6f}") - print(f" RMSE (eV/atom): {metrics.get('energy_rmse', 'N/A'):.6f}") - print(f" R²: {metrics.get('energy_r2', 'N/A'):.6f}") - print() - - # Force metrics - if "force_mae" in metrics: - print("Force Metrics:") - print(f" MAE (eV/Å): {metrics.get('force_mae', 'N/A'):.6f}") - print(f" RMSE (eV/Å): {metrics.get('force_rmse', 'N/A'):.6f}") - print(f" R²: {metrics.get('force_r2', 'N/A'):.6f}") - print() - - # Stress metrics - if "stress_mae" in metrics: - print("Stress Metrics:") - print(f" MAE (GPa): {metrics.get('stress_mae', 'N/A'):.6f}") - print(f" RMSE (GPa): {metrics.get('stress_rmse', 'N/A'):.6f}") - print(f" R²: {metrics.get('stress_r2', 'N/A'):.6f}") - print() - - if "num_evaluated" in metrics: - print(f"Structures: {metrics.get('num_evaluated', 'N/A')}") - - print("=" * 80) + print("Benchmark Results:", output.get("metrics")) if __name__ == "__main__": diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py index 354ca456..4fadc7f5 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py @@ -1,10 +1,5 @@ -"""Test Matbench Discovery benchmark on Anvil HPC. - -This script demonstrates running the IS2RE benchmark with a subset of structures -using multi-GPU parallelization on a Globus Compute endpoint. -""" - -from rich import print +#!/usr/bin/env python3 +"""Matbench Discovery Benchmark - MACE Multi-GPU Example""" from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery @@ -17,25 +12,36 @@ def create_mace_model(device): return mace_mp(model="medium-mpa-0", device=device, default_dtype="float64") -results = MatbenchDiscovery.IS2RE.remote( - endpoint=ANVIL, - user_endpoint_config={ - "scheduler_options": "#SBATCH --gpus-per-node=4\n", - "walltime": "05:00:00", - "qos": "gpu", - "partition": "gpu", - "account": "cis250461-gpu", - "cores_per_node": 16, - "requirements": "", # 'requirements' is required for Anvil endpoint - }, - model_factory=create_mace_model, - model_packages=[ - "mace-torch", - "cuequivariance", - "cuequivariance-torch", - "cuequivariance-ops-torch-cu12", - ], - checkpoint_path="~/.garden/benchmarks/matbench_mace-torch_cuequivariance_full_20251208_115719_ed2e47af.json", -) - -print(results) +def main(): + print(f"Running MACE benchmark on endpoint {ANVIL}...") + + results = MatbenchDiscovery.IS2RE.remote( + endpoint=ANVIL, + user_endpoint_config={ + "scheduler_options": "#SBATCH --gpus-per-node=4\n", + "walltime": "05:00:00", + "qos": "gpu", + "partition": "gpu", + "account": "cis250461-gpu", + "cores_per_node": 16, + "requirements": "", # 'requirements' is required for Anvil endpoint + }, + model_factory=create_mace_model, + model_packages=[ + "mace-torch", + "cuequivariance", + "cuequivariance-torch", + "cuequivariance-ops-torch-cu12", + ], + checkpoint_path="~/.garden/benchmarks/matbench_mace-torch_cuequivariance_full_20251208_115719_ed2e47af.json", + num_structures="random_100", + ) + + if "error" in results.get("metrics", {}): + print(f"Error: {results['metrics']['error']}") + else: + print("Benchmark Results:", results.get("metrics")) + + +if __name__ == "__main__": + main() diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py index 22099e9e..8a7636ba 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py @@ -1,59 +1,27 @@ #!/usr/bin/env python3 -""" -Matbench Discovery Benchmark - MatterSim Example - -MatterSim is a deep learning atomistic model for general material simulations. -Paper: https://arxiv.org/abs/2405.04967 -GitHub: https://github.com/microsoft/mattersim -""" +"""Matbench Discovery Benchmark - MatterSim Example""" from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery -# ============================================================================= -# Configuration -# ============================================================================= - # Globus Compute endpoint ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" -# ============================================================================= -# Model Factory -# ============================================================================= - - def create_mattersim_model(device): - """Create MatterSim model calculator. - - Args: - device: Device to load model on ("cuda" or "cpu") - - Returns: - ASE calculator for MatterSim - """ from mattersim.forcefield import MatterSimCalculator return MatterSimCalculator(device=device) -# ============================================================================= -# Run Benchmark -# ============================================================================= - - def main(): - """Run Matbench Discovery IS2RE benchmark with MatterSim.""" - - print("=" * 80) - print("Matbench Discovery IS2RE Benchmark - MatterSim") - print("=" * 80) + print(f"Running MatterSim benchmark on endpoint {ENDPOINT_ID}...") - # Run IS2RE task using the new groundhog API + # Run IS2RE task output = MatbenchDiscovery.IS2RE.remote( endpoint=ENDPOINT_ID, user_endpoint_config={ "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8\n", - "walltime": 7200, # 2 hours in seconds + "walltime": 7200, "qos": "gpu", "partition": "gpu-debug", "account": "cis250461-gpu", @@ -66,31 +34,10 @@ def main(): num_structures="random_100", ) - # Display metrics - print() - print("=" * 80) - print("Benchmark Results") - print("=" * 80) - - metrics = output.get("metrics", {}) - if "error" in metrics: - print(f"Error: {metrics['error']}") + if "error" in output.get("metrics", {}): + print(f"Error: {output['metrics']['error']}") else: - # Discovery metrics - print(f"F1 Score: {metrics.get('F1', 'N/A'):.6f}") - print(f"DAF: {metrics.get('DAF', 'N/A'):.2f}x") - print(f"Precision: {metrics.get('Precision', 'N/A'):.6f}") - print(f"Recall: {metrics.get('Recall', 'N/A'):.6f}") - print(f"Accuracy: {metrics.get('Accuracy', 'N/A'):.6f}") - print() - # Regression metrics - print(f"MAE (eV/atom): {metrics.get('MAE', 'N/A'):.6f}") - print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}") - print(f"R²: {metrics.get('R2', 'N/A'):.6f}") - print() - print(f"Structures: {metrics.get('num_evaluated', 'N/A')}") - - print("=" * 80) + print("Benchmark Results:", output.get("metrics")) if __name__ == "__main__": diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py index e24d0d69..411c64e1 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py @@ -1,60 +1,27 @@ #!/usr/bin/env python3 -""" -Matbench Discovery Benchmark - SevenNet Example - -This script demonstrates running the Matbench Discovery IS2RE benchmark -using SevenNet as the MLIP model on a remote Globus Compute endpoint. - -SevenNet is a graph neural network potential with good transferability. -""" +"""Matbench Discovery Benchmark - SevenNet Example""" from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery -# ============================================================================= -# Configuration -# ============================================================================= - # Globus Compute endpoint (replace with your endpoint UUID) ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" -# ============================================================================= -# Model Factory -# ============================================================================= - - def create_sevennet_model(device): - """Create SevenNet model calculator. - - Args: - device: Device to load model on ("cuda" or "cpu") - - Returns: - ASE calculator for SevenNet - """ from sevenn.calculator import SevenNetCalculator return SevenNetCalculator(model="7net-0", device=device) -# ============================================================================= -# Run Benchmark -# ============================================================================= - - def main(): - """Run Matbench Discovery IS2RE benchmark with SevenNet.""" + print(f"Running SevenNet benchmark on endpoint {ENDPOINT_ID}...") - print("=" * 80) - print("Matbench Discovery IS2RE Benchmark - SevenNet") - print("=" * 80) - - # Run IS2RE task using the new groundhog API + # Run IS2RE task output = MatbenchDiscovery.IS2RE.remote( endpoint=ENDPOINT_ID, user_endpoint_config={ "scheduler_options": "#SBATCH --gpus-per-node=2\n", - "walltime": 7200, # 2 hours in seconds + "walltime": 7200, "qos": "gpu", "partition": "gpu-debug", "account": "cis250461-gpu", @@ -67,31 +34,10 @@ def main(): num_structures="random_100", ) - # Display metrics - print() - print("=" * 80) - print("Benchmark Results") - print("=" * 80) - - metrics = output.get("metrics", {}) - if "error" in metrics: - print(f"Error: {metrics['error']}") + if "error" in output.get("metrics", {}): + print(f"Error: {output['metrics']['error']}") else: - # Discovery metrics - print(f"F1 Score: {metrics.get('F1', 'N/A'):.6f}") - print(f"DAF: {metrics.get('DAF', 'N/A'):.2f}x") - print(f"Precision: {metrics.get('Precision', 'N/A'):.6f}") - print(f"Recall: {metrics.get('Recall', 'N/A'):.6f}") - print(f"Accuracy: {metrics.get('Accuracy', 'N/A'):.6f}") - print() - # Regression metrics - print(f"MAE (eV/atom): {metrics.get('MAE', 'N/A'):.6f}") - print(f"RMSE (eV/atom): {metrics.get('RMSE', 'N/A'):.6f}") - print(f"R²: {metrics.get('R2', 'N/A'):.6f}") - print() - print(f"Structures: {metrics.get('num_evaluated', 'N/A')}") - - print("=" * 80) + print("Benchmark Results:", output.get("metrics")) if __name__ == "__main__": diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index d087c06a..9d59b708 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -1,3 +1,16 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "groundhog-hpc", +# "ase", +# "numpy", +# "pandas", +# "scikit-learn", +# "torch", +# "matbench-discovery", +# ] +# /// + from __future__ import annotations import concurrent.futures @@ -7,17 +20,48 @@ import os import sys import time -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional +from enum import Enum +from typing import Any, Callable, Dict, List, Optional, Sequence import groundhog_hpc as hog import numpy as np import pandas as pd from sklearn.metrics import r2_score -from .metrics import stable_metrics -if TYPE_CHECKING: - from .enums import DatasetConfig, DatasetSize +class DatasetSize(str, Enum): + """Predefined dataset sizes for Matbench Discovery benchmarks. + + These correspond to different subsets of the WBM test set that are commonly + used for evaluating materials discovery models. + """ + + FULL = "full" + """Full WBM test set (~257k structures)""" + + UNIQUE_PROTOS = "unique_protos" + """Unique prototypes subset (~215k structures) - removes duplicate prototypes""" + + RANDOM_10K = "random_10k" + """Random 10k structures from the unique prototypes subset (fixed seed)""" + + RANDOM_100 = "random_100" + """Random 100 structures for quick testing (fixed seed)""" + + def seed(self, seed: int) -> "DatasetConfig": + """Return a configuration with a custom random seed.""" + return DatasetConfig(self, seed) + + +class DatasetConfig: + """Configuration for a dataset subset with a specific random seed.""" + + def __init__(self, subset: DatasetSize, seed: int): + self.subset = subset + self.seed = seed + + def __repr__(self): + return f"{self.subset.name}(seed={self.seed})" def setup_logging(): @@ -47,7 +91,6 @@ def setup_device(gpu_id: Optional[int] = None) -> str: def convert_numpy_types(obj): """Convert numpy types to Python native types for JSON serialization.""" - if isinstance(obj, (np.integer, np.floating)): return obj.item() elif isinstance(obj, np.ndarray): @@ -87,6 +130,124 @@ def _get_meta_metrics_source() -> str: _MODEL_CACHE = None +# Metrics calculations lifted from https://github.com/janosh/matbench-discovery/tree/main/matbench_discovery/metrics +# Since they aren't setup to be easily imported, we just copy them here +def classify_stable( + each_true: Sequence[float] | pd.Series | np.ndarray, + each_pred: Sequence[float] | pd.Series | np.ndarray, + *, + stability_threshold: float = 0.0, + fillna: bool = True, +) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]: + if len(each_true) != len(each_pred): + raise ValueError(f"{len(each_true)=} != {len(each_pred)=}") + + each_true_arr, each_pred_arr = pd.Series(each_true), pd.Series(each_pred) + + if stability_threshold is None or np.isnan(stability_threshold): + raise ValueError("stability_threshold must be a real number") + actual_pos = each_true_arr <= (stability_threshold or 0) + actual_neg = each_true_arr > (stability_threshold or 0) + + model_pos = each_pred_arr <= (stability_threshold or 0) + model_neg = each_pred_arr > (stability_threshold or 0) + + if fillna: + nan_mask = np.isnan(each_pred) + model_pos[nan_mask] = False + model_neg[nan_mask] = True + + n_pos, n_neg, total = model_pos.sum(), model_neg.sum(), len(each_pred) + if n_pos + n_neg != total: + raise ValueError( + f"after filling NaNs, the sum of positive ({n_pos}) and negative " + f"({n_neg}) predictions should add up to {total=}" + ) + + true_pos = actual_pos & model_pos + false_neg = actual_pos & model_neg + false_pos = actual_neg & model_pos + true_neg = actual_neg & model_neg + + return true_pos, false_neg, false_pos, true_neg + + +# This is also coptied from the matbench-discovery repo +def stable_metrics( + each_true: Sequence[float] | pd.Series | np.ndarray, + each_pred: Sequence[float] | pd.Series | np.ndarray, + *, + stability_threshold: float = 0.0, + fillna: bool = True, + prevalence: float | None = None, +) -> dict[str, float]: + n_true_pos, n_false_neg, n_false_pos, n_true_neg = map( + sum, + classify_stable( + each_true, each_pred, stability_threshold=stability_threshold, fillna=fillna + ), + ) + + n_total_pos = n_true_pos + n_false_neg + n_total_neg = n_true_neg + n_false_pos + if prevalence is None: + prevalence = ( + n_total_pos / (n_total_pos + n_total_neg) + if (n_total_pos + n_total_neg) > 0 + else float("nan") + ) + precision = ( + n_true_pos / (n_true_pos + n_false_pos) + if (n_true_pos + n_false_pos) > 0 + else float("nan") + ) + recall = n_true_pos / n_total_pos if n_total_pos > 0 else float("nan") + + TPR = recall + FPR = n_false_pos / n_total_neg if n_total_neg > 0 else float("nan") + TNR = n_true_neg / n_total_neg if n_total_neg > 0 else float("nan") + FNR = n_false_neg / n_total_pos if n_total_pos > 0 else float("nan") + + if FPR > 0 and TNR > 0 and FPR + TNR != 1: + if abs(FPR + TNR - 1) > 1e-6: + raise ValueError(f"{FPR=} {TNR=} don't add up to 1") + + if TPR > 0 and FNR > 0 and TPR + FNR != 1: + if abs(TPR + FNR - 1) > 1e-6: + raise ValueError(f"{TPR=} {FNR=} don't add up to 1") + + is_nan = np.isnan(each_true) | np.isnan(each_pred) + each_true, each_pred = np.array(each_true)[~is_nan], np.array(each_pred)[~is_nan] + + if precision + recall == 0: + f1_score = float("nan") + else: + f1_score = 2 * (precision * recall) / (precision + recall) + + return dict( + F1=f1_score, + DAF=precision / prevalence if prevalence > 0 else float("nan"), + Precision=precision, + Recall=recall, + Accuracy=( + (n_true_pos + n_true_neg) / (n_total_pos + n_total_neg) + if (n_total_pos + n_total_neg > 0) + else float("nan") + ), + TPR=TPR, + FPR=FPR, + TNR=TNR, + FNR=FNR, + TP=n_true_pos, + FP=n_false_pos, + TN=n_true_neg, + FN=n_false_neg, + MAE=np.abs(each_true - each_pred).mean(), + RMSE=((each_true - each_pred) ** 2).mean() ** 0.5, + R2=r2_score(each_true, each_pred) if len(each_true) > 1 else float("nan"), + ) + + def _process_batch_common( batch_id: int, structures: List[Any], @@ -584,7 +745,12 @@ def run_benchmark_hog( try: import torch - num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + num_gpus = 1 + else: + num_gpus = 0 except ImportError: num_gpus = 0 @@ -599,6 +765,10 @@ def run_benchmark_hog( available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores threads_per_worker = max(1, available_cores // num_workers) + # MPS (Apple Silicon) performance degrades with high thread counts due to contention + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + threads_per_worker = 1 + logger.info( f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)" ) @@ -665,6 +835,10 @@ def run_benchmark_hog( logger.info(f"Checkpoint saved to {checkpoint_path}") except Exception as e: logger.error(f"Failed to save checkpoint: {e}") + raise RuntimeError( + f"Critical: Failed to save checkpoint to {checkpoint_path}. " + f"Aborting to prevent loss of progress. Error: {e}" + ) from e elapsed = time.time() - chunk_start logger.info(f"Chunk {chunk_idx + 1} complete in {elapsed:.1f}s") @@ -963,8 +1137,17 @@ def _run_task( ) if checkpoint_path: - print(f"Resuming from checkpoint: {checkpoint_path}") - final_checkpoint_path = checkpoint_path + # Always expand tilde to home directory + final_checkpoint_path = os.path.expanduser(checkpoint_path) + if os.path.exists(final_checkpoint_path): + print(f"Resuming from checkpoint: {final_checkpoint_path}") + else: + print( + f"WARNING: Checkpoint file not found at {final_checkpoint_path}. " + f"Starting fresh and will save checkpoints to this path." + ) + # Ensure directory exists for new checkpoint + os.makedirs(os.path.dirname(final_checkpoint_path), exist_ok=True) else: print( f"Checkpoint will be saved to: ~/.garden/benchmarks/{checkpoint_name}" @@ -974,6 +1157,18 @@ def _run_task( ) os.makedirs(os.path.dirname(final_checkpoint_path), exist_ok=True) + # Validate we can write to the checkpoint path early to fail fast + try: + test_file = final_checkpoint_path + ".write_test" + with open(test_file, "w") as f: + f.write("test") + os.remove(test_file) + except Exception as e: + raise RuntimeError( + f"Cannot write to checkpoint path: {final_checkpoint_path}. " + f"Check permissions and disk space. Error: {e}" + ) from e + runner_config["checkpoint_path"] = final_checkpoint_path # meta_metrics_source is injected by BenchmarkMethod wrapper diff --git a/garden_ai/benchmarks/utils/meta_metrics.py b/garden_ai/benchmarks/utils/meta_metrics.py index e18120d4..481a2b35 100644 --- a/garden_ai/benchmarks/utils/meta_metrics.py +++ b/garden_ai/benchmarks/utils/meta_metrics.py @@ -60,6 +60,8 @@ def get_hardware_info() -> Dict[str, Any]: info["gpu_memory_gb"] = round(props.total_memory / (1024**3), 1) elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): info["device_type"] = "mps" + info["num_gpus"] = 1 + info["gpu_names"] = ["Apple Metal Performance Shaders"] except ImportError: pass return info From 5192800ed0c91158c84812c7e098d36400e31f36 Mon Sep 17 00:00:00 2001 From: hholb Date: Wed, 10 Dec 2025 09:24:41 -0700 Subject: [PATCH 11/23] bump groundhog version --- pyproject.toml | 2 +- uv.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6b3d1706..dec7f17a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ dependencies = [ # used transitively by modal -> grpclib, force 4.3.0 to reslove CVE-2025-57804 # Can remove once we upgrade to more current modal sdk version "h2>=4.3.0", - "groundhog-hpc>=0.5.0", + "groundhog-hpc>=0.5.6", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index 0d176456..6a2e9a3e 100644 --- a/uv.lock +++ b/uv.lock @@ -1098,7 +1098,7 @@ requires-dist = [ { name = "gitpython", specifier = ">=3.1.35,<4.0.0" }, { name = "globus-compute-sdk", specifier = ">=4.0.0" }, { name = "globus-sdk", specifier = ">=3.34.0,<4.0.0" }, - { name = "groundhog-hpc", specifier = ">=0.5.0" }, + { name = "groundhog-hpc", specifier = ">=0.5.6" }, { name = "h2", specifier = ">=4.3.0" }, { name = "huggingface-hub", specifier = "==0.18.0" }, { name = "ipython", specifier = "<8.13" }, @@ -1251,7 +1251,7 @@ wheels = [ [[package]] name = "groundhog-hpc" -version = "0.5.4" +version = "0.5.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "globus-compute-sdk" }, @@ -1265,9 +1265,9 @@ dependencies = [ { name = "typer" }, { name = "uv" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ea/4a/79c3bef59e0e4e538875949cec290d26cf5cefd601135e190723f9fc89de/groundhog_hpc-0.5.4.tar.gz", hash = "sha256:1f9ef486a6b62a3f28168689425b9b838c1abe92d76291a392299c70a4f5a0ec", size = 31705, upload-time = "2025-11-06T23:31:08.795Z" } +sdist = { url = "https://files.pythonhosted.org/packages/5a/e7/adf855aaded946d2cff12851320c7b53114fed40d4b833efaf0081bc3aea/groundhog_hpc-0.5.6.tar.gz", hash = "sha256:cc5a25c0dfc6a0ddc641e631cc7dae1466e81b8f24984f102eb8300cf6340b42", size = 32346, upload-time = "2025-12-09T18:49:49.554Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ea/c4/abece517b27357edc102891244233e68a12de57c0cee7bd4a404ad86bb74/groundhog_hpc-0.5.4-py3-none-any.whl", hash = "sha256:287c91211f2d64fb89b84b3be0cd26611cd3c1a29c76b83efe1219f3ad8fc53f", size = 44364, upload-time = "2025-11-06T23:31:07.512Z" }, + { url = "https://files.pythonhosted.org/packages/7c/13/702590a7f6064c01609379225c678b42e6c1a56e72e85a8d14a23ec9213a/groundhog_hpc-0.5.6-py3-none-any.whl", hash = "sha256:d6347031c1f779e24379fd9619ca59dc2dce8df521fcd0c6cb51b00a7e807eab", size = 45086, upload-time = "2025-12-09T18:49:50.346Z" }, ] [[package]] From 060865ca8ddc7b0b80d9c50992731385b43ba880 Mon Sep 17 00:00:00 2001 From: hholb Date: Wed, 10 Dec 2025 13:17:24 -0700 Subject: [PATCH 12/23] calculate metrics if the given checkpoint file has finished all of the materials --- .../benchmarks/matbench_discovery/tasks.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index 9d59b708..c4d5e0e3 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -724,7 +724,21 @@ def run_benchmark_hog( ] if not items_to_process: - logger.info("All items already processed!") + logger.info( + "All items already processed! Calculating metrics from checkpoint..." + ) + + # Calculate metrics from checkpoint results + try: + metrics = calc_metrics_fn(results, config) + logger.info(f"Metrics calculated: {metrics}") + except Exception as e: + logger.error(f"Failed to calculate metrics: {e}") + import traceback + + traceback.print_exc() + metrics = {"error": f"Metrics calculation failed: {e}"} + run_metadata = calculate_run_metadata( hardware_info=hardware_info, model_info=model_info, @@ -733,7 +747,7 @@ def run_benchmark_hog( num_structures_total=len(all_items), num_structures_processed=0, ) - return {"metrics": {}, "run_metadata": run_metadata} + return {"metrics": metrics, "run_metadata": run_metadata} logger.info(f"Processing {len(items_to_process)} remaining items") From 8eeefb297aca329a8ffb566107499d7f3ecdae82 Mon Sep 17 00:00:00 2001 From: hholb Date: Wed, 10 Dec 2025 13:43:08 -0700 Subject: [PATCH 13/23] remote unused files --- .../benchmarks/matbench_discovery/README.md | 285 ------------------ .../examples/matbench_mace_multi_gpu.py | 1 - .../benchmarks/matbench_discovery/metrics.py | 193 ------------ 3 files changed, 479 deletions(-) delete mode 100644 garden_ai/benchmarks/matbench_discovery/README.md delete mode 100644 garden_ai/benchmarks/matbench_discovery/metrics.py diff --git a/garden_ai/benchmarks/matbench_discovery/README.md b/garden_ai/benchmarks/matbench_discovery/README.md deleted file mode 100644 index 4273cd02..00000000 --- a/garden_ai/benchmarks/matbench_discovery/README.md +++ /dev/null @@ -1,285 +0,0 @@ -# Matbench Discovery Benchmark Adapter - -Minimal viable implementation for running [Matbench Discovery](https://matbench-discovery.materialsproject.org/) benchmarks on remote HPC systems via Globus Compute. - -## Overview - -This adapter enables Garden AI users to benchmark their materials models against the Matbench Discovery test suite without manually managing HPC jobs, environment setup, or data transfers. - -### Current Status: MVP - -**Implemented:** -- ✅ IS2RE (Initial Structure to Relaxed Energy) task -- ✅ Remote environment setup with UV -- ✅ Automatic dependency installation -- ✅ Basic metric calculation -- ✅ Multi-GPU parallelization (automatic GPU detection and work distribution) - -**Future Work:** -- ⏳ Additional tasks (RS2RE, S2EFS, thermal conductivity) -- ⏳ Globus Transfer for model weights and large datasets -- ⏳ Checkpointing and failure recovery -- ⏳ Full metric calculation against DFT ground truth -- ⏳ Backend integration for result publishing - -## Architecture - -``` -User's Machine Remote HPC Endpoint -├─ MatbenchDiscovery ├─ Clone matbench-discovery repo -│ ├─ tasks.IS2RE │ ├─ Set up UV virtual environment -│ └─ Globus Compute Executor ───┼─>├─ Install dependencies - │ │ ├─ matbench-discovery - │ │ └─ model package (e.g., mace-torch) - │ ├─ Load test structures via DataFiles - │ ├─ Run structure relaxations - │ ├─ Calculate metrics - │ └─ Return results -``` - -## File Structure - -``` -matbench_discovery/ -├── __init__.py # Main adapter class (MatbenchDiscovery) -├── tasks.py # Task implementations (IS2RETask) -├── remote_runner.py # Remote execution functions -├── enums.py # Task enumerations -├── example.py # Usage example -└── README.md # This file -``` - -## Usage - -### Basic Example - -```python -from garden_ai.benchmarks import MatbenchDiscovery -from my_model import MyModel - -# Configure endpoint -endpoint_id = "your-endpoint-uuid" -endpoint_config = { - "account": "project-account", - "partition": "gpu-debug", - "scheduler_options": "#SBATCH --gpus-per-node=1" -} - -# Run benchmark -with MatbenchDiscovery(endpoint_id, endpoint_config) as bench: - model = MyModel() - task = bench.tasks.IS2RE - - # Submit job (returns immediately) - future = task.submit(model, num_structures=100) - - # Wait for completion - results = future.result() - - # Calculate metrics - metrics = task.calculate_metrics(results) - print(metrics) -``` - -### Multi-GPU Parallelization - -The adapter automatically detects and uses all available GPUs on the compute node for parallel processing. This significantly improves throughput for large-scale benchmarks. - -**Example: 4-GPU Configuration on Anvil** - -```python -from garden_ai.benchmarks import MatbenchDiscovery - -endpoint_id = "your-endpoint-uuid" -endpoint_config = { - "account": "your-account", - "qos": "gpu", - "partition": "gpu", - "scheduler_options": "#SBATCH --gpus-per-node=4\n#SBATCH --time=4:00:00\n#SBATCH --mem=64G", - "worker_init": "pip install --user uv", -} - -with MatbenchDiscovery(endpoint_id, endpoint_config) as bench: - task = bench.tasks.IS2RE - - # Multi-GPU is enabled by default - future = task.submit( - model_package="mace-torch", - model_factory="mace_mp", - model_kwargs={"model": "medium", "device": "cuda"}, - num_structures=1000, - use_multi_gpu=True, # Default: True - ) - - results = future.result() - metrics = task.calculate_metrics(results) -``` - -**How it works:** -1. Automatically detects available GPUs using `torch.cuda.device_count()` -2. Splits structures into equal batches (one per GPU) -3. Processes batches in parallel using multiprocessing -4. Aggregates results from all workers - -**Performance expectations:** -- **Single GPU**: ~10-20 structures/hour (baseline) -- **4 GPUs**: ~3-4x speedup (~40-80 structures/hour) -- Actual performance depends on model complexity and structure size - -**Disabling multi-GPU:** -```python -future = task.submit( - model_package="mace-torch", - model_factory="mace_mp", - model_kwargs={"model": "medium", "device": "cuda"}, - num_structures=100, - use_multi_gpu=False, # Use single GPU/CPU -) -``` - -### Scaling Guide - -**Recommended test progression:** - -1. **Small test (10-100 structures)**: Verify setup and model compatibility - - Partition: `gpu-debug` - - Time: 30 minutes - - GPUs: 1-4 - -2. **Medium test (1000 structures)**: Test multi-GPU parallelization - - Partition: `gpu` - - Time: 4 hours - - GPUs: 4 - - Expected throughput: ~250-300 structures/hour with 4 GPUs - -3. **Full dataset (~257k structures)**: Production run - - Partition: `gpu` - - Time: 48+ hours - - GPUs: 4 - - Consider implementing checkpointing for runs >24 hours - -### Model Requirements - -For the MVP, models must: - -1. **Be pip-installable** (or provide package name) -2. **Implement ASE calculator interface** (or be convertible to one) -3. **Have a checkpoint file** (optional, can be None for models with default weights) - -Example model: - -```python -class MyModel: - def __init__(self): - self.checkpoint_path = "/path/to/checkpoint.pt" - - # ASE calculator interface - def calculate(self, atoms, properties, system_changes): - # Calculate energy, forces, stress - ... -``` - -### Workflow Details - -When you call `task.submit(model)`: - -1. **Model introspection**: Extracts model class name, module, and checkpoint path -2. **Remote submission**: Sends job to Globus Compute endpoint -3. **Environment setup** (on remote): - - Clones matbench-discovery repository - - Creates Python 3.11 virtual environment with UV - - Installs matbench-discovery package - - Installs model package (e.g., `pip install mace-torch`) -4. **Benchmark execution**: - - Loads test structures using `DataFiles.wbm_initial_structures` - - Instantiates model and loads checkpoint - - Runs geometry optimizations (ASE FIRE optimizer) - - Collects results -5. **Result return**: Returns energies, convergence stats, and failures - -## Configuration Options - -### MatbenchDiscovery - -```python -MatbenchDiscovery( - endpoint_id="uuid", # Required: Globus Compute endpoint - user_endpoint_config=dict, # Optional: HPC scheduler config - repo_ref="main", # Optional: Git ref to use - model_package="mace-torch" # Optional: Default model package -) -``` - -### IS2RETask.submit() - -```python -task.submit( - model, # Required: Model instance - num_structures=100, # Optional: Number of structures to test - model_package="mace-torch", # Optional: Override default package - use_multi_gpu=True, # Optional: Enable multi-GPU (default: True) -) -``` - -## Design Decisions - -### Why UV? -- Fast, deterministic installs -- Handles both `pyproject.toml` and `requirements.txt` -- Built-in venv creation with specific Python versions - -### Why DataFiles auto-download? -- Avoids manual Globus Transfer setup for MVP -- Matbench's DataFiles handles caching automatically -- Can optimize with explicit transfer later - -### Why ASE calculator interface? -- Standard in materials modeling community -- Most interatomic potentials support it (MACE, M3GNet, CHGNet, etc.) -- Simple adaptation layer if needed - -### Why multiprocessing for multi-GPU? -- Simple and effective for within-node parallelization -- Avoids CUDA initialization issues with fork -- Each GPU gets isolated process with dedicated memory -- Easy to debug and monitor per-GPU progress - -## Limitations - -1. **No weight transfer**: Model checkpoints must be accessible from remote (URL or shared filesystem) -2. **Basic metrics**: Only reports convergence stats, not comparison to DFT ground truth -3. **IS2RE only**: Other tasks not yet implemented -4. **No checkpointing**: If job fails, must restart from scratch (recommended for runs >24 hours) -5. **No result publishing**: Backend integration not yet implemented -6. **Single-node parallelization**: Multi-GPU works within a node; SLURM array jobs for multi-node not yet implemented - -## Next Steps - -To generalize beyond Matbench: - -1. **Extract base classes**: `BenchmarkAdapter`, `BenchmarkTask`, `RemoteRunner` -2. **Add data staging**: Implement Globus Transfer for weights/datasets -3. **Define model interface**: Standard protocol for model serialization -4. **Add checkpointing**: Save intermediate results for failure recovery -5. **Implement batching**: Distribute work across SLURM array jobs - -## Testing - -```bash -# Install dependencies -cd garden_ai/benchmarks/matbench_discovery -pip install -e . - -# Update example.py with your endpoint details -vim example.py - -# Run example -python example.py -``` - -## References - -- [Matbench Discovery](https://matbench-discovery.materialsproject.org/) -- [Matbench Discovery GitHub](https://github.com/janosh/matbench-discovery) -- [Globus Compute](https://globus-compute.readthedocs.io/) -- [ASE Calculator Interface](https://wiki.fysik.dtu.dk/ase/ase/calculators/calculators.html) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py index 4fadc7f5..e0fb0003 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py @@ -34,7 +34,6 @@ def main(): "cuequivariance-ops-torch-cu12", ], checkpoint_path="~/.garden/benchmarks/matbench_mace-torch_cuequivariance_full_20251208_115719_ed2e47af.json", - num_structures="random_100", ) if "error" in results.get("metrics", {}): diff --git a/garden_ai/benchmarks/matbench_discovery/metrics.py b/garden_ai/benchmarks/matbench_discovery/metrics.py deleted file mode 100644 index c08bad2d..00000000 --- a/garden_ai/benchmarks/matbench_discovery/metrics.py +++ /dev/null @@ -1,193 +0,0 @@ -"""Functions to classify energy above convex hull predictions as true/false -positive/negative and compute performance metrics. - -Adapted from matbench-discovery to avoid import issues. -Original source: https://github.com/janosh/matbench-discovery/blob/main/matbench_discovery/metrics/discovery.py -""" - -from collections.abc import Sequence - -import numpy as np -import pandas as pd -from sklearn.metrics import r2_score - -# Default stability threshold from matbench-discovery -# STABILITY_THRESHOLD = 0.0 - - -def classify_stable( - each_true: Sequence[float] | pd.Series | np.ndarray, - each_pred: Sequence[float] | pd.Series | np.ndarray, - *, - stability_threshold: float = 0.0, - fillna: bool = True, -) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]: - """Classify model stability predictions as true/false positive/negatives (usually - w.r.t DFT-ground truth labels). All energies are assumed to be in eV/atom - (but shouldn't really matter as long as they're consistent). - - Args: - each_true (Sequence[float] | pd.Series): Ground truth energy above convex hull - values. - each_pred (Sequence[float] | pd.Series): Model-predicted energy above convex - hull values. - stability_threshold (float, optional): Maximum energy above convex hull - for a material to still be considered stable. Usually 0, 0.05 or 0.1. - Defaults to 0.0, meaning a material has to be directly on - the hull to be called stable. Negative values mean a material has to pull - the known hull down by that amount to count as stable. Few materials lie - below the known hull, so only negative values very close to 0 make sense. - fillna (bool): Whether to fill NaNs as the model predicting unstable. Defaults - to True. - - Returns: - tuple[TP, FN, FP, TN]: Indices as pd.Series for true positives, - false negatives, false positives and true negatives (in this order). - - Raises: - ValueError: If sum of positive + negative preds doesn't add up to the total. - """ - if len(each_true) != len(each_pred): - raise ValueError(f"{len(each_true)=} != {len(each_pred)=}") - - each_true_arr, each_pred_arr = pd.Series(each_true), pd.Series(each_pred) - - if stability_threshold is None or np.isnan(stability_threshold): - raise ValueError("stability_threshold must be a real number") - actual_pos = each_true_arr <= (stability_threshold or 0) - actual_neg = each_true_arr > (stability_threshold or 0) - - model_pos = each_pred_arr <= (stability_threshold or 0) - model_neg = each_pred_arr > (stability_threshold or 0) - - if fillna: - nan_mask = np.isnan(each_pred) - # for in both the model's stable and unstable preds, fill NaNs as unstable - model_pos[nan_mask] = False - model_neg[nan_mask] = True - - n_pos, n_neg, total = model_pos.sum(), model_neg.sum(), len(each_pred) - if n_pos + n_neg != total: - raise ValueError( - f"after filling NaNs, the sum of positive ({n_pos}) and negative " - f"({n_neg}) predictions should add up to {total=}" - ) - - true_pos = actual_pos & model_pos - false_neg = actual_pos & model_neg - false_pos = actual_neg & model_pos - true_neg = actual_neg & model_neg - - return true_pos, false_neg, false_pos, true_neg - - -def stable_metrics( - each_true: Sequence[float] | pd.Series | np.ndarray, - each_pred: Sequence[float] | pd.Series | np.ndarray, - *, - stability_threshold: float = 0.0, - fillna: bool = True, - prevalence: float | None = None, -) -> dict[str, float]: - """Get a dictionary of stability prediction metrics. Mostly binary classification - metrics, but also MAE, RMSE and R2. - - Args: - each_true (Sequence[float] | pd.Series): true energy above convex hull - each_pred (Sequence[float] | pd.Series): predicted energy above convex hull - stability_threshold (float): Where to place stability threshold relative to - convex hull in eV/atom, usually 0 or 0.1 eV. Default = 0.0. - fillna (bool): Whether to fill NaNs as the model predicting unstable. Defaults - to True. - prevalence (float, optional): Prevalence of stable materials in the dataset. - If None, calculated from the input data. Defaults to None. - - Note: Should give equivalent classification metrics to - sklearn.metrics.classification_report( - each_true > stability_threshold, - each_pred > stability_threshold, - output_dict=True, - ) - when using the same stability_threshold. - - Returns: - dict[str, float]: dictionary of classification metrics with keys DAF, Precision, - Recall, Accuracy, F1, TPR, FPR, TNR, FNR, MAE, RMSE, R2. - - Raises: - ValueError: If FPR + TNR don't add up to 1. - ValueError: If TPR + FNR don't add up to 1. - """ - n_true_pos, n_false_neg, n_false_pos, n_true_neg = map( - sum, - classify_stable( - each_true, each_pred, stability_threshold=stability_threshold, fillna=fillna - ), - ) - - n_total_pos = n_true_pos + n_false_neg - n_total_neg = n_true_neg + n_false_pos - # prevalence: dummy discovery rate of stable crystals by selecting randomly from - # all materials - if prevalence is None: - prevalence = ( - n_total_pos / (n_total_pos + n_total_neg) - if (n_total_pos + n_total_neg) > 0 - else float("nan") - ) - # Calculate ratios with guards against division by zero - precision = ( - n_true_pos / (n_true_pos + n_false_pos) - if (n_true_pos + n_false_pos) > 0 - else float("nan") - ) - recall = n_true_pos / n_total_pos if n_total_pos > 0 else float("nan") - - TPR = recall - FPR = n_false_pos / n_total_neg if n_total_neg > 0 else float("nan") - TNR = n_true_neg / n_total_neg if n_total_neg > 0 else float("nan") - FNR = n_false_neg / n_total_pos if n_total_pos > 0 else float("nan") - - # sanity check: false positives + true negatives = all negatives - if FPR > 0 and TNR > 0 and FPR + TNR != 1: - # Floating point tolerance - if abs(FPR + TNR - 1) > 1e-6: - raise ValueError(f"{FPR=} {TNR=} don't add up to 1") - - # sanity check: true positives + false negatives = all positives - if TPR > 0 and FNR > 0 and TPR + FNR != 1: - # Floating point tolerance - if abs(TPR + FNR - 1) > 1e-6: - raise ValueError(f"{TPR=} {FNR=} don't add up to 1") - - # Drop NaNs to calculate regression metrics - is_nan = np.isnan(each_true) | np.isnan(each_pred) - each_true, each_pred = np.array(each_true)[~is_nan], np.array(each_pred)[~is_nan] - - if precision + recall == 0: # Calculate F1 score, handling division by zero - f1_score = float("nan") - else: - f1_score = 2 * (precision * recall) / (precision + recall) - - return dict( - F1=f1_score, - DAF=precision / prevalence if prevalence > 0 else float("nan"), - Precision=precision, - Recall=recall, - Accuracy=( - (n_true_pos + n_true_neg) / (n_total_pos + n_total_neg) - if (n_total_pos + n_total_neg > 0) - else float("nan") - ), - TPR=TPR, - FPR=FPR, - TNR=TNR, - FNR=FNR, - TP=n_true_pos, - FP=n_false_pos, - TN=n_true_neg, - FN=n_false_neg, - MAE=np.abs(each_true - each_pred).mean(), - RMSE=((each_true - each_pred) ** 2).mean() ** 0.5, - R2=r2_score(each_true, each_pred) if len(each_true) > 1 else float("nan"), - ) From 97d7291e0b78cc3341295a9b0d174012433c9d1d Mon Sep 17 00:00:00 2001 From: hholb Date: Wed, 10 Dec 2025 14:05:25 -0700 Subject: [PATCH 14/23] fix type errors --- .../examples/matbench_equiformerv2.py | 4 +- .../benchmarks/matbench_discovery/tasks.py | 63 +++++++++++-------- garden_ai/benchmarks/utils/meta_metrics.py | 9 ++- 3 files changed, 43 insertions(+), 33 deletions(-) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py index e877f230..afd3b2d5 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py @@ -8,7 +8,9 @@ def create_equiformerv2_model(device): - from fairchem.core.calculate.ase_calculator import Calculator + from fairchem.core.calculate.ase_calculator import ( + Calculator, # type: ignore[import-not-found] + ) # Use pre-trained checkpoint - will auto-download from HuggingFace return Calculator( diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index c4d5e0e3..25c045a0 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -25,8 +25,8 @@ import groundhog_hpc as hog import numpy as np -import pandas as pd -from sklearn.metrics import r2_score +import pandas as pd # type: ignore[import-untyped] +from sklearn.metrics import r2_score # type: ignore[import-untyped] class DatasetSize(str, Enum): @@ -111,7 +111,7 @@ def convert_numpy_types(obj): def _inject_meta_metrics(source: str) -> None: """Inject meta_metrics functions from source code for remote execution.""" global get_hardware_info, extract_model_info, calculate_run_metadata - namespace = {} + namespace: Dict[str, Any] = {} exec(source, namespace) get_hardware_info = namespace["get_hardware_info"] extract_model_info = namespace["extract_model_info"] @@ -287,7 +287,7 @@ def _process_batch_common( func_name = func_name_match.group(1) # Execute the source to define the function - local_namespace = {} + local_namespace: Dict[str, Any] = {} exec(model_factory_source, local_namespace) model_factory = local_namespace[func_name] @@ -327,7 +327,7 @@ def get_material_ids_for_subset( if subset_type == "full": return None - from matbench_discovery.data import DataFiles + from matbench_discovery.data import DataFiles # type: ignore[import-untyped] df = pd.read_csv(DataFiles.wbm_summary.path) @@ -353,7 +353,7 @@ def _load_dataset_common( config: Dict[str, Any], zip_path: str, read_format: str = "extxyz", - read_index: str | slice = None, + read_index: Optional[str | slice] = None, ) -> List[Any]: from io import TextIOWrapper from zipfile import ZipFile @@ -395,7 +395,9 @@ def _load_dataset_common( elif not isinstance(atoms_list, list): structures.append((filename, atoms_list)) else: - structures.append((filename, read(text_stream, format=read_format))) + structures.append( + (filename, read(text_stream, format=read_format)) # type: ignore[arg-type] + ) return structures @@ -477,19 +479,19 @@ def compute(model, atoms): def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Any]: - from matbench_discovery.data import DataFiles + from matbench_discovery.data import DataFiles # type: ignore[import-untyped] return _load_dataset_common(config, DataFiles.wbm_initial_atoms.path) def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Any]: - from matbench_discovery.data import DataFiles + from matbench_discovery.data import DataFiles # type: ignore[import-untyped] return _load_dataset_common(config, DataFiles.wbm_relaxed_atoms.path) def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]: - from matbench_discovery.data import DataFiles + from matbench_discovery.data import DataFiles # type: ignore[import-untyped] return _load_dataset_common(config, DataFiles.mp_trj_extxyz.path, read_index=":") @@ -497,7 +499,7 @@ def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]: def calculate_metrics_energy( results: Dict[str, Any], config: Dict[str, Any] ) -> Dict[str, Any]: - from matbench_discovery.data import df_wbm + from matbench_discovery.data import df_wbm # type: ignore[import-untyped] if len(results) == 0: return {"error": "No results to evaluate"} @@ -542,9 +544,9 @@ def calculate_metrics_forces( from zipfile import ZipFile from ase.io import read - from matbench_discovery.data import DataFiles + from matbench_discovery.data import DataFiles # type: ignore[import-untyped] - metrics = { + metrics: Dict[str, List[float]] = { "energy_mae": [], "energy_rmse": [], "force_mae": [], @@ -552,9 +554,12 @@ def calculate_metrics_forces( "stress_mae": [], "stress_rmse": [], } - all_e_pred, all_e_true = [], [] - all_f_pred, all_f_true = [], [] - all_s_pred, all_s_true = [], [] + all_e_pred: List[float] = [] + all_e_true: List[float] = [] + all_f_pred: List[float] = [] + all_f_true: List[float] = [] + all_s_pred: List[float] = [] + all_s_true: List[float] = [] zip_path = DataFiles.mp_trj_extxyz.path @@ -569,8 +574,8 @@ def calculate_metrics_forces( gt_atoms = atoms_list[-1] e_pred = res["energy"] - e_true = gt_atoms.get_potential_energy() - n_atoms = len(gt_atoms) + e_true = gt_atoms.get_potential_energy() # type: ignore[union-attr] + n_atoms = len(gt_atoms) # type: ignore[arg-type] energy_error = abs(e_pred - e_true) / n_atoms metrics["energy_mae"].append(energy_error) metrics["energy_rmse"].append(energy_error**2) @@ -578,7 +583,7 @@ def calculate_metrics_forces( all_e_true.append(e_true / n_atoms) f_pred = np.array(res["forces"]) - f_true = gt_atoms.get_forces() + f_true = gt_atoms.get_forces() # type: ignore[union-attr] force_error = np.abs(f_pred - f_true) metrics["force_mae"].append(force_error.mean()) metrics["force_rmse"].append((force_error**2).mean()) @@ -586,7 +591,7 @@ def calculate_metrics_forces( all_f_true.extend(f_true.flatten()) s_pred = np.array(res["stress"]) - s_true = gt_atoms.get_stress() + s_true = gt_atoms.get_stress() # type: ignore[union-attr] stress_error = np.abs(s_pred - s_true) metrics["stress_mae"].append(stress_error.mean()) metrics["stress_rmse"].append((stress_error**2).mean()) @@ -644,6 +649,8 @@ def run_benchmark_hog( _inject_meta_metrics(meta_metrics_source) # Collect hardware and model info + assert get_hardware_info is not None, "meta_metrics not injected" + assert extract_model_info is not None, "meta_metrics not injected" hardware_info = get_hardware_info() model_info = extract_model_info(model_packages) logger.info(f"Hardware: {hardware_info}") @@ -739,6 +746,7 @@ def run_benchmark_hog( traceback.print_exc() metrics = {"error": f"Metrics calculation failed: {e}"} + assert calculate_run_metadata is not None, "meta_metrics not injected" run_metadata = calculate_run_metadata( hardware_info=hardware_info, model_info=model_info, @@ -771,7 +779,7 @@ def run_benchmark_hog( use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1 # Use sched_getaffinity to get cores available to this job, not total cores on node try: - total_cores = len(os.sched_getaffinity(0)) + total_cores = len(os.sched_getaffinity(0)) # type: ignore[attr-defined] except AttributeError: # Fallback for systems without sched_getaffinity (e.g., macOS) total_cores = os.cpu_count() or 1 @@ -872,6 +880,7 @@ def run_benchmark_hog( metrics = {"error": f"Metrics calculation failed: {e}"} # Calculate run metadata + assert calculate_run_metadata is not None, "meta_metrics not injected" run_metadata = calculate_run_metadata( hardware_info=hardware_info, model_info=model_info, @@ -1067,8 +1076,8 @@ def _prepare_runner_config( else: seed = num_structures.seed elif hasattr(num_structures, "subset"): # DatasetConfig - subset = num_structures.subset.value - seed = num_structures.seed + subset = num_structures.subset.value # type: ignore[union-attr] + seed = num_structures.seed # type: ignore[union-attr] elif isinstance(num_structures, int): subset = "full" # We handle int as limit in load_dataset @@ -1111,7 +1120,7 @@ def _generate_checkpoint_name( def _run_task( model_factory: Any, model_packages: str | List[str], - num_structures: int | "DatasetSize" | "DatasetConfig", + num_structures: int | str | DatasetSize | DatasetConfig, checkpoint_name: str | None, checkpoint_path: str | None, process_fn: Any, @@ -1203,7 +1212,7 @@ def _run_task( def IS2RE( model_factory: Any, model_packages: str | List[str], - num_structures: int | "DatasetSize" | "DatasetConfig" = "full", + num_structures: int | str | DatasetSize | DatasetConfig = "full", checkpoint_name: str | None = None, checkpoint_path: str | None = None, sys_path: List[str] | None = None, @@ -1227,7 +1236,7 @@ def IS2RE( def RS2RE( model_factory: Any, model_packages: str | List[str], - num_structures: int | "DatasetSize" | "DatasetConfig" = "full", + num_structures: int | str | DatasetSize | DatasetConfig = "full", checkpoint_name: str | None = None, checkpoint_path: str | None = None, sys_path: List[str] | None = None, @@ -1251,7 +1260,7 @@ def RS2RE( def S2EFS( model_factory: Any, model_packages: str | List[str], - num_structures: int | "DatasetSize" | "DatasetConfig" = "full", + num_structures: int | str | DatasetSize | DatasetConfig = "full", checkpoint_name: str | None = None, checkpoint_path: str | None = None, sys_path: List[str] | None = None, diff --git a/garden_ai/benchmarks/utils/meta_metrics.py b/garden_ai/benchmarks/utils/meta_metrics.py index 481a2b35..951d549f 100644 --- a/garden_ai/benchmarks/utils/meta_metrics.py +++ b/garden_ai/benchmarks/utils/meta_metrics.py @@ -51,11 +51,10 @@ def get_hardware_info() -> Dict[str, Any]: if torch.cuda.is_available(): info["device_type"] = "cuda" - info["num_gpus"] = torch.cuda.device_count() - info["gpu_names"] = [ - torch.cuda.get_device_name(i) for i in range(info["num_gpus"]) - ] - if info["num_gpus"] > 0: + num_gpus = torch.cuda.device_count() + info["num_gpus"] = num_gpus + info["gpu_names"] = [torch.cuda.get_device_name(i) for i in range(num_gpus)] + if num_gpus > 0: props = torch.cuda.get_device_properties(0) info["gpu_memory_gb"] = round(props.total_memory / (1024**3), 1) elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): From 0c2997bca3b3d64c383b81954ea6c21527e4a36a Mon Sep 17 00:00:00 2001 From: hholb Date: Wed, 10 Dec 2025 14:08:32 -0700 Subject: [PATCH 15/23] appease mypy --- .../examples/matbench_equiformerv2.py | 4 +--- garden_ai/benchmarks/matbench_discovery/tasks.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py index afd3b2d5..eb4d64ea 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py @@ -8,9 +8,7 @@ def create_equiformerv2_model(device): - from fairchem.core.calculate.ase_calculator import ( - Calculator, # type: ignore[import-not-found] - ) + from fairchem.core.calculate.ase_calculator import Calculator # type: ignore # Use pre-trained checkpoint - will auto-download from HuggingFace return Calculator( diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index 25c045a0..45ec9701 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -25,8 +25,8 @@ import groundhog_hpc as hog import numpy as np -import pandas as pd # type: ignore[import-untyped] -from sklearn.metrics import r2_score # type: ignore[import-untyped] +import pandas as pd # type: ignore +from sklearn.metrics import r2_score # type: ignore class DatasetSize(str, Enum): @@ -327,7 +327,7 @@ def get_material_ids_for_subset( if subset_type == "full": return None - from matbench_discovery.data import DataFiles # type: ignore[import-untyped] + from matbench_discovery.data import DataFiles # type: ignore df = pd.read_csv(DataFiles.wbm_summary.path) @@ -479,19 +479,19 @@ def compute(model, atoms): def load_dataset_wbm_initial(config: Dict[str, Any]) -> List[Any]: - from matbench_discovery.data import DataFiles # type: ignore[import-untyped] + from matbench_discovery.data import DataFiles # type: ignore return _load_dataset_common(config, DataFiles.wbm_initial_atoms.path) def load_dataset_wbm_relaxed(config: Dict[str, Any]) -> List[Any]: - from matbench_discovery.data import DataFiles # type: ignore[import-untyped] + from matbench_discovery.data import DataFiles # type: ignore return _load_dataset_common(config, DataFiles.wbm_relaxed_atoms.path) def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]: - from matbench_discovery.data import DataFiles # type: ignore[import-untyped] + from matbench_discovery.data import DataFiles # type: ignore return _load_dataset_common(config, DataFiles.mp_trj_extxyz.path, read_index=":") @@ -499,7 +499,7 @@ def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]: def calculate_metrics_energy( results: Dict[str, Any], config: Dict[str, Any] ) -> Dict[str, Any]: - from matbench_discovery.data import df_wbm # type: ignore[import-untyped] + from matbench_discovery.data import df_wbm # type: ignore if len(results) == 0: return {"error": "No results to evaluate"} @@ -544,7 +544,7 @@ def calculate_metrics_forces( from zipfile import ZipFile from ase.io import read - from matbench_discovery.data import DataFiles # type: ignore[import-untyped] + from matbench_discovery.data import DataFiles # type: ignore metrics: Dict[str, List[float]] = { "energy_mae": [], From 2f96ee4d1ab6ce3d1cee57dfc25d1dc5c98adee3 Mon Sep 17 00:00:00 2001 From: hholb Date: Wed, 10 Dec 2025 14:45:18 -0700 Subject: [PATCH 16/23] implement publish_benchmark_result helper --- garden_ai/backend_client.py | 10 +++ garden_ai/benchmarks/__init__.py | 90 ++++++++++++++++--- .../benchmarks/matbench_discovery/tasks.py | 47 +++++++--- 3 files changed, 122 insertions(+), 25 deletions(-) diff --git a/garden_ai/backend_client.py b/garden_ai/backend_client.py index 63d5e7a0..9574fa1d 100644 --- a/garden_ai/backend_client.py +++ b/garden_ai/backend_client.py @@ -6,6 +6,10 @@ from garden_ai.constants import GardenConstants from garden_ai.gardens import Garden +from garden_ai.schemas.benchmark import ( + BenchmarkResultCreateRequest, + BenchmarkResultResponse, +) from garden_ai.schemas.garden import GardenMetadata from garden_ai.schemas.hpc import HpcInvocationCreateRequest from garden_ai.schemas.modal import ( @@ -157,3 +161,9 @@ def search_gardens(self, payload: dict) -> dict: def create_hpc_invocation(self, payload: HpcInvocationCreateRequest) -> dict: response = self._post("/hpc/invocations", payload.model_dump(mode="json")) return response + + def publish_benchmark_result( + self, payload: BenchmarkResultCreateRequest + ) -> BenchmarkResultResponse: + response = self._post("/benchmarks", payload.model_dump(mode="json")) + return BenchmarkResultResponse(**response) diff --git a/garden_ai/benchmarks/__init__.py b/garden_ai/benchmarks/__init__.py index 5d40ae5a..5dc7dd7a 100644 --- a/garden_ai/benchmarks/__init__.py +++ b/garden_ai/benchmarks/__init__.py @@ -7,6 +7,11 @@ - MatbenchDiscovery: Materials discovery benchmark suite """ +from typing import Any, Dict, Optional + +from garden_ai.client import GardenClient +from garden_ai.schemas.benchmark import BenchmarkResultCreateRequest + from .matbench_discovery.enums import DatasetSize, MatbenchTask from .matbench_discovery.tasks import MatbenchDiscovery @@ -14,22 +19,85 @@ "MatbenchDiscovery", "MatbenchTask", "DatasetSize", + "publish_benchmark_result", ] -def publish_benchmark_result(benchmark, model, results): - """Publish benchmark results to Garden AI backend. +def publish_benchmark_result( + result: Dict[str, Any], + benchmark_name: Optional[str] = None, + task_name: Optional[str] = None, +) -> Dict[str, Any]: + """Publish benchmark results to the Garden AI backend. - This is a placeholder for future functionality to store benchmark - results alongside published models. + This function takes the output from a benchmark task (e.g., MatbenchDiscovery.IS2RE.remote()) + and publishes it to the Garden backend for tracking and leaderboard purposes. Args: - benchmark: Benchmark adapter instance - model: Model that was benchmarked - results: Dictionary of benchmark metrics + result: The output dictionary from a benchmark task. Should contain: + - 'metrics': Dictionary of benchmark metrics (F1, DAF, MAE, etc.) + - 'run_metadata': Optional run metadata (hardware, timing, cost) + - '_benchmark_info': Auto-injected benchmark/task info (if from wrapped method) + benchmark_name: Override for benchmark name (defaults to auto-detected from result) + task_name: Override for task name (defaults to auto-detected from result) + + Returns: + Dictionary containing the response from the backend, including the result ID. + + Raises: + ValueError: If benchmark_name or task_name cannot be determined. + requests.HTTPError: If the backend request fails. + + Example: + ```python + from garden_ai.benchmarks import MatbenchDiscovery, publish_benchmark_result + + # Run a benchmark + output = MatbenchDiscovery.IS2RE.remote( + endpoint="your-endpoint-id", + model_factory=create_model, + model_packages="mace-torch", + ) + + # Publish the results + response = publish_benchmark_result(output) + print(f"Published with ID: {response['id']}") + ``` """ - # TODO: Implement when backend API is ready - raise NotImplementedError( - "Publishing benchmark results is not yet implemented. " - "For now, save results locally or to your own storage." + # Extract benchmark info from result or use provided overrides + benchmark_info = result.get("_benchmark_info", {}) + + final_benchmark_name = benchmark_name or benchmark_info.get("benchmark_name") + final_task_name = task_name or benchmark_info.get("task_name") + + if not final_benchmark_name: + raise ValueError( + "benchmark_name is required. Either pass it explicitly or use a result " + "from a MatbenchDiscovery task method (e.g., MatbenchDiscovery.IS2RE.remote())." + ) + + if not final_task_name: + raise ValueError( + "task_name is required. Either pass it explicitly or use a result " + "from a MatbenchDiscovery task method (e.g., MatbenchDiscovery.IS2RE.remote())." + ) + + # Extract metrics and run_metadata + metrics = result.get("metrics", {}) + run_metadata = result.get("run_metadata") + + if not metrics: + raise ValueError("Result must contain 'metrics' dictionary.") + + # Create the request payload + payload = BenchmarkResultCreateRequest( + benchmark_name=final_benchmark_name, + benchmark_task_name=final_task_name, + metrics=metrics, + run_metadata=run_metadata, ) + + # Get authenticated client and publish + client = GardenClient() + response = client.backend_client.publish_benchmark_result(payload) + return response.model_dump() diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index 45ec9701..28f09fbd 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -899,9 +899,17 @@ def run_benchmark_hog( class BenchmarkMethod: """Wrapper around groundhog Method that handles source extraction for remote execution.""" - def __init__(self, hog_method): - """Initialize wrapper with the underlying groundhog Method.""" + BENCHMARK_NAME = "matbench_discovery" + + def __init__(self, hog_method, task_name: str): + """Initialize wrapper with the underlying groundhog Method. + + Args: + hog_method: The underlying groundhog method to wrap. + task_name: Name of the benchmark task (e.g., 'IS2RE', 'S2EFS'). + """ self._hog_method = hog_method + self._task_name = task_name def _extract_sources(self, kwargs): """Extract source code from model_factory and meta_metrics for remote execution.""" @@ -1018,17 +1026,28 @@ def _print_checkpoint_info(self, kwargs, is_remote: bool): print(f' checkpoint_path="{identifier}"') print("=" * 80) + def _add_benchmark_info(self, result: Dict[str, Any]) -> Dict[str, Any]: + """Add benchmark metadata to the result for publishing.""" + if isinstance(result, dict): + result["_benchmark_info"] = { + "benchmark_name": self.BENCHMARK_NAME, + "task_name": self._task_name, + } + return result + def remote(self, *args, **kwargs): """Execute remotely with automatic source extraction.""" kwargs = self._extract_sources(kwargs) self._print_checkpoint_info(kwargs, is_remote=True) - return self._hog_method.remote(*args, **kwargs) + result = self._hog_method.remote(*args, **kwargs) + return self._add_benchmark_info(result) def local(self, *args, **kwargs): """Execute locally with automatic source extraction.""" kwargs = self._extract_sources(kwargs) self._print_checkpoint_info(kwargs, is_remote=False) - return self._hog_method.local(*args, **kwargs) + result = self._hog_method.local(*args, **kwargs) + return self._add_benchmark_info(result) def submit(self, *args, **kwargs): """Submit for async execution with automatic source extraction.""" @@ -1349,15 +1368,15 @@ def create_mace_model(device): _run_task = _MatbenchDiscoveryBase._run_task # Main benchmark tasks - wrapped for automatic model_factory source extraction - IS2RE = BenchmarkMethod(_MatbenchDiscoveryBase.IS2RE) - RS2RE = BenchmarkMethod(_MatbenchDiscoveryBase.RS2RE) - S2EFS = BenchmarkMethod(_MatbenchDiscoveryBase.S2EFS) + IS2RE = BenchmarkMethod(_MatbenchDiscoveryBase.IS2RE, "IS2RE") + RS2RE = BenchmarkMethod(_MatbenchDiscoveryBase.RS2RE, "RS2RE") + S2EFS = BenchmarkMethod(_MatbenchDiscoveryBase.S2EFS, "S2EFS") # Aliases - S2EF = BenchmarkMethod(_MatbenchDiscoveryBase.S2EF) - S2EFSM = BenchmarkMethod(_MatbenchDiscoveryBase.S2EFSM) - IS2E = BenchmarkMethod(_MatbenchDiscoveryBase.IS2E) - S2E = BenchmarkMethod(_MatbenchDiscoveryBase.S2E) - S2RE = BenchmarkMethod(_MatbenchDiscoveryBase.S2RE) - RP2RE = BenchmarkMethod(_MatbenchDiscoveryBase.RP2RE) - IP2E = BenchmarkMethod(_MatbenchDiscoveryBase.IP2E) + S2EF = BenchmarkMethod(_MatbenchDiscoveryBase.S2EF, "S2EF") + S2EFSM = BenchmarkMethod(_MatbenchDiscoveryBase.S2EFSM, "S2EFSM") + IS2E = BenchmarkMethod(_MatbenchDiscoveryBase.IS2E, "IS2E") + S2E = BenchmarkMethod(_MatbenchDiscoveryBase.S2E, "S2E") + S2RE = BenchmarkMethod(_MatbenchDiscoveryBase.S2RE, "S2RE") + RP2RE = BenchmarkMethod(_MatbenchDiscoveryBase.RP2RE, "RP2RE") + IP2E = BenchmarkMethod(_MatbenchDiscoveryBase.IP2E, "IP2E") From 723bb580778e410aaf73857ffd4100536afa6406 Mon Sep 17 00:00:00 2001 From: hholb Date: Wed, 10 Dec 2025 14:48:00 -0700 Subject: [PATCH 17/23] cleanup comments --- garden_ai/benchmarks/matbench_discovery/enums.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/garden_ai/benchmarks/matbench_discovery/enums.py b/garden_ai/benchmarks/matbench_discovery/enums.py index 5c34cb6b..fed3b514 100644 --- a/garden_ai/benchmarks/matbench_discovery/enums.py +++ b/garden_ai/benchmarks/matbench_discovery/enums.py @@ -4,13 +4,7 @@ class MatbenchTask(Enum): - """Available Matbench Discovery benchmark tasks. - - Currently only IS2RE is implemented for the MVP. - Future tasks could include: - - RS2RE: Relaxed Structure to Relaxed Energy - - S2EFS: Structure to Energy, Forces, and Stress - """ + """Available Matbench Discovery benchmark tasks.""" IS2RE = "IS2RE" # Initial Structure to Relaxed Energy RS2RE = "RS2RE" # Relaxed Structure to Relaxed Energy From 83baabac33a88e065d03b293ac89cd15e22a1b86 Mon Sep 17 00:00:00 2001 From: hholb Date: Wed, 10 Dec 2025 14:55:06 -0700 Subject: [PATCH 18/23] add py.typed to appease mypy as per PEP 561 --- garden_ai/py.typed | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 garden_ai/py.typed diff --git a/garden_ai/py.typed b/garden_ai/py.typed new file mode 100644 index 00000000..e69de29b From ac03466841165f5fbdf23c352fcae6111ee6b369 Mon Sep 17 00:00:00 2001 From: hholb Date: Wed, 10 Dec 2025 14:59:17 -0700 Subject: [PATCH 19/23] add missing schema file :facepalm --- garden_ai/schemas/benchmark.py | 36 ++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 garden_ai/schemas/benchmark.py diff --git a/garden_ai/schemas/benchmark.py b/garden_ai/schemas/benchmark.py new file mode 100644 index 00000000..f607c8a8 --- /dev/null +++ b/garden_ai/schemas/benchmark.py @@ -0,0 +1,36 @@ +"""Benchmark-related schemas for API requests/responses.""" + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field + + +class BenchmarkResultCreateRequest(BaseModel): + """Request schema for publishing benchmark results to the backend.""" + + benchmark_name: str = Field( + ..., + description="Name of the benchmark suite (e.g., 'matbench_discovery')", + ) + benchmark_task_name: str = Field( + ..., + description="Name of the specific task within the benchmark (e.g., 'IS2RE', 'S2EFS')", + ) + metrics: Dict[str, Any] = Field( + ..., + description="Dictionary of benchmark metrics (F1, DAF, MAE, etc.)", + ) + run_metadata: Optional[Dict[str, Any]] = Field( + default=None, + description="Optional run metadata (hardware info, timing, cost estimates)", + ) + + +class BenchmarkResultResponse(BaseModel): + """Response schema from the benchmark result creation endpoint.""" + + id: str = Field(..., description="Unique identifier for the benchmark result") + benchmark_name: str + benchmark_task_name: str + metrics: Dict[str, Any] + run_metadata: Optional[Dict[str, Any]] = None From 964c4f14124622614472013693190fe089dbd537 Mon Sep 17 00:00:00 2001 From: hholb Date: Thu, 11 Dec 2025 10:21:24 -0700 Subject: [PATCH 20/23] fix checkpoint resume bug, clean up examples --- .../examples/local_execution.py | 33 ++++------ .../examples/matbench_equiformerv2.py | 43 ++++-------- .../examples/matbench_mace_multi_gpu.py | 55 ++++++---------- .../examples/matbench_mattersim.py | 42 ++++-------- .../examples/matbench_sevennet.py | 42 ++++-------- .../benchmarks/matbench_discovery/tasks.py | 65 +++++++++++++++---- 6 files changed, 121 insertions(+), 159 deletions(-) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py b/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py index 6414f5cf..1f482904 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py @@ -10,22 +10,17 @@ def create_mattersim_model(device): return MatterSimCalculator(device=device) -def main(): - print("Running MatterSim benchmark locally...") - - # Run IS2RE task locally - # Note: Requires a GPU or MPS if using MatterSim, or CPU if specified/supported - output = MatbenchDiscovery.IS2RE.local( - model_factory=create_mattersim_model, - model_packages="mattersim", - num_structures="random_100", - ) - - if "error" in output.get("metrics", {}): - print(f"Error: {output['metrics']['error']}") - else: - print("Benchmark Results:", output.get("metrics")) - - -if __name__ == "__main__": - main() +print("Running MatterSim benchmark locally...") + +# Run IS2RE task locally +# Note: Requires a GPU or MPS if using MatterSim, or CPU if specified/supported +output = MatbenchDiscovery.IS2RE.local( + model_factory=create_mattersim_model, + model_packages="mattersim", + num_structures="random_100", +) + +if "error" in output.get("metrics", {}): + print(f"Error: {output['metrics']['error']}") +else: + print("Benchmark Results:", output.get("metrics")) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py index eb4d64ea..b32cfed0 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py @@ -3,9 +3,6 @@ from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery -# Globus Compute endpoint -ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" - def create_equiformerv2_model(device): from fairchem.core.calculate.ase_calculator import Calculator # type: ignore @@ -16,32 +13,16 @@ def create_equiformerv2_model(device): ) -def main(): - print(f"Running EquiformerV2 benchmark on endpoint {ENDPOINT_ID}...") - - # Run S2EFS task (structure to energy/forces/stress) - output = MatbenchDiscovery.S2EFS.remote( - endpoint=ENDPOINT_ID, - user_endpoint_config={ - "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8\n", - "walltime": 7200, - "qos": "gpu", - "partition": "gpu-debug", - "account": "cis250461-gpu", - "cores_per_node": 16, - "mem_per_node": 32, - "requirements": "", - }, - model_factory=create_equiformerv2_model, - model_packages="fairchem-core", - num_structures="random_100", - ) - - if "error" in output.get("metrics", {}): - print(f"Error: {output['metrics']['error']}") - else: - print("Benchmark Results:", output.get("metrics")) - +# Run S2EFS task (structure to energy/forces/stress) +output = MatbenchDiscovery.S2EFS.remote( + endpoint="anvil", + account="your-account-here", + model_factory=create_equiformerv2_model, + model_packages="fairchem-core", + num_structures="random_10k", +) -if __name__ == "__main__": - main() +if "error" in output.get("metrics", {}): + print(f"Error: {output['metrics']['error']}") +else: + print("Benchmark Results:", output.get("metrics")) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py index e0fb0003..90e7134b 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 """Matbench Discovery Benchmark - MACE Multi-GPU Example""" -from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery +from rich import print -ANVIL = "5aafb4c1-27b2-40d8-a038-a0277611868f" +from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery def create_mace_model(device): @@ -12,35 +12,22 @@ def create_mace_model(device): return mace_mp(model="medium-mpa-0", device=device, default_dtype="float64") -def main(): - print(f"Running MACE benchmark on endpoint {ANVIL}...") - - results = MatbenchDiscovery.IS2RE.remote( - endpoint=ANVIL, - user_endpoint_config={ - "scheduler_options": "#SBATCH --gpus-per-node=4\n", - "walltime": "05:00:00", - "qos": "gpu", - "partition": "gpu", - "account": "cis250461-gpu", - "cores_per_node": 16, - "requirements": "", # 'requirements' is required for Anvil endpoint - }, - model_factory=create_mace_model, - model_packages=[ - "mace-torch", - "cuequivariance", - "cuequivariance-torch", - "cuequivariance-ops-torch-cu12", - ], - checkpoint_path="~/.garden/benchmarks/matbench_mace-torch_cuequivariance_full_20251208_115719_ed2e47af.json", - ) - - if "error" in results.get("metrics", {}): - print(f"Error: {results['metrics']['error']}") - else: - print("Benchmark Results:", results.get("metrics")) - - -if __name__ == "__main__": - main() +print("Running MACE benchmark on endpoint anvil...") + +results = MatbenchDiscovery.IS2RE.remote( + endpoint="anvil", + account="cis250461-gpu", + model_factory=create_mace_model, + model_packages=[ + "mace-torch", + "cuequivariance", + "cuequivariance-torch", + "cuequivariance-ops-torch-cu12", + ], + num_structures="random_100", +) + +if "error" in results.get("metrics", {}): + print(f"Error: {results['metrics']['error']}") +else: + print("Benchmark Results:", results) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py index 8a7636ba..f9a5c4c8 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py @@ -3,9 +3,6 @@ from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery -# Globus Compute endpoint -ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" - def create_mattersim_model(device): from mattersim.forcefield import MatterSimCalculator @@ -13,32 +10,15 @@ def create_mattersim_model(device): return MatterSimCalculator(device=device) -def main(): - print(f"Running MatterSim benchmark on endpoint {ENDPOINT_ID}...") - - # Run IS2RE task - output = MatbenchDiscovery.IS2RE.remote( - endpoint=ENDPOINT_ID, - user_endpoint_config={ - "scheduler_options": "#SBATCH --gpus-per-node=2\n#SBATCH --cpus-per-task=8\n", - "walltime": 7200, - "qos": "gpu", - "partition": "gpu-debug", - "account": "cis250461-gpu", - "cores_per_node": 16, - "mem_per_node": 32, - "requirements": "", - }, - model_factory=create_mattersim_model, - model_packages="mattersim", - num_structures="random_100", - ) - - if "error" in output.get("metrics", {}): - print(f"Error: {output['metrics']['error']}") - else: - print("Benchmark Results:", output.get("metrics")) - +output = MatbenchDiscovery.IS2RE.remote( + endpoint="anvil", + account="your-account-here", + model_factory=create_mattersim_model, + model_packages="mattersim", + num_structures="random_100", +) -if __name__ == "__main__": - main() +if "error" in output.get("metrics", {}): + print(f"Error: {output['metrics']['error']}") +else: + print("Benchmark Results:", output) diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py index 411c64e1..da69d7ab 100644 --- a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py +++ b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py @@ -3,9 +3,6 @@ from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery -# Globus Compute endpoint (replace with your endpoint UUID) -ENDPOINT_ID = "5aafb4c1-27b2-40d8-a038-a0277611868f" - def create_sevennet_model(device): from sevenn.calculator import SevenNetCalculator @@ -13,32 +10,15 @@ def create_sevennet_model(device): return SevenNetCalculator(model="7net-0", device=device) -def main(): - print(f"Running SevenNet benchmark on endpoint {ENDPOINT_ID}...") - - # Run IS2RE task - output = MatbenchDiscovery.IS2RE.remote( - endpoint=ENDPOINT_ID, - user_endpoint_config={ - "scheduler_options": "#SBATCH --gpus-per-node=2\n", - "walltime": 7200, - "qos": "gpu", - "partition": "gpu-debug", - "account": "cis250461-gpu", - "cores_per_node": 16, - "mem_per_node": 32, - "requirements": "", - }, - model_factory=create_sevennet_model, - model_packages="sevenn", - num_structures="random_100", - ) - - if "error" in output.get("metrics", {}): - print(f"Error: {output['metrics']['error']}") - else: - print("Benchmark Results:", output.get("metrics")) - +output = MatbenchDiscovery.IS2RE.remote( + endpoint="anvil", + account="your-account-here", + model_factory=create_sevennet_model, + model_packages="sevenn", + num_structures="random_100", +) -if __name__ == "__main__": - main() +if "error" in output.get("metrics", {}): + print(f"Error: {output['metrics']['error']}") +else: + print("Benchmark Results:", output) diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index 28f09fbd..b2710467 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -9,6 +9,15 @@ # "torch", # "matbench-discovery", # ] +# +# [tool.hog.anvil] +# endpoint = "5aafb4c1-27b2-40d8-a038-a0277611868f" +# account = "replace with your account" +# qos = "gpu" +# partition = "gpu" +# cores_per_node = 16 +# scheduler_options = "#SBATCH --gpus-per-node=4\n" +# requirements = "" # /// from __future__ import annotations @@ -706,13 +715,29 @@ def run_benchmark_hog( checkpoint_path = config.get("checkpoint_path") results = {} + prior_elapsed = 0.0 # Cumulative time from previous sessions if checkpoint_path and os.path.exists(checkpoint_path): logger.info(f"Loading checkpoint from {checkpoint_path}") try: with open(checkpoint_path) as f: - results = json.load(f) - logger.info(f"Found {len(results)} processed items in checkpoint") + checkpoint_data = json.load(f) + + # Handle new format with metadata vs old format (plain results dict) + if "_checkpoint_meta" in checkpoint_data: + results = checkpoint_data.get("results", {}) + meta = checkpoint_data["_checkpoint_meta"] + prior_elapsed = meta.get("elapsed_seconds", 0.0) + logger.info( + f"Found {len(results)} processed items in checkpoint " + f"(prior elapsed: {prior_elapsed:.1f}s)" + ) + else: + # Backward compatibility: old format is plain results dict + results = checkpoint_data + logger.info( + f"Found {len(results)} processed items in checkpoint (legacy format)" + ) except Exception as e: logger.warning(f"Failed to load checkpoint: {e}. Starting fresh.") @@ -746,14 +771,15 @@ def run_benchmark_hog( traceback.print_exc() metrics = {"error": f"Metrics calculation failed: {e}"} + # Use cumulative values from checkpoint metadata assert calculate_run_metadata is not None, "meta_metrics not injected" run_metadata = calculate_run_metadata( hardware_info=hardware_info, model_info=model_info, - total_elapsed=0, + total_elapsed=prior_elapsed, num_workers=0, num_structures_total=len(all_items), - num_structures_processed=0, + num_structures_processed=len(results), ) return {"metrics": metrics, "run_metadata": run_metadata} @@ -850,9 +876,20 @@ def run_benchmark_hog( if checkpoint_path: try: tmp_path = checkpoint_path + ".tmp" + # Calculate cumulative elapsed time for checkpoint + current_elapsed = time.time() - start_time + cumulative_elapsed = prior_elapsed + current_elapsed + + # Save checkpoint with metadata for resume + checkpoint_data = { + "results": convert_numpy_types(results), + "_checkpoint_meta": { + "elapsed_seconds": cumulative_elapsed, + "structures_processed": len(results), + }, + } with open(tmp_path, "w") as f: - clean_results = convert_numpy_types(results) - json.dump(clean_results, f, indent=2) + json.dump(checkpoint_data, f, indent=2) os.replace(tmp_path, checkpoint_path) logger.info(f"Checkpoint saved to {checkpoint_path}") except Exception as e: @@ -865,8 +902,12 @@ def run_benchmark_hog( elapsed = time.time() - chunk_start logger.info(f"Chunk {chunk_idx + 1} complete in {elapsed:.1f}s") - total_elapsed = time.time() - start_time - logger.info(f"Benchmark complete in {total_elapsed:.1f}s.") + session_elapsed = time.time() - start_time + total_elapsed = prior_elapsed + session_elapsed + logger.info( + f"Session complete in {session_elapsed:.1f}s. " + f"Total elapsed: {total_elapsed:.1f}s." + ) logger.info("Calculating metrics...") try: @@ -879,7 +920,7 @@ def run_benchmark_hog( traceback.print_exc() metrics = {"error": f"Metrics calculation failed: {e}"} - # Calculate run metadata + # Calculate run metadata using cumulative values assert calculate_run_metadata is not None, "meta_metrics not injected" run_metadata = calculate_run_metadata( hardware_info=hardware_info, @@ -887,7 +928,7 @@ def run_benchmark_hog( total_elapsed=total_elapsed, num_workers=num_workers, num_structures_total=len(all_items), - num_structures_processed=len(items_to_process), + num_structures_processed=len(results), ) logger.info(f"Run metadata: {run_metadata}") @@ -1310,9 +1351,7 @@ def S2EFSM(*args, **kwargs): @hog.method() def IS2E(*args, **kwargs): - # Same as IS2RE but static? No, IS2E is Initial Structure to Energy (Static). - # IS2RE is Relaxation. - # IS2E logic: + # IS2E is Initial Structure to Energy (Static). return _MatbenchDiscoveryBase._run_task( *args, **kwargs, From cf7b6c59e28ad5ab6f9da4a522fef1fe2af4e9a3 Mon Sep 17 00:00:00 2001 From: hholb Date: Fri, 12 Dec 2025 09:44:32 -0700 Subject: [PATCH 21/23] fix missing import --- garden_ai/benchmarks/matbench_discovery/tasks.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index b2710467..99cbb8f7 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -30,12 +30,13 @@ import sys import time from enum import Enum -from typing import Any, Callable, Dict, List, Optional, Sequence +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence import groundhog_hpc as hog import numpy as np -import pandas as pd # type: ignore -from sklearn.metrics import r2_score # type: ignore + +if TYPE_CHECKING: + import pandas as pd class DatasetSize(str, Enum): @@ -148,6 +149,8 @@ def classify_stable( stability_threshold: float = 0.0, fillna: bool = True, ) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]: + import pandas as pd + if len(each_true) != len(each_pred): raise ValueError(f"{len(each_true)=} != {len(each_pred)=}") @@ -233,6 +236,8 @@ def stable_metrics( else: f1_score = 2 * (precision * recall) / (precision + recall) + from sklearn.metrics import r2_score # type: ignore + return dict( F1=f1_score, DAF=precision / prevalence if prevalence > 0 else float("nan"), @@ -336,6 +341,7 @@ def get_material_ids_for_subset( if subset_type == "full": return None + import pandas as pd from matbench_discovery.data import DataFiles # type: ignore df = pd.read_csv(DataFiles.wbm_summary.path) @@ -610,6 +616,8 @@ def calculate_metrics_forces( except Exception: pass + from sklearn.metrics import r2_score # type: ignore + result_metrics = {} if metrics["energy_mae"]: result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"])) From 5342023881c82bf39f7547b40a65cc81e112f0c3 Mon Sep 17 00:00:00 2001 From: hholb Date: Wed, 14 Jan 2026 13:40:50 -0700 Subject: [PATCH 22/23] fix a few bugs, update request schemas --- garden_ai/benchmarks/__init__.py | 33 ++- .../benchmarks/matbench_discovery/tasks.py | 238 +++++++++--------- garden_ai/client.py | 5 +- garden_ai/schemas/benchmark.py | 2 +- 4 files changed, 145 insertions(+), 133 deletions(-) diff --git a/garden_ai/benchmarks/__init__.py b/garden_ai/benchmarks/__init__.py index 5dc7dd7a..c3281dcd 100644 --- a/garden_ai/benchmarks/__init__.py +++ b/garden_ai/benchmarks/__init__.py @@ -25,6 +25,8 @@ def publish_benchmark_result( result: Dict[str, Any], + model_name: str, + garden_doi: Optional[str] = None, benchmark_name: Optional[str] = None, task_name: Optional[str] = None, ) -> Dict[str, Any]: @@ -38,6 +40,9 @@ def publish_benchmark_result( - 'metrics': Dictionary of benchmark metrics (F1, DAF, MAE, etc.) - 'run_metadata': Optional run metadata (hardware, timing, cost) - '_benchmark_info': Auto-injected benchmark/task info (if from wrapped method) + model_name: The specific name/variant of the model (e.g., "mace-mp-0-medium", "chgnet-v0.3.0"). + This is required to identify the model on the leaderboard. + garden_doi: Optional DOI for the Garden publication associated with this benchmark result. benchmark_name: Override for benchmark name (defaults to auto-detected from result) task_name: Override for task name (defaults to auto-detected from result) @@ -53,14 +58,10 @@ def publish_benchmark_result( from garden_ai.benchmarks import MatbenchDiscovery, publish_benchmark_result # Run a benchmark - output = MatbenchDiscovery.IS2RE.remote( - endpoint="your-endpoint-id", - model_factory=create_model, - model_packages="mace-torch", - ) + output = MatbenchDiscovery.IS2RE.remote(...) # Publish the results - response = publish_benchmark_result(output) + response = publish_benchmark_result(output, model_name="mace-medium", garden_doi="10.26311/example.doi") print(f"Published with ID: {response['id']}") ``` """ @@ -82,19 +83,25 @@ def publish_benchmark_result( "from a MatbenchDiscovery task method (e.g., MatbenchDiscovery.IS2RE.remote())." ) - # Extract metrics and run_metadata - metrics = result.get("metrics", {}) - run_metadata = result.get("run_metadata") + # Inject model name into run_metadata + if "run_metadata" not in result: + result["run_metadata"] = {} + if "model" not in result["run_metadata"]: + result["run_metadata"]["model"] = {} + + result["run_metadata"]["model"]["variant"] = model_name - if not metrics: - raise ValueError("Result must contain 'metrics' dictionary.") + # Inject garden_doi if provided + if garden_doi: + result["run_metadata"]["garden_doi"] = garden_doi # Create the request payload + # Note: We pass the modified result (containing metrics and metadata) as 'metrics' + # This assumes the backend handles the unified blob or we rely on the schema field description. payload = BenchmarkResultCreateRequest( benchmark_name=final_benchmark_name, benchmark_task_name=final_task_name, - metrics=metrics, - run_metadata=run_metadata, + metrics=result, ) # Get authenticated client and publish diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index 99cbb8f7..d2d0ee7a 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -1,5 +1,5 @@ # /// script -# requires-python = ">=3.10" +# requires-python = "==3.12" # dependencies = [ # "groundhog-hpc", # "ase", @@ -8,16 +8,22 @@ # "scikit-learn", # "torch", # "matbench-discovery", +# "bibtexparser<1.4.3", # ] # # [tool.hog.anvil] # endpoint = "5aafb4c1-27b2-40d8-a038-a0277611868f" -# account = "replace with your account" # qos = "gpu" # partition = "gpu" # cores_per_node = 16 +# mem_per_mode = 32 # scheduler_options = "#SBATCH --gpus-per-node=4\n" # requirements = "" +# +# [tool.hog.sophia] +# endpoint = "8d07224c-ceaa-4b7f-946d-fae3f7423d5b" +# account = "Garden-Ai" +# queue = "by-gpu" # /// from __future__ import annotations @@ -27,6 +33,7 @@ import logging import multiprocessing import os +import random import sys import time from enum import Enum @@ -140,8 +147,20 @@ def _get_meta_metrics_source() -> str: _MODEL_CACHE = None -# Metrics calculations lifted from https://github.com/janosh/matbench-discovery/tree/main/matbench_discovery/metrics -# Since they aren't setup to be easily imported, we just copy them here +# Helper functions from matbench-discovery/metrics/geo_opt.py and phonons.py + + +def calc_rmsd( + coords_true: np.ndarray, + coords_pred: np.ndarray, +) -> float: + """Calculate the Root Mean Square Deviation (RMSD) between two sets of coordinates. + Assumes atoms are in the same order. + """ + return np.sqrt(((coords_true - coords_pred) ** 2).mean()) + + +# Metrics calculations lifted from https://github.com/janosh/matbench-discovery/blob/main/matbench_discovery/metrics/discovery.py def classify_stable( each_true: Sequence[float] | pd.Series | np.ndarray, each_pred: Sequence[float] | pd.Series | np.ndarray, @@ -238,6 +257,7 @@ def stable_metrics( from sklearn.metrics import r2_score # type: ignore + # Return the standard discovery metrics return dict( F1=f1_score, DAF=precision / prevalence if prevalence > 0 else float("nan"), @@ -252,13 +272,13 @@ def stable_metrics( FPR=FPR, TNR=TNR, FNR=FNR, - TP=n_true_pos, - FP=n_false_pos, - TN=n_true_neg, - FN=n_false_neg, MAE=np.abs(each_true - each_pred).mean(), RMSE=((each_true - each_pred) ** 2).mean() ** 0.5, - R2=r2_score(each_true, each_pred) if len(each_true) > 1 else float("nan"), + **{ + "R^2": r2_score(each_true, each_pred) + if len(each_true) > 1 + else float("nan") + }, ) @@ -514,12 +534,46 @@ def load_dataset_mp_trj(config: Dict[str, Any]) -> List[Any]: def calculate_metrics_energy( results: Dict[str, Any], config: Dict[str, Any] ) -> Dict[str, Any]: - from matbench_discovery.data import df_wbm # type: ignore + from io import TextIOWrapper + from zipfile import ZipFile + + from ase.io import read + from matbench_discovery.data import DataFiles, df_wbm # type: ignore if len(results) == 0: return {"error": "No results to evaluate"} model_energies = {} + rmsd_list = [] + + # Calculate RMSD if positions are returned (e.g. for IS2RE) + try: + # Check if any result has positions + first_res = next(iter(results.values())) + if isinstance(first_res, dict) and "positions" in first_res: + with ZipFile(DataFiles.wbm_relaxed_atoms.path, "r") as zf: + for sid, res in results.items(): + if isinstance(res, dict) and "positions" in res: + try: + # Load GT structure + # sid is the filename in the zip (e.g. "material_id.extxyz") + with zf.open(sid) as f: + text_stream = TextIOWrapper(f, encoding="utf-8") + # Read first frame (should be only one for relaxed) + gt_atoms = read(text_stream, format="extxyz") + + pred_pos = np.array(res["positions"]) + gt_pos = gt_atoms.get_positions() # type: ignore + + if pred_pos.shape == gt_pos.shape: + # Use helper function + rmsd = calc_rmsd(gt_pos, pred_pos) + rmsd_list.append(rmsd) + except Exception: + pass + except Exception as e: + print(f"Warning: RMSD calculation failed: {e}") + for sid, res in results.items(): if isinstance(res, dict) and res.get("energy") is not None: mat_id = sid.replace(".extxyz", "") @@ -549,6 +603,10 @@ def calculate_metrics_energy( metrics = stable_metrics(each_true, each_pred, prevalence=global_prevalence) metrics["num_evaluated"] = len(common_ids) + + # Inject RMSD + metrics["RMSD"] = float(np.mean(rmsd_list)) if rmsd_list else float("nan") + return metrics @@ -561,20 +619,9 @@ def calculate_metrics_forces( from ase.io import read from matbench_discovery.data import DataFiles # type: ignore - metrics: Dict[str, List[float]] = { - "energy_mae": [], - "energy_rmse": [], - "force_mae": [], - "force_rmse": [], - "stress_mae": [], - "stress_rmse": [], - } + # We will use the standard stable_metrics for energy predictions in the trajectory all_e_pred: List[float] = [] all_e_true: List[float] = [] - all_f_pred: List[float] = [] - all_f_true: List[float] = [] - all_s_pred: List[float] = [] - all_s_true: List[float] = [] zip_path = DataFiles.mp_trj_extxyz.path @@ -583,71 +630,32 @@ def calculate_metrics_forces( if "error" in res: continue try: - with zf.open(sid) as f: - text_stream = TextIOWrapper(f, encoding="utf-8") - atoms_list = read(text_stream, format="extxyz", index=":") - gt_atoms = atoms_list[-1] - - e_pred = res["energy"] - e_true = gt_atoms.get_potential_energy() # type: ignore[union-attr] - n_atoms = len(gt_atoms) # type: ignore[arg-type] - energy_error = abs(e_pred - e_true) / n_atoms - metrics["energy_mae"].append(energy_error) - metrics["energy_rmse"].append(energy_error**2) - all_e_pred.append(e_pred / n_atoms) - all_e_true.append(e_true / n_atoms) - - f_pred = np.array(res["forces"]) - f_true = gt_atoms.get_forces() # type: ignore[union-attr] - force_error = np.abs(f_pred - f_true) - metrics["force_mae"].append(force_error.mean()) - metrics["force_rmse"].append((force_error**2).mean()) - all_f_pred.extend(f_pred.flatten()) - all_f_true.extend(f_true.flatten()) - - s_pred = np.array(res["stress"]) - s_true = gt_atoms.get_stress() # type: ignore[union-attr] - stress_error = np.abs(s_pred - s_true) - metrics["stress_mae"].append(stress_error.mean()) - metrics["stress_rmse"].append((stress_error**2).mean()) - all_s_pred.extend(s_pred.flatten()) - all_s_true.extend(s_true.flatten()) - + if isinstance(res, dict) and "energy" in res: + with zf.open(sid) as f: + text_stream = TextIOWrapper(f, encoding="utf-8") + atoms_list = read(text_stream, format="extxyz", index=":") + gt_atoms = atoms_list[-1] # type: ignore + + e_pred = res["energy"] + e_true = gt_atoms.get_potential_energy() # type: ignore + n_atoms = len(gt_atoms) # type: ignore + + # Normalize per atom + all_e_pred.append(e_pred / n_atoms) + all_e_true.append(e_true / n_atoms) except Exception: pass - from sklearn.metrics import r2_score # type: ignore - - result_metrics = {} - if metrics["energy_mae"]: - result_metrics["energy_mae"] = float(np.mean(metrics["energy_mae"])) - result_metrics["energy_rmse"] = float(np.sqrt(np.mean(metrics["energy_rmse"]))) - result_metrics["energy_r2"] = ( - float(r2_score(all_e_true, all_e_pred)) - if len(all_e_true) > 1 - else float("nan") - ) - - if metrics["force_mae"]: - result_metrics["force_mae"] = float(np.mean(metrics["force_mae"])) - result_metrics["force_rmse"] = float(np.sqrt(np.mean(metrics["force_rmse"]))) - result_metrics["force_r2"] = ( - float(r2_score(all_f_true, all_f_pred)) - if len(all_f_true) > 1 - else float("nan") - ) + if not all_e_true: + return {"error": "No valid energy comparisons found"} - if metrics["stress_mae"]: - result_metrics["stress_mae"] = float(np.mean(metrics["stress_mae"])) - result_metrics["stress_rmse"] = float(np.sqrt(np.mean(metrics["stress_rmse"]))) - result_metrics["stress_r2"] = ( - float(r2_score(all_s_true, all_s_pred)) - if len(all_s_true) > 1 - else float("nan") - ) + each_true = np.array(all_e_true) + each_pred = np.array(all_e_pred) - result_metrics["num_evaluated"] = len(metrics["energy_mae"]) - return result_metrics + # Calculate standard discovery metrics on energies + metrics = stable_metrics(each_true, each_pred) + metrics["num_evaluated"] = len(all_e_true) + return metrics def run_benchmark_hog( @@ -763,6 +771,37 @@ def run_benchmark_hog( (item_id, item) for item_id, item in all_items if str(item_id) not in results ] + try: + import torch + + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + num_gpus = 1 + else: + num_gpus = 0 + except ImportError: + num_gpus = 0 + + use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1 + # Use sched_getaffinity to get cores available to this job, not total cores on node + try: + total_cores = len(os.sched_getaffinity(0)) # type: ignore[attr-defined] + except AttributeError: + # Fallback for systems without sched_getaffinity (e.g., macOS) + total_cores = os.cpu_count() or 1 + num_workers = num_gpus if use_multi_gpu else 1 + available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores + threads_per_worker = max(1, available_cores // num_workers) + + # MPS (Apple Silicon) performance degrades with high thread counts due to contention + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + threads_per_worker = 1 + + logger.info( + f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)" + ) + if not items_to_process: logger.info( "All items already processed! Calculating metrics from checkpoint..." @@ -785,7 +824,7 @@ def run_benchmark_hog( hardware_info=hardware_info, model_info=model_info, total_elapsed=prior_elapsed, - num_workers=0, + num_workers=num_workers, num_structures_total=len(all_items), num_structures_processed=len(results), ) @@ -793,42 +832,9 @@ def run_benchmark_hog( logger.info(f"Processing {len(items_to_process)} remaining items") - import random - random.seed(42) random.shuffle(items_to_process) - try: - import torch - - if torch.cuda.is_available(): - num_gpus = torch.cuda.device_count() - elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): - num_gpus = 1 - else: - num_gpus = 0 - except ImportError: - num_gpus = 0 - - use_multi_gpu = config.get("use_multi_gpu", True) and num_gpus > 1 - # Use sched_getaffinity to get cores available to this job, not total cores on node - try: - total_cores = len(os.sched_getaffinity(0)) # type: ignore[attr-defined] - except AttributeError: - # Fallback for systems without sched_getaffinity (e.g., macOS) - total_cores = os.cpu_count() or 1 - num_workers = num_gpus if use_multi_gpu else 1 - available_cores = max(1, total_cores - 2) if total_cores > 4 else total_cores - threads_per_worker = max(1, available_cores // num_workers) - - # MPS (Apple Silicon) performance degrades with high thread counts due to contention - if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): - threads_per_worker = 1 - - logger.info( - f"Resources: {num_gpus} GPUs, {total_cores} Cores. Using {num_workers} workers ({threads_per_worker} threads/worker)" - ) - start_time = time.time() chunk_size = 1000 * num_workers chunks = [ diff --git a/garden_ai/client.py b/garden_ai/client.py index 87b8e0e4..2b3ee425 100644 --- a/garden_ai/client.py +++ b/garden_ai/client.py @@ -23,9 +23,9 @@ from globus_sdk.authorizers import GlobusAuthorizer from globus_sdk.scopes import ScopeBuilder from globus_sdk.tokenstorage import SimpleJSONFileAdapter -from modal.cli._traceback import setup_rich_traceback from rich import print from rich.prompt import Prompt +from rich.traceback import install from garden_ai.backend_client import BackendClient from garden_ai.constants import GardenConstants @@ -34,8 +34,7 @@ from garden_ai.hpc.gardens.mlip_garden import MLIPGarden logger = logging.getLogger() -# modal helper replacement for rich.traceback.install -setup_rich_traceback() +install() class AuthException(Exception): diff --git a/garden_ai/schemas/benchmark.py b/garden_ai/schemas/benchmark.py index f607c8a8..283f7fd3 100644 --- a/garden_ai/schemas/benchmark.py +++ b/garden_ai/schemas/benchmark.py @@ -29,7 +29,7 @@ class BenchmarkResultCreateRequest(BaseModel): class BenchmarkResultResponse(BaseModel): """Response schema from the benchmark result creation endpoint.""" - id: str = Field(..., description="Unique identifier for the benchmark result") + id: int = Field(..., description="Unique identifier for the benchmark result") benchmark_name: str benchmark_task_name: str metrics: Dict[str, Any] From d9ee1ffdcf1a2491b6ed2e2082d020dda4cca385 Mon Sep 17 00:00:00 2001 From: hholb Date: Thu, 22 Jan 2026 12:49:32 -0700 Subject: [PATCH 23/23] fix gpu assignements --- .../benchmarks/matbench_discovery/tasks.py | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/garden_ai/benchmarks/matbench_discovery/tasks.py b/garden_ai/benchmarks/matbench_discovery/tasks.py index d2d0ee7a..cdf44cca 100644 --- a/garden_ai/benchmarks/matbench_discovery/tasks.py +++ b/garden_ai/benchmarks/matbench_discovery/tasks.py @@ -296,14 +296,18 @@ def _process_batch_common( import re import time + gpu_id = model_config.get("gpu_id") + if gpu_id is not None: + os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) + device = "cuda" + else: + device = setup_device(gpu_id) + import torch os.environ["OMP_NUM_THREADS"] = str(num_threads) torch.set_num_threads(num_threads) - gpu_id = model_config.get("gpu_id") - device = setup_device(gpu_id) - worker_logger = logging.getLogger(f"worker_{batch_id}") worker_logger.info( f"Started {task_name} on {device} with {len(structures)} structures. Threads: {num_threads}" @@ -590,10 +594,28 @@ def calculate_metrics_energy( df_subset = df_wbm_indexed.loc[common_ids] y_pred = np.array([model_energies[mid] for mid in common_ids]) - y_true = df_subset["uncorrected_energy"].values n_atoms = df_subset["n_sites"].values - e_form_error = (y_pred - y_true) / n_atoms + # CRITICAL FIX: Compute formation energy error, not total energy error + # Formation energy is defined as: E_formation = E_total - Σ(n_i × E_ref_i) + # where E_ref_i are elemental reference energies in their standard states + + # Get ground truth formation energy per atom (uncorrected, matches model prediction level) + y_true_form = df_subset["e_form_per_atom_uncorrected"].values # eV/atom + + # Compute reference energy per atom from known DFT data + # E_ref_per_atom = E_total_per_atom - E_form_per_atom + y_true_total = df_subset["uncorrected_energy"].values + ref_energy_per_atom = (y_true_total / n_atoms) - y_true_form + + # Compute model's predicted formation energy per atom + # E_form_pred = E_total_pred / n_atoms - E_ref_per_atom + y_pred_form = (y_pred / n_atoms) - ref_energy_per_atom + + # Formation energy error (this is what affects stability predictions!) + e_form_error = y_pred_form - y_true_form + + # Predict energy above hull by adding formation energy error to ground truth hull distance each_true = df_subset["e_above_hull_mp2020_corrected_ppd_mp"].values each_pred = each_true + e_form_error