OpenADMET · dwwest · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -59,3 +59,13 @@ The CLI entry point is `openadmet` (`openadmet/models/cli/cli.py`), with subcomm
 - Ruff + Black formatting; isort with Black-compatible profile
 - Sentence case in comments and print statements; acronyms (MPNN, MVE, ADMET, FFN) stay capitalized
 - Do not number steps in comments; do not end comments with a period
+
+## Unit Testing & Refactoring Rules
+
+When writing or refactoring tests, you must strictly adhere to the following guidelines to ensure tests are mathematically sound, robust, and non-tautological:
+
+* **Avoid Tautological Mocks:** Do not mock the system under test. Mock heavy I/O, external dependencies, or heavy data loading, but ensure the core logic of the target function is actually executed. Use lightweight synthetic datasets (e.g., small tensors or pandas DataFrames) instead of bypassing the execution entirely.
+* **Standard Mocking:** Never write custom nested dummy classes or custom mock fixtures. Always use the standard `pytest-mock` library (the `mocker` fixture) to patch objects and verify calls.
+* **No Lazy Assertions:** Never use `assert True`. Assert actual state changes, specific dictionary keys, object types (e.g., `isinstance(obj, matplotlib.figure.Figure)`), or verify file creation via the `tmp_path` fixture.
+* **Robust ML Data Testing:** When testing data splitters or clustering algorithms, you must explicitly assert that the resulting train/validation/test sets are mutually exclusive (e.g., checking that set intersections of indices or arrays are empty). Ensure synthetic testing data has enough variance (e.g., diverse SMILES scaffolds) to meaningfully test the algorithm.
+* **Safe Floating-Point Math:** Never use strict equality (`==`) to compare floating-point numbers. Always use `pytest.approx()` or `numpy.testing.assert_almost_equal()` to prevent cross-platform precision failures. Assert the actual math (e.g., UQ or metric calculations), not just the existence of the output.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,7 +17,7 @@ repos:
   - id: black
     files: ^openadmet_models
 - repo: https://github.com/PyCQA/isort
-  rev: 8.0.0
+  rev: 8.0.1
   hooks:
   - id: isort
     files: ^openadmet_models
@@ -37,7 +37,7 @@ repos:
     - --py39-plus
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version.
-  rev: v0.15.2
+  rev: v0.15.4
   hooks:
     # Run the linter.
     - id: ruff-check

diff --git a/devtools/conda-envs/openadmet-models-gpu.yaml b/devtools/conda-envs/openadmet-models-gpu.yaml
@@ -33,6 +33,7 @@ dependencies:
   - pytorch_scatter
   - pytorch_sparse
   - pytest
+  - pytest-mock
   - pytest-cov
   - pytest-xdist
   - rdkit

diff --git a/devtools/conda-envs/openadmet-models.yaml b/devtools/conda-envs/openadmet-models.yaml
@@ -33,6 +33,7 @@ dependencies:
   - pytorch_scatter
   - pytorch_sparse
   - pytest
+  - pytest-mock
   - pytest-cov
   - pytest-xdist
   - rdkit

diff --git a/openadmet/models/active_learning/committee.py b/openadmet/models/active_learning/committee.py
@@ -382,8 +382,8 @@ def _predict(self, X, return_std=False, **kwargs):
         if return_std is False:
             return mean
 
-        # Compute standard deviation
-        std = np.std(preds, axis=-1)
+        # Compute standard deviation, guard against zero std
+        std = np.maximum(np.std(preds, axis=-1), 1e-8)
 
         # Calibrate std if calibration model is available
         if self.calibrated:

diff --git a/openadmet/models/anvil/specification.py b/openadmet/models/anvil/specification.py
@@ -15,8 +15,8 @@
 from openadmet.models.active_learning.ensemble_base import (
     get_ensemble_class,
 )
-from openadmet.models.drivers import DriverType
 from openadmet.models.architecture.model_base import get_mod_class
+from openadmet.models.drivers import DriverType
 from openadmet.models.eval.eval_base import get_eval_class
 from openadmet.models.features.feature_base import get_featurizer_class
 from openadmet.models.registries import *  # noqa: F401, F403
@@ -449,9 +449,23 @@ class SplitSpec(AnvilSection):
 
 
 class FeatureSpec(AnvilSection):
-    """Featurization specification."""
+    """
+    Featurization specification.
+
+    Attributes
+    ----------
+    section_name : ClassVar[str]
+        The name of the section.
+    type : Optional[str]
+        The type of featurizer to use.
+    params : dict
+        The parameters for the featurizer.
+
+    """
 
     section_name: ClassVar[str] = "feat"
+    type: str | None = None
+    params: dict = Field(default_factory=dict)
 
 
 class ModelSpec(AnvilSection):
@@ -546,8 +560,8 @@ class EnsembleSpec(AnvilSection):
 
     section_name: ClassVar[str] = "ensemble"
     n_models: int
-    calibration_method: str | None = "isotonic-regression"
-    use_bagging: bool = True
+    calibration_method: str | None = None
+    use_bagging: bool = False
     param_paths: list[str] | None = None
     serial_paths: list[str] | None = None
 
@@ -729,6 +743,26 @@ def to_workflow(self):
         # Pull driver from associated trainer to choose the correct workflow
         trainer_class = self.procedure.train.to_class()
         driver = _DRIVER_TO_CLASS[trainer_class._driver_type]
+        model_kwargs = {
+            "param_path": self.procedure.model.param_path,
+            "serial_path": self.procedure.model.serial_path,
+            "freeze_weights": self.procedure.model.freeze_weights,
+        }
+        ensemble_kwargs = (
+            {
+                "n_models": self.procedure.ensemble.n_models,
+                "calibration_method": self.procedure.ensemble.calibration_method,
+                "param_paths": self.procedure.ensemble.param_paths,
+                "serial_paths": self.procedure.ensemble.serial_paths,
+                "use_bagging": self.procedure.ensemble.use_bagging,
+            }
+            if self.procedure.ensemble
+            else {}
+        )
+        feat_kwargs = {
+            "type": self.procedure.feat.type,
+            "params": self.procedure.feat.params,
+        }
 
         return driver(
             metadata=self.metadata,
@@ -744,5 +778,34 @@ def to_workflow(self):
             feat=self.procedure.feat.to_class(),
             trainer=self.procedure.train.to_class(),
             evals=[eval.to_class() for eval in self.report.eval],
-            parent_spec=self,
+            model_kwargs=model_kwargs,
+            ensemble_kwargs=ensemble_kwargs,
+            feat_kwargs=feat_kwargs,
+        )
+
+    def run(
+        self,
+        output_dir: PathLike = "anvil_training",
+        debug: bool = False,
+        tag: str = None,
+    ):
+        """Run the Anvil workflow from this specification."""
+        workflow = self.to_workflow()
+        result = workflow.run(output_dir=output_dir, debug=debug, tag=tag)
+
+        resolved_output_dir = workflow.resolved_output_dir or Path(output_dir)
+        resolved_output_dir.mkdir(parents=True, exist_ok=True)
+        provenance_spec = self.model_copy(deep=True)
+        if tag is not None:
+            provenance_spec.metadata.tag = tag
+
+        provenance_spec.to_recipe(resolved_output_dir / "anvil_recipe.yaml")
+        recipe_components = resolved_output_dir / "recipe_components"
+        recipe_components.mkdir(parents=True, exist_ok=True)
+        provenance_spec.to_multi_yaml(
+            metadata_yaml=recipe_components / "metadata.yaml",
+            procedure_yaml=recipe_components / "procedure.yaml",
+            data_yaml=recipe_components / "data.yaml",
+            report_yaml=recipe_components / "eval.yaml",
         )
+        return result