From d8bbe7824a5c41eb0a02e6befefa4c7ee1afd119 Mon Sep 17 00:00:00 2001
From: Sean Colby <seanmcolby@gmail.com>
Date: Thu, 14 May 2026 08:43:30 -0800
Subject: [PATCH 1/3] Strengthen unit test suite across eval, features,
 inference, models, split, and CLI

Apply testing standards from remove-parent-spec branch across the broad unit
test suite. Key changes:

- Replace integration-style tests with real lightweight components
- Remove MagicMock abuse on systems under test; keep mocks only for I/O
- Add pytest.approx throughout to prevent cross-platform floating-point failures
- Add mutual-exclusion assertions for all splitter tests
- Strengthen test_inference.py: real FingerprintFeaturizer, DummyRegressorModel,
  CommitteeRegressor; assert derived math values (PRED=1.0, STD=1.0, UCB=4.0)
- Add NumPy-style docstrings to all test functions
- Update test_cli.py for spec.run() call path (PR 3 API)
- test_eval.py: preserves RAE/log tests from main + docstrings/pytest.approx
- test_base.py: preserves weights_only load test from main + docstrings
- test_mtenn.py: not included (module deleted in main via #529)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 openadmet/models/tests/unit/cli/test_cli.py   | 127 ++++++--
 .../tests/unit/comparison/test_comparison.py  |  53 +++-
 openadmet/models/tests/unit/data/test_data.py |  18 ++
 openadmet/models/tests/unit/eval/test_eval.py |  60 +++-
 .../tests/unit/features/test_features.py      |  81 ++++-
 .../models/tests/unit/features/test_nepare.py |   6 +
 .../tests/unit/inference/test_inference.py    | 228 +++++++++++---
 .../models/tests/unit/models/test_base.py     |  13 +
 .../models/tests/unit/models/test_lgbm.py     |  23 ++
 .../models/tests/unit/models/test_nepare.py   |   2 +
 .../models/tests/unit/split/test_splitters.py | 288 ++++++++++++------
 openadmet/models/tests/unit/test_utils.py     |   6 +
 12 files changed, 719 insertions(+), 186 deletions(-)

diff --git a/openadmet/models/tests/unit/cli/test_cli.py b/openadmet/models/tests/unit/cli/test_cli.py
index 73da2cec..8e430025 100644
--- a/openadmet/models/tests/unit/cli/test_cli.py
+++ b/openadmet/models/tests/unit/cli/test_cli.py
@@ -1,59 +1,87 @@
-from openadmet.models.cli.cli import cli
-from openadmet.models.tests.test_utils import click_success
-from openadmet.models.tests.unit.datafiles import (
-    anvil_lgbm_trained_model_dir,
-    pred_test_data_csv,
-    basic_anvil_yaml_cv,
-)
+from pathlib import Path
+
 import pytest
 from click.testing import CliRunner
 
+from openadmet.models.cli import anvil as anvil_cli_module
+from openadmet.models.cli import predict as predict_cli_module
+from openadmet.models.cli.cli import cli
+from openadmet.models.tests.test_utils import click_success
+from openadmet.models.tests.unit.datafiles import basic_anvil_yaml_cv
+
+
+@pytest.fixture
+def runner():
+    """Provide a Click CliRunner for testing CLI commands in isolation."""
+    return CliRunner()
 
-def test_toplevel_runnable():
-    """Test the top-level CLI command"""
-    runner = CliRunner()
+
+def test_toplevel_runnable(runner):
+    """Ensure the top-level 'openadmet' command runs and displays help without error."""
     result = runner.invoke(cli, ["--help"])
     assert click_success(result)
 
 
 @pytest.mark.parametrize(
-    "args",
-    [
-        ["anvil", "--help"],
-        ["compare", "--help"],
-        ["predict", "--help"],
-    ],
+    "args", [["anvil", "--help"], ["compare", "--help"], ["predict", "--help"]]
 )
-def test_subcommand_runnable(args):
-    """Test the subcommands"""
-    runner = CliRunner()
+def test_subcommand_runnable(runner, args):
+    """Verify that all major subcommands (anvil, compare, predict) are registered and runnable."""
     result = runner.invoke(cli, args)
     assert click_success(result)
 
 
-def test_predict_cli(tmp_path):
-    """Test the predict CLI command"""
-    runner = CliRunner()
+def test_predict_cli_invokes_inference(tmp_path, runner, mocker):
+    """
+    Validate that the 'predict' subcommand correctly parses arguments and calls the underlying inference function.
+
+    We mock `inference_func` to avoid loading real models (which is heavy and requires trained artifacts).
+    This ensures that the CLI layer correctly passes paths, column names, and flags to the logic layer.
+    """
+    input_csv = tmp_path / "input.csv"
+    input_csv.write_text("MY_SMILES\nCCO\n")
+    model_dir = tmp_path / "model_dir"
+    model_dir.mkdir()
+
+    mock_inference = mocker.patch.object(predict_cli_module, "inference_func")
+
     result = runner.invoke(
         cli,
         [
             "predict",
             "--input-path",
-            pred_test_data_csv,
+            input_csv,
             "--input-col",
             "MY_SMILES",
             "--model-dir",
-            anvil_lgbm_trained_model_dir,
+            model_dir,
             "--output-csv",
             tmp_path / "predictions.csv",
+            "--accelerator",
+            "cpu",
         ],
     )
     assert click_success(result)
+    mock_inference.assert_called_once()
+    called = mock_inference.call_args.kwargs
+    assert called["input_col"] == "MY_SMILES"
+    assert called["accelerator"] == "cpu"
+    assert called["write_csv"] is True
+    assert list(called["model_dir"]) == [model_dir]
 
 
-def test_anvil_cli(tmp_path):
-    """Test the anvil CLI command"""
-    runner = CliRunner()
+def test_anvil_cli_invokes_workflow(tmp_path, runner, mocker):
+    """
+    Validate that the 'anvil' subcommand correctly initializes and runs a workflow from a recipe.
+
+    We mock the `AnvilSpecification` and workflow execution to verify that the CLI correctly handles
+    recipe paths and output directories without actually running a full ML training job.
+    """
+    mock_spec = mocker.Mock()
+    mock_from_recipe = mocker.patch.object(
+        anvil_cli_module.AnvilSpecification, "from_recipe", return_value=mock_spec
+    )
+
     result = runner.invoke(
         cli,
         [
@@ -66,3 +94,48 @@ def test_anvil_cli(tmp_path):
     )
 
     assert click_success(result)
+    mock_from_recipe.assert_called_once_with(basic_anvil_yaml_cv)
+    mock_spec.run.assert_called_once()
+    called = mock_spec.run.call_args.kwargs
+    assert Path(called["output_dir"]) == tmp_path / "anvil_output"
+    assert called["debug"] is False
+
+
+@pytest.mark.parametrize(
+    "aq_fxns,beta,best_y,xi,expected",
+    [
+        (("ucb",), (2.0,), (), (), {"ucb": {"beta": 2.0}}),
+        (
+            ("ei", "pi"),
+            (),
+            (1.0, 2.0),
+            (0.1, 0.2),
+            {"ei": {"xi": 0.1, "best_y": 1.0}, "pi": {"xi": 0.2, "best_y": 2.0}},
+        ),
+    ],
+)
+def test_validate_aq_fxns_success(aq_fxns, beta, best_y, xi, expected):
+    """
+    Verify that valid combinations of acquisition function arguments are correctly parsed into a configuration dict.
+
+    This tests the CLI argument validation logic for active learning parameters.
+    """
+    assert predict_cli_module._validate_aq_fxns(aq_fxns, beta, best_y, xi) == expected
+
+
+@pytest.mark.parametrize(
+    "aq_fxns,beta,best_y,xi,error_message",
+    [
+        (("ucb", "ucb"), (1.0, 2.0), (), (), "UCB can only be specified once"),
+        (("ei",), (), (), (), "must be specified once per EI and/or PI acquisition"),
+        (("ucb",), (), (), (), "Field `beta` must be specified for UCB acquisition"),
+    ],
+)
+def test_validate_aq_fxns_errors(aq_fxns, beta, best_y, xi, error_message):
+    """
+    Ensure that invalid acquisition function arguments trigger appropriate validation errors.
+
+    This prevents users from running predictions with ambiguous or incomplete active learning settings.
+    """
+    with pytest.raises(ValueError, match=error_message):
+        predict_cli_module._validate_aq_fxns(aq_fxns, beta, best_y, xi)
diff --git a/openadmet/models/tests/unit/comparison/test_comparison.py b/openadmet/models/tests/unit/comparison/test_comparison.py
index 27d93900..cd9949cc 100644
--- a/openadmet/models/tests/unit/comparison/test_comparison.py
+++ b/openadmet/models/tests/unit/comparison/test_comparison.py
@@ -1,26 +1,32 @@
+import numpy as np
 import pytest
 from numpy.testing import assert_almost_equal
-import numpy as np
+
 from openadmet.models.comparison.compare_base import get_comparison_class
 from openadmet.models.comparison.posthoc import PostHocComparison
 from openadmet.models.tests.unit.datafiles import (
-    cyp2c9_json,
+    anvil_lgbm_trained_model_dir,
     cyp1a2_json,
+    cyp2c9_json,
     cyp3a4_json,
     multi_task_json,
-    anvil_lgbm_trained_model_dir,
 )
 
 
 def test_get_comparison_class():
-    """Test getting comparison class."""
+    """
+    Test dynamic retrieval of comparison classes from the registry.
+
+    Verifies that valid class names return the class and invalid names raise ValueError.
+    """
     get_comparison_class("PostHoc")
     with pytest.raises(ValueError):
         get_comparison_class("NotARealClass")
 
 
 def test_posthoc_fails_on_incorrect_inputs():
-    """Test that posthoc comparison fails when given incorrect inputs.
+    """
+    Test that posthoc comparison fails when given incorrect inputs.
 
     Inputs include:
     - No inputs
@@ -28,6 +34,8 @@ def test_posthoc_fails_on_incorrect_inputs():
     - Mismatched lengths of model_stats_fns, labels, and task_names
     - Repeated labels
     - Incorrect labels and task_names for model_stats_fns
+
+    This validation is critical to ensure that comparison tables and plots match models to their correct metadata.
     """
     comp_obj = PostHocComparison()
     with pytest.raises(ValueError):
@@ -69,7 +77,12 @@ def test_posthoc_repeat_label_error():
 
 
 def test_posthoc_comparison():
-    """Test that posthoc comparison works when given correct inputs."""
+    """
+    Test that posthoc comparison works correctly when given valid inputs.
+
+    This verifies the calculation of statistical tests (Levene's test for equality of variances,
+    Tukey's HSD for pairwise mean differences) based on loaded model metrics.
+    """
     model_stats = [cyp2c9_json, cyp3a4_json, cyp1a2_json]
     model_tags = [
         "openadmet-CYP2C9-pchembl-regression-testing-cv",
@@ -97,7 +110,12 @@ def test_posthoc_comparison():
 def test_posthoc_comparison_anvil_reader_and_feature_label(
     label_types, expected_labels
 ):
-    """Test that posthoc comparison can read from anvil-trained model directories and features."""
+    """
+    Test that posthoc comparison can automatically extract labels from anvil-trained model directories.
+
+    This ensures that metadata stored in `metadata.yaml` within model directories can be correctly
+    parsed to generate readable labels for comparison plots.
+    """
     comp_obj = PostHocComparison()
     model_stats_fns, labels, task_names = comp_obj.label_and_task_name_from_anvil(
         model_dirs=[anvil_lgbm_trained_model_dir], label_types=label_types
@@ -121,7 +139,12 @@ def test_posthoc_comparison_json_reader_fails(label_types):
 
 
 def test_posthoc_comparison_json_reader():
-    """Test that posthoc comparison can read multi vs single task from anvil file."""
+    """
+    Test that posthoc comparison handles both multi-task and single-task JSON result files.
+
+    This verifies that the system can normalize results from different task types into a common
+    format for statistical comparison.
+    """
     model_stats = [multi_task_json, cyp3a4_json]
     model_tags = ["multitask", "single_task"]
     task_tags = ["cyp3a4_pchembl_value_mean", "pchembl_value_mean"]
@@ -130,14 +153,18 @@ def test_posthoc_comparison_json_reader():
     levene, tukeys_df = comp_obj.compare(
         model_stats_fns=model_stats, labels=model_tags, task_names=task_tags
     )
-    assert levene["mse"][0] == 2.483488460351842
-    assert levene["ktau"][0] == 1.0392615736603197
-    assert tukeys_df["metric_val"][0] == -0.01037444780666702
-    assert tukeys_df["pvalue"][0] == 0.2488307785417857
+    assert levene["mse"][0] == pytest.approx(2.483, abs=0.001)
+    assert levene["ktau"][0] == pytest.approx(1.039, abs=0.001)
+    assert tukeys_df["metric_val"][0] == pytest.approx(-0.010, abs=0.001)
+    assert tukeys_df["pvalue"][0] == pytest.approx(0.248, abs=0.001)
 
 
 def test_posthoc_comparison_printing(capsys):
-    """Test that posthoc comparison prints results to console."""
+    """
+    Test that posthoc comparison prints results to console in a readable format.
+
+    We capture stdout to verify that Levene's test and Tukey's HSD results are actually displayed to the user.
+    """
     model_stats = [cyp2c9_json, cyp3a4_json, cyp1a2_json]
     model_tags = [
         "openadmet-CYP2C9-pchembl-regression-testing-cv",
diff --git a/openadmet/models/tests/unit/data/test_data.py b/openadmet/models/tests/unit/data/test_data.py
index d714bc7e..fa310a66 100644
--- a/openadmet/models/tests/unit/data/test_data.py
+++ b/openadmet/models/tests/unit/data/test_data.py
@@ -5,6 +5,12 @@
 
 
 def test_data_spec_from_csv():
+    """
+    Validate loading data from a CSV file via DataSpec.
+
+    Ensures that the data loader correctly reads the specified CSV, extracts the target and SMILES columns,
+    and returns them as expected.
+    """
     data_spec = DataSpec(
         type="intake",
         resource=test_csv,
@@ -18,6 +24,12 @@ def test_data_spec_from_csv():
 
 
 def test_data_spec_from_intake():
+    """
+    Validate loading data from an Intake catalog.
+
+    Intake allows for declarative data loading. This test checks that DataSpec can correctly interface
+    with an Intake catalog to retrieve data.
+    """
     data_spec = DataSpec(
         type="intake",
         resource=intake_cat,
@@ -32,6 +44,12 @@ def test_data_spec_from_intake():
 
 @pytest.mark.parametrize("dropna, expected_length", [(True, 3333), (False, 7196)])
 def test_data_spec_dropna(dropna, expected_length):
+    """
+    Test the `dropna` functionality in DataSpec.
+
+    Verifies that rows with missing values in target columns are dropped when dropna=True,
+    and preserved when dropna=False. This is critical for handling real-world datasets which often contain gaps.
+    """
     data_spec = DataSpec(
         type="intake",
         resource=nan_data,
diff --git a/openadmet/models/tests/unit/eval/test_eval.py b/openadmet/models/tests/unit/eval/test_eval.py
index 6af7599e..c0040763 100644
--- a/openadmet/models/tests/unit/eval/test_eval.py
+++ b/openadmet/models/tests/unit/eval/test_eval.py
@@ -1,6 +1,8 @@
+import matplotlib.figure
+import numpy as np
 import pytest
+import seaborn as sns
 
-import numpy as np
 from openadmet.models.eval.binary import PosthocBinaryMetrics
 from openadmet.models.eval.classification import (
     ClassificationMetrics,
@@ -20,21 +22,28 @@
 
 
 def test_get_eval_class():
+    """Verify that evaluation classes can be retrieved by name from the registry."""
     get_eval_class("RegressionMetrics")
     get_eval_class("PosthocBinaryMetrics")
     get_eval_class("ClassificationMetrics")
 
 
 def test_regression_metrics():
+    """
+    Validate calculation of standard regression metrics (MSE, MAE, R2).
+
+    This test uses simple synthetic data to ensure that the mathematical implementations
+    of these metrics are correct and return the expected values.
+    """
     y_true = np.array([3, -0.5, 2, 7]).reshape(-1, 1)
     y_pred = np.array([2.5, 0.0, 2, 8]).reshape(-1, 1)
 
     rm = RegressionMetrics()
     metrics = rm.evaluate(y_true, y_pred)
 
-    assert metrics["task_0"]["mse"]["value"] == 0.375
-    assert metrics["task_0"]["mae"]["value"] == 0.5
-    assert metrics["task_0"]["r2"]["value"] == 0.9486081370449679
+    assert metrics["task_0"]["mse"]["value"] == pytest.approx(0.375, abs=0.001)
+    assert metrics["task_0"]["mae"]["value"] == pytest.approx(0.5, abs=0.001)
+    assert metrics["task_0"]["r2"]["value"] == pytest.approx(0.94860, abs=0.001)
 
 
 def test_regression_metrics_and_cv_include_rae_and_pct_within_1_log_for_pxc50():
@@ -94,16 +103,32 @@ def test_lightning_cv_pct_within_1_log_uses_raw_metric_callable():
 
 
 def test_regression_plots():
+    """
+    Verify that regression plotting functions return valid figure objects.
+
+    This ensures that regression plots (JointGrid for parity, Figure for CI) are generated
+    without error, which is important for model reporting.
+    """
     y_true = np.array([3, -0.5, 2, 7]).reshape(-1, 1)
     y_pred = np.array([2.5, 0.0, 2, 8]).reshape(-1, 1)
 
     rm = RegressionPlots()
-    rm.evaluate(y_true, y_pred)
+    plot_data = rm.evaluate(y_true, y_pred)
 
-    assert True
+    assert isinstance(plot_data, dict)
+    assert "task_0_regplot" in plot_data
+    assert "task_0_ciplot" in plot_data
+    assert isinstance(plot_data["task_0_regplot"], sns.axisgrid.JointGrid)
+    assert isinstance(plot_data["task_0_ciplot"], matplotlib.figure.Figure)
 
 
 def test_classification_metrics():
+    """
+    Validate calculation of classification metrics (Accuracy, Precision, Recall, F1, AUC).
+
+    This ensures that for binary classification tasks, the metrics are computed correctly based on
+    predicted probabilities and ground truth labels.
+    """
     y_true = [0, 1, 1, 0]
 
     # We pass probabilities of the class, not the class itself
@@ -113,25 +138,38 @@ def test_classification_metrics():
     cm = ClassificationMetrics()
     metrics = cm.evaluate(y_true, y_pred)
 
-    assert metrics["accuracy"]["value"] == 0.75
+    assert metrics["accuracy"]["value"] == pytest.approx(0.75)
     assert metrics["precision"]["value"] == pytest.approx(0.667, abs=0.001)
-    assert metrics["recall"]["value"] == 1.0
-    assert metrics["f1"]["value"] == 0.8
-    assert metrics["roc_auc"]["value"] == 0.75
+    assert metrics["recall"]["value"] == pytest.approx(1.0)
+    assert metrics["f1"]["value"] == pytest.approx(0.8)
+    assert metrics["roc_auc"]["value"] == pytest.approx(0.75)
     assert metrics["pr_auc"]["value"] == pytest.approx(0.833, abs=0.001)
 
 
 def test_classification_plots():
+    """
+    Verify that classification plotting functions (ROC, PR curves) return valid figure objects.
+    """
     y_true = [0, 1, 1, 0]
     y_pred = [[1, 0], [0, 1], [0, 1], [0, 1]]
 
     cp = ClassificationPlots()
     cp.evaluate(y_true, y_pred)
 
-    assert True
+    assert isinstance(cp.plot_data, dict)
+    assert "roc_curve" in cp.plot_data
+    assert "pr_curve" in cp.plot_data
+    assert isinstance(cp.plot_data["roc_curve"], matplotlib.figure.Figure)
+    assert isinstance(cp.plot_data["pr_curve"], matplotlib.figure.Figure)
 
 
 def test_posthoc_eval_metrics():
+    """
+    Test post-hoc binary metrics utility functions.
+
+    Verifies that we can calculate precision and recall at a specific cutoff threshold from
+    regression-like outputs (or probabilities).
+    """
     y_true = [3, -0.5, 2, 7]
     y_pred = [2.5, 0.0, 2, 8]
     cutoff = 4.0
diff --git a/openadmet/models/tests/unit/features/test_features.py b/openadmet/models/tests/unit/features/test_features.py
index cb9e1a37..ea13a1be 100644
--- a/openadmet/models/tests/unit/features/test_features.py
+++ b/openadmet/models/tests/unit/features/test_features.py
@@ -10,17 +10,25 @@
 
 @pytest.fixture()
 def smiles():
+    """Provide a list of valid SMILES strings for testing featurization."""
     return ["CCO", "CCN", "CCO"]
 
 
 @pytest.fixture()
 def one_invalid_smi():
+    """Provide a list of SMILES strings containing one invalid entry to test error handling."""
     return ["CCO", "CCN", "invalid", "CCO"]
 
 
 @pytest.mark.parametrize("dtype", (np.float32, np.float64))
 @pytest.mark.parametrize("descr_type", ["mordred", "desc2d"])
 def test_descriptor_featurizer(descr_type, dtype):
+    """
+    Validate DescriptorFeaturizer for different descriptor types and floating point precisions.
+
+    This ensures that physical-chemical descriptors (like Mordred or RDKit 2D) are correctly generated
+    and returned with the requested data type, which is important for downstream model compatibility.
+    """
     featurizer = DescriptorFeaturizer(descr_type=descr_type, dtype=dtype)
     X, idx = featurizer.featurize(["CCO", "CCN", "CCO"])
     assert X.dtype == dtype
@@ -28,6 +36,12 @@ def test_descriptor_featurizer(descr_type, dtype):
 
 
 def test_descriptor_one_invalid(one_invalid_smi):
+    """
+    Ensure DescriptorFeaturizer robustly handles invalid SMILES strings.
+
+    The featurizer should skip invalid molecules and return indices corresponding only to the valid ones.
+    This prevents the entire pipeline from crashing due to a single bad input.
+    """
     featurizer = DescriptorFeaturizer(descr_type="mordred")
     X, idx = featurizer.featurize(one_invalid_smi)
     assert X.shape == (3, 1613)
@@ -38,6 +52,12 @@ def test_descriptor_one_invalid(one_invalid_smi):
 @pytest.mark.parametrize("dtype", (np.float32, np.float64))
 @pytest.mark.parametrize("fp_type", ("ecfp", "fcfp"))
 def test_fingerprint_featurizer(smiles, fp_type, dtype):
+    """
+    Validate FingerprintFeaturizer for different fingerprint types (ECFP, FCFP) and precisions.
+
+    This verifies that structural fingerprints are correctly generated with the expected vector size (2000)
+    and data type.
+    """
     featurizer = FingerprintFeaturizer(fp_type=fp_type, dtype=dtype)
     X, idx = featurizer.featurize(smiles)
     assert X.shape == (3, 2000)
@@ -46,6 +66,11 @@ def test_fingerprint_featurizer(smiles, fp_type, dtype):
 
 
 def test_fingerprint_one_invalid(one_invalid_smi):
+    """
+    Ensure FingerprintFeaturizer robustly handles invalid SMILES strings.
+
+    Similar to descriptors, it should filter out invalid entries and return correct indices for valid ones.
+    """
     featurizer = FingerprintFeaturizer(fp_type="ecfp")
     X, idx = featurizer.featurize(one_invalid_smi)
     assert X.shape == (3, 2000)
@@ -54,6 +79,12 @@ def test_fingerprint_one_invalid(one_invalid_smi):
 
 
 def test_feature_concatenator(smiles):
+    """
+    Validate that FeatureConcatenator correctly combines multiple feature sets (descriptors + fingerprints).
+
+    This ensures that different feature representations can be stacked horizontally for the same molecules,
+    providing a richer feature set for training.
+    """
     desc_featurizer = DescriptorFeaturizer(descr_type="mordred")
     fp_featurizer = FingerprintFeaturizer(fp_type="ecfp")
     concat = FeatureConcatenator(featurizers=[desc_featurizer, fp_featurizer])
@@ -62,14 +93,48 @@ def test_feature_concatenator(smiles):
     assert_array_equal(idx, np.arange(3))
 
 
-def test_feature_concatenator_failed_diff_positions(one_invalid_smi):
+def test_feature_concatenator_drops_intersection(mocker):
+    """
+    Verify that FeatureConcatenator only keeps molecules valid across ALL featurizers.
+
+    If one featurizer fails for molecule A and another fails for molecule B, the concatenator
+    must drop both A and B to maintain feature alignment.
+
+    We mock the underlying featurizers to control which indices fail, avoiding the need for
+    complex real-world molecules that fail specific featurizers. This isolates the intersection logic.
+    """
+    # Arrange
     desc_featurizer = DescriptorFeaturizer(descr_type="mordred")
     fp_featurizer = FingerprintFeaturizer(fp_type="ecfp")
     concat = FeatureConcatenator(featurizers=[desc_featurizer, fp_featurizer])
-    X, idx = concat.featurize(one_invalid_smi)
-    assert X.shape == (3, 3613)
-    # index 2 is invalid, so the shape should be 3
-    assert_array_equal(idx, np.asarray([0, 1, 3]))
+
+    # Mock descriptor featurizer to return 3 valid outputs (fails on index 1)
+    # Indices: 0, 2, 3 (skips 1)
+    desc_features = np.zeros((3, 1613))
+    desc_indices = np.array([0, 2, 3])
+    mocker.patch.object(
+        DescriptorFeaturizer, "featurize", return_value=(desc_features, desc_indices)
+    )
+
+    # Mock fingerprint featurizer to return 3 valid outputs (fails on index 2)
+    # Indices: 0, 1, 3 (skips 2)
+    # Note: ECFP size is 2000 in this codebase
+    fp_features = np.zeros((3, 2000))
+    fp_indices = np.array([0, 1, 3])
+    mocker.patch.object(
+        FingerprintFeaturizer, "featurize", return_value=(fp_features, fp_indices)
+    )
+
+    smiles = ["SMI0", "SMI1", "SMI2", "SMI3"]
+
+    # Act
+    X, idx = concat.featurize(smiles)
+
+    # Assert
+    # Intersection of [0, 2, 3] and [0, 1, 3] is [0, 3]
+    # Expected shape: (2, 1613 + 2000) = (2, 3613)
+    assert X.shape == (2, 3613)
+    assert_array_equal(idx, np.array([0, 3]))
 
 
 def test_feature_concatenator_order_independence(smiles):
@@ -90,6 +155,12 @@ def test_feature_concatenator_order_independence(smiles):
 
 
 def test_pairwise_featurizer(smiles):
+    """
+    Validate PairwiseFeaturizer in 'full' mode (all-pairs).
+
+    This tests that features are generated for every pair of molecules and that target values
+    (differences) are correctly computed.
+    """
     featurizer = PairwiseFeaturizer(
         featurizer={"FingerprintFeaturizer": {"fp_type": "ecfp", "dtype": np.float32}},
         how_to_pair="full",
diff --git a/openadmet/models/tests/unit/features/test_nepare.py b/openadmet/models/tests/unit/features/test_nepare.py
index 61cfa787..2e926b0b 100644
--- a/openadmet/models/tests/unit/features/test_nepare.py
+++ b/openadmet/models/tests/unit/features/test_nepare.py
@@ -9,6 +9,12 @@
 
 
 def test_pairwise_make_new():
+    """
+    Verify that PairwiseFeaturizer can create a new independent instance via make_new().
+
+    This is important for factory-like creation patterns in the registry or during cross-validation
+    where fresh featurizers are needed.
+    """
     featurizer = PairwiseFeaturizer(
         how_to_pair="full", featurizer=FingerprintFeaturizer(fp_type="ecfp:4")
     )
diff --git a/openadmet/models/tests/unit/inference/test_inference.py b/openadmet/models/tests/unit/inference/test_inference.py
index df341b21..7a4c369c 100644
--- a/openadmet/models/tests/unit/inference/test_inference.py
+++ b/openadmet/models/tests/unit/inference/test_inference.py
@@ -1,50 +1,204 @@
+"""Tests for the inference orchestration pipeline using real, lightweight components."""
+
 from pathlib import Path
+
+import numpy as np
 import pandas as pd
-import os
 import pytest
 
-from openadmet.models.inference.inference import predict
-from openadmet.models.tests.unit.datafiles import (
-    pred_test_data_csv,
-    anvil_lgbm_trained_model_dir,
-    anvil_chemprop_trained_model_dir,
-)
+from openadmet.models.active_learning.committee import CommitteeRegressor
+from openadmet.models.anvil.specification import DataSpec, Metadata
+from openadmet.models.architecture.dummy import DummyRegressorModel
+from openadmet.models.features.molfeat_fingerprint import FingerprintFeaturizer
+from openadmet.models.inference import inference as inference_module
 
 
 @pytest.fixture
-def anvil_lgbm():
-    return anvil_lgbm_trained_model_dir
+def input_df():
+    """Provide a simple DataFrame with SMILES for testing inference inputs."""
+    return pd.DataFrame({"MY_SMILES": ["CCO", "CCN"]})
 
 
-@pytest.fixture
-def anvil_chemprop():
-    return anvil_chemprop_trained_model_dir
-
-
-@pytest.mark.skipif(
-    os.getenv("RUNNER_OS") == "macOS", reason="MacOS runner not enough memory"
-)
-@pytest.mark.parametrize("model_dir", ["anvil_lgbm", "anvil_chemprop"])
-def test_predict(model_dir, request):
-    # Use the fixture to get the model directory
-    model_dir = request.getfixturevalue(model_dir)
-    # Test the predict function with a sample input
-    input_path = pred_test_data_csv
-    input_col = "MY_SMILES"
-    model_dir = [model_dir]
-    write_csv = False
-    output_path = None
-    debug = False
-
-    result = predict(
-        input_path,
-        input_col,
-        model_dir,
-        write_csv,
-        output_path,
-        debug=False,
+@pytest.fixture(scope="module")
+def real_featurizer():
+    """Return a real FingerprintFeaturizer using ECFP4 fingerprints."""
+    return FingerprintFeaturizer(fp_type="ecfp:4")
+
+
+@pytest.fixture(scope="module")
+def real_data_spec():
+    """Return a real DataSpec with a single regression target."""
+    return DataSpec(type="csv", target_cols=["task_0"], input_col="MY_SMILES")
+
+
+@pytest.fixture(scope="module")
+def real_metadata_single():
+    """Return real Metadata with tag UNIT for single-model tests."""
+    return Metadata(
+        version="v1",
+        driver="sklearn",
+        name="unit-test",
+        build_number=0,
+        description="Unit test model",
+        tag="UNIT",
+        authors="Test Author",
+        email="test@example.com",
+        biotargets=["test"],
+        tags=["test"],
+    )
+
+
+@pytest.fixture(scope="module")
+def real_metadata_ensemble():
+    """Return real Metadata with tag ENS for ensemble tests."""
+    return Metadata(
+        version="v1",
+        driver="sklearn",
+        name="ens-test",
+        build_number=0,
+        description="Ensemble test model",
+        tag="ENS",
+        authors="Test Author",
+        email="test@example.com",
+        biotargets=["test"],
+        tags=["test"],
+    )
+
+
+@pytest.fixture(scope="module")
+def trained_single_model():
+    """Return a DummyRegressorModel trained to always predict 1.0 regardless of input features."""
+    X_train = np.zeros((3, 2))
+    y_train = np.array([[1.0], [1.0], [1.0]])
+    model = DummyRegressorModel()
+    model.train(X_train, y_train)
+    return model
+
+
+@pytest.fixture(scope="module")
+def trained_ensemble():
+    """Return a CommitteeRegressor whose two members predict 1.0 and 3.0 respectively.
+
+    The ensemble mean is 2.0 and the standard deviation is 1.0 for any input,
+    making the UCB score with beta=2.0 equal to 4.0.
+    """
+    X_train = np.zeros((3, 2))
+
+    model1 = DummyRegressorModel()
+    model1.train(X_train, np.array([[1.0], [1.0], [1.0]]))
+
+    model2 = DummyRegressorModel()
+    model2.train(X_train, np.array([[3.0], [3.0], [3.0]]))
+
+    return CommitteeRegressor.from_models([model1, model2])
+
+
+def test_predict_with_real_single_model(
+    mocker,
+    input_df,
+    real_featurizer,
+    real_metadata_single,
+    real_data_spec,
+    trained_single_model,
+):
+    """Test the inference pipeline with a real DummyRegressorModel.
+
+    SMILES strings flow through a real FingerprintFeaturizer and a real DummyRegressorModel
+    to verify internal data plumbing. Because DummyRegressorModel always predicts the
+    training mean, PRED values must equal 1.0 for both inputs. The STD column must be NaN
+    because non-ensemble models produce no uncertainty estimate.
+    """
+    mock_loader = mocker.patch.object(
+        inference_module,
+        "load_anvil_model_and_metadata",
+        return_value=(
+            trained_single_model,
+            real_featurizer,
+            real_metadata_single,
+            real_data_spec,
+        ),
+    )
+
+    result = inference_module.predict(
+        input_path=input_df,
+        input_col="MY_SMILES",
+        model_dir=["unused-model-dir"],
         accelerator="cpu",
+        log=False,
     )
 
-    # Check if the result is a DataFrame
     assert isinstance(result, pd.DataFrame)
+    assert "OADMET_PRED_UNIT_task_0" in result.columns
+    assert "OADMET_STD_UNIT_task_0" in result.columns
+    assert result["OADMET_PRED_UNIT_task_0"].tolist() == pytest.approx([1.0, 1.0])
+    assert result["OADMET_STD_UNIT_task_0"].isna().all()
+    mock_loader.assert_called_once_with(Path("unused-model-dir"))
+
+
+def test_predict_with_real_ensemble_and_acquisition(
+    mocker,
+    input_df,
+    real_featurizer,
+    real_metadata_ensemble,
+    real_data_spec,
+    trained_ensemble,
+):
+    """Test the inference pipeline with a real CommitteeRegressor and UCB acquisition.
+
+    Two DummyRegressorModel members predict 1.0 and 3.0 respectively, yielding a committee
+    mean of 2.0 and standard deviation of 1.0 for any input. With beta=2.0,
+    UCB = mean + beta * std = 2.0 + 2.0 * 1.0 = 4.0.
+    """
+    mock_loader = mocker.patch.object(
+        inference_module,
+        "load_anvil_model_and_metadata",
+        return_value=(
+            trained_ensemble,
+            real_featurizer,
+            real_metadata_ensemble,
+            real_data_spec,
+        ),
+    )
+
+    result = inference_module.predict(
+        input_path=input_df,
+        input_col="MY_SMILES",
+        model_dir=["unused-model-dir"],
+        accelerator="cpu",
+        log=False,
+        aq_fxn_args={"ucb": {"beta": 2.0}},
+    )
+
+    assert result["OADMET_PRED_ENS_task_0"].tolist() == pytest.approx([2.0, 2.0])
+    assert result["OADMET_STD_ENS_task_0"].tolist() == pytest.approx([1.0, 1.0])
+    assert result["OADMET_UCB_ENS_task_0"].tolist() == pytest.approx([4.0, 4.0])
+    mock_loader.assert_called_once_with(Path("unused-model-dir"))
+
+
+def test_predict_raises_when_input_column_missing(input_df):
+    """Ensure that the inference function validates the existence of the specified SMILES column."""
+    with pytest.raises(ValueError, match="Column OTHER not found"):
+        inference_module.predict(
+            input_path=input_df,
+            input_col="OTHER",
+            model_dir=["unused-model-dir"],
+            log=False,
+        )
+
+
+def test_load_anvil_model_and_metadata_missing_recipe_components(tmp_path):
+    """Ensure correct error is raised when the model directory structure is invalid."""
+    with pytest.raises(FileNotFoundError, match="does not contain recipe components"):
+        inference_module.load_anvil_model_and_metadata(tmp_path)
+
+
+def test_load_anvil_model_and_metadata_missing_procedure_yaml(tmp_path):
+    """Ensure correct error is raised when critical YAML metadata files are missing."""
+    model_dir = tmp_path / "model"
+    recipe_components = model_dir / "recipe_components"
+    recipe_components.mkdir(parents=True)
+    (recipe_components / "metadata.yaml").write_text("metadata")
+    (recipe_components / "data.yaml").write_text("data")
+
+    with pytest.raises(FileNotFoundError, match="does not contain procedure.yaml"):
+        inference_module.load_anvil_model_and_metadata(model_dir)
diff --git a/openadmet/models/tests/unit/models/test_base.py b/openadmet/models/tests/unit/models/test_base.py
index ed466fef..05801e3b 100644
--- a/openadmet/models/tests/unit/models/test_base.py
+++ b/openadmet/models/tests/unit/models/test_base.py
@@ -13,6 +13,13 @@
 
 @pytest.mark.parametrize("mclass", models.classes())
 def test_save_load_pickleable(mclass, tmp_path):
+    """
+    Verify save/load mechanics for all registered pickleable models (e.g., sklearn-based).
+
+    This iterates through the model registry and tests that any model inheriting from PickleableModelBase
+    can be instantiated, built, saved, and loaded without error. This is a crucial contract test
+    ensuring all registered models comply with the persistence interface.
+    """
     if not issubclass(mclass, PickleableModelBase):
         pytest.skip(f"Skipping non-pickleable model {mclass.__name__}")
     model = mclass()
@@ -25,6 +32,12 @@ def test_save_load_pickleable(mclass, tmp_path):
 
 @pytest.mark.parametrize("mclass", models.classes())
 def test_save_load_torch_model(mclass, tmp_path):
+    """
+    Verify save/load mechanics for all registered PyTorch Lightning models.
+
+    Similar to the pickleable test, this ensures that deep learning models (inheriting from LightningModelBase)
+    implement the correct save/load logic for their weights and configurations.
+    """
     if not issubclass(mclass, LightningModelBase):
         pytest.skip(f"Skipping non-torch model {mclass.__name__}")
     model = mclass()
diff --git a/openadmet/models/tests/unit/models/test_lgbm.py b/openadmet/models/tests/unit/models/test_lgbm.py
index d6a76d71..7634f755 100644
--- a/openadmet/models/tests/unit/models/test_lgbm.py
+++ b/openadmet/models/tests/unit/models/test_lgbm.py
@@ -6,17 +6,24 @@
 
 @pytest.fixture
 def X_y():
+    """Provide simple synthetic data for basic model training tests."""
     X = [[1, 2, 3], [4, 5, 6]]
     y = [1, 2]
     return X, y
 
 
 def test_lgbm():
+    """Verify that LGBMRegressorModel initializes with the correct type identifier."""
     lgbm_model = LGBMRegressorModel()
     assert lgbm_model.type == "LGBMRegressorModel"
 
 
 def test_lgbm_from_params():
+    """
+    Validate that hyperparameters passed to the constructor are correctly applied to the underlying estimator.
+
+    This ensures that user configurations (like n_estimators) are respected by the model.
+    """
     lgbm_model = LGBMRegressorModel(n_estimators=100, boosting_type="rf")
     lgbm_model.build()
     assert lgbm_model.type == "LGBMRegressorModel"
@@ -25,6 +32,11 @@ def test_lgbm_from_params():
 
 
 def test_lgbm_train_predict(X_y):
+    """
+    Verify the train and predict lifecycle of LGBMRegressorModel.
+
+    This checks that the model can fit to data and generate predictions with the expected shape and values.
+    """
     lgbm_model = LGBMRegressorModel(n_estimators=100)
     lgbm_model.build()
     X, y = X_y
@@ -41,6 +53,11 @@ def test_lgbm_train_predict(X_y):
 
 
 def test_lgbm_save_load(tmp_path, X_y):
+    """
+    Validate persistence of the LGBM model to disk.
+
+    Ensures that saving and reloading the model preserves its learned state and prediction behavior.
+    """
     lgbm_model = LGBMRegressorModel(n_estimators=100)
     lgbm_model.build()
     X, y = X_y
@@ -54,6 +71,12 @@ def test_lgbm_save_load(tmp_path, X_y):
 
 
 def test_serialization(tmp_path, X_y):
+    """
+    Validate JSON/pickle serialization workflow for LGBM models.
+
+    This tests the separate storage of hyperparameters (JSON) and model weights (pickle),
+    which is used for model registry and versioning.
+    """
     lgbm_model = LGBMRegressorModel(n_estimators=100)
     lgbm_model.build()
     X, y = X_y
diff --git a/openadmet/models/tests/unit/models/test_nepare.py b/openadmet/models/tests/unit/models/test_nepare.py
index 17a00f5f..113653e7 100644
--- a/openadmet/models/tests/unit/models/test_nepare.py
+++ b/openadmet/models/tests/unit/models/test_nepare.py
@@ -6,11 +6,13 @@
 
 @pytest.fixture
 def X_y():
+    """Provide synthetic data pairs for testing pairwise regression."""
     X = [[1, 2, 3], [4, 5, 6]]
     y = [1, 2]
     return X, y
 
 
 def test_nepare():
+    """Verify initialization of the NeuralPairwiseRegressorModel."""
     nepare_model = NeuralPairwiseRegressorModel()
     assert nepare_model.type == "NeuralPairwiseRegressorModel"
diff --git a/openadmet/models/tests/unit/split/test_splitters.py b/openadmet/models/tests/unit/split/test_splitters.py
index 4c375786..f0602492 100644
--- a/openadmet/models/tests/unit/split/test_splitters.py
+++ b/openadmet/models/tests/unit/split/test_splitters.py
@@ -5,10 +5,10 @@
 from openadmet.models.split.sklearn import ShuffleSplitter
 from openadmet.models.split.cluster import ClusterSplitter
 from openadmet.models.split.split_base import splitters
-from openadmet.models.tests.unit.datafiles import CYP3A4_chembl_pchembl
 
 
 def test_in_splitters():
+    """Verify that concrete splitter implementations are correctly registered in the splitters registry."""
     assert "ShuffleSplitter" in splitters
     assert "ClusterSplitter" in splitters
 
@@ -34,10 +34,15 @@ def test_in_splitters():
 def test_simple_split(
     train_size, val_size, test_size, expected_train, expected_val, expected_test, error
 ):
-    # Error expected
+    """
+    Validate that ShuffleSplitter correctly partitions data according to specified ratios.
+
+    This test verifies both successful splits and error handling for invalid configurations.
+    Correct splitting ensures that training, validation, and test sets are of the expected size
+    and are mutually exclusive, which is critical for valid model evaluation.
+    """
     if error is True:
         with pytest.raises(ValueError):
-            # Initialize splitter
             splitter = ShuffleSplitter(
                 train_size=train_size,
                 val_size=val_size,
@@ -46,142 +51,239 @@ def test_simple_split(
             )
         return
 
-    # Initialize splitter
     splitter = ShuffleSplitter(
         train_size=train_size, val_size=val_size, test_size=test_size, random_state=42
     )
 
-    # Generate random data
+    # Generate synthetic random data for testing split logic
     X = np.random.rand(100, 10)
     y = np.random.rand(100)
 
-    # Error is expected
-    if error is True:
-        with pytest.raises(ValueError):
-            splitter.split(X, y)
-        return
-
-    # Perform the split
     X_train, X_val, X_test, y_train, y_val, y_test, groups = splitter.split(X, y)
 
-    # Check train
     assert X_train.shape[0] == expected_train
     assert y_train.shape[0] == expected_train
 
-    # Validation set requested
     if val_size > 0:
         assert X_val.shape[0] == expected_val
         assert y_val.shape[0] == expected_val
+        # Assert X_train and X_val are mutually exclusive
+        train_set = set(map(tuple, X_train))
+        val_set = set(map(tuple, X_val))
+        assert len(train_set.intersection(val_set)) == 0
 
-    # Validation set not requested
     else:
         assert X_val is None
         assert y_val is None
 
-    # Test set requested
     if test_size > 0:
         assert X_test.shape[0] == expected_test
         assert y_test.shape[0] == expected_test
+        # Assert X_train and X_test are mutually exclusive
+        train_set = set(map(tuple, X_train))
+        test_set = set(map(tuple, X_test))
+        assert len(train_set.intersection(test_set)) == 0
+
+        if val_size > 0:
+            # Assert X_val and X_test are mutually exclusive
+            val_set = set(map(tuple, X_val))
+            assert len(val_set.intersection(test_set)) == 0
 
-    # Test set not requested
     else:
         assert X_test is None
         assert y_test is None
 
 
+@pytest.fixture
+def synthetic_cluster_data():
+    """
+    Provide a synthetic dataset with structural diversity for testing cluster splitting.
+
+    This fixture returns a set of SMILES strings representing different chemical scaffolds
+    (benzenes, pyridines, cyclohexanes, furans, thiophenes) and corresponding target values.
+    Using diverse scaffolds ensures that clustering algorithms (like Butina or Bemis-Murcko)
+    can meaningfully group molecules, allowing verification that splits respect cluster boundaries.
+    """
+    base_smiles = [
+        "Cc1ccccc1",
+        "CCc1ccccc1",
+        "Oc1ccccc1",
+        "Nc1ccccc1",
+        "Clc1ccccc1",
+        "Fc1ccccc1",
+        "C(=O)Oc1ccccc1",
+        "C(=O)Cc1ccccc1",
+        "c1ccccc1C#N",
+        "COc1ccccc1",
+        "Cc1ccncc1",
+        "CCc1ccncc1",
+        "Oc1ccncc1",
+        "Nc1ccncc1",
+        "Clc1ccncc1",
+        "Fc1ccncc1",
+        "C(=O)Oc1ccncc1",
+        "C(=O)Cc1ccncc1",
+        "c1ccncc1C#N",
+        "COc1ccncc1",
+        "CC1CCCCC1",
+        "CCC1CCCCC1",
+        "OC1CCCCC1",
+        "NC1CCCCC1",
+        "ClC1CCCCC1",
+        "FC1CCCCC1",
+        "C(=O)OC1CCCCC1",
+        "C(=O)CC1CCCCC1",
+        "C1CCCCC1C#N",
+        "COC1CCCCC1",
+        "Cc1ccoc1",
+        "CCc1ccoc1",
+        "Oc1ccoc1",
+        "Nc1ccoc1",
+        "Clc1ccoc1",
+        "Fc1ccoc1",
+        "C(=O)Oc1ccoc1",
+        "C(=O)Cc1ccoc1",
+        "c1ccoc1C#N",
+        "COc1ccoc1",
+        "Cc1ccsc1",
+        "CCc1ccsc1",
+        "Oc1ccsc1",
+        "Nc1ccsc1",
+        "Clc1ccsc1",
+        "Fc1ccsc1",
+        "C(=O)Oc1ccsc1",
+        "C(=O)Cc1ccsc1",
+        "c1ccsc1C#N",
+        "COc1ccsc1",
+        "Cc1ccc2ccccc2c1",
+        "CCc1ccc2ccccc2c1",
+        "Oc1ccc2ccccc2c1",
+        "Nc1ccc2ccccc2c1",
+        "Clc1ccc2ccccc2c1",
+        "Fc1ccc2ccccc2c1",
+        "C(=O)Oc1ccc2ccccc2c1",
+        "C(=O)Cc1ccc2ccccc2c1",
+        "c1ccc2ccccc2c1C#N",
+        "COc1ccc2ccccc2c1",
+        "Cc1ccc2[nH]ccc2c1",
+        "CCc1ccc2[nH]ccc2c1",
+        "Oc1ccc2[nH]ccc2c1",
+        "Nc1ccc2[nH]ccc2c1",
+        "Clc1ccc2[nH]ccc2c1",
+        "Fc1ccc2[nH]ccc2c1",
+        "C(=O)Oc1ccc2[nH]ccc2c1",
+        "C(=O)Cc1ccc2[nH]ccc2c1",
+        "c1ccc2[nH]ccc2c1C#N",
+        "COc1ccc2[nH]ccc2c1",
+        "Cc1ccc2ncccc2c1",
+        "CCc1ccc2ncccc2c1",
+        "Oc1ccc2ncccc2c1",
+        "Nc1ccc2ncccc2c1",
+        "Clc1ccc2ncccc2c1",
+        "Fc1ccc2ncccc2c1",
+        "C(=O)Oc1ccc2ncccc2c1",
+        "C(=O)Cc1ccc2ncccc2c1",
+        "c1ccc2ncccc2c1C#N",
+        "COc1ccc2ncccc2c1",
+        "CC1CCCC1",
+        "CCC1CCCC1",
+        "OC1CCCC1",
+        "NC1CCCC1",
+        "ClC1CCCC1",
+        "FC1CCCC1",
+        "C(=O)OC1CCCC1",
+        "C(=O)CC1CCCC1",
+        "C1CCCC1C#N",
+        "COC1CCCC1",
+        "CC1CCNCC1",
+        "CCC1CCNCC1",
+        "OC1CCNCC1",
+        "NC1CCNCC1",
+        "ClC1CCNCC1",
+        "FC1CCNCC1",
+        "C(=O)OC1CCNCC1",
+        "C(=O)CC1CCNCC1",
+        "C1CCNCC1C#N",
+        "COC1CCNCC1",
+    ]
+    smiles = pd.Series(base_smiles)
+    y = pd.Series(np.linspace(0.0, 1.0, len(smiles)))
+    return smiles, y
+
+
 @pytest.mark.parametrize(
-    "train_size, val_size, test_size, expected_train, expected_val, expected_test, error, method",
+    "method",
     [
-        # Test cases for kmeans
-        (0.8, 0.0, 0.2, 1600, 0, 400, False, "kmeans"),
-        (0.7, 0.3, 0.0, 1400, 600, 0, False, "kmeans"),
-        (0.7, 0.1, 0.2, 1400, 200, 400, False, "kmeans"),
-        (0.6, 0.2, 0.2, 1200, 400, 400, False, "kmeans"),
-        # Test cases for butina
-        (0.8, 0.0, 0.2, 1600, 0, 400, False, "butina"),
-        (0.7, 0.3, 0.0, 1400, 600, 0, False, "butina"),
-        (0.7, 0.1, 0.2, 1400, 200, 400, False, "butina"),
-        (0.6, 0.2, 0.2, 1200, 400, 400, False, "butina"),
-        # Test cases for bemis-murcko
-        (0.8, 0.0, 0.2, 1600, 0, 400, False, "bemis-murcko"),
-        (0.7, 0.3, 0.0, 1400, 600, 0, False, "bemis-murcko"),
-        (0.7, 0.1, 0.2, 1400, 200, 400, False, "bemis-murcko"),
-        (0.6, 0.2, 0.2, 1200, 400, 400, False, "bemis-murcko"),
-        # Error cases
-        (1.0, 0.0, 0.0, 200, 0, 0, True, "kmeans"),
-        (0.5, 0.5, 0.5, -1, -1, -1, True, "kmeans"),
+        "kmeans",
+        "butina",
+        "bemis-murcko",
     ],
 )
-def test_cluster_split(
-    train_size,
-    val_size,
-    test_size,
-    expected_train,
-    expected_val,
-    expected_test,
-    error,
-    method,
-):
-    df = pd.read_csv(CYP3A4_chembl_pchembl)
-    X = df["CANONICAL_SMILES"][:2000]
-    y = df["pChEMBL mean"][:2000]
-
-    # Error expected
-    if error is True:
-        with pytest.raises(ValueError):
-            # Initialize splitter
-            splitter = ClusterSplitter(
-                train_size=train_size,
-                val_size=val_size,
-                test_size=test_size,
-                random_state=42,
-                method=method,
-                k_clusters=100,
-            )
-        return
+def test_cluster_split_synthetic_data(method, synthetic_cluster_data):
+    """
+    Validate ClusterSplitter functionality with different clustering methods.
 
-    # Initialize splitter
+    This test ensures that molecular data is split such that training, validation, and test sets
+    contain mutually exclusive molecules (no data leakage). It verifies split sizes are approximately
+    correct and that structural separation is maintained.
+    """
+    X, y = synthetic_cluster_data
     splitter = ClusterSplitter(
-        train_size=train_size,
-        val_size=val_size,
-        test_size=test_size,
+        train_size=0.7,
+        val_size=0.1,
+        test_size=0.2,
         random_state=42,
         method=method,
-        k_clusters=100,
+        k_clusters=10,
+    )
+    X_train, X_val, X_test, y_train, y_val, y_test, groups = splitter.split(
+        X, y, num_iters=50
     )
 
-    # Perform the split
-    X_train, X_val, X_test, y_train, y_val, y_test, groups = splitter.split(X, y)
-
-    # Check type preservation
     for obj in [X_train, X_val, X_test]:
         if obj is not None:
-            assert isinstance(obj, pd.Series), "X split must preserve pandas Series"
+            assert isinstance(obj, pd.Series)
 
     for obj in [y_train, y_val, y_test]:
         if obj is not None:
-            assert isinstance(obj, pd.Series), "y split must preserve pandas Series"
+            assert isinstance(obj, pd.Series)
 
-    # Check train
-    assert abs(X_train.shape[0] - expected_train) <= 10
-    assert abs(y_train.shape[0] - expected_train) <= 10
+    total = len(X)
+    assert abs(len(X_train) - int(0.7 * total)) <= 5
+    assert abs(len(X_val) - int(0.1 * total)) <= 5
+    assert abs(len(X_test) - int(0.2 * total)) <= 5
+    assert len(groups) == total
 
-    # Validation set requested
-    if val_size > 0:
-        assert abs(X_val.shape[0] - expected_val) <= 10
-        assert abs(y_val.shape[0] - expected_val) <= 10
+    # Check for data leakage
+    # Assert X_train, X_val, and X_test are mutually exclusive by index
+    train_idx = set(X_train.index)
+    val_idx = set(X_val.index)
+    test_idx = set(X_test.index)
 
-    # Validation set not requested
-    else:
-        assert X_val is None
-        assert y_val is None
+    assert len(train_idx.intersection(val_idx)) == 0
+    assert len(train_idx.intersection(test_idx)) == 0
+    assert len(val_idx.intersection(test_idx)) == 0
 
-    # Test set requested
-    if test_size > 0:
-        assert abs(X_test.shape[0] - expected_test) <= 10
-        assert abs(y_test.shape[0] - expected_test) <= 10
 
-    # Test set not requested
-    else:
-        assert X_test is None
-        assert y_test is None
+def test_cluster_split_invalid_size_configuration():
+    """Ensure ClusterSplitter raises ValueError for invalid split size configurations (e.g., sum != 1.0)."""
+    with pytest.raises(ValueError):
+        ClusterSplitter(
+            train_size=1.0,
+            val_size=0.0,
+            test_size=0.0,
+            random_state=42,
+            method="kmeans",
+        )
+
+
+def test_cluster_split_invalid_method():
+    """Ensure ClusterSplitter raises ValueError when initialized with an unknown clustering method."""
+    with pytest.raises(ValueError):
+        ClusterSplitter(
+            train_size=0.7,
+            val_size=0.1,
+            test_size=0.2,
+            random_state=42,
+            method="not-a-method",
+        )
diff --git a/openadmet/models/tests/unit/test_utils.py b/openadmet/models/tests/unit/test_utils.py
index fbd17e31..9e65ee49 100644
--- a/openadmet/models/tests/unit/test_utils.py
+++ b/openadmet/models/tests/unit/test_utils.py
@@ -2,6 +2,12 @@
 
 
 def click_success(result):
+    """
+    Helper function to verify that a Click command executed successfully (exit code 0).
+
+    If the command failed, this function prints the output and traceback to aid in debugging
+    before returning False.
+    """
     if result.exit_code != 0:  # -no-cov-  (only occurs on test error)
         print(result.output)
         traceback.print_tb(result.exc_info[2])

From a36d7d11c5c6b7a9fde40205cbee5121c7ea2c53 Mon Sep 17 00:00:00 2001
From: Sean Colby <seanmcolby@gmail.com>
Date: Sat, 16 May 2026 11:03:36 -0800
Subject: [PATCH 2/3] Add autospec to mocks and standardise test_base.py mock
 style

- Add autospec=True to inference_func and from_recipe patches in test_cli.py;
  convert mock_spec to create_autospec(AnvilSpecification, instance=True)
- Add autospec=True to both load_anvil_model_and_metadata patches in
  test_inference.py
- Add autospec=True to both featurize patches in test_features.py
- Replace monkeypatch.setattr + unittest.mock.Mock with mocker.patch /
  mocker.Mock in test_base.py; add docstring; remove stale import

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 openadmet/models/tests/unit/cli/test_cli.py       |  6 +++---
 .../models/tests/unit/features/test_features.py   |  4 ++--
 .../models/tests/unit/inference/test_inference.py |  2 ++
 openadmet/models/tests/unit/models/test_base.py   | 15 ++++++++++-----
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/openadmet/models/tests/unit/cli/test_cli.py b/openadmet/models/tests/unit/cli/test_cli.py
index 8e430025..cff115d6 100644
--- a/openadmet/models/tests/unit/cli/test_cli.py
+++ b/openadmet/models/tests/unit/cli/test_cli.py
@@ -43,7 +43,7 @@ def test_predict_cli_invokes_inference(tmp_path, runner, mocker):
     model_dir = tmp_path / "model_dir"
     model_dir.mkdir()
 
-    mock_inference = mocker.patch.object(predict_cli_module, "inference_func")
+    mock_inference = mocker.patch.object(predict_cli_module, "inference_func", autospec=True)
 
     result = runner.invoke(
         cli,
@@ -77,9 +77,9 @@ def test_anvil_cli_invokes_workflow(tmp_path, runner, mocker):
     We mock the `AnvilSpecification` and workflow execution to verify that the CLI correctly handles
     recipe paths and output directories without actually running a full ML training job.
     """
-    mock_spec = mocker.Mock()
+    mock_spec = mocker.create_autospec(anvil_cli_module.AnvilSpecification, instance=True)
     mock_from_recipe = mocker.patch.object(
-        anvil_cli_module.AnvilSpecification, "from_recipe", return_value=mock_spec
+        anvil_cli_module.AnvilSpecification, "from_recipe", autospec=True, return_value=mock_spec
     )
 
     result = runner.invoke(
diff --git a/openadmet/models/tests/unit/features/test_features.py b/openadmet/models/tests/unit/features/test_features.py
index ea13a1be..0a8b1446 100644
--- a/openadmet/models/tests/unit/features/test_features.py
+++ b/openadmet/models/tests/unit/features/test_features.py
@@ -113,7 +113,7 @@ def test_feature_concatenator_drops_intersection(mocker):
     desc_features = np.zeros((3, 1613))
     desc_indices = np.array([0, 2, 3])
     mocker.patch.object(
-        DescriptorFeaturizer, "featurize", return_value=(desc_features, desc_indices)
+        DescriptorFeaturizer, "featurize", autospec=True, return_value=(desc_features, desc_indices)
     )
 
     # Mock fingerprint featurizer to return 3 valid outputs (fails on index 2)
@@ -122,7 +122,7 @@ def test_feature_concatenator_drops_intersection(mocker):
     fp_features = np.zeros((3, 2000))
     fp_indices = np.array([0, 1, 3])
     mocker.patch.object(
-        FingerprintFeaturizer, "featurize", return_value=(fp_features, fp_indices)
+        FingerprintFeaturizer, "featurize", autospec=True, return_value=(fp_features, fp_indices)
     )
 
     smiles = ["SMI0", "SMI1", "SMI2", "SMI3"]
diff --git a/openadmet/models/tests/unit/inference/test_inference.py b/openadmet/models/tests/unit/inference/test_inference.py
index 7a4c369c..51173861 100644
--- a/openadmet/models/tests/unit/inference/test_inference.py
+++ b/openadmet/models/tests/unit/inference/test_inference.py
@@ -111,6 +111,7 @@ def test_predict_with_real_single_model(
     mock_loader = mocker.patch.object(
         inference_module,
         "load_anvil_model_and_metadata",
+        autospec=True,
         return_value=(
             trained_single_model,
             real_featurizer,
@@ -152,6 +153,7 @@ def test_predict_with_real_ensemble_and_acquisition(
     mock_loader = mocker.patch.object(
         inference_module,
         "load_anvil_model_and_metadata",
+        autospec=True,
         return_value=(
             trained_ensemble,
             real_featurizer,
diff --git a/openadmet/models/tests/unit/models/test_base.py b/openadmet/models/tests/unit/models/test_base.py
index 05801e3b..d7b3f766 100644
--- a/openadmet/models/tests/unit/models/test_base.py
+++ b/openadmet/models/tests/unit/models/test_base.py
@@ -1,5 +1,4 @@
 from types import SimpleNamespace
-from unittest.mock import Mock
 
 import pytest
 
@@ -48,12 +47,18 @@ def test_save_load_torch_model(mclass, tmp_path):
     loaded_model.load(tmp_path / "test_model.pth")
 
 
-def test_lightning_model_load_uses_weights_only(monkeypatch, tmp_path):
+def test_lightning_model_load_uses_weights_only(mocker, tmp_path):
+    """
+    Verify that LightningModelBase.load calls torch.load with weights_only=True.
+
+    We mock torch.load to avoid requiring a real .pth file and to capture the call arguments.
+    We mock estimator.load_state_dict because both calls are in a single expression in the
+    implementation, making it impossible to test one without the other.
+    """
     state_dict = {"layer.weight": "dummy"}
-    torch_load = Mock(return_value=state_dict)
-    monkeypatch.setattr(model_base.torch, "load", torch_load)
+    torch_load = mocker.patch.object(model_base.torch, "load", return_value=state_dict)
 
-    estimator = Mock()
+    estimator = mocker.Mock()
     model = SimpleNamespace(estimator=estimator)
     path = tmp_path / "test_model.pth"
 

From 5eed81a4eb7967c2128061d00d7d407bb29a4ce6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 16 May 2026 19:04:26 +0000
Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 openadmet/models/tests/unit/cli/test_cli.py         | 13 ++++++++++---
 .../models/tests/unit/features/test_features.py     | 10 ++++++++--
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/openadmet/models/tests/unit/cli/test_cli.py b/openadmet/models/tests/unit/cli/test_cli.py
index cff115d6..1160a44e 100644
--- a/openadmet/models/tests/unit/cli/test_cli.py
+++ b/openadmet/models/tests/unit/cli/test_cli.py
@@ -43,7 +43,9 @@ def test_predict_cli_invokes_inference(tmp_path, runner, mocker):
     model_dir = tmp_path / "model_dir"
     model_dir.mkdir()
 
-    mock_inference = mocker.patch.object(predict_cli_module, "inference_func", autospec=True)
+    mock_inference = mocker.patch.object(
+        predict_cli_module, "inference_func", autospec=True
+    )
 
     result = runner.invoke(
         cli,
@@ -77,9 +79,14 @@ def test_anvil_cli_invokes_workflow(tmp_path, runner, mocker):
     We mock the `AnvilSpecification` and workflow execution to verify that the CLI correctly handles
     recipe paths and output directories without actually running a full ML training job.
     """
-    mock_spec = mocker.create_autospec(anvil_cli_module.AnvilSpecification, instance=True)
+    mock_spec = mocker.create_autospec(
+        anvil_cli_module.AnvilSpecification, instance=True
+    )
     mock_from_recipe = mocker.patch.object(
-        anvil_cli_module.AnvilSpecification, "from_recipe", autospec=True, return_value=mock_spec
+        anvil_cli_module.AnvilSpecification,
+        "from_recipe",
+        autospec=True,
+        return_value=mock_spec,
     )
 
     result = runner.invoke(
diff --git a/openadmet/models/tests/unit/features/test_features.py b/openadmet/models/tests/unit/features/test_features.py
index 0a8b1446..1fa7b224 100644
--- a/openadmet/models/tests/unit/features/test_features.py
+++ b/openadmet/models/tests/unit/features/test_features.py
@@ -113,7 +113,10 @@ def test_feature_concatenator_drops_intersection(mocker):
     desc_features = np.zeros((3, 1613))
     desc_indices = np.array([0, 2, 3])
     mocker.patch.object(
-        DescriptorFeaturizer, "featurize", autospec=True, return_value=(desc_features, desc_indices)
+        DescriptorFeaturizer,
+        "featurize",
+        autospec=True,
+        return_value=(desc_features, desc_indices),
     )
 
     # Mock fingerprint featurizer to return 3 valid outputs (fails on index 2)
@@ -122,7 +125,10 @@ def test_feature_concatenator_drops_intersection(mocker):
     fp_features = np.zeros((3, 2000))
     fp_indices = np.array([0, 1, 3])
     mocker.patch.object(
-        FingerprintFeaturizer, "featurize", autospec=True, return_value=(fp_features, fp_indices)
+        FingerprintFeaturizer,
+        "featurize",
+        autospec=True,
+        return_value=(fp_features, fp_indices),
     )
 
     smiles = ["SMI0", "SMI1", "SMI2", "SMI3"]