From d8bbe7824a5c41eb0a02e6befefa4c7ee1afd119 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Thu, 14 May 2026 08:43:30 -0800 Subject: [PATCH 1/3] Strengthen unit test suite across eval, features, inference, models, split, and CLI Apply testing standards from remove-parent-spec branch across the broad unit test suite. Key changes: - Replace integration-style tests with real lightweight components - Remove MagicMock abuse on systems under test; keep mocks only for I/O - Add pytest.approx throughout to prevent cross-platform floating-point failures - Add mutual-exclusion assertions for all splitter tests - Strengthen test_inference.py: real FingerprintFeaturizer, DummyRegressorModel, CommitteeRegressor; assert derived math values (PRED=1.0, STD=1.0, UCB=4.0) - Add NumPy-style docstrings to all test functions - Update test_cli.py for spec.run() call path (PR 3 API) - test_eval.py: preserves RAE/log tests from main + docstrings/pytest.approx - test_base.py: preserves weights_only load test from main + docstrings - test_mtenn.py: not included (module deleted in main via #529) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- openadmet/models/tests/unit/cli/test_cli.py | 127 ++++++-- .../tests/unit/comparison/test_comparison.py | 53 +++- openadmet/models/tests/unit/data/test_data.py | 18 ++ openadmet/models/tests/unit/eval/test_eval.py | 60 +++- .../tests/unit/features/test_features.py | 81 ++++- .../models/tests/unit/features/test_nepare.py | 6 + .../tests/unit/inference/test_inference.py | 228 +++++++++++--- .../models/tests/unit/models/test_base.py | 13 + .../models/tests/unit/models/test_lgbm.py | 23 ++ .../models/tests/unit/models/test_nepare.py | 2 + .../models/tests/unit/split/test_splitters.py | 288 ++++++++++++------ openadmet/models/tests/unit/test_utils.py | 6 + 12 files changed, 719 insertions(+), 186 deletions(-) diff --git a/openadmet/models/tests/unit/cli/test_cli.py b/openadmet/models/tests/unit/cli/test_cli.py index 73da2cec..8e430025 100644 --- a/openadmet/models/tests/unit/cli/test_cli.py +++ b/openadmet/models/tests/unit/cli/test_cli.py @@ -1,59 +1,87 @@ -from openadmet.models.cli.cli import cli -from openadmet.models.tests.test_utils import click_success -from openadmet.models.tests.unit.datafiles import ( - anvil_lgbm_trained_model_dir, - pred_test_data_csv, - basic_anvil_yaml_cv, -) +from pathlib import Path + import pytest from click.testing import CliRunner +from openadmet.models.cli import anvil as anvil_cli_module +from openadmet.models.cli import predict as predict_cli_module +from openadmet.models.cli.cli import cli +from openadmet.models.tests.test_utils import click_success +from openadmet.models.tests.unit.datafiles import basic_anvil_yaml_cv + + +@pytest.fixture +def runner(): + """Provide a Click CliRunner for testing CLI commands in isolation.""" + return CliRunner() -def test_toplevel_runnable(): - """Test the top-level CLI command""" - runner = CliRunner() + +def test_toplevel_runnable(runner): + """Ensure the top-level 'openadmet' command runs and displays help without error.""" result = runner.invoke(cli, ["--help"]) assert click_success(result) @pytest.mark.parametrize( - "args", - [ - ["anvil", "--help"], - ["compare", "--help"], - ["predict", "--help"], - ], + "args", [["anvil", "--help"], ["compare", "--help"], ["predict", "--help"]] ) -def test_subcommand_runnable(args): - """Test the subcommands""" - runner = CliRunner() +def test_subcommand_runnable(runner, args): + """Verify that all major subcommands (anvil, compare, predict) are registered and runnable.""" result = runner.invoke(cli, args) assert click_success(result) -def test_predict_cli(tmp_path): - """Test the predict CLI command""" - runner = CliRunner() +def test_predict_cli_invokes_inference(tmp_path, runner, mocker): + """ + Validate that the 'predict' subcommand correctly parses arguments and calls the underlying inference function. + + We mock `inference_func` to avoid loading real models (which is heavy and requires trained artifacts). + This ensures that the CLI layer correctly passes paths, column names, and flags to the logic layer. + """ + input_csv = tmp_path / "input.csv" + input_csv.write_text("MY_SMILES\nCCO\n") + model_dir = tmp_path / "model_dir" + model_dir.mkdir() + + mock_inference = mocker.patch.object(predict_cli_module, "inference_func") + result = runner.invoke( cli, [ "predict", "--input-path", - pred_test_data_csv, + input_csv, "--input-col", "MY_SMILES", "--model-dir", - anvil_lgbm_trained_model_dir, + model_dir, "--output-csv", tmp_path / "predictions.csv", + "--accelerator", + "cpu", ], ) assert click_success(result) + mock_inference.assert_called_once() + called = mock_inference.call_args.kwargs + assert called["input_col"] == "MY_SMILES" + assert called["accelerator"] == "cpu" + assert called["write_csv"] is True + assert list(called["model_dir"]) == [model_dir] -def test_anvil_cli(tmp_path): - """Test the anvil CLI command""" - runner = CliRunner() +def test_anvil_cli_invokes_workflow(tmp_path, runner, mocker): + """ + Validate that the 'anvil' subcommand correctly initializes and runs a workflow from a recipe. + + We mock the `AnvilSpecification` and workflow execution to verify that the CLI correctly handles + recipe paths and output directories without actually running a full ML training job. + """ + mock_spec = mocker.Mock() + mock_from_recipe = mocker.patch.object( + anvil_cli_module.AnvilSpecification, "from_recipe", return_value=mock_spec + ) + result = runner.invoke( cli, [ @@ -66,3 +94,48 @@ def test_anvil_cli(tmp_path): ) assert click_success(result) + mock_from_recipe.assert_called_once_with(basic_anvil_yaml_cv) + mock_spec.run.assert_called_once() + called = mock_spec.run.call_args.kwargs + assert Path(called["output_dir"]) == tmp_path / "anvil_output" + assert called["debug"] is False + + +@pytest.mark.parametrize( + "aq_fxns,beta,best_y,xi,expected", + [ + (("ucb",), (2.0,), (), (), {"ucb": {"beta": 2.0}}), + ( + ("ei", "pi"), + (), + (1.0, 2.0), + (0.1, 0.2), + {"ei": {"xi": 0.1, "best_y": 1.0}, "pi": {"xi": 0.2, "best_y": 2.0}}, + ), + ], +) +def test_validate_aq_fxns_success(aq_fxns, beta, best_y, xi, expected): + """ + Verify that valid combinations of acquisition function arguments are correctly parsed into a configuration dict. + + This tests the CLI argument validation logic for active learning parameters. + """ + assert predict_cli_module._validate_aq_fxns(aq_fxns, beta, best_y, xi) == expected + + +@pytest.mark.parametrize( + "aq_fxns,beta,best_y,xi,error_message", + [ + (("ucb", "ucb"), (1.0, 2.0), (), (), "UCB can only be specified once"), + (("ei",), (), (), (), "must be specified once per EI and/or PI acquisition"), + (("ucb",), (), (), (), "Field `beta` must be specified for UCB acquisition"), + ], +) +def test_validate_aq_fxns_errors(aq_fxns, beta, best_y, xi, error_message): + """ + Ensure that invalid acquisition function arguments trigger appropriate validation errors. + + This prevents users from running predictions with ambiguous or incomplete active learning settings. + """ + with pytest.raises(ValueError, match=error_message): + predict_cli_module._validate_aq_fxns(aq_fxns, beta, best_y, xi) diff --git a/openadmet/models/tests/unit/comparison/test_comparison.py b/openadmet/models/tests/unit/comparison/test_comparison.py index 27d93900..cd9949cc 100644 --- a/openadmet/models/tests/unit/comparison/test_comparison.py +++ b/openadmet/models/tests/unit/comparison/test_comparison.py @@ -1,26 +1,32 @@ +import numpy as np import pytest from numpy.testing import assert_almost_equal -import numpy as np + from openadmet.models.comparison.compare_base import get_comparison_class from openadmet.models.comparison.posthoc import PostHocComparison from openadmet.models.tests.unit.datafiles import ( - cyp2c9_json, + anvil_lgbm_trained_model_dir, cyp1a2_json, + cyp2c9_json, cyp3a4_json, multi_task_json, - anvil_lgbm_trained_model_dir, ) def test_get_comparison_class(): - """Test getting comparison class.""" + """ + Test dynamic retrieval of comparison classes from the registry. + + Verifies that valid class names return the class and invalid names raise ValueError. + """ get_comparison_class("PostHoc") with pytest.raises(ValueError): get_comparison_class("NotARealClass") def test_posthoc_fails_on_incorrect_inputs(): - """Test that posthoc comparison fails when given incorrect inputs. + """ + Test that posthoc comparison fails when given incorrect inputs. Inputs include: - No inputs @@ -28,6 +34,8 @@ def test_posthoc_fails_on_incorrect_inputs(): - Mismatched lengths of model_stats_fns, labels, and task_names - Repeated labels - Incorrect labels and task_names for model_stats_fns + + This validation is critical to ensure that comparison tables and plots match models to their correct metadata. """ comp_obj = PostHocComparison() with pytest.raises(ValueError): @@ -69,7 +77,12 @@ def test_posthoc_repeat_label_error(): def test_posthoc_comparison(): - """Test that posthoc comparison works when given correct inputs.""" + """ + Test that posthoc comparison works correctly when given valid inputs. + + This verifies the calculation of statistical tests (Levene's test for equality of variances, + Tukey's HSD for pairwise mean differences) based on loaded model metrics. + """ model_stats = [cyp2c9_json, cyp3a4_json, cyp1a2_json] model_tags = [ "openadmet-CYP2C9-pchembl-regression-testing-cv", @@ -97,7 +110,12 @@ def test_posthoc_comparison(): def test_posthoc_comparison_anvil_reader_and_feature_label( label_types, expected_labels ): - """Test that posthoc comparison can read from anvil-trained model directories and features.""" + """ + Test that posthoc comparison can automatically extract labels from anvil-trained model directories. + + This ensures that metadata stored in `metadata.yaml` within model directories can be correctly + parsed to generate readable labels for comparison plots. + """ comp_obj = PostHocComparison() model_stats_fns, labels, task_names = comp_obj.label_and_task_name_from_anvil( model_dirs=[anvil_lgbm_trained_model_dir], label_types=label_types @@ -121,7 +139,12 @@ def test_posthoc_comparison_json_reader_fails(label_types): def test_posthoc_comparison_json_reader(): - """Test that posthoc comparison can read multi vs single task from anvil file.""" + """ + Test that posthoc comparison handles both multi-task and single-task JSON result files. + + This verifies that the system can normalize results from different task types into a common + format for statistical comparison. + """ model_stats = [multi_task_json, cyp3a4_json] model_tags = ["multitask", "single_task"] task_tags = ["cyp3a4_pchembl_value_mean", "pchembl_value_mean"] @@ -130,14 +153,18 @@ def test_posthoc_comparison_json_reader(): levene, tukeys_df = comp_obj.compare( model_stats_fns=model_stats, labels=model_tags, task_names=task_tags ) - assert levene["mse"][0] == 2.483488460351842 - assert levene["ktau"][0] == 1.0392615736603197 - assert tukeys_df["metric_val"][0] == -0.01037444780666702 - assert tukeys_df["pvalue"][0] == 0.2488307785417857 + assert levene["mse"][0] == pytest.approx(2.483, abs=0.001) + assert levene["ktau"][0] == pytest.approx(1.039, abs=0.001) + assert tukeys_df["metric_val"][0] == pytest.approx(-0.010, abs=0.001) + assert tukeys_df["pvalue"][0] == pytest.approx(0.248, abs=0.001) def test_posthoc_comparison_printing(capsys): - """Test that posthoc comparison prints results to console.""" + """ + Test that posthoc comparison prints results to console in a readable format. + + We capture stdout to verify that Levene's test and Tukey's HSD results are actually displayed to the user. + """ model_stats = [cyp2c9_json, cyp3a4_json, cyp1a2_json] model_tags = [ "openadmet-CYP2C9-pchembl-regression-testing-cv", diff --git a/openadmet/models/tests/unit/data/test_data.py b/openadmet/models/tests/unit/data/test_data.py index d714bc7e..fa310a66 100644 --- a/openadmet/models/tests/unit/data/test_data.py +++ b/openadmet/models/tests/unit/data/test_data.py @@ -5,6 +5,12 @@ def test_data_spec_from_csv(): + """ + Validate loading data from a CSV file via DataSpec. + + Ensures that the data loader correctly reads the specified CSV, extracts the target and SMILES columns, + and returns them as expected. + """ data_spec = DataSpec( type="intake", resource=test_csv, @@ -18,6 +24,12 @@ def test_data_spec_from_csv(): def test_data_spec_from_intake(): + """ + Validate loading data from an Intake catalog. + + Intake allows for declarative data loading. This test checks that DataSpec can correctly interface + with an Intake catalog to retrieve data. + """ data_spec = DataSpec( type="intake", resource=intake_cat, @@ -32,6 +44,12 @@ def test_data_spec_from_intake(): @pytest.mark.parametrize("dropna, expected_length", [(True, 3333), (False, 7196)]) def test_data_spec_dropna(dropna, expected_length): + """ + Test the `dropna` functionality in DataSpec. + + Verifies that rows with missing values in target columns are dropped when dropna=True, + and preserved when dropna=False. This is critical for handling real-world datasets which often contain gaps. + """ data_spec = DataSpec( type="intake", resource=nan_data, diff --git a/openadmet/models/tests/unit/eval/test_eval.py b/openadmet/models/tests/unit/eval/test_eval.py index 6af7599e..c0040763 100644 --- a/openadmet/models/tests/unit/eval/test_eval.py +++ b/openadmet/models/tests/unit/eval/test_eval.py @@ -1,6 +1,8 @@ +import matplotlib.figure +import numpy as np import pytest +import seaborn as sns -import numpy as np from openadmet.models.eval.binary import PosthocBinaryMetrics from openadmet.models.eval.classification import ( ClassificationMetrics, @@ -20,21 +22,28 @@ def test_get_eval_class(): + """Verify that evaluation classes can be retrieved by name from the registry.""" get_eval_class("RegressionMetrics") get_eval_class("PosthocBinaryMetrics") get_eval_class("ClassificationMetrics") def test_regression_metrics(): + """ + Validate calculation of standard regression metrics (MSE, MAE, R2). + + This test uses simple synthetic data to ensure that the mathematical implementations + of these metrics are correct and return the expected values. + """ y_true = np.array([3, -0.5, 2, 7]).reshape(-1, 1) y_pred = np.array([2.5, 0.0, 2, 8]).reshape(-1, 1) rm = RegressionMetrics() metrics = rm.evaluate(y_true, y_pred) - assert metrics["task_0"]["mse"]["value"] == 0.375 - assert metrics["task_0"]["mae"]["value"] == 0.5 - assert metrics["task_0"]["r2"]["value"] == 0.9486081370449679 + assert metrics["task_0"]["mse"]["value"] == pytest.approx(0.375, abs=0.001) + assert metrics["task_0"]["mae"]["value"] == pytest.approx(0.5, abs=0.001) + assert metrics["task_0"]["r2"]["value"] == pytest.approx(0.94860, abs=0.001) def test_regression_metrics_and_cv_include_rae_and_pct_within_1_log_for_pxc50(): @@ -94,16 +103,32 @@ def test_lightning_cv_pct_within_1_log_uses_raw_metric_callable(): def test_regression_plots(): + """ + Verify that regression plotting functions return valid figure objects. + + This ensures that regression plots (JointGrid for parity, Figure for CI) are generated + without error, which is important for model reporting. + """ y_true = np.array([3, -0.5, 2, 7]).reshape(-1, 1) y_pred = np.array([2.5, 0.0, 2, 8]).reshape(-1, 1) rm = RegressionPlots() - rm.evaluate(y_true, y_pred) + plot_data = rm.evaluate(y_true, y_pred) - assert True + assert isinstance(plot_data, dict) + assert "task_0_regplot" in plot_data + assert "task_0_ciplot" in plot_data + assert isinstance(plot_data["task_0_regplot"], sns.axisgrid.JointGrid) + assert isinstance(plot_data["task_0_ciplot"], matplotlib.figure.Figure) def test_classification_metrics(): + """ + Validate calculation of classification metrics (Accuracy, Precision, Recall, F1, AUC). + + This ensures that for binary classification tasks, the metrics are computed correctly based on + predicted probabilities and ground truth labels. + """ y_true = [0, 1, 1, 0] # We pass probabilities of the class, not the class itself @@ -113,25 +138,38 @@ def test_classification_metrics(): cm = ClassificationMetrics() metrics = cm.evaluate(y_true, y_pred) - assert metrics["accuracy"]["value"] == 0.75 + assert metrics["accuracy"]["value"] == pytest.approx(0.75) assert metrics["precision"]["value"] == pytest.approx(0.667, abs=0.001) - assert metrics["recall"]["value"] == 1.0 - assert metrics["f1"]["value"] == 0.8 - assert metrics["roc_auc"]["value"] == 0.75 + assert metrics["recall"]["value"] == pytest.approx(1.0) + assert metrics["f1"]["value"] == pytest.approx(0.8) + assert metrics["roc_auc"]["value"] == pytest.approx(0.75) assert metrics["pr_auc"]["value"] == pytest.approx(0.833, abs=0.001) def test_classification_plots(): + """ + Verify that classification plotting functions (ROC, PR curves) return valid figure objects. + """ y_true = [0, 1, 1, 0] y_pred = [[1, 0], [0, 1], [0, 1], [0, 1]] cp = ClassificationPlots() cp.evaluate(y_true, y_pred) - assert True + assert isinstance(cp.plot_data, dict) + assert "roc_curve" in cp.plot_data + assert "pr_curve" in cp.plot_data + assert isinstance(cp.plot_data["roc_curve"], matplotlib.figure.Figure) + assert isinstance(cp.plot_data["pr_curve"], matplotlib.figure.Figure) def test_posthoc_eval_metrics(): + """ + Test post-hoc binary metrics utility functions. + + Verifies that we can calculate precision and recall at a specific cutoff threshold from + regression-like outputs (or probabilities). + """ y_true = [3, -0.5, 2, 7] y_pred = [2.5, 0.0, 2, 8] cutoff = 4.0 diff --git a/openadmet/models/tests/unit/features/test_features.py b/openadmet/models/tests/unit/features/test_features.py index cb9e1a37..ea13a1be 100644 --- a/openadmet/models/tests/unit/features/test_features.py +++ b/openadmet/models/tests/unit/features/test_features.py @@ -10,17 +10,25 @@ @pytest.fixture() def smiles(): + """Provide a list of valid SMILES strings for testing featurization.""" return ["CCO", "CCN", "CCO"] @pytest.fixture() def one_invalid_smi(): + """Provide a list of SMILES strings containing one invalid entry to test error handling.""" return ["CCO", "CCN", "invalid", "CCO"] @pytest.mark.parametrize("dtype", (np.float32, np.float64)) @pytest.mark.parametrize("descr_type", ["mordred", "desc2d"]) def test_descriptor_featurizer(descr_type, dtype): + """ + Validate DescriptorFeaturizer for different descriptor types and floating point precisions. + + This ensures that physical-chemical descriptors (like Mordred or RDKit 2D) are correctly generated + and returned with the requested data type, which is important for downstream model compatibility. + """ featurizer = DescriptorFeaturizer(descr_type=descr_type, dtype=dtype) X, idx = featurizer.featurize(["CCO", "CCN", "CCO"]) assert X.dtype == dtype @@ -28,6 +36,12 @@ def test_descriptor_featurizer(descr_type, dtype): def test_descriptor_one_invalid(one_invalid_smi): + """ + Ensure DescriptorFeaturizer robustly handles invalid SMILES strings. + + The featurizer should skip invalid molecules and return indices corresponding only to the valid ones. + This prevents the entire pipeline from crashing due to a single bad input. + """ featurizer = DescriptorFeaturizer(descr_type="mordred") X, idx = featurizer.featurize(one_invalid_smi) assert X.shape == (3, 1613) @@ -38,6 +52,12 @@ def test_descriptor_one_invalid(one_invalid_smi): @pytest.mark.parametrize("dtype", (np.float32, np.float64)) @pytest.mark.parametrize("fp_type", ("ecfp", "fcfp")) def test_fingerprint_featurizer(smiles, fp_type, dtype): + """ + Validate FingerprintFeaturizer for different fingerprint types (ECFP, FCFP) and precisions. + + This verifies that structural fingerprints are correctly generated with the expected vector size (2000) + and data type. + """ featurizer = FingerprintFeaturizer(fp_type=fp_type, dtype=dtype) X, idx = featurizer.featurize(smiles) assert X.shape == (3, 2000) @@ -46,6 +66,11 @@ def test_fingerprint_featurizer(smiles, fp_type, dtype): def test_fingerprint_one_invalid(one_invalid_smi): + """ + Ensure FingerprintFeaturizer robustly handles invalid SMILES strings. + + Similar to descriptors, it should filter out invalid entries and return correct indices for valid ones. + """ featurizer = FingerprintFeaturizer(fp_type="ecfp") X, idx = featurizer.featurize(one_invalid_smi) assert X.shape == (3, 2000) @@ -54,6 +79,12 @@ def test_fingerprint_one_invalid(one_invalid_smi): def test_feature_concatenator(smiles): + """ + Validate that FeatureConcatenator correctly combines multiple feature sets (descriptors + fingerprints). + + This ensures that different feature representations can be stacked horizontally for the same molecules, + providing a richer feature set for training. + """ desc_featurizer = DescriptorFeaturizer(descr_type="mordred") fp_featurizer = FingerprintFeaturizer(fp_type="ecfp") concat = FeatureConcatenator(featurizers=[desc_featurizer, fp_featurizer]) @@ -62,14 +93,48 @@ def test_feature_concatenator(smiles): assert_array_equal(idx, np.arange(3)) -def test_feature_concatenator_failed_diff_positions(one_invalid_smi): +def test_feature_concatenator_drops_intersection(mocker): + """ + Verify that FeatureConcatenator only keeps molecules valid across ALL featurizers. + + If one featurizer fails for molecule A and another fails for molecule B, the concatenator + must drop both A and B to maintain feature alignment. + + We mock the underlying featurizers to control which indices fail, avoiding the need for + complex real-world molecules that fail specific featurizers. This isolates the intersection logic. + """ + # Arrange desc_featurizer = DescriptorFeaturizer(descr_type="mordred") fp_featurizer = FingerprintFeaturizer(fp_type="ecfp") concat = FeatureConcatenator(featurizers=[desc_featurizer, fp_featurizer]) - X, idx = concat.featurize(one_invalid_smi) - assert X.shape == (3, 3613) - # index 2 is invalid, so the shape should be 3 - assert_array_equal(idx, np.asarray([0, 1, 3])) + + # Mock descriptor featurizer to return 3 valid outputs (fails on index 1) + # Indices: 0, 2, 3 (skips 1) + desc_features = np.zeros((3, 1613)) + desc_indices = np.array([0, 2, 3]) + mocker.patch.object( + DescriptorFeaturizer, "featurize", return_value=(desc_features, desc_indices) + ) + + # Mock fingerprint featurizer to return 3 valid outputs (fails on index 2) + # Indices: 0, 1, 3 (skips 2) + # Note: ECFP size is 2000 in this codebase + fp_features = np.zeros((3, 2000)) + fp_indices = np.array([0, 1, 3]) + mocker.patch.object( + FingerprintFeaturizer, "featurize", return_value=(fp_features, fp_indices) + ) + + smiles = ["SMI0", "SMI1", "SMI2", "SMI3"] + + # Act + X, idx = concat.featurize(smiles) + + # Assert + # Intersection of [0, 2, 3] and [0, 1, 3] is [0, 3] + # Expected shape: (2, 1613 + 2000) = (2, 3613) + assert X.shape == (2, 3613) + assert_array_equal(idx, np.array([0, 3])) def test_feature_concatenator_order_independence(smiles): @@ -90,6 +155,12 @@ def test_feature_concatenator_order_independence(smiles): def test_pairwise_featurizer(smiles): + """ + Validate PairwiseFeaturizer in 'full' mode (all-pairs). + + This tests that features are generated for every pair of molecules and that target values + (differences) are correctly computed. + """ featurizer = PairwiseFeaturizer( featurizer={"FingerprintFeaturizer": {"fp_type": "ecfp", "dtype": np.float32}}, how_to_pair="full", diff --git a/openadmet/models/tests/unit/features/test_nepare.py b/openadmet/models/tests/unit/features/test_nepare.py index 61cfa787..2e926b0b 100644 --- a/openadmet/models/tests/unit/features/test_nepare.py +++ b/openadmet/models/tests/unit/features/test_nepare.py @@ -9,6 +9,12 @@ def test_pairwise_make_new(): + """ + Verify that PairwiseFeaturizer can create a new independent instance via make_new(). + + This is important for factory-like creation patterns in the registry or during cross-validation + where fresh featurizers are needed. + """ featurizer = PairwiseFeaturizer( how_to_pair="full", featurizer=FingerprintFeaturizer(fp_type="ecfp:4") ) diff --git a/openadmet/models/tests/unit/inference/test_inference.py b/openadmet/models/tests/unit/inference/test_inference.py index df341b21..7a4c369c 100644 --- a/openadmet/models/tests/unit/inference/test_inference.py +++ b/openadmet/models/tests/unit/inference/test_inference.py @@ -1,50 +1,204 @@ +"""Tests for the inference orchestration pipeline using real, lightweight components.""" + from pathlib import Path + +import numpy as np import pandas as pd -import os import pytest -from openadmet.models.inference.inference import predict -from openadmet.models.tests.unit.datafiles import ( - pred_test_data_csv, - anvil_lgbm_trained_model_dir, - anvil_chemprop_trained_model_dir, -) +from openadmet.models.active_learning.committee import CommitteeRegressor +from openadmet.models.anvil.specification import DataSpec, Metadata +from openadmet.models.architecture.dummy import DummyRegressorModel +from openadmet.models.features.molfeat_fingerprint import FingerprintFeaturizer +from openadmet.models.inference import inference as inference_module @pytest.fixture -def anvil_lgbm(): - return anvil_lgbm_trained_model_dir +def input_df(): + """Provide a simple DataFrame with SMILES for testing inference inputs.""" + return pd.DataFrame({"MY_SMILES": ["CCO", "CCN"]}) -@pytest.fixture -def anvil_chemprop(): - return anvil_chemprop_trained_model_dir - - -@pytest.mark.skipif( - os.getenv("RUNNER_OS") == "macOS", reason="MacOS runner not enough memory" -) -@pytest.mark.parametrize("model_dir", ["anvil_lgbm", "anvil_chemprop"]) -def test_predict(model_dir, request): - # Use the fixture to get the model directory - model_dir = request.getfixturevalue(model_dir) - # Test the predict function with a sample input - input_path = pred_test_data_csv - input_col = "MY_SMILES" - model_dir = [model_dir] - write_csv = False - output_path = None - debug = False - - result = predict( - input_path, - input_col, - model_dir, - write_csv, - output_path, - debug=False, +@pytest.fixture(scope="module") +def real_featurizer(): + """Return a real FingerprintFeaturizer using ECFP4 fingerprints.""" + return FingerprintFeaturizer(fp_type="ecfp:4") + + +@pytest.fixture(scope="module") +def real_data_spec(): + """Return a real DataSpec with a single regression target.""" + return DataSpec(type="csv", target_cols=["task_0"], input_col="MY_SMILES") + + +@pytest.fixture(scope="module") +def real_metadata_single(): + """Return real Metadata with tag UNIT for single-model tests.""" + return Metadata( + version="v1", + driver="sklearn", + name="unit-test", + build_number=0, + description="Unit test model", + tag="UNIT", + authors="Test Author", + email="test@example.com", + biotargets=["test"], + tags=["test"], + ) + + +@pytest.fixture(scope="module") +def real_metadata_ensemble(): + """Return real Metadata with tag ENS for ensemble tests.""" + return Metadata( + version="v1", + driver="sklearn", + name="ens-test", + build_number=0, + description="Ensemble test model", + tag="ENS", + authors="Test Author", + email="test@example.com", + biotargets=["test"], + tags=["test"], + ) + + +@pytest.fixture(scope="module") +def trained_single_model(): + """Return a DummyRegressorModel trained to always predict 1.0 regardless of input features.""" + X_train = np.zeros((3, 2)) + y_train = np.array([[1.0], [1.0], [1.0]]) + model = DummyRegressorModel() + model.train(X_train, y_train) + return model + + +@pytest.fixture(scope="module") +def trained_ensemble(): + """Return a CommitteeRegressor whose two members predict 1.0 and 3.0 respectively. + + The ensemble mean is 2.0 and the standard deviation is 1.0 for any input, + making the UCB score with beta=2.0 equal to 4.0. + """ + X_train = np.zeros((3, 2)) + + model1 = DummyRegressorModel() + model1.train(X_train, np.array([[1.0], [1.0], [1.0]])) + + model2 = DummyRegressorModel() + model2.train(X_train, np.array([[3.0], [3.0], [3.0]])) + + return CommitteeRegressor.from_models([model1, model2]) + + +def test_predict_with_real_single_model( + mocker, + input_df, + real_featurizer, + real_metadata_single, + real_data_spec, + trained_single_model, +): + """Test the inference pipeline with a real DummyRegressorModel. + + SMILES strings flow through a real FingerprintFeaturizer and a real DummyRegressorModel + to verify internal data plumbing. Because DummyRegressorModel always predicts the + training mean, PRED values must equal 1.0 for both inputs. The STD column must be NaN + because non-ensemble models produce no uncertainty estimate. + """ + mock_loader = mocker.patch.object( + inference_module, + "load_anvil_model_and_metadata", + return_value=( + trained_single_model, + real_featurizer, + real_metadata_single, + real_data_spec, + ), + ) + + result = inference_module.predict( + input_path=input_df, + input_col="MY_SMILES", + model_dir=["unused-model-dir"], accelerator="cpu", + log=False, ) - # Check if the result is a DataFrame assert isinstance(result, pd.DataFrame) + assert "OADMET_PRED_UNIT_task_0" in result.columns + assert "OADMET_STD_UNIT_task_0" in result.columns + assert result["OADMET_PRED_UNIT_task_0"].tolist() == pytest.approx([1.0, 1.0]) + assert result["OADMET_STD_UNIT_task_0"].isna().all() + mock_loader.assert_called_once_with(Path("unused-model-dir")) + + +def test_predict_with_real_ensemble_and_acquisition( + mocker, + input_df, + real_featurizer, + real_metadata_ensemble, + real_data_spec, + trained_ensemble, +): + """Test the inference pipeline with a real CommitteeRegressor and UCB acquisition. + + Two DummyRegressorModel members predict 1.0 and 3.0 respectively, yielding a committee + mean of 2.0 and standard deviation of 1.0 for any input. With beta=2.0, + UCB = mean + beta * std = 2.0 + 2.0 * 1.0 = 4.0. + """ + mock_loader = mocker.patch.object( + inference_module, + "load_anvil_model_and_metadata", + return_value=( + trained_ensemble, + real_featurizer, + real_metadata_ensemble, + real_data_spec, + ), + ) + + result = inference_module.predict( + input_path=input_df, + input_col="MY_SMILES", + model_dir=["unused-model-dir"], + accelerator="cpu", + log=False, + aq_fxn_args={"ucb": {"beta": 2.0}}, + ) + + assert result["OADMET_PRED_ENS_task_0"].tolist() == pytest.approx([2.0, 2.0]) + assert result["OADMET_STD_ENS_task_0"].tolist() == pytest.approx([1.0, 1.0]) + assert result["OADMET_UCB_ENS_task_0"].tolist() == pytest.approx([4.0, 4.0]) + mock_loader.assert_called_once_with(Path("unused-model-dir")) + + +def test_predict_raises_when_input_column_missing(input_df): + """Ensure that the inference function validates the existence of the specified SMILES column.""" + with pytest.raises(ValueError, match="Column OTHER not found"): + inference_module.predict( + input_path=input_df, + input_col="OTHER", + model_dir=["unused-model-dir"], + log=False, + ) + + +def test_load_anvil_model_and_metadata_missing_recipe_components(tmp_path): + """Ensure correct error is raised when the model directory structure is invalid.""" + with pytest.raises(FileNotFoundError, match="does not contain recipe components"): + inference_module.load_anvil_model_and_metadata(tmp_path) + + +def test_load_anvil_model_and_metadata_missing_procedure_yaml(tmp_path): + """Ensure correct error is raised when critical YAML metadata files are missing.""" + model_dir = tmp_path / "model" + recipe_components = model_dir / "recipe_components" + recipe_components.mkdir(parents=True) + (recipe_components / "metadata.yaml").write_text("metadata") + (recipe_components / "data.yaml").write_text("data") + + with pytest.raises(FileNotFoundError, match="does not contain procedure.yaml"): + inference_module.load_anvil_model_and_metadata(model_dir) diff --git a/openadmet/models/tests/unit/models/test_base.py b/openadmet/models/tests/unit/models/test_base.py index ed466fef..05801e3b 100644 --- a/openadmet/models/tests/unit/models/test_base.py +++ b/openadmet/models/tests/unit/models/test_base.py @@ -13,6 +13,13 @@ @pytest.mark.parametrize("mclass", models.classes()) def test_save_load_pickleable(mclass, tmp_path): + """ + Verify save/load mechanics for all registered pickleable models (e.g., sklearn-based). + + This iterates through the model registry and tests that any model inheriting from PickleableModelBase + can be instantiated, built, saved, and loaded without error. This is a crucial contract test + ensuring all registered models comply with the persistence interface. + """ if not issubclass(mclass, PickleableModelBase): pytest.skip(f"Skipping non-pickleable model {mclass.__name__}") model = mclass() @@ -25,6 +32,12 @@ def test_save_load_pickleable(mclass, tmp_path): @pytest.mark.parametrize("mclass", models.classes()) def test_save_load_torch_model(mclass, tmp_path): + """ + Verify save/load mechanics for all registered PyTorch Lightning models. + + Similar to the pickleable test, this ensures that deep learning models (inheriting from LightningModelBase) + implement the correct save/load logic for their weights and configurations. + """ if not issubclass(mclass, LightningModelBase): pytest.skip(f"Skipping non-torch model {mclass.__name__}") model = mclass() diff --git a/openadmet/models/tests/unit/models/test_lgbm.py b/openadmet/models/tests/unit/models/test_lgbm.py index d6a76d71..7634f755 100644 --- a/openadmet/models/tests/unit/models/test_lgbm.py +++ b/openadmet/models/tests/unit/models/test_lgbm.py @@ -6,17 +6,24 @@ @pytest.fixture def X_y(): + """Provide simple synthetic data for basic model training tests.""" X = [[1, 2, 3], [4, 5, 6]] y = [1, 2] return X, y def test_lgbm(): + """Verify that LGBMRegressorModel initializes with the correct type identifier.""" lgbm_model = LGBMRegressorModel() assert lgbm_model.type == "LGBMRegressorModel" def test_lgbm_from_params(): + """ + Validate that hyperparameters passed to the constructor are correctly applied to the underlying estimator. + + This ensures that user configurations (like n_estimators) are respected by the model. + """ lgbm_model = LGBMRegressorModel(n_estimators=100, boosting_type="rf") lgbm_model.build() assert lgbm_model.type == "LGBMRegressorModel" @@ -25,6 +32,11 @@ def test_lgbm_from_params(): def test_lgbm_train_predict(X_y): + """ + Verify the train and predict lifecycle of LGBMRegressorModel. + + This checks that the model can fit to data and generate predictions with the expected shape and values. + """ lgbm_model = LGBMRegressorModel(n_estimators=100) lgbm_model.build() X, y = X_y @@ -41,6 +53,11 @@ def test_lgbm_train_predict(X_y): def test_lgbm_save_load(tmp_path, X_y): + """ + Validate persistence of the LGBM model to disk. + + Ensures that saving and reloading the model preserves its learned state and prediction behavior. + """ lgbm_model = LGBMRegressorModel(n_estimators=100) lgbm_model.build() X, y = X_y @@ -54,6 +71,12 @@ def test_lgbm_save_load(tmp_path, X_y): def test_serialization(tmp_path, X_y): + """ + Validate JSON/pickle serialization workflow for LGBM models. + + This tests the separate storage of hyperparameters (JSON) and model weights (pickle), + which is used for model registry and versioning. + """ lgbm_model = LGBMRegressorModel(n_estimators=100) lgbm_model.build() X, y = X_y diff --git a/openadmet/models/tests/unit/models/test_nepare.py b/openadmet/models/tests/unit/models/test_nepare.py index 17a00f5f..113653e7 100644 --- a/openadmet/models/tests/unit/models/test_nepare.py +++ b/openadmet/models/tests/unit/models/test_nepare.py @@ -6,11 +6,13 @@ @pytest.fixture def X_y(): + """Provide synthetic data pairs for testing pairwise regression.""" X = [[1, 2, 3], [4, 5, 6]] y = [1, 2] return X, y def test_nepare(): + """Verify initialization of the NeuralPairwiseRegressorModel.""" nepare_model = NeuralPairwiseRegressorModel() assert nepare_model.type == "NeuralPairwiseRegressorModel" diff --git a/openadmet/models/tests/unit/split/test_splitters.py b/openadmet/models/tests/unit/split/test_splitters.py index 4c375786..f0602492 100644 --- a/openadmet/models/tests/unit/split/test_splitters.py +++ b/openadmet/models/tests/unit/split/test_splitters.py @@ -5,10 +5,10 @@ from openadmet.models.split.sklearn import ShuffleSplitter from openadmet.models.split.cluster import ClusterSplitter from openadmet.models.split.split_base import splitters -from openadmet.models.tests.unit.datafiles import CYP3A4_chembl_pchembl def test_in_splitters(): + """Verify that concrete splitter implementations are correctly registered in the splitters registry.""" assert "ShuffleSplitter" in splitters assert "ClusterSplitter" in splitters @@ -34,10 +34,15 @@ def test_in_splitters(): def test_simple_split( train_size, val_size, test_size, expected_train, expected_val, expected_test, error ): - # Error expected + """ + Validate that ShuffleSplitter correctly partitions data according to specified ratios. + + This test verifies both successful splits and error handling for invalid configurations. + Correct splitting ensures that training, validation, and test sets are of the expected size + and are mutually exclusive, which is critical for valid model evaluation. + """ if error is True: with pytest.raises(ValueError): - # Initialize splitter splitter = ShuffleSplitter( train_size=train_size, val_size=val_size, @@ -46,142 +51,239 @@ def test_simple_split( ) return - # Initialize splitter splitter = ShuffleSplitter( train_size=train_size, val_size=val_size, test_size=test_size, random_state=42 ) - # Generate random data + # Generate synthetic random data for testing split logic X = np.random.rand(100, 10) y = np.random.rand(100) - # Error is expected - if error is True: - with pytest.raises(ValueError): - splitter.split(X, y) - return - - # Perform the split X_train, X_val, X_test, y_train, y_val, y_test, groups = splitter.split(X, y) - # Check train assert X_train.shape[0] == expected_train assert y_train.shape[0] == expected_train - # Validation set requested if val_size > 0: assert X_val.shape[0] == expected_val assert y_val.shape[0] == expected_val + # Assert X_train and X_val are mutually exclusive + train_set = set(map(tuple, X_train)) + val_set = set(map(tuple, X_val)) + assert len(train_set.intersection(val_set)) == 0 - # Validation set not requested else: assert X_val is None assert y_val is None - # Test set requested if test_size > 0: assert X_test.shape[0] == expected_test assert y_test.shape[0] == expected_test + # Assert X_train and X_test are mutually exclusive + train_set = set(map(tuple, X_train)) + test_set = set(map(tuple, X_test)) + assert len(train_set.intersection(test_set)) == 0 + + if val_size > 0: + # Assert X_val and X_test are mutually exclusive + val_set = set(map(tuple, X_val)) + assert len(val_set.intersection(test_set)) == 0 - # Test set not requested else: assert X_test is None assert y_test is None +@pytest.fixture +def synthetic_cluster_data(): + """ + Provide a synthetic dataset with structural diversity for testing cluster splitting. + + This fixture returns a set of SMILES strings representing different chemical scaffolds + (benzenes, pyridines, cyclohexanes, furans, thiophenes) and corresponding target values. + Using diverse scaffolds ensures that clustering algorithms (like Butina or Bemis-Murcko) + can meaningfully group molecules, allowing verification that splits respect cluster boundaries. + """ + base_smiles = [ + "Cc1ccccc1", + "CCc1ccccc1", + "Oc1ccccc1", + "Nc1ccccc1", + "Clc1ccccc1", + "Fc1ccccc1", + "C(=O)Oc1ccccc1", + "C(=O)Cc1ccccc1", + "c1ccccc1C#N", + "COc1ccccc1", + "Cc1ccncc1", + "CCc1ccncc1", + "Oc1ccncc1", + "Nc1ccncc1", + "Clc1ccncc1", + "Fc1ccncc1", + "C(=O)Oc1ccncc1", + "C(=O)Cc1ccncc1", + "c1ccncc1C#N", + "COc1ccncc1", + "CC1CCCCC1", + "CCC1CCCCC1", + "OC1CCCCC1", + "NC1CCCCC1", + "ClC1CCCCC1", + "FC1CCCCC1", + "C(=O)OC1CCCCC1", + "C(=O)CC1CCCCC1", + "C1CCCCC1C#N", + "COC1CCCCC1", + "Cc1ccoc1", + "CCc1ccoc1", + "Oc1ccoc1", + "Nc1ccoc1", + "Clc1ccoc1", + "Fc1ccoc1", + "C(=O)Oc1ccoc1", + "C(=O)Cc1ccoc1", + "c1ccoc1C#N", + "COc1ccoc1", + "Cc1ccsc1", + "CCc1ccsc1", + "Oc1ccsc1", + "Nc1ccsc1", + "Clc1ccsc1", + "Fc1ccsc1", + "C(=O)Oc1ccsc1", + "C(=O)Cc1ccsc1", + "c1ccsc1C#N", + "COc1ccsc1", + "Cc1ccc2ccccc2c1", + "CCc1ccc2ccccc2c1", + "Oc1ccc2ccccc2c1", + "Nc1ccc2ccccc2c1", + "Clc1ccc2ccccc2c1", + "Fc1ccc2ccccc2c1", + "C(=O)Oc1ccc2ccccc2c1", + "C(=O)Cc1ccc2ccccc2c1", + "c1ccc2ccccc2c1C#N", + "COc1ccc2ccccc2c1", + "Cc1ccc2[nH]ccc2c1", + "CCc1ccc2[nH]ccc2c1", + "Oc1ccc2[nH]ccc2c1", + "Nc1ccc2[nH]ccc2c1", + "Clc1ccc2[nH]ccc2c1", + "Fc1ccc2[nH]ccc2c1", + "C(=O)Oc1ccc2[nH]ccc2c1", + "C(=O)Cc1ccc2[nH]ccc2c1", + "c1ccc2[nH]ccc2c1C#N", + "COc1ccc2[nH]ccc2c1", + "Cc1ccc2ncccc2c1", + "CCc1ccc2ncccc2c1", + "Oc1ccc2ncccc2c1", + "Nc1ccc2ncccc2c1", + "Clc1ccc2ncccc2c1", + "Fc1ccc2ncccc2c1", + "C(=O)Oc1ccc2ncccc2c1", + "C(=O)Cc1ccc2ncccc2c1", + "c1ccc2ncccc2c1C#N", + "COc1ccc2ncccc2c1", + "CC1CCCC1", + "CCC1CCCC1", + "OC1CCCC1", + "NC1CCCC1", + "ClC1CCCC1", + "FC1CCCC1", + "C(=O)OC1CCCC1", + "C(=O)CC1CCCC1", + "C1CCCC1C#N", + "COC1CCCC1", + "CC1CCNCC1", + "CCC1CCNCC1", + "OC1CCNCC1", + "NC1CCNCC1", + "ClC1CCNCC1", + "FC1CCNCC1", + "C(=O)OC1CCNCC1", + "C(=O)CC1CCNCC1", + "C1CCNCC1C#N", + "COC1CCNCC1", + ] + smiles = pd.Series(base_smiles) + y = pd.Series(np.linspace(0.0, 1.0, len(smiles))) + return smiles, y + + @pytest.mark.parametrize( - "train_size, val_size, test_size, expected_train, expected_val, expected_test, error, method", + "method", [ - # Test cases for kmeans - (0.8, 0.0, 0.2, 1600, 0, 400, False, "kmeans"), - (0.7, 0.3, 0.0, 1400, 600, 0, False, "kmeans"), - (0.7, 0.1, 0.2, 1400, 200, 400, False, "kmeans"), - (0.6, 0.2, 0.2, 1200, 400, 400, False, "kmeans"), - # Test cases for butina - (0.8, 0.0, 0.2, 1600, 0, 400, False, "butina"), - (0.7, 0.3, 0.0, 1400, 600, 0, False, "butina"), - (0.7, 0.1, 0.2, 1400, 200, 400, False, "butina"), - (0.6, 0.2, 0.2, 1200, 400, 400, False, "butina"), - # Test cases for bemis-murcko - (0.8, 0.0, 0.2, 1600, 0, 400, False, "bemis-murcko"), - (0.7, 0.3, 0.0, 1400, 600, 0, False, "bemis-murcko"), - (0.7, 0.1, 0.2, 1400, 200, 400, False, "bemis-murcko"), - (0.6, 0.2, 0.2, 1200, 400, 400, False, "bemis-murcko"), - # Error cases - (1.0, 0.0, 0.0, 200, 0, 0, True, "kmeans"), - (0.5, 0.5, 0.5, -1, -1, -1, True, "kmeans"), + "kmeans", + "butina", + "bemis-murcko", ], ) -def test_cluster_split( - train_size, - val_size, - test_size, - expected_train, - expected_val, - expected_test, - error, - method, -): - df = pd.read_csv(CYP3A4_chembl_pchembl) - X = df["CANONICAL_SMILES"][:2000] - y = df["pChEMBL mean"][:2000] - - # Error expected - if error is True: - with pytest.raises(ValueError): - # Initialize splitter - splitter = ClusterSplitter( - train_size=train_size, - val_size=val_size, - test_size=test_size, - random_state=42, - method=method, - k_clusters=100, - ) - return +def test_cluster_split_synthetic_data(method, synthetic_cluster_data): + """ + Validate ClusterSplitter functionality with different clustering methods. - # Initialize splitter + This test ensures that molecular data is split such that training, validation, and test sets + contain mutually exclusive molecules (no data leakage). It verifies split sizes are approximately + correct and that structural separation is maintained. + """ + X, y = synthetic_cluster_data splitter = ClusterSplitter( - train_size=train_size, - val_size=val_size, - test_size=test_size, + train_size=0.7, + val_size=0.1, + test_size=0.2, random_state=42, method=method, - k_clusters=100, + k_clusters=10, + ) + X_train, X_val, X_test, y_train, y_val, y_test, groups = splitter.split( + X, y, num_iters=50 ) - # Perform the split - X_train, X_val, X_test, y_train, y_val, y_test, groups = splitter.split(X, y) - - # Check type preservation for obj in [X_train, X_val, X_test]: if obj is not None: - assert isinstance(obj, pd.Series), "X split must preserve pandas Series" + assert isinstance(obj, pd.Series) for obj in [y_train, y_val, y_test]: if obj is not None: - assert isinstance(obj, pd.Series), "y split must preserve pandas Series" + assert isinstance(obj, pd.Series) - # Check train - assert abs(X_train.shape[0] - expected_train) <= 10 - assert abs(y_train.shape[0] - expected_train) <= 10 + total = len(X) + assert abs(len(X_train) - int(0.7 * total)) <= 5 + assert abs(len(X_val) - int(0.1 * total)) <= 5 + assert abs(len(X_test) - int(0.2 * total)) <= 5 + assert len(groups) == total - # Validation set requested - if val_size > 0: - assert abs(X_val.shape[0] - expected_val) <= 10 - assert abs(y_val.shape[0] - expected_val) <= 10 + # Check for data leakage + # Assert X_train, X_val, and X_test are mutually exclusive by index + train_idx = set(X_train.index) + val_idx = set(X_val.index) + test_idx = set(X_test.index) - # Validation set not requested - else: - assert X_val is None - assert y_val is None + assert len(train_idx.intersection(val_idx)) == 0 + assert len(train_idx.intersection(test_idx)) == 0 + assert len(val_idx.intersection(test_idx)) == 0 - # Test set requested - if test_size > 0: - assert abs(X_test.shape[0] - expected_test) <= 10 - assert abs(y_test.shape[0] - expected_test) <= 10 - # Test set not requested - else: - assert X_test is None - assert y_test is None +def test_cluster_split_invalid_size_configuration(): + """Ensure ClusterSplitter raises ValueError for invalid split size configurations (e.g., sum != 1.0).""" + with pytest.raises(ValueError): + ClusterSplitter( + train_size=1.0, + val_size=0.0, + test_size=0.0, + random_state=42, + method="kmeans", + ) + + +def test_cluster_split_invalid_method(): + """Ensure ClusterSplitter raises ValueError when initialized with an unknown clustering method.""" + with pytest.raises(ValueError): + ClusterSplitter( + train_size=0.7, + val_size=0.1, + test_size=0.2, + random_state=42, + method="not-a-method", + ) diff --git a/openadmet/models/tests/unit/test_utils.py b/openadmet/models/tests/unit/test_utils.py index fbd17e31..9e65ee49 100644 --- a/openadmet/models/tests/unit/test_utils.py +++ b/openadmet/models/tests/unit/test_utils.py @@ -2,6 +2,12 @@ def click_success(result): + """ + Helper function to verify that a Click command executed successfully (exit code 0). + + If the command failed, this function prints the output and traceback to aid in debugging + before returning False. + """ if result.exit_code != 0: # -no-cov- (only occurs on test error) print(result.output) traceback.print_tb(result.exc_info[2]) From a36d7d11c5c6b7a9fde40205cbee5121c7ea2c53 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Sat, 16 May 2026 11:03:36 -0800 Subject: [PATCH 2/3] Add autospec to mocks and standardise test_base.py mock style - Add autospec=True to inference_func and from_recipe patches in test_cli.py; convert mock_spec to create_autospec(AnvilSpecification, instance=True) - Add autospec=True to both load_anvil_model_and_metadata patches in test_inference.py - Add autospec=True to both featurize patches in test_features.py - Replace monkeypatch.setattr + unittest.mock.Mock with mocker.patch / mocker.Mock in test_base.py; add docstring; remove stale import Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- openadmet/models/tests/unit/cli/test_cli.py | 6 +++--- .../models/tests/unit/features/test_features.py | 4 ++-- .../models/tests/unit/inference/test_inference.py | 2 ++ openadmet/models/tests/unit/models/test_base.py | 15 ++++++++++----- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/openadmet/models/tests/unit/cli/test_cli.py b/openadmet/models/tests/unit/cli/test_cli.py index 8e430025..cff115d6 100644 --- a/openadmet/models/tests/unit/cli/test_cli.py +++ b/openadmet/models/tests/unit/cli/test_cli.py @@ -43,7 +43,7 @@ def test_predict_cli_invokes_inference(tmp_path, runner, mocker): model_dir = tmp_path / "model_dir" model_dir.mkdir() - mock_inference = mocker.patch.object(predict_cli_module, "inference_func") + mock_inference = mocker.patch.object(predict_cli_module, "inference_func", autospec=True) result = runner.invoke( cli, @@ -77,9 +77,9 @@ def test_anvil_cli_invokes_workflow(tmp_path, runner, mocker): We mock the `AnvilSpecification` and workflow execution to verify that the CLI correctly handles recipe paths and output directories without actually running a full ML training job. """ - mock_spec = mocker.Mock() + mock_spec = mocker.create_autospec(anvil_cli_module.AnvilSpecification, instance=True) mock_from_recipe = mocker.patch.object( - anvil_cli_module.AnvilSpecification, "from_recipe", return_value=mock_spec + anvil_cli_module.AnvilSpecification, "from_recipe", autospec=True, return_value=mock_spec ) result = runner.invoke( diff --git a/openadmet/models/tests/unit/features/test_features.py b/openadmet/models/tests/unit/features/test_features.py index ea13a1be..0a8b1446 100644 --- a/openadmet/models/tests/unit/features/test_features.py +++ b/openadmet/models/tests/unit/features/test_features.py @@ -113,7 +113,7 @@ def test_feature_concatenator_drops_intersection(mocker): desc_features = np.zeros((3, 1613)) desc_indices = np.array([0, 2, 3]) mocker.patch.object( - DescriptorFeaturizer, "featurize", return_value=(desc_features, desc_indices) + DescriptorFeaturizer, "featurize", autospec=True, return_value=(desc_features, desc_indices) ) # Mock fingerprint featurizer to return 3 valid outputs (fails on index 2) @@ -122,7 +122,7 @@ def test_feature_concatenator_drops_intersection(mocker): fp_features = np.zeros((3, 2000)) fp_indices = np.array([0, 1, 3]) mocker.patch.object( - FingerprintFeaturizer, "featurize", return_value=(fp_features, fp_indices) + FingerprintFeaturizer, "featurize", autospec=True, return_value=(fp_features, fp_indices) ) smiles = ["SMI0", "SMI1", "SMI2", "SMI3"] diff --git a/openadmet/models/tests/unit/inference/test_inference.py b/openadmet/models/tests/unit/inference/test_inference.py index 7a4c369c..51173861 100644 --- a/openadmet/models/tests/unit/inference/test_inference.py +++ b/openadmet/models/tests/unit/inference/test_inference.py @@ -111,6 +111,7 @@ def test_predict_with_real_single_model( mock_loader = mocker.patch.object( inference_module, "load_anvil_model_and_metadata", + autospec=True, return_value=( trained_single_model, real_featurizer, @@ -152,6 +153,7 @@ def test_predict_with_real_ensemble_and_acquisition( mock_loader = mocker.patch.object( inference_module, "load_anvil_model_and_metadata", + autospec=True, return_value=( trained_ensemble, real_featurizer, diff --git a/openadmet/models/tests/unit/models/test_base.py b/openadmet/models/tests/unit/models/test_base.py index 05801e3b..d7b3f766 100644 --- a/openadmet/models/tests/unit/models/test_base.py +++ b/openadmet/models/tests/unit/models/test_base.py @@ -1,5 +1,4 @@ from types import SimpleNamespace -from unittest.mock import Mock import pytest @@ -48,12 +47,18 @@ def test_save_load_torch_model(mclass, tmp_path): loaded_model.load(tmp_path / "test_model.pth") -def test_lightning_model_load_uses_weights_only(monkeypatch, tmp_path): +def test_lightning_model_load_uses_weights_only(mocker, tmp_path): + """ + Verify that LightningModelBase.load calls torch.load with weights_only=True. + + We mock torch.load to avoid requiring a real .pth file and to capture the call arguments. + We mock estimator.load_state_dict because both calls are in a single expression in the + implementation, making it impossible to test one without the other. + """ state_dict = {"layer.weight": "dummy"} - torch_load = Mock(return_value=state_dict) - monkeypatch.setattr(model_base.torch, "load", torch_load) + torch_load = mocker.patch.object(model_base.torch, "load", return_value=state_dict) - estimator = Mock() + estimator = mocker.Mock() model = SimpleNamespace(estimator=estimator) path = tmp_path / "test_model.pth" From 5eed81a4eb7967c2128061d00d7d407bb29a4ce6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 16 May 2026 19:04:26 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openadmet/models/tests/unit/cli/test_cli.py | 13 ++++++++++--- .../models/tests/unit/features/test_features.py | 10 ++++++++-- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/openadmet/models/tests/unit/cli/test_cli.py b/openadmet/models/tests/unit/cli/test_cli.py index cff115d6..1160a44e 100644 --- a/openadmet/models/tests/unit/cli/test_cli.py +++ b/openadmet/models/tests/unit/cli/test_cli.py @@ -43,7 +43,9 @@ def test_predict_cli_invokes_inference(tmp_path, runner, mocker): model_dir = tmp_path / "model_dir" model_dir.mkdir() - mock_inference = mocker.patch.object(predict_cli_module, "inference_func", autospec=True) + mock_inference = mocker.patch.object( + predict_cli_module, "inference_func", autospec=True + ) result = runner.invoke( cli, @@ -77,9 +79,14 @@ def test_anvil_cli_invokes_workflow(tmp_path, runner, mocker): We mock the `AnvilSpecification` and workflow execution to verify that the CLI correctly handles recipe paths and output directories without actually running a full ML training job. """ - mock_spec = mocker.create_autospec(anvil_cli_module.AnvilSpecification, instance=True) + mock_spec = mocker.create_autospec( + anvil_cli_module.AnvilSpecification, instance=True + ) mock_from_recipe = mocker.patch.object( - anvil_cli_module.AnvilSpecification, "from_recipe", autospec=True, return_value=mock_spec + anvil_cli_module.AnvilSpecification, + "from_recipe", + autospec=True, + return_value=mock_spec, ) result = runner.invoke( diff --git a/openadmet/models/tests/unit/features/test_features.py b/openadmet/models/tests/unit/features/test_features.py index 0a8b1446..1fa7b224 100644 --- a/openadmet/models/tests/unit/features/test_features.py +++ b/openadmet/models/tests/unit/features/test_features.py @@ -113,7 +113,10 @@ def test_feature_concatenator_drops_intersection(mocker): desc_features = np.zeros((3, 1613)) desc_indices = np.array([0, 2, 3]) mocker.patch.object( - DescriptorFeaturizer, "featurize", autospec=True, return_value=(desc_features, desc_indices) + DescriptorFeaturizer, + "featurize", + autospec=True, + return_value=(desc_features, desc_indices), ) # Mock fingerprint featurizer to return 3 valid outputs (fails on index 2) @@ -122,7 +125,10 @@ def test_feature_concatenator_drops_intersection(mocker): fp_features = np.zeros((3, 2000)) fp_indices = np.array([0, 1, 3]) mocker.patch.object( - FingerprintFeaturizer, "featurize", autospec=True, return_value=(fp_features, fp_indices) + FingerprintFeaturizer, + "featurize", + autospec=True, + return_value=(fp_features, fp_indices), ) smiles = ["SMI0", "SMI1", "SMI2", "SMI3"]