diff --git a/CHANGELOG.md b/CHANGELOG.md index c9b8bc3d..dc216b8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ Changes: membership thresholds. A sample is predicted as a member when its observed score exceeds the predicted threshold at quantile level (1 - alpha). No shadow models or architecture knowledge required. Registered in the attack factory as `"qmia"`. +* Refactor: move `unwrap_model` from `InstanceBasedAttack` to `sacroml.attacks.utils` + so it can be reused by other attacks that need to split a scikit-learn `Pipeline` + into its final estimator and preprocessing stages + ([#455](https://github.com/AI-SDC/SACRO-ML/issues/455)). ## Version 1.4.3 (Jan 29, 2026) diff --git a/sacroml/attacks/instance_based_attack.py b/sacroml/attacks/instance_based_attack.py index fc96bbeb..ea981f43 100644 --- a/sacroml/attacks/instance_based_attack.py +++ b/sacroml/attacks/instance_based_attack.py @@ -18,18 +18,13 @@ import numpy as np from fpdf import FPDF -from sklearn.base import BaseEstimator from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.svm import SVC, SVR, NuSVC, NuSVR, OneClassSVM -try: - from sklearn.pipeline import Pipeline -except ImportError: # pragma: no cover - Pipeline = None - from sacroml.attacks import report from sacroml.attacks.attack import Attack from sacroml.attacks.target import Target +from sacroml.attacks.utils import unwrap_model logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -154,24 +149,6 @@ def attackable(cls, target: Target) -> bool: return False return True - @staticmethod - def _unwrap_model(model: BaseEstimator) -> tuple[BaseEstimator, Pipeline | None]: - """Extract the final estimator and preprocessor from a Pipeline. - - Returns - ------- - tuple - (final_estimator, preprocessor_pipeline_or_None) - If the model is a Pipeline with preprocessing steps, returns - a Pipeline of just the preprocessing steps so X_train can be - transformed to the same space as the stored instances. - """ - if Pipeline is not None and isinstance(model, Pipeline): - final_estimator = model.steps[-1][1] - preprocessor = Pipeline(model.steps[:-1]) if len(model.steps) > 1 else None - return final_estimator, preprocessor - return model, None - def _compare_instances( self, stored_instances: np.ndarray, @@ -289,7 +266,7 @@ def _attack(self, target: Target) -> dict: dict Attack report dictionary. """ - raw_model, preprocessor = self._unwrap_model(target.model.model) + raw_model, preprocessor = unwrap_model(target.model.model) model_type = type(raw_model).__name__ is_svm = isinstance(raw_model, SVM_TYPES) diff --git a/sacroml/attacks/utils.py b/sacroml/attacks/utils.py index 9543f1bc..79301267 100644 --- a/sacroml/attacks/utils.py +++ b/sacroml/attacks/utils.py @@ -10,6 +10,7 @@ import numpy as np from scipy.stats import shapiro from sklearn.base import BaseEstimator +from sklearn.pipeline import Pipeline from sacroml.attacks.model import Model from sacroml.attacks.target import Target @@ -278,3 +279,35 @@ def get_class_by_name(class_path: str) -> type[object]: module_path, class_name = class_path.rsplit(".", 1) module = importlib.import_module(module_path) return getattr(module, class_name) + + +def unwrap_model( + model: BaseEstimator, +) -> tuple[BaseEstimator, Pipeline | None]: + """Extract the final estimator and preprocessor from a scikit-learn model. + + If ``model`` is a :class:`sklearn.pipeline.Pipeline`, the final step is + returned as the estimator and a new ``Pipeline`` containing the remaining + earlier steps is returned as the preprocessor. This allows callers to + transform inputs into the same feature space the final estimator was + fitted on. If the Pipeline has only one step, no preprocessor exists and + ``None`` is returned in its place. Non-Pipeline models are returned + unchanged with ``None`` as the preprocessor. + + Parameters + ---------- + model : BaseEstimator + A fitted scikit-learn estimator, optionally wrapped in a ``Pipeline``. + + Returns + ------- + tuple[BaseEstimator, Pipeline | None] + ``(final_estimator, preprocessor)`` where ``preprocessor`` is a + Pipeline of all steps except the last, or ``None`` if the input is + not a Pipeline or is a single-step Pipeline. + """ + if isinstance(model, Pipeline): + final_estimator = model.steps[-1][1] + preprocessor = Pipeline(model.steps[:-1]) if len(model.steps) > 1 else None + return final_estimator, preprocessor + return model, None diff --git a/tests/attacks/test_utils.py b/tests/attacks/test_utils.py new file mode 100644 index 00000000..8364cbba --- /dev/null +++ b/tests/attacks/test_utils.py @@ -0,0 +1,58 @@ +"""Tests for sacroml.attacks.utils helper functions.""" + +from __future__ import annotations + +import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC + +from sacroml.attacks.utils import unwrap_model + + +class TestUnwrapModel: + """Tests for ``unwrap_model``.""" + + def test_non_pipeline_returns_model_and_none(self): + """A plain estimator is returned unchanged with no preprocessor.""" + model = SVC(gamma=0.1) + estimator, preprocessor = unwrap_model(model) + assert estimator is model + assert preprocessor is None + + def test_single_step_pipeline_returns_final_step_only(self): + """A one-step Pipeline yields its final estimator and no preprocessor.""" + final = LogisticRegression() + pipe = Pipeline([("clf", final)]) + estimator, preprocessor = unwrap_model(pipe) + assert estimator is final + assert preprocessor is None + + def test_multi_step_pipeline_splits_preprocessor_from_estimator(self): + """A multi-step Pipeline yields the final step and a Pipeline of the rest.""" + scaler = StandardScaler() + final = LogisticRegression() + pipe = Pipeline([("scaler", scaler), ("clf", final)]) + + estimator, preprocessor = unwrap_model(pipe) + + assert estimator is final + assert isinstance(preprocessor, Pipeline) + assert [name for name, _ in preprocessor.steps] == ["scaler"] + assert preprocessor.steps[0][1] is scaler + + def test_multi_step_preprocessor_transforms_input(self): + """The returned preprocessor can transform inputs end-to-end.""" + rng = np.random.default_rng(0) + X = rng.normal(size=(20, 3)) + y = rng.integers(0, 2, size=20) + + pipe = Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression())]) + pipe.fit(X, y) + + _, preprocessor = unwrap_model(pipe) + transformed = preprocessor.transform(X) + + np.testing.assert_allclose(transformed.mean(axis=0), 0, atol=1e-8) + np.testing.assert_allclose(transformed.std(axis=0), 1, atol=1e-1)