diff --git a/docs/automl.md b/docs/automl.md index cc680f6..11a0f91 100644 --- a/docs/automl.md +++ b/docs/automl.md @@ -262,6 +262,21 @@ print(report.metrics["brier_score"]) # lower is better-calibrated default. The evaluator also reports **`average_precision`** (PR-AUC, important on imbalanced data) and **`brier_score`** (probability quality) for binary tasks alongside `roc_auc`/`accuracy`. +## Ensembling + +Single-best selection leaves accuracy on the table. Pass `ensemble=True` to stack the **top-k** +leaderboard candidates into one model via a cross-fit meta-learner: + +```python +result = AutoML(ensemble=True, ensemble_size=3).fit(train) +print(result.best_model.name) # "stacking_ensemble" +print(result.best_model.params["members"]) # the base learners that were stacked +``` + +The winner becomes an `EnsemblePort` (default: scikit-learn `StackingClassifier`/`StackingRegressor` +over the top-`ensemble_size` trainers, with a logistic / ridge meta-learner). Off by default; +`ensemble` and `calibrate` compose (the stack can itself be calibrated). DI-resolvable via `from_context`. + ## See also - [Datasets and loaders](datasets.md) — build the `Dataset` you feed to `fit`. diff --git a/src/fireflyframework_datascience/automl/facade.py b/src/fireflyframework_datascience/automl/facade.py index 281b0a1..794b98f 100644 --- a/src/fireflyframework_datascience/automl/facade.py +++ b/src/fireflyframework_datascience/automl/facade.py @@ -20,6 +20,7 @@ from fireflyframework_datascience.features import FeatureEngineerPort from fireflyframework_datascience.models import Model, TrainerPort from fireflyframework_datascience.models.calibration import CalibratorPort +from fireflyframework_datascience.models.ensemble import EnsemblePort from fireflyframework_datascience.search import SearchPolicyPort from fireflyframework_datascience.tracking import TrackerPort from fireflyframework_datascience.validation import ValidatorPort @@ -42,6 +43,9 @@ def __init__( feature_engineer: FeatureEngineerPort | None = None, calibrate: bool = False, calibrator: CalibratorPort | None = None, + ensemble: bool = False, + ensemble_size: int = 3, + ensemble_impl: EnsemblePort | None = None, cv: int = 5, n_trials: int = 20, random_state: int = 42, @@ -55,6 +59,9 @@ def __init__( self._feature_engineer = feature_engineer self._calibrate = calibrate self._calibrator = calibrator + self._ensemble = ensemble + self._ensemble_size = ensemble_size + self._ensemble_impl = ensemble_impl self._cv = cv self._n_trials = n_trials self._random_state = random_state @@ -73,6 +80,7 @@ def from_context(cls, context: Any, **overrides: Any) -> AutoML: explainer=container.resolve_optional(ExplainerPort), feature_engineer=container.resolve_optional(FeatureEngineerPort), calibrator=container.resolve_optional(CalibratorPort), + ensemble_impl=container.resolve_optional(EnsemblePort), **overrides, ) @@ -99,7 +107,7 @@ def fit(self, dataset: Dataset, *, task: TaskType | None = None, metric: str | N run = self._tracker.start_run(f"automl:{dataset.name}") if self._tracker else None leaderboard: list[LeaderboardEntry] = [] - best: tuple[float, TrainerPort, dict[str, Any]] | None = None + evaluated: list[tuple[float, TrainerPort, dict[str, Any]]] = [] for trainer in candidates: space = trainer.param_space(task) if self._n_trials > 1 else {} @@ -111,25 +119,23 @@ def fit(self, dataset: Dataset, *, task: TaskType | None = None, metric: str | N ) leaderboard.append(LeaderboardEntry(trainer.name, dict(result.best_params), result.best_score, metric)) logger.info("AutoML candidate %s: cv %s=%.4f", trainer.name, metric, result.best_score) - if best is None or result.best_score > best[0]: - best = (result.best_score, trainer, dict(result.best_params)) - - assert best is not None - _, best_trainer, best_params = best - estimator = self._pipeline(best_trainer.make_estimator(task, best_params), dataset.X) - # Optional probability calibration of the winner (classification only): wrap + cross-fit so the - # served model's probabilities are trustworthy. Off by default; classical-first is unchanged. + evaluated.append((result.best_score, trainer, dict(result.best_params))) + + evaluated.sort(key=lambda e: e[0], reverse=True) + # Pick the winner: a stacking ensemble of the top-k candidates (if ensemble=True) or the single + # best. Then optionally calibrate (classification only). Both are off by default — classical-first. + estimator, model_name, model_params = self._build_winner(evaluated, task, dataset.X) calibrator = self._calibrator or (_default_calibrator() if self._calibrate else None) if self._calibrate and calibrator is not None and calibrator.supports(task): estimator = calibrator.calibrate(estimator, dataset.X, dataset.y, task) else: estimator.fit(dataset.X, dataset.y) model = Model( - name=best_trainer.name, + name=model_name, estimator=estimator, task=task, feature_names=list(dataset.feature_names), - params=best_params, + params=model_params, ) leaderboard.sort(key=lambda e: e.cv_score, reverse=True) self._track_results(run, model, leaderboard, metric) @@ -146,6 +152,18 @@ def fit(self, dataset: Dataset, *, task: TaskType | None = None, metric: str | N # -- internals -------------------------------------------------------- + def _build_winner( + self, evaluated: list[tuple[float, TrainerPort, dict[str, Any]]], task: TaskType, X: Any + ) -> tuple[Any, str, dict[str, Any]]: + """The model to serve: a stacking ensemble of the top-k candidates, or the single best.""" + if self._ensemble and len(evaluated) >= 2: + ensemble = self._ensemble_impl or _default_ensemble() + k = min(self._ensemble_size, len(evaluated)) + bases = [(t.name, self._pipeline(t.make_estimator(task, p), X)) for _, t, p in evaluated[:k]] + return ensemble.build(bases, task), ensemble.name, {"members": [t.name for _, t, _ in evaluated[:k]]} + _, trainer, params = evaluated[0] + return self._pipeline(trainer.make_estimator(task, params), X), trainer.name, params + def _objective(self, trainer: TrainerPort, task: TaskType, dataset: Dataset, scoring: str): from sklearn.model_selection import cross_val_score @@ -232,4 +250,10 @@ def _default_calibrator() -> CalibratorPort: return SklearnCalibrator() +def _default_ensemble() -> EnsemblePort: + from fireflyframework_datascience.models.ensemble import StackingEnsemble + + return StackingEnsemble() + + __all__ = ["AutoML"] diff --git a/src/fireflyframework_datascience/models/ensemble.py b/src/fireflyframework_datascience/models/ensemble.py new file mode 100644 index 0000000..8906971 --- /dev/null +++ b/src/fireflyframework_datascience/models/ensemble.py @@ -0,0 +1,53 @@ +# Copyright 2026 Firefly Software Foundation. +"""Ensembling — combine the top-k AutoML candidates into one stronger model. + +Single-best selection leaves accuracy on the table; stacking the strongest candidates with a +cross-fit meta-learner is the standard last-mile lift in production AutoML. An :class:`EnsemblePort` +builds the combined estimator from the leaderboard's base learners. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable + +if TYPE_CHECKING: + from fireflyframework_datascience.core.types import TaskType + + +@runtime_checkable +class EnsemblePort(Protocol): + """Builds an (unfitted) ensemble estimator from named base learners.""" + + name: str + + def supports(self, task: TaskType) -> bool: ... + + def build(self, base_estimators: list[tuple[str, Any]], task: TaskType) -> Any: ... + + +class StackingEnsemble: + """Stacks base learners via scikit-learn ``Stacking{Classifier,Regressor}`` (cross-fit meta-learner).""" + + name = "stacking_ensemble" + + def __init__(self, *, cv: int = 3) -> None: + self._cv = cv + + def supports(self, task: TaskType) -> bool: + return True + + def build(self, base_estimators: list[tuple[str, Any]], task: TaskType) -> Any: + if task.is_classification(): + from sklearn.ensemble import StackingClassifier + from sklearn.linear_model import LogisticRegression + + return StackingClassifier( + estimators=base_estimators, final_estimator=LogisticRegression(max_iter=1000), cv=self._cv + ) + from sklearn.ensemble import StackingRegressor + from sklearn.linear_model import RidgeCV + + return StackingRegressor(estimators=base_estimators, final_estimator=RidgeCV(), cv=self._cv) + + +__all__ = ["EnsemblePort", "StackingEnsemble"] diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py new file mode 100644 index 0000000..d4972d3 --- /dev/null +++ b/tests/test_ensemble.py @@ -0,0 +1,39 @@ +# Copyright 2026 Firefly Software Foundation. +"""AutoML(ensemble=True) stacks the top-k leaderboard models into one stronger model. + +Single-best selection leaves accuracy on the table; stacking the strongest candidates is the standard +last-mile lift in production AutoML. Real data (breast_cancer), no fakes. +""" + +from __future__ import annotations + + +def test_automl_ensemble_builds_a_competitive_stack() -> None: + from sklearn.ensemble import StackingClassifier + + from fireflyframework_datascience.automl import AutoML + from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader + + ds = SklearnDatasetLoader().load("breast_cancer") + train, test = ds.train_test_split(test_size=0.25, random_state=0) + + result = AutoML(cv=3, n_trials=1, random_state=0, ensemble=True).fit(train) + + assert result.best_model.name == "stacking_ensemble" + assert isinstance(result.best_model.estimator, StackingClassifier) + # the stack combines >= 2 base learners and is competitive on holdout + assert len(result.best_model.params["members"]) >= 2 + assert result.evaluate(test).metrics["roc_auc"] > 0.95 + + +def test_automl_without_ensemble_is_single_model() -> None: + from sklearn.ensemble import StackingClassifier + + from fireflyframework_datascience.automl import AutoML + from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader + + ds = SklearnDatasetLoader().load("breast_cancer") + train, _ = ds.train_test_split(test_size=0.25, random_state=0) + result = AutoML(cv=3, n_trials=1, random_state=0).fit(train) + assert not isinstance(result.best_model.estimator, StackingClassifier) + assert result.best_model.name != "stacking_ensemble"