Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/automl.md
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,21 @@ print(report.metrics["brier_score"]) # lower is better-calibrated
default. The evaluator also reports **`average_precision`** (PR-AUC, important on imbalanced data) and
**`brier_score`** (probability quality) for binary tasks alongside `roc_auc`/`accuracy`.

## Ensembling

Single-best selection leaves accuracy on the table. Pass `ensemble=True` to stack the **top-k**
leaderboard candidates into one model via a cross-fit meta-learner:

```python
result = AutoML(ensemble=True, ensemble_size=3).fit(train)
print(result.best_model.name) # "stacking_ensemble"
print(result.best_model.params["members"]) # the base learners that were stacked
```

The winner becomes an `EnsemblePort` (default: scikit-learn `StackingClassifier`/`StackingRegressor`
over the top-`ensemble_size` trainers, with a logistic / ridge meta-learner). Off by default;
`ensemble` and `calibrate` compose (the stack can itself be calibrated). DI-resolvable via `from_context`.

## See also

- [Datasets and loaders](datasets.md) — build the `Dataset` you feed to `fit`.
Expand Down
46 changes: 35 additions & 11 deletions src/fireflyframework_datascience/automl/facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from fireflyframework_datascience.features import FeatureEngineerPort
from fireflyframework_datascience.models import Model, TrainerPort
from fireflyframework_datascience.models.calibration import CalibratorPort
from fireflyframework_datascience.models.ensemble import EnsemblePort
from fireflyframework_datascience.search import SearchPolicyPort
from fireflyframework_datascience.tracking import TrackerPort
from fireflyframework_datascience.validation import ValidatorPort
Expand All @@ -42,6 +43,9 @@ def __init__(
feature_engineer: FeatureEngineerPort | None = None,
calibrate: bool = False,
calibrator: CalibratorPort | None = None,
ensemble: bool = False,
ensemble_size: int = 3,
ensemble_impl: EnsemblePort | None = None,
cv: int = 5,
n_trials: int = 20,
random_state: int = 42,
Expand All @@ -55,6 +59,9 @@ def __init__(
self._feature_engineer = feature_engineer
self._calibrate = calibrate
self._calibrator = calibrator
self._ensemble = ensemble
self._ensemble_size = ensemble_size
self._ensemble_impl = ensemble_impl
self._cv = cv
self._n_trials = n_trials
self._random_state = random_state
Expand All @@ -73,6 +80,7 @@ def from_context(cls, context: Any, **overrides: Any) -> AutoML:
explainer=container.resolve_optional(ExplainerPort),
feature_engineer=container.resolve_optional(FeatureEngineerPort),
calibrator=container.resolve_optional(CalibratorPort),
ensemble_impl=container.resolve_optional(EnsemblePort),
**overrides,
)

Expand All @@ -99,7 +107,7 @@ def fit(self, dataset: Dataset, *, task: TaskType | None = None, metric: str | N

run = self._tracker.start_run(f"automl:{dataset.name}") if self._tracker else None
leaderboard: list[LeaderboardEntry] = []
best: tuple[float, TrainerPort, dict[str, Any]] | None = None
evaluated: list[tuple[float, TrainerPort, dict[str, Any]]] = []

for trainer in candidates:
space = trainer.param_space(task) if self._n_trials > 1 else {}
Expand All @@ -111,25 +119,23 @@ def fit(self, dataset: Dataset, *, task: TaskType | None = None, metric: str | N
)
leaderboard.append(LeaderboardEntry(trainer.name, dict(result.best_params), result.best_score, metric))
logger.info("AutoML candidate %s: cv %s=%.4f", trainer.name, metric, result.best_score)
if best is None or result.best_score > best[0]:
best = (result.best_score, trainer, dict(result.best_params))

assert best is not None
_, best_trainer, best_params = best
estimator = self._pipeline(best_trainer.make_estimator(task, best_params), dataset.X)
# Optional probability calibration of the winner (classification only): wrap + cross-fit so the
# served model's probabilities are trustworthy. Off by default; classical-first is unchanged.
evaluated.append((result.best_score, trainer, dict(result.best_params)))

evaluated.sort(key=lambda e: e[0], reverse=True)
# Pick the winner: a stacking ensemble of the top-k candidates (if ensemble=True) or the single
# best. Then optionally calibrate (classification only). Both are off by default — classical-first.
estimator, model_name, model_params = self._build_winner(evaluated, task, dataset.X)
calibrator = self._calibrator or (_default_calibrator() if self._calibrate else None)
if self._calibrate and calibrator is not None and calibrator.supports(task):
estimator = calibrator.calibrate(estimator, dataset.X, dataset.y, task)
else:
estimator.fit(dataset.X, dataset.y)
model = Model(
name=best_trainer.name,
name=model_name,
estimator=estimator,
task=task,
feature_names=list(dataset.feature_names),
params=best_params,
params=model_params,
)
leaderboard.sort(key=lambda e: e.cv_score, reverse=True)
self._track_results(run, model, leaderboard, metric)
Expand All @@ -146,6 +152,18 @@ def fit(self, dataset: Dataset, *, task: TaskType | None = None, metric: str | N

# -- internals --------------------------------------------------------

def _build_winner(
self, evaluated: list[tuple[float, TrainerPort, dict[str, Any]]], task: TaskType, X: Any
) -> tuple[Any, str, dict[str, Any]]:
"""The model to serve: a stacking ensemble of the top-k candidates, or the single best."""
if self._ensemble and len(evaluated) >= 2:
ensemble = self._ensemble_impl or _default_ensemble()
k = min(self._ensemble_size, len(evaluated))
bases = [(t.name, self._pipeline(t.make_estimator(task, p), X)) for _, t, p in evaluated[:k]]
return ensemble.build(bases, task), ensemble.name, {"members": [t.name for _, t, _ in evaluated[:k]]}
_, trainer, params = evaluated[0]
return self._pipeline(trainer.make_estimator(task, params), X), trainer.name, params

def _objective(self, trainer: TrainerPort, task: TaskType, dataset: Dataset, scoring: str):
from sklearn.model_selection import cross_val_score

Expand Down Expand Up @@ -232,4 +250,10 @@ def _default_calibrator() -> CalibratorPort:
return SklearnCalibrator()


def _default_ensemble() -> EnsemblePort:
from fireflyframework_datascience.models.ensemble import StackingEnsemble

return StackingEnsemble()


__all__ = ["AutoML"]
53 changes: 53 additions & 0 deletions src/fireflyframework_datascience/models/ensemble.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright 2026 Firefly Software Foundation.
"""Ensembling — combine the top-k AutoML candidates into one stronger model.

Single-best selection leaves accuracy on the table; stacking the strongest candidates with a
cross-fit meta-learner is the standard last-mile lift in production AutoML. An :class:`EnsemblePort`
builds the combined estimator from the leaderboard's base learners.
"""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable

if TYPE_CHECKING:
from fireflyframework_datascience.core.types import TaskType


@runtime_checkable
class EnsemblePort(Protocol):
"""Builds an (unfitted) ensemble estimator from named base learners."""

name: str

def supports(self, task: TaskType) -> bool: ...

def build(self, base_estimators: list[tuple[str, Any]], task: TaskType) -> Any: ...


class StackingEnsemble:
"""Stacks base learners via scikit-learn ``Stacking{Classifier,Regressor}`` (cross-fit meta-learner)."""

name = "stacking_ensemble"

def __init__(self, *, cv: int = 3) -> None:
self._cv = cv

def supports(self, task: TaskType) -> bool:
return True

def build(self, base_estimators: list[tuple[str, Any]], task: TaskType) -> Any:
if task.is_classification():
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

return StackingClassifier(
estimators=base_estimators, final_estimator=LogisticRegression(max_iter=1000), cv=self._cv
)
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

return StackingRegressor(estimators=base_estimators, final_estimator=RidgeCV(), cv=self._cv)


__all__ = ["EnsemblePort", "StackingEnsemble"]
39 changes: 39 additions & 0 deletions tests/test_ensemble.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright 2026 Firefly Software Foundation.
"""AutoML(ensemble=True) stacks the top-k leaderboard models into one stronger model.

Single-best selection leaves accuracy on the table; stacking the strongest candidates is the standard
last-mile lift in production AutoML. Real data (breast_cancer), no fakes.
"""

from __future__ import annotations


def test_automl_ensemble_builds_a_competitive_stack() -> None:
from sklearn.ensemble import StackingClassifier

from fireflyframework_datascience.automl import AutoML
from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader

ds = SklearnDatasetLoader().load("breast_cancer")
train, test = ds.train_test_split(test_size=0.25, random_state=0)

result = AutoML(cv=3, n_trials=1, random_state=0, ensemble=True).fit(train)

assert result.best_model.name == "stacking_ensemble"
assert isinstance(result.best_model.estimator, StackingClassifier)
# the stack combines >= 2 base learners and is competitive on holdout
assert len(result.best_model.params["members"]) >= 2
assert result.evaluate(test).metrics["roc_auc"] > 0.95


def test_automl_without_ensemble_is_single_model() -> None:
from sklearn.ensemble import StackingClassifier

from fireflyframework_datascience.automl import AutoML
from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader

ds = SklearnDatasetLoader().load("breast_cancer")
train, _ = ds.train_test_split(test_size=0.25, random_state=0)
result = AutoML(cv=3, n_trials=1, random_state=0).fit(train)
assert not isinstance(result.best_model.estimator, StackingClassifier)
assert result.best_model.name != "stacking_ensemble"
Loading