diff --git a/docs/automl.md b/docs/automl.md index 11a0f91..dd3c37c 100644 --- a/docs/automl.md +++ b/docs/automl.md @@ -100,6 +100,21 @@ the `SklearnMetricsEvaluator`, and the `DefaultSearchPolicy`. A `validator` and unless you supply them — when present, the validator runs first and raises on failure, and the tracker logs the winner's params, CV score, and model artifact. +!!! tip "`cv` accepts a splitter, not just a fold count" + + `cv` is passed straight to scikit-learn's `cross_val_score`, so beyond an `int` you can hand it any + splitter to control *how* folds are drawn — and to avoid silent leakage: + + ```python + from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedKFold + + AutoML(cv=TimeSeriesSplit(n_splits=5)) # temporal data: forward-chaining, no future leakage + AutoML(cv=StratifiedKFold(5, shuffle=True)) # explicit stratification + shuffling control + AutoML(cv=GroupKFold(n_splits=5)) # grouped data: keep a group out of train and test + ``` + + The same splitter drives every candidate's cross-validation, so the leaderboard stays comparable. + ### Wiring it up === "Imperative / notebook" @@ -185,8 +200,9 @@ Both policies return a `SearchResult(best_params, best_score, n_trials)`. The se The default `SklearnMetricsEvaluator` (`name="sklearn"`) supplies CV scoring names and a panel of held-out metrics: -- **Classification**: `accuracy`, `f1` (weighted), `precision` (weighted), `recall` (weighted), plus `roc_auc` - and `log_loss` when probabilities are available. +- **Classification**: `accuracy`, `f1` (weighted), `precision` (weighted), `recall` (weighted), plus + `roc_auc` and `log_loss` when probabilities are available. For **binary** tasks the panel also reports + `average_precision` (PR-AUC) and `brier_score` (probability quality / calibration). - **Regression**: `rmse`, `mae`, `r2`. ```python @@ -201,6 +217,16 @@ The leaderboard and CV objective use the *scoring* name, not the raw metric: `f1 multiclass `roc_auc` becomes `roc_auc_ovr_weighted`. This is why CV scores are always maximized — a lower RMSE shows up as a larger (less negative) `neg_root_mean_squared_error`. +!!! tip "Select on PR-AUC for imbalanced binary problems" + + ROC-AUC over-credits a classifier on heavily imbalanced data. Pass `metric="average_precision"` to + select the winner on **PR-AUC** instead — it is a first-class CV scorer, so the leaderboard, the + refit winner, and `result.cv_scoring` all reflect it: + + ```python + result = AutoML().fit(train, metric="average_precision") # winner chosen by PR-AUC, not accuracy + ``` + !!! tip "Two scores, one winner" `result.metric` is the human-facing metric name (e.g. `roc_auc`); `result.cv_scoring` is the sklearn diff --git a/src/fireflyframework_datascience/automl/facade.py b/src/fireflyframework_datascience/automl/facade.py index 794b98f..90520f7 100644 --- a/src/fireflyframework_datascience/automl/facade.py +++ b/src/fireflyframework_datascience/automl/facade.py @@ -46,7 +46,7 @@ def __init__( ensemble: bool = False, ensemble_size: int = 3, ensemble_impl: EnsemblePort | None = None, - cv: int = 5, + cv: int | Any = 5, n_trials: int = 20, random_state: int = 42, ) -> None: diff --git a/src/fireflyframework_datascience/evaluation/adapters.py b/src/fireflyframework_datascience/evaluation/adapters.py index 6ce6f57..a9870dc 100644 --- a/src/fireflyframework_datascience/evaluation/adapters.py +++ b/src/fireflyframework_datascience/evaluation/adapters.py @@ -19,6 +19,8 @@ "f1": "f1_weighted", "roc_auc": "roc_auc", "roc_auc_ovr": "roc_auc_ovr_weighted", + # PR-AUC: the right selection target for imbalanced binary problems (where ROC-AUC over-credits). + "average_precision": "average_precision", "rmse": "neg_root_mean_squared_error", "mae": "neg_mean_absolute_error", "r2": "r2", diff --git a/tests/test_cv_and_prauc_selection.py b/tests/test_cv_and_prauc_selection.py new file mode 100644 index 0000000..6521423 --- /dev/null +++ b/tests/test_cv_and_prauc_selection.py @@ -0,0 +1,72 @@ +# Copyright 2026 Firefly Software Foundation. +"""Robust model selection: select on PR-AUC, and cross-validate with a custom splitter. + +Two gaps these tests close (real data — breast_cancer — no fakes): + 1. ``average_precision`` (PR-AUC) is reported on holdout but was NOT a selectable CV metric: + ``fit(metric="average_precision")`` silently fell back to selecting on accuracy. On imbalanced + binary problems that picks the wrong winner. + 2. ``AutoML(cv=...)`` must accept a scikit-learn splitter (TimeSeriesSplit for temporal data, + StratifiedKFold for explicit control), not only an int — so users can avoid silent leakage. +""" + +from __future__ import annotations + + +def test_pr_auc_is_a_selectable_cv_metric() -> None: + from fireflyframework_datascience.core.types import TaskType + from fireflyframework_datascience.evaluation.adapters import SklearnMetricsEvaluator + + ev = SklearnMetricsEvaluator() + # PR-AUC must map to sklearn's average_precision scorer (greater-is-better), not fall back to accuracy. + assert ev.scoring_name(TaskType.BINARY, "average_precision") == "average_precision" + assert ev.greater_is_better("average_precision") is True + + +def test_automl_selects_on_pr_auc() -> None: + from fireflyframework_datascience.automl import AutoML + from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader + + ds = SklearnDatasetLoader().load("breast_cancer") + train, _ = ds.train_test_split(test_size=0.25, random_state=0) + + result = AutoML(cv=3, n_trials=1, random_state=0).fit(train, metric="average_precision") + + assert result.metric == "average_precision" + # The winner was selected using the PR-AUC scorer, not the accuracy fallback. + assert result.cv_scoring == "average_precision" + assert result.leaderboard[0].metric == "average_precision" + # PR-AUC is a probability in [0, 1] (a high-scoring task); a fallback would surface a different scale. + assert 0.5 <= result.leaderboard[0].cv_score <= 1.0 + + +def test_automl_accepts_a_cv_splitter() -> None: + from sklearn.model_selection import StratifiedKFold + + from fireflyframework_datascience.automl import AutoML + from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader + + ds = SklearnDatasetLoader().load("breast_cancer") + train, test = ds.train_test_split(test_size=0.25, random_state=0) + + # A caller-supplied splitter must flow through to cross-validation and yield a fitted, usable model. + splitter = StratifiedKFold(n_splits=4, shuffle=True, random_state=0) + result = AutoML(cv=splitter, n_trials=1, random_state=0).fit(train) + + assert result.best_model is not None + assert result.evaluate(test).metrics["roc_auc"] > 0.95 + + +def test_automl_accepts_a_time_series_splitter() -> None: + from sklearn.model_selection import TimeSeriesSplit + + from fireflyframework_datascience.automl import AutoML + from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader + + ds = SklearnDatasetLoader().load("breast_cancer") + train, _ = ds.train_test_split(test_size=0.25, random_state=0) + + # Forward-chaining CV (no future leakage) must be accepted and produce a valid leaderboard score. + result = AutoML(cv=TimeSeriesSplit(n_splits=3), n_trials=1, random_state=0).fit(train) + + assert result.best_model is not None + assert result.leaderboard[0].cv_score > 0.0