Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions docs/automl.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,21 @@ the `SklearnMetricsEvaluator`, and the `DefaultSearchPolicy`. A `validator` and
unless you supply them — when present, the validator runs first and raises on failure, and the tracker
logs the winner's params, CV score, and model artifact.

!!! tip "`cv` accepts a splitter, not just a fold count"

`cv` is passed straight to scikit-learn's `cross_val_score`, so beyond an `int` you can hand it any
splitter to control *how* folds are drawn — and to avoid silent leakage:

```python
from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedKFold

AutoML(cv=TimeSeriesSplit(n_splits=5)) # temporal data: forward-chaining, no future leakage
AutoML(cv=StratifiedKFold(5, shuffle=True)) # explicit stratification + shuffling control
AutoML(cv=GroupKFold(n_splits=5)) # grouped data: keep a group out of train and test
```

The same splitter drives every candidate's cross-validation, so the leaderboard stays comparable.

### Wiring it up

=== "Imperative / notebook"
Expand Down Expand Up @@ -185,8 +200,9 @@ Both policies return a `SearchResult(best_params, best_score, n_trials)`. The se
The default `SklearnMetricsEvaluator` (`name="sklearn"`) supplies CV scoring names and a panel of held-out
metrics:

- **Classification**: `accuracy`, `f1` (weighted), `precision` (weighted), `recall` (weighted), plus `roc_auc`
and `log_loss` when probabilities are available.
- **Classification**: `accuracy`, `f1` (weighted), `precision` (weighted), `recall` (weighted), plus
`roc_auc` and `log_loss` when probabilities are available. For **binary** tasks the panel also reports
`average_precision` (PR-AUC) and `brier_score` (probability quality / calibration).
- **Regression**: `rmse`, `mae`, `r2`.

```python
Expand All @@ -201,6 +217,16 @@ The leaderboard and CV objective use the *scoring* name, not the raw metric: `f1
multiclass `roc_auc` becomes `roc_auc_ovr_weighted`. This is why CV scores are always maximized — a lower
RMSE shows up as a larger (less negative) `neg_root_mean_squared_error`.

!!! tip "Select on PR-AUC for imbalanced binary problems"

ROC-AUC over-credits a classifier on heavily imbalanced data. Pass `metric="average_precision"` to
select the winner on **PR-AUC** instead — it is a first-class CV scorer, so the leaderboard, the
refit winner, and `result.cv_scoring` all reflect it:

```python
result = AutoML().fit(train, metric="average_precision") # winner chosen by PR-AUC, not accuracy
```

!!! tip "Two scores, one winner"

`result.metric` is the human-facing metric name (e.g. `roc_auc`); `result.cv_scoring` is the sklearn
Expand Down
2 changes: 1 addition & 1 deletion src/fireflyframework_datascience/automl/facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(
ensemble: bool = False,
ensemble_size: int = 3,
ensemble_impl: EnsemblePort | None = None,
cv: int = 5,
cv: int | Any = 5,
n_trials: int = 20,
random_state: int = 42,
) -> None:
Expand Down
2 changes: 2 additions & 0 deletions src/fireflyframework_datascience/evaluation/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
"f1": "f1_weighted",
"roc_auc": "roc_auc",
"roc_auc_ovr": "roc_auc_ovr_weighted",
# PR-AUC: the right selection target for imbalanced binary problems (where ROC-AUC over-credits).
"average_precision": "average_precision",
"rmse": "neg_root_mean_squared_error",
"mae": "neg_mean_absolute_error",
"r2": "r2",
Expand Down
72 changes: 72 additions & 0 deletions tests/test_cv_and_prauc_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright 2026 Firefly Software Foundation.
"""Robust model selection: select on PR-AUC, and cross-validate with a custom splitter.

Two gaps these tests close (real data — breast_cancer — no fakes):
1. ``average_precision`` (PR-AUC) is reported on holdout but was NOT a selectable CV metric:
``fit(metric="average_precision")`` silently fell back to selecting on accuracy. On imbalanced
binary problems that picks the wrong winner.
2. ``AutoML(cv=...)`` must accept a scikit-learn splitter (TimeSeriesSplit for temporal data,
StratifiedKFold for explicit control), not only an int — so users can avoid silent leakage.
"""

from __future__ import annotations


def test_pr_auc_is_a_selectable_cv_metric() -> None:
from fireflyframework_datascience.core.types import TaskType
from fireflyframework_datascience.evaluation.adapters import SklearnMetricsEvaluator

ev = SklearnMetricsEvaluator()
# PR-AUC must map to sklearn's average_precision scorer (greater-is-better), not fall back to accuracy.
assert ev.scoring_name(TaskType.BINARY, "average_precision") == "average_precision"
assert ev.greater_is_better("average_precision") is True


def test_automl_selects_on_pr_auc() -> None:
from fireflyframework_datascience.automl import AutoML
from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader

ds = SklearnDatasetLoader().load("breast_cancer")
train, _ = ds.train_test_split(test_size=0.25, random_state=0)

result = AutoML(cv=3, n_trials=1, random_state=0).fit(train, metric="average_precision")

assert result.metric == "average_precision"
# The winner was selected using the PR-AUC scorer, not the accuracy fallback.
assert result.cv_scoring == "average_precision"
assert result.leaderboard[0].metric == "average_precision"
# PR-AUC is a probability in [0, 1] (a high-scoring task); a fallback would surface a different scale.
assert 0.5 <= result.leaderboard[0].cv_score <= 1.0


def test_automl_accepts_a_cv_splitter() -> None:
from sklearn.model_selection import StratifiedKFold

from fireflyframework_datascience.automl import AutoML
from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader

ds = SklearnDatasetLoader().load("breast_cancer")
train, test = ds.train_test_split(test_size=0.25, random_state=0)

# A caller-supplied splitter must flow through to cross-validation and yield a fitted, usable model.
splitter = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
result = AutoML(cv=splitter, n_trials=1, random_state=0).fit(train)

assert result.best_model is not None
assert result.evaluate(test).metrics["roc_auc"] > 0.95


def test_automl_accepts_a_time_series_splitter() -> None:
from sklearn.model_selection import TimeSeriesSplit

from fireflyframework_datascience.automl import AutoML
from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader

ds = SklearnDatasetLoader().load("breast_cancer")
train, _ = ds.train_test_split(test_size=0.25, random_state=0)

# Forward-chaining CV (no future leakage) must be accepted and produce a valid leaderboard score.
result = AutoML(cv=TimeSeriesSplit(n_splits=3), n_trials=1, random_state=0).fit(train)

assert result.best_model is not None
assert result.leaderboard[0].cv_score > 0.0
Loading