fireflyframework · ancongui · Jun 25, 2026 · Jun 25, 2026
diff --git a/docs/automl.md b/docs/automl.md
@@ -100,6 +100,21 @@ the `SklearnMetricsEvaluator`, and the `DefaultSearchPolicy`. A `validator` and
 unless you supply them — when present, the validator runs first and raises on failure, and the tracker
 logs the winner's params, CV score, and model artifact.
 
+!!! tip "`cv` accepts a splitter, not just a fold count"
+
+    `cv` is passed straight to scikit-learn's `cross_val_score`, so beyond an `int` you can hand it any
+    splitter to control *how* folds are drawn — and to avoid silent leakage:
+
+    ```python
+    from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedKFold
+
+    AutoML(cv=TimeSeriesSplit(n_splits=5))          # temporal data: forward-chaining, no future leakage
+    AutoML(cv=StratifiedKFold(5, shuffle=True))     # explicit stratification + shuffling control
+    AutoML(cv=GroupKFold(n_splits=5))               # grouped data: keep a group out of train and test
+    ```
+
+    The same splitter drives every candidate's cross-validation, so the leaderboard stays comparable.
+
 ### Wiring it up
 
 === "Imperative / notebook"
@@ -185,8 +200,9 @@ Both policies return a `SearchResult(best_params, best_score, n_trials)`. The se
 The default `SklearnMetricsEvaluator` (`name="sklearn"`) supplies CV scoring names and a panel of held-out
 metrics:
 
-- **Classification**: `accuracy`, `f1` (weighted), `precision` (weighted), `recall` (weighted), plus `roc_auc`
-  and `log_loss` when probabilities are available.
+- **Classification**: `accuracy`, `f1` (weighted), `precision` (weighted), `recall` (weighted), plus
+  `roc_auc` and `log_loss` when probabilities are available. For **binary** tasks the panel also reports
+  `average_precision` (PR-AUC) and `brier_score` (probability quality / calibration).
 - **Regression**: `rmse`, `mae`, `r2`.
 
 ```python
@@ -201,6 +217,16 @@ The leaderboard and CV objective use the *scoring* name, not the raw metric: `f1
 multiclass `roc_auc` becomes `roc_auc_ovr_weighted`. This is why CV scores are always maximized — a lower
 RMSE shows up as a larger (less negative) `neg_root_mean_squared_error`.
 
+!!! tip "Select on PR-AUC for imbalanced binary problems"
+
+    ROC-AUC over-credits a classifier on heavily imbalanced data. Pass `metric="average_precision"` to
+    select the winner on **PR-AUC** instead — it is a first-class CV scorer, so the leaderboard, the
+    refit winner, and `result.cv_scoring` all reflect it:
+
+    ```python
+    result = AutoML().fit(train, metric="average_precision")   # winner chosen by PR-AUC, not accuracy
+    ```
+
 !!! tip "Two scores, one winner"
 
     `result.metric` is the human-facing metric name (e.g. `roc_auc`); `result.cv_scoring` is the sklearn

diff --git a/src/fireflyframework_datascience/automl/facade.py b/src/fireflyframework_datascience/automl/facade.py
@@ -46,7 +46,7 @@ def __init__(
         ensemble: bool = False,
         ensemble_size: int = 3,
         ensemble_impl: EnsemblePort | None = None,
-        cv: int = 5,
+        cv: int | Any = 5,
         n_trials: int = 20,
         random_state: int = 42,
     ) -> None:

diff --git a/src/fireflyframework_datascience/evaluation/adapters.py b/src/fireflyframework_datascience/evaluation/adapters.py
@@ -19,6 +19,8 @@
     "f1": "f1_weighted",
     "roc_auc": "roc_auc",
     "roc_auc_ovr": "roc_auc_ovr_weighted",
+    # PR-AUC: the right selection target for imbalanced binary problems (where ROC-AUC over-credits).
+    "average_precision": "average_precision",
     "rmse": "neg_root_mean_squared_error",
     "mae": "neg_mean_absolute_error",
     "r2": "r2",

diff --git a/tests/test_cv_and_prauc_selection.py b/tests/test_cv_and_prauc_selection.py
@@ -0,0 +1,72 @@
+# Copyright 2026 Firefly Software Foundation.
+"""Robust model selection: select on PR-AUC, and cross-validate with a custom splitter.
+
+Two gaps these tests close (real data — breast_cancer — no fakes):
+  1. ``average_precision`` (PR-AUC) is reported on holdout but was NOT a selectable CV metric:
+     ``fit(metric="average_precision")`` silently fell back to selecting on accuracy. On imbalanced
+     binary problems that picks the wrong winner.
+  2. ``AutoML(cv=...)`` must accept a scikit-learn splitter (TimeSeriesSplit for temporal data,
+     StratifiedKFold for explicit control), not only an int — so users can avoid silent leakage.
+"""
+
+from __future__ import annotations
+
+
+def test_pr_auc_is_a_selectable_cv_metric() -> None:
+    from fireflyframework_datascience.core.types import TaskType
+    from fireflyframework_datascience.evaluation.adapters import SklearnMetricsEvaluator
+
+    ev = SklearnMetricsEvaluator()
+    # PR-AUC must map to sklearn's average_precision scorer (greater-is-better), not fall back to accuracy.
+    assert ev.scoring_name(TaskType.BINARY, "average_precision") == "average_precision"
+    assert ev.greater_is_better("average_precision") is True
+
+
+def test_automl_selects_on_pr_auc() -> None:
+    from fireflyframework_datascience.automl import AutoML
+    from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader
+
+    ds = SklearnDatasetLoader().load("breast_cancer")
+    train, _ = ds.train_test_split(test_size=0.25, random_state=0)
+
+    result = AutoML(cv=3, n_trials=1, random_state=0).fit(train, metric="average_precision")
+
+    assert result.metric == "average_precision"
+    # The winner was selected using the PR-AUC scorer, not the accuracy fallback.
+    assert result.cv_scoring == "average_precision"
+    assert result.leaderboard[0].metric == "average_precision"
+    # PR-AUC is a probability in [0, 1] (a high-scoring task); a fallback would surface a different scale.
+    assert 0.5 <= result.leaderboard[0].cv_score <= 1.0
+
+
+def test_automl_accepts_a_cv_splitter() -> None:
+    from sklearn.model_selection import StratifiedKFold
+
+    from fireflyframework_datascience.automl import AutoML
+    from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader
+
+    ds = SklearnDatasetLoader().load("breast_cancer")
+    train, test = ds.train_test_split(test_size=0.25, random_state=0)
+
+    # A caller-supplied splitter must flow through to cross-validation and yield a fitted, usable model.
+    splitter = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
+    result = AutoML(cv=splitter, n_trials=1, random_state=0).fit(train)
+
+    assert result.best_model is not None
+    assert result.evaluate(test).metrics["roc_auc"] > 0.95
+
+
+def test_automl_accepts_a_time_series_splitter() -> None:
+    from sklearn.model_selection import TimeSeriesSplit
+
+    from fireflyframework_datascience.automl import AutoML
+    from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader
+
+    ds = SklearnDatasetLoader().load("breast_cancer")
+    train, _ = ds.train_test_split(test_size=0.25, random_state=0)
+
+    # Forward-chaining CV (no future leakage) must be accepted and produce a valid leaderboard score.
+    result = AutoML(cv=TimeSeriesSplit(n_splits=3), n_trials=1, random_state=0).fit(train)
+
+    assert result.best_model is not None
+    assert result.leaderboard[0].cv_score > 0.0