From d3ec1547160aa0c383477a971596f11df4f0ca70 Mon Sep 17 00:00:00 2001 From: Andres Contreras Date: Thu, 25 Jun 2026 20:54:08 +0200 Subject: [PATCH] feat: probability calibration + richer classification metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modeling-depth batch: - CalibratorPort + SklearnCalibrator (cross-validated CalibratedClassifierCV); opt-in AutoML(calibrate=True) wraps the winning classifier after selection so served probabilities are trustworthy. Off by default (classical-first unchanged); DI-resolvable. - Evaluator now reports average_precision (PR-AUC, key on imbalanced data) and brier_score (probability quality) for binary tasks, alongside roc_auc/accuracy. Additive — no change to defaults or CV scoring. TDD on real breast_cancer data; docs note in automl.md. --- docs/automl.md | 16 +++++++ .../automl/facade.py | 20 +++++++- .../evaluation/adapters.py | 5 +- .../models/calibration.py | 48 +++++++++++++++++++ tests/evaluation/test_richer_metrics.py | 29 +++++++++++ tests/test_calibration.py | 41 ++++++++++++++++ 6 files changed, 157 insertions(+), 2 deletions(-) create mode 100644 src/fireflyframework_datascience/models/calibration.py create mode 100644 tests/evaluation/test_richer_metrics.py create mode 100644 tests/test_calibration.py diff --git a/docs/automl.md b/docs/automl.md index 45f87c5..cc680f6 100644 --- a/docs/automl.md +++ b/docs/automl.md @@ -246,6 +246,22 @@ Each `LeaderboardEntry` holds `model_name`, `params`, `cv_score`, and `metric`, linear roc_auc=0.9886 ``` +## Probability calibration + +Tree and boosting models often produce over-confident probabilities. For risk- or cost-sensitive +decisions, calibrate the winner so its probabilities are trustworthy — pass `calibrate=True`: + +```python +result = AutoML(calibrate=True).fit(train) # wraps the winner in cross-validated calibration +report = result.evaluate(test) +print(report.metrics["brier_score"]) # lower is better-calibrated +``` + +`calibrate` wraps the selected classifier in a `CalibratorPort` (default: scikit-learn +`CalibratedClassifierCV`, isotonic, cross-fit) after model selection — classification only, off by +default. The evaluator also reports **`average_precision`** (PR-AUC, important on imbalanced data) and +**`brier_score`** (probability quality) for binary tasks alongside `roc_auc`/`accuracy`. + ## See also - [Datasets and loaders](datasets.md) — build the `Dataset` you feed to `fit`. diff --git a/src/fireflyframework_datascience/automl/facade.py b/src/fireflyframework_datascience/automl/facade.py index dec7cb6..281b0a1 100644 --- a/src/fireflyframework_datascience/automl/facade.py +++ b/src/fireflyframework_datascience/automl/facade.py @@ -19,6 +19,7 @@ from fireflyframework_datascience.explainability import ExplainerPort from fireflyframework_datascience.features import FeatureEngineerPort from fireflyframework_datascience.models import Model, TrainerPort +from fireflyframework_datascience.models.calibration import CalibratorPort from fireflyframework_datascience.search import SearchPolicyPort from fireflyframework_datascience.tracking import TrackerPort from fireflyframework_datascience.validation import ValidatorPort @@ -39,6 +40,8 @@ def __init__( tracker: TrackerPort | None = None, explainer: ExplainerPort | None = None, feature_engineer: FeatureEngineerPort | None = None, + calibrate: bool = False, + calibrator: CalibratorPort | None = None, cv: int = 5, n_trials: int = 20, random_state: int = 42, @@ -50,6 +53,8 @@ def __init__( self._tracker = tracker self._explainer = explainer self._feature_engineer = feature_engineer + self._calibrate = calibrate + self._calibrator = calibrator self._cv = cv self._n_trials = n_trials self._random_state = random_state @@ -67,6 +72,7 @@ def from_context(cls, context: Any, **overrides: Any) -> AutoML: tracker=container.resolve_optional(TrackerPort), explainer=container.resolve_optional(ExplainerPort), feature_engineer=container.resolve_optional(FeatureEngineerPort), + calibrator=container.resolve_optional(CalibratorPort), **overrides, ) @@ -111,7 +117,13 @@ def fit(self, dataset: Dataset, *, task: TaskType | None = None, metric: str | N assert best is not None _, best_trainer, best_params = best estimator = self._pipeline(best_trainer.make_estimator(task, best_params), dataset.X) - estimator.fit(dataset.X, dataset.y) + # Optional probability calibration of the winner (classification only): wrap + cross-fit so the + # served model's probabilities are trustworthy. Off by default; classical-first is unchanged. + calibrator = self._calibrator or (_default_calibrator() if self._calibrate else None) + if self._calibrate and calibrator is not None and calibrator.supports(task): + estimator = calibrator.calibrate(estimator, dataset.X, dataset.y, task) + else: + estimator.fit(dataset.X, dataset.y) model = Model( name=best_trainer.name, estimator=estimator, @@ -214,4 +226,10 @@ def _default_search() -> SearchPolicyPort: return DefaultSearchPolicy() +def _default_calibrator() -> CalibratorPort: + from fireflyframework_datascience.models.calibration import SklearnCalibrator + + return SklearnCalibrator() + + __all__ = ["AutoML"] diff --git a/src/fireflyframework_datascience/evaluation/adapters.py b/src/fireflyframework_datascience/evaluation/adapters.py index f37f644..6ce6f57 100644 --- a/src/fireflyframework_datascience/evaluation/adapters.py +++ b/src/fireflyframework_datascience/evaluation/adapters.py @@ -70,12 +70,15 @@ def _classification(self, y_true: Any, y_pred: Any, y_proba: Any, task: TaskType return metrics def _add_proba_metrics(self, metrics: dict[str, float], y_true: Any, y_proba: Any, task: TaskType) -> None: - from sklearn.metrics import log_loss, roc_auc_score + from sklearn.metrics import average_precision_score, brier_score_loss, log_loss, roc_auc_score try: if task is TaskType.BINARY: positive = y_proba[:, 1] if getattr(y_proba, "ndim", 1) == 2 else y_proba metrics["roc_auc"] = float(roc_auc_score(y_true, positive)) + # PR-AUC (key on imbalanced data) and the Brier score (probability quality / calibration) + metrics["average_precision"] = float(average_precision_score(y_true, positive)) + metrics["brier_score"] = float(brier_score_loss(y_true, positive)) else: metrics["roc_auc"] = float(roc_auc_score(y_true, y_proba, multi_class="ovr", average="weighted")) metrics["log_loss"] = float(log_loss(y_true, y_proba)) diff --git a/src/fireflyframework_datascience/models/calibration.py b/src/fireflyframework_datascience/models/calibration.py new file mode 100644 index 0000000..3fc0544 --- /dev/null +++ b/src/fireflyframework_datascience/models/calibration.py @@ -0,0 +1,48 @@ +# Copyright 2026 Firefly Software Foundation. +"""Probability calibration — wrap a fitted classifier so its predicted probabilities are trustworthy. + +Tree and boosting models often produce over-confident probabilities; risk- and cost-sensitive +decisions (lending, medicine) need them well-calibrated. A :class:`CalibratorPort` post-processes the +selected model; the default adapter uses scikit-learn's cross-validated calibration. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable + +if TYPE_CHECKING: + from fireflyframework_datascience.core.types import TaskType + + +@runtime_checkable +class CalibratorPort(Protocol): + """Calibrates a classifier's probabilities, returning a fitted, calibrated estimator.""" + + name: str + + def supports(self, task: TaskType) -> bool: ... + + def calibrate(self, estimator: Any, X: Any, y: Any, task: TaskType) -> Any: ... + + +class SklearnCalibrator: + """Cross-validated calibration via scikit-learn ``CalibratedClassifierCV`` (isotonic by default).""" + + name = "sklearn_calibration" + + def __init__(self, *, method: str = "isotonic", cv: int = 3) -> None: + self._method = method + self._cv = cv + + def supports(self, task: TaskType) -> bool: + return task.is_classification() + + def calibrate(self, estimator: Any, X: Any, y: Any, task: TaskType) -> Any: + from sklearn.calibration import CalibratedClassifierCV + + calibrated = CalibratedClassifierCV(estimator, method=self._method, cv=self._cv) + calibrated.fit(X, y) + return calibrated + + +__all__ = ["CalibratorPort", "SklearnCalibrator"] diff --git a/tests/evaluation/test_richer_metrics.py b/tests/evaluation/test_richer_metrics.py new file mode 100644 index 0000000..6b85eae --- /dev/null +++ b/tests/evaluation/test_richer_metrics.py @@ -0,0 +1,29 @@ +# Copyright 2026 Firefly Software Foundation. +"""The evaluator should report PR-AUC (average precision) and the Brier score for classification. + +These matter for imbalanced data (PR-AUC) and probability quality / calibration (Brier). Additive — +they do not change defaults or CV scoring. Real computed metrics, no fakes. +""" + +from __future__ import annotations + +import numpy as np + +from fireflyframework_datascience.core.types import TaskType +from fireflyframework_datascience.evaluation.adapters import SklearnMetricsEvaluator + + +def test_binary_evaluation_reports_pr_auc_and_brier() -> None: + y_true = np.array([0, 0, 1, 1, 0, 1, 1, 0]) + y_proba = np.array([[0.8, 0.2], [0.7, 0.3], [0.2, 0.8], [0.1, 0.9], [0.6, 0.4], [0.3, 0.7], [0.4, 0.6], [0.9, 0.1]]) + y_pred = y_proba.argmax(axis=1) + + result = SklearnMetricsEvaluator().evaluate(TaskType.BINARY, y_true, y_pred, y_proba, metric="roc_auc") + + assert "average_precision" in result.metrics + assert "brier_score" in result.metrics + assert 0.0 <= result.metrics["average_precision"] <= 1.0 + assert 0.0 <= result.metrics["brier_score"] <= 1.0 + # this is a strongly-separated set, so PR-AUC should be high and Brier low + assert result.metrics["average_precision"] > 0.8 + assert result.metrics["brier_score"] < 0.2 diff --git a/tests/test_calibration.py b/tests/test_calibration.py new file mode 100644 index 0000000..0e9bf29 --- /dev/null +++ b/tests/test_calibration.py @@ -0,0 +1,41 @@ +# Copyright 2026 Firefly Software Foundation. +"""AutoML(calibrate=True) wraps the winning classifier so its probabilities are trustworthy. + +Tree/boosting probabilities are often poorly calibrated; risk-/cost-sensitive decisions need +trustworthy probabilities. Real data (breast_cancer), no fakes. +""" + +from __future__ import annotations + + +def test_automl_calibrate_wraps_winner_with_valid_probabilities() -> None: + from sklearn.calibration import CalibratedClassifierCV + + from fireflyframework_datascience.automl import AutoML + from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader + + ds = SklearnDatasetLoader().load("breast_cancer") + train, test = ds.train_test_split(test_size=0.25, random_state=0) + + result = AutoML(cv=3, n_trials=1, random_state=0, calibrate=True).fit(train) + + # the winner is wrapped in a calibrator and still exposes valid probabilities + assert isinstance(result.best_model.estimator, CalibratedClassifierCV) + proba = result.predict_proba(test.X) + assert proba.shape[0] == test.n_rows + assert ((proba >= 0.0) & (proba <= 1.0)).all() + # calibrated probabilities are trustworthy: a finite, low Brier score on holdout + brier = result.evaluate(test).metrics["brier_score"] + assert 0.0 <= brier <= 0.25 + + +def test_automl_without_calibrate_is_unchanged() -> None: + from sklearn.calibration import CalibratedClassifierCV + + from fireflyframework_datascience.automl import AutoML + from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader + + ds = SklearnDatasetLoader().load("breast_cancer") + train, _ = ds.train_test_split(test_size=0.25, random_state=0) + result = AutoML(cv=3, n_trials=1, random_state=0).fit(train) + assert not isinstance(result.best_model.estimator, CalibratedClassifierCV)