Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions docs/automl.md
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,22 @@ Each `LeaderboardEntry` holds `model_name`, `params`, `cv_score`, and `metric`,
linear roc_auc=0.9886
```

## Probability calibration

Tree and boosting models often produce over-confident probabilities. For risk- or cost-sensitive
decisions, calibrate the winner so its probabilities are trustworthy — pass `calibrate=True`:

```python
result = AutoML(calibrate=True).fit(train) # wraps the winner in cross-validated calibration
report = result.evaluate(test)
print(report.metrics["brier_score"]) # lower is better-calibrated
```

`calibrate` wraps the selected classifier in a `CalibratorPort` (default: scikit-learn
`CalibratedClassifierCV`, isotonic, cross-fit) after model selection — classification only, off by
default. The evaluator also reports **`average_precision`** (PR-AUC, important on imbalanced data) and
**`brier_score`** (probability quality) for binary tasks alongside `roc_auc`/`accuracy`.

## See also

- [Datasets and loaders](datasets.md) — build the `Dataset` you feed to `fit`.
Expand Down
20 changes: 19 additions & 1 deletion src/fireflyframework_datascience/automl/facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from fireflyframework_datascience.explainability import ExplainerPort
from fireflyframework_datascience.features import FeatureEngineerPort
from fireflyframework_datascience.models import Model, TrainerPort
from fireflyframework_datascience.models.calibration import CalibratorPort
from fireflyframework_datascience.search import SearchPolicyPort
from fireflyframework_datascience.tracking import TrackerPort
from fireflyframework_datascience.validation import ValidatorPort
Expand All @@ -39,6 +40,8 @@ def __init__(
tracker: TrackerPort | None = None,
explainer: ExplainerPort | None = None,
feature_engineer: FeatureEngineerPort | None = None,
calibrate: bool = False,
calibrator: CalibratorPort | None = None,
cv: int = 5,
n_trials: int = 20,
random_state: int = 42,
Expand All @@ -50,6 +53,8 @@ def __init__(
self._tracker = tracker
self._explainer = explainer
self._feature_engineer = feature_engineer
self._calibrate = calibrate
self._calibrator = calibrator
self._cv = cv
self._n_trials = n_trials
self._random_state = random_state
Expand All @@ -67,6 +72,7 @@ def from_context(cls, context: Any, **overrides: Any) -> AutoML:
tracker=container.resolve_optional(TrackerPort),
explainer=container.resolve_optional(ExplainerPort),
feature_engineer=container.resolve_optional(FeatureEngineerPort),
calibrator=container.resolve_optional(CalibratorPort),
**overrides,
)

Expand Down Expand Up @@ -111,7 +117,13 @@ def fit(self, dataset: Dataset, *, task: TaskType | None = None, metric: str | N
assert best is not None
_, best_trainer, best_params = best
estimator = self._pipeline(best_trainer.make_estimator(task, best_params), dataset.X)
estimator.fit(dataset.X, dataset.y)
# Optional probability calibration of the winner (classification only): wrap + cross-fit so the
# served model's probabilities are trustworthy. Off by default; classical-first is unchanged.
calibrator = self._calibrator or (_default_calibrator() if self._calibrate else None)
if self._calibrate and calibrator is not None and calibrator.supports(task):
estimator = calibrator.calibrate(estimator, dataset.X, dataset.y, task)
else:
estimator.fit(dataset.X, dataset.y)
model = Model(
name=best_trainer.name,
estimator=estimator,
Expand Down Expand Up @@ -214,4 +226,10 @@ def _default_search() -> SearchPolicyPort:
return DefaultSearchPolicy()


def _default_calibrator() -> CalibratorPort:
from fireflyframework_datascience.models.calibration import SklearnCalibrator

return SklearnCalibrator()


__all__ = ["AutoML"]
5 changes: 4 additions & 1 deletion src/fireflyframework_datascience/evaluation/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,15 @@ def _classification(self, y_true: Any, y_pred: Any, y_proba: Any, task: TaskType
return metrics

def _add_proba_metrics(self, metrics: dict[str, float], y_true: Any, y_proba: Any, task: TaskType) -> None:
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.metrics import average_precision_score, brier_score_loss, log_loss, roc_auc_score

try:
if task is TaskType.BINARY:
positive = y_proba[:, 1] if getattr(y_proba, "ndim", 1) == 2 else y_proba
metrics["roc_auc"] = float(roc_auc_score(y_true, positive))
# PR-AUC (key on imbalanced data) and the Brier score (probability quality / calibration)
metrics["average_precision"] = float(average_precision_score(y_true, positive))
metrics["brier_score"] = float(brier_score_loss(y_true, positive))
else:
metrics["roc_auc"] = float(roc_auc_score(y_true, y_proba, multi_class="ovr", average="weighted"))
metrics["log_loss"] = float(log_loss(y_true, y_proba))
Expand Down
48 changes: 48 additions & 0 deletions src/fireflyframework_datascience/models/calibration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2026 Firefly Software Foundation.
"""Probability calibration — wrap a fitted classifier so its predicted probabilities are trustworthy.

Tree and boosting models often produce over-confident probabilities; risk- and cost-sensitive
decisions (lending, medicine) need them well-calibrated. A :class:`CalibratorPort` post-processes the
selected model; the default adapter uses scikit-learn's cross-validated calibration.
"""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable

if TYPE_CHECKING:
from fireflyframework_datascience.core.types import TaskType


@runtime_checkable
class CalibratorPort(Protocol):
"""Calibrates a classifier's probabilities, returning a fitted, calibrated estimator."""

name: str

def supports(self, task: TaskType) -> bool: ...

def calibrate(self, estimator: Any, X: Any, y: Any, task: TaskType) -> Any: ...


class SklearnCalibrator:
"""Cross-validated calibration via scikit-learn ``CalibratedClassifierCV`` (isotonic by default)."""

name = "sklearn_calibration"

def __init__(self, *, method: str = "isotonic", cv: int = 3) -> None:
self._method = method
self._cv = cv

def supports(self, task: TaskType) -> bool:
return task.is_classification()

def calibrate(self, estimator: Any, X: Any, y: Any, task: TaskType) -> Any:
from sklearn.calibration import CalibratedClassifierCV

calibrated = CalibratedClassifierCV(estimator, method=self._method, cv=self._cv)
calibrated.fit(X, y)
return calibrated


__all__ = ["CalibratorPort", "SklearnCalibrator"]
29 changes: 29 additions & 0 deletions tests/evaluation/test_richer_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright 2026 Firefly Software Foundation.
"""The evaluator should report PR-AUC (average precision) and the Brier score for classification.

These matter for imbalanced data (PR-AUC) and probability quality / calibration (Brier). Additive —
they do not change defaults or CV scoring. Real computed metrics, no fakes.
"""

from __future__ import annotations

import numpy as np

from fireflyframework_datascience.core.types import TaskType
from fireflyframework_datascience.evaluation.adapters import SklearnMetricsEvaluator


def test_binary_evaluation_reports_pr_auc_and_brier() -> None:
y_true = np.array([0, 0, 1, 1, 0, 1, 1, 0])
y_proba = np.array([[0.8, 0.2], [0.7, 0.3], [0.2, 0.8], [0.1, 0.9], [0.6, 0.4], [0.3, 0.7], [0.4, 0.6], [0.9, 0.1]])
y_pred = y_proba.argmax(axis=1)

result = SklearnMetricsEvaluator().evaluate(TaskType.BINARY, y_true, y_pred, y_proba, metric="roc_auc")

assert "average_precision" in result.metrics
assert "brier_score" in result.metrics
assert 0.0 <= result.metrics["average_precision"] <= 1.0
assert 0.0 <= result.metrics["brier_score"] <= 1.0
# this is a strongly-separated set, so PR-AUC should be high and Brier low
assert result.metrics["average_precision"] > 0.8
assert result.metrics["brier_score"] < 0.2
41 changes: 41 additions & 0 deletions tests/test_calibration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright 2026 Firefly Software Foundation.
"""AutoML(calibrate=True) wraps the winning classifier so its probabilities are trustworthy.

Tree/boosting probabilities are often poorly calibrated; risk-/cost-sensitive decisions need
trustworthy probabilities. Real data (breast_cancer), no fakes.
"""

from __future__ import annotations


def test_automl_calibrate_wraps_winner_with_valid_probabilities() -> None:
from sklearn.calibration import CalibratedClassifierCV

from fireflyframework_datascience.automl import AutoML
from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader

ds = SklearnDatasetLoader().load("breast_cancer")
train, test = ds.train_test_split(test_size=0.25, random_state=0)

result = AutoML(cv=3, n_trials=1, random_state=0, calibrate=True).fit(train)

# the winner is wrapped in a calibrator and still exposes valid probabilities
assert isinstance(result.best_model.estimator, CalibratedClassifierCV)
proba = result.predict_proba(test.X)
assert proba.shape[0] == test.n_rows
assert ((proba >= 0.0) & (proba <= 1.0)).all()
# calibrated probabilities are trustworthy: a finite, low Brier score on holdout
brier = result.evaluate(test).metrics["brier_score"]
assert 0.0 <= brier <= 0.25


def test_automl_without_calibrate_is_unchanged() -> None:
from sklearn.calibration import CalibratedClassifierCV

from fireflyframework_datascience.automl import AutoML
from fireflyframework_datascience.datasets.adapters import SklearnDatasetLoader

ds = SklearnDatasetLoader().load("breast_cancer")
train, _ = ds.train_test_split(test_size=0.25, random_state=0)
result = AutoML(cv=3, n_trials=1, random_state=0).fit(train)
assert not isinstance(result.best_model.estimator, CalibratedClassifierCV)
Loading