From 8ebb5d4d1c1ec1e7fa7fc845e6ca91734ad8e5f6 Mon Sep 17 00:00:00 2001
From: Felix Divo <felix.divo@tu-darmstadt.de>
Date: Sat, 13 Jun 2026 22:31:50 +0000
Subject: [PATCH] FIX objective/metrics: keep benchmark importable + test
 datasets resolvable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`benchopt test` (benchopt_dev CI) failed with
`Patterns ['simulated'] did not match any dataset`. Two issues, both from
benchopt importing the benchmark in environments without the objective's
requirements installed:

1. Dataset resolution under skip_import. When benchopt builds the test env it
   loads the objective under `skip_import_ctx` — it never imports the module,
   it statically AST-parses class attributes, and only extracts attributes that
   exist on `BaseObjective`. `test_config` is not a base attribute, so our
   `test_config['dataset']['name']` was invisible and resolution fell back to
   the non-existent `simulated` dataset. Fix: also set `test_dataset_name`
   (a base attribute, so it IS extracted statically) to a real dataset.

2. sklearn import at module load. `benchmark_utils.metrics` imported sklearn at
   module top level, so importing objective.py (and the AD datasets, which pull
   `AD_METRICS` for its keys) failed when sklearn was absent. Fix: import
   sklearn lazily inside the metric functions; the registries and metric names
   are now available with only numpy installed. The functions still run with
   sklearn present (it is an objective requirement, installed for real runs).

This supersedes the earlier `safe_import_context` attempt, which made benchopt
flag the whole objective as a failed import under `skip_import_ctx`.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 benchmark_utils/metrics.py | 26 +++++++++++++++++++-------
 objective.py               |  9 ++++++++-
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/benchmark_utils/metrics.py b/benchmark_utils/metrics.py
index 2f162ae..24abe00 100644
--- a/benchmark_utils/metrics.py
+++ b/benchmark_utils/metrics.py
@@ -15,16 +15,16 @@
 """
 
 import numpy as np
-from sklearn.metrics import (
-    accuracy_score,
-    average_precision_score,
-    balanced_accuracy_score,
-    f1_score,
-    roc_auc_score,
-)
 
 from benchmark_utils.outputs import ForecastOutput
 
+# NB: sklearn is imported lazily inside the classification / anomaly-detection
+# metrics below, so this module (and the registries at the bottom) can be
+# imported with only numpy installed. benchopt imports objective.py and the
+# dataset files just to read metadata in envs without the objective's
+# requirements; a top-level sklearn import would break that (and ``benchopt
+# test`` env setup). The metric functions only run when sklearn is present.
+
 # ---------------------------------------------------------------------------
 # Forecasting — internal helpers
 # ---------------------------------------------------------------------------
@@ -172,14 +172,20 @@ def mcis(y_true, forecast: ForecastOutput, alpha=0.05, **_):
 
 
 def accuracy(y_true, y_pred):
+    from sklearn.metrics import accuracy_score
+
     return float(accuracy_score(y_true, y_pred))
 
 
 def balanced_accuracy(y_true, y_pred):
+    from sklearn.metrics import balanced_accuracy_score
+
     return float(balanced_accuracy_score(y_true, y_pred))
 
 
 def f1_weighted(y_true, y_pred):
+    from sklearn.metrics import f1_score
+
     return float(f1_score(y_true, y_pred, average="weighted", zero_division=0))
 
 
@@ -196,6 +202,8 @@ def auc_roc(y_true, y_score):
     y_true  : list of (T_j,) int arrays, concatenated
     y_score : list of (T_j,) float arrays, concatenated
     """
+    from sklearn.metrics import roc_auc_score
+
     y_true = np.concatenate([np.asarray(y) for y in y_true])
     y_score = np.concatenate([np.asarray(y) for y in y_score])
     if y_true.sum() == 0:
@@ -205,6 +213,8 @@ def auc_roc(y_true, y_score):
 
 def auc_pr(y_true, y_score):
     """Area under Precision-Recall curve."""
+    from sklearn.metrics import average_precision_score
+
     y_true = np.concatenate([np.asarray(y) for y in y_true])
     y_score = np.concatenate([np.asarray(y) for y in y_score])
     if y_true.sum() == 0:
@@ -224,6 +234,8 @@ def f1_pa(y_true, y_score, threshold=None):
         If None, the threshold is chosen to maximise F1 on the test set
         (oracle threshold — for benchmarking purposes only).
     """
+    from sklearn.metrics import f1_score
+
     y_true_cat = np.concatenate([np.asarray(y) for y in y_true])
     y_score_cat = np.concatenate([np.asarray(y) for y in y_score])
 
diff --git a/objective.py b/objective.py
index 5964026..92f3742 100644
--- a/objective.py
+++ b/objective.py
@@ -67,7 +67,14 @@ class Objective(BaseObjective):
 
     sampling_strategy = "run_once"
 
-    # Minimal config for ``benchopt test``
+    # Test dataset for ``benchopt test``. ``test_dataset_name`` is the only
+    # selector benchopt can read statically (via AST) when it builds the test
+    # env under ``skip_import_ctx`` without importing this module — ``test_config``
+    # is not a base-class attribute, so it is invisible there and resolution
+    # would otherwise fall back to the non-existent ``simulated`` dataset.
+    test_dataset_name = "monash"
+    # Richer config used once the objective is actually imported: exercise more
+    # datasets, all in debug mode for speed.
     test_config = {
         "dataset": {
             # Skipping MITDB for now due to timeout in download