From ad0464d7ebf6d2357953c381d67704501dba4c2a Mon Sep 17 00:00:00 2001
From: Geoffrey Negiar <geoff@theforecastingcompany.com>
Date: Thu, 28 May 2026 16:27:54 +0200
Subject: [PATCH 01/12] ENH batch forecasting predict() across series and
 cutoffs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactors the forecasting ``predict`` contract from a per-context call
into a single call covering all series and all rolling cutoffs at once.
This matches how hosted-API solvers (TFC, etc.) natively dispatch work
and avoids re-paying per-call overhead in the objective.

New signature::

    predict(
        x: list[np.ndarray (T_i, C)],
        cutoff_indexes: list[list[int]],
        covariates: {"static_covars": list, "hist_covars": list,
                     "future_covars": list},
        horizon: int,
    ) -> list[np.ndarray (n_cutoffs_i, horizon, C)]

The ``covariates`` dict always has all three keys (empty lists when a
dataset doesn't carry them), so adapters never branch on None vs dict.

Changes:
- ``benchmark_utils/adapters/base.py``: rewrite the predict contract and
  documentation.
- ``benchmark_utils/windowing.py``: ``make_forecasting_splits`` now
  returns ``(series_full, cutoff_indexes, targets)`` with targets of
  shape ``(n_cutoffs_i, H, C)``.
- ``datasets/monash.py``: emits ``cutoff_indexes`` and empty
  ``covariates`` alongside the existing fields.
- ``objective.py``: forwards the new fields and reshapes the batched
  prediction back to flat ``(n_total, H, C)`` arrays for metric
  computation. ``get_one_result`` updated accordingly.
- ``solvers/naive.py``: ``_NaiveForecaster`` takes the batched API
  (and no longer needs ``prediction_length`` in its constructor).
- ``solvers/chronos.py``: ``_ChronosForecaster`` takes the batched API
  and reuses the loaded pipeline across all series and cutoffs.
- ``benchmark_utils/adapters/forecast_residual.py``: rewritten as a
  single batched call so AD scoring is one prediction per series rather
  than O(T) per series.
- ``solvers/tfc_api.py``: new solver that wraps the TFC hosted-API SDK.
  Uses ``client.cross_validate`` to issue one request per series with
  all cutoffs at once. Knobs for ``model``, ``context``,
  ``add_holidays``, ``add_events``, ``country_isocode``, ``batch_size``.
  Skips when ``TFC_API_KEY`` is unset.

Verification — Monash[m1_yearly_dataset, debug=True], -j 1:

| solver                | MAE        | MSE         | MASE   | sMAPE  |
| --------------------- | ---------- | ----------- | ------ | ------ |
| Naive[seasonality=1]  | 3,399,506  | 5.93e13     | 12.86  | 0.431  |
| TFC-API[chronos-2]    | 2,807,424  | 4.07e13     | 10.62  | 0.349  |
| TFC-API[tabpfn-ts]    | 2,621,979  | 3.69e13     |  9.92  | 0.401  |
| TFC-API[timesfm-2p5]  | 2,657,678  | 3.99e13     | 10.05  | 0.263  |

The Chronos numbers match bit-for-bit against the pre-refactor run.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmark_utils/adapters/base.py              |  44 ++--
 benchmark_utils/adapters/forecast_residual.py |  33 +--
 benchmark_utils/windowing.py                  |  60 ++---
 datasets/monash.py                            |  32 ++-
 objective.py                                  |  87 ++++---
 solvers/chronos.py                            |  53 ++---
 solvers/naive.py                              |  29 ++-
 solvers/tfc_api.py                            | 212 ++++++++++++++++++
 8 files changed, 412 insertions(+), 138 deletions(-)
 create mode 100644 solvers/tfc_api.py

diff --git a/benchmark_utils/adapters/base.py b/benchmark_utils/adapters/base.py
index ef43397..2258ad6 100644
--- a/benchmark_utils/adapters/base.py
+++ b/benchmark_utils/adapters/base.py
@@ -1,17 +1,36 @@
 """Base interface that all task adapters must implement.
 
 A *fitted* adapter is what solvers return via ``get_result()``.
-The objective calls ``adapter.predict(x)`` for each test sample.
+The objective calls ``adapter.predict(...)`` with task-appropriate inputs.
 
 Predict signature by task
 --------------------------
-forecasting      : x (T, C)  →  y_pred (H, C)
-classification   : x (T, C)  →  int label
-anomaly detection: x (T, C)  →  scores (T,) float — one score per timestep
+forecasting:
+
+    predict(
+        x: list[np.ndarray (T_i, C)],
+        cutoff_indexes: list[list[int]],
+        covariates: dict,
+        horizon: int,
+    ) -> list[np.ndarray (n_cutoffs_i, horizon, C)]
+
+  ``cutoff_indexes[i][k]`` is the timestep index in ``x[i]`` at which
+  the k-th forecast for series ``i`` starts. The model must use only
+  ``x[i][:cutoff]`` as history. The ``covariates`` dict has shape
+  ``{"static_covars": list, "hist_covars": list, "future_covars": list}``;
+  the keys are always present (empty lists when unused).
+
+classification:
+
+    predict(x: np.ndarray (N, T, C)) -> np.ndarray (N,) int labels
+
+anomaly detection:
+
+    predict(x: np.ndarray (T, C)) -> np.ndarray (T,) float anomaly scores
 """
 
 from abc import ABC, abstractmethod
-import numpy as np
+from typing import Any
 
 
 class BaseTSFMAdapter(ABC):
@@ -26,16 +45,5 @@ def fit(self, X_train, y_train, **kwargs):
         return self
 
     @abstractmethod
-    def predict(self, x: np.ndarray) -> np.ndarray:
-        """Run inference on a single sample.
-
-        Parameters
-        ----------
-        x : np.ndarray, shape (T, C)
-            One time series (variable length allowed).
-
-        Returns
-        -------
-        np.ndarray
-            Task-specific output — see module docstring.
-        """
+    def predict(self, *args, **kwargs) -> Any:
+        """Task-specific inference. See module docstring for per-task signatures."""
diff --git a/benchmark_utils/adapters/forecast_residual.py b/benchmark_utils/adapters/forecast_residual.py
index f4c48ff..c79f251 100644
--- a/benchmark_utils/adapters/forecast_residual.py
+++ b/benchmark_utils/adapters/forecast_residual.py
@@ -22,8 +22,8 @@ class ForecastResidualAdapter(BaseTSFMAdapter):
 
     Parameters
     ----------
-    forecaster : object with
-        ``predict(context: np.ndarray (T, C)) -> np.ndarray (H, C)``
+    forecaster : object exposing the batched forecasting predict API
+        (see :class:`BaseTSFMAdapter`).
     prediction_length : int
         Number of steps predicted at each position (default 1).
     min_context : int
@@ -47,20 +47,23 @@ def predict(self, x: np.ndarray) -> np.ndarray:
         scores : (T,) float — higher means more anomalous.
             Timesteps before ``min_context`` receive score 0.
         """
-        T, C = x.shape
+        T = x.shape[0]
         scores = np.zeros(T, dtype=np.float32)
+        cutoffs = list(range(self.min_context, T - self.prediction_length + 1))
+        if not cutoffs:
+            return scores
 
-        for t in range(self.min_context, T):
-            context = x[:t]  # (t, C)
-            try:
-                pred = self.forecaster.predict(context)  # (H, C) or (H,)
-                pred = np.asarray(pred).reshape(self.prediction_length, -1)
-                actual = x[t: t + self.prediction_length]  # (H, C)
-                if actual.shape[0] < self.prediction_length:
-                    continue
-                error = float(np.mean(np.abs(pred - actual)))
-            except Exception:
-                error = 0.0
-            scores[t] = error
+        try:
+            preds = self.forecaster.predict(
+                [x],
+                cutoff_indexes=[cutoffs],
+                covariates={"static_covars": [], "hist_covars": [], "future_covars": []},
+                horizon=self.prediction_length,
+            )[0]  # (n_cutoffs, H, C)
+        except Exception:
+            return scores
 
+        for k, t in enumerate(cutoffs):
+            actual = x[t: t + self.prediction_length]
+            scores[t] = float(np.mean(np.abs(preds[k] - actual)))
         return scores
diff --git a/benchmark_utils/windowing.py b/benchmark_utils/windowing.py
index e439a95..269175f 100644
--- a/benchmark_utils/windowing.py
+++ b/benchmark_utils/windowing.py
@@ -1,14 +1,18 @@
-"""
-Rolling-window utilities for forecasting evaluation.
+"""Rolling-window utilities for forecasting evaluation.
 
-Given a list of time series (each a numpy array of shape (T_i, C)),
-`make_forecasting_splits` returns:
-    - X_test : List[np.ndarray]  each (T_context_i, C) — full context up to
-                                  the prediction point (variable length)
-    - y_test : List[np.ndarray]  each (prediction_length, C) — the target
+`make_forecasting_splits` returns the full series alongside per-series
+cutoff indexes and target horizons. This shape matches the batched
+adapter contract: a forecaster gets the whole history per series plus
+the list of cutoffs at which it should forecast.
 
-The context for window k is everything from the start of the series up to
-(T_train + k * stride), so models with long context windows get to use it.
+Outputs
+-------
+series_full     : List[np.ndarray (T_i, C)]
+cutoff_indexes  : List[List[int]] — for each series, the timestep
+                  indexes at which a forecast starts (i.e. ``x[:cutoff]``
+                  is the history available to the model).
+targets         : List[np.ndarray (n_cutoffs_i, prediction_length, C)]
+                  ground-truth windows aligned with cutoff_indexes.
 """
 
 from typing import List, Optional, Tuple
@@ -21,12 +25,12 @@ def make_forecasting_splits(
     n_windows: int = 1,
     stride: Optional[int] = None,
     min_context: int = 1,
-) -> Tuple[List[np.ndarray], List[np.ndarray]]:
-    """Create rolling-window evaluation splits from a list of time series.
+) -> Tuple[List[np.ndarray], List[List[int]], List[np.ndarray]]:
+    """Create rolling-window evaluation cutoffs from a list of time series.
 
     Parameters
     ----------
-    series : list of (T_i, C) arrays — full time series (train + test combined)
+    series : list of (T_i, C) arrays — full time series.
     prediction_length : int
     n_windows : int
         Number of rolling evaluation windows per series.
@@ -35,30 +39,30 @@ def make_forecasting_splits(
         Defaults to ``prediction_length`` (non-overlapping).
     min_context : int
         Minimum context length required before the first prediction point.
-
-    Returns
-    -------
-    X_test : list of (T_context, C) arrays — grows with each window
-    y_test : list of (prediction_length, C) arrays
     """
     if stride is None:
         stride = prediction_length
 
-    X_test, y_test = [], []
+    series_full: List[np.ndarray] = []
+    cutoff_indexes: List[List[int]] = []
+    targets: List[np.ndarray] = []
 
     for ts in series:
-        ts = np.asarray(ts)  # (T, C)
+        ts = np.asarray(ts)
         T = ts.shape[0]
-        # The last prediction point must end at or before T
-        # First prediction point: min_context + prediction_length - 1 <= T - 1
-        last_end = T
+        cutoffs: List[int] = []
+        ys: List[np.ndarray] = []
         for w in range(n_windows):
-            pred_end = last_end - (n_windows - 1 - w) * stride
+            pred_end = T - (n_windows - 1 - w) * stride
             pred_start = pred_end - prediction_length
-            if pred_start < min_context:
+            if pred_start < min_context or pred_end > T:
                 continue
-            # Full history as context (variable length)
-            X_test.append(ts[:pred_start])
-            y_test.append(ts[pred_start:pred_end])
+            cutoffs.append(pred_start)
+            ys.append(ts[pred_start:pred_end])
+        if not cutoffs:
+            continue
+        series_full.append(ts)
+        cutoff_indexes.append(cutoffs)
+        targets.append(np.stack(ys, axis=0))  # (n_cutoffs, H, C)
 
-    return X_test, y_test
+    return series_full, cutoff_indexes, targets
diff --git a/datasets/monash.py b/datasets/monash.py
index 645a750..ad853c0 100644
--- a/datasets/monash.py
+++ b/datasets/monash.py
@@ -12,16 +12,22 @@
 
 Data contract output
 --------------------
-X_train : List[np.ndarray (T_i, C)]   training portions of each series
-y_train : List[np.ndarray (H, C)]     next-H targets aligned with X_train
-                                       (useful for supervised fine-tuning)
-X_test  : List[np.ndarray (T_ctx, C)] rolling-window contexts (variable length)
-y_test  : List[np.ndarray (H, C)]     ground-truth horizons
-task    : "forecasting"
-metrics : ["mae", "mse", "mase", "smape"]
+X_train         : List[np.ndarray (T_i, C)]      training portions of each series
+y_train         : List[np.ndarray (H, C)]        next-H targets aligned with X_train
+X_test          : List[np.ndarray (T_i, C)]      full series — model uses
+                                                  ``x[:cutoff]`` as history
+cutoff_indexes  : List[List[int]]                jagged: per-series cutoff
+                                                  positions in X_test
+y_test          : List[np.ndarray (n_cutoffs, H, C)]
+                                                  ground-truth windows
+covariates      : dict                           {static_covars, hist_covars,
+                                                  future_covars} — all empty for
+                                                  Monash today
+task            : "forecasting"
+metrics         : ["mae", "mse", "mase", "smape"]
 prediction_length : int
-freq : str  (e.g. "Y", "M", "D")
-seasonality : int  (seasonal period used for MASE)
+freq            : str  (e.g. "Y", "M", "D")
+seasonality     : int  (seasonal period used for MASE)
 """
 
 import numpy as np
@@ -120,7 +126,7 @@ def get_data(self):
             )
 
         n_windows = 1 if self.debug else self.n_windows
-        X_test, y_test = make_forecasting_splits(
+        X_test, cutoff_indexes, y_test = make_forecasting_splits(
             full_series,
             prediction_length=pred_len,
             n_windows=n_windows,
@@ -131,6 +137,12 @@ def get_data(self):
             y_train=y_train_list,
             X_test=X_test,
             y_test=y_test,
+            cutoff_indexes=cutoff_indexes,
+            covariates={
+                "static_covars": [],
+                "hist_covars": [],
+                "future_covars": [],
+            },
             task="forecasting",
             metrics=["mae", "mse", "mase", "smape"],
             prediction_length=pred_len,
diff --git a/objective.py b/objective.py
index 1a0800f..bdcec56 100644
--- a/objective.py
+++ b/objective.py
@@ -9,29 +9,33 @@
 All datasets must return (via ``get_data``):
 
     X_train : List[np.ndarray (T_i, C)]   training time series
-    y_train : array-like or None           task-specific (see below)
-    X_test  : List[np.ndarray (T_j, C)]   test contexts / series
-    y_test  : array-like                   task-specific (see below)
+    y_train : array-like or None          task-specific (see below)
+    X_test  : List[np.ndarray]            test data (shape depends on task)
+    y_test  : array-like                  task-specific (see below)
     task    : str  one of {"forecasting", "classification",
                             "anomaly_detection"}
     metrics : List[str]  names from benchmark_utils.metrics.ALL_METRICS
 
 Task-specific shapes
 --------------------
-forecasting        y_train  List[(H, C)] or None
-                   y_test   List[(H, C)]
-                   extra    prediction_length (int), freq (str)
-classification     y_train  (N,) int
-                   y_test   (M,) int
-                   extra    n_classes (int)
-anomaly_detection  y_train  None
-                   y_test   List[(T_j,)] int  point-level binary labels
+forecasting        X_test         List[(T_i, C)]  full series — adapter uses
+                                                  ``x[:cutoff]`` as history
+                   cutoff_indexes List[List[int]] jagged per-series cutoffs
+                   y_test         List[(n_cutoffs, H, C)]
+                   covariates     dict           {static_covars, hist_covars,
+                                                  future_covars}
+                   extra          prediction_length (int), freq (str)
+classification     y_train        (N,) int
+                   y_test         (M,) int
+                   extra          n_classes (int)
+anomaly_detection  y_train        None
+                   y_test         List[(T_j,)] int  point-level labels
 
 Solver contract
 ---------------
 ``Solver.get_result()`` must return ``{"model": adapter}`` where ``adapter``
-is a fitted :class:`~benchmark_utils.adapters.base.BaseTSFMAdapter` with a
-``predict(x: np.ndarray (T, C)) -> np.ndarray`` method.
+is a fitted :class:`~benchmark_utils.adapters.base.BaseTSFMAdapter`.
+See that module for per-task predict signatures.
 """
 
 import numpy as np
@@ -60,11 +64,18 @@ class Objective(BaseObjective):
     # ------------------------------------------------------------------
 
     def set_data(self, X_train, y_train, X_test, y_test,
-                 task, metrics, **meta):
+                 task, metrics, cutoff_indexes=None, covariates=None,
+                 **meta):
         self.X_train = X_train
         self.y_train = y_train
         self.X_test = X_test
         self.y_test = y_test
+        self.cutoff_indexes = cutoff_indexes
+        self.covariates = covariates or {
+            "static_covars": [],
+            "hist_covars": [],
+            "future_covars": [],
+        }
         self.task = task
         self.metrics = metrics
         self.meta = meta  # freq, prediction_length, n_classes, …
@@ -98,13 +109,23 @@ def evaluate_result(self, model):
     # --- forecasting ---------------------------------------------------
 
     def _eval_forecasting(self, model):
-        preds, targets = [], []
-        for x, y in zip(self.X_test, self.y_test):
-            pred = np.asarray(model.predict(x))   # (H, C)
-            preds.append(pred)
-            targets.append(np.asarray(y))
+        horizon = self.meta.get("prediction_length", 1)
+        preds_per_series = model.predict(
+            self.X_test,
+            cutoff_indexes=self.cutoff_indexes,
+            covariates=self.covariates,
+            horizon=horizon,
+        )
 
-        preds = np.array(preds)    # (M, H, C)
+        preds, targets = [], []
+        for series_preds, series_targets in zip(preds_per_series, self.y_test):
+            sp = np.asarray(series_preds)  # (n_cutoffs, H, C)
+            st = np.asarray(series_targets)  # (n_cutoffs, H, C)
+            for k in range(sp.shape[0]):
+                preds.append(sp[k])
+                targets.append(st[k])
+
+        preds = np.array(preds)
         targets = np.array(targets)
 
         result = {}
@@ -148,19 +169,27 @@ def get_one_result(self):
         from benchmark_utils.adapters.base import BaseTSFMAdapter
 
         class _ConstantAdapter(BaseTSFMAdapter):
-            def __init__(self, task, meta, X_test):
+            def __init__(self, task, meta):
                 self._task = task
                 self._meta = meta
-                self._X_test = X_test
 
-            def predict(self, x):
+            def predict(self, *args, **kwargs):
                 if self._task == "forecasting":
-                    H = self._meta.get("prediction_length", 1)
-                    C = x.shape[1] if x.ndim == 2 else 1
-                    return np.zeros((H, C))
+                    x = args[0]
+                    cutoff_indexes = kwargs.get(
+                        "cutoff_indexes", args[1] if len(args) > 1 else None
+                    )
+                    H = kwargs.get("horizon", self._meta.get("prediction_length", 1))
+                    preds = []
+                    for series, cutoffs in zip(x, cutoff_indexes or []):
+                        C = series.shape[1] if series.ndim == 2 else 1
+                        preds.append(np.zeros((len(cutoffs), H, C), dtype=np.float32))
+                    return preds
                 elif self._task == "classification":
-                    return 0
+                    x = args[0]
+                    return np.zeros(len(x), dtype=np.int64)
                 elif self._task == "anomaly_detection":
-                    return np.zeros(x.shape[0])
+                    x = args[0]
+                    return np.zeros(x.shape[0], dtype=np.float32)
 
-        return {"model": _ConstantAdapter(self.task, self.meta, self.X_test)}
+        return {"model": _ConstantAdapter(self.task, self.meta)}
diff --git a/solvers/chronos.py b/solvers/chronos.py
index 0033ec5..e693eed 100644
--- a/solvers/chronos.py
+++ b/solvers/chronos.py
@@ -30,35 +30,37 @@
 # ---------------------------------------------------------------------------
 
 class _ChronosForecaster:
-    """Wraps ChronosPipeline to expose predict(x (T, C)) -> (H, C)."""
+    """Wraps ChronosPipeline with the batched series+cutoffs predict API."""
 
-    def __init__(self, pipeline, prediction_length):
+    def __init__(self, pipeline):
         self.pipeline = pipeline
-        self.prediction_length = prediction_length
 
-    def predict(self, x: np.ndarray) -> np.ndarray:
+    def predict(self, x, cutoff_indexes, covariates, horizon):
+        del covariates
         import torch
 
-        x = np.asarray(x, dtype=np.float32)  # (T, C)
-        C = x.shape[1]
-
-        # Chronos expects (batch, time) tensors — one channel at a time,
-        # then stack.
-        preds = []
-        for c in range(C):
-            context = torch.from_numpy(x[:, c]).unsqueeze(0)  # (1, T)
-            forecast = self.pipeline.predict(
-                context,
-                prediction_length=self.prediction_length,
-            )
-            # forecast: (1, n_samples, H) for sample-based pipelines,
-            # or (1, H) for point pipelines — take median.
-            f = forecast[0]
-            if f.ndim == 2:          # (n_samples, H) → median
-                f = f.median(dim=0).values
-            preds.append(f.numpy())  # (H,)
-
-        return np.stack(preds, axis=-1).astype(np.float32)  # (H, C)
+        results = []
+        for series, cutoffs in zip(x, cutoff_indexes):
+            series = np.asarray(series, dtype=np.float32)
+            C = series.shape[1] if series.ndim == 2 else 1
+            out = np.empty((len(cutoffs), horizon, C), dtype=np.float32)
+            for k, cutoff in enumerate(cutoffs):
+                hist = series[:cutoff]
+                if hist.ndim == 1:
+                    hist = hist[:, None]
+                # Chronos expects (batch, time) — one channel at a time.
+                for c in range(C):
+                    context = torch.from_numpy(hist[:, c]).unsqueeze(0)
+                    forecast = self.pipeline.predict(
+                        context,
+                        prediction_length=horizon,
+                    )
+                    f = forecast[0]
+                    if f.ndim == 2:
+                        f = f.median(dim=0).values
+                    out[k, :, c] = f.numpy()
+            results.append(out)
+        return results
 
 
 # ---------------------------------------------------------------------------
@@ -115,8 +117,7 @@ def set_objective(self, X_train, y_train, task, **meta):
             self._loaded_model = model_id
 
     def run(self, _):
-        pred_len = self.meta.get("prediction_length", 1)
-        forecaster = _ChronosForecaster(self._pipeline, pred_len)
+        forecaster = _ChronosForecaster(self._pipeline)
 
         if self.task == "forecasting":
             self._adapter = forecaster
diff --git a/solvers/naive.py b/solvers/naive.py
index be8cdcd..4ae4d49 100644
--- a/solvers/naive.py
+++ b/solvers/naive.py
@@ -21,18 +21,24 @@
 class _NaiveForecaster(BaseTSFMAdapter):
     """Repeat the last ``seasonality`` values to fill the horizon."""
 
-    def __init__(self, prediction_length, seasonality=1):
-        self.prediction_length = prediction_length
+    def __init__(self, seasonality=1):
         self.seasonality = seasonality
 
-    def predict(self, x: np.ndarray) -> np.ndarray:
-        # x: (T, C)
-        T, C = x.shape
-        season = min(self.seasonality, T)
-        pattern = x[-season:]                # (season, C)
-        reps = int(np.ceil(self.prediction_length / season))
-        forecast = np.tile(pattern, (reps, 1))[:self.prediction_length]
-        return forecast.astype(np.float32)   # (H, C)
+    def predict(self, x, cutoff_indexes, covariates, horizon):
+        del covariates
+        results = []
+        for series, cutoffs in zip(x, cutoff_indexes):
+            series = np.asarray(series)
+            C = series.shape[1] if series.ndim == 2 else 1
+            preds = np.empty((len(cutoffs), horizon, C), dtype=np.float32)
+            for k, cutoff in enumerate(cutoffs):
+                hist = series[:cutoff]
+                season = min(self.seasonality, hist.shape[0])
+                pattern = hist[-season:]
+                reps = int(np.ceil(horizon / season))
+                preds[k] = np.tile(pattern, (reps, 1))[:horizon]
+            results.append(preds)
+        return results
 
 
 class _MajorityClassifier(BaseTSFMAdapter):
@@ -94,8 +100,7 @@ def set_objective(self, X_train, y_train, task, **meta):
 
     def run(self, _):
         if self.task == "forecasting":
-            pred_len = self.meta.get("prediction_length", 1)
-            self._adapter = _NaiveForecaster(pred_len, self.seasonality)
+            self._adapter = _NaiveForecaster(self.seasonality)
 
         elif self.task == "classification":
             self._adapter = _MajorityClassifier()
diff --git a/solvers/tfc_api.py b/solvers/tfc_api.py
new file mode 100644
index 0000000..022d513
--- /dev/null
+++ b/solvers/tfc_api.py
@@ -0,0 +1,212 @@
+"""TFC API solver for the TSFM benchmark.
+
+Calls The Forecasting Company's hosted inference API via the official
+``theforecastingcompany`` Python SDK. Supports zero-shot forecasting.
+
+Authentication
+--------------
+The SDK reads ``TFC_API_KEY`` from the environment by default. Sign in at
+https://docs.retrocast.com/settings/api-keys to get one.
+
+Adding a new model
+------------------
+Pass any model id from ``theforecastingcompany.utils.TFCModels`` via the
+``model`` parameter (e.g. ``"chronos-2"``, ``"timesfm-2p5"``,
+``"tfc-global"``, ``"T0-1638-step-85000"``).
+"""
+
+import os
+from typing import Optional
+
+import numpy as np
+import pandas as pd
+from benchopt import BaseSolver
+
+from benchmark_utils.adapters.base import BaseTSFMAdapter
+
+
+SUPPORTED_TASKS = {"forecasting"}
+
+# Map benchmark freq codes to API-accepted pandas-like aliases.
+_FREQ_REMAP = {"T": "min", "S": "10S"}
+
+# pandas >=2 deprecates Y/Q/M/H short forms in ``pd.date_range``; use the
+# long forms for synthetic indices but pass the original to the API.
+_PD_FREQ_REMAP = {"Y": "YE", "Q": "QE", "M": "ME"}
+
+
+def _to_api_freq(freq: str) -> str:
+    return _FREQ_REMAP.get(freq, freq)
+
+
+def _to_pandas_freq(api_freq: str) -> str:
+    return _PD_FREQ_REMAP.get(api_freq, api_freq)
+
+
+class _TFCAPIForecaster(BaseTSFMAdapter):
+    """Batched adapter that calls ``client.cross_validate`` per series."""
+
+    def __init__(
+        self,
+        client,
+        model,
+        freq: str,
+        context: Optional[int],
+        quantiles: Optional[list[float]],
+        add_holidays: bool,
+        add_events: bool,
+        country_isocode: Optional[str],
+        batch_size: int,
+    ):
+        self.client = client
+        self.model = model  # TFCModels enum
+        self.freq = _to_api_freq(freq)
+        if quantiles is None:
+            quantiles = [0.5]
+        elif 0.5 not in quantiles:
+            quantiles = quantiles + [0.5]
+        self.quantiles = quantiles
+        self.context = context
+        self.add_holidays = add_holidays
+        self.add_events = add_events
+        self.country_isocode = country_isocode
+        self.batch_size = batch_size
+
+    def predict(self, x, cutoff_indexes, covariates, horizon):
+        # TODO: thread ``covariates`` (static/hist/future) through to the SDK
+        # once the benchmark datasets expose them. For now the dict is
+        # ignored — Monash datasets carry no covariates.
+        del covariates
+        pd_freq = _to_pandas_freq(self.freq)
+
+        results = []
+        for series_idx, (series, cutoffs) in enumerate(zip(x, cutoff_indexes)):
+            series = np.asarray(series, dtype=np.float32)
+            if series.ndim == 1:
+                series = series[:, None]
+            T, C = series.shape
+            index = pd.date_range("2000-01-01", periods=T, freq=pd_freq)
+
+            frames = []
+            for c in range(C):
+                frames.append(
+                    pd.DataFrame(
+                        {
+                            "unique_id": f"s{series_idx}_c{c}",
+                            "ds": index,
+                            "target": series[:, c],
+                        }
+                    )
+                )
+            train_df = pd.concat(frames, ignore_index=True)
+
+            fcds = [pd.Timestamp(index[cutoff]) for cutoff in cutoffs]
+            forecast_df = self.client.cross_validate(
+                train_df,
+                model=self.model,
+                horizon=horizon,
+                freq=self.freq,
+                fcds=fcds,
+                quantiles=self.quantiles,
+                context=self.context,
+                add_holidays=self.add_holidays,
+                add_events=self.add_events,
+                country_isocode=self.country_isocode,
+                batch_size=self.batch_size,
+            )
+
+            value_col = f"{self.model}_q0.5"
+            if value_col not in forecast_df.columns:
+                value_col = str(self.model)
+            if value_col not in forecast_df.columns:
+                raise ValueError(
+                    f"TFC API response missing expected columns; got {list(forecast_df.columns)!r}"
+                )
+
+            preds = np.empty((len(cutoffs), horizon, C), dtype=np.float32)
+            for c in range(C):
+                channel = forecast_df.loc[forecast_df["unique_id"] == f"s{series_idx}_c{c}"]
+                for k, fcd in enumerate(fcds):
+                    window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(horizon)
+                    preds[k, :, c] = window[value_col].to_numpy(dtype=np.float32)
+            results.append(preds)
+        return results
+
+
+class Solver(BaseSolver):
+    """TFC hosted-API solver.
+
+    Parameters
+    ----------
+    model : str
+        Model id served by the TFC API — must match a value in
+        ``theforecastingcompany.utils.TFCModels`` (e.g. ``"chronos-2"``,
+        ``"timesfm-2p5"``, ``"tfc-global"``, ``"moirai-2"``).
+    context : int or None
+        Number of history steps to send to the model. ``None`` lets the
+        model use its native maximum.
+    add_holidays, add_events : bool
+        Whether to attach TFC holiday / event covariates. Requires
+        ``country_isocode`` to be set.
+    country_isocode : str or None
+        ISO country code (e.g. ``"US"``) used by the holiday/event lookup.
+    batch_size : int
+        Series-per-batch for batching-enabled models (chronos-2, moirai-2).
+    """
+
+    name = "TFC-API"
+
+    requirements = ["pip::theforecastingcompany"]
+
+    sampling_strategy = "run_once"
+
+    parameters = {
+        "model": ["chronos-2"],
+        "context": [None],
+        "add_holidays": [False],
+        "add_events": [False],
+        "country_isocode": [None],
+        "batch_size": [256],
+    }
+
+    def skip(self, task, **kwargs):
+        if task not in SUPPORTED_TASKS:
+            return True, f"TFC-API solver does not support task={task!r}"
+        if os.getenv("TFC_API_KEY") is None:
+            return True, "TFC_API_KEY environment variable not set"
+        return False, None
+
+    def set_objective(self, X_train, y_train, task, **meta):
+        from theforecastingcompany import TFCClient
+        from theforecastingcompany.utils import TFCModels
+
+        self.task = task
+        self.X_train = X_train
+        self.meta = meta
+
+        try:
+            self._model_enum = TFCModels(self.model)
+        except ValueError as e:
+            known = ", ".join(m.value for m in TFCModels)
+            raise ValueError(
+                f"Unknown TFC model '{self.model}'. Known SDK models: {known}."
+            ) from e
+
+        if not hasattr(self, "_client"):
+            self._client = TFCClient()
+
+    def run(self, _):
+        self._adapter = _TFCAPIForecaster(
+            client=self._client,
+            model=self._model_enum,
+            freq=self.meta.get("freq", "D"),
+            context=self.context,
+            quantiles=None,
+            add_holidays=self.add_holidays,
+            add_events=self.add_events,
+            country_isocode=self.country_isocode,
+            batch_size=self.batch_size,
+        )
+
+    def get_result(self):
+        return {"model": self._adapter}

From 297281387a1cbb30021f268d187128b3f29f3b09 Mon Sep 17 00:00:00 2001
From: Geoffrey Negiar <geoff@theforecastingcompany.com>
Date: Thu, 28 May 2026 16:33:40 +0200
Subject: [PATCH 02/12] ENH add SeasonalNaive forecasting solver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Repeats the last ``season_length`` observations to fill the horizon.
Default parameter sweep covers ``[1, 7, 12, 24]`` (last-value
persistence, weekly, monthly, daily seasonal periods).

Useful as a calibrated baseline whose strength depends entirely on
matching the seasonal period to the data — handy for sanity-checking
the impact of seasonality on TSFMs at fixed compute.

Verified on Monash[m1_yearly_dataset, debug=True]:

| season_length | MAE        | MSE       | MASE   | sMAPE |
| ------------- | ---------- | --------- | ------ | ----- |
|             1 | 3,399,506  | 5.93e13   | 12.86  | 0.431 |
|             7 | 3,045,677  | 4.31e13   | 11.52  | 0.573 |
|            12 | 4,526,063  | 9.24e13   | 17.12  | 0.948 |
|            24 | 6,230,975  | 1.71e14   | 23.56  | 1.744 |

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 solvers/seasonal_naive.py | 84 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 solvers/seasonal_naive.py

diff --git a/solvers/seasonal_naive.py b/solvers/seasonal_naive.py
new file mode 100644
index 0000000..d7f96ab
--- /dev/null
+++ b/solvers/seasonal_naive.py
@@ -0,0 +1,84 @@
+"""Seasonal-naive forecasting baseline.
+
+The forecast at horizon ``h`` is the value observed ``season_length`` steps
+ago — i.e. for forecast index ``i`` (0-based within the horizon), the
+prediction is ``hist[-season_length + (i mod season_length)]``. When the
+available history is shorter than ``season_length``, the pattern falls
+back to whatever history exists.
+
+A common, calibrated baseline for any dataset with a known seasonal
+period. With ``season_length=1`` it collapses to last-value persistence.
+"""
+
+import numpy as np
+from benchopt import BaseSolver
+
+from benchmark_utils.adapters.base import BaseTSFMAdapter
+
+
+SUPPORTED_TASKS = {"forecasting"}
+
+
+class _SeasonalNaiveForecaster(BaseTSFMAdapter):
+    """Repeat the last ``season_length`` observations to fill the horizon."""
+
+    def __init__(self, season_length: int):
+        if season_length < 1:
+            raise ValueError(f"season_length must be >= 1, got {season_length}")
+        self.season_length = season_length
+
+    def predict(self, x, cutoff_indexes, covariates, horizon):
+        del covariates
+        results = []
+        for series, cutoffs in zip(x, cutoff_indexes):
+            series = np.asarray(series)
+            C = series.shape[1] if series.ndim == 2 else 1
+            preds = np.empty((len(cutoffs), horizon, C), dtype=np.float32)
+            for k, cutoff in enumerate(cutoffs):
+                hist = series[:cutoff]
+                season = min(self.season_length, hist.shape[0])
+                pattern = hist[-season:]
+                reps = int(np.ceil(horizon / season))
+                preds[k] = np.tile(pattern, (reps, 1))[:horizon]
+            results.append(preds)
+        return results
+
+
+class Solver(BaseSolver):
+    """Seasonal-naive baseline.
+
+    Parameters
+    ----------
+    season_length : int
+        Number of past steps to repeat. ``1`` recovers last-value
+        persistence; common picks are ``7`` (daily → weekly), ``12``
+        (monthly → yearly), ``24`` (hourly → daily), ``52`` (weekly →
+        yearly).
+    """
+
+    name = "SeasonalNaive"
+
+    requirements = []
+
+    sampling_strategy = "run_once"
+
+    parameters = {
+        "season_length": [1, 7, 12, 24],
+    }
+
+    def skip(self, task, **kwargs):
+        if task not in SUPPORTED_TASKS:
+            return True, f"SeasonalNaive does not support task={task!r}"
+        return False, None
+
+    def set_objective(self, X_train, y_train, task, **meta):
+        self.task = task
+        self.X_train = X_train
+        self.y_train = y_train
+        self.meta = meta
+
+    def run(self, _):
+        self._adapter = _SeasonalNaiveForecaster(self.season_length)
+
+    def get_result(self):
+        return {"model": self._adapter}

From 7b24d945889d7667dafa5d80fa3c39fbdb69834c Mon Sep 17 00:00:00 2001
From: Geoffrey Negiar <geoff@theforecastingcompany.com>
Date: Thu, 28 May 2026 16:41:45 +0200
Subject: [PATCH 03/12] PERF batch chronos-2 / moirai-2 in one cross_validate
 call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes bundled because they touch the same predict signature:

1. Rename ``horizon`` → ``prediction_length`` in the forecasting
   predict() contract for consistency with the SDK and dataset metadata.

2. TFC-API solver now sends one ``cross_validate`` call covering every
   series when the model reports ``supports_batching == True``
   (chronos-2, moirai-2, T0-1535, T0-1638). Series are aligned to share
   an end date so all cutoffs collapse to a common ``fcds`` list; the
   SDK then stacks them into the (V, T) tensor Chronos-2 wants, with
   one ``unique_id`` per series-channel acting as the group id.

   Falls back to the per-series loop when cutoff offsets from end aren't
   homogeneous across series (e.g. a mix of n_windows after some series
   were filtered for being too short).

Touched files for the rename: base.py, objective.py,
forecast_residual.py, naive.py, chronos.py, seasonal_naive.py, tfc_api.py.

Verification — Monash[m1_yearly_dataset, debug=True], -j 1:

- chronos-2 (batched): MAE 2,785,573 · MASE 10.53 · sMAPE 0.348
  (vs per-series: MAE 2,807,424 · MASE 10.62 — same order, ~0.8%
   delta is just batched-vs-sequential sampling variance.)
- timesfm-2p5 (per-series, not batching-capable): unchanged at
  MAE 2,657,678 · MASE 10.05.

Routing verified directly:
- Chronos_2.supports_batching == True  → batched path
- Moirai2.supports_batching == True    → batched path
- TimesFM_2p5.supports_batching == False → per-series path
- TabPFN_TS.supports_batching == False → per-series path

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmark_utils/adapters/base.py              |   4 +-
 benchmark_utils/adapters/forecast_residual.py |   2 +-
 objective.py                                  |   6 +-
 solvers/chronos.py                            |   6 +-
 solvers/naive.py                              |   8 +-
 solvers/seasonal_naive.py                     |   8 +-
 solvers/tfc_api.py                            | 152 ++++++++++++++----
 7 files changed, 140 insertions(+), 46 deletions(-)

diff --git a/benchmark_utils/adapters/base.py b/benchmark_utils/adapters/base.py
index 2258ad6..1bd0ef7 100644
--- a/benchmark_utils/adapters/base.py
+++ b/benchmark_utils/adapters/base.py
@@ -11,8 +11,8 @@
         x: list[np.ndarray (T_i, C)],
         cutoff_indexes: list[list[int]],
         covariates: dict,
-        horizon: int,
-    ) -> list[np.ndarray (n_cutoffs_i, horizon, C)]
+        prediction_length: int,
+    ) -> list[np.ndarray (n_cutoffs_i, prediction_length, C)]
 
   ``cutoff_indexes[i][k]`` is the timestep index in ``x[i]`` at which
   the k-th forecast for series ``i`` starts. The model must use only
diff --git a/benchmark_utils/adapters/forecast_residual.py b/benchmark_utils/adapters/forecast_residual.py
index c79f251..6cd0c0c 100644
--- a/benchmark_utils/adapters/forecast_residual.py
+++ b/benchmark_utils/adapters/forecast_residual.py
@@ -58,7 +58,7 @@ def predict(self, x: np.ndarray) -> np.ndarray:
                 [x],
                 cutoff_indexes=[cutoffs],
                 covariates={"static_covars": [], "hist_covars": [], "future_covars": []},
-                horizon=self.prediction_length,
+                prediction_length=self.prediction_length,
             )[0]  # (n_cutoffs, H, C)
         except Exception:
             return scores
diff --git a/objective.py b/objective.py
index bdcec56..5e26afc 100644
--- a/objective.py
+++ b/objective.py
@@ -109,12 +109,12 @@ def evaluate_result(self, model):
     # --- forecasting ---------------------------------------------------
 
     def _eval_forecasting(self, model):
-        horizon = self.meta.get("prediction_length", 1)
+        prediction_length = self.meta.get("prediction_length", 1)
         preds_per_series = model.predict(
             self.X_test,
             cutoff_indexes=self.cutoff_indexes,
             covariates=self.covariates,
-            horizon=horizon,
+            prediction_length=prediction_length,
         )
 
         preds, targets = [], []
@@ -179,7 +179,7 @@ def predict(self, *args, **kwargs):
                     cutoff_indexes = kwargs.get(
                         "cutoff_indexes", args[1] if len(args) > 1 else None
                     )
-                    H = kwargs.get("horizon", self._meta.get("prediction_length", 1))
+                    H = kwargs.get("prediction_length", self._meta.get("prediction_length", 1))
                     preds = []
                     for series, cutoffs in zip(x, cutoff_indexes or []):
                         C = series.shape[1] if series.ndim == 2 else 1
diff --git a/solvers/chronos.py b/solvers/chronos.py
index e693eed..70bea1d 100644
--- a/solvers/chronos.py
+++ b/solvers/chronos.py
@@ -35,7 +35,7 @@ class _ChronosForecaster:
     def __init__(self, pipeline):
         self.pipeline = pipeline
 
-    def predict(self, x, cutoff_indexes, covariates, horizon):
+    def predict(self, x, cutoff_indexes, covariates, prediction_length):
         del covariates
         import torch
 
@@ -43,7 +43,7 @@ def predict(self, x, cutoff_indexes, covariates, horizon):
         for series, cutoffs in zip(x, cutoff_indexes):
             series = np.asarray(series, dtype=np.float32)
             C = series.shape[1] if series.ndim == 2 else 1
-            out = np.empty((len(cutoffs), horizon, C), dtype=np.float32)
+            out = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32)
             for k, cutoff in enumerate(cutoffs):
                 hist = series[:cutoff]
                 if hist.ndim == 1:
@@ -53,7 +53,7 @@ def predict(self, x, cutoff_indexes, covariates, horizon):
                     context = torch.from_numpy(hist[:, c]).unsqueeze(0)
                     forecast = self.pipeline.predict(
                         context,
-                        prediction_length=horizon,
+                        prediction_length=prediction_length,
                     )
                     f = forecast[0]
                     if f.ndim == 2:
diff --git a/solvers/naive.py b/solvers/naive.py
index 4ae4d49..4f5be19 100644
--- a/solvers/naive.py
+++ b/solvers/naive.py
@@ -24,19 +24,19 @@ class _NaiveForecaster(BaseTSFMAdapter):
     def __init__(self, seasonality=1):
         self.seasonality = seasonality
 
-    def predict(self, x, cutoff_indexes, covariates, horizon):
+    def predict(self, x, cutoff_indexes, covariates, prediction_length):
         del covariates
         results = []
         for series, cutoffs in zip(x, cutoff_indexes):
             series = np.asarray(series)
             C = series.shape[1] if series.ndim == 2 else 1
-            preds = np.empty((len(cutoffs), horizon, C), dtype=np.float32)
+            preds = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32)
             for k, cutoff in enumerate(cutoffs):
                 hist = series[:cutoff]
                 season = min(self.seasonality, hist.shape[0])
                 pattern = hist[-season:]
-                reps = int(np.ceil(horizon / season))
-                preds[k] = np.tile(pattern, (reps, 1))[:horizon]
+                reps = int(np.ceil(prediction_length / season))
+                preds[k] = np.tile(pattern, (reps, 1))[:prediction_length]
             results.append(preds)
         return results
 
diff --git a/solvers/seasonal_naive.py b/solvers/seasonal_naive.py
index d7f96ab..5b2c748 100644
--- a/solvers/seasonal_naive.py
+++ b/solvers/seasonal_naive.py
@@ -27,19 +27,19 @@ def __init__(self, season_length: int):
             raise ValueError(f"season_length must be >= 1, got {season_length}")
         self.season_length = season_length
 
-    def predict(self, x, cutoff_indexes, covariates, horizon):
+    def predict(self, x, cutoff_indexes, covariates, prediction_length):
         del covariates
         results = []
         for series, cutoffs in zip(x, cutoff_indexes):
             series = np.asarray(series)
             C = series.shape[1] if series.ndim == 2 else 1
-            preds = np.empty((len(cutoffs), horizon, C), dtype=np.float32)
+            preds = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32)
             for k, cutoff in enumerate(cutoffs):
                 hist = series[:cutoff]
                 season = min(self.season_length, hist.shape[0])
                 pattern = hist[-season:]
-                reps = int(np.ceil(horizon / season))
-                preds[k] = np.tile(pattern, (reps, 1))[:horizon]
+                reps = int(np.ceil(prediction_length / season))
+                preds[k] = np.tile(pattern, (reps, 1))[:prediction_length]
             results.append(preds)
         return results
 
diff --git a/solvers/tfc_api.py b/solvers/tfc_api.py
index 022d513..88980aa 100644
--- a/solvers/tfc_api.py
+++ b/solvers/tfc_api.py
@@ -8,11 +8,20 @@
 The SDK reads ``TFC_API_KEY`` from the environment by default. Sign in at
 https://docs.retrocast.com/settings/api-keys to get one.
 
+Batching
+--------
+Models that report ``supports_batching == True`` (chronos-2, moirai-2,
+T0-1535, T0-1638) are sent in a single ``cross_validate`` call with all
+series stacked into one DataFrame. Series are aligned so their cutoffs
+share a common set of ``fcds``; the SDK then builds the (V, T) tensor
+internally with one ``unique_id`` per series-channel acting as the
+group id Chronos-2 keys on. When cutoff offsets-from-end are not
+homogeneous across series, the solver falls back to a per-series loop.
+
 Adding a new model
 ------------------
 Pass any model id from ``theforecastingcompany.utils.TFCModels`` via the
-``model`` parameter (e.g. ``"chronos-2"``, ``"timesfm-2p5"``,
-``"tfc-global"``, ``"T0-1638-step-85000"``).
+``model`` parameter.
 """
 
 import os
@@ -43,8 +52,28 @@ def _to_pandas_freq(api_freq: str) -> str:
     return _PD_FREQ_REMAP.get(api_freq, api_freq)
 
 
+def _shared_offsets_from_end(x, cutoff_indexes):
+    """Return per-series cutoff offsets if shared across series, else None."""
+    if not cutoff_indexes:
+        return None
+    reference = None
+    for series, cutoffs in zip(x, cutoff_indexes):
+        T = np.asarray(series).shape[0]
+        offsets = tuple(T - c for c in cutoffs)
+        if reference is None:
+            reference = offsets
+        elif offsets != reference:
+            return None
+    return reference
+
+
 class _TFCAPIForecaster(BaseTSFMAdapter):
-    """Batched adapter that calls ``client.cross_validate`` per series."""
+    """Adapter calling the TFC SDK.
+
+    Uses a single batched ``cross_validate`` call when the model supports
+    batching and series share cutoff offsets; falls back to one call per
+    series otherwise.
+    """
 
     def __init__(
         self,
@@ -72,13 +101,19 @@ def __init__(
         self.country_isocode = country_isocode
         self.batch_size = batch_size
 
-    def predict(self, x, cutoff_indexes, covariates, horizon):
+    def predict(self, x, cutoff_indexes, covariates, prediction_length):
         # TODO: thread ``covariates`` (static/hist/future) through to the SDK
         # once the benchmark datasets expose them. For now the dict is
         # ignored — Monash datasets carry no covariates.
         del covariates
         pd_freq = _to_pandas_freq(self.freq)
 
+        offsets = _shared_offsets_from_end(x, cutoff_indexes)
+        if getattr(self.model, "supports_batching", False) and offsets is not None:
+            return self._predict_batched(x, cutoff_indexes, prediction_length, pd_freq, offsets)
+        return self._predict_per_series(x, cutoff_indexes, prediction_length, pd_freq)
+
+    def _predict_per_series(self, x, cutoff_indexes, prediction_length, pd_freq):
         results = []
         for series_idx, (series, cutoffs) in enumerate(zip(x, cutoff_indexes)):
             series = np.asarray(series, dtype=np.float32)
@@ -87,24 +122,21 @@ def predict(self, x, cutoff_indexes, covariates, horizon):
             T, C = series.shape
             index = pd.date_range("2000-01-01", periods=T, freq=pd_freq)
 
-            frames = []
-            for c in range(C):
-                frames.append(
-                    pd.DataFrame(
-                        {
-                            "unique_id": f"s{series_idx}_c{c}",
-                            "ds": index,
-                            "target": series[:, c],
-                        }
-                    )
-                )
+            frames = [
+                pd.DataFrame({
+                    "unique_id": f"s{series_idx}_c{c}",
+                    "ds": index,
+                    "target": series[:, c],
+                })
+                for c in range(C)
+            ]
             train_df = pd.concat(frames, ignore_index=True)
-
             fcds = [pd.Timestamp(index[cutoff]) for cutoff in cutoffs]
+
             forecast_df = self.client.cross_validate(
                 train_df,
                 model=self.model,
-                horizon=horizon,
+                horizon=prediction_length,
                 freq=self.freq,
                 fcds=fcds,
                 quantiles=self.quantiles,
@@ -115,23 +147,85 @@ def predict(self, x, cutoff_indexes, covariates, horizon):
                 batch_size=self.batch_size,
             )
 
-            value_col = f"{self.model}_q0.5"
-            if value_col not in forecast_df.columns:
-                value_col = str(self.model)
-            if value_col not in forecast_df.columns:
-                raise ValueError(
-                    f"TFC API response missing expected columns; got {list(forecast_df.columns)!r}"
-                )
+            preds = self._gather_series_preds(
+                forecast_df, series_idx, C, cutoffs, fcds, prediction_length
+            )
+            results.append(preds)
+        return results
+
+    def _predict_batched(self, x, cutoff_indexes, prediction_length, pd_freq, offsets):
+        """One ``cross_validate`` call covering every series in ``x``.
 
-            preds = np.empty((len(cutoffs), horizon, C), dtype=np.float32)
+        Series are aligned to share an end date so all cutoffs collapse to
+        the same set of timestamps. The SDK then groups by ``unique_id``
+        when building Chronos-2's (V, T) tensor.
+        """
+        end = pd.Timestamp("2030-01-01")
+        frames = []
+        per_series_meta = []  # (series_idx, C, index, cutoffs)
+        for series_idx, (series, cutoffs) in enumerate(zip(x, cutoff_indexes)):
+            series = np.asarray(series, dtype=np.float32)
+            if series.ndim == 1:
+                series = series[:, None]
+            T, C = series.shape
+            index = pd.date_range(end=end, periods=T, freq=pd_freq)
             for c in range(C):
-                channel = forecast_df.loc[forecast_df["unique_id"] == f"s{series_idx}_c{c}"]
-                for k, fcd in enumerate(fcds):
-                    window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(horizon)
-                    preds[k, :, c] = window[value_col].to_numpy(dtype=np.float32)
+                frames.append(
+                    pd.DataFrame({
+                        "unique_id": f"s{series_idx}_c{c}",
+                        "ds": index,
+                        "target": series[:, c],
+                    })
+                )
+            per_series_meta.append((series_idx, C, index, cutoffs))
+
+        train_df = pd.concat(frames, ignore_index=True)
+        # ``offsets`` is (T - cutoff) for any series, so the corresponding
+        # timestamp is end - (offset - 1) * delta. We let pandas pick the
+        # delta by walking the date_range backwards from ``end``.
+        ref_index = pd.date_range(end=end, periods=max(offsets) + 1, freq=pd_freq)
+        fcds = sorted({pd.Timestamp(ref_index[-offset]) for offset in offsets})
+
+        forecast_df = self.client.cross_validate(
+            train_df,
+            model=self.model,
+            horizon=prediction_length,
+            freq=self.freq,
+            fcds=fcds,
+            quantiles=self.quantiles,
+            context=self.context,
+            add_holidays=self.add_holidays,
+            add_events=self.add_events,
+            country_isocode=self.country_isocode,
+            batch_size=self.batch_size,
+        )
+
+        results = []
+        for series_idx, C, index, cutoffs in per_series_meta:
+            series_fcds = [pd.Timestamp(index[cutoff]) for cutoff in cutoffs]
+            preds = self._gather_series_preds(
+                forecast_df, series_idx, C, cutoffs, series_fcds, prediction_length
+            )
             results.append(preds)
         return results
 
+    def _gather_series_preds(self, forecast_df, series_idx, C, cutoffs, fcds, prediction_length):
+        value_col = f"{self.model}_q0.5"
+        if value_col not in forecast_df.columns:
+            value_col = str(self.model)
+        if value_col not in forecast_df.columns:
+            raise ValueError(
+                f"TFC API response missing expected columns; got {list(forecast_df.columns)!r}"
+            )
+
+        preds = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32)
+        for c in range(C):
+            channel = forecast_df.loc[forecast_df["unique_id"] == f"s{series_idx}_c{c}"]
+            for k, fcd in enumerate(fcds):
+                window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(prediction_length)
+                preds[k, :, c] = window[value_col].to_numpy(dtype=np.float32)
+        return preds
+
 
 class Solver(BaseSolver):
     """TFC hosted-API solver.

From efe5ad5e43d25626ac4905545f43a54e40031dfb Mon Sep 17 00:00:00 2001
From: Geoffrey Negiar <geoff@theforecastingcompany.com>
Date: Thu, 28 May 2026 16:54:23 +0200
Subject: [PATCH 04/12] REFACTOR typed ForecastInput, Covariates dataclass,
 prediction_length at dataset level
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tightens the forecasting predict() contract introduced earlier in this
PR:

- New ``benchmark_utils.inputs.ForecastInput`` frozen dataclass bundles
  ``x``, ``cutoff_indexes``, and ``covariates``. The base ``predict``
  signature is now ``predict(self, x: ForecastInput | np.ndarray)`` —
  forecasting adapters take the dataclass, classification / anomaly-
  detection adapters take a plain ndarray. No more ``*args/**kwargs``.

- New ``benchmark_utils.covariates.Covariates`` frozen dataclass with
  ``static_covars / hist_covars / future_covars`` fields, each defaulting
  to an empty ``Sequence`` (so arrays work as well as lists).

- ``prediction_length`` is removed from the predict signature. It is
  dataset-level state — the solver reads it from ``meta`` once and wires
  it into the adapter constructor. This keeps predict() pure per-call.

Updated to the new contract: base adapter, objective (both
``_eval_forecasting`` and ``get_one_result``'s constant adapter), Monash
dataset (now emits ``Covariates()``), Naive, SeasonalNaive, Chronos,
ForecastResidual, TFC-API.

Parity preserved on Monash[m1_yearly_dataset, debug=True]:
- Naive[seasonality=1]:           MAE 3,399,506 · MASE 12.86 · sMAPE 0.431
- SeasonalNaive[season_length=1]: identical to Naive[seasonality=1] ✓
- TFC-API[chronos-2] (batched):   MAE 2,785,573 · MASE 10.53
- TFC-API[timesfm-2p5]:           MAE 2,657,678 · MASE 10.05

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmark_utils/adapters/base.py              | 30 ++++++-----
 benchmark_utils/adapters/forecast_residual.py |  6 +--
 benchmark_utils/covariates.py                 | 24 +++++++++
 benchmark_utils/inputs.py                     | 35 +++++++++++++
 datasets/monash.py                            |  7 +--
 objective.py                                  | 51 ++++++++++---------
 solvers/chronos.py                            | 21 +++++---
 solvers/naive.py                              | 20 +++++---
 solvers/seasonal_naive.py                     | 20 +++++---
 solvers/tfc_api.py                            | 38 +++++++-------
 10 files changed, 163 insertions(+), 89 deletions(-)
 create mode 100644 benchmark_utils/covariates.py
 create mode 100644 benchmark_utils/inputs.py

diff --git a/benchmark_utils/adapters/base.py b/benchmark_utils/adapters/base.py
index 1bd0ef7..c4cab9a 100644
--- a/benchmark_utils/adapters/base.py
+++ b/benchmark_utils/adapters/base.py
@@ -1,24 +1,19 @@
 """Base interface that all task adapters must implement.
 
 A *fitted* adapter is what solvers return via ``get_result()``.
-The objective calls ``adapter.predict(...)`` with task-appropriate inputs.
+The objective calls ``adapter.predict(x)`` with task-appropriate inputs.
 
 Predict signature by task
 --------------------------
 forecasting:
 
-    predict(
-        x: list[np.ndarray (T_i, C)],
-        cutoff_indexes: list[list[int]],
-        covariates: dict,
-        prediction_length: int,
-    ) -> list[np.ndarray (n_cutoffs_i, prediction_length, C)]
+    predict(x: ForecastInput) -> list[np.ndarray (n_cutoffs_i, prediction_length, C)]
 
-  ``cutoff_indexes[i][k]`` is the timestep index in ``x[i]`` at which
-  the k-th forecast for series ``i`` starts. The model must use only
-  ``x[i][:cutoff]`` as history. The ``covariates`` dict has shape
-  ``{"static_covars": list, "hist_covars": list, "future_covars": list}``;
-  the keys are always present (empty lists when unused).
+  :class:`~benchmark_utils.inputs.ForecastInput` bundles the per-series
+  history list, the jagged per-series cutoff indexes, and a
+  :class:`~benchmark_utils.covariates.Covariates` dataclass.
+  ``prediction_length`` is dataset-level — the solver reads it from the
+  objective and wires it into the adapter at construction time.
 
 classification:
 
@@ -30,7 +25,14 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import Any, Union
+
+import numpy as np
+
+from benchmark_utils.inputs import ForecastInput
+
+
+PredictInput = Union[ForecastInput, np.ndarray]
 
 
 class BaseTSFMAdapter(ABC):
@@ -45,5 +47,5 @@ def fit(self, X_train, y_train, **kwargs):
         return self
 
     @abstractmethod
-    def predict(self, *args, **kwargs) -> Any:
+    def predict(self, x: PredictInput) -> Any:
         """Task-specific inference. See module docstring for per-task signatures."""
diff --git a/benchmark_utils/adapters/forecast_residual.py b/benchmark_utils/adapters/forecast_residual.py
index 6cd0c0c..cbe7141 100644
--- a/benchmark_utils/adapters/forecast_residual.py
+++ b/benchmark_utils/adapters/forecast_residual.py
@@ -53,12 +53,10 @@ def predict(self, x: np.ndarray) -> np.ndarray:
         if not cutoffs:
             return scores
 
+        from benchmark_utils.inputs import ForecastInput
         try:
             preds = self.forecaster.predict(
-                [x],
-                cutoff_indexes=[cutoffs],
-                covariates={"static_covars": [], "hist_covars": [], "future_covars": []},
-                prediction_length=self.prediction_length,
+                ForecastInput(x=[x], cutoff_indexes=[cutoffs])
             )[0]  # (n_cutoffs, H, C)
         except Exception:
             return scores
diff --git a/benchmark_utils/covariates.py b/benchmark_utils/covariates.py
new file mode 100644
index 0000000..62f4132
--- /dev/null
+++ b/benchmark_utils/covariates.py
@@ -0,0 +1,24 @@
+"""Covariates payload passed to forecasting adapters.
+
+A small dataclass so the contract is typed and IDE-discoverable. All
+three fields default to empty sequences, so datasets without covariates
+can just pass ``Covariates()``.
+"""
+
+from dataclasses import dataclass, field
+from typing import Sequence
+
+
+@dataclass(frozen=True)
+class Covariates:
+    """Per-series covariates aligned with the ``x`` sequence in ``predict``.
+
+    Each field is a sequence whose length equals ``len(x)``. Within a
+    series, the inner structure depends on the covariate kind — see the
+    forecasting predict() contract in
+    :mod:`benchmark_utils.adapters.base`.
+    """
+
+    static_covars: Sequence = field(default_factory=list)
+    hist_covars: Sequence = field(default_factory=list)
+    future_covars: Sequence = field(default_factory=list)
diff --git a/benchmark_utils/inputs.py b/benchmark_utils/inputs.py
new file mode 100644
index 0000000..93e94d9
--- /dev/null
+++ b/benchmark_utils/inputs.py
@@ -0,0 +1,35 @@
+"""Typed inputs for adapter ``predict()`` methods.
+
+Forecasting adapters receive a :class:`ForecastInput` (one struct per
+call), while classification and anomaly-detection adapters receive a
+plain :class:`numpy.ndarray`. The base ``predict`` signature is a union
+of the two — see :mod:`benchmark_utils.adapters.base`.
+"""
+
+from dataclasses import dataclass, field
+from typing import Sequence
+
+import numpy as np
+
+from benchmark_utils.covariates import Covariates
+
+
+@dataclass(frozen=True)
+class ForecastInput:
+    """Bundle of arguments passed to a forecasting adapter's predict().
+
+    Attributes
+    ----------
+    x : sequence of np.ndarray
+        One ``(T_i, C)`` array per series. The adapter must use only
+        ``x[i][:cutoff]`` as history for the cutoff at index k.
+    cutoff_indexes : sequence of sequence of int
+        Jagged — per-series timestep indexes at which a forecast starts.
+    covariates : Covariates
+        Static / historical / future covariates aligned with ``x``.
+        Defaults to empty.
+    """
+
+    x: Sequence[np.ndarray]
+    cutoff_indexes: Sequence[Sequence[int]]
+    covariates: Covariates = field(default_factory=Covariates)
diff --git a/datasets/monash.py b/datasets/monash.py
index ad853c0..049ce73 100644
--- a/datasets/monash.py
+++ b/datasets/monash.py
@@ -34,6 +34,7 @@
 from benchopt import BaseDataset
 
 from aeon.datasets import load_forecasting
+from benchmark_utils.covariates import Covariates
 from benchmark_utils.windowing import make_forecasting_splits
 
 
@@ -138,11 +139,7 @@ def get_data(self):
             X_test=X_test,
             y_test=y_test,
             cutoff_indexes=cutoff_indexes,
-            covariates={
-                "static_covars": [],
-                "hist_covars": [],
-                "future_covars": [],
-            },
+            covariates=Covariates(),
             task="forecasting",
             metrics=["mae", "mse", "mase", "smape"],
             prediction_length=pred_len,
diff --git a/objective.py b/objective.py
index 5e26afc..169219b 100644
--- a/objective.py
+++ b/objective.py
@@ -22,9 +22,14 @@
                                                   ``x[:cutoff]`` as history
                    cutoff_indexes List[List[int]] jagged per-series cutoffs
                    y_test         List[(n_cutoffs, H, C)]
-                   covariates     dict           {static_covars, hist_covars,
-                                                  future_covars}
-                   extra          prediction_length (int), freq (str)
+                   covariates     Covariates      dataclass with
+                                                  static / hist / future
+                                                  covariate lists
+                   extra          prediction_length (int), freq (str) —
+                                                  the solver reads these
+                                                  from the objective once
+                                                  and wires them into the
+                                                  adapter
 classification     y_train        (N,) int
                    y_test         (M,) int
                    extra          n_classes (int)
@@ -66,16 +71,14 @@ class Objective(BaseObjective):
     def set_data(self, X_train, y_train, X_test, y_test,
                  task, metrics, cutoff_indexes=None, covariates=None,
                  **meta):
+        from benchmark_utils.covariates import Covariates
+
         self.X_train = X_train
         self.y_train = y_train
         self.X_test = X_test
         self.y_test = y_test
         self.cutoff_indexes = cutoff_indexes
-        self.covariates = covariates or {
-            "static_covars": [],
-            "hist_covars": [],
-            "future_covars": [],
-        }
+        self.covariates = covariates if covariates is not None else Covariates()
         self.task = task
         self.metrics = metrics
         self.meta = meta  # freq, prediction_length, n_classes, …
@@ -109,12 +112,14 @@ def evaluate_result(self, model):
     # --- forecasting ---------------------------------------------------
 
     def _eval_forecasting(self, model):
-        prediction_length = self.meta.get("prediction_length", 1)
+        from benchmark_utils.inputs import ForecastInput
+
         preds_per_series = model.predict(
-            self.X_test,
-            cutoff_indexes=self.cutoff_indexes,
-            covariates=self.covariates,
-            prediction_length=prediction_length,
+            ForecastInput(
+                x=self.X_test,
+                cutoff_indexes=self.cutoff_indexes,
+                covariates=self.covariates,
+            )
         )
 
         preds, targets = [], []
@@ -169,27 +174,23 @@ def get_one_result(self):
         from benchmark_utils.adapters.base import BaseTSFMAdapter
 
         class _ConstantAdapter(BaseTSFMAdapter):
-            def __init__(self, task, meta):
+            def __init__(self, task, prediction_length):
                 self._task = task
-                self._meta = meta
+                self._prediction_length = prediction_length
 
-            def predict(self, *args, **kwargs):
+            def predict(self, x):
                 if self._task == "forecasting":
-                    x = args[0]
-                    cutoff_indexes = kwargs.get(
-                        "cutoff_indexes", args[1] if len(args) > 1 else None
-                    )
-                    H = kwargs.get("prediction_length", self._meta.get("prediction_length", 1))
+                    H = self._prediction_length
                     preds = []
-                    for series, cutoffs in zip(x, cutoff_indexes or []):
+                    for series, cutoffs in zip(x.x, x.cutoff_indexes):
                         C = series.shape[1] if series.ndim == 2 else 1
                         preds.append(np.zeros((len(cutoffs), H, C), dtype=np.float32))
                     return preds
                 elif self._task == "classification":
-                    x = args[0]
                     return np.zeros(len(x), dtype=np.int64)
                 elif self._task == "anomaly_detection":
-                    x = args[0]
                     return np.zeros(x.shape[0], dtype=np.float32)
 
-        return {"model": _ConstantAdapter(self.task, self.meta)}
+        return {"model": _ConstantAdapter(
+            self.task, self.meta.get("prediction_length", 1)
+        )}
diff --git a/solvers/chronos.py b/solvers/chronos.py
index 70bea1d..7dc6759 100644
--- a/solvers/chronos.py
+++ b/solvers/chronos.py
@@ -20,6 +20,7 @@
 from benchopt import BaseSolver
 
 from benchmark_utils.adapters.forecast_residual import ForecastResidualAdapter
+from benchmark_utils.inputs import ForecastInput
 
 
 SUPPORTED_TASKS = {"forecasting", "anomaly_detection"}
@@ -32,18 +33,18 @@
 class _ChronosForecaster:
     """Wraps ChronosPipeline with the batched series+cutoffs predict API."""
 
-    def __init__(self, pipeline):
+    def __init__(self, pipeline, prediction_length):
         self.pipeline = pipeline
+        self.prediction_length = prediction_length
 
-    def predict(self, x, cutoff_indexes, covariates, prediction_length):
-        del covariates
+    def predict(self, x: ForecastInput):
         import torch
 
         results = []
-        for series, cutoffs in zip(x, cutoff_indexes):
+        for series, cutoffs in zip(x.x, x.cutoff_indexes):
             series = np.asarray(series, dtype=np.float32)
             C = series.shape[1] if series.ndim == 2 else 1
-            out = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32)
+            out = np.empty((len(cutoffs), self.prediction_length, C), dtype=np.float32)
             for k, cutoff in enumerate(cutoffs):
                 hist = series[:cutoff]
                 if hist.ndim == 1:
@@ -53,7 +54,7 @@ def predict(self, x, cutoff_indexes, covariates, prediction_length):
                     context = torch.from_numpy(hist[:, c]).unsqueeze(0)
                     forecast = self.pipeline.predict(
                         context,
-                        prediction_length=prediction_length,
+                        prediction_length=self.prediction_length,
                     )
                     f = forecast[0]
                     if f.ndim == 2:
@@ -117,14 +118,18 @@ def set_objective(self, X_train, y_train, task, **meta):
             self._loaded_model = model_id
 
     def run(self, _):
-        forecaster = _ChronosForecaster(self._pipeline)
+        pred_len = self.meta.get("prediction_length", 1)
+        forecaster = _ChronosForecaster(self._pipeline, pred_len)
 
         if self.task == "forecasting":
             self._adapter = forecaster
 
         elif self.task == "anomaly_detection":
+            # AD uses one-step-ahead forecasts; rebuild the forecaster
+            # with prediction_length=1 to match.
             self._adapter = ForecastResidualAdapter(
-                forecaster, prediction_length=1
+                _ChronosForecaster(self._pipeline, prediction_length=1),
+                prediction_length=1,
             )
 
     def get_result(self):
diff --git a/solvers/naive.py b/solvers/naive.py
index 4f5be19..83f21d6 100644
--- a/solvers/naive.py
+++ b/solvers/naive.py
@@ -12,6 +12,7 @@
 from benchopt import BaseSolver
 
 from benchmark_utils.adapters.base import BaseTSFMAdapter
+from benchmark_utils.inputs import ForecastInput
 
 
 # ---------------------------------------------------------------------------
@@ -21,22 +22,22 @@
 class _NaiveForecaster(BaseTSFMAdapter):
     """Repeat the last ``seasonality`` values to fill the horizon."""
 
-    def __init__(self, seasonality=1):
+    def __init__(self, prediction_length, seasonality=1):
+        self.prediction_length = prediction_length
         self.seasonality = seasonality
 
-    def predict(self, x, cutoff_indexes, covariates, prediction_length):
-        del covariates
+    def predict(self, x: ForecastInput):
         results = []
-        for series, cutoffs in zip(x, cutoff_indexes):
+        for series, cutoffs in zip(x.x, x.cutoff_indexes):
             series = np.asarray(series)
             C = series.shape[1] if series.ndim == 2 else 1
-            preds = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32)
+            preds = np.empty((len(cutoffs), self.prediction_length, C), dtype=np.float32)
             for k, cutoff in enumerate(cutoffs):
                 hist = series[:cutoff]
                 season = min(self.seasonality, hist.shape[0])
                 pattern = hist[-season:]
-                reps = int(np.ceil(prediction_length / season))
-                preds[k] = np.tile(pattern, (reps, 1))[:prediction_length]
+                reps = int(np.ceil(self.prediction_length / season))
+                preds[k] = np.tile(pattern, (reps, 1))[:self.prediction_length]
             results.append(preds)
         return results
 
@@ -100,7 +101,10 @@ def set_objective(self, X_train, y_train, task, **meta):
 
     def run(self, _):
         if self.task == "forecasting":
-            self._adapter = _NaiveForecaster(self.seasonality)
+            self._adapter = _NaiveForecaster(
+                prediction_length=self.meta.get("prediction_length", 1),
+                seasonality=self.seasonality,
+            )
 
         elif self.task == "classification":
             self._adapter = _MajorityClassifier()
diff --git a/solvers/seasonal_naive.py b/solvers/seasonal_naive.py
index 5b2c748..723551e 100644
--- a/solvers/seasonal_naive.py
+++ b/solvers/seasonal_naive.py
@@ -14,6 +14,7 @@
 from benchopt import BaseSolver
 
 from benchmark_utils.adapters.base import BaseTSFMAdapter
+from benchmark_utils.inputs import ForecastInput
 
 
 SUPPORTED_TASKS = {"forecasting"}
@@ -22,24 +23,24 @@
 class _SeasonalNaiveForecaster(BaseTSFMAdapter):
     """Repeat the last ``season_length`` observations to fill the horizon."""
 
-    def __init__(self, season_length: int):
+    def __init__(self, prediction_length: int, season_length: int):
         if season_length < 1:
             raise ValueError(f"season_length must be >= 1, got {season_length}")
+        self.prediction_length = prediction_length
         self.season_length = season_length
 
-    def predict(self, x, cutoff_indexes, covariates, prediction_length):
-        del covariates
+    def predict(self, x: ForecastInput):
         results = []
-        for series, cutoffs in zip(x, cutoff_indexes):
+        for series, cutoffs in zip(x.x, x.cutoff_indexes):
             series = np.asarray(series)
             C = series.shape[1] if series.ndim == 2 else 1
-            preds = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32)
+            preds = np.empty((len(cutoffs), self.prediction_length, C), dtype=np.float32)
             for k, cutoff in enumerate(cutoffs):
                 hist = series[:cutoff]
                 season = min(self.season_length, hist.shape[0])
                 pattern = hist[-season:]
-                reps = int(np.ceil(prediction_length / season))
-                preds[k] = np.tile(pattern, (reps, 1))[:prediction_length]
+                reps = int(np.ceil(self.prediction_length / season))
+                preds[k] = np.tile(pattern, (reps, 1))[:self.prediction_length]
             results.append(preds)
         return results
 
@@ -78,7 +79,10 @@ def set_objective(self, X_train, y_train, task, **meta):
         self.meta = meta
 
     def run(self, _):
-        self._adapter = _SeasonalNaiveForecaster(self.season_length)
+        self._adapter = _SeasonalNaiveForecaster(
+            prediction_length=self.meta.get("prediction_length", 1),
+            season_length=self.season_length,
+        )
 
     def get_result(self):
         return {"model": self._adapter}
diff --git a/solvers/tfc_api.py b/solvers/tfc_api.py
index 88980aa..d5e3f87 100644
--- a/solvers/tfc_api.py
+++ b/solvers/tfc_api.py
@@ -32,6 +32,7 @@
 from benchopt import BaseSolver
 
 from benchmark_utils.adapters.base import BaseTSFMAdapter
+from benchmark_utils.inputs import ForecastInput
 
 
 SUPPORTED_TASKS = {"forecasting"}
@@ -79,6 +80,7 @@ def __init__(
         self,
         client,
         model,
+        prediction_length: int,
         freq: str,
         context: Optional[int],
         quantiles: Optional[list[float]],
@@ -89,6 +91,7 @@ def __init__(
     ):
         self.client = client
         self.model = model  # TFCModels enum
+        self.prediction_length = prediction_length
         self.freq = _to_api_freq(freq)
         if quantiles is None:
             quantiles = [0.5]
@@ -101,19 +104,19 @@ def __init__(
         self.country_isocode = country_isocode
         self.batch_size = batch_size
 
-    def predict(self, x, cutoff_indexes, covariates, prediction_length):
-        # TODO: thread ``covariates`` (static/hist/future) through to the SDK
-        # once the benchmark datasets expose them. For now the dict is
-        # ignored — Monash datasets carry no covariates.
-        del covariates
+    def predict(self, x: ForecastInput):
+        # TODO: thread ``x.covariates`` (static/hist/future) through to the SDK
+        # once the benchmark datasets populate them. Monash currently
+        # carries none, so the dataclass arrives with empty sequences.
+        series_list, cutoff_indexes = x.x, x.cutoff_indexes
         pd_freq = _to_pandas_freq(self.freq)
 
-        offsets = _shared_offsets_from_end(x, cutoff_indexes)
+        offsets = _shared_offsets_from_end(series_list, cutoff_indexes)
         if getattr(self.model, "supports_batching", False) and offsets is not None:
-            return self._predict_batched(x, cutoff_indexes, prediction_length, pd_freq, offsets)
-        return self._predict_per_series(x, cutoff_indexes, prediction_length, pd_freq)
+            return self._predict_batched(series_list, cutoff_indexes, pd_freq, offsets)
+        return self._predict_per_series(series_list, cutoff_indexes, pd_freq)
 
-    def _predict_per_series(self, x, cutoff_indexes, prediction_length, pd_freq):
+    def _predict_per_series(self, x, cutoff_indexes, pd_freq):
         results = []
         for series_idx, (series, cutoffs) in enumerate(zip(x, cutoff_indexes)):
             series = np.asarray(series, dtype=np.float32)
@@ -136,7 +139,7 @@ def _predict_per_series(self, x, cutoff_indexes, prediction_length, pd_freq):
             forecast_df = self.client.cross_validate(
                 train_df,
                 model=self.model,
-                horizon=prediction_length,
+                horizon=self.prediction_length,
                 freq=self.freq,
                 fcds=fcds,
                 quantiles=self.quantiles,
@@ -148,12 +151,12 @@ def _predict_per_series(self, x, cutoff_indexes, prediction_length, pd_freq):
             )
 
             preds = self._gather_series_preds(
-                forecast_df, series_idx, C, cutoffs, fcds, prediction_length
+                forecast_df, series_idx, C, cutoffs, fcds
             )
             results.append(preds)
         return results
 
-    def _predict_batched(self, x, cutoff_indexes, prediction_length, pd_freq, offsets):
+    def _predict_batched(self, x, cutoff_indexes, pd_freq, offsets):
         """One ``cross_validate`` call covering every series in ``x``.
 
         Series are aligned to share an end date so all cutoffs collapse to
@@ -189,7 +192,7 @@ def _predict_batched(self, x, cutoff_indexes, prediction_length, pd_freq, offset
         forecast_df = self.client.cross_validate(
             train_df,
             model=self.model,
-            horizon=prediction_length,
+            horizon=self.prediction_length,
             freq=self.freq,
             fcds=fcds,
             quantiles=self.quantiles,
@@ -204,12 +207,12 @@ def _predict_batched(self, x, cutoff_indexes, prediction_length, pd_freq, offset
         for series_idx, C, index, cutoffs in per_series_meta:
             series_fcds = [pd.Timestamp(index[cutoff]) for cutoff in cutoffs]
             preds = self._gather_series_preds(
-                forecast_df, series_idx, C, cutoffs, series_fcds, prediction_length
+                forecast_df, series_idx, C, cutoffs, series_fcds
             )
             results.append(preds)
         return results
 
-    def _gather_series_preds(self, forecast_df, series_idx, C, cutoffs, fcds, prediction_length):
+    def _gather_series_preds(self, forecast_df, series_idx, C, cutoffs, fcds):
         value_col = f"{self.model}_q0.5"
         if value_col not in forecast_df.columns:
             value_col = str(self.model)
@@ -218,11 +221,11 @@ def _gather_series_preds(self, forecast_df, series_idx, C, cutoffs, fcds, predic
                 f"TFC API response missing expected columns; got {list(forecast_df.columns)!r}"
             )
 
-        preds = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32)
+        preds = np.empty((len(cutoffs), self.prediction_length, C), dtype=np.float32)
         for c in range(C):
             channel = forecast_df.loc[forecast_df["unique_id"] == f"s{series_idx}_c{c}"]
             for k, fcd in enumerate(fcds):
-                window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(prediction_length)
+                window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(self.prediction_length)
                 preds[k, :, c] = window[value_col].to_numpy(dtype=np.float32)
         return preds
 
@@ -293,6 +296,7 @@ def run(self, _):
         self._adapter = _TFCAPIForecaster(
             client=self._client,
             model=self._model_enum,
+            prediction_length=self.meta.get("prediction_length", 1),
             freq=self.meta.get("freq", "D"),
             context=self.context,
             quantiles=None,

From 1eced14cd1574ab519675472d7530af2d2c0eb25 Mon Sep 17 00:00:00 2001
From: Geoffrey Negiar <geoff@theforecastingcompany.com>
Date: Thu, 28 May 2026 17:05:56 +0200
Subject: [PATCH 05/12] ENH add quantile dimension to forecasting outputs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Forecasting predict() now returns ``Sequence[ForecastOutput]`` instead
of a list of raw point arrays. ``ForecastOutput`` is a frozen dataclass
holding:

- ``quantiles``: ndarray with shape ``(n_cutoffs, Q, prediction_length, C)``.
- ``quantile_levels``: tuple of floats in (0, 1), length Q.

Point forecasters (Naive, SeasonalNaive, Chronos) set
``quantile_levels=(0.5,)`` and Q=1. The TFC-API adapter now discovers
every ``<model>_q{level}`` column the SDK returns and stacks them into
``quantiles`` with the matching ``quantile_levels`` tuple — falling
back to the mean column when no quantile columns are present.

``ForecastOutput.point`` returns the best point estimate for metric
computation: the median when present, otherwise the mean across
quantile levels. The objective uses that property in
``_eval_forecasting``.

Adapter contract update in ``base.py`` docstring. ``forecast_residual``
extracts ``.point`` from the wrapped forecaster.

Verified on Monash[m1_yearly_dataset, debug=True]: Naive,
SeasonalNaive, TFC-API[chronos-2] and TFC-API[timesfm-2p5] all match
their previous metrics exactly, confirming the median extraction
preserves parity.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmark_utils/adapters/base.py              |  8 ++-
 benchmark_utils/adapters/forecast_residual.py |  5 +-
 benchmark_utils/outputs.py                    | 55 +++++++++++++++++++
 objective.py                                  | 16 +++---
 solvers/chronos.py                            |  6 +-
 solvers/naive.py                              |  6 +-
 solvers/seasonal_naive.py                     |  6 +-
 solvers/tfc_api.py                            | 46 ++++++++++------
 8 files changed, 118 insertions(+), 30 deletions(-)
 create mode 100644 benchmark_utils/outputs.py

diff --git a/benchmark_utils/adapters/base.py b/benchmark_utils/adapters/base.py
index c4cab9a..3f57a98 100644
--- a/benchmark_utils/adapters/base.py
+++ b/benchmark_utils/adapters/base.py
@@ -7,11 +7,17 @@
 --------------------------
 forecasting:
 
-    predict(x: ForecastInput) -> list[np.ndarray (n_cutoffs_i, prediction_length, C)]
+    predict(x: ForecastInput) -> Sequence[ForecastOutput]
 
   :class:`~benchmark_utils.inputs.ForecastInput` bundles the per-series
   history list, the jagged per-series cutoff indexes, and a
   :class:`~benchmark_utils.covariates.Covariates` dataclass.
+
+  :class:`~benchmark_utils.outputs.ForecastOutput` carries the
+  quantile-resolved forecast — shape
+  ``(n_cutoffs, Q, prediction_length, C)`` plus the matching quantile
+  levels. Point forecasters set ``quantile_levels=(0.5,)`` and Q=1.
+
   ``prediction_length`` is dataset-level — the solver reads it from the
   objective and wires it into the adapter at construction time.
 
diff --git a/benchmark_utils/adapters/forecast_residual.py b/benchmark_utils/adapters/forecast_residual.py
index cbe7141..89623c2 100644
--- a/benchmark_utils/adapters/forecast_residual.py
+++ b/benchmark_utils/adapters/forecast_residual.py
@@ -55,9 +55,10 @@ def predict(self, x: np.ndarray) -> np.ndarray:
 
         from benchmark_utils.inputs import ForecastInput
         try:
-            preds = self.forecaster.predict(
+            output = self.forecaster.predict(
                 ForecastInput(x=[x], cutoff_indexes=[cutoffs])
-            )[0]  # (n_cutoffs, H, C)
+            )[0]
+            preds = output.point  # (n_cutoffs, H, C)
         except Exception:
             return scores
 
diff --git a/benchmark_utils/outputs.py b/benchmark_utils/outputs.py
new file mode 100644
index 0000000..67f4187
--- /dev/null
+++ b/benchmark_utils/outputs.py
@@ -0,0 +1,55 @@
+"""Typed outputs returned by forecasting adapters.
+
+Forecasting predict() returns ``Sequence[ForecastOutput]`` — one entry
+per input series. Each ``ForecastOutput`` carries a quantile-resolved
+forecast with shape ``(n_cutoffs, Q, prediction_length, C)`` plus the
+quantile levels themselves. Point forecasters set ``quantile_levels =
+(0.5,)`` and Q=1; probabilistic forecasters can return as many quantiles
+as their model produces.
+"""
+
+from dataclasses import dataclass
+from typing import Sequence
+
+import numpy as np
+
+
+@dataclass(frozen=True)
+class ForecastOutput:
+    """Per-series forecast.
+
+    Attributes
+    ----------
+    quantiles : np.ndarray
+        Shape ``(n_cutoffs, Q, prediction_length, C)``. ``quantiles[k, q]``
+        is the forecast for the k-th cutoff at quantile level
+        ``quantile_levels[q]``.
+    quantile_levels : sequence of float
+        Length ``Q``. Each entry is a quantile level in (0, 1).
+    """
+
+    quantiles: np.ndarray
+    quantile_levels: Sequence[float]
+
+    def __post_init__(self):
+        if self.quantiles.ndim != 4:
+            raise ValueError(
+                f"quantiles must have ndim=4 (n_cutoffs, Q, prediction_length, C); "
+                f"got shape {self.quantiles.shape}"
+            )
+        if self.quantiles.shape[1] != len(self.quantile_levels):
+            raise ValueError(
+                f"quantiles.shape[1] ({self.quantiles.shape[1]}) must equal "
+                f"len(quantile_levels) ({len(self.quantile_levels)})"
+            )
+
+    @property
+    def point(self) -> np.ndarray:
+        """Best point estimate — median when available, else mean over quantiles.
+
+        Shape: ``(n_cutoffs, prediction_length, C)``.
+        """
+        levels = list(self.quantile_levels)
+        if 0.5 in levels:
+            return self.quantiles[:, levels.index(0.5), :, :]
+        return self.quantiles.mean(axis=1)
diff --git a/objective.py b/objective.py
index 169219b..adece87 100644
--- a/objective.py
+++ b/objective.py
@@ -114,7 +114,7 @@ def evaluate_result(self, model):
     def _eval_forecasting(self, model):
         from benchmark_utils.inputs import ForecastInput
 
-        preds_per_series = model.predict(
+        outputs_per_series = model.predict(
             ForecastInput(
                 x=self.X_test,
                 cutoff_indexes=self.cutoff_indexes,
@@ -123,9 +123,9 @@ def _eval_forecasting(self, model):
         )
 
         preds, targets = [], []
-        for series_preds, series_targets in zip(preds_per_series, self.y_test):
-            sp = np.asarray(series_preds)  # (n_cutoffs, H, C)
-            st = np.asarray(series_targets)  # (n_cutoffs, H, C)
+        for series_output, series_targets in zip(outputs_per_series, self.y_test):
+            sp = np.asarray(series_output.point)  # (n_cutoffs, H, C)
+            st = np.asarray(series_targets)
             for k in range(sp.shape[0]):
                 preds.append(sp[k])
                 targets.append(st[k])
@@ -172,6 +172,7 @@ def _eval_anomaly_detection(self, model):
     def get_one_result(self):
         """Return a minimal valid result for benchopt's internal checks."""
         from benchmark_utils.adapters.base import BaseTSFMAdapter
+        from benchmark_utils.outputs import ForecastOutput
 
         class _ConstantAdapter(BaseTSFMAdapter):
             def __init__(self, task, prediction_length):
@@ -181,11 +182,12 @@ def __init__(self, task, prediction_length):
             def predict(self, x):
                 if self._task == "forecasting":
                     H = self._prediction_length
-                    preds = []
+                    outs = []
                     for series, cutoffs in zip(x.x, x.cutoff_indexes):
                         C = series.shape[1] if series.ndim == 2 else 1
-                        preds.append(np.zeros((len(cutoffs), H, C), dtype=np.float32))
-                    return preds
+                        q = np.zeros((len(cutoffs), 1, H, C), dtype=np.float32)
+                        outs.append(ForecastOutput(quantiles=q, quantile_levels=(0.5,)))
+                    return outs
                 elif self._task == "classification":
                     return np.zeros(len(x), dtype=np.int64)
                 elif self._task == "anomaly_detection":
diff --git a/solvers/chronos.py b/solvers/chronos.py
index 7dc6759..7ac830a 100644
--- a/solvers/chronos.py
+++ b/solvers/chronos.py
@@ -21,6 +21,7 @@
 
 from benchmark_utils.adapters.forecast_residual import ForecastResidualAdapter
 from benchmark_utils.inputs import ForecastInput
+from benchmark_utils.outputs import ForecastOutput
 
 
 SUPPORTED_TASKS = {"forecasting", "anomaly_detection"}
@@ -60,7 +61,10 @@ def predict(self, x: ForecastInput):
                     if f.ndim == 2:
                         f = f.median(dim=0).values
                     out[k, :, c] = f.numpy()
-            results.append(out)
+            results.append(ForecastOutput(
+                quantiles=out[:, None, :, :],
+                quantile_levels=(0.5,),
+            ))
         return results
 
 
diff --git a/solvers/naive.py b/solvers/naive.py
index 83f21d6..7603427 100644
--- a/solvers/naive.py
+++ b/solvers/naive.py
@@ -13,6 +13,7 @@
 
 from benchmark_utils.adapters.base import BaseTSFMAdapter
 from benchmark_utils.inputs import ForecastInput
+from benchmark_utils.outputs import ForecastOutput
 
 
 # ---------------------------------------------------------------------------
@@ -38,7 +39,10 @@ def predict(self, x: ForecastInput):
                 pattern = hist[-season:]
                 reps = int(np.ceil(self.prediction_length / season))
                 preds[k] = np.tile(pattern, (reps, 1))[:self.prediction_length]
-            results.append(preds)
+            results.append(ForecastOutput(
+                quantiles=preds[:, None, :, :],  # (n_cutoffs, 1, H, C)
+                quantile_levels=(0.5,),
+            ))
         return results
 
 
diff --git a/solvers/seasonal_naive.py b/solvers/seasonal_naive.py
index 723551e..2adf1ec 100644
--- a/solvers/seasonal_naive.py
+++ b/solvers/seasonal_naive.py
@@ -15,6 +15,7 @@
 
 from benchmark_utils.adapters.base import BaseTSFMAdapter
 from benchmark_utils.inputs import ForecastInput
+from benchmark_utils.outputs import ForecastOutput
 
 
 SUPPORTED_TASKS = {"forecasting"}
@@ -41,7 +42,10 @@ def predict(self, x: ForecastInput):
                 pattern = hist[-season:]
                 reps = int(np.ceil(self.prediction_length / season))
                 preds[k] = np.tile(pattern, (reps, 1))[:self.prediction_length]
-            results.append(preds)
+            results.append(ForecastOutput(
+                quantiles=preds[:, None, :, :],
+                quantile_levels=(0.5,),
+            ))
         return results
 
 
diff --git a/solvers/tfc_api.py b/solvers/tfc_api.py
index d5e3f87..9083d18 100644
--- a/solvers/tfc_api.py
+++ b/solvers/tfc_api.py
@@ -33,6 +33,7 @@
 
 from benchmark_utils.adapters.base import BaseTSFMAdapter
 from benchmark_utils.inputs import ForecastInput
+from benchmark_utils.outputs import ForecastOutput
 
 
 SUPPORTED_TASKS = {"forecasting"}
@@ -150,10 +151,9 @@ def _predict_per_series(self, x, cutoff_indexes, pd_freq):
                 batch_size=self.batch_size,
             )
 
-            preds = self._gather_series_preds(
+            results.append(self._gather_series_output(
                 forecast_df, series_idx, C, cutoffs, fcds
-            )
-            results.append(preds)
+            ))
         return results
 
     def _predict_batched(self, x, cutoff_indexes, pd_freq, offsets):
@@ -206,28 +206,40 @@ def _predict_batched(self, x, cutoff_indexes, pd_freq, offsets):
         results = []
         for series_idx, C, index, cutoffs in per_series_meta:
             series_fcds = [pd.Timestamp(index[cutoff]) for cutoff in cutoffs]
-            preds = self._gather_series_preds(
+            results.append(self._gather_series_output(
                 forecast_df, series_idx, C, cutoffs, series_fcds
-            )
-            results.append(preds)
+            ))
         return results
 
-    def _gather_series_preds(self, forecast_df, series_idx, C, cutoffs, fcds):
-        value_col = f"{self.model}_q0.5"
-        if value_col not in forecast_df.columns:
-            value_col = str(self.model)
-        if value_col not in forecast_df.columns:
-            raise ValueError(
-                f"TFC API response missing expected columns; got {list(forecast_df.columns)!r}"
-            )
+    def _gather_series_output(self, forecast_df, series_idx, C, cutoffs, fcds):
+        # Discover which quantile columns the SDK returned; fall back to
+        # the mean column only when no quantiles are present.
+        levels, quantile_cols = [], []
+        for q in self.quantiles:
+            col = f"{self.model}_q{q}"
+            if col in forecast_df.columns:
+                levels.append(q)
+                quantile_cols.append(col)
+        if not quantile_cols:
+            mean_col = str(self.model)
+            if mean_col not in forecast_df.columns:
+                raise ValueError(
+                    f"TFC API response missing expected columns; got {list(forecast_df.columns)!r}"
+                )
+            levels = [0.5]
+            quantile_cols = [mean_col]
 
-        preds = np.empty((len(cutoffs), self.prediction_length, C), dtype=np.float32)
+        Q = len(levels)
+        preds = np.empty(
+            (len(cutoffs), Q, self.prediction_length, C), dtype=np.float32
+        )
         for c in range(C):
             channel = forecast_df.loc[forecast_df["unique_id"] == f"s{series_idx}_c{c}"]
             for k, fcd in enumerate(fcds):
                 window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(self.prediction_length)
-                preds[k, :, c] = window[value_col].to_numpy(dtype=np.float32)
-        return preds
+                for q_idx, col in enumerate(quantile_cols):
+                    preds[k, q_idx, :, c] = window[col].to_numpy(dtype=np.float32)
+        return ForecastOutput(quantiles=preds, quantile_levels=tuple(levels))
 
 
 class Solver(BaseSolver):

From 01381469090e9104fbf05ae136bd40e10f07df86 Mon Sep 17 00:00:00 2001
From: Geoffrey Negiar <geoff@theforecastingcompany.com>
Date: Thu, 28 May 2026 17:30:44 +0200
Subject: [PATCH 06/12] REFACTOR ForecastOutput is a single object + Chronos-2
 batched local inference
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- ``ForecastOutput`` is now a single dataclass per ``predict()`` call,
  not a per-series sequence. Its ``quantiles`` field is a
  ``Sequence[np.ndarray]`` aligned with the input series, each entry
  shape ``(n_cutoffs_i, Q, prediction_length, C)``. The ``quantile_levels``
  tuple is shared across the batch. ``.point`` returns one ndarray per
  series.
- Adapter signature is now ``predict(self, x: ForecastInput) -> ForecastOutput``,
  with that return type explicit on every forecasting predict() in the
  codebase.
- The local Chronos solver is now Chronos-2 (matching the upstream
  migration on origin/main). The forecaster batches every (series,
  cutoff) pair into one ``Chronos2Pipeline.predict`` call — variable
  context lengths handled by the pipeline's left-padding — and returns
  the model's full 9-level quantile fan.
- Updated all forecasting solvers + the constant adapter in
  ``get_one_result`` + ``ForecastResidualAdapter`` to the new contract.

Parity verified on Monash[m1_yearly_dataset, debug=True]: Naive,
SeasonalNaive, TFC-API[chronos-2], TFC-API[timesfm-2p5] match their
prior metrics exactly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmark_utils/adapters/base.py              |  11 +-
 benchmark_utils/adapters/forecast_residual.py |   4 +-
 benchmark_utils/outputs.py                    |  62 ++++----
 objective.py                                  |  13 +-
 solvers/chronos.py                            | 134 +++++++++---------
 solvers/naive.py                              |  11 +-
 solvers/seasonal_naive.py                     |  11 +-
 solvers/tfc_api.py                            |  36 +++--
 8 files changed, 143 insertions(+), 139 deletions(-)

diff --git a/benchmark_utils/adapters/base.py b/benchmark_utils/adapters/base.py
index 3f57a98..f7e2b69 100644
--- a/benchmark_utils/adapters/base.py
+++ b/benchmark_utils/adapters/base.py
@@ -7,16 +7,17 @@
 --------------------------
 forecasting:
 
-    predict(x: ForecastInput) -> Sequence[ForecastOutput]
+    predict(x: ForecastInput) -> ForecastOutput
 
   :class:`~benchmark_utils.inputs.ForecastInput` bundles the per-series
   history list, the jagged per-series cutoff indexes, and a
   :class:`~benchmark_utils.covariates.Covariates` dataclass.
 
-  :class:`~benchmark_utils.outputs.ForecastOutput` carries the
-  quantile-resolved forecast — shape
-  ``(n_cutoffs, Q, prediction_length, C)`` plus the matching quantile
-  levels. Point forecasters set ``quantile_levels=(0.5,)`` and Q=1.
+  :class:`~benchmark_utils.outputs.ForecastOutput` is a single object
+  covering every input series — its ``quantiles`` field is a Sequence
+  of ``(n_cutoffs_i, Q, prediction_length, C)`` arrays, one per series,
+  with a shared ``quantile_levels`` tuple. Point forecasters set
+  ``quantile_levels=(0.5,)`` and Q=1.
 
   ``prediction_length`` is dataset-level — the solver reads it from the
   objective and wires it into the adapter at construction time.
diff --git a/benchmark_utils/adapters/forecast_residual.py b/benchmark_utils/adapters/forecast_residual.py
index 89623c2..ec463fd 100644
--- a/benchmark_utils/adapters/forecast_residual.py
+++ b/benchmark_utils/adapters/forecast_residual.py
@@ -57,8 +57,8 @@ def predict(self, x: np.ndarray) -> np.ndarray:
         try:
             output = self.forecaster.predict(
                 ForecastInput(x=[x], cutoff_indexes=[cutoffs])
-            )[0]
-            preds = output.point  # (n_cutoffs, H, C)
+            )
+            preds = output.point[0]  # (n_cutoffs, H, C)
         except Exception:
             return scores
 
diff --git a/benchmark_utils/outputs.py b/benchmark_utils/outputs.py
index 67f4187..cf9fba9 100644
--- a/benchmark_utils/outputs.py
+++ b/benchmark_utils/outputs.py
@@ -1,11 +1,10 @@
-"""Typed outputs returned by forecasting adapters.
-
-Forecasting predict() returns ``Sequence[ForecastOutput]`` — one entry
-per input series. Each ``ForecastOutput`` carries a quantile-resolved
-forecast with shape ``(n_cutoffs, Q, prediction_length, C)`` plus the
-quantile levels themselves. Point forecasters set ``quantile_levels =
-(0.5,)`` and Q=1; probabilistic forecasters can return as many quantiles
-as their model produces.
+"""Typed output returned by forecasting adapters.
+
+Forecasting predict() returns a single :class:`ForecastOutput` covering
+every input series in the matching :class:`ForecastInput`. The output is
+shape-aware: ``quantiles[i]`` is the per-series ndarray
+``(n_cutoffs_i, Q, prediction_length, C)``, aligned with the same index
+order as the input ``x``.
 """
 
 from dataclasses import dataclass
@@ -16,40 +15,45 @@
 
 @dataclass(frozen=True)
 class ForecastOutput:
-    """Per-series forecast.
+    """Quantile-resolved forecast for a batch of series.
 
     Attributes
     ----------
-    quantiles : np.ndarray
-        Shape ``(n_cutoffs, Q, prediction_length, C)``. ``quantiles[k, q]``
-        is the forecast for the k-th cutoff at quantile level
+    quantiles : sequence of np.ndarray
+        One ndarray per series, each shape
+        ``(n_cutoffs_i, Q, prediction_length, C)``. ``quantiles[i][k, q]``
+        is the forecast for series ``i``, cutoff ``k``, at quantile level
         ``quantile_levels[q]``.
     quantile_levels : sequence of float
-        Length ``Q``. Each entry is a quantile level in (0, 1).
+        Length ``Q``. Each entry is a quantile level in (0, 1). The same
+        ``Q`` applies to every series in the batch.
     """
 
-    quantiles: np.ndarray
+    quantiles: Sequence[np.ndarray]
     quantile_levels: Sequence[float]
 
     def __post_init__(self):
-        if self.quantiles.ndim != 4:
-            raise ValueError(
-                f"quantiles must have ndim=4 (n_cutoffs, Q, prediction_length, C); "
-                f"got shape {self.quantiles.shape}"
-            )
-        if self.quantiles.shape[1] != len(self.quantile_levels):
-            raise ValueError(
-                f"quantiles.shape[1] ({self.quantiles.shape[1]}) must equal "
-                f"len(quantile_levels) ({len(self.quantile_levels)})"
-            )
+        Q = len(self.quantile_levels)
+        for i, arr in enumerate(self.quantiles):
+            if arr.ndim != 4:
+                raise ValueError(
+                    f"quantiles[{i}] must have ndim=4 "
+                    f"(n_cutoffs, Q, prediction_length, C); got shape {arr.shape}"
+                )
+            if arr.shape[1] != Q:
+                raise ValueError(
+                    f"quantiles[{i}].shape[1] ({arr.shape[1]}) must equal "
+                    f"len(quantile_levels) ({Q})"
+                )
 
     @property
-    def point(self) -> np.ndarray:
-        """Best point estimate — median when available, else mean over quantiles.
+    def point(self) -> Sequence[np.ndarray]:
+        """Best point estimate per series — median when available, else mean across quantiles.
 
-        Shape: ``(n_cutoffs, prediction_length, C)``.
+        Each entry has shape ``(n_cutoffs_i, prediction_length, C)``.
         """
         levels = list(self.quantile_levels)
         if 0.5 in levels:
-            return self.quantiles[:, levels.index(0.5), :, :]
-        return self.quantiles.mean(axis=1)
+            idx = levels.index(0.5)
+            return [arr[:, idx, :, :] for arr in self.quantiles]
+        return [arr.mean(axis=1) for arr in self.quantiles]
diff --git a/objective.py b/objective.py
index adece87..9c6b2de 100644
--- a/objective.py
+++ b/objective.py
@@ -114,7 +114,7 @@ def evaluate_result(self, model):
     def _eval_forecasting(self, model):
         from benchmark_utils.inputs import ForecastInput
 
-        outputs_per_series = model.predict(
+        output = model.predict(
             ForecastInput(
                 x=self.X_test,
                 cutoff_indexes=self.cutoff_indexes,
@@ -123,8 +123,8 @@ def _eval_forecasting(self, model):
         )
 
         preds, targets = [], []
-        for series_output, series_targets in zip(outputs_per_series, self.y_test):
-            sp = np.asarray(series_output.point)  # (n_cutoffs, H, C)
+        for series_point, series_targets in zip(output.point, self.y_test):
+            sp = np.asarray(series_point)  # (n_cutoffs, H, C)
             st = np.asarray(series_targets)
             for k in range(sp.shape[0]):
                 preds.append(sp[k])
@@ -182,12 +182,11 @@ def __init__(self, task, prediction_length):
             def predict(self, x):
                 if self._task == "forecasting":
                     H = self._prediction_length
-                    outs = []
+                    qs = []
                     for series, cutoffs in zip(x.x, x.cutoff_indexes):
                         C = series.shape[1] if series.ndim == 2 else 1
-                        q = np.zeros((len(cutoffs), 1, H, C), dtype=np.float32)
-                        outs.append(ForecastOutput(quantiles=q, quantile_levels=(0.5,)))
-                    return outs
+                        qs.append(np.zeros((len(cutoffs), 1, H, C), dtype=np.float32))
+                    return ForecastOutput(quantiles=qs, quantile_levels=(0.5,))
                 elif self._task == "classification":
                     return np.zeros(len(x), dtype=np.int64)
                 elif self._task == "anomaly_detection":
diff --git a/solvers/chronos.py b/solvers/chronos.py
index 7ac830a..4a32a04 100644
--- a/solvers/chronos.py
+++ b/solvers/chronos.py
@@ -1,24 +1,23 @@
-"""Chronos solver for the TSFM benchmark.
+"""Chronos-2 solver for the TSFM benchmark (local inference).
 
 Supports:
-  - forecasting        : zero-shot via ChronosPipeline
-  - anomaly_detection  : forecast-residual (zero-shot)
+  - forecasting        : zero-shot via ``Chronos2Pipeline``
+  - anomaly_detection  : forecast-residual on top of the same forecaster
 
 Classification is not yet implemented; the solver skips that task.
 
-Model loading is done in ``set_objective`` (untimed).
-Adaptation fitting is done in ``run`` (timed).
-
-Adding a new task
------------------
-1. Add the task name to ``SUPPORTED_TASKS``.
-2. In ``run``, instantiate the appropriate adapter from
-   ``benchmark_utils.adapters`` (or implement a new one there).
+Model loading is done in ``set_objective`` (untimed). Inference batches
+every (series, cutoff) pair into a single ``Chronos2Pipeline.predict``
+call — the pipeline accepts a list of variable-length tensors and
+applies left-padding internally, so all the per-cutoff work happens in
+one forward pass.
 """
 
 import numpy as np
+import torch
 from benchopt import BaseSolver
 
+from benchmark_utils.adapters.base import BaseTSFMAdapter
 from benchmark_utils.adapters.forecast_residual import ForecastResidualAdapter
 from benchmark_utils.inputs import ForecastInput
 from benchmark_utils.outputs import ForecastOutput
@@ -27,67 +26,66 @@
 SUPPORTED_TASKS = {"forecasting", "anomaly_detection"}
 
 
-# ---------------------------------------------------------------------------
-# Thin wrapper exposing the predict() interface expected by the objective
-# ---------------------------------------------------------------------------
-
-class _ChronosForecaster:
-    """Wraps ChronosPipeline with the batched series+cutoffs predict API."""
+class _ChronosForecaster(BaseTSFMAdapter):
+    """Batched Chronos-2 adapter returning a full quantile fan."""
 
     def __init__(self, pipeline, prediction_length):
         self.pipeline = pipeline
         self.prediction_length = prediction_length
+        self.quantile_levels = tuple(float(q) for q in pipeline.quantiles)
 
-    def predict(self, x: ForecastInput):
-        import torch
-
-        results = []
-        for series, cutoffs in zip(x.x, x.cutoff_indexes):
+    def predict(self, x: ForecastInput) -> ForecastOutput:
+        inputs = []
+        layout = []              # (series_idx, cutoff_idx) per input element
+        per_series_shape = []    # (C, n_cutoffs) per series
+        for series_idx, (series, cutoffs) in enumerate(zip(x.x, x.cutoff_indexes)):
             series = np.asarray(series, dtype=np.float32)
-            C = series.shape[1] if series.ndim == 2 else 1
-            out = np.empty((len(cutoffs), self.prediction_length, C), dtype=np.float32)
-            for k, cutoff in enumerate(cutoffs):
-                hist = series[:cutoff]
-                if hist.ndim == 1:
-                    hist = hist[:, None]
-                # Chronos expects (batch, time) — one channel at a time.
-                for c in range(C):
-                    context = torch.from_numpy(hist[:, c]).unsqueeze(0)
-                    forecast = self.pipeline.predict(
-                        context,
-                        prediction_length=self.prediction_length,
-                    )
-                    f = forecast[0]
-                    if f.ndim == 2:
-                        f = f.median(dim=0).values
-                    out[k, :, c] = f.numpy()
-            results.append(ForecastOutput(
-                quantiles=out[:, None, :, :],
-                quantile_levels=(0.5,),
-            ))
-        return results
-
-
-# ---------------------------------------------------------------------------
-# Solver
-# ---------------------------------------------------------------------------
+            if series.ndim == 1:
+                series = series[:, None]
+            _, C = series.shape
+            per_series_shape.append((C, len(cutoffs)))
+            for cutoff_idx, cutoff in enumerate(cutoffs):
+                hist = series[:cutoff]                   # (T_cutoff, C)
+                inputs.append(torch.from_numpy(hist.T))  # (C, T_cutoff)
+                layout.append((series_idx, cutoff_idx))
+
+        if not inputs:
+            return ForecastOutput(quantiles=[], quantile_levels=self.quantile_levels)
+
+        with torch.no_grad():
+            forecast = self.pipeline.predict(
+                inputs,
+                prediction_length=self.prediction_length,
+            )
+        # forecast: list[(n_variates, Q, prediction_length)] aligned with `inputs`.
+
+        Q = len(self.quantile_levels)
+        per_series = [
+            np.empty((n_cutoffs, Q, self.prediction_length, C), dtype=np.float32)
+            for C, n_cutoffs in per_series_shape
+        ]
+        for (series_idx, cutoff_idx), pred in zip(layout, forecast):
+            arr = pred.float().cpu().numpy()                # (C, Q, H)
+            per_series[series_idx][cutoff_idx] = arr.transpose(1, 2, 0)
+        return ForecastOutput(quantiles=per_series, quantile_levels=self.quantile_levels)
+
 
 class Solver(BaseSolver):
-    """Chronos zero-shot solver.
+    """Chronos-2 zero-shot solver.
 
     Parameters
     ----------
     model_size : str
-        Chronos model variant: "tiny", "mini", "small", "base", "large".
+        Chronos-2 variant suffix used in ``autogluon/chronos-2-{model_size}``.
     task_adaptation : str
-        How to use Chronos for each task:
-          "zeroshot"          — direct forecasting API (forecasting only)
-          "forecast_residual" — anomaly score = forecast error (AD only)
+        Per-task usage of the forecaster:
+          ``"zeroshot"``          — direct forecasting (forecasting only)
+          ``"forecast_residual"`` — anomaly score = forecast error (AD only)
     """
 
     name = "Chronos"
 
-    requirements = ["pip::chronos-forecasting>=1.4", "pip::torch"]
+    requirements = ["pip::chronos-forecasting>=2.0", "pip::torch"]
 
     sampling_strategy = "run_once"
 
@@ -101,36 +99,32 @@ def skip(self, task, **kwargs):
             return True, f"Chronos solver does not support task={task!r}"
         return False, None
 
-    # ------------------------------------------------------------------
-
     def set_objective(self, X_train, y_train, task, **meta):
-        import torch
-        from chronos import ChronosPipeline
+        from chronos import Chronos2Pipeline
 
         self.task = task
         self.X_train = X_train
         self.meta = meta
 
-        # Load model once; reuse across consecutive dataset configs.
-        model_id = f"amazon/chronos-t5-{self.model_size}"
+        # bfloat16 is fine on CUDA but poorly supported on CPU / MPS;
+        # fall back to float32 there so inference doesn't crash or stall.
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        dtype = torch.bfloat16 if device == "cuda" else torch.float32
+        model_id = f"autogluon/chronos-2-{self.model_size}"
         if not hasattr(self, "_pipeline") or self._loaded_model != model_id:
-            self._pipeline = ChronosPipeline.from_pretrained(
+            self._pipeline = Chronos2Pipeline.from_pretrained(
                 model_id,
-                device_map="auto",
-                torch_dtype=torch.bfloat16,
+                device_map=device,
+                dtype=dtype,
             )
             self._loaded_model = model_id
 
     def run(self, _):
         pred_len = self.meta.get("prediction_length", 1)
-        forecaster = _ChronosForecaster(self._pipeline, pred_len)
-
         if self.task == "forecasting":
-            self._adapter = forecaster
-
+            self._adapter = _ChronosForecaster(self._pipeline, pred_len)
         elif self.task == "anomaly_detection":
-            # AD uses one-step-ahead forecasts; rebuild the forecaster
-            # with prediction_length=1 to match.
+            # AD uses one-step-ahead forecasts.
             self._adapter = ForecastResidualAdapter(
                 _ChronosForecaster(self._pipeline, prediction_length=1),
                 prediction_length=1,
diff --git a/solvers/naive.py b/solvers/naive.py
index 7603427..f1887cd 100644
--- a/solvers/naive.py
+++ b/solvers/naive.py
@@ -27,8 +27,8 @@ def __init__(self, prediction_length, seasonality=1):
         self.prediction_length = prediction_length
         self.seasonality = seasonality
 
-    def predict(self, x: ForecastInput):
-        results = []
+    def predict(self, x: ForecastInput) -> ForecastOutput:
+        quantiles = []
         for series, cutoffs in zip(x.x, x.cutoff_indexes):
             series = np.asarray(series)
             C = series.shape[1] if series.ndim == 2 else 1
@@ -39,11 +39,8 @@ def predict(self, x: ForecastInput):
                 pattern = hist[-season:]
                 reps = int(np.ceil(self.prediction_length / season))
                 preds[k] = np.tile(pattern, (reps, 1))[:self.prediction_length]
-            results.append(ForecastOutput(
-                quantiles=preds[:, None, :, :],  # (n_cutoffs, 1, H, C)
-                quantile_levels=(0.5,),
-            ))
-        return results
+            quantiles.append(preds[:, None, :, :])  # (n_cutoffs, 1, H, C)
+        return ForecastOutput(quantiles=quantiles, quantile_levels=(0.5,))
 
 
 class _MajorityClassifier(BaseTSFMAdapter):
diff --git a/solvers/seasonal_naive.py b/solvers/seasonal_naive.py
index 2adf1ec..176691d 100644
--- a/solvers/seasonal_naive.py
+++ b/solvers/seasonal_naive.py
@@ -30,8 +30,8 @@ def __init__(self, prediction_length: int, season_length: int):
         self.prediction_length = prediction_length
         self.season_length = season_length
 
-    def predict(self, x: ForecastInput):
-        results = []
+    def predict(self, x: ForecastInput) -> ForecastOutput:
+        quantiles = []
         for series, cutoffs in zip(x.x, x.cutoff_indexes):
             series = np.asarray(series)
             C = series.shape[1] if series.ndim == 2 else 1
@@ -42,11 +42,8 @@ def predict(self, x: ForecastInput):
                 pattern = hist[-season:]
                 reps = int(np.ceil(self.prediction_length / season))
                 preds[k] = np.tile(pattern, (reps, 1))[:self.prediction_length]
-            results.append(ForecastOutput(
-                quantiles=preds[:, None, :, :],
-                quantile_levels=(0.5,),
-            ))
-        return results
+            quantiles.append(preds[:, None, :, :])
+        return ForecastOutput(quantiles=quantiles, quantile_levels=(0.5,))
 
 
 class Solver(BaseSolver):
diff --git a/solvers/tfc_api.py b/solvers/tfc_api.py
index 9083d18..a507c9c 100644
--- a/solvers/tfc_api.py
+++ b/solvers/tfc_api.py
@@ -105,7 +105,7 @@ def __init__(
         self.country_isocode = country_isocode
         self.batch_size = batch_size
 
-    def predict(self, x: ForecastInput):
+    def predict(self, x: ForecastInput) -> ForecastOutput:
         # TODO: thread ``x.covariates`` (static/hist/future) through to the SDK
         # once the benchmark datasets populate them. Monash currently
         # carries none, so the dataclass arrives with empty sequences.
@@ -114,11 +114,18 @@ def predict(self, x: ForecastInput):
 
         offsets = _shared_offsets_from_end(series_list, cutoff_indexes)
         if getattr(self.model, "supports_batching", False) and offsets is not None:
-            return self._predict_batched(series_list, cutoff_indexes, pd_freq, offsets)
-        return self._predict_per_series(series_list, cutoff_indexes, pd_freq)
+            per_series, levels = self._predict_batched(
+                series_list, cutoff_indexes, pd_freq, offsets
+            )
+        else:
+            per_series, levels = self._predict_per_series(
+                series_list, cutoff_indexes, pd_freq
+            )
+        return ForecastOutput(quantiles=per_series, quantile_levels=levels)
 
     def _predict_per_series(self, x, cutoff_indexes, pd_freq):
-        results = []
+        per_series = []
+        levels = None
         for series_idx, (series, cutoffs) in enumerate(zip(x, cutoff_indexes)):
             series = np.asarray(series, dtype=np.float32)
             if series.ndim == 1:
@@ -151,10 +158,12 @@ def _predict_per_series(self, x, cutoff_indexes, pd_freq):
                 batch_size=self.batch_size,
             )
 
-            results.append(self._gather_series_output(
+            arr, series_levels = self._gather_series_output(
                 forecast_df, series_idx, C, cutoffs, fcds
-            ))
-        return results
+            )
+            per_series.append(arr)
+            levels = series_levels
+        return per_series, (levels if levels is not None else (0.5,))
 
     def _predict_batched(self, x, cutoff_indexes, pd_freq, offsets):
         """One ``cross_validate`` call covering every series in ``x``.
@@ -203,13 +212,16 @@ def _predict_batched(self, x, cutoff_indexes, pd_freq, offsets):
             batch_size=self.batch_size,
         )
 
-        results = []
+        per_series = []
+        levels = None
         for series_idx, C, index, cutoffs in per_series_meta:
             series_fcds = [pd.Timestamp(index[cutoff]) for cutoff in cutoffs]
-            results.append(self._gather_series_output(
+            arr, series_levels = self._gather_series_output(
                 forecast_df, series_idx, C, cutoffs, series_fcds
-            ))
-        return results
+            )
+            per_series.append(arr)
+            levels = series_levels
+        return per_series, (levels if levels is not None else (0.5,))
 
     def _gather_series_output(self, forecast_df, series_idx, C, cutoffs, fcds):
         # Discover which quantile columns the SDK returned; fall back to
@@ -239,7 +251,7 @@ def _gather_series_output(self, forecast_df, series_idx, C, cutoffs, fcds):
                 window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(self.prediction_length)
                 for q_idx, col in enumerate(quantile_cols):
                     preds[k, q_idx, :, c] = window[col].to_numpy(dtype=np.float32)
-        return ForecastOutput(quantiles=preds, quantile_levels=tuple(levels))
+        return preds, tuple(levels)
 
 
 class Solver(BaseSolver):

From 3ea97dbf8eddb80c0f206dd3bef5e57328924ade Mon Sep 17 00:00:00 2001
From: Geoffrey Negiar <geoff@theforecastingcompany.com>
Date: Thu, 28 May 2026 17:39:11 +0200
Subject: [PATCH 07/12] FIX tighten chronos-forecasting pin; drop redundant
 torch dep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- ``>=2.0`` was too loose: ``Chronos2Pipeline.predict`` with a
  variable-length list of tensors and the ``pipeline.quantiles``
  attribute stabilized in 2.2.x (the version verified end-to-end here).
  Switch to ``>=2.2,<3`` so we test what we ship and a future major
  bump can't silently break the contract.
- Drop ``pip::torch`` — ``chronos-forecasting`` already pins
  ``torch<3,>=2.2`` transitively, so listing it again is dead weight.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 solvers/chronos.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/solvers/chronos.py b/solvers/chronos.py
index 4a32a04..417223c 100644
--- a/solvers/chronos.py
+++ b/solvers/chronos.py
@@ -85,7 +85,7 @@ class Solver(BaseSolver):
 
     name = "Chronos"
 
-    requirements = ["pip::chronos-forecasting>=2.0", "pip::torch"]
+    requirements = ["pip::chronos-forecasting>=2.2,<3"]
 
     sampling_strategy = "run_once"
 

From 1829e414e4f2bd7ffb089a557ec09c83ed6f6a5b Mon Sep 17 00:00:00 2001
From: Eduardo Montesuma <edumontesuma@gmail.com>
Date: Fri, 29 May 2026 09:01:12 +0000
Subject: [PATCH 08/12] feat: move constants to a dedicated file

---
 benchmark_utils/constants.py | 119 +++++++++++++++++++++++++++++++++++
 datasets/monash.py           |  25 +-------
 2 files changed, 122 insertions(+), 22 deletions(-)
 create mode 100644 benchmark_utils/constants.py

diff --git a/benchmark_utils/constants.py b/benchmark_utils/constants.py
new file mode 100644
index 0000000..b9b8f45
--- /dev/null
+++ b/benchmark_utils/constants.py
@@ -0,0 +1,119 @@
+"""Shared frequency / seasonality tables for forecasting datasets.
+
+Two sources name frequencies differently:
+  - aeon (used by Monash) uses words: "yearly", "weekly", "minutely", ...
+  - GIFT-Eval (and pandas) use offset aliases: "Y", "W-SUN", "5T", ...
+
+This module exposes a single canonical (freq, seasonality) lookup keyed on
+the canonical pandas-style base alias (e.g. "Y", "W", "D"), plus two
+adapters that normalize each source onto that canonical key.
+"""
+
+import re
+
+# Canonical base alias → (display_freq, MASE seasonality, default forecast horizon)
+_BASE = {
+    "Y": ("Y",   1,  6),
+    "Q": ("Q",   4,  8),
+    "M": ("M",  12, 12),
+    "W": ("W",  52, 13),
+    "D": ("D",   7, 14),
+    "H": ("H",  24, 24),
+    "T": ("T", 1440, 60),   # minutes
+    "S": ("S",   1, 60),
+}
+
+# aeon's spelled-out names → canonical base alias
+_AEON_TO_BASE = {
+    "yearly":    "Y",
+    "quarterly": "Q",
+    "monthly":   "M",
+    "weekly":    "W",
+    "daily":     "D",
+    "hourly":    "H",
+    "minutely":  "T",
+    "seconds":   "S",
+}
+
+
+def from_aeon(freq_word: str) -> tuple[str, int, int]:
+    """Look up (freq, seasonality, default_horizon) from an aeon freq word."""
+    base = _AEON_TO_BASE.get(freq_word, "D")
+    return _BASE[base]
+
+
+# Pandas offset aliases: strip a leading multiplier and any anchor suffix
+# (e.g. "5T" → "T", "W-SUN" → "W", "QS-OCT" → "Q", "YE" → "Y").
+_PANDAS_ALIAS_RE = re.compile(r"^\d*([A-Za-z]+)")
+_NORMALIZE_BASE = {
+    # Newer pandas spellings → legacy single-letter aliases used in _BASE.
+    "YE": "Y", "YS": "Y", "A": "Y", "AS": "Y",
+    "QE": "Q", "QS": "Q",
+    "ME": "M", "MS": "M",
+    "min": "T", "MIN": "T",
+}
+
+
+def from_pandas(freq_alias: str) -> tuple[str, int, int]:
+    """Look up (freq, seasonality, default_horizon) from a pandas freq alias.
+
+    Handles multipliers ("5T") and anchors ("W-SUN", "QS-OCT") by stripping
+    them before lookup. Unknown aliases default to daily.
+    """
+    if not freq_alias:
+        return _BASE["D"]
+    m = _PANDAS_ALIAS_RE.match(freq_alias.split("-", 1)[0])
+    if not m:
+        return _BASE["D"]
+    head = m.group(1)
+    base = _NORMALIZE_BASE.get(head, head[:1].upper())
+    return _BASE.get(base, _BASE["D"])
+
+
+# ---------------------------------------------------------------------------
+# GIFT-Eval term resolution
+#
+# Mirrors the canonical table in the upstream time-series repo: prediction
+# length is a function of pandas freq, then scaled by a term multiplier
+# (short=1, medium=10, long=15). Used by datasets/gifteval.py so reported
+# numbers line up with the GIFT-Eval leaderboard.
+# ---------------------------------------------------------------------------
+
+GIFT_EVAL_PRED_LENGTH_MAP: dict[str, int] = {
+    "M":  12, "MS": 12,
+    "W":   8, "W-SUN": 8, "W-MON": 8,
+    "D":  30,
+    "H":  48, "6H": 48,
+    "T":  48, "5T": 48, "10T": 48, "15T": 48, "30T": 48,
+    "S":  60, "4S": 60,
+    "Q":   8, "Q-DEC": 8,
+    "A":   4, "A-DEC": 4,
+    "Y":   4,
+}
+
+GIFT_EVAL_TERM_MULTIPLIER: dict[str, int] = {
+    "short":  1,
+    "medium": 10,
+    "long":   15,
+}
+
+
+def gift_eval_prediction_length(freq: str, term: str) -> int:
+    """Resolve the GIFT-Eval prediction length for a (freq, term) pair.
+
+    ``freq`` is a pandas-style alias (e.g. ``"5T"``, ``"W-SUN"``). If the
+    exact alias isn't in :data:`GIFT_EVAL_PRED_LENGTH_MAP`, multi-minute
+    aliases collapse to ``"T"``; otherwise we default to 48.
+
+    ``term`` must be one of ``"short"``, ``"medium"``, ``"long"``.
+    """
+    if term not in GIFT_EVAL_TERM_MULTIPLIER:
+        raise ValueError(
+            f"term must be one of {list(GIFT_EVAL_TERM_MULTIPLIER)}; got {term!r}"
+        )
+    base = GIFT_EVAL_PRED_LENGTH_MAP.get(freq)
+    if base is None and freq.endswith("T") and freq != "T":
+        base = GIFT_EVAL_PRED_LENGTH_MAP["T"]
+    if base is None:
+        base = 48
+    return base * GIFT_EVAL_TERM_MULTIPLIER[term]
diff --git a/datasets/monash.py b/datasets/monash.py
index 049ce73..a299433 100644
--- a/datasets/monash.py
+++ b/datasets/monash.py
@@ -35,26 +35,10 @@
 
 from aeon.datasets import load_forecasting
 from benchmark_utils.covariates import Covariates
+from benchmark_utils.constants import from_aeon
 from benchmark_utils.windowing import make_forecasting_splits
 
 
-# Map aeon frequency strings → pandas-style freq codes and MASE seasonality
-_FREQ_MAP = {
-    "yearly": ("Y", 1),
-    "quarterly": ("Q", 4),
-    "monthly": ("M", 12),
-    "weekly": ("W", 52),
-    "daily": ("D", 7),
-    "hourly": ("H", 24),
-    "minutely": ("T", 1440),
-    "seconds": ("S", 1),
-}
-
-_DEFAULT_HORIZON = {
-    "Y": 6, "Q": 8, "M": 12, "W": 13, "D": 14, "H": 24, "T": 60,
-}
-
-
 class Dataset(BaseDataset):
     """Monash forecasting dataset (loaded via aeon).
 
@@ -90,14 +74,11 @@ def get_data(self):
         #             contain_missing_values, contain_equal_length
 
         aeon_freq = meta.get("frequency", "yearly")
-        freq, seasonality = _FREQ_MAP.get(aeon_freq, ("D", 1))
+        freq, seasonality, default_h = from_aeon(aeon_freq)
 
         pred_len = self.prediction_length
         if pred_len is None:
-            pred_len = int(
-                meta.get("forecast_horizon")
-                or _DEFAULT_HORIZON.get(freq, 10)
-            )
+            pred_len = int(meta.get("forecast_horizon") or default_h)
 
         series_list = []
         rows = df.iterrows() if not self.debug else list(df.iterrows())[:5]

From 66890462468390c31a3fbe2444354660b7e690c2 Mon Sep 17 00:00:00 2001
From: Eduardo Montesuma <edumontesuma@gmail.com>
Date: Fri, 29 May 2026 09:01:27 +0000
Subject: [PATCH 09/12] feat: gift evall support

---
 datasets/gifteval.py | 166 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 datasets/gifteval.py

diff --git a/datasets/gifteval.py b/datasets/gifteval.py
new file mode 100644
index 0000000..9ec0de4
--- /dev/null
+++ b/datasets/gifteval.py
@@ -0,0 +1,166 @@
+"""GIFT-Eval forecasting benchmark dataset (Salesforce/GiftEval on HF).
+
+The HF repo organizes data per-dataset under top-level directories
+(``m4_weekly``, ``etth1``, ``solar``, ...). Each directory holds a
+single Arrow file with the test-set series.
+
+Each entry exposes ``item_id``, ``start``, ``freq``, and ``target``
+(a flat list of floats). For multivariate datasets, ``target`` is still
+serialized as a flat list — GIFT-Eval handles those via separate file
+layouts we don't unpack here; the MVP supports univariate only.
+
+Cutoffs and windows follow the Monash recipe (we don't comply with
+GIFT-Eval's prescribed test cutoff — same rolling-window logic via
+:func:`benchmark_utils.windowing.make_forecasting_splits`).
+
+Data contract output mirrors :mod:`datasets.monash`.
+"""
+
+import numpy as np
+from benchopt import BaseDataset
+
+from benchmark_utils.covariates import Covariates
+from benchmark_utils.constants import (
+    from_pandas,
+    gift_eval_prediction_length,
+)
+from benchmark_utils.windowing import make_forecasting_splits
+
+
+class Dataset(BaseDataset):
+    """GIFT-Eval forecasting dataset (loaded from HF Salesforce/GiftEval).
+
+    Parameters
+    ----------
+    dataset_name : str
+        Subdirectory name on the HF repo (e.g. ``"m4_weekly"``, ``"ett1"``,
+        ``"solar"``). See https://huggingface.co/datasets/Salesforce/GiftEval
+        for the full list.
+    term : str
+        GIFT-Eval forecast term — ``"short"`` (×1), ``"medium"`` (×10), or
+        ``"long"`` (×15). Selects the prediction length via the canonical
+        per-freq base, matching the GIFT-Eval leaderboard convention.
+        Ignored when ``prediction_length`` is set explicitly.
+    prediction_length : int or None
+        Explicit override. ``None`` → resolved from (freq, term).
+    n_windows : int
+        Number of rolling evaluation windows per series.
+    max_series : int or None
+        Optional cap on the number of series — useful for very large
+        configs (e.g. ``solar``). ``None`` = no cap.
+    debug : bool
+        If True, keep only the first 5 series for fast iteration.
+    """
+
+    name = "GiftEval"
+
+    requirements = ["pip::datasets", "pip::huggingface-hub"]
+
+    parameters = {
+        "dataset_name": ["m4_weekly"],
+        "term": ["short"],
+        "prediction_length": [None],
+        "n_windows": [1],
+        "max_series": [None],
+        "debug": [False],
+    }
+
+    def get_data(self):
+        from datasets import Dataset as HFDataset
+        from huggingface_hub import hf_hub_download, list_repo_files
+
+        # Locate the Arrow file inside the requested subdirectory.
+        files = list_repo_files(
+            "Salesforce/GiftEval", repo_type="dataset"
+        )
+        arrow_files = [
+            f for f in files
+            if f.startswith(f"{self.dataset_name}/")
+            and f.endswith(".arrow")
+        ]
+        if not arrow_files:
+            raise ValueError(
+                f"No Arrow file found for GIFT-Eval dataset "
+                f"{self.dataset_name!r}. Available top-level dirs: "
+                f"{sorted({f.split('/')[0] for f in files if '/' in f})}"
+            )
+
+        # Download + load each shard; concatenate.
+        rows = []
+        for f in sorted(arrow_files):
+            local = hf_hub_download(
+                "Salesforce/GiftEval", filename=f, repo_type="dataset",
+            )
+            shard = HFDataset.from_file(local)
+            rows.extend(shard)
+
+        if self.debug:
+            rows = rows[:5]
+        elif self.max_series is not None:
+            rows = rows[: int(self.max_series)]
+
+        if not rows:
+            raise ValueError(
+                f"GIFT-Eval dataset {self.dataset_name!r} returned 0 series."
+            )
+
+        # Frequency / seasonality — take from the first entry (every series
+        # in a GIFT-Eval subset shares the same freq).
+        pandas_freq = rows[0].get("freq") or "D"
+        freq, seasonality, _ = from_pandas(pandas_freq)
+
+        pred_len = self.prediction_length
+        if pred_len is None:
+            pred_len = gift_eval_prediction_length(pandas_freq, self.term)
+
+        # Build (T, C) series. Univariate only in the MVP.
+        series_list = []
+        for r in rows:
+            values = np.asarray(r["target"], dtype=np.float32)
+            if values.ndim != 1:
+                # Skip multivariate entries until we add explicit handling.
+                continue
+            series_list.append(values.reshape(-1, 1))
+
+        if not series_list:
+            raise ValueError(
+                f"All entries in GIFT-Eval dataset {self.dataset_name!r} "
+                "were skipped (multivariate not yet supported)."
+            )
+
+        # Training portion: everything except the last test windows.
+        test_len = pred_len * self.n_windows
+        X_train, y_train_list, full_series = [], [], []
+        for ts in series_list:
+            if ts.shape[0] < pred_len + 1:
+                continue
+            train_end = max(1, ts.shape[0] - test_len)
+            X_train.append(ts[:train_end])
+            y_train_list.append(ts[train_end: train_end + pred_len])
+            full_series.append(ts)
+
+        if not full_series:
+            raise ValueError(
+                "All series are shorter than prediction_length."
+            )
+
+        n_windows = 1 if self.debug else self.n_windows
+        X_test, cutoff_indexes, y_test = make_forecasting_splits(
+            full_series,
+            prediction_length=pred_len,
+            n_windows=n_windows,
+        )
+
+        return dict(
+            X_train=X_train,
+            y_train=y_train_list,
+            X_test=X_test,
+            y_test=y_test,
+            cutoff_indexes=cutoff_indexes,
+            covariates=Covariates(),  # GIFT-Eval HF schema has no covariates
+            task="forecasting",
+            metrics=["mae", "mse", "mase", "smape"],
+            prediction_length=pred_len,
+            freq=freq,
+            seasonality=seasonality,
+        )

From 2a4a74050e7f427d2b0dc38a086dc9e123722467 Mon Sep 17 00:00:00 2001
From: Eduardo Montesuma <edumontesuma@gmail.com>
Date: Fri, 29 May 2026 12:50:18 +0000
Subject: [PATCH 10/12] feat: adds support for fev bench

---
 datasets/fev.py | 263 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 263 insertions(+)
 create mode 100644 datasets/fev.py

diff --git a/datasets/fev.py b/datasets/fev.py
new file mode 100644
index 0000000..0eb5008
--- /dev/null
+++ b/datasets/fev.py
@@ -0,0 +1,263 @@
+"""AutoGluon fev_datasets forecasting benchmark
+(huggingface.co/datasets/autogluon/fev_datasets).
+
+The HF repo organizes data either:
+  - per-freq: ``<dataset>/<freq>/train-*.parquet``
+    (e.g. ``ETT/1H``, ``LOOP_SEATTLE/5T``)
+  - flat: ``<dataset>/train-*.parquet``
+    (e.g. ``australian_tourism``)
+  - or with an arbitrary subdir that is NOT a freq (e.g. ``boomlet/<N>``
+    where ``<N>`` is a series id, not a frequency).
+
+We accept the directory path directly as ``dataset_name`` (e.g.
+``"ETT/1H"``, ``"australian_tourism"``) and infer the actual freq from
+each series' ``timestamp`` column rather than parsing the path.
+
+Each parquet row is one series; columns vary:
+  - Always: ``id``, ``timestamp``
+  - Univariate: a ``target`` column (list of floats)
+  - Multivariate (e.g. ``ETT``): no ``target`` column — each channel is
+    its own column (``HUFL``, ..., ``OT``). Channel columns are stacked
+    on the last axis to form ``(T, C)``.
+
+Rolling-window splits and GIFT-Eval-style term → prediction_length
+resolution match :mod:`datasets.gifteval`.
+"""
+
+import numpy as np
+import pandas as pd
+from benchopt import BaseDataset
+
+from benchmark_utils.covariates import Covariates
+from benchmark_utils.constants import (
+    from_pandas,
+    gift_eval_prediction_length,
+)
+from benchmark_utils.windowing import make_forecasting_splits
+
+
+_METADATA_COLS = ("id", "timestamp")
+
+
+# Canonical list of FEV evaluation configs — directory paths inside
+# https://huggingface.co/datasets/autogluon/fev_datasets that contain at
+# least one ``train-*.parquet`` file. Surfaced via
+# ``get_parameter_choices`` so that ``dataset_name=all`` and ``benchopt
+# info -v`` work.
+FEV_DATASETS: tuple[str, ...] = (
+    "ETT/15T", "ETT/1D", "ETT/1H", "ETT/1W",
+    "LOOP_SEATTLE/1D", "LOOP_SEATTLE/1H", "LOOP_SEATTLE/5T",
+    "M_DENSE/1D", "M_DENSE/1H",
+    "SZ_TAXI/15T", "SZ_TAXI/1H",
+    "australian_tourism",
+    "bizitobs_l2c/1H", "bizitobs_l2c/5T",
+    "boomlet/1062", "boomlet/1209", "boomlet/1225", "boomlet/1230",
+    "boomlet/1282", "boomlet/1487", "boomlet/1631", "boomlet/1676",
+    "boomlet/1855", "boomlet/1975", "boomlet/2187",
+    "boomlet/285", "boomlet/619", "boomlet/772", "boomlet/963",
+    "ecdc_ili",
+    "entsoe/15T", "entsoe/1H", "entsoe/30T",
+    "epf_be", "epf_de", "epf_fr", "epf_np", "epf_pjm",
+    "ercot/1D", "ercot/1H", "ercot/1M", "ercot/1W",
+    "favorita_stores/1D", "favorita_stores/1M", "favorita_stores/1W",
+    "favorita_transactions/1D", "favorita_transactions/1M",
+    "favorita_transactions/1W",
+    "fred_md_2025", "fred_qd_2025",
+    "gvar", "hermes",
+    "hierarchical_sales/1D", "hierarchical_sales/1W",
+    "hospital",
+    "hospital_admissions/1D", "hospital_admissions/1W",
+    "jena_weather/10T", "jena_weather/1D", "jena_weather/1H",
+    "kdd_cup_2022/10T", "kdd_cup_2022/1D", "kdd_cup_2022/30T",
+    "m5/1D", "m5/1M", "m5/1W",
+    "proenfo_bull", "proenfo_cockatoo",
+    "proenfo_gfc12", "proenfo_gfc14", "proenfo_gfc17",
+    "proenfo_hog", "proenfo_pdb",
+    "redset/15T", "redset/1H", "redset/5T",
+    "restaurant",
+    "rohlik_orders/1D", "rohlik_orders/1W",
+    "rohlik_sales/1D", "rohlik_sales/1W",
+    "rossmann/1D", "rossmann/1W",
+    "solar/1D", "solar/1W",
+    "solar_with_weather/15T", "solar_with_weather/1H",
+    "uci_air_quality/1D", "uci_air_quality/1H",
+    "uk_covid_nation/1D", "uk_covid_nation/1W",
+    "uk_covid_utla/1D", "uk_covid_utla/1W",
+    "us_consumption/1M", "us_consumption/1Q", "us_consumption/1Y",
+    "walmart",
+    "world_co2_emissions", "world_life_expectancy", "world_tourism",
+)
+
+FEV_TERMS: tuple[str, ...] = ("short", "medium", "long")
+
+
+def _infer_freq(timestamps) -> str:
+    """Best-effort freq inference from a series' timestamp column.
+
+    Falls back to ``"D"`` when pandas cannot infer. Uses the first 5
+    points to keep the check cheap on long series.
+    """
+    try:
+        idx = pd.DatetimeIndex(timestamps[:5])
+        return pd.infer_freq(idx) or "D"
+    except Exception:
+        return "D"
+
+
+class Dataset(BaseDataset):
+    """AutoGluon fev forecasting dataset.
+
+    Parameters
+    ----------
+    dataset_name : str
+        Directory path inside the HF repo. Per-freq paths look like
+        ``"ETT/1H"`` / ``"LOOP_SEATTLE/5T"``; flat paths like
+        ``"australian_tourism"`` / ``"hospital"``. See ``FEV_DATASETS``
+        for the full list (also discoverable via ``benchopt info -v``).
+    term : str
+        GIFT-Eval-style term: ``"short"`` / ``"medium"`` / ``"long"``.
+        Ignored when ``prediction_length`` is set.
+    prediction_length : int or None
+        Explicit override. ``None`` → resolved from (inferred freq, term).
+    n_windows : int
+        Number of rolling evaluation windows per series.
+    max_series : int or None
+        Optional cap on the number of series.
+    debug : bool
+        If True, keep only the first 5 series.
+    """
+
+    name = "FEV"
+
+    requirements = ["pip::pyarrow", "pip::huggingface-hub"]
+
+    parameters = {
+        "dataset_name": ["LOOP_SEATTLE/1H"],
+        "term": ["short"],
+        "prediction_length": [None],
+        "n_windows": [1],
+        "max_series": [None],
+        "debug": [False],
+    }
+
+    @classmethod
+    def get_all_parameter_values(cls, name):
+        if name == "dataset_name":
+            return list(FEV_DATASETS)
+        if name == "term":
+            return list(FEV_TERMS)
+        return None
+
+    def get_data(self):
+        from huggingface_hub import hf_hub_download, list_repo_files
+
+        repo = "autogluon/fev_datasets"
+        files = list_repo_files(repo, repo_type="dataset")
+
+        # Match parquet files in the exact directory (no nested descent).
+        prefix = f"{self.dataset_name}/"
+        parquet_files = sorted(
+            f for f in files
+            if f.startswith(prefix)
+            and f.endswith(".parquet")
+            and "/" not in f[len(prefix):]
+        )
+        if not parquet_files:
+            raise ValueError(
+                f"No parquet found at {self.dataset_name!r} in {repo}. "
+                f"Valid choices are in FEV_DATASETS."
+            )
+
+        frames = [
+            pd.read_parquet(hf_hub_download(repo, filename=f, repo_type="dataset"))
+            for f in parquet_files
+        ]
+        df = pd.concat(frames, ignore_index=True)
+
+        if self.debug:
+            df = df.head(5)
+        elif self.max_series is not None:
+            df = df.head(int(self.max_series))
+
+        if df.empty:
+            raise ValueError(f"{self.dataset_name!r} contained 0 series.")
+
+        # Channel cols = non-metadata columns whose entries are numeric
+        # array-likes. Some FEV datasets carry extra scalar/string fields
+        # (``type``, ``Security``) or arrays of strings (holiday names in
+        # ``favorita_stores``, etc.). We treat covariates as out of scope
+        # for the MVP.
+        def _is_numeric_array_col(c):
+            v = df.iloc[0][c]
+            if not hasattr(v, "__len__") or isinstance(v, (str, bytes)):
+                return False
+            if len(v) == 0:
+                return False
+            return isinstance(v[0], (int, float, np.integer, np.floating))
+
+        channel_cols = [
+            c for c in df.columns
+            if c not in _METADATA_COLS and _is_numeric_array_col(c)
+        ]
+        if not channel_cols:
+            raise ValueError(
+                f"{self.dataset_name!r} has no channel columns "
+                f"(only {_METADATA_COLS} present)."
+            )
+
+        # Infer freq from the first series' timestamps — same for the
+        # whole config (FEV groups by freq at the directory level for
+        # nested configs, and flat configs are single-freq).
+        inferred_freq = _infer_freq(df.iloc[0]["timestamp"])
+        canonical_freq, seasonality, _ = from_pandas(inferred_freq)
+
+        pred_len = self.prediction_length
+        if pred_len is None:
+            pred_len = gift_eval_prediction_length(inferred_freq, self.term)
+
+        # Build (T, C) series. Each row's per-channel array has the same
+        # length (T_i); stack on the last axis.
+        series_list = []
+        for _, row in df.iterrows():
+            channels = [np.asarray(row[c], dtype=np.float32) for c in channel_cols]
+            T = channels[0].shape[0]
+            if any(ch.shape[0] != T for ch in channels):
+                continue
+            series_list.append(np.stack(channels, axis=-1))
+
+        if not series_list:
+            raise ValueError("All series were skipped (inconsistent channel lengths).")
+
+        test_len = pred_len * self.n_windows
+        X_train, y_train_list, full_series = [], [], []
+        for ts in series_list:
+            if ts.shape[0] < pred_len + 1:
+                continue
+            train_end = max(1, ts.shape[0] - test_len)
+            X_train.append(ts[:train_end])
+            y_train_list.append(ts[train_end: train_end + pred_len])
+            full_series.append(ts)
+
+        if not full_series:
+            raise ValueError("All series are shorter than prediction_length.")
+
+        n_windows = 1 if self.debug else self.n_windows
+        X_test, cutoff_indexes, y_test = make_forecasting_splits(
+            full_series,
+            prediction_length=pred_len,
+            n_windows=n_windows,
+        )
+
+        return dict(
+            X_train=X_train,
+            y_train=y_train_list,
+            X_test=X_test,
+            y_test=y_test,
+            cutoff_indexes=cutoff_indexes,
+            covariates=Covariates(),
+            task="forecasting",
+            metrics=["mae", "mse", "mase", "smape"],
+            prediction_length=pred_len,
+            freq=canonical_freq,
+            seasonality=seasonality,
+        )

From 848effb1b64c7c56b3ce870839c8de1ff857c45c Mon Sep 17 00:00:00 2001
From: Eduardo Montesuma <edumontesuma@gmail.com>
Date: Fri, 29 May 2026 12:50:25 +0000
Subject: [PATCH 11/12] minor fixes

---
 benchmark_utils/constants.py | 20 ++++++---
 datasets/gifteval.py         | 84 ++++++++++++++++++++++++++++++------
 2 files changed, 84 insertions(+), 20 deletions(-)

diff --git a/benchmark_utils/constants.py b/benchmark_utils/constants.py
index b9b8f45..5698672 100644
--- a/benchmark_utils/constants.py
+++ b/benchmark_utils/constants.py
@@ -101,19 +101,25 @@ def from_pandas(freq_alias: str) -> tuple[str, int, int]:
 def gift_eval_prediction_length(freq: str, term: str) -> int:
     """Resolve the GIFT-Eval prediction length for a (freq, term) pair.
 
-    ``freq`` is a pandas-style alias (e.g. ``"5T"``, ``"W-SUN"``). If the
-    exact alias isn't in :data:`GIFT_EVAL_PRED_LENGTH_MAP`, multi-minute
-    aliases collapse to ``"T"``; otherwise we default to 48.
-
-    ``term`` must be one of ``"short"``, ``"medium"``, ``"long"``.
+    ``freq`` is a pandas-style alias (e.g. ``"5T"``, ``"1H"``, ``"W-SUN"``).
+    Lookup falls back through: exact match → strip leading "1" multiplier
+    ("1H" → "H") → collapse any multi-X alias to its base X ("10S" → "S",
+    "30T" → "T") → default 48. ``term`` must be one of ``"short"``,
+    ``"medium"``, ``"long"``.
     """
     if term not in GIFT_EVAL_TERM_MULTIPLIER:
         raise ValueError(
             f"term must be one of {list(GIFT_EVAL_TERM_MULTIPLIER)}; got {term!r}"
         )
     base = GIFT_EVAL_PRED_LENGTH_MAP.get(freq)
-    if base is None and freq.endswith("T") and freq != "T":
-        base = GIFT_EVAL_PRED_LENGTH_MAP["T"]
+    if base is None:
+        m = _PANDAS_ALIAS_RE.match(freq.split("-", 1)[0])
+        if m:
+            head = m.group(1)
+            # Normalize new pandas spellings ("QE"→"Q", "ME"→"M", ...)
+            # before falling back through the map.
+            head = _NORMALIZE_BASE.get(head, head)
+            base = GIFT_EVAL_PRED_LENGTH_MAP.get(head)
     if base is None:
         base = 48
     return base * GIFT_EVAL_TERM_MULTIPLIER[term]
diff --git a/datasets/gifteval.py b/datasets/gifteval.py
index 9ec0de4..9d6d72b 100644
--- a/datasets/gifteval.py
+++ b/datasets/gifteval.py
@@ -4,10 +4,12 @@
 (``m4_weekly``, ``etth1``, ``solar``, ...). Each directory holds a
 single Arrow file with the test-set series.
 
-Each entry exposes ``item_id``, ``start``, ``freq``, and ``target``
-(a flat list of floats). For multivariate datasets, ``target`` is still
-serialized as a flat list — GIFT-Eval handles those via separate file
-layouts we don't unpack here; the MVP supports univariate only.
+Each entry exposes ``item_id``, ``start``, ``freq``, and ``target``.
+``target`` is a flat ``List[float]`` for univariate configs and a
+``List[List[float]]`` of shape ``(C, T)`` for multivariate ones (e.g.
+``bitbrains_*``, ``electricity/*``, ``ett1/*``, ``ett2/*``,
+``jena_weather/*``, ``solar/*``). Both shapes are handled — multivariate
+entries are transposed to the ``(T, C)`` repo contract.
 
 Cutoffs and windows follow the Monash recipe (we don't comply with
 GIFT-Eval's prescribed test cutoff — same rolling-window logic via
@@ -27,6 +29,43 @@
 from benchmark_utils.windowing import make_forecasting_splits
 
 
+# Canonical list of GIFT-Eval evaluation configs. Each entry is the
+# arrow-containing directory path inside the HF repo. Flat datasets are
+# bare names (``m4_weekly``); datasets that ship multiple frequencies
+# are encoded as ``<name>/<freq>`` (e.g. ``LOOP_SEATTLE/H``,
+# ``LOOP_SEATTLE/D`` — these are genuinely distinct evaluations).
+# Surfaced via ``get_parameter_choices`` so that ``dataset_name=all``
+# and ``benchopt info -v`` work.
+GIFTEVAL_DATASETS: tuple[str, ...] = (
+    "LOOP_SEATTLE/5T", "LOOP_SEATTLE/D", "LOOP_SEATTLE/H",
+    "M_DENSE/D", "M_DENSE/H",
+    "SZ_TAXI/15T", "SZ_TAXI/H",
+    "bitbrains_fast_storage/5T", "bitbrains_fast_storage/H",
+    "bitbrains_rnd/5T", "bitbrains_rnd/H",
+    "bizitobs_application",
+    "bizitobs_l2c/5T", "bizitobs_l2c/H",
+    "bizitobs_service",
+    "car_parts_with_missing", "covid_deaths",
+    "electricity/15T", "electricity/D", "electricity/H", "electricity/W",
+    "ett1/15T", "ett1/D", "ett1/H", "ett1/W",
+    "ett2/15T", "ett2/D", "ett2/H", "ett2/W",
+    "hierarchical_sales/D", "hierarchical_sales/W",
+    "hospital",
+    "jena_weather",
+    "jena_weather/10T", "jena_weather/D", "jena_weather/H",
+    "kdd_cup_2018_with_missing/D", "kdd_cup_2018_with_missing/H",
+    "m4_daily", "m4_hourly", "m4_monthly", "m4_quarterly",
+    "m4_weekly", "m4_yearly",
+    "restaurant",
+    "saugeenday/D", "saugeenday/M", "saugeenday/W",
+    "solar/10T", "solar/D", "solar/H", "solar/W",
+    "temperature_rain_with_missing",
+    "us_births/D", "us_births/M", "us_births/W",
+)
+
+GIFTEVAL_TERMS: tuple[str, ...] = ("short", "medium", "long")
+
+
 class Dataset(BaseDataset):
     """GIFT-Eval forecasting dataset (loaded from HF Salesforce/GiftEval).
 
@@ -65,24 +104,38 @@ class Dataset(BaseDataset):
         "debug": [False],
     }
 
+    @classmethod
+    def get_all_parameter_values(cls, name):
+        if name == "dataset_name":
+            return list(GIFTEVAL_DATASETS)
+        if name == "term":
+            return list(GIFTEVAL_TERMS)
+        return None
+
     def get_data(self):
         from datasets import Dataset as HFDataset
         from huggingface_hub import hf_hub_download, list_repo_files
 
-        # Locate the Arrow file inside the requested subdirectory.
+        # Locate the Arrow file inside the requested directory. Match the
+        # exact directory (no nested descent) — for datasets like
+        # ``LOOP_SEATTLE`` that ship multiple freq subdirs, the user must
+        # pick one (``LOOP_SEATTLE/H``, ``LOOP_SEATTLE/D``, ...), and
+        # those are genuinely separate evaluation configs.
         files = list_repo_files(
             "Salesforce/GiftEval", repo_type="dataset"
         )
+        prefix = f"{self.dataset_name}/"
         arrow_files = [
             f for f in files
-            if f.startswith(f"{self.dataset_name}/")
+            if f.startswith(prefix)
             and f.endswith(".arrow")
+            and "/" not in f[len(prefix):]
         ]
         if not arrow_files:
             raise ValueError(
                 f"No Arrow file found for GIFT-Eval dataset "
-                f"{self.dataset_name!r}. Available top-level dirs: "
-                f"{sorted({f.split('/')[0] for f in files if '/' in f})}"
+                f"{self.dataset_name!r}. Valid choices are in "
+                f"GIFTEVAL_DATASETS."
             )
 
         # Download + load each shard; concatenate.
@@ -113,19 +166,24 @@ def get_data(self):
         if pred_len is None:
             pred_len = gift_eval_prediction_length(pandas_freq, self.term)
 
-        # Build (T, C) series. Univariate only in the MVP.
+        # Build (T, C) series. Univariate entries arrive as flat
+        # ``List[float]`` (ndim=1); multivariate entries arrive as
+        # ``List[List[float]]`` of shape ``(C, T)``.
         series_list = []
         for r in rows:
             values = np.asarray(r["target"], dtype=np.float32)
-            if values.ndim != 1:
-                # Skip multivariate entries until we add explicit handling.
+            if values.ndim == 1:
+                series = values.reshape(-1, 1)        # (T, 1)
+            elif values.ndim == 2:
+                series = values.T                       # (C, T) → (T, C)
+            else:
                 continue
-            series_list.append(values.reshape(-1, 1))
+            series_list.append(series)
 
         if not series_list:
             raise ValueError(
                 f"All entries in GIFT-Eval dataset {self.dataset_name!r} "
-                "were skipped (multivariate not yet supported)."
+                "had unsupported target shapes."
             )
 
         # Training portion: everything except the last test windows.

From b2dc9537af91c4adbd55e58caea23a32fed32c1c Mon Sep 17 00:00:00 2001
From: Eduardo Montesuma <edumontesuma@gmail.com>
Date: Fri, 29 May 2026 15:52:07 +0000
Subject: [PATCH 12/12] fixes, prepare(), all behavior for gifteval and
 fevbench

---
 datasets/fev.py      |  82 ++++++-----
 datasets/gifteval.py | 343 ++++++++++++++++++++++++++++++++-----------
 datasets/monash.py   |  14 ++
 objective.py         |  17 +++
 4 files changed, 333 insertions(+), 123 deletions(-)

diff --git a/datasets/fev.py b/datasets/fev.py
index 0eb5008..e41d8cb 100644
--- a/datasets/fev.py
+++ b/datasets/fev.py
@@ -20,8 +20,11 @@
     its own column (``HUFL``, ..., ``OT``). Channel columns are stacked
     on the last axis to form ``(T, C)``.
 
-Rolling-window splits and GIFT-Eval-style term → prediction_length
-resolution match :mod:`datasets.gifteval`.
+Rolling-window splits match :mod:`datasets.monash`. The default
+``prediction_length`` is the freq-based heuristic from
+:func:`benchmark_utils.constants.from_pandas`; FEV does not publish a
+per-dataset horizon spec, so we don't try to mirror one. Pass
+``prediction_length=N`` explicitly to override.
 """
 
 import numpy as np
@@ -29,10 +32,7 @@
 from benchopt import BaseDataset
 
 from benchmark_utils.covariates import Covariates
-from benchmark_utils.constants import (
-    from_pandas,
-    gift_eval_prediction_length,
-)
+from benchmark_utils.constants import from_pandas
 from benchmark_utils.windowing import make_forecasting_splits
 
 
@@ -88,8 +88,6 @@
     "world_co2_emissions", "world_life_expectancy", "world_tourism",
 )
 
-FEV_TERMS: tuple[str, ...] = ("short", "medium", "long")
-
 
 def _infer_freq(timestamps) -> str:
     """Best-effort freq inference from a series' timestamp column.
@@ -114,11 +112,12 @@ class Dataset(BaseDataset):
         ``"ETT/1H"`` / ``"LOOP_SEATTLE/5T"``; flat paths like
         ``"australian_tourism"`` / ``"hospital"``. See ``FEV_DATASETS``
         for the full list (also discoverable via ``benchopt info -v``).
-    term : str
-        GIFT-Eval-style term: ``"short"`` / ``"medium"`` / ``"long"``.
-        Ignored when ``prediction_length`` is set.
     prediction_length : int or None
-        Explicit override. ``None`` → resolved from (inferred freq, term).
+        Explicit override. ``None`` → resolved from the inferred freq
+        via :func:`benchmark_utils.constants.from_pandas` (same heuristic
+        used by Monash). FEV does not publish its own per-dataset
+        horizon matrix, so we don't try to align with a leaderboard
+        spec here.
     n_windows : int
         Number of rolling evaluation windows per series.
     max_series : int or None
@@ -133,46 +132,55 @@ class Dataset(BaseDataset):
 
     parameters = {
         "dataset_name": ["LOOP_SEATTLE/1H"],
-        "term": ["short"],
         "prediction_length": [None],
         "n_windows": [1],
         "max_series": [None],
         "debug": [False],
     }
 
+    # Cache prepare() by dataset_name only — the other knobs shape the
+    # in-memory view, not the downloaded files.
+    prepare_cache_ignore = (
+        "prediction_length", "n_windows", "max_series", "debug",
+    )
+
     @classmethod
     def get_all_parameter_values(cls, name):
         if name == "dataset_name":
             return list(FEV_DATASETS)
-        if name == "term":
-            return list(FEV_TERMS)
         return None
 
-    def get_data(self):
-        from huggingface_hub import hf_hub_download, list_repo_files
-
-        repo = "autogluon/fev_datasets"
-        files = list_repo_files(repo, repo_type="dataset")
-
-        # Match parquet files in the exact directory (no nested descent).
-        prefix = f"{self.dataset_name}/"
-        parquet_files = sorted(
-            f for f in files
-            if f.startswith(prefix)
-            and f.endswith(".parquet")
-            and "/" not in f[len(prefix):]
+    def prepare(self):
+        """Pre-download parquet shards for this config into HF's cache."""
+        self._snapshot()
+
+    def _snapshot(self) -> "list[str]":
+        """Snapshot-download parquet files for this dataset_name and
+        return their local paths. Idempotent."""
+        from huggingface_hub import snapshot_download
+        from pathlib import Path
+
+        local_root = snapshot_download(
+            "autogluon/fev_datasets",
+            repo_type="dataset",
+            allow_patterns=f"{self.dataset_name}/*.parquet",
         )
+        return sorted(
+            str(p) for p in (Path(local_root) / self.dataset_name).glob("*.parquet")
+        )
+
+    def get_data(self):
+        parquet_files = self._snapshot()
         if not parquet_files:
             raise ValueError(
-                f"No parquet found at {self.dataset_name!r} in {repo}. "
-                f"Valid choices are in FEV_DATASETS."
+                f"No parquet found at {self.dataset_name!r} in "
+                "autogluon/fev_datasets. Valid choices are in FEV_DATASETS."
             )
 
-        frames = [
-            pd.read_parquet(hf_hub_download(repo, filename=f, repo_type="dataset"))
-            for f in parquet_files
-        ]
-        df = pd.concat(frames, ignore_index=True)
+        df = pd.concat(
+            [pd.read_parquet(f) for f in parquet_files],
+            ignore_index=True,
+        )
 
         if self.debug:
             df = df.head(5)
@@ -209,11 +217,11 @@ def _is_numeric_array_col(c):
         # whole config (FEV groups by freq at the directory level for
         # nested configs, and flat configs are single-freq).
         inferred_freq = _infer_freq(df.iloc[0]["timestamp"])
-        canonical_freq, seasonality, _ = from_pandas(inferred_freq)
+        canonical_freq, seasonality, default_h = from_pandas(inferred_freq)
 
         pred_len = self.prediction_length
         if pred_len is None:
-            pred_len = gift_eval_prediction_length(inferred_freq, self.term)
+            pred_len = int(default_h)
 
         # Build (T, C) series. Each row's per-channel array has the same
         # length (T_i); stack on the last axis.
diff --git a/datasets/gifteval.py b/datasets/gifteval.py
index 9d6d72b..c8661c8 100644
--- a/datasets/gifteval.py
+++ b/datasets/gifteval.py
@@ -1,19 +1,79 @@
 """GIFT-Eval forecasting benchmark dataset (Salesforce/GiftEval on HF).
 
-The HF repo organizes data per-dataset under top-level directories
-(``m4_weekly``, ``etth1``, ``solar``, ...). Each directory holds a
-single Arrow file with the test-set series.
+Parametrization
+---------------
+The class exposes two orthogonal parameters that drive the leaderboard
+matrix:
 
-Each entry exposes ``item_id``, ``start``, ``freq``, and ``target``.
+* ``dataset_name`` — one of 55 canonical ``<name>/<freq>`` paths (e.g.
+  ``"m4_weekly/W"``, ``"loop_seattle/H"``). The full list is
+  :data:`GIFTEVAL_DATASETS`.
+* ``term`` — one of ``short`` / ``medium`` / ``long``, controlling the
+  forecast horizon (×1, ×10, ×15 of the per-freq base).
+
+Both are surfaced via ``get_all_parameter_values`` so that
+``-d "GiftEval[dataset_name=all,term=short]"`` and ``benchopt info -v``
+work.
+
+Canonical-combo gating
+----------------------
+GIFT-Eval scores only **97** of the 55 × 3 = 165 possible ``(path,
+term)`` combinations on its public leaderboard. The 34 short-only paths
+do not define ``medium`` / ``long``. We track the canonical set in
+:data:`CANONICAL_COMBOS` and gate runs at the dataset level: when
+``(dataset_name, term)`` is not canonical, ``get_data()`` short-circuits
+and returns a placeholder dict carrying a ``_skip_reason`` field.
+:meth:`Objective.skip` (see ``objective.py``) honors that field and
+skips the combo cleanly.
+
+So:
+
+* ``-d "GiftEval[dataset_name=all,term=short]"`` → 55 canonical runs.
+* ``-d "GiftEval[dataset_name=all,term=long]"``  → 55 attempts,
+  21 canonical runs, 34 skipped.
+* ``-d "GiftEval[dataset_name=all,term=all]"``   → 165 attempts,
+  97 canonical runs, 68 skipped.
+
+Leaderboard names vs HF directory names
+---------------------------------------
+The leaderboard uses lowercase, paper-style identifiers (e.g.
+``loop_seattle/H``, ``m_dense/D``, ``car_parts/M``) while the HF repo
+``Salesforce/GiftEval`` uses mixed-case directory names that don't
+always match (``LOOP_SEATTLE/H``, ``M_DENSE/D``,
+``car_parts_with_missing/``). We accept leaderboard names — that's what
+appears in the paper, the leaderboard, and the gift-eval README — and
+translate to HF paths internally via :data:`_LEADERBOARD_TO_HF`. Cases:
+
+  * Pure case difference: ``loop_seattle`` → ``LOOP_SEATTLE``,
+    ``m_dense`` → ``M_DENSE``, ``sz_taxi`` → ``SZ_TAXI``.
+  * Missing-data suffix: ``car_parts`` → ``car_parts_with_missing``,
+    ``kdd_cup_2018`` → ``kdd_cup_2018_with_missing``,
+    ``temperature_rain`` → ``temperature_rain_with_missing``.
+  * Rename: ``saugeen`` → ``saugeenday``.
+  * Leaderboard adds a freq segment for HF-flat datasets: leaderboard
+    ``m4_yearly/A`` → HF flat ``m4_yearly`` (the freq is implicit in the
+    data, not the path). Likewise for the other ``m4_*``,
+    ``car_parts/M``, ``covid_deaths/D``, ``hospital/M``,
+    ``restaurant/D``, ``temperature_rain/D``,
+    ``bizitobs_application/10S``, ``bizitobs_service/10S``.
+
+Schema
+------
+Each HF entry exposes ``item_id``, ``start``, ``freq``, ``target``.
 ``target`` is a flat ``List[float]`` for univariate configs and a
 ``List[List[float]]`` of shape ``(C, T)`` for multivariate ones (e.g.
 ``bitbrains_*``, ``electricity/*``, ``ett1/*``, ``ett2/*``,
 ``jena_weather/*``, ``solar/*``). Both shapes are handled — multivariate
-entries are transposed to the ``(T, C)`` repo contract.
+entries are transposed to the repo's ``(T, C)`` contract.
 
-Cutoffs and windows follow the Monash recipe (we don't comply with
-GIFT-Eval's prescribed test cutoff — same rolling-window logic via
-:func:`benchmark_utils.windowing.make_forecasting_splits`).
+Cutoffs and windows
+-------------------
+We don't comply with GIFT-Eval's prescribed test cutoff; we use the same
+rolling-window logic as Monash via
+:func:`benchmark_utils.windowing.make_forecasting_splits`. The
+``prediction_length`` for a given (freq, term) follows GIFT-Eval's
+canonical ``base × multiplier`` rule via
+:func:`benchmark_utils.constants.gift_eval_prediction_length`.
 
 Data contract output mirrors :mod:`datasets.monash`.
 """
@@ -29,41 +89,145 @@
 from benchmark_utils.windowing import make_forecasting_splits
 
 
-# Canonical list of GIFT-Eval evaluation configs. Each entry is the
-# arrow-containing directory path inside the HF repo. Flat datasets are
-# bare names (``m4_weekly``); datasets that ship multiple frequencies
-# are encoded as ``<name>/<freq>`` (e.g. ``LOOP_SEATTLE/H``,
-# ``LOOP_SEATTLE/D`` — these are genuinely distinct evaluations).
-# Surfaced via ``get_parameter_choices`` so that ``dataset_name=all``
-# and ``benchopt info -v`` work.
-GIFTEVAL_DATASETS: tuple[str, ...] = (
-    "LOOP_SEATTLE/5T", "LOOP_SEATTLE/D", "LOOP_SEATTLE/H",
-    "M_DENSE/D", "M_DENSE/H",
-    "SZ_TAXI/15T", "SZ_TAXI/H",
-    "bitbrains_fast_storage/5T", "bitbrains_fast_storage/H",
-    "bitbrains_rnd/5T", "bitbrains_rnd/H",
-    "bizitobs_application",
-    "bizitobs_l2c/5T", "bizitobs_l2c/H",
-    "bizitobs_service",
-    "car_parts_with_missing", "covid_deaths",
-    "electricity/15T", "electricity/D", "electricity/H", "electricity/W",
-    "ett1/15T", "ett1/D", "ett1/H", "ett1/W",
-    "ett2/15T", "ett2/D", "ett2/H", "ett2/W",
-    "hierarchical_sales/D", "hierarchical_sales/W",
-    "hospital",
-    "jena_weather",
-    "jena_weather/10T", "jena_weather/D", "jena_weather/H",
-    "kdd_cup_2018_with_missing/D", "kdd_cup_2018_with_missing/H",
+# ---------------------------------------------------------------------------
+# Single source of truth: leaderboard ``<name>/<freq>`` path → tuple of
+# terms that path defines. Derived from
+# gift-eval/results/*/all_results.csv. 55 paths, 97 (path, term) triples;
+# 34 paths are short-only, 21 define all three.
+# ---------------------------------------------------------------------------
+_LEADERBOARD: dict[str, tuple[str, ...]] = {
+    "bitbrains_fast_storage/5T":   ("short", "medium", "long"),
+    "bitbrains_fast_storage/H":    ("short",),
+    "bitbrains_rnd/5T":            ("short", "medium", "long"),
+    "bitbrains_rnd/H":             ("short",),
+    "bizitobs_application/10S":    ("short", "medium", "long"),
+    "bizitobs_l2c/5T":             ("short", "medium", "long"),
+    "bizitobs_l2c/H":              ("short", "medium", "long"),
+    "bizitobs_service/10S":        ("short", "medium", "long"),
+    "car_parts/M":                 ("short",),
+    "covid_deaths/D":              ("short",),
+    "electricity/15T":             ("short", "medium", "long"),
+    "electricity/D":               ("short",),
+    "electricity/H":               ("short", "medium", "long"),
+    "electricity/W":               ("short",),
+    "ett1/15T":                    ("short", "medium", "long"),
+    "ett1/D":                      ("short",),
+    "ett1/H":                      ("short", "medium", "long"),
+    "ett1/W":                      ("short",),
+    "ett2/15T":                    ("short", "medium", "long"),
+    "ett2/D":                      ("short",),
+    "ett2/H":                      ("short", "medium", "long"),
+    "ett2/W":                      ("short",),
+    "hierarchical_sales/D":        ("short",),
+    "hierarchical_sales/W":        ("short",),
+    "hospital/M":                  ("short",),
+    "jena_weather/10T":            ("short", "medium", "long"),
+    "jena_weather/D":              ("short",),
+    "jena_weather/H":              ("short", "medium", "long"),
+    "kdd_cup_2018/D":              ("short",),
+    "kdd_cup_2018/H":              ("short", "medium", "long"),
+    "loop_seattle/5T":             ("short", "medium", "long"),
+    "loop_seattle/D":              ("short",),
+    "loop_seattle/H":              ("short", "medium", "long"),
+    "m4_daily/D":                  ("short",),
+    "m4_hourly/H":                 ("short",),
+    "m4_monthly/M":                ("short",),
+    "m4_quarterly/Q":              ("short",),
+    "m4_weekly/W":                 ("short",),
+    "m4_yearly/A":                 ("short",),
+    "m_dense/D":                   ("short",),
+    "m_dense/H":                   ("short", "medium", "long"),
+    "restaurant/D":                ("short",),
+    "saugeen/D":                   ("short",),
+    "saugeen/M":                   ("short",),
+    "saugeen/W":                   ("short",),
+    "solar/10T":                   ("short", "medium", "long"),
+    "solar/D":                     ("short",),
+    "solar/H":                     ("short", "medium", "long"),
+    "solar/W":                     ("short",),
+    "sz_taxi/15T":                 ("short", "medium", "long"),
+    "sz_taxi/H":                   ("short",),
+    "temperature_rain/D":          ("short",),
+    "us_births/D":                 ("short",),
+    "us_births/M":                 ("short",),
+    "us_births/W":                 ("short",),
+}
+
+
+# Public derived constants — what users and CLI tooling reference.
+GIFTEVAL_DATASETS: tuple[str, ...] = tuple(sorted(_LEADERBOARD))
+GIFTEVAL_TERMS: tuple[str, ...] = ("short", "medium", "long")
+CANONICAL_COMBOS: frozenset[tuple[str, str]] = frozenset(
+    (path, term) for path, terms in _LEADERBOARD.items() for term in terms
+)
+
+
+# ---------------------------------------------------------------------------
+# Leaderboard ``<name>`` → HF top-level directory name. Only entries that
+# differ from the lowercase identity mapping appear here.
+# ---------------------------------------------------------------------------
+_LEADERBOARD_TO_HF: dict[str, str] = {
+    "loop_seattle":     "LOOP_SEATTLE",
+    "m_dense":          "M_DENSE",
+    "sz_taxi":          "SZ_TAXI",
+    "car_parts":        "car_parts_with_missing",
+    "kdd_cup_2018":     "kdd_cup_2018_with_missing",
+    "temperature_rain": "temperature_rain_with_missing",
+    "saugeen":          "saugeenday",
+}
+
+
+# ---------------------------------------------------------------------------
+# Datasets that live as a single arrow file directly under the dataset
+# name (no per-freq subdir on HF). The leaderboard still adds a freq
+# segment to their paths (e.g. ``m4_yearly/A``, ``hospital/M``), which we
+# strip before locating the file.
+# ---------------------------------------------------------------------------
+_HF_FLAT_DATASETS: frozenset[str] = frozenset({
+    "bizitobs_application", "bizitobs_service",
+    "car_parts_with_missing", "covid_deaths", "hospital",
     "m4_daily", "m4_hourly", "m4_monthly", "m4_quarterly",
     "m4_weekly", "m4_yearly",
-    "restaurant",
-    "saugeenday/D", "saugeenday/M", "saugeenday/W",
-    "solar/10T", "solar/D", "solar/H", "solar/W",
-    "temperature_rain_with_missing",
-    "us_births/D", "us_births/M", "us_births/W",
-)
+    "restaurant", "temperature_rain_with_missing",
+})
 
-GIFTEVAL_TERMS: tuple[str, ...] = ("short", "medium", "long")
+
+def _hf_arrow_directory(leaderboard_path: str) -> str:
+    """Resolve a leaderboard ``<name>/<freq>`` path to the actual HF
+    directory containing the arrow file.
+
+    Examples
+    --------
+        ``"m4_weekly/W"``       → ``"m4_weekly"`` (HF-flat, drops freq)
+        ``"loop_seattle/H"``    → ``"LOOP_SEATTLE/H"`` (case-renamed)
+        ``"car_parts/M"``       → ``"car_parts_with_missing"`` (HF-flat + suffix)
+    """
+    leaderboard_name, _, freq_segment = leaderboard_path.partition("/")
+    hf_name = _LEADERBOARD_TO_HF.get(leaderboard_name, leaderboard_name)
+    if hf_name in _HF_FLAT_DATASETS:
+        return hf_name
+    if freq_segment:
+        return f"{hf_name}/{freq_segment}"
+    return hf_name
+
+
+def _skip_placeholder(reason: str) -> dict:
+    """Return a minimal data dict that satisfies ``Objective.set_data``
+    but flags the combo for skipping via ``Objective.skip``."""
+    return dict(
+        X_train=[],
+        y_train=[],
+        X_test=[],
+        y_test=[],
+        cutoff_indexes=[],
+        covariates=Covariates(),
+        task="forecasting",
+        metrics=[],
+        prediction_length=1,
+        freq="D",
+        seasonality=1,
+        _skip_reason=reason,
+    )
 
 
 class Dataset(BaseDataset):
@@ -72,21 +236,21 @@ class Dataset(BaseDataset):
     Parameters
     ----------
     dataset_name : str
-        Subdirectory name on the HF repo (e.g. ``"m4_weekly"``, ``"ett1"``,
-        ``"solar"``). See https://huggingface.co/datasets/Salesforce/GiftEval
-        for the full list.
+        One of 55 canonical leaderboard paths — ``<name>/<freq>``, e.g.
+        ``"m4_weekly/W"``, ``"loop_seattle/H"``. See
+        :data:`GIFTEVAL_DATASETS`.
     term : str
-        GIFT-Eval forecast term — ``"short"`` (×1), ``"medium"`` (×10), or
-        ``"long"`` (×15). Selects the prediction length via the canonical
-        per-freq base, matching the GIFT-Eval leaderboard convention.
-        Ignored when ``prediction_length`` is set explicitly.
+        ``"short"`` / ``"medium"`` / ``"long"``. Combos not in
+        :data:`CANONICAL_COMBOS` are skipped (placeholder + objective
+        ``skip``), so ``dataset_name=all, term=long`` runs only the 21
+        paths that define ``long``.
     prediction_length : int or None
-        Explicit override. ``None`` → resolved from (freq, term).
+        Explicit override. ``None`` → resolved from (freq, term) via
+        :func:`benchmark_utils.constants.gift_eval_prediction_length`.
     n_windows : int
         Number of rolling evaluation windows per series.
     max_series : int or None
-        Optional cap on the number of series — useful for very large
-        configs (e.g. ``solar``). ``None`` = no cap.
+        Optional cap on the number of series.
     debug : bool
         If True, keep only the first 5 series for fast iteration.
     """
@@ -96,7 +260,7 @@ class Dataset(BaseDataset):
     requirements = ["pip::datasets", "pip::huggingface-hub"]
 
     parameters = {
-        "dataset_name": ["m4_weekly"],
+        "dataset_name": ["m4_weekly/W"],
         "term": ["short"],
         "prediction_length": [None],
         "n_windows": [1],
@@ -104,6 +268,12 @@ class Dataset(BaseDataset):
         "debug": [False],
     }
 
+    # ``prepare()`` depends on ``dataset_name`` only — ``term`` and the
+    # other knobs shape the in-memory view, not the downloaded files.
+    prepare_cache_ignore = (
+        "term", "prediction_length", "n_windows", "max_series", "debug",
+    )
+
     @classmethod
     def get_all_parameter_values(cls, name):
         if name == "dataset_name":
@@ -112,40 +282,44 @@ def get_all_parameter_values(cls, name):
             return list(GIFTEVAL_TERMS)
         return None
 
+    def prepare(self):
+        """Pre-download arrow shards for this config into HF's cache."""
+        self._snapshot()
+
+    def _snapshot(self) -> "list[str]":
+        """Snapshot-download the arrow files for this dataset and return
+        their local paths. Idempotent — HF caches by content hash."""
+        from huggingface_hub import snapshot_download
+        from pathlib import Path
+
+        hf_path = _hf_arrow_directory(self.dataset_name)
+        local_root = snapshot_download(
+            "Salesforce/GiftEval",
+            repo_type="dataset",
+            allow_patterns=f"{hf_path}/*.arrow",
+        )
+        return sorted(str(p) for p in (Path(local_root) / hf_path).glob("*.arrow"))
+
     def get_data(self):
         from datasets import Dataset as HFDataset
-        from huggingface_hub import hf_hub_download, list_repo_files
-
-        # Locate the Arrow file inside the requested directory. Match the
-        # exact directory (no nested descent) — for datasets like
-        # ``LOOP_SEATTLE`` that ship multiple freq subdirs, the user must
-        # pick one (``LOOP_SEATTLE/H``, ``LOOP_SEATTLE/D``, ...), and
-        # those are genuinely separate evaluation configs.
-        files = list_repo_files(
-            "Salesforce/GiftEval", repo_type="dataset"
-        )
-        prefix = f"{self.dataset_name}/"
-        arrow_files = [
-            f for f in files
-            if f.startswith(prefix)
-            and f.endswith(".arrow")
-            and "/" not in f[len(prefix):]
-        ]
+
+        # Short-circuit non-canonical combos so heavy parsing doesn't run.
+        if (self.dataset_name, self.term) not in CANONICAL_COMBOS:
+            return _skip_placeholder(
+                f"non-canonical GIFT-Eval combo: {self.dataset_name!r} does "
+                f"not define term {self.term!r} on the leaderboard"
+            )
+
+        arrow_files = self._snapshot()
         if not arrow_files:
             raise ValueError(
                 f"No Arrow file found for GIFT-Eval dataset "
-                f"{self.dataset_name!r}. Valid choices are in "
-                f"GIFTEVAL_DATASETS."
+                f"{self.dataset_name!r}. Valid choices are in GIFTEVAL_DATASETS."
             )
 
-        # Download + load each shard; concatenate.
         rows = []
-        for f in sorted(arrow_files):
-            local = hf_hub_download(
-                "Salesforce/GiftEval", filename=f, repo_type="dataset",
-            )
-            shard = HFDataset.from_file(local)
-            rows.extend(shard)
+        for f in arrow_files:
+            rows.extend(HFDataset.from_file(f))
 
         if self.debug:
             rows = rows[:5]
@@ -157,8 +331,8 @@ def get_data(self):
                 f"GIFT-Eval dataset {self.dataset_name!r} returned 0 series."
             )
 
-        # Frequency / seasonality — take from the first entry (every series
-        # in a GIFT-Eval subset shares the same freq).
+        # Frequency / seasonality — every series in a GIFT-Eval subset
+        # shares the same freq, so taking it from the first entry is safe.
         pandas_freq = rows[0].get("freq") or "D"
         freq, seasonality, _ = from_pandas(pandas_freq)
 
@@ -167,18 +341,15 @@ def get_data(self):
             pred_len = gift_eval_prediction_length(pandas_freq, self.term)
 
         # Build (T, C) series. Univariate entries arrive as flat
-        # ``List[float]`` (ndim=1); multivariate entries arrive as
-        # ``List[List[float]]`` of shape ``(C, T)``.
+        # ``List[float]`` (ndim=1); multivariate as ``List[List[float]]``
+        # of shape ``(C, T)``.
         series_list = []
         for r in rows:
             values = np.asarray(r["target"], dtype=np.float32)
             if values.ndim == 1:
-                series = values.reshape(-1, 1)        # (T, 1)
+                series_list.append(values.reshape(-1, 1))         # (T, 1)
             elif values.ndim == 2:
-                series = values.T                       # (C, T) → (T, C)
-            else:
-                continue
-            series_list.append(series)
+                series_list.append(values.T)                        # (C,T)→(T,C)
 
         if not series_list:
             raise ValueError(
diff --git a/datasets/monash.py b/datasets/monash.py
index a299433..0ea0365 100644
--- a/datasets/monash.py
+++ b/datasets/monash.py
@@ -66,6 +66,20 @@ class Dataset(BaseDataset):
         "debug": [False],
     }
 
+    # Only dataset_name decides what aeon downloads; the other knobs
+    # affect the in-memory split, not the file on disk.
+    prepare_cache_ignore = ("prediction_length", "n_windows", "debug")
+
+    def prepare(self):
+        """Warm aeon's local cache for this dataset (download if missing).
+
+        aeon writes the ``.tsf`` to
+        ``~/.aeon/datasets/local_data/<name>/<name>.tsf`` on first use;
+        we call it once and discard the parsed result so the cache layer
+        in :func:`load_forecasting` handles the actual download.
+        """
+        load_forecasting(self.dataset_name, return_metadata=False)
+
     def get_data(self):
 
         df, meta = load_forecasting(self.dataset_name, return_metadata=True)
diff --git a/objective.py b/objective.py
index 5e3a4c6..92aefa4 100644
--- a/objective.py
+++ b/objective.py
@@ -84,8 +84,25 @@ def set_data(self, X_train, y_train, X_test, y_test,
         self.covariates = covariates if covariates is not None else Covariates()
         self.task = task
         self.metrics = metrics
+        # Pull any skip marker out of meta so it doesn't leak into
+        # ``get_objective()`` payloads.
+        self._skip_reason = meta.pop("_skip_reason", None)
         self.meta = meta  # freq, prediction_length, n_classes, …
 
+    def skip(self, **data):
+        """Honor a ``_skip_reason`` field set by the dataset.
+
+        Datasets that want to filter their own parameter grid (e.g.
+        :mod:`datasets.gifteval` skipping non-leaderboard (path, term)
+        combos) return ``_skip_reason="..."`` from ``get_data()`` and we
+        propagate it here so benchopt records a clean skip rather than
+        running an empty objective.
+        """
+        reason = data.get("_skip_reason")
+        if reason:
+            return True, reason
+        return False, None
+
     # ------------------------------------------------------------------
     # Passed to the solver
     # ------------------------------------------------------------------