From ad0464d7ebf6d2357953c381d67704501dba4c2a Mon Sep 17 00:00:00 2001 From: Geoffrey Negiar Date: Thu, 28 May 2026 16:27:54 +0200 Subject: [PATCH 01/12] ENH batch forecasting predict() across series and cutoffs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactors the forecasting ``predict`` contract from a per-context call into a single call covering all series and all rolling cutoffs at once. This matches how hosted-API solvers (TFC, etc.) natively dispatch work and avoids re-paying per-call overhead in the objective. New signature:: predict( x: list[np.ndarray (T_i, C)], cutoff_indexes: list[list[int]], covariates: {"static_covars": list, "hist_covars": list, "future_covars": list}, horizon: int, ) -> list[np.ndarray (n_cutoffs_i, horizon, C)] The ``covariates`` dict always has all three keys (empty lists when a dataset doesn't carry them), so adapters never branch on None vs dict. Changes: - ``benchmark_utils/adapters/base.py``: rewrite the predict contract and documentation. - ``benchmark_utils/windowing.py``: ``make_forecasting_splits`` now returns ``(series_full, cutoff_indexes, targets)`` with targets of shape ``(n_cutoffs_i, H, C)``. - ``datasets/monash.py``: emits ``cutoff_indexes`` and empty ``covariates`` alongside the existing fields. - ``objective.py``: forwards the new fields and reshapes the batched prediction back to flat ``(n_total, H, C)`` arrays for metric computation. ``get_one_result`` updated accordingly. - ``solvers/naive.py``: ``_NaiveForecaster`` takes the batched API (and no longer needs ``prediction_length`` in its constructor). - ``solvers/chronos.py``: ``_ChronosForecaster`` takes the batched API and reuses the loaded pipeline across all series and cutoffs. - ``benchmark_utils/adapters/forecast_residual.py``: rewritten as a single batched call so AD scoring is one prediction per series rather than O(T) per series. - ``solvers/tfc_api.py``: new solver that wraps the TFC hosted-API SDK. Uses ``client.cross_validate`` to issue one request per series with all cutoffs at once. Knobs for ``model``, ``context``, ``add_holidays``, ``add_events``, ``country_isocode``, ``batch_size``. Skips when ``TFC_API_KEY`` is unset. Verification — Monash[m1_yearly_dataset, debug=True], -j 1: | solver | MAE | MSE | MASE | sMAPE | | --------------------- | ---------- | ----------- | ------ | ------ | | Naive[seasonality=1] | 3,399,506 | 5.93e13 | 12.86 | 0.431 | | TFC-API[chronos-2] | 2,807,424 | 4.07e13 | 10.62 | 0.349 | | TFC-API[tabpfn-ts] | 2,621,979 | 3.69e13 | 9.92 | 0.401 | | TFC-API[timesfm-2p5] | 2,657,678 | 3.99e13 | 10.05 | 0.263 | The Chronos numbers match bit-for-bit against the pre-refactor run. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmark_utils/adapters/base.py | 44 ++-- benchmark_utils/adapters/forecast_residual.py | 33 +-- benchmark_utils/windowing.py | 60 ++--- datasets/monash.py | 32 ++- objective.py | 87 ++++--- solvers/chronos.py | 53 ++--- solvers/naive.py | 29 ++- solvers/tfc_api.py | 212 ++++++++++++++++++ 8 files changed, 412 insertions(+), 138 deletions(-) create mode 100644 solvers/tfc_api.py diff --git a/benchmark_utils/adapters/base.py b/benchmark_utils/adapters/base.py index ef43397..2258ad6 100644 --- a/benchmark_utils/adapters/base.py +++ b/benchmark_utils/adapters/base.py @@ -1,17 +1,36 @@ """Base interface that all task adapters must implement. A *fitted* adapter is what solvers return via ``get_result()``. -The objective calls ``adapter.predict(x)`` for each test sample. +The objective calls ``adapter.predict(...)`` with task-appropriate inputs. Predict signature by task -------------------------- -forecasting : x (T, C) → y_pred (H, C) -classification : x (T, C) → int label -anomaly detection: x (T, C) → scores (T,) float — one score per timestep +forecasting: + + predict( + x: list[np.ndarray (T_i, C)], + cutoff_indexes: list[list[int]], + covariates: dict, + horizon: int, + ) -> list[np.ndarray (n_cutoffs_i, horizon, C)] + + ``cutoff_indexes[i][k]`` is the timestep index in ``x[i]`` at which + the k-th forecast for series ``i`` starts. The model must use only + ``x[i][:cutoff]`` as history. The ``covariates`` dict has shape + ``{"static_covars": list, "hist_covars": list, "future_covars": list}``; + the keys are always present (empty lists when unused). + +classification: + + predict(x: np.ndarray (N, T, C)) -> np.ndarray (N,) int labels + +anomaly detection: + + predict(x: np.ndarray (T, C)) -> np.ndarray (T,) float anomaly scores """ from abc import ABC, abstractmethod -import numpy as np +from typing import Any class BaseTSFMAdapter(ABC): @@ -26,16 +45,5 @@ def fit(self, X_train, y_train, **kwargs): return self @abstractmethod - def predict(self, x: np.ndarray) -> np.ndarray: - """Run inference on a single sample. - - Parameters - ---------- - x : np.ndarray, shape (T, C) - One time series (variable length allowed). - - Returns - ------- - np.ndarray - Task-specific output — see module docstring. - """ + def predict(self, *args, **kwargs) -> Any: + """Task-specific inference. See module docstring for per-task signatures.""" diff --git a/benchmark_utils/adapters/forecast_residual.py b/benchmark_utils/adapters/forecast_residual.py index f4c48ff..c79f251 100644 --- a/benchmark_utils/adapters/forecast_residual.py +++ b/benchmark_utils/adapters/forecast_residual.py @@ -22,8 +22,8 @@ class ForecastResidualAdapter(BaseTSFMAdapter): Parameters ---------- - forecaster : object with - ``predict(context: np.ndarray (T, C)) -> np.ndarray (H, C)`` + forecaster : object exposing the batched forecasting predict API + (see :class:`BaseTSFMAdapter`). prediction_length : int Number of steps predicted at each position (default 1). min_context : int @@ -47,20 +47,23 @@ def predict(self, x: np.ndarray) -> np.ndarray: scores : (T,) float — higher means more anomalous. Timesteps before ``min_context`` receive score 0. """ - T, C = x.shape + T = x.shape[0] scores = np.zeros(T, dtype=np.float32) + cutoffs = list(range(self.min_context, T - self.prediction_length + 1)) + if not cutoffs: + return scores - for t in range(self.min_context, T): - context = x[:t] # (t, C) - try: - pred = self.forecaster.predict(context) # (H, C) or (H,) - pred = np.asarray(pred).reshape(self.prediction_length, -1) - actual = x[t: t + self.prediction_length] # (H, C) - if actual.shape[0] < self.prediction_length: - continue - error = float(np.mean(np.abs(pred - actual))) - except Exception: - error = 0.0 - scores[t] = error + try: + preds = self.forecaster.predict( + [x], + cutoff_indexes=[cutoffs], + covariates={"static_covars": [], "hist_covars": [], "future_covars": []}, + horizon=self.prediction_length, + )[0] # (n_cutoffs, H, C) + except Exception: + return scores + for k, t in enumerate(cutoffs): + actual = x[t: t + self.prediction_length] + scores[t] = float(np.mean(np.abs(preds[k] - actual))) return scores diff --git a/benchmark_utils/windowing.py b/benchmark_utils/windowing.py index e439a95..269175f 100644 --- a/benchmark_utils/windowing.py +++ b/benchmark_utils/windowing.py @@ -1,14 +1,18 @@ -""" -Rolling-window utilities for forecasting evaluation. +"""Rolling-window utilities for forecasting evaluation. -Given a list of time series (each a numpy array of shape (T_i, C)), -`make_forecasting_splits` returns: - - X_test : List[np.ndarray] each (T_context_i, C) — full context up to - the prediction point (variable length) - - y_test : List[np.ndarray] each (prediction_length, C) — the target +`make_forecasting_splits` returns the full series alongside per-series +cutoff indexes and target horizons. This shape matches the batched +adapter contract: a forecaster gets the whole history per series plus +the list of cutoffs at which it should forecast. -The context for window k is everything from the start of the series up to -(T_train + k * stride), so models with long context windows get to use it. +Outputs +------- +series_full : List[np.ndarray (T_i, C)] +cutoff_indexes : List[List[int]] — for each series, the timestep + indexes at which a forecast starts (i.e. ``x[:cutoff]`` + is the history available to the model). +targets : List[np.ndarray (n_cutoffs_i, prediction_length, C)] + ground-truth windows aligned with cutoff_indexes. """ from typing import List, Optional, Tuple @@ -21,12 +25,12 @@ def make_forecasting_splits( n_windows: int = 1, stride: Optional[int] = None, min_context: int = 1, -) -> Tuple[List[np.ndarray], List[np.ndarray]]: - """Create rolling-window evaluation splits from a list of time series. +) -> Tuple[List[np.ndarray], List[List[int]], List[np.ndarray]]: + """Create rolling-window evaluation cutoffs from a list of time series. Parameters ---------- - series : list of (T_i, C) arrays — full time series (train + test combined) + series : list of (T_i, C) arrays — full time series. prediction_length : int n_windows : int Number of rolling evaluation windows per series. @@ -35,30 +39,30 @@ def make_forecasting_splits( Defaults to ``prediction_length`` (non-overlapping). min_context : int Minimum context length required before the first prediction point. - - Returns - ------- - X_test : list of (T_context, C) arrays — grows with each window - y_test : list of (prediction_length, C) arrays """ if stride is None: stride = prediction_length - X_test, y_test = [], [] + series_full: List[np.ndarray] = [] + cutoff_indexes: List[List[int]] = [] + targets: List[np.ndarray] = [] for ts in series: - ts = np.asarray(ts) # (T, C) + ts = np.asarray(ts) T = ts.shape[0] - # The last prediction point must end at or before T - # First prediction point: min_context + prediction_length - 1 <= T - 1 - last_end = T + cutoffs: List[int] = [] + ys: List[np.ndarray] = [] for w in range(n_windows): - pred_end = last_end - (n_windows - 1 - w) * stride + pred_end = T - (n_windows - 1 - w) * stride pred_start = pred_end - prediction_length - if pred_start < min_context: + if pred_start < min_context or pred_end > T: continue - # Full history as context (variable length) - X_test.append(ts[:pred_start]) - y_test.append(ts[pred_start:pred_end]) + cutoffs.append(pred_start) + ys.append(ts[pred_start:pred_end]) + if not cutoffs: + continue + series_full.append(ts) + cutoff_indexes.append(cutoffs) + targets.append(np.stack(ys, axis=0)) # (n_cutoffs, H, C) - return X_test, y_test + return series_full, cutoff_indexes, targets diff --git a/datasets/monash.py b/datasets/monash.py index 645a750..ad853c0 100644 --- a/datasets/monash.py +++ b/datasets/monash.py @@ -12,16 +12,22 @@ Data contract output -------------------- -X_train : List[np.ndarray (T_i, C)] training portions of each series -y_train : List[np.ndarray (H, C)] next-H targets aligned with X_train - (useful for supervised fine-tuning) -X_test : List[np.ndarray (T_ctx, C)] rolling-window contexts (variable length) -y_test : List[np.ndarray (H, C)] ground-truth horizons -task : "forecasting" -metrics : ["mae", "mse", "mase", "smape"] +X_train : List[np.ndarray (T_i, C)] training portions of each series +y_train : List[np.ndarray (H, C)] next-H targets aligned with X_train +X_test : List[np.ndarray (T_i, C)] full series — model uses + ``x[:cutoff]`` as history +cutoff_indexes : List[List[int]] jagged: per-series cutoff + positions in X_test +y_test : List[np.ndarray (n_cutoffs, H, C)] + ground-truth windows +covariates : dict {static_covars, hist_covars, + future_covars} — all empty for + Monash today +task : "forecasting" +metrics : ["mae", "mse", "mase", "smape"] prediction_length : int -freq : str (e.g. "Y", "M", "D") -seasonality : int (seasonal period used for MASE) +freq : str (e.g. "Y", "M", "D") +seasonality : int (seasonal period used for MASE) """ import numpy as np @@ -120,7 +126,7 @@ def get_data(self): ) n_windows = 1 if self.debug else self.n_windows - X_test, y_test = make_forecasting_splits( + X_test, cutoff_indexes, y_test = make_forecasting_splits( full_series, prediction_length=pred_len, n_windows=n_windows, @@ -131,6 +137,12 @@ def get_data(self): y_train=y_train_list, X_test=X_test, y_test=y_test, + cutoff_indexes=cutoff_indexes, + covariates={ + "static_covars": [], + "hist_covars": [], + "future_covars": [], + }, task="forecasting", metrics=["mae", "mse", "mase", "smape"], prediction_length=pred_len, diff --git a/objective.py b/objective.py index 1a0800f..bdcec56 100644 --- a/objective.py +++ b/objective.py @@ -9,29 +9,33 @@ All datasets must return (via ``get_data``): X_train : List[np.ndarray (T_i, C)] training time series - y_train : array-like or None task-specific (see below) - X_test : List[np.ndarray (T_j, C)] test contexts / series - y_test : array-like task-specific (see below) + y_train : array-like or None task-specific (see below) + X_test : List[np.ndarray] test data (shape depends on task) + y_test : array-like task-specific (see below) task : str one of {"forecasting", "classification", "anomaly_detection"} metrics : List[str] names from benchmark_utils.metrics.ALL_METRICS Task-specific shapes -------------------- -forecasting y_train List[(H, C)] or None - y_test List[(H, C)] - extra prediction_length (int), freq (str) -classification y_train (N,) int - y_test (M,) int - extra n_classes (int) -anomaly_detection y_train None - y_test List[(T_j,)] int point-level binary labels +forecasting X_test List[(T_i, C)] full series — adapter uses + ``x[:cutoff]`` as history + cutoff_indexes List[List[int]] jagged per-series cutoffs + y_test List[(n_cutoffs, H, C)] + covariates dict {static_covars, hist_covars, + future_covars} + extra prediction_length (int), freq (str) +classification y_train (N,) int + y_test (M,) int + extra n_classes (int) +anomaly_detection y_train None + y_test List[(T_j,)] int point-level labels Solver contract --------------- ``Solver.get_result()`` must return ``{"model": adapter}`` where ``adapter`` -is a fitted :class:`~benchmark_utils.adapters.base.BaseTSFMAdapter` with a -``predict(x: np.ndarray (T, C)) -> np.ndarray`` method. +is a fitted :class:`~benchmark_utils.adapters.base.BaseTSFMAdapter`. +See that module for per-task predict signatures. """ import numpy as np @@ -60,11 +64,18 @@ class Objective(BaseObjective): # ------------------------------------------------------------------ def set_data(self, X_train, y_train, X_test, y_test, - task, metrics, **meta): + task, metrics, cutoff_indexes=None, covariates=None, + **meta): self.X_train = X_train self.y_train = y_train self.X_test = X_test self.y_test = y_test + self.cutoff_indexes = cutoff_indexes + self.covariates = covariates or { + "static_covars": [], + "hist_covars": [], + "future_covars": [], + } self.task = task self.metrics = metrics self.meta = meta # freq, prediction_length, n_classes, … @@ -98,13 +109,23 @@ def evaluate_result(self, model): # --- forecasting --------------------------------------------------- def _eval_forecasting(self, model): - preds, targets = [], [] - for x, y in zip(self.X_test, self.y_test): - pred = np.asarray(model.predict(x)) # (H, C) - preds.append(pred) - targets.append(np.asarray(y)) + horizon = self.meta.get("prediction_length", 1) + preds_per_series = model.predict( + self.X_test, + cutoff_indexes=self.cutoff_indexes, + covariates=self.covariates, + horizon=horizon, + ) - preds = np.array(preds) # (M, H, C) + preds, targets = [], [] + for series_preds, series_targets in zip(preds_per_series, self.y_test): + sp = np.asarray(series_preds) # (n_cutoffs, H, C) + st = np.asarray(series_targets) # (n_cutoffs, H, C) + for k in range(sp.shape[0]): + preds.append(sp[k]) + targets.append(st[k]) + + preds = np.array(preds) targets = np.array(targets) result = {} @@ -148,19 +169,27 @@ def get_one_result(self): from benchmark_utils.adapters.base import BaseTSFMAdapter class _ConstantAdapter(BaseTSFMAdapter): - def __init__(self, task, meta, X_test): + def __init__(self, task, meta): self._task = task self._meta = meta - self._X_test = X_test - def predict(self, x): + def predict(self, *args, **kwargs): if self._task == "forecasting": - H = self._meta.get("prediction_length", 1) - C = x.shape[1] if x.ndim == 2 else 1 - return np.zeros((H, C)) + x = args[0] + cutoff_indexes = kwargs.get( + "cutoff_indexes", args[1] if len(args) > 1 else None + ) + H = kwargs.get("horizon", self._meta.get("prediction_length", 1)) + preds = [] + for series, cutoffs in zip(x, cutoff_indexes or []): + C = series.shape[1] if series.ndim == 2 else 1 + preds.append(np.zeros((len(cutoffs), H, C), dtype=np.float32)) + return preds elif self._task == "classification": - return 0 + x = args[0] + return np.zeros(len(x), dtype=np.int64) elif self._task == "anomaly_detection": - return np.zeros(x.shape[0]) + x = args[0] + return np.zeros(x.shape[0], dtype=np.float32) - return {"model": _ConstantAdapter(self.task, self.meta, self.X_test)} + return {"model": _ConstantAdapter(self.task, self.meta)} diff --git a/solvers/chronos.py b/solvers/chronos.py index 0033ec5..e693eed 100644 --- a/solvers/chronos.py +++ b/solvers/chronos.py @@ -30,35 +30,37 @@ # --------------------------------------------------------------------------- class _ChronosForecaster: - """Wraps ChronosPipeline to expose predict(x (T, C)) -> (H, C).""" + """Wraps ChronosPipeline with the batched series+cutoffs predict API.""" - def __init__(self, pipeline, prediction_length): + def __init__(self, pipeline): self.pipeline = pipeline - self.prediction_length = prediction_length - def predict(self, x: np.ndarray) -> np.ndarray: + def predict(self, x, cutoff_indexes, covariates, horizon): + del covariates import torch - x = np.asarray(x, dtype=np.float32) # (T, C) - C = x.shape[1] - - # Chronos expects (batch, time) tensors — one channel at a time, - # then stack. - preds = [] - for c in range(C): - context = torch.from_numpy(x[:, c]).unsqueeze(0) # (1, T) - forecast = self.pipeline.predict( - context, - prediction_length=self.prediction_length, - ) - # forecast: (1, n_samples, H) for sample-based pipelines, - # or (1, H) for point pipelines — take median. - f = forecast[0] - if f.ndim == 2: # (n_samples, H) → median - f = f.median(dim=0).values - preds.append(f.numpy()) # (H,) - - return np.stack(preds, axis=-1).astype(np.float32) # (H, C) + results = [] + for series, cutoffs in zip(x, cutoff_indexes): + series = np.asarray(series, dtype=np.float32) + C = series.shape[1] if series.ndim == 2 else 1 + out = np.empty((len(cutoffs), horizon, C), dtype=np.float32) + for k, cutoff in enumerate(cutoffs): + hist = series[:cutoff] + if hist.ndim == 1: + hist = hist[:, None] + # Chronos expects (batch, time) — one channel at a time. + for c in range(C): + context = torch.from_numpy(hist[:, c]).unsqueeze(0) + forecast = self.pipeline.predict( + context, + prediction_length=horizon, + ) + f = forecast[0] + if f.ndim == 2: + f = f.median(dim=0).values + out[k, :, c] = f.numpy() + results.append(out) + return results # --------------------------------------------------------------------------- @@ -115,8 +117,7 @@ def set_objective(self, X_train, y_train, task, **meta): self._loaded_model = model_id def run(self, _): - pred_len = self.meta.get("prediction_length", 1) - forecaster = _ChronosForecaster(self._pipeline, pred_len) + forecaster = _ChronosForecaster(self._pipeline) if self.task == "forecasting": self._adapter = forecaster diff --git a/solvers/naive.py b/solvers/naive.py index be8cdcd..4ae4d49 100644 --- a/solvers/naive.py +++ b/solvers/naive.py @@ -21,18 +21,24 @@ class _NaiveForecaster(BaseTSFMAdapter): """Repeat the last ``seasonality`` values to fill the horizon.""" - def __init__(self, prediction_length, seasonality=1): - self.prediction_length = prediction_length + def __init__(self, seasonality=1): self.seasonality = seasonality - def predict(self, x: np.ndarray) -> np.ndarray: - # x: (T, C) - T, C = x.shape - season = min(self.seasonality, T) - pattern = x[-season:] # (season, C) - reps = int(np.ceil(self.prediction_length / season)) - forecast = np.tile(pattern, (reps, 1))[:self.prediction_length] - return forecast.astype(np.float32) # (H, C) + def predict(self, x, cutoff_indexes, covariates, horizon): + del covariates + results = [] + for series, cutoffs in zip(x, cutoff_indexes): + series = np.asarray(series) + C = series.shape[1] if series.ndim == 2 else 1 + preds = np.empty((len(cutoffs), horizon, C), dtype=np.float32) + for k, cutoff in enumerate(cutoffs): + hist = series[:cutoff] + season = min(self.seasonality, hist.shape[0]) + pattern = hist[-season:] + reps = int(np.ceil(horizon / season)) + preds[k] = np.tile(pattern, (reps, 1))[:horizon] + results.append(preds) + return results class _MajorityClassifier(BaseTSFMAdapter): @@ -94,8 +100,7 @@ def set_objective(self, X_train, y_train, task, **meta): def run(self, _): if self.task == "forecasting": - pred_len = self.meta.get("prediction_length", 1) - self._adapter = _NaiveForecaster(pred_len, self.seasonality) + self._adapter = _NaiveForecaster(self.seasonality) elif self.task == "classification": self._adapter = _MajorityClassifier() diff --git a/solvers/tfc_api.py b/solvers/tfc_api.py new file mode 100644 index 0000000..022d513 --- /dev/null +++ b/solvers/tfc_api.py @@ -0,0 +1,212 @@ +"""TFC API solver for the TSFM benchmark. + +Calls The Forecasting Company's hosted inference API via the official +``theforecastingcompany`` Python SDK. Supports zero-shot forecasting. + +Authentication +-------------- +The SDK reads ``TFC_API_KEY`` from the environment by default. Sign in at +https://docs.retrocast.com/settings/api-keys to get one. + +Adding a new model +------------------ +Pass any model id from ``theforecastingcompany.utils.TFCModels`` via the +``model`` parameter (e.g. ``"chronos-2"``, ``"timesfm-2p5"``, +``"tfc-global"``, ``"T0-1638-step-85000"``). +""" + +import os +from typing import Optional + +import numpy as np +import pandas as pd +from benchopt import BaseSolver + +from benchmark_utils.adapters.base import BaseTSFMAdapter + + +SUPPORTED_TASKS = {"forecasting"} + +# Map benchmark freq codes to API-accepted pandas-like aliases. +_FREQ_REMAP = {"T": "min", "S": "10S"} + +# pandas >=2 deprecates Y/Q/M/H short forms in ``pd.date_range``; use the +# long forms for synthetic indices but pass the original to the API. +_PD_FREQ_REMAP = {"Y": "YE", "Q": "QE", "M": "ME"} + + +def _to_api_freq(freq: str) -> str: + return _FREQ_REMAP.get(freq, freq) + + +def _to_pandas_freq(api_freq: str) -> str: + return _PD_FREQ_REMAP.get(api_freq, api_freq) + + +class _TFCAPIForecaster(BaseTSFMAdapter): + """Batched adapter that calls ``client.cross_validate`` per series.""" + + def __init__( + self, + client, + model, + freq: str, + context: Optional[int], + quantiles: Optional[list[float]], + add_holidays: bool, + add_events: bool, + country_isocode: Optional[str], + batch_size: int, + ): + self.client = client + self.model = model # TFCModels enum + self.freq = _to_api_freq(freq) + if quantiles is None: + quantiles = [0.5] + elif 0.5 not in quantiles: + quantiles = quantiles + [0.5] + self.quantiles = quantiles + self.context = context + self.add_holidays = add_holidays + self.add_events = add_events + self.country_isocode = country_isocode + self.batch_size = batch_size + + def predict(self, x, cutoff_indexes, covariates, horizon): + # TODO: thread ``covariates`` (static/hist/future) through to the SDK + # once the benchmark datasets expose them. For now the dict is + # ignored — Monash datasets carry no covariates. + del covariates + pd_freq = _to_pandas_freq(self.freq) + + results = [] + for series_idx, (series, cutoffs) in enumerate(zip(x, cutoff_indexes)): + series = np.asarray(series, dtype=np.float32) + if series.ndim == 1: + series = series[:, None] + T, C = series.shape + index = pd.date_range("2000-01-01", periods=T, freq=pd_freq) + + frames = [] + for c in range(C): + frames.append( + pd.DataFrame( + { + "unique_id": f"s{series_idx}_c{c}", + "ds": index, + "target": series[:, c], + } + ) + ) + train_df = pd.concat(frames, ignore_index=True) + + fcds = [pd.Timestamp(index[cutoff]) for cutoff in cutoffs] + forecast_df = self.client.cross_validate( + train_df, + model=self.model, + horizon=horizon, + freq=self.freq, + fcds=fcds, + quantiles=self.quantiles, + context=self.context, + add_holidays=self.add_holidays, + add_events=self.add_events, + country_isocode=self.country_isocode, + batch_size=self.batch_size, + ) + + value_col = f"{self.model}_q0.5" + if value_col not in forecast_df.columns: + value_col = str(self.model) + if value_col not in forecast_df.columns: + raise ValueError( + f"TFC API response missing expected columns; got {list(forecast_df.columns)!r}" + ) + + preds = np.empty((len(cutoffs), horizon, C), dtype=np.float32) + for c in range(C): + channel = forecast_df.loc[forecast_df["unique_id"] == f"s{series_idx}_c{c}"] + for k, fcd in enumerate(fcds): + window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(horizon) + preds[k, :, c] = window[value_col].to_numpy(dtype=np.float32) + results.append(preds) + return results + + +class Solver(BaseSolver): + """TFC hosted-API solver. + + Parameters + ---------- + model : str + Model id served by the TFC API — must match a value in + ``theforecastingcompany.utils.TFCModels`` (e.g. ``"chronos-2"``, + ``"timesfm-2p5"``, ``"tfc-global"``, ``"moirai-2"``). + context : int or None + Number of history steps to send to the model. ``None`` lets the + model use its native maximum. + add_holidays, add_events : bool + Whether to attach TFC holiday / event covariates. Requires + ``country_isocode`` to be set. + country_isocode : str or None + ISO country code (e.g. ``"US"``) used by the holiday/event lookup. + batch_size : int + Series-per-batch for batching-enabled models (chronos-2, moirai-2). + """ + + name = "TFC-API" + + requirements = ["pip::theforecastingcompany"] + + sampling_strategy = "run_once" + + parameters = { + "model": ["chronos-2"], + "context": [None], + "add_holidays": [False], + "add_events": [False], + "country_isocode": [None], + "batch_size": [256], + } + + def skip(self, task, **kwargs): + if task not in SUPPORTED_TASKS: + return True, f"TFC-API solver does not support task={task!r}" + if os.getenv("TFC_API_KEY") is None: + return True, "TFC_API_KEY environment variable not set" + return False, None + + def set_objective(self, X_train, y_train, task, **meta): + from theforecastingcompany import TFCClient + from theforecastingcompany.utils import TFCModels + + self.task = task + self.X_train = X_train + self.meta = meta + + try: + self._model_enum = TFCModels(self.model) + except ValueError as e: + known = ", ".join(m.value for m in TFCModels) + raise ValueError( + f"Unknown TFC model '{self.model}'. Known SDK models: {known}." + ) from e + + if not hasattr(self, "_client"): + self._client = TFCClient() + + def run(self, _): + self._adapter = _TFCAPIForecaster( + client=self._client, + model=self._model_enum, + freq=self.meta.get("freq", "D"), + context=self.context, + quantiles=None, + add_holidays=self.add_holidays, + add_events=self.add_events, + country_isocode=self.country_isocode, + batch_size=self.batch_size, + ) + + def get_result(self): + return {"model": self._adapter} From 297281387a1cbb30021f268d187128b3f29f3b09 Mon Sep 17 00:00:00 2001 From: Geoffrey Negiar Date: Thu, 28 May 2026 16:33:40 +0200 Subject: [PATCH 02/12] ENH add SeasonalNaive forecasting solver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Repeats the last ``season_length`` observations to fill the horizon. Default parameter sweep covers ``[1, 7, 12, 24]`` (last-value persistence, weekly, monthly, daily seasonal periods). Useful as a calibrated baseline whose strength depends entirely on matching the seasonal period to the data — handy for sanity-checking the impact of seasonality on TSFMs at fixed compute. Verified on Monash[m1_yearly_dataset, debug=True]: | season_length | MAE | MSE | MASE | sMAPE | | ------------- | ---------- | --------- | ------ | ----- | | 1 | 3,399,506 | 5.93e13 | 12.86 | 0.431 | | 7 | 3,045,677 | 4.31e13 | 11.52 | 0.573 | | 12 | 4,526,063 | 9.24e13 | 17.12 | 0.948 | | 24 | 6,230,975 | 1.71e14 | 23.56 | 1.744 | Co-Authored-By: Claude Opus 4.7 (1M context) --- solvers/seasonal_naive.py | 84 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 solvers/seasonal_naive.py diff --git a/solvers/seasonal_naive.py b/solvers/seasonal_naive.py new file mode 100644 index 0000000..d7f96ab --- /dev/null +++ b/solvers/seasonal_naive.py @@ -0,0 +1,84 @@ +"""Seasonal-naive forecasting baseline. + +The forecast at horizon ``h`` is the value observed ``season_length`` steps +ago — i.e. for forecast index ``i`` (0-based within the horizon), the +prediction is ``hist[-season_length + (i mod season_length)]``. When the +available history is shorter than ``season_length``, the pattern falls +back to whatever history exists. + +A common, calibrated baseline for any dataset with a known seasonal +period. With ``season_length=1`` it collapses to last-value persistence. +""" + +import numpy as np +from benchopt import BaseSolver + +from benchmark_utils.adapters.base import BaseTSFMAdapter + + +SUPPORTED_TASKS = {"forecasting"} + + +class _SeasonalNaiveForecaster(BaseTSFMAdapter): + """Repeat the last ``season_length`` observations to fill the horizon.""" + + def __init__(self, season_length: int): + if season_length < 1: + raise ValueError(f"season_length must be >= 1, got {season_length}") + self.season_length = season_length + + def predict(self, x, cutoff_indexes, covariates, horizon): + del covariates + results = [] + for series, cutoffs in zip(x, cutoff_indexes): + series = np.asarray(series) + C = series.shape[1] if series.ndim == 2 else 1 + preds = np.empty((len(cutoffs), horizon, C), dtype=np.float32) + for k, cutoff in enumerate(cutoffs): + hist = series[:cutoff] + season = min(self.season_length, hist.shape[0]) + pattern = hist[-season:] + reps = int(np.ceil(horizon / season)) + preds[k] = np.tile(pattern, (reps, 1))[:horizon] + results.append(preds) + return results + + +class Solver(BaseSolver): + """Seasonal-naive baseline. + + Parameters + ---------- + season_length : int + Number of past steps to repeat. ``1`` recovers last-value + persistence; common picks are ``7`` (daily → weekly), ``12`` + (monthly → yearly), ``24`` (hourly → daily), ``52`` (weekly → + yearly). + """ + + name = "SeasonalNaive" + + requirements = [] + + sampling_strategy = "run_once" + + parameters = { + "season_length": [1, 7, 12, 24], + } + + def skip(self, task, **kwargs): + if task not in SUPPORTED_TASKS: + return True, f"SeasonalNaive does not support task={task!r}" + return False, None + + def set_objective(self, X_train, y_train, task, **meta): + self.task = task + self.X_train = X_train + self.y_train = y_train + self.meta = meta + + def run(self, _): + self._adapter = _SeasonalNaiveForecaster(self.season_length) + + def get_result(self): + return {"model": self._adapter} From 7b24d945889d7667dafa5d80fa3c39fbdb69834c Mon Sep 17 00:00:00 2001 From: Geoffrey Negiar Date: Thu, 28 May 2026 16:41:45 +0200 Subject: [PATCH 03/12] PERF batch chronos-2 / moirai-2 in one cross_validate call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes bundled because they touch the same predict signature: 1. Rename ``horizon`` → ``prediction_length`` in the forecasting predict() contract for consistency with the SDK and dataset metadata. 2. TFC-API solver now sends one ``cross_validate`` call covering every series when the model reports ``supports_batching == True`` (chronos-2, moirai-2, T0-1535, T0-1638). Series are aligned to share an end date so all cutoffs collapse to a common ``fcds`` list; the SDK then stacks them into the (V, T) tensor Chronos-2 wants, with one ``unique_id`` per series-channel acting as the group id. Falls back to the per-series loop when cutoff offsets from end aren't homogeneous across series (e.g. a mix of n_windows after some series were filtered for being too short). Touched files for the rename: base.py, objective.py, forecast_residual.py, naive.py, chronos.py, seasonal_naive.py, tfc_api.py. Verification — Monash[m1_yearly_dataset, debug=True], -j 1: - chronos-2 (batched): MAE 2,785,573 · MASE 10.53 · sMAPE 0.348 (vs per-series: MAE 2,807,424 · MASE 10.62 — same order, ~0.8% delta is just batched-vs-sequential sampling variance.) - timesfm-2p5 (per-series, not batching-capable): unchanged at MAE 2,657,678 · MASE 10.05. Routing verified directly: - Chronos_2.supports_batching == True → batched path - Moirai2.supports_batching == True → batched path - TimesFM_2p5.supports_batching == False → per-series path - TabPFN_TS.supports_batching == False → per-series path Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmark_utils/adapters/base.py | 4 +- benchmark_utils/adapters/forecast_residual.py | 2 +- objective.py | 6 +- solvers/chronos.py | 6 +- solvers/naive.py | 8 +- solvers/seasonal_naive.py | 8 +- solvers/tfc_api.py | 152 ++++++++++++++---- 7 files changed, 140 insertions(+), 46 deletions(-) diff --git a/benchmark_utils/adapters/base.py b/benchmark_utils/adapters/base.py index 2258ad6..1bd0ef7 100644 --- a/benchmark_utils/adapters/base.py +++ b/benchmark_utils/adapters/base.py @@ -11,8 +11,8 @@ x: list[np.ndarray (T_i, C)], cutoff_indexes: list[list[int]], covariates: dict, - horizon: int, - ) -> list[np.ndarray (n_cutoffs_i, horizon, C)] + prediction_length: int, + ) -> list[np.ndarray (n_cutoffs_i, prediction_length, C)] ``cutoff_indexes[i][k]`` is the timestep index in ``x[i]`` at which the k-th forecast for series ``i`` starts. The model must use only diff --git a/benchmark_utils/adapters/forecast_residual.py b/benchmark_utils/adapters/forecast_residual.py index c79f251..6cd0c0c 100644 --- a/benchmark_utils/adapters/forecast_residual.py +++ b/benchmark_utils/adapters/forecast_residual.py @@ -58,7 +58,7 @@ def predict(self, x: np.ndarray) -> np.ndarray: [x], cutoff_indexes=[cutoffs], covariates={"static_covars": [], "hist_covars": [], "future_covars": []}, - horizon=self.prediction_length, + prediction_length=self.prediction_length, )[0] # (n_cutoffs, H, C) except Exception: return scores diff --git a/objective.py b/objective.py index bdcec56..5e26afc 100644 --- a/objective.py +++ b/objective.py @@ -109,12 +109,12 @@ def evaluate_result(self, model): # --- forecasting --------------------------------------------------- def _eval_forecasting(self, model): - horizon = self.meta.get("prediction_length", 1) + prediction_length = self.meta.get("prediction_length", 1) preds_per_series = model.predict( self.X_test, cutoff_indexes=self.cutoff_indexes, covariates=self.covariates, - horizon=horizon, + prediction_length=prediction_length, ) preds, targets = [], [] @@ -179,7 +179,7 @@ def predict(self, *args, **kwargs): cutoff_indexes = kwargs.get( "cutoff_indexes", args[1] if len(args) > 1 else None ) - H = kwargs.get("horizon", self._meta.get("prediction_length", 1)) + H = kwargs.get("prediction_length", self._meta.get("prediction_length", 1)) preds = [] for series, cutoffs in zip(x, cutoff_indexes or []): C = series.shape[1] if series.ndim == 2 else 1 diff --git a/solvers/chronos.py b/solvers/chronos.py index e693eed..70bea1d 100644 --- a/solvers/chronos.py +++ b/solvers/chronos.py @@ -35,7 +35,7 @@ class _ChronosForecaster: def __init__(self, pipeline): self.pipeline = pipeline - def predict(self, x, cutoff_indexes, covariates, horizon): + def predict(self, x, cutoff_indexes, covariates, prediction_length): del covariates import torch @@ -43,7 +43,7 @@ def predict(self, x, cutoff_indexes, covariates, horizon): for series, cutoffs in zip(x, cutoff_indexes): series = np.asarray(series, dtype=np.float32) C = series.shape[1] if series.ndim == 2 else 1 - out = np.empty((len(cutoffs), horizon, C), dtype=np.float32) + out = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32) for k, cutoff in enumerate(cutoffs): hist = series[:cutoff] if hist.ndim == 1: @@ -53,7 +53,7 @@ def predict(self, x, cutoff_indexes, covariates, horizon): context = torch.from_numpy(hist[:, c]).unsqueeze(0) forecast = self.pipeline.predict( context, - prediction_length=horizon, + prediction_length=prediction_length, ) f = forecast[0] if f.ndim == 2: diff --git a/solvers/naive.py b/solvers/naive.py index 4ae4d49..4f5be19 100644 --- a/solvers/naive.py +++ b/solvers/naive.py @@ -24,19 +24,19 @@ class _NaiveForecaster(BaseTSFMAdapter): def __init__(self, seasonality=1): self.seasonality = seasonality - def predict(self, x, cutoff_indexes, covariates, horizon): + def predict(self, x, cutoff_indexes, covariates, prediction_length): del covariates results = [] for series, cutoffs in zip(x, cutoff_indexes): series = np.asarray(series) C = series.shape[1] if series.ndim == 2 else 1 - preds = np.empty((len(cutoffs), horizon, C), dtype=np.float32) + preds = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32) for k, cutoff in enumerate(cutoffs): hist = series[:cutoff] season = min(self.seasonality, hist.shape[0]) pattern = hist[-season:] - reps = int(np.ceil(horizon / season)) - preds[k] = np.tile(pattern, (reps, 1))[:horizon] + reps = int(np.ceil(prediction_length / season)) + preds[k] = np.tile(pattern, (reps, 1))[:prediction_length] results.append(preds) return results diff --git a/solvers/seasonal_naive.py b/solvers/seasonal_naive.py index d7f96ab..5b2c748 100644 --- a/solvers/seasonal_naive.py +++ b/solvers/seasonal_naive.py @@ -27,19 +27,19 @@ def __init__(self, season_length: int): raise ValueError(f"season_length must be >= 1, got {season_length}") self.season_length = season_length - def predict(self, x, cutoff_indexes, covariates, horizon): + def predict(self, x, cutoff_indexes, covariates, prediction_length): del covariates results = [] for series, cutoffs in zip(x, cutoff_indexes): series = np.asarray(series) C = series.shape[1] if series.ndim == 2 else 1 - preds = np.empty((len(cutoffs), horizon, C), dtype=np.float32) + preds = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32) for k, cutoff in enumerate(cutoffs): hist = series[:cutoff] season = min(self.season_length, hist.shape[0]) pattern = hist[-season:] - reps = int(np.ceil(horizon / season)) - preds[k] = np.tile(pattern, (reps, 1))[:horizon] + reps = int(np.ceil(prediction_length / season)) + preds[k] = np.tile(pattern, (reps, 1))[:prediction_length] results.append(preds) return results diff --git a/solvers/tfc_api.py b/solvers/tfc_api.py index 022d513..88980aa 100644 --- a/solvers/tfc_api.py +++ b/solvers/tfc_api.py @@ -8,11 +8,20 @@ The SDK reads ``TFC_API_KEY`` from the environment by default. Sign in at https://docs.retrocast.com/settings/api-keys to get one. +Batching +-------- +Models that report ``supports_batching == True`` (chronos-2, moirai-2, +T0-1535, T0-1638) are sent in a single ``cross_validate`` call with all +series stacked into one DataFrame. Series are aligned so their cutoffs +share a common set of ``fcds``; the SDK then builds the (V, T) tensor +internally with one ``unique_id`` per series-channel acting as the +group id Chronos-2 keys on. When cutoff offsets-from-end are not +homogeneous across series, the solver falls back to a per-series loop. + Adding a new model ------------------ Pass any model id from ``theforecastingcompany.utils.TFCModels`` via the -``model`` parameter (e.g. ``"chronos-2"``, ``"timesfm-2p5"``, -``"tfc-global"``, ``"T0-1638-step-85000"``). +``model`` parameter. """ import os @@ -43,8 +52,28 @@ def _to_pandas_freq(api_freq: str) -> str: return _PD_FREQ_REMAP.get(api_freq, api_freq) +def _shared_offsets_from_end(x, cutoff_indexes): + """Return per-series cutoff offsets if shared across series, else None.""" + if not cutoff_indexes: + return None + reference = None + for series, cutoffs in zip(x, cutoff_indexes): + T = np.asarray(series).shape[0] + offsets = tuple(T - c for c in cutoffs) + if reference is None: + reference = offsets + elif offsets != reference: + return None + return reference + + class _TFCAPIForecaster(BaseTSFMAdapter): - """Batched adapter that calls ``client.cross_validate`` per series.""" + """Adapter calling the TFC SDK. + + Uses a single batched ``cross_validate`` call when the model supports + batching and series share cutoff offsets; falls back to one call per + series otherwise. + """ def __init__( self, @@ -72,13 +101,19 @@ def __init__( self.country_isocode = country_isocode self.batch_size = batch_size - def predict(self, x, cutoff_indexes, covariates, horizon): + def predict(self, x, cutoff_indexes, covariates, prediction_length): # TODO: thread ``covariates`` (static/hist/future) through to the SDK # once the benchmark datasets expose them. For now the dict is # ignored — Monash datasets carry no covariates. del covariates pd_freq = _to_pandas_freq(self.freq) + offsets = _shared_offsets_from_end(x, cutoff_indexes) + if getattr(self.model, "supports_batching", False) and offsets is not None: + return self._predict_batched(x, cutoff_indexes, prediction_length, pd_freq, offsets) + return self._predict_per_series(x, cutoff_indexes, prediction_length, pd_freq) + + def _predict_per_series(self, x, cutoff_indexes, prediction_length, pd_freq): results = [] for series_idx, (series, cutoffs) in enumerate(zip(x, cutoff_indexes)): series = np.asarray(series, dtype=np.float32) @@ -87,24 +122,21 @@ def predict(self, x, cutoff_indexes, covariates, horizon): T, C = series.shape index = pd.date_range("2000-01-01", periods=T, freq=pd_freq) - frames = [] - for c in range(C): - frames.append( - pd.DataFrame( - { - "unique_id": f"s{series_idx}_c{c}", - "ds": index, - "target": series[:, c], - } - ) - ) + frames = [ + pd.DataFrame({ + "unique_id": f"s{series_idx}_c{c}", + "ds": index, + "target": series[:, c], + }) + for c in range(C) + ] train_df = pd.concat(frames, ignore_index=True) - fcds = [pd.Timestamp(index[cutoff]) for cutoff in cutoffs] + forecast_df = self.client.cross_validate( train_df, model=self.model, - horizon=horizon, + horizon=prediction_length, freq=self.freq, fcds=fcds, quantiles=self.quantiles, @@ -115,23 +147,85 @@ def predict(self, x, cutoff_indexes, covariates, horizon): batch_size=self.batch_size, ) - value_col = f"{self.model}_q0.5" - if value_col not in forecast_df.columns: - value_col = str(self.model) - if value_col not in forecast_df.columns: - raise ValueError( - f"TFC API response missing expected columns; got {list(forecast_df.columns)!r}" - ) + preds = self._gather_series_preds( + forecast_df, series_idx, C, cutoffs, fcds, prediction_length + ) + results.append(preds) + return results + + def _predict_batched(self, x, cutoff_indexes, prediction_length, pd_freq, offsets): + """One ``cross_validate`` call covering every series in ``x``. - preds = np.empty((len(cutoffs), horizon, C), dtype=np.float32) + Series are aligned to share an end date so all cutoffs collapse to + the same set of timestamps. The SDK then groups by ``unique_id`` + when building Chronos-2's (V, T) tensor. + """ + end = pd.Timestamp("2030-01-01") + frames = [] + per_series_meta = [] # (series_idx, C, index, cutoffs) + for series_idx, (series, cutoffs) in enumerate(zip(x, cutoff_indexes)): + series = np.asarray(series, dtype=np.float32) + if series.ndim == 1: + series = series[:, None] + T, C = series.shape + index = pd.date_range(end=end, periods=T, freq=pd_freq) for c in range(C): - channel = forecast_df.loc[forecast_df["unique_id"] == f"s{series_idx}_c{c}"] - for k, fcd in enumerate(fcds): - window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(horizon) - preds[k, :, c] = window[value_col].to_numpy(dtype=np.float32) + frames.append( + pd.DataFrame({ + "unique_id": f"s{series_idx}_c{c}", + "ds": index, + "target": series[:, c], + }) + ) + per_series_meta.append((series_idx, C, index, cutoffs)) + + train_df = pd.concat(frames, ignore_index=True) + # ``offsets`` is (T - cutoff) for any series, so the corresponding + # timestamp is end - (offset - 1) * delta. We let pandas pick the + # delta by walking the date_range backwards from ``end``. + ref_index = pd.date_range(end=end, periods=max(offsets) + 1, freq=pd_freq) + fcds = sorted({pd.Timestamp(ref_index[-offset]) for offset in offsets}) + + forecast_df = self.client.cross_validate( + train_df, + model=self.model, + horizon=prediction_length, + freq=self.freq, + fcds=fcds, + quantiles=self.quantiles, + context=self.context, + add_holidays=self.add_holidays, + add_events=self.add_events, + country_isocode=self.country_isocode, + batch_size=self.batch_size, + ) + + results = [] + for series_idx, C, index, cutoffs in per_series_meta: + series_fcds = [pd.Timestamp(index[cutoff]) for cutoff in cutoffs] + preds = self._gather_series_preds( + forecast_df, series_idx, C, cutoffs, series_fcds, prediction_length + ) results.append(preds) return results + def _gather_series_preds(self, forecast_df, series_idx, C, cutoffs, fcds, prediction_length): + value_col = f"{self.model}_q0.5" + if value_col not in forecast_df.columns: + value_col = str(self.model) + if value_col not in forecast_df.columns: + raise ValueError( + f"TFC API response missing expected columns; got {list(forecast_df.columns)!r}" + ) + + preds = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32) + for c in range(C): + channel = forecast_df.loc[forecast_df["unique_id"] == f"s{series_idx}_c{c}"] + for k, fcd in enumerate(fcds): + window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(prediction_length) + preds[k, :, c] = window[value_col].to_numpy(dtype=np.float32) + return preds + class Solver(BaseSolver): """TFC hosted-API solver. From efe5ad5e43d25626ac4905545f43a54e40031dfb Mon Sep 17 00:00:00 2001 From: Geoffrey Negiar Date: Thu, 28 May 2026 16:54:23 +0200 Subject: [PATCH 04/12] REFACTOR typed ForecastInput, Covariates dataclass, prediction_length at dataset level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tightens the forecasting predict() contract introduced earlier in this PR: - New ``benchmark_utils.inputs.ForecastInput`` frozen dataclass bundles ``x``, ``cutoff_indexes``, and ``covariates``. The base ``predict`` signature is now ``predict(self, x: ForecastInput | np.ndarray)`` — forecasting adapters take the dataclass, classification / anomaly- detection adapters take a plain ndarray. No more ``*args/**kwargs``. - New ``benchmark_utils.covariates.Covariates`` frozen dataclass with ``static_covars / hist_covars / future_covars`` fields, each defaulting to an empty ``Sequence`` (so arrays work as well as lists). - ``prediction_length`` is removed from the predict signature. It is dataset-level state — the solver reads it from ``meta`` once and wires it into the adapter constructor. This keeps predict() pure per-call. Updated to the new contract: base adapter, objective (both ``_eval_forecasting`` and ``get_one_result``'s constant adapter), Monash dataset (now emits ``Covariates()``), Naive, SeasonalNaive, Chronos, ForecastResidual, TFC-API. Parity preserved on Monash[m1_yearly_dataset, debug=True]: - Naive[seasonality=1]: MAE 3,399,506 · MASE 12.86 · sMAPE 0.431 - SeasonalNaive[season_length=1]: identical to Naive[seasonality=1] ✓ - TFC-API[chronos-2] (batched): MAE 2,785,573 · MASE 10.53 - TFC-API[timesfm-2p5]: MAE 2,657,678 · MASE 10.05 Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmark_utils/adapters/base.py | 30 ++++++----- benchmark_utils/adapters/forecast_residual.py | 6 +-- benchmark_utils/covariates.py | 24 +++++++++ benchmark_utils/inputs.py | 35 +++++++++++++ datasets/monash.py | 7 +-- objective.py | 51 ++++++++++--------- solvers/chronos.py | 21 +++++--- solvers/naive.py | 20 +++++--- solvers/seasonal_naive.py | 20 +++++--- solvers/tfc_api.py | 38 +++++++------- 10 files changed, 163 insertions(+), 89 deletions(-) create mode 100644 benchmark_utils/covariates.py create mode 100644 benchmark_utils/inputs.py diff --git a/benchmark_utils/adapters/base.py b/benchmark_utils/adapters/base.py index 1bd0ef7..c4cab9a 100644 --- a/benchmark_utils/adapters/base.py +++ b/benchmark_utils/adapters/base.py @@ -1,24 +1,19 @@ """Base interface that all task adapters must implement. A *fitted* adapter is what solvers return via ``get_result()``. -The objective calls ``adapter.predict(...)`` with task-appropriate inputs. +The objective calls ``adapter.predict(x)`` with task-appropriate inputs. Predict signature by task -------------------------- forecasting: - predict( - x: list[np.ndarray (T_i, C)], - cutoff_indexes: list[list[int]], - covariates: dict, - prediction_length: int, - ) -> list[np.ndarray (n_cutoffs_i, prediction_length, C)] + predict(x: ForecastInput) -> list[np.ndarray (n_cutoffs_i, prediction_length, C)] - ``cutoff_indexes[i][k]`` is the timestep index in ``x[i]`` at which - the k-th forecast for series ``i`` starts. The model must use only - ``x[i][:cutoff]`` as history. The ``covariates`` dict has shape - ``{"static_covars": list, "hist_covars": list, "future_covars": list}``; - the keys are always present (empty lists when unused). + :class:`~benchmark_utils.inputs.ForecastInput` bundles the per-series + history list, the jagged per-series cutoff indexes, and a + :class:`~benchmark_utils.covariates.Covariates` dataclass. + ``prediction_length`` is dataset-level — the solver reads it from the + objective and wires it into the adapter at construction time. classification: @@ -30,7 +25,14 @@ """ from abc import ABC, abstractmethod -from typing import Any +from typing import Any, Union + +import numpy as np + +from benchmark_utils.inputs import ForecastInput + + +PredictInput = Union[ForecastInput, np.ndarray] class BaseTSFMAdapter(ABC): @@ -45,5 +47,5 @@ def fit(self, X_train, y_train, **kwargs): return self @abstractmethod - def predict(self, *args, **kwargs) -> Any: + def predict(self, x: PredictInput) -> Any: """Task-specific inference. See module docstring for per-task signatures.""" diff --git a/benchmark_utils/adapters/forecast_residual.py b/benchmark_utils/adapters/forecast_residual.py index 6cd0c0c..cbe7141 100644 --- a/benchmark_utils/adapters/forecast_residual.py +++ b/benchmark_utils/adapters/forecast_residual.py @@ -53,12 +53,10 @@ def predict(self, x: np.ndarray) -> np.ndarray: if not cutoffs: return scores + from benchmark_utils.inputs import ForecastInput try: preds = self.forecaster.predict( - [x], - cutoff_indexes=[cutoffs], - covariates={"static_covars": [], "hist_covars": [], "future_covars": []}, - prediction_length=self.prediction_length, + ForecastInput(x=[x], cutoff_indexes=[cutoffs]) )[0] # (n_cutoffs, H, C) except Exception: return scores diff --git a/benchmark_utils/covariates.py b/benchmark_utils/covariates.py new file mode 100644 index 0000000..62f4132 --- /dev/null +++ b/benchmark_utils/covariates.py @@ -0,0 +1,24 @@ +"""Covariates payload passed to forecasting adapters. + +A small dataclass so the contract is typed and IDE-discoverable. All +three fields default to empty sequences, so datasets without covariates +can just pass ``Covariates()``. +""" + +from dataclasses import dataclass, field +from typing import Sequence + + +@dataclass(frozen=True) +class Covariates: + """Per-series covariates aligned with the ``x`` sequence in ``predict``. + + Each field is a sequence whose length equals ``len(x)``. Within a + series, the inner structure depends on the covariate kind — see the + forecasting predict() contract in + :mod:`benchmark_utils.adapters.base`. + """ + + static_covars: Sequence = field(default_factory=list) + hist_covars: Sequence = field(default_factory=list) + future_covars: Sequence = field(default_factory=list) diff --git a/benchmark_utils/inputs.py b/benchmark_utils/inputs.py new file mode 100644 index 0000000..93e94d9 --- /dev/null +++ b/benchmark_utils/inputs.py @@ -0,0 +1,35 @@ +"""Typed inputs for adapter ``predict()`` methods. + +Forecasting adapters receive a :class:`ForecastInput` (one struct per +call), while classification and anomaly-detection adapters receive a +plain :class:`numpy.ndarray`. The base ``predict`` signature is a union +of the two — see :mod:`benchmark_utils.adapters.base`. +""" + +from dataclasses import dataclass, field +from typing import Sequence + +import numpy as np + +from benchmark_utils.covariates import Covariates + + +@dataclass(frozen=True) +class ForecastInput: + """Bundle of arguments passed to a forecasting adapter's predict(). + + Attributes + ---------- + x : sequence of np.ndarray + One ``(T_i, C)`` array per series. The adapter must use only + ``x[i][:cutoff]`` as history for the cutoff at index k. + cutoff_indexes : sequence of sequence of int + Jagged — per-series timestep indexes at which a forecast starts. + covariates : Covariates + Static / historical / future covariates aligned with ``x``. + Defaults to empty. + """ + + x: Sequence[np.ndarray] + cutoff_indexes: Sequence[Sequence[int]] + covariates: Covariates = field(default_factory=Covariates) diff --git a/datasets/monash.py b/datasets/monash.py index ad853c0..049ce73 100644 --- a/datasets/monash.py +++ b/datasets/monash.py @@ -34,6 +34,7 @@ from benchopt import BaseDataset from aeon.datasets import load_forecasting +from benchmark_utils.covariates import Covariates from benchmark_utils.windowing import make_forecasting_splits @@ -138,11 +139,7 @@ def get_data(self): X_test=X_test, y_test=y_test, cutoff_indexes=cutoff_indexes, - covariates={ - "static_covars": [], - "hist_covars": [], - "future_covars": [], - }, + covariates=Covariates(), task="forecasting", metrics=["mae", "mse", "mase", "smape"], prediction_length=pred_len, diff --git a/objective.py b/objective.py index 5e26afc..169219b 100644 --- a/objective.py +++ b/objective.py @@ -22,9 +22,14 @@ ``x[:cutoff]`` as history cutoff_indexes List[List[int]] jagged per-series cutoffs y_test List[(n_cutoffs, H, C)] - covariates dict {static_covars, hist_covars, - future_covars} - extra prediction_length (int), freq (str) + covariates Covariates dataclass with + static / hist / future + covariate lists + extra prediction_length (int), freq (str) — + the solver reads these + from the objective once + and wires them into the + adapter classification y_train (N,) int y_test (M,) int extra n_classes (int) @@ -66,16 +71,14 @@ class Objective(BaseObjective): def set_data(self, X_train, y_train, X_test, y_test, task, metrics, cutoff_indexes=None, covariates=None, **meta): + from benchmark_utils.covariates import Covariates + self.X_train = X_train self.y_train = y_train self.X_test = X_test self.y_test = y_test self.cutoff_indexes = cutoff_indexes - self.covariates = covariates or { - "static_covars": [], - "hist_covars": [], - "future_covars": [], - } + self.covariates = covariates if covariates is not None else Covariates() self.task = task self.metrics = metrics self.meta = meta # freq, prediction_length, n_classes, … @@ -109,12 +112,14 @@ def evaluate_result(self, model): # --- forecasting --------------------------------------------------- def _eval_forecasting(self, model): - prediction_length = self.meta.get("prediction_length", 1) + from benchmark_utils.inputs import ForecastInput + preds_per_series = model.predict( - self.X_test, - cutoff_indexes=self.cutoff_indexes, - covariates=self.covariates, - prediction_length=prediction_length, + ForecastInput( + x=self.X_test, + cutoff_indexes=self.cutoff_indexes, + covariates=self.covariates, + ) ) preds, targets = [], [] @@ -169,27 +174,23 @@ def get_one_result(self): from benchmark_utils.adapters.base import BaseTSFMAdapter class _ConstantAdapter(BaseTSFMAdapter): - def __init__(self, task, meta): + def __init__(self, task, prediction_length): self._task = task - self._meta = meta + self._prediction_length = prediction_length - def predict(self, *args, **kwargs): + def predict(self, x): if self._task == "forecasting": - x = args[0] - cutoff_indexes = kwargs.get( - "cutoff_indexes", args[1] if len(args) > 1 else None - ) - H = kwargs.get("prediction_length", self._meta.get("prediction_length", 1)) + H = self._prediction_length preds = [] - for series, cutoffs in zip(x, cutoff_indexes or []): + for series, cutoffs in zip(x.x, x.cutoff_indexes): C = series.shape[1] if series.ndim == 2 else 1 preds.append(np.zeros((len(cutoffs), H, C), dtype=np.float32)) return preds elif self._task == "classification": - x = args[0] return np.zeros(len(x), dtype=np.int64) elif self._task == "anomaly_detection": - x = args[0] return np.zeros(x.shape[0], dtype=np.float32) - return {"model": _ConstantAdapter(self.task, self.meta)} + return {"model": _ConstantAdapter( + self.task, self.meta.get("prediction_length", 1) + )} diff --git a/solvers/chronos.py b/solvers/chronos.py index 70bea1d..7dc6759 100644 --- a/solvers/chronos.py +++ b/solvers/chronos.py @@ -20,6 +20,7 @@ from benchopt import BaseSolver from benchmark_utils.adapters.forecast_residual import ForecastResidualAdapter +from benchmark_utils.inputs import ForecastInput SUPPORTED_TASKS = {"forecasting", "anomaly_detection"} @@ -32,18 +33,18 @@ class _ChronosForecaster: """Wraps ChronosPipeline with the batched series+cutoffs predict API.""" - def __init__(self, pipeline): + def __init__(self, pipeline, prediction_length): self.pipeline = pipeline + self.prediction_length = prediction_length - def predict(self, x, cutoff_indexes, covariates, prediction_length): - del covariates + def predict(self, x: ForecastInput): import torch results = [] - for series, cutoffs in zip(x, cutoff_indexes): + for series, cutoffs in zip(x.x, x.cutoff_indexes): series = np.asarray(series, dtype=np.float32) C = series.shape[1] if series.ndim == 2 else 1 - out = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32) + out = np.empty((len(cutoffs), self.prediction_length, C), dtype=np.float32) for k, cutoff in enumerate(cutoffs): hist = series[:cutoff] if hist.ndim == 1: @@ -53,7 +54,7 @@ def predict(self, x, cutoff_indexes, covariates, prediction_length): context = torch.from_numpy(hist[:, c]).unsqueeze(0) forecast = self.pipeline.predict( context, - prediction_length=prediction_length, + prediction_length=self.prediction_length, ) f = forecast[0] if f.ndim == 2: @@ -117,14 +118,18 @@ def set_objective(self, X_train, y_train, task, **meta): self._loaded_model = model_id def run(self, _): - forecaster = _ChronosForecaster(self._pipeline) + pred_len = self.meta.get("prediction_length", 1) + forecaster = _ChronosForecaster(self._pipeline, pred_len) if self.task == "forecasting": self._adapter = forecaster elif self.task == "anomaly_detection": + # AD uses one-step-ahead forecasts; rebuild the forecaster + # with prediction_length=1 to match. self._adapter = ForecastResidualAdapter( - forecaster, prediction_length=1 + _ChronosForecaster(self._pipeline, prediction_length=1), + prediction_length=1, ) def get_result(self): diff --git a/solvers/naive.py b/solvers/naive.py index 4f5be19..83f21d6 100644 --- a/solvers/naive.py +++ b/solvers/naive.py @@ -12,6 +12,7 @@ from benchopt import BaseSolver from benchmark_utils.adapters.base import BaseTSFMAdapter +from benchmark_utils.inputs import ForecastInput # --------------------------------------------------------------------------- @@ -21,22 +22,22 @@ class _NaiveForecaster(BaseTSFMAdapter): """Repeat the last ``seasonality`` values to fill the horizon.""" - def __init__(self, seasonality=1): + def __init__(self, prediction_length, seasonality=1): + self.prediction_length = prediction_length self.seasonality = seasonality - def predict(self, x, cutoff_indexes, covariates, prediction_length): - del covariates + def predict(self, x: ForecastInput): results = [] - for series, cutoffs in zip(x, cutoff_indexes): + for series, cutoffs in zip(x.x, x.cutoff_indexes): series = np.asarray(series) C = series.shape[1] if series.ndim == 2 else 1 - preds = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32) + preds = np.empty((len(cutoffs), self.prediction_length, C), dtype=np.float32) for k, cutoff in enumerate(cutoffs): hist = series[:cutoff] season = min(self.seasonality, hist.shape[0]) pattern = hist[-season:] - reps = int(np.ceil(prediction_length / season)) - preds[k] = np.tile(pattern, (reps, 1))[:prediction_length] + reps = int(np.ceil(self.prediction_length / season)) + preds[k] = np.tile(pattern, (reps, 1))[:self.prediction_length] results.append(preds) return results @@ -100,7 +101,10 @@ def set_objective(self, X_train, y_train, task, **meta): def run(self, _): if self.task == "forecasting": - self._adapter = _NaiveForecaster(self.seasonality) + self._adapter = _NaiveForecaster( + prediction_length=self.meta.get("prediction_length", 1), + seasonality=self.seasonality, + ) elif self.task == "classification": self._adapter = _MajorityClassifier() diff --git a/solvers/seasonal_naive.py b/solvers/seasonal_naive.py index 5b2c748..723551e 100644 --- a/solvers/seasonal_naive.py +++ b/solvers/seasonal_naive.py @@ -14,6 +14,7 @@ from benchopt import BaseSolver from benchmark_utils.adapters.base import BaseTSFMAdapter +from benchmark_utils.inputs import ForecastInput SUPPORTED_TASKS = {"forecasting"} @@ -22,24 +23,24 @@ class _SeasonalNaiveForecaster(BaseTSFMAdapter): """Repeat the last ``season_length`` observations to fill the horizon.""" - def __init__(self, season_length: int): + def __init__(self, prediction_length: int, season_length: int): if season_length < 1: raise ValueError(f"season_length must be >= 1, got {season_length}") + self.prediction_length = prediction_length self.season_length = season_length - def predict(self, x, cutoff_indexes, covariates, prediction_length): - del covariates + def predict(self, x: ForecastInput): results = [] - for series, cutoffs in zip(x, cutoff_indexes): + for series, cutoffs in zip(x.x, x.cutoff_indexes): series = np.asarray(series) C = series.shape[1] if series.ndim == 2 else 1 - preds = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32) + preds = np.empty((len(cutoffs), self.prediction_length, C), dtype=np.float32) for k, cutoff in enumerate(cutoffs): hist = series[:cutoff] season = min(self.season_length, hist.shape[0]) pattern = hist[-season:] - reps = int(np.ceil(prediction_length / season)) - preds[k] = np.tile(pattern, (reps, 1))[:prediction_length] + reps = int(np.ceil(self.prediction_length / season)) + preds[k] = np.tile(pattern, (reps, 1))[:self.prediction_length] results.append(preds) return results @@ -78,7 +79,10 @@ def set_objective(self, X_train, y_train, task, **meta): self.meta = meta def run(self, _): - self._adapter = _SeasonalNaiveForecaster(self.season_length) + self._adapter = _SeasonalNaiveForecaster( + prediction_length=self.meta.get("prediction_length", 1), + season_length=self.season_length, + ) def get_result(self): return {"model": self._adapter} diff --git a/solvers/tfc_api.py b/solvers/tfc_api.py index 88980aa..d5e3f87 100644 --- a/solvers/tfc_api.py +++ b/solvers/tfc_api.py @@ -32,6 +32,7 @@ from benchopt import BaseSolver from benchmark_utils.adapters.base import BaseTSFMAdapter +from benchmark_utils.inputs import ForecastInput SUPPORTED_TASKS = {"forecasting"} @@ -79,6 +80,7 @@ def __init__( self, client, model, + prediction_length: int, freq: str, context: Optional[int], quantiles: Optional[list[float]], @@ -89,6 +91,7 @@ def __init__( ): self.client = client self.model = model # TFCModels enum + self.prediction_length = prediction_length self.freq = _to_api_freq(freq) if quantiles is None: quantiles = [0.5] @@ -101,19 +104,19 @@ def __init__( self.country_isocode = country_isocode self.batch_size = batch_size - def predict(self, x, cutoff_indexes, covariates, prediction_length): - # TODO: thread ``covariates`` (static/hist/future) through to the SDK - # once the benchmark datasets expose them. For now the dict is - # ignored — Monash datasets carry no covariates. - del covariates + def predict(self, x: ForecastInput): + # TODO: thread ``x.covariates`` (static/hist/future) through to the SDK + # once the benchmark datasets populate them. Monash currently + # carries none, so the dataclass arrives with empty sequences. + series_list, cutoff_indexes = x.x, x.cutoff_indexes pd_freq = _to_pandas_freq(self.freq) - offsets = _shared_offsets_from_end(x, cutoff_indexes) + offsets = _shared_offsets_from_end(series_list, cutoff_indexes) if getattr(self.model, "supports_batching", False) and offsets is not None: - return self._predict_batched(x, cutoff_indexes, prediction_length, pd_freq, offsets) - return self._predict_per_series(x, cutoff_indexes, prediction_length, pd_freq) + return self._predict_batched(series_list, cutoff_indexes, pd_freq, offsets) + return self._predict_per_series(series_list, cutoff_indexes, pd_freq) - def _predict_per_series(self, x, cutoff_indexes, prediction_length, pd_freq): + def _predict_per_series(self, x, cutoff_indexes, pd_freq): results = [] for series_idx, (series, cutoffs) in enumerate(zip(x, cutoff_indexes)): series = np.asarray(series, dtype=np.float32) @@ -136,7 +139,7 @@ def _predict_per_series(self, x, cutoff_indexes, prediction_length, pd_freq): forecast_df = self.client.cross_validate( train_df, model=self.model, - horizon=prediction_length, + horizon=self.prediction_length, freq=self.freq, fcds=fcds, quantiles=self.quantiles, @@ -148,12 +151,12 @@ def _predict_per_series(self, x, cutoff_indexes, prediction_length, pd_freq): ) preds = self._gather_series_preds( - forecast_df, series_idx, C, cutoffs, fcds, prediction_length + forecast_df, series_idx, C, cutoffs, fcds ) results.append(preds) return results - def _predict_batched(self, x, cutoff_indexes, prediction_length, pd_freq, offsets): + def _predict_batched(self, x, cutoff_indexes, pd_freq, offsets): """One ``cross_validate`` call covering every series in ``x``. Series are aligned to share an end date so all cutoffs collapse to @@ -189,7 +192,7 @@ def _predict_batched(self, x, cutoff_indexes, prediction_length, pd_freq, offset forecast_df = self.client.cross_validate( train_df, model=self.model, - horizon=prediction_length, + horizon=self.prediction_length, freq=self.freq, fcds=fcds, quantiles=self.quantiles, @@ -204,12 +207,12 @@ def _predict_batched(self, x, cutoff_indexes, prediction_length, pd_freq, offset for series_idx, C, index, cutoffs in per_series_meta: series_fcds = [pd.Timestamp(index[cutoff]) for cutoff in cutoffs] preds = self._gather_series_preds( - forecast_df, series_idx, C, cutoffs, series_fcds, prediction_length + forecast_df, series_idx, C, cutoffs, series_fcds ) results.append(preds) return results - def _gather_series_preds(self, forecast_df, series_idx, C, cutoffs, fcds, prediction_length): + def _gather_series_preds(self, forecast_df, series_idx, C, cutoffs, fcds): value_col = f"{self.model}_q0.5" if value_col not in forecast_df.columns: value_col = str(self.model) @@ -218,11 +221,11 @@ def _gather_series_preds(self, forecast_df, series_idx, C, cutoffs, fcds, predic f"TFC API response missing expected columns; got {list(forecast_df.columns)!r}" ) - preds = np.empty((len(cutoffs), prediction_length, C), dtype=np.float32) + preds = np.empty((len(cutoffs), self.prediction_length, C), dtype=np.float32) for c in range(C): channel = forecast_df.loc[forecast_df["unique_id"] == f"s{series_idx}_c{c}"] for k, fcd in enumerate(fcds): - window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(prediction_length) + window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(self.prediction_length) preds[k, :, c] = window[value_col].to_numpy(dtype=np.float32) return preds @@ -293,6 +296,7 @@ def run(self, _): self._adapter = _TFCAPIForecaster( client=self._client, model=self._model_enum, + prediction_length=self.meta.get("prediction_length", 1), freq=self.meta.get("freq", "D"), context=self.context, quantiles=None, From 1eced14cd1574ab519675472d7530af2d2c0eb25 Mon Sep 17 00:00:00 2001 From: Geoffrey Negiar Date: Thu, 28 May 2026 17:05:56 +0200 Subject: [PATCH 05/12] ENH add quantile dimension to forecasting outputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Forecasting predict() now returns ``Sequence[ForecastOutput]`` instead of a list of raw point arrays. ``ForecastOutput`` is a frozen dataclass holding: - ``quantiles``: ndarray with shape ``(n_cutoffs, Q, prediction_length, C)``. - ``quantile_levels``: tuple of floats in (0, 1), length Q. Point forecasters (Naive, SeasonalNaive, Chronos) set ``quantile_levels=(0.5,)`` and Q=1. The TFC-API adapter now discovers every ``_q{level}`` column the SDK returns and stacks them into ``quantiles`` with the matching ``quantile_levels`` tuple — falling back to the mean column when no quantile columns are present. ``ForecastOutput.point`` returns the best point estimate for metric computation: the median when present, otherwise the mean across quantile levels. The objective uses that property in ``_eval_forecasting``. Adapter contract update in ``base.py`` docstring. ``forecast_residual`` extracts ``.point`` from the wrapped forecaster. Verified on Monash[m1_yearly_dataset, debug=True]: Naive, SeasonalNaive, TFC-API[chronos-2] and TFC-API[timesfm-2p5] all match their previous metrics exactly, confirming the median extraction preserves parity. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmark_utils/adapters/base.py | 8 ++- benchmark_utils/adapters/forecast_residual.py | 5 +- benchmark_utils/outputs.py | 55 +++++++++++++++++++ objective.py | 16 +++--- solvers/chronos.py | 6 +- solvers/naive.py | 6 +- solvers/seasonal_naive.py | 6 +- solvers/tfc_api.py | 46 ++++++++++------ 8 files changed, 118 insertions(+), 30 deletions(-) create mode 100644 benchmark_utils/outputs.py diff --git a/benchmark_utils/adapters/base.py b/benchmark_utils/adapters/base.py index c4cab9a..3f57a98 100644 --- a/benchmark_utils/adapters/base.py +++ b/benchmark_utils/adapters/base.py @@ -7,11 +7,17 @@ -------------------------- forecasting: - predict(x: ForecastInput) -> list[np.ndarray (n_cutoffs_i, prediction_length, C)] + predict(x: ForecastInput) -> Sequence[ForecastOutput] :class:`~benchmark_utils.inputs.ForecastInput` bundles the per-series history list, the jagged per-series cutoff indexes, and a :class:`~benchmark_utils.covariates.Covariates` dataclass. + + :class:`~benchmark_utils.outputs.ForecastOutput` carries the + quantile-resolved forecast — shape + ``(n_cutoffs, Q, prediction_length, C)`` plus the matching quantile + levels. Point forecasters set ``quantile_levels=(0.5,)`` and Q=1. + ``prediction_length`` is dataset-level — the solver reads it from the objective and wires it into the adapter at construction time. diff --git a/benchmark_utils/adapters/forecast_residual.py b/benchmark_utils/adapters/forecast_residual.py index cbe7141..89623c2 100644 --- a/benchmark_utils/adapters/forecast_residual.py +++ b/benchmark_utils/adapters/forecast_residual.py @@ -55,9 +55,10 @@ def predict(self, x: np.ndarray) -> np.ndarray: from benchmark_utils.inputs import ForecastInput try: - preds = self.forecaster.predict( + output = self.forecaster.predict( ForecastInput(x=[x], cutoff_indexes=[cutoffs]) - )[0] # (n_cutoffs, H, C) + )[0] + preds = output.point # (n_cutoffs, H, C) except Exception: return scores diff --git a/benchmark_utils/outputs.py b/benchmark_utils/outputs.py new file mode 100644 index 0000000..67f4187 --- /dev/null +++ b/benchmark_utils/outputs.py @@ -0,0 +1,55 @@ +"""Typed outputs returned by forecasting adapters. + +Forecasting predict() returns ``Sequence[ForecastOutput]`` — one entry +per input series. Each ``ForecastOutput`` carries a quantile-resolved +forecast with shape ``(n_cutoffs, Q, prediction_length, C)`` plus the +quantile levels themselves. Point forecasters set ``quantile_levels = +(0.5,)`` and Q=1; probabilistic forecasters can return as many quantiles +as their model produces. +""" + +from dataclasses import dataclass +from typing import Sequence + +import numpy as np + + +@dataclass(frozen=True) +class ForecastOutput: + """Per-series forecast. + + Attributes + ---------- + quantiles : np.ndarray + Shape ``(n_cutoffs, Q, prediction_length, C)``. ``quantiles[k, q]`` + is the forecast for the k-th cutoff at quantile level + ``quantile_levels[q]``. + quantile_levels : sequence of float + Length ``Q``. Each entry is a quantile level in (0, 1). + """ + + quantiles: np.ndarray + quantile_levels: Sequence[float] + + def __post_init__(self): + if self.quantiles.ndim != 4: + raise ValueError( + f"quantiles must have ndim=4 (n_cutoffs, Q, prediction_length, C); " + f"got shape {self.quantiles.shape}" + ) + if self.quantiles.shape[1] != len(self.quantile_levels): + raise ValueError( + f"quantiles.shape[1] ({self.quantiles.shape[1]}) must equal " + f"len(quantile_levels) ({len(self.quantile_levels)})" + ) + + @property + def point(self) -> np.ndarray: + """Best point estimate — median when available, else mean over quantiles. + + Shape: ``(n_cutoffs, prediction_length, C)``. + """ + levels = list(self.quantile_levels) + if 0.5 in levels: + return self.quantiles[:, levels.index(0.5), :, :] + return self.quantiles.mean(axis=1) diff --git a/objective.py b/objective.py index 169219b..adece87 100644 --- a/objective.py +++ b/objective.py @@ -114,7 +114,7 @@ def evaluate_result(self, model): def _eval_forecasting(self, model): from benchmark_utils.inputs import ForecastInput - preds_per_series = model.predict( + outputs_per_series = model.predict( ForecastInput( x=self.X_test, cutoff_indexes=self.cutoff_indexes, @@ -123,9 +123,9 @@ def _eval_forecasting(self, model): ) preds, targets = [], [] - for series_preds, series_targets in zip(preds_per_series, self.y_test): - sp = np.asarray(series_preds) # (n_cutoffs, H, C) - st = np.asarray(series_targets) # (n_cutoffs, H, C) + for series_output, series_targets in zip(outputs_per_series, self.y_test): + sp = np.asarray(series_output.point) # (n_cutoffs, H, C) + st = np.asarray(series_targets) for k in range(sp.shape[0]): preds.append(sp[k]) targets.append(st[k]) @@ -172,6 +172,7 @@ def _eval_anomaly_detection(self, model): def get_one_result(self): """Return a minimal valid result for benchopt's internal checks.""" from benchmark_utils.adapters.base import BaseTSFMAdapter + from benchmark_utils.outputs import ForecastOutput class _ConstantAdapter(BaseTSFMAdapter): def __init__(self, task, prediction_length): @@ -181,11 +182,12 @@ def __init__(self, task, prediction_length): def predict(self, x): if self._task == "forecasting": H = self._prediction_length - preds = [] + outs = [] for series, cutoffs in zip(x.x, x.cutoff_indexes): C = series.shape[1] if series.ndim == 2 else 1 - preds.append(np.zeros((len(cutoffs), H, C), dtype=np.float32)) - return preds + q = np.zeros((len(cutoffs), 1, H, C), dtype=np.float32) + outs.append(ForecastOutput(quantiles=q, quantile_levels=(0.5,))) + return outs elif self._task == "classification": return np.zeros(len(x), dtype=np.int64) elif self._task == "anomaly_detection": diff --git a/solvers/chronos.py b/solvers/chronos.py index 7dc6759..7ac830a 100644 --- a/solvers/chronos.py +++ b/solvers/chronos.py @@ -21,6 +21,7 @@ from benchmark_utils.adapters.forecast_residual import ForecastResidualAdapter from benchmark_utils.inputs import ForecastInput +from benchmark_utils.outputs import ForecastOutput SUPPORTED_TASKS = {"forecasting", "anomaly_detection"} @@ -60,7 +61,10 @@ def predict(self, x: ForecastInput): if f.ndim == 2: f = f.median(dim=0).values out[k, :, c] = f.numpy() - results.append(out) + results.append(ForecastOutput( + quantiles=out[:, None, :, :], + quantile_levels=(0.5,), + )) return results diff --git a/solvers/naive.py b/solvers/naive.py index 83f21d6..7603427 100644 --- a/solvers/naive.py +++ b/solvers/naive.py @@ -13,6 +13,7 @@ from benchmark_utils.adapters.base import BaseTSFMAdapter from benchmark_utils.inputs import ForecastInput +from benchmark_utils.outputs import ForecastOutput # --------------------------------------------------------------------------- @@ -38,7 +39,10 @@ def predict(self, x: ForecastInput): pattern = hist[-season:] reps = int(np.ceil(self.prediction_length / season)) preds[k] = np.tile(pattern, (reps, 1))[:self.prediction_length] - results.append(preds) + results.append(ForecastOutput( + quantiles=preds[:, None, :, :], # (n_cutoffs, 1, H, C) + quantile_levels=(0.5,), + )) return results diff --git a/solvers/seasonal_naive.py b/solvers/seasonal_naive.py index 723551e..2adf1ec 100644 --- a/solvers/seasonal_naive.py +++ b/solvers/seasonal_naive.py @@ -15,6 +15,7 @@ from benchmark_utils.adapters.base import BaseTSFMAdapter from benchmark_utils.inputs import ForecastInput +from benchmark_utils.outputs import ForecastOutput SUPPORTED_TASKS = {"forecasting"} @@ -41,7 +42,10 @@ def predict(self, x: ForecastInput): pattern = hist[-season:] reps = int(np.ceil(self.prediction_length / season)) preds[k] = np.tile(pattern, (reps, 1))[:self.prediction_length] - results.append(preds) + results.append(ForecastOutput( + quantiles=preds[:, None, :, :], + quantile_levels=(0.5,), + )) return results diff --git a/solvers/tfc_api.py b/solvers/tfc_api.py index d5e3f87..9083d18 100644 --- a/solvers/tfc_api.py +++ b/solvers/tfc_api.py @@ -33,6 +33,7 @@ from benchmark_utils.adapters.base import BaseTSFMAdapter from benchmark_utils.inputs import ForecastInput +from benchmark_utils.outputs import ForecastOutput SUPPORTED_TASKS = {"forecasting"} @@ -150,10 +151,9 @@ def _predict_per_series(self, x, cutoff_indexes, pd_freq): batch_size=self.batch_size, ) - preds = self._gather_series_preds( + results.append(self._gather_series_output( forecast_df, series_idx, C, cutoffs, fcds - ) - results.append(preds) + )) return results def _predict_batched(self, x, cutoff_indexes, pd_freq, offsets): @@ -206,28 +206,40 @@ def _predict_batched(self, x, cutoff_indexes, pd_freq, offsets): results = [] for series_idx, C, index, cutoffs in per_series_meta: series_fcds = [pd.Timestamp(index[cutoff]) for cutoff in cutoffs] - preds = self._gather_series_preds( + results.append(self._gather_series_output( forecast_df, series_idx, C, cutoffs, series_fcds - ) - results.append(preds) + )) return results - def _gather_series_preds(self, forecast_df, series_idx, C, cutoffs, fcds): - value_col = f"{self.model}_q0.5" - if value_col not in forecast_df.columns: - value_col = str(self.model) - if value_col not in forecast_df.columns: - raise ValueError( - f"TFC API response missing expected columns; got {list(forecast_df.columns)!r}" - ) + def _gather_series_output(self, forecast_df, series_idx, C, cutoffs, fcds): + # Discover which quantile columns the SDK returned; fall back to + # the mean column only when no quantiles are present. + levels, quantile_cols = [], [] + for q in self.quantiles: + col = f"{self.model}_q{q}" + if col in forecast_df.columns: + levels.append(q) + quantile_cols.append(col) + if not quantile_cols: + mean_col = str(self.model) + if mean_col not in forecast_df.columns: + raise ValueError( + f"TFC API response missing expected columns; got {list(forecast_df.columns)!r}" + ) + levels = [0.5] + quantile_cols = [mean_col] - preds = np.empty((len(cutoffs), self.prediction_length, C), dtype=np.float32) + Q = len(levels) + preds = np.empty( + (len(cutoffs), Q, self.prediction_length, C), dtype=np.float32 + ) for c in range(C): channel = forecast_df.loc[forecast_df["unique_id"] == f"s{series_idx}_c{c}"] for k, fcd in enumerate(fcds): window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(self.prediction_length) - preds[k, :, c] = window[value_col].to_numpy(dtype=np.float32) - return preds + for q_idx, col in enumerate(quantile_cols): + preds[k, q_idx, :, c] = window[col].to_numpy(dtype=np.float32) + return ForecastOutput(quantiles=preds, quantile_levels=tuple(levels)) class Solver(BaseSolver): From 01381469090e9104fbf05ae136bd40e10f07df86 Mon Sep 17 00:00:00 2001 From: Geoffrey Negiar Date: Thu, 28 May 2026 17:30:44 +0200 Subject: [PATCH 06/12] REFACTOR ForecastOutput is a single object + Chronos-2 batched local inference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ``ForecastOutput`` is now a single dataclass per ``predict()`` call, not a per-series sequence. Its ``quantiles`` field is a ``Sequence[np.ndarray]`` aligned with the input series, each entry shape ``(n_cutoffs_i, Q, prediction_length, C)``. The ``quantile_levels`` tuple is shared across the batch. ``.point`` returns one ndarray per series. - Adapter signature is now ``predict(self, x: ForecastInput) -> ForecastOutput``, with that return type explicit on every forecasting predict() in the codebase. - The local Chronos solver is now Chronos-2 (matching the upstream migration on origin/main). The forecaster batches every (series, cutoff) pair into one ``Chronos2Pipeline.predict`` call — variable context lengths handled by the pipeline's left-padding — and returns the model's full 9-level quantile fan. - Updated all forecasting solvers + the constant adapter in ``get_one_result`` + ``ForecastResidualAdapter`` to the new contract. Parity verified on Monash[m1_yearly_dataset, debug=True]: Naive, SeasonalNaive, TFC-API[chronos-2], TFC-API[timesfm-2p5] match their prior metrics exactly. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmark_utils/adapters/base.py | 11 +- benchmark_utils/adapters/forecast_residual.py | 4 +- benchmark_utils/outputs.py | 62 ++++---- objective.py | 13 +- solvers/chronos.py | 134 +++++++++--------- solvers/naive.py | 11 +- solvers/seasonal_naive.py | 11 +- solvers/tfc_api.py | 36 +++-- 8 files changed, 143 insertions(+), 139 deletions(-) diff --git a/benchmark_utils/adapters/base.py b/benchmark_utils/adapters/base.py index 3f57a98..f7e2b69 100644 --- a/benchmark_utils/adapters/base.py +++ b/benchmark_utils/adapters/base.py @@ -7,16 +7,17 @@ -------------------------- forecasting: - predict(x: ForecastInput) -> Sequence[ForecastOutput] + predict(x: ForecastInput) -> ForecastOutput :class:`~benchmark_utils.inputs.ForecastInput` bundles the per-series history list, the jagged per-series cutoff indexes, and a :class:`~benchmark_utils.covariates.Covariates` dataclass. - :class:`~benchmark_utils.outputs.ForecastOutput` carries the - quantile-resolved forecast — shape - ``(n_cutoffs, Q, prediction_length, C)`` plus the matching quantile - levels. Point forecasters set ``quantile_levels=(0.5,)`` and Q=1. + :class:`~benchmark_utils.outputs.ForecastOutput` is a single object + covering every input series — its ``quantiles`` field is a Sequence + of ``(n_cutoffs_i, Q, prediction_length, C)`` arrays, one per series, + with a shared ``quantile_levels`` tuple. Point forecasters set + ``quantile_levels=(0.5,)`` and Q=1. ``prediction_length`` is dataset-level — the solver reads it from the objective and wires it into the adapter at construction time. diff --git a/benchmark_utils/adapters/forecast_residual.py b/benchmark_utils/adapters/forecast_residual.py index 89623c2..ec463fd 100644 --- a/benchmark_utils/adapters/forecast_residual.py +++ b/benchmark_utils/adapters/forecast_residual.py @@ -57,8 +57,8 @@ def predict(self, x: np.ndarray) -> np.ndarray: try: output = self.forecaster.predict( ForecastInput(x=[x], cutoff_indexes=[cutoffs]) - )[0] - preds = output.point # (n_cutoffs, H, C) + ) + preds = output.point[0] # (n_cutoffs, H, C) except Exception: return scores diff --git a/benchmark_utils/outputs.py b/benchmark_utils/outputs.py index 67f4187..cf9fba9 100644 --- a/benchmark_utils/outputs.py +++ b/benchmark_utils/outputs.py @@ -1,11 +1,10 @@ -"""Typed outputs returned by forecasting adapters. - -Forecasting predict() returns ``Sequence[ForecastOutput]`` — one entry -per input series. Each ``ForecastOutput`` carries a quantile-resolved -forecast with shape ``(n_cutoffs, Q, prediction_length, C)`` plus the -quantile levels themselves. Point forecasters set ``quantile_levels = -(0.5,)`` and Q=1; probabilistic forecasters can return as many quantiles -as their model produces. +"""Typed output returned by forecasting adapters. + +Forecasting predict() returns a single :class:`ForecastOutput` covering +every input series in the matching :class:`ForecastInput`. The output is +shape-aware: ``quantiles[i]`` is the per-series ndarray +``(n_cutoffs_i, Q, prediction_length, C)``, aligned with the same index +order as the input ``x``. """ from dataclasses import dataclass @@ -16,40 +15,45 @@ @dataclass(frozen=True) class ForecastOutput: - """Per-series forecast. + """Quantile-resolved forecast for a batch of series. Attributes ---------- - quantiles : np.ndarray - Shape ``(n_cutoffs, Q, prediction_length, C)``. ``quantiles[k, q]`` - is the forecast for the k-th cutoff at quantile level + quantiles : sequence of np.ndarray + One ndarray per series, each shape + ``(n_cutoffs_i, Q, prediction_length, C)``. ``quantiles[i][k, q]`` + is the forecast for series ``i``, cutoff ``k``, at quantile level ``quantile_levels[q]``. quantile_levels : sequence of float - Length ``Q``. Each entry is a quantile level in (0, 1). + Length ``Q``. Each entry is a quantile level in (0, 1). The same + ``Q`` applies to every series in the batch. """ - quantiles: np.ndarray + quantiles: Sequence[np.ndarray] quantile_levels: Sequence[float] def __post_init__(self): - if self.quantiles.ndim != 4: - raise ValueError( - f"quantiles must have ndim=4 (n_cutoffs, Q, prediction_length, C); " - f"got shape {self.quantiles.shape}" - ) - if self.quantiles.shape[1] != len(self.quantile_levels): - raise ValueError( - f"quantiles.shape[1] ({self.quantiles.shape[1]}) must equal " - f"len(quantile_levels) ({len(self.quantile_levels)})" - ) + Q = len(self.quantile_levels) + for i, arr in enumerate(self.quantiles): + if arr.ndim != 4: + raise ValueError( + f"quantiles[{i}] must have ndim=4 " + f"(n_cutoffs, Q, prediction_length, C); got shape {arr.shape}" + ) + if arr.shape[1] != Q: + raise ValueError( + f"quantiles[{i}].shape[1] ({arr.shape[1]}) must equal " + f"len(quantile_levels) ({Q})" + ) @property - def point(self) -> np.ndarray: - """Best point estimate — median when available, else mean over quantiles. + def point(self) -> Sequence[np.ndarray]: + """Best point estimate per series — median when available, else mean across quantiles. - Shape: ``(n_cutoffs, prediction_length, C)``. + Each entry has shape ``(n_cutoffs_i, prediction_length, C)``. """ levels = list(self.quantile_levels) if 0.5 in levels: - return self.quantiles[:, levels.index(0.5), :, :] - return self.quantiles.mean(axis=1) + idx = levels.index(0.5) + return [arr[:, idx, :, :] for arr in self.quantiles] + return [arr.mean(axis=1) for arr in self.quantiles] diff --git a/objective.py b/objective.py index adece87..9c6b2de 100644 --- a/objective.py +++ b/objective.py @@ -114,7 +114,7 @@ def evaluate_result(self, model): def _eval_forecasting(self, model): from benchmark_utils.inputs import ForecastInput - outputs_per_series = model.predict( + output = model.predict( ForecastInput( x=self.X_test, cutoff_indexes=self.cutoff_indexes, @@ -123,8 +123,8 @@ def _eval_forecasting(self, model): ) preds, targets = [], [] - for series_output, series_targets in zip(outputs_per_series, self.y_test): - sp = np.asarray(series_output.point) # (n_cutoffs, H, C) + for series_point, series_targets in zip(output.point, self.y_test): + sp = np.asarray(series_point) # (n_cutoffs, H, C) st = np.asarray(series_targets) for k in range(sp.shape[0]): preds.append(sp[k]) @@ -182,12 +182,11 @@ def __init__(self, task, prediction_length): def predict(self, x): if self._task == "forecasting": H = self._prediction_length - outs = [] + qs = [] for series, cutoffs in zip(x.x, x.cutoff_indexes): C = series.shape[1] if series.ndim == 2 else 1 - q = np.zeros((len(cutoffs), 1, H, C), dtype=np.float32) - outs.append(ForecastOutput(quantiles=q, quantile_levels=(0.5,))) - return outs + qs.append(np.zeros((len(cutoffs), 1, H, C), dtype=np.float32)) + return ForecastOutput(quantiles=qs, quantile_levels=(0.5,)) elif self._task == "classification": return np.zeros(len(x), dtype=np.int64) elif self._task == "anomaly_detection": diff --git a/solvers/chronos.py b/solvers/chronos.py index 7ac830a..4a32a04 100644 --- a/solvers/chronos.py +++ b/solvers/chronos.py @@ -1,24 +1,23 @@ -"""Chronos solver for the TSFM benchmark. +"""Chronos-2 solver for the TSFM benchmark (local inference). Supports: - - forecasting : zero-shot via ChronosPipeline - - anomaly_detection : forecast-residual (zero-shot) + - forecasting : zero-shot via ``Chronos2Pipeline`` + - anomaly_detection : forecast-residual on top of the same forecaster Classification is not yet implemented; the solver skips that task. -Model loading is done in ``set_objective`` (untimed). -Adaptation fitting is done in ``run`` (timed). - -Adding a new task ------------------ -1. Add the task name to ``SUPPORTED_TASKS``. -2. In ``run``, instantiate the appropriate adapter from - ``benchmark_utils.adapters`` (or implement a new one there). +Model loading is done in ``set_objective`` (untimed). Inference batches +every (series, cutoff) pair into a single ``Chronos2Pipeline.predict`` +call — the pipeline accepts a list of variable-length tensors and +applies left-padding internally, so all the per-cutoff work happens in +one forward pass. """ import numpy as np +import torch from benchopt import BaseSolver +from benchmark_utils.adapters.base import BaseTSFMAdapter from benchmark_utils.adapters.forecast_residual import ForecastResidualAdapter from benchmark_utils.inputs import ForecastInput from benchmark_utils.outputs import ForecastOutput @@ -27,67 +26,66 @@ SUPPORTED_TASKS = {"forecasting", "anomaly_detection"} -# --------------------------------------------------------------------------- -# Thin wrapper exposing the predict() interface expected by the objective -# --------------------------------------------------------------------------- - -class _ChronosForecaster: - """Wraps ChronosPipeline with the batched series+cutoffs predict API.""" +class _ChronosForecaster(BaseTSFMAdapter): + """Batched Chronos-2 adapter returning a full quantile fan.""" def __init__(self, pipeline, prediction_length): self.pipeline = pipeline self.prediction_length = prediction_length + self.quantile_levels = tuple(float(q) for q in pipeline.quantiles) - def predict(self, x: ForecastInput): - import torch - - results = [] - for series, cutoffs in zip(x.x, x.cutoff_indexes): + def predict(self, x: ForecastInput) -> ForecastOutput: + inputs = [] + layout = [] # (series_idx, cutoff_idx) per input element + per_series_shape = [] # (C, n_cutoffs) per series + for series_idx, (series, cutoffs) in enumerate(zip(x.x, x.cutoff_indexes)): series = np.asarray(series, dtype=np.float32) - C = series.shape[1] if series.ndim == 2 else 1 - out = np.empty((len(cutoffs), self.prediction_length, C), dtype=np.float32) - for k, cutoff in enumerate(cutoffs): - hist = series[:cutoff] - if hist.ndim == 1: - hist = hist[:, None] - # Chronos expects (batch, time) — one channel at a time. - for c in range(C): - context = torch.from_numpy(hist[:, c]).unsqueeze(0) - forecast = self.pipeline.predict( - context, - prediction_length=self.prediction_length, - ) - f = forecast[0] - if f.ndim == 2: - f = f.median(dim=0).values - out[k, :, c] = f.numpy() - results.append(ForecastOutput( - quantiles=out[:, None, :, :], - quantile_levels=(0.5,), - )) - return results - - -# --------------------------------------------------------------------------- -# Solver -# --------------------------------------------------------------------------- + if series.ndim == 1: + series = series[:, None] + _, C = series.shape + per_series_shape.append((C, len(cutoffs))) + for cutoff_idx, cutoff in enumerate(cutoffs): + hist = series[:cutoff] # (T_cutoff, C) + inputs.append(torch.from_numpy(hist.T)) # (C, T_cutoff) + layout.append((series_idx, cutoff_idx)) + + if not inputs: + return ForecastOutput(quantiles=[], quantile_levels=self.quantile_levels) + + with torch.no_grad(): + forecast = self.pipeline.predict( + inputs, + prediction_length=self.prediction_length, + ) + # forecast: list[(n_variates, Q, prediction_length)] aligned with `inputs`. + + Q = len(self.quantile_levels) + per_series = [ + np.empty((n_cutoffs, Q, self.prediction_length, C), dtype=np.float32) + for C, n_cutoffs in per_series_shape + ] + for (series_idx, cutoff_idx), pred in zip(layout, forecast): + arr = pred.float().cpu().numpy() # (C, Q, H) + per_series[series_idx][cutoff_idx] = arr.transpose(1, 2, 0) + return ForecastOutput(quantiles=per_series, quantile_levels=self.quantile_levels) + class Solver(BaseSolver): - """Chronos zero-shot solver. + """Chronos-2 zero-shot solver. Parameters ---------- model_size : str - Chronos model variant: "tiny", "mini", "small", "base", "large". + Chronos-2 variant suffix used in ``autogluon/chronos-2-{model_size}``. task_adaptation : str - How to use Chronos for each task: - "zeroshot" — direct forecasting API (forecasting only) - "forecast_residual" — anomaly score = forecast error (AD only) + Per-task usage of the forecaster: + ``"zeroshot"`` — direct forecasting (forecasting only) + ``"forecast_residual"`` — anomaly score = forecast error (AD only) """ name = "Chronos" - requirements = ["pip::chronos-forecasting>=1.4", "pip::torch"] + requirements = ["pip::chronos-forecasting>=2.0", "pip::torch"] sampling_strategy = "run_once" @@ -101,36 +99,32 @@ def skip(self, task, **kwargs): return True, f"Chronos solver does not support task={task!r}" return False, None - # ------------------------------------------------------------------ - def set_objective(self, X_train, y_train, task, **meta): - import torch - from chronos import ChronosPipeline + from chronos import Chronos2Pipeline self.task = task self.X_train = X_train self.meta = meta - # Load model once; reuse across consecutive dataset configs. - model_id = f"amazon/chronos-t5-{self.model_size}" + # bfloat16 is fine on CUDA but poorly supported on CPU / MPS; + # fall back to float32 there so inference doesn't crash or stall. + device = "cuda" if torch.cuda.is_available() else "cpu" + dtype = torch.bfloat16 if device == "cuda" else torch.float32 + model_id = f"autogluon/chronos-2-{self.model_size}" if not hasattr(self, "_pipeline") or self._loaded_model != model_id: - self._pipeline = ChronosPipeline.from_pretrained( + self._pipeline = Chronos2Pipeline.from_pretrained( model_id, - device_map="auto", - torch_dtype=torch.bfloat16, + device_map=device, + dtype=dtype, ) self._loaded_model = model_id def run(self, _): pred_len = self.meta.get("prediction_length", 1) - forecaster = _ChronosForecaster(self._pipeline, pred_len) - if self.task == "forecasting": - self._adapter = forecaster - + self._adapter = _ChronosForecaster(self._pipeline, pred_len) elif self.task == "anomaly_detection": - # AD uses one-step-ahead forecasts; rebuild the forecaster - # with prediction_length=1 to match. + # AD uses one-step-ahead forecasts. self._adapter = ForecastResidualAdapter( _ChronosForecaster(self._pipeline, prediction_length=1), prediction_length=1, diff --git a/solvers/naive.py b/solvers/naive.py index 7603427..f1887cd 100644 --- a/solvers/naive.py +++ b/solvers/naive.py @@ -27,8 +27,8 @@ def __init__(self, prediction_length, seasonality=1): self.prediction_length = prediction_length self.seasonality = seasonality - def predict(self, x: ForecastInput): - results = [] + def predict(self, x: ForecastInput) -> ForecastOutput: + quantiles = [] for series, cutoffs in zip(x.x, x.cutoff_indexes): series = np.asarray(series) C = series.shape[1] if series.ndim == 2 else 1 @@ -39,11 +39,8 @@ def predict(self, x: ForecastInput): pattern = hist[-season:] reps = int(np.ceil(self.prediction_length / season)) preds[k] = np.tile(pattern, (reps, 1))[:self.prediction_length] - results.append(ForecastOutput( - quantiles=preds[:, None, :, :], # (n_cutoffs, 1, H, C) - quantile_levels=(0.5,), - )) - return results + quantiles.append(preds[:, None, :, :]) # (n_cutoffs, 1, H, C) + return ForecastOutput(quantiles=quantiles, quantile_levels=(0.5,)) class _MajorityClassifier(BaseTSFMAdapter): diff --git a/solvers/seasonal_naive.py b/solvers/seasonal_naive.py index 2adf1ec..176691d 100644 --- a/solvers/seasonal_naive.py +++ b/solvers/seasonal_naive.py @@ -30,8 +30,8 @@ def __init__(self, prediction_length: int, season_length: int): self.prediction_length = prediction_length self.season_length = season_length - def predict(self, x: ForecastInput): - results = [] + def predict(self, x: ForecastInput) -> ForecastOutput: + quantiles = [] for series, cutoffs in zip(x.x, x.cutoff_indexes): series = np.asarray(series) C = series.shape[1] if series.ndim == 2 else 1 @@ -42,11 +42,8 @@ def predict(self, x: ForecastInput): pattern = hist[-season:] reps = int(np.ceil(self.prediction_length / season)) preds[k] = np.tile(pattern, (reps, 1))[:self.prediction_length] - results.append(ForecastOutput( - quantiles=preds[:, None, :, :], - quantile_levels=(0.5,), - )) - return results + quantiles.append(preds[:, None, :, :]) + return ForecastOutput(quantiles=quantiles, quantile_levels=(0.5,)) class Solver(BaseSolver): diff --git a/solvers/tfc_api.py b/solvers/tfc_api.py index 9083d18..a507c9c 100644 --- a/solvers/tfc_api.py +++ b/solvers/tfc_api.py @@ -105,7 +105,7 @@ def __init__( self.country_isocode = country_isocode self.batch_size = batch_size - def predict(self, x: ForecastInput): + def predict(self, x: ForecastInput) -> ForecastOutput: # TODO: thread ``x.covariates`` (static/hist/future) through to the SDK # once the benchmark datasets populate them. Monash currently # carries none, so the dataclass arrives with empty sequences. @@ -114,11 +114,18 @@ def predict(self, x: ForecastInput): offsets = _shared_offsets_from_end(series_list, cutoff_indexes) if getattr(self.model, "supports_batching", False) and offsets is not None: - return self._predict_batched(series_list, cutoff_indexes, pd_freq, offsets) - return self._predict_per_series(series_list, cutoff_indexes, pd_freq) + per_series, levels = self._predict_batched( + series_list, cutoff_indexes, pd_freq, offsets + ) + else: + per_series, levels = self._predict_per_series( + series_list, cutoff_indexes, pd_freq + ) + return ForecastOutput(quantiles=per_series, quantile_levels=levels) def _predict_per_series(self, x, cutoff_indexes, pd_freq): - results = [] + per_series = [] + levels = None for series_idx, (series, cutoffs) in enumerate(zip(x, cutoff_indexes)): series = np.asarray(series, dtype=np.float32) if series.ndim == 1: @@ -151,10 +158,12 @@ def _predict_per_series(self, x, cutoff_indexes, pd_freq): batch_size=self.batch_size, ) - results.append(self._gather_series_output( + arr, series_levels = self._gather_series_output( forecast_df, series_idx, C, cutoffs, fcds - )) - return results + ) + per_series.append(arr) + levels = series_levels + return per_series, (levels if levels is not None else (0.5,)) def _predict_batched(self, x, cutoff_indexes, pd_freq, offsets): """One ``cross_validate`` call covering every series in ``x``. @@ -203,13 +212,16 @@ def _predict_batched(self, x, cutoff_indexes, pd_freq, offsets): batch_size=self.batch_size, ) - results = [] + per_series = [] + levels = None for series_idx, C, index, cutoffs in per_series_meta: series_fcds = [pd.Timestamp(index[cutoff]) for cutoff in cutoffs] - results.append(self._gather_series_output( + arr, series_levels = self._gather_series_output( forecast_df, series_idx, C, cutoffs, series_fcds - )) - return results + ) + per_series.append(arr) + levels = series_levels + return per_series, (levels if levels is not None else (0.5,)) def _gather_series_output(self, forecast_df, series_idx, C, cutoffs, fcds): # Discover which quantile columns the SDK returned; fall back to @@ -239,7 +251,7 @@ def _gather_series_output(self, forecast_df, series_idx, C, cutoffs, fcds): window = channel.loc[channel["fcd"] == fcd].sort_values("ds").head(self.prediction_length) for q_idx, col in enumerate(quantile_cols): preds[k, q_idx, :, c] = window[col].to_numpy(dtype=np.float32) - return ForecastOutput(quantiles=preds, quantile_levels=tuple(levels)) + return preds, tuple(levels) class Solver(BaseSolver): From 3ea97dbf8eddb80c0f206dd3bef5e57328924ade Mon Sep 17 00:00:00 2001 From: Geoffrey Negiar Date: Thu, 28 May 2026 17:39:11 +0200 Subject: [PATCH 07/12] FIX tighten chronos-forecasting pin; drop redundant torch dep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ``>=2.0`` was too loose: ``Chronos2Pipeline.predict`` with a variable-length list of tensors and the ``pipeline.quantiles`` attribute stabilized in 2.2.x (the version verified end-to-end here). Switch to ``>=2.2,<3`` so we test what we ship and a future major bump can't silently break the contract. - Drop ``pip::torch`` — ``chronos-forecasting`` already pins ``torch<3,>=2.2`` transitively, so listing it again is dead weight. Co-Authored-By: Claude Opus 4.7 (1M context) --- solvers/chronos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solvers/chronos.py b/solvers/chronos.py index 4a32a04..417223c 100644 --- a/solvers/chronos.py +++ b/solvers/chronos.py @@ -85,7 +85,7 @@ class Solver(BaseSolver): name = "Chronos" - requirements = ["pip::chronos-forecasting>=2.0", "pip::torch"] + requirements = ["pip::chronos-forecasting>=2.2,<3"] sampling_strategy = "run_once" From 1829e414e4f2bd7ffb089a557ec09c83ed6f6a5b Mon Sep 17 00:00:00 2001 From: Eduardo Montesuma Date: Fri, 29 May 2026 09:01:12 +0000 Subject: [PATCH 08/12] feat: move constants to a dedicated file --- benchmark_utils/constants.py | 119 +++++++++++++++++++++++++++++++++++ datasets/monash.py | 25 +------- 2 files changed, 122 insertions(+), 22 deletions(-) create mode 100644 benchmark_utils/constants.py diff --git a/benchmark_utils/constants.py b/benchmark_utils/constants.py new file mode 100644 index 0000000..b9b8f45 --- /dev/null +++ b/benchmark_utils/constants.py @@ -0,0 +1,119 @@ +"""Shared frequency / seasonality tables for forecasting datasets. + +Two sources name frequencies differently: + - aeon (used by Monash) uses words: "yearly", "weekly", "minutely", ... + - GIFT-Eval (and pandas) use offset aliases: "Y", "W-SUN", "5T", ... + +This module exposes a single canonical (freq, seasonality) lookup keyed on +the canonical pandas-style base alias (e.g. "Y", "W", "D"), plus two +adapters that normalize each source onto that canonical key. +""" + +import re + +# Canonical base alias → (display_freq, MASE seasonality, default forecast horizon) +_BASE = { + "Y": ("Y", 1, 6), + "Q": ("Q", 4, 8), + "M": ("M", 12, 12), + "W": ("W", 52, 13), + "D": ("D", 7, 14), + "H": ("H", 24, 24), + "T": ("T", 1440, 60), # minutes + "S": ("S", 1, 60), +} + +# aeon's spelled-out names → canonical base alias +_AEON_TO_BASE = { + "yearly": "Y", + "quarterly": "Q", + "monthly": "M", + "weekly": "W", + "daily": "D", + "hourly": "H", + "minutely": "T", + "seconds": "S", +} + + +def from_aeon(freq_word: str) -> tuple[str, int, int]: + """Look up (freq, seasonality, default_horizon) from an aeon freq word.""" + base = _AEON_TO_BASE.get(freq_word, "D") + return _BASE[base] + + +# Pandas offset aliases: strip a leading multiplier and any anchor suffix +# (e.g. "5T" → "T", "W-SUN" → "W", "QS-OCT" → "Q", "YE" → "Y"). +_PANDAS_ALIAS_RE = re.compile(r"^\d*([A-Za-z]+)") +_NORMALIZE_BASE = { + # Newer pandas spellings → legacy single-letter aliases used in _BASE. + "YE": "Y", "YS": "Y", "A": "Y", "AS": "Y", + "QE": "Q", "QS": "Q", + "ME": "M", "MS": "M", + "min": "T", "MIN": "T", +} + + +def from_pandas(freq_alias: str) -> tuple[str, int, int]: + """Look up (freq, seasonality, default_horizon) from a pandas freq alias. + + Handles multipliers ("5T") and anchors ("W-SUN", "QS-OCT") by stripping + them before lookup. Unknown aliases default to daily. + """ + if not freq_alias: + return _BASE["D"] + m = _PANDAS_ALIAS_RE.match(freq_alias.split("-", 1)[0]) + if not m: + return _BASE["D"] + head = m.group(1) + base = _NORMALIZE_BASE.get(head, head[:1].upper()) + return _BASE.get(base, _BASE["D"]) + + +# --------------------------------------------------------------------------- +# GIFT-Eval term resolution +# +# Mirrors the canonical table in the upstream time-series repo: prediction +# length is a function of pandas freq, then scaled by a term multiplier +# (short=1, medium=10, long=15). Used by datasets/gifteval.py so reported +# numbers line up with the GIFT-Eval leaderboard. +# --------------------------------------------------------------------------- + +GIFT_EVAL_PRED_LENGTH_MAP: dict[str, int] = { + "M": 12, "MS": 12, + "W": 8, "W-SUN": 8, "W-MON": 8, + "D": 30, + "H": 48, "6H": 48, + "T": 48, "5T": 48, "10T": 48, "15T": 48, "30T": 48, + "S": 60, "4S": 60, + "Q": 8, "Q-DEC": 8, + "A": 4, "A-DEC": 4, + "Y": 4, +} + +GIFT_EVAL_TERM_MULTIPLIER: dict[str, int] = { + "short": 1, + "medium": 10, + "long": 15, +} + + +def gift_eval_prediction_length(freq: str, term: str) -> int: + """Resolve the GIFT-Eval prediction length for a (freq, term) pair. + + ``freq`` is a pandas-style alias (e.g. ``"5T"``, ``"W-SUN"``). If the + exact alias isn't in :data:`GIFT_EVAL_PRED_LENGTH_MAP`, multi-minute + aliases collapse to ``"T"``; otherwise we default to 48. + + ``term`` must be one of ``"short"``, ``"medium"``, ``"long"``. + """ + if term not in GIFT_EVAL_TERM_MULTIPLIER: + raise ValueError( + f"term must be one of {list(GIFT_EVAL_TERM_MULTIPLIER)}; got {term!r}" + ) + base = GIFT_EVAL_PRED_LENGTH_MAP.get(freq) + if base is None and freq.endswith("T") and freq != "T": + base = GIFT_EVAL_PRED_LENGTH_MAP["T"] + if base is None: + base = 48 + return base * GIFT_EVAL_TERM_MULTIPLIER[term] diff --git a/datasets/monash.py b/datasets/monash.py index 049ce73..a299433 100644 --- a/datasets/monash.py +++ b/datasets/monash.py @@ -35,26 +35,10 @@ from aeon.datasets import load_forecasting from benchmark_utils.covariates import Covariates +from benchmark_utils.constants import from_aeon from benchmark_utils.windowing import make_forecasting_splits -# Map aeon frequency strings → pandas-style freq codes and MASE seasonality -_FREQ_MAP = { - "yearly": ("Y", 1), - "quarterly": ("Q", 4), - "monthly": ("M", 12), - "weekly": ("W", 52), - "daily": ("D", 7), - "hourly": ("H", 24), - "minutely": ("T", 1440), - "seconds": ("S", 1), -} - -_DEFAULT_HORIZON = { - "Y": 6, "Q": 8, "M": 12, "W": 13, "D": 14, "H": 24, "T": 60, -} - - class Dataset(BaseDataset): """Monash forecasting dataset (loaded via aeon). @@ -90,14 +74,11 @@ def get_data(self): # contain_missing_values, contain_equal_length aeon_freq = meta.get("frequency", "yearly") - freq, seasonality = _FREQ_MAP.get(aeon_freq, ("D", 1)) + freq, seasonality, default_h = from_aeon(aeon_freq) pred_len = self.prediction_length if pred_len is None: - pred_len = int( - meta.get("forecast_horizon") - or _DEFAULT_HORIZON.get(freq, 10) - ) + pred_len = int(meta.get("forecast_horizon") or default_h) series_list = [] rows = df.iterrows() if not self.debug else list(df.iterrows())[:5] From 66890462468390c31a3fbe2444354660b7e690c2 Mon Sep 17 00:00:00 2001 From: Eduardo Montesuma Date: Fri, 29 May 2026 09:01:27 +0000 Subject: [PATCH 09/12] feat: gift evall support --- datasets/gifteval.py | 166 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 datasets/gifteval.py diff --git a/datasets/gifteval.py b/datasets/gifteval.py new file mode 100644 index 0000000..9ec0de4 --- /dev/null +++ b/datasets/gifteval.py @@ -0,0 +1,166 @@ +"""GIFT-Eval forecasting benchmark dataset (Salesforce/GiftEval on HF). + +The HF repo organizes data per-dataset under top-level directories +(``m4_weekly``, ``etth1``, ``solar``, ...). Each directory holds a +single Arrow file with the test-set series. + +Each entry exposes ``item_id``, ``start``, ``freq``, and ``target`` +(a flat list of floats). For multivariate datasets, ``target`` is still +serialized as a flat list — GIFT-Eval handles those via separate file +layouts we don't unpack here; the MVP supports univariate only. + +Cutoffs and windows follow the Monash recipe (we don't comply with +GIFT-Eval's prescribed test cutoff — same rolling-window logic via +:func:`benchmark_utils.windowing.make_forecasting_splits`). + +Data contract output mirrors :mod:`datasets.monash`. +""" + +import numpy as np +from benchopt import BaseDataset + +from benchmark_utils.covariates import Covariates +from benchmark_utils.constants import ( + from_pandas, + gift_eval_prediction_length, +) +from benchmark_utils.windowing import make_forecasting_splits + + +class Dataset(BaseDataset): + """GIFT-Eval forecasting dataset (loaded from HF Salesforce/GiftEval). + + Parameters + ---------- + dataset_name : str + Subdirectory name on the HF repo (e.g. ``"m4_weekly"``, ``"ett1"``, + ``"solar"``). See https://huggingface.co/datasets/Salesforce/GiftEval + for the full list. + term : str + GIFT-Eval forecast term — ``"short"`` (×1), ``"medium"`` (×10), or + ``"long"`` (×15). Selects the prediction length via the canonical + per-freq base, matching the GIFT-Eval leaderboard convention. + Ignored when ``prediction_length`` is set explicitly. + prediction_length : int or None + Explicit override. ``None`` → resolved from (freq, term). + n_windows : int + Number of rolling evaluation windows per series. + max_series : int or None + Optional cap on the number of series — useful for very large + configs (e.g. ``solar``). ``None`` = no cap. + debug : bool + If True, keep only the first 5 series for fast iteration. + """ + + name = "GiftEval" + + requirements = ["pip::datasets", "pip::huggingface-hub"] + + parameters = { + "dataset_name": ["m4_weekly"], + "term": ["short"], + "prediction_length": [None], + "n_windows": [1], + "max_series": [None], + "debug": [False], + } + + def get_data(self): + from datasets import Dataset as HFDataset + from huggingface_hub import hf_hub_download, list_repo_files + + # Locate the Arrow file inside the requested subdirectory. + files = list_repo_files( + "Salesforce/GiftEval", repo_type="dataset" + ) + arrow_files = [ + f for f in files + if f.startswith(f"{self.dataset_name}/") + and f.endswith(".arrow") + ] + if not arrow_files: + raise ValueError( + f"No Arrow file found for GIFT-Eval dataset " + f"{self.dataset_name!r}. Available top-level dirs: " + f"{sorted({f.split('/')[0] for f in files if '/' in f})}" + ) + + # Download + load each shard; concatenate. + rows = [] + for f in sorted(arrow_files): + local = hf_hub_download( + "Salesforce/GiftEval", filename=f, repo_type="dataset", + ) + shard = HFDataset.from_file(local) + rows.extend(shard) + + if self.debug: + rows = rows[:5] + elif self.max_series is not None: + rows = rows[: int(self.max_series)] + + if not rows: + raise ValueError( + f"GIFT-Eval dataset {self.dataset_name!r} returned 0 series." + ) + + # Frequency / seasonality — take from the first entry (every series + # in a GIFT-Eval subset shares the same freq). + pandas_freq = rows[0].get("freq") or "D" + freq, seasonality, _ = from_pandas(pandas_freq) + + pred_len = self.prediction_length + if pred_len is None: + pred_len = gift_eval_prediction_length(pandas_freq, self.term) + + # Build (T, C) series. Univariate only in the MVP. + series_list = [] + for r in rows: + values = np.asarray(r["target"], dtype=np.float32) + if values.ndim != 1: + # Skip multivariate entries until we add explicit handling. + continue + series_list.append(values.reshape(-1, 1)) + + if not series_list: + raise ValueError( + f"All entries in GIFT-Eval dataset {self.dataset_name!r} " + "were skipped (multivariate not yet supported)." + ) + + # Training portion: everything except the last test windows. + test_len = pred_len * self.n_windows + X_train, y_train_list, full_series = [], [], [] + for ts in series_list: + if ts.shape[0] < pred_len + 1: + continue + train_end = max(1, ts.shape[0] - test_len) + X_train.append(ts[:train_end]) + y_train_list.append(ts[train_end: train_end + pred_len]) + full_series.append(ts) + + if not full_series: + raise ValueError( + "All series are shorter than prediction_length." + ) + + n_windows = 1 if self.debug else self.n_windows + X_test, cutoff_indexes, y_test = make_forecasting_splits( + full_series, + prediction_length=pred_len, + n_windows=n_windows, + ) + + return dict( + X_train=X_train, + y_train=y_train_list, + X_test=X_test, + y_test=y_test, + cutoff_indexes=cutoff_indexes, + covariates=Covariates(), # GIFT-Eval HF schema has no covariates + task="forecasting", + metrics=["mae", "mse", "mase", "smape"], + prediction_length=pred_len, + freq=freq, + seasonality=seasonality, + ) From 2a4a74050e7f427d2b0dc38a086dc9e123722467 Mon Sep 17 00:00:00 2001 From: Eduardo Montesuma Date: Fri, 29 May 2026 12:50:18 +0000 Subject: [PATCH 10/12] feat: adds support for fev bench --- datasets/fev.py | 263 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 datasets/fev.py diff --git a/datasets/fev.py b/datasets/fev.py new file mode 100644 index 0000000..0eb5008 --- /dev/null +++ b/datasets/fev.py @@ -0,0 +1,263 @@ +"""AutoGluon fev_datasets forecasting benchmark +(huggingface.co/datasets/autogluon/fev_datasets). + +The HF repo organizes data either: + - per-freq: ``//train-*.parquet`` + (e.g. ``ETT/1H``, ``LOOP_SEATTLE/5T``) + - flat: ``/train-*.parquet`` + (e.g. ``australian_tourism``) + - or with an arbitrary subdir that is NOT a freq (e.g. ``boomlet/`` + where ```` is a series id, not a frequency). + +We accept the directory path directly as ``dataset_name`` (e.g. +``"ETT/1H"``, ``"australian_tourism"``) and infer the actual freq from +each series' ``timestamp`` column rather than parsing the path. + +Each parquet row is one series; columns vary: + - Always: ``id``, ``timestamp`` + - Univariate: a ``target`` column (list of floats) + - Multivariate (e.g. ``ETT``): no ``target`` column — each channel is + its own column (``HUFL``, ..., ``OT``). Channel columns are stacked + on the last axis to form ``(T, C)``. + +Rolling-window splits and GIFT-Eval-style term → prediction_length +resolution match :mod:`datasets.gifteval`. +""" + +import numpy as np +import pandas as pd +from benchopt import BaseDataset + +from benchmark_utils.covariates import Covariates +from benchmark_utils.constants import ( + from_pandas, + gift_eval_prediction_length, +) +from benchmark_utils.windowing import make_forecasting_splits + + +_METADATA_COLS = ("id", "timestamp") + + +# Canonical list of FEV evaluation configs — directory paths inside +# https://huggingface.co/datasets/autogluon/fev_datasets that contain at +# least one ``train-*.parquet`` file. Surfaced via +# ``get_parameter_choices`` so that ``dataset_name=all`` and ``benchopt +# info -v`` work. +FEV_DATASETS: tuple[str, ...] = ( + "ETT/15T", "ETT/1D", "ETT/1H", "ETT/1W", + "LOOP_SEATTLE/1D", "LOOP_SEATTLE/1H", "LOOP_SEATTLE/5T", + "M_DENSE/1D", "M_DENSE/1H", + "SZ_TAXI/15T", "SZ_TAXI/1H", + "australian_tourism", + "bizitobs_l2c/1H", "bizitobs_l2c/5T", + "boomlet/1062", "boomlet/1209", "boomlet/1225", "boomlet/1230", + "boomlet/1282", "boomlet/1487", "boomlet/1631", "boomlet/1676", + "boomlet/1855", "boomlet/1975", "boomlet/2187", + "boomlet/285", "boomlet/619", "boomlet/772", "boomlet/963", + "ecdc_ili", + "entsoe/15T", "entsoe/1H", "entsoe/30T", + "epf_be", "epf_de", "epf_fr", "epf_np", "epf_pjm", + "ercot/1D", "ercot/1H", "ercot/1M", "ercot/1W", + "favorita_stores/1D", "favorita_stores/1M", "favorita_stores/1W", + "favorita_transactions/1D", "favorita_transactions/1M", + "favorita_transactions/1W", + "fred_md_2025", "fred_qd_2025", + "gvar", "hermes", + "hierarchical_sales/1D", "hierarchical_sales/1W", + "hospital", + "hospital_admissions/1D", "hospital_admissions/1W", + "jena_weather/10T", "jena_weather/1D", "jena_weather/1H", + "kdd_cup_2022/10T", "kdd_cup_2022/1D", "kdd_cup_2022/30T", + "m5/1D", "m5/1M", "m5/1W", + "proenfo_bull", "proenfo_cockatoo", + "proenfo_gfc12", "proenfo_gfc14", "proenfo_gfc17", + "proenfo_hog", "proenfo_pdb", + "redset/15T", "redset/1H", "redset/5T", + "restaurant", + "rohlik_orders/1D", "rohlik_orders/1W", + "rohlik_sales/1D", "rohlik_sales/1W", + "rossmann/1D", "rossmann/1W", + "solar/1D", "solar/1W", + "solar_with_weather/15T", "solar_with_weather/1H", + "uci_air_quality/1D", "uci_air_quality/1H", + "uk_covid_nation/1D", "uk_covid_nation/1W", + "uk_covid_utla/1D", "uk_covid_utla/1W", + "us_consumption/1M", "us_consumption/1Q", "us_consumption/1Y", + "walmart", + "world_co2_emissions", "world_life_expectancy", "world_tourism", +) + +FEV_TERMS: tuple[str, ...] = ("short", "medium", "long") + + +def _infer_freq(timestamps) -> str: + """Best-effort freq inference from a series' timestamp column. + + Falls back to ``"D"`` when pandas cannot infer. Uses the first 5 + points to keep the check cheap on long series. + """ + try: + idx = pd.DatetimeIndex(timestamps[:5]) + return pd.infer_freq(idx) or "D" + except Exception: + return "D" + + +class Dataset(BaseDataset): + """AutoGluon fev forecasting dataset. + + Parameters + ---------- + dataset_name : str + Directory path inside the HF repo. Per-freq paths look like + ``"ETT/1H"`` / ``"LOOP_SEATTLE/5T"``; flat paths like + ``"australian_tourism"`` / ``"hospital"``. See ``FEV_DATASETS`` + for the full list (also discoverable via ``benchopt info -v``). + term : str + GIFT-Eval-style term: ``"short"`` / ``"medium"`` / ``"long"``. + Ignored when ``prediction_length`` is set. + prediction_length : int or None + Explicit override. ``None`` → resolved from (inferred freq, term). + n_windows : int + Number of rolling evaluation windows per series. + max_series : int or None + Optional cap on the number of series. + debug : bool + If True, keep only the first 5 series. + """ + + name = "FEV" + + requirements = ["pip::pyarrow", "pip::huggingface-hub"] + + parameters = { + "dataset_name": ["LOOP_SEATTLE/1H"], + "term": ["short"], + "prediction_length": [None], + "n_windows": [1], + "max_series": [None], + "debug": [False], + } + + @classmethod + def get_all_parameter_values(cls, name): + if name == "dataset_name": + return list(FEV_DATASETS) + if name == "term": + return list(FEV_TERMS) + return None + + def get_data(self): + from huggingface_hub import hf_hub_download, list_repo_files + + repo = "autogluon/fev_datasets" + files = list_repo_files(repo, repo_type="dataset") + + # Match parquet files in the exact directory (no nested descent). + prefix = f"{self.dataset_name}/" + parquet_files = sorted( + f for f in files + if f.startswith(prefix) + and f.endswith(".parquet") + and "/" not in f[len(prefix):] + ) + if not parquet_files: + raise ValueError( + f"No parquet found at {self.dataset_name!r} in {repo}. " + f"Valid choices are in FEV_DATASETS." + ) + + frames = [ + pd.read_parquet(hf_hub_download(repo, filename=f, repo_type="dataset")) + for f in parquet_files + ] + df = pd.concat(frames, ignore_index=True) + + if self.debug: + df = df.head(5) + elif self.max_series is not None: + df = df.head(int(self.max_series)) + + if df.empty: + raise ValueError(f"{self.dataset_name!r} contained 0 series.") + + # Channel cols = non-metadata columns whose entries are numeric + # array-likes. Some FEV datasets carry extra scalar/string fields + # (``type``, ``Security``) or arrays of strings (holiday names in + # ``favorita_stores``, etc.). We treat covariates as out of scope + # for the MVP. + def _is_numeric_array_col(c): + v = df.iloc[0][c] + if not hasattr(v, "__len__") or isinstance(v, (str, bytes)): + return False + if len(v) == 0: + return False + return isinstance(v[0], (int, float, np.integer, np.floating)) + + channel_cols = [ + c for c in df.columns + if c not in _METADATA_COLS and _is_numeric_array_col(c) + ] + if not channel_cols: + raise ValueError( + f"{self.dataset_name!r} has no channel columns " + f"(only {_METADATA_COLS} present)." + ) + + # Infer freq from the first series' timestamps — same for the + # whole config (FEV groups by freq at the directory level for + # nested configs, and flat configs are single-freq). + inferred_freq = _infer_freq(df.iloc[0]["timestamp"]) + canonical_freq, seasonality, _ = from_pandas(inferred_freq) + + pred_len = self.prediction_length + if pred_len is None: + pred_len = gift_eval_prediction_length(inferred_freq, self.term) + + # Build (T, C) series. Each row's per-channel array has the same + # length (T_i); stack on the last axis. + series_list = [] + for _, row in df.iterrows(): + channels = [np.asarray(row[c], dtype=np.float32) for c in channel_cols] + T = channels[0].shape[0] + if any(ch.shape[0] != T for ch in channels): + continue + series_list.append(np.stack(channels, axis=-1)) + + if not series_list: + raise ValueError("All series were skipped (inconsistent channel lengths).") + + test_len = pred_len * self.n_windows + X_train, y_train_list, full_series = [], [], [] + for ts in series_list: + if ts.shape[0] < pred_len + 1: + continue + train_end = max(1, ts.shape[0] - test_len) + X_train.append(ts[:train_end]) + y_train_list.append(ts[train_end: train_end + pred_len]) + full_series.append(ts) + + if not full_series: + raise ValueError("All series are shorter than prediction_length.") + + n_windows = 1 if self.debug else self.n_windows + X_test, cutoff_indexes, y_test = make_forecasting_splits( + full_series, + prediction_length=pred_len, + n_windows=n_windows, + ) + + return dict( + X_train=X_train, + y_train=y_train_list, + X_test=X_test, + y_test=y_test, + cutoff_indexes=cutoff_indexes, + covariates=Covariates(), + task="forecasting", + metrics=["mae", "mse", "mase", "smape"], + prediction_length=pred_len, + freq=canonical_freq, + seasonality=seasonality, + ) From 848effb1b64c7c56b3ce870839c8de1ff857c45c Mon Sep 17 00:00:00 2001 From: Eduardo Montesuma Date: Fri, 29 May 2026 12:50:25 +0000 Subject: [PATCH 11/12] minor fixes --- benchmark_utils/constants.py | 20 ++++++--- datasets/gifteval.py | 84 ++++++++++++++++++++++++++++++------ 2 files changed, 84 insertions(+), 20 deletions(-) diff --git a/benchmark_utils/constants.py b/benchmark_utils/constants.py index b9b8f45..5698672 100644 --- a/benchmark_utils/constants.py +++ b/benchmark_utils/constants.py @@ -101,19 +101,25 @@ def from_pandas(freq_alias: str) -> tuple[str, int, int]: def gift_eval_prediction_length(freq: str, term: str) -> int: """Resolve the GIFT-Eval prediction length for a (freq, term) pair. - ``freq`` is a pandas-style alias (e.g. ``"5T"``, ``"W-SUN"``). If the - exact alias isn't in :data:`GIFT_EVAL_PRED_LENGTH_MAP`, multi-minute - aliases collapse to ``"T"``; otherwise we default to 48. - - ``term`` must be one of ``"short"``, ``"medium"``, ``"long"``. + ``freq`` is a pandas-style alias (e.g. ``"5T"``, ``"1H"``, ``"W-SUN"``). + Lookup falls back through: exact match → strip leading "1" multiplier + ("1H" → "H") → collapse any multi-X alias to its base X ("10S" → "S", + "30T" → "T") → default 48. ``term`` must be one of ``"short"``, + ``"medium"``, ``"long"``. """ if term not in GIFT_EVAL_TERM_MULTIPLIER: raise ValueError( f"term must be one of {list(GIFT_EVAL_TERM_MULTIPLIER)}; got {term!r}" ) base = GIFT_EVAL_PRED_LENGTH_MAP.get(freq) - if base is None and freq.endswith("T") and freq != "T": - base = GIFT_EVAL_PRED_LENGTH_MAP["T"] + if base is None: + m = _PANDAS_ALIAS_RE.match(freq.split("-", 1)[0]) + if m: + head = m.group(1) + # Normalize new pandas spellings ("QE"→"Q", "ME"→"M", ...) + # before falling back through the map. + head = _NORMALIZE_BASE.get(head, head) + base = GIFT_EVAL_PRED_LENGTH_MAP.get(head) if base is None: base = 48 return base * GIFT_EVAL_TERM_MULTIPLIER[term] diff --git a/datasets/gifteval.py b/datasets/gifteval.py index 9ec0de4..9d6d72b 100644 --- a/datasets/gifteval.py +++ b/datasets/gifteval.py @@ -4,10 +4,12 @@ (``m4_weekly``, ``etth1``, ``solar``, ...). Each directory holds a single Arrow file with the test-set series. -Each entry exposes ``item_id``, ``start``, ``freq``, and ``target`` -(a flat list of floats). For multivariate datasets, ``target`` is still -serialized as a flat list — GIFT-Eval handles those via separate file -layouts we don't unpack here; the MVP supports univariate only. +Each entry exposes ``item_id``, ``start``, ``freq``, and ``target``. +``target`` is a flat ``List[float]`` for univariate configs and a +``List[List[float]]`` of shape ``(C, T)`` for multivariate ones (e.g. +``bitbrains_*``, ``electricity/*``, ``ett1/*``, ``ett2/*``, +``jena_weather/*``, ``solar/*``). Both shapes are handled — multivariate +entries are transposed to the ``(T, C)`` repo contract. Cutoffs and windows follow the Monash recipe (we don't comply with GIFT-Eval's prescribed test cutoff — same rolling-window logic via @@ -27,6 +29,43 @@ from benchmark_utils.windowing import make_forecasting_splits +# Canonical list of GIFT-Eval evaluation configs. Each entry is the +# arrow-containing directory path inside the HF repo. Flat datasets are +# bare names (``m4_weekly``); datasets that ship multiple frequencies +# are encoded as ``/`` (e.g. ``LOOP_SEATTLE/H``, +# ``LOOP_SEATTLE/D`` — these are genuinely distinct evaluations). +# Surfaced via ``get_parameter_choices`` so that ``dataset_name=all`` +# and ``benchopt info -v`` work. +GIFTEVAL_DATASETS: tuple[str, ...] = ( + "LOOP_SEATTLE/5T", "LOOP_SEATTLE/D", "LOOP_SEATTLE/H", + "M_DENSE/D", "M_DENSE/H", + "SZ_TAXI/15T", "SZ_TAXI/H", + "bitbrains_fast_storage/5T", "bitbrains_fast_storage/H", + "bitbrains_rnd/5T", "bitbrains_rnd/H", + "bizitobs_application", + "bizitobs_l2c/5T", "bizitobs_l2c/H", + "bizitobs_service", + "car_parts_with_missing", "covid_deaths", + "electricity/15T", "electricity/D", "electricity/H", "electricity/W", + "ett1/15T", "ett1/D", "ett1/H", "ett1/W", + "ett2/15T", "ett2/D", "ett2/H", "ett2/W", + "hierarchical_sales/D", "hierarchical_sales/W", + "hospital", + "jena_weather", + "jena_weather/10T", "jena_weather/D", "jena_weather/H", + "kdd_cup_2018_with_missing/D", "kdd_cup_2018_with_missing/H", + "m4_daily", "m4_hourly", "m4_monthly", "m4_quarterly", + "m4_weekly", "m4_yearly", + "restaurant", + "saugeenday/D", "saugeenday/M", "saugeenday/W", + "solar/10T", "solar/D", "solar/H", "solar/W", + "temperature_rain_with_missing", + "us_births/D", "us_births/M", "us_births/W", +) + +GIFTEVAL_TERMS: tuple[str, ...] = ("short", "medium", "long") + + class Dataset(BaseDataset): """GIFT-Eval forecasting dataset (loaded from HF Salesforce/GiftEval). @@ -65,24 +104,38 @@ class Dataset(BaseDataset): "debug": [False], } + @classmethod + def get_all_parameter_values(cls, name): + if name == "dataset_name": + return list(GIFTEVAL_DATASETS) + if name == "term": + return list(GIFTEVAL_TERMS) + return None + def get_data(self): from datasets import Dataset as HFDataset from huggingface_hub import hf_hub_download, list_repo_files - # Locate the Arrow file inside the requested subdirectory. + # Locate the Arrow file inside the requested directory. Match the + # exact directory (no nested descent) — for datasets like + # ``LOOP_SEATTLE`` that ship multiple freq subdirs, the user must + # pick one (``LOOP_SEATTLE/H``, ``LOOP_SEATTLE/D``, ...), and + # those are genuinely separate evaluation configs. files = list_repo_files( "Salesforce/GiftEval", repo_type="dataset" ) + prefix = f"{self.dataset_name}/" arrow_files = [ f for f in files - if f.startswith(f"{self.dataset_name}/") + if f.startswith(prefix) and f.endswith(".arrow") + and "/" not in f[len(prefix):] ] if not arrow_files: raise ValueError( f"No Arrow file found for GIFT-Eval dataset " - f"{self.dataset_name!r}. Available top-level dirs: " - f"{sorted({f.split('/')[0] for f in files if '/' in f})}" + f"{self.dataset_name!r}. Valid choices are in " + f"GIFTEVAL_DATASETS." ) # Download + load each shard; concatenate. @@ -113,19 +166,24 @@ def get_data(self): if pred_len is None: pred_len = gift_eval_prediction_length(pandas_freq, self.term) - # Build (T, C) series. Univariate only in the MVP. + # Build (T, C) series. Univariate entries arrive as flat + # ``List[float]`` (ndim=1); multivariate entries arrive as + # ``List[List[float]]`` of shape ``(C, T)``. series_list = [] for r in rows: values = np.asarray(r["target"], dtype=np.float32) - if values.ndim != 1: - # Skip multivariate entries until we add explicit handling. + if values.ndim == 1: + series = values.reshape(-1, 1) # (T, 1) + elif values.ndim == 2: + series = values.T # (C, T) → (T, C) + else: continue - series_list.append(values.reshape(-1, 1)) + series_list.append(series) if not series_list: raise ValueError( f"All entries in GIFT-Eval dataset {self.dataset_name!r} " - "were skipped (multivariate not yet supported)." + "had unsupported target shapes." ) # Training portion: everything except the last test windows. From b2dc9537af91c4adbd55e58caea23a32fed32c1c Mon Sep 17 00:00:00 2001 From: Eduardo Montesuma Date: Fri, 29 May 2026 15:52:07 +0000 Subject: [PATCH 12/12] fixes, prepare(), all behavior for gifteval and fevbench --- datasets/fev.py | 82 ++++++----- datasets/gifteval.py | 343 ++++++++++++++++++++++++++++++++----------- datasets/monash.py | 14 ++ objective.py | 17 +++ 4 files changed, 333 insertions(+), 123 deletions(-) diff --git a/datasets/fev.py b/datasets/fev.py index 0eb5008..e41d8cb 100644 --- a/datasets/fev.py +++ b/datasets/fev.py @@ -20,8 +20,11 @@ its own column (``HUFL``, ..., ``OT``). Channel columns are stacked on the last axis to form ``(T, C)``. -Rolling-window splits and GIFT-Eval-style term → prediction_length -resolution match :mod:`datasets.gifteval`. +Rolling-window splits match :mod:`datasets.monash`. The default +``prediction_length`` is the freq-based heuristic from +:func:`benchmark_utils.constants.from_pandas`; FEV does not publish a +per-dataset horizon spec, so we don't try to mirror one. Pass +``prediction_length=N`` explicitly to override. """ import numpy as np @@ -29,10 +32,7 @@ from benchopt import BaseDataset from benchmark_utils.covariates import Covariates -from benchmark_utils.constants import ( - from_pandas, - gift_eval_prediction_length, -) +from benchmark_utils.constants import from_pandas from benchmark_utils.windowing import make_forecasting_splits @@ -88,8 +88,6 @@ "world_co2_emissions", "world_life_expectancy", "world_tourism", ) -FEV_TERMS: tuple[str, ...] = ("short", "medium", "long") - def _infer_freq(timestamps) -> str: """Best-effort freq inference from a series' timestamp column. @@ -114,11 +112,12 @@ class Dataset(BaseDataset): ``"ETT/1H"`` / ``"LOOP_SEATTLE/5T"``; flat paths like ``"australian_tourism"`` / ``"hospital"``. See ``FEV_DATASETS`` for the full list (also discoverable via ``benchopt info -v``). - term : str - GIFT-Eval-style term: ``"short"`` / ``"medium"`` / ``"long"``. - Ignored when ``prediction_length`` is set. prediction_length : int or None - Explicit override. ``None`` → resolved from (inferred freq, term). + Explicit override. ``None`` → resolved from the inferred freq + via :func:`benchmark_utils.constants.from_pandas` (same heuristic + used by Monash). FEV does not publish its own per-dataset + horizon matrix, so we don't try to align with a leaderboard + spec here. n_windows : int Number of rolling evaluation windows per series. max_series : int or None @@ -133,46 +132,55 @@ class Dataset(BaseDataset): parameters = { "dataset_name": ["LOOP_SEATTLE/1H"], - "term": ["short"], "prediction_length": [None], "n_windows": [1], "max_series": [None], "debug": [False], } + # Cache prepare() by dataset_name only — the other knobs shape the + # in-memory view, not the downloaded files. + prepare_cache_ignore = ( + "prediction_length", "n_windows", "max_series", "debug", + ) + @classmethod def get_all_parameter_values(cls, name): if name == "dataset_name": return list(FEV_DATASETS) - if name == "term": - return list(FEV_TERMS) return None - def get_data(self): - from huggingface_hub import hf_hub_download, list_repo_files - - repo = "autogluon/fev_datasets" - files = list_repo_files(repo, repo_type="dataset") - - # Match parquet files in the exact directory (no nested descent). - prefix = f"{self.dataset_name}/" - parquet_files = sorted( - f for f in files - if f.startswith(prefix) - and f.endswith(".parquet") - and "/" not in f[len(prefix):] + def prepare(self): + """Pre-download parquet shards for this config into HF's cache.""" + self._snapshot() + + def _snapshot(self) -> "list[str]": + """Snapshot-download parquet files for this dataset_name and + return their local paths. Idempotent.""" + from huggingface_hub import snapshot_download + from pathlib import Path + + local_root = snapshot_download( + "autogluon/fev_datasets", + repo_type="dataset", + allow_patterns=f"{self.dataset_name}/*.parquet", ) + return sorted( + str(p) for p in (Path(local_root) / self.dataset_name).glob("*.parquet") + ) + + def get_data(self): + parquet_files = self._snapshot() if not parquet_files: raise ValueError( - f"No parquet found at {self.dataset_name!r} in {repo}. " - f"Valid choices are in FEV_DATASETS." + f"No parquet found at {self.dataset_name!r} in " + "autogluon/fev_datasets. Valid choices are in FEV_DATASETS." ) - frames = [ - pd.read_parquet(hf_hub_download(repo, filename=f, repo_type="dataset")) - for f in parquet_files - ] - df = pd.concat(frames, ignore_index=True) + df = pd.concat( + [pd.read_parquet(f) for f in parquet_files], + ignore_index=True, + ) if self.debug: df = df.head(5) @@ -209,11 +217,11 @@ def _is_numeric_array_col(c): # whole config (FEV groups by freq at the directory level for # nested configs, and flat configs are single-freq). inferred_freq = _infer_freq(df.iloc[0]["timestamp"]) - canonical_freq, seasonality, _ = from_pandas(inferred_freq) + canonical_freq, seasonality, default_h = from_pandas(inferred_freq) pred_len = self.prediction_length if pred_len is None: - pred_len = gift_eval_prediction_length(inferred_freq, self.term) + pred_len = int(default_h) # Build (T, C) series. Each row's per-channel array has the same # length (T_i); stack on the last axis. diff --git a/datasets/gifteval.py b/datasets/gifteval.py index 9d6d72b..c8661c8 100644 --- a/datasets/gifteval.py +++ b/datasets/gifteval.py @@ -1,19 +1,79 @@ """GIFT-Eval forecasting benchmark dataset (Salesforce/GiftEval on HF). -The HF repo organizes data per-dataset under top-level directories -(``m4_weekly``, ``etth1``, ``solar``, ...). Each directory holds a -single Arrow file with the test-set series. +Parametrization +--------------- +The class exposes two orthogonal parameters that drive the leaderboard +matrix: -Each entry exposes ``item_id``, ``start``, ``freq``, and ``target``. +* ``dataset_name`` — one of 55 canonical ``/`` paths (e.g. + ``"m4_weekly/W"``, ``"loop_seattle/H"``). The full list is + :data:`GIFTEVAL_DATASETS`. +* ``term`` — one of ``short`` / ``medium`` / ``long``, controlling the + forecast horizon (×1, ×10, ×15 of the per-freq base). + +Both are surfaced via ``get_all_parameter_values`` so that +``-d "GiftEval[dataset_name=all,term=short]"`` and ``benchopt info -v`` +work. + +Canonical-combo gating +---------------------- +GIFT-Eval scores only **97** of the 55 × 3 = 165 possible ``(path, +term)`` combinations on its public leaderboard. The 34 short-only paths +do not define ``medium`` / ``long``. We track the canonical set in +:data:`CANONICAL_COMBOS` and gate runs at the dataset level: when +``(dataset_name, term)`` is not canonical, ``get_data()`` short-circuits +and returns a placeholder dict carrying a ``_skip_reason`` field. +:meth:`Objective.skip` (see ``objective.py``) honors that field and +skips the combo cleanly. + +So: + +* ``-d "GiftEval[dataset_name=all,term=short]"`` → 55 canonical runs. +* ``-d "GiftEval[dataset_name=all,term=long]"`` → 55 attempts, + 21 canonical runs, 34 skipped. +* ``-d "GiftEval[dataset_name=all,term=all]"`` → 165 attempts, + 97 canonical runs, 68 skipped. + +Leaderboard names vs HF directory names +--------------------------------------- +The leaderboard uses lowercase, paper-style identifiers (e.g. +``loop_seattle/H``, ``m_dense/D``, ``car_parts/M``) while the HF repo +``Salesforce/GiftEval`` uses mixed-case directory names that don't +always match (``LOOP_SEATTLE/H``, ``M_DENSE/D``, +``car_parts_with_missing/``). We accept leaderboard names — that's what +appears in the paper, the leaderboard, and the gift-eval README — and +translate to HF paths internally via :data:`_LEADERBOARD_TO_HF`. Cases: + + * Pure case difference: ``loop_seattle`` → ``LOOP_SEATTLE``, + ``m_dense`` → ``M_DENSE``, ``sz_taxi`` → ``SZ_TAXI``. + * Missing-data suffix: ``car_parts`` → ``car_parts_with_missing``, + ``kdd_cup_2018`` → ``kdd_cup_2018_with_missing``, + ``temperature_rain`` → ``temperature_rain_with_missing``. + * Rename: ``saugeen`` → ``saugeenday``. + * Leaderboard adds a freq segment for HF-flat datasets: leaderboard + ``m4_yearly/A`` → HF flat ``m4_yearly`` (the freq is implicit in the + data, not the path). Likewise for the other ``m4_*``, + ``car_parts/M``, ``covid_deaths/D``, ``hospital/M``, + ``restaurant/D``, ``temperature_rain/D``, + ``bizitobs_application/10S``, ``bizitobs_service/10S``. + +Schema +------ +Each HF entry exposes ``item_id``, ``start``, ``freq``, ``target``. ``target`` is a flat ``List[float]`` for univariate configs and a ``List[List[float]]`` of shape ``(C, T)`` for multivariate ones (e.g. ``bitbrains_*``, ``electricity/*``, ``ett1/*``, ``ett2/*``, ``jena_weather/*``, ``solar/*``). Both shapes are handled — multivariate -entries are transposed to the ``(T, C)`` repo contract. +entries are transposed to the repo's ``(T, C)`` contract. -Cutoffs and windows follow the Monash recipe (we don't comply with -GIFT-Eval's prescribed test cutoff — same rolling-window logic via -:func:`benchmark_utils.windowing.make_forecasting_splits`). +Cutoffs and windows +------------------- +We don't comply with GIFT-Eval's prescribed test cutoff; we use the same +rolling-window logic as Monash via +:func:`benchmark_utils.windowing.make_forecasting_splits`. The +``prediction_length`` for a given (freq, term) follows GIFT-Eval's +canonical ``base × multiplier`` rule via +:func:`benchmark_utils.constants.gift_eval_prediction_length`. Data contract output mirrors :mod:`datasets.monash`. """ @@ -29,41 +89,145 @@ from benchmark_utils.windowing import make_forecasting_splits -# Canonical list of GIFT-Eval evaluation configs. Each entry is the -# arrow-containing directory path inside the HF repo. Flat datasets are -# bare names (``m4_weekly``); datasets that ship multiple frequencies -# are encoded as ``/`` (e.g. ``LOOP_SEATTLE/H``, -# ``LOOP_SEATTLE/D`` — these are genuinely distinct evaluations). -# Surfaced via ``get_parameter_choices`` so that ``dataset_name=all`` -# and ``benchopt info -v`` work. -GIFTEVAL_DATASETS: tuple[str, ...] = ( - "LOOP_SEATTLE/5T", "LOOP_SEATTLE/D", "LOOP_SEATTLE/H", - "M_DENSE/D", "M_DENSE/H", - "SZ_TAXI/15T", "SZ_TAXI/H", - "bitbrains_fast_storage/5T", "bitbrains_fast_storage/H", - "bitbrains_rnd/5T", "bitbrains_rnd/H", - "bizitobs_application", - "bizitobs_l2c/5T", "bizitobs_l2c/H", - "bizitobs_service", - "car_parts_with_missing", "covid_deaths", - "electricity/15T", "electricity/D", "electricity/H", "electricity/W", - "ett1/15T", "ett1/D", "ett1/H", "ett1/W", - "ett2/15T", "ett2/D", "ett2/H", "ett2/W", - "hierarchical_sales/D", "hierarchical_sales/W", - "hospital", - "jena_weather", - "jena_weather/10T", "jena_weather/D", "jena_weather/H", - "kdd_cup_2018_with_missing/D", "kdd_cup_2018_with_missing/H", +# --------------------------------------------------------------------------- +# Single source of truth: leaderboard ``/`` path → tuple of +# terms that path defines. Derived from +# gift-eval/results/*/all_results.csv. 55 paths, 97 (path, term) triples; +# 34 paths are short-only, 21 define all three. +# --------------------------------------------------------------------------- +_LEADERBOARD: dict[str, tuple[str, ...]] = { + "bitbrains_fast_storage/5T": ("short", "medium", "long"), + "bitbrains_fast_storage/H": ("short",), + "bitbrains_rnd/5T": ("short", "medium", "long"), + "bitbrains_rnd/H": ("short",), + "bizitobs_application/10S": ("short", "medium", "long"), + "bizitobs_l2c/5T": ("short", "medium", "long"), + "bizitobs_l2c/H": ("short", "medium", "long"), + "bizitobs_service/10S": ("short", "medium", "long"), + "car_parts/M": ("short",), + "covid_deaths/D": ("short",), + "electricity/15T": ("short", "medium", "long"), + "electricity/D": ("short",), + "electricity/H": ("short", "medium", "long"), + "electricity/W": ("short",), + "ett1/15T": ("short", "medium", "long"), + "ett1/D": ("short",), + "ett1/H": ("short", "medium", "long"), + "ett1/W": ("short",), + "ett2/15T": ("short", "medium", "long"), + "ett2/D": ("short",), + "ett2/H": ("short", "medium", "long"), + "ett2/W": ("short",), + "hierarchical_sales/D": ("short",), + "hierarchical_sales/W": ("short",), + "hospital/M": ("short",), + "jena_weather/10T": ("short", "medium", "long"), + "jena_weather/D": ("short",), + "jena_weather/H": ("short", "medium", "long"), + "kdd_cup_2018/D": ("short",), + "kdd_cup_2018/H": ("short", "medium", "long"), + "loop_seattle/5T": ("short", "medium", "long"), + "loop_seattle/D": ("short",), + "loop_seattle/H": ("short", "medium", "long"), + "m4_daily/D": ("short",), + "m4_hourly/H": ("short",), + "m4_monthly/M": ("short",), + "m4_quarterly/Q": ("short",), + "m4_weekly/W": ("short",), + "m4_yearly/A": ("short",), + "m_dense/D": ("short",), + "m_dense/H": ("short", "medium", "long"), + "restaurant/D": ("short",), + "saugeen/D": ("short",), + "saugeen/M": ("short",), + "saugeen/W": ("short",), + "solar/10T": ("short", "medium", "long"), + "solar/D": ("short",), + "solar/H": ("short", "medium", "long"), + "solar/W": ("short",), + "sz_taxi/15T": ("short", "medium", "long"), + "sz_taxi/H": ("short",), + "temperature_rain/D": ("short",), + "us_births/D": ("short",), + "us_births/M": ("short",), + "us_births/W": ("short",), +} + + +# Public derived constants — what users and CLI tooling reference. +GIFTEVAL_DATASETS: tuple[str, ...] = tuple(sorted(_LEADERBOARD)) +GIFTEVAL_TERMS: tuple[str, ...] = ("short", "medium", "long") +CANONICAL_COMBOS: frozenset[tuple[str, str]] = frozenset( + (path, term) for path, terms in _LEADERBOARD.items() for term in terms +) + + +# --------------------------------------------------------------------------- +# Leaderboard ```` → HF top-level directory name. Only entries that +# differ from the lowercase identity mapping appear here. +# --------------------------------------------------------------------------- +_LEADERBOARD_TO_HF: dict[str, str] = { + "loop_seattle": "LOOP_SEATTLE", + "m_dense": "M_DENSE", + "sz_taxi": "SZ_TAXI", + "car_parts": "car_parts_with_missing", + "kdd_cup_2018": "kdd_cup_2018_with_missing", + "temperature_rain": "temperature_rain_with_missing", + "saugeen": "saugeenday", +} + + +# --------------------------------------------------------------------------- +# Datasets that live as a single arrow file directly under the dataset +# name (no per-freq subdir on HF). The leaderboard still adds a freq +# segment to their paths (e.g. ``m4_yearly/A``, ``hospital/M``), which we +# strip before locating the file. +# --------------------------------------------------------------------------- +_HF_FLAT_DATASETS: frozenset[str] = frozenset({ + "bizitobs_application", "bizitobs_service", + "car_parts_with_missing", "covid_deaths", "hospital", "m4_daily", "m4_hourly", "m4_monthly", "m4_quarterly", "m4_weekly", "m4_yearly", - "restaurant", - "saugeenday/D", "saugeenday/M", "saugeenday/W", - "solar/10T", "solar/D", "solar/H", "solar/W", - "temperature_rain_with_missing", - "us_births/D", "us_births/M", "us_births/W", -) + "restaurant", "temperature_rain_with_missing", +}) -GIFTEVAL_TERMS: tuple[str, ...] = ("short", "medium", "long") + +def _hf_arrow_directory(leaderboard_path: str) -> str: + """Resolve a leaderboard ``/`` path to the actual HF + directory containing the arrow file. + + Examples + -------- + ``"m4_weekly/W"`` → ``"m4_weekly"`` (HF-flat, drops freq) + ``"loop_seattle/H"`` → ``"LOOP_SEATTLE/H"`` (case-renamed) + ``"car_parts/M"`` → ``"car_parts_with_missing"`` (HF-flat + suffix) + """ + leaderboard_name, _, freq_segment = leaderboard_path.partition("/") + hf_name = _LEADERBOARD_TO_HF.get(leaderboard_name, leaderboard_name) + if hf_name in _HF_FLAT_DATASETS: + return hf_name + if freq_segment: + return f"{hf_name}/{freq_segment}" + return hf_name + + +def _skip_placeholder(reason: str) -> dict: + """Return a minimal data dict that satisfies ``Objective.set_data`` + but flags the combo for skipping via ``Objective.skip``.""" + return dict( + X_train=[], + y_train=[], + X_test=[], + y_test=[], + cutoff_indexes=[], + covariates=Covariates(), + task="forecasting", + metrics=[], + prediction_length=1, + freq="D", + seasonality=1, + _skip_reason=reason, + ) class Dataset(BaseDataset): @@ -72,21 +236,21 @@ class Dataset(BaseDataset): Parameters ---------- dataset_name : str - Subdirectory name on the HF repo (e.g. ``"m4_weekly"``, ``"ett1"``, - ``"solar"``). See https://huggingface.co/datasets/Salesforce/GiftEval - for the full list. + One of 55 canonical leaderboard paths — ``/``, e.g. + ``"m4_weekly/W"``, ``"loop_seattle/H"``. See + :data:`GIFTEVAL_DATASETS`. term : str - GIFT-Eval forecast term — ``"short"`` (×1), ``"medium"`` (×10), or - ``"long"`` (×15). Selects the prediction length via the canonical - per-freq base, matching the GIFT-Eval leaderboard convention. - Ignored when ``prediction_length`` is set explicitly. + ``"short"`` / ``"medium"`` / ``"long"``. Combos not in + :data:`CANONICAL_COMBOS` are skipped (placeholder + objective + ``skip``), so ``dataset_name=all, term=long`` runs only the 21 + paths that define ``long``. prediction_length : int or None - Explicit override. ``None`` → resolved from (freq, term). + Explicit override. ``None`` → resolved from (freq, term) via + :func:`benchmark_utils.constants.gift_eval_prediction_length`. n_windows : int Number of rolling evaluation windows per series. max_series : int or None - Optional cap on the number of series — useful for very large - configs (e.g. ``solar``). ``None`` = no cap. + Optional cap on the number of series. debug : bool If True, keep only the first 5 series for fast iteration. """ @@ -96,7 +260,7 @@ class Dataset(BaseDataset): requirements = ["pip::datasets", "pip::huggingface-hub"] parameters = { - "dataset_name": ["m4_weekly"], + "dataset_name": ["m4_weekly/W"], "term": ["short"], "prediction_length": [None], "n_windows": [1], @@ -104,6 +268,12 @@ class Dataset(BaseDataset): "debug": [False], } + # ``prepare()`` depends on ``dataset_name`` only — ``term`` and the + # other knobs shape the in-memory view, not the downloaded files. + prepare_cache_ignore = ( + "term", "prediction_length", "n_windows", "max_series", "debug", + ) + @classmethod def get_all_parameter_values(cls, name): if name == "dataset_name": @@ -112,40 +282,44 @@ def get_all_parameter_values(cls, name): return list(GIFTEVAL_TERMS) return None + def prepare(self): + """Pre-download arrow shards for this config into HF's cache.""" + self._snapshot() + + def _snapshot(self) -> "list[str]": + """Snapshot-download the arrow files for this dataset and return + their local paths. Idempotent — HF caches by content hash.""" + from huggingface_hub import snapshot_download + from pathlib import Path + + hf_path = _hf_arrow_directory(self.dataset_name) + local_root = snapshot_download( + "Salesforce/GiftEval", + repo_type="dataset", + allow_patterns=f"{hf_path}/*.arrow", + ) + return sorted(str(p) for p in (Path(local_root) / hf_path).glob("*.arrow")) + def get_data(self): from datasets import Dataset as HFDataset - from huggingface_hub import hf_hub_download, list_repo_files - - # Locate the Arrow file inside the requested directory. Match the - # exact directory (no nested descent) — for datasets like - # ``LOOP_SEATTLE`` that ship multiple freq subdirs, the user must - # pick one (``LOOP_SEATTLE/H``, ``LOOP_SEATTLE/D``, ...), and - # those are genuinely separate evaluation configs. - files = list_repo_files( - "Salesforce/GiftEval", repo_type="dataset" - ) - prefix = f"{self.dataset_name}/" - arrow_files = [ - f for f in files - if f.startswith(prefix) - and f.endswith(".arrow") - and "/" not in f[len(prefix):] - ] + + # Short-circuit non-canonical combos so heavy parsing doesn't run. + if (self.dataset_name, self.term) not in CANONICAL_COMBOS: + return _skip_placeholder( + f"non-canonical GIFT-Eval combo: {self.dataset_name!r} does " + f"not define term {self.term!r} on the leaderboard" + ) + + arrow_files = self._snapshot() if not arrow_files: raise ValueError( f"No Arrow file found for GIFT-Eval dataset " - f"{self.dataset_name!r}. Valid choices are in " - f"GIFTEVAL_DATASETS." + f"{self.dataset_name!r}. Valid choices are in GIFTEVAL_DATASETS." ) - # Download + load each shard; concatenate. rows = [] - for f in sorted(arrow_files): - local = hf_hub_download( - "Salesforce/GiftEval", filename=f, repo_type="dataset", - ) - shard = HFDataset.from_file(local) - rows.extend(shard) + for f in arrow_files: + rows.extend(HFDataset.from_file(f)) if self.debug: rows = rows[:5] @@ -157,8 +331,8 @@ def get_data(self): f"GIFT-Eval dataset {self.dataset_name!r} returned 0 series." ) - # Frequency / seasonality — take from the first entry (every series - # in a GIFT-Eval subset shares the same freq). + # Frequency / seasonality — every series in a GIFT-Eval subset + # shares the same freq, so taking it from the first entry is safe. pandas_freq = rows[0].get("freq") or "D" freq, seasonality, _ = from_pandas(pandas_freq) @@ -167,18 +341,15 @@ def get_data(self): pred_len = gift_eval_prediction_length(pandas_freq, self.term) # Build (T, C) series. Univariate entries arrive as flat - # ``List[float]`` (ndim=1); multivariate entries arrive as - # ``List[List[float]]`` of shape ``(C, T)``. + # ``List[float]`` (ndim=1); multivariate as ``List[List[float]]`` + # of shape ``(C, T)``. series_list = [] for r in rows: values = np.asarray(r["target"], dtype=np.float32) if values.ndim == 1: - series = values.reshape(-1, 1) # (T, 1) + series_list.append(values.reshape(-1, 1)) # (T, 1) elif values.ndim == 2: - series = values.T # (C, T) → (T, C) - else: - continue - series_list.append(series) + series_list.append(values.T) # (C,T)→(T,C) if not series_list: raise ValueError( diff --git a/datasets/monash.py b/datasets/monash.py index a299433..0ea0365 100644 --- a/datasets/monash.py +++ b/datasets/monash.py @@ -66,6 +66,20 @@ class Dataset(BaseDataset): "debug": [False], } + # Only dataset_name decides what aeon downloads; the other knobs + # affect the in-memory split, not the file on disk. + prepare_cache_ignore = ("prediction_length", "n_windows", "debug") + + def prepare(self): + """Warm aeon's local cache for this dataset (download if missing). + + aeon writes the ``.tsf`` to + ``~/.aeon/datasets/local_data//.tsf`` on first use; + we call it once and discard the parsed result so the cache layer + in :func:`load_forecasting` handles the actual download. + """ + load_forecasting(self.dataset_name, return_metadata=False) + def get_data(self): df, meta = load_forecasting(self.dataset_name, return_metadata=True) diff --git a/objective.py b/objective.py index 5e3a4c6..92aefa4 100644 --- a/objective.py +++ b/objective.py @@ -84,8 +84,25 @@ def set_data(self, X_train, y_train, X_test, y_test, self.covariates = covariates if covariates is not None else Covariates() self.task = task self.metrics = metrics + # Pull any skip marker out of meta so it doesn't leak into + # ``get_objective()`` payloads. + self._skip_reason = meta.pop("_skip_reason", None) self.meta = meta # freq, prediction_length, n_classes, … + def skip(self, **data): + """Honor a ``_skip_reason`` field set by the dataset. + + Datasets that want to filter their own parameter grid (e.g. + :mod:`datasets.gifteval` skipping non-leaderboard (path, term) + combos) return ``_skip_reason="..."`` from ``get_data()`` and we + propagate it here so benchopt records a clean skip rather than + running an empty objective. + """ + reason = data.get("_skip_reason") + if reason: + return True, reason + return False, None + # ------------------------------------------------------------------ # Passed to the solver # ------------------------------------------------------------------