benchopt · eddardd · May 28, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/benchmark_utils/constants.py b/benchmark_utils/constants.py
@@ -0,0 +1,125 @@
+"""Shared frequency / seasonality tables for forecasting datasets.
+
+Two sources name frequencies differently:
+  - aeon (used by Monash) uses words: "yearly", "weekly", "minutely", ...
+  - GIFT-Eval (and pandas) use offset aliases: "Y", "W-SUN", "5T", ...
+
+This module exposes a single canonical (freq, seasonality) lookup keyed on
+the canonical pandas-style base alias (e.g. "Y", "W", "D"), plus two
+adapters that normalize each source onto that canonical key.
+"""
+
+import re
+
+# Canonical base alias → (display_freq, MASE seasonality, default forecast horizon)
+_BASE = {
+    "Y": ("Y",   1,  6),
+    "Q": ("Q",   4,  8),
+    "M": ("M",  12, 12),
+    "W": ("W",  52, 13),
+    "D": ("D",   7, 14),
+    "H": ("H",  24, 24),
+    "T": ("T", 1440, 60),   # minutes
+    "S": ("S",   1, 60),
+}
+
+# aeon's spelled-out names → canonical base alias
+_AEON_TO_BASE = {
+    "yearly":    "Y",
+    "quarterly": "Q",
+    "monthly":   "M",
+    "weekly":    "W",
+    "daily":     "D",
+    "hourly":    "H",
+    "minutely":  "T",
+    "seconds":   "S",
+}
+
+
+def from_aeon(freq_word: str) -> tuple[str, int, int]:
+    """Look up (freq, seasonality, default_horizon) from an aeon freq word."""
+    base = _AEON_TO_BASE.get(freq_word, "D")
+    return _BASE[base]
+
+
+# Pandas offset aliases: strip a leading multiplier and any anchor suffix
+# (e.g. "5T" → "T", "W-SUN" → "W", "QS-OCT" → "Q", "YE" → "Y").
+_PANDAS_ALIAS_RE = re.compile(r"^\d*([A-Za-z]+)")
+_NORMALIZE_BASE = {
+    # Newer pandas spellings → legacy single-letter aliases used in _BASE.
+    "YE": "Y", "YS": "Y", "A": "Y", "AS": "Y",
+    "QE": "Q", "QS": "Q",
+    "ME": "M", "MS": "M",
+    "min": "T", "MIN": "T",
+}
+
+
+def from_pandas(freq_alias: str) -> tuple[str, int, int]:
+    """Look up (freq, seasonality, default_horizon) from a pandas freq alias.
+
+    Handles multipliers ("5T") and anchors ("W-SUN", "QS-OCT") by stripping
+    them before lookup. Unknown aliases default to daily.
+    """
+    if not freq_alias:
+        return _BASE["D"]
+    m = _PANDAS_ALIAS_RE.match(freq_alias.split("-", 1)[0])
+    if not m:
+        return _BASE["D"]
+    head = m.group(1)
+    base = _NORMALIZE_BASE.get(head, head[:1].upper())
+    return _BASE.get(base, _BASE["D"])
+
+
+# ---------------------------------------------------------------------------
+# GIFT-Eval term resolution
+#
+# Mirrors the canonical table in the upstream time-series repo: prediction
+# length is a function of pandas freq, then scaled by a term multiplier
+# (short=1, medium=10, long=15). Used by datasets/gifteval.py so reported
+# numbers line up with the GIFT-Eval leaderboard.
+# ---------------------------------------------------------------------------
+
+GIFT_EVAL_PRED_LENGTH_MAP: dict[str, int] = {
+    "M":  12, "MS": 12,
+    "W":   8, "W-SUN": 8, "W-MON": 8,
+    "D":  30,
+    "H":  48, "6H": 48,
+    "T":  48, "5T": 48, "10T": 48, "15T": 48, "30T": 48,
+    "S":  60, "4S": 60,
+    "Q":   8, "Q-DEC": 8,
+    "A":   4, "A-DEC": 4,
+    "Y":   4,
+}
+
+GIFT_EVAL_TERM_MULTIPLIER: dict[str, int] = {
+    "short":  1,
+    "medium": 10,
+    "long":   15,
+}
+
+
+def gift_eval_prediction_length(freq: str, term: str) -> int:
+    """Resolve the GIFT-Eval prediction length for a (freq, term) pair.
+
+    ``freq`` is a pandas-style alias (e.g. ``"5T"``, ``"1H"``, ``"W-SUN"``).
+    Lookup falls back through: exact match → strip leading "1" multiplier
+    ("1H" → "H") → collapse any multi-X alias to its base X ("10S" → "S",
+    "30T" → "T") → default 48. ``term`` must be one of ``"short"``,
+    ``"medium"``, ``"long"``.
+    """
+    if term not in GIFT_EVAL_TERM_MULTIPLIER:
+        raise ValueError(
+            f"term must be one of {list(GIFT_EVAL_TERM_MULTIPLIER)}; got {term!r}"
+        )
+    base = GIFT_EVAL_PRED_LENGTH_MAP.get(freq)
+    if base is None:
+        m = _PANDAS_ALIAS_RE.match(freq.split("-", 1)[0])
+        if m:
+            head = m.group(1)
+            # Normalize new pandas spellings ("QE"→"Q", "ME"→"M", ...)
+            # before falling back through the map.
+            head = _NORMALIZE_BASE.get(head, head)
+            base = GIFT_EVAL_PRED_LENGTH_MAP.get(head)
+    if base is None:
+        base = 48
+    return base * GIFT_EVAL_TERM_MULTIPLIER[term]
diff --git a/datasets/fev.py b/datasets/fev.py
@@ -0,0 +1,271 @@
+"""AutoGluon fev_datasets forecasting benchmark
+(huggingface.co/datasets/autogluon/fev_datasets).
+
+The HF repo organizes data either:
+  - per-freq: ``<dataset>/<freq>/train-*.parquet``
+    (e.g. ``ETT/1H``, ``LOOP_SEATTLE/5T``)
+  - flat: ``<dataset>/train-*.parquet``
+    (e.g. ``australian_tourism``)
+  - or with an arbitrary subdir that is NOT a freq (e.g. ``boomlet/<N>``
+    where ``<N>`` is a series id, not a frequency).
+
+We accept the directory path directly as ``dataset_name`` (e.g.
+``"ETT/1H"``, ``"australian_tourism"``) and infer the actual freq from
+each series' ``timestamp`` column rather than parsing the path.
+
+Each parquet row is one series; columns vary:
+  - Always: ``id``, ``timestamp``
+  - Univariate: a ``target`` column (list of floats)
+  - Multivariate (e.g. ``ETT``): no ``target`` column — each channel is
+    its own column (``HUFL``, ..., ``OT``). Channel columns are stacked
+    on the last axis to form ``(T, C)``.
+
+Rolling-window splits match :mod:`datasets.monash`. The default
+``prediction_length`` is the freq-based heuristic from
+:func:`benchmark_utils.constants.from_pandas`; FEV does not publish a
+per-dataset horizon spec, so we don't try to mirror one. Pass
+``prediction_length=N`` explicitly to override.
+"""
+
+import numpy as np
+import pandas as pd
+from benchopt import BaseDataset
+
+from benchmark_utils.covariates import Covariates
+from benchmark_utils.constants import from_pandas
+from benchmark_utils.windowing import make_forecasting_splits
+
+
+_METADATA_COLS = ("id", "timestamp")
+
+
+# Canonical list of FEV evaluation configs — directory paths inside
+# https://huggingface.co/datasets/autogluon/fev_datasets that contain at
+# least one ``train-*.parquet`` file. Surfaced via
+# ``get_parameter_choices`` so that ``dataset_name=all`` and ``benchopt
+# info -v`` work.
+FEV_DATASETS: tuple[str, ...] = (
+    "ETT/15T", "ETT/1D", "ETT/1H", "ETT/1W",
+    "LOOP_SEATTLE/1D", "LOOP_SEATTLE/1H", "LOOP_SEATTLE/5T",
+    "M_DENSE/1D", "M_DENSE/1H",
+    "SZ_TAXI/15T", "SZ_TAXI/1H",
+    "australian_tourism",
+    "bizitobs_l2c/1H", "bizitobs_l2c/5T",
+    "boomlet/1062", "boomlet/1209", "boomlet/1225", "boomlet/1230",
+    "boomlet/1282", "boomlet/1487", "boomlet/1631", "boomlet/1676",
+    "boomlet/1855", "boomlet/1975", "boomlet/2187",
+    "boomlet/285", "boomlet/619", "boomlet/772", "boomlet/963",
+    "ecdc_ili",
+    "entsoe/15T", "entsoe/1H", "entsoe/30T",
+    "epf_be", "epf_de", "epf_fr", "epf_np", "epf_pjm",
+    "ercot/1D", "ercot/1H", "ercot/1M", "ercot/1W",
+    "favorita_stores/1D", "favorita_stores/1M", "favorita_stores/1W",
+    "favorita_transactions/1D", "favorita_transactions/1M",
+    "favorita_transactions/1W",
+    "fred_md_2025", "fred_qd_2025",
+    "gvar", "hermes",
+    "hierarchical_sales/1D", "hierarchical_sales/1W",
+    "hospital",
+    "hospital_admissions/1D", "hospital_admissions/1W",
+    "jena_weather/10T", "jena_weather/1D", "jena_weather/1H",
+    "kdd_cup_2022/10T", "kdd_cup_2022/1D", "kdd_cup_2022/30T",
+    "m5/1D", "m5/1M", "m5/1W",
+    "proenfo_bull", "proenfo_cockatoo",
+    "proenfo_gfc12", "proenfo_gfc14", "proenfo_gfc17",
+    "proenfo_hog", "proenfo_pdb",
+    "redset/15T", "redset/1H", "redset/5T",
+    "restaurant",
+    "rohlik_orders/1D", "rohlik_orders/1W",
+    "rohlik_sales/1D", "rohlik_sales/1W",
+    "rossmann/1D", "rossmann/1W",
+    "solar/1D", "solar/1W",
+    "solar_with_weather/15T", "solar_with_weather/1H",
+    "uci_air_quality/1D", "uci_air_quality/1H",
+    "uk_covid_nation/1D", "uk_covid_nation/1W",
+    "uk_covid_utla/1D", "uk_covid_utla/1W",
+    "us_consumption/1M", "us_consumption/1Q", "us_consumption/1Y",
+    "walmart",
+    "world_co2_emissions", "world_life_expectancy", "world_tourism",
+)
+
+
+def _infer_freq(timestamps) -> str:
+    """Best-effort freq inference from a series' timestamp column.
+
+    Falls back to ``"D"`` when pandas cannot infer. Uses the first 5
+    points to keep the check cheap on long series.
+    """
+    try:
+        idx = pd.DatetimeIndex(timestamps[:5])
+        return pd.infer_freq(idx) or "D"
+    except Exception:
+        return "D"
+
+
+class Dataset(BaseDataset):
+    """AutoGluon fev forecasting dataset.
+
+    Parameters
+    ----------
+    dataset_name : str
+        Directory path inside the HF repo. Per-freq paths look like
+        ``"ETT/1H"`` / ``"LOOP_SEATTLE/5T"``; flat paths like
+        ``"australian_tourism"`` / ``"hospital"``. See ``FEV_DATASETS``
+        for the full list (also discoverable via ``benchopt info -v``).
+    prediction_length : int or None
+        Explicit override. ``None`` → resolved from the inferred freq
+        via :func:`benchmark_utils.constants.from_pandas` (same heuristic
+        used by Monash). FEV does not publish its own per-dataset
+        horizon matrix, so we don't try to align with a leaderboard
+        spec here.
+    n_windows : int
+        Number of rolling evaluation windows per series.
+    max_series : int or None
+        Optional cap on the number of series.
+    debug : bool
+        If True, keep only the first 5 series.
+    """
+
+    name = "FEV"
+
+    requirements = ["pip::pyarrow", "pip::huggingface-hub"]
-    requirements = ["pip::pyarrow", "pip::huggingface-hub"]
+    requirements = ["pip::huggingface-hub"]
-    requirements = ["pip::pyarrow", "pip::huggingface-hub"]
+    requirements = ["pip::huggingface-hub"]
+
+    parameters = {
+        "dataset_name": ["LOOP_SEATTLE/1H"],
+        "prediction_length": [None],
+        "n_windows": [1],
+        "max_series": [None],
+        "debug": [False],
+    }
+
+    # Cache prepare() by dataset_name only — the other knobs shape the
+    # in-memory view, not the downloaded files.
+    prepare_cache_ignore = (
+        "prediction_length", "n_windows", "max_series", "debug",
+    )
+
+    @classmethod
+    def get_all_parameter_values(cls, name):
+        if name == "dataset_name":
+            return list(FEV_DATASETS)
+        return None
+
+    def prepare(self):
+        """Pre-download parquet shards for this config into HF's cache."""
+        self._snapshot()
+
+    def _snapshot(self) -> "list[str]":
+        """Snapshot-download parquet files for this dataset_name and
+        return their local paths. Idempotent."""
+        from huggingface_hub import snapshot_download
+        from pathlib import Path
+
+        local_root = snapshot_download(
+            "autogluon/fev_datasets",
+            repo_type="dataset",
+            allow_patterns=f"{self.dataset_name}/*.parquet",
+        )
+        return sorted(
+            str(p) for p in (Path(local_root) / self.dataset_name).glob("*.parquet")
+        )
+
+    def get_data(self):
+        parquet_files = self._snapshot()
+        if not parquet_files:
+            raise ValueError(
+                f"No parquet found at {self.dataset_name!r} in "
+                "autogluon/fev_datasets. Valid choices are in FEV_DATASETS."
+            )
+
+        df = pd.concat(
+            [pd.read_parquet(f) for f in parquet_files],
+            ignore_index=True,
+        )
+
+        if self.debug:
+            df = df.head(5)
+        elif self.max_series is not None:
+            df = df.head(int(self.max_series))
+
+        if df.empty:
+            raise ValueError(f"{self.dataset_name!r} contained 0 series.")
+
+        # Channel cols = non-metadata columns whose entries are numeric
+        # array-likes. Some FEV datasets carry extra scalar/string fields
+        # (``type``, ``Security``) or arrays of strings (holiday names in
+        # ``favorita_stores``, etc.). We treat covariates as out of scope
+        # for the MVP.
+        def _is_numeric_array_col(c):
+            v = df.iloc[0][c]
+            if not hasattr(v, "__len__") or isinstance(v, (str, bytes)):
+                return False
+            if len(v) == 0:
+                return False
+            return isinstance(v[0], (int, float, np.integer, np.floating))
+
+        channel_cols = [
+            c for c in df.columns
+            if c not in _METADATA_COLS and _is_numeric_array_col(c)
+        ]
+        if not channel_cols:
+            raise ValueError(
+                f"{self.dataset_name!r} has no channel columns "
+                f"(only {_METADATA_COLS} present)."
+            )
+
+        # Infer freq from the first series' timestamps — same for the
+        # whole config (FEV groups by freq at the directory level for
+        # nested configs, and flat configs are single-freq).
+        inferred_freq = _infer_freq(df.iloc[0]["timestamp"])
+        canonical_freq, seasonality, default_h = from_pandas(inferred_freq)
+
+        pred_len = self.prediction_length
+        if pred_len is None:
+            pred_len = int(default_h)
+
+        # Build (T, C) series. Each row's per-channel array has the same
+        # length (T_i); stack on the last axis.
+        series_list = []
+        for _, row in df.iterrows():
+            channels = [np.asarray(row[c], dtype=np.float32) for c in channel_cols]
+            T = channels[0].shape[0]
+            if any(ch.shape[0] != T for ch in channels):
+                continue
+            series_list.append(np.stack(channels, axis=-1))
+
+        if not series_list:
+            raise ValueError("All series were skipped (inconsistent channel lengths).")
+
+        test_len = pred_len * self.n_windows
+        X_train, y_train_list, full_series = [], [], []
+        for ts in series_list:
+            if ts.shape[0] < pred_len + 1:
+                continue
+            train_end = max(1, ts.shape[0] - test_len)
+            X_train.append(ts[:train_end])
+            y_train_list.append(ts[train_end: train_end + pred_len])
+            full_series.append(ts)
+
+        if not full_series:
+            raise ValueError("All series are shorter than prediction_length.")
+
+        n_windows = 1 if self.debug else self.n_windows
+        X_test, cutoff_indexes, y_test = make_forecasting_splits(
+            full_series,
+            prediction_length=pred_len,
+            n_windows=n_windows,
+        )
+
+        return dict(
+            X_train=X_train,
+            y_train=y_train_list,
+            X_test=X_test,
+            y_test=y_test,
+            cutoff_indexes=cutoff_indexes,
+            covariates=Covariates(),
+            task="forecasting",
+            metrics=["mae", "mse", "mase", "smape"],
+            prediction_length=pred_len,
+            freq=canonical_freq,
+            seasonality=seasonality,
+        )