diff --git a/benchmark_utils/constants.py b/benchmark_utils/constants.py
new file mode 100644
index 0000000..5698672
--- /dev/null
+++ b/benchmark_utils/constants.py
@@ -0,0 +1,125 @@
+"""Shared frequency / seasonality tables for forecasting datasets.
+
+Two sources name frequencies differently:
+  - aeon (used by Monash) uses words: "yearly", "weekly", "minutely", ...
+  - GIFT-Eval (and pandas) use offset aliases: "Y", "W-SUN", "5T", ...
+
+This module exposes a single canonical (freq, seasonality) lookup keyed on
+the canonical pandas-style base alias (e.g. "Y", "W", "D"), plus two
+adapters that normalize each source onto that canonical key.
+"""
+
+import re
+
+# Canonical base alias → (display_freq, MASE seasonality, default forecast horizon)
+_BASE = {
+    "Y": ("Y",   1,  6),
+    "Q": ("Q",   4,  8),
+    "M": ("M",  12, 12),
+    "W": ("W",  52, 13),
+    "D": ("D",   7, 14),
+    "H": ("H",  24, 24),
+    "T": ("T", 1440, 60),   # minutes
+    "S": ("S",   1, 60),
+}
+
+# aeon's spelled-out names → canonical base alias
+_AEON_TO_BASE = {
+    "yearly":    "Y",
+    "quarterly": "Q",
+    "monthly":   "M",
+    "weekly":    "W",
+    "daily":     "D",
+    "hourly":    "H",
+    "minutely":  "T",
+    "seconds":   "S",
+}
+
+
+def from_aeon(freq_word: str) -> tuple[str, int, int]:
+    """Look up (freq, seasonality, default_horizon) from an aeon freq word."""
+    base = _AEON_TO_BASE.get(freq_word, "D")
+    return _BASE[base]
+
+
+# Pandas offset aliases: strip a leading multiplier and any anchor suffix
+# (e.g. "5T" → "T", "W-SUN" → "W", "QS-OCT" → "Q", "YE" → "Y").
+_PANDAS_ALIAS_RE = re.compile(r"^\d*([A-Za-z]+)")
+_NORMALIZE_BASE = {
+    # Newer pandas spellings → legacy single-letter aliases used in _BASE.
+    "YE": "Y", "YS": "Y", "A": "Y", "AS": "Y",
+    "QE": "Q", "QS": "Q",
+    "ME": "M", "MS": "M",
+    "min": "T", "MIN": "T",
+}
+
+
+def from_pandas(freq_alias: str) -> tuple[str, int, int]:
+    """Look up (freq, seasonality, default_horizon) from a pandas freq alias.
+
+    Handles multipliers ("5T") and anchors ("W-SUN", "QS-OCT") by stripping
+    them before lookup. Unknown aliases default to daily.
+    """
+    if not freq_alias:
+        return _BASE["D"]
+    m = _PANDAS_ALIAS_RE.match(freq_alias.split("-", 1)[0])
+    if not m:
+        return _BASE["D"]
+    head = m.group(1)
+    base = _NORMALIZE_BASE.get(head, head[:1].upper())
+    return _BASE.get(base, _BASE["D"])
+
+
+# ---------------------------------------------------------------------------
+# GIFT-Eval term resolution
+#
+# Mirrors the canonical table in the upstream time-series repo: prediction
+# length is a function of pandas freq, then scaled by a term multiplier
+# (short=1, medium=10, long=15). Used by datasets/gifteval.py so reported
+# numbers line up with the GIFT-Eval leaderboard.
+# ---------------------------------------------------------------------------
+
+GIFT_EVAL_PRED_LENGTH_MAP: dict[str, int] = {
+    "M":  12, "MS": 12,
+    "W":   8, "W-SUN": 8, "W-MON": 8,
+    "D":  30,
+    "H":  48, "6H": 48,
+    "T":  48, "5T": 48, "10T": 48, "15T": 48, "30T": 48,
+    "S":  60, "4S": 60,
+    "Q":   8, "Q-DEC": 8,
+    "A":   4, "A-DEC": 4,
+    "Y":   4,
+}
+
+GIFT_EVAL_TERM_MULTIPLIER: dict[str, int] = {
+    "short":  1,
+    "medium": 10,
+    "long":   15,
+}
+
+
+def gift_eval_prediction_length(freq: str, term: str) -> int:
+    """Resolve the GIFT-Eval prediction length for a (freq, term) pair.
+
+    ``freq`` is a pandas-style alias (e.g. ``"5T"``, ``"1H"``, ``"W-SUN"``).
+    Lookup falls back through: exact match → strip leading "1" multiplier
+    ("1H" → "H") → collapse any multi-X alias to its base X ("10S" → "S",
+    "30T" → "T") → default 48. ``term`` must be one of ``"short"``,
+    ``"medium"``, ``"long"``.
+    """
+    if term not in GIFT_EVAL_TERM_MULTIPLIER:
+        raise ValueError(
+            f"term must be one of {list(GIFT_EVAL_TERM_MULTIPLIER)}; got {term!r}"
+        )
+    base = GIFT_EVAL_PRED_LENGTH_MAP.get(freq)
+    if base is None:
+        m = _PANDAS_ALIAS_RE.match(freq.split("-", 1)[0])
+        if m:
+            head = m.group(1)
+            # Normalize new pandas spellings ("QE"→"Q", "ME"→"M", ...)
+            # before falling back through the map.
+            head = _NORMALIZE_BASE.get(head, head)
+            base = GIFT_EVAL_PRED_LENGTH_MAP.get(head)
+    if base is None:
+        base = 48
+    return base * GIFT_EVAL_TERM_MULTIPLIER[term]
diff --git a/datasets/fev.py b/datasets/fev.py
new file mode 100644
index 0000000..e41d8cb
--- /dev/null
+++ b/datasets/fev.py
@@ -0,0 +1,271 @@
+"""AutoGluon fev_datasets forecasting benchmark
+(huggingface.co/datasets/autogluon/fev_datasets).
+
+The HF repo organizes data either:
+  - per-freq: ``<dataset>/<freq>/train-*.parquet``
+    (e.g. ``ETT/1H``, ``LOOP_SEATTLE/5T``)
+  - flat: ``<dataset>/train-*.parquet``
+    (e.g. ``australian_tourism``)
+  - or with an arbitrary subdir that is NOT a freq (e.g. ``boomlet/<N>``
+    where ``<N>`` is a series id, not a frequency).
+
+We accept the directory path directly as ``dataset_name`` (e.g.
+``"ETT/1H"``, ``"australian_tourism"``) and infer the actual freq from
+each series' ``timestamp`` column rather than parsing the path.
+
+Each parquet row is one series; columns vary:
+  - Always: ``id``, ``timestamp``
+  - Univariate: a ``target`` column (list of floats)
+  - Multivariate (e.g. ``ETT``): no ``target`` column — each channel is
+    its own column (``HUFL``, ..., ``OT``). Channel columns are stacked
+    on the last axis to form ``(T, C)``.
+
+Rolling-window splits match :mod:`datasets.monash`. The default
+``prediction_length`` is the freq-based heuristic from
+:func:`benchmark_utils.constants.from_pandas`; FEV does not publish a
+per-dataset horizon spec, so we don't try to mirror one. Pass
+``prediction_length=N`` explicitly to override.
+"""
+
+import numpy as np
+import pandas as pd
+from benchopt import BaseDataset
+
+from benchmark_utils.covariates import Covariates
+from benchmark_utils.constants import from_pandas
+from benchmark_utils.windowing import make_forecasting_splits
+
+
+_METADATA_COLS = ("id", "timestamp")
+
+
+# Canonical list of FEV evaluation configs — directory paths inside
+# https://huggingface.co/datasets/autogluon/fev_datasets that contain at
+# least one ``train-*.parquet`` file. Surfaced via
+# ``get_parameter_choices`` so that ``dataset_name=all`` and ``benchopt
+# info -v`` work.
+FEV_DATASETS: tuple[str, ...] = (
+    "ETT/15T", "ETT/1D", "ETT/1H", "ETT/1W",
+    "LOOP_SEATTLE/1D", "LOOP_SEATTLE/1H", "LOOP_SEATTLE/5T",
+    "M_DENSE/1D", "M_DENSE/1H",
+    "SZ_TAXI/15T", "SZ_TAXI/1H",
+    "australian_tourism",
+    "bizitobs_l2c/1H", "bizitobs_l2c/5T",
+    "boomlet/1062", "boomlet/1209", "boomlet/1225", "boomlet/1230",
+    "boomlet/1282", "boomlet/1487", "boomlet/1631", "boomlet/1676",
+    "boomlet/1855", "boomlet/1975", "boomlet/2187",
+    "boomlet/285", "boomlet/619", "boomlet/772", "boomlet/963",
+    "ecdc_ili",
+    "entsoe/15T", "entsoe/1H", "entsoe/30T",
+    "epf_be", "epf_de", "epf_fr", "epf_np", "epf_pjm",
+    "ercot/1D", "ercot/1H", "ercot/1M", "ercot/1W",
+    "favorita_stores/1D", "favorita_stores/1M", "favorita_stores/1W",
+    "favorita_transactions/1D", "favorita_transactions/1M",
+    "favorita_transactions/1W",
+    "fred_md_2025", "fred_qd_2025",
+    "gvar", "hermes",
+    "hierarchical_sales/1D", "hierarchical_sales/1W",
+    "hospital",
+    "hospital_admissions/1D", "hospital_admissions/1W",
+    "jena_weather/10T", "jena_weather/1D", "jena_weather/1H",
+    "kdd_cup_2022/10T", "kdd_cup_2022/1D", "kdd_cup_2022/30T",
+    "m5/1D", "m5/1M", "m5/1W",
+    "proenfo_bull", "proenfo_cockatoo",
+    "proenfo_gfc12", "proenfo_gfc14", "proenfo_gfc17",
+    "proenfo_hog", "proenfo_pdb",
+    "redset/15T", "redset/1H", "redset/5T",
+    "restaurant",
+    "rohlik_orders/1D", "rohlik_orders/1W",
+    "rohlik_sales/1D", "rohlik_sales/1W",
+    "rossmann/1D", "rossmann/1W",
+    "solar/1D", "solar/1W",
+    "solar_with_weather/15T", "solar_with_weather/1H",
+    "uci_air_quality/1D", "uci_air_quality/1H",
+    "uk_covid_nation/1D", "uk_covid_nation/1W",
+    "uk_covid_utla/1D", "uk_covid_utla/1W",
+    "us_consumption/1M", "us_consumption/1Q", "us_consumption/1Y",
+    "walmart",
+    "world_co2_emissions", "world_life_expectancy", "world_tourism",
+)
+
+
+def _infer_freq(timestamps) -> str:
+    """Best-effort freq inference from a series' timestamp column.
+
+    Falls back to ``"D"`` when pandas cannot infer. Uses the first 5
+    points to keep the check cheap on long series.
+    """
+    try:
+        idx = pd.DatetimeIndex(timestamps[:5])
+        return pd.infer_freq(idx) or "D"
+    except Exception:
+        return "D"
+
+
+class Dataset(BaseDataset):
+    """AutoGluon fev forecasting dataset.
+
+    Parameters
+    ----------
+    dataset_name : str
+        Directory path inside the HF repo. Per-freq paths look like
+        ``"ETT/1H"`` / ``"LOOP_SEATTLE/5T"``; flat paths like
+        ``"australian_tourism"`` / ``"hospital"``. See ``FEV_DATASETS``
+        for the full list (also discoverable via ``benchopt info -v``).
+    prediction_length : int or None
+        Explicit override. ``None`` → resolved from the inferred freq
+        via :func:`benchmark_utils.constants.from_pandas` (same heuristic
+        used by Monash). FEV does not publish its own per-dataset
+        horizon matrix, so we don't try to align with a leaderboard
+        spec here.
+    n_windows : int
+        Number of rolling evaluation windows per series.
+    max_series : int or None
+        Optional cap on the number of series.
+    debug : bool
+        If True, keep only the first 5 series.
+    """
+
+    name = "FEV"
+
+    requirements = ["pip::pyarrow", "pip::huggingface-hub"]
+
+    parameters = {
+        "dataset_name": ["LOOP_SEATTLE/1H"],
+        "prediction_length": [None],
+        "n_windows": [1],
+        "max_series": [None],
+        "debug": [False],
+    }
+
+    # Cache prepare() by dataset_name only — the other knobs shape the
+    # in-memory view, not the downloaded files.
+    prepare_cache_ignore = (
+        "prediction_length", "n_windows", "max_series", "debug",
+    )
+
+    @classmethod
+    def get_all_parameter_values(cls, name):
+        if name == "dataset_name":
+            return list(FEV_DATASETS)
+        return None
+
+    def prepare(self):
+        """Pre-download parquet shards for this config into HF's cache."""
+        self._snapshot()
+
+    def _snapshot(self) -> "list[str]":
+        """Snapshot-download parquet files for this dataset_name and
+        return their local paths. Idempotent."""
+        from huggingface_hub import snapshot_download
+        from pathlib import Path
+
+        local_root = snapshot_download(
+            "autogluon/fev_datasets",
+            repo_type="dataset",
+            allow_patterns=f"{self.dataset_name}/*.parquet",
+        )
+        return sorted(
+            str(p) for p in (Path(local_root) / self.dataset_name).glob("*.parquet")
+        )
+
+    def get_data(self):
+        parquet_files = self._snapshot()
+        if not parquet_files:
+            raise ValueError(
+                f"No parquet found at {self.dataset_name!r} in "
+                "autogluon/fev_datasets. Valid choices are in FEV_DATASETS."
+            )
+
+        df = pd.concat(
+            [pd.read_parquet(f) for f in parquet_files],
+            ignore_index=True,
+        )
+
+        if self.debug:
+            df = df.head(5)
+        elif self.max_series is not None:
+            df = df.head(int(self.max_series))
+
+        if df.empty:
+            raise ValueError(f"{self.dataset_name!r} contained 0 series.")
+
+        # Channel cols = non-metadata columns whose entries are numeric
+        # array-likes. Some FEV datasets carry extra scalar/string fields
+        # (``type``, ``Security``) or arrays of strings (holiday names in
+        # ``favorita_stores``, etc.). We treat covariates as out of scope
+        # for the MVP.
+        def _is_numeric_array_col(c):
+            v = df.iloc[0][c]
+            if not hasattr(v, "__len__") or isinstance(v, (str, bytes)):
+                return False
+            if len(v) == 0:
+                return False
+            return isinstance(v[0], (int, float, np.integer, np.floating))
+
+        channel_cols = [
+            c for c in df.columns
+            if c not in _METADATA_COLS and _is_numeric_array_col(c)
+        ]
+        if not channel_cols:
+            raise ValueError(
+                f"{self.dataset_name!r} has no channel columns "
+                f"(only {_METADATA_COLS} present)."
+            )
+
+        # Infer freq from the first series' timestamps — same for the
+        # whole config (FEV groups by freq at the directory level for
+        # nested configs, and flat configs are single-freq).
+        inferred_freq = _infer_freq(df.iloc[0]["timestamp"])
+        canonical_freq, seasonality, default_h = from_pandas(inferred_freq)
+
+        pred_len = self.prediction_length
+        if pred_len is None:
+            pred_len = int(default_h)
+
+        # Build (T, C) series. Each row's per-channel array has the same
+        # length (T_i); stack on the last axis.
+        series_list = []
+        for _, row in df.iterrows():
+            channels = [np.asarray(row[c], dtype=np.float32) for c in channel_cols]
+            T = channels[0].shape[0]
+            if any(ch.shape[0] != T for ch in channels):
+                continue
+            series_list.append(np.stack(channels, axis=-1))
+
+        if not series_list:
+            raise ValueError("All series were skipped (inconsistent channel lengths).")
+
+        test_len = pred_len * self.n_windows
+        X_train, y_train_list, full_series = [], [], []
+        for ts in series_list:
+            if ts.shape[0] < pred_len + 1:
+                continue
+            train_end = max(1, ts.shape[0] - test_len)
+            X_train.append(ts[:train_end])
+            y_train_list.append(ts[train_end: train_end + pred_len])
+            full_series.append(ts)
+
+        if not full_series:
+            raise ValueError("All series are shorter than prediction_length.")
+
+        n_windows = 1 if self.debug else self.n_windows
+        X_test, cutoff_indexes, y_test = make_forecasting_splits(
+            full_series,
+            prediction_length=pred_len,
+            n_windows=n_windows,
+        )
+
+        return dict(
+            X_train=X_train,
+            y_train=y_train_list,
+            X_test=X_test,
+            y_test=y_test,
+            cutoff_indexes=cutoff_indexes,
+            covariates=Covariates(),
+            task="forecasting",
+            metrics=["mae", "mse", "mase", "smape"],
+            prediction_length=pred_len,
+            freq=canonical_freq,
+            seasonality=seasonality,
+        )
diff --git a/datasets/gifteval.py b/datasets/gifteval.py
new file mode 100644
index 0000000..c8661c8
--- /dev/null
+++ b/datasets/gifteval.py
@@ -0,0 +1,395 @@
+"""GIFT-Eval forecasting benchmark dataset (Salesforce/GiftEval on HF).
+
+Parametrization
+---------------
+The class exposes two orthogonal parameters that drive the leaderboard
+matrix:
+
+* ``dataset_name`` — one of 55 canonical ``<name>/<freq>`` paths (e.g.
+  ``"m4_weekly/W"``, ``"loop_seattle/H"``). The full list is
+  :data:`GIFTEVAL_DATASETS`.
+* ``term`` — one of ``short`` / ``medium`` / ``long``, controlling the
+  forecast horizon (×1, ×10, ×15 of the per-freq base).
+
+Both are surfaced via ``get_all_parameter_values`` so that
+``-d "GiftEval[dataset_name=all,term=short]"`` and ``benchopt info -v``
+work.
+
+Canonical-combo gating
+----------------------
+GIFT-Eval scores only **97** of the 55 × 3 = 165 possible ``(path,
+term)`` combinations on its public leaderboard. The 34 short-only paths
+do not define ``medium`` / ``long``. We track the canonical set in
+:data:`CANONICAL_COMBOS` and gate runs at the dataset level: when
+``(dataset_name, term)`` is not canonical, ``get_data()`` short-circuits
+and returns a placeholder dict carrying a ``_skip_reason`` field.
+:meth:`Objective.skip` (see ``objective.py``) honors that field and
+skips the combo cleanly.
+
+So:
+
+* ``-d "GiftEval[dataset_name=all,term=short]"`` → 55 canonical runs.
+* ``-d "GiftEval[dataset_name=all,term=long]"``  → 55 attempts,
+  21 canonical runs, 34 skipped.
+* ``-d "GiftEval[dataset_name=all,term=all]"``   → 165 attempts,
+  97 canonical runs, 68 skipped.
+
+Leaderboard names vs HF directory names
+---------------------------------------
+The leaderboard uses lowercase, paper-style identifiers (e.g.
+``loop_seattle/H``, ``m_dense/D``, ``car_parts/M``) while the HF repo
+``Salesforce/GiftEval`` uses mixed-case directory names that don't
+always match (``LOOP_SEATTLE/H``, ``M_DENSE/D``,
+``car_parts_with_missing/``). We accept leaderboard names — that's what
+appears in the paper, the leaderboard, and the gift-eval README — and
+translate to HF paths internally via :data:`_LEADERBOARD_TO_HF`. Cases:
+
+  * Pure case difference: ``loop_seattle`` → ``LOOP_SEATTLE``,
+    ``m_dense`` → ``M_DENSE``, ``sz_taxi`` → ``SZ_TAXI``.
+  * Missing-data suffix: ``car_parts`` → ``car_parts_with_missing``,
+    ``kdd_cup_2018`` → ``kdd_cup_2018_with_missing``,
+    ``temperature_rain`` → ``temperature_rain_with_missing``.
+  * Rename: ``saugeen`` → ``saugeenday``.
+  * Leaderboard adds a freq segment for HF-flat datasets: leaderboard
+    ``m4_yearly/A`` → HF flat ``m4_yearly`` (the freq is implicit in the
+    data, not the path). Likewise for the other ``m4_*``,
+    ``car_parts/M``, ``covid_deaths/D``, ``hospital/M``,
+    ``restaurant/D``, ``temperature_rain/D``,
+    ``bizitobs_application/10S``, ``bizitobs_service/10S``.
+
+Schema
+------
+Each HF entry exposes ``item_id``, ``start``, ``freq``, ``target``.
+``target`` is a flat ``List[float]`` for univariate configs and a
+``List[List[float]]`` of shape ``(C, T)`` for multivariate ones (e.g.
+``bitbrains_*``, ``electricity/*``, ``ett1/*``, ``ett2/*``,
+``jena_weather/*``, ``solar/*``). Both shapes are handled — multivariate
+entries are transposed to the repo's ``(T, C)`` contract.
+
+Cutoffs and windows
+-------------------
+We don't comply with GIFT-Eval's prescribed test cutoff; we use the same
+rolling-window logic as Monash via
+:func:`benchmark_utils.windowing.make_forecasting_splits`. The
+``prediction_length`` for a given (freq, term) follows GIFT-Eval's
+canonical ``base × multiplier`` rule via
+:func:`benchmark_utils.constants.gift_eval_prediction_length`.
+
+Data contract output mirrors :mod:`datasets.monash`.
+"""
+
+import numpy as np
+from benchopt import BaseDataset
+
+from benchmark_utils.covariates import Covariates
+from benchmark_utils.constants import (
+    from_pandas,
+    gift_eval_prediction_length,
+)
+from benchmark_utils.windowing import make_forecasting_splits
+
+
+# ---------------------------------------------------------------------------
+# Single source of truth: leaderboard ``<name>/<freq>`` path → tuple of
+# terms that path defines. Derived from
+# gift-eval/results/*/all_results.csv. 55 paths, 97 (path, term) triples;
+# 34 paths are short-only, 21 define all three.
+# ---------------------------------------------------------------------------
+_LEADERBOARD: dict[str, tuple[str, ...]] = {
+    "bitbrains_fast_storage/5T":   ("short", "medium", "long"),
+    "bitbrains_fast_storage/H":    ("short",),
+    "bitbrains_rnd/5T":            ("short", "medium", "long"),
+    "bitbrains_rnd/H":             ("short",),
+    "bizitobs_application/10S":    ("short", "medium", "long"),
+    "bizitobs_l2c/5T":             ("short", "medium", "long"),
+    "bizitobs_l2c/H":              ("short", "medium", "long"),
+    "bizitobs_service/10S":        ("short", "medium", "long"),
+    "car_parts/M":                 ("short",),
+    "covid_deaths/D":              ("short",),
+    "electricity/15T":             ("short", "medium", "long"),
+    "electricity/D":               ("short",),
+    "electricity/H":               ("short", "medium", "long"),
+    "electricity/W":               ("short",),
+    "ett1/15T":                    ("short", "medium", "long"),
+    "ett1/D":                      ("short",),
+    "ett1/H":                      ("short", "medium", "long"),
+    "ett1/W":                      ("short",),
+    "ett2/15T":                    ("short", "medium", "long"),
+    "ett2/D":                      ("short",),
+    "ett2/H":                      ("short", "medium", "long"),
+    "ett2/W":                      ("short",),
+    "hierarchical_sales/D":        ("short",),
+    "hierarchical_sales/W":        ("short",),
+    "hospital/M":                  ("short",),
+    "jena_weather/10T":            ("short", "medium", "long"),
+    "jena_weather/D":              ("short",),
+    "jena_weather/H":              ("short", "medium", "long"),
+    "kdd_cup_2018/D":              ("short",),
+    "kdd_cup_2018/H":              ("short", "medium", "long"),
+    "loop_seattle/5T":             ("short", "medium", "long"),
+    "loop_seattle/D":              ("short",),
+    "loop_seattle/H":              ("short", "medium", "long"),
+    "m4_daily/D":                  ("short",),
+    "m4_hourly/H":                 ("short",),
+    "m4_monthly/M":                ("short",),
+    "m4_quarterly/Q":              ("short",),
+    "m4_weekly/W":                 ("short",),
+    "m4_yearly/A":                 ("short",),
+    "m_dense/D":                   ("short",),
+    "m_dense/H":                   ("short", "medium", "long"),
+    "restaurant/D":                ("short",),
+    "saugeen/D":                   ("short",),
+    "saugeen/M":                   ("short",),
+    "saugeen/W":                   ("short",),
+    "solar/10T":                   ("short", "medium", "long"),
+    "solar/D":                     ("short",),
+    "solar/H":                     ("short", "medium", "long"),
+    "solar/W":                     ("short",),
+    "sz_taxi/15T":                 ("short", "medium", "long"),
+    "sz_taxi/H":                   ("short",),
+    "temperature_rain/D":          ("short",),
+    "us_births/D":                 ("short",),
+    "us_births/M":                 ("short",),
+    "us_births/W":                 ("short",),
+}
+
+
+# Public derived constants — what users and CLI tooling reference.
+GIFTEVAL_DATASETS: tuple[str, ...] = tuple(sorted(_LEADERBOARD))
+GIFTEVAL_TERMS: tuple[str, ...] = ("short", "medium", "long")
+CANONICAL_COMBOS: frozenset[tuple[str, str]] = frozenset(
+    (path, term) for path, terms in _LEADERBOARD.items() for term in terms
+)
+
+
+# ---------------------------------------------------------------------------
+# Leaderboard ``<name>`` → HF top-level directory name. Only entries that
+# differ from the lowercase identity mapping appear here.
+# ---------------------------------------------------------------------------
+_LEADERBOARD_TO_HF: dict[str, str] = {
+    "loop_seattle":     "LOOP_SEATTLE",
+    "m_dense":          "M_DENSE",
+    "sz_taxi":          "SZ_TAXI",
+    "car_parts":        "car_parts_with_missing",
+    "kdd_cup_2018":     "kdd_cup_2018_with_missing",
+    "temperature_rain": "temperature_rain_with_missing",
+    "saugeen":          "saugeenday",
+}
+
+
+# ---------------------------------------------------------------------------
+# Datasets that live as a single arrow file directly under the dataset
+# name (no per-freq subdir on HF). The leaderboard still adds a freq
+# segment to their paths (e.g. ``m4_yearly/A``, ``hospital/M``), which we
+# strip before locating the file.
+# ---------------------------------------------------------------------------
+_HF_FLAT_DATASETS: frozenset[str] = frozenset({
+    "bizitobs_application", "bizitobs_service",
+    "car_parts_with_missing", "covid_deaths", "hospital",
+    "m4_daily", "m4_hourly", "m4_monthly", "m4_quarterly",
+    "m4_weekly", "m4_yearly",
+    "restaurant", "temperature_rain_with_missing",
+})
+
+
+def _hf_arrow_directory(leaderboard_path: str) -> str:
+    """Resolve a leaderboard ``<name>/<freq>`` path to the actual HF
+    directory containing the arrow file.
+
+    Examples
+    --------
+        ``"m4_weekly/W"``       → ``"m4_weekly"`` (HF-flat, drops freq)
+        ``"loop_seattle/H"``    → ``"LOOP_SEATTLE/H"`` (case-renamed)
+        ``"car_parts/M"``       → ``"car_parts_with_missing"`` (HF-flat + suffix)
+    """
+    leaderboard_name, _, freq_segment = leaderboard_path.partition("/")
+    hf_name = _LEADERBOARD_TO_HF.get(leaderboard_name, leaderboard_name)
+    if hf_name in _HF_FLAT_DATASETS:
+        return hf_name
+    if freq_segment:
+        return f"{hf_name}/{freq_segment}"
+    return hf_name
+
+
+def _skip_placeholder(reason: str) -> dict:
+    """Return a minimal data dict that satisfies ``Objective.set_data``
+    but flags the combo for skipping via ``Objective.skip``."""
+    return dict(
+        X_train=[],
+        y_train=[],
+        X_test=[],
+        y_test=[],
+        cutoff_indexes=[],
+        covariates=Covariates(),
+        task="forecasting",
+        metrics=[],
+        prediction_length=1,
+        freq="D",
+        seasonality=1,
+        _skip_reason=reason,
+    )
+
+
+class Dataset(BaseDataset):
+    """GIFT-Eval forecasting dataset (loaded from HF Salesforce/GiftEval).
+
+    Parameters
+    ----------
+    dataset_name : str
+        One of 55 canonical leaderboard paths — ``<name>/<freq>``, e.g.
+        ``"m4_weekly/W"``, ``"loop_seattle/H"``. See
+        :data:`GIFTEVAL_DATASETS`.
+    term : str
+        ``"short"`` / ``"medium"`` / ``"long"``. Combos not in
+        :data:`CANONICAL_COMBOS` are skipped (placeholder + objective
+        ``skip``), so ``dataset_name=all, term=long`` runs only the 21
+        paths that define ``long``.
+    prediction_length : int or None
+        Explicit override. ``None`` → resolved from (freq, term) via
+        :func:`benchmark_utils.constants.gift_eval_prediction_length`.
+    n_windows : int
+        Number of rolling evaluation windows per series.
+    max_series : int or None
+        Optional cap on the number of series.
+    debug : bool
+        If True, keep only the first 5 series for fast iteration.
+    """
+
+    name = "GiftEval"
+
+    requirements = ["pip::datasets", "pip::huggingface-hub"]
+
+    parameters = {
+        "dataset_name": ["m4_weekly/W"],
+        "term": ["short"],
+        "prediction_length": [None],
+        "n_windows": [1],
+        "max_series": [None],
+        "debug": [False],
+    }
+
+    # ``prepare()`` depends on ``dataset_name`` only — ``term`` and the
+    # other knobs shape the in-memory view, not the downloaded files.
+    prepare_cache_ignore = (
+        "term", "prediction_length", "n_windows", "max_series", "debug",
+    )
+
+    @classmethod
+    def get_all_parameter_values(cls, name):
+        if name == "dataset_name":
+            return list(GIFTEVAL_DATASETS)
+        if name == "term":
+            return list(GIFTEVAL_TERMS)
+        return None
+
+    def prepare(self):
+        """Pre-download arrow shards for this config into HF's cache."""
+        self._snapshot()
+
+    def _snapshot(self) -> "list[str]":
+        """Snapshot-download the arrow files for this dataset and return
+        their local paths. Idempotent — HF caches by content hash."""
+        from huggingface_hub import snapshot_download
+        from pathlib import Path
+
+        hf_path = _hf_arrow_directory(self.dataset_name)
+        local_root = snapshot_download(
+            "Salesforce/GiftEval",
+            repo_type="dataset",
+            allow_patterns=f"{hf_path}/*.arrow",
+        )
+        return sorted(str(p) for p in (Path(local_root) / hf_path).glob("*.arrow"))
+
+    def get_data(self):
+        from datasets import Dataset as HFDataset
+
+        # Short-circuit non-canonical combos so heavy parsing doesn't run.
+        if (self.dataset_name, self.term) not in CANONICAL_COMBOS:
+            return _skip_placeholder(
+                f"non-canonical GIFT-Eval combo: {self.dataset_name!r} does "
+                f"not define term {self.term!r} on the leaderboard"
+            )
+
+        arrow_files = self._snapshot()
+        if not arrow_files:
+            raise ValueError(
+                f"No Arrow file found for GIFT-Eval dataset "
+                f"{self.dataset_name!r}. Valid choices are in GIFTEVAL_DATASETS."
+            )
+
+        rows = []
+        for f in arrow_files:
+            rows.extend(HFDataset.from_file(f))
+
+        if self.debug:
+            rows = rows[:5]
+        elif self.max_series is not None:
+            rows = rows[: int(self.max_series)]
+
+        if not rows:
+            raise ValueError(
+                f"GIFT-Eval dataset {self.dataset_name!r} returned 0 series."
+            )
+
+        # Frequency / seasonality — every series in a GIFT-Eval subset
+        # shares the same freq, so taking it from the first entry is safe.
+        pandas_freq = rows[0].get("freq") or "D"
+        freq, seasonality, _ = from_pandas(pandas_freq)
+
+        pred_len = self.prediction_length
+        if pred_len is None:
+            pred_len = gift_eval_prediction_length(pandas_freq, self.term)
+
+        # Build (T, C) series. Univariate entries arrive as flat
+        # ``List[float]`` (ndim=1); multivariate as ``List[List[float]]``
+        # of shape ``(C, T)``.
+        series_list = []
+        for r in rows:
+            values = np.asarray(r["target"], dtype=np.float32)
+            if values.ndim == 1:
+                series_list.append(values.reshape(-1, 1))         # (T, 1)
+            elif values.ndim == 2:
+                series_list.append(values.T)                        # (C,T)→(T,C)
+
+        if not series_list:
+            raise ValueError(
+                f"All entries in GIFT-Eval dataset {self.dataset_name!r} "
+                "had unsupported target shapes."
+            )
+
+        # Training portion: everything except the last test windows.
+        test_len = pred_len * self.n_windows
+        X_train, y_train_list, full_series = [], [], []
+        for ts in series_list:
+            if ts.shape[0] < pred_len + 1:
+                continue
+            train_end = max(1, ts.shape[0] - test_len)
+            X_train.append(ts[:train_end])
+            y_train_list.append(ts[train_end: train_end + pred_len])
+            full_series.append(ts)
+
+        if not full_series:
+            raise ValueError(
+                "All series are shorter than prediction_length."
+            )
+
+        n_windows = 1 if self.debug else self.n_windows
+        X_test, cutoff_indexes, y_test = make_forecasting_splits(
+            full_series,
+            prediction_length=pred_len,
+            n_windows=n_windows,
+        )
+
+        return dict(
+            X_train=X_train,
+            y_train=y_train_list,
+            X_test=X_test,
+            y_test=y_test,
+            cutoff_indexes=cutoff_indexes,
+            covariates=Covariates(),  # GIFT-Eval HF schema has no covariates
+            task="forecasting",
+            metrics=["mae", "mse", "mase", "smape"],
+            prediction_length=pred_len,
+            freq=freq,
+            seasonality=seasonality,
+        )
diff --git a/datasets/monash.py b/datasets/monash.py
index a607f64..f7f6ed3 100644
--- a/datasets/monash.py
+++ b/datasets/monash.py
@@ -36,30 +36,9 @@
 from benchopt import BaseDataset
 
 from benchmark_utils.covariates import Covariates
+from benchmark_utils.constants import from_aeon
 from benchmark_utils.windowing import make_forecasting_splits
 
-# Map aeon frequency strings → pandas-style freq codes and MASE seasonality
-_FREQ_MAP = {
-    "yearly": ("Y", 1),
-    "quarterly": ("Q", 4),
-    "monthly": ("M", 12),
-    "weekly": ("W", 52),
-    "daily": ("D", 7),
-    "hourly": ("H", 24),
-    "minutely": ("T", 1440),
-    "seconds": ("S", 1),
-}
-
-_DEFAULT_HORIZON = {
-    "Y": 6,
-    "Q": 8,
-    "M": 12,
-    "W": 13,
-    "D": 14,
-    "H": 24,
-    "T": 60,
-}
-
 
 class Dataset(BaseDataset):
     """Monash forecasting dataset (loaded via aeon).
@@ -88,6 +67,20 @@ class Dataset(BaseDataset):
         "debug": [False],
     }
 
+    # Only dataset_name decides what aeon downloads; the other knobs
+    # affect the in-memory split, not the file on disk.
+    prepare_cache_ignore = ("prediction_length", "n_windows", "debug")
+
+    def prepare(self):
+        """Warm aeon's local cache for this dataset (download if missing).
+
+        aeon writes the ``.tsf`` to
+        ``~/.aeon/datasets/local_data/<name>/<name>.tsf`` on first use;
+        we call it once and discard the parsed result so the cache layer
+        in :func:`load_forecasting` handles the actual download.
+        """
+        load_forecasting(self.dataset_name, return_metadata=False)
+
     def get_data(self):
         df, meta = load_forecasting(self.dataset_name, return_metadata=True)
         # df columns: series_name, start_timestamp, series_value
@@ -95,13 +88,11 @@ def get_data(self):
         #             contain_missing_values, contain_equal_length
 
         aeon_freq = meta.get("frequency", "yearly")
-        freq, seasonality = _FREQ_MAP.get(aeon_freq, ("D", 1))
+        freq, seasonality, default_h = from_aeon(aeon_freq)
 
         pred_len = self.prediction_length
         if pred_len is None:
-            pred_len = int(
-                meta.get("forecast_horizon") or _DEFAULT_HORIZON.get(freq, 10)
-            )
+            pred_len = int(meta.get("forecast_horizon") or default_h)
 
         series_list = []
         rows = df.iterrows() if not self.debug else list(df.iterrows())[:5]
diff --git a/objective.py b/objective.py
index 5964026..9ad2ead 100644
--- a/objective.py
+++ b/objective.py
@@ -104,8 +104,25 @@ def set_data(
         self.covariates = covariates if covariates is not None else Covariates()
         self.task = task
         self.metrics = metrics
+        # Pull any skip marker out of meta so it doesn't leak into
+        # ``get_objective()`` payloads.
+        self._skip_reason = meta.pop("_skip_reason", None)
         self.meta = meta  # freq, prediction_length, n_classes, …
 
+    def skip(self, **data):
+        """Honor a ``_skip_reason`` field set by the dataset.
+
+        Datasets that want to filter their own parameter grid (e.g.
+        :mod:`datasets.gifteval` skipping non-leaderboard (path, term)
+        combos) return ``_skip_reason="..."`` from ``get_data()`` and we
+        propagate it here so benchopt records a clean skip rather than
+        running an empty objective.
+        """
+        reason = data.get("_skip_reason")
+        if reason:
+            return True, reason
+        return False, None
+
     # ------------------------------------------------------------------
     # Passed to the solver
     # ------------------------------------------------------------------