diff --git a/benchmark_utils/constants.py b/benchmark_utils/constants.py new file mode 100644 index 0000000..5698672 --- /dev/null +++ b/benchmark_utils/constants.py @@ -0,0 +1,125 @@ +"""Shared frequency / seasonality tables for forecasting datasets. + +Two sources name frequencies differently: + - aeon (used by Monash) uses words: "yearly", "weekly", "minutely", ... + - GIFT-Eval (and pandas) use offset aliases: "Y", "W-SUN", "5T", ... + +This module exposes a single canonical (freq, seasonality) lookup keyed on +the canonical pandas-style base alias (e.g. "Y", "W", "D"), plus two +adapters that normalize each source onto that canonical key. +""" + +import re + +# Canonical base alias → (display_freq, MASE seasonality, default forecast horizon) +_BASE = { + "Y": ("Y", 1, 6), + "Q": ("Q", 4, 8), + "M": ("M", 12, 12), + "W": ("W", 52, 13), + "D": ("D", 7, 14), + "H": ("H", 24, 24), + "T": ("T", 1440, 60), # minutes + "S": ("S", 1, 60), +} + +# aeon's spelled-out names → canonical base alias +_AEON_TO_BASE = { + "yearly": "Y", + "quarterly": "Q", + "monthly": "M", + "weekly": "W", + "daily": "D", + "hourly": "H", + "minutely": "T", + "seconds": "S", +} + + +def from_aeon(freq_word: str) -> tuple[str, int, int]: + """Look up (freq, seasonality, default_horizon) from an aeon freq word.""" + base = _AEON_TO_BASE.get(freq_word, "D") + return _BASE[base] + + +# Pandas offset aliases: strip a leading multiplier and any anchor suffix +# (e.g. "5T" → "T", "W-SUN" → "W", "QS-OCT" → "Q", "YE" → "Y"). +_PANDAS_ALIAS_RE = re.compile(r"^\d*([A-Za-z]+)") +_NORMALIZE_BASE = { + # Newer pandas spellings → legacy single-letter aliases used in _BASE. + "YE": "Y", "YS": "Y", "A": "Y", "AS": "Y", + "QE": "Q", "QS": "Q", + "ME": "M", "MS": "M", + "min": "T", "MIN": "T", +} + + +def from_pandas(freq_alias: str) -> tuple[str, int, int]: + """Look up (freq, seasonality, default_horizon) from a pandas freq alias. + + Handles multipliers ("5T") and anchors ("W-SUN", "QS-OCT") by stripping + them before lookup. Unknown aliases default to daily. + """ + if not freq_alias: + return _BASE["D"] + m = _PANDAS_ALIAS_RE.match(freq_alias.split("-", 1)[0]) + if not m: + return _BASE["D"] + head = m.group(1) + base = _NORMALIZE_BASE.get(head, head[:1].upper()) + return _BASE.get(base, _BASE["D"]) + + +# --------------------------------------------------------------------------- +# GIFT-Eval term resolution +# +# Mirrors the canonical table in the upstream time-series repo: prediction +# length is a function of pandas freq, then scaled by a term multiplier +# (short=1, medium=10, long=15). Used by datasets/gifteval.py so reported +# numbers line up with the GIFT-Eval leaderboard. +# --------------------------------------------------------------------------- + +GIFT_EVAL_PRED_LENGTH_MAP: dict[str, int] = { + "M": 12, "MS": 12, + "W": 8, "W-SUN": 8, "W-MON": 8, + "D": 30, + "H": 48, "6H": 48, + "T": 48, "5T": 48, "10T": 48, "15T": 48, "30T": 48, + "S": 60, "4S": 60, + "Q": 8, "Q-DEC": 8, + "A": 4, "A-DEC": 4, + "Y": 4, +} + +GIFT_EVAL_TERM_MULTIPLIER: dict[str, int] = { + "short": 1, + "medium": 10, + "long": 15, +} + + +def gift_eval_prediction_length(freq: str, term: str) -> int: + """Resolve the GIFT-Eval prediction length for a (freq, term) pair. + + ``freq`` is a pandas-style alias (e.g. ``"5T"``, ``"1H"``, ``"W-SUN"``). + Lookup falls back through: exact match → strip leading "1" multiplier + ("1H" → "H") → collapse any multi-X alias to its base X ("10S" → "S", + "30T" → "T") → default 48. ``term`` must be one of ``"short"``, + ``"medium"``, ``"long"``. + """ + if term not in GIFT_EVAL_TERM_MULTIPLIER: + raise ValueError( + f"term must be one of {list(GIFT_EVAL_TERM_MULTIPLIER)}; got {term!r}" + ) + base = GIFT_EVAL_PRED_LENGTH_MAP.get(freq) + if base is None: + m = _PANDAS_ALIAS_RE.match(freq.split("-", 1)[0]) + if m: + head = m.group(1) + # Normalize new pandas spellings ("QE"→"Q", "ME"→"M", ...) + # before falling back through the map. + head = _NORMALIZE_BASE.get(head, head) + base = GIFT_EVAL_PRED_LENGTH_MAP.get(head) + if base is None: + base = 48 + return base * GIFT_EVAL_TERM_MULTIPLIER[term] diff --git a/datasets/fev.py b/datasets/fev.py new file mode 100644 index 0000000..e41d8cb --- /dev/null +++ b/datasets/fev.py @@ -0,0 +1,271 @@ +"""AutoGluon fev_datasets forecasting benchmark +(huggingface.co/datasets/autogluon/fev_datasets). + +The HF repo organizes data either: + - per-freq: ``//train-*.parquet`` + (e.g. ``ETT/1H``, ``LOOP_SEATTLE/5T``) + - flat: ``/train-*.parquet`` + (e.g. ``australian_tourism``) + - or with an arbitrary subdir that is NOT a freq (e.g. ``boomlet/`` + where ```` is a series id, not a frequency). + +We accept the directory path directly as ``dataset_name`` (e.g. +``"ETT/1H"``, ``"australian_tourism"``) and infer the actual freq from +each series' ``timestamp`` column rather than parsing the path. + +Each parquet row is one series; columns vary: + - Always: ``id``, ``timestamp`` + - Univariate: a ``target`` column (list of floats) + - Multivariate (e.g. ``ETT``): no ``target`` column — each channel is + its own column (``HUFL``, ..., ``OT``). Channel columns are stacked + on the last axis to form ``(T, C)``. + +Rolling-window splits match :mod:`datasets.monash`. The default +``prediction_length`` is the freq-based heuristic from +:func:`benchmark_utils.constants.from_pandas`; FEV does not publish a +per-dataset horizon spec, so we don't try to mirror one. Pass +``prediction_length=N`` explicitly to override. +""" + +import numpy as np +import pandas as pd +from benchopt import BaseDataset + +from benchmark_utils.covariates import Covariates +from benchmark_utils.constants import from_pandas +from benchmark_utils.windowing import make_forecasting_splits + + +_METADATA_COLS = ("id", "timestamp") + + +# Canonical list of FEV evaluation configs — directory paths inside +# https://huggingface.co/datasets/autogluon/fev_datasets that contain at +# least one ``train-*.parquet`` file. Surfaced via +# ``get_parameter_choices`` so that ``dataset_name=all`` and ``benchopt +# info -v`` work. +FEV_DATASETS: tuple[str, ...] = ( + "ETT/15T", "ETT/1D", "ETT/1H", "ETT/1W", + "LOOP_SEATTLE/1D", "LOOP_SEATTLE/1H", "LOOP_SEATTLE/5T", + "M_DENSE/1D", "M_DENSE/1H", + "SZ_TAXI/15T", "SZ_TAXI/1H", + "australian_tourism", + "bizitobs_l2c/1H", "bizitobs_l2c/5T", + "boomlet/1062", "boomlet/1209", "boomlet/1225", "boomlet/1230", + "boomlet/1282", "boomlet/1487", "boomlet/1631", "boomlet/1676", + "boomlet/1855", "boomlet/1975", "boomlet/2187", + "boomlet/285", "boomlet/619", "boomlet/772", "boomlet/963", + "ecdc_ili", + "entsoe/15T", "entsoe/1H", "entsoe/30T", + "epf_be", "epf_de", "epf_fr", "epf_np", "epf_pjm", + "ercot/1D", "ercot/1H", "ercot/1M", "ercot/1W", + "favorita_stores/1D", "favorita_stores/1M", "favorita_stores/1W", + "favorita_transactions/1D", "favorita_transactions/1M", + "favorita_transactions/1W", + "fred_md_2025", "fred_qd_2025", + "gvar", "hermes", + "hierarchical_sales/1D", "hierarchical_sales/1W", + "hospital", + "hospital_admissions/1D", "hospital_admissions/1W", + "jena_weather/10T", "jena_weather/1D", "jena_weather/1H", + "kdd_cup_2022/10T", "kdd_cup_2022/1D", "kdd_cup_2022/30T", + "m5/1D", "m5/1M", "m5/1W", + "proenfo_bull", "proenfo_cockatoo", + "proenfo_gfc12", "proenfo_gfc14", "proenfo_gfc17", + "proenfo_hog", "proenfo_pdb", + "redset/15T", "redset/1H", "redset/5T", + "restaurant", + "rohlik_orders/1D", "rohlik_orders/1W", + "rohlik_sales/1D", "rohlik_sales/1W", + "rossmann/1D", "rossmann/1W", + "solar/1D", "solar/1W", + "solar_with_weather/15T", "solar_with_weather/1H", + "uci_air_quality/1D", "uci_air_quality/1H", + "uk_covid_nation/1D", "uk_covid_nation/1W", + "uk_covid_utla/1D", "uk_covid_utla/1W", + "us_consumption/1M", "us_consumption/1Q", "us_consumption/1Y", + "walmart", + "world_co2_emissions", "world_life_expectancy", "world_tourism", +) + + +def _infer_freq(timestamps) -> str: + """Best-effort freq inference from a series' timestamp column. + + Falls back to ``"D"`` when pandas cannot infer. Uses the first 5 + points to keep the check cheap on long series. + """ + try: + idx = pd.DatetimeIndex(timestamps[:5]) + return pd.infer_freq(idx) or "D" + except Exception: + return "D" + + +class Dataset(BaseDataset): + """AutoGluon fev forecasting dataset. + + Parameters + ---------- + dataset_name : str + Directory path inside the HF repo. Per-freq paths look like + ``"ETT/1H"`` / ``"LOOP_SEATTLE/5T"``; flat paths like + ``"australian_tourism"`` / ``"hospital"``. See ``FEV_DATASETS`` + for the full list (also discoverable via ``benchopt info -v``). + prediction_length : int or None + Explicit override. ``None`` → resolved from the inferred freq + via :func:`benchmark_utils.constants.from_pandas` (same heuristic + used by Monash). FEV does not publish its own per-dataset + horizon matrix, so we don't try to align with a leaderboard + spec here. + n_windows : int + Number of rolling evaluation windows per series. + max_series : int or None + Optional cap on the number of series. + debug : bool + If True, keep only the first 5 series. + """ + + name = "FEV" + + requirements = ["pip::pyarrow", "pip::huggingface-hub"] + + parameters = { + "dataset_name": ["LOOP_SEATTLE/1H"], + "prediction_length": [None], + "n_windows": [1], + "max_series": [None], + "debug": [False], + } + + # Cache prepare() by dataset_name only — the other knobs shape the + # in-memory view, not the downloaded files. + prepare_cache_ignore = ( + "prediction_length", "n_windows", "max_series", "debug", + ) + + @classmethod + def get_all_parameter_values(cls, name): + if name == "dataset_name": + return list(FEV_DATASETS) + return None + + def prepare(self): + """Pre-download parquet shards for this config into HF's cache.""" + self._snapshot() + + def _snapshot(self) -> "list[str]": + """Snapshot-download parquet files for this dataset_name and + return their local paths. Idempotent.""" + from huggingface_hub import snapshot_download + from pathlib import Path + + local_root = snapshot_download( + "autogluon/fev_datasets", + repo_type="dataset", + allow_patterns=f"{self.dataset_name}/*.parquet", + ) + return sorted( + str(p) for p in (Path(local_root) / self.dataset_name).glob("*.parquet") + ) + + def get_data(self): + parquet_files = self._snapshot() + if not parquet_files: + raise ValueError( + f"No parquet found at {self.dataset_name!r} in " + "autogluon/fev_datasets. Valid choices are in FEV_DATASETS." + ) + + df = pd.concat( + [pd.read_parquet(f) for f in parquet_files], + ignore_index=True, + ) + + if self.debug: + df = df.head(5) + elif self.max_series is not None: + df = df.head(int(self.max_series)) + + if df.empty: + raise ValueError(f"{self.dataset_name!r} contained 0 series.") + + # Channel cols = non-metadata columns whose entries are numeric + # array-likes. Some FEV datasets carry extra scalar/string fields + # (``type``, ``Security``) or arrays of strings (holiday names in + # ``favorita_stores``, etc.). We treat covariates as out of scope + # for the MVP. + def _is_numeric_array_col(c): + v = df.iloc[0][c] + if not hasattr(v, "__len__") or isinstance(v, (str, bytes)): + return False + if len(v) == 0: + return False + return isinstance(v[0], (int, float, np.integer, np.floating)) + + channel_cols = [ + c for c in df.columns + if c not in _METADATA_COLS and _is_numeric_array_col(c) + ] + if not channel_cols: + raise ValueError( + f"{self.dataset_name!r} has no channel columns " + f"(only {_METADATA_COLS} present)." + ) + + # Infer freq from the first series' timestamps — same for the + # whole config (FEV groups by freq at the directory level for + # nested configs, and flat configs are single-freq). + inferred_freq = _infer_freq(df.iloc[0]["timestamp"]) + canonical_freq, seasonality, default_h = from_pandas(inferred_freq) + + pred_len = self.prediction_length + if pred_len is None: + pred_len = int(default_h) + + # Build (T, C) series. Each row's per-channel array has the same + # length (T_i); stack on the last axis. + series_list = [] + for _, row in df.iterrows(): + channels = [np.asarray(row[c], dtype=np.float32) for c in channel_cols] + T = channels[0].shape[0] + if any(ch.shape[0] != T for ch in channels): + continue + series_list.append(np.stack(channels, axis=-1)) + + if not series_list: + raise ValueError("All series were skipped (inconsistent channel lengths).") + + test_len = pred_len * self.n_windows + X_train, y_train_list, full_series = [], [], [] + for ts in series_list: + if ts.shape[0] < pred_len + 1: + continue + train_end = max(1, ts.shape[0] - test_len) + X_train.append(ts[:train_end]) + y_train_list.append(ts[train_end: train_end + pred_len]) + full_series.append(ts) + + if not full_series: + raise ValueError("All series are shorter than prediction_length.") + + n_windows = 1 if self.debug else self.n_windows + X_test, cutoff_indexes, y_test = make_forecasting_splits( + full_series, + prediction_length=pred_len, + n_windows=n_windows, + ) + + return dict( + X_train=X_train, + y_train=y_train_list, + X_test=X_test, + y_test=y_test, + cutoff_indexes=cutoff_indexes, + covariates=Covariates(), + task="forecasting", + metrics=["mae", "mse", "mase", "smape"], + prediction_length=pred_len, + freq=canonical_freq, + seasonality=seasonality, + ) diff --git a/datasets/gifteval.py b/datasets/gifteval.py new file mode 100644 index 0000000..c8661c8 --- /dev/null +++ b/datasets/gifteval.py @@ -0,0 +1,395 @@ +"""GIFT-Eval forecasting benchmark dataset (Salesforce/GiftEval on HF). + +Parametrization +--------------- +The class exposes two orthogonal parameters that drive the leaderboard +matrix: + +* ``dataset_name`` — one of 55 canonical ``/`` paths (e.g. + ``"m4_weekly/W"``, ``"loop_seattle/H"``). The full list is + :data:`GIFTEVAL_DATASETS`. +* ``term`` — one of ``short`` / ``medium`` / ``long``, controlling the + forecast horizon (×1, ×10, ×15 of the per-freq base). + +Both are surfaced via ``get_all_parameter_values`` so that +``-d "GiftEval[dataset_name=all,term=short]"`` and ``benchopt info -v`` +work. + +Canonical-combo gating +---------------------- +GIFT-Eval scores only **97** of the 55 × 3 = 165 possible ``(path, +term)`` combinations on its public leaderboard. The 34 short-only paths +do not define ``medium`` / ``long``. We track the canonical set in +:data:`CANONICAL_COMBOS` and gate runs at the dataset level: when +``(dataset_name, term)`` is not canonical, ``get_data()`` short-circuits +and returns a placeholder dict carrying a ``_skip_reason`` field. +:meth:`Objective.skip` (see ``objective.py``) honors that field and +skips the combo cleanly. + +So: + +* ``-d "GiftEval[dataset_name=all,term=short]"`` → 55 canonical runs. +* ``-d "GiftEval[dataset_name=all,term=long]"`` → 55 attempts, + 21 canonical runs, 34 skipped. +* ``-d "GiftEval[dataset_name=all,term=all]"`` → 165 attempts, + 97 canonical runs, 68 skipped. + +Leaderboard names vs HF directory names +--------------------------------------- +The leaderboard uses lowercase, paper-style identifiers (e.g. +``loop_seattle/H``, ``m_dense/D``, ``car_parts/M``) while the HF repo +``Salesforce/GiftEval`` uses mixed-case directory names that don't +always match (``LOOP_SEATTLE/H``, ``M_DENSE/D``, +``car_parts_with_missing/``). We accept leaderboard names — that's what +appears in the paper, the leaderboard, and the gift-eval README — and +translate to HF paths internally via :data:`_LEADERBOARD_TO_HF`. Cases: + + * Pure case difference: ``loop_seattle`` → ``LOOP_SEATTLE``, + ``m_dense`` → ``M_DENSE``, ``sz_taxi`` → ``SZ_TAXI``. + * Missing-data suffix: ``car_parts`` → ``car_parts_with_missing``, + ``kdd_cup_2018`` → ``kdd_cup_2018_with_missing``, + ``temperature_rain`` → ``temperature_rain_with_missing``. + * Rename: ``saugeen`` → ``saugeenday``. + * Leaderboard adds a freq segment for HF-flat datasets: leaderboard + ``m4_yearly/A`` → HF flat ``m4_yearly`` (the freq is implicit in the + data, not the path). Likewise for the other ``m4_*``, + ``car_parts/M``, ``covid_deaths/D``, ``hospital/M``, + ``restaurant/D``, ``temperature_rain/D``, + ``bizitobs_application/10S``, ``bizitobs_service/10S``. + +Schema +------ +Each HF entry exposes ``item_id``, ``start``, ``freq``, ``target``. +``target`` is a flat ``List[float]`` for univariate configs and a +``List[List[float]]`` of shape ``(C, T)`` for multivariate ones (e.g. +``bitbrains_*``, ``electricity/*``, ``ett1/*``, ``ett2/*``, +``jena_weather/*``, ``solar/*``). Both shapes are handled — multivariate +entries are transposed to the repo's ``(T, C)`` contract. + +Cutoffs and windows +------------------- +We don't comply with GIFT-Eval's prescribed test cutoff; we use the same +rolling-window logic as Monash via +:func:`benchmark_utils.windowing.make_forecasting_splits`. The +``prediction_length`` for a given (freq, term) follows GIFT-Eval's +canonical ``base × multiplier`` rule via +:func:`benchmark_utils.constants.gift_eval_prediction_length`. + +Data contract output mirrors :mod:`datasets.monash`. +""" + +import numpy as np +from benchopt import BaseDataset + +from benchmark_utils.covariates import Covariates +from benchmark_utils.constants import ( + from_pandas, + gift_eval_prediction_length, +) +from benchmark_utils.windowing import make_forecasting_splits + + +# --------------------------------------------------------------------------- +# Single source of truth: leaderboard ``/`` path → tuple of +# terms that path defines. Derived from +# gift-eval/results/*/all_results.csv. 55 paths, 97 (path, term) triples; +# 34 paths are short-only, 21 define all three. +# --------------------------------------------------------------------------- +_LEADERBOARD: dict[str, tuple[str, ...]] = { + "bitbrains_fast_storage/5T": ("short", "medium", "long"), + "bitbrains_fast_storage/H": ("short",), + "bitbrains_rnd/5T": ("short", "medium", "long"), + "bitbrains_rnd/H": ("short",), + "bizitobs_application/10S": ("short", "medium", "long"), + "bizitobs_l2c/5T": ("short", "medium", "long"), + "bizitobs_l2c/H": ("short", "medium", "long"), + "bizitobs_service/10S": ("short", "medium", "long"), + "car_parts/M": ("short",), + "covid_deaths/D": ("short",), + "electricity/15T": ("short", "medium", "long"), + "electricity/D": ("short",), + "electricity/H": ("short", "medium", "long"), + "electricity/W": ("short",), + "ett1/15T": ("short", "medium", "long"), + "ett1/D": ("short",), + "ett1/H": ("short", "medium", "long"), + "ett1/W": ("short",), + "ett2/15T": ("short", "medium", "long"), + "ett2/D": ("short",), + "ett2/H": ("short", "medium", "long"), + "ett2/W": ("short",), + "hierarchical_sales/D": ("short",), + "hierarchical_sales/W": ("short",), + "hospital/M": ("short",), + "jena_weather/10T": ("short", "medium", "long"), + "jena_weather/D": ("short",), + "jena_weather/H": ("short", "medium", "long"), + "kdd_cup_2018/D": ("short",), + "kdd_cup_2018/H": ("short", "medium", "long"), + "loop_seattle/5T": ("short", "medium", "long"), + "loop_seattle/D": ("short",), + "loop_seattle/H": ("short", "medium", "long"), + "m4_daily/D": ("short",), + "m4_hourly/H": ("short",), + "m4_monthly/M": ("short",), + "m4_quarterly/Q": ("short",), + "m4_weekly/W": ("short",), + "m4_yearly/A": ("short",), + "m_dense/D": ("short",), + "m_dense/H": ("short", "medium", "long"), + "restaurant/D": ("short",), + "saugeen/D": ("short",), + "saugeen/M": ("short",), + "saugeen/W": ("short",), + "solar/10T": ("short", "medium", "long"), + "solar/D": ("short",), + "solar/H": ("short", "medium", "long"), + "solar/W": ("short",), + "sz_taxi/15T": ("short", "medium", "long"), + "sz_taxi/H": ("short",), + "temperature_rain/D": ("short",), + "us_births/D": ("short",), + "us_births/M": ("short",), + "us_births/W": ("short",), +} + + +# Public derived constants — what users and CLI tooling reference. +GIFTEVAL_DATASETS: tuple[str, ...] = tuple(sorted(_LEADERBOARD)) +GIFTEVAL_TERMS: tuple[str, ...] = ("short", "medium", "long") +CANONICAL_COMBOS: frozenset[tuple[str, str]] = frozenset( + (path, term) for path, terms in _LEADERBOARD.items() for term in terms +) + + +# --------------------------------------------------------------------------- +# Leaderboard ```` → HF top-level directory name. Only entries that +# differ from the lowercase identity mapping appear here. +# --------------------------------------------------------------------------- +_LEADERBOARD_TO_HF: dict[str, str] = { + "loop_seattle": "LOOP_SEATTLE", + "m_dense": "M_DENSE", + "sz_taxi": "SZ_TAXI", + "car_parts": "car_parts_with_missing", + "kdd_cup_2018": "kdd_cup_2018_with_missing", + "temperature_rain": "temperature_rain_with_missing", + "saugeen": "saugeenday", +} + + +# --------------------------------------------------------------------------- +# Datasets that live as a single arrow file directly under the dataset +# name (no per-freq subdir on HF). The leaderboard still adds a freq +# segment to their paths (e.g. ``m4_yearly/A``, ``hospital/M``), which we +# strip before locating the file. +# --------------------------------------------------------------------------- +_HF_FLAT_DATASETS: frozenset[str] = frozenset({ + "bizitobs_application", "bizitobs_service", + "car_parts_with_missing", "covid_deaths", "hospital", + "m4_daily", "m4_hourly", "m4_monthly", "m4_quarterly", + "m4_weekly", "m4_yearly", + "restaurant", "temperature_rain_with_missing", +}) + + +def _hf_arrow_directory(leaderboard_path: str) -> str: + """Resolve a leaderboard ``/`` path to the actual HF + directory containing the arrow file. + + Examples + -------- + ``"m4_weekly/W"`` → ``"m4_weekly"`` (HF-flat, drops freq) + ``"loop_seattle/H"`` → ``"LOOP_SEATTLE/H"`` (case-renamed) + ``"car_parts/M"`` → ``"car_parts_with_missing"`` (HF-flat + suffix) + """ + leaderboard_name, _, freq_segment = leaderboard_path.partition("/") + hf_name = _LEADERBOARD_TO_HF.get(leaderboard_name, leaderboard_name) + if hf_name in _HF_FLAT_DATASETS: + return hf_name + if freq_segment: + return f"{hf_name}/{freq_segment}" + return hf_name + + +def _skip_placeholder(reason: str) -> dict: + """Return a minimal data dict that satisfies ``Objective.set_data`` + but flags the combo for skipping via ``Objective.skip``.""" + return dict( + X_train=[], + y_train=[], + X_test=[], + y_test=[], + cutoff_indexes=[], + covariates=Covariates(), + task="forecasting", + metrics=[], + prediction_length=1, + freq="D", + seasonality=1, + _skip_reason=reason, + ) + + +class Dataset(BaseDataset): + """GIFT-Eval forecasting dataset (loaded from HF Salesforce/GiftEval). + + Parameters + ---------- + dataset_name : str + One of 55 canonical leaderboard paths — ``/``, e.g. + ``"m4_weekly/W"``, ``"loop_seattle/H"``. See + :data:`GIFTEVAL_DATASETS`. + term : str + ``"short"`` / ``"medium"`` / ``"long"``. Combos not in + :data:`CANONICAL_COMBOS` are skipped (placeholder + objective + ``skip``), so ``dataset_name=all, term=long`` runs only the 21 + paths that define ``long``. + prediction_length : int or None + Explicit override. ``None`` → resolved from (freq, term) via + :func:`benchmark_utils.constants.gift_eval_prediction_length`. + n_windows : int + Number of rolling evaluation windows per series. + max_series : int or None + Optional cap on the number of series. + debug : bool + If True, keep only the first 5 series for fast iteration. + """ + + name = "GiftEval" + + requirements = ["pip::datasets", "pip::huggingface-hub"] + + parameters = { + "dataset_name": ["m4_weekly/W"], + "term": ["short"], + "prediction_length": [None], + "n_windows": [1], + "max_series": [None], + "debug": [False], + } + + # ``prepare()`` depends on ``dataset_name`` only — ``term`` and the + # other knobs shape the in-memory view, not the downloaded files. + prepare_cache_ignore = ( + "term", "prediction_length", "n_windows", "max_series", "debug", + ) + + @classmethod + def get_all_parameter_values(cls, name): + if name == "dataset_name": + return list(GIFTEVAL_DATASETS) + if name == "term": + return list(GIFTEVAL_TERMS) + return None + + def prepare(self): + """Pre-download arrow shards for this config into HF's cache.""" + self._snapshot() + + def _snapshot(self) -> "list[str]": + """Snapshot-download the arrow files for this dataset and return + their local paths. Idempotent — HF caches by content hash.""" + from huggingface_hub import snapshot_download + from pathlib import Path + + hf_path = _hf_arrow_directory(self.dataset_name) + local_root = snapshot_download( + "Salesforce/GiftEval", + repo_type="dataset", + allow_patterns=f"{hf_path}/*.arrow", + ) + return sorted(str(p) for p in (Path(local_root) / hf_path).glob("*.arrow")) + + def get_data(self): + from datasets import Dataset as HFDataset + + # Short-circuit non-canonical combos so heavy parsing doesn't run. + if (self.dataset_name, self.term) not in CANONICAL_COMBOS: + return _skip_placeholder( + f"non-canonical GIFT-Eval combo: {self.dataset_name!r} does " + f"not define term {self.term!r} on the leaderboard" + ) + + arrow_files = self._snapshot() + if not arrow_files: + raise ValueError( + f"No Arrow file found for GIFT-Eval dataset " + f"{self.dataset_name!r}. Valid choices are in GIFTEVAL_DATASETS." + ) + + rows = [] + for f in arrow_files: + rows.extend(HFDataset.from_file(f)) + + if self.debug: + rows = rows[:5] + elif self.max_series is not None: + rows = rows[: int(self.max_series)] + + if not rows: + raise ValueError( + f"GIFT-Eval dataset {self.dataset_name!r} returned 0 series." + ) + + # Frequency / seasonality — every series in a GIFT-Eval subset + # shares the same freq, so taking it from the first entry is safe. + pandas_freq = rows[0].get("freq") or "D" + freq, seasonality, _ = from_pandas(pandas_freq) + + pred_len = self.prediction_length + if pred_len is None: + pred_len = gift_eval_prediction_length(pandas_freq, self.term) + + # Build (T, C) series. Univariate entries arrive as flat + # ``List[float]`` (ndim=1); multivariate as ``List[List[float]]`` + # of shape ``(C, T)``. + series_list = [] + for r in rows: + values = np.asarray(r["target"], dtype=np.float32) + if values.ndim == 1: + series_list.append(values.reshape(-1, 1)) # (T, 1) + elif values.ndim == 2: + series_list.append(values.T) # (C,T)→(T,C) + + if not series_list: + raise ValueError( + f"All entries in GIFT-Eval dataset {self.dataset_name!r} " + "had unsupported target shapes." + ) + + # Training portion: everything except the last test windows. + test_len = pred_len * self.n_windows + X_train, y_train_list, full_series = [], [], [] + for ts in series_list: + if ts.shape[0] < pred_len + 1: + continue + train_end = max(1, ts.shape[0] - test_len) + X_train.append(ts[:train_end]) + y_train_list.append(ts[train_end: train_end + pred_len]) + full_series.append(ts) + + if not full_series: + raise ValueError( + "All series are shorter than prediction_length." + ) + + n_windows = 1 if self.debug else self.n_windows + X_test, cutoff_indexes, y_test = make_forecasting_splits( + full_series, + prediction_length=pred_len, + n_windows=n_windows, + ) + + return dict( + X_train=X_train, + y_train=y_train_list, + X_test=X_test, + y_test=y_test, + cutoff_indexes=cutoff_indexes, + covariates=Covariates(), # GIFT-Eval HF schema has no covariates + task="forecasting", + metrics=["mae", "mse", "mase", "smape"], + prediction_length=pred_len, + freq=freq, + seasonality=seasonality, + ) diff --git a/datasets/monash.py b/datasets/monash.py index a607f64..f7f6ed3 100644 --- a/datasets/monash.py +++ b/datasets/monash.py @@ -36,30 +36,9 @@ from benchopt import BaseDataset from benchmark_utils.covariates import Covariates +from benchmark_utils.constants import from_aeon from benchmark_utils.windowing import make_forecasting_splits -# Map aeon frequency strings → pandas-style freq codes and MASE seasonality -_FREQ_MAP = { - "yearly": ("Y", 1), - "quarterly": ("Q", 4), - "monthly": ("M", 12), - "weekly": ("W", 52), - "daily": ("D", 7), - "hourly": ("H", 24), - "minutely": ("T", 1440), - "seconds": ("S", 1), -} - -_DEFAULT_HORIZON = { - "Y": 6, - "Q": 8, - "M": 12, - "W": 13, - "D": 14, - "H": 24, - "T": 60, -} - class Dataset(BaseDataset): """Monash forecasting dataset (loaded via aeon). @@ -88,6 +67,20 @@ class Dataset(BaseDataset): "debug": [False], } + # Only dataset_name decides what aeon downloads; the other knobs + # affect the in-memory split, not the file on disk. + prepare_cache_ignore = ("prediction_length", "n_windows", "debug") + + def prepare(self): + """Warm aeon's local cache for this dataset (download if missing). + + aeon writes the ``.tsf`` to + ``~/.aeon/datasets/local_data//.tsf`` on first use; + we call it once and discard the parsed result so the cache layer + in :func:`load_forecasting` handles the actual download. + """ + load_forecasting(self.dataset_name, return_metadata=False) + def get_data(self): df, meta = load_forecasting(self.dataset_name, return_metadata=True) # df columns: series_name, start_timestamp, series_value @@ -95,13 +88,11 @@ def get_data(self): # contain_missing_values, contain_equal_length aeon_freq = meta.get("frequency", "yearly") - freq, seasonality = _FREQ_MAP.get(aeon_freq, ("D", 1)) + freq, seasonality, default_h = from_aeon(aeon_freq) pred_len = self.prediction_length if pred_len is None: - pred_len = int( - meta.get("forecast_horizon") or _DEFAULT_HORIZON.get(freq, 10) - ) + pred_len = int(meta.get("forecast_horizon") or default_h) series_list = [] rows = df.iterrows() if not self.debug else list(df.iterrows())[:5] diff --git a/objective.py b/objective.py index 5964026..9ad2ead 100644 --- a/objective.py +++ b/objective.py @@ -104,8 +104,25 @@ def set_data( self.covariates = covariates if covariates is not None else Covariates() self.task = task self.metrics = metrics + # Pull any skip marker out of meta so it doesn't leak into + # ``get_objective()`` payloads. + self._skip_reason = meta.pop("_skip_reason", None) self.meta = meta # freq, prediction_length, n_classes, … + def skip(self, **data): + """Honor a ``_skip_reason`` field set by the dataset. + + Datasets that want to filter their own parameter grid (e.g. + :mod:`datasets.gifteval` skipping non-leaderboard (path, term) + combos) return ``_skip_reason="..."`` from ``get_data()`` and we + propagate it here so benchopt records a clean skip rather than + running an empty objective. + """ + reason = data.get("_skip_reason") + if reason: + return True, reason + return False, None + # ------------------------------------------------------------------ # Passed to the solver # ------------------------------------------------------------------