-
Notifications
You must be signed in to change notification settings - Fork 21
ENH: Support for GiftEval and FEV-Bench #17
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
eddardd
wants to merge
17
commits into
benchopt:main
Choose a base branch
from
eddardd:feat/gift-eval-support
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
17 commits
Select commit
Hold shift + click to select a range
ad0464d
ENH batch forecasting predict() across series and cutoffs
GeoffNN 2972813
ENH add SeasonalNaive forecasting solver
GeoffNN 7b24d94
PERF batch chronos-2 / moirai-2 in one cross_validate call
GeoffNN efe5ad5
REFACTOR typed ForecastInput, Covariates dataclass, prediction_length…
GeoffNN 1eced14
ENH add quantile dimension to forecasting outputs
GeoffNN 0138146
REFACTOR ForecastOutput is a single object + Chronos-2 batched local …
GeoffNN d910ba5
Merge remote-tracking branch 'origin/main' into refactor/batched-pred…
GeoffNN 3ea97db
FIX tighten chronos-forecasting pin; drop redundant torch dep
GeoffNN 70a2873
Merge pull request #1 from GeoffNN/refactor/batched-predict-api
eddardd 5c1876a
Merge branch 'main' into main
eddardd 1829e41
feat: move constants to a dedicated file
eddardd 6689046
feat: gift evall support
eddardd 5e4a675
Merge branch 'main' into feat/gift-eval-support
eddardd 2a4a740
feat: adds support for fev bench
eddardd 848effb
minor fixes
eddardd b2dc953
fixes, prepare(), all behavior for gifteval and fevbench
eddardd b792561
Merge branch 'main' into feat/gift-eval-support
tomMoral File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| """Shared frequency / seasonality tables for forecasting datasets. | ||
|
|
||
| Two sources name frequencies differently: | ||
| - aeon (used by Monash) uses words: "yearly", "weekly", "minutely", ... | ||
| - GIFT-Eval (and pandas) use offset aliases: "Y", "W-SUN", "5T", ... | ||
|
|
||
| This module exposes a single canonical (freq, seasonality) lookup keyed on | ||
| the canonical pandas-style base alias (e.g. "Y", "W", "D"), plus two | ||
| adapters that normalize each source onto that canonical key. | ||
| """ | ||
|
|
||
| import re | ||
|
|
||
| # Canonical base alias → (display_freq, MASE seasonality, default forecast horizon) | ||
| _BASE = { | ||
| "Y": ("Y", 1, 6), | ||
| "Q": ("Q", 4, 8), | ||
| "M": ("M", 12, 12), | ||
| "W": ("W", 52, 13), | ||
| "D": ("D", 7, 14), | ||
| "H": ("H", 24, 24), | ||
| "T": ("T", 1440, 60), # minutes | ||
| "S": ("S", 1, 60), | ||
| } | ||
|
|
||
| # aeon's spelled-out names → canonical base alias | ||
| _AEON_TO_BASE = { | ||
| "yearly": "Y", | ||
| "quarterly": "Q", | ||
| "monthly": "M", | ||
| "weekly": "W", | ||
| "daily": "D", | ||
| "hourly": "H", | ||
| "minutely": "T", | ||
| "seconds": "S", | ||
| } | ||
|
|
||
|
|
||
| def from_aeon(freq_word: str) -> tuple[str, int, int]: | ||
| """Look up (freq, seasonality, default_horizon) from an aeon freq word.""" | ||
| base = _AEON_TO_BASE.get(freq_word, "D") | ||
| return _BASE[base] | ||
|
|
||
|
|
||
| # Pandas offset aliases: strip a leading multiplier and any anchor suffix | ||
| # (e.g. "5T" → "T", "W-SUN" → "W", "QS-OCT" → "Q", "YE" → "Y"). | ||
| _PANDAS_ALIAS_RE = re.compile(r"^\d*([A-Za-z]+)") | ||
| _NORMALIZE_BASE = { | ||
| # Newer pandas spellings → legacy single-letter aliases used in _BASE. | ||
| "YE": "Y", "YS": "Y", "A": "Y", "AS": "Y", | ||
| "QE": "Q", "QS": "Q", | ||
| "ME": "M", "MS": "M", | ||
| "min": "T", "MIN": "T", | ||
| } | ||
|
|
||
|
|
||
| def from_pandas(freq_alias: str) -> tuple[str, int, int]: | ||
| """Look up (freq, seasonality, default_horizon) from a pandas freq alias. | ||
|
|
||
| Handles multipliers ("5T") and anchors ("W-SUN", "QS-OCT") by stripping | ||
| them before lookup. Unknown aliases default to daily. | ||
| """ | ||
| if not freq_alias: | ||
| return _BASE["D"] | ||
| m = _PANDAS_ALIAS_RE.match(freq_alias.split("-", 1)[0]) | ||
| if not m: | ||
| return _BASE["D"] | ||
| head = m.group(1) | ||
| base = _NORMALIZE_BASE.get(head, head[:1].upper()) | ||
| return _BASE.get(base, _BASE["D"]) | ||
|
|
||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # GIFT-Eval term resolution | ||
| # | ||
| # Mirrors the canonical table in the upstream time-series repo: prediction | ||
| # length is a function of pandas freq, then scaled by a term multiplier | ||
| # (short=1, medium=10, long=15). Used by datasets/gifteval.py so reported | ||
| # numbers line up with the GIFT-Eval leaderboard. | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| GIFT_EVAL_PRED_LENGTH_MAP: dict[str, int] = { | ||
| "M": 12, "MS": 12, | ||
| "W": 8, "W-SUN": 8, "W-MON": 8, | ||
| "D": 30, | ||
| "H": 48, "6H": 48, | ||
| "T": 48, "5T": 48, "10T": 48, "15T": 48, "30T": 48, | ||
| "S": 60, "4S": 60, | ||
| "Q": 8, "Q-DEC": 8, | ||
| "A": 4, "A-DEC": 4, | ||
| "Y": 4, | ||
| } | ||
|
|
||
| GIFT_EVAL_TERM_MULTIPLIER: dict[str, int] = { | ||
| "short": 1, | ||
| "medium": 10, | ||
| "long": 15, | ||
| } | ||
|
|
||
|
|
||
| def gift_eval_prediction_length(freq: str, term: str) -> int: | ||
| """Resolve the GIFT-Eval prediction length for a (freq, term) pair. | ||
|
|
||
| ``freq`` is a pandas-style alias (e.g. ``"5T"``, ``"1H"``, ``"W-SUN"``). | ||
| Lookup falls back through: exact match → strip leading "1" multiplier | ||
| ("1H" → "H") → collapse any multi-X alias to its base X ("10S" → "S", | ||
| "30T" → "T") → default 48. ``term`` must be one of ``"short"``, | ||
| ``"medium"``, ``"long"``. | ||
| """ | ||
| if term not in GIFT_EVAL_TERM_MULTIPLIER: | ||
| raise ValueError( | ||
| f"term must be one of {list(GIFT_EVAL_TERM_MULTIPLIER)}; got {term!r}" | ||
| ) | ||
| base = GIFT_EVAL_PRED_LENGTH_MAP.get(freq) | ||
| if base is None: | ||
| m = _PANDAS_ALIAS_RE.match(freq.split("-", 1)[0]) | ||
| if m: | ||
| head = m.group(1) | ||
| # Normalize new pandas spellings ("QE"→"Q", "ME"→"M", ...) | ||
| # before falling back through the map. | ||
| head = _NORMALIZE_BASE.get(head, head) | ||
| base = GIFT_EVAL_PRED_LENGTH_MAP.get(head) | ||
| if base is None: | ||
| base = 48 | ||
| return base * GIFT_EVAL_TERM_MULTIPLIER[term] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,271 @@ | ||
| """AutoGluon fev_datasets forecasting benchmark | ||
| (huggingface.co/datasets/autogluon/fev_datasets). | ||
|
|
||
| The HF repo organizes data either: | ||
| - per-freq: ``<dataset>/<freq>/train-*.parquet`` | ||
| (e.g. ``ETT/1H``, ``LOOP_SEATTLE/5T``) | ||
| - flat: ``<dataset>/train-*.parquet`` | ||
| (e.g. ``australian_tourism``) | ||
| - or with an arbitrary subdir that is NOT a freq (e.g. ``boomlet/<N>`` | ||
| where ``<N>`` is a series id, not a frequency). | ||
|
|
||
| We accept the directory path directly as ``dataset_name`` (e.g. | ||
| ``"ETT/1H"``, ``"australian_tourism"``) and infer the actual freq from | ||
| each series' ``timestamp`` column rather than parsing the path. | ||
|
|
||
| Each parquet row is one series; columns vary: | ||
| - Always: ``id``, ``timestamp`` | ||
| - Univariate: a ``target`` column (list of floats) | ||
| - Multivariate (e.g. ``ETT``): no ``target`` column — each channel is | ||
| its own column (``HUFL``, ..., ``OT``). Channel columns are stacked | ||
| on the last axis to form ``(T, C)``. | ||
|
|
||
| Rolling-window splits match :mod:`datasets.monash`. The default | ||
| ``prediction_length`` is the freq-based heuristic from | ||
| :func:`benchmark_utils.constants.from_pandas`; FEV does not publish a | ||
| per-dataset horizon spec, so we don't try to mirror one. Pass | ||
| ``prediction_length=N`` explicitly to override. | ||
| """ | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
| from benchopt import BaseDataset | ||
|
|
||
| from benchmark_utils.covariates import Covariates | ||
| from benchmark_utils.constants import from_pandas | ||
| from benchmark_utils.windowing import make_forecasting_splits | ||
|
|
||
|
|
||
| _METADATA_COLS = ("id", "timestamp") | ||
|
|
||
|
|
||
| # Canonical list of FEV evaluation configs — directory paths inside | ||
| # https://huggingface.co/datasets/autogluon/fev_datasets that contain at | ||
| # least one ``train-*.parquet`` file. Surfaced via | ||
| # ``get_parameter_choices`` so that ``dataset_name=all`` and ``benchopt | ||
| # info -v`` work. | ||
| FEV_DATASETS: tuple[str, ...] = ( | ||
| "ETT/15T", "ETT/1D", "ETT/1H", "ETT/1W", | ||
| "LOOP_SEATTLE/1D", "LOOP_SEATTLE/1H", "LOOP_SEATTLE/5T", | ||
| "M_DENSE/1D", "M_DENSE/1H", | ||
| "SZ_TAXI/15T", "SZ_TAXI/1H", | ||
| "australian_tourism", | ||
| "bizitobs_l2c/1H", "bizitobs_l2c/5T", | ||
| "boomlet/1062", "boomlet/1209", "boomlet/1225", "boomlet/1230", | ||
| "boomlet/1282", "boomlet/1487", "boomlet/1631", "boomlet/1676", | ||
| "boomlet/1855", "boomlet/1975", "boomlet/2187", | ||
| "boomlet/285", "boomlet/619", "boomlet/772", "boomlet/963", | ||
| "ecdc_ili", | ||
| "entsoe/15T", "entsoe/1H", "entsoe/30T", | ||
| "epf_be", "epf_de", "epf_fr", "epf_np", "epf_pjm", | ||
| "ercot/1D", "ercot/1H", "ercot/1M", "ercot/1W", | ||
| "favorita_stores/1D", "favorita_stores/1M", "favorita_stores/1W", | ||
| "favorita_transactions/1D", "favorita_transactions/1M", | ||
| "favorita_transactions/1W", | ||
| "fred_md_2025", "fred_qd_2025", | ||
| "gvar", "hermes", | ||
| "hierarchical_sales/1D", "hierarchical_sales/1W", | ||
| "hospital", | ||
| "hospital_admissions/1D", "hospital_admissions/1W", | ||
| "jena_weather/10T", "jena_weather/1D", "jena_weather/1H", | ||
| "kdd_cup_2022/10T", "kdd_cup_2022/1D", "kdd_cup_2022/30T", | ||
| "m5/1D", "m5/1M", "m5/1W", | ||
| "proenfo_bull", "proenfo_cockatoo", | ||
| "proenfo_gfc12", "proenfo_gfc14", "proenfo_gfc17", | ||
| "proenfo_hog", "proenfo_pdb", | ||
| "redset/15T", "redset/1H", "redset/5T", | ||
| "restaurant", | ||
| "rohlik_orders/1D", "rohlik_orders/1W", | ||
| "rohlik_sales/1D", "rohlik_sales/1W", | ||
| "rossmann/1D", "rossmann/1W", | ||
| "solar/1D", "solar/1W", | ||
| "solar_with_weather/15T", "solar_with_weather/1H", | ||
| "uci_air_quality/1D", "uci_air_quality/1H", | ||
| "uk_covid_nation/1D", "uk_covid_nation/1W", | ||
| "uk_covid_utla/1D", "uk_covid_utla/1W", | ||
| "us_consumption/1M", "us_consumption/1Q", "us_consumption/1Y", | ||
| "walmart", | ||
| "world_co2_emissions", "world_life_expectancy", "world_tourism", | ||
| ) | ||
|
|
||
|
|
||
| def _infer_freq(timestamps) -> str: | ||
| """Best-effort freq inference from a series' timestamp column. | ||
|
|
||
| Falls back to ``"D"`` when pandas cannot infer. Uses the first 5 | ||
| points to keep the check cheap on long series. | ||
| """ | ||
| try: | ||
| idx = pd.DatetimeIndex(timestamps[:5]) | ||
| return pd.infer_freq(idx) or "D" | ||
| except Exception: | ||
| return "D" | ||
|
|
||
|
|
||
| class Dataset(BaseDataset): | ||
| """AutoGluon fev forecasting dataset. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| dataset_name : str | ||
| Directory path inside the HF repo. Per-freq paths look like | ||
| ``"ETT/1H"`` / ``"LOOP_SEATTLE/5T"``; flat paths like | ||
| ``"australian_tourism"`` / ``"hospital"``. See ``FEV_DATASETS`` | ||
| for the full list (also discoverable via ``benchopt info -v``). | ||
| prediction_length : int or None | ||
| Explicit override. ``None`` → resolved from the inferred freq | ||
| via :func:`benchmark_utils.constants.from_pandas` (same heuristic | ||
| used by Monash). FEV does not publish its own per-dataset | ||
| horizon matrix, so we don't try to align with a leaderboard | ||
| spec here. | ||
| n_windows : int | ||
| Number of rolling evaluation windows per series. | ||
| max_series : int or None | ||
| Optional cap on the number of series. | ||
| debug : bool | ||
| If True, keep only the first 5 series. | ||
| """ | ||
|
|
||
| name = "FEV" | ||
|
|
||
| requirements = ["pip::pyarrow", "pip::huggingface-hub"] | ||
|
|
||
| parameters = { | ||
| "dataset_name": ["LOOP_SEATTLE/1H"], | ||
| "prediction_length": [None], | ||
| "n_windows": [1], | ||
| "max_series": [None], | ||
| "debug": [False], | ||
| } | ||
|
|
||
| # Cache prepare() by dataset_name only — the other knobs shape the | ||
| # in-memory view, not the downloaded files. | ||
| prepare_cache_ignore = ( | ||
| "prediction_length", "n_windows", "max_series", "debug", | ||
| ) | ||
|
|
||
| @classmethod | ||
| def get_all_parameter_values(cls, name): | ||
| if name == "dataset_name": | ||
| return list(FEV_DATASETS) | ||
| return None | ||
|
|
||
| def prepare(self): | ||
| """Pre-download parquet shards for this config into HF's cache.""" | ||
| self._snapshot() | ||
|
|
||
| def _snapshot(self) -> "list[str]": | ||
| """Snapshot-download parquet files for this dataset_name and | ||
| return their local paths. Idempotent.""" | ||
| from huggingface_hub import snapshot_download | ||
| from pathlib import Path | ||
|
Comment on lines
+160
to
+161
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
|
|
||
| local_root = snapshot_download( | ||
| "autogluon/fev_datasets", | ||
| repo_type="dataset", | ||
| allow_patterns=f"{self.dataset_name}/*.parquet", | ||
| ) | ||
| return sorted( | ||
| str(p) for p in (Path(local_root) / self.dataset_name).glob("*.parquet") | ||
| ) | ||
|
|
||
| def get_data(self): | ||
| parquet_files = self._snapshot() | ||
| if not parquet_files: | ||
| raise ValueError( | ||
| f"No parquet found at {self.dataset_name!r} in " | ||
| "autogluon/fev_datasets. Valid choices are in FEV_DATASETS." | ||
| ) | ||
|
|
||
| df = pd.concat( | ||
| [pd.read_parquet(f) for f in parquet_files], | ||
| ignore_index=True, | ||
| ) | ||
|
|
||
| if self.debug: | ||
| df = df.head(5) | ||
| elif self.max_series is not None: | ||
| df = df.head(int(self.max_series)) | ||
|
|
||
| if df.empty: | ||
| raise ValueError(f"{self.dataset_name!r} contained 0 series.") | ||
|
|
||
| # Channel cols = non-metadata columns whose entries are numeric | ||
| # array-likes. Some FEV datasets carry extra scalar/string fields | ||
| # (``type``, ``Security``) or arrays of strings (holiday names in | ||
| # ``favorita_stores``, etc.). We treat covariates as out of scope | ||
| # for the MVP. | ||
| def _is_numeric_array_col(c): | ||
| v = df.iloc[0][c] | ||
| if not hasattr(v, "__len__") or isinstance(v, (str, bytes)): | ||
| return False | ||
| if len(v) == 0: | ||
| return False | ||
| return isinstance(v[0], (int, float, np.integer, np.floating)) | ||
|
|
||
| channel_cols = [ | ||
| c for c in df.columns | ||
| if c not in _METADATA_COLS and _is_numeric_array_col(c) | ||
| ] | ||
| if not channel_cols: | ||
| raise ValueError( | ||
| f"{self.dataset_name!r} has no channel columns " | ||
| f"(only {_METADATA_COLS} present)." | ||
| ) | ||
|
|
||
| # Infer freq from the first series' timestamps — same for the | ||
| # whole config (FEV groups by freq at the directory level for | ||
| # nested configs, and flat configs are single-freq). | ||
| inferred_freq = _infer_freq(df.iloc[0]["timestamp"]) | ||
| canonical_freq, seasonality, default_h = from_pandas(inferred_freq) | ||
|
|
||
| pred_len = self.prediction_length | ||
| if pred_len is None: | ||
| pred_len = int(default_h) | ||
|
|
||
| # Build (T, C) series. Each row's per-channel array has the same | ||
| # length (T_i); stack on the last axis. | ||
| series_list = [] | ||
| for _, row in df.iterrows(): | ||
| channels = [np.asarray(row[c], dtype=np.float32) for c in channel_cols] | ||
| T = channels[0].shape[0] | ||
| if any(ch.shape[0] != T for ch in channels): | ||
| continue | ||
| series_list.append(np.stack(channels, axis=-1)) | ||
|
|
||
| if not series_list: | ||
| raise ValueError("All series were skipped (inconsistent channel lengths).") | ||
|
|
||
| test_len = pred_len * self.n_windows | ||
| X_train, y_train_list, full_series = [], [], [] | ||
| for ts in series_list: | ||
| if ts.shape[0] < pred_len + 1: | ||
| continue | ||
| train_end = max(1, ts.shape[0] - test_len) | ||
| X_train.append(ts[:train_end]) | ||
| y_train_list.append(ts[train_end: train_end + pred_len]) | ||
| full_series.append(ts) | ||
|
|
||
| if not full_series: | ||
| raise ValueError("All series are shorter than prediction_length.") | ||
|
|
||
| n_windows = 1 if self.debug else self.n_windows | ||
| X_test, cutoff_indexes, y_test = make_forecasting_splits( | ||
| full_series, | ||
| prediction_length=pred_len, | ||
| n_windows=n_windows, | ||
| ) | ||
|
|
||
| return dict( | ||
| X_train=X_train, | ||
| y_train=y_train_list, | ||
| X_test=X_test, | ||
| y_test=y_test, | ||
| cutoff_indexes=cutoff_indexes, | ||
| covariates=Covariates(), | ||
| task="forecasting", | ||
| metrics=["mae", "mse", "mase", "smape"], | ||
| prediction_length=pred_len, | ||
| freq=canonical_freq, | ||
| seasonality=seasonality, | ||
| ) | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pyarrow is a requirement of benchopt, no need to reinstall and risk breaking the conda/uv install