Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 125 additions & 0 deletions benchmark_utils/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""Shared frequency / seasonality tables for forecasting datasets.

Two sources name frequencies differently:
- aeon (used by Monash) uses words: "yearly", "weekly", "minutely", ...
- GIFT-Eval (and pandas) use offset aliases: "Y", "W-SUN", "5T", ...

This module exposes a single canonical (freq, seasonality) lookup keyed on
the canonical pandas-style base alias (e.g. "Y", "W", "D"), plus two
adapters that normalize each source onto that canonical key.
"""

import re

# Canonical base alias → (display_freq, MASE seasonality, default forecast horizon)
_BASE = {
"Y": ("Y", 1, 6),
"Q": ("Q", 4, 8),
"M": ("M", 12, 12),
"W": ("W", 52, 13),
"D": ("D", 7, 14),
"H": ("H", 24, 24),
"T": ("T", 1440, 60), # minutes
"S": ("S", 1, 60),
}

# aeon's spelled-out names → canonical base alias
_AEON_TO_BASE = {
"yearly": "Y",
"quarterly": "Q",
"monthly": "M",
"weekly": "W",
"daily": "D",
"hourly": "H",
"minutely": "T",
"seconds": "S",
}


def from_aeon(freq_word: str) -> tuple[str, int, int]:
"""Look up (freq, seasonality, default_horizon) from an aeon freq word."""
base = _AEON_TO_BASE.get(freq_word, "D")
return _BASE[base]


# Pandas offset aliases: strip a leading multiplier and any anchor suffix
# (e.g. "5T" → "T", "W-SUN" → "W", "QS-OCT" → "Q", "YE" → "Y").
_PANDAS_ALIAS_RE = re.compile(r"^\d*([A-Za-z]+)")
_NORMALIZE_BASE = {
# Newer pandas spellings → legacy single-letter aliases used in _BASE.
"YE": "Y", "YS": "Y", "A": "Y", "AS": "Y",
"QE": "Q", "QS": "Q",
"ME": "M", "MS": "M",
"min": "T", "MIN": "T",
}


def from_pandas(freq_alias: str) -> tuple[str, int, int]:
"""Look up (freq, seasonality, default_horizon) from a pandas freq alias.

Handles multipliers ("5T") and anchors ("W-SUN", "QS-OCT") by stripping
them before lookup. Unknown aliases default to daily.
"""
if not freq_alias:
return _BASE["D"]
m = _PANDAS_ALIAS_RE.match(freq_alias.split("-", 1)[0])
if not m:
return _BASE["D"]
head = m.group(1)
base = _NORMALIZE_BASE.get(head, head[:1].upper())
return _BASE.get(base, _BASE["D"])


# ---------------------------------------------------------------------------
# GIFT-Eval term resolution
#
# Mirrors the canonical table in the upstream time-series repo: prediction
# length is a function of pandas freq, then scaled by a term multiplier
# (short=1, medium=10, long=15). Used by datasets/gifteval.py so reported
# numbers line up with the GIFT-Eval leaderboard.
# ---------------------------------------------------------------------------

GIFT_EVAL_PRED_LENGTH_MAP: dict[str, int] = {
"M": 12, "MS": 12,
"W": 8, "W-SUN": 8, "W-MON": 8,
"D": 30,
"H": 48, "6H": 48,
"T": 48, "5T": 48, "10T": 48, "15T": 48, "30T": 48,
"S": 60, "4S": 60,
"Q": 8, "Q-DEC": 8,
"A": 4, "A-DEC": 4,
"Y": 4,
}

GIFT_EVAL_TERM_MULTIPLIER: dict[str, int] = {
"short": 1,
"medium": 10,
"long": 15,
}


def gift_eval_prediction_length(freq: str, term: str) -> int:
"""Resolve the GIFT-Eval prediction length for a (freq, term) pair.

``freq`` is a pandas-style alias (e.g. ``"5T"``, ``"1H"``, ``"W-SUN"``).
Lookup falls back through: exact match → strip leading "1" multiplier
("1H" → "H") → collapse any multi-X alias to its base X ("10S" → "S",
"30T" → "T") → default 48. ``term`` must be one of ``"short"``,
``"medium"``, ``"long"``.
"""
if term not in GIFT_EVAL_TERM_MULTIPLIER:
raise ValueError(
f"term must be one of {list(GIFT_EVAL_TERM_MULTIPLIER)}; got {term!r}"
)
base = GIFT_EVAL_PRED_LENGTH_MAP.get(freq)
if base is None:
m = _PANDAS_ALIAS_RE.match(freq.split("-", 1)[0])
if m:
head = m.group(1)
# Normalize new pandas spellings ("QE"→"Q", "ME"→"M", ...)
# before falling back through the map.
head = _NORMALIZE_BASE.get(head, head)
base = GIFT_EVAL_PRED_LENGTH_MAP.get(head)
if base is None:
base = 48
return base * GIFT_EVAL_TERM_MULTIPLIER[term]
271 changes: 271 additions & 0 deletions datasets/fev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
"""AutoGluon fev_datasets forecasting benchmark
(huggingface.co/datasets/autogluon/fev_datasets).

The HF repo organizes data either:
- per-freq: ``<dataset>/<freq>/train-*.parquet``
(e.g. ``ETT/1H``, ``LOOP_SEATTLE/5T``)
- flat: ``<dataset>/train-*.parquet``
(e.g. ``australian_tourism``)
- or with an arbitrary subdir that is NOT a freq (e.g. ``boomlet/<N>``
where ``<N>`` is a series id, not a frequency).

We accept the directory path directly as ``dataset_name`` (e.g.
``"ETT/1H"``, ``"australian_tourism"``) and infer the actual freq from
each series' ``timestamp`` column rather than parsing the path.

Each parquet row is one series; columns vary:
- Always: ``id``, ``timestamp``
- Univariate: a ``target`` column (list of floats)
- Multivariate (e.g. ``ETT``): no ``target`` column — each channel is
its own column (``HUFL``, ..., ``OT``). Channel columns are stacked
on the last axis to form ``(T, C)``.

Rolling-window splits match :mod:`datasets.monash`. The default
``prediction_length`` is the freq-based heuristic from
:func:`benchmark_utils.constants.from_pandas`; FEV does not publish a
per-dataset horizon spec, so we don't try to mirror one. Pass
``prediction_length=N`` explicitly to override.
"""

import numpy as np
import pandas as pd
from benchopt import BaseDataset

from benchmark_utils.covariates import Covariates
from benchmark_utils.constants import from_pandas
from benchmark_utils.windowing import make_forecasting_splits


_METADATA_COLS = ("id", "timestamp")


# Canonical list of FEV evaluation configs — directory paths inside
# https://huggingface.co/datasets/autogluon/fev_datasets that contain at
# least one ``train-*.parquet`` file. Surfaced via
# ``get_parameter_choices`` so that ``dataset_name=all`` and ``benchopt
# info -v`` work.
FEV_DATASETS: tuple[str, ...] = (
"ETT/15T", "ETT/1D", "ETT/1H", "ETT/1W",
"LOOP_SEATTLE/1D", "LOOP_SEATTLE/1H", "LOOP_SEATTLE/5T",
"M_DENSE/1D", "M_DENSE/1H",
"SZ_TAXI/15T", "SZ_TAXI/1H",
"australian_tourism",
"bizitobs_l2c/1H", "bizitobs_l2c/5T",
"boomlet/1062", "boomlet/1209", "boomlet/1225", "boomlet/1230",
"boomlet/1282", "boomlet/1487", "boomlet/1631", "boomlet/1676",
"boomlet/1855", "boomlet/1975", "boomlet/2187",
"boomlet/285", "boomlet/619", "boomlet/772", "boomlet/963",
"ecdc_ili",
"entsoe/15T", "entsoe/1H", "entsoe/30T",
"epf_be", "epf_de", "epf_fr", "epf_np", "epf_pjm",
"ercot/1D", "ercot/1H", "ercot/1M", "ercot/1W",
"favorita_stores/1D", "favorita_stores/1M", "favorita_stores/1W",
"favorita_transactions/1D", "favorita_transactions/1M",
"favorita_transactions/1W",
"fred_md_2025", "fred_qd_2025",
"gvar", "hermes",
"hierarchical_sales/1D", "hierarchical_sales/1W",
"hospital",
"hospital_admissions/1D", "hospital_admissions/1W",
"jena_weather/10T", "jena_weather/1D", "jena_weather/1H",
"kdd_cup_2022/10T", "kdd_cup_2022/1D", "kdd_cup_2022/30T",
"m5/1D", "m5/1M", "m5/1W",
"proenfo_bull", "proenfo_cockatoo",
"proenfo_gfc12", "proenfo_gfc14", "proenfo_gfc17",
"proenfo_hog", "proenfo_pdb",
"redset/15T", "redset/1H", "redset/5T",
"restaurant",
"rohlik_orders/1D", "rohlik_orders/1W",
"rohlik_sales/1D", "rohlik_sales/1W",
"rossmann/1D", "rossmann/1W",
"solar/1D", "solar/1W",
"solar_with_weather/15T", "solar_with_weather/1H",
"uci_air_quality/1D", "uci_air_quality/1H",
"uk_covid_nation/1D", "uk_covid_nation/1W",
"uk_covid_utla/1D", "uk_covid_utla/1W",
"us_consumption/1M", "us_consumption/1Q", "us_consumption/1Y",
"walmart",
"world_co2_emissions", "world_life_expectancy", "world_tourism",
)


def _infer_freq(timestamps) -> str:
"""Best-effort freq inference from a series' timestamp column.

Falls back to ``"D"`` when pandas cannot infer. Uses the first 5
points to keep the check cheap on long series.
"""
try:
idx = pd.DatetimeIndex(timestamps[:5])
return pd.infer_freq(idx) or "D"
except Exception:
return "D"


class Dataset(BaseDataset):
"""AutoGluon fev forecasting dataset.

Parameters
----------
dataset_name : str
Directory path inside the HF repo. Per-freq paths look like
``"ETT/1H"`` / ``"LOOP_SEATTLE/5T"``; flat paths like
``"australian_tourism"`` / ``"hospital"``. See ``FEV_DATASETS``
for the full list (also discoverable via ``benchopt info -v``).
prediction_length : int or None
Explicit override. ``None`` → resolved from the inferred freq
via :func:`benchmark_utils.constants.from_pandas` (same heuristic
used by Monash). FEV does not publish its own per-dataset
horizon matrix, so we don't try to align with a leaderboard
spec here.
n_windows : int
Number of rolling evaluation windows per series.
max_series : int or None
Optional cap on the number of series.
debug : bool
If True, keep only the first 5 series.
"""

name = "FEV"

requirements = ["pip::pyarrow", "pip::huggingface-hub"]

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pyarrow is a requirement of benchopt, no need to reinstall and risk breaking the conda/uv install

Suggested change
requirements = ["pip::pyarrow", "pip::huggingface-hub"]
requirements = ["pip::huggingface-hub"]


parameters = {
"dataset_name": ["LOOP_SEATTLE/1H"],
"prediction_length": [None],
"n_windows": [1],
"max_series": [None],
"debug": [False],
}

# Cache prepare() by dataset_name only — the other knobs shape the
# in-memory view, not the downloaded files.
prepare_cache_ignore = (
"prediction_length", "n_windows", "max_series", "debug",
)

@classmethod
def get_all_parameter_values(cls, name):
if name == "dataset_name":
return list(FEV_DATASETS)
return None

def prepare(self):
"""Pre-download parquet shards for this config into HF's cache."""
self._snapshot()

def _snapshot(self) -> "list[str]":
"""Snapshot-download parquet files for this dataset_name and
return their local paths. Idempotent."""
from huggingface_hub import snapshot_download
from pathlib import Path
Comment on lines +160 to +161

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto


local_root = snapshot_download(
"autogluon/fev_datasets",
repo_type="dataset",
allow_patterns=f"{self.dataset_name}/*.parquet",
)
return sorted(
str(p) for p in (Path(local_root) / self.dataset_name).glob("*.parquet")
)

def get_data(self):
parquet_files = self._snapshot()
if not parquet_files:
raise ValueError(
f"No parquet found at {self.dataset_name!r} in "
"autogluon/fev_datasets. Valid choices are in FEV_DATASETS."
)

df = pd.concat(
[pd.read_parquet(f) for f in parquet_files],
ignore_index=True,
)

if self.debug:
df = df.head(5)
elif self.max_series is not None:
df = df.head(int(self.max_series))

if df.empty:
raise ValueError(f"{self.dataset_name!r} contained 0 series.")

# Channel cols = non-metadata columns whose entries are numeric
# array-likes. Some FEV datasets carry extra scalar/string fields
# (``type``, ``Security``) or arrays of strings (holiday names in
# ``favorita_stores``, etc.). We treat covariates as out of scope
# for the MVP.
def _is_numeric_array_col(c):
v = df.iloc[0][c]
if not hasattr(v, "__len__") or isinstance(v, (str, bytes)):
return False
if len(v) == 0:
return False
return isinstance(v[0], (int, float, np.integer, np.floating))

channel_cols = [
c for c in df.columns
if c not in _METADATA_COLS and _is_numeric_array_col(c)
]
if not channel_cols:
raise ValueError(
f"{self.dataset_name!r} has no channel columns "
f"(only {_METADATA_COLS} present)."
)

# Infer freq from the first series' timestamps — same for the
# whole config (FEV groups by freq at the directory level for
# nested configs, and flat configs are single-freq).
inferred_freq = _infer_freq(df.iloc[0]["timestamp"])
canonical_freq, seasonality, default_h = from_pandas(inferred_freq)

pred_len = self.prediction_length
if pred_len is None:
pred_len = int(default_h)

# Build (T, C) series. Each row's per-channel array has the same
# length (T_i); stack on the last axis.
series_list = []
for _, row in df.iterrows():
channels = [np.asarray(row[c], dtype=np.float32) for c in channel_cols]
T = channels[0].shape[0]
if any(ch.shape[0] != T for ch in channels):
continue
series_list.append(np.stack(channels, axis=-1))

if not series_list:
raise ValueError("All series were skipped (inconsistent channel lengths).")

test_len = pred_len * self.n_windows
X_train, y_train_list, full_series = [], [], []
for ts in series_list:
if ts.shape[0] < pred_len + 1:
continue
train_end = max(1, ts.shape[0] - test_len)
X_train.append(ts[:train_end])
y_train_list.append(ts[train_end: train_end + pred_len])
full_series.append(ts)

if not full_series:
raise ValueError("All series are shorter than prediction_length.")

n_windows = 1 if self.debug else self.n_windows
X_test, cutoff_indexes, y_test = make_forecasting_splits(
full_series,
prediction_length=pred_len,
n_windows=n_windows,
)

return dict(
X_train=X_train,
y_train=y_train_list,
X_test=X_test,
y_test=y_test,
cutoff_indexes=cutoff_indexes,
covariates=Covariates(),
task="forecasting",
metrics=["mae", "mse", "mase", "smape"],
prediction_length=pred_len,
freq=canonical_freq,
seasonality=seasonality,
)
Loading
Loading