From ddd6391d52ceb505c795bb48a9daf674183d7425 Mon Sep 17 00:00:00 2001 From: mohossam01 Date: Fri, 15 May 2026 18:26:47 -0400 Subject: [PATCH] feat: record variance partition and gp kernel fit in the manifest --- docs/site/api-reference.md | 6 +- docs/site/manifest-reference.md | 224 ++++++++- docs/site/user-guide/archetypes.md | 29 ++ plotsim/gp.py | 216 +++++++++ plotsim/manifest.py | 535 ++++++++++++++++++++ tests/test_manifest_variance_gp.py | 752 +++++++++++++++++++++++++++++ 6 files changed, 1758 insertions(+), 4 deletions(-) create mode 100644 plotsim/gp.py create mode 100644 tests/test_manifest_variance_gp.py diff --git a/docs/site/api-reference.md b/docs/site/api-reference.md index b5734f0..11bd397 100644 --- a/docs/site/api-reference.md +++ b/docs/site/api-reference.md @@ -511,7 +511,9 @@ The manifest captures the *signal layer* a noisy fact table can't recover: archetype assignments, trajectory positions, event-firing periods, SCD band crossings, bridge associations, the engine's seasonal-strength inputs, per-pair regression summaries for declared -correlations, and reproducibility metadata. +correlations, nested-ANOVA variance partitions per metric, RBF +Gaussian-process kernel fits over each archetype's trajectory shape, +and reproducibility metadata. **Parameters** @@ -523,7 +525,7 @@ correlations, and reproducibility metadata. | `sample_rate` | Override for `config.manifest.trajectory_sample_rate`. `None` reads the config value. | | `scd_state` | Pass `state.scd` to record SCD Type 2 band crossings. `None` leaves `manifest.scd_events` empty. | | `bridge_state` | Pass `state.bridges` to record M:N associations. `None` leaves `manifest.bridge_associations` empty. | -| `entity_metrics` | Pass `state.entity_metrics` to populate `manifest.regression_pairs_global` and `manifest.regression_pairs_by_archetype` with pair-wise OLS summaries for every declared correlation pair. `None` leaves both sections at their empty defaults. | +| `entity_metrics` | Pass `state.entity_metrics` to populate `manifest.regression_pairs_global` / `manifest.regression_pairs_by_archetype` with pair-wise OLS summaries for every declared correlation pair AND `manifest.variance_partitions` / `manifest.variance_partitions_by_segment` with nested-ANOVA decompositions per metric. `None` leaves all four sections at their empty defaults. (`manifest.gp_kernel_fits` is populated independently from `config.archetypes` whenever the config declares at least one metric — the section does not need `entity_metrics` threaded.) | The function is pure — same inputs produce a byte-identical manifest. No RNG, no clock, no filesystem. diff --git a/docs/site/manifest-reference.md b/docs/site/manifest-reference.md index eabfb6e..2b771ef 100644 --- a/docs/site/manifest-reference.md +++ b/docs/site/manifest-reference.md @@ -68,13 +68,16 @@ produces a byte-identical `manifest.json`. Encoding: UTF-8, "noise_config": {...} | null, "seasonal_decomposition": {...}, "regression_pairs_global": [...], - "regression_pairs_by_archetype": {...} + "regression_pairs_by_archetype": {...}, + "variance_partitions": [...], + "variance_partitions_by_segment": [...], + "gp_kernel_fits": [...] } ``` | Field | Type | Description | |---|---|---| -| `schema_version` | `str` | Wire-shape version. Currently `"1.10"` (bumped over time as new additive sections — `causal_graph`, `correlations`, `outlier_injections`, multi-source mappings, `parent_child_relations`, `noise_config` — landed; 1.7 → 1.8 extended `noise_config` with `noise_family` / `degrees_of_freedom`; 1.8 → 1.9 added the optional `target_metric` field on the per-entity `treatment` and per-cohort `treatment_cohorts` records; 1.9 → 1.10 added the `seasonal_decomposition` snapshot plus per-pair OLS summaries in `regression_pairs_global` / `regression_pairs_by_archetype`) | +| `schema_version` | `str` | Wire-shape version. Currently `"1.10"` (bumped over time as new additive sections — `causal_graph`, `correlations`, `outlier_injections`, multi-source mappings, `parent_child_relations`, `noise_config` — landed; 1.7 → 1.8 extended `noise_config` with `noise_family` / `degrees_of_freedom`; 1.8 → 1.9 added the optional `target_metric` field on the per-entity `treatment` and per-cohort `treatment_cohorts` records; 1.9 → 1.10 added the `seasonal_decomposition` snapshot plus per-pair OLS summaries in `regression_pairs_global` / `regression_pairs_by_archetype`; the `variance_partitions` / `variance_partitions_by_segment` / `gp_kernel_fits` sections landed additively on 1.10 with no version bump) | | `seed` | `int` | The seed used for generation — `config.seed` | | `config_sha256` | `str` | Full SHA-256 hex of the JSON-serialized config. Detects config drift between generation and consumption | | `archetype_assignments` | array | One entry per entity; see below | @@ -95,6 +98,9 @@ produces a byte-identical `manifest.json`. Encoding: UTF-8, | `seasonal_decomposition` | object | Snapshot of the seasonal-strength inputs the engine consumed. Always emitted; configs without `seasonal_effects` get the empty-sentinel shape (empty list / empty dicts) | | `regression_pairs_global` | array | Pair-wise OLS summary (slope, intercept, r², residual variance) for every declared correlation pair, pooled across every entity. Empty list when no correlations are configured | | `regression_pairs_by_archetype` | object | Same OLS summary as `regression_pairs_global` but grouped by `Entity.archetype`. Keys are archetype names; values mirror the global list shape. Empty dict when no correlations are configured | +| `variance_partitions` | array | Nested-ANOVA variance decomposition per metric, with `Entity.archetype` as the between-group axis. One record per metric. Empty list when the config declares no metrics | +| `variance_partitions_by_segment` | array | Same nested-ANOVA decomposition with curve segment as the between-group axis, computed per archetype. One record per `(metric, archetype)` pair. Segments are never pooled across archetypes. Empty list when the config declares no metrics | +| `gp_kernel_fits` | array | RBF Gaussian-process kernel fits over each archetype's trajectory shape, plus per-entity records for entities that carry trajectory `overrides`. Empty list when the config declares no metrics | --- @@ -848,6 +854,220 @@ pooled fit will mispredict for the archetype whose β diverges most. --- +## `variance_partitions` + +Nested-ANOVA variance decomposition per metric, with `Entity.archetype` +as the between-group axis. + +```json +{ + "variance_partitions": [ + { + "metric": "engagement", + "scope": "archetype", + "scope_name": "all", + "ss_between": 12.4, + "ss_within_entity": 6.8, + "ss_residual": 41.0, + "fraction_between": 0.206, + "fraction_within_entity": 0.113, + "fraction_residual": 0.681, + "degrees_of_freedom_between": 1, + "degrees_of_freedom_within": 18, + "degrees_of_freedom_residual": 220, + "n_observations": 240, + "cold_start_entities_excluded": 0 + } + ] +} +``` + +| Field | Type | Description | +|---|---|---| +| `metric` | `str` | The metric this record decomposes (matches `config.metrics[i].name`) | +| `scope` | `str` | Always `"archetype"` for this section | +| `scope_name` | `str` | Always the literal sentinel `"all"` for this section — the partition spans every archetype the config declares | +| `ss_between` | `float` | Sum-of-squares attributable to the grouping axis (variance between archetype means around the grand mean) | +| `ss_within_entity` | `float` | Sum-of-squares between entity means within the same archetype | +| `ss_residual` | `float` | Within-entity sum-of-squares over time (residual to each entity's own mean) | +| `fraction_between` / `fraction_within_entity` / `fraction_residual` | `float` | Each `ss_*` divided by `ss_total = ss_between + ss_within_entity + ss_residual`. The three fractions sum to `1.0` (or to `0.0` for a fully constant metric) | +| `degrees_of_freedom_between` | `int` | `n_groups - 1` where `n_groups` is the number of archetypes that contributed observations | +| `degrees_of_freedom_within` | `int` | `n_cells - n_groups` where `n_cells` is the number of distinct `(archetype, entity)` pairs | +| `degrees_of_freedom_residual` | `int` | `n_observations - n_cells` | +| `n_observations` | `int` | Count of finite `(entity, period)` cells used. Cells with NaN (cold-start lead-ins, MCAR-rewritten values) are excluded | +| `cold_start_entities_excluded` | `int` | Count of entities that contributed at least one NaN cell to this partition. Surfaces "this section dropped data" without forcing a re-derivation of the NaN tally | + +The three sums-of-squares satisfy `ss_between + ss_within_entity + +ss_residual == ss_total` exactly (modulo floating-point rounding at +`rtol≈1e-10`). One record per metric, sorted by `metric` for stable JSON +output. + +**Use case** — diagnose how much of a metric's spread is explained by +the latent archetype label vs. by entity-level idiosyncrasy vs. by +within-entity time-series noise. A high `fraction_between` says the +archetype is the dominant driver of metric values; a high +`fraction_residual` with a low `fraction_between` says metric values +are essentially noise on top of an entity-specific mean. + +--- + +## `variance_partitions_by_segment` + +The same nested-ANOVA decomposition with curve segment as the between- +group axis, computed per archetype. + +```json +{ + "variance_partitions_by_segment": [ + { + "metric": "engagement", + "scope": "segment", + "scope_name": "growth", + "ss_between": 8.1, + "ss_within_entity": 4.6, + "ss_residual": 22.3, + "fraction_between": 0.231, + "fraction_within_entity": 0.131, + "fraction_residual": 0.638, + "degrees_of_freedom_between": 2, + "degrees_of_freedom_within": 27, + "degrees_of_freedom_residual": 90, + "n_observations": 120, + "cold_start_entities_excluded": 0 + } + ] +} +``` + +The schema is identical to `variance_partitions`. The differences: + +| Field | Difference | +|---|---| +| `scope` | Always `"segment"` | +| `scope_name` | The archetype name. Each archetype's segments are decomposed in isolation; the section never pools observations across archetypes | +| `degrees_of_freedom_between` | `n_curve_segments - 1` for the named archetype | +| `n_observations` | Restricted to entities of `scope_name`'s archetype only | + +One record per `(metric, archetype)` pair whose entities contributed at +least one finite observation. Sorted by `(metric, scope_name)` for +stable JSON output. + +Each entity's segment membership is derived from its own boundary +computation — entities with `start_period > 0` (cold-start) or with +trajectory `overrides` contribute observations to the segment they +actually occupied at each period, not to the archetype baseline +segment. Period membership is reported as `segment_0`, `segment_1`, … +internally and rolls up into the per-segment SS terms; downstream +consumers see only the partition totals. + +**Use case** — locate where in an archetype's curve a metric's +variance spreads most. A high `ss_between` for an archetype with three +distinct curve segments says the metric tracks the curve; a low +`ss_between` says the metric is decoupled from the archetype's +narrative phase structure. + +--- + +## `gp_kernel_fits` + +RBF Gaussian-process kernel fits over each archetype's trajectory +shape. Surfaces a smoothness characterization the trajectory tape +itself doesn't directly expose. + +```json +{ + "gp_kernel_fits": [ + { + "scope_type": "archetype", + "scope_name": "growth", + "kernel_type": "rbf", + "hyperparameters": { + "length_scale": 4.7, + "signal_variance": 0.31, + "noise_variance": 0.0008 + }, + "log_marginal_likelihood": -3.2, + "n_train": 12, + "converged": true + }, + { + "scope_type": "archetype", + "scope_name": "flat", + "kernel_type": "rbf", + "hyperparameters": {}, + "log_marginal_likelihood": null, + "n_train": 12, + "converged": false + }, + { + "scope_type": "entity", + "scope_name": "growers_001", + "kernel_type": "rbf", + "hyperparameters": { + "length_scale": 6.1, + "signal_variance": 0.28, + "noise_variance": 0.0012 + }, + "log_marginal_likelihood": -2.9, + "n_train": 10, + "converged": true + } + ] +} +``` + +| Field | Type | Description | +|---|---|---| +| `scope_type` | `str` | Either `"archetype"` (one fit per declared archetype, against the archetype's clean trajectory) or `"entity"` (one fit per override-bearing entity, against that entity's specific trajectory) | +| `scope_name` | `str` | The archetype name or the entity name, depending on `scope_type` | +| `kernel_type` | `str` | Always `"rbf"` for now. Reserved as a discriminator for future kernel families | +| `hyperparameters` | object | Three keys when `converged=true`: `length_scale`, `signal_variance`, `noise_variance`. All in the natural (unstandardized) scale — `length_scale` is in units of period indices. Empty `{}` when `converged=false` | +| `log_marginal_likelihood` | `float` or `null` | The maximized value (positive sign — the fitter minimizes the negative log likelihood internally and negates before reporting). `null` when `converged=false` | +| `n_train` | `int` | Count of finite `(period, position)` training pairs used. NaN cells (cold-start prefix periods) are excluded | +| `converged` | `bool` | `true` when the optimizer reported success AND produced finite hyperparameters. `false` otherwise — see below for the failure modes | + +### Records emitted + +* One `scope_type="archetype"` record per archetype the config + declares. The fit consumes the archetype's *clean* trajectory (no + overrides, no cold-start shift) so the kernel characterizes the + archetype's intrinsic shape, not any individual entity's realized + data. +* One `scope_type="entity"` record per entity carrying a non-`None` + `overrides` field. Default-trajectory entities do *not* produce + per-entity records — only override-bearing entities do. + +### When `converged=false` + +The optimizer's failure paths are surfaced as a non-fatal record (the +manifest build never raises on a failed fit): + +* **Flat trajectory** — variance below the floor (≈ `1e-12`). The RBF + likelihood surface is degenerate when the signal has no variance to + fit. +* **Sparse data** — fewer than three finite training points (the + kernel has three hyperparameters; under-determined fits are + short-circuited). +* **Optimizer failure** — `scipy.optimize.minimize` reports + non-success or returns a non-finite NLL. +* **Numerical blow-up** — Cholesky factorization fails on the + covariance matrix despite the noise-variance floor. + +`converged=false` records carry an empty `hyperparameters` dict and a +`null` log marginal likelihood. Consumers should gate downstream +usage on the flag rather than inspecting `hyperparameters` directly. + +**Use case** — characterize trajectory smoothness without re-fitting a +GP downstream. A short `length_scale` (≪ `n_periods`) says the +archetype oscillates or has fast transitions; a long `length_scale` +(≈ `n_periods`) says it's gradual or monotone. Compare per-entity +records against their parent archetype to detect override-driven +shape divergence — a per-entity `length_scale` that disagrees with +the archetype baseline is direct evidence that the override pushed +the entity's curve onto a different smoothness regime. + +--- + ## Reading the manifest in Python ```python diff --git a/docs/site/user-guide/archetypes.md b/docs/site/user-guide/archetypes.md index 5737f1b..6eef991 100644 --- a/docs/site/user-guide/archetypes.md +++ b/docs/site/user-guide/archetypes.md @@ -231,6 +231,35 @@ controls peak height above `base`. Each cycle ramps from `base` to --- +## What the manifest records about archetypes + +Every run emits three archetype-shaped sections in `manifest.json` +([reference](../manifest-reference.md)) so a downstream consumer can +characterize the engine's archetype mix without re-running it: + +* **`variance_partitions`** — one nested-ANOVA record per metric, with + `Entity.archetype` as the between-group axis. The + `fraction_between` value answers "how much of this metric's spread + comes from the latent archetype label vs. entity-level idiosyncrasy + vs. within-entity time-series noise?" +* **`variance_partitions_by_segment`** — the same decomposition with + curve segment as the axis, computed per archetype (segments are + never pooled across archetypes). Surfaces which curve phases + contribute most to a metric's spread within an archetype. +* **`gp_kernel_fits`** — RBF Gaussian-process fit over each + archetype's clean trajectory. The recovered `length_scale` + characterizes the archetype's smoothness — short for oscillating / + fast-transition shapes, long for monotone or gradual ones. Entities + carrying trajectory `overrides` (per-entity inflection shifts) get + their own per-entity record alongside the archetype baseline so + override-driven shape divergence is directly visible. + +Configs without metrics emit empty containers for all three sections. +Flat trajectories produce a `converged: false` GP record with null +hyperparameters — the build never aborts on a degenerate fit. + +--- + ## What to read next - [Metrics and connections](./metrics-and-connections.md) — the metric diff --git a/plotsim/gp.py b/plotsim/gp.py new file mode 100644 index 0000000..9a53790 --- /dev/null +++ b/plotsim/gp.py @@ -0,0 +1,216 @@ +"""plotsim.gp — RBF Gaussian-process kernel fit for trajectory characterization. + +Mission 026: emits a per-archetype (or per-entity for override-bearing +entities) RBF kernel fit into the manifest so downstream consumers can +characterize trajectory smoothness (length scale, signal variance) without +re-fitting their own GP against the realized metric values. + +This module is analytical-only and stateless: + + * No filesystem I/O. + * No engine state — callers translate trajectory arrays into ``(x, y)`` + training data and pass them in. + * No RNG — same input → same fit result. + +The kernel is RBF (squared exponential) plus a homoscedastic noise term: + + k(t, t') = signal_variance * exp(-0.5 * (t - t')² / length_scale²) + + noise_variance * I + +Hyperparameters are optimized in log-space (``L-BFGS-B``) for numerical +stability — positivity is guaranteed by the parametrization. Degenerate +inputs (constant trajectories, ``< 3`` finite training points, +zero-range support, Cholesky failures, optimizer non-convergence) are +caught and surfaced as ``RBFFitResult(converged=False, hyperparameters=None, +log_marginal_likelihood=None)`` — never as exceptions. This is the +contract the manifest builder relies on: a failed fit must not abort +manifest emission. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + +import numpy as np +from scipy.optimize import minimize + +_LOG_TWO_PI = float(np.log(2.0 * np.pi)) +# Trajectories whose variance is below this floor are treated as flat — +# the RBF likelihood surface is degenerate (signal_variance → 0 floor) +# and fitting has no informational content. +_FLAT_VARIANCE_FLOOR = 1e-12 +# Log-space bounds for each hyperparameter. Keeps the optimizer away from +# numerically unstable regions where Cholesky would fail. +_LOG_HYPER_MIN = -8.0 +_LOG_HYPER_MAX = 8.0 +# Jitter added to the kernel diagonal for numerical stability beyond the +# noise term itself — keeps near-singular matrices factorizable. +_JITTER = 1e-10 + + +@dataclass(frozen=True) +class RBFFitResult: + """Outcome of one RBF kernel fit. + + ``converged=True`` means the optimizer reported success AND produced + finite hyperparameters. ``converged=False`` is the catch-all for + every degenerate path; consumers should gate downstream usage on the + flag rather than inspecting ``hyperparameters`` directly. + """ + + converged: bool + hyperparameters: Optional[dict[str, float]] + log_marginal_likelihood: Optional[float] + n_train: int + + +def _neg_log_marginal_likelihood( + log_theta: np.ndarray, + y_centered: np.ndarray, + d2: np.ndarray, +) -> float: + """Negative log marginal likelihood for the RBF + noise kernel. + + ``d2`` is the pairwise squared-distance matrix of the training inputs; + ``y_centered`` is the training targets with the empirical mean + subtracted (so the GP prior mean of zero is appropriate). Returns + ``1e10`` as a soft penalty for Cholesky failures so the optimizer + backs off the bad region instead of crashing. + """ + log_length_scale, log_signal_var, log_noise_var = log_theta + length_scale = float(np.exp(log_length_scale)) + signal_var = float(np.exp(log_signal_var)) + noise_var = float(np.exp(log_noise_var)) + n = y_centered.shape[0] + k = signal_var * np.exp(-0.5 * d2 / (length_scale * length_scale)) + k = k + (noise_var + _JITTER) * np.eye(n) + try: + chol = np.linalg.cholesky(k) + except np.linalg.LinAlgError: + return 1e10 + alpha = np.linalg.solve(chol.T, np.linalg.solve(chol, y_centered)) + log_det = 2.0 * float(np.sum(np.log(np.diag(chol)))) + nll = 0.5 * float(y_centered @ alpha) + 0.5 * log_det + 0.5 * float(n) * _LOG_TWO_PI + if not np.isfinite(nll): + return 1e10 + return float(nll) + + +def fit_rbf(x: np.ndarray, y: np.ndarray) -> RBFFitResult: + """Fit an RBF kernel + homoscedastic noise to ``(x, y)``. + + Args: + x: 1-D training inputs (typically period indices as float64). + y: 1-D training targets (typically trajectory positions in [0, 1]). + + Returns: + ``RBFFitResult``. Non-finite cells in either ``x`` or ``y`` are + masked out before fitting. Returns ``converged=False`` (and null + hyperparameters / likelihood) when: + + * fewer than 3 finite training points remain after masking; + * the masked ``y`` has variance below ``_FLAT_VARIANCE_FLOOR``; + * the masked ``x`` has zero range (all inputs identical); + * the optimizer fails to converge OR returns a non-finite NLL. + + On success ``hyperparameters`` carries the three keys + ``length_scale``, ``signal_variance``, ``noise_variance`` (in the + original unstandardized scale) and ``log_marginal_likelihood`` + carries the maximized value (positive sign — the function + minimizes the *negative* log likelihood and the result is + negated before reporting). + """ + x_arr = np.asarray(x, dtype=np.float64) + y_arr = np.asarray(y, dtype=np.float64) + if x_arr.shape != y_arr.shape or x_arr.ndim != 1: + return RBFFitResult( + converged=False, + hyperparameters=None, + log_marginal_likelihood=None, + n_train=0, + ) + mask = np.isfinite(x_arr) & np.isfinite(y_arr) + xf = x_arr[mask] + yf = y_arr[mask] + n = int(xf.size) + if n < 3: + return RBFFitResult( + converged=False, + hyperparameters=None, + log_marginal_likelihood=None, + n_train=n, + ) + y_var = float(np.var(yf)) + if y_var < _FLAT_VARIANCE_FLOOR: + return RBFFitResult( + converged=False, + hyperparameters=None, + log_marginal_likelihood=None, + n_train=n, + ) + x_range = float(xf.max() - xf.min()) + if x_range <= 0.0: + return RBFFitResult( + converged=False, + hyperparameters=None, + log_marginal_likelihood=None, + n_train=n, + ) + + yf_centered = yf - float(np.mean(yf)) + diff = xf[:, None] - xf[None, :] + d2 = diff * diff + + init_length_scale = max(x_range / 4.0, 1.0) + init_signal_var = max(y_var, 1e-6) + init_noise_var = max(y_var * 1e-3, 1e-8) + log_theta0 = np.array( + [ + float(np.log(init_length_scale)), + float(np.log(init_signal_var)), + float(np.log(init_noise_var)), + ], + dtype=np.float64, + ) + bounds = [ + (_LOG_HYPER_MIN, _LOG_HYPER_MAX), + (_LOG_HYPER_MIN, _LOG_HYPER_MAX), + (_LOG_HYPER_MIN, _LOG_HYPER_MAX), + ] + try: + result = minimize( + _neg_log_marginal_likelihood, + log_theta0, + args=(yf_centered, d2), + method="L-BFGS-B", + bounds=bounds, + ) + except (ValueError, np.linalg.LinAlgError): + return RBFFitResult( + converged=False, + hyperparameters=None, + log_marginal_likelihood=None, + n_train=n, + ) + if not bool(result.success) or not np.isfinite(result.fun): + return RBFFitResult( + converged=False, + hyperparameters=None, + log_marginal_likelihood=None, + n_train=n, + ) + log_length_scale, log_signal_var, log_noise_var = result.x + return RBFFitResult( + converged=True, + hyperparameters={ + "length_scale": float(np.exp(log_length_scale)), + "signal_variance": float(np.exp(log_signal_var)), + "noise_variance": float(np.exp(log_noise_var)), + }, + log_marginal_likelihood=float(-result.fun), + n_train=n, + ) + + +__all__ = ["RBFFitResult", "fit_rbf"] diff --git a/plotsim/manifest.py b/plotsim/manifest.py index bcd695f..ab8e28d 100644 --- a/plotsim/manifest.py +++ b/plotsim/manifest.py @@ -722,6 +722,115 @@ class RegressionPair(_ManifestBase): n_observations: int +class VariancePartition(_ManifestBase): + """Mission 026: nested-ANOVA variance partition for one metric. + + A three-level decomposition of the realized metric series: + + * ``ss_between`` — variance attributable to the grouping axis. For + ``scope="archetype"`` the axis is ``Entity.archetype`` (variance + between archetype means around the grand mean). For + ``scope="segment"`` the axis is the curve segment index within + ``scope_name``'s archetype (variance between segment means + around the archetype's grand mean). + * ``ss_within_entity`` — variance between entity means within the + same group. Captures entity-to-entity dispersion that the + grouping axis does not explain. + * ``ss_residual`` — within-entity variance over time, residual to + the entity's own mean within its group. The unexplained + time-series noise floor. + + The three sum to ``ss_total = Σ (y - grand_mean)²`` exactly modulo + floating-point rounding (rtol ≈ 1e-10 holds in practice). + ``fraction_*`` fields are each ``ss_* / ss_total`` (or zero when + ``ss_total == 0``). + + Degrees of freedom follow the standard nested-ANOVA convention: + + * ``degrees_of_freedom_between = n_groups - 1`` + * ``degrees_of_freedom_within = n_cells - n_groups`` (cells = + unique ``(group, entity)`` pairs that contributed observations) + * ``degrees_of_freedom_residual = n_observations - n_cells`` + + Cold-start entities (those with ``Entity.start_period > 0``) and + other NaN-cell sources are excluded cell-by-cell — only finite + observations contribute. ``cold_start_entities_excluded`` counts + entities that had at least one NaN cell in this partition's + domain; it surfaces "this section dropped data" to the consumer + without forcing them to re-derive the NaN count. + + ``scope_name`` semantics: + + * ``scope="archetype"`` — single record per metric covering every + archetype. ``scope_name`` is the literal sentinel ``"all"``. + * ``scope="segment"`` — one record per ``(metric, archetype)``. + ``scope_name`` is the archetype name, and ``ss_between`` / + DOF reflect the segments within that archetype only. Segments + are NEVER pooled across archetypes — the manifest guarantees + the archetype-locality of every segment-scope record. + """ + + metric: str + scope: str + scope_name: str + ss_between: float + ss_within_entity: float + ss_residual: float + fraction_between: float + fraction_within_entity: float + fraction_residual: float + degrees_of_freedom_between: int + degrees_of_freedom_within: int + degrees_of_freedom_residual: int + n_observations: int + cold_start_entities_excluded: int = 0 + + +class GPKernelFit(_ManifestBase): + """Mission 026: RBF Gaussian-process fit summary for one trajectory shape. + + Emitted with ``scope_type="archetype"`` (one record per declared + archetype, fitted against the archetype's clean trajectory) plus + additional ``scope_type="entity"`` records for any entity carrying + a non-None ``overrides`` field (the entity's specific trajectory + diverges from the archetype baseline and warrants its own fit). + + * ``scope_name`` — the archetype name (for archetype scope) or + the entity name (for entity scope). + * ``kernel_type`` — currently always ``"rbf"``. Reserved for a + future multi-kernel extension; consumers should still gate on + the value for forward compatibility. + * ``hyperparameters`` — three keys when ``converged=True``: + ``length_scale``, ``signal_variance``, ``noise_variance``. All + three are in the natural (unstandardized) scale — + ``length_scale`` is in units of period indices, so a value of + ``10.0`` means the kernel correlation length spans ten periods. + Empty dict when ``converged=False``. + * ``log_marginal_likelihood`` — the maximized value (positive + sign; the fitter minimizes the negative log likelihood + internally and negates before reporting). ``None`` when + ``converged=False``. + * ``n_train`` — count of finite ``(period, position)`` training + pairs used. NaN cells (cold-start prefix periods) are + excluded. + * ``converged`` — ``True`` when the optimizer reported success + AND produced finite hyperparameters. ``False`` for flat + trajectories (variance below ``1e-12``), trajectories with + fewer than three finite training points, optimizer failures, + and Cholesky-blow-up paths. Consumers should gate downstream + usage on this flag rather than inspecting hyperparameters + directly. + """ + + scope_type: str + scope_name: str + kernel_type: str + hyperparameters: dict[str, float] = Field(default_factory=dict) + log_marginal_likelihood: Optional[float] = None + n_train: int + converged: bool + + class HoldoutInfo(_ManifestBase): """M109: ground-truth record of the temporal holdout split. @@ -883,6 +992,24 @@ class ManifestSchema(_ManifestBase): # OLS (rare; a single-entity archetype with cold-start NaN can # produce ``n_observations < 2``). regression_pairs_by_archetype: dict[str, list[RegressionPair]] = {} + # Mission 026: nested-ANOVA variance partitions with archetype as + # the between-group axis. One record per metric. Empty list when + # ``entity_metrics`` was not threaded or the config declares no + # metrics — matches the empty-section contract on no-metric runs. + variance_partitions: list[VariancePartition] = [] + # Mission 026: nested-ANOVA variance partitions with curve segment + # as the between-group axis, computed per archetype (never pooled + # across archetypes). One record per ``(metric, archetype)`` pair + # whose entities contributed at least one finite observation. + # Empty list under the same conditions as ``variance_partitions``. + variance_partitions_by_segment: list[VariancePartition] = [] + # Mission 026: per-archetype (and per-entity for entities with + # ``overrides``) RBF GP kernel fits over the trajectory shape. + # Empty list when ``config.metrics`` is empty (the section + # piggybacks on metric presence so no-metric configs stay + # byte-equivalent to the pre-026 lane modulo the schema version + # string and the new empty containers). + gp_kernel_fits: list[GPKernelFit] = [] # --- Helpers ----------------------------------------------------------------- @@ -1348,6 +1475,388 @@ def _build_regression_pairs_by_archetype( return out +# --- Variance partition / GP helpers (Mission 026) --------------------------- + + +def _entity_segment_membership( + config: PlotsimConfig, + n_periods: int, +) -> dict[str, list[int]]: + """Per-entity ``(period_index → segment_index)`` lookup table. + + Returns one length-``n_periods`` list per entity. Entry ``p`` is the + curve segment index that period maps to under that entity's own + boundary computation (overrides + ``start_period`` applied), or + ``-1`` for periods before the entity's arrival (cold-start prefix) + and for entities whose archetype is unresolvable. + + Reuses ``trajectory._segment_boundaries`` / ``_resolve_shift`` so + the segment definition is byte-equivalent to the trajectory the + engine actually generated. + """ + # Local imports: ``plotsim.trajectory`` already imports from + # ``plotsim.config``, so importing it at module top wouldn't cycle — + # but keeping the imports local matches the pattern M25 used for + # ``_build_seasonal_factors``. + from plotsim.trajectory import _resolve_shift, _segment_boundaries + + arch_by_name = {a.name: a for a in config.archetypes} + out: dict[str, list[int]] = {} + for entity in config.entities: + archetype = arch_by_name.get(entity.archetype) + membership = [-1] * n_periods + if archetype is None: + out[entity.name] = membership + continue + active_n = n_periods - entity.start_period + if active_n < 1: + out[entity.name] = membership + continue + overrides_dict = entity.overrides.model_dump() if entity.overrides is not None else None + shift = _resolve_shift(archetype, active_n, overrides_dict) + boundaries = _segment_boundaries(archetype, active_n, shift) + for seg_idx in range(len(archetype.curve_segments)): + local_start = boundaries[seg_idx] + local_end = boundaries[seg_idx + 1] + absolute_start = entity.start_period + local_start + absolute_end = entity.start_period + local_end + for p in range(absolute_start, absolute_end): + if 0 <= p < n_periods: + membership[p] = seg_idx + out[entity.name] = membership + return out + + +def _three_level_anova( + cells: list[tuple[str, str, np.ndarray]], +) -> Optional[dict[str, float]]: + """Compute the three-level nested-ANOVA SS / DOF decomposition. + + ``cells`` is a list of ``(group_label, entity_label, values)`` + tuples. Each tuple represents one ``(group, entity)`` cell with at + least one finite observation. The decomposition is: + + ss_total = ss_between + ss_within_entity + ss_residual + + where + + * ``ss_between`` is the variance between group means around the + grand mean (weighted by per-group N); + * ``ss_within_entity`` is the variance between entity means + around their group mean (weighted by per-cell N); + * ``ss_residual`` is the within-cell residual variance. + + Returns ``None`` when ``cells`` is empty or carries zero + observations in total. The caller filters those out — the manifest + section omits records with no data rather than emitting zero-only + entries. + """ + if not cells: + return None + all_values = np.concatenate([c[2] for c in cells]) + n_total = int(all_values.size) + if n_total == 0: + return None + grand_mean = float(np.mean(all_values)) + ss_total = float(np.sum((all_values - grand_mean) ** 2)) + + cell_n: list[int] = [] + cell_mean: list[float] = [] + cell_ss_residual: list[float] = [] + group_labels: list[str] = [] + for group_label, _entity_label, values in cells: + n = int(values.size) + if n == 0: + continue + mean = float(np.mean(values)) + cell_n.append(n) + cell_mean.append(mean) + cell_ss_residual.append(float(np.sum((values - mean) ** 2))) + group_labels.append(group_label) + + if not cell_n: + return None + + group_totals: dict[str, tuple[float, int]] = {} + for label, n, mean in zip(group_labels, cell_n, cell_mean): + prev_sum, prev_n = group_totals.get(label, (0.0, 0)) + group_totals[label] = (prev_sum + n * mean, prev_n + n) + group_mean_by_label: dict[str, float] = { + label: (s / n if n > 0 else 0.0) for label, (s, n) in group_totals.items() + } + + ss_between = 0.0 + for label, (_, n) in group_totals.items(): + diff = group_mean_by_label[label] - grand_mean + ss_between += float(n) * diff * diff + + ss_within_entity = 0.0 + for label, n, mean in zip(group_labels, cell_n, cell_mean): + diff = mean - group_mean_by_label[label] + ss_within_entity += float(n) * diff * diff + + ss_residual = float(sum(cell_ss_residual)) + + n_groups = len(group_totals) + n_cells = len(cell_n) + df_between = max(n_groups - 1, 0) + df_within = max(n_cells - n_groups, 0) + df_residual = max(n_total - n_cells, 0) + + return { + "ss_total": ss_total, + "ss_between": ss_between, + "ss_within_entity": ss_within_entity, + "ss_residual": ss_residual, + "df_between": float(df_between), + "df_within": float(df_within), + "df_residual": float(df_residual), + "n_observations": float(n_total), + } + + +def _make_variance_partition( + metric: str, + scope: str, + scope_name: str, + anova: dict[str, float], + cold_start_entities_excluded: int, +) -> VariancePartition: + """Translate an ``_three_level_anova`` result dict into a manifest record.""" + ss_total = anova["ss_total"] + if ss_total > 0.0: + f_between = anova["ss_between"] / ss_total + f_within = anova["ss_within_entity"] / ss_total + f_residual = anova["ss_residual"] / ss_total + else: + f_between = 0.0 + f_within = 0.0 + f_residual = 0.0 + return VariancePartition( + metric=metric, + scope=scope, + scope_name=scope_name, + ss_between=float(anova["ss_between"]), + ss_within_entity=float(anova["ss_within_entity"]), + ss_residual=float(anova["ss_residual"]), + fraction_between=float(f_between), + fraction_within_entity=float(f_within), + fraction_residual=float(f_residual), + degrees_of_freedom_between=int(anova["df_between"]), + degrees_of_freedom_within=int(anova["df_within"]), + degrees_of_freedom_residual=int(anova["df_residual"]), + n_observations=int(anova["n_observations"]), + cold_start_entities_excluded=int(cold_start_entities_excluded), + ) + + +def _build_variance_partitions_archetype( + config: PlotsimConfig, + entity_metrics: dict[str, dict[str, np.ndarray]], +) -> list[VariancePartition]: + """One ``VariancePartition`` per metric with archetype as the group axis. + + Groups every finite ``(entity, period)`` observation by + ``Entity.archetype``. Drops metrics with no finite observations + entirely — the section omits records with no data rather than + emitting zero-only entries (mirrors the pre-existing manifest + convention for sparse derived sections). + """ + if not config.metrics or not entity_metrics: + return [] + out: list[VariancePartition] = [] + for metric in config.metrics: + cells: list[tuple[str, str, np.ndarray]] = [] + cold_start = 0 + for entity in config.entities: + per_metric = entity_metrics.get(entity.name) + if per_metric is None: + continue + arr = per_metric.get(metric.name) + if arr is None: + continue + arr64 = np.asarray(arr, dtype=np.float64) + mask = np.isfinite(arr64) + n_kept = int(np.sum(mask)) + if n_kept < int(arr64.size): + cold_start += 1 + if n_kept == 0: + continue + cells.append((entity.archetype, entity.name, arr64[mask])) + anova = _three_level_anova(cells) + if anova is None: + continue + out.append( + _make_variance_partition( + metric.name, + scope="archetype", + scope_name="all", + anova=anova, + cold_start_entities_excluded=cold_start, + ) + ) + out.sort(key=lambda v: v.metric) + return out + + +def _build_variance_partitions_by_segment( + config: PlotsimConfig, + entity_metrics: dict[str, dict[str, np.ndarray]], + n_periods: int, +) -> list[VariancePartition]: + """One ``VariancePartition`` per ``(metric, archetype)`` with segment as the group axis. + + Restricted to entities of one archetype at a time — the section + never pools observations across archetypes. The segment index is + derived from each entity's own boundary computation + (``_entity_segment_membership``), so cold-start entities and + override-bearing entities contribute observations to the segment + they actually occupied at each period, not to the archetype + baseline segment. + """ + if not config.metrics or not entity_metrics: + return [] + arch_by_name = {a.name: a for a in config.archetypes} + entities_by_archetype: dict[str, list[Any]] = {} + for entity in config.entities: + entities_by_archetype.setdefault(entity.archetype, []).append(entity) + + membership = _entity_segment_membership(config, n_periods) + + out: list[VariancePartition] = [] + for metric in config.metrics: + for archetype_name in sorted(entities_by_archetype.keys()): + if archetype_name not in arch_by_name: + continue + entities = entities_by_archetype[archetype_name] + grouped: dict[tuple[int, str], list[float]] = {} + cold_start = 0 + for entity in entities: + per_metric = entity_metrics.get(entity.name) + if per_metric is None: + continue + arr = per_metric.get(metric.name) + if arr is None: + continue + arr64 = np.asarray(arr, dtype=np.float64) + seg_map = membership.get(entity.name, [-1] * n_periods) + has_nan = False + n_periods_local = int(arr64.size) + for p in range(n_periods_local): + val = arr64[p] + if not np.isfinite(val): + has_nan = True + continue + seg_idx = seg_map[p] if p < len(seg_map) else -1 + if seg_idx < 0: + continue + grouped.setdefault((seg_idx, entity.name), []).append(float(val)) + if has_nan: + cold_start += 1 + if not grouped: + continue + cells = [ + ( + f"segment_{seg_idx}", + entity_name, + np.asarray(values, dtype=np.float64), + ) + for (seg_idx, entity_name), values in grouped.items() + ] + anova = _three_level_anova(cells) + if anova is None: + continue + out.append( + _make_variance_partition( + metric.name, + scope="segment", + scope_name=archetype_name, + anova=anova, + cold_start_entities_excluded=cold_start, + ) + ) + out.sort(key=lambda v: (v.metric, v.scope_name)) + return out + + +def _build_gp_kernel_fits( + config: PlotsimConfig, + n_periods: int, +) -> list[GPKernelFit]: + """Per-archetype (and per-override-entity) RBF GP fits. + + ``scope_type="archetype"`` for every archetype: fit against the + archetype's clean trajectory (no overrides, no cold-start shift). + ``scope_type="entity"`` for every entity carrying a non-None + ``overrides`` field — those entities have trajectories that diverge + from their archetype baseline, so a per-entity record captures the + actual shape the engine generated. + + The piggyback on ``config.metrics`` (an empty list skips emission + entirely) matches the byte-equivalence contract: configs with no + metrics emit no GP records. + + Records are emitted even when the fit fails to converge — the + ``converged=False`` path carries null hyperparameters / likelihood + so consumers can distinguish "fit failed" from "fit not attempted." + """ + if not config.metrics or n_periods < 1: + return [] + # Local imports: ``plotsim.gp`` is a leaf analytical module that + # depends on scipy; importing at module top would force scipy load + # on every manifest build, including no-metric runs. + from plotsim.gp import fit_rbf + from plotsim.trajectory import compute_trajectory + + out: list[GPKernelFit] = [] + arch_by_name = {a.name: a for a in config.archetypes} + x = np.arange(n_periods, dtype=np.float64) + + for archetype_name in sorted(arch_by_name.keys()): + archetype = arch_by_name[archetype_name] + traj = compute_trajectory(archetype, n_periods) + result = fit_rbf(x, traj) + out.append( + GPKernelFit( + scope_type="archetype", + scope_name=archetype_name, + kernel_type="rbf", + hyperparameters=result.hyperparameters or {}, + log_marginal_likelihood=result.log_marginal_likelihood, + n_train=result.n_train, + converged=result.converged, + ) + ) + + for entity in config.entities: + if entity.overrides is None: + continue + entity_archetype = arch_by_name.get(entity.archetype) + if entity_archetype is None: + continue + overrides_dict = entity.overrides.model_dump() + traj = compute_trajectory( + entity_archetype, + n_periods, + overrides_dict, + entity.start_period, + ) + result = fit_rbf(x, traj) + out.append( + GPKernelFit( + scope_type="entity", + scope_name=entity.name, + kernel_type="rbf", + hyperparameters=result.hyperparameters or {}, + log_marginal_likelihood=result.log_marginal_likelihood, + n_train=result.n_train, + converged=result.converged, + ) + ) + + return out + + # --- Build / write ----------------------------------------------------------- @@ -1736,6 +2245,27 @@ def build_manifest( regression_pairs_global = [] regression_pairs_by_archetype = {} + # Mission 026: nested-ANOVA variance partitions (archetype-level and + # segment-within-archetype) plus per-archetype / per-override-entity + # GP kernel fits. All three sections piggyback on the same call-site + # contract: variance partitions need ``entity_metrics``; GP fits + # piggyback on ``config.metrics`` being non-empty so a no-metric + # config emits empty sections (the byte-equivalence contract). + if entity_metrics is not None: + variance_partitions = _build_variance_partitions_archetype( + config, + entity_metrics, + ) + variance_partitions_by_segment = _build_variance_partitions_by_segment( + config, + entity_metrics, + n_periods, + ) + else: + variance_partitions = [] + variance_partitions_by_segment = [] + gp_kernel_fits = _build_gp_kernel_fits(config, n_periods) + return ManifestSchema( schema_version=MANIFEST_SCHEMA_VERSION, seed=int(config.seed), @@ -1761,6 +2291,9 @@ def build_manifest( seasonal_decomposition=seasonal_decomposition, regression_pairs_global=regression_pairs_global, regression_pairs_by_archetype=regression_pairs_by_archetype, + variance_partitions=variance_partitions, + variance_partitions_by_segment=variance_partitions_by_segment, + gp_kernel_fits=gp_kernel_fits, ) @@ -1794,6 +2327,7 @@ def write_manifest(manifest: ManifestSchema, output_dir: Path) -> Path: "CorrelationCompensation", "CorrelationEntry", "EntityArchetypeAssignment", + "GPKernelFit", "TreatmentAssignment", "TreatmentCohort", "EventFiring", @@ -1808,6 +2342,7 @@ def write_manifest(manifest: ManifestSchema, output_dir: Path) -> Path: "SeasonalDecomposition", "SourceEntityMapping", "TrajectorySample", + "VariancePartition", "build_manifest", "config_sha256", "write_manifest", diff --git a/tests/test_manifest_variance_gp.py b/tests/test_manifest_variance_gp.py new file mode 100644 index 0000000..8664427 --- /dev/null +++ b/tests/test_manifest_variance_gp.py @@ -0,0 +1,752 @@ +"""Manifest enrichment: nested-ANOVA variance partition + RBF GP kernel fit. + +Mission 026 adds three additive sections to ``ManifestSchema`` (no schema +version bump — still 1.10): + +* ``variance_partitions`` — nested-ANOVA decomposition per metric with + ``Entity.archetype`` as the between-group axis. ``ss_between + + ss_within_entity + ss_residual == ss_total`` exactly (modulo + floating-point rounding). +* ``variance_partitions_by_segment`` — same decomposition with curve- + segment-within-archetype as the axis. Per-archetype only; segments are + never pooled across archetypes. +* ``gp_kernel_fits`` — per-archetype RBF kernel fits over the trajectory + shape, plus per-entity records for entities carrying ``overrides``. + +This module locks in: + +* ANOVA identity (``ss_between + ss_within_entity + ss_residual == + ss_total``) for the archetype scope at ``rtol=1e-10``. +* Per-archetype within-segment computation never pools across archetypes + (a multi-archetype config emits one record per archetype, each + restricted to its own entities). +* Cold-start NaN cells are excluded from the partition and the + ``cold_start_entities_excluded`` counter surfaces the dropped count. +* Empty sections for no-metric configs. +* RBF length scale recovered on a sinusoidal trajectory is shorter than + the total period span (captures the periodicity). +* Sigmoid trajectory fit converges with a length scale consistent with + monotone smoothness. +* Flat trajectory: ``converged=False`` with null hyperparameters; the + manifest build does not raise. +* Entity-level trajectory overrides emit ``scope_type="entity"`` records + alongside the archetype baseline. +* No new ``ManifestSchema`` field outside the three Mission 026 + additions — guards against future additive drift. +""" + +from __future__ import annotations + +import warnings + +import numpy as np + +from plotsim import generate_tables_with_state +from plotsim.config import ( + Archetype, + Column, + CurveSegment, + Domain, + Entity, + EntityOverrides, + Metric, + OutputConfig, + PlotsimConfig, + SurrogateKeyWarning, + Table, + TimeWindow, +) +from plotsim.gp import fit_rbf +from plotsim.manifest import ( + GPKernelFit, + MANIFEST_SCHEMA_VERSION, + VariancePartition, + build_manifest, +) + + +# --- Fixtures -------------------------------------------------------------- + + +def _config_with_archetypes( + archetypes: list[Archetype], + entities: list[Entity], + *, + metrics: list[Metric] | None = None, +) -> PlotsimConfig: + """Engine config with caller-supplied archetypes / entities. + + Defaults to a single beta-distributed metric ``m1`` so the realized + ``entity_metrics`` arrays have well-conditioned variance for the + ANOVA decomposition. Pass ``metrics=[]`` to drive a no-metric run + (variance partitions and GP fits both empty in that path). + """ + if metrics is None: + metrics = [ + Metric( + name="m1", + label="m1", + distribution="beta", + params={"alpha": 2.0, "beta": 5.0}, + polarity="positive", + ), + ] + if metrics: + fct_columns = [ + Column(name="date_key", dtype="id", source="fk:dim_date.date_key"), + Column(name="entity_id", dtype="id", source="fk:dim_entity.entity_id"), + ] + [Column(name=m.name, dtype="float", source=f"metric:{m.name}") for m in metrics] + fct = Table( + name="fct_m", + type="fact", + grain="per_entity_per_period", + primary_key=["date_key", "entity_id"], + foreign_keys=["dim_date.date_key", "dim_entity.entity_id"], + columns=fct_columns, + ) + tables = [ + Table( + name="dim_date", + type="dim", + grain="per_period", + primary_key="date_key", + columns=[ + Column(name="date_key", dtype="id", source="pk"), + Column(name="date", dtype="date", source="generated:date_key"), + ], + ), + Table( + name="dim_entity", + type="dim", + grain="per_entity", + primary_key="entity_id", + columns=[Column(name="entity_id", dtype="id", source="pk")], + ), + fct, + ] + else: + tables = [ + Table( + name="dim_date", + type="dim", + grain="per_period", + primary_key="date_key", + columns=[ + Column(name="date_key", dtype="id", source="pk"), + Column(name="date", dtype="date", source="generated:date_key"), + ], + ), + Table( + name="dim_entity", + type="dim", + grain="per_entity", + primary_key="entity_id", + columns=[Column(name="entity_id", dtype="id", source="pk")], + ), + ] + with warnings.catch_warnings(): + warnings.simplefilter("ignore", SurrogateKeyWarning) + return PlotsimConfig( + domain=Domain( + name="t", + description="t", + entity_type="entity", + entity_label="Entities", + ), + time_window=TimeWindow( + start="2024-01", + end="2024-12", + granularity="monthly", + ), + seed=0, + metrics=metrics, + archetypes=archetypes, + entities=entities, + tables=tables, + output=OutputConfig(format="csv", directory="out/m26"), + ) + + +def _three_segment_archetype(name: str, levels: tuple[float, float, float]) -> Archetype: + """Three-segment plateau archetype with caller-specified levels. + + Used as the variance-partition workhorse: distinct per-segment + levels guarantee positive between-segment variance, so the segment- + scope partition has non-trivial structure. + """ + return Archetype( + name=name, + label=name, + description=f"three-segment plateau {levels}", + curve_segments=[ + CurveSegment( + curve="plateau", + params={"level": levels[0]}, + start_pct=0.0, + end_pct=0.34, + ), + CurveSegment( + curve="plateau", + params={"level": levels[1]}, + start_pct=0.34, + end_pct=0.67, + ), + CurveSegment( + curve="plateau", + params={"level": levels[2]}, + start_pct=0.67, + end_pct=1.0, + ), + ], + ) + + +def _flat_archetype(name: str = "flat", level: float = 0.5) -> Archetype: + return Archetype( + name=name, + label=name, + description="constant plateau", + curve_segments=[ + CurveSegment( + curve="plateau", + params={"level": level}, + start_pct=0.0, + end_pct=1.0, + ), + ], + ) + + +def _sigmoid_archetype(name: str = "sigmoid") -> Archetype: + """S-curve archetype: monotone rise from 0 to 1 over the window. + + Sigmoid takes a ``rising: bool`` parameter; default ``True`` produces + the canonical 0→1 transition centered mid-window. The RBF GP fit on + this trajectory should converge with a length scale consistent with + monotone smoothness — neither very short (flat) nor very long + (periodic). + """ + return Archetype( + name=name, + label=name, + description="sigmoid rise", + curve_segments=[ + CurveSegment( + curve="sigmoid", + params={"rising": True}, + start_pct=0.0, + end_pct=1.0, + ), + ], + ) + + +def _oscillating_archetype(name: str = "oscillating", period: int = 6) -> Archetype: + """Oscillating archetype: sinusoidal cycle. + + ``period`` is the cycle length in periods. With ``n_periods=12`` and + ``period=6`` the trajectory exhibits two full cycles, so the RBF GP + fit's recovered length scale should be measurably shorter than the + full window length. + """ + return Archetype( + name=name, + label=name, + description="sinusoidal cycle", + curve_segments=[ + CurveSegment( + curve="oscillating", + params={"period": period}, + start_pct=0.0, + end_pct=1.0, + ), + ], + ) + + +# --- Variance partition: ANOVA identity ------------------------------------ + + +def test_variance_partition_archetype_anova_identity(): + """Acceptance #1: ``ss_between + ss_within_entity + ss_residual == + ss_total`` (rtol=1e-10) for a config with two archetypes and one + metric. ``ss_total`` is computed from the same finite observations + the partition saw — guards against a decomposition that drops or + double-counts mass. + """ + archetypes = [ + _three_segment_archetype("a_low", (0.2, 0.4, 0.6)), + _three_segment_archetype("a_high", (0.5, 0.7, 0.9)), + ] + entities = [Entity(name=f"lo_{i}", archetype="a_low", size=1) for i in range(8)] + [ + Entity(name=f"hi_{i}", archetype="a_high", size=1) for i in range(8) + ] + cfg = _config_with_archetypes(archetypes, entities) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + assert len(manifest.variance_partitions) == 1 + rec = manifest.variance_partitions[0] + assert rec.metric == "m1" + assert rec.scope == "archetype" + assert rec.scope_name == "all" + pool = np.concatenate([state.entity_metrics[e.name]["m1"] for e in cfg.entities]) + pool = pool[np.isfinite(pool)] + ss_total = float(np.sum((pool - float(np.mean(pool))) ** 2)) + ss_sum = rec.ss_between + rec.ss_within_entity + rec.ss_residual + assert np.isclose(ss_sum, ss_total, rtol=1e-10, atol=1e-10) + # Fractions sum to 1 within the same tolerance. + fraction_sum = rec.fraction_between + rec.fraction_within_entity + rec.fraction_residual + assert np.isclose(fraction_sum, 1.0, rtol=1e-10, atol=1e-10) + # n_observations equals the pool size after NaN masking. + assert rec.n_observations == int(pool.size) + + +def test_variance_partition_by_segment_groups_within_archetype(): + """Acceptance #2: ``variance_partitions_by_segment`` emits one record + per ``(metric, archetype)`` and ``scope_name`` is the archetype name. + Segments are grouped within their parent archetype — the section + never pools observations across archetypes. + """ + archetypes = [ + _three_segment_archetype("a_low", (0.2, 0.4, 0.6)), + _three_segment_archetype("a_high", (0.5, 0.7, 0.9)), + ] + entities = [Entity(name=f"lo_{i}", archetype="a_low", size=1) for i in range(8)] + [ + Entity(name=f"hi_{i}", archetype="a_high", size=1) for i in range(8) + ] + cfg = _config_with_archetypes(archetypes, entities) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + by_segment = manifest.variance_partitions_by_segment + # One record per (metric, archetype); one metric × two archetypes. + assert len(by_segment) == 2 + scope_names = {r.scope_name for r in by_segment} + assert scope_names == {"a_low", "a_high"} + for rec in by_segment: + assert rec.metric == "m1" + assert rec.scope == "segment" + # The archetype-restricted observation count equals the per- + # archetype pool size (after NaN masking) — proves the segment + # scope never pools across archetypes. + names = [e.name for e in cfg.entities if e.archetype == rec.scope_name] + pool = np.concatenate([state.entity_metrics[n]["m1"] for n in names]) + pool = pool[np.isfinite(pool)] + assert rec.n_observations == int(pool.size) + ss_total = float(np.sum((pool - float(np.mean(pool))) ** 2)) + ss_sum = rec.ss_between + rec.ss_within_entity + rec.ss_residual + assert np.isclose(ss_sum, ss_total, rtol=1e-10, atol=1e-10) + + +def test_variance_partition_segment_between_is_positive_for_distinct_levels(): + """Within an archetype, three distinct plateau levels produce + positive between-segment variance. A bug that pooled segments into + a single group would yield ``ss_between == 0``. + """ + archetypes = [_three_segment_archetype("layered", (0.1, 0.5, 0.9))] + entities = [Entity(name=f"e_{i}", archetype="layered", size=1) for i in range(10)] + cfg = _config_with_archetypes(archetypes, entities) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + by_segment = manifest.variance_partitions_by_segment + assert len(by_segment) == 1 + rec = by_segment[0] + assert rec.scope_name == "layered" + assert rec.ss_between > 0.0 + # df_between for three segments = 3 - 1 = 2. + assert rec.degrees_of_freedom_between == 2 + + +# --- Cold-start handling ---------------------------------------------------- + + +def test_variance_partition_excludes_cold_start_periods(): + """Acceptance #3: a config with cold-start entities (arrival offset + > 0) excludes NaN-padded periods. ``n_observations`` is less than + ``n_entities × n_periods`` and ``cold_start_entities_excluded`` + counts the cold-start entities that contributed at least one NaN + cell. + """ + archetypes = [_flat_archetype("flat", level=0.5)] + entities = [ + Entity(name="e_warm", archetype="flat", size=1, start_period=0), + Entity(name="e_cold_1", archetype="flat", size=1, start_period=3), + Entity(name="e_cold_2", archetype="flat", size=1, start_period=6), + ] + cfg = _config_with_archetypes(archetypes, entities) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + n_periods = len(tables["dim_date"]) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + rec = manifest.variance_partitions[0] + assert rec.n_observations < len(entities) * n_periods + # Exact: e_warm contributes n_periods, e_cold_1 contributes + # n_periods - 3, e_cold_2 contributes n_periods - 6. + expected = n_periods + (n_periods - 3) + (n_periods - 6) + assert rec.n_observations == expected + # Two of the three entities had at least one NaN cell. + assert rec.cold_start_entities_excluded == 2 + + +# --- Empty section guard ---------------------------------------------------- + + +def test_variance_partition_empty_when_no_metrics(): + """Acceptance #7: a config with no metrics emits empty + ``variance_partitions`` and empty ``variance_partitions_by_segment``. + Same guarantee for ``gp_kernel_fits`` — the section piggybacks on + metric presence so no-metric configs are byte-equivalent to the + pre-M26 lane modulo the new empty containers. + """ + archetypes = [_flat_archetype()] + entities = [Entity(name=f"e_{i}", archetype="flat", size=1) for i in range(3)] + cfg = _config_with_archetypes(archetypes, entities, metrics=[]) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + assert manifest.variance_partitions == [] + assert manifest.variance_partitions_by_segment == [] + assert manifest.gp_kernel_fits == [] + + +# --- GP fits: shape characterization --------------------------------------- + + +def test_gp_fit_archetype_record_emitted_per_archetype(): + """One ``scope_type="archetype"`` record per declared archetype in + sorted-name order. The fit consumes the archetype's clean + trajectory (no overrides, no cold-start shift) so the kernel + characterizes the archetype's shape rather than any individual + entity's realized data. + """ + archetypes = [ + _flat_archetype("flat", level=0.5), + _sigmoid_archetype("sigmoid"), + ] + entities = [Entity(name=f"e_{i}", archetype="flat", size=1) for i in range(3)] + [ + Entity(name=f"s_{i}", archetype="sigmoid", size=1) for i in range(3) + ] + cfg = _config_with_archetypes(archetypes, entities) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + archetype_fits = [f for f in manifest.gp_kernel_fits if f.scope_type == "archetype"] + assert {f.scope_name for f in archetype_fits} == {"flat", "sigmoid"} + for fit in archetype_fits: + assert fit.kernel_type == "rbf" + + +def test_gp_fit_sigmoid_converges_and_length_scale_is_smooth(): + """Acceptance #5 (variant): a sigmoid archetype produces + ``converged=True`` with a length scale on the order of the + transition's smoothness — not a short cycle, not the full window + span. The exact value is optimizer-dependent; pin only the + monotone-smoothness signal: length scale > 1 (not collapsed to the + period grid) AND length scale < total span (not infinite). + """ + cfg = _config_with_archetypes( + archetypes=[_sigmoid_archetype("sigmoid")], + entities=[Entity(name="e_0", archetype="sigmoid", size=1)], + ) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + n_periods = len(tables["dim_date"]) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + fit = next(f for f in manifest.gp_kernel_fits if f.scope_name == "sigmoid") + assert fit.converged is True + assert fit.hyperparameters is not None + length_scale = fit.hyperparameters["length_scale"] + assert 1.0 < length_scale < float(n_periods) + + +def test_gp_fit_oscillating_recovers_short_length_scale(): + """Acceptance #4: an oscillating archetype's recovered length scale + is measurably shorter than the total period span — the kernel + captures the periodicity rather than averaging it out. + """ + cfg = _config_with_archetypes( + archetypes=[_oscillating_archetype("oscillating", period=4)], + entities=[Entity(name="e_0", archetype="oscillating", size=1)], + ) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + n_periods = len(tables["dim_date"]) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + fit = next(f for f in manifest.gp_kernel_fits if f.scope_name == "oscillating") + assert fit.converged is True + assert fit.hyperparameters is not None + length_scale = fit.hyperparameters["length_scale"] + # Periodicity ≪ window span — recovered length scale must reflect + # the cycle, not the full window. n_periods = 12 (Jan–Dec 2024). + assert length_scale < float(n_periods) / 2.0 + + +def test_gp_fit_flat_trajectory_does_not_converge(): + """Acceptance #5 (variant): a flat archetype produces + ``converged=False`` with empty hyperparameters and ``None`` log + marginal likelihood. The manifest build does not raise — the + failed fit is recorded as a non-fatal record. + """ + cfg = _config_with_archetypes( + archetypes=[_flat_archetype("flat", level=0.5)], + entities=[Entity(name="e_0", archetype="flat", size=1)], + ) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + fit = next(f for f in manifest.gp_kernel_fits if f.scope_name == "flat") + assert fit.converged is False + assert fit.hyperparameters == {} + assert fit.log_marginal_likelihood is None + + +def test_gp_fit_entity_override_emits_entity_scope_record(): + """Acceptance #6: a config with at least one entity carrying + ``overrides`` emits a ``scope_type="entity"`` record for that + entity AND keeps the ``scope_type="archetype"`` baseline record + for the archetype. Other entities sharing the archetype do NOT + produce per-entity records — only override-bearing entities do. + """ + cfg = _config_with_archetypes( + archetypes=[_sigmoid_archetype("sigmoid")], + entities=[ + Entity( + name="e_override", + archetype="sigmoid", + size=1, + overrides=EntityOverrides(inflection_month=2), + ), + Entity(name="e_default", archetype="sigmoid", size=1), + ], + ) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + by_scope = {(f.scope_type, f.scope_name) for f in manifest.gp_kernel_fits} + assert ("archetype", "sigmoid") in by_scope + assert ("entity", "e_override") in by_scope + # Default-entity does NOT get its own per-entity record. + assert ("entity", "e_default") not in by_scope + + +# --- Byte-equivalence guard ------------------------------------------------ + + +def test_no_metric_config_emits_empty_m26_sections(): + """Configs that don't trigger either Mission 026 section produce + a manifest with all three new fields at their empty defaults. The + test is structurally identical to the M25 byte-equivalence guard: + pop the new fields and assert they were the only delta from the + pre-M26 wire shape. + """ + cfg = _config_with_archetypes( + archetypes=[_flat_archetype()], + entities=[Entity(name=f"e_{i}", archetype="flat", size=1) for i in range(3)], + metrics=[], + ) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + payload = manifest.model_dump(mode="json") + assert payload.pop("variance_partitions") == [] + assert payload.pop("variance_partitions_by_segment") == [] + assert payload.pop("gp_kernel_fits") == [] + assert payload["schema_version"] == "1.10" + + +# --- Schema-pin guard ------------------------------------------------------ + + +def test_schema_version_still_pinned_at_1_10(): + """Mission 026 adds fields without bumping the schema version. Pins + the constant so a stray bump would surface as an import-time typed + assertion failure rather than a downstream consumer rejecting the + payload at parse time. + """ + assert MANIFEST_SCHEMA_VERSION == "1.10" + + +def test_variance_partition_model_carries_required_fields(): + """``VariancePartition`` exposes the documented field set. Guards + against a partial rename or a dropped field on a refactor. + """ + rec = VariancePartition( + metric="m1", + scope="archetype", + scope_name="all", + ss_between=1.0, + ss_within_entity=1.0, + ss_residual=1.0, + fraction_between=0.33, + fraction_within_entity=0.33, + fraction_residual=0.34, + degrees_of_freedom_between=1, + degrees_of_freedom_within=10, + degrees_of_freedom_residual=100, + n_observations=120, + cold_start_entities_excluded=2, + ) + payload = rec.model_dump(mode="json") + assert set(payload.keys()) == { + "metric", + "scope", + "scope_name", + "ss_between", + "ss_within_entity", + "ss_residual", + "fraction_between", + "fraction_within_entity", + "fraction_residual", + "degrees_of_freedom_between", + "degrees_of_freedom_within", + "degrees_of_freedom_residual", + "n_observations", + "cold_start_entities_excluded", + } + + +def test_gp_kernel_fit_model_carries_required_fields(): + """``GPKernelFit`` exposes the documented field set.""" + rec = GPKernelFit( + scope_type="archetype", + scope_name="flat", + kernel_type="rbf", + hyperparameters={"length_scale": 1.0, "signal_variance": 1.0, "noise_variance": 0.1}, + log_marginal_likelihood=-10.0, + n_train=12, + converged=True, + ) + payload = rec.model_dump(mode="json") + assert set(payload.keys()) == { + "scope_type", + "scope_name", + "kernel_type", + "hyperparameters", + "log_marginal_likelihood", + "n_train", + "converged", + } + + +# --- Direct GP module smoke ------------------------------------------------ + + +def test_fit_rbf_flat_signal_does_not_converge(): + """``fit_rbf`` on a constant input returns + ``converged=False`` / null hyperparameters / null likelihood. + Direct unit test of the leaf module — guards against a future + refactor that drops the flat-variance short-circuit. + """ + x = np.arange(20, dtype=np.float64) + y = np.full(20, 0.5, dtype=np.float64) + result = fit_rbf(x, y) + assert result.converged is False + assert result.hyperparameters is None + assert result.log_marginal_likelihood is None + assert result.n_train == 20 + + +def test_fit_rbf_sinusoidal_converges_with_short_length_scale(): + """``fit_rbf`` on a sinusoidal signal recovers a length scale much + smaller than the total span. The exact value is optimizer- + dependent; pin only the order-of-magnitude relationship. + """ + n = 60 + x = np.arange(n, dtype=np.float64) + y = np.sin(2.0 * np.pi * x / 10.0).astype(np.float64) + result = fit_rbf(x, y) + assert result.converged is True + assert result.hyperparameters is not None + length_scale = result.hyperparameters["length_scale"] + assert length_scale < float(n) / 2.0 + + +def test_fit_rbf_fewer_than_three_points_does_not_converge(): + """Edge case: under 3 finite training points produces + ``converged=False``. The kernel has 3 hyperparameters; fewer + observations than hyperparameters is degenerate. + """ + x = np.array([0.0, 1.0], dtype=np.float64) + y = np.array([0.0, 1.0], dtype=np.float64) + result = fit_rbf(x, y) + assert result.converged is False + assert result.n_train == 2