diff --git a/CHANGELOG.md b/CHANGELOG.md index b71a2db..4a894a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,24 @@ Versioning: [SemVer](https://semver.org/spec/v2.0.0.html). ### Added +- **Manifest decomposition + regression sections.** The manifest sidecar + now records two additional summaries of the engine's realized signal + layer. `seasonal_decomposition` snapshots the global per-period + seasonal-strength array plus the per-metric and per-entity sensitivity + multipliers — a downstream consumer can reproduce the effective + seasonal lift at any `(entity, period, metric)` cell without + re-reading the source config. `regression_pairs_global` carries + pair-wise OLS β + intercept (both directions), r², per-direction + residual variance, and the finite-observation count for every + declared correlation pair, pooled across all entities; + `regression_pairs_by_archetype` provides the same OLS surface + restricted to each archetype's entity subset, so consumers can see + which archetypes carry the correlation. Configs without + `seasonal_effects` emit an empty-sentinel `seasonal_decomposition` + (empty list and empty dicts); configs without `correlations` emit + empty regression sections. Manifest schema bumps 1.9 → 1.10 for the + three additive sections. + - **Per-metric treatment effects.** New optional `target_metric` field on the treatment surface (set on `SegmentInput.treatment` in the builder; mirrored as `Entity.treatment_target_metric` in the engine). diff --git a/docs/site/api-reference.md b/docs/site/api-reference.md index f43fad6..b5734f0 100644 --- a/docs/site/api-reference.md +++ b/docs/site/api-reference.md @@ -237,13 +237,14 @@ Use this when you need the ground-truth trajectory positions — the the primary consumers. Recovering positions from noisy fact-table cells is impossible in general; this function exposes them directly. -`GenerationState` is a frozen dataclass with three fields: +`GenerationState` is a frozen dataclass with four fields: | Field | Type | Contents | |---|---|---| | `trajectories` | `dict[str, ndarray]` | Per-entity position array, length `n_periods`, values in `[0, 1]` | | `scd` | `SCDState` | Per-dim SCD Type 2 versioning (empty when no SCD columns are configured) | | `bridges` | `BridgeAssociations` | Per-bridge association ground truth (empty when no bridges are configured) | +| `entity_metrics` | `dict[str, dict[str, ndarray]]` | Per-entity, per-metric realized series — the noise-free, distribution-shaped values the fact tables were built from. Consumed by `build_manifest` for the regression-pair sections; downstream feature pipelines pick it up here when they need the same arrays without re-running the engine | **Returns** — `(tables, state)`. @@ -255,8 +256,11 @@ is impossible in general; this function exposes them directly. from plotsim import generate_tables_with_state, build_manifest tables, state = generate_tables_with_state(cfg) -manifest = build_manifest(cfg, state.trajectories, tables, - scd_state=state.scd, bridge_state=state.bridges) +manifest = build_manifest( + cfg, state.trajectories, tables, + scd_state=state.scd, bridge_state=state.bridges, + entity_metrics=state.entity_metrics, +) ``` --- @@ -393,8 +397,11 @@ are still written so you can inspect the broken data. Block on from plotsim import generate_tables_with_state, build_manifest, write_tables tables, state = generate_tables_with_state(cfg) -manifest = build_manifest(cfg, state.trajectories, tables, - scd_state=state.scd, bridge_state=state.bridges) +manifest = build_manifest( + cfg, state.trajectories, tables, + scd_state=state.scd, bridge_state=state.bridges, + entity_metrics=state.entity_metrics, +) out = write_tables(tables, cfg, manifest=manifest) print(f"Wrote to {out}") ``` @@ -496,13 +503,15 @@ def build_manifest( sample_rate: float | None = None, scd_state: SCDState | None = None, bridge_state: BridgeAssociations | None = None, + entity_metrics: dict[str, dict[str, numpy.ndarray]] | None = None, ) -> ManifestSchema ``` The manifest captures the *signal layer* a noisy fact table can't recover: archetype assignments, trajectory positions, event-firing -periods, SCD band crossings, bridge associations, and reproducibility -metadata. +periods, SCD band crossings, bridge associations, the engine's +seasonal-strength inputs, per-pair regression summaries for declared +correlations, and reproducibility metadata. **Parameters** @@ -514,6 +523,7 @@ metadata. | `sample_rate` | Override for `config.manifest.trajectory_sample_rate`. `None` reads the config value. | | `scd_state` | Pass `state.scd` to record SCD Type 2 band crossings. `None` leaves `manifest.scd_events` empty. | | `bridge_state` | Pass `state.bridges` to record M:N associations. `None` leaves `manifest.bridge_associations` empty. | +| `entity_metrics` | Pass `state.entity_metrics` to populate `manifest.regression_pairs_global` and `manifest.regression_pairs_by_archetype` with pair-wise OLS summaries for every declared correlation pair. `None` leaves both sections at their empty defaults. | The function is pure — same inputs produce a byte-identical manifest. No RNG, no clock, no filesystem. @@ -531,6 +541,7 @@ tables, state = generate_tables_with_state(cfg) manifest = build_manifest( cfg, state.trajectories, tables, scd_state=state.scd, bridge_state=state.bridges, + entity_metrics=state.entity_metrics, ) write_manifest(manifest, Path("output")) ``` diff --git a/docs/site/manifest-reference.md b/docs/site/manifest-reference.md index b6dff82..eabfb6e 100644 --- a/docs/site/manifest-reference.md +++ b/docs/site/manifest-reference.md @@ -32,6 +32,7 @@ tables, state = generate_tables_with_state(cfg) manifest = build_manifest( cfg, state.trajectories, tables, scd_state=state.scd, bridge_state=state.bridges, + entity_metrics=state.entity_metrics, ) write_tables(tables, cfg, manifest=manifest) ``` @@ -46,7 +47,7 @@ produces a byte-identical `manifest.json`. Encoding: UTF-8, ```json { - "schema_version": "1.7", + "schema_version": "1.10", "seed": 42, "config_sha256": "<64-char hex>", "archetype_assignments": [...], @@ -64,13 +65,16 @@ produces a byte-identical `manifest.json`. Encoding: UTF-8, "correlations": [...], "outlier_injections": [...] | null, "parent_child_relations": [...], - "noise_config": {...} | null + "noise_config": {...} | null, + "seasonal_decomposition": {...}, + "regression_pairs_global": [...], + "regression_pairs_by_archetype": {...} } ``` | Field | Type | Description | |---|---|---| -| `schema_version` | `str` | Wire-shape version. Currently `"1.9"` (bumped over time as new additive sections — `causal_graph`, `correlations`, `outlier_injections`, multi-source mappings, `parent_child_relations`, `noise_config` — landed; 1.7 → 1.8 extended `noise_config` with `noise_family` / `degrees_of_freedom`; 1.8 → 1.9 added the optional `target_metric` field on the per-entity `treatment` and per-cohort `treatment_cohorts` records) | +| `schema_version` | `str` | Wire-shape version. Currently `"1.10"` (bumped over time as new additive sections — `causal_graph`, `correlations`, `outlier_injections`, multi-source mappings, `parent_child_relations`, `noise_config` — landed; 1.7 → 1.8 extended `noise_config` with `noise_family` / `degrees_of_freedom`; 1.8 → 1.9 added the optional `target_metric` field on the per-entity `treatment` and per-cohort `treatment_cohorts` records; 1.9 → 1.10 added the `seasonal_decomposition` snapshot plus per-pair OLS summaries in `regression_pairs_global` / `regression_pairs_by_archetype`) | | `seed` | `int` | The seed used for generation — `config.seed` | | `config_sha256` | `str` | Full SHA-256 hex of the JSON-serialized config. Detects config drift between generation and consumption | | `archetype_assignments` | array | One entry per entity; see below | @@ -88,6 +92,9 @@ produces a byte-identical `manifest.json`. Encoding: UTF-8, | `correlations` | array | One entry per user-declared `config.correlations` pair, with the realized (post-Higham, post-compensation) coefficient. Empty list when no correlations are configured | | `outlier_injections` | array or `null` | Per-cell outlier-fire log. `null` when skipped (no `outlier_rate`, vectorized mode, or cell budget exceeded). `[]` when the detector ran and observed no firings | | `noise_config` | object or `null` | Noise-model record. `null` when the run uses the default magnitude-scaled gaussian lane; populated when EITHER `noise.scale_with_trajectory` is `true` OR `noise.noise_family` is non-default (`"student_t"` / `"laplace"`) | +| `seasonal_decomposition` | object | Snapshot of the seasonal-strength inputs the engine consumed. Always emitted; configs without `seasonal_effects` get the empty-sentinel shape (empty list / empty dicts) | +| `regression_pairs_global` | array | Pair-wise OLS summary (slope, intercept, r², residual variance) for every declared correlation pair, pooled across every entity. Empty list when no correlations are configured | +| `regression_pairs_by_archetype` | object | Same OLS summary as `regression_pairs_global` but grouped by `Entity.archetype`. Keys are archetype names; values mirror the global list shape. Empty dict when no correlations are configured | --- @@ -666,6 +673,181 @@ the scorer well-calibrated under the heavier-tailed residuals. --- +## `seasonal_decomposition` + +Snapshot of the seasonal-strength inputs the engine consumed during +metric generation. + +```json +{ + "seasonal_decomposition": { + "seasonal_factors": [0.0, 0.8, 0.8, 0.0, 0.0, -0.3, -0.3, 0.0, 0.0, 0.0, 0.0, 0.8], + "metric_seasonal_sensitivities": { + "engagement": 1.0, + "mrr": 0.6 + }, + "entity_seasonal_sensitivities": { + "growers_001": 1.0, + "decliners_002": 0.0 + } + } +} +``` + +| Field | Type | Description | +|---|---|---| +| `seasonal_factors` | array of `float` | Length-`n_periods` global strength array. Entry `t` is the sum of every `SeasonalEffect.strength` whose `months` set contains period `t`'s calendar month | +| `metric_seasonal_sensitivities` | object | One entry per metric, keyed by `Metric.name` and valued by `Metric.seasonal_sensitivity`. The per-metric multiplier the engine applies on top of the global strength | +| `entity_seasonal_sensitivities` | object | One entry per entity, keyed by `Entity.name` and valued by `Entity.seasonal_sensitivity`. The per-entity multiplier the engine applies on top of the global strength | + +### When the section is the empty sentinel + +Configs without any `seasonal_effects` declared get the empty-sentinel +shape — `seasonal_factors: []`, `metric_seasonal_sensitivities: {}`, +`entity_seasonal_sensitivities: {}` — rather than `null`. The +sensitivity multipliers are inert in that lane (the engine short- +circuits before applying them), so recording them would just be noise. +Always present so a downstream consumer can iterate the section without +a None-check. + +**Use case** — reconstruct the engine's effective seasonal lift at any +cell without re-reading the YAML config. For an `(entity, period, metric)` +triple: + +```python +lift = ( + manifest["seasonal_decomposition"]["seasonal_factors"][period] + * manifest["seasonal_decomposition"]["metric_seasonal_sensitivities"][metric] + * manifest["seasonal_decomposition"]["entity_seasonal_sensitivities"][entity] +) +``` + +A seasonality-aware anomaly detector can subtract this lift before +scoring; a feature pipeline can expose `seasonal_factor` as a regressor +that exactly mirrors the engine's modulation. + +--- + +## `regression_pairs_global` + +Pair-wise ordinary-least-squares fit for every declared correlation, +pooled across every entity and period. + +```json +{ + "regression_pairs_global": [ + { + "metric_a": "engagement", + "metric_b": "mrr", + "beta_a_to_b": 0.84, + "intercept_a_to_b": 12.3, + "beta_b_to_a": 0.71, + "intercept_b_to_a": -4.1, + "r_squared": 0.6, + "residual_variance_a_to_b": 18.7, + "residual_variance_b_to_a": 0.04, + "n_observations": 720 + } + ] +} +``` + +| Field | Type | Description | +|---|---|---| +| `metric_a` / `metric_b` | `str` | The pair, in the order the user declared them in `config.correlations` | +| `beta_a_to_b` | `float` | OLS slope for `b = beta * a + intercept` over the pooled `(a, b)` observations | +| `intercept_a_to_b` | `float` | OLS intercept for the same regression | +| `beta_b_to_a` | `float` | OLS slope for the reverse regression `a = beta * b + intercept` | +| `intercept_b_to_a` | `float` | OLS intercept for the reverse regression | +| `r_squared` | `float` | Direction-invariant coefficient of determination. Equal to `corr(a, b) ** 2` on the same observations | +| `residual_variance_a_to_b` | `float` | Variance of `b - (beta_a_to_b * a + intercept_a_to_b)` — the unexplained-noise scale for the `a → b` direction | +| `residual_variance_b_to_a` | `float` | Same for the reverse direction | +| `n_observations` | `int` | Count of finite `(a, b)` pairs used. Cells with NaN in either metric (cold-start lead-ins, MCAR-rewritten values) are excluded | + +One entry per pair in `config.correlations`. Auto-zero off-diagonals +(pairs the user did not declare) are not recorded. Sorted by +`(metric_a, metric_b)` for stable JSON output. + +**Distinct from** `correlations` (which records the realized Pearson +coefficient the copula targeted). `regression_pairs_global` describes +the *fitted linear relationship* between the realized series — slope +and intercept, plus the unexplained variance. A high `r_squared` +combined with a small `residual_variance` says the pair moves +tightly together along a straight line; a high `r_squared` with +asymmetric residual variances says one direction predicts the other +better than vice-versa (which is normal under unequal metric scales). + +`n_observations < 2` is a degenerate case (sparse cold-start, no +overlap between metric domains); the record's β / intercept / variance +fields are all `0.0` and downstream consumers should gate on the count +before reading the coefficients. + +**Use case** — score a regression baseline. A predictor of `mrr` from +`engagement` should land near `beta_a_to_b` with residual variance +close to `residual_variance_a_to_b`. Larger deviations flag either +model misspecification or that the consumer is over-fitting noise the +manifest already attributes to residuals. + +--- + +## `regression_pairs_by_archetype` + +The same OLS surface as `regression_pairs_global`, but restricted to +each archetype's entity subset so a consumer can see which archetypes +carry the declared correlations. + +```json +{ + "regression_pairs_by_archetype": { + "growth": [ + { + "metric_a": "engagement", + "metric_b": "mrr", + "beta_a_to_b": 0.91, + "intercept_a_to_b": 9.2, + "beta_b_to_a": 0.86, + "intercept_b_to_a": -7.0, + "r_squared": 0.78, + "residual_variance_a_to_b": 10.4, + "residual_variance_b_to_a": 0.02, + "n_observations": 360 + } + ], + "decline": [ + { + "metric_a": "engagement", + "metric_b": "mrr", + "beta_a_to_b": 0.62, + "intercept_a_to_b": 15.8, + "beta_b_to_a": 0.41, + "intercept_b_to_a": 1.2, + "r_squared": 0.31, + "residual_variance_a_to_b": 25.6, + "residual_variance_b_to_a": 0.08, + "n_observations": 360 + } + ] + } +} +``` + +The top-level object's keys are archetype names (matching +`Entity.archetype`); each value list mirrors the +`regression_pairs_global` shape, one entry per declared pair. +Archetypes that contribute no finite observations are omitted entirely +(rather than mapped to an empty list) — the dict reflects archetypes +that actually contributed to the fit. + +Empty `{}` when no correlations are declared. + +**Use case** — diagnose where in the population a declared correlation +is strongest. A pair with a high pooled `r_squared` but per-archetype +values that swing widely is a signal that the correlation is a mixture +artefact, not a within-archetype relationship — a model trained on the +pooled fit will mispredict for the archetype whose β diverges most. + +--- + ## Reading the manifest in Python ```python diff --git a/docs/site/user-guide/metrics-and-connections.md b/docs/site/user-guide/metrics-and-connections.md index 3cad185..9830697 100644 --- a/docs/site/user-guide/metrics-and-connections.md +++ b/docs/site/user-guide/metrics-and-connections.md @@ -245,6 +245,17 @@ records the adjustments in `manifest.correlation_adjustments`. Strong mirrors (`mirrors`, `inverts`) on lots of metrics tends to over-constrain the matrix — a warning fires. +Beyond the correlation target itself, the manifest emits a pair-wise +OLS fit (slope, intercept, r², residual variance) for every declared +correlation in `manifest.regression_pairs_global` (pooled across all +entities) and `manifest.regression_pairs_by_archetype` (grouped by +archetype). The pooled fit answers "given the realized output, what +linear relationship do these metrics actually follow"; the +per-archetype fit answers "is that relationship the same in every +sub-population, or is the pooled correlation a mixture artefact?" +See [`manifest-reference.md`](../manifest-reference.md#regression_pairs_global) +for the field layout. + --- ## Causal lag — `follows` + `delay` diff --git a/docs/site/user-guide/seasonality.md b/docs/site/user-guide/seasonality.md index 88371da..f22bb4f 100644 --- a/docs/site/user-guide/seasonality.md +++ b/docs/site/user-guide/seasonality.md @@ -190,11 +190,26 @@ combined with windows shorter than 24 periods. Seasonal modulation is a deterministic function of the config — same `(config, seed)` produces the same `seasonal_factor` at every cell. The -manifest doesn't record per-cell seasonal factors directly; you can -reconstruct them from the config alone. - -If you need per-cell verification, [`trace_metric_cell`](../api-reference.md#trace_metric_cell) -returns the `seasonal_factor` and `modulated_center` for any single +manifest's `seasonal_decomposition` section captures the three inputs +the engine consumed so a consumer can reproduce the effective lift at +any cell without re-reading the YAML: + +- `seasonal_factors` — the length-`n_periods` global strength array + (entry `t` is the summed strength of every effect whose `months` + set contains period `t`'s calendar month). +- `metric_seasonal_sensitivities` — per-metric multipliers + (`Metric.seasonal_sensitivity`). +- `entity_seasonal_sensitivities` — per-entity multipliers + (`Entity.seasonal_sensitivity`). + +The effective lift at cell `(entity, period, metric)` is the product +of those three values — the same multiplication the engine applies +during metric generation. Configs without any `seasonal_effects` +declared get the empty-sentinel shape (empty list and empty dicts). + +If you need per-cell verification rather than reconstruction, +[`trace_metric_cell`](../api-reference.md#trace_metric_cell) returns +the `seasonal_factor` and `modulated_center` for any single `(entity, period, metric)` triple. --- diff --git a/plotsim/cli.py b/plotsim/cli.py index 9766a52..427cdf3 100644 --- a/plotsim/cli.py +++ b/plotsim/cli.py @@ -280,6 +280,7 @@ def cmd_run(args: argparse.Namespace) -> int: tables, scd_state=gen_state.scd, bridge_state=gen_state.bridges, + entity_metrics=gen_state.entity_metrics, ) output_dir = Path(args.output_dir) if args.output_dir else None diff --git a/plotsim/manifest.py b/plotsim/manifest.py index fc1ccb5..bcd695f 100644 --- a/plotsim/manifest.py +++ b/plotsim/manifest.py @@ -57,7 +57,7 @@ import numpy as np import pandas as pd -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from plotsim.config import ( FKSource, @@ -125,7 +125,24 @@ # the new field's null default. Populated when the entity's # ``treatment_target_metric`` names a metric — the lift then applies # only to that metric's effective-position evaluation. -MANIFEST_SCHEMA_VERSION = "1.9" +# Bumped 1.9 → 1.10 for three additive sections summarizing the engine's +# realized signal layer alongside the existing trajectory tape: +# * ``seasonal_decomposition`` — the global per-period seasonal-strength +# array plus the per-metric / per-entity sensitivities used to scale +# it. Defaults to empty list + empty dicts on configs without +# seasonality, which matches the engine no-op lane. +# * ``regression_pairs_global`` — ordinary-least-squares β + intercept +# in both directions for every declared correlation pair, pooled +# across entities, paired with r² and per-direction residual +# variances. Empty list when the config declares no correlations. +# * ``regression_pairs_by_archetype`` — the same OLS surface grouped by +# ``Entity.archetype`` so consumers can see which archetypes carry +# the correlation. Empty dict when correlations are absent or when +# ``entity_metrics`` was not threaded. +# Configs without correlations or seasonality emit a 1.10 manifest +# byte-equivalent to 1.9 modulo the schema version string and the three +# new empty containers. +MANIFEST_SCHEMA_VERSION = "1.10" class _ManifestBase(BaseModel): @@ -627,6 +644,84 @@ class NoiseConfigInfo(_ManifestBase): degrees_of_freedom: Optional[float] = None +class SeasonalDecomposition(_ManifestBase): + """Per-run snapshot of the seasonal-strength inputs. + + Three fields describing the deterministic seasonal layer applied to + every cell at metric-generation time: + + * ``seasonal_factors`` — the length-``n_periods`` global strength + array produced by ``_build_seasonal_factors``. Entry ``t`` is the + sum of every ``SeasonalEffect.strength`` whose ``months`` set + contains period ``t``'s calendar month. Empty list when the config + declares no seasonal effects (the engine's short-circuit lane); + a populated list otherwise. + * ``metric_seasonal_sensitivities`` — one entry per metric, keyed by + ``Metric.name`` and valued by ``Metric.seasonal_sensitivity``. The + per-metric multiplier the engine applies on top of the global + strength. Empty dict when no seasonal effects are configured — + the values are inert in that lane and recording them would just + be noise. + * ``entity_seasonal_sensitivities`` — one entry per entity, keyed by + ``Entity.name`` and valued by ``Entity.seasonal_sensitivity``. The + per-entity multiplier the engine applies on top of the global + strength. Empty dict under the same condition as + ``metric_seasonal_sensitivities``. + + A downstream consumer can reconstruct the effective seasonal lift at + cell ``(entity, period, metric)`` exactly as the engine did: + ``seasonal_factors[period] * metric_seasonal_sensitivities[metric] * + entity_seasonal_sensitivities[entity]``. + """ + + seasonal_factors: list[float] + metric_seasonal_sensitivities: dict[str, float] + entity_seasonal_sensitivities: dict[str, float] + + +class RegressionPair(_ManifestBase): + """Pair-wise OLS summary for one declared correlation edge. + + Computed pair-wise from the realized ``entity_metrics`` arrays (the + noise-free, distribution-shaped per-cell values, pre-MCAR / outlier + rewrites). Emitted only for pairs declared in ``config.correlations`` + — undeclared pairs are auto-zero and don't contribute. Both directions + are surfaced so consumers don't have to re-derive one from the other. + + * ``metric_a`` / ``metric_b`` — the pair, in the order the user + declared them. Bidirectional fields read as ``a → b`` / ``b → a``. + * ``beta_a_to_b`` / ``intercept_a_to_b`` — OLS slope and intercept + for the regression ``b = beta * a + intercept`` over the pooled + ``(a, b)`` observations. + * ``beta_b_to_a`` / ``intercept_b_to_a`` — the reverse regression + ``a = beta * b + intercept``. + * ``r_squared`` — direction-invariant coefficient of determination. + Equal to ``corr(a, b) ** 2`` on the pooled observations. + * ``residual_variance_a_to_b`` — variance of ``b - (beta_a_to_b * a + + intercept_a_to_b)``; the unexplained-noise scale for the + ``a → b`` direction. + * ``residual_variance_b_to_a`` — same for the reverse direction. + * ``n_observations`` — count of finite ``(a, b)`` pairs used. Cells + with NaN in either metric (cold-start lead-ins, MCAR-rewritten + values that leak into the realized series) are excluded. + + ``n_observations < 2`` produces a degenerate record (all β / intercept + / variances are ``0.0``, ``r_squared`` is ``0.0``); downstream consumers + should gate on the count before using the coefficients. + """ + + metric_a: str + metric_b: str + beta_a_to_b: float + intercept_a_to_b: float + beta_b_to_a: float + intercept_b_to_a: float + r_squared: float + residual_variance_a_to_b: float + residual_variance_b_to_a: float + n_observations: int + + class HoldoutInfo(_ManifestBase): """M109: ground-truth record of the temporal holdout split. @@ -759,6 +854,35 @@ class ManifestSchema(_ManifestBase): # position-scaled gaussian noise from one that didn't without # re-reading the config. noise_config: Optional[NoiseConfigInfo] = None + # Schema 1.10: seasonal-decomposition snapshot. Always emitted — + # configs without ``seasonal_effects`` get an empty list + empty + # dicts (no seasonal layer was active). Configs with seasonality + # carry the global strength array plus the per-metric / per-entity + # multipliers so a consumer can reconstruct the effective seasonal + # lift at any (entity, period, metric) cell without re-reading + # the YAML config. + seasonal_decomposition: SeasonalDecomposition = Field( + default_factory=lambda: SeasonalDecomposition( + seasonal_factors=[], + metric_seasonal_sensitivities={}, + entity_seasonal_sensitivities={}, + ), + ) + # Schema 1.10: pooled OLS summary for every declared correlation + # pair. Empty list when no correlations are configured or when + # ``entity_metrics`` was not threaded to ``build_manifest`` — the + # builder degrades gracefully when called by older callers that + # haven't been updated to pass the realized arrays. One entry per + # ``config.correlations`` pair otherwise. + regression_pairs_global: list[RegressionPair] = [] + # Schema 1.10: same OLS summary grouped by ``Entity.archetype``. + # Each entry's value list mirrors ``regression_pairs_global`` but + # restricted to entities of one archetype. Empty dict when no + # correlations are configured, when ``entity_metrics`` was not + # threaded, or when no archetype had enough observations to fit + # OLS (rare; a single-entity archetype with cold-start NaN can + # produce ``n_observations < 2``). + regression_pairs_by_archetype: dict[str, list[RegressionPair]] = {} # --- Helpers ----------------------------------------------------------------- @@ -1019,6 +1143,211 @@ def _build_treatment_cohorts(entities: list) -> list[TreatmentCohort]: return cohorts +# --- Decomposition / regression helpers -------------------------------------- + + +def _build_seasonal_decomposition( + config: PlotsimConfig, + n_periods: int, +) -> SeasonalDecomposition: + """Snapshot the seasonal-strength inputs into a manifest record. + + Reuses ``_build_seasonal_factors`` for the global per-period strength + array so the manifest carries the exact lookup the engine consumed. + Configs without ``seasonal_effects`` get the empty-sentinel shape + (empty list + empty dicts) — the engine short-circuits on those + configs and the sensitivities are inert, so recording them would + just be noise. + """ + # Local import: ``plotsim.tables`` transitively imports + # ``plotsim.manifest`` via the validation chain, so top-level would + # introduce a cycle on cold load. + from plotsim.tables import _build_seasonal_factors + + if not config.seasonal_effects: + return SeasonalDecomposition( + seasonal_factors=[], + metric_seasonal_sensitivities={}, + entity_seasonal_sensitivities={}, + ) + + factors = _build_seasonal_factors(config, n_periods) + factor_list: list[float] = [float(x) for x in factors] if factors is not None else [] + return SeasonalDecomposition( + seasonal_factors=factor_list, + metric_seasonal_sensitivities={ + m.name: float(m.seasonal_sensitivity) for m in config.metrics + }, + entity_seasonal_sensitivities={ + e.name: float(e.seasonal_sensitivity) for e in config.entities + }, + ) + + +def _ols_pair( + a: np.ndarray, + b: np.ndarray, + metric_a: str, + metric_b: str, +) -> RegressionPair: + """Compute the bidirectional OLS summary for one ``(a, b)`` pair. + + Strips non-finite cells in either array before fitting; the count of + surviving cells is the ``n_observations`` reported. ``n < 2`` short- + circuits to a zero-filled record (no β / variance is well-defined on + one observation) — downstream consumers should gate on the count. + Zero-variance inputs (a degenerate column constant across every + entity-period) also short-circuit, since β is undefined when + ``var(x) == 0``. + """ + mask = np.isfinite(a) & np.isfinite(b) + a_f = a[mask] + b_f = b[mask] + n_obs = int(a_f.size) + if n_obs < 2: + return RegressionPair( + metric_a=metric_a, + metric_b=metric_b, + beta_a_to_b=0.0, + intercept_a_to_b=0.0, + beta_b_to_a=0.0, + intercept_b_to_a=0.0, + r_squared=0.0, + residual_variance_a_to_b=0.0, + residual_variance_b_to_a=0.0, + n_observations=n_obs, + ) + var_a = float(np.var(a_f)) + var_b = float(np.var(b_f)) + if var_a == 0.0 or var_b == 0.0: + return RegressionPair( + metric_a=metric_a, + metric_b=metric_b, + beta_a_to_b=0.0, + intercept_a_to_b=float(np.mean(b_f)), + beta_b_to_a=0.0, + intercept_b_to_a=float(np.mean(a_f)), + r_squared=0.0, + residual_variance_a_to_b=float(var_b), + residual_variance_b_to_a=float(var_a), + n_observations=n_obs, + ) + mean_a = float(np.mean(a_f)) + mean_b = float(np.mean(b_f)) + cov_ab = float(np.mean((a_f - mean_a) * (b_f - mean_b))) + beta_a_to_b = cov_ab / var_a + intercept_a_to_b = mean_b - beta_a_to_b * mean_a + beta_b_to_a = cov_ab / var_b + intercept_b_to_a = mean_a - beta_b_to_a * mean_b + resid_a_to_b = b_f - (beta_a_to_b * a_f + intercept_a_to_b) + resid_b_to_a = a_f - (beta_b_to_a * b_f + intercept_b_to_a) + # r² is direction-invariant; equal to corr² on the same observations. + corr = cov_ab / np.sqrt(var_a * var_b) + r_squared = float(corr * corr) + return RegressionPair( + metric_a=metric_a, + metric_b=metric_b, + beta_a_to_b=float(beta_a_to_b), + intercept_a_to_b=float(intercept_a_to_b), + beta_b_to_a=float(beta_b_to_a), + intercept_b_to_a=float(intercept_b_to_a), + r_squared=r_squared, + residual_variance_a_to_b=float(np.var(resid_a_to_b)), + residual_variance_b_to_a=float(np.var(resid_b_to_a)), + n_observations=n_obs, + ) + + +def _pool_metric_arrays( + entity_metrics: dict[str, dict[str, np.ndarray]], + entity_names: list[str], + metric: str, +) -> np.ndarray: + """Flatten ``entity_metrics[e][metric]`` across ``entity_names``. + + Missing entities or missing metric keys are skipped silently — the + caller passes a deliberately-narrowed entity list (global = every + entity, by-archetype = entities matching one archetype label), and + cells with NaN are filtered downstream by ``_ols_pair``'s mask. + """ + chunks: list[np.ndarray] = [] + for ename in entity_names: + per_metric = entity_metrics.get(ename) + if per_metric is None: + continue + arr = per_metric.get(metric) + if arr is None: + continue + chunks.append(np.asarray(arr, dtype=np.float64)) + if not chunks: + return np.empty(0, dtype=np.float64) + return np.concatenate(chunks) + + +def _build_regression_pairs( + config: PlotsimConfig, + entity_metrics: dict[str, dict[str, np.ndarray]], + entity_names: list[str], +) -> list[RegressionPair]: + """One ``RegressionPair`` per declared correlation, pooled over a subset. + + Scope is intentionally narrowed to ``config.correlations`` (D1: declared + pairs only — avoids O(n_metrics²) bloat and matches the existing + ``correlations`` manifest section's scope). Pairs whose pooled + observation count is < 2 still emit a record (zero-filled β), so the + section's entry list mirrors ``config.correlations`` 1:1 — downstream + consumers don't have to fall back when an archetype subset is sparse. + + Output is sorted by ``(metric_a, metric_b)`` to match the + ``correlations`` section's stable-ordering contract. + """ + if not config.correlations or not entity_metrics or not entity_names: + return [] + out: list[RegressionPair] = [] + for pair in config.correlations: + a = _pool_metric_arrays(entity_metrics, entity_names, pair.metric_a) + b = _pool_metric_arrays(entity_metrics, entity_names, pair.metric_b) + if a.size != b.size: + # Defensive: would indicate a metric-series-length mismatch + # upstream. Surface as an empty record rather than crash — + # the n_observations=0 gate handles the rest. + n = min(a.size, b.size) + a = a[:n] + b = b[:n] + out.append(_ols_pair(a, b, pair.metric_a, pair.metric_b)) + out.sort(key=lambda r: (r.metric_a, r.metric_b)) + return out + + +def _build_regression_pairs_by_archetype( + config: PlotsimConfig, + entity_metrics: dict[str, dict[str, np.ndarray]], +) -> dict[str, list[RegressionPair]]: + """Group ``_build_regression_pairs`` output by ``Entity.archetype``. + + Archetypes whose entity subset has no matching arrays in + ``entity_metrics`` are omitted entirely (rather than mapped to an + empty list) — the dict reflects archetypes that actually contributed + observations. The within-archetype pair list is sorted the same way + as the global one. + """ + if not config.correlations or not entity_metrics: + return {} + by_archetype: dict[str, list[str]] = {} + for e in config.entities: + by_archetype.setdefault(e.archetype, []).append(e.name) + out: dict[str, list[RegressionPair]] = {} + for archetype in sorted(by_archetype.keys()): + pairs = _build_regression_pairs( + config, + entity_metrics, + by_archetype[archetype], + ) + if pairs: + out[archetype] = pairs + return out + + # --- Build / write ----------------------------------------------------------- @@ -1029,6 +1358,7 @@ def build_manifest( sample_rate: Optional[float] = None, scd_state: Optional[Any] = None, bridge_state: Optional[Any] = None, + entity_metrics: Optional[dict[str, dict[str, np.ndarray]]] = None, ) -> ManifestSchema: """Assemble the manifest from config + generation state + tables. @@ -1049,6 +1379,14 @@ def build_manifest( each bridge becomes one ``BridgeAssociationRecord``. ``None`` leaves ``manifest.bridge_associations`` as ``[]``. + Schema 1.10: ``entity_metrics`` (the ``GenerationState.entity_metrics`` + field) carries the per-entity, per-metric realized series the engine + built fact tables from. When supplied, ``regression_pairs_global`` + and ``regression_pairs_by_archetype`` are populated with pair-wise + OLS summaries for every declared correlation pair. ``None`` (older + callers, or callers deliberately skipping the regression section) + leaves both manifest fields at their empty defaults. + The function is pure and stateless — same inputs → same output. No RNG, no clock, no filesystem. """ @@ -1378,6 +1716,26 @@ def build_manifest( ), ) + # Schema 1.10: seasonal-decomposition snapshot + per-pair OLS + # summaries. Seasonal always emits (empty sentinel when no effects); + # regression sections emit only when both declared correlations and + # realized ``entity_metrics`` are present. + seasonal_decomposition = _build_seasonal_decomposition(config, n_periods) + if entity_metrics is not None: + every_entity = [e.name for e in config.entities] + regression_pairs_global = _build_regression_pairs( + config, + entity_metrics, + every_entity, + ) + regression_pairs_by_archetype = _build_regression_pairs_by_archetype( + config, + entity_metrics, + ) + else: + regression_pairs_global = [] + regression_pairs_by_archetype = {} + return ManifestSchema( schema_version=MANIFEST_SCHEMA_VERSION, seed=int(config.seed), @@ -1400,6 +1758,9 @@ def build_manifest( source_entity_mappings=source_entity_mappings, parent_child_relations=parent_child_relations, noise_config=noise_config_info, + seasonal_decomposition=seasonal_decomposition, + regression_pairs_global=regression_pairs_global, + regression_pairs_by_archetype=regression_pairs_by_archetype, ) @@ -1442,7 +1803,9 @@ def write_manifest(manifest: ManifestSchema, output_dir: Path) -> Path: "OutlierInjection", "ParentChildRelation", "QualityInjection", + "RegressionPair", "SCDEvent", + "SeasonalDecomposition", "SourceEntityMapping", "TrajectorySample", "build_manifest", diff --git a/plotsim/tables.py b/plotsim/tables.py index 0d58862..739be60 100644 --- a/plotsim/tables.py +++ b/plotsim/tables.py @@ -4377,6 +4377,15 @@ class GenerationState: bridge DataFrames. ``BridgeAssociations(bridges={})`` is the empty sentinel for configs without a ``bridges`` block. + ``entity_metrics`` carries the per-entity, per-metric realized series + produced by ``_compute_entity_metrics`` — the noise-free, distribution- + shaped values the fact tables were built from (before MCAR / outlier + injection rewrites cells). The manifest builder reads this for the + seasonal-decomposition and regression-pair sections; downstream + consumers that need the same arrays without re-running the engine + pick them up here. Empty dict for configs that never realized any + metric series (an edge case not produced by ``generate_tables_with_state``). + Future fields extend this dataclass; existing callers that destructure ``(tables, state)`` keep working because Python dataclass fields are accessed by name. @@ -4387,6 +4396,7 @@ class GenerationState: bridges: BridgeAssociations = field( default_factory=lambda: BridgeAssociations(bridges={}), ) + entity_metrics: dict[str, dict[str, np.ndarray]] = field(default_factory=dict) def _date_key_to_period_label(dim_date: pd.DataFrame) -> dict[int, str]: @@ -4794,4 +4804,5 @@ def generate_tables_with_state( trajectories=trajectories, scd=scd_state, bridges=bridge_associations, + entity_metrics=entity_metrics, ) diff --git a/tests/test_heteroscedastic_noise.py b/tests/test_heteroscedastic_noise.py index e683f9d..de77f64 100644 --- a/tests/test_heteroscedastic_noise.py +++ b/tests/test_heteroscedastic_noise.py @@ -277,7 +277,7 @@ def test_manifest_schema_version_pins_1_9(): warnings.simplefilter("ignore") cfg = _build_small_config(scale_with_trajectory=True) _tables, manifest = _generate_and_manifest(cfg) - assert manifest.schema_version == "1.9" + assert manifest.schema_version == "1.10" # --- End-to-end byte-identity for default-off engine path ------------------- diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 1caaa71..ffbcc43 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -384,12 +384,17 @@ def test_schema_version_bumped_to_1_9(): both per-entity ``TreatmentAssignment`` and per-cohort ``TreatmentCohort`` records (1.8 → 1.9); ``None`` default preserves byte-equivalence for pre-M24 configs modulo the version string. + Schema 1.10 added three additive sections — ``seasonal_decomposition`` + (always emitted with an empty sentinel on configs without + seasonality), ``regression_pairs_global``, and + ``regression_pairs_by_archetype`` — for per-pair OLS summaries and a + snapshot of the engine's seasonal-strength inputs. The version pin lives in this test rather than just the manifest module - so a downstream consumer pinning ``schema_version >= "1.9"`` has a + so a downstream consumer pinning ``schema_version >= "1.10"`` has a direct on-disk contract test it can reference. """ - assert MANIFEST_SCHEMA_VERSION == "1.9" + assert MANIFEST_SCHEMA_VERSION == "1.10" def test_causal_graph_emits_one_edge_per_metric_with_lag(saas_run): diff --git a/tests/test_manifest_decomposition_regression.py b/tests/test_manifest_decomposition_regression.py new file mode 100644 index 0000000..089c40f --- /dev/null +++ b/tests/test_manifest_decomposition_regression.py @@ -0,0 +1,547 @@ +"""Manifest enrichment: seasonal-decomposition snapshot + per-pair regression. + +Schema 1.10 adds three additive sections to ``ManifestSchema``: + +* ``seasonal_decomposition`` — the global per-period strength array plus + per-metric and per-entity sensitivity dicts. Configs without + ``seasonal_effects`` emit the empty-sentinel shape (empty list / empty + dicts) — the engine short-circuits on those configs and recording + the inert multipliers would just be noise. +* ``regression_pairs_global`` — pair-wise OLS β + intercept (both + directions), r², per-direction residual variance, and finite-cell + count for every declared correlation pair, pooled across all + entities. +* ``regression_pairs_by_archetype`` — the same OLS surface restricted + to the entity subset of each archetype. Archetypes that contribute + no finite observations are omitted entirely. + +This module locks in: + +* Seasonal factors match ``_build_seasonal_factors`` output value-for- + value, and the sensitivity dicts mirror the config. +* No-seasonality configs emit the empty-sentinel shape. +* Pooled OLS coefficients match a manual ``np.linalg.lstsq`` fit on the + same observations (rtol=1e-6). +* Both directions of β are emitted for every declared pair. +* Cold-start NaN cells are filtered out of the regression — fits still + succeed with the surviving observations, and ``n_observations`` + reports the surviving count. +* Configs without correlations / seasonality produce manifests + byte-equivalent to pre-1.10 modulo the schema version string and the + three new empty-sentinel containers. +* The schema version pin is ``"1.10"``. +""" + +from __future__ import annotations + +import warnings + +import numpy as np + +from plotsim import generate_tables_with_state +from plotsim.config import ( + Archetype, + Column, + CorrelationPair, + CurveSegment, + Domain, + Entity, + Metric, + OutputConfig, + PlotsimConfig, + SeasonalEffect, + SurrogateKeyWarning, + Table, + TimeWindow, +) +from plotsim.manifest import ( + MANIFEST_SCHEMA_VERSION, + RegressionPair, + SeasonalDecomposition, + build_manifest, +) +from plotsim.tables import _build_seasonal_factors + + +# --- Fixtures -------------------------------------------------------------- + + +def _two_metric_config( + *, + entities: list[Entity] | None = None, + archetypes: list[Archetype] | None = None, + correlations: list[CorrelationPair] | None = None, + seasonal_effects: list[SeasonalEffect] | None = None, + metric_seasonal_sensitivity: tuple[float, float] = (1.0, 1.0), +) -> PlotsimConfig: + """Two-metric engine config with one flat archetype. + + Returns realized cell values in [0, 1] (beta distribution) so a + pooled OLS fit on entity_metrics has well-conditioned inputs. + ``correlations`` defaults to empty (no copula at draw time); pass a + non-empty list to engage the manifest's regression sections. + """ + if archetypes is None: + archetypes = [ + Archetype( + name="flat", + label="flat", + description="constant 0.5 plateau", + curve_segments=[ + CurveSegment( + curve="plateau", + params={"level": 0.5}, + start_pct=0.0, + end_pct=1.0, + ), + ], + ), + ] + m1 = Metric( + name="m1", + label="m1", + distribution="beta", + params={"alpha": 2.0, "beta": 5.0}, + polarity="positive", + seasonal_sensitivity=metric_seasonal_sensitivity[0], + ) + m2 = Metric( + name="m2", + label="m2", + distribution="beta", + params={"alpha": 2.0, "beta": 5.0}, + polarity="positive", + seasonal_sensitivity=metric_seasonal_sensitivity[1], + ) + fct = Table( + name="fct_m", + type="fact", + grain="per_entity_per_period", + primary_key=["date_key", "entity_id"], + foreign_keys=["dim_date.date_key", "dim_entity.entity_id"], + columns=[ + Column(name="date_key", dtype="id", source="fk:dim_date.date_key"), + Column(name="entity_id", dtype="id", source="fk:dim_entity.entity_id"), + Column(name="m1", dtype="float", source="metric:m1"), + Column(name="m2", dtype="float", source="metric:m2"), + ], + ) + dim_date = Table( + name="dim_date", + type="dim", + grain="per_period", + primary_key="date_key", + columns=[ + Column(name="date_key", dtype="id", source="pk"), + Column(name="date", dtype="date", source="generated:date_key"), + ], + ) + dim_entity = Table( + name="dim_entity", + type="dim", + grain="per_entity", + primary_key="entity_id", + columns=[ + Column(name="entity_id", dtype="id", source="pk"), + ], + ) + if entities is None: + entities = [Entity(name=f"e_{i}", archetype="flat", size=1) for i in range(10)] + with warnings.catch_warnings(): + warnings.simplefilter("ignore", SurrogateKeyWarning) + return PlotsimConfig( + domain=Domain( + name="t", + description="t", + entity_type="entity", + entity_label="Entities", + ), + time_window=TimeWindow( + start="2024-01", + end="2024-12", + granularity="monthly", + ), + seed=0, + metrics=[m1, m2], + archetypes=archetypes, + entities=entities, + tables=[dim_date, dim_entity, fct], + correlations=correlations or [], + seasonal_effects=seasonal_effects or [], + output=OutputConfig(format="csv", directory="out/m25"), + ) + + +# --- Schema version pin ---------------------------------------------------- + + +def test_schema_version_is_1_10(): + """Lock the schema version at ``"1.10"`` so downstream readers + that pin on the constant catch the bump as a typed import-time + failure rather than an at-runtime field-shape surprise. + """ + assert MANIFEST_SCHEMA_VERSION == "1.10" + + +# --- Seasonal decomposition ------------------------------------------------ + + +def test_seasonal_decomposition_matches_engine_factors(): + """The manifest's ``seasonal_factors`` array is value-for-value the + output of ``_build_seasonal_factors`` for the same config — the + helper is the engine's own source of truth, so the manifest snapshot + must agree at the float level rather than at a normalized summary. + """ + seasonal = [ + SeasonalEffect(months=(12, 1, 2), strength=0.8), + SeasonalEffect(months=(6, 7), strength=-0.3), + ] + cfg = _two_metric_config( + seasonal_effects=seasonal, + metric_seasonal_sensitivity=(0.5, 1.5), + ) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + decomp = manifest.seasonal_decomposition + assert isinstance(decomp, SeasonalDecomposition) + expected = _build_seasonal_factors(cfg, n_periods=len(tables["dim_date"])) + assert expected is not None + np.testing.assert_allclose( + np.asarray(decomp.seasonal_factors, dtype=np.float64), + expected, + rtol=0.0, + atol=0.0, + ) + assert decomp.metric_seasonal_sensitivities == {"m1": 0.5, "m2": 1.5} + assert set(decomp.entity_seasonal_sensitivities.keys()) == {e.name for e in cfg.entities} + # Per-entity sensitivities default to 1.0 — they were not overridden + # in this fixture, so every value should equal 1.0. + for v in decomp.entity_seasonal_sensitivities.values(): + assert v == 1.0 + + +def test_seasonal_decomposition_empty_when_no_effects(): + """Configs without seasonality emit the empty-sentinel shape: an + empty list for factors and empty dicts for both sensitivity maps. + Anchors D3 — D3 picked empty-containers over a null sentinel so + downstream consumers don't need a None-check before iterating. + """ + cfg = _two_metric_config() + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + decomp = manifest.seasonal_decomposition + assert decomp.seasonal_factors == [] + assert decomp.metric_seasonal_sensitivities == {} + assert decomp.entity_seasonal_sensitivities == {} + + +# --- Regression pair correctness ------------------------------------------- + + +def _manual_ols(a: np.ndarray, b: np.ndarray) -> tuple[float, float]: + """Fit ``b = beta * a + intercept`` via ``np.linalg.lstsq``. + + Used as the cross-check oracle in + ``test_regression_beta_matches_numpy_lstsq``. Returns + ``(beta, intercept)`` so the assertion is direct. + """ + design = np.column_stack([a, np.ones_like(a)]) + coeffs, *_ = np.linalg.lstsq(design, b, rcond=None) + return float(coeffs[0]), float(coeffs[1]) + + +def test_regression_beta_matches_numpy_lstsq(): + """Acceptance #3: the manifest's pooled β for a declared correlation + pair matches a manual ``np.linalg.lstsq`` fit on the same + ``entity_metrics`` arrays, both directions, within rtol=1e-6. This + is the cross-check that ``_ols_pair``'s closed-form is right. + """ + entities = [Entity(name=f"e_{i}", archetype="flat", size=1) for i in range(40)] + cfg = _two_metric_config( + entities=entities, + correlations=[CorrelationPair(metric_a="m1", metric_b="m2", coefficient=0.7)], + ) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + assert len(manifest.regression_pairs_global) == 1 + rec = manifest.regression_pairs_global[0] + assert (rec.metric_a, rec.metric_b) == ("m1", "m2") + + pool_a = np.concatenate([state.entity_metrics[e.name]["m1"] for e in cfg.entities]) + pool_b = np.concatenate([state.entity_metrics[e.name]["m2"] for e in cfg.entities]) + mask = np.isfinite(pool_a) & np.isfinite(pool_b) + pool_a = pool_a[mask] + pool_b = pool_b[mask] + expected_beta_a_to_b, expected_int_a_to_b = _manual_ols(pool_a, pool_b) + expected_beta_b_to_a, expected_int_b_to_a = _manual_ols(pool_b, pool_a) + assert np.isclose(rec.beta_a_to_b, expected_beta_a_to_b, rtol=1e-6, atol=1e-9) + assert np.isclose(rec.intercept_a_to_b, expected_int_a_to_b, rtol=1e-6, atol=1e-9) + assert np.isclose(rec.beta_b_to_a, expected_beta_b_to_a, rtol=1e-6, atol=1e-9) + assert np.isclose(rec.intercept_b_to_a, expected_int_b_to_a, rtol=1e-6, atol=1e-9) + expected_r2 = float(np.corrcoef(pool_a, pool_b)[0, 1] ** 2) + assert np.isclose(rec.r_squared, expected_r2, rtol=1e-6, atol=1e-9) + assert rec.n_observations == int(pool_a.size) + + +def test_regression_emits_both_directions(): + """Both ``β_{a→b}`` and ``β_{b→a}`` are emitted, and (given non- + degenerate variance) they're related by ``β_{a→b} * β_{b→a} == + r²``. The mathematical identity is the right invariant to pin — + it would catch a copy-paste error that reused the same direction's + β for both fields. + """ + entities = [Entity(name=f"e_{i}", archetype="flat", size=1) for i in range(40)] + cfg = _two_metric_config( + entities=entities, + correlations=[CorrelationPair(metric_a="m1", metric_b="m2", coefficient=0.7)], + ) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + rec = manifest.regression_pairs_global[0] + product = rec.beta_a_to_b * rec.beta_b_to_a + assert np.isclose(product, rec.r_squared, rtol=1e-6, atol=1e-9) + + +def test_regression_by_archetype_emits_per_archetype(): + """Acceptance #4: with two archetypes and a declared correlation + pair, ``regression_pairs_by_archetype`` has one entry per archetype + and each entry's pooled β is computed over only that archetype's + entities. + """ + archetypes = [ + Archetype( + name="flat_low", + label="flat_low", + description="constant 0.3", + curve_segments=[ + CurveSegment( + curve="plateau", + params={"level": 0.3}, + start_pct=0.0, + end_pct=1.0, + ), + ], + ), + Archetype( + name="flat_high", + label="flat_high", + description="constant 0.7", + curve_segments=[ + CurveSegment( + curve="plateau", + params={"level": 0.7}, + start_pct=0.0, + end_pct=1.0, + ), + ], + ), + ] + entities = [Entity(name=f"lo_{i}", archetype="flat_low", size=1) for i in range(20)] + [ + Entity(name=f"hi_{i}", archetype="flat_high", size=1) for i in range(20) + ] + cfg = _two_metric_config( + entities=entities, + archetypes=archetypes, + correlations=[CorrelationPair(metric_a="m1", metric_b="m2", coefficient=0.6)], + ) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + by_arch = manifest.regression_pairs_by_archetype + assert set(by_arch.keys()) == {"flat_low", "flat_high"} + for archetype_name in ("flat_low", "flat_high"): + recs = by_arch[archetype_name] + assert len(recs) == 1 + rec = recs[0] + names = [e.name for e in cfg.entities if e.archetype == archetype_name] + pool_a = np.concatenate([state.entity_metrics[n]["m1"] for n in names]) + pool_b = np.concatenate([state.entity_metrics[n]["m2"] for n in names]) + mask = np.isfinite(pool_a) & np.isfinite(pool_b) + pool_a = pool_a[mask] + pool_b = pool_b[mask] + expected_beta, expected_int = _manual_ols(pool_a, pool_b) + assert np.isclose(rec.beta_a_to_b, expected_beta, rtol=1e-6, atol=1e-9) + assert np.isclose(rec.intercept_a_to_b, expected_int, rtol=1e-6, atol=1e-9) + + +def test_regression_pairs_empty_without_correlations(): + """Acceptance #5: configs without ``correlations`` emit empty + regression sections. Mirrors the existing ``correlations`` section's + contract — undeclared = no record. + """ + cfg = _two_metric_config() + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + assert manifest.regression_pairs_global == [] + assert manifest.regression_pairs_by_archetype == {} + + +def test_regression_nan_cells_are_skipped(): + """``n_observations`` reflects only finite ``(a, b)`` cells. + Synthetic NaN injection into the realized ``entity_metrics`` (cells + a downstream cold-start contract would produce as NaN) is masked + out by ``_ols_pair`` — the fit still succeeds on the surviving + observations and the count drops by the masked-cell count. + """ + entities = [Entity(name=f"e_{i}", archetype="flat", size=1) for i in range(20)] + cfg = _two_metric_config( + entities=entities, + correlations=[CorrelationPair(metric_a="m1", metric_b="m2", coefficient=0.5)], + ) + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + # Drop the first three periods on one entity to NaN — emulates a + # cold-start lead-in pattern the regression layer must tolerate. + contaminated = { + ename: {metric: arr.copy() for metric, arr in per_metric.items()} + for ename, per_metric in state.entity_metrics.items() + } + contaminated["e_0"]["m1"][:3] = np.nan + n_periods = len(tables["dim_date"]) + expected_finite = len(entities) * n_periods - 3 + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=contaminated, + ) + rec = manifest.regression_pairs_global[0] + assert rec.n_observations == expected_finite + # Fit still produces a finite β under the mask. + assert np.isfinite(rec.beta_a_to_b) + assert np.isfinite(rec.beta_b_to_a) + + +# --- Byte-equivalence guard ------------------------------------------------ + + +def test_byte_equivalent_to_pre_1_10_modulo_new_fields(): + """Configs without correlations or seasonality emit a 1.10 manifest + whose serialized payload equals the equivalent pre-1.10 payload + modulo (a) ``schema_version``, (b) the three new sentinel-shaped + fields. Anchors the additive-only promise: 1.9 readers parsing a + 1.10 manifest see only the new fields' defaults — no existing + field's value or ordering changed. + """ + cfg = _two_metric_config() + rng = np.random.default_rng(cfg.seed) + tables, state = generate_tables_with_state(cfg, rng) + manifest = build_manifest( + cfg, + state.trajectories, + tables, + scd_state=state.scd, + bridge_state=state.bridges, + entity_metrics=state.entity_metrics, + ) + payload = manifest.model_dump(mode="json") + # Strip the new and version fields; everything else must be empty + # or a default produced by the pre-1.10 build path. + assert payload.pop("schema_version") == "1.10" + seasonal = payload.pop("seasonal_decomposition") + assert seasonal == { + "seasonal_factors": [], + "metric_seasonal_sensitivities": {}, + "entity_seasonal_sensitivities": {}, + } + assert payload.pop("regression_pairs_global") == [] + assert payload.pop("regression_pairs_by_archetype") == {} + # The remaining keys are exactly the pre-1.10 field set. We don't + # snapshot the full dict (that's what schema-pin tests cover); we + # just assert the new fields are the only delta from the legacy + # shape by checking nothing pre-1.10 went missing. + for legacy_key in ( + "seed", + "config_sha256", + "archetype_assignments", + "trajectory_samples", + "event_firings", + "causal_graph", + "correlations", + "noise_config", + ): + assert legacy_key in payload, f"legacy field missing from 1.10 payload: {legacy_key}" + + +# --- RegressionPair shape -------------------------------------------------- + + +def test_regression_pair_carries_required_fields(): + """``RegressionPair`` exposes the ten fields the schema documents. + Pins the model surface independently of the engine path — would + catch a partial rename or a dropped field on a refactor. + """ + rec = RegressionPair( + metric_a="x", + metric_b="y", + beta_a_to_b=1.0, + intercept_a_to_b=0.0, + beta_b_to_a=1.0, + intercept_b_to_a=0.0, + r_squared=1.0, + residual_variance_a_to_b=0.0, + residual_variance_b_to_a=0.0, + n_observations=10, + ) + payload = rec.model_dump(mode="json") + assert set(payload.keys()) == { + "metric_a", + "metric_b", + "beta_a_to_b", + "intercept_a_to_b", + "beta_b_to_a", + "intercept_b_to_a", + "r_squared", + "residual_variance_a_to_b", + "residual_variance_b_to_a", + "n_observations", + } diff --git a/tests/test_multi_source.py b/tests/test_multi_source.py index a4a8547..4996952 100644 --- a/tests/test_multi_source.py +++ b/tests/test_multi_source.py @@ -313,10 +313,12 @@ def test_manifest_schema_bumped_to_1_9(): # with ``noise_family`` / ``degrees_of_freedom`` and broadens its # emission criterion to cover non-gaussian families (0.6-M23); 1.9 # adds the optional ``target_metric`` field on ``TreatmentAssignment`` - # / ``TreatmentCohort`` for per-metric treatment effects (0.6-M24). + # / ``TreatmentCohort`` for per-metric treatment effects (0.6-M24); + # 1.10 adds the ``seasonal_decomposition`` snapshot plus + # ``regression_pairs_global`` / ``regression_pairs_by_archetype``. # This module's contract tracks the pin at the schema level, not the # field semantics. - assert MANIFEST_SCHEMA_VERSION == "1.9" + assert MANIFEST_SCHEMA_VERSION == "1.10" # ── AC6: single-source configs unchanged (no multi_source block) ────────── @@ -406,7 +408,7 @@ def test_bundled_template_loads_and_validates(tmp_path: Path): assert (out_dir / "dim_company_crm.csv").is_file() assert (out_dir / "dim_company_billing.csv").is_file() manifest_payload = json.loads((out_dir / "manifest.json").read_text(encoding="utf-8")) - assert manifest_payload["schema_version"] == "1.9" + assert manifest_payload["schema_version"] == "1.10" # 20 entities × 2 sources = 40 mapping records. assert len(manifest_payload["source_entity_mappings"]) == 40 diff --git a/tests/test_per_metric_treatment.py b/tests/test_per_metric_treatment.py index 39dcdf9..7ee492e 100644 --- a/tests/test_per_metric_treatment.py +++ b/tests/test_per_metric_treatment.py @@ -518,8 +518,11 @@ def test_manifest_schema_version_bumped_for_m24(): schema. Pre-M24 readers see a 1.9 manifest's new ``target_metric`` field default to ``None`` so they parse cleanly — but the schema string itself must advance to signal that the new field exists. + Subsequent additive sections continue to advance the pin + (1.9 → 1.10 for the seasonal-decomposition snapshot plus per-pair + regression summaries). """ - assert MANIFEST_SCHEMA_VERSION == "1.9" + assert MANIFEST_SCHEMA_VERSION == "1.10" # --- Builder propagation --------------------------------------------------- diff --git a/tests/test_time_varying_correlations.py b/tests/test_time_varying_correlations.py index 423c2c5..95eb431 100644 --- a/tests/test_time_varying_correlations.py +++ b/tests/test_time_varying_correlations.py @@ -686,8 +686,10 @@ def test_schema_version_is_1_9(self): # bumped 1.7 → 1.8 for ``noise_family`` / ``degrees_of_freedom`` # on ``NoiseConfigInfo`` and broadened its emission criterion; # 0.6-M24 bumped 1.8 → 1.9 for the additive ``target_metric`` - # field on ``TreatmentAssignment`` / ``TreatmentCohort``. - assert MANIFEST_SCHEMA_VERSION == "1.9" + # field on ``TreatmentAssignment`` / ``TreatmentCohort``; + # the 1.9 → 1.10 bump added the ``seasonal_decomposition`` + # snapshot plus per-pair OLS summary sections. + assert MANIFEST_SCHEMA_VERSION == "1.10" def test_no_phases_yields_empty_correlation_phases_list(self): cfg = _two_metric_config(