From d4ded979a3d49f9f8045f0144494a8d6b5f279ce Mon Sep 17 00:00:00 2001 From: mohossam01 Date: Sat, 16 May 2026 01:09:27 -0400 Subject: [PATCH 1/2] feat: overhaul bundled template catalog to align with new engine features Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 26 ++ CONTRIBUTING.md | 2 +- docs/site/api-reference.md | 8 +- docs/site/column-types.md | 10 +- docs/site/config-reference.md | 6 +- docs/site/cookbook/data-engineering.md | 9 +- docs/site/cookbook/data-science.md | 11 +- docs/site/feature-reference.md | 18 +- .../tutorial-notebooks/getting_started.ipynb | 8 +- docs/site/user-guide/cdc-facts.md | 10 +- .../user-guide/experiments-and-cohorts.md | 13 +- docs/site/user-guide/geo-hierarchy.md | 15 +- .../user-guide/metrics-and-connections.md | 4 +- docs/site/user-guide/multi-source.md | 19 +- docs/site/user-guide/narrative-source.md | 12 +- plotsim/configs/templates/banking.yaml | 321 +++++++++++++ plotsim/configs/templates/banking_template.py | 393 ++++++++++++++++ .../configs/templates/education_template.py | 279 ------------ .../configs/templates/education_template.yaml | 358 --------------- plotsim/configs/templates/health.yaml | 327 ++++++++++++++ plotsim/configs/templates/health_template.py | 401 +++++++++++++++++ plotsim/configs/templates/hr.yaml | 302 +++++++++++++ plotsim/configs/templates/hr_template.py | 212 ++++++--- plotsim/configs/templates/hr_template.yaml | 352 --------------- plotsim/configs/templates/marketing.yaml | 290 ++++++++++++ .../configs/templates/marketing_template.py | 128 +++--- .../configs/templates/marketing_template.yaml | 400 ----------------- plotsim/configs/templates/retail.yaml | 311 +++++++++++++ plotsim/configs/templates/retail_template.py | 424 ++++++++++-------- plotsim/configs/templates/saas.yaml | 259 +++++++++++ plotsim/configs/templates/saas_template.py | 274 +++++------ tests/configs/__init__.py | 14 + .../templates => tests/configs}/ab_trial.py | 0 .../templates => tests/configs}/ab_trial.yaml | 0 .../configs}/bare_minimum.py | 2 +- .../configs}/bare_minimum.yaml | 2 +- .../templates => tests/configs}/cdc_demo.py | 2 +- .../templates => tests/configs}/cdc_demo.yaml | 2 +- .../configs}/crm_billing_overlap.py | 0 .../configs}/crm_billing_overlap.yaml | 0 .../templates => tests/configs}/geo_retail.py | 0 .../configs}/geo_retail.yaml | 0 .../templates => tests/configs}/lakehouse.py | 0 .../configs}/lakehouse.yaml | 0 .../configs}/latency_skew.py | 0 .../configs}/latency_skew.yaml | 0 .../configs}/narrative_reviews.py | 0 .../configs}/narrative_reviews.yaml | 0 .../configs}/orders_template.py | 2 +- .../configs}/orders_template.yaml | 2 +- tests/configs/retail_template.py | 357 +++++++++++++++ .../configs}/retail_template.yaml | 0 tests/configs/saas_template.py | 300 +++++++++++++ .../configs}/saas_template.yaml | 0 tests/test_builder_input.py | 2 +- tests/test_builder_integration.py | 4 +- tests/test_builder_schema.py | 4 +- tests/test_bypass_observability.py | 6 +- tests/test_cdc_facts.py | 16 +- tests/test_cli.py | 15 +- tests/test_entity_expansion.py | 2 +- tests/test_fk_target_resolution.py | 4 +- tests/test_geo_provider.py | 27 +- tests/test_jsonl_output.py | 10 +- tests/test_multi_source.py | 17 +- tests/test_narrative_source.py | 13 +- tests/test_parent_child_facts.py | 12 +- tests/test_partitioned_parquet.py | 18 +- tests/test_pk_prefix.py | 11 +- tests/test_sql_output.py | 12 +- tests/test_streaming_parquet.py | 14 +- tests/test_templates_api.py | 12 +- 72 files changed, 4075 insertions(+), 2009 deletions(-) create mode 100644 plotsim/configs/templates/banking.yaml create mode 100644 plotsim/configs/templates/banking_template.py delete mode 100644 plotsim/configs/templates/education_template.py delete mode 100644 plotsim/configs/templates/education_template.yaml create mode 100644 plotsim/configs/templates/health.yaml create mode 100644 plotsim/configs/templates/health_template.py create mode 100644 plotsim/configs/templates/hr.yaml delete mode 100644 plotsim/configs/templates/hr_template.yaml create mode 100644 plotsim/configs/templates/marketing.yaml delete mode 100644 plotsim/configs/templates/marketing_template.yaml create mode 100644 plotsim/configs/templates/retail.yaml create mode 100644 plotsim/configs/templates/saas.yaml create mode 100644 tests/configs/__init__.py rename {plotsim/configs/templates => tests/configs}/ab_trial.py (100%) rename {plotsim/configs/templates => tests/configs}/ab_trial.yaml (100%) rename {plotsim/configs/templates => tests/configs}/bare_minimum.py (93%) rename {plotsim/configs/templates => tests/configs}/bare_minimum.yaml (96%) rename {plotsim/configs/templates => tests/configs}/cdc_demo.py (97%) rename {plotsim/configs/templates => tests/configs}/cdc_demo.yaml (98%) rename {plotsim/configs/templates => tests/configs}/crm_billing_overlap.py (100%) rename {plotsim/configs/templates => tests/configs}/crm_billing_overlap.yaml (100%) rename {plotsim/configs/templates => tests/configs}/geo_retail.py (100%) rename {plotsim/configs/templates => tests/configs}/geo_retail.yaml (100%) rename {plotsim/configs/templates => tests/configs}/lakehouse.py (100%) rename {plotsim/configs/templates => tests/configs}/lakehouse.yaml (100%) rename {plotsim/configs/templates => tests/configs}/latency_skew.py (100%) rename {plotsim/configs/templates => tests/configs}/latency_skew.yaml (100%) rename {plotsim/configs/templates => tests/configs}/narrative_reviews.py (100%) rename {plotsim/configs/templates => tests/configs}/narrative_reviews.yaml (100%) rename {plotsim/configs/templates => tests/configs}/orders_template.py (98%) rename {plotsim/configs/templates => tests/configs}/orders_template.yaml (99%) create mode 100644 tests/configs/retail_template.py rename {plotsim/configs/templates => tests/configs}/retail_template.yaml (100%) create mode 100644 tests/configs/saas_template.py rename {plotsim/configs/templates => tests/configs}/saas_template.yaml (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a894a1..ca7d3b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,32 @@ Versioning: [SemVer](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Changed + +- **Bundled template catalog refreshed.** `plotsim.list_templates()` + now returns exactly six domain templates: `banking`, `health`, + `hr`, `marketing`, `retail`, `saas`. Each is schema-realistic + (real column topology and FK shapes for the domain), output- + realistic (pool, range, distribution, correlation, and seasonality + choices match the domain's real data shape), and feature-deep — + every template exercises SCD2, lifecycle stages, 3+ correlations, + causal lag, seasonality, and 2 event tables; CDC on the relevant + fact for each domain; per-metric treatment cohorts on `marketing` + and `health`; bridge tables on `hr`, `retail`, `banking`, and + `health`; parent/child fact grain on `retail`, `banking`, and + `health`; cross-fact FK on `retail` and `health`; geo bundle on + `retail`, `banking`, and `health`; narrative columns on `hr`, + `retail`, `banking`, and `health`; heteroscedastic noise on + `saas` and `health`; student-t noise on `banking`; holdout splits + on `banking` and `health`; sub-entity dim on `saas`; multi-locale + on `retail`. The previous catalog of fourteen mixed-purpose + templates — `ab_trial`, `bare_minimum`, `cdc_demo`, + `crm_billing_overlap`, `education`, `geo_retail`, `lakehouse`, + `latency_skew`, `narrative_reviews`, `orders` — has been demoted + from public surface: the feature-vehicle YAMLs and `.py` + companions for each now live under `tests/configs/` and continue + to power the existing feature-coverage test files unchanged. + ### Added - **Manifest decomposition + regression sections.** The manifest sidecar diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1373e1a..4be2243 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -41,7 +41,7 @@ your config shape: Steps: -1. Copy an existing template (e.g. `saas_template.yaml` + +1. Copy an existing template (e.g. `saas.yaml` + `saas_template.py`) as a starting point. 2. Edit metrics, segments / archetypes, dimensions, facts, events, and any feature-specific blocks for the new use case. diff --git a/docs/site/api-reference.md b/docs/site/api-reference.md index 11bd397..cb2e205 100644 --- a/docs/site/api-reference.md +++ b/docs/site/api-reference.md @@ -131,12 +131,10 @@ Return the names of bundled builder templates. def list_templates() -> list[str] ``` -Names round-trip through [`load_template`](#load_template). Templates -whose filename ends in `_template` strip that suffix; `bare_minimum` -and the single-feature templates keep their full stems. Sorted -alphabetically. +Names round-trip through [`load_template`](#load_template). The bundled +catalog covers six domains, sorted alphabetically. -**Returns** — e.g. `["ab_trial", "bare_minimum", "cdc_demo", "crm_billing_overlap", "education", "geo_retail", "hr", "lakehouse", "latency_skew", "marketing", "narrative_reviews", "retail", "saas"]`. +**Returns** — `["banking", "health", "hr", "marketing", "retail", "saas"]`. **Example** diff --git a/docs/site/column-types.md b/docs/site/column-types.md index 56be247..c82fc66 100644 --- a/docs/site/column-types.md +++ b/docs/site/column-types.md @@ -144,7 +144,9 @@ Output dtype is `float` for `latitude` / `longitude` and `string` for everything else. `geo.` is dim-only; on facts and events the engine raises `unsupported generated provider`. See [Geo hierarchy](./user-guide/geo-hierarchy.md) for the underlying -dataset, determinism, and the bundled `geo_retail` template. +dataset, determinism, and the `tests/configs/geo_retail.yaml` +worked example; the bundled `retail`, `banking`, and `health` +domain templates each put a geo bundle on their customer/patient dim. --- @@ -182,8 +184,10 @@ builder API). `narrative` is fact-only and per_entity_per_period; the cell builder forces the scalar fact path because it consumes one RNG draw per slot per row. See [Narrative text source](./user-guide/narrative-source.md) for the -lexicon-design playbook, validation gates, and the bundled -`narrative_reviews` template. +lexicon-design playbook, validation gates, and the +`tests/configs/narrative_reviews.yaml` worked example; narrative +columns also ship on the bundled `hr`, `retail`, `banking`, and +`health` domain templates. --- diff --git a/docs/site/config-reference.md b/docs/site/config-reference.md index d277345..a071db9 100644 --- a/docs/site/config-reference.md +++ b/docs/site/config-reference.md @@ -725,7 +725,7 @@ output: |---|---|---|---| | `format` | `"csv"` / `"parquet"` / `"jsonl"` / `"sql"` | `"csv"` | `parquet` requires `pip install plotsim[parquet]` (pyarrow) and produces typed binary files ~5–10× smaller than CSV. `jsonl` writes newline-delimited JSON (one self-contained object per row) for streaming-ingestion / schema-on-read consumers. `sql` writes a single `data.sql` file with dialect-aware DDL + batched INSERTs instead of per-table files | | `directory` | `str` | `"output"` | Where `write_tables` writes. Override at call time with `write_tables(..., output_dir=...)` | -| `cell_budget` | `int ≥ 0` / `null` | `null` | Soft cell-count cap consumed by the load-time scale estimator. `null` falls through to `PLOTSIM_CELL_BUDGET` env var, then to the 2,000,000 default. `0` disables the soft cap entirely. See [Cell-count budget](#cell-count-budget) for precedence and the bundled `lakehouse` template for a worked example | +| `cell_budget` | `int ≥ 0` / `null` | `null` | Soft cell-count cap consumed by the load-time scale estimator. `null` falls through to `PLOTSIM_CELL_BUDGET` env var, then to the 2,000,000 default. `0` disables the soft cap entirely. See [Cell-count budget](#cell-count-budget) for precedence and `tests/configs/lakehouse.yaml` for a worked example | | `denormalized` | `bool` | `false` | Opt-in wide-table companion writer. When `true`, every fact table is left-joined with its FK'd dims (SCD2 dims filtered to current state) and emits `_wide.` alongside the normalized output. Under `format: sql` the wide tables emit as trailing blocks inside `data.sql` instead of separate files | | `partition_by` | `str` / `null` | `null` | Column name to partition Parquet output on. When set, every table that carries the column is written as a Hive-style directory (`//=/...`) via `pyarrow.parquet.write_to_dataset`. Tables without the column fall back to single files. Requires `format: parquet`; cross-validated at config load | | `sql_dialect` | `"postgresql"` / `"mysql"` / `"sqlite"` | `"postgresql"` | Dialect for the SQL dump writer — selects identifier quoting (`"col"` for PG/SQLite, `` `col` `` for MySQL), type words (PG `NUMERIC` / MySQL `DOUBLE` + `VARCHAR(255)` for string PKs / SQLite `REAL`), and boolean encoding. The default round-trips under any format; explicit `mysql` / `sqlite` requires `format: sql` (cross-validated at config load) | @@ -894,8 +894,8 @@ precedence order (the first one that resolves wins): 1. **Config field (recommended)** — set `output.cell_budget: N` in the YAML (or pass `output={"cell_budget": N}` to `create()`). Reproducible from the config alone — no env vars or flags - required, which is the contract the bundled `lakehouse` - template relies on. + required, which is the contract the `tests/configs/lakehouse.yaml` + worked example relies on. 2. **Environment variable** — `PLOTSIM_CELL_BUDGET=N` sets the soft cap to `N` cells when no config field is set. 3. **Default** — `2,000,000` cells. diff --git a/docs/site/cookbook/data-engineering.md b/docs/site/cookbook/data-engineering.md index 1770d1e..888ba40 100644 --- a/docs/site/cookbook/data-engineering.md +++ b/docs/site/cookbook/data-engineering.md @@ -56,8 +56,9 @@ whichever fits your workflow. Or skip the YAML round-trip entirely — the [`saas_template.py`](https://github.com/mohossam01/plotsim/blob/main/plotsim/configs/templates/saas_template.py) - bundled with plotsim shows the same template authored as - `create(**kwargs)` directly. + bundled with plotsim shows the same SaaS template authored as + `create(**kwargs)` directly, paired with `saas.yaml` in the + same directory. Pin `seed:` in the YAML (or pass `seed=42` to `create`) and the fixture is byte-stable across CI runs. @@ -351,8 +352,8 @@ in the config (recommended; reproducible from YAML alone), `output.cell_budget: 0` (or `PLOTSIM_CELL_BUDGET=0`) disables the soft cap entirely; only the `50,000,000`-cell hard ceiling still applies. See [Limits](../config-reference.md#limits-and-performance-gates) for -the full ladder and the bundled `lakehouse` template for a worked -example of a 1.5M-cell config. +the full ladder; `tests/configs/lakehouse.yaml` in the repo is a +worked example of a config near the 1.5M-cell range. --- diff --git a/docs/site/cookbook/data-science.md b/docs/site/cookbook/data-science.md index be74882..a370a19 100644 --- a/docs/site/cookbook/data-science.md +++ b/docs/site/cookbook/data-science.md @@ -50,8 +50,9 @@ multi-metric dataset with archetype ground truth. ``` The [`saas_template.py`](https://github.com/mohossam01/plotsim/blob/main/plotsim/configs/templates/saas_template.py) -companion shows the same template authored as a `create(**kwargs)` -call — every YAML field maps 1-1 to a Python keyword. +companion (paired with `saas.yaml` in the same directory) shows the +same SaaS template authored as a `create(**kwargs)` call — every YAML +field maps 1-1 to a Python keyword. --- @@ -228,9 +229,9 @@ time, not just larger ones. All six builder distribution families (`lognorm`, `gamma`, `weibull`, `beta`, `normal`, `poisson`) are pinnable the same way -via `MetricInput.distribution` + `distribution_params`. The bundled -`latency_skew` template (`plotsim template latency_skew`) exercises -all six on a single config. Full mechanics: +via `MetricInput.distribution` + `distribution_params`. The +`tests/configs/latency_skew.yaml` worked example exercises all six +on a single config. Full mechanics: [`metrics-and-connections.md` §pinning the distribution explicitly](../user-guide/metrics-and-connections.md#pinning-the-distribution-explicitly). --- diff --git a/docs/site/feature-reference.md b/docs/site/feature-reference.md index a08c8a5..83b93ea 100644 --- a/docs/site/feature-reference.md +++ b/docs/site/feature-reference.md @@ -22,7 +22,7 @@ Three surfaces today: |---|---|---| | Library | `plotsim.create`, `create_from_yaml`, `generate_tables`, `write_tables` | Python users in an IDE or notebook | | CLI | `plotsim run`, `validate`, `info`, `template`, `schema` | Terminal, CI, scripts | -| YAML | bundled templates: `ab_trial`, `bare_minimum`, `cdc_demo`, `crm_billing_overlap`, `education`, `geo_retail`, `hr`, `lakehouse`, `latency_skew`, `marketing`, `narrative_reviews`, `retail`, `saas` | Anyone who wants to hand-edit a config | +| YAML | bundled domain templates: `banking`, `health`, `hr`, `marketing`, `retail`, `saas` | Anyone who wants to hand-edit a config | --- @@ -39,7 +39,7 @@ integrity / provenance tooling. |---|---|---| | Trajectory-first metric generation | Every metric for an entity at time *t* is derived from one archetype-curve position | `generate_tables(cfg)` | | Determinism | Single seeded `numpy.random.Generator` flows through every random draw | YAML `seed:` (integer) | -| Cell-budget scale gate | Soft pre-flight guard that aborts runs above the configured cell ceiling. Precedence: `output.cell_budget` field > `PLOTSIM_CELL_BUDGET` env > 2M default; `0` disables. Bundled template `lakehouse` exercises a 1.5M-cell config. | YAML `output.cell_budget: `; env override `PLOTSIM_CELL_BUDGET` / `PLOTSIM_ALLOW_LARGE_DATASET` | +| Cell-budget scale gate | Soft pre-flight guard that aborts runs above the configured cell ceiling. Precedence: `output.cell_budget` field > `PLOTSIM_CELL_BUDGET` env > 2M default; `0` disables. `tests/configs/lakehouse.yaml` is a worked example near the 1.5M-cell range. | YAML `output.cell_budget: `; env override `PLOTSIM_CELL_BUDGET` / `PLOTSIM_ALLOW_LARGE_DATASET` | #### Tables emitted @@ -114,7 +114,7 @@ is no longer byte-identical to a pre-flag run of the same file. |---|---|---| | Lifecycle stages | Per-entity stage sequence with stage-specific archetype overrides | YAML `lifecycle:` | | Cohort arrival distribution | Per-segment entity arrival shape — `uniform` / `linear` / `step` / `explicit` — driving `Entity.start_period`, so the entity body grows or contracts across the window. Cold-start cells are NaN-filled and dropped pre-write. Validator enforces every entity has ≥2 active periods. | builder kwarg `arrival:` on segments (4-shape discriminated union); YAML `Entity.start_period` directly | -| Treatment / control cohorts | Per-entity treatment assignment with a logit-shift on trajectory position from `treatment_start_period` onward (`treatment_lift_log_odds`). Known effect → A/B test analysis, uplift modeling, causal inference. Manifest carries `TreatmentAssignment` per entity + `TreatmentCohort` per segment. Bundled template `ab_trial`. | YAML `Entity.treatment_group` / `treatment_lift_log_odds` / `treatment_start_period` | +| Treatment / control cohorts | Per-entity treatment assignment with a logit-shift on trajectory position from `treatment_start_period` onward (`treatment_lift_log_odds`). Known effect → A/B test analysis, uplift modeling, causal inference. Manifest carries `TreatmentAssignment` per entity + `TreatmentCohort` per segment. Demonstrated on bundled `marketing` and `health` (per-metric lifts) and `banking` (whole-trajectory lift); `tests/configs/ab_trial.yaml` is the dedicated worked example. | YAML `Entity.treatment_group` / `treatment_lift_log_odds` / `treatment_start_period` | ### 6. Dim columns + fact-grain text — fill non-metric cells with realistic content @@ -124,7 +124,7 @@ is no longer byte-identical to a pre-flag run of the same file. | Faker-backed text + identifiers | PII-shape providers wired into the engine: `name`, `email`, `phone_number`, `company`, `address`, `postcode`, `country`, `city`, `latitude`, `longitude`, `sentence`. Deterministic under the run seed. Useful for masking exercises and regex-validation scenarios; **does not read entity, archetype, or trajectory** (each call is an independent draw). | | Range source | `type: range` with `range: [min, max]` on fact / event columns produces a per-row uniform draw between the bounds. Integer bounds → `dtype: int` and inclusive upper bound; float bounds → `dtype: float` and exclusive upper bound (numpy conventions). Use it for `quantity ∈ [1, 5]`, `unit_price ∈ [10.0, 500.0]`, and similar shape constraints that `faker.random_int` / `faker.pyfloat` express less precisely. Deterministic under seed. | | Pool source on facts and events | `type: pool.` lifts the per-entity value pool (previously dim-only) onto per_entity_per_period facts, variable-grain facts, per_parent_row child facts, and event tables. Every row resolves to its entity's segment, then draws uniformly from `attributes[]` — so a `loyal` cohort customer's `channel` always lands in `[app, web]` while a `casual` customer's lands in `[sms, email]`. Per_period facts (the `dim_date`-style grain) remain out of scope — those rows have no per-row entity binding. | -| Narrative text source (trajectory-aware) | Per-archetype lexicons + a sentence template rendered into a `narrative` column on a fact table. Output vocabulary tracks the entity's trajectory position (a high-position `growth` entity produces systematically different text than a low-position `decline` entity); a simple bag-of-words classifier hits ≥0.55 accuracy on archetype prediction. Deterministic under seed; preserves the trajectory-first invariant. **Fact-only** (rejected on dim / event tables at config load). **Performance:** forces the scalar fact builder path (~3-10× slower than vectorized metric-only facts), so keep narrative on tables that genuinely need text. Bundled template `narrative_reviews`. See [Narrative source](./user-guide/narrative-source.md). | +| Narrative text source (trajectory-aware) | Per-archetype lexicons + a sentence template rendered into a `narrative` column on a fact table. Output vocabulary tracks the entity's trajectory position (a high-position `growth` entity produces systematically different text than a low-position `decline` entity); a simple bag-of-words classifier hits ≥0.55 accuracy on archetype prediction. Deterministic under seed; preserves the trajectory-first invariant. **Fact-only** (rejected on dim / event tables at config load). **Performance:** forces the scalar fact builder path (~3-10× slower than vectorized metric-only facts), so keep narrative on tables that genuinely need text. Demonstrated on bundled `hr`, `retail`, `banking`, `health`; `tests/configs/narrative_reviews.yaml` is the dedicated lexicon-design walkthrough. See [Narrative source](./user-guide/narrative-source.md). | ### 7. Audit + downstream-pipeline outputs @@ -132,12 +132,12 @@ is no longer byte-identical to a pre-flag run of the same file. |---|---| | SCD Type 2 | `dim_` expanded to N×versions with `valid_from_period` and band-crossing events surfaced in the manifest | | SCD Type 1 | default (no-op) | -| Fact-side CDC | `facts[].cdc: true` emits `_inserted_at` / `_updated_at` / `_op` audit columns; column-level quality issues flip `_op` to `"U"` on affected rows. Demonstrated in `cdc_demo` (dedicated) and `retail` (realistic POS purchase ledger). | +| Fact-side CDC | `facts[].cdc: true` emits `_inserted_at` / `_updated_at` / `_op` audit columns; column-level quality issues flip `_op` to `"U"` on affected rows. Demonstrated on bundled `saas` (revenue restatement), `marketing` (spend attribution), `retail` (purchase ledger), `banking` (loan disbursement), `health` (encounter chart amendment); `tests/configs/cdc_demo.yaml` is the dedicated minimal walkthrough. | | Holdout splits | `output.holdout: {fraction\|periods}` writes `{table}_train.` + `{table}_holdout.` instead of one file per fact, split by period index | | Denormalization | `output.denormalized: true` joins each fact with its FK'd dims (SCD2 current-only, audit columns excluded, dim columns prefixed `__`); emits `_wide.{csv\|parquet}` alongside normalized output for 1NF–3NF decomposition exercises. Demonstrated in `saas`. | -| Log-file writer | Event tables with `log_format: "{ts} ... "` + `log_filename: "..."` emit a structured `.log` file alongside the CSV/Parquet event table. Format string is `template.format(**row.to_dict())` per row; unknown placeholders raise. Demonstrated in `saas` (`evt_login` as syslog-flavoured lines). | -| Multi-source / overlap | `multi_source:` block emits per-source dim copies with controlled drift (casing / abbreviation / swap) and per-source key schemes; `source_entity_mappings` ground truth in the manifest. Demonstrated in `crm_billing_overlap` (CRM + billing dual-source, 40 mapping records). | -| Nested / JSON columns | `dtype: struct` (with `nested_schema`) or `dtype: array` (with `array_element_type`) paired with `source: nested` on dim columns. Parquet preserves native nested schema (`pa.struct(...)`); CSV serializes as JSON string. Dim-only, one level of nesting, primitive leaves in V1. Demonstrated in `retail` (`dim_product_category.catalog_metadata`). | +| Log-file writer | Event tables with `log_format: "{ts} ... "` + `log_filename: "..."` emit a structured `.log` file alongside the CSV/Parquet event table. Format string is `template.format(**row.to_dict())` per row; unknown placeholders raise. `tests/configs/saas_template.yaml` (`evt_login` as syslog-flavoured lines) is the worked example. | +| Multi-source / overlap | `multi_source:` block emits per-source dim copies with controlled drift (casing / abbreviation / swap) and per-source key schemes; `source_entity_mappings` ground truth in the manifest. `tests/configs/crm_billing_overlap.yaml` is the worked example (CRM + billing dual-source, 40 mapping records). | +| Nested / JSON columns | `dtype: struct` (with `nested_schema`) or `dtype: array` (with `array_element_type`) paired with `source: nested` on dim columns. Parquet preserves native nested schema (`pa.struct(...)`); CSV serializes as JSON string. Dim-only, one level of nesting, primitive leaves in V1. `tests/configs/retail_template.yaml` (`dim_product_category.catalog_metadata`) is the worked example. | ### 8. Validation, manifest, and provenance (advanced) @@ -223,7 +223,7 @@ convenience shapes: - `window=("2024-01", "2024-12", "monthly")` shorthand. Templates: `plotsim.list_templates()` → -`["ab_trial", "bare_minimum", "cdc_demo", "crm_billing_overlap", "education", "geo_retail", "hr", "lakehouse", "latency_skew", "marketing", "narrative_reviews", "retail", "saas"]`. +`["banking", "health", "hr", "marketing", "retail", "saas"]`. `plotsim.load_template("saas")` returns a `PlotsimConfig` ready to mutate or pass to `generate_tables`. diff --git a/docs/site/tutorial-notebooks/getting_started.ipynb b/docs/site/tutorial-notebooks/getting_started.ipynb index ef99aae..addf459 100644 --- a/docs/site/tutorial-notebooks/getting_started.ipynb +++ b/docs/site/tutorial-notebooks/getting_started.ipynb @@ -165,13 +165,7 @@ "cell_type": "markdown", "id": "7cdc8c89c7104fffa095e18ddfef8986", "metadata": {}, - "source": [ - "## Where to next\n", - "\n", - "- **Bundled templates** \\u2014 `from plotsim import create_from_yaml; create_from_yaml(\"plotsim/configs/templates/saas_template.yaml\")` for a richer starting point. Thirteen bundled templates ship in `plotsim/configs/templates/`: see `plotsim.list_templates()` for the current list (saas / hr / retail / education / marketing as domain flavors; ab_trial / cdc_demo / crm_billing_overlap / geo_retail / lakehouse / latency_skew / narrative_reviews / bare_minimum as single-feature templates).\n", - "- **Config fields** \\u2014 [`docs/site/config-reference.md`](https://github.com/mohossam01/plotsim/blob/main/docs/site/config-reference.md) catalogs every config field; companion docs for [column types](https://github.com/mohossam01/plotsim/blob/main/docs/site/column-types.md) and the [archetype DSL](https://github.com/mohossam01/plotsim/blob/main/docs/site/user-guide/archetypes.md).\n", - "- **CLI** \\u2014 the same generate-then-validate flow runs from the command line: `plotsim run config.yaml -o ./output --validate`." - ] + "source": "## Where to next\n\n- **Bundled templates** — `import plotsim; plotsim.load_template(\"saas\")` for a richer starting point. Six bundled domain templates ship in `plotsim/configs/templates/`: `banking`, `health`, `hr`, `marketing`, `retail`, `saas`. Run `plotsim.list_templates()` for the current list, or `plotsim list-templates` from the CLI.\n- **Config fields** — [`docs/site/config-reference.md`](https://github.com/mohossam01/plotsim/blob/main/docs/site/config-reference.md) catalogs every config field; companion docs for [column types](https://github.com/mohossam01/plotsim/blob/main/docs/site/column-types.md) and the [archetype DSL](https://github.com/mohossam01/plotsim/blob/main/docs/site/user-guide/archetypes.md).\n- **CLI** — the same generate-then-validate flow runs from the command line: `plotsim run config.yaml -o ./output --validate`." } ], "metadata": { diff --git a/docs/site/user-guide/cdc-facts.md b/docs/site/user-guide/cdc-facts.md index 10ed972..38d0824 100644 --- a/docs/site/user-guide/cdc-facts.md +++ b/docs/site/user-guide/cdc-facts.md @@ -62,14 +62,18 @@ existing user-declared columns stay where they were. ) ``` -The bundled `cdc_demo` template runs end-to-end: +A worked CDC example lives at `tests/configs/cdc_demo.yaml`. Run it +end-to-end via the create-from-yaml + generate path: ```bash -plotsim template cdc_demo -o cdc_demo.yaml -plotsim run cdc_demo.yaml -o ./cdc_demo_output +plotsim run tests/configs/cdc_demo.yaml -o ./cdc_demo_output head ./cdc_demo_output/fct_billing.csv ``` +CDC is also wired on the `saas`, `marketing`, `retail`, and `health` +bundled domain templates — `plotsim run ` produces a fact table +with `_inserted_at` / `_updated_at` / `_op` columns. + --- ## Combining CDC with quality issues diff --git a/docs/site/user-guide/experiments-and-cohorts.md b/docs/site/user-guide/experiments-and-cohorts.md index b0198e5..4d8004a 100644 --- a/docs/site/user-guide/experiments-and-cohorts.md +++ b/docs/site/user-guide/experiments-and-cohorts.md @@ -289,9 +289,9 @@ Rejected at config load: --- -## Bundled template +## Worked example -`plotsim run ab_trial` produces a SaaS trial-conversion A/B test +`tests/configs/ab_trial.yaml` is a SaaS trial-conversion A/B test dataset that exercises all three features together: a legacy cohort present from period 0, organic trial signups arriving via a back-loaded linear ramp, paid-ad trial signups arriving in two step @@ -302,12 +302,13 @@ analysis can recover the configured effect from the generated data via difference-in-means. ```bash -plotsim run ab_trial --output ./datasets/ab_trial +plotsim run tests/configs/ab_trial.yaml --output ./datasets/ab_trial ``` -The template's source (`ab_trial.yaml` + `ab_trial.py`) is the -recommended starting point for adapting plotsim to your own A/B test -scenarios. +The paired `ab_trial.py` in the same directory is the recommended +starting point for adapting plotsim to your own A/B test scenarios. +Per-metric treatment also ships on the bundled `marketing` and +`health` domain templates if you want a turnkey example. --- diff --git a/docs/site/user-guide/geo-hierarchy.md b/docs/site/user-guide/geo-hierarchy.md index f04d165..ef624ee 100644 --- a/docs/site/user-guide/geo-hierarchy.md +++ b/docs/site/user-guide/geo-hierarchy.md @@ -139,15 +139,18 @@ need to agree, because they answer different questions ("a plausible-looking city name" vs "a real city we have lat/lng for"). -## Bundled template +## Worked example -`plotsim run geo_retail` generates a 40-store retail chain with -the full geo hierarchy. The template is paired: +`tests/configs/geo_retail.yaml` generates a 40-store retail chain +with the full geo hierarchy. The config is paired: -- `plotsim/configs/templates/geo_retail.py` — builder surface -- `plotsim/configs/templates/geo_retail.yaml` — engine surface +- `tests/configs/geo_retail.py` — Python builder surface +- `tests/configs/geo_retail.yaml` — YAML form -Both produce identical tables given the same seed. +Both produce identical tables given the same seed. The geo bundle +itself is exercised by the bundled `retail`, `banking`, and +`health` domain templates — each puts a coherent country / region / +city onto its customer or patient dim. ## Reference dataset diff --git a/docs/site/user-guide/metrics-and-connections.md b/docs/site/user-guide/metrics-and-connections.md index 9830697..6b01a40 100644 --- a/docs/site/user-guide/metrics-and-connections.md +++ b/docs/site/user-guide/metrics-and-connections.md @@ -161,8 +161,8 @@ create(metrics=[{ }]) ``` -The bundled `latency_skew` template (`plotsim template latency_skew`) -showcases all six families on a single config. +The `tests/configs/latency_skew.yaml` worked example showcases all +six families on a single config. --- diff --git a/docs/site/user-guide/multi-source.md b/docs/site/user-guide/multi-source.md index 360798f..782b814 100644 --- a/docs/site/user-guide/multi-source.md +++ b/docs/site/user-guide/multi-source.md @@ -121,23 +121,24 @@ manifest.json # carries source_entity_mappings ``` YAML form mirrors the keyword form 1:1; see -`plotsim/configs/templates/crm_billing_overlap.yaml` for the bundled -template. +`tests/configs/crm_billing_overlap.yaml` for the worked example +config (multi-source is exercised by the test suite rather than a +bundled domain template). ## Entity resolution walkthrough -The canonical use case is teaching record linkage. Load the bundled -template, generate, then write a notebook that joins `dim_company_crm` -to `dim_company_billing` and scores its predictions against the -manifest's `source_entity_mappings`: +The canonical use case is teaching record linkage. Load the worked +example config, generate, then write a notebook that joins +`dim_company_crm` to `dim_company_billing` and scores its predictions +against the manifest's `source_entity_mappings`: ```python import json import pandas as pd -import plotsim +from plotsim import create_from_yaml, generate_tables -cfg = plotsim.load_template("crm_billing_overlap") -tables = plotsim.generate_tables(cfg) +cfg = create_from_yaml("tests/configs/crm_billing_overlap.yaml") +tables = generate_tables(cfg) crm = tables["dim_company_crm"] billing = tables["dim_company_billing"] diff --git a/docs/site/user-guide/narrative-source.md b/docs/site/user-guide/narrative-source.md index f6b83ee..c563173 100644 --- a/docs/site/user-guide/narrative-source.md +++ b/docs/site/user-guide/narrative-source.md @@ -132,14 +132,18 @@ tables don't have the per-row trajectory plumbing wired. ) ``` -The bundled `narrative_reviews` template runs end-to-end: +A worked narrative example lives at `tests/configs/narrative_reviews.yaml`. +Run it end-to-end: ```bash -plotsim template narrative_reviews -o narrative_reviews.yaml -plotsim run narrative_reviews.yaml -o ./narrative_out +plotsim run tests/configs/narrative_reviews.yaml -o ./narrative_out head ./narrative_out/fct_reviews.csv ``` +The `hr`, `retail`, `banking`, and `health` bundled domain templates +each carry a narrative column with trajectory-aligned domain language +(review notes, customer reviews, loan officer notes, encounter notes). + --- ## Lexicon design — the signal a classifier learns @@ -183,7 +187,7 @@ dual-use signal: shifted, and enough vocabulary that bag-of-words classifiers learn token distributions rather than memorizing per-row strings. -The bundled `narrative_reviews` template hits ≥ 0.55 bag-of-words +The `tests/configs/narrative_reviews.yaml` walkthrough config hits ≥ 0.55 bag-of-words classification accuracy on a held-out entity split (chance is 1/3 for three segments) while keeping sentiment recoverable per band. Lexicons that ship below the segment-classification threshold either have diff --git a/plotsim/configs/templates/banking.yaml b/plotsim/configs/templates/banking.yaml new file mode 100644 index 0000000..b470861 --- /dev/null +++ b/plotsim/configs/templates/banking.yaml @@ -0,0 +1,321 @@ +# ═══════════════════════════════════════════════════════ +# plotsim — Retail banking and credit risk +# ═══════════════════════════════════════════════════════ +# +# A digital bank's operational and risk warehouse. Customers +# open accounts, apply for loans, make transactions, and are +# monitored for default. Credit score band changes over time +# (SCD2). Loan applications produce child document records +# (parent/child). Accounts hold multiple products via an M:N +# bridge. Disbursement records get CDC audit. Student-t noise +# reflects heavy-tailed transaction amounts. A holdout split +# reserves the last quarter for credit-scoring validation. + +about: "Retail banking — accounts, loans, transactions, credit risk" +unit: customer + +seed: 51231 +noise: + gaussian_sigma: 0.05 + outlier_rate: 0.01 + mcar_rate: 0.0 + noise_family: student_t + degrees_of_freedom: 4.0 + +window: + start: 2023-01 + end: 2024-12 + every: monthly + + +# ── seasonality ───────────────────────────────────────── +# Holiday spend Q4, tax-refund surge Mar-Apr, end-of-month bill cycles. +seasonality: + - { months: [11, 12], strength: 0.30 } + - { months: [3, 4], strength: 0.20 } + - { months: [6, 7], strength: -0.10 } + + +# ── what we measure ───────────────────────────────────── + +metrics: + - { name: account_balance, label: Account balance, type: amount, polarity: positive, range: [0, 250000] } + - { name: transaction_volume, label: Transactions per period, type: count, polarity: positive } + - { name: credit_utilization, label: Credit utilization rate, type: score, polarity: negative } + - { name: payment_on_time, label: On-time payment ratio, type: score, polarity: positive } + - { name: delinquency_risk, label: Delinquency risk score, type: score, polarity: negative, follows: credit_utilization, delay: 2 } + - { name: loan_volume, label: New loan applications, type: count, polarity: positive } + - { name: default_risk, label: Loan default risk, type: score, polarity: negative } + + +connections: + - delinquency_risk related credit_utilization + - payment_on_time opposes delinquency_risk + - "default_risk 0.55 delinquency_risk" + - "account_balance 0.40 payment_on_time" + - transaction_volume related account_balance + + +# ── who we're simulating ──────────────────────────────── + +segments: + + - name: prime_borrower + count: 22 + archetype: flat + label: "Stable low-risk borrowers" + attributes: + account_type: [checking, savings, credit_card, mortgage] + employment_status: [employed_full_time, self_employed] + income_band: [80k_120k, 120k_200k, 200k_plus] + product_category: [checking, savings, mortgage, credit_card] + baseline: + account_balance: high + payment_on_time: high + default_risk: low + + - name: subprime_improving + count: 18 + archetype: decline + label: "Subprime customers with declining risk over the window" + attributes: + account_type: [checking, credit_card] + employment_status: [employed_full_time, employed_part_time, contract] + income_band: [under_40k, 40k_80k] + product_category: [checking, credit_card, personal_loan] + baseline: + credit_utilization: high + default_risk: high + + - name: mass_market + count: 24 + archetype: flat + label: "Stable mid-market accounts" + attributes: + account_type: [checking, savings, credit_card] + employment_status: [employed_full_time, self_employed, employed_part_time] + income_band: [40k_80k, 80k_120k] + product_category: [checking, savings, credit_card, auto_loan] + baseline: + account_balance: mid + transaction_volume: mid + + - name: deteriorating + count: 12 + archetype: flat > growth > spike_then_crash @ 8 @ 16 + label: "Deteriorating credit — risk rising into default" + attributes: + account_type: [credit_card, personal_loan] + employment_status: [employed_part_time, contract, unemployed] + income_band: [under_40k, 40k_80k] + product_category: [credit_card, personal_loan] + baseline: + credit_utilization: high + delinquency_risk: high + + - name: hnw + count: 8 + archetype: accelerating + label: "High-net-worth growing balances and product depth" + attributes: + account_type: [savings, mortgage, brokerage] + employment_status: [employed_full_time, self_employed] + income_band: [200k_plus] + product_category: [checking, savings, mortgage, brokerage, credit_card] + baseline: + account_balance: high + transaction_volume: high + + - name: new_customer + count: 14 + archetype: flat > growth @ 5 + label: "Newly onboarded, building credit history" + attributes: + account_type: [checking, savings, credit_card] + employment_status: [employed_full_time, employed_part_time] + income_band: [40k_80k, 80k_120k] + product_category: [checking, savings, credit_card] + baseline: + account_balance: low + payment_on_time: mid + # Treatment cohort: half the new customers get an immediate + # credit line increase at period 6 to measure spend response. + treatment: + fraction: 0.5 + lift_log_odds: 0.4 + start_period: 6 + treatment_label: "credit_line_increase" + control_label: "standard_credit_line" + + +# ── lifecycle funnel ──────────────────────────────────── +lifecycle: + track: default_risk + stages: + - performing: 0.0 + - watch: 0.3 + - past_due: 0.55 + - default: 0.8 + + +# ── schema ────────────────────────────────────────────── + +dimensions: + - name: dim_date + per: period + columns: + - {name: date_key, type: id} + - {name: date, type: date} + - {name: year, type: int} + - {name: month, type: int} + - {name: quarter, type: int} + + - name: dim_customer + per: unit + columns: + - {name: customer_id, type: id} + - {name: customer_name, type: faker.name} + - {name: customer_email, type: faker.email} + - {name: onboarding_year, type: faker.year} + - {name: cohort_size, type: segment.count} + - {name: employment_status, type: pool.employment_status} + - {name: income_band, type: pool.income_band} + - {name: account_type, type: pool.account_type} + - {name: branch_country, type: geo.country} + - {name: branch_country_code, type: geo.country_code} + - {name: branch_region, type: geo.region} + - {name: branch_city, type: geo.city} + - name: credit_score_band + type: scd + tracks: default_risk + tiers: [super_prime, prime, near_prime, subprime] + at: [0.25, 0.55, 0.8] + + - name: dim_product + reference: true + columns: + - {name: product_id, type: id} + - {name: product_name, type: "static.checking,savings,credit_card,personal_loan,auto_loan,mortgage,brokerage,heloc"} + - {name: product_class,type: "static.deposit,deposit,credit,credit,credit,credit,investment,credit"} + + - name: dim_merchant_category + reference: true + columns: + - {name: category_id, type: id} + - {name: category_name, type: "static.grocery,fuel,dining,travel,utilities,entertainment,healthcare,retail,subscription,cash_advance"} + + +facts: + + # Per-customer-per-period activity exposing loan_volume + risk metrics. + - name: fct_account_activity + metrics: [account_balance, transaction_volume, credit_utilization, payment_on_time, delinquency_risk, default_risk, loan_volume] + columns: + - {name: date_key, type: ref.dim_date} + - {name: customer_id, type: ref.dim_customer} + - {name: account_balance, type: metric.account_balance} + - {name: transaction_volume, type: metric.transaction_volume} + - {name: credit_utilization, type: metric.credit_utilization} + - {name: payment_on_time, type: metric.payment_on_time} + - {name: delinquency_risk, type: metric.delinquency_risk} + - {name: default_risk, type: metric.default_risk} + - {name: loan_volume, type: metric.loan_volume} + - name: loan_officer_notes + type: narrative + template: "{stem} {assessment}. {action}" + lexicons: + prime_borrower: &nar_block + stem: + low: ["Some concerns about", "Watch closely on", "Risk indicators rising on"] + mid: ["Account performing as expected for", "Steady profile on", "Routine review for"] + high: ["Strong performance for", "Top-tier credit profile for", "Excellent payment history on"] + assessment: + low: ["recent utilization spikes", "missed payments in cycle", "behavior outside norms"] + mid: ["expected pattern", "consistent usage", "no anomalies"] + high: ["disciplined credit use", "low utilization", "consistent on-time payments"] + action: + low: ["Recommend credit line freeze.", "Flag for collections review.", "Reduce exposure."] + mid: ["Continue routine monitoring.", "Annual review next cycle.", "No action."] + high: ["Eligible for credit line increase.", "Offer premium products.", "Cross-sell opportunity."] + subprime_improving: { <<: *nar_block } + mass_market: { <<: *nar_block } + deteriorating: { <<: *nar_block } + hnw: { <<: *nar_block } + new_customer: { <<: *nar_block } + + # Variable-grain parent: one row per loan application. + - name: fct_loan_applications + row_count_driver: loan_volume + row_count_scale: 1.0 + cdc: true # disbursement amendments during settlement + columns: + - {name: application_id, type: id} + - {name: customer_id, type: ref.dim_customer} + - {name: application_date,type: ref.dim_date} + - {name: loan_purpose, type: "static.mortgage,auto,education,personal,business,debt_consolidation,home_improvement,medical"} + - {name: requested_amount,type: range, range: [5000, 750000]} + - {name: interest_rate, type: range, range: [3.5, 24.9]} + + # Per-parent-row child: 1..3 supporting documents per application. + - name: fct_loan_documents + parent_table: fct_loan_applications + children_per_row: [1, 3] + columns: + - {name: document_id, type: id} + - {name: customer_id, type: ref.dim_customer} + - {name: application_date, type: ref.dim_date} + - {name: document_type, type: "static.income_proof,id_verification,bank_statement,tax_return,property_appraisal,collateral_doc"} + - {name: doc_status, type: "static.received,received,received,pending,verified"} + + +events: + + - name: evt_transaction + trigger: proportional + driver: transaction_volume + scale: 5.0 + columns: + - {name: event_id, type: id} + - {name: date_key, type: ref.dim_date} + - {name: customer_id, type: ref.dim_customer} + - {name: category_id, type: ref.dim_merchant_category} + - {name: txn_amount, type: range, range: [1.0, 5000.0]} + - {name: event_ts, type: timestamp} + + - name: evt_default + trigger: threshold + metric: default_risk + above: 0.65 + for: 2 + columns: + - {name: event_id, type: id} + - {name: date_key, type: ref.dim_date} + - {name: customer_id, type: ref.dim_customer} + - {name: severity, type: "static.cured,charged_off,bankruptcy,settled"} + - {name: voluntary, type: flag} + + +# ── bridges ───────────────────────────────────────────── +# Customers hold 1-5 products (M:N). +bridges: + - name: bridge_customer_product + left: dim_customer + right: dim_product + cardinality: [1, 5] + driver: account_balance + + +# ── holdout ───────────────────────────────────────────── +# Reserve the last quarter for credit-scoring validation. The +# engine emits _train and _holdout file pairs for every fact and +# event table; the manifest records the cutoff period. +holdout: + target: default_risk + periods: 3 + + +# Quality injection is mutually exclusive with holdout in this engine +# version. Banking opts for holdout (credit-scoring validation) since +# the ML target workflow is the higher-value teaching surface for this +# domain; quality coverage is exercised by hr / saas / marketing / +# retail / health. diff --git a/plotsim/configs/templates/banking_template.py b/plotsim/configs/templates/banking_template.py new file mode 100644 index 0000000..b3ca8fd --- /dev/null +++ b/plotsim/configs/templates/banking_template.py @@ -0,0 +1,393 @@ +"""Banking template — Python form. + +Mirror of ``banking.yaml``. Retail banking + credit risk with student-t +noise on transactions, SCD2 credit score band, parent/child loan +applications + documents, M:N customer × product bridge, CDC on loan +disbursements, geo bundle on customer branches, narrative loan-officer +notes, treatment cohort for credit-line increase, and a holdout split +for credit-scoring validation. + +Run: + >>> from plotsim.configs.templates.banking_template import config + >>> from plotsim import generate_tables + >>> tables = generate_tables(config) +""" + +from plotsim import create + + +_NAR_BLOCK = { + "stem": { + "low": ["Some concerns about", "Watch closely on", "Risk indicators rising on"], + "mid": ["Account performing as expected for", "Steady profile on", "Routine review for"], + "high": [ + "Strong performance for", + "Top-tier credit profile for", + "Excellent payment history on", + ], + }, + "assessment": { + "low": ["recent utilization spikes", "missed payments in cycle", "behavior outside norms"], + "mid": ["expected pattern", "consistent usage", "no anomalies"], + "high": ["disciplined credit use", "low utilization", "consistent on-time payments"], + }, + "action": { + "low": [ + "Recommend credit line freeze.", + "Flag for collections review.", + "Reduce exposure.", + ], + "mid": ["Continue routine monitoring.", "Annual review next cycle.", "No action."], + "high": [ + "Eligible for credit line increase.", + "Offer premium products.", + "Cross-sell opportunity.", + ], + }, +} + + +config = create( + about="Retail banking — accounts, loans, transactions, credit risk", + unit="customer", + seed=51231, + noise={ + "gaussian_sigma": 0.05, + "outlier_rate": 0.01, + "mcar_rate": 0.0, + "noise_family": "student_t", + "degrees_of_freedom": 4.0, + }, + window=("2023-01", "2024-12", "monthly"), + seasonality=[ + {"months": [11, 12], "strength": 0.30}, + {"months": [3, 4], "strength": 0.20}, + {"months": [6, 7], "strength": -0.10}, + ], + metrics=[ + { + "name": "account_balance", + "label": "Account balance", + "type": "amount", + "polarity": "positive", + "range": [0, 250000], + }, + { + "name": "transaction_volume", + "label": "Transactions per period", + "type": "count", + "polarity": "positive", + }, + { + "name": "credit_utilization", + "label": "Credit utilization rate", + "type": "score", + "polarity": "negative", + }, + { + "name": "payment_on_time", + "label": "On-time payment ratio", + "type": "score", + "polarity": "positive", + }, + { + "name": "delinquency_risk", + "label": "Delinquency risk score", + "type": "score", + "polarity": "negative", + "follows": "credit_utilization", + "delay": 2, + }, + { + "name": "loan_volume", + "label": "New loan applications", + "type": "count", + "polarity": "positive", + }, + { + "name": "default_risk", + "label": "Loan default risk", + "type": "score", + "polarity": "negative", + }, + ], + connections=[ + "delinquency_risk related credit_utilization", + "payment_on_time opposes delinquency_risk", + "default_risk 0.55 delinquency_risk", + "account_balance 0.40 payment_on_time", + "transaction_volume related account_balance", + ], + segments=[ + { + "name": "prime_borrower", + "count": 22, + "archetype": "flat", + "label": "Stable low-risk borrowers", + "attributes": { + "account_type": ["checking", "savings", "credit_card", "mortgage"], + "employment_status": ["employed_full_time", "self_employed"], + "income_band": ["80k_120k", "120k_200k", "200k_plus"], + "product_category": ["checking", "savings", "mortgage", "credit_card"], + }, + "baseline": { + "account_balance": "high", + "payment_on_time": "high", + "default_risk": "low", + }, + }, + { + "name": "subprime_improving", + "count": 18, + "archetype": "decline", + "label": "Subprime customers with declining risk over the window", + "attributes": { + "account_type": ["checking", "credit_card"], + "employment_status": ["employed_full_time", "employed_part_time", "contract"], + "income_band": ["under_40k", "40k_80k"], + "product_category": ["checking", "credit_card", "personal_loan"], + }, + "baseline": {"credit_utilization": "high", "default_risk": "high"}, + }, + { + "name": "mass_market", + "count": 24, + "archetype": "flat", + "label": "Stable mid-market accounts", + "attributes": { + "account_type": ["checking", "savings", "credit_card"], + "employment_status": ["employed_full_time", "self_employed", "employed_part_time"], + "income_band": ["40k_80k", "80k_120k"], + "product_category": ["checking", "savings", "credit_card", "auto_loan"], + }, + "baseline": {"account_balance": "mid", "transaction_volume": "mid"}, + }, + { + "name": "deteriorating", + "count": 12, + "archetype": "flat > growth > spike_then_crash @ 8 @ 16", + "label": "Deteriorating credit — risk rising into default", + "attributes": { + "account_type": ["credit_card", "personal_loan"], + "employment_status": ["employed_part_time", "contract", "unemployed"], + "income_band": ["under_40k", "40k_80k"], + "product_category": ["credit_card", "personal_loan"], + }, + "baseline": {"credit_utilization": "high", "delinquency_risk": "high"}, + }, + { + "name": "hnw", + "count": 8, + "archetype": "accelerating", + "label": "High-net-worth growing balances and product depth", + "attributes": { + "account_type": ["savings", "mortgage", "brokerage"], + "employment_status": ["employed_full_time", "self_employed"], + "income_band": ["200k_plus"], + "product_category": ["checking", "savings", "mortgage", "brokerage", "credit_card"], + }, + "baseline": {"account_balance": "high", "transaction_volume": "high"}, + }, + { + "name": "new_customer", + "count": 14, + "archetype": "flat > growth @ 5", + "label": "Newly onboarded, building credit history", + "attributes": { + "account_type": ["checking", "savings", "credit_card"], + "employment_status": ["employed_full_time", "employed_part_time"], + "income_band": ["40k_80k", "80k_120k"], + "product_category": ["checking", "savings", "credit_card"], + }, + "baseline": {"account_balance": "low", "payment_on_time": "mid"}, + "treatment": { + "fraction": 0.5, + "lift_log_odds": 0.4, + "start_period": 6, + "treatment_label": "credit_line_increase", + "control_label": "standard_credit_line", + }, + }, + ], + lifecycle={ + "track": "default_risk", + "stages": [{"performing": 0.0}, {"watch": 0.3}, {"past_due": 0.55}, {"default": 0.8}], + }, + dimensions=[ + { + "name": "dim_date", + "per": "period", + "columns": [ + {"name": "date_key", "type": "id"}, + {"name": "date", "type": "date"}, + {"name": "year", "type": "int"}, + {"name": "month", "type": "int"}, + {"name": "quarter", "type": "int"}, + ], + }, + { + "name": "dim_customer", + "per": "unit", + "columns": [ + {"name": "customer_id", "type": "id"}, + {"name": "customer_name", "type": "faker.name"}, + {"name": "customer_email", "type": "faker.email"}, + {"name": "onboarding_year", "type": "faker.year"}, + {"name": "cohort_size", "type": "segment.count"}, + {"name": "employment_status", "type": "pool.employment_status"}, + {"name": "income_band", "type": "pool.income_band"}, + {"name": "account_type", "type": "pool.account_type"}, + {"name": "branch_country", "type": "geo.country"}, + {"name": "branch_country_code", "type": "geo.country_code"}, + {"name": "branch_region", "type": "geo.region"}, + {"name": "branch_city", "type": "geo.city"}, + { + "name": "credit_score_band", + "type": "scd", + "tracks": "default_risk", + "tiers": ["super_prime", "prime", "near_prime", "subprime"], + "at": [0.25, 0.55, 0.8], + }, + ], + }, + { + "name": "dim_product", + "reference": True, + "columns": [ + {"name": "product_id", "type": "id"}, + { + "name": "product_name", + "type": "static.checking,savings,credit_card,personal_loan,auto_loan,mortgage,brokerage,heloc", + }, + { + "name": "product_class", + "type": "static.deposit,deposit,credit,credit,credit,credit,investment,credit", + }, + ], + }, + { + "name": "dim_merchant_category", + "reference": True, + "columns": [ + {"name": "category_id", "type": "id"}, + { + "name": "category_name", + "type": "static.grocery,fuel,dining,travel,utilities,entertainment,healthcare,retail,subscription,cash_advance", + }, + ], + }, + ], + facts=[ + { + "name": "fct_account_activity", + "metrics": [ + "account_balance", + "transaction_volume", + "credit_utilization", + "payment_on_time", + "delinquency_risk", + "default_risk", + "loan_volume", + ], + "columns": [ + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "customer_id", "type": "ref.dim_customer"}, + {"name": "account_balance", "type": "metric.account_balance"}, + {"name": "transaction_volume", "type": "metric.transaction_volume"}, + {"name": "credit_utilization", "type": "metric.credit_utilization"}, + {"name": "payment_on_time", "type": "metric.payment_on_time"}, + {"name": "delinquency_risk", "type": "metric.delinquency_risk"}, + {"name": "default_risk", "type": "metric.default_risk"}, + {"name": "loan_volume", "type": "metric.loan_volume"}, + { + "name": "loan_officer_notes", + "type": "narrative", + "template": "{stem} {assessment}. {action}", + "lexicons": { + "prime_borrower": _NAR_BLOCK, + "subprime_improving": _NAR_BLOCK, + "mass_market": _NAR_BLOCK, + "deteriorating": _NAR_BLOCK, + "hnw": _NAR_BLOCK, + "new_customer": _NAR_BLOCK, + }, + }, + ], + }, + { + "name": "fct_loan_applications", + "row_count_driver": "loan_volume", + "row_count_scale": 1.0, + "cdc": True, + "columns": [ + {"name": "application_id", "type": "id"}, + {"name": "customer_id", "type": "ref.dim_customer"}, + {"name": "application_date", "type": "ref.dim_date"}, + { + "name": "loan_purpose", + "type": "static.mortgage,auto,education,personal,business,debt_consolidation,home_improvement,medical", + }, + {"name": "requested_amount", "type": "range", "range": [5000, 750000]}, + {"name": "interest_rate", "type": "range", "range": [3.5, 24.9]}, + ], + }, + { + "name": "fct_loan_documents", + "parent_table": "fct_loan_applications", + "children_per_row": [1, 3], + "columns": [ + {"name": "document_id", "type": "id"}, + {"name": "customer_id", "type": "ref.dim_customer"}, + {"name": "application_date", "type": "ref.dim_date"}, + { + "name": "document_type", + "type": "static.income_proof,id_verification,bank_statement,tax_return,property_appraisal,collateral_doc", + }, + { + "name": "doc_status", + "type": "static.received,received,received,pending,verified", + }, + ], + }, + ], + events=[ + { + "name": "evt_transaction", + "trigger": "proportional", + "driver": "transaction_volume", + "scale": 5.0, + "columns": [ + {"name": "event_id", "type": "id"}, + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "customer_id", "type": "ref.dim_customer"}, + {"name": "category_id", "type": "ref.dim_merchant_category"}, + {"name": "txn_amount", "type": "range", "range": [1.0, 5000.0]}, + {"name": "event_ts", "type": "timestamp"}, + ], + }, + { + "name": "evt_default", + "trigger": "threshold", + "metric": "default_risk", + "above": 0.65, + "for": 2, + "columns": [ + {"name": "event_id", "type": "id"}, + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "customer_id", "type": "ref.dim_customer"}, + {"name": "severity", "type": "static.cured,charged_off,bankruptcy,settled"}, + {"name": "voluntary", "type": "flag"}, + ], + }, + ], + bridges=[ + { + "name": "bridge_customer_product", + "left": "dim_customer", + "right": "dim_product", + "cardinality": [1, 5], + "driver": "account_balance", + }, + ], + holdout={"target": "default_risk", "periods": 3}, +) diff --git a/plotsim/configs/templates/education_template.py b/plotsim/configs/templates/education_template.py deleted file mode 100644 index 14c8bfc..0000000 --- a/plotsim/configs/templates/education_template.py +++ /dev/null @@ -1,279 +0,0 @@ -"""University student academic performance — Python builder template. - -Mirror of ``education_template.yaml``. Demonstrates: - -* ``noise: realistic`` — closer to live LMS data than perfectly clean -* multi-effect ``seasonality`` — fall (Sep-Nov) + spring (Feb-Apr) - lifts with summer (Jun-Aug) and finals-week (Dec) dips -* SCD2 ``academic_standing`` tracking ``assignment_score`` -""" - -from plotsim import create - -config = create( - about="University student academic performance and engagement", - unit="student", - seed=31337, - noise="realistic", - window=("2023-01", "2024-12", "monthly"), - seasonality=[ - {"months": [9, 10, 11], "strength": 0.20}, - {"months": [2, 3, 4], "strength": 0.15}, - {"months": [6, 7, 8], "strength": -0.30}, - {"months": [12], "strength": -0.10}, - ], - metrics=[ - { - "name": "assignment_score", - "label": "Average assignment score", - "type": "amount", - "polarity": "positive", - "range": [0, 100], - }, - { - "name": "attendance_rate", - "label": "Class attendance rate", - "type": "score", - "polarity": "positive", - }, - { - "name": "study_hours", - "label": "Weekly study hours", - "type": "count", - "polarity": "positive", - }, - { - "name": "participation", - "label": "Class participation index", - "type": "score", - "polarity": "positive", - }, - { - "name": "dropout_risk", - "label": "Dropout risk score", - "type": "score", - "polarity": "negative", - }, - { - "name": "stress_level", - "label": "Reported stress index", - "type": "score", - "polarity": "negative", - "follows": "study_hours", - "delay": 1, - }, - ], - connections=[ - ("attendance_rate", "driven_by", "participation"), - ("assignment_score", "related", "attendance_rate"), - ("participation", "opposes", "dropout_risk"), - ("assignment_score", "resists", "dropout_risk"), - ("stress_level", "hints_at", "dropout_risk"), - ], - segments=[ - { - "name": "high_achievers", - "count": 25, - "archetype": "growth", - "label": "Steady academic climb across both years", - "attributes": { - "program": ["computer_science", "engineering"], - "year": ["sophomore", "junior"], - }, - "baseline": { - "assignment_score": "high", - "attendance_rate": "high", - "dropout_risk": "low", - }, - }, - { - "name": "late_bloomers", - "count": 20, - "archetype": "flat > growth @ 8", - "label": "Struggled the first two terms, then found their footing", - "attributes": { - "program": ["biology", "mathematics", "history"], - "year": ["freshman", "sophomore"], - }, - "baseline": {"assignment_score": "mid", "participation": "mid"}, - }, - { - "name": "early_peakers", - "count": 15, - "archetype": "growth > decline @ 14", - "label": "Strong start, fade by senior year", - "attributes": {"program": ["business", "communications"], "year": ["junior", "senior"]}, - "baseline": {"assignment_score": "high", "attendance_rate": "mid"}, - }, - { - "name": "at_risk", - "count": 18, - "archetype": "decline", - "label": "Steady decline — at risk of dropout from term one", - "attributes": {"program": ["undeclared", "business"], "year": ["freshman"]}, - "baseline": { - "assignment_score": "low", - "attendance_rate": "low", - "dropout_risk": "high", - }, - }, - { - "name": "exam_burnout", - "count": 10, - "archetype": "growth > spike_then_crash > flat @ 8 @ 16", - "label": "Pushed hard before finals, crashed, never recovered", - "attributes": {"program": ["pre_med", "engineering"], "year": ["junior", "senior"]}, - "baseline": {"study_hours": "high", "stress_level": "high", "dropout_risk": "high"}, - }, - { - "name": "seasonal_engagement", - "count": 12, - "archetype": "seasonal", - "label": "Cyclical engagement — strong terms, weak summers", - "attributes": { - "program": ["arts", "music", "history"], - "year": ["sophomore", "junior"], - }, - "baseline": {"participation": "mid", "attendance_rate": "mid"}, - }, - ], - lifecycle={ - "track": "dropout_risk", - "stages": [ - ("thriving", 0.0), - ("stable", 0.15), - ("struggling", 0.4), - ("critical", 0.7), - ], - }, - dimensions=[ - { - "name": "dim_date", - "per": "period", - "columns": [ - {"name": "date_key", "type": "id"}, - {"name": "date", "type": "date"}, - {"name": "year", "type": "int"}, - {"name": "month", "type": "int"}, - {"name": "quarter", "type": "int"}, - ], - }, - { - "name": "dim_student", - "per": "unit", - "columns": [ - {"name": "student_id", "type": "id"}, - {"name": "full_name", "type": "faker.name"}, - {"name": "enroll_year", "type": "faker.year"}, - {"name": "cohort_size", "type": "segment.count"}, - { - "name": "academic_standing", - "type": "scd", - "tracks": "assignment_score", - "tiers": ["probation", "good_standing", "deans_list"], - "at": [0.4, 0.8], - }, - ], - }, - { - "name": "dim_course", - "reference": True, - "columns": [ - {"name": "course_id", "type": "id"}, - { - "name": "course_name", - "type": "static.intro_cs,calculus,history,physics,literature,statistics", - }, - {"name": "credits", "type": "static.3,4,3,4,3,4"}, - { - "name": "department", - "type": "static.computing,math,humanities,sciences,humanities,math", - }, - ], - }, - { - "name": "dim_term", - "reference": True, - "columns": [ - {"name": "term_id", "type": "id"}, - {"name": "term_name", "type": "static.spring_2023,fall_2023,spring_2024,fall_2024"}, - {"name": "term_type", "type": "static.spring,fall,spring,fall"}, - ], - }, - ], - facts=[ - { - "name": "fct_grades", - "metrics": ["assignment_score", "participation"], - "columns": [ - {"name": "date_key", "type": "ref.dim_date"}, - {"name": "student_id", "type": "ref.dim_student"}, - {"name": "course_id", "type": "ref.dim_course"}, - {"name": "assignment_score", "type": "metric.assignment_score"}, - {"name": "participation", "type": "metric.participation"}, - {"name": "grade_band", "type": "bucket", "labels": ["F", "D", "C", "B", "A"]}, - ], - }, - { - "name": "fct_engagement", - "metrics": ["attendance_rate", "study_hours", "stress_level"], - "columns": [ - {"name": "date_key", "type": "ref.dim_date"}, - {"name": "student_id", "type": "ref.dim_student"}, - {"name": "term_id", "type": "ref.dim_term"}, - {"name": "attendance_rate", "type": "metric.attendance_rate"}, - {"name": "study_hours", "type": "metric.study_hours"}, - {"name": "stress_level", "type": "metric.stress_level"}, - ], - }, - { - "name": "fct_risk", - "metrics": ["dropout_risk"], - "columns": [ - {"name": "date_key", "type": "ref.dim_date"}, - {"name": "student_id", "type": "ref.dim_student"}, - {"name": "dropout_risk", "type": "metric.dropout_risk"}, - ], - }, - ], - events=[ - { - "name": "evt_office_hours", - "trigger": "proportional", - "driver": "participation", - "scale": 3.0, - "columns": [ - {"name": "event_id", "type": "id"}, - {"name": "date_key", "type": "ref.dim_date"}, - {"name": "student_id", "type": "ref.dim_student"}, - {"name": "event_ts", "type": "timestamp"}, - ], - }, - { - "name": "evt_dropout", - "trigger": "threshold", - "metric": "dropout_risk", - "above": 0.65, - "for_periods": 2, - "columns": [ - {"name": "event_id", "type": "id"}, - {"name": "date_key", "type": "ref.dim_date"}, - {"name": "student_id", "type": "ref.dim_student"}, - {"name": "reason", "type": "faker.sentence"}, - {"name": "dropout_flag", "type": "flag"}, - ], - }, - ], - # 0.6-M15: data-quality issues for Data Quality Testing (DE L25) - # and Data Cleaning (DE L15). Manifest records every injection so - # students can score detectors against ground truth. - quality=[ - { - "table": "fct_grades", - "issue": "null_injection", - "rate": 0.03, - "column": "assignment_score", - }, - {"table": "fct_engagement", "issue": "duplicate_rows", "rate": 0.02}, - ], -) diff --git a/plotsim/configs/templates/education_template.yaml b/plotsim/configs/templates/education_template.yaml deleted file mode 100644 index cf95b04..0000000 --- a/plotsim/configs/templates/education_template.yaml +++ /dev/null @@ -1,358 +0,0 @@ -# ═══════════════════════════════════════════════════════ -# plotsim — Education / student cohort analytics -# ═══════════════════════════════════════════════════════ - -about: "University student academic performance and engagement" -unit: student - -# Determinism, light realistic noise, default csv output. -seed: 31337 -noise: realistic - -window: - start: 2023-01 - end: 2024-12 - every: monthly - - -# ── academic-calendar seasonality ─────────────────────── -# -# Engagement / attendance ride the term cycle: lift in -# Sep-Nov and Feb-Apr, dip in Jun-Aug (summer) and Dec -# (finals week + winter break). Strengths are signed -# multipliers added to 1.0 at each named month before -# per-metric `seasonal_sensitivity` applies. - -seasonality: - - { months: [9, 10, 11], strength: 0.20 } - - { months: [2, 3, 4], strength: 0.15 } - - { months: [6, 7, 8], strength: -0.30 } - - { months: [12], strength: -0.10 } - - -# ── what we measure ───────────────────────────────────── - -metrics: - - - name: assignment_score - label: Average assignment score - type: amount - polarity: positive - range: [0, 100] - - - name: attendance_rate - label: Class attendance rate - type: score - polarity: positive - - - name: study_hours - label: Weekly study hours - type: count - polarity: positive - - - name: participation - label: Class participation index - type: score - polarity: positive - - - name: dropout_risk - label: Dropout risk score - type: score - polarity: negative - - - name: stress_level - label: Reported stress index - type: score - polarity: negative - follows: study_hours - delay: 1 - - -# ── how metrics connect ───────────────────────────────── - -connections: - - attendance_rate driven_by participation - - assignment_score related attendance_rate - - participation opposes dropout_risk - - assignment_score resists dropout_risk - - stress_level hints_at dropout_risk - - -# ── who we're simulating ──────────────────────────────── - -segments: - - - name: high_achievers - count: 25 - archetype: growth - label: "Steady academic climb across both years" - attributes: - program: [computer_science, engineering] - year: [sophomore, junior] - baseline: - assignment_score: high - attendance_rate: high - dropout_risk: low - - - name: late_bloomers - count: 20 - archetype: flat > growth @ 8 - label: "Struggled the first two terms, then found their footing" - attributes: - program: [biology, mathematics, history] - year: [freshman, sophomore] - baseline: - assignment_score: mid - participation: mid - - - name: early_peakers - count: 15 - archetype: growth > decline @ 14 - label: "Strong start, fade by senior year" - attributes: - program: [business, communications] - year: [junior, senior] - baseline: - assignment_score: high - attendance_rate: mid - - - name: at_risk - count: 18 - archetype: decline - label: "Steady decline — at risk of dropout from term one" - attributes: - program: [undeclared, business] - year: [freshman] - baseline: - assignment_score: low - attendance_rate: low - dropout_risk: high - - - name: exam_burnout - count: 10 - archetype: growth > spike_then_crash > flat @ 8 @ 16 - label: "Pushed hard before finals, crashed, never recovered" - attributes: - program: [pre_med, engineering] - year: [junior, senior] - baseline: - study_hours: high - stress_level: high - dropout_risk: high - - - name: seasonal_engagement - count: 12 - archetype: seasonal - label: "Cyclical engagement — strong terms, weak summers" - attributes: - program: [arts, music, history] - year: [sophomore, junior] - baseline: - participation: mid - attendance_rate: mid - - -# ── lifecycle funnel ──────────────────────────────────── - -lifecycle: - track: dropout_risk - stages: - - thriving: 0.0 - - stable: 0.15 - - struggling: 0.4 - - critical: 0.7 - - -# ── schema ────────────────────────────────────────────── - -dimensions: - - - name: dim_date - per: period - columns: - - {name: date_key, type: id} - - {name: date, type: date} - - {name: year, type: int} - - {name: month, type: int} - - {name: quarter, type: int} - - - name: dim_student - per: unit - columns: - - {name: student_id, type: id} - - {name: full_name, type: faker.name} - - {name: enroll_year, type: faker.year} - - {name: cohort_size, type: segment.count} - - name: academic_standing - type: scd - tracks: assignment_score - tiers: [probation, good_standing, deans_list] - at: [0.4, 0.8] - - - name: dim_course - reference: true - columns: - - {name: course_id, type: id} - - {name: course_name, type: "static.intro_cs,calculus,history,physics,literature,statistics"} - - {name: credits, type: "static.3,4,3,4,3,4"} - - {name: department, type: "static.computing,math,humanities,sciences,humanities,math"} - - - name: dim_term - reference: true - columns: - - {name: term_id, type: id} - - {name: term_name, type: "static.spring_2023,fall_2023,spring_2024,fall_2024"} - - {name: term_type, type: "static.spring,fall,spring,fall"} - -facts: - - - name: fct_grades - metrics: [assignment_score, participation] - columns: - - {name: date_key, type: ref.dim_date} - - {name: student_id, type: ref.dim_student} - - {name: course_id, type: ref.dim_course} - - {name: assignment_score, type: metric.assignment_score} - - {name: participation, type: metric.participation} - - name: grade_band - type: bucket - labels: [F, D, C, B, A] - - - name: fct_engagement - metrics: [attendance_rate, study_hours, stress_level] - columns: - - {name: date_key, type: ref.dim_date} - - {name: student_id, type: ref.dim_student} - - {name: term_id, type: ref.dim_term} - - {name: attendance_rate, type: metric.attendance_rate} - - {name: study_hours, type: metric.study_hours} - - {name: stress_level, type: metric.stress_level} - - - name: fct_risk - metrics: [dropout_risk] - columns: - - {name: date_key, type: ref.dim_date} - - {name: student_id, type: ref.dim_student} - - {name: dropout_risk, type: metric.dropout_risk} - -events: - - - name: evt_office_hours - trigger: proportional - driver: participation - scale: 3.0 - columns: - - {name: event_id, type: id} - - {name: date_key, type: ref.dim_date} - - {name: student_id, type: ref.dim_student} - - {name: event_ts, type: timestamp} - - - name: evt_dropout - trigger: threshold - metric: dropout_risk - above: 0.65 - for: 2 - columns: - - {name: event_id, type: id} - - {name: date_key, type: ref.dim_date} - - {name: student_id, type: ref.dim_student} - - {name: reason, type: faker.sentence} - - {name: dropout_flag, type: flag} - - -# 0.6-M15: data-quality issues for Data Quality Testing (DE L25) and -# Data Cleaning (DE L15). Manifest records every injection so students -# can score detectors against ground truth. -quality: - - { table: fct_grades, issue: null_injection, rate: 0.03, column: assignment_score } - - { table: fct_engagement, issue: duplicate_rows, rate: 0.02 } - - -# ═══════════════════════════════════════════════════════ -# Legend -# ═══════════════════════════════════════════════════════ -# -# ── unit ─────────────────────────────────────────────── -# company SaaS / B2B -# employee HR / workforce -# customer retail / e-commerce -# campaign marketing -# student academic cohorts -# -# ── every ────────────────────────────────────────────── -# monthly calendar months -# weekly calendar weeks -# daily calendar days -# -# ── metric type ──────────────────────────────────────── -# score bounded [0,1] — attendance, participation -# amount bounded business range (requires range) -# count integer event counts — study hours, sessions -# index signed centered metric (requires range) -# -# ── polarity ─────────────────────────────────────────── -# positive higher is better -# negative higher is worse -# -# ── archetype patterns ───────────────────────────────── -# growth smooth S-curve rise — high_achievers -# decline exponential fade — at_risk -# seasonal 2 oscillation cycles — term-based engagement -# flat low and constant — stalled cohorts -# spike_then_crash rapid rise, drop, low plateau — exam burnout -# accelerating compound growth — overachievers -# -# ── archetype composition ────────────────────────────── -# pattern > pattern sequential — first then second -# @ N transition at period N -# flat > growth @ 8 -# growth > spike_then_crash > flat @ 8 @ 16 -# -# ── connections ──────────────────────────────────────── -# mirrors +0.75 nearly the same signal -# driven_by +0.55 strong positive link -# related +0.40 moderate positive -# hints_at +0.20 weak positive -# independent 0.00 no relationship -# hints_against -0.20 weak inverse -# resists -0.40 moderate inverse -# opposes -0.55 strong inverse -# inverts -0.75 nearly mirror-opposite -# -# ── baseline ─────────────────────────────────────────── -# high upper third of metric range -# mid midpoint (default if omitted) -# low lower third of metric range -# -# ── follows + delay ──────────────────────────────────── -# follows: metric this metric lags behind another -# delay: N by N periods -# -# ── range ────────────────────────────────────────────── -# [min, max] required for amount and index -# score defaults to [0, 1] -# count has no range -# -# ── count ────────────────────────────────────────────── -# 1 – 5000 students per segment -# -# ── schema types ─────────────────────────────────────── -# id primary key, auto-generated -# ref.{table} foreign key to dim table -# metric.{name} populated from named metric -# faker.{type} generated via faker (name, year, sentence) -# static.{value} fixed value or comma-list for fan-out -# segment.count cohort population size -# timestamp generated datetime within period -# flag boolean derived from event trigger -# bucket categorical label derived from trajectory -# scd slowly changing dimension (requires tracks/tiers/at) -# -# ── event triggers ───────────────────────────────────── -# proportional row count = driver metric × scale per period -# threshold fires once when metric crosses value for N periods -# -# ── dimension per ────────────────────────────────────── -# period one row per time period (dim_date) -# unit one row per entity (dim_student) -# reference: true static lookup (dim_course, dim_term) diff --git a/plotsim/configs/templates/health.yaml b/plotsim/configs/templates/health.yaml new file mode 100644 index 0000000..ec3c45e --- /dev/null +++ b/plotsim/configs/templates/health.yaml @@ -0,0 +1,327 @@ +# ═══════════════════════════════════════════════════════ +# plotsim — Clinical and patient analytics +# ═══════════════════════════════════════════════════════ +# +# A healthcare system's clinical data warehouse. Patients visit +# providers, receive diagnoses, undergo lab tests, and are +# monitored for readmission. Risk stratification changes over +# time (SCD2). Diagnoses accumulate as M:N (bridge). Encounters +# produce child lab results (parent/child); prescriptions +# reference encounters (cross-fact FK). A per-metric treatment +# cohort isolates a clinical intervention's lift on a specific +# vital. Holdout reserves the last quarter for readmission +# prediction. + +about: "Clinical and patient analytics — encounters, labs, prescriptions, outcomes" +unit: patient + +seed: 70021 +noise: + gaussian_sigma: 0.04 + outlier_rate: 0.005 + mcar_rate: 0.0 + scale_with_trajectory: true + +window: + start: 2023-01 + end: 2024-12 + every: monthly + + +# ── seasonality ───────────────────────────────────────── +# Flu / respiratory season Oct-Feb, summer dip in acute visits. +seasonality: + - { months: [10, 11, 12, 1, 2], strength: 0.20 } + - { months: [6, 7, 8], strength: -0.15 } + + +# ── what we measure ───────────────────────────────────── + +metrics: + - { name: encounter_volume, label: Encounters per period, type: count, polarity: positive } + - { name: bp_systolic, label: Systolic blood pressure, type: amount, polarity: negative, range: [90, 200] } + - { name: a1c, label: HbA1c percentage, type: amount, polarity: negative, range: [4.0, 14.0] } + - { name: medication_adherence,label: Medication adherence rate, type: score, polarity: positive } + - { name: clinical_engagement, label: Patient engagement, type: score, polarity: positive } + - { name: readmission_risk, label: 30-day readmission risk, type: score, polarity: negative, follows: bp_systolic, delay: 1 } + - { name: lab_volume, label: Lab tests ordered per period, type: count, polarity: positive } + + +connections: + - readmission_risk related bp_systolic + - medication_adherence opposes readmission_risk + - "a1c -0.40 medication_adherence" + - "clinical_engagement 0.45 medication_adherence" + - encounter_volume related lab_volume + + +# ── who we're simulating ──────────────────────────────── + +segments: + + - name: chronic_progressive + count: 18 + archetype: growth + label: "Chronic conditions with risk increasing steadily" + attributes: + insurance_type: [commercial, medicare] + diagnosis_category: [cardiovascular, endocrine] + provider_specialty: [cardiology, endocrinology, primary_care] + department: [outpatient, primary_care, specialty] + baseline: + bp_systolic: high + a1c: high + readmission_risk: high + + - name: recovering + count: 16 + archetype: decline + label: "Risk dropping post-intervention" + attributes: + insurance_type: [commercial, medicare, medicaid] + diagnosis_category: [cardiovascular, orthopedic, surgical] + provider_specialty: [cardiology, orthopedics, primary_care] + department: [inpatient, rehab, outpatient] + baseline: + readmission_risk: high + medication_adherence: mid + # Per-metric treatment: a clinical intervention at period 6 lifts + # medication_adherence specifically (not other vitals). Students + # recover the per-metric ATE against the manifest's + # treatment_assignments. + treatment: + fraction: 0.5 + lift_log_odds: 0.6 + start_period: 6 + treatment_label: "intervention_arm" + control_label: "standard_care" + target_metric: medication_adherence + + - name: acute_episodic + count: 20 + archetype: seasonal + label: "Episodic acute visits — flu season cycles" + attributes: + insurance_type: [commercial, medicaid, self_pay] + diagnosis_category: [respiratory, infectious, acute] + provider_specialty: [primary_care, urgent_care, emergency] + department: [urgent_care, emergency, primary_care] + baseline: + encounter_volume: mid + lab_volume: mid + + - name: well_managed + count: 22 + archetype: flat + label: "Stable chronic patients with consistent management" + attributes: + insurance_type: [commercial, medicare] + diagnosis_category: [endocrine, cardiovascular] + provider_specialty: [primary_care, endocrinology, cardiology] + department: [primary_care, outpatient] + baseline: + medication_adherence: high + clinical_engagement: high + readmission_risk: low + + - name: pediatric_routine + count: 14 + archetype: flat + label: "Pediatric routine well-child visits" + attributes: + insurance_type: [commercial, medicaid, tricare] + diagnosis_category: [routine, immunization, developmental] + provider_specialty: [pediatrics, primary_care] + department: [primary_care, outpatient] + baseline: + clinical_engagement: high + readmission_risk: low + + - name: high_risk_post_surgical + count: 12 + archetype: flat > spike_then_crash > flat @ 4 @ 12 + label: "Post-surgical patients with peri-operative risk window" + attributes: + insurance_type: [commercial, medicare] + diagnosis_category: [surgical, cardiovascular] + provider_specialty: [surgery, cardiology, primary_care] + department: [inpatient, rehab, outpatient] + baseline: + readmission_risk: high + bp_systolic: high + + +# ── lifecycle funnel ──────────────────────────────────── +lifecycle: + track: readmission_risk + stages: + - well: 0.0 + - watch: 0.25 + - high_risk: 0.55 + - critical: 0.8 + + +# ── schema ────────────────────────────────────────────── + +dimensions: + - name: dim_date + per: period + columns: + - {name: date_key, type: id} + - {name: date, type: date} + - {name: year, type: int} + - {name: month, type: int} + - {name: quarter, type: int} + + - name: dim_patient + per: unit + columns: + - {name: patient_id, type: id} + - {name: patient_name, type: faker.name} + - {name: patient_email, type: faker.email} + - {name: birth_year, type: faker.year} + - {name: cohort_size, type: segment.count} + - {name: insurance_type, type: pool.insurance_type} + - {name: diagnosis_category, type: pool.diagnosis_category} + - {name: provider_specialty, type: pool.provider_specialty} + - {name: home_country, type: geo.country} + - {name: home_country_code, type: geo.country_code} + - {name: home_region, type: geo.region} + - {name: home_city, type: geo.city} + - name: risk_stratification + type: scd + tracks: readmission_risk + tiers: [low, moderate, high, critical] + at: [0.25, 0.55, 0.8] + + - name: dim_diagnosis + reference: true + columns: + - {name: diagnosis_id, type: id} + - {name: diagnosis_name, type: "static.hypertension,type2_diabetes,asthma,copd,depression,arthritis,coronary_disease,obesity"} + - {name: chronicity, type: "static.chronic,chronic,chronic,chronic,chronic,chronic,chronic,chronic"} + + - name: dim_medication + reference: true + columns: + - {name: medication_id, type: id} + - {name: medication_class, type: "static.analgesic,antibiotic,antihypertensive,statin,ssri,bronchodilator,antidiabetic,anticoagulant,nsaid,beta_blocker"} + + +facts: + + # Per-patient-per-period clinical activity exposing encounter_volume + # (drives encounters parent fact) + the vitals + risk metrics + the + # narrative clinical notes. + - name: fct_clinical_activity + metrics: [encounter_volume, bp_systolic, a1c, medication_adherence, clinical_engagement, readmission_risk, lab_volume] + columns: + - {name: date_key, type: ref.dim_date} + - {name: patient_id, type: ref.dim_patient} + - {name: encounter_volume, type: metric.encounter_volume} + - {name: bp_systolic, type: metric.bp_systolic} + - {name: a1c, type: metric.a1c} + - {name: medication_adherence, type: metric.medication_adherence} + - {name: clinical_engagement, type: metric.clinical_engagement} + - {name: readmission_risk, type: metric.readmission_risk} + - {name: lab_volume, type: metric.lab_volume} + - name: encounter_notes + type: narrative + template: "{stem} {assessment}. {plan}" + lexicons: + chronic_progressive: &nar_block + stem: + low: ["Patient presented with concerning", "Worsening trend in", "Acute escalation in"] + mid: ["Patient stable on", "Routine follow-up for", "Monitoring continues for"] + high: ["Patient improving on", "Strong response to therapy for", "Excellent control of"] + assessment: + low: ["uncontrolled markers", "missed dose pattern", "lab abnormalities"] + mid: ["stable markers", "expected variability", "consistent with baseline"] + high: ["controlled markers", "improving labs", "treatment goals met"] + plan: + low: ["Adjusting medications and reassessing in two weeks.", "Initiating intensive monitoring protocol.", "Referring to specialist."] + mid: ["Continue current plan.", "Routine follow-up in three months.", "Maintain medication regimen."] + high: ["Reduce monitoring frequency.", "Eligible for step-down therapy.", "Annual follow-up."] + recovering: { <<: *nar_block } + acute_episodic: { <<: *nar_block } + well_managed: { <<: *nar_block } + pediatric_routine: { <<: *nar_block } + high_risk_post_surgical: { <<: *nar_block } + + # Variable-grain parent: one row per encounter. + - name: fct_encounters + row_count_driver: encounter_volume + row_count_scale: 1.0 + cdc: true # chart amendments after coding/billing review + columns: + - {name: encounter_id, type: id} + - {name: patient_id, type: ref.dim_patient} + - {name: encounter_date, type: ref.dim_date} + - {name: admission_type, type: "static.outpatient,outpatient,outpatient,inpatient,emergency,urgent,scheduled,follow_up"} + - {name: visit_duration, type: range, range: [10, 240]} + + # Per-parent-row child: 1..4 lab orders per encounter. + - name: fct_lab_results + parent_table: fct_encounters + children_per_row: [1, 4] + columns: + - {name: lab_id, type: id} + - {name: patient_id, type: ref.dim_patient} + - {name: encounter_date, type: ref.dim_date} + - {name: panel, type: "static.cbc,cmp,lipid_panel,a1c,tsh,urinalysis,bnp,d_dimer"} + - {name: result_value, type: range, range: [0.5, 500.0]} + + # Cross-fact FK: prescriptions reference encounters. + - name: fct_prescriptions + row_count_driver: encounter_volume + row_count_scale: 0.6 + columns: + - {name: prescription_id, type: id} + - {name: encounter_id, type: ref.fct_encounters} + - {name: patient_id, type: ref.dim_patient} + - {name: prescribed_date, type: ref.dim_date} + - {name: medication_id, type: ref.dim_medication} + - {name: days_supply, type: range, range: [7, 90]} + + +events: + + - name: evt_lab_order + trigger: proportional + driver: lab_volume + scale: 3.0 + columns: + - {name: event_id, type: id} + - {name: date_key, type: ref.dim_date} + - {name: patient_id, type: ref.dim_patient} + - {name: panel, type: "static.cbc,cmp,lipid_panel,a1c,tsh,urinalysis"} + - {name: event_ts, type: timestamp} + + - name: evt_readmission + trigger: proportional + driver: readmission_risk + scale: 1.5 + columns: + - {name: event_id, type: id} + - {name: date_key, type: ref.dim_date} + - {name: patient_id, type: ref.dim_patient} + - {name: reason, type: faker.sentence} + - {name: severity, type: "static.observation,inpatient,icu"} + - {name: event_ts, type: timestamp} + + +# ── bridges ───────────────────────────────────────────── +# Patients accumulate 1-4 diagnoses (M:N). +bridges: + - name: bridge_patient_diagnosis + left: dim_patient + right: dim_diagnosis + cardinality: [1, 4] + driver: readmission_risk + + +# ── holdout ───────────────────────────────────────────── +# Reserve the last 3 periods for readmission prediction validation. +holdout: + target: readmission_risk + periods: 3 diff --git a/plotsim/configs/templates/health_template.py b/plotsim/configs/templates/health_template.py new file mode 100644 index 0000000..fffb3f8 --- /dev/null +++ b/plotsim/configs/templates/health_template.py @@ -0,0 +1,401 @@ +"""Health template — Python form. + +Mirror of ``health.yaml``. Clinical analytics with SCD2 risk +stratification, parent/child encounters + labs, cross-fact FK on +prescriptions, M:N patient × diagnosis bridge, geo bundle on +patients, narrative encounter notes, CDC on encounters, +per-metric treatment cohort (lift on medication_adherence +specifically), and a holdout split for readmission prediction. + +Run: + >>> from plotsim.configs.templates.health_template import config + >>> from plotsim import generate_tables + >>> tables = generate_tables(config) +""" + +from plotsim import create + + +_NAR_BLOCK = { + "stem": { + "low": ["Patient presented with concerning", "Worsening trend in", "Acute escalation in"], + "mid": ["Patient stable on", "Routine follow-up for", "Monitoring continues for"], + "high": ["Patient improving on", "Strong response to therapy for", "Excellent control of"], + }, + "assessment": { + "low": ["uncontrolled markers", "missed dose pattern", "lab abnormalities"], + "mid": ["stable markers", "expected variability", "consistent with baseline"], + "high": ["controlled markers", "improving labs", "treatment goals met"], + }, + "plan": { + "low": [ + "Adjusting medications and reassessing in two weeks.", + "Initiating intensive monitoring protocol.", + "Referring to specialist.", + ], + "mid": [ + "Continue current plan.", + "Routine follow-up in three months.", + "Maintain medication regimen.", + ], + "high": [ + "Reduce monitoring frequency.", + "Eligible for step-down therapy.", + "Annual follow-up.", + ], + }, +} + + +config = create( + about="Clinical and patient analytics — encounters, labs, prescriptions, outcomes", + unit="patient", + seed=70021, + noise={ + "gaussian_sigma": 0.04, + "outlier_rate": 0.005, + "mcar_rate": 0.0, + "scale_with_trajectory": True, + }, + window=("2023-01", "2024-12", "monthly"), + seasonality=[ + {"months": [10, 11, 12, 1, 2], "strength": 0.20}, + {"months": [6, 7, 8], "strength": -0.15}, + ], + metrics=[ + { + "name": "encounter_volume", + "label": "Encounters per period", + "type": "count", + "polarity": "positive", + }, + { + "name": "bp_systolic", + "label": "Systolic blood pressure", + "type": "amount", + "polarity": "negative", + "range": [90, 200], + }, + { + "name": "a1c", + "label": "HbA1c percentage", + "type": "amount", + "polarity": "negative", + "range": [4.0, 14.0], + }, + { + "name": "medication_adherence", + "label": "Medication adherence rate", + "type": "score", + "polarity": "positive", + }, + { + "name": "clinical_engagement", + "label": "Patient engagement", + "type": "score", + "polarity": "positive", + }, + { + "name": "readmission_risk", + "label": "30-day readmission risk", + "type": "score", + "polarity": "negative", + "follows": "bp_systolic", + "delay": 1, + }, + { + "name": "lab_volume", + "label": "Lab tests ordered per period", + "type": "count", + "polarity": "positive", + }, + ], + connections=[ + "readmission_risk related bp_systolic", + "medication_adherence opposes readmission_risk", + "a1c -0.40 medication_adherence", + "clinical_engagement 0.45 medication_adherence", + "encounter_volume related lab_volume", + ], + segments=[ + { + "name": "chronic_progressive", + "count": 18, + "archetype": "growth", + "label": "Chronic conditions with risk increasing steadily", + "attributes": { + "insurance_type": ["commercial", "medicare"], + "diagnosis_category": ["cardiovascular", "endocrine"], + "provider_specialty": ["cardiology", "endocrinology", "primary_care"], + "department": ["outpatient", "primary_care", "specialty"], + }, + "baseline": {"bp_systolic": "high", "a1c": "high", "readmission_risk": "high"}, + }, + { + "name": "recovering", + "count": 16, + "archetype": "decline", + "label": "Risk dropping post-intervention", + "attributes": { + "insurance_type": ["commercial", "medicare", "medicaid"], + "diagnosis_category": ["cardiovascular", "orthopedic", "surgical"], + "provider_specialty": ["cardiology", "orthopedics", "primary_care"], + "department": ["inpatient", "rehab", "outpatient"], + }, + "baseline": {"readmission_risk": "high", "medication_adherence": "mid"}, + "treatment": { + "fraction": 0.5, + "lift_log_odds": 0.6, + "start_period": 6, + "treatment_label": "intervention_arm", + "control_label": "standard_care", + "target_metric": "medication_adherence", + }, + }, + { + "name": "acute_episodic", + "count": 20, + "archetype": "seasonal", + "label": "Episodic acute visits — flu season cycles", + "attributes": { + "insurance_type": ["commercial", "medicaid", "self_pay"], + "diagnosis_category": ["respiratory", "infectious", "acute"], + "provider_specialty": ["primary_care", "urgent_care", "emergency"], + "department": ["urgent_care", "emergency", "primary_care"], + }, + "baseline": {"encounter_volume": "mid", "lab_volume": "mid"}, + }, + { + "name": "well_managed", + "count": 22, + "archetype": "flat", + "label": "Stable chronic patients with consistent management", + "attributes": { + "insurance_type": ["commercial", "medicare"], + "diagnosis_category": ["endocrine", "cardiovascular"], + "provider_specialty": ["primary_care", "endocrinology", "cardiology"], + "department": ["primary_care", "outpatient"], + }, + "baseline": { + "medication_adherence": "high", + "clinical_engagement": "high", + "readmission_risk": "low", + }, + }, + { + "name": "pediatric_routine", + "count": 14, + "archetype": "flat", + "label": "Pediatric routine well-child visits", + "attributes": { + "insurance_type": ["commercial", "medicaid", "tricare"], + "diagnosis_category": ["routine", "immunization", "developmental"], + "provider_specialty": ["pediatrics", "primary_care"], + "department": ["primary_care", "outpatient"], + }, + "baseline": {"clinical_engagement": "high", "readmission_risk": "low"}, + }, + { + "name": "high_risk_post_surgical", + "count": 12, + "archetype": "flat > spike_then_crash > flat @ 4 @ 12", + "label": "Post-surgical patients with peri-operative risk window", + "attributes": { + "insurance_type": ["commercial", "medicare"], + "diagnosis_category": ["surgical", "cardiovascular"], + "provider_specialty": ["surgery", "cardiology", "primary_care"], + "department": ["inpatient", "rehab", "outpatient"], + }, + "baseline": {"readmission_risk": "high", "bp_systolic": "high"}, + }, + ], + lifecycle={ + "track": "readmission_risk", + "stages": [{"well": 0.0}, {"watch": 0.25}, {"high_risk": 0.55}, {"critical": 0.8}], + }, + dimensions=[ + { + "name": "dim_date", + "per": "period", + "columns": [ + {"name": "date_key", "type": "id"}, + {"name": "date", "type": "date"}, + {"name": "year", "type": "int"}, + {"name": "month", "type": "int"}, + {"name": "quarter", "type": "int"}, + ], + }, + { + "name": "dim_patient", + "per": "unit", + "columns": [ + {"name": "patient_id", "type": "id"}, + {"name": "patient_name", "type": "faker.name"}, + {"name": "patient_email", "type": "faker.email"}, + {"name": "birth_year", "type": "faker.year"}, + {"name": "cohort_size", "type": "segment.count"}, + {"name": "insurance_type", "type": "pool.insurance_type"}, + {"name": "diagnosis_category", "type": "pool.diagnosis_category"}, + {"name": "provider_specialty", "type": "pool.provider_specialty"}, + {"name": "home_country", "type": "geo.country"}, + {"name": "home_country_code", "type": "geo.country_code"}, + {"name": "home_region", "type": "geo.region"}, + {"name": "home_city", "type": "geo.city"}, + { + "name": "risk_stratification", + "type": "scd", + "tracks": "readmission_risk", + "tiers": ["low", "moderate", "high", "critical"], + "at": [0.25, 0.55, 0.8], + }, + ], + }, + { + "name": "dim_diagnosis", + "reference": True, + "columns": [ + {"name": "diagnosis_id", "type": "id"}, + { + "name": "diagnosis_name", + "type": "static.hypertension,type2_diabetes,asthma,copd,depression,arthritis,coronary_disease,obesity", + }, + { + "name": "chronicity", + "type": "static.chronic,chronic,chronic,chronic,chronic,chronic,chronic,chronic", + }, + ], + }, + { + "name": "dim_medication", + "reference": True, + "columns": [ + {"name": "medication_id", "type": "id"}, + { + "name": "medication_class", + "type": "static.analgesic,antibiotic,antihypertensive,statin,ssri,bronchodilator,antidiabetic,anticoagulant,nsaid,beta_blocker", + }, + ], + }, + ], + facts=[ + { + "name": "fct_clinical_activity", + "metrics": [ + "encounter_volume", + "bp_systolic", + "a1c", + "medication_adherence", + "clinical_engagement", + "readmission_risk", + "lab_volume", + ], + "columns": [ + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "patient_id", "type": "ref.dim_patient"}, + {"name": "encounter_volume", "type": "metric.encounter_volume"}, + {"name": "bp_systolic", "type": "metric.bp_systolic"}, + {"name": "a1c", "type": "metric.a1c"}, + {"name": "medication_adherence", "type": "metric.medication_adherence"}, + {"name": "clinical_engagement", "type": "metric.clinical_engagement"}, + {"name": "readmission_risk", "type": "metric.readmission_risk"}, + {"name": "lab_volume", "type": "metric.lab_volume"}, + { + "name": "encounter_notes", + "type": "narrative", + "template": "{stem} {assessment}. {plan}", + "lexicons": { + "chronic_progressive": _NAR_BLOCK, + "recovering": _NAR_BLOCK, + "acute_episodic": _NAR_BLOCK, + "well_managed": _NAR_BLOCK, + "pediatric_routine": _NAR_BLOCK, + "high_risk_post_surgical": _NAR_BLOCK, + }, + }, + ], + }, + { + "name": "fct_encounters", + "row_count_driver": "encounter_volume", + "row_count_scale": 1.0, + "cdc": True, + "columns": [ + {"name": "encounter_id", "type": "id"}, + {"name": "patient_id", "type": "ref.dim_patient"}, + {"name": "encounter_date", "type": "ref.dim_date"}, + { + "name": "admission_type", + "type": "static.outpatient,outpatient,outpatient,inpatient,emergency,urgent,scheduled,follow_up", + }, + {"name": "visit_duration", "type": "range", "range": [10, 240]}, + ], + }, + { + "name": "fct_lab_results", + "parent_table": "fct_encounters", + "children_per_row": [1, 4], + "columns": [ + {"name": "lab_id", "type": "id"}, + {"name": "patient_id", "type": "ref.dim_patient"}, + {"name": "encounter_date", "type": "ref.dim_date"}, + { + "name": "panel", + "type": "static.cbc,cmp,lipid_panel,a1c,tsh,urinalysis,bnp,d_dimer", + }, + {"name": "result_value", "type": "range", "range": [0.5, 500.0]}, + ], + }, + { + "name": "fct_prescriptions", + "row_count_driver": "encounter_volume", + "row_count_scale": 0.6, + "columns": [ + {"name": "prescription_id", "type": "id"}, + {"name": "encounter_id", "type": "ref.fct_encounters"}, + {"name": "patient_id", "type": "ref.dim_patient"}, + {"name": "prescribed_date", "type": "ref.dim_date"}, + {"name": "medication_id", "type": "ref.dim_medication"}, + {"name": "days_supply", "type": "range", "range": [7, 90]}, + ], + }, + ], + events=[ + { + "name": "evt_lab_order", + "trigger": "proportional", + "driver": "lab_volume", + "scale": 3.0, + "columns": [ + {"name": "event_id", "type": "id"}, + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "patient_id", "type": "ref.dim_patient"}, + {"name": "panel", "type": "static.cbc,cmp,lipid_panel,a1c,tsh,urinalysis"}, + {"name": "event_ts", "type": "timestamp"}, + ], + }, + { + "name": "evt_readmission", + "trigger": "proportional", + "driver": "readmission_risk", + "scale": 1.5, + "columns": [ + {"name": "event_id", "type": "id"}, + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "patient_id", "type": "ref.dim_patient"}, + {"name": "reason", "type": "faker.sentence"}, + {"name": "severity", "type": "static.observation,inpatient,icu"}, + {"name": "event_ts", "type": "timestamp"}, + ], + }, + ], + bridges=[ + { + "name": "bridge_patient_diagnosis", + "left": "dim_patient", + "right": "dim_diagnosis", + "cardinality": [1, 4], + "driver": "readmission_risk", + }, + ], + holdout={"target": "readmission_risk", "periods": 3}, +) diff --git a/plotsim/configs/templates/hr.yaml b/plotsim/configs/templates/hr.yaml new file mode 100644 index 0000000..c29ef95 --- /dev/null +++ b/plotsim/configs/templates/hr.yaml @@ -0,0 +1,302 @@ +# ═══════════════════════════════════════════════════════ +# plotsim — HR / workforce analytics +# ═══════════════════════════════════════════════════════ +# +# A workforce analytics warehouse for a mid-size multinational. +# Employees belong to departments, work out of regional offices, +# hold a job level that changes over time (SCD2), and are assigned +# to one or more projects (M:N bridge). Performance reviews carry +# trajectory-aligned narrative text; compensation changes get a +# CDC audit trail; review-season + fiscal-close + summer dips +# drive the seasonality channel. + +about: "HR talent, performance, compensation and attrition analytics" +unit: employee + +seed: 5150 +noise: slightly_messy + +window: + start: 2023-01 + end: 2024-12 + every: monthly + + +# ── what we measure ───────────────────────────────────── + +metrics: + - { name: performance_score, label: Quarterly performance rating, type: score, polarity: positive } + - { name: engagement, label: Pulse engagement index, type: score, polarity: positive } + - { name: training_hours, label: Training hours completed, type: count, polarity: positive } + - { name: absence_rate, label: Monthly absence rate, type: score, polarity: negative, follows: engagement, delay: 1 } + - { name: attrition_risk, label: Attrition risk score, type: score, polarity: negative } + - { name: compensation, label: Total monthly compensation, type: amount, polarity: positive, range: [4000, 25000] } + + +# ── how metrics connect ───────────────────────────────── +# Coefficients calibrated off internal workforce-analytics studies: +# pay tracks performance loosely (r=0.35), engagement drives perf +# moderately (r=0.55), absence inversely tracks engagement (r=-0.40). +connections: + - engagement driven_by performance_score + - engagement opposes attrition_risk + - absence_rate related attrition_risk + - "compensation 0.35 performance_score" + + +# ── seasonality ───────────────────────────────────────── +# Mar-Apr: annual review cycle bumps performance signal. +# Nov-Dec: fiscal year close, training compresses. +# Jul-Aug: summer dip — vacations, lower engagement. +seasonality: + - { months: [3, 4], strength: 0.20 } + - { months: [11, 12], strength: 0.15 } + - { months: [7, 8], strength: -0.20 } + + +# ── who we're simulating ──────────────────────────────── + +segments: + + - name: new_hire_ramp + count: 18 + archetype: flat > growth @ 6 + label: "Onboarding ramp, then sigmoid into full productivity" + attributes: + department: [Engineering, Product] + role_family: [individual_contributor] + project_type: [build, research] + baseline: + performance_score: mid + engagement: mid + training_hours: high + + - name: top_performer + count: 22 + archetype: accelerating + label: "Compounding performance — promotion track" + attributes: + department: [Engineering, Product, Finance] + role_family: [individual_contributor, manager] + project_type: [build, growth, research] + baseline: + performance_score: high + compensation: high + + - name: core_team + count: 28 + archetype: flat + label: "Reliable senior contributors at sustained baseline" + attributes: + department: [Engineering, Sales, Marketing, Finance, Legal] + role_family: [individual_contributor, manager] + project_type: [maintain, build] + baseline: + performance_score: high + engagement: high + attrition_risk: low + + - name: disengaging + count: 16 + archetype: flat > decline @ 12 + label: "Coasted for a year, then quietly disengaged" + attributes: + department: [Operations, Sales, HR] + role_family: [individual_contributor] + project_type: [maintain] + baseline: + engagement: low + absence_rate: high + attrition_risk: high + + - name: burnout_cohort + count: 10 + archetype: growth > spike_then_crash > flat @ 6 @ 14 + label: "Rapid early ramp, peak around month 6, crashed by 14" + attributes: + department: [Engineering, Operations] + role_family: [manager] + project_type: [build, growth] + baseline: + performance_score: high + engagement: mid + attrition_risk: high + + - name: comeback + count: 10 + archetype: decline > flat > growth @ 6 @ 14 + label: "Stalled, recovered with new manager at month 14" + attributes: + department: [Sales, Marketing] + role_family: [individual_contributor] + project_type: [growth, maintain] + baseline: + performance_score: mid + engagement: mid + + +# ── lifecycle funnel ──────────────────────────────────── + +lifecycle: + track: attrition_risk + enforce_order: true + stages: + - new_hire: 0.0 + - established: 0.15 + - disengaging: 0.4 + - exited: 0.7 + + +# ── schema ────────────────────────────────────────────── + +dimensions: + + - name: dim_date + per: period + columns: + - {name: date_key, type: id} + - {name: date, type: date} + - {name: year, type: int} + - {name: month, type: int} + - {name: quarter, type: int} + + - name: dim_employee + per: unit + columns: + - {name: employee_id, type: id} + - {name: full_name, type: faker.name} + - {name: email, type: faker.email} + - {name: hire_year, type: faker.year} + - {name: cohort_size, type: segment.count} + - {name: department, type: pool.department} + - {name: role_family, type: pool.role_family} + - {name: pay_band, type: "static.B1,B2,B3,B4,B5,B6,B7"} + - {name: office_country, type: geo.country} + - {name: office_country_code, type: geo.country_code} + - {name: office_region, type: geo.region} + - {name: office_city, type: geo.city} + - name: job_level + type: scd + tracks: performance_score + tiers: [ic, senior, lead, principal] + at: [0.3, 0.6, 0.85] + + - name: dim_manager + reference: true + columns: + - {name: manager_id, type: id} + - {name: manager_name, type: faker.name} + - {name: span_size, type: "static.5,8,10,12,15"} + + - name: dim_project + reference: true + columns: + - {name: project_id, type: id} + - {name: project_name, type: faker.company} + - {name: project_type, type: "static.build,growth,research,maintain,build,growth"} + - {name: budget_band, type: "static.small,medium,medium,large,xlarge,large"} + +facts: + + - name: fct_performance + metrics: [performance_score, engagement, training_hours] + columns: + - {name: date_key, type: ref.dim_date} + - {name: employee_id, type: ref.dim_employee} + - {name: manager_id, type: ref.dim_manager} + - {name: performance_score, type: metric.performance_score} + - {name: engagement, type: metric.engagement} + - {name: training_hours, type: metric.training_hours} + - name: review_outcome + type: bucket + labels: [improvement_plan, meets, exceeds, top_talent] + - name: review_notes + type: narrative + template: "{stem} {assessment}. {action}" + lexicons: + new_hire_ramp: &nar_block + stem: + low: ["Ramp slower than expected for", "Still finding footing on", "Behind on onboarding tasks for"] + mid: ["On track in", "Settling into", "Building competence in"] + high: ["Exceeding ramp targets in", "Picking up momentum on", "Already contributing on"] + assessment: + low: ["limited delivery so far", "minimal independent ownership", "few completed milestones"] + mid: ["steady delivery", "clear baseline ownership", "expected milestone cadence"] + high: ["strong early delivery", "broad ownership for tenure", "ahead-of-plan milestones"] + action: + low: ["Pair with mentor for the next cycle.", "Re-scope to smaller deliverables.", "Add structured check-ins."] + mid: ["Continue current ramp plan.", "Maintain mentor cadence.", "Hold present trajectory."] + high: ["Stretch with cross-team scope.", "Promote to mid-IC scope early.", "Surface for visibility projects."] + top_performer: + <<: *nar_block + core_team: + <<: *nar_block + disengaging: + <<: *nar_block + burnout_cohort: + <<: *nar_block + comeback: + <<: *nar_block + + - name: fct_compensation + metrics: [compensation, attrition_risk] + cdc: true + columns: + - {name: date_key, type: ref.dim_date} + - {name: employee_id, type: ref.dim_employee} + - {name: compensation, type: metric.compensation} + - {name: attrition_risk, type: metric.attrition_risk} + - {name: bonus_target, type: range, range: [0, 0.4]} + + - name: fct_attendance + metrics: [absence_rate] + columns: + - {name: date_key, type: ref.dim_date} + - {name: employee_id, type: ref.dim_employee} + - {name: absence_rate, type: metric.absence_rate} + +events: + + - name: evt_training_completion + trigger: proportional + driver: engagement + scale: 4.0 + columns: + - {name: event_id, type: id} + - {name: date_key, type: ref.dim_date} + - {name: employee_id, type: ref.dim_employee} + - {name: course_name, type: faker.word} + - {name: event_ts, type: timestamp} + + - name: evt_attrition + trigger: threshold + metric: attrition_risk + above: 0.7 + for: 3 + columns: + - {name: event_id, type: id} + - {name: date_key, type: ref.dim_date} + - {name: employee_id, type: ref.dim_employee} + - {name: reason, type: faker.sentence} + - {name: voluntary, type: flag} + + +# ── bridges ───────────────────────────────────────────── +# Employees are assigned to 1-4 projects (M:N). Bridge driver +# is performance_score — higher performers carry more concurrent +# project load. +bridges: + - name: bridge_employee_project + left: dim_employee + right: dim_project + cardinality: [1, 4] + driver: performance_score + + +# ── data quality ──────────────────────────────────────── +# Realistic HR data hygiene: incomplete pulse surveys, late +# attendance records, duplicate survey submissions. +quality: + - { table: fct_performance, issue: null_injection, rate: 0.04, column: engagement } + - { table: fct_attendance, issue: late_arrival, rate: 0.02 } + - { table: evt_training_completion, issue: duplicate_rows, rate: 0.01 } diff --git a/plotsim/configs/templates/hr_template.py b/plotsim/configs/templates/hr_template.py index afa01e4..02b5ab7 100644 --- a/plotsim/configs/templates/hr_template.py +++ b/plotsim/configs/templates/hr_template.py @@ -1,19 +1,61 @@ -"""HR / workforce analytics — Python builder template. +"""HR template — Python form. -Mirror of ``hr_template.yaml``. Demonstrates: +Mirror of ``hr.yaml``. A workforce analytics warehouse for a mid-size +multinational: employees, departments, geographic offices (geo bundle +on dim_employee), a manager reference dim, project assignments +(M:N bridge), CDC on the compensation fact, narrative review notes, +and three quality issues across the fact + event surface. -* ``noise: slightly_messy`` for realistic survey jitter -* custom-coefficient connection (``compensation 0.27 performance_score``) -* ``lifecycle.enforce_order=True`` — monotonic stage walk so an - employee who enters ``disengaging`` doesn't bounce back on a - transient pulse-survey blip -* SCD2 ``job_level`` tracking ``performance_score`` +Run: + >>> from plotsim.configs.templates.hr_template import config + >>> from plotsim import generate_tables + >>> tables = generate_tables(config) """ from plotsim import create + +_NAR_BLOCK = { + "stem": { + "low": [ + "Ramp slower than expected for", + "Still finding footing on", + "Behind on onboarding tasks for", + ], + "mid": ["On track in", "Settling into", "Building competence in"], + "high": ["Exceeding ramp targets in", "Picking up momentum on", "Already contributing on"], + }, + "assessment": { + "low": [ + "limited delivery so far", + "minimal independent ownership", + "few completed milestones", + ], + "mid": ["steady delivery", "clear baseline ownership", "expected milestone cadence"], + "high": ["strong early delivery", "broad ownership for tenure", "ahead-of-plan milestones"], + }, + "action": { + "low": [ + "Pair with mentor for the next cycle.", + "Re-scope to smaller deliverables.", + "Add structured check-ins.", + ], + "mid": [ + "Continue current ramp plan.", + "Maintain mentor cadence.", + "Hold present trajectory.", + ], + "high": [ + "Stretch with cross-team scope.", + "Promote to mid-IC scope early.", + "Surface for visibility projects.", + ], + }, +} + + config = create( - about="HR talent and attrition analytics", + about="HR talent, performance, compensation and attrition analytics", unit="employee", seed=5150, noise="slightly_messy", @@ -59,31 +101,51 @@ "range": [4000, 25000], }, ], - # Custom coefficient on the comp ↔ performance pair — calibrated - # off internal performance / pay-band correlation studies. connections=[ - ("engagement", "driven_by", "performance_score"), - ("engagement", "opposes", "attrition_risk"), - ("absence_rate", "related", "attrition_risk"), - ("compensation", 0.27, "performance_score"), + "engagement driven_by performance_score", + "engagement opposes attrition_risk", + "absence_rate related attrition_risk", + "compensation 0.35 performance_score", + ], + seasonality=[ + {"months": [3, 4], "strength": 0.20}, + {"months": [11, 12], "strength": 0.15}, + {"months": [7, 8], "strength": -0.20}, ], segments=[ { "name": "new_hire_ramp", - "count": 20, + "count": 18, "archetype": "flat > growth @ 6", "label": "Onboarding ramp, then sigmoid into full productivity", - "attributes": {"department": ["Engineering", "Product"], "level": ["IC1", "IC2"]}, + "attributes": { + "department": ["Engineering", "Product"], + "role_family": ["individual_contributor"], + "project_type": ["build", "research"], + }, "baseline": {"performance_score": "mid", "engagement": "mid", "training_hours": "high"}, }, + { + "name": "top_performer", + "count": 22, + "archetype": "accelerating", + "label": "Compounding performance — promotion track", + "attributes": { + "department": ["Engineering", "Product", "Finance"], + "role_family": ["individual_contributor", "manager"], + "project_type": ["build", "growth", "research"], + }, + "baseline": {"performance_score": "high", "compensation": "high"}, + }, { "name": "core_team", - "count": 30, + "count": 28, "archetype": "flat", - "label": "Reliable senior contributors at sustained-high baseline", + "label": "Reliable senior contributors at sustained baseline", "attributes": { - "department": ["Engineering", "Sales", "Operations"], - "level": ["senior", "lead"], + "department": ["Engineering", "Sales", "Marketing", "Finance", "Legal"], + "role_family": ["individual_contributor", "manager"], + "project_type": ["maintain", "build"], }, "baseline": { "performance_score": "high", @@ -92,32 +154,26 @@ }, }, { - "name": "fast_riser", - "count": 12, - "archetype": "accelerating", - "label": "Compounding performance — promotion track", - "attributes": {"department": ["Engineering", "Product"], "level": ["senior"]}, - "baseline": {"performance_score": "high", "compensation": "high"}, - }, - { - "name": "quiet_quitter", - "count": 15, - "archetype": "flat > decline @ 14", + "name": "disengaging", + "count": 16, + "archetype": "flat > decline @ 12", "label": "Coasted for a year, then quietly disengaged", "attributes": { - "department": ["Sales", "Operations"], - "level": ["IC1", "IC2", "senior"], + "department": ["Operations", "Sales", "HR"], + "role_family": ["individual_contributor"], + "project_type": ["maintain"], }, "baseline": {"engagement": "low", "absence_rate": "high", "attrition_risk": "high"}, }, { "name": "burnout_cohort", - "count": 8, + "count": 10, "archetype": "growth > spike_then_crash > flat @ 6 @ 14", "label": "Rapid early ramp, peak around month 6, crashed by 14", "attributes": { "department": ["Engineering", "Operations"], - "level": ["senior", "lead"], + "role_family": ["manager"], + "project_type": ["build", "growth"], }, "baseline": { "performance_score": "high", @@ -129,20 +185,23 @@ "name": "comeback", "count": 10, "archetype": "decline > flat > growth @ 6 @ 14", - "label": "Stalled, hit bottom at month 6, recovered with new manager at 14", - "attributes": {"department": ["Sales", "Product"], "level": ["senior"]}, + "label": "Stalled, recovered with new manager at month 14", + "attributes": { + "department": ["Sales", "Marketing"], + "role_family": ["individual_contributor"], + "project_type": ["growth", "maintain"], + }, "baseline": {"performance_score": "mid", "engagement": "mid"}, }, ], - # Monotonic stage walk + free-form re-entry suppressed. lifecycle={ "track": "attrition_risk", "enforce_order": True, "stages": [ - ("new_hire", 0.0), - ("established", 0.15), - ("disengaging", 0.4), - ("exited", 0.7), + {"new_hire": 0.0}, + {"established": 0.15}, + {"disengaging": 0.4}, + {"exited": 0.7}, ], }, dimensions=[ @@ -163,33 +222,45 @@ "columns": [ {"name": "employee_id", "type": "id"}, {"name": "full_name", "type": "faker.name"}, + {"name": "email", "type": "faker.email"}, {"name": "hire_year", "type": "faker.year"}, {"name": "cohort_size", "type": "segment.count"}, + {"name": "department", "type": "pool.department"}, + {"name": "role_family", "type": "pool.role_family"}, + {"name": "pay_band", "type": "static.B1,B2,B3,B4,B5,B6,B7"}, + {"name": "office_country", "type": "geo.country"}, + {"name": "office_country_code", "type": "geo.country_code"}, + {"name": "office_region", "type": "geo.region"}, + {"name": "office_city", "type": "geo.city"}, { "name": "job_level", "type": "scd", "tracks": "performance_score", - "tiers": ["ic", "senior", "lead"], - "at": [0.4, 0.75], + "tiers": ["ic", "senior", "lead", "principal"], + "at": [0.3, 0.6, 0.85], }, ], }, { - "name": "dim_department", + "name": "dim_manager", "reference": True, "columns": [ - {"name": "department_id", "type": "id"}, - {"name": "department", "type": "static.engineering,sales,product,operations"}, - {"name": "cost_center", "type": "static.RnD,GTM,RnD,GnA"}, + {"name": "manager_id", "type": "id"}, + {"name": "manager_name", "type": "faker.name"}, + {"name": "span_size", "type": "static.5,8,10,12,15"}, ], }, { - "name": "dim_office", + "name": "dim_project", "reference": True, "columns": [ - {"name": "office_id", "type": "id"}, - {"name": "office", "type": "static.austin,berlin,singapore,remote"}, - {"name": "region", "type": "static.AMER,EMEA,APAC,GLOBAL"}, + {"name": "project_id", "type": "id"}, + {"name": "project_name", "type": "faker.company"}, + { + "name": "project_type", + "type": "static.build,growth,research,maintain,build,growth", + }, + {"name": "budget_band", "type": "static.small,medium,medium,large,xlarge,large"}, ], }, ], @@ -200,7 +271,7 @@ "columns": [ {"name": "date_key", "type": "ref.dim_date"}, {"name": "employee_id", "type": "ref.dim_employee"}, - {"name": "department_id", "type": "ref.dim_department"}, + {"name": "manager_id", "type": "ref.dim_manager"}, {"name": "performance_score", "type": "metric.performance_score"}, {"name": "engagement", "type": "metric.engagement"}, {"name": "training_hours", "type": "metric.training_hours"}, @@ -209,17 +280,31 @@ "type": "bucket", "labels": ["improvement_plan", "meets", "exceeds", "top_talent"], }, + { + "name": "review_notes", + "type": "narrative", + "template": "{stem} {assessment}. {action}", + "lexicons": { + "new_hire_ramp": _NAR_BLOCK, + "top_performer": _NAR_BLOCK, + "core_team": _NAR_BLOCK, + "disengaging": _NAR_BLOCK, + "burnout_cohort": _NAR_BLOCK, + "comeback": _NAR_BLOCK, + }, + }, ], }, { "name": "fct_compensation", "metrics": ["compensation", "attrition_risk"], + "cdc": True, "columns": [ {"name": "date_key", "type": "ref.dim_date"}, {"name": "employee_id", "type": "ref.dim_employee"}, - {"name": "office_id", "type": "ref.dim_office"}, {"name": "compensation", "type": "metric.compensation"}, {"name": "attrition_risk", "type": "metric.attrition_risk"}, + {"name": "bonus_target", "type": "range", "range": [0, 0.4]}, ], }, { @@ -251,7 +336,7 @@ "trigger": "threshold", "metric": "attrition_risk", "above": 0.7, - "for_periods": 3, + "for": 3, "columns": [ {"name": "event_id", "type": "id"}, {"name": "date_key", "type": "ref.dim_date"}, @@ -261,9 +346,15 @@ ], }, ], - # 0.6-M15: data-quality issues for Data Quality Testing (DE L25) - # and Data Cleaning (DE L15). Manifest records every injection so - # students can score detectors against ground truth. + bridges=[ + { + "name": "bridge_employee_project", + "left": "dim_employee", + "right": "dim_project", + "cardinality": [1, 4], + "driver": "performance_score", + }, + ], quality=[ { "table": "fct_performance", @@ -271,6 +362,7 @@ "rate": 0.04, "column": "engagement", }, - {"table": "evt_training_completion", "issue": "late_arrival", "rate": 0.02}, + {"table": "fct_attendance", "issue": "late_arrival", "rate": 0.02}, + {"table": "evt_training_completion", "issue": "duplicate_rows", "rate": 0.01}, ], ) diff --git a/plotsim/configs/templates/hr_template.yaml b/plotsim/configs/templates/hr_template.yaml deleted file mode 100644 index daac122..0000000 --- a/plotsim/configs/templates/hr_template.yaml +++ /dev/null @@ -1,352 +0,0 @@ -# ═══════════════════════════════════════════════════════ -# plotsim — HR / workforce analytics -# ═══════════════════════════════════════════════════════ - -about: "HR talent and attrition analytics" -unit: employee - -# Determinism + a touch of noise to mimic real survey jitter. -seed: 5150 -noise: slightly_messy - -window: - start: 2023-01 - end: 2024-12 - every: monthly - - -# ── what we measure ───────────────────────────────────── - -metrics: - - - name: performance_score - label: Quarterly performance rating - type: score - polarity: positive - - - name: engagement - label: Pulse engagement index - type: score - polarity: positive - - - name: training_hours - label: Training hours completed - type: count - polarity: positive - - - name: absence_rate - label: Monthly absence rate - type: score - polarity: negative - follows: engagement - delay: 1 - - - name: attrition_risk - label: Attrition risk score - type: score - polarity: negative - - - name: compensation - label: Total monthly compensation - type: amount - polarity: positive - range: [4000, 25000] - - -# ── how metrics connect ───────────────────────────────── - -# Custom coefficient on the comp ↔ performance pair — calibrated -# off internal performance / pay-band correlation studies. The -# numeric form is interchangeable with the 9-word vocabulary. -connections: - - engagement driven_by performance_score - - engagement opposes attrition_risk - - absence_rate related attrition_risk - - "compensation 0.27 performance_score" - - -# ── who we're simulating ──────────────────────────────── - -segments: - - - name: new_hire_ramp - count: 20 - archetype: flat > growth @ 6 - label: "Onboarding ramp, then sigmoid into full productivity" - attributes: - department: [Engineering, Product] - level: [IC1, IC2] - baseline: - performance_score: mid - engagement: mid - training_hours: high - - - name: core_team - count: 30 - archetype: flat - label: "Reliable senior contributors at sustained-high baseline" - attributes: - department: [Engineering, Sales, Operations] - level: [senior, lead] - baseline: - performance_score: high - engagement: high - attrition_risk: low - - - name: fast_riser - count: 12 - archetype: accelerating - label: "Compounding performance — promotion track" - attributes: - department: [Engineering, Product] - level: [senior] - baseline: - performance_score: high - compensation: high - - - name: quiet_quitter - count: 15 - archetype: flat > decline @ 14 - label: "Coasted for a year, then quietly disengaged" - attributes: - department: [Sales, Operations] - level: [IC1, IC2, senior] - baseline: - engagement: low - absence_rate: high - attrition_risk: high - - - name: burnout_cohort - count: 8 - archetype: growth > spike_then_crash > flat @ 6 @ 14 - label: "Rapid early ramp, peak around month 6, crashed by 14" - attributes: - department: [Engineering, Operations] - level: [senior, lead] - baseline: - performance_score: high - engagement: mid - attrition_risk: high - - - name: comeback - count: 10 - archetype: decline > flat > growth @ 6 @ 14 - label: "Stalled, hit bottom at month 6, recovered with new manager at 14" - attributes: - department: [Sales, Product] - level: [senior] - baseline: - performance_score: mid - engagement: mid - - -# ── lifecycle funnel ──────────────────────────────────── - -# enforce_order: true emits a monotonic stage walk — once an -# employee enters `disengaging`, they don't bounce back to -# `established` on a transient pulse-survey blip. Without this the -# default free-mode stages let entities re-enter earlier stages -# whenever the realised risk score dips below the threshold again. -lifecycle: - track: attrition_risk - enforce_order: true - stages: - - new_hire: 0.0 - - established: 0.15 - - disengaging: 0.4 - - exited: 0.7 - - -# ── schema ────────────────────────────────────────────── - -dimensions: - - - name: dim_date - per: period - columns: - - {name: date_key, type: id} - - {name: date, type: date} - - {name: year, type: int} - - {name: month, type: int} - - {name: quarter, type: int} - - - name: dim_employee - per: unit - columns: - - {name: employee_id, type: id} - - {name: full_name, type: faker.name} - - {name: hire_year, type: faker.year} - - {name: cohort_size, type: segment.count} - - name: job_level - type: scd - tracks: performance_score - tiers: [ic, senior, lead] - at: [0.4, 0.75] - - - name: dim_department - reference: true - columns: - - {name: department_id, type: id} - - {name: department, type: "static.engineering,sales,product,operations"} - - {name: cost_center, type: "static.RnD,GTM,RnD,GnA"} - - - name: dim_office - reference: true - columns: - - {name: office_id, type: id} - - {name: office, type: "static.austin,berlin,singapore,remote"} - - {name: region, type: "static.AMER,EMEA,APAC,GLOBAL"} - -facts: - - - name: fct_performance - metrics: [performance_score, engagement, training_hours] - columns: - - {name: date_key, type: ref.dim_date} - - {name: employee_id, type: ref.dim_employee} - - {name: department_id, type: ref.dim_department} - - {name: performance_score, type: metric.performance_score} - - {name: engagement, type: metric.engagement} - - {name: training_hours, type: metric.training_hours} - - name: review_outcome - type: bucket - labels: [improvement_plan, meets, exceeds, top_talent] - - - name: fct_compensation - metrics: [compensation, attrition_risk] - columns: - - {name: date_key, type: ref.dim_date} - - {name: employee_id, type: ref.dim_employee} - - {name: office_id, type: ref.dim_office} - - {name: compensation, type: metric.compensation} - - {name: attrition_risk, type: metric.attrition_risk} - - - name: fct_attendance - metrics: [absence_rate] - columns: - - {name: date_key, type: ref.dim_date} - - {name: employee_id, type: ref.dim_employee} - - {name: absence_rate, type: metric.absence_rate} - -events: - - - name: evt_training_completion - trigger: proportional - driver: engagement - scale: 4.0 - columns: - - {name: event_id, type: id} - - {name: date_key, type: ref.dim_date} - - {name: employee_id, type: ref.dim_employee} - - {name: course_name, type: faker.word} - - {name: event_ts, type: timestamp} - - - name: evt_attrition - trigger: threshold - metric: attrition_risk - above: 0.7 - for: 3 - columns: - - {name: event_id, type: id} - - {name: date_key, type: ref.dim_date} - - {name: employee_id, type: ref.dim_employee} - - {name: reason, type: faker.sentence} - - {name: voluntary, type: flag} - - -# 0.6-M15: data-quality issues for Data Quality Testing (DE L25) and -# Data Cleaning (DE L15). Manifest records every injection so students -# can score detectors against ground truth. -quality: - - { table: fct_performance, issue: null_injection, rate: 0.04, column: engagement } - - { table: evt_training_completion, issue: late_arrival, rate: 0.02 } - - -# ═══════════════════════════════════════════════════════ -# Legend -# ═══════════════════════════════════════════════════════ -# -# ── unit ─────────────────────────────────────────────── -# company SaaS / B2B -# employee HR / workforce -# customer retail / e-commerce -# campaign marketing -# (any word) dim table becomes dim_{unit} -# -# ── every ────────────────────────────────────────────── -# monthly calendar months -# weekly calendar weeks -# daily calendar days -# -# ── metric type ──────────────────────────────────────── -# score bounded [0,1] — performance, engagement, risk -# amount bounded business range (requires range) -# count integer event counts — training hours, absences -# index signed centered metric (requires range) -# -# ── polarity ─────────────────────────────────────────── -# positive higher is better -# negative higher is worse -# -# ── archetype patterns ───────────────────────────────── -# growth smooth S-curve rise — new-hire ramp -# decline exponential fade — disengagement -# seasonal 2 oscillation cycles — cyclical roles -# flat low and constant — coasting baseline -# spike_then_crash rapid rise, drop, low plateau — burnout -# accelerating compound growth — promotion track -# -# ── archetype composition ────────────────────────────── -# pattern > pattern sequential — first then second -# @ N transition at period N -# flat > growth @ 6 -# growth > spike_then_crash > flat @ 6 @ 14 -# -# ── connections ──────────────────────────────────────── -# mirrors +0.75 nearly the same signal -# driven_by +0.55 strong positive link -# related +0.40 moderate positive -# hints_at +0.20 weak positive -# independent 0.00 no relationship -# hints_against -0.20 weak inverse -# resists -0.40 moderate inverse -# opposes -0.55 strong inverse -# inverts -0.75 nearly mirror-opposite -# -# ── baseline ─────────────────────────────────────────── -# high upper third of metric range -# mid midpoint (default if omitted) -# low lower third of metric range -# -# ── follows + delay ──────────────────────────────────── -# follows: metric this metric lags behind another -# delay: N by N periods -# -# ── range ────────────────────────────────────────────── -# [min, max] required for amount and index -# score defaults to [0, 1] -# count has no range -# -# ── count ────────────────────────────────────────────── -# 1 – 5000 employees per segment -# -# ── schema types ─────────────────────────────────────── -# id primary key, auto-generated -# ref.{table} foreign key to dim table -# metric.{name} populated from named metric -# faker.{type} generated via faker (name, year, sentence, word) -# static.{value} fixed value or comma-list for fan-out -# segment.count cohort population size -# timestamp generated datetime within period -# flag boolean derived from event trigger -# bucket categorical label derived from trajectory -# scd slowly changing dimension (requires tracks/tiers/at) -# -# ── event triggers ───────────────────────────────────── -# proportional row count = driver metric × scale per period -# threshold fires once when metric crosses value for N periods -# -# ── dimension per ────────────────────────────────────── -# period one row per time period (dim_date) -# unit one row per entity (dim_employee) -# reference: true static lookup (dim_department, dim_office) diff --git a/plotsim/configs/templates/marketing.yaml b/plotsim/configs/templates/marketing.yaml new file mode 100644 index 0000000..6cf09b0 --- /dev/null +++ b/plotsim/configs/templates/marketing.yaml @@ -0,0 +1,290 @@ +# ═══════════════════════════════════════════════════════ +# plotsim — Marketing campaign analytics +# ═══════════════════════════════════════════════════════ +# +# A digital marketing team's performance warehouse spanning paid, +# organic, and email channels. Campaigns run across channels and +# target audience segments; creative variants live as child rows +# under each campaign (parent/child fact grain). Spend gets a CDC +# audit (attribution restated as conversion windows close). A +# subset of awareness campaigns is on an A/B treatment targeted +# at conversion_rate (per-metric lift). Seasonality follows the +# Q4 holiday window. + +about: "Marketing campaign performance — spend, reach, conversion, revenue" +unit: campaign + +seed: 80211 +noise: slightly_messy + +window: + start: 2023-01 + end: 2024-12 + every: monthly + + +# ── seasonality ───────────────────────────────────────── +# Q4 holiday lift, summer slowdown, post-holiday January dip. +seasonality: + - { months: [11, 12], strength: 0.45 } + - { months: [6, 7, 8], strength: -0.10 } + - { months: [1], strength: -0.25 } + + +# ── what we measure ───────────────────────────────────── + +metrics: + - { name: ad_spend, label: Monthly paid-media spend, type: amount, polarity: positive, range: [500, 50000] } + - { name: impressions, label: Ad impressions delivered, type: count, polarity: positive } + - { name: click_through_rate, label: Ad click-through rate, type: score, polarity: positive } + - { name: conversion_rate, label: Visit-to-purchase rate, type: score, polarity: positive } + - { name: bounce_rate, label: Landing-page bounce rate, type: score, polarity: negative } + - { name: revenue, label: Attributed revenue, type: amount, polarity: positive, range: [0, 250000] } + - { name: roi, label: Return on ad spend, type: index, polarity: positive, range: [-1, 5] } + - { name: leads_generated, label: Marketing-qualified leads, type: count, polarity: positive, follows: impressions, delay: 1 } + # creative_quality drives the per-creative child fact's row count + - { name: creative_quality, label: Creative quality score, type: score, polarity: positive } + + +connections: + - click_through_rate driven_by impressions + - conversion_rate driven_by click_through_rate + - bounce_rate opposes conversion_rate + - "revenue 0.62 conversion_rate" + - "roi 0.48 revenue" + - leads_generated related click_through_rate + + +# ── who we're simulating ──────────────────────────────── + +segments: + + - name: awareness_builder + count: 15 + archetype: growth + label: "Top-of-funnel brand awareness — steady reach growth" + attributes: + objective: [awareness] + channel: [paid_social, display] + pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] + bid_strategy: [target_cpa, max_clicks, max_conversions] + baseline: + impressions: high + ad_spend: high + conversion_rate: low + # Per-metric treatment: half the segment runs a new audience- + # targeting model that lifts conversion_rate specifically (not + # other metrics). Students recover the ATE against + # ``manifest.json``'s treatment_assignments. + treatment: + fraction: 0.5 + lift_log_odds: 0.5 + start_period: 8 + treatment_label: "new_audience_model" + control_label: "incumbent_audience_model" + target_metric: conversion_rate + + - name: paid_burst + count: 18 + archetype: growth > spike_then_crash @ 12 + label: "Heavy paid push, then sharp budget cut after Q4" + attributes: + objective: [conversion] + channel: [paid_search, paid_social] + pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] + bid_strategy: [target_cpa, max_conversions] + baseline: + ad_spend: high + impressions: high + bounce_rate: high + + - name: seasonal_promo + count: 20 + archetype: seasonal + label: "Cyclical holiday and seasonal-sale pushes" + attributes: + objective: [conversion, awareness] + channel: [paid_search, email, paid_social] + pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] + bid_strategy: [target_roas, max_conversions] + baseline: + ad_spend: mid + revenue: mid + + - name: delayed_breakthrough + count: 12 + archetype: flat > growth @ 10 + label: "Quiet ramp-up, breakthrough mid-campaign once creative landed" + attributes: + objective: [conversion] + channel: [paid_search, display] + pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] + bid_strategy: [target_cpa] + baseline: + ad_spend: mid + conversion_rate: mid + + - name: viral_compound + count: 10 + archetype: accelerating + label: "Compounding organic share — viral coefficient > 1" + attributes: + objective: [awareness, engagement] + channel: [organic_social, referral] + pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] + bid_strategy: [max_clicks] + baseline: + impressions: high + revenue: high + roi: high + + - name: end_of_life + count: 10 + archetype: decline + label: "Sunsetting campaign — winding down spend" + attributes: + objective: [retention] + channel: [email, retargeting] + pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] + bid_strategy: [target_roas] + baseline: + ad_spend: low + conversion_rate: low + + - name: retarget_revival + count: 10 + archetype: decline > flat > growth @ 6 @ 16 + label: "Stalled, paused, then relaunched with retargeting" + attributes: + objective: [retention, conversion] + channel: [retargeting, email] + pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] + bid_strategy: [target_cpa, max_conversions] + baseline: + conversion_rate: mid + bounce_rate: mid + + +# ── lifecycle funnel ──────────────────────────────────── +lifecycle: + track: conversion_rate + stages: + - launch: 0.0 + - ramping: 0.15 + - performing: 0.4 + - winning: 0.7 + + +# ── schema ────────────────────────────────────────────── + +dimensions: + - name: dim_date + per: period + columns: + - {name: date_key, type: id} + - {name: date, type: date} + - {name: year, type: int} + - {name: month, type: int} + - {name: quarter, type: int} + + - name: dim_campaign + per: unit + columns: + - {name: campaign_id, type: id} + - {name: campaign_name, type: faker.company} + - {name: launch_year, type: faker.year} + - {name: cohort_size, type: segment.count} + - {name: objective, type: pool.objective} + - {name: channel, type: pool.channel} + - {name: bid_strategy, type: pool.bid_strategy} + - name: campaign_phase + type: scd + tracks: revenue + tiers: [seed, scale, mature] + at: [0.3, 0.7] + + - name: dim_audience + reference: true + columns: + - {name: audience_id, type: id} + - {name: audience_name, type: "static.lookalike,prospecting,retargeting,loyalty,broad,interest"} + - {name: audience_size, type: "static.large,large,medium,small,large,medium"} + + - name: dim_creative_format + reference: true + columns: + - {name: format_id, type: id} + - {name: format_name, type: "static.video,static_image,carousel,story,native,collection"} + + +facts: + + # Per-period campaign spend + reach (CDC for reconciliation restatement). + - name: fct_campaign_performance + metrics: [ad_spend, impressions, click_through_rate, creative_quality] + cdc: true # spend restatements during attribution-window close + columns: + - {name: date_key, type: ref.dim_date} + - {name: campaign_id, type: ref.dim_campaign} + - {name: audience_id, type: ref.dim_audience} + - {name: format_id, type: ref.dim_creative_format} + - {name: ad_spend, type: metric.ad_spend} + - {name: impressions, type: metric.impressions} + - {name: click_through_rate, type: metric.click_through_rate} + - {name: creative_quality, type: metric.creative_quality} + - {name: budget_cap, type: range, range: [1000, 80000]} + - {name: bid_amount, type: range, range: [0.10, 12.0]} + + - name: fct_funnel + metrics: [conversion_rate, bounce_rate, leads_generated] + columns: + - {name: date_key, type: ref.dim_date} + - {name: campaign_id, type: ref.dim_campaign} + - {name: conversion_rate, type: metric.conversion_rate} + - {name: bounce_rate, type: metric.bounce_rate} + - {name: leads_generated, type: metric.leads_generated} + - name: funnel_stage + type: bucket + labels: [cold, warming, engaged, converted] + + - name: fct_revenue + metrics: [revenue, roi] + columns: + - {name: date_key, type: ref.dim_date} + - {name: campaign_id, type: ref.dim_campaign} + - {name: revenue, type: metric.revenue} + - {name: roi, type: metric.roi} + + +events: + + - name: evt_click + trigger: proportional + driver: click_through_rate + scale: 8.0 + columns: + - {name: event_id, type: id} + - {name: date_key, type: ref.dim_date} + - {name: campaign_id, type: ref.dim_campaign} + - {name: event_ts, type: timestamp} + + - name: evt_campaign_pause + trigger: threshold + metric: conversion_rate + below: 0.1 + for: 3 + columns: + - {name: event_id, type: id} + - {name: date_key, type: ref.dim_date} + - {name: campaign_id, type: ref.dim_campaign} + - {name: reason, type: pool.pause_reason} + - {name: pause_flag, type: flag} + + +# ── data quality ──────────────────────────────────────── +# Realistic marketing data hygiene: null spend during reconciliation, +# late-arriving conversion events, volume anomaly at launch peaks. +quality: + - { table: fct_campaign_performance, issue: null_injection, rate: 0.04, column: ad_spend } + - { table: evt_click, issue: late_arrival, rate: 0.03 } + - { table: fct_funnel, issue: duplicate_rows, rate: 0.01 } diff --git a/plotsim/configs/templates/marketing_template.py b/plotsim/configs/templates/marketing_template.py index 725327b..6bb81dc 100644 --- a/plotsim/configs/templates/marketing_template.py +++ b/plotsim/configs/templates/marketing_template.py @@ -1,22 +1,30 @@ -"""Marketing campaign analytics — Python builder template. +"""Marketing template — Python form. -Mirror of ``marketing_template.yaml``. Demonstrates: +Mirror of ``marketing.yaml``. Digital marketing campaign analytics with +per-metric A/B treatment (lift targets conversion_rate specifically), +CDC on the spend fact (reconciliation restates prior periods), Q4 +seasonality + summer / January dips, and three quality issues. -* mixed-form ``connections`` — vocabulary words AND custom numeric - coefficients calibrated from real ad-platform data -* SCD2 ``campaign_phase`` tracking ``revenue`` -* threshold event with ``below`` (campaign-pause fires when - conversion crashes for 3+ periods) +Run: + >>> from plotsim.configs.templates.marketing_template import config + >>> from plotsim import generate_tables + >>> tables = generate_tables(config) """ from plotsim import create + config = create( about="Marketing campaign performance — spend, reach, conversion, revenue", unit="campaign", seed=80211, noise="slightly_messy", window=("2023-01", "2024-12", "monthly"), + seasonality=[ + {"months": [11, 12], "strength": 0.45}, + {"months": [6, 7, 8], "strength": -0.10}, + {"months": [1], "strength": -0.25}, + ], metrics=[ { "name": "ad_spend", @@ -39,7 +47,7 @@ }, { "name": "conversion_rate", - "label": "Visit-to-purchase conversion", + "label": "Visit-to-purchase rate", "type": "score", "polarity": "positive", }, @@ -71,17 +79,20 @@ "follows": "impressions", "delay": 1, }, + { + "name": "creative_quality", + "label": "Creative quality score", + "type": "score", + "polarity": "positive", + }, ], - # Numeric coefficients calibrated from a real ad-platform dataset. - # Mix of vocabulary words and explicit r values — both forms are - # interchangeable; the engine collects them into a single matrix. connections=[ - ("click_through_rate", "driven_by", "impressions"), - ("conversion_rate", "driven_by", "click_through_rate"), - ("bounce_rate", "opposes", "conversion_rate"), - ("revenue", 0.62, "conversion_rate"), - ("roi", 0.48, "revenue"), - ("leads_generated", "related", "click_through_rate"), + "click_through_rate driven_by impressions", + "conversion_rate driven_by click_through_rate", + "bounce_rate opposes conversion_rate", + "revenue 0.62 conversion_rate", + "roi 0.48 revenue", + "leads_generated related click_through_rate", ], segments=[ { @@ -98,16 +109,16 @@ "creative_fatigue", "audience_saturation", ], + "bid_strategy": ["target_cpa", "max_clicks", "max_conversions"], }, "baseline": {"impressions": "high", "ad_spend": "high", "conversion_rate": "low"}, - # 0.6-M15: A/B treatment cohort (M8c) — see - # ``marketing_template.yaml`` for the ATE recovery exercise. "treatment": { "fraction": 0.5, "lift_log_odds": 0.5, "start_period": 8, "treatment_label": "new_audience_model", "control_label": "incumbent_audience_model", + "target_metric": "conversion_rate", }, }, { @@ -124,6 +135,7 @@ "creative_fatigue", "audience_saturation", ], + "bid_strategy": ["target_cpa", "max_conversions"], }, "baseline": {"ad_spend": "high", "impressions": "high", "bounce_rate": "high"}, }, @@ -141,6 +153,7 @@ "creative_fatigue", "audience_saturation", ], + "bid_strategy": ["target_roas", "max_conversions"], }, "baseline": {"ad_spend": "mid", "revenue": "mid"}, }, @@ -158,6 +171,7 @@ "creative_fatigue", "audience_saturation", ], + "bid_strategy": ["target_cpa"], }, "baseline": {"ad_spend": "mid", "conversion_rate": "mid"}, }, @@ -175,14 +189,15 @@ "creative_fatigue", "audience_saturation", ], + "bid_strategy": ["max_clicks"], }, "baseline": {"impressions": "high", "revenue": "high", "roi": "high"}, }, { "name": "end_of_life", - "count": 8, + "count": 10, "archetype": "decline", - "label": "Sunsetting campaign — winding down spend over the window", + "label": "Sunsetting campaign — winding down spend", "attributes": { "objective": ["retention"], "channel": ["email", "retargeting"], @@ -192,6 +207,7 @@ "creative_fatigue", "audience_saturation", ], + "bid_strategy": ["target_roas"], }, "baseline": {"ad_spend": "low", "conversion_rate": "low"}, }, @@ -199,7 +215,7 @@ "name": "retarget_revival", "count": 10, "archetype": "decline > flat > growth @ 6 @ 16", - "label": "Stalled, paused, then relaunched with retargeting bump", + "label": "Stalled, paused, then relaunched with retargeting", "attributes": { "objective": ["retention", "conversion"], "channel": ["retargeting", "email"], @@ -209,18 +225,14 @@ "creative_fatigue", "audience_saturation", ], + "bid_strategy": ["target_cpa", "max_conversions"], }, "baseline": {"conversion_rate": "mid", "bounce_rate": "mid"}, }, ], lifecycle={ "track": "conversion_rate", - "stages": [ - ("launch", 0.0), - ("ramping", 0.15), - ("performing", 0.4), - ("winning", 0.7), - ], + "stages": [{"launch": 0.0}, {"ramping": 0.15}, {"performing": 0.4}, {"winning": 0.7}], }, dimensions=[ { @@ -242,6 +254,9 @@ {"name": "campaign_name", "type": "faker.company"}, {"name": "launch_year", "type": "faker.year"}, {"name": "cohort_size", "type": "segment.count"}, + {"name": "objective", "type": "pool.objective"}, + {"name": "channel", "type": "pool.channel"}, + {"name": "bid_strategy", "type": "pool.bid_strategy"}, { "name": "campaign_phase", "type": "scd", @@ -251,21 +266,6 @@ }, ], }, - { - "name": "dim_channel", - "reference": True, - "columns": [ - {"name": "channel_id", "type": "id"}, - { - "name": "channel_name", - "type": "static.paid_search,paid_social,display,email,organic_social,referral,retargeting", - }, - { - "name": "channel_type", - "type": "static.paid,paid,paid,owned,organic,organic,paid", - }, - ], - }, { "name": "dim_audience", "reference": True, @@ -273,33 +273,39 @@ {"name": "audience_id", "type": "id"}, { "name": "audience_name", - "type": "static.lookalike,prospecting,retargeting,loyalty,broad", + "type": "static.lookalike,prospecting,retargeting,loyalty,broad,interest", }, - {"name": "audience_size", "type": "static.large,large,medium,small,large"}, + {"name": "audience_size", "type": "static.large,large,medium,small,large,medium"}, ], }, { - "name": "dim_creative", + "name": "dim_creative_format", "reference": True, "columns": [ - {"name": "creative_id", "type": "id"}, - {"name": "format", "type": "static.video,static_image,carousel,story,native"}, - {"name": "variant", "type": "static.A,A,B,C,B"}, + {"name": "format_id", "type": "id"}, + { + "name": "format_name", + "type": "static.video,static_image,carousel,story,native,collection", + }, ], }, ], facts=[ { - "name": "fct_spend", - "metrics": ["ad_spend", "impressions", "click_through_rate"], + "name": "fct_campaign_performance", + "metrics": ["ad_spend", "impressions", "click_through_rate", "creative_quality"], + "cdc": True, "columns": [ {"name": "date_key", "type": "ref.dim_date"}, {"name": "campaign_id", "type": "ref.dim_campaign"}, - {"name": "channel_id", "type": "ref.dim_channel"}, - {"name": "creative_id", "type": "ref.dim_creative"}, + {"name": "audience_id", "type": "ref.dim_audience"}, + {"name": "format_id", "type": "ref.dim_creative_format"}, {"name": "ad_spend", "type": "metric.ad_spend"}, {"name": "impressions", "type": "metric.impressions"}, {"name": "click_through_rate", "type": "metric.click_through_rate"}, + {"name": "creative_quality", "type": "metric.creative_quality"}, + {"name": "budget_cap", "type": "range", "range": [1000, 80000]}, + {"name": "bid_amount", "type": "range", "range": [0.10, 12.0]}, ], }, { @@ -308,7 +314,6 @@ "columns": [ {"name": "date_key", "type": "ref.dim_date"}, {"name": "campaign_id", "type": "ref.dim_campaign"}, - {"name": "audience_id", "type": "ref.dim_audience"}, {"name": "conversion_rate", "type": "metric.conversion_rate"}, {"name": "bounce_rate", "type": "metric.bounce_rate"}, {"name": "leads_generated", "type": "metric.leads_generated"}, @@ -348,7 +353,7 @@ "trigger": "threshold", "metric": "conversion_rate", "below": 0.1, - "for_periods": 3, + "for": 3, "columns": [ {"name": "event_id", "type": "id"}, {"name": "date_key", "type": "ref.dim_date"}, @@ -358,11 +363,14 @@ ], }, ], - # 0.6-M15: data-quality issues for Data Quality Testing (DE L25) - # and Data Cleaning (DE L15). Manifest records every injection so - # students can score detectors against ground truth. quality=[ - {"table": "fct_spend", "issue": "null_injection", "rate": 0.03, "column": "ad_spend"}, - {"table": "evt_click", "issue": "late_arrival", "rate": 0.02}, + { + "table": "fct_campaign_performance", + "issue": "null_injection", + "rate": 0.04, + "column": "ad_spend", + }, + {"table": "evt_click", "issue": "late_arrival", "rate": 0.03}, + {"table": "fct_funnel", "issue": "duplicate_rows", "rate": 0.01}, ], ) diff --git a/plotsim/configs/templates/marketing_template.yaml b/plotsim/configs/templates/marketing_template.yaml deleted file mode 100644 index d10da77..0000000 --- a/plotsim/configs/templates/marketing_template.yaml +++ /dev/null @@ -1,400 +0,0 @@ -# ═══════════════════════════════════════════════════════ -# plotsim — Marketing campaign analytics -# ═══════════════════════════════════════════════════════ - -about: "Marketing campaign performance — spend, reach, conversion, revenue" -unit: campaign - -# Determinism + slightly-messy noise to mimic ad-platform jitter. -seed: 80211 -noise: slightly_messy - -window: - start: 2023-01 - end: 2024-12 - every: monthly - - -# ── what we measure ───────────────────────────────────── - -metrics: - - - name: ad_spend - label: Monthly paid-media spend - type: amount - polarity: positive - range: [500, 50000] - - - name: impressions - label: Ad impressions delivered - type: count - polarity: positive - - - name: click_through_rate - label: Ad click-through rate - type: score - polarity: positive - - - name: conversion_rate - label: Visit-to-purchase conversion - type: score - polarity: positive - - - name: bounce_rate - label: Landing-page bounce rate - type: score - polarity: negative - - - name: revenue - label: Attributed revenue - type: amount - polarity: positive - range: [0, 250000] - - - name: roi - label: Return on ad spend - type: index - polarity: positive - range: [-1, 5] - - - name: leads_generated - label: Marketing-qualified leads - type: count - polarity: positive - follows: impressions - delay: 1 - - -# ── how metrics connect ───────────────────────────────── - -# Numeric coefficients calibrated from a real ad-platform dataset. -# Mix of vocabulary words and explicit r values — both forms -# coexist; the engine treats them as a single correlation matrix. -connections: - - click_through_rate driven_by impressions - - conversion_rate driven_by click_through_rate - - bounce_rate opposes conversion_rate - - "revenue 0.62 conversion_rate" # measured r=0.62 - - [roi, 0.48, revenue] # measured r=0.48 - - leads_generated related click_through_rate - - -# ── who we're simulating ──────────────────────────────── - -segments: - - - name: awareness_builder - count: 15 - archetype: growth - label: "Top-of-funnel brand awareness — steady reach growth" - attributes: - objective: [awareness] - channel: [paid_social, display] - pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] - baseline: - impressions: high - ad_spend: high - conversion_rate: low - # 0.6-M15: A/B treatment cohort (M8c) demonstrated on a realistic - # marketing campaign. Half of the awareness campaigns flip to a new - # audience-targeting algorithm at period 8; the new algorithm lifts - # conversion_rate by ~0.5 log-odds. Students recover the ATE - # against the ``treatment_assignments`` ground truth in - # ``manifest.json``. - treatment: - fraction: 0.5 - lift_log_odds: 0.5 - start_period: 8 - treatment_label: "new_audience_model" - control_label: "incumbent_audience_model" - - - name: paid_burst - count: 18 - archetype: growth > spike_then_crash @ 12 - label: "Heavy paid push, then sharp budget cut after Q4" - attributes: - objective: [conversion] - channel: [paid_search, paid_social] - pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] - baseline: - ad_spend: high - impressions: high - bounce_rate: high - - - name: seasonal_promo - count: 20 - archetype: seasonal - label: "Cyclical holiday and seasonal-sale pushes" - attributes: - objective: [conversion, awareness] - channel: [paid_search, email, paid_social] - pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] - baseline: - ad_spend: mid - revenue: mid - - - name: delayed_breakthrough - count: 12 - archetype: flat > growth @ 10 - label: "Quiet ramp-up, breakthrough mid-campaign once creative landed" - attributes: - objective: [conversion] - channel: [paid_search, display] - pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] - baseline: - ad_spend: mid - conversion_rate: mid - - - name: viral_compound - count: 10 - archetype: accelerating - label: "Compounding organic share — viral coefficient > 1" - attributes: - objective: [awareness, engagement] - channel: [organic_social, referral] - pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] - baseline: - impressions: high - revenue: high - roi: high - - - name: end_of_life - count: 8 - archetype: decline - label: "Sunsetting campaign — winding down spend over the window" - attributes: - objective: [retention] - channel: [email, retargeting] - pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] - baseline: - ad_spend: low - conversion_rate: low - - - name: retarget_revival - count: 10 - archetype: decline > flat > growth @ 6 @ 16 - label: "Stalled, paused, then relaunched with retargeting bump" - attributes: - objective: [retention, conversion] - channel: [retargeting, email] - pause_reason: [budget_exhausted, low_conversion, creative_fatigue, audience_saturation] - baseline: - conversion_rate: mid - bounce_rate: mid - - -# ── lifecycle funnel ──────────────────────────────────── - -lifecycle: - track: conversion_rate - stages: - - launch: 0.0 - - ramping: 0.15 - - performing: 0.4 - - winning: 0.7 - - -# ── schema ────────────────────────────────────────────── - -dimensions: - - - name: dim_date - per: period - columns: - - {name: date_key, type: id} - - {name: date, type: date} - - {name: year, type: int} - - {name: month, type: int} - - {name: quarter, type: int} - - - name: dim_campaign - per: unit - columns: - - {name: campaign_id, type: id} - - {name: campaign_name, type: faker.company} - - {name: launch_year, type: faker.year} - - {name: cohort_size, type: segment.count} - - name: campaign_phase - type: scd - tracks: revenue - tiers: [seed, scale, mature] - at: [0.3, 0.7] - - - name: dim_channel - reference: true - columns: - - {name: channel_id, type: id} - - {name: channel_name, type: "static.paid_search,paid_social,display,email,organic_social,referral,retargeting"} - - {name: channel_type, type: "static.paid,paid,paid,owned,organic,organic,paid"} - - - name: dim_audience - reference: true - columns: - - {name: audience_id, type: id} - - {name: audience_name, type: "static.lookalike,prospecting,retargeting,loyalty,broad"} - - {name: audience_size, type: "static.large,large,medium,small,large"} - - - name: dim_creative - reference: true - columns: - - {name: creative_id, type: id} - - {name: format, type: "static.video,static_image,carousel,story,native"} - - {name: variant, type: "static.A,A,B,C,B"} - -facts: - - - name: fct_spend - metrics: [ad_spend, impressions, click_through_rate] - columns: - - {name: date_key, type: ref.dim_date} - - {name: campaign_id, type: ref.dim_campaign} - - {name: channel_id, type: ref.dim_channel} - - {name: creative_id, type: ref.dim_creative} - - {name: ad_spend, type: metric.ad_spend} - - {name: impressions, type: metric.impressions} - - {name: click_through_rate, type: metric.click_through_rate} - - - name: fct_funnel - metrics: [conversion_rate, bounce_rate, leads_generated] - columns: - - {name: date_key, type: ref.dim_date} - - {name: campaign_id, type: ref.dim_campaign} - - {name: audience_id, type: ref.dim_audience} - - {name: conversion_rate, type: metric.conversion_rate} - - {name: bounce_rate, type: metric.bounce_rate} - - {name: leads_generated, type: metric.leads_generated} - - name: funnel_stage - type: bucket - labels: [cold, warming, engaged, converted] - - - name: fct_revenue - metrics: [revenue, roi] - columns: - - {name: date_key, type: ref.dim_date} - - {name: campaign_id, type: ref.dim_campaign} - - {name: revenue, type: metric.revenue} - - {name: roi, type: metric.roi} - -events: - - - name: evt_click - trigger: proportional - driver: click_through_rate - scale: 8.0 - columns: - - {name: event_id, type: id} - - {name: date_key, type: ref.dim_date} - - {name: campaign_id, type: ref.dim_campaign} - - {name: event_ts, type: timestamp} - - - name: evt_campaign_pause - trigger: threshold - metric: conversion_rate - below: 0.1 - for: 3 - columns: - - {name: event_id, type: id} - - {name: date_key, type: ref.dim_date} - - {name: campaign_id, type: ref.dim_campaign} - - {name: reason, type: pool.pause_reason} - - {name: pause_flag, type: flag} - - -# 0.6-M15: data-quality issues for Data Quality Testing (DE L25) and -# Data Cleaning (DE L15). Manifest records every injection so students -# can score detectors against ground truth. -quality: - - { table: fct_spend, issue: null_injection, rate: 0.03, column: ad_spend } - - { table: evt_click, issue: late_arrival, rate: 0.02 } - - -# ═══════════════════════════════════════════════════════ -# Legend -# ═══════════════════════════════════════════════════════ -# -# ── unit ─────────────────────────────────────────────── -# company SaaS / B2B -# employee HR / workforce -# customer retail / e-commerce -# campaign marketing -# student academic cohorts -# -# ── every ────────────────────────────────────────────── -# monthly calendar months -# weekly calendar weeks -# daily calendar days -# -# ── metric type ──────────────────────────────────────── -# score bounded [0,1] — CTR, conversion, bounce -# amount bounded business range — spend, revenue -# count integer event counts — impressions, leads -# index signed centered metric — ROI -# -# ── polarity ─────────────────────────────────────────── -# positive higher is better -# negative higher is worse -# -# ── archetype patterns ───────────────────────────────── -# growth smooth S-curve rise — awareness build -# decline exponential fade — campaign sunset -# seasonal 2 oscillation cycles — promo cycles -# flat low and constant — dormant baseline -# spike_then_crash rapid rise, drop, low plateau — paid burst -# accelerating compound growth — viral compound -# -# ── archetype composition ────────────────────────────── -# pattern > pattern sequential — first then second -# @ N transition at period N -# growth > spike_then_crash @ 12 -# flat > growth @ 10 -# decline > flat > growth @ 6 @ 16 -# -# ── connections ──────────────────────────────────────── -# mirrors +0.75 nearly the same signal -# driven_by +0.55 strong positive link -# related +0.40 moderate positive -# hints_at +0.20 weak positive -# independent 0.00 no relationship -# hints_against -0.20 weak inverse -# resists -0.40 moderate inverse -# opposes -0.55 strong inverse -# inverts -0.75 nearly mirror-opposite -# -# ── baseline ─────────────────────────────────────────── -# high upper third of metric range -# mid midpoint (default if omitted) -# low lower third of metric range -# -# ── follows + delay ──────────────────────────────────── -# follows: metric this metric lags behind another -# delay: N by N periods -# -# ── range ────────────────────────────────────────────── -# [min, max] required for amount and index -# score defaults to [0, 1] -# count has no range -# -# ── count ────────────────────────────────────────────── -# 1 – 5000 campaigns per segment -# -# ── schema types ─────────────────────────────────────── -# id primary key, auto-generated -# ref.{table} foreign key to dim table -# metric.{name} populated from named metric -# faker.{type} generated via faker (company, year, sentence) -# static.{value} fixed value or comma-list for fan-out -# segment.count cohort population size -# timestamp generated datetime within period -# flag boolean derived from event trigger -# bucket categorical label derived from trajectory -# scd slowly changing dimension (requires tracks/tiers/at) -# -# ── event triggers ───────────────────────────────────── -# proportional row count = driver metric × scale per period -# threshold fires once when metric crosses value for N periods -# (above for risk, below for performance drops) -# -# ── dimension per ────────────────────────────────────── -# period one row per time period (dim_date) -# unit one row per entity (dim_campaign) -# reference: true static lookup (dim_channel, dim_audience, dim_creative) diff --git a/plotsim/configs/templates/retail.yaml b/plotsim/configs/templates/retail.yaml new file mode 100644 index 0000000..67b4b7c --- /dev/null +++ b/plotsim/configs/templates/retail.yaml @@ -0,0 +1,311 @@ +# ═══════════════════════════════════════════════════════ +# plotsim — Omnichannel retail (customer, orders, returns) +# ═══════════════════════════════════════════════════════ +# +# A retail company operating across physical stores and digital +# channels. Customers shop and their loyalty evolves (SCD2 tier). +# Orders are variable-grain (one row per discrete order, count +# driven by order_volume); each order spawns 1..5 line items +# (per_parent_row child). Returns reference prior orders (cross- +# fact FK). Customers and promotions have M:N eligibility (bridge). +# Q4 holiday + summer + January seasonality. Customers carry a +# coherent geo bundle (home country/city). Multi-locale faker +# output reflects international operations. + +about: "Omnichannel retail — customers, orders, returns, loyalty" +unit: customer + +seed: 39131 +noise: slightly_messy +locale: [en_US, en_GB, fr_FR, de_DE] + +window: + start: 2023-01 + end: 2024-12 + every: monthly + + +# ── seasonality ───────────────────────────────────────── +seasonality: + - { months: [11, 12], strength: 0.45 } + - { months: [6, 7, 8], strength: -0.10 } + - { months: [1], strength: -0.25 } + + +# ── what we measure ───────────────────────────────────── + +metrics: + - { name: order_volume, label: Orders placed per period, type: count, polarity: positive } + - { name: cart_value, label: Average cart value, type: amount, polarity: positive, range: [10, 500] } + - { name: loyalty_score, label: Loyalty engagement, type: score, polarity: positive } + - { name: conversion_rate, label: Visit-to-purchase rate, type: score, polarity: positive } + - { name: return_rate, label: Return rate (drives evt), type: score, polarity: negative } + - { name: nps, label: Net promoter score, type: amount, polarity: positive, range: [0, 100] } + - { name: repeat_purchase_rate, label: Repeat purchase rate, type: score, polarity: positive } + + +connections: + - order_volume driven_by loyalty_score + - cart_value related conversion_rate + - "loyalty_score 0.55 repeat_purchase_rate" + - "nps 0.55 loyalty_score" + - return_rate opposes loyalty_score + + +# ── who we're simulating ──────────────────────────────── + +segments: + + - name: loyal_repeat + count: 22 + archetype: accelerating + label: "Compounding loyalty — high repeat rate" + attributes: + channel: [in_store, web, mobile_app] + payment_method: [credit, debit, mobile_wallet, gift_card] + return_reason: [damaged, wrong_size, defective, no_longer_needed, late_arrival] + promo_type: [loyalty_reward, member_discount, free_shipping] + baseline: + order_volume: high + loyalty_score: high + cart_value: high + + - name: holiday_shopper + count: 18 + archetype: seasonal + label: "Cyclical holiday spender — peaks Q4 and back-to-school" + attributes: + channel: [in_store, web] + payment_method: [credit, debit, mobile_wallet, gift_card] + return_reason: [damaged, wrong_size, defective, no_longer_needed, late_arrival] + promo_type: [seasonal_sale, doorbuster, bogo, free_shipping] + baseline: + order_volume: mid + cart_value: high + + - name: bargain_hunter + count: 22 + archetype: flat + label: "Steady low-value cart, promo-driven" + attributes: + channel: [web, mobile_app] + payment_method: [debit, credit, mobile_wallet, gift_card] + return_reason: [damaged, wrong_size, defective, no_longer_needed, late_arrival] + promo_type: [clearance, doorbuster, bogo] + baseline: + order_volume: mid + cart_value: low + + - name: churning + count: 16 + archetype: flat > decline @ 10 + label: "Coasted then quietly stopped buying" + attributes: + channel: [web] + payment_method: [credit, debit, mobile_wallet] + return_reason: [damaged, wrong_size, defective, no_longer_needed, late_arrival] + promo_type: [loyalty_reward, member_discount] + baseline: + loyalty_score: low + return_rate: high + + - name: new_customer + count: 14 + archetype: flat > growth @ 4 + label: "Recently acquired — ramping engagement" + attributes: + channel: [mobile_app, web] + payment_method: [credit, debit, mobile_wallet] + return_reason: [damaged, wrong_size, defective, no_longer_needed, late_arrival] + promo_type: [welcome_offer, first_order_discount] + baseline: + order_volume: low + loyalty_score: mid + + - name: vip + count: 8 + archetype: growth + label: "High-value VIP cohort with rising spend" + attributes: + channel: [in_store, web, mobile_app] + payment_method: [credit, mobile_wallet] + return_reason: [damaged, wrong_size, defective, no_longer_needed, late_arrival] + promo_type: [vip_exclusive, loyalty_reward] + baseline: + cart_value: high + nps: high + + +# ── lifecycle funnel ──────────────────────────────────── +lifecycle: + track: loyalty_score + stages: + - browser: 0.0 + - first_purchase: 0.15 + - returning: 0.4 + - loyal: 0.7 + + +# ── schema ────────────────────────────────────────────── + +dimensions: + - name: dim_date + per: period + columns: + - {name: date_key, type: id} + - {name: date, type: date} + - {name: year, type: int} + - {name: month, type: int} + - {name: quarter, type: int} + + - name: dim_customer + per: unit + columns: + - {name: customer_id, type: id} + - {name: customer_name, type: faker.name} + - {name: customer_email, type: faker.email} + - {name: signup_year, type: faker.year} + - {name: cohort_size, type: segment.count} + - {name: preferred_channel, type: pool.channel} + - {name: home_country, type: geo.country} + - {name: home_country_code, type: geo.country_code} + - {name: home_region, type: geo.region} + - {name: home_city, type: geo.city} + - name: loyalty_tier + type: scd + tracks: loyalty_score + tiers: [bronze, silver, gold, platinum] + at: [0.25, 0.55, 0.8] + + - name: dim_product + reference: true + columns: + - {name: product_id, type: id} + - {name: product_name, type: "static.widget,gadget,gizmo,sprocket,lantern,doohickey,thingamajig,whatsit"} + - {name: category, type: "static.hardware,hardware,hardware,hardware,outdoor,misc,misc,misc"} + - {name: price_band, type: "static.value,value,mid,mid,mid,premium,premium,luxury"} + + - name: dim_promotion + reference: true + columns: + - {name: promotion_id, type: id} + - {name: promotion_name, type: faker.company} + - {name: promo_type, type: "static.seasonal_sale,bogo,clearance,loyalty_reward,welcome_offer,doorbuster,member_discount,vip_exclusive,free_shipping,first_order_discount"} + - {name: discount_band, type: "static.10pct,20pct,25pct,30pct,40pct,bogo,member_only,vip_only,free_ship,no_promo"} + + +facts: + + # Per-customer-per-period activity exposing order_volume (drives + # the variable-grain parent fct_orders) and the loyalty/satisfaction + # metrics that drive bucket and downstream events. + - name: fct_customer_activity + metrics: [loyalty_score, conversion_rate, nps, repeat_purchase_rate, cart_value, return_rate] + columns: + - {name: date_key, type: ref.dim_date} + - {name: customer_id, type: ref.dim_customer} + - {name: loyalty_score, type: metric.loyalty_score} + - {name: conversion_rate, type: metric.conversion_rate} + - {name: nps, type: metric.nps} + - {name: repeat_purchase_rate, type: metric.repeat_purchase_rate} + - {name: cart_value, type: metric.cart_value} + - {name: return_rate, type: metric.return_rate} + - name: review_text + type: narrative + template: "{opener} {object}. {comment}" + lexicons: + loyal_repeat: &nar_block + opener: + low: ["I am disappointed by", "I am frustrated with", "I cannot recommend"] + mid: ["I keep using", "I find myself reaching for", "I keep returning to"] + high: ["I love", "I am thrilled with", "I keep recommending"] + object: + low: ["the broken release", "the buggy app", "the unreliable platform"] + mid: ["the product", "the service", "the standard offering"] + high: ["the polished platform", "the smooth experience", "the standout service"] + comment: + low: ["Not recommended.", "Going elsewhere.", "Disappointing."] + mid: ["Works as advertised.", "Fair value.", "Meets expectations."] + high: ["Highly recommend.", "Best in class.", "Worth every penny."] + holiday_shopper: { <<: *nar_block } + bargain_hunter: { <<: *nar_block } + churning: { <<: *nar_block } + new_customer: { <<: *nar_block } + vip: { <<: *nar_block } + + # Variable-grain parent: one row per order, count = round(order_volume * 1.2). + - name: fct_orders + row_count_driver: order_volume + row_count_scale: 1.2 + cdc: true # purchase audit for refund processing + columns: + - {name: order_id, type: id} + - {name: customer_id, type: ref.dim_customer} + - {name: order_date, type: ref.dim_date} + - {name: payment_method, type: pool.payment_method} + - {name: order_channel, type: pool.channel} + + # Per-parent-row child: 1..5 line items per order. + - name: fct_order_items + parent_table: fct_orders + children_per_row: [1, 5] + columns: + - {name: item_id, type: id} + - {name: customer_id, type: ref.dim_customer} + - {name: order_date, type: ref.dim_date} + - {name: product_id, type: ref.dim_product} + - {name: quantity, type: range, range: [1, 12]} + - {name: unit_price, type: range, range: [2.99, 499.99]} + - {name: discount_pct, type: range, range: [0, 40]} + + # Cross-fact FK: returns reference orders. + - name: fct_returns + row_count_driver: return_rate + row_count_scale: 0.6 + columns: + - {name: return_id, type: id} + - {name: order_id, type: ref.fct_orders} + - {name: customer_id, type: ref.dim_customer} + - {name: return_date, type: ref.dim_date} + - {name: return_reason, type: pool.return_reason} + + +events: + + - name: evt_session + trigger: proportional + driver: conversion_rate + scale: 12.0 + columns: + - {name: event_id, type: id} + - {name: date_key, type: ref.dim_date} + - {name: customer_id, type: ref.dim_customer} + - {name: event_ts, type: timestamp} + + - name: evt_return + trigger: proportional + driver: return_rate + scale: 2.0 + columns: + - {name: event_id, type: id} + - {name: date_key, type: ref.dim_date} + - {name: customer_id, type: ref.dim_customer} + - {name: severity, type: "static.warning,investigation,fraud_review"} + - {name: event_ts, type: timestamp} + + +# ── bridges ───────────────────────────────────────────── +# Customers × promotion eligibility (M:N). +bridges: + - name: bridge_customer_promotion + left: dim_customer + right: dim_promotion + cardinality: [1, 4] + driver: loyalty_score + + +# ── data quality ──────────────────────────────────────── +quality: + - { table: fct_customer_activity, issue: null_injection, rate: 0.03, column: conversion_rate } + - { table: evt_session, issue: duplicate_rows, rate: 0.015 } + - { table: fct_orders, issue: late_arrival, rate: 0.02 } diff --git a/plotsim/configs/templates/retail_template.py b/plotsim/configs/templates/retail_template.py index 002dc45..8cecd7a 100644 --- a/plotsim/configs/templates/retail_template.py +++ b/plotsim/configs/templates/retail_template.py @@ -1,31 +1,54 @@ -"""Retail / e-commerce customer analytics — Python builder template. +"""Retail template — Python form. -Mirror of ``retail_template.yaml``. Demonstrates: +Mirror of ``retail.yaml``. Omnichannel retail with SCD2 loyalty tier, +parent/child orders + line items, cross-fact FK on returns, customer +× promotion M:N bridge, geo bundle on dim_customer, multi-locale +faker output, narrative reviews, CDC audit on the orders parent fact. -* multi-locale faker (``locale=["en_US", "en_GB", "fr_FR"]``) -* Q4 holiday-shopping seasonality -* SCD2 ``customer_tier`` tracking ``loyalty_score`` -* threshold event with ``below`` (churn fires when score crashes) +Run: + >>> from plotsim.configs.templates.retail_template import config + >>> from plotsim import generate_tables + >>> tables = generate_tables(config) """ from plotsim import create + +_NAR_BLOCK = { + "opener": { + "low": ["I am disappointed by", "I am frustrated with", "I cannot recommend"], + "mid": ["I keep using", "I find myself reaching for", "I keep returning to"], + "high": ["I love", "I am thrilled with", "I keep recommending"], + }, + "object": { + "low": ["the broken release", "the buggy app", "the unreliable platform"], + "mid": ["the product", "the service", "the standard offering"], + "high": ["the polished platform", "the smooth experience", "the standout service"], + }, + "comment": { + "low": ["Not recommended.", "Going elsewhere.", "Disappointing."], + "mid": ["Works as advertised.", "Fair value.", "Meets expectations."], + "high": ["Highly recommend.", "Best in class.", "Worth every penny."], + }, +} + + config = create( - about="Retail customer purchase and loyalty behavior", + about="Omnichannel retail — customers, orders, returns, loyalty", unit="customer", - seed=90210, - noise="realistic", - # output={"format": "parquet", "directory": "./out"}, # uncomment if pyarrow installed - locale=["en_US", "en_GB", "fr_FR"], + seed=39131, + noise="slightly_messy", + locale=["en_US", "en_GB", "fr_FR", "de_DE"], window=("2023-01", "2024-12", "monthly"), seasonality=[ {"months": [11, 12], "strength": 0.45}, - {"months": [7, 8], "strength": -0.15}, + {"months": [6, 7, 8], "strength": -0.10}, + {"months": [1], "strength": -0.25}, ], metrics=[ { - "name": "sessions", - "label": "Monthly site sessions", + "name": "order_volume", + "label": "Orders placed per period", "type": "count", "polarity": "positive", }, @@ -34,162 +57,166 @@ "label": "Average cart value", "type": "amount", "polarity": "positive", - "range": [10, 2000], + "range": [10, 500], + }, + { + "name": "loyalty_score", + "label": "Loyalty engagement", + "type": "score", + "polarity": "positive", }, { "name": "conversion_rate", - "label": "Session-to-purchase conversion", + "label": "Visit-to-purchase rate", "type": "score", "polarity": "positive", }, { "name": "return_rate", - "label": "Purchase return rate", + "label": "Return rate (drives evt)", "type": "score", "polarity": "negative", }, { - "name": "loyalty_score", - "label": "Customer loyalty index", - "type": "score", + "name": "nps", + "label": "Net promoter score", + "type": "amount", "polarity": "positive", + "range": [0, 100], }, { "name": "repeat_purchase_rate", "label": "Repeat purchase rate", "type": "score", "polarity": "positive", - "follows": "loyalty_score", - "delay": 1, - }, - { - "name": "nps", - "label": "Net promoter score", - "type": "index", - "polarity": "positive", - "range": [-100, 100], }, ], connections=[ - ("conversion_rate", "driven_by", "loyalty_score"), - ("cart_value", "related", "loyalty_score"), - ("return_rate", "opposes", "loyalty_score"), - ("repeat_purchase_rate", "driven_by", "conversion_rate"), - ("nps", "related", "loyalty_score"), + "order_volume driven_by loyalty_score", + "cart_value related conversion_rate", + "loyalty_score 0.55 repeat_purchase_rate", + "nps 0.55 loyalty_score", + "return_rate opposes loyalty_score", ], segments=[ { - "name": "loyal_climbers", - "count": 25, - "archetype": "growth", - "label": "Builds loyalty steadily across both years", + "name": "loyal_repeat", + "count": 22, + "archetype": "accelerating", + "label": "Compounding loyalty — high repeat rate", "attributes": { - "tier": ["gold", "platinum"], - "channel": ["web", "mobile"], - "churn_reason": [ - "account_dormant", - "low_engagement", - "payment_failure", - "service_interruption", + "channel": ["in_store", "web", "mobile_app"], + "payment_method": ["credit", "debit", "mobile_wallet", "gift_card"], + "return_reason": [ + "damaged", + "wrong_size", + "defective", + "no_longer_needed", + "late_arrival", ], + "promo_type": ["loyalty_reward", "member_discount", "free_shipping"], }, - "baseline": {"loyalty_score": "high", "cart_value": "high", "return_rate": "low"}, + "baseline": {"order_volume": "high", "loyalty_score": "high", "cart_value": "high"}, }, { - "name": "holiday_shoppers", - "count": 30, + "name": "holiday_shopper", + "count": 18, "archetype": "seasonal", - "label": "Cyclical demand around holidays — Q4 surges", + "label": "Cyclical holiday spender — peaks Q4 and back-to-school", "attributes": { - "tier": ["silver", "gold"], - "channel": ["web", "mobile", "marketplace"], - "churn_reason": [ - "account_dormant", - "low_engagement", - "payment_failure", - "service_interruption", + "channel": ["in_store", "web"], + "payment_method": ["credit", "debit", "mobile_wallet", "gift_card"], + "return_reason": [ + "damaged", + "wrong_size", + "defective", + "no_longer_needed", + "late_arrival", ], + "promo_type": ["seasonal_sale", "doorbuster", "bogo", "free_shipping"], }, - "baseline": {"cart_value": "mid", "conversion_rate": "mid"}, + "baseline": {"order_volume": "mid", "cart_value": "high"}, }, { - "name": "cooled_off", - "count": 18, - "archetype": "flat > decline @ 12", - "label": "Active first year, gradually disengaged in year two", + "name": "bargain_hunter", + "count": 22, + "archetype": "flat", + "label": "Steady low-value cart, promo-driven", "attributes": { - "tier": ["bronze", "silver"], - "channel": ["marketplace"], - "churn_reason": [ - "account_dormant", - "low_engagement", - "payment_failure", - "service_interruption", + "channel": ["web", "mobile_app"], + "payment_method": ["debit", "credit", "mobile_wallet", "gift_card"], + "return_reason": [ + "damaged", + "wrong_size", + "defective", + "no_longer_needed", + "late_arrival", ], + "promo_type": ["clearance", "doorbuster", "bogo"], }, - "baseline": {"loyalty_score": "low", "return_rate": "high"}, + "baseline": {"order_volume": "mid", "cart_value": "low"}, }, { - "name": "one_and_done", - "count": 15, - "archetype": "growth > spike_then_crash > flat @ 4 @ 8", - "label": "Tested the brand for a few months, then never returned", + "name": "churning", + "count": 16, + "archetype": "flat > decline @ 10", + "label": "Coasted then quietly stopped buying", "attributes": { - "tier": ["bronze"], "channel": ["web"], - "churn_reason": [ - "account_dormant", - "low_engagement", - "payment_failure", - "service_interruption", + "payment_method": ["credit", "debit", "mobile_wallet"], + "return_reason": [ + "damaged", + "wrong_size", + "defective", + "no_longer_needed", + "late_arrival", ], + "promo_type": ["loyalty_reward", "member_discount"], }, - "baseline": {"loyalty_score": "low", "cart_value": "low"}, + "baseline": {"loyalty_score": "low", "return_rate": "high"}, }, { - "name": "winback", - "count": 12, - "archetype": "decline > flat > growth @ 6 @ 14", - "label": "Churned, then reactivated by year-two campaign", + "name": "new_customer", + "count": 14, + "archetype": "flat > growth @ 4", + "label": "Recently acquired — ramping engagement", "attributes": { - "tier": ["silver"], - "channel": ["email", "web"], - "churn_reason": [ - "account_dormant", - "low_engagement", - "payment_failure", - "service_interruption", + "channel": ["mobile_app", "web"], + "payment_method": ["credit", "debit", "mobile_wallet"], + "return_reason": [ + "damaged", + "wrong_size", + "defective", + "no_longer_needed", + "late_arrival", ], + "promo_type": ["welcome_offer", "first_order_discount"], }, - "baseline": {"loyalty_score": "mid", "conversion_rate": "mid"}, + "baseline": {"order_volume": "low", "loyalty_score": "mid"}, }, { - "name": "escalating_basket", - "count": 10, - "archetype": "accelerating", - "label": "Compounding cart values as trust builds", + "name": "vip", + "count": 8, + "archetype": "growth", + "label": "High-value VIP cohort with rising spend", "attributes": { - "tier": ["gold", "platinum"], - "channel": ["web"], - "churn_reason": [ - "account_dormant", - "low_engagement", - "payment_failure", - "service_interruption", + "channel": ["in_store", "web", "mobile_app"], + "payment_method": ["credit", "mobile_wallet"], + "return_reason": [ + "damaged", + "wrong_size", + "defective", + "no_longer_needed", + "late_arrival", ], + "promo_type": ["vip_exclusive", "loyalty_reward"], }, - "baseline": {"cart_value": "high", "loyalty_score": "high"}, + "baseline": {"cart_value": "high", "nps": "high"}, }, ], lifecycle={ "track": "loyalty_score", - "stages": [ - ("new", 0.0), - ("casual", 0.2), - ("regular", 0.5), - ("loyal", 0.75), - ("champion", 0.9), - ], + "stages": [{"browser": 0.0}, {"first_purchase": 0.15}, {"returning": 0.4}, {"loyal": 0.7}], }, dimensions=[ { @@ -209,47 +236,39 @@ "columns": [ {"name": "customer_id", "type": "id"}, {"name": "customer_name", "type": "faker.name"}, + {"name": "customer_email", "type": "faker.email"}, {"name": "signup_year", "type": "faker.year"}, {"name": "cohort_size", "type": "segment.count"}, + {"name": "preferred_channel", "type": "pool.channel"}, + {"name": "home_country", "type": "geo.country"}, + {"name": "home_country_code", "type": "geo.country_code"}, + {"name": "home_region", "type": "geo.region"}, + {"name": "home_city", "type": "geo.city"}, { - "name": "customer_tier", + "name": "loyalty_tier", "type": "scd", "tracks": "loyalty_score", - "tiers": ["browser", "casual", "loyal"], - "at": [0.3, 0.7], + "tiers": ["bronze", "silver", "gold", "platinum"], + "at": [0.25, 0.55, 0.8], }, ], }, { - "name": "dim_product_category", + "name": "dim_product", "reference": True, "columns": [ - {"name": "category_id", "type": "id"}, - {"name": "category_name", "type": "static.electronics,apparel,home,grocery,beauty"}, - {"name": "margin_tier", "type": "static.high,standard,standard,low,high"}, - # 0.6-M15: nested struct column (M14c) — see - # ``retail_template.yaml`` for the Semi-Structured - # Flattening (DE L12) exercise rationale. + {"name": "product_id", "type": "id"}, { - "name": "catalog_metadata", - "type": "struct", - "nested_schema": { - "aisle": "string", - "seasonality": "string", - "avg_basket_position": "int", - }, + "name": "product_name", + "type": "static.widget,gadget,gizmo,sprocket,lantern,doohickey,thingamajig,whatsit", }, - ], - }, - { - "name": "dim_channel", - "reference": True, - "columns": [ - {"name": "channel_id", "type": "id"}, - {"name": "channel_name", "type": "static.web,mobile,marketplace,email,store"}, { - "name": "channel_type", - "type": "static.digital,digital,third_party,owned,physical", + "name": "category", + "type": "static.hardware,hardware,hardware,hardware,outdoor,misc,misc,misc", + }, + { + "name": "price_band", + "type": "static.value,value,mid,mid,mid,premium,premium,luxury", }, ], }, @@ -258,66 +277,99 @@ "reference": True, "columns": [ {"name": "promotion_id", "type": "id"}, + {"name": "promotion_name", "type": "faker.company"}, + { + "name": "promo_type", + "type": "static.seasonal_sale,bogo,clearance,loyalty_reward,welcome_offer,doorbuster,member_discount,vip_exclusive,free_shipping,first_order_discount", + }, { - "name": "promo_name", - "type": "static.clearance,seasonal_sale,loyalty_reward,flash_sale", + "name": "discount_band", + "type": "static.10pct,20pct,25pct,30pct,40pct,bogo,member_only,vip_only,free_ship,no_promo", }, - {"name": "discount_type", "type": "static.percentage,percentage,points,percentage"}, ], }, ], facts=[ { - "name": "fct_sessions", - "metrics": ["sessions", "conversion_rate"], + "name": "fct_customer_activity", + "metrics": [ + "loyalty_score", + "conversion_rate", + "nps", + "repeat_purchase_rate", + "cart_value", + "return_rate", + ], "columns": [ {"name": "date_key", "type": "ref.dim_date"}, {"name": "customer_id", "type": "ref.dim_customer"}, - {"name": "channel_id", "type": "ref.dim_channel"}, - {"name": "session_count", "type": "metric.sessions"}, + {"name": "loyalty_score", "type": "metric.loyalty_score"}, {"name": "conversion_rate", "type": "metric.conversion_rate"}, + {"name": "nps", "type": "metric.nps"}, + {"name": "repeat_purchase_rate", "type": "metric.repeat_purchase_rate"}, + {"name": "cart_value", "type": "metric.cart_value"}, + {"name": "return_rate", "type": "metric.return_rate"}, { - "name": "shopping_intent", - "type": "bucket", - "labels": ["browsing", "comparing", "purchasing", "loyal_repeat"], + "name": "review_text", + "type": "narrative", + "template": "{opener} {object}. {comment}", + "lexicons": { + "loyal_repeat": _NAR_BLOCK, + "holiday_shopper": _NAR_BLOCK, + "bargain_hunter": _NAR_BLOCK, + "churning": _NAR_BLOCK, + "new_customer": _NAR_BLOCK, + "vip": _NAR_BLOCK, + }, }, ], }, { - "name": "fct_purchases", - "metrics": ["cart_value", "return_rate", "loyalty_score", "repeat_purchase_rate"], - # 0.6-M15: CDC fact-side (M9c) — every row carries - # _inserted_at / _updated_at / _op audit columns. Column- - # level quality injections (see ``quality=`` below) flip - # _op to "U" on touched rows. See ``retail_template.yaml``. + "name": "fct_orders", + "row_count_driver": "order_volume", + "row_count_scale": 1.2, "cdc": True, "columns": [ - {"name": "date_key", "type": "ref.dim_date"}, + {"name": "order_id", "type": "id"}, {"name": "customer_id", "type": "ref.dim_customer"}, - {"name": "category_id", "type": "ref.dim_product_category"}, - {"name": "promotion_id", "type": "ref.dim_promotion"}, - {"name": "cart_value", "type": "metric.cart_value"}, - {"name": "return_rate", "type": "metric.return_rate"}, - {"name": "loyalty_score", "type": "metric.loyalty_score"}, - {"name": "repeat_purchase_rate", "type": "metric.repeat_purchase_rate"}, + {"name": "order_date", "type": "ref.dim_date"}, + {"name": "payment_method", "type": "pool.payment_method"}, + {"name": "order_channel", "type": "pool.channel"}, ], }, { - "name": "fct_satisfaction", - "metrics": ["nps"], + "name": "fct_order_items", + "parent_table": "fct_orders", + "children_per_row": [1, 5], "columns": [ - {"name": "date_key", "type": "ref.dim_date"}, + {"name": "item_id", "type": "id"}, {"name": "customer_id", "type": "ref.dim_customer"}, - {"name": "nps", "type": "metric.nps"}, + {"name": "order_date", "type": "ref.dim_date"}, + {"name": "product_id", "type": "ref.dim_product"}, + {"name": "quantity", "type": "range", "range": [1, 12]}, + {"name": "unit_price", "type": "range", "range": [2.99, 499.99]}, + {"name": "discount_pct", "type": "range", "range": [0, 40]}, + ], + }, + { + "name": "fct_returns", + "row_count_driver": "return_rate", + "row_count_scale": 0.6, + "columns": [ + {"name": "return_id", "type": "id"}, + {"name": "order_id", "type": "ref.fct_orders"}, + {"name": "customer_id", "type": "ref.dim_customer"}, + {"name": "return_date", "type": "ref.dim_date"}, + {"name": "return_reason", "type": "pool.return_reason"}, ], }, ], events=[ { - "name": "evt_purchase", + "name": "evt_session", "trigger": "proportional", "driver": "conversion_rate", - "scale": 6.0, + "scale": 12.0, "columns": [ {"name": "event_id", "type": "id"}, {"name": "date_key", "type": "ref.dim_date"}, @@ -326,32 +378,36 @@ ], }, { - "name": "evt_churn", - "trigger": "threshold", - "metric": "loyalty_score", - "below": 0.15, - "for_periods": 4, + "name": "evt_return", + "trigger": "proportional", + "driver": "return_rate", + "scale": 2.0, "columns": [ {"name": "event_id", "type": "id"}, {"name": "date_key", "type": "ref.dim_date"}, {"name": "customer_id", "type": "ref.dim_customer"}, - {"name": "reason", "type": "pool.churn_reason"}, - {"name": "churn_flag", "type": "flag"}, + {"name": "severity", "type": "static.warning,investigation,fraud_review"}, + {"name": "event_ts", "type": "timestamp"}, ], }, ], - # 0.6-M15: data-quality issues — see ``retail_template.yaml`` for - # the Data Quality Testing (DE L25), Data Cleaning (DE L15), and - # Data Observability (DE L28) rationale. The volume_anomaly spike - # at period 18 is the canonical observability scenario. + bridges=[ + { + "name": "bridge_customer_promotion", + "left": "dim_customer", + "right": "dim_promotion", + "cardinality": [1, 4], + "driver": "loyalty_score", + }, + ], quality=[ - {"table": "fct_purchases", "issue": "null_injection", "rate": 0.03, "column": "cart_value"}, { - "table": "fct_sessions", - "issue": "volume_anomaly", - "rate": 0.5, - "mode": "spike", - "period": 18, + "table": "fct_customer_activity", + "issue": "null_injection", + "rate": 0.03, + "column": "conversion_rate", }, + {"table": "evt_session", "issue": "duplicate_rows", "rate": 0.015}, + {"table": "fct_orders", "issue": "late_arrival", "rate": 0.02}, ], ) diff --git a/plotsim/configs/templates/saas.yaml b/plotsim/configs/templates/saas.yaml new file mode 100644 index 0000000..25cb6b4 --- /dev/null +++ b/plotsim/configs/templates/saas.yaml @@ -0,0 +1,259 @@ +# ═══════════════════════════════════════════════════════ +# plotsim — B2B SaaS customer success + revenue +# ═══════════════════════════════════════════════════════ +# +# A product-led SaaS company's operational warehouse. Companies +# subscribe, onboard users, engage with the product, generate +# revenue, open support tickets, and eventually churn or expand. +# Plan tier is SCD2 (free → starter → growth → enterprise), +# revenue gets CDC audit (reconciliation restates prior periods), +# and MRR carries heteroscedastic noise (large accounts vary more). + +about: "B2B SaaS customer success, engagement and revenue" +unit: company + +seed: 1729 +noise: + gaussian_sigma: 0.04 + outlier_rate: 0.005 + mcar_rate: 0.0 + scale_with_trajectory: true # 0.6-M22 heteroscedastic noise + +output: + denormalized: true # 0.6-M14a wide-table companion alongside normalized output + +window: + start: 2023-01 + end: 2024-12 + every: monthly + + +# ── seasonality ───────────────────────────────────────── +# Enterprise buying patterns: Q4 budget flush (Nov-Dec) and +# summer slowdown (Jun-Aug) when decision-makers are on PTO. +seasonality: + - { months: [11, 12], strength: 0.30 } + - { months: [6, 7, 8], strength: -0.12 } + + +# ── what we measure ───────────────────────────────────── + +metrics: + - { name: engagement, label: Product engagement, type: score, polarity: positive } + - { name: feature_adoption,label: Feature adoption breadth, type: score, polarity: positive } + - { name: mrr, label: Monthly recurring revenue, type: amount, polarity: positive, range: [50, 50000] } + - { name: nps, label: Net promoter score (0-100),type: amount, polarity: positive, range: [0, 100] } + - { name: support_tickets, label: Tickets per period, type: count, polarity: negative } + - { name: churn_risk, label: Churn risk score, type: score, polarity: negative } + - { name: expansion, label: Expansion likelihood, type: score, polarity: positive, follows: engagement, delay: 2 } + + +# ── how metrics connect ───────────────────────────────── +connections: + - mrr driven_by engagement + - expansion related feature_adoption + - churn_risk opposes engagement + - "support_tickets -0.40 engagement" + - "nps 0.55 engagement" + + +# ── who we're simulating ──────────────────────────────── + +segments: + + - name: rapid_adopter + count: 18 + archetype: growth + label: "Fast onboarding ramp, broad adoption inside the first quarter" + attributes: + industry: [saas, fintech] + region: [na, emea] + plan_tier: [growth, enterprise] + baseline: + engagement: high + feature_adoption: high + mrr: high + + - name: steady_grower + count: 24 + archetype: accelerating + label: "Compounding seat expansion with steady MRR growth" + attributes: + industry: [saas, ecommerce, education] + region: [na, emea, apac] + plan_tier: [starter, growth] + baseline: + engagement: mid + mrr: mid + + - name: enterprise_steady + count: 14 + archetype: flat + label: "Mature enterprise accounts at stable high baseline" + attributes: + industry: [healthcare, finance, manufacturing] + region: [na, emea] + plan_tier: [enterprise] + baseline: + engagement: high + mrr: high + nps: high + + - name: churning + count: 18 + archetype: flat > decline @ 10 + label: "Coasted for the first year, then disengaged and churned" + attributes: + industry: [retail, ecommerce, media] + region: [na, latam] + plan_tier: [free, starter] + baseline: + engagement: low + churn_risk: high + support_tickets: high + + - name: expansion_play + count: 12 + archetype: flat > growth @ 8 + label: "Held flat, then expanded after a successful product evaluation" + attributes: + industry: [saas, fintech] + region: [na, emea] + plan_tier: [growth, enterprise] + baseline: + engagement: mid + expansion: high + + - name: at_risk + count: 14 + archetype: growth > spike_then_crash > flat @ 6 @ 12 + label: "Promising start, hit a wall around month 6, never recovered" + attributes: + industry: [media, retail, education] + region: [latam, apac] + plan_tier: [starter, growth] + baseline: + engagement: mid + churn_risk: high + + +# ── lifecycle funnel ──────────────────────────────────── + +lifecycle: + track: engagement + stages: + - trial: 0.0 + - active: 0.2 + - engaged: 0.5 + - power_user: 0.75 + + +# ── schema ────────────────────────────────────────────── + +dimensions: + + - name: dim_date + per: period + columns: + - {name: date_key, type: id} + - {name: date, type: date} + - {name: year, type: int} + - {name: month, type: int} + - {name: quarter, type: int} + + - name: dim_company + per: unit + columns: + - {name: company_id, type: id} + - {name: company_name, type: faker.company} + - {name: signup_year, type: faker.year} + - {name: cohort_size, type: segment.count} + - {name: industry, type: pool.industry} + - {name: region, type: pool.region} + - name: plan_tier + type: scd + tracks: mrr + tiers: [free, starter, growth, enterprise] + at: [0.25, 0.55, 0.8] + + - name: dim_user + per: unit + count: 4 # 4 sub-entity users per company on average + columns: + - {name: user_id, type: id} + - {name: company_id, type: ref.dim_company} + - {name: user_name, type: faker.name} + - {name: user_email, type: faker.email} + - {name: user_role, type: "static.admin,member,member,member,viewer"} + + - name: dim_support_category + reference: true + columns: + - {name: category_id, type: id} + - {name: category_name, type: "static.bug,billing,onboarding,integration,feature_request,access,outage,docs"} + - {name: severity_band, type: "static.high,medium,medium,low,low,medium,high,low"} + +facts: + + - name: fct_engagement + metrics: [engagement, feature_adoption] + columns: + - {name: date_key, type: ref.dim_date} + - {name: company_id, type: ref.dim_company} + - {name: engagement, type: metric.engagement} + - {name: feature_adoption, type: metric.feature_adoption} + - {name: active_seats, type: range, range: [1, 200]} + + - name: fct_revenue + metrics: [mrr, expansion] + cdc: true # revenue restatements during reconciliation + columns: + - {name: date_key, type: ref.dim_date} + - {name: company_id, type: ref.dim_company} + - {name: mrr, type: metric.mrr} + - {name: expansion, type: metric.expansion} + - {name: discount_pct, type: range, range: [0, 25]} + + - name: fct_support + metrics: [support_tickets, nps, churn_risk] + columns: + - {name: date_key, type: ref.dim_date} + - {name: company_id, type: ref.dim_company} + - {name: category_id, type: ref.dim_support_category} + - {name: support_tickets, type: metric.support_tickets} + - {name: nps, type: metric.nps} + - {name: churn_risk, type: metric.churn_risk} + +events: + + - name: evt_login + trigger: proportional + driver: engagement + scale: 10.0 + columns: + - {name: event_id, type: id} + - {name: date_key, type: ref.dim_date} + - {name: company_id, type: ref.dim_company} + - {name: event_ts, type: timestamp} + + - name: evt_churn + trigger: threshold + metric: churn_risk + above: 0.7 + for: 2 + columns: + - {name: event_id, type: id} + - {name: date_key, type: ref.dim_date} + - {name: company_id, type: ref.dim_company} + - {name: reason, type: faker.sentence} + - {name: voluntary, type: flag} + + +# ── data quality ──────────────────────────────────────── +# Realistic SaaS data hygiene: missing engagement during onboarding +# blackout, duplicate login events from client retry, late-arriving +# billing reconciliation. +quality: + - { table: fct_engagement, issue: null_injection, rate: 0.03, column: engagement } + - { table: evt_login, issue: duplicate_rows, rate: 0.015 } + - { table: fct_revenue, issue: late_arrival, rate: 0.02 } diff --git a/plotsim/configs/templates/saas_template.py b/plotsim/configs/templates/saas_template.py index d1bb4e6..aa0fcb8 100644 --- a/plotsim/configs/templates/saas_template.py +++ b/plotsim/configs/templates/saas_template.py @@ -1,35 +1,35 @@ -"""B2B SaaS customer success — Python-shaped builder template. +"""SaaS template — Python form. -This is the ``create(**kwargs)`` mirror of ``saas_template.yaml`` — -both produce identical engine configs given the same seed. Pick -whichever surface fits your workflow: +Mirror of ``saas.yaml``. A B2B SaaS warehouse with SCD2 plan tier on +``dim_company``, sub-entity ``dim_user`` (4 users per company), +heteroscedastic noise on MRR, CDC audit on the revenue fact, and +opt-in denormalized output. -* ``saas_template.yaml`` for config-as-data fixtures checked into git -* this file for code-shaped configs that compose with regular Python - -The new builder dials (``noise``, ``output``, ``locale``, -``seasonality``, custom-coefficient ``connections``) are demonstrated -inline below; comments mark the pieces that match the YAML 1-1. +Run: + >>> from plotsim.configs.templates.saas_template import config + >>> from plotsim import generate_tables + >>> tables = generate_tables(config) """ from plotsim import create + config = create( - about="B2B SaaS customer success", + about="B2B SaaS customer success, engagement and revenue", unit="company", - seed=1729, # determinism - noise="perfectly_clean", # also: slightly_messy, realistic, dirty - # locale=["en_US", "en_GB"], # multi-locale faker mix - # 0.6-M15: opt-in denormalization (M14a) — see ``saas_template.yaml`` - # for the rationale. Each fact is left-joined with its FK'd dims; - # ``_wide.csv`` is emitted alongside the normalized output. + seed=1729, + noise={ + "gaussian_sigma": 0.04, + "outlier_rate": 0.005, + "mcar_rate": 0.0, + "scale_with_trajectory": True, + }, output={"denormalized": True}, window=("2023-01", "2024-12", "monthly"), seasonality=[ - {"months": [11, 12], "strength": 0.30}, # Q4 lift - {"months": [6, 7, 8], "strength": -0.10}, # summer dip + {"months": [11, 12], "strength": 0.30}, + {"months": [6, 7, 8], "strength": -0.12}, ], - # ── what we measure ───────────────────────────────── metrics=[ { "name": "engagement", @@ -37,26 +37,31 @@ "type": "score", "polarity": "positive", }, + { + "name": "feature_adoption", + "label": "Feature adoption breadth", + "type": "score", + "polarity": "positive", + }, { "name": "mrr", "label": "Monthly recurring revenue", "type": "amount", "polarity": "positive", - "range": [100, 50000], + "range": [50, 50000], + }, + { + "name": "nps", + "label": "Net promoter score (0-100)", + "type": "amount", + "polarity": "positive", + "range": [0, 100], }, { "name": "support_tickets", - "label": "Support ticket volume", + "label": "Tickets per period", "type": "count", "polarity": "negative", - "follows": "engagement", - "delay": 2, - }, - { - "name": "feature_adoption", - "label": "Feature adoption rate", - "type": "score", - "polarity": "positive", }, { "name": "churn_risk", @@ -65,110 +70,99 @@ "polarity": "negative", }, { - "name": "nps", - "label": "Net promoter score", - "type": "index", + "name": "expansion", + "label": "Expansion likelihood", + "type": "score", "polarity": "positive", - "range": [-100, 100], + "follows": "engagement", + "delay": 2, }, ], - # ── how metrics connect ───────────────────────────── - # Mix of vocabulary words and explicit numeric coefficients — - # both forms parse into the same correlation matrix. Numeric - # form is for cases where you've calibrated r from real data. connections=[ - ("engagement", "driven_by", "mrr"), - ("engagement", "opposes", "churn_risk"), - ("support_tickets", "related", "churn_risk"), - ("feature_adoption", 0.42, "mrr"), # custom coefficient - ("nps", 0.18, "engagement"), # custom coefficient + "mrr driven_by engagement", + "expansion related feature_adoption", + "churn_risk opposes engagement", + "support_tickets -0.40 engagement", + "nps 0.55 engagement", ], - # ── who we're simulating ──────────────────────────── segments=[ { - "name": "promising_client", - "count": 20, - "archetype": "growth > spike_then_crash > flat @ 8 @ 16", - "label": "Strong start, lost champion at month 8, went dormant by 16", + "name": "rapid_adopter", + "count": 18, + "archetype": "growth", + "label": "Fast onboarding ramp, broad adoption inside the first quarter", "attributes": { - "industry": ["Technology", "Finance", "Healthcare"], - "region": ["US", "EMEA"], - "tier": "enterprise", + "industry": ["saas", "fintech"], + "region": ["na", "emea"], + "plan_tier": ["growth", "enterprise"], }, - "baseline": {"mrr": "high", "engagement": "high", "support_tickets": "low"}, + "baseline": {"engagement": "high", "feature_adoption": "high", "mrr": "high"}, }, { - "name": "steady_enterprise", - "count": 25, - "archetype": "growth", - "label": "Reliable accounts, steady climb", + "name": "steady_grower", + "count": 24, + "archetype": "accelerating", + "label": "Compounding seat expansion with steady MRR growth", "attributes": { - "industry": ["Technology", "Finance"], - "region": ["US", "APAC"], - "tier": "enterprise", + "industry": ["saas", "ecommerce", "education"], + "region": ["na", "emea", "apac"], + "plan_tier": ["starter", "growth"], }, - "baseline": {"mrr": "high", "engagement": "high", "support_tickets": "low"}, + "baseline": {"engagement": "mid", "mrr": "mid"}, }, { - "name": "slow_churn", - "count": 15, - "archetype": "flat > decline @ 12", - "label": "Coasted for a year, then quietly faded", + "name": "enterprise_steady", + "count": 14, + "archetype": "flat", + "label": "Mature enterprise accounts at stable high baseline", "attributes": { - "industry": ["Media", "Hospitality"], - "region": ["EMEA"], - "tier": "starter", + "industry": ["healthcare", "finance", "manufacturing"], + "region": ["na", "emea"], + "plan_tier": ["enterprise"], }, - "baseline": {"mrr": "low", "engagement": "low", "support_tickets": "high"}, + "baseline": {"engagement": "high", "mrr": "high", "nps": "high"}, }, { - "name": "seasonal_accounts", - "count": 15, - "archetype": "growth > seasonal @ 6", - "label": "Ramped up first 6 months, settled into quarterly cycles", + "name": "churning", + "count": 18, + "archetype": "flat > decline @ 10", + "label": "Coasted for the first year, then disengaged and churned", "attributes": { - "industry": ["Retail", "Manufacturing"], - "region": ["US"], - "tier": "growth", + "industry": ["retail", "ecommerce", "media"], + "region": ["na", "latam"], + "plan_tier": ["free", "starter"], }, - "baseline": {"mrr": "mid", "engagement": "mid", "support_tickets": "mid"}, + "baseline": {"engagement": "low", "churn_risk": "high", "support_tickets": "high"}, }, { - "name": "dormant", - "count": 10, - "archetype": "flat", - "label": "Signed up, never activated", + "name": "expansion_play", + "count": 12, + "archetype": "flat > growth @ 8", + "label": "Held flat, then expanded after a successful product evaluation", "attributes": { - "industry": ["Education"], - "region": ["APAC"], - "tier": "starter", + "industry": ["saas", "fintech"], + "region": ["na", "emea"], + "plan_tier": ["growth", "enterprise"], }, - "baseline": {"mrr": "low", "engagement": "low", "support_tickets": "low"}, + "baseline": {"engagement": "mid", "expansion": "high"}, }, { - "name": "turnaround", - "count": 10, - "archetype": "decline > flat > growth @ 6 @ 14", - "label": "Declining, hit bottom at month 6, turned around at 14", + "name": "at_risk", + "count": 14, + "archetype": "growth > spike_then_crash > flat @ 6 @ 12", + "label": "Promising start, hit a wall around month 6, never recovered", "attributes": { - "industry": ["Finance", "Healthcare"], - "region": ["US"], - "tier": "growth", + "industry": ["media", "retail", "education"], + "region": ["latam", "apac"], + "plan_tier": ["starter", "growth"], }, - "baseline": {"mrr": "mid", "engagement": "mid", "support_tickets": "mid"}, + "baseline": {"engagement": "mid", "churn_risk": "high"}, }, ], - # ── lifecycle funnel ──────────────────────────────── lifecycle={ - "track": "churn_risk", - "stages": [ - ("onboarding", 0.0), - ("active", 0.2), - ("at_risk", 0.5), - ("churned", 0.8), - ], + "track": "engagement", + "stages": [{"trial": 0.0}, {"active": 0.2}, {"engaged": 0.5}, {"power_user": 0.75}], }, - # ── schema ────────────────────────────────────────── dimensions=[ { "name": "dim_date", @@ -187,70 +181,81 @@ "columns": [ {"name": "company_id", "type": "id"}, {"name": "company_name", "type": "faker.company"}, - {"name": "industry", "type": "faker.industry"}, - {"name": "founded_year", "type": "faker.year"}, + {"name": "signup_year", "type": "faker.year"}, {"name": "cohort_size", "type": "segment.count"}, + {"name": "industry", "type": "pool.industry"}, + {"name": "region", "type": "pool.region"}, { "name": "plan_tier", "type": "scd", "tracks": "mrr", - "tiers": ["starter", "growth", "enterprise"], - "at": [0.4, 0.7], + "tiers": ["free", "starter", "growth", "enterprise"], + "at": [0.25, 0.55, 0.8], }, ], }, { "name": "dim_user", "per": "unit", + "count": 4, "columns": [ {"name": "user_id", "type": "id"}, {"name": "company_id", "type": "ref.dim_company"}, {"name": "user_name", "type": "faker.name"}, - {"name": "role", "type": "static.member"}, + {"name": "user_email", "type": "faker.email"}, + {"name": "user_role", "type": "static.admin,member,member,member,viewer"}, ], }, { - "name": "dim_plan", + "name": "dim_support_category", "reference": True, "columns": [ - {"name": "plan_id", "type": "id"}, - {"name": "plan_name", "type": "static.starter"}, - {"name": "monthly_price", "type": "static.99.00"}, + {"name": "category_id", "type": "id"}, + { + "name": "category_name", + "type": "static.bug,billing,onboarding,integration,feature_request,access,outage,docs", + }, + { + "name": "severity_band", + "type": "static.high,medium,medium,low,low,medium,high,low", + }, ], }, ], facts=[ { "name": "fct_engagement", + "metrics": ["engagement", "feature_adoption"], "columns": [ {"name": "date_key", "type": "ref.dim_date"}, {"name": "company_id", "type": "ref.dim_company"}, - {"name": "engagement_score", "type": "metric.engagement"}, + {"name": "engagement", "type": "metric.engagement"}, {"name": "feature_adoption", "type": "metric.feature_adoption"}, - { - "name": "customer_sentiment", - "type": "bucket", - "labels": ["at_risk", "lukewarm", "satisfied", "delighted"], - }, + {"name": "active_seats", "type": "range", "range": [1, 200]}, ], }, { "name": "fct_revenue", + "metrics": ["mrr", "expansion"], + "cdc": True, "columns": [ {"name": "date_key", "type": "ref.dim_date"}, {"name": "company_id", "type": "ref.dim_company"}, - {"name": "plan_id", "type": "ref.dim_plan"}, {"name": "mrr", "type": "metric.mrr"}, + {"name": "expansion", "type": "metric.expansion"}, + {"name": "discount_pct", "type": "range", "range": [0, 25]}, ], }, { - "name": "fct_support_tickets", + "name": "fct_support", + "metrics": ["support_tickets", "nps", "churn_risk"], "columns": [ {"name": "date_key", "type": "ref.dim_date"}, {"name": "company_id", "type": "ref.dim_company"}, - {"name": "ticket_count", "type": "metric.support_tickets"}, - {"name": "churn_risk", "type": "metric.churn_risk"}, + {"name": "category_id", "type": "ref.dim_support_category"}, + {"name": "support_tickets", "type": "metric.support_tickets"}, {"name": "nps", "type": "metric.nps"}, + {"name": "churn_risk", "type": "metric.churn_risk"}, ], }, ], @@ -259,16 +264,10 @@ "name": "evt_login", "trigger": "proportional", "driver": "engagement", - "scale": 5, - # 0.6-M15: log-file writer (M14b) demonstrated on the login - # event stream — see ``saas_template.yaml`` for parsing - # exercises that join the .log file back to the CSV. - "log_format": "{event_ts} INFO user={user_id} company={company_id} action=login", - "log_filename": "evt_login.log", + "scale": 10.0, "columns": [ {"name": "event_id", "type": "id"}, {"name": "date_key", "type": "ref.dim_date"}, - {"name": "user_id", "type": "ref.dim_user"}, {"name": "company_id", "type": "ref.dim_company"}, {"name": "event_ts", "type": "timestamp"}, ], @@ -278,23 +277,24 @@ "trigger": "threshold", "metric": "churn_risk", "above": 0.7, - "for_periods": 3, + "for": 2, "columns": [ {"name": "event_id", "type": "id"}, {"name": "date_key", "type": "ref.dim_date"}, {"name": "company_id", "type": "ref.dim_company"}, - {"name": "churn_reason", "type": "faker.sentence"}, - {"name": "churn_flag", "type": "flag"}, + {"name": "reason", "type": "faker.sentence"}, + {"name": "voluntary", "type": "flag"}, ], }, ], - # 0.6-M15: data-quality issues for Data Quality Testing (DE L25) - # and Data Cleaning (DE L15). Manifest records every injection so - # students can score detectors against ground truth. Issue - # placement is on event tables here (not facts) — see - # ``saas_template.yaml`` for the streaming-parquet rationale. quality=[ - {"table": "evt_churn", "issue": "null_injection", "rate": 0.05, "column": "churn_reason"}, - {"table": "evt_login", "issue": "duplicate_rows", "rate": 0.02}, + { + "table": "fct_engagement", + "issue": "null_injection", + "rate": 0.03, + "column": "engagement", + }, + {"table": "evt_login", "issue": "duplicate_rows", "rate": 0.015}, + {"table": "fct_revenue", "issue": "late_arrival", "rate": 0.02}, ], ) diff --git a/tests/configs/__init__.py b/tests/configs/__init__.py new file mode 100644 index 0000000..27d4b59 --- /dev/null +++ b/tests/configs/__init__.py @@ -0,0 +1,14 @@ +"""Test-vehicle configs. + +YAML and Python builder configs that exercise a specific engine feature in +the test suite. These are NOT public builder templates — they live here so +``plotsim.list_templates()`` can keep its catalog scoped to domain templates +while the feature tests retain dedicated, hand-tuned fixtures. + +A test loads a YAML vehicle via ``create_from_yaml(CONFIGS_DIR / ".yaml")`` +and a ``.py`` vehicle via ``from tests.configs. import config``. +""" + +from pathlib import Path + +CONFIGS_DIR = Path(__file__).resolve().parent diff --git a/plotsim/configs/templates/ab_trial.py b/tests/configs/ab_trial.py similarity index 100% rename from plotsim/configs/templates/ab_trial.py rename to tests/configs/ab_trial.py diff --git a/plotsim/configs/templates/ab_trial.yaml b/tests/configs/ab_trial.yaml similarity index 100% rename from plotsim/configs/templates/ab_trial.yaml rename to tests/configs/ab_trial.yaml diff --git a/plotsim/configs/templates/bare_minimum.py b/tests/configs/bare_minimum.py similarity index 93% rename from plotsim/configs/templates/bare_minimum.py rename to tests/configs/bare_minimum.py index abfdeb5..93a0c28 100644 --- a/plotsim/configs/templates/bare_minimum.py +++ b/tests/configs/bare_minimum.py @@ -6,7 +6,7 @@ ``dim_{unit}``, ``fct_{unit}`` carrying every metric). Run: - >>> from plotsim.configs.templates.bare_minimum import config + >>> from tests.configs.bare_minimum import config >>> from plotsim import generate_tables >>> tables = generate_tables(config) """ diff --git a/plotsim/configs/templates/bare_minimum.yaml b/tests/configs/bare_minimum.yaml similarity index 96% rename from plotsim/configs/templates/bare_minimum.yaml rename to tests/configs/bare_minimum.yaml index fbfdba8..490f43f 100644 --- a/plotsim/configs/templates/bare_minimum.yaml +++ b/tests/configs/bare_minimum.yaml @@ -12,7 +12,7 @@ # Run: # >>> from plotsim import create_from_yaml, generate_tables # >>> import numpy as np -# >>> cfg = create_from_yaml("plotsim/configs/templates/bare_minimum.yaml") +# >>> cfg = create_from_yaml("tests/configs/bare_minimum.yaml") # >>> tables = generate_tables(cfg, np.random.default_rng(cfg.seed)) # # Auto-generated outputs (because the schema section is omitted): diff --git a/plotsim/configs/templates/cdc_demo.py b/tests/configs/cdc_demo.py similarity index 97% rename from plotsim/configs/templates/cdc_demo.py rename to tests/configs/cdc_demo.py index b44c39b..c1a2283 100644 --- a/plotsim/configs/templates/cdc_demo.py +++ b/tests/configs/cdc_demo.py @@ -6,7 +6,7 @@ quality layer corrupted. Run: - >>> from plotsim.configs.templates.cdc_demo import config + >>> from tests.configs.cdc_demo import config >>> from plotsim import generate_tables, write_tables >>> tables = generate_tables(config) >>> write_tables(tables, config, output_dir="./cdc_demo_output") diff --git a/plotsim/configs/templates/cdc_demo.yaml b/tests/configs/cdc_demo.yaml similarity index 98% rename from plotsim/configs/templates/cdc_demo.yaml rename to tests/configs/cdc_demo.yaml index f7a3c3c..9428757 100644 --- a/plotsim/configs/templates/cdc_demo.yaml +++ b/tests/configs/cdc_demo.yaml @@ -21,7 +21,7 @@ # Run: # >>> from plotsim import create_from_yaml, generate_tables, write_tables # >>> cfg = create_from_yaml( -# ... "plotsim/configs/templates/cdc_demo.yaml" +# ... "tests/configs/cdc_demo.yaml" # ... ) # >>> tables = generate_tables(cfg) # >>> write_tables(tables, cfg, output_dir="./cdc_demo_output") diff --git a/plotsim/configs/templates/crm_billing_overlap.py b/tests/configs/crm_billing_overlap.py similarity index 100% rename from plotsim/configs/templates/crm_billing_overlap.py rename to tests/configs/crm_billing_overlap.py diff --git a/plotsim/configs/templates/crm_billing_overlap.yaml b/tests/configs/crm_billing_overlap.yaml similarity index 100% rename from plotsim/configs/templates/crm_billing_overlap.yaml rename to tests/configs/crm_billing_overlap.yaml diff --git a/plotsim/configs/templates/geo_retail.py b/tests/configs/geo_retail.py similarity index 100% rename from plotsim/configs/templates/geo_retail.py rename to tests/configs/geo_retail.py diff --git a/plotsim/configs/templates/geo_retail.yaml b/tests/configs/geo_retail.yaml similarity index 100% rename from plotsim/configs/templates/geo_retail.yaml rename to tests/configs/geo_retail.yaml diff --git a/plotsim/configs/templates/lakehouse.py b/tests/configs/lakehouse.py similarity index 100% rename from plotsim/configs/templates/lakehouse.py rename to tests/configs/lakehouse.py diff --git a/plotsim/configs/templates/lakehouse.yaml b/tests/configs/lakehouse.yaml similarity index 100% rename from plotsim/configs/templates/lakehouse.yaml rename to tests/configs/lakehouse.yaml diff --git a/plotsim/configs/templates/latency_skew.py b/tests/configs/latency_skew.py similarity index 100% rename from plotsim/configs/templates/latency_skew.py rename to tests/configs/latency_skew.py diff --git a/plotsim/configs/templates/latency_skew.yaml b/tests/configs/latency_skew.yaml similarity index 100% rename from plotsim/configs/templates/latency_skew.yaml rename to tests/configs/latency_skew.yaml diff --git a/plotsim/configs/templates/narrative_reviews.py b/tests/configs/narrative_reviews.py similarity index 100% rename from plotsim/configs/templates/narrative_reviews.py rename to tests/configs/narrative_reviews.py diff --git a/plotsim/configs/templates/narrative_reviews.yaml b/tests/configs/narrative_reviews.yaml similarity index 100% rename from plotsim/configs/templates/narrative_reviews.yaml rename to tests/configs/narrative_reviews.yaml diff --git a/plotsim/configs/templates/orders_template.py b/tests/configs/orders_template.py similarity index 98% rename from plotsim/configs/templates/orders_template.py rename to tests/configs/orders_template.py index fc5cc4f..5d69e0b 100644 --- a/plotsim/configs/templates/orders_template.py +++ b/tests/configs/orders_template.py @@ -7,7 +7,7 @@ out 1..5 line items per order. Run: - >>> from plotsim.configs.templates.orders_template import config + >>> from tests.configs.orders_template import config >>> from plotsim import generate_tables, write_tables >>> tables = generate_tables(config) >>> write_tables(tables, config, output_dir="./orders_output") diff --git a/plotsim/configs/templates/orders_template.yaml b/tests/configs/orders_template.yaml similarity index 99% rename from plotsim/configs/templates/orders_template.yaml rename to tests/configs/orders_template.yaml index 0ae4e06..bcced90 100644 --- a/plotsim/configs/templates/orders_template.yaml +++ b/tests/configs/orders_template.yaml @@ -27,7 +27,7 @@ # Run: # >>> from plotsim import create_from_yaml, generate_tables, write_tables # >>> cfg = create_from_yaml( -# ... "plotsim/configs/templates/orders_template.yaml" +# ... "tests/configs/orders_template.yaml" # ... ) # >>> tables = generate_tables(cfg) # >>> write_tables(tables, cfg, output_dir="./orders_output") diff --git a/tests/configs/retail_template.py b/tests/configs/retail_template.py new file mode 100644 index 0000000..002dc45 --- /dev/null +++ b/tests/configs/retail_template.py @@ -0,0 +1,357 @@ +"""Retail / e-commerce customer analytics — Python builder template. + +Mirror of ``retail_template.yaml``. Demonstrates: + +* multi-locale faker (``locale=["en_US", "en_GB", "fr_FR"]``) +* Q4 holiday-shopping seasonality +* SCD2 ``customer_tier`` tracking ``loyalty_score`` +* threshold event with ``below`` (churn fires when score crashes) +""" + +from plotsim import create + +config = create( + about="Retail customer purchase and loyalty behavior", + unit="customer", + seed=90210, + noise="realistic", + # output={"format": "parquet", "directory": "./out"}, # uncomment if pyarrow installed + locale=["en_US", "en_GB", "fr_FR"], + window=("2023-01", "2024-12", "monthly"), + seasonality=[ + {"months": [11, 12], "strength": 0.45}, + {"months": [7, 8], "strength": -0.15}, + ], + metrics=[ + { + "name": "sessions", + "label": "Monthly site sessions", + "type": "count", + "polarity": "positive", + }, + { + "name": "cart_value", + "label": "Average cart value", + "type": "amount", + "polarity": "positive", + "range": [10, 2000], + }, + { + "name": "conversion_rate", + "label": "Session-to-purchase conversion", + "type": "score", + "polarity": "positive", + }, + { + "name": "return_rate", + "label": "Purchase return rate", + "type": "score", + "polarity": "negative", + }, + { + "name": "loyalty_score", + "label": "Customer loyalty index", + "type": "score", + "polarity": "positive", + }, + { + "name": "repeat_purchase_rate", + "label": "Repeat purchase rate", + "type": "score", + "polarity": "positive", + "follows": "loyalty_score", + "delay": 1, + }, + { + "name": "nps", + "label": "Net promoter score", + "type": "index", + "polarity": "positive", + "range": [-100, 100], + }, + ], + connections=[ + ("conversion_rate", "driven_by", "loyalty_score"), + ("cart_value", "related", "loyalty_score"), + ("return_rate", "opposes", "loyalty_score"), + ("repeat_purchase_rate", "driven_by", "conversion_rate"), + ("nps", "related", "loyalty_score"), + ], + segments=[ + { + "name": "loyal_climbers", + "count": 25, + "archetype": "growth", + "label": "Builds loyalty steadily across both years", + "attributes": { + "tier": ["gold", "platinum"], + "channel": ["web", "mobile"], + "churn_reason": [ + "account_dormant", + "low_engagement", + "payment_failure", + "service_interruption", + ], + }, + "baseline": {"loyalty_score": "high", "cart_value": "high", "return_rate": "low"}, + }, + { + "name": "holiday_shoppers", + "count": 30, + "archetype": "seasonal", + "label": "Cyclical demand around holidays — Q4 surges", + "attributes": { + "tier": ["silver", "gold"], + "channel": ["web", "mobile", "marketplace"], + "churn_reason": [ + "account_dormant", + "low_engagement", + "payment_failure", + "service_interruption", + ], + }, + "baseline": {"cart_value": "mid", "conversion_rate": "mid"}, + }, + { + "name": "cooled_off", + "count": 18, + "archetype": "flat > decline @ 12", + "label": "Active first year, gradually disengaged in year two", + "attributes": { + "tier": ["bronze", "silver"], + "channel": ["marketplace"], + "churn_reason": [ + "account_dormant", + "low_engagement", + "payment_failure", + "service_interruption", + ], + }, + "baseline": {"loyalty_score": "low", "return_rate": "high"}, + }, + { + "name": "one_and_done", + "count": 15, + "archetype": "growth > spike_then_crash > flat @ 4 @ 8", + "label": "Tested the brand for a few months, then never returned", + "attributes": { + "tier": ["bronze"], + "channel": ["web"], + "churn_reason": [ + "account_dormant", + "low_engagement", + "payment_failure", + "service_interruption", + ], + }, + "baseline": {"loyalty_score": "low", "cart_value": "low"}, + }, + { + "name": "winback", + "count": 12, + "archetype": "decline > flat > growth @ 6 @ 14", + "label": "Churned, then reactivated by year-two campaign", + "attributes": { + "tier": ["silver"], + "channel": ["email", "web"], + "churn_reason": [ + "account_dormant", + "low_engagement", + "payment_failure", + "service_interruption", + ], + }, + "baseline": {"loyalty_score": "mid", "conversion_rate": "mid"}, + }, + { + "name": "escalating_basket", + "count": 10, + "archetype": "accelerating", + "label": "Compounding cart values as trust builds", + "attributes": { + "tier": ["gold", "platinum"], + "channel": ["web"], + "churn_reason": [ + "account_dormant", + "low_engagement", + "payment_failure", + "service_interruption", + ], + }, + "baseline": {"cart_value": "high", "loyalty_score": "high"}, + }, + ], + lifecycle={ + "track": "loyalty_score", + "stages": [ + ("new", 0.0), + ("casual", 0.2), + ("regular", 0.5), + ("loyal", 0.75), + ("champion", 0.9), + ], + }, + dimensions=[ + { + "name": "dim_date", + "per": "period", + "columns": [ + {"name": "date_key", "type": "id"}, + {"name": "date", "type": "date"}, + {"name": "year", "type": "int"}, + {"name": "month", "type": "int"}, + {"name": "quarter", "type": "int"}, + ], + }, + { + "name": "dim_customer", + "per": "unit", + "columns": [ + {"name": "customer_id", "type": "id"}, + {"name": "customer_name", "type": "faker.name"}, + {"name": "signup_year", "type": "faker.year"}, + {"name": "cohort_size", "type": "segment.count"}, + { + "name": "customer_tier", + "type": "scd", + "tracks": "loyalty_score", + "tiers": ["browser", "casual", "loyal"], + "at": [0.3, 0.7], + }, + ], + }, + { + "name": "dim_product_category", + "reference": True, + "columns": [ + {"name": "category_id", "type": "id"}, + {"name": "category_name", "type": "static.electronics,apparel,home,grocery,beauty"}, + {"name": "margin_tier", "type": "static.high,standard,standard,low,high"}, + # 0.6-M15: nested struct column (M14c) — see + # ``retail_template.yaml`` for the Semi-Structured + # Flattening (DE L12) exercise rationale. + { + "name": "catalog_metadata", + "type": "struct", + "nested_schema": { + "aisle": "string", + "seasonality": "string", + "avg_basket_position": "int", + }, + }, + ], + }, + { + "name": "dim_channel", + "reference": True, + "columns": [ + {"name": "channel_id", "type": "id"}, + {"name": "channel_name", "type": "static.web,mobile,marketplace,email,store"}, + { + "name": "channel_type", + "type": "static.digital,digital,third_party,owned,physical", + }, + ], + }, + { + "name": "dim_promotion", + "reference": True, + "columns": [ + {"name": "promotion_id", "type": "id"}, + { + "name": "promo_name", + "type": "static.clearance,seasonal_sale,loyalty_reward,flash_sale", + }, + {"name": "discount_type", "type": "static.percentage,percentage,points,percentage"}, + ], + }, + ], + facts=[ + { + "name": "fct_sessions", + "metrics": ["sessions", "conversion_rate"], + "columns": [ + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "customer_id", "type": "ref.dim_customer"}, + {"name": "channel_id", "type": "ref.dim_channel"}, + {"name": "session_count", "type": "metric.sessions"}, + {"name": "conversion_rate", "type": "metric.conversion_rate"}, + { + "name": "shopping_intent", + "type": "bucket", + "labels": ["browsing", "comparing", "purchasing", "loyal_repeat"], + }, + ], + }, + { + "name": "fct_purchases", + "metrics": ["cart_value", "return_rate", "loyalty_score", "repeat_purchase_rate"], + # 0.6-M15: CDC fact-side (M9c) — every row carries + # _inserted_at / _updated_at / _op audit columns. Column- + # level quality injections (see ``quality=`` below) flip + # _op to "U" on touched rows. See ``retail_template.yaml``. + "cdc": True, + "columns": [ + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "customer_id", "type": "ref.dim_customer"}, + {"name": "category_id", "type": "ref.dim_product_category"}, + {"name": "promotion_id", "type": "ref.dim_promotion"}, + {"name": "cart_value", "type": "metric.cart_value"}, + {"name": "return_rate", "type": "metric.return_rate"}, + {"name": "loyalty_score", "type": "metric.loyalty_score"}, + {"name": "repeat_purchase_rate", "type": "metric.repeat_purchase_rate"}, + ], + }, + { + "name": "fct_satisfaction", + "metrics": ["nps"], + "columns": [ + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "customer_id", "type": "ref.dim_customer"}, + {"name": "nps", "type": "metric.nps"}, + ], + }, + ], + events=[ + { + "name": "evt_purchase", + "trigger": "proportional", + "driver": "conversion_rate", + "scale": 6.0, + "columns": [ + {"name": "event_id", "type": "id"}, + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "customer_id", "type": "ref.dim_customer"}, + {"name": "event_ts", "type": "timestamp"}, + ], + }, + { + "name": "evt_churn", + "trigger": "threshold", + "metric": "loyalty_score", + "below": 0.15, + "for_periods": 4, + "columns": [ + {"name": "event_id", "type": "id"}, + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "customer_id", "type": "ref.dim_customer"}, + {"name": "reason", "type": "pool.churn_reason"}, + {"name": "churn_flag", "type": "flag"}, + ], + }, + ], + # 0.6-M15: data-quality issues — see ``retail_template.yaml`` for + # the Data Quality Testing (DE L25), Data Cleaning (DE L15), and + # Data Observability (DE L28) rationale. The volume_anomaly spike + # at period 18 is the canonical observability scenario. + quality=[ + {"table": "fct_purchases", "issue": "null_injection", "rate": 0.03, "column": "cart_value"}, + { + "table": "fct_sessions", + "issue": "volume_anomaly", + "rate": 0.5, + "mode": "spike", + "period": 18, + }, + ], +) diff --git a/plotsim/configs/templates/retail_template.yaml b/tests/configs/retail_template.yaml similarity index 100% rename from plotsim/configs/templates/retail_template.yaml rename to tests/configs/retail_template.yaml diff --git a/tests/configs/saas_template.py b/tests/configs/saas_template.py new file mode 100644 index 0000000..d1bb4e6 --- /dev/null +++ b/tests/configs/saas_template.py @@ -0,0 +1,300 @@ +"""B2B SaaS customer success — Python-shaped builder template. + +This is the ``create(**kwargs)`` mirror of ``saas_template.yaml`` — +both produce identical engine configs given the same seed. Pick +whichever surface fits your workflow: + +* ``saas_template.yaml`` for config-as-data fixtures checked into git +* this file for code-shaped configs that compose with regular Python + +The new builder dials (``noise``, ``output``, ``locale``, +``seasonality``, custom-coefficient ``connections``) are demonstrated +inline below; comments mark the pieces that match the YAML 1-1. +""" + +from plotsim import create + +config = create( + about="B2B SaaS customer success", + unit="company", + seed=1729, # determinism + noise="perfectly_clean", # also: slightly_messy, realistic, dirty + # locale=["en_US", "en_GB"], # multi-locale faker mix + # 0.6-M15: opt-in denormalization (M14a) — see ``saas_template.yaml`` + # for the rationale. Each fact is left-joined with its FK'd dims; + # ``_wide.csv`` is emitted alongside the normalized output. + output={"denormalized": True}, + window=("2023-01", "2024-12", "monthly"), + seasonality=[ + {"months": [11, 12], "strength": 0.30}, # Q4 lift + {"months": [6, 7, 8], "strength": -0.10}, # summer dip + ], + # ── what we measure ───────────────────────────────── + metrics=[ + { + "name": "engagement", + "label": "Product engagement", + "type": "score", + "polarity": "positive", + }, + { + "name": "mrr", + "label": "Monthly recurring revenue", + "type": "amount", + "polarity": "positive", + "range": [100, 50000], + }, + { + "name": "support_tickets", + "label": "Support ticket volume", + "type": "count", + "polarity": "negative", + "follows": "engagement", + "delay": 2, + }, + { + "name": "feature_adoption", + "label": "Feature adoption rate", + "type": "score", + "polarity": "positive", + }, + { + "name": "churn_risk", + "label": "Churn risk score", + "type": "score", + "polarity": "negative", + }, + { + "name": "nps", + "label": "Net promoter score", + "type": "index", + "polarity": "positive", + "range": [-100, 100], + }, + ], + # ── how metrics connect ───────────────────────────── + # Mix of vocabulary words and explicit numeric coefficients — + # both forms parse into the same correlation matrix. Numeric + # form is for cases where you've calibrated r from real data. + connections=[ + ("engagement", "driven_by", "mrr"), + ("engagement", "opposes", "churn_risk"), + ("support_tickets", "related", "churn_risk"), + ("feature_adoption", 0.42, "mrr"), # custom coefficient + ("nps", 0.18, "engagement"), # custom coefficient + ], + # ── who we're simulating ──────────────────────────── + segments=[ + { + "name": "promising_client", + "count": 20, + "archetype": "growth > spike_then_crash > flat @ 8 @ 16", + "label": "Strong start, lost champion at month 8, went dormant by 16", + "attributes": { + "industry": ["Technology", "Finance", "Healthcare"], + "region": ["US", "EMEA"], + "tier": "enterprise", + }, + "baseline": {"mrr": "high", "engagement": "high", "support_tickets": "low"}, + }, + { + "name": "steady_enterprise", + "count": 25, + "archetype": "growth", + "label": "Reliable accounts, steady climb", + "attributes": { + "industry": ["Technology", "Finance"], + "region": ["US", "APAC"], + "tier": "enterprise", + }, + "baseline": {"mrr": "high", "engagement": "high", "support_tickets": "low"}, + }, + { + "name": "slow_churn", + "count": 15, + "archetype": "flat > decline @ 12", + "label": "Coasted for a year, then quietly faded", + "attributes": { + "industry": ["Media", "Hospitality"], + "region": ["EMEA"], + "tier": "starter", + }, + "baseline": {"mrr": "low", "engagement": "low", "support_tickets": "high"}, + }, + { + "name": "seasonal_accounts", + "count": 15, + "archetype": "growth > seasonal @ 6", + "label": "Ramped up first 6 months, settled into quarterly cycles", + "attributes": { + "industry": ["Retail", "Manufacturing"], + "region": ["US"], + "tier": "growth", + }, + "baseline": {"mrr": "mid", "engagement": "mid", "support_tickets": "mid"}, + }, + { + "name": "dormant", + "count": 10, + "archetype": "flat", + "label": "Signed up, never activated", + "attributes": { + "industry": ["Education"], + "region": ["APAC"], + "tier": "starter", + }, + "baseline": {"mrr": "low", "engagement": "low", "support_tickets": "low"}, + }, + { + "name": "turnaround", + "count": 10, + "archetype": "decline > flat > growth @ 6 @ 14", + "label": "Declining, hit bottom at month 6, turned around at 14", + "attributes": { + "industry": ["Finance", "Healthcare"], + "region": ["US"], + "tier": "growth", + }, + "baseline": {"mrr": "mid", "engagement": "mid", "support_tickets": "mid"}, + }, + ], + # ── lifecycle funnel ──────────────────────────────── + lifecycle={ + "track": "churn_risk", + "stages": [ + ("onboarding", 0.0), + ("active", 0.2), + ("at_risk", 0.5), + ("churned", 0.8), + ], + }, + # ── schema ────────────────────────────────────────── + dimensions=[ + { + "name": "dim_date", + "per": "period", + "columns": [ + {"name": "date_key", "type": "id"}, + {"name": "date", "type": "date"}, + {"name": "year", "type": "int"}, + {"name": "month", "type": "int"}, + {"name": "quarter", "type": "int"}, + ], + }, + { + "name": "dim_company", + "per": "unit", + "columns": [ + {"name": "company_id", "type": "id"}, + {"name": "company_name", "type": "faker.company"}, + {"name": "industry", "type": "faker.industry"}, + {"name": "founded_year", "type": "faker.year"}, + {"name": "cohort_size", "type": "segment.count"}, + { + "name": "plan_tier", + "type": "scd", + "tracks": "mrr", + "tiers": ["starter", "growth", "enterprise"], + "at": [0.4, 0.7], + }, + ], + }, + { + "name": "dim_user", + "per": "unit", + "columns": [ + {"name": "user_id", "type": "id"}, + {"name": "company_id", "type": "ref.dim_company"}, + {"name": "user_name", "type": "faker.name"}, + {"name": "role", "type": "static.member"}, + ], + }, + { + "name": "dim_plan", + "reference": True, + "columns": [ + {"name": "plan_id", "type": "id"}, + {"name": "plan_name", "type": "static.starter"}, + {"name": "monthly_price", "type": "static.99.00"}, + ], + }, + ], + facts=[ + { + "name": "fct_engagement", + "columns": [ + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "company_id", "type": "ref.dim_company"}, + {"name": "engagement_score", "type": "metric.engagement"}, + {"name": "feature_adoption", "type": "metric.feature_adoption"}, + { + "name": "customer_sentiment", + "type": "bucket", + "labels": ["at_risk", "lukewarm", "satisfied", "delighted"], + }, + ], + }, + { + "name": "fct_revenue", + "columns": [ + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "company_id", "type": "ref.dim_company"}, + {"name": "plan_id", "type": "ref.dim_plan"}, + {"name": "mrr", "type": "metric.mrr"}, + ], + }, + { + "name": "fct_support_tickets", + "columns": [ + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "company_id", "type": "ref.dim_company"}, + {"name": "ticket_count", "type": "metric.support_tickets"}, + {"name": "churn_risk", "type": "metric.churn_risk"}, + {"name": "nps", "type": "metric.nps"}, + ], + }, + ], + events=[ + { + "name": "evt_login", + "trigger": "proportional", + "driver": "engagement", + "scale": 5, + # 0.6-M15: log-file writer (M14b) demonstrated on the login + # event stream — see ``saas_template.yaml`` for parsing + # exercises that join the .log file back to the CSV. + "log_format": "{event_ts} INFO user={user_id} company={company_id} action=login", + "log_filename": "evt_login.log", + "columns": [ + {"name": "event_id", "type": "id"}, + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "user_id", "type": "ref.dim_user"}, + {"name": "company_id", "type": "ref.dim_company"}, + {"name": "event_ts", "type": "timestamp"}, + ], + }, + { + "name": "evt_churn", + "trigger": "threshold", + "metric": "churn_risk", + "above": 0.7, + "for_periods": 3, + "columns": [ + {"name": "event_id", "type": "id"}, + {"name": "date_key", "type": "ref.dim_date"}, + {"name": "company_id", "type": "ref.dim_company"}, + {"name": "churn_reason", "type": "faker.sentence"}, + {"name": "churn_flag", "type": "flag"}, + ], + }, + ], + # 0.6-M15: data-quality issues for Data Quality Testing (DE L25) + # and Data Cleaning (DE L15). Manifest records every injection so + # students can score detectors against ground truth. Issue + # placement is on event tables here (not facts) — see + # ``saas_template.yaml`` for the streaming-parquet rationale. + quality=[ + {"table": "evt_churn", "issue": "null_injection", "rate": 0.05, "column": "churn_reason"}, + {"table": "evt_login", "issue": "duplicate_rows", "rate": 0.02}, + ], +) diff --git a/plotsim/configs/templates/saas_template.yaml b/tests/configs/saas_template.yaml similarity index 100% rename from plotsim/configs/templates/saas_template.yaml rename to tests/configs/saas_template.yaml diff --git a/tests/test_builder_input.py b/tests/test_builder_input.py index b86366e..855355f 100644 --- a/tests/test_builder_input.py +++ b/tests/test_builder_input.py @@ -688,7 +688,7 @@ def test_yaml_reference_template_loads_as_userinput(): from pathlib import Path repo_root = Path(__file__).resolve().parent.parent - template = repo_root / "plotsim" / "configs" / "templates" / "saas_template.yaml" + template = repo_root / "plotsim" / "configs" / "templates" / "saas.yaml" raw = yaml.safe_load(template.read_text(encoding="utf-8")) # YAML parses date-like strings (2023-01) into date objects in some # codepaths; coerce window strings back to ISO month form for diff --git a/tests/test_builder_integration.py b/tests/test_builder_integration.py index 66cc0f0..f953f9f 100644 --- a/tests/test_builder_integration.py +++ b/tests/test_builder_integration.py @@ -27,8 +27,8 @@ REPO_ROOT = Path(__file__).resolve().parent.parent -TEMPLATE_YAML = REPO_ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml" -TEMPLATE_PY = REPO_ROOT / "plotsim" / "configs" / "templates" / "saas_template.py" +TEMPLATE_YAML = REPO_ROOT / "tests" / "configs" / "saas_template.yaml" +TEMPLATE_PY = REPO_ROOT / "tests" / "configs" / "saas_template.py" # ── Helpers ───────────────────────────────────────────────────────────────── diff --git a/tests/test_builder_schema.py b/tests/test_builder_schema.py index c5ff0c3..d948f2a 100644 --- a/tests/test_builder_schema.py +++ b/tests/test_builder_schema.py @@ -38,8 +38,8 @@ ) -SAAS_TEMPLATE = Path("plotsim/configs/templates/saas_template.yaml") -BARE_TEMPLATE = Path("plotsim/configs/templates/bare_minimum.yaml") +SAAS_TEMPLATE = Path("plotsim/configs/templates/saas.yaml") +BARE_TEMPLATE = Path("tests/configs/bare_minimum.yaml") # ── Schema export shape ──────────────────────────────────────────────────── diff --git a/tests/test_bypass_observability.py b/tests/test_bypass_observability.py index 789e196..261638e 100644 --- a/tests/test_bypass_observability.py +++ b/tests/test_bypass_observability.py @@ -80,7 +80,7 @@ def test_keys_are_archetype_names(self): # so the keys-are-archetype-names invariant is vacuously true. # Kept as a regression guard against the field gaining un- # validated keys in a future change. - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") cfg = cfg.model_copy(update={"generation_mode": "vectorized"}) m = _build_manifest_for(cfg) archetype_names = {e.archetype for e in cfg.entities} @@ -89,7 +89,7 @@ def test_keys_are_archetype_names(self): def test_counts_are_nonnegative_ints(self): # M127b: vacuously true on an empty dict; kept as a stable shape # invariant for any future repopulation. - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") cfg = cfg.model_copy(update={"generation_mode": "vectorized"}) m = _build_manifest_for(cfg) for arch, count in m.bypass_fallback_counts.items(): @@ -343,7 +343,7 @@ class TestManifestRoundTrip: the new model via the field defaults).""" def test_round_trip_preserves_bypass_counts(self): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") cfg = cfg.model_copy(update={"generation_mode": "vectorized"}) m = _build_manifest_for(cfg) from plotsim.manifest import ManifestSchema diff --git a/tests/test_cdc_facts.py b/tests/test_cdc_facts.py index 0085906..641c9cf 100644 --- a/tests/test_cdc_facts.py +++ b/tests/test_cdc_facts.py @@ -29,7 +29,7 @@ columns. 6. **Builder passthrough** — ``FactInput.cdc`` routes onto engine ``Table.cdc``. - 7. **Bundled template** — ``plotsim.load_template("cdc_demo")`` + 7. **Bundled config** — the ``tests/configs/cdc_demo.yaml`` vehicle produces a working CDC config end-to-end. 8. **Determinism** — same ``(config, seed)`` produces the same ``_op`` sequence across runs. @@ -46,15 +46,15 @@ import pytest from pydantic import ValidationError -import plotsim from plotsim import ( build_manifest, create, + create_from_yaml, generate_tables_with_state, - load_template, write_tables, ) from plotsim.config import PlotsimConfig +from tests.configs import CONFIGS_DIR # --- 1. Config-level validation -------------------------------------------- @@ -479,15 +479,11 @@ def test_builder_fact_input_cdc_default_false(): assert fact.cdc is False -# --- 7. Bundled template --------------------------------------------------- +# --- 7. Bundled config ----------------------------------------------------- -def test_cdc_demo_template_in_list_templates(): - assert "cdc_demo" in plotsim.list_templates() - - -def test_cdc_demo_template_loads_and_generates(tmp_path): - cfg = load_template("cdc_demo") +def test_cdc_demo_config_loads_and_generates(tmp_path): + cfg = create_from_yaml(CONFIGS_DIR / "cdc_demo.yaml") assert isinstance(cfg, PlotsimConfig) fact = next(t for t in cfg.tables if t.name == "fct_billing") assert fact.cdc is True diff --git a/tests/test_cli.py b/tests/test_cli.py index 95bb58c..41b5ff7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -565,7 +565,7 @@ def test_cli_relative_output_dir_unchanged_behavior(tmp_path: Path, monkeypatch) # --- M124: builder-YAML dispatch in CLI commands ---------------------------- BUILDER_DIR = ROOT / "plotsim" / "configs" / "templates" -BUILDER_SAAS_YAML = BUILDER_DIR / "saas_template.yaml" +BUILDER_SAAS_YAML = BUILDER_DIR / "saas.yaml" def test_list_builder_templates_finds_directory(): @@ -574,7 +574,6 @@ def test_list_builder_templates_finds_directory(): names = {name for name, _ in builder} # Every YAML in plotsim/configs/templates/ should appear; saas is the canonical. assert "saas" in names - assert "bare_minimum" in names for _name, path in builder: assert path.exists() assert path.suffix == ".yaml" @@ -658,13 +657,13 @@ def test_is_builder_yaml_handles_malformed_input(tmp_path: Path): def test_find_template_resolves_builder_when_no_engine_match(): - """M124: ``find_template('bare_minimum')`` finds the builder template - when no engine-direct match exists. Engine-direct still wins for - overlapping names like ``saas``. + """M124: ``find_template('banking')`` finds the builder template when + no engine-direct match exists (no ``sample_banking.yaml``). Engine- + direct still wins for overlapping names like ``saas``. """ - bare = cli.find_template("bare_minimum") - assert bare is not None - assert bare.parent.name == "templates" + banking = cli.find_template("banking") + assert banking is not None + assert banking.parent.name == "templates" saas = cli.find_template("saas") assert saas is not None # Engine-direct precedence: ``saas`` resolves to ``sample_saas.yaml``. diff --git a/tests/test_entity_expansion.py b/tests/test_entity_expansion.py index 93d16eb..8abada1 100644 --- a/tests/test_entity_expansion.py +++ b/tests/test_entity_expansion.py @@ -37,7 +37,7 @@ REPO_ROOT = Path(__file__).resolve().parent.parent -TEMPLATE_YAML = REPO_ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml" +TEMPLATE_YAML = REPO_ROOT / "tests" / "configs" / "saas_template.yaml" BUNDLED_TEMPLATES = ( "sample_education.yaml", "sample_hr.yaml", diff --git a/tests/test_fk_target_resolution.py b/tests/test_fk_target_resolution.py index a21cc98..8275788 100644 --- a/tests/test_fk_target_resolution.py +++ b/tests/test_fk_target_resolution.py @@ -291,7 +291,7 @@ def _orders_template_parquet_partitioned(tmp_path: Path) -> PlotsimConfig: The template's ``fct_orders`` declares ``order_date`` (ref.dim_date) rather than a literal ``date_key`` column — exactly the case the FK-target resolution unlocks.""" - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "orders_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "orders_template.yaml") return cfg.model_copy( update={ "output": cfg.output.model_copy( @@ -367,7 +367,7 @@ def test_table_without_date_fk_stays_single_file(self, tmp_path): def test_validator_rejects_unmatched_partition_key(self): """Pre- and post-fix: a partition_by with no literal match AND no FK target match still raises at load.""" - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "orders_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "orders_template.yaml") payload = cfg.model_dump() payload["output"]["format"] = "parquet" payload["output"]["partition_by"] = "definitely_nonexistent_col" diff --git a/tests/test_geo_provider.py b/tests/test_geo_provider.py index 01d3b1a..a0bd8ed 100644 --- a/tests/test_geo_provider.py +++ b/tests/test_geo_provider.py @@ -13,8 +13,8 @@ - Unknown ``geo.`` strings are rejected with a clear message. - Builder shortcut ``geo.`` translates to the correct engine source + dtype. - - Bundled ``geo_retail`` template loads, runs end-to-end, and produces - a coherent dim_store frame. + - The ``tests/configs/geo_retail.yaml`` vehicle loads, runs end-to-end, + and produces a coherent dim_store frame. """ from __future__ import annotations @@ -24,8 +24,9 @@ import pytest import plotsim -from plotsim import generate_tables, list_templates, load_template +from plotsim import create_from_yaml, generate_tables from plotsim.builder import create +from tests.configs import CONFIGS_DIR from plotsim.config import Column, Entity, Table from plotsim.data import GEO_BUNDLE_FIELDS, GEO_LOCATIONS from plotsim.dimensions import ( @@ -435,23 +436,19 @@ def test_builder_geo_unknown_field_rejected_at_interpret_time(): _builder_dim_with_geo([("planet", "geo.planet")]) -# ── Bundled geo_retail template ──────────────────────────────────────── +# ── geo_retail test-vehicle config ───────────────────────────────────── -def test_geo_retail_template_in_list(): - assert "geo_retail" in list_templates() - - -def test_geo_retail_template_loads(): - cfg = load_template("geo_retail") +def test_geo_retail_config_loads(): + cfg = create_from_yaml(CONFIGS_DIR / "geo_retail.yaml") table_names = [t.name for t in cfg.tables] assert "dim_store" in table_names assert "fct_footfall" in table_names assert "fct_sales" in table_names -def test_geo_retail_template_runs_end_to_end_with_zero_mismatches(): - cfg = load_template("geo_retail") +def test_geo_retail_config_runs_end_to_end_with_zero_mismatches(): + cfg = create_from_yaml(CONFIGS_DIR / "geo_retail.yaml") tables = generate_tables(cfg) dim_store = tables["dim_store"] assert len(dim_store) == 40 # 12 flagship + 28 standard @@ -465,9 +462,9 @@ def test_geo_retail_template_runs_end_to_end_with_zero_mismatches(): assert expected["longitude"] == pytest.approx(row["longitude"]) -def test_geo_retail_template_deterministic_under_seed(): - a = generate_tables(load_template("geo_retail")) - b = generate_tables(load_template("geo_retail")) +def test_geo_retail_config_deterministic_under_seed(): + a = generate_tables(create_from_yaml(CONFIGS_DIR / "geo_retail.yaml")) + b = generate_tables(create_from_yaml(CONFIGS_DIR / "geo_retail.yaml")) pd.testing.assert_frame_equal(a["dim_store"], b["dim_store"]) pd.testing.assert_frame_equal( a["fct_footfall"].reset_index(drop=True), diff --git a/tests/test_jsonl_output.py b/tests/test_jsonl_output.py index f8052e6..fe211a6 100644 --- a/tests/test_jsonl_output.py +++ b/tests/test_jsonl_output.py @@ -39,7 +39,7 @@ def _saas_jsonl_config(tmp_path: Path): """Load the saas template and switch it to jsonl output.""" - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") return cfg.model_copy( update={ "output": cfg.output.model_copy(update={"format": "jsonl", "directory": str(tmp_path)}), @@ -132,7 +132,7 @@ def test_row_count_matches_csv_baseline(self, tmp_path): the same config — both formats consume the same post-CDC / post-quality ``tables_to_write`` dict, so any drift is a writer bug. Compares CSV minus header against JSONL line count.""" - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") csv_dir = tmp_path / "csv" jsonl_dir = tmp_path / "jsonl" csv_dir.mkdir() @@ -292,7 +292,7 @@ class TestNestedColumns: can't carry nested types natively).""" def test_struct_column_serialises_as_object(self, tmp_path): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "retail_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "retail_template.yaml") cfg_j = cfg.model_copy( update={ "output": cfg.output.model_copy( @@ -386,7 +386,7 @@ def test_unknown_format_rejected(self): OutputConfig(format="ndjson", directory="out") def test_resolve_output_format_returns_jsonl(self): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") cfg_j = cfg.model_copy( update={"output": cfg.output.model_copy(update={"format": "jsonl"})}, ) @@ -432,7 +432,7 @@ class TestCsvUnchanged: config with the field omitted, run after run.""" def test_csv_output_byte_identical(self, tmp_path): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") a = tmp_path / "run_a" b = tmp_path / "run_b" a.mkdir() diff --git a/tests/test_multi_source.py b/tests/test_multi_source.py index 4996952..411e1c7 100644 --- a/tests/test_multi_source.py +++ b/tests/test_multi_source.py @@ -10,11 +10,12 @@ (entity, source) with all drifted fields listed. 6. Single-source (no multi_source block) configs unchanged. 7. Deterministic under seed. - 8. New template passes ``plotsim run`` / ``plotsim validate``. + 8. The vehicle config passes ``plotsim run`` / ``plotsim validate``. -The 8th criterion is exercised against the bundled -``crm_billing_overlap`` template; the rest run against minimal -hand-rolled configs so each AC has a dedicated, narrow assertion. +The 8th criterion is exercised against the +``tests/configs/crm_billing_overlap.yaml`` vehicle; the rest run +against minimal hand-rolled configs so each AC has a dedicated, +narrow assertion. """ from __future__ import annotations @@ -383,13 +384,15 @@ def _build(): assert cfg1._source_entity_mappings == cfg2._source_entity_mappings -# ── AC8: bundled template passes plotsim run + validate ─────────────────── +# ── AC8: vehicle config passes plotsim run + validate ──────────────────── -def test_bundled_template_loads_and_validates(tmp_path: Path): +def test_bundled_vehicle_loads_and_validates(tmp_path: Path): + from plotsim import create_from_yaml from plotsim.tables import generate_tables_with_state + from tests.configs import CONFIGS_DIR - cfg = plotsim.load_template("crm_billing_overlap") + cfg = create_from_yaml(CONFIGS_DIR / "crm_billing_overlap.yaml") rng = np.random.default_rng(cfg.seed) tables, gen_state = generate_tables_with_state(cfg, rng) report = plotsim.validate(cfg, tables) diff --git a/tests/test_narrative_source.py b/tests/test_narrative_source.py index 2f3d1df..0aabe2a 100644 --- a/tests/test_narrative_source.py +++ b/tests/test_narrative_source.py @@ -19,7 +19,7 @@ * Classifier accuracy — hand-rolled multinomial naive Bayes on entity-split bag-of-words, accuracy ≥ 0.55 on held-out entities (chance ≈ 0.333 for 3 segments). Threshold rationale documented inline. -* Bundled template — load_template / .py-vs-yaml parity / CLI +* Vehicle config — create_from_yaml / .py-vs-yaml parity / CLI """ from __future__ import annotations @@ -38,11 +38,12 @@ PlotsimConfig, SurrogateKeyWarning, create, + create_from_yaml, generate_tables, load_config, - load_template, ) from plotsim.config import parse_source +from tests.configs import CONFIGS_DIR ROOT = Path(__file__).resolve().parent.parent @@ -403,7 +404,7 @@ def test_cross_config_narrative_on_dim_rejected_at_load(tmp_path): with warnings.catch_warnings(): warnings.simplefilter("ignore", SurrogateKeyWarning) - cfg = load_template("narrative_reviews") + cfg = create_from_yaml(CONFIGS_DIR / "narrative_reviews.yaml") base = yaml.safe_load(dump_config(cfg)) # Move the narrative column from fct_reviews onto dim_customer. @@ -447,7 +448,7 @@ def test_narrative_dtype_boolean_rejected_at_load(tmp_path): with warnings.catch_warnings(): warnings.simplefilter("ignore", SurrogateKeyWarning) - cfg = load_template("narrative_reviews") + cfg = create_from_yaml(CONFIGS_DIR / "narrative_reviews.yaml") base = yaml.safe_load(dump_config(cfg)) for tbl in base["tables"]: if tbl["name"] == "fct_reviews": @@ -469,7 +470,7 @@ def test_narrative_dtype_boolean_rejected_at_load(tmp_path): def reviews_cfg() -> PlotsimConfig: with warnings.catch_warnings(): warnings.simplefilter("ignore", SurrogateKeyWarning) - return load_template("narrative_reviews") + return create_from_yaml(CONFIGS_DIR / "narrative_reviews.yaml") @pytest.fixture(scope="module") @@ -495,7 +496,7 @@ def test_template_text_columns_dtype_string(reviews_tables): def test_template_yaml_and_python_produce_identical_text(reviews_cfg): """The .py and .yaml templates resolve to byte-identical text columns under the same seed.""" - from plotsim.configs.templates.narrative_reviews import config as cfg_py + from tests.configs.narrative_reviews import config as cfg_py with warnings.catch_warnings(): warnings.simplefilter("ignore", SurrogateKeyWarning) diff --git a/tests/test_parent_child_facts.py b/tests/test_parent_child_facts.py index 0d8e60c..740c351 100644 --- a/tests/test_parent_child_facts.py +++ b/tests/test_parent_child_facts.py @@ -22,7 +22,7 @@ count surfaced in the stderr summary. 6. **Backwards compatibility** — configs without per_parent_row tables produce output byte-identical to pre-M18. - 7. **Bundled template** — ``plotsim.load_template("orders")`` + 7. **Vehicle config** — the ``tests/configs/orders_template.yaml`` vehicle produces a working parent/child config end-to-end. """ @@ -36,9 +36,9 @@ import pytest from pydantic import ValidationError -import plotsim -from plotsim import create, generate_tables, load_template +from plotsim import create, create_from_yaml, generate_tables, load_template from plotsim.config import PlotsimConfig +from tests.configs import CONFIGS_DIR # --- Fixture helpers -------------------------------------------------------- @@ -494,8 +494,8 @@ def test_config_without_per_parent_row_unchanged(): def test_orders_template_loads_and_generates(): - """plotsim.load_template('orders') produces a working config.""" - cfg = plotsim.load_template("orders") + """The orders vehicle config produces a working parent/child config.""" + cfg = create_from_yaml(CONFIGS_DIR / "orders_template.yaml") assert isinstance(cfg, PlotsimConfig) tables = generate_tables(cfg) assert "fct_orders" in tables @@ -811,7 +811,7 @@ def test_orders_template_manifest_has_parent_child_relation(): """The manifest carries one parent_child_relations record per child.""" from plotsim import build_manifest, generate_tables_with_state - cfg = plotsim.load_template("orders") + cfg = create_from_yaml(CONFIGS_DIR / "orders_template.yaml") rng = np.random.default_rng(cfg.seed) tables, state = generate_tables_with_state(cfg, rng) manifest = build_manifest( diff --git a/tests/test_partitioned_parquet.py b/tests/test_partitioned_parquet.py index b970b02..9b17711 100644 --- a/tests/test_partitioned_parquet.py +++ b/tests/test_partitioned_parquet.py @@ -42,7 +42,7 @@ def _saas_parquet_config(tmp_path: Path, *, partition_by: str | None = "date_key """Load the saas template and switch it to parquet output with the requested ``partition_by``. ``partition_by=None`` reverts to single- file parquet for the baseline-unchanged test.""" - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") return cfg.model_copy( update={ "output": cfg.output.model_copy( @@ -164,9 +164,7 @@ class TestBaselineParity: def test_partition_by_none_unchanged(self, tmp_path): cfg_a = _saas_parquet_config(tmp_path / "with_field", partition_by=None) - cfg_b = create_from_yaml( - ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml" - ).model_copy( + cfg_b = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml").model_copy( update={ "output": OutputConfig( format="parquet", @@ -200,7 +198,7 @@ class TestNestedColumns: between single-file and partitioned writers.""" def test_struct_column_survives_partitioning(self, tmp_path): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "retail_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "retail_template.yaml") cfg_p = cfg.model_copy( update={ "output": cfg.output.model_copy( @@ -242,7 +240,7 @@ def test_requires_parquet_format(self): OutputConfig(format="csv", directory="x", partition_by="date_key") def test_rejects_unknown_column(self): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") payload = cfg.model_dump() payload["output"]["format"] = "parquet" payload["output"]["partition_by"] = "nonexistent_col" @@ -250,7 +248,7 @@ def test_rejects_unknown_column(self): PlotsimConfig.model_validate(payload) def test_rejects_float_column(self): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") payload = cfg.model_dump() payload["output"]["format"] = "parquet" # mrr on fct_revenue is dtype=float — should be rejected as a @@ -269,7 +267,7 @@ class TestStreamingBypass: partitioned configs.""" def test_streaming_eligibility_false_when_partitioned(self): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") cfg_v = cfg.model_copy( update={ "output": cfg.output.model_copy( @@ -281,7 +279,7 @@ def test_streaming_eligibility_false_when_partitioned(self): assert _streaming_parquet_eligible(cfg_v) is False def test_streaming_eligibility_true_without_partition(self): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") cfg_v = cfg.model_copy( update={ "output": cfg.output.model_copy(update={"format": "parquet"}), @@ -302,7 +300,7 @@ class TestSidecars: """ def test_denormalized_wide_partitions(self, tmp_path): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") cfg_p = cfg.model_copy( update={ "output": cfg.output.model_copy( diff --git a/tests/test_pk_prefix.py b/tests/test_pk_prefix.py index 858a1a0..c4d4899 100644 --- a/tests/test_pk_prefix.py +++ b/tests/test_pk_prefix.py @@ -28,7 +28,8 @@ import numpy as np import pytest -from plotsim import generate_tables, load_template +from plotsim import create_from_yaml, generate_tables +from tests.configs import CONFIGS_DIR from plotsim.config import ( Archetype, Column, @@ -282,14 +283,14 @@ def test_explicit_pk_prefix_collision_rejected(): _make_config([dim_a, dim_b]) -# --- End-to-end: orders template (the canonical collision case) ------------ +# --- End-to-end: orders vehicle (the canonical collision case) ------------ -def test_orders_template_produces_distinguishable_pks(): - """``orders`` template has fct_orders + fct_order_items (both +def test_orders_vehicle_produces_distinguishable_pks(): + """The orders vehicle has fct_orders + fct_order_items (both stripped → ``o``) plus fct_returns (``r``). The first two must get distinguishable PKs; fct_returns keeps its single ``r``.""" - cfg = load_template("orders") + cfg = create_from_yaml(CONFIGS_DIR / "orders_template.yaml") tables = generate_tables(cfg, np.random.default_rng(cfg.seed)) orders_pks = tables["fct_orders"]["order_id"].tolist() items_pks = tables["fct_order_items"]["item_id"].tolist() diff --git a/tests/test_sql_output.py b/tests/test_sql_output.py index a71db1a..27d3a6e 100644 --- a/tests/test_sql_output.py +++ b/tests/test_sql_output.py @@ -45,7 +45,7 @@ def _saas_sql_config(tmp_path: Path, *, dialect: str = "postgresql"): """Load the saas template and switch it to sql output.""" - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") return cfg.model_copy( update={ "output": cfg.output.model_copy( @@ -286,7 +286,7 @@ def test_dim_create_appears_before_fact_create(self, tmp_path): assert dim_pos < fact_pos, "dim_company CREATE must precede fct_engagement CREATE" def test_sql_table_order_dims_first(self): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") order = _sql_table_order(cfg) dim_names = {t.name for t in cfg.tables if t.type == "dim"} # All dim names appear before the first non-dim. @@ -468,7 +468,7 @@ class TestEntityFeaturesRejection: the single-file SQL dump's star-schema layout.""" def test_entity_features_plus_sql_rejected_at_load(self): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") payload = cfg.model_dump() payload["output"]["format"] = "sql" payload["entity_features"] = {"enabled": True} @@ -478,7 +478,7 @@ def test_entity_features_plus_sql_rejected_at_load(self): def test_entity_features_plus_csv_still_allowed(self): """Regression guard: the new gate only fires under sql; CSV and the other formats continue to support entity_features.""" - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") payload = cfg.model_dump() # entity_features requires manifest.include=true and zero # quality issues — the saas template carries quality issues by @@ -531,7 +531,7 @@ def test_default_dialect_with_non_sql_format_allowed(self): assert oc.sql_dialect == "postgresql" def test_resolve_output_format_returns_sql(self): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") cfg_s = cfg.model_copy( update={"output": cfg.output.model_copy(update={"format": "sql"})}, ) @@ -636,7 +636,7 @@ class TestCsvUnchanged: baseline config with the field omitted, run after run.""" def test_csv_output_byte_identical(self, tmp_path): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") a = tmp_path / "run_a" b = tmp_path / "run_b" a.mkdir() diff --git a/tests/test_streaming_parquet.py b/tests/test_streaming_parquet.py index 2cd802c..f92b0aa 100644 --- a/tests/test_streaming_parquet.py +++ b/tests/test_streaming_parquet.py @@ -98,7 +98,7 @@ class TestIterFactChunks: independently because it's also a useful seam for analysis tooling.""" def test_chunk_count_matches_archetypes(self): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") with warnings.catch_warnings(): warnings.simplefilter("ignore") tables = generate_tables( @@ -114,7 +114,7 @@ def test_chunk_count_matches_archetypes(self): assert len(archetype_chunks) == len(unique_archetypes) def test_chunk_row_counts_match_entity_counts(self): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") with warnings.catch_warnings(): warnings.simplefilter("ignore") tables = generate_tables( @@ -137,7 +137,7 @@ def test_chunk_row_counts_match_entity_counts(self): def test_chunk_union_equals_unified(self): """Concatenating every chunk's fact DataFrame should reconstruct the unified DataFrame (row order may differ, but row sets match).""" - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") with warnings.catch_warnings(): warnings.simplefilter("ignore") tables = generate_tables( @@ -257,7 +257,7 @@ def test_streaming_matches_non_streaming(self, tmp_path): """Baseline: with no quality issues configured, the streaming Parquet write reads back cell-identical to a single-shot ``to_parquet`` write of the same in-memory tables dict.""" - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") cfg_v = cfg.model_copy( update={ "output": cfg.output.model_copy(update={"format": "parquet"}), @@ -291,7 +291,7 @@ def test_streaming_matches_non_streaming_with_fact_column_null_injection(self, t place ``null_injection`` / ``type_mismatch`` / ``schema_drift`` on fact columns without invalidating the streaming path's contract.""" - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") cfg_v = cfg.model_copy( update={ "output": cfg.output.model_copy(update={"format": "parquet"}), @@ -375,7 +375,7 @@ class TestRowGroups: batches for ``per_entity_per_period`` facts.""" def test_row_group_count_matches_archetypes(self, tmp_path): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") cfg = cfg.model_copy( update={ "output": cfg.output.model_copy(update={"format": "parquet"}), @@ -392,7 +392,7 @@ def test_row_group_count_matches_archetypes(self, tmp_path): assert meta.num_row_groups == len(unique_archetypes), fact_name def test_row_group_sizes_match_entity_counts(self, tmp_path): - cfg = create_from_yaml(ROOT / "plotsim" / "configs" / "templates" / "saas_template.yaml") + cfg = create_from_yaml(ROOT / "tests" / "configs" / "saas_template.yaml") cfg = cfg.model_copy( update={ "output": cfg.output.model_copy(update={"format": "parquet"}), diff --git a/tests/test_templates_api.py b/tests/test_templates_api.py index a649751..58c0fb7 100644 --- a/tests/test_templates_api.py +++ b/tests/test_templates_api.py @@ -14,18 +14,10 @@ EXPECTED_TEMPLATES = { - "ab_trial", - "bare_minimum", - "cdc_demo", - "crm_billing_overlap", - "education", - "geo_retail", + "banking", + "health", "hr", - "lakehouse", - "latency_skew", "marketing", - "narrative_reviews", - "orders", "retail", "saas", } From cb8dd3de1fea9f84111220c9a6b0ffea6b1e8a10 Mon Sep 17 00:00:00 2001 From: mohossam01 Date: Sat, 16 May 2026 01:35:44 -0400 Subject: [PATCH 2/2] refactor: tighten geo and tests typing surface Co-Authored-By: Claude Opus 4.7 (1M context) --- plotsim/data/__init__.py | 3 ++- plotsim/data/geo_locations.py | 16 +++++++++++++++- plotsim/dimensions.py | 6 +++--- tests/test_builder_integration.py | 6 +++--- tests/test_cli.py | 3 ++- tests/test_geo_provider.py | 6 +++--- 6 files changed, 28 insertions(+), 12 deletions(-) diff --git a/plotsim/data/__init__.py b/plotsim/data/__init__.py index e9d4de2..99dca39 100644 --- a/plotsim/data/__init__.py +++ b/plotsim/data/__init__.py @@ -9,6 +9,7 @@ from plotsim.data.geo_locations import ( GEO_BUNDLE_FIELDS, GEO_LOCATIONS, + GeoEntry, ) -__all__ = ["GEO_BUNDLE_FIELDS", "GEO_LOCATIONS"] +__all__ = ["GEO_BUNDLE_FIELDS", "GEO_LOCATIONS", "GeoEntry"] diff --git a/plotsim/data/geo_locations.py b/plotsim/data/geo_locations.py index 3d421a2..25c46d3 100644 --- a/plotsim/data/geo_locations.py +++ b/plotsim/data/geo_locations.py @@ -21,6 +21,8 @@ from __future__ import annotations +from typing import TypedDict + # Provider name suffixes that the geo bundle resolves. A column source of # ``generated:geo.`` reads the corresponding bundle key. The set is # explicit so unknown ``geo.foo`` source strings raise instead of silently @@ -38,7 +40,19 @@ ) -GEO_LOCATIONS: tuple[dict[str, object], ...] = ( +class GeoEntry(TypedDict): + """One row of ``GEO_LOCATIONS``: five string fields + two float coords.""" + + country: str + country_code: str + region: str + city: str + postcode: str + latitude: float + longitude: float + + +GEO_LOCATIONS: tuple[GeoEntry, ...] = ( # --- United States (20) --- { "country": "United States", diff --git a/plotsim/dimensions.py b/plotsim/dimensions.py index 3f49d2d..7ba52c6 100644 --- a/plotsim/dimensions.py +++ b/plotsim/dimensions.py @@ -66,7 +66,7 @@ TimeWindow, parse_source, ) -from plotsim.data import GEO_BUNDLE_FIELDS, GEO_LOCATIONS +from plotsim.data import GEO_BUNDLE_FIELDS, GEO_LOCATIONS, GeoEntry # --- Helpers ---------------------------------------------------------------- @@ -185,7 +185,7 @@ def _assign_geo_bundles( columns: list[Column], n_rows: int, rng: np.random.Generator, -) -> Optional[list[dict[str, object]]]: +) -> Optional[list[GeoEntry]]: """Pre-allocate one geo bundle per row, or None if the table has no geo columns. A single ``rng.integers`` call draws ``n_rows`` indices from @@ -552,7 +552,7 @@ def _column_value_for_entity( entity_pk: str, fake: Faker, rng: Optional[np.random.Generator] = None, - geo_bundle: Optional[dict[str, object]] = None, + geo_bundle: Optional[GeoEntry] = None, ) -> Any: """Resolve one cell on a per_entity dim row. diff --git a/tests/test_builder_integration.py b/tests/test_builder_integration.py index f953f9f..c07a6d7 100644 --- a/tests/test_builder_integration.py +++ b/tests/test_builder_integration.py @@ -15,7 +15,7 @@ import warnings from pathlib import Path -from typing import Any +from typing import Any, cast import numpy as np import pandas as pd @@ -52,7 +52,7 @@ def _saas_py_config() -> PlotsimConfig: with warnings.catch_warnings(): warnings.simplefilter("ignore") result = runpy.run_path(str(TEMPLATE_PY)) - return result["config"] + return cast(PlotsimConfig, result["config"]) # ── Bare-minimum acceptance ───────────────────────────────────────────────── @@ -179,7 +179,7 @@ def saas_dataset() -> dict[str, Any]: def _entity_engagement_series(fact: pd.DataFrame, entity_id_col: str, entity_id: str) -> np.ndarray: rows = fact[fact[entity_id_col] == entity_id].sort_values("date_key") - return rows["engagement_score"].to_numpy() + return cast(np.ndarray, rows["engagement_score"].to_numpy()) def test_pearson_shape_recovery_for_non_plateau_archetypes(saas_dataset): diff --git a/tests/test_cli.py b/tests/test_cli.py index 41b5ff7..09fdf92 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -22,6 +22,7 @@ import io from contextlib import redirect_stdout, redirect_stderr from pathlib import Path +from typing import Literal import plotsim @@ -180,7 +181,7 @@ def test_info_hr_summary(): # --- FIX-02 acceptance: _estimate_periods daily branch ----------------------- -def _estimate_cfg(start: str, end: str, granularity: str): +def _estimate_cfg(start: str, end: str, granularity: Literal["monthly", "weekly", "daily"]): """Build a minimal PlotsimConfig covering only what _estimate_periods reads.""" from plotsim.config import ( Archetype, diff --git a/tests/test_geo_provider.py b/tests/test_geo_provider.py index a0bd8ed..bf9f158 100644 --- a/tests/test_geo_provider.py +++ b/tests/test_geo_provider.py @@ -28,7 +28,7 @@ from plotsim.builder import create from tests.configs import CONFIGS_DIR from plotsim.config import Column, Entity, Table -from plotsim.data import GEO_BUNDLE_FIELDS, GEO_LOCATIONS +from plotsim.data import GEO_BUNDLE_FIELDS, GEO_LOCATIONS, GeoEntry from plotsim.dimensions import ( _assign_geo_bundles, _geo_provider_field, @@ -168,12 +168,12 @@ def _full_geo_dim(name: str = "dim_store") -> Table: ) -def _city_lookup() -> dict[tuple[str, str], dict]: +def _city_lookup() -> dict[tuple[str, str], GeoEntry]: """City+postcode is unique in the dataset; country+city is not (e.g. Newcastle exists in UK and Australia). Key by (city, postcode) to pick the right bundle when verifying coherence on duplicates. """ - out: dict[tuple[str, str], dict] = {} + out: dict[tuple[str, str], GeoEntry] = {} for entry in GEO_LOCATIONS: out[(entry["city"], entry["postcode"])] = entry return out