From f030a66c0de672ed86dc33b9a2dd88a11f3e2e22 Mon Sep 17 00:00:00 2001
From: Javier Cuervo <javier.cuervo@proportione.com>
Date: Sat, 13 Jun 2026 15:24:27 +0200
Subject: [PATCH 1/3] feat(stats): reviewer-proof correlation helpers + prisma
 correlate

Add prisma.stats: fisher_ci, correlation_table (Pearson + Fisher-z CI +
Spearman, overall and per stratum), partial_correlation, bootstrap_ci,
missingness_compare (Mann-Whitney selection check), and mixed_model_icc
(random-intercept clustering + ICC). New 'prisma correlate' CLI command.
scipy becomes a core dependency; statsmodels is an optional [stats] extra.
Extracted while hardening the statistical rigour of the 20-60-20
AI-permitted-assessment study (Cuervo, 2026, Universidade de Aveiro).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md                     |  27 ++++++
 pyproject.toml                   |   6 +-
 src/prisma/cli.py                |  35 +++++++
 src/prisma/stats/__init__.py     |  25 +++++
 src/prisma/stats/correlations.py | 152 +++++++++++++++++++++++++++++++
 tests/test_stats.py              |  62 +++++++++++++
 6 files changed, 306 insertions(+), 1 deletion(-)
 create mode 100644 CHANGELOG.md
 create mode 100644 src/prisma/stats/__init__.py
 create mode 100644 src/prisma/stats/correlations.py
 create mode 100644 tests/test_stats.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..0b5ab21
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,27 @@
+# Changelog
+
+All notable changes to `proportione-prisma` are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+### Added
+
+- `prisma.stats`: reviewer-proof correlation/robustness helpers — `fisher_ci`, `correlation_table` (Pearson + Fisher-z CI + Spearman, overall and per stratum), `partial_correlation`, `bootstrap_ci` (small-cell fragility), `missingness_compare` (selection-bias check via Mann-Whitney), and `mixed_model_icc` (random-intercept clustering + ICC; optional `statsmodels` extra). New `prisma correlate` CLI command. Adds `scipy` as a core dependency and a `[stats]` optional extra. Extracted while hardening the statistical rigour of the 20-60-20 AI-permitted-assessment study (Cuervo, 2026, Universidade de Aveiro).
+- Pub1-Fusion (Cuervo & Marques, 2026, *From search queries to strategic decisions: a hybrid systematic and bibliometric review of digital signals in business forecasting*) submitted to International Journal of Information Management — citation slot reserved for the next release once the editorial decision lands.
+
+## [0.1.0] — 2026-04-29
+
+### Added
+
+- Initial public release.
+- `prisma.ingest`: OpenAlex search, Unpaywall PDF discovery, RIS I/O, cross-source deduplication (DOI matching plus rapidfuzz title-author similarity).
+- `prisma.screening`: two-tier rule engine (hard exclusion + multi-group inclusion), YAML-defined, full audit log.
+- `prisma.extraction`: PyMuPDF text extraction, section detection, taxonomy-driven field extraction.
+- `prisma.quality`: MMAT 2018 quantitative-descriptive heuristic scoring (Q1–Q5, High/Medium/Low).
+- `prisma.reporting`: PRISMA 2020 flow diagram from `PRISMACounts` dataclass.
+- `prisma.bibliometrics`: VOSviewer `.net` loader, Louvain communities, co-occurrence matrix.
+- `prisma.viz`: matplotlib config with the Proportione brand palette.
+- Streamlit demo (`streamlit_app/`) walking through the full pipeline.
+- CLI (`prisma`) covering ingest / screen / extract / quality / report.
+- Zenodo archive with DOI [`10.5281/zenodo.19883809`](https://doi.org/10.5281/zenodo.19883809).
+- Cited in: Pub3-v2 (Cuervo & Marques, 2026, ibero-american bibliometric mapping, under review at CIDEMA II McGraw Hill).
diff --git a/pyproject.toml b/pyproject.toml
index aa260f3..14134da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ classifiers = [
 dependencies = [
     "pandas>=2.0",
     "numpy>=1.24",
+    "scipy>=1.10",
     "rapidfuzz>=3.0",
     "requests>=2.31",
     "pymupdf>=1.23",
@@ -54,13 +55,16 @@ streamlit = [
     "streamlit>=1.30",
     "plotly>=5.18",
 ]
+stats = [
+    "statsmodels>=0.14",
+]
 dev = [
     "pytest>=7.4",
     "pytest-cov>=4.1",
     "ruff>=0.3",
     "mypy>=1.8",
 ]
-all = ["proportione-prisma[streamlit,dev]"]
+all = ["proportione-prisma[streamlit,stats,dev]"]
 
 [project.urls]
 Homepage = "https://github.com/Proportione/prisma"
diff --git a/src/prisma/cli.py b/src/prisma/cli.py
index 95c4c4b..c5d3944 100644
--- a/src/prisma/cli.py
+++ b/src/prisma/cli.py
@@ -8,6 +8,7 @@
 - extract           Extract fields from PDFs using a YAML taxonomy.
 - quality           MMAT 2018 quality assessment from PDF text.
 - report            Render a PRISMA 2020 flow diagram from a JSON counts file.
+- correlate         Pearson (+Fisher-z CI) and Spearman correlations, overall/per stratum.
 """
 from __future__ import annotations
 
@@ -187,5 +188,39 @@ def report(counts: str, output: str, title: str, note: str) -> None:
     click.echo(f"Wrote {out_path}")
 
 
+@main.command()
+@click.option("--in", "csv_path", required=True, type=click.Path(exists=True, dir_okay=False),
+              help="CSV with one row per record.")
+@click.option("--x", required=True, help="First numeric column.")
+@click.option("--y", required=True, help="Second numeric column.")
+@click.option("--group", default=None, help="Optional column to stratify by.")
+@click.option("--partial", "covar", default=None, help="Optional covariate for a partial correlation r(x,y|covar).")
+@click.option("--bootstrap", is_flag=True, help="Add a percentile bootstrap CI (useful for small cells).")
+@click.option("--out", "output_csv", default=None, type=click.Path(dir_okay=False),
+              help="Optional CSV to write the correlation table to.")
+def correlate(csv_path: str, x: str, y: str, group: str | None, covar: str | None,
+              bootstrap: bool, output_csv: str | None) -> None:
+    """Pearson (+Fisher-z CI) and Spearman correlations, overall and per stratum.
+
+    Reviewer-proof by design: every estimate carries n, a 95% CI and a rank-based
+    robustness check; optional partial correlation and bootstrap CI.
+    """
+    import pandas as pd
+    from prisma.stats import bootstrap_ci, correlation_table, partial_correlation
+
+    df = pd.read_csv(csv_path)
+    table = correlation_table(df, x, y, group=group)
+    click.echo(table.to_string(index=False))
+    if output_csv:
+        out = Path(output_csv)
+        out.parent.mkdir(parents=True, exist_ok=True)
+        table.to_csv(out, index=False)
+        click.echo(f"Wrote {out}")
+    if covar:
+        click.echo(f"\nPartial r({x},{y}|{covar}): {partial_correlation(df, x, y, covar)}")
+    if bootstrap:
+        click.echo(f"Bootstrap CI: {bootstrap_ci(df, x, y)}")
+
+
 if __name__ == "__main__":
     main()
diff --git a/src/prisma/stats/__init__.py b/src/prisma/stats/__init__.py
new file mode 100644
index 0000000..fd83038
--- /dev/null
+++ b/src/prisma/stats/__init__.py
@@ -0,0 +1,25 @@
+"""Statistical helpers for transparent, reviewer-proof SLR / education-research analysis.
+
+Reusable methods extracted while hardening the statistical rigour of the
+20-60-20 AI-permitted-assessment study (Cuervo, 2026): within-stratum
+correlations with Fisher-z confidence intervals, Spearman robustness,
+partial correlation, bootstrap CIs for small cells, missingness/selection
+checks, and a clustered (mixed-effects) model with ICC.
+"""
+from .correlations import (
+    fisher_ci,
+    correlation_table,
+    partial_correlation,
+    bootstrap_ci,
+    missingness_compare,
+    mixed_model_icc,
+)
+
+__all__ = [
+    "fisher_ci",
+    "correlation_table",
+    "partial_correlation",
+    "bootstrap_ci",
+    "missingness_compare",
+    "mixed_model_icc",
+]
diff --git a/src/prisma/stats/correlations.py b/src/prisma/stats/correlations.py
new file mode 100644
index 0000000..bbb0e61
--- /dev/null
+++ b/src/prisma/stats/correlations.py
@@ -0,0 +1,152 @@
+"""Correlation and robustness statistics for education / SLR datasets.
+
+Design goals: every estimate ships with the information a sceptical reviewer
+asks for — sample size, a confidence interval, a rank-based robustness check,
+and (where relevant) controls for mechanical overlap, small-cell fragility,
+selection bias and clustering. Pure functions over pandas DataFrames.
+"""
+from __future__ import annotations
+
+import math
+from typing import Iterable, Sequence
+
+import numpy as np
+import pandas as pd
+from scipy import stats
+
+
+def fisher_ci(r: float, n: int, alpha: float = 0.05) -> tuple[float, float]:
+    """Fisher z-transform confidence interval for a Pearson correlation.
+
+    Returns (nan, nan) when undefined (n < 4 or |r| >= 1).
+    """
+    if n < 4 or abs(r) >= 1.0:
+        return (float("nan"), float("nan"))
+    z = math.atanh(r)
+    se = 1.0 / math.sqrt(n - 3)
+    crit = stats.norm.ppf(1 - alpha / 2)
+    return (math.tanh(z - crit * se), math.tanh(z + crit * se))
+
+
+def _pair(df: pd.DataFrame, x: str, y: str) -> pd.DataFrame:
+    return df[[x, y]].apply(pd.to_numeric, errors="coerce").dropna()
+
+
+def correlation_table(
+    df: pd.DataFrame,
+    x: str,
+    y: str,
+    group: str | None = None,
+    alpha: float = 0.05,
+) -> pd.DataFrame:
+    """Pearson r (+ Fisher-z CI) and Spearman rho with n, overall and per group.
+
+    The Spearman column is the headline robustness check: a Pearson/Spearman gap
+    flags outlier- or spread-driven associations.
+    """
+    def one(sub: pd.DataFrame, label: str) -> dict | None:
+        d = _pair(sub, x, y)
+        if len(d) < 4:
+            return None
+        pr, pp = stats.pearsonr(d[x], d[y])
+        sr, sp = stats.spearmanr(d[x], d[y])
+        lo, hi = fisher_ci(pr, len(d), alpha)
+        return {
+            "stratum": label, "n": len(d),
+            "pearson_r": round(pr, 3), "ci_low": round(lo, 3), "ci_high": round(hi, 3),
+            "pearson_p": pp, "spearman_rho": round(sr, 3), "spearman_p": sp,
+        }
+
+    rows = [one(df, "ALL")]
+    if group is not None:
+        for g, sub in df.groupby(group):
+            rows.append(one(sub, str(g)))
+    return pd.DataFrame([r for r in rows if r is not None])
+
+
+def partial_correlation(df: pd.DataFrame, x: str, y: str, covar: str) -> dict:
+    """First-order partial correlation r(x, y | covar).
+
+    Answers 'does the x–y association survive controlling for a confounder
+    such as general ability?'.
+    """
+    d = df[[x, y, covar]].apply(pd.to_numeric, errors="coerce").dropna()
+    if len(d) < 5:
+        return {"n": len(d), "partial_r": float("nan"), "raw_r": float("nan")}
+    rxy = stats.pearsonr(d[x], d[y])[0]
+    rxz = stats.pearsonr(d[x], d[covar])[0]
+    ryz = stats.pearsonr(d[y], d[covar])[0]
+    denom = math.sqrt((1 - rxz ** 2) * (1 - ryz ** 2))
+    pr = (rxy - rxz * ryz) / denom if denom else float("nan")
+    return {"n": len(d), "partial_r": round(pr, 3), "raw_r": round(rxy, 3)}
+
+
+def bootstrap_ci(
+    df: pd.DataFrame, x: str, y: str, n_boot: int = 5000, seed: int = 0, alpha: float = 0.05
+) -> dict:
+    """Percentile bootstrap CI for a Pearson r — for small/fragile cells."""
+    d = _pair(df, x, y)
+    if len(d) < 10:
+        return {"n": len(d), "r": float("nan"), "ci_low": float("nan"), "ci_high": float("nan")}
+    arr = d.to_numpy()
+    rng = np.random.default_rng(seed)
+    boot = []
+    for _ in range(n_boot):
+        s = arr[rng.integers(0, len(arr), len(arr))]
+        if s[:, 0].std() > 0 and s[:, 1].std() > 0:
+            boot.append(np.corrcoef(s[:, 0], s[:, 1])[0, 1])
+    return {
+        "n": len(d), "r": round(float(np.corrcoef(arr[:, 0], arr[:, 1])[0, 1]), 3),
+        "ci_low": round(float(np.percentile(boot, 100 * alpha / 2)), 3),
+        "ci_high": round(float(np.percentile(boot, 100 * (1 - alpha / 2))), 3),
+        "n_boot": len(boot),
+    }
+
+
+def missingness_compare(df: pd.DataFrame, indicator: str, by: Sequence[str]) -> pd.DataFrame:
+    """Compare rows where `indicator` is present vs missing on `by` variables.
+
+    Surfaces selection bias from listwise deletion (Mann-Whitney U, non-parametric).
+    """
+    present = df[df[indicator].notna()]
+    missing = df[df[indicator].isna()]
+    out = []
+    for var in by:
+        h = pd.to_numeric(present[var], errors="coerce").dropna()
+        m = pd.to_numeric(missing[var], errors="coerce").dropna()
+        rec = {"variable": var, "n_present": len(h), "n_missing": len(m),
+               "mean_present": round(h.mean(), 2) if len(h) else float("nan"),
+               "mean_missing": round(m.mean(), 2) if len(m) else float("nan"), "mw_p": float("nan")}
+        if len(h) > 3 and len(m) > 3:
+            rec["mw_p"] = stats.mannwhitneyu(h, m).pvalue
+        out.append(rec)
+    return pd.DataFrame(out)
+
+
+def mixed_model_icc(df: pd.DataFrame, outcome: str, predictor: str, group: str) -> dict:
+    """Random-intercept mixed model `outcome ~ predictor + (1|group)` with ICC.
+
+    Accounts for clustering (students nested in cohorts/courses). Reports the
+    cluster-adjusted fixed effect, the naive OLS effect, and the intraclass
+    correlation. Requires the optional `statsmodels` dependency.
+    """
+    try:
+        import statsmodels.formula.api as smf
+    except ImportError as exc:  # pragma: no cover
+        raise ImportError(
+            "mixed_model_icc requires statsmodels: pip install 'proportione-prisma[stats]'"
+        ) from exc
+    d = df[[outcome, predictor, group]].apply(
+        lambda c: pd.to_numeric(c, errors="coerce") if c.name != group else c
+    ).dropna()
+    md = smf.mixedlm(f"{outcome} ~ {predictor}", d, groups=d[group]).fit(reml=False)
+    ols = smf.ols(f"{outcome} ~ {predictor}", d).fit()
+    gv = float(md.cov_re.iloc[0, 0])
+    rv = float(md.scale)
+    icc = gv / (gv + rv) if (gv + rv) else float("nan")
+    return {
+        "n": len(d), "n_groups": int(d[group].nunique()),
+        "coef": round(float(md.params.get(predictor)), 4), "p": float(md.pvalues.get(predictor)),
+        "icc": round(icc, 3),
+        "ols_coef": round(float(ols.params[predictor]), 4), "ols_p": float(ols.pvalues[predictor]),
+    }
diff --git a/tests/test_stats.py b/tests/test_stats.py
new file mode 100644
index 0000000..ca361ea
--- /dev/null
+++ b/tests/test_stats.py
@@ -0,0 +1,62 @@
+"""Tests for prisma.stats correlation helpers."""
+import math
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from prisma.stats import (
+    bootstrap_ci,
+    correlation_table,
+    fisher_ci,
+    missingness_compare,
+    partial_correlation,
+)
+
+
+@pytest.fixture
+def df():
+    rng = np.random.default_rng(7)
+    n = 120
+    x = rng.normal(size=n)
+    y = 0.6 * x + rng.normal(scale=0.8, size=n)
+    z = 0.5 * x + rng.normal(scale=0.9, size=n)
+    g = ["A"] * 60 + ["B"] * 60
+    d = pd.DataFrame({"x": x, "y": y, "z": z, "g": g})
+    # introduce some missingness in y
+    d.loc[d.index[:20], "y"] = np.nan
+    return d
+
+
+def test_fisher_ci_orders_and_bounds():
+    lo, hi = fisher_ci(0.5, 100)
+    assert lo < 0.5 < hi
+    assert -1 <= lo <= 1 and -1 <= hi <= 1
+    assert all(math.isnan(v) for v in fisher_ci(0.5, 3))  # n too small
+
+
+def test_correlation_table_overall_and_grouped(df):
+    t = correlation_table(df, "x", "y", group="g")
+    assert set(["stratum", "n", "pearson_r", "ci_low", "ci_high", "spearman_rho"]).issubset(t.columns)
+    assert t.iloc[0]["stratum"] == "ALL"
+    assert {"A", "B"}.issubset(set(t["stratum"]))
+    row = t.iloc[0]
+    assert row["ci_low"] < row["pearson_r"] < row["ci_high"]
+    assert row["pearson_r"] > 0  # positive by construction
+
+
+def test_partial_correlation_reduces_when_confounded(df):
+    p = partial_correlation(df, "x", "y", "z")
+    assert p["n"] > 5
+    assert abs(p["partial_r"]) <= abs(p["raw_r"]) + 1e-9  # controlling a correlate cannot inflate
+
+
+def test_bootstrap_ci_brackets_point(df):
+    b = bootstrap_ci(df, "x", "y", n_boot=500, seed=1)
+    assert b["ci_low"] < b["r"] < b["ci_high"]
+
+
+def test_missingness_compare_flags_columns(df):
+    m = missingness_compare(df, indicator="y", by=["x", "z"])
+    assert list(m["variable"]) == ["x", "z"]
+    assert (m["n_missing"] == 20).all()

From 3dae65defd6d2a30039459e759a08b2a13411052 Mon Sep 17 00:00:00 2001
From: Javier Cuervo <javier.cuervo@proportione.com>
Date: Thu, 25 Jun 2026 12:11:10 +0100
Subject: [PATCH 2/3] fix(stats): resuelve lint ruff en correlations.py (#2)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/prisma/stats/correlations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/prisma/stats/correlations.py b/src/prisma/stats/correlations.py
index bbb0e61..e955fbc 100644
--- a/src/prisma/stats/correlations.py
+++ b/src/prisma/stats/correlations.py
@@ -8,7 +8,7 @@
 from __future__ import annotations
 
 import math
-from typing import Iterable, Sequence
+from collections.abc import Sequence
 
 import numpy as np
 import pandas as pd
@@ -67,7 +67,7 @@ def one(sub: pd.DataFrame, label: str) -> dict | None:
 def partial_correlation(df: pd.DataFrame, x: str, y: str, covar: str) -> dict:
     """First-order partial correlation r(x, y | covar).
 
-    Answers 'does the x–y association survive controlling for a confounder
+    Answers 'does the x-y association survive controlling for a confounder
     such as general ability?'.
     """
     d = df[[x, y, covar]].apply(pd.to_numeric, errors="coerce").dropna()

From 0b4b5be0cd10b742a7e7d3ce7cf7b308bb0d3190 Mon Sep 17 00:00:00 2001
From: Javier Cuervo <javier.cuervo@proportione.com>
Date: Thu, 25 Jun 2026 12:58:39 +0100
Subject: [PATCH 3/3] fix(lint): ordena imports e __all__ en cli.py y stats
 (#2)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/prisma/cli.py            |  1 +
 src/prisma/stats/__init__.py | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/prisma/cli.py b/src/prisma/cli.py
index c5d3944..c3fee72 100644
--- a/src/prisma/cli.py
+++ b/src/prisma/cli.py
@@ -206,6 +206,7 @@ def correlate(csv_path: str, x: str, y: str, group: str | None, covar: str | Non
     robustness check; optional partial correlation and bootstrap CI.
     """
     import pandas as pd
+
     from prisma.stats import bootstrap_ci, correlation_table, partial_correlation
 
     df = pd.read_csv(csv_path)
diff --git a/src/prisma/stats/__init__.py b/src/prisma/stats/__init__.py
index fd83038..d7cfa76 100644
--- a/src/prisma/stats/__init__.py
+++ b/src/prisma/stats/__init__.py
@@ -7,19 +7,19 @@
 checks, and a clustered (mixed-effects) model with ICC.
 """
 from .correlations import (
-    fisher_ci,
-    correlation_table,
-    partial_correlation,
     bootstrap_ci,
+    correlation_table,
+    fisher_ci,
     missingness_compare,
     mixed_model_icc,
+    partial_correlation,
 )
 
 __all__ = [
-    "fisher_ci",
-    "correlation_table",
-    "partial_correlation",
     "bootstrap_ci",
+    "correlation_table",
+    "fisher_ci",
     "missingness_compare",
     "mixed_model_icc",
+    "partial_correlation",
 ]