From f030a66c0de672ed86dc33b9a2dd88a11f3e2e22 Mon Sep 17 00:00:00 2001 From: Javier Cuervo Date: Sat, 13 Jun 2026 15:24:27 +0200 Subject: [PATCH 1/3] feat(stats): reviewer-proof correlation helpers + prisma correlate Add prisma.stats: fisher_ci, correlation_table (Pearson + Fisher-z CI + Spearman, overall and per stratum), partial_correlation, bootstrap_ci, missingness_compare (Mann-Whitney selection check), and mixed_model_icc (random-intercept clustering + ICC). New 'prisma correlate' CLI command. scipy becomes a core dependency; statsmodels is an optional [stats] extra. Extracted while hardening the statistical rigour of the 20-60-20 AI-permitted-assessment study (Cuervo, 2026, Universidade de Aveiro). Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 27 ++++++ pyproject.toml | 6 +- src/prisma/cli.py | 35 +++++++ src/prisma/stats/__init__.py | 25 +++++ src/prisma/stats/correlations.py | 152 +++++++++++++++++++++++++++++++ tests/test_stats.py | 62 +++++++++++++ 6 files changed, 306 insertions(+), 1 deletion(-) create mode 100644 CHANGELOG.md create mode 100644 src/prisma/stats/__init__.py create mode 100644 src/prisma/stats/correlations.py create mode 100644 tests/test_stats.py diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..0b5ab21 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,27 @@ +# Changelog + +All notable changes to `proportione-prisma` are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- `prisma.stats`: reviewer-proof correlation/robustness helpers — `fisher_ci`, `correlation_table` (Pearson + Fisher-z CI + Spearman, overall and per stratum), `partial_correlation`, `bootstrap_ci` (small-cell fragility), `missingness_compare` (selection-bias check via Mann-Whitney), and `mixed_model_icc` (random-intercept clustering + ICC; optional `statsmodels` extra). New `prisma correlate` CLI command. Adds `scipy` as a core dependency and a `[stats]` optional extra. Extracted while hardening the statistical rigour of the 20-60-20 AI-permitted-assessment study (Cuervo, 2026, Universidade de Aveiro). +- Pub1-Fusion (Cuervo & Marques, 2026, *From search queries to strategic decisions: a hybrid systematic and bibliometric review of digital signals in business forecasting*) submitted to International Journal of Information Management — citation slot reserved for the next release once the editorial decision lands. + +## [0.1.0] — 2026-04-29 + +### Added + +- Initial public release. +- `prisma.ingest`: OpenAlex search, Unpaywall PDF discovery, RIS I/O, cross-source deduplication (DOI matching plus rapidfuzz title-author similarity). +- `prisma.screening`: two-tier rule engine (hard exclusion + multi-group inclusion), YAML-defined, full audit log. +- `prisma.extraction`: PyMuPDF text extraction, section detection, taxonomy-driven field extraction. +- `prisma.quality`: MMAT 2018 quantitative-descriptive heuristic scoring (Q1–Q5, High/Medium/Low). +- `prisma.reporting`: PRISMA 2020 flow diagram from `PRISMACounts` dataclass. +- `prisma.bibliometrics`: VOSviewer `.net` loader, Louvain communities, co-occurrence matrix. +- `prisma.viz`: matplotlib config with the Proportione brand palette. +- Streamlit demo (`streamlit_app/`) walking through the full pipeline. +- CLI (`prisma`) covering ingest / screen / extract / quality / report. +- Zenodo archive with DOI [`10.5281/zenodo.19883809`](https://doi.org/10.5281/zenodo.19883809). +- Cited in: Pub3-v2 (Cuervo & Marques, 2026, ibero-american bibliometric mapping, under review at CIDEMA II McGraw Hill). diff --git a/pyproject.toml b/pyproject.toml index aa260f3..14134da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ classifiers = [ dependencies = [ "pandas>=2.0", "numpy>=1.24", + "scipy>=1.10", "rapidfuzz>=3.0", "requests>=2.31", "pymupdf>=1.23", @@ -54,13 +55,16 @@ streamlit = [ "streamlit>=1.30", "plotly>=5.18", ] +stats = [ + "statsmodels>=0.14", +] dev = [ "pytest>=7.4", "pytest-cov>=4.1", "ruff>=0.3", "mypy>=1.8", ] -all = ["proportione-prisma[streamlit,dev]"] +all = ["proportione-prisma[streamlit,stats,dev]"] [project.urls] Homepage = "https://github.com/Proportione/prisma" diff --git a/src/prisma/cli.py b/src/prisma/cli.py index 95c4c4b..c5d3944 100644 --- a/src/prisma/cli.py +++ b/src/prisma/cli.py @@ -8,6 +8,7 @@ - extract Extract fields from PDFs using a YAML taxonomy. - quality MMAT 2018 quality assessment from PDF text. - report Render a PRISMA 2020 flow diagram from a JSON counts file. +- correlate Pearson (+Fisher-z CI) and Spearman correlations, overall/per stratum. """ from __future__ import annotations @@ -187,5 +188,39 @@ def report(counts: str, output: str, title: str, note: str) -> None: click.echo(f"Wrote {out_path}") +@main.command() +@click.option("--in", "csv_path", required=True, type=click.Path(exists=True, dir_okay=False), + help="CSV with one row per record.") +@click.option("--x", required=True, help="First numeric column.") +@click.option("--y", required=True, help="Second numeric column.") +@click.option("--group", default=None, help="Optional column to stratify by.") +@click.option("--partial", "covar", default=None, help="Optional covariate for a partial correlation r(x,y|covar).") +@click.option("--bootstrap", is_flag=True, help="Add a percentile bootstrap CI (useful for small cells).") +@click.option("--out", "output_csv", default=None, type=click.Path(dir_okay=False), + help="Optional CSV to write the correlation table to.") +def correlate(csv_path: str, x: str, y: str, group: str | None, covar: str | None, + bootstrap: bool, output_csv: str | None) -> None: + """Pearson (+Fisher-z CI) and Spearman correlations, overall and per stratum. + + Reviewer-proof by design: every estimate carries n, a 95% CI and a rank-based + robustness check; optional partial correlation and bootstrap CI. + """ + import pandas as pd + from prisma.stats import bootstrap_ci, correlation_table, partial_correlation + + df = pd.read_csv(csv_path) + table = correlation_table(df, x, y, group=group) + click.echo(table.to_string(index=False)) + if output_csv: + out = Path(output_csv) + out.parent.mkdir(parents=True, exist_ok=True) + table.to_csv(out, index=False) + click.echo(f"Wrote {out}") + if covar: + click.echo(f"\nPartial r({x},{y}|{covar}): {partial_correlation(df, x, y, covar)}") + if bootstrap: + click.echo(f"Bootstrap CI: {bootstrap_ci(df, x, y)}") + + if __name__ == "__main__": main() diff --git a/src/prisma/stats/__init__.py b/src/prisma/stats/__init__.py new file mode 100644 index 0000000..fd83038 --- /dev/null +++ b/src/prisma/stats/__init__.py @@ -0,0 +1,25 @@ +"""Statistical helpers for transparent, reviewer-proof SLR / education-research analysis. + +Reusable methods extracted while hardening the statistical rigour of the +20-60-20 AI-permitted-assessment study (Cuervo, 2026): within-stratum +correlations with Fisher-z confidence intervals, Spearman robustness, +partial correlation, bootstrap CIs for small cells, missingness/selection +checks, and a clustered (mixed-effects) model with ICC. +""" +from .correlations import ( + fisher_ci, + correlation_table, + partial_correlation, + bootstrap_ci, + missingness_compare, + mixed_model_icc, +) + +__all__ = [ + "fisher_ci", + "correlation_table", + "partial_correlation", + "bootstrap_ci", + "missingness_compare", + "mixed_model_icc", +] diff --git a/src/prisma/stats/correlations.py b/src/prisma/stats/correlations.py new file mode 100644 index 0000000..bbb0e61 --- /dev/null +++ b/src/prisma/stats/correlations.py @@ -0,0 +1,152 @@ +"""Correlation and robustness statistics for education / SLR datasets. + +Design goals: every estimate ships with the information a sceptical reviewer +asks for — sample size, a confidence interval, a rank-based robustness check, +and (where relevant) controls for mechanical overlap, small-cell fragility, +selection bias and clustering. Pure functions over pandas DataFrames. +""" +from __future__ import annotations + +import math +from typing import Iterable, Sequence + +import numpy as np +import pandas as pd +from scipy import stats + + +def fisher_ci(r: float, n: int, alpha: float = 0.05) -> tuple[float, float]: + """Fisher z-transform confidence interval for a Pearson correlation. + + Returns (nan, nan) when undefined (n < 4 or |r| >= 1). + """ + if n < 4 or abs(r) >= 1.0: + return (float("nan"), float("nan")) + z = math.atanh(r) + se = 1.0 / math.sqrt(n - 3) + crit = stats.norm.ppf(1 - alpha / 2) + return (math.tanh(z - crit * se), math.tanh(z + crit * se)) + + +def _pair(df: pd.DataFrame, x: str, y: str) -> pd.DataFrame: + return df[[x, y]].apply(pd.to_numeric, errors="coerce").dropna() + + +def correlation_table( + df: pd.DataFrame, + x: str, + y: str, + group: str | None = None, + alpha: float = 0.05, +) -> pd.DataFrame: + """Pearson r (+ Fisher-z CI) and Spearman rho with n, overall and per group. + + The Spearman column is the headline robustness check: a Pearson/Spearman gap + flags outlier- or spread-driven associations. + """ + def one(sub: pd.DataFrame, label: str) -> dict | None: + d = _pair(sub, x, y) + if len(d) < 4: + return None + pr, pp = stats.pearsonr(d[x], d[y]) + sr, sp = stats.spearmanr(d[x], d[y]) + lo, hi = fisher_ci(pr, len(d), alpha) + return { + "stratum": label, "n": len(d), + "pearson_r": round(pr, 3), "ci_low": round(lo, 3), "ci_high": round(hi, 3), + "pearson_p": pp, "spearman_rho": round(sr, 3), "spearman_p": sp, + } + + rows = [one(df, "ALL")] + if group is not None: + for g, sub in df.groupby(group): + rows.append(one(sub, str(g))) + return pd.DataFrame([r for r in rows if r is not None]) + + +def partial_correlation(df: pd.DataFrame, x: str, y: str, covar: str) -> dict: + """First-order partial correlation r(x, y | covar). + + Answers 'does the x–y association survive controlling for a confounder + such as general ability?'. + """ + d = df[[x, y, covar]].apply(pd.to_numeric, errors="coerce").dropna() + if len(d) < 5: + return {"n": len(d), "partial_r": float("nan"), "raw_r": float("nan")} + rxy = stats.pearsonr(d[x], d[y])[0] + rxz = stats.pearsonr(d[x], d[covar])[0] + ryz = stats.pearsonr(d[y], d[covar])[0] + denom = math.sqrt((1 - rxz ** 2) * (1 - ryz ** 2)) + pr = (rxy - rxz * ryz) / denom if denom else float("nan") + return {"n": len(d), "partial_r": round(pr, 3), "raw_r": round(rxy, 3)} + + +def bootstrap_ci( + df: pd.DataFrame, x: str, y: str, n_boot: int = 5000, seed: int = 0, alpha: float = 0.05 +) -> dict: + """Percentile bootstrap CI for a Pearson r — for small/fragile cells.""" + d = _pair(df, x, y) + if len(d) < 10: + return {"n": len(d), "r": float("nan"), "ci_low": float("nan"), "ci_high": float("nan")} + arr = d.to_numpy() + rng = np.random.default_rng(seed) + boot = [] + for _ in range(n_boot): + s = arr[rng.integers(0, len(arr), len(arr))] + if s[:, 0].std() > 0 and s[:, 1].std() > 0: + boot.append(np.corrcoef(s[:, 0], s[:, 1])[0, 1]) + return { + "n": len(d), "r": round(float(np.corrcoef(arr[:, 0], arr[:, 1])[0, 1]), 3), + "ci_low": round(float(np.percentile(boot, 100 * alpha / 2)), 3), + "ci_high": round(float(np.percentile(boot, 100 * (1 - alpha / 2))), 3), + "n_boot": len(boot), + } + + +def missingness_compare(df: pd.DataFrame, indicator: str, by: Sequence[str]) -> pd.DataFrame: + """Compare rows where `indicator` is present vs missing on `by` variables. + + Surfaces selection bias from listwise deletion (Mann-Whitney U, non-parametric). + """ + present = df[df[indicator].notna()] + missing = df[df[indicator].isna()] + out = [] + for var in by: + h = pd.to_numeric(present[var], errors="coerce").dropna() + m = pd.to_numeric(missing[var], errors="coerce").dropna() + rec = {"variable": var, "n_present": len(h), "n_missing": len(m), + "mean_present": round(h.mean(), 2) if len(h) else float("nan"), + "mean_missing": round(m.mean(), 2) if len(m) else float("nan"), "mw_p": float("nan")} + if len(h) > 3 and len(m) > 3: + rec["mw_p"] = stats.mannwhitneyu(h, m).pvalue + out.append(rec) + return pd.DataFrame(out) + + +def mixed_model_icc(df: pd.DataFrame, outcome: str, predictor: str, group: str) -> dict: + """Random-intercept mixed model `outcome ~ predictor + (1|group)` with ICC. + + Accounts for clustering (students nested in cohorts/courses). Reports the + cluster-adjusted fixed effect, the naive OLS effect, and the intraclass + correlation. Requires the optional `statsmodels` dependency. + """ + try: + import statsmodels.formula.api as smf + except ImportError as exc: # pragma: no cover + raise ImportError( + "mixed_model_icc requires statsmodels: pip install 'proportione-prisma[stats]'" + ) from exc + d = df[[outcome, predictor, group]].apply( + lambda c: pd.to_numeric(c, errors="coerce") if c.name != group else c + ).dropna() + md = smf.mixedlm(f"{outcome} ~ {predictor}", d, groups=d[group]).fit(reml=False) + ols = smf.ols(f"{outcome} ~ {predictor}", d).fit() + gv = float(md.cov_re.iloc[0, 0]) + rv = float(md.scale) + icc = gv / (gv + rv) if (gv + rv) else float("nan") + return { + "n": len(d), "n_groups": int(d[group].nunique()), + "coef": round(float(md.params.get(predictor)), 4), "p": float(md.pvalues.get(predictor)), + "icc": round(icc, 3), + "ols_coef": round(float(ols.params[predictor]), 4), "ols_p": float(ols.pvalues[predictor]), + } diff --git a/tests/test_stats.py b/tests/test_stats.py new file mode 100644 index 0000000..ca361ea --- /dev/null +++ b/tests/test_stats.py @@ -0,0 +1,62 @@ +"""Tests for prisma.stats correlation helpers.""" +import math + +import numpy as np +import pandas as pd +import pytest + +from prisma.stats import ( + bootstrap_ci, + correlation_table, + fisher_ci, + missingness_compare, + partial_correlation, +) + + +@pytest.fixture +def df(): + rng = np.random.default_rng(7) + n = 120 + x = rng.normal(size=n) + y = 0.6 * x + rng.normal(scale=0.8, size=n) + z = 0.5 * x + rng.normal(scale=0.9, size=n) + g = ["A"] * 60 + ["B"] * 60 + d = pd.DataFrame({"x": x, "y": y, "z": z, "g": g}) + # introduce some missingness in y + d.loc[d.index[:20], "y"] = np.nan + return d + + +def test_fisher_ci_orders_and_bounds(): + lo, hi = fisher_ci(0.5, 100) + assert lo < 0.5 < hi + assert -1 <= lo <= 1 and -1 <= hi <= 1 + assert all(math.isnan(v) for v in fisher_ci(0.5, 3)) # n too small + + +def test_correlation_table_overall_and_grouped(df): + t = correlation_table(df, "x", "y", group="g") + assert set(["stratum", "n", "pearson_r", "ci_low", "ci_high", "spearman_rho"]).issubset(t.columns) + assert t.iloc[0]["stratum"] == "ALL" + assert {"A", "B"}.issubset(set(t["stratum"])) + row = t.iloc[0] + assert row["ci_low"] < row["pearson_r"] < row["ci_high"] + assert row["pearson_r"] > 0 # positive by construction + + +def test_partial_correlation_reduces_when_confounded(df): + p = partial_correlation(df, "x", "y", "z") + assert p["n"] > 5 + assert abs(p["partial_r"]) <= abs(p["raw_r"]) + 1e-9 # controlling a correlate cannot inflate + + +def test_bootstrap_ci_brackets_point(df): + b = bootstrap_ci(df, "x", "y", n_boot=500, seed=1) + assert b["ci_low"] < b["r"] < b["ci_high"] + + +def test_missingness_compare_flags_columns(df): + m = missingness_compare(df, indicator="y", by=["x", "z"]) + assert list(m["variable"]) == ["x", "z"] + assert (m["n_missing"] == 20).all() From 3dae65defd6d2a30039459e759a08b2a13411052 Mon Sep 17 00:00:00 2001 From: Javier Cuervo Date: Thu, 25 Jun 2026 12:11:10 +0100 Subject: [PATCH 2/3] fix(stats): resuelve lint ruff en correlations.py (#2) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/prisma/stats/correlations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/prisma/stats/correlations.py b/src/prisma/stats/correlations.py index bbb0e61..e955fbc 100644 --- a/src/prisma/stats/correlations.py +++ b/src/prisma/stats/correlations.py @@ -8,7 +8,7 @@ from __future__ import annotations import math -from typing import Iterable, Sequence +from collections.abc import Sequence import numpy as np import pandas as pd @@ -67,7 +67,7 @@ def one(sub: pd.DataFrame, label: str) -> dict | None: def partial_correlation(df: pd.DataFrame, x: str, y: str, covar: str) -> dict: """First-order partial correlation r(x, y | covar). - Answers 'does the x–y association survive controlling for a confounder + Answers 'does the x-y association survive controlling for a confounder such as general ability?'. """ d = df[[x, y, covar]].apply(pd.to_numeric, errors="coerce").dropna() From 0b4b5be0cd10b742a7e7d3ce7cf7b308bb0d3190 Mon Sep 17 00:00:00 2001 From: Javier Cuervo Date: Thu, 25 Jun 2026 12:58:39 +0100 Subject: [PATCH 3/3] fix(lint): ordena imports e __all__ en cli.py y stats (#2) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/prisma/cli.py | 1 + src/prisma/stats/__init__.py | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/prisma/cli.py b/src/prisma/cli.py index c5d3944..c3fee72 100644 --- a/src/prisma/cli.py +++ b/src/prisma/cli.py @@ -206,6 +206,7 @@ def correlate(csv_path: str, x: str, y: str, group: str | None, covar: str | Non robustness check; optional partial correlation and bootstrap CI. """ import pandas as pd + from prisma.stats import bootstrap_ci, correlation_table, partial_correlation df = pd.read_csv(csv_path) diff --git a/src/prisma/stats/__init__.py b/src/prisma/stats/__init__.py index fd83038..d7cfa76 100644 --- a/src/prisma/stats/__init__.py +++ b/src/prisma/stats/__init__.py @@ -7,19 +7,19 @@ checks, and a clustered (mixed-effects) model with ICC. """ from .correlations import ( - fisher_ci, - correlation_table, - partial_correlation, bootstrap_ci, + correlation_table, + fisher_ci, missingness_compare, mixed_model_icc, + partial_correlation, ) __all__ = [ - "fisher_ci", - "correlation_table", - "partial_correlation", "bootstrap_ci", + "correlation_table", + "fisher_ci", "missingness_compare", "mixed_model_icc", + "partial_correlation", ]