Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Changelog

All notable changes to `proportione-prisma` are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Added

- `prisma.stats`: reviewer-proof correlation/robustness helpers — `fisher_ci`, `correlation_table` (Pearson + Fisher-z CI + Spearman, overall and per stratum), `partial_correlation`, `bootstrap_ci` (small-cell fragility), `missingness_compare` (selection-bias check via Mann-Whitney), and `mixed_model_icc` (random-intercept clustering + ICC; optional `statsmodels` extra). New `prisma correlate` CLI command. Adds `scipy` as a core dependency and a `[stats]` optional extra. Extracted while hardening the statistical rigour of the 20-60-20 AI-permitted-assessment study (Cuervo, 2026, Universidade de Aveiro).
- Pub1-Fusion (Cuervo & Marques, 2026, *From search queries to strategic decisions: a hybrid systematic and bibliometric review of digital signals in business forecasting*) submitted to International Journal of Information Management — citation slot reserved for the next release once the editorial decision lands.

## [0.1.0] — 2026-04-29

### Added

- Initial public release.
- `prisma.ingest`: OpenAlex search, Unpaywall PDF discovery, RIS I/O, cross-source deduplication (DOI matching plus rapidfuzz title-author similarity).
- `prisma.screening`: two-tier rule engine (hard exclusion + multi-group inclusion), YAML-defined, full audit log.
- `prisma.extraction`: PyMuPDF text extraction, section detection, taxonomy-driven field extraction.
- `prisma.quality`: MMAT 2018 quantitative-descriptive heuristic scoring (Q1–Q5, High/Medium/Low).
- `prisma.reporting`: PRISMA 2020 flow diagram from `PRISMACounts` dataclass.
- `prisma.bibliometrics`: VOSviewer `.net` loader, Louvain communities, co-occurrence matrix.
- `prisma.viz`: matplotlib config with the Proportione brand palette.
- Streamlit demo (`streamlit_app/`) walking through the full pipeline.
- CLI (`prisma`) covering ingest / screen / extract / quality / report.
- Zenodo archive with DOI [`10.5281/zenodo.19883809`](https://doi.org/10.5281/zenodo.19883809).
- Cited in: Pub3-v2 (Cuervo & Marques, 2026, ibero-american bibliometric mapping, under review at CIDEMA II McGraw Hill).
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ classifiers = [
dependencies = [
"pandas>=2.0",
"numpy>=1.24",
"scipy>=1.10",
"rapidfuzz>=3.0",
"requests>=2.31",
"pymupdf>=1.23",
Expand All @@ -54,13 +55,16 @@ streamlit = [
"streamlit>=1.30",
"plotly>=5.18",
]
stats = [
"statsmodels>=0.14",
]
dev = [
"pytest>=7.4",
"pytest-cov>=4.1",
"ruff>=0.3",
"mypy>=1.8",
]
all = ["proportione-prisma[streamlit,dev]"]
all = ["proportione-prisma[streamlit,stats,dev]"]

[project.urls]
Homepage = "https://github.com/Proportione/prisma"
Expand Down
36 changes: 36 additions & 0 deletions src/prisma/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- extract Extract fields from PDFs using a YAML taxonomy.
- quality MMAT 2018 quality assessment from PDF text.
- report Render a PRISMA 2020 flow diagram from a JSON counts file.
- correlate Pearson (+Fisher-z CI) and Spearman correlations, overall/per stratum.
"""
from __future__ import annotations

Expand Down Expand Up @@ -187,5 +188,40 @@ def report(counts: str, output: str, title: str, note: str) -> None:
click.echo(f"Wrote {out_path}")


@main.command()
@click.option("--in", "csv_path", required=True, type=click.Path(exists=True, dir_okay=False),
help="CSV with one row per record.")
@click.option("--x", required=True, help="First numeric column.")
@click.option("--y", required=True, help="Second numeric column.")
@click.option("--group", default=None, help="Optional column to stratify by.")
@click.option("--partial", "covar", default=None, help="Optional covariate for a partial correlation r(x,y|covar).")
@click.option("--bootstrap", is_flag=True, help="Add a percentile bootstrap CI (useful for small cells).")
@click.option("--out", "output_csv", default=None, type=click.Path(dir_okay=False),
help="Optional CSV to write the correlation table to.")
def correlate(csv_path: str, x: str, y: str, group: str | None, covar: str | None,
bootstrap: bool, output_csv: str | None) -> None:
"""Pearson (+Fisher-z CI) and Spearman correlations, overall and per stratum.

Reviewer-proof by design: every estimate carries n, a 95% CI and a rank-based
robustness check; optional partial correlation and bootstrap CI.
"""
import pandas as pd

from prisma.stats import bootstrap_ci, correlation_table, partial_correlation

df = pd.read_csv(csv_path)
table = correlation_table(df, x, y, group=group)
click.echo(table.to_string(index=False))
if output_csv:
out = Path(output_csv)
out.parent.mkdir(parents=True, exist_ok=True)
table.to_csv(out, index=False)
click.echo(f"Wrote {out}")
if covar:
click.echo(f"\nPartial r({x},{y}|{covar}): {partial_correlation(df, x, y, covar)}")
if bootstrap:
click.echo(f"Bootstrap CI: {bootstrap_ci(df, x, y)}")


if __name__ == "__main__":
main()
25 changes: 25 additions & 0 deletions src/prisma/stats/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Statistical helpers for transparent, reviewer-proof SLR / education-research analysis.

Reusable methods extracted while hardening the statistical rigour of the
20-60-20 AI-permitted-assessment study (Cuervo, 2026): within-stratum
correlations with Fisher-z confidence intervals, Spearman robustness,
partial correlation, bootstrap CIs for small cells, missingness/selection
checks, and a clustered (mixed-effects) model with ICC.
"""
from .correlations import (
bootstrap_ci,
correlation_table,
fisher_ci,
missingness_compare,
mixed_model_icc,
partial_correlation,
)

__all__ = [
"bootstrap_ci",
"correlation_table",
"fisher_ci",
"missingness_compare",
"mixed_model_icc",
"partial_correlation",
]
152 changes: 152 additions & 0 deletions src/prisma/stats/correlations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""Correlation and robustness statistics for education / SLR datasets.

Design goals: every estimate ships with the information a sceptical reviewer
asks for — sample size, a confidence interval, a rank-based robustness check,
and (where relevant) controls for mechanical overlap, small-cell fragility,
selection bias and clustering. Pure functions over pandas DataFrames.
"""
from __future__ import annotations

import math
from collections.abc import Sequence

import numpy as np
import pandas as pd
from scipy import stats


def fisher_ci(r: float, n: int, alpha: float = 0.05) -> tuple[float, float]:
"""Fisher z-transform confidence interval for a Pearson correlation.

Returns (nan, nan) when undefined (n < 4 or |r| >= 1).
"""
if n < 4 or abs(r) >= 1.0:
return (float("nan"), float("nan"))
z = math.atanh(r)
se = 1.0 / math.sqrt(n - 3)
crit = stats.norm.ppf(1 - alpha / 2)
return (math.tanh(z - crit * se), math.tanh(z + crit * se))


def _pair(df: pd.DataFrame, x: str, y: str) -> pd.DataFrame:
return df[[x, y]].apply(pd.to_numeric, errors="coerce").dropna()


def correlation_table(
df: pd.DataFrame,
x: str,
y: str,
group: str | None = None,
alpha: float = 0.05,
) -> pd.DataFrame:
"""Pearson r (+ Fisher-z CI) and Spearman rho with n, overall and per group.

The Spearman column is the headline robustness check: a Pearson/Spearman gap
flags outlier- or spread-driven associations.
"""
def one(sub: pd.DataFrame, label: str) -> dict | None:
d = _pair(sub, x, y)
if len(d) < 4:
return None
pr, pp = stats.pearsonr(d[x], d[y])
sr, sp = stats.spearmanr(d[x], d[y])
lo, hi = fisher_ci(pr, len(d), alpha)
return {
"stratum": label, "n": len(d),
"pearson_r": round(pr, 3), "ci_low": round(lo, 3), "ci_high": round(hi, 3),
"pearson_p": pp, "spearman_rho": round(sr, 3), "spearman_p": sp,
}

rows = [one(df, "ALL")]
if group is not None:
for g, sub in df.groupby(group):
rows.append(one(sub, str(g)))
return pd.DataFrame([r for r in rows if r is not None])


def partial_correlation(df: pd.DataFrame, x: str, y: str, covar: str) -> dict:
"""First-order partial correlation r(x, y | covar).

Answers 'does the x-y association survive controlling for a confounder
such as general ability?'.
"""
d = df[[x, y, covar]].apply(pd.to_numeric, errors="coerce").dropna()
if len(d) < 5:
return {"n": len(d), "partial_r": float("nan"), "raw_r": float("nan")}
rxy = stats.pearsonr(d[x], d[y])[0]
rxz = stats.pearsonr(d[x], d[covar])[0]
ryz = stats.pearsonr(d[y], d[covar])[0]
denom = math.sqrt((1 - rxz ** 2) * (1 - ryz ** 2))
pr = (rxy - rxz * ryz) / denom if denom else float("nan")
return {"n": len(d), "partial_r": round(pr, 3), "raw_r": round(rxy, 3)}


def bootstrap_ci(
df: pd.DataFrame, x: str, y: str, n_boot: int = 5000, seed: int = 0, alpha: float = 0.05
) -> dict:
"""Percentile bootstrap CI for a Pearson r — for small/fragile cells."""
d = _pair(df, x, y)
if len(d) < 10:
return {"n": len(d), "r": float("nan"), "ci_low": float("nan"), "ci_high": float("nan")}
arr = d.to_numpy()
rng = np.random.default_rng(seed)
boot = []
for _ in range(n_boot):
s = arr[rng.integers(0, len(arr), len(arr))]
if s[:, 0].std() > 0 and s[:, 1].std() > 0:
boot.append(np.corrcoef(s[:, 0], s[:, 1])[0, 1])
return {
"n": len(d), "r": round(float(np.corrcoef(arr[:, 0], arr[:, 1])[0, 1]), 3),
"ci_low": round(float(np.percentile(boot, 100 * alpha / 2)), 3),
"ci_high": round(float(np.percentile(boot, 100 * (1 - alpha / 2))), 3),
"n_boot": len(boot),
}


def missingness_compare(df: pd.DataFrame, indicator: str, by: Sequence[str]) -> pd.DataFrame:
"""Compare rows where `indicator` is present vs missing on `by` variables.

Surfaces selection bias from listwise deletion (Mann-Whitney U, non-parametric).
"""
present = df[df[indicator].notna()]
missing = df[df[indicator].isna()]
out = []
for var in by:
h = pd.to_numeric(present[var], errors="coerce").dropna()
m = pd.to_numeric(missing[var], errors="coerce").dropna()
rec = {"variable": var, "n_present": len(h), "n_missing": len(m),
"mean_present": round(h.mean(), 2) if len(h) else float("nan"),
"mean_missing": round(m.mean(), 2) if len(m) else float("nan"), "mw_p": float("nan")}
if len(h) > 3 and len(m) > 3:
rec["mw_p"] = stats.mannwhitneyu(h, m).pvalue
out.append(rec)
return pd.DataFrame(out)


def mixed_model_icc(df: pd.DataFrame, outcome: str, predictor: str, group: str) -> dict:
"""Random-intercept mixed model `outcome ~ predictor + (1|group)` with ICC.

Accounts for clustering (students nested in cohorts/courses). Reports the
cluster-adjusted fixed effect, the naive OLS effect, and the intraclass
correlation. Requires the optional `statsmodels` dependency.
"""
try:
import statsmodels.formula.api as smf
except ImportError as exc: # pragma: no cover
raise ImportError(
"mixed_model_icc requires statsmodels: pip install 'proportione-prisma[stats]'"
) from exc
d = df[[outcome, predictor, group]].apply(
lambda c: pd.to_numeric(c, errors="coerce") if c.name != group else c
).dropna()
md = smf.mixedlm(f"{outcome} ~ {predictor}", d, groups=d[group]).fit(reml=False)
ols = smf.ols(f"{outcome} ~ {predictor}", d).fit()
gv = float(md.cov_re.iloc[0, 0])
rv = float(md.scale)
icc = gv / (gv + rv) if (gv + rv) else float("nan")
return {
"n": len(d), "n_groups": int(d[group].nunique()),
"coef": round(float(md.params.get(predictor)), 4), "p": float(md.pvalues.get(predictor)),
"icc": round(icc, 3),
"ols_coef": round(float(ols.params[predictor]), 4), "ols_p": float(ols.pvalues[predictor]),
}
62 changes: 62 additions & 0 deletions tests/test_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Tests for prisma.stats correlation helpers."""
import math

import numpy as np
import pandas as pd
import pytest

from prisma.stats import (
bootstrap_ci,
correlation_table,
fisher_ci,
missingness_compare,
partial_correlation,
)


@pytest.fixture
def df():
rng = np.random.default_rng(7)
n = 120
x = rng.normal(size=n)
y = 0.6 * x + rng.normal(scale=0.8, size=n)
z = 0.5 * x + rng.normal(scale=0.9, size=n)
g = ["A"] * 60 + ["B"] * 60
d = pd.DataFrame({"x": x, "y": y, "z": z, "g": g})
# introduce some missingness in y
d.loc[d.index[:20], "y"] = np.nan
return d


def test_fisher_ci_orders_and_bounds():
lo, hi = fisher_ci(0.5, 100)
assert lo < 0.5 < hi
assert -1 <= lo <= 1 and -1 <= hi <= 1
assert all(math.isnan(v) for v in fisher_ci(0.5, 3)) # n too small


def test_correlation_table_overall_and_grouped(df):
t = correlation_table(df, "x", "y", group="g")
assert set(["stratum", "n", "pearson_r", "ci_low", "ci_high", "spearman_rho"]).issubset(t.columns)
assert t.iloc[0]["stratum"] == "ALL"
assert {"A", "B"}.issubset(set(t["stratum"]))
row = t.iloc[0]
assert row["ci_low"] < row["pearson_r"] < row["ci_high"]
assert row["pearson_r"] > 0 # positive by construction


def test_partial_correlation_reduces_when_confounded(df):
p = partial_correlation(df, "x", "y", "z")
assert p["n"] > 5
assert abs(p["partial_r"]) <= abs(p["raw_r"]) + 1e-9 # controlling a correlate cannot inflate


def test_bootstrap_ci_brackets_point(df):
b = bootstrap_ci(df, "x", "y", n_boot=500, seed=1)
assert b["ci_low"] < b["r"] < b["ci_high"]


def test_missingness_compare_flags_columns(df):
m = missingness_compare(df, indicator="y", by=["x", "z"])
assert list(m["variable"]) == ["x", "z"]
assert (m["n_missing"] == 20).all()
Loading