From b8c208eabf679c814dcbdf59233e3f93cbddd6c1 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 24 May 2026 15:57:33 +0000 Subject: [PATCH 1/3] =?UTF-8?q?feat:=20add=20B-Cubed=20(B=C2=B3)=20metrics?= =?UTF-8?q?=20and=20confusion=20matrix=20to=20ER=20scoring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit score_er_tier now also reports cluster-level B-Cubed precision/recall/F1 (clusters built from the pair graph via connected components over all rows) and the full pair-level confusion matrix (TP/FP/FN/TN), shown in the ER report (rich table + JSON). These are diagnostic only — the headline DQBench ER Score stays pair-F1-weighted and unchanged, so published leaderboard entries don't move. https://claude.ai/code/session_01KjxRYsnVFPVJ3aUBmNm7vB --- AGENTS.md | 4 +-- CHANGELOG.md | 3 +- CLAUDE.md | 3 +- README.md | 2 +- dqbench/er_scorer.py | 77 +++++++++++++++++++++++++++++++++++++++-- dqbench/models.py | 7 ++++ dqbench/report.py | 31 ++++++++++++++--- tests/test_er_scorer.py | 47 +++++++++++++++++++++++++ 8 files changed, 163 insertions(+), 11 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 1bad9c6..b17bf42 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -6,7 +6,7 @@ The standard benchmark for data quality and validation tools — five categories ```bash pip install -e ".[dev]" # Dev install -pytest --tb=short -v # Run tests (242 passing) +pytest --tb=short -v # Run tests (247 passing) ruff check . # Lint dqbench run # Run benchmark (records result on the local leaderboard) dqbench run all # Head-to-head comparison @@ -93,7 +93,7 @@ The adapter interface is the primary extension point. Each adapter implements a ## Performance & Testing -- Always run `pytest --tb=short -v` before committing. All 242 tests must pass. +- Always run `pytest --tb=short -v` before committing. All 247 tests must pass. - Always run `ruff check .` for linting. - Tier generators use a local `random.Random(42)` instance for deterministic output. - Do not use numpy or any external RNG; stick to stdlib `random.Random(42)`. diff --git a/CHANGELOG.md b/CHANGELOG.md index a422b8e..2b76172 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Unreleased ### Added +- **ER B³ (BCubed) metrics + confusion matrix** — `score_er_tier` now also reports cluster-level B-Cubed precision/recall/F1 (built from the pair graph via connected components over all rows) and the full pair-level confusion matrix (TP/FP/FN/TN), surfaced in the ER report (rich + JSON). These are **diagnostic only** — the headline DQBench ER Score stays pair-F1-weighted and unchanged, so published entries don't move. - **Third-party OSS tools on the leaderboard** — new adapters and reproducible, version-pinned entries: **Splink** (ER, 87.14 — probabilistic Fellegi-Sunter, seeded), **recordlinkage** (ER, 80.28 — blocking + Jaro-Winkler), **cuallee** (Detect, 30.56 — rule-based DQ checks), **frictionless** (Detect, 2.22 — inferred-schema validation), and a **pandas cleaning baseline** (Transform, 100.0). **Great Expectations** is now included too (Detect: best-effort 21.68, auto-profiled 21.29, zero-config 0.0) — its earlier non-determinism turned out to be dev-environment contamination; in an isolated env with pinned deps it reproduces exactly. Each entry runs in its own isolated CI job and passes `dqbench verify`. - **Published leaderboard with a reproducibility gate** — a version-controlled, community-submittable board where **results are only accepted if a GitHub Action can reproduce them**. Each entry is backed by a manifest under `leaderboard/submissions/` (tool, category, adapter, pinned packages). New commands: `dqbench reproduce [--write]` (run the manifest, optionally record it), `dqbench verify ` (reproduce and confirm the committed numbers match), `dqbench publish [--check]` (regenerate/verify `LEADERBOARD.md`), and `dqbench leaderboard --source repo`. `dqbench run --adapter` now also accepts a `module:Class` reference. - CI: `.github/workflows/leaderboard.yml` gates PRs with `dqbench publish --check` (structural) and `dqbench verify` on each changed manifest (reproduction); `.github/workflows/leaderboard-refresh.yml` audits all manifests on a schedule. @@ -19,7 +20,7 @@ ### Changed - `ensure_er_datasets()` in `dqbench/runner.py` is now per-tier idempotent — users with an existing T1-T3 cache pick up T4 without needing `dqbench generate --force`. - Default ER tier list extended to `[1, 2, 3, 4]`; existing callers passing explicit `tiers=` are unaffected. -- Full test suite: 242 passing (was 161). +- Full test suite: 247 passing (was 161). ## v1.1.0 — 2026-03-29 diff --git a/CLAUDE.md b/CLAUDE.md index bb9ee80..93e5645 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ The standard benchmark for data quality and validation tools — five categories ```bash pip install -e ".[dev]" # Dev install -pytest --tb=short -v # Run tests (242 passing) +pytest --tb=short -v # Run tests (247 passing) ruff check . # Lint dqbench run # Run benchmark (records result on the local leaderboard) dqbench run all # Head-to-head comparison @@ -76,6 +76,7 @@ T4 has `weights.get(tier, 0) == 0` in `ERScorecard.dqbench_er_score` — reporte ### ER - **Pair-level P/R/F1** against `ERGroundTruth.duplicate_pairs` (pairs normalised to `(min, max)`) +- **B³ (BCubed) P/R/F1 + confusion matrix** (TP/FP/FN/TN) also reported per tier — diagnostic only, do NOT affect the composite - **DQBench ER Score**: `T1_F1 × 20% + T2_F1 × 40% + T3_F1 × 40%` — T4 weight is 0 - A perfect adapter on `tiers=[1, 2, 3]` scores 100; on `tiers=[4]` it scores 1.0 F1 but 0.0 composite diff --git a/README.md b/README.md index 4fd8b4d..f1f6954 100644 --- a/README.md +++ b/README.md @@ -287,7 +287,7 @@ dqbench run --adapter my_er_adapter.py | **Pipeline** | 3 | End-to-end pipeline orchestration | | **OCR Company** | 3 | OCR company-name confidence and correction | -Full suite: 242 tests passing across all five categories. +Full suite: 247 tests passing across all five categories. ## OCR Company Benchmark diff --git a/dqbench/er_scorer.py b/dqbench/er_scorer.py index 8d547d5..88afdcb 100644 --- a/dqbench/er_scorer.py +++ b/dqbench/er_scorer.py @@ -1,8 +1,8 @@ """Scoring logic for ER benchmarks.""" from __future__ import annotations -from dqbench.models import ERTierResult from dqbench.er_ground_truth import ERGroundTruth +from dqbench.models import ERTierResult def _normalize_pairs(pairs: list[tuple[int, int]]) -> set[tuple[int, int]]: @@ -10,6 +10,67 @@ def _normalize_pairs(pairs: list[tuple[int, int]]) -> set[tuple[int, int]]: return {(min(a, b), max(a, b)) for a, b in pairs} +def _clusters_from_pairs(pairs: set[tuple[int, int]], n: int) -> tuple[list[int], dict[int, set[int]]]: + """Union-find over rows 0..n-1; records not in any pair are singletons. + + Returns (root_of_element, members_by_root). + """ + parent = list(range(n)) + + def find(x: int) -> int: + root = x + while parent[root] != root: + root = parent[root] + while parent[x] != root: # path compression + parent[x], x = root, parent[x] + return root + + for a, b in pairs: + if a < n and b < n: + ra, rb = find(a), find(b) + if ra != rb: + parent[max(ra, rb)] = min(ra, rb) + + roots = [find(i) for i in range(n)] + members: dict[int, set[int]] = {} + for i, r in enumerate(roots): + members.setdefault(r, set()).add(i) + return roots, members + + +def _bcubed(pred_pairs: set[tuple[int, int]], true_pairs: set[tuple[int, int]], n: int) -> tuple[float, float, float]: + """B-Cubed (B³) precision/recall/F1, averaged over all n elements. + + For each element e: precision = |pred_cluster(e) ∩ true_cluster(e)| / |pred_cluster(e)|, + recall = the same intersection / |true_cluster(e)|. + """ + if n == 0: + return 0.0, 0.0, 0.0 + + pred_root, pred_members = _clusters_from_pairs(pred_pairs, n) + true_root, true_members = _clusters_from_pairs(true_pairs, n) + + # Cache intersection size per (pred_root, true_root) pair to avoid recomputation. + inter_cache: dict[tuple[int, int], int] = {} + p_sum = 0.0 + r_sum = 0.0 + for e in range(n): + pc = pred_members[pred_root[e]] + tc = true_members[true_root[e]] + key = (pred_root[e], true_root[e]) + inter = inter_cache.get(key) + if inter is None: + inter = len(pc & tc) + inter_cache[key] = inter + p_sum += inter / len(pc) + r_sum += inter / len(tc) + + precision = p_sum / n + recall = r_sum / n + f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 + return precision, recall, f1 + + def score_er_tier( predictions: list[tuple[int, int]], ground_truth: ERGroundTruth, @@ -17,7 +78,7 @@ def score_er_tier( time_seconds: float, memory_mb: float, ) -> ERTierResult: - """Score ER predictions against ground truth using pair-level P/R/F1.""" + """Score ER predictions with pair-level P/R/F1, a confusion matrix, and B³.""" true_pairs = _normalize_pairs(ground_truth.duplicate_pairs) pred_pairs = _normalize_pairs(predictions) @@ -29,6 +90,11 @@ def score_er_tier( fp = len(false_positives) fn = len(false_negatives) + # True negatives: all candidate pairs that are correctly NOT linked. + n = ground_truth.rows + total_possible = n * (n - 1) // 2 + tn = max(0, total_possible - tp - fp - fn) + if tp == 0: precision = 0.0 recall = 0.0 @@ -38,6 +104,8 @@ def score_er_tier( recall = tp / (tp + fn) f1 = 2 * precision * recall / (precision + recall) + bc_p, bc_r, bc_f1 = _bcubed(pred_pairs, true_pairs, n) + return ERTierResult( tier=tier, precision=precision, @@ -47,4 +115,9 @@ def score_er_tier( false_negatives=fn, time_seconds=time_seconds, memory_mb=memory_mb, + true_positives=tp, + true_negatives=tn, + bcubed_precision=bc_p, + bcubed_recall=bc_r, + bcubed_f1=bc_f1, ) diff --git a/dqbench/models.py b/dqbench/models.py index 8aea98e..f946d6b 100644 --- a/dqbench/models.py +++ b/dqbench/models.py @@ -96,6 +96,13 @@ class ERTierResult: false_negatives: int time_seconds: float memory_mb: float + # Confusion matrix (pair-level). fp/fn above are the off-diagonal counts. + true_positives: int = 0 + true_negatives: int = 0 + # B-Cubed (B³) cluster-level metrics — diagnostic, do not affect the composite. + bcubed_precision: float = 0.0 + bcubed_recall: float = 0.0 + bcubed_f1: float = 0.0 @dataclass diff --git a/dqbench/report.py b/dqbench/report.py index 99e1fd6..175dd57 100644 --- a/dqbench/report.py +++ b/dqbench/report.py @@ -261,8 +261,6 @@ def report_er_rich(scorecard: ERScorecard) -> None: table.add_column("Precision", style="green") table.add_column("Recall", style="green") table.add_column("F1", style="green") - table.add_column("FP", style="red") - table.add_column("FN", style="red") table.add_column("Time", style="dim") table.add_column("Memory", style="dim") @@ -272,8 +270,6 @@ def report_er_rich(scorecard: ERScorecard) -> None: f"{t.precision:.1%}", f"{t.recall:.1%}", f"{t.f1:.1%}", - str(t.false_positives), - str(t.false_negatives), f"{t.time_seconds:.2f}s", f"{t.memory_mb:.1f} MB", ) @@ -283,6 +279,33 @@ def report_er_rich(scorecard: ERScorecard) -> None: f"\n[bold]DQBench ER Score: {scorecard.dqbench_er_score:.2f} / 100[/bold]\n" ) + # Confusion matrix (pair-level) + B-Cubed cluster metrics (diagnostic). + cm_table = Table( + title="Confusion Matrix (pairs) & B³ (clusters)", + box=box.ROUNDED, show_header=True, header_style="bold yellow", + ) + cm_table.add_column("Tier", style="cyan") + cm_table.add_column("TP", justify="right", style="green") + cm_table.add_column("FP", justify="right", style="red") + cm_table.add_column("FN", justify="right", style="red") + cm_table.add_column("TN", justify="right", style="dim") + cm_table.add_column("B³ Prec", justify="right", style="green") + cm_table.add_column("B³ Rec", justify="right", style="green") + cm_table.add_column("B³ F1", justify="right", style="bold green") + for t in scorecard.tiers: + cm_table.add_row( + f"T{t.tier}", + str(t.true_positives), + str(t.false_positives), + str(t.false_negatives), + str(t.true_negatives), + f"{t.bcubed_precision:.1%}", + f"{t.bcubed_recall:.1%}", + f"{t.bcubed_f1:.1%}", + ) + console.print(cm_table) + console.print() + # Real dataset results if present if scorecard.real_datasets: real_table = Table( diff --git a/tests/test_er_scorer.py b/tests/test_er_scorer.py index f651df0..9207fa3 100644 --- a/tests/test_er_scorer.py +++ b/tests/test_er_scorer.py @@ -93,3 +93,50 @@ def test_f1_harmonic_mean(self, ground_truth): r = 2 / 5 expected_f1 = 2 * p * r / (p + r) assert result.f1 == pytest.approx(expected_f1) + + +class TestConfusionMatrix: + def test_confusion_counts_sum_to_total_pairs(self, ground_truth): + predictions = [(0, 50), (1, 51), (2, 52), (3, 53), (4, 54), (10, 60), (20, 70)] + result = score_er_tier(predictions, ground_truth, tier=1, + time_seconds=0.0, memory_mb=0.0) + n = ground_truth.rows + total = n * (n - 1) // 2 + assert result.true_positives == 5 + assert result.false_positives == 2 + assert result.false_negatives == 0 + assert result.true_negatives == total - 5 - 2 - 0 + assert (result.true_positives + result.false_positives + + result.false_negatives + result.true_negatives) == total + + def test_perfect_has_no_off_diagonal(self, ground_truth): + preds = [(0, 50), (1, 51), (2, 52), (3, 53), (4, 54)] + result = score_er_tier(preds, ground_truth, tier=1, time_seconds=0.0, memory_mb=0.0) + assert result.false_positives == 0 and result.false_negatives == 0 + assert result.true_positives == 5 + + +class TestBCubed: + def test_perfect_clustering_is_one(self, ground_truth): + preds = [(0, 50), (1, 51), (2, 52), (3, 53), (4, 54)] + result = score_er_tier(preds, ground_truth, tier=1, time_seconds=0.0, memory_mb=0.0) + assert result.bcubed_precision == pytest.approx(1.0) + assert result.bcubed_recall == pytest.approx(1.0) + assert result.bcubed_f1 == pytest.approx(1.0) + + def test_empty_predictions_recall_below_one(self, ground_truth): + # No links: every singleton predicted. Precision is perfect (each predicted + # cluster of size 1 is "pure"), but recall misses the true 2-member clusters. + result = score_er_tier([], ground_truth, tier=1, time_seconds=0.0, memory_mb=0.0) + assert result.bcubed_precision == pytest.approx(1.0) + assert result.bcubed_recall < 1.0 + assert result.bcubed_f1 < 1.0 + + def test_over_merging_drops_precision(self): + # One giant predicted cluster {0,1,2,3} but truth has two separate pairs. + gt = ERGroundTruth(tier=1, version="1.0.0", rows=4, + duplicate_pairs=[(0, 1), (2, 3)], total_duplicates=2, difficulty="easy") + result = score_er_tier([(0, 1), (1, 2), (2, 3)], gt, tier=1, time_seconds=0.0, memory_mb=0.0) + # All 4 in one predicted cluster: each element's pred cluster has 2/4 correct. + assert result.bcubed_precision == pytest.approx(0.5) + assert result.bcubed_recall == pytest.approx(1.0) From 1e1ef73a765caa6b41f4d39840d990d533a3a6e5 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 24 May 2026 19:31:37 +0000 Subject: [PATCH 2/3] feat: ungated reference board for auto-config (GoldenMatch / GoldenSuite) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Auto-config tools learn/sample and aren't reproducible, so they can't sit on the gated board. Add a separate "Reference — auto-config (not gate-verified)" section instead. - Manifests marked "gated": false route to leaderboard/reference/ — no manifest-linkage required, skipped by the CI verify matrix and the refresh audit. publish/check_published render + validate (schema only). - New GoldenMatchAutoConfigAdapter (goldenmatch-auto) via auto_configure_df. - Seeded reference entries: GoldenMatch (auto-config) ER (~92 this cold run; observed 57-92, non-deterministic) and GoldenSuite (zero-config) Pipeline (~33.85). Both clearly flagged non-reproducible with their range. https://claude.ai/code/session_01KjxRYsnVFPVJ3aUBmNm7vB --- .github/workflows/leaderboard-refresh.yml | 10 +- .github/workflows/leaderboard.yml | 12 +- AGENTS.md | 4 +- CHANGELOG.md | 3 +- CLAUDE.md | 2 +- LEADERBOARD.md | 16 +++ README.md | 2 +- docs/leaderboard.md | 9 ++ dqbench/adapters/goldenmatch_adapter.py | 42 +++++++ dqbench/cli.py | 1 + dqbench/submission.py | 114 ++++++++++++++---- leaderboard/reference/er.json | 20 +++ leaderboard/reference/pipeline.json | 19 +++ .../submissions/er-goldenmatch-auto.json | 15 +++ .../pipeline-goldensuite-zero.json | 18 +++ tests/test_submission.py | 33 +++++ 16 files changed, 290 insertions(+), 30 deletions(-) create mode 100644 leaderboard/reference/er.json create mode 100644 leaderboard/reference/pipeline.json create mode 100644 leaderboard/submissions/er-goldenmatch-auto.json create mode 100644 leaderboard/submissions/pipeline-goldensuite-zero.json diff --git a/.github/workflows/leaderboard-refresh.yml b/.github/workflows/leaderboard-refresh.yml index 79bcec4..2d23b1e 100644 --- a/.github/workflows/leaderboard-refresh.yml +++ b/.github/workflows/leaderboard-refresh.yml @@ -17,11 +17,17 @@ jobs: run: | set -uo pipefail files=$(ls leaderboard/submissions/*.json 2>/dev/null || true) - if [ -z "$files" ]; then + gated="" + for f in $files; do + if python -c "import json,sys; sys.exit(0 if json.load(open('$f')).get('gated', True) else 1)"; then + gated="$gated $f" + fi + done + if [ -z "$(echo $gated)" ]; then echo "any=false" >> "$GITHUB_OUTPUT" echo "manifests=[]" >> "$GITHUB_OUTPUT" else - arr=$(printf '%s\n' $files | jq -R . | jq -cs .) + arr=$(printf '%s\n' $gated | jq -R . | jq -cs .) echo "any=true" >> "$GITHUB_OUTPUT" echo "manifests=$arr" >> "$GITHUB_OUTPUT" fi diff --git a/.github/workflows/leaderboard.yml b/.github/workflows/leaderboard.yml index 13fdc2b..c520a10 100644 --- a/.github/workflows/leaderboard.yml +++ b/.github/workflows/leaderboard.yml @@ -44,11 +44,19 @@ jobs: base="${{ github.base_ref }}" git fetch origin "$base" files=$(git diff --name-only --diff-filter=ACMR "origin/$base...HEAD" -- leaderboard/submissions/ | grep '\.json$' || true) - if [ -z "$files" ]; then + # Only gate manifests with "gated" != false (ungated reference runs are not verified). + gated="" + for f in $files; do + [ -f "$f" ] || continue + if python -c "import json,sys; sys.exit(0 if json.load(open('$f')).get('gated', True) else 1)"; then + gated="$gated $f" + fi + done + if [ -z "$(echo $gated)" ]; then echo "any=false" >> "$GITHUB_OUTPUT" echo "manifests=[]" >> "$GITHUB_OUTPUT" else - arr=$(printf '%s\n' $files | jq -R . | jq -cs .) + arr=$(printf '%s\n' $gated | jq -R . | jq -cs .) echo "any=true" >> "$GITHUB_OUTPUT" echo "manifests=$arr" >> "$GITHUB_OUTPUT" fi diff --git a/AGENTS.md b/AGENTS.md index b17bf42..237956c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -6,7 +6,7 @@ The standard benchmark for data quality and validation tools — five categories ```bash pip install -e ".[dev]" # Dev install -pytest --tb=short -v # Run tests (247 passing) +pytest --tb=short -v # Run tests (251 passing) ruff check . # Lint dqbench run # Run benchmark (records result on the local leaderboard) dqbench run all # Head-to-head comparison @@ -93,7 +93,7 @@ The adapter interface is the primary extension point. Each adapter implements a ## Performance & Testing -- Always run `pytest --tb=short -v` before committing. All 247 tests must pass. +- Always run `pytest --tb=short -v` before committing. All 251 tests must pass. - Always run `ruff check .` for linting. - Tier generators use a local `random.Random(42)` instance for deterministic output. - Do not use numpy or any external RNG; stick to stdlib `random.Random(42)`. diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b76172..0ec783e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Unreleased ### Added +- **Ungated "reference" board for auto-config runs** — non-deterministic auto-config results (which can't pass the reproducibility gate) can now be published in a separate "Reference — auto-config (not gate-verified)" section of `LEADERBOARD.md`. Manifests marked `"gated": false` route to `leaderboard/reference/` (no manifest-linkage required, skipped by the CI verify matrix). Seeded with **GoldenMatch (auto-config)** (ER) and **GoldenSuite (zero-config)** (Pipeline), each flagged non-reproducible with its observed range. New `GoldenMatchAutoConfigAdapter` (`goldenmatch-auto`). - **ER B³ (BCubed) metrics + confusion matrix** — `score_er_tier` now also reports cluster-level B-Cubed precision/recall/F1 (built from the pair graph via connected components over all rows) and the full pair-level confusion matrix (TP/FP/FN/TN), surfaced in the ER report (rich + JSON). These are **diagnostic only** — the headline DQBench ER Score stays pair-F1-weighted and unchanged, so published entries don't move. - **Third-party OSS tools on the leaderboard** — new adapters and reproducible, version-pinned entries: **Splink** (ER, 87.14 — probabilistic Fellegi-Sunter, seeded), **recordlinkage** (ER, 80.28 — blocking + Jaro-Winkler), **cuallee** (Detect, 30.56 — rule-based DQ checks), **frictionless** (Detect, 2.22 — inferred-schema validation), and a **pandas cleaning baseline** (Transform, 100.0). **Great Expectations** is now included too (Detect: best-effort 21.68, auto-profiled 21.29, zero-config 0.0) — its earlier non-determinism turned out to be dev-environment contamination; in an isolated env with pinned deps it reproduces exactly. Each entry runs in its own isolated CI job and passes `dqbench verify`. - **Published leaderboard with a reproducibility gate** — a version-controlled, community-submittable board where **results are only accepted if a GitHub Action can reproduce them**. Each entry is backed by a manifest under `leaderboard/submissions/` (tool, category, adapter, pinned packages). New commands: `dqbench reproduce [--write]` (run the manifest, optionally record it), `dqbench verify ` (reproduce and confirm the committed numbers match), `dqbench publish [--check]` (regenerate/verify `LEADERBOARD.md`), and `dqbench leaderboard --source repo`. `dqbench run --adapter` now also accepts a `module:Class` reference. @@ -20,7 +21,7 @@ ### Changed - `ensure_er_datasets()` in `dqbench/runner.py` is now per-tier idempotent — users with an existing T1-T3 cache pick up T4 without needing `dqbench generate --force`. - Default ER tier list extended to `[1, 2, 3, 4]`; existing callers passing explicit `tiers=` are unaffected. -- Full test suite: 247 passing (was 161). +- Full test suite: 251 passing (was 161). ## v1.1.0 — 2026-03-29 diff --git a/CLAUDE.md b/CLAUDE.md index 93e5645..c0dd501 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ The standard benchmark for data quality and validation tools — five categories ```bash pip install -e ".[dev]" # Dev install -pytest --tb=short -v # Run tests (247 passing) +pytest --tb=short -v # Run tests (251 passing) ruff check . # Lint dqbench run # Run benchmark (records result on the local leaderboard) dqbench run all # Head-to-head comparison diff --git a/LEADERBOARD.md b/LEADERBOARD.md index 0ea73de..e6eaa01 100644 --- a/LEADERBOARD.md +++ b/LEADERBOARD.md @@ -42,3 +42,19 @@ Published results across all five categories. Higher is better; the score is the | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | | 1 | GoldenSuite (tuned) | 1.2.0 | 80.0% | 81.7% | 67.3% | 75.59 | DQBench maintainers | reproduced | 2026-05-24 | | 2 | GoldenPipe | 1.2.0 | 80.0% | 81.7% | 56.8% | 71.38 | DQBench maintainers | reproduced | 2026-05-24 | + +# Reference — auto-config (not gate-verified) + +> ⚠️ These runs are **not reproducible** and are **not enforced by CI** — auto-config tools learn/sample and produce different numbers across runs. Shown for reference only; see each entry's notes for the observed range. + +## ER + +| Rank | Tool | Version | T1 | T2 | T3 | T4 | Score | Submitter | Source | Date | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 1 | GoldenMatch (auto-config) | 1.18.1 | 89.3% | 97.8% | 88.4% | 82.3% | 92.36 | DQBench maintainers | auto-config | 2026-05-24 | + +## Pipeline + +| Rank | Tool | Version | T1 | T2 | T3 | Score | Submitter | Source | Date | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 1 | GoldenSuite (zero-config) | 1.2.0 | 49.8% | 28.9% | 30.9% | 33.85 | DQBench maintainers | auto-config | 2026-05-24 | diff --git a/README.md b/README.md index f1f6954..3c7c429 100644 --- a/README.md +++ b/README.md @@ -287,7 +287,7 @@ dqbench run --adapter my_er_adapter.py | **Pipeline** | 3 | End-to-end pipeline orchestration | | **OCR Company** | 3 | OCR company-name confidence and correction | -Full suite: 247 tests passing across all five categories. +Full suite: 251 tests passing across all five categories. ## OCR Company Benchmark diff --git a/docs/leaderboard.md b/docs/leaderboard.md index 702103a..c689493 100644 --- a/docs/leaderboard.md +++ b/docs/leaderboard.md @@ -82,6 +82,15 @@ invocations (e.g. sampling-based profilers) cannot be accepted until their adapt made deterministic — the reproducibility gate will reject them. CI reproduces runs on **Python 3.11**; pin your `install` versions so numbers don't drift. +### Reference board (ungated) + +Auto-config tools that learn/sample (e.g. GoldenMatch's `auto_configure_df`) are +genuinely non-reproducible, so they can't sit on the gated board. They can still be +shown for context in a separate **"Reference — auto-config (not gate-verified)"** +section. Mark the manifest `"gated": false` — it routes to `leaderboard/reference/`, +needs no manifest-linkage, and is skipped by the CI verify matrix. Reference entries +should document their observed run-to-run range in `notes`. + ## Result sources | `--result-source` / `source` | Meaning | diff --git a/dqbench/adapters/goldenmatch_adapter.py b/dqbench/adapters/goldenmatch_adapter.py index 7cc13fa..dcfcbbb 100644 --- a/dqbench/adapters/goldenmatch_adapter.py +++ b/dqbench/adapters/goldenmatch_adapter.py @@ -9,6 +9,48 @@ from dqbench.adapters.base import EntityResolutionAdapter +def _pairs_from_clusters(result) -> list[tuple[int, int]]: + """All within-cluster pairs from a GoldenMatch DedupeResult.""" + pairs: list[tuple[int, int]] = [] + if result.clusters: + for cluster in result.clusters.values(): + members = sorted(cluster["members"]) + for i in range(len(members)): + for j in range(i + 1, len(members)): + pairs.append((members[i], members[j])) + return pairs + + +class GoldenMatchAutoConfigAdapter(EntityResolutionAdapter): + """GoldenMatch in auto-config mode (no hand-tuned config). + + NOTE: GoldenMatch's auto-config learns/samples and persists state to + `.goldenmatch/memory.db`, so results are NOT reproducible run-to-run — this + adapter is for the ungated *reference* board only, never the gated leaderboard. + """ + + @property + def name(self) -> str: + return "GoldenMatch (auto-config)" + + @property + def version(self) -> str: + try: + import goldenmatch + return goldenmatch.__version__ + except ImportError: + return "not-installed" + + def deduplicate(self, csv_path: Path) -> list[tuple[int, int]]: + import goldenmatch + import polars as pl + + df = pl.read_csv(csv_path) + config = goldenmatch.auto_configure_df(df) + result = goldenmatch.dedupe_df(df, config=config) + return _pairs_from_clusters(result) + + class GoldenMatchAdapter(EntityResolutionAdapter): @property def name(self) -> str: diff --git a/dqbench/cli.py b/dqbench/cli.py index 4592125..356e880 100644 --- a/dqbench/cli.py +++ b/dqbench/cli.py @@ -36,6 +36,7 @@ "frictionless": "dqbench.adapters.frictionless_adapter:FrictionlessAdapter", # GoldenMatch (ER) "goldenmatch": "dqbench.adapters.goldenmatch_adapter:GoldenMatchAdapter", + "goldenmatch-auto": "dqbench.adapters.goldenmatch_adapter:GoldenMatchAutoConfigAdapter", # recordlinkage (ER, third-party) "recordlinkage": "dqbench.adapters.recordlinkage_adapter:RecordLinkageAdapter", # Splink (ER, third-party) diff --git a/dqbench/submission.py b/dqbench/submission.py index 41ae66f..dc156c2 100644 --- a/dqbench/submission.py +++ b/dqbench/submission.py @@ -31,6 +31,7 @@ # Repo-relative locations RESULTS_SUBDIR = Path("leaderboard") / "results" SUBMISSIONS_SUBDIR = Path("leaderboard") / "submissions" +REFERENCE_SUBDIR = Path("leaderboard") / "reference" LEADERBOARD_MD = Path("LEADERBOARD.md") # Reproduced numbers are rounded (score 2dp, tier 4dp); this absorbs float repr only. @@ -39,7 +40,7 @@ # Map a run-JSON score key back to its category (run JSON does not name the category) SCORE_KEY_TO_CATEGORY = {meta["score_attr"]: cat for cat, meta in CATEGORY_META.items()} -VALID_SOURCES = {"reproduced", "vendor-reported", "third-party"} +VALID_SOURCES = {"reproduced", "vendor-reported", "third-party", "auto-config"} @dataclass @@ -216,6 +217,37 @@ def add_submission(submission: Submission, root: Path) -> Path: return path +def _reference_path(root: Path, category: str) -> Path: + return root / REFERENCE_SUBDIR / f"{category}.json" + + +def load_reference(root: Path, category: str | None = None) -> list[Submission]: + """Load ungated reference entries (auto-config / non-deterministic, not gate-verified).""" + cats = [category] if category else CATEGORY_ORDER + out: list[Submission] = [] + for cat in cats: + path = _reference_path(root, cat) + if not path.exists(): + continue + out.extend(submission_from_dict(d) for d in json.loads(path.read_text())) + return out + + +def add_reference(submission: Submission, root: Path) -> Path: + """Merge an ungated reference entry (one per tool@version). Not verified by CI.""" + errors = validate_submission(submission.to_dict()) + if errors: + raise ValueError("Invalid reference entry:\n - " + "\n - ".join(errors)) + path = _reference_path(root, submission.category) + path.parent.mkdir(parents=True, exist_ok=True) + existing: list[dict] = json.loads(path.read_text()) if path.exists() else [] + merged = [s for s in map(submission_from_dict, existing) if s.key() != submission.key()] + merged.append(submission) + merged.sort(key=lambda s: (-s.score, s.tool.lower())) + path.write_text(json.dumps([s.to_dict() for s in merged], indent=2) + "\n") + return path + + def validate_store(root: Path) -> list[str]: """Validate every entry in the published store; returns a flat error list.""" errors: list[str] = [] @@ -237,6 +269,20 @@ def validate_store(root: Path) -> list[str]: if entry.get("category") not in (None, cat): errors.append(f"{path}[{i}]: category {entry.get('category')!r} does not match file {cat!r}") + # Reference entries are ungated (not verified), but must still be schema-valid. + for cat in CATEGORY_ORDER: + path = _reference_path(root, cat) + if not path.exists(): + continue + try: + data = json.loads(path.read_text()) + except json.JSONDecodeError as e: + errors.append(f"{path}: invalid JSON ({e})") + continue + for i, entry in enumerate(data if isinstance(data, list) else []): + for err in validate_submission(entry): + errors.append(f"{path}[{i}]: {err}") + errors.extend(_validate_manifest_linkage(root)) return errors @@ -287,7 +333,9 @@ def _validate_manifest_linkage(root: Path) -> list[str]: if merrs: errors.append(f"manifest {m.get('id', m.get('tool', '?'))}: " + "; ".join(merrs)) continue - manifest_keys.add((m["category"], m["tool"])) + # Only gated manifests satisfy the gated-entry linkage requirement. + if m.get("gated", True): + manifest_keys.add((m["category"], m["tool"])) for cat in CATEGORY_ORDER: path = _results_path(root, cat) @@ -367,15 +415,24 @@ def submission_from_manifest(manifest: dict, run_data: dict) -> Submission: def reproduce_and_write(manifest: dict, root: Path) -> Submission: - """Reproduce a manifest's run and merge the result into the published store.""" + """Reproduce a manifest's run and merge the result into the gated or reference store. + + Manifests with ``"gated": false`` go to the ungated reference store (not + verified by CI) — used for non-deterministic auto-config runs. + """ submission = submission_from_manifest(manifest, reproduce(manifest, root)) - add_submission(submission, root) + if manifest.get("gated", True): + add_submission(submission, root) + else: + add_reference(submission, root) publish(root) return submission def verify(manifest: dict, root: Path) -> list[str]: """Reproduce a manifest and confirm the committed entry matches. Empty = ok.""" + if not manifest.get("gated", True): + return [] # ungated reference entries are not gate-verified errs = validate_manifest(manifest) if errs: return [f"manifest invalid: {e}" for e in errs] @@ -415,23 +472,12 @@ def verify(manifest: dict, root: Path) -> list[str]: # --------------------------------------------------------------------------- -def render_markdown(submissions: list[Submission]) -> str: - """Render the published board as Markdown for LEADERBOARD.md.""" +def _render_category_tables(lines: list[str], submissions: list[Submission]) -> bool: + """Append a per-category ranked table for each non-empty category. Returns True if any.""" by_cat: dict[str, list[Submission]] = {} for s in submissions: by_cat.setdefault(s.category, []).append(s) - lines: list[str] = [ - "# DQBench Leaderboard", - "", - "Published results across all five categories. Higher is better; the score is " - "the tier-weighted composite (0-100).", - "", - "> Generated by `dqbench publish` from `leaderboard/results/`. Do not edit by hand — " - "see [how to submit](docs/leaderboard.md).", - "", - ] - any_rows = False for cat in CATEGORY_ORDER: group = by_cat.get(cat) @@ -458,17 +504,43 @@ def render_markdown(submissions: list[Submission]) -> str: row.append(s.date or "—") lines.append("| " + " | ".join(row) + " |") lines.append("") + return any_rows - if not any_rows: + +def render_markdown(submissions: list[Submission], reference: list[Submission] | None = None) -> str: + """Render the published board as Markdown for LEADERBOARD.md.""" + lines: list[str] = [ + "# DQBench Leaderboard", + "", + "Published results across all five categories. Higher is better; the score is " + "the tier-weighted composite (0-100).", + "", + "> Generated by `dqbench publish` from `leaderboard/results/`. Do not edit by hand — " + "see [how to submit](docs/leaderboard.md).", + "", + ] + + if not _render_category_tables(lines, submissions): lines.append("_No results published yet._") lines.append("") + if reference: + lines.append("# Reference — auto-config (not gate-verified)") + lines.append("") + lines.append( + "> ⚠️ These runs are **not reproducible** and are **not enforced by CI** — " + "auto-config tools learn/sample and produce different numbers across runs. " + "Shown for reference only; see each entry's notes for the observed range." + ) + lines.append("") + _render_category_tables(lines, reference) + return "\n".join(lines).rstrip() + "\n" def publish(root: Path) -> Path: - """Regenerate LEADERBOARD.md from the repo store. Returns the written path.""" - md = render_markdown(load_store(root)) + """Regenerate LEADERBOARD.md from the gated + reference stores. Returns the written path.""" + md = render_markdown(load_store(root), load_reference(root)) path = root / LEADERBOARD_MD path.write_text(md) return path @@ -480,7 +552,7 @@ def check_published(root: Path) -> list[str]: if errors: return errors - expected = render_markdown(load_store(root)) + expected = render_markdown(load_store(root), load_reference(root)) path = root / LEADERBOARD_MD current = path.read_text() if path.exists() else "" if current != expected: diff --git a/leaderboard/reference/er.json b/leaderboard/reference/er.json new file mode 100644 index 0000000..90600a7 --- /dev/null +++ b/leaderboard/reference/er.json @@ -0,0 +1,20 @@ +[ + { + "category": "er", + "tool": "GoldenMatch (auto-config)", + "tool_version": "1.18.1", + "score": 92.36, + "tier_scores": { + "1": 0.8929, + "2": 0.9783, + "3": 0.8842, + "4": 0.8235 + }, + "submitter": "DQBench maintainers", + "date": "2026-05-24", + "adapter": "goldenmatch-auto", + "dqbench_version": "1.0.0", + "source": "auto-config", + "notes": "Auto-config (auto_configure_df). NOT reproducible: persists a learning store (.goldenmatch/memory.db), so the composite swings widely across runs (~57-92 observed). Value shown is one representative cold run \u2014 reference only, never gate-verified." + } +] diff --git a/leaderboard/reference/pipeline.json b/leaderboard/reference/pipeline.json new file mode 100644 index 0000000..0a38c6b --- /dev/null +++ b/leaderboard/reference/pipeline.json @@ -0,0 +1,19 @@ +[ + { + "category": "pipeline", + "tool": "GoldenSuite (zero-config)", + "tool_version": "1.2.0", + "score": 33.85, + "tier_scores": { + "1": 0.4981, + "2": 0.2887, + "3": 0.3086 + }, + "submitter": "DQBench maintainers", + "date": "2026-05-24", + "adapter": "goldensuite-zero", + "dqbench_version": "1.0.0", + "source": "auto-config", + "notes": "Full Golden suite via goldenpipe.run (load->check->flow->dedupe), zero-config. NOT reproducible: auto-config varies run-to-run (~33.8 +/- a row)." + } +] diff --git a/leaderboard/submissions/er-goldenmatch-auto.json b/leaderboard/submissions/er-goldenmatch-auto.json new file mode 100644 index 0000000..74ddd56 --- /dev/null +++ b/leaderboard/submissions/er-goldenmatch-auto.json @@ -0,0 +1,15 @@ +{ + "id": "er-goldenmatch-auto", + "category": "er", + "tool": "GoldenMatch (auto-config)", + "adapter": "goldenmatch-auto", + "install": [ + "goldenmatch==1.18.1", + "jellyfish==1.2.1", + "RapidFuzz==3.14.5" + ], + "submitter": "DQBench maintainers", + "source": "auto-config", + "gated": false, + "notes": "Auto-config (auto_configure_df). NOT reproducible: persists a learning store (.goldenmatch/memory.db), so the composite swings widely across runs (~57-92 observed). Value shown is one representative cold run \u2014 reference only, never gate-verified." +} diff --git a/leaderboard/submissions/pipeline-goldensuite-zero.json b/leaderboard/submissions/pipeline-goldensuite-zero.json new file mode 100644 index 0000000..209d05e --- /dev/null +++ b/leaderboard/submissions/pipeline-goldensuite-zero.json @@ -0,0 +1,18 @@ +{ + "id": "pipeline-goldensuite-zero", + "category": "pipeline", + "tool": "GoldenSuite (zero-config)", + "adapter": "goldensuite-zero", + "install": [ + "goldencheck==1.2.0", + "goldenflow==1.1.6", + "goldenmatch==1.18.1", + "goldenpipe==1.2.0", + "jellyfish==1.2.1", + "RapidFuzz==3.14.5" + ], + "submitter": "DQBench maintainers", + "source": "auto-config", + "gated": false, + "notes": "Full Golden suite via goldenpipe.run (load->check->flow->dedupe), zero-config. NOT reproducible: auto-config varies run-to-run (~33.8 +/- a row)." +} diff --git a/tests/test_submission.py b/tests/test_submission.py index fa25707..508c8d1 100644 --- a/tests/test_submission.py +++ b/tests/test_submission.py @@ -10,6 +10,7 @@ add_submission, check_published, infer_category, + load_reference, load_store, publish, render_markdown, @@ -262,6 +263,38 @@ def test_verify_missing_entry(tmp_path): assert any("no committed entry" in e for e in errors) +def test_ungated_manifest_routes_to_reference_store(tmp_path): + manifest = _mini_manifest(tmp_path) + manifest["gated"] = False + reproduce_and_write(manifest, root=tmp_path) + # Goes to the ungated reference store, not the gated results store. + assert load_store(tmp_path, category="detect") == [] + ref = load_reference(tmp_path, category="detect") + assert len(ref) == 1 and ref[0].tool == "MiniTool" + + +def test_verify_skips_ungated_manifest(tmp_path): + manifest = _mini_manifest(tmp_path) + manifest["gated"] = False + assert verify(manifest, root=tmp_path) == [] # not gate-verified + + +def test_reference_entry_needs_no_manifest(tmp_path): + # A reference entry with no submissions/ manifest must still pass validate_store. + from dqbench.submission import add_reference + add_reference(submission_from_run(_detect_run("Auto", score=99.0), submitter="Me"), root=tmp_path) + assert validate_store(tmp_path) == [] + + +def test_render_markdown_reference_section(): + gated = [submission_from_run(_detect_run("GatedTool", score=50.0), submitter="Me")] + ref = [submission_from_run(_detect_run("AutoTool", score=99.0), submitter="Me")] + md = render_markdown(gated, ref) + assert "not gate-verified" in md + assert md.index("GatedTool") < md.index("Reference") # gated board first + assert "AutoTool" in md + + @pytest.mark.parametrize("category,runner_attr", [ ("detect", "run_benchmark"), ("transform", "run_transform_benchmark"), From 294a26d1d13901a469b98065d1ff7332f147e331 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 24 May 2026 20:17:25 +0000 Subject: [PATCH 3/3] feat: make GoldenMatch auto-config deterministic and gate it (ER 92.36) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GoldenMatch auto-config's run-to-run drift was its persisted cross-run learning store (~/.goldenmatch/autoconfig_memory.db), not randomness — the profiling sample is already seeded. Disabling that store makes auto-config reproduce exactly, so it graduates from the ungated reference board to the gated ER leaderboard, topping it at 92.36 (vs Splink 87.14, tuned 76.91). - GoldenMatchAutoConfigAdapter sets GOLDENMATCH_AUTOCONFIG_MEMORY=0 (and LLM=0) before importing goldenmatch — the flag is read once at import. - er-goldenmatch-auto manifest flipped to gated; entry moved to the gated results store; dqbench verify confirms it reproduces. - Reference board now seeds only GoldenSuite (zero-config) (Pipeline). https://claude.ai/code/session_01KjxRYsnVFPVJ3aUBmNm7vB --- CHANGELOG.md | 3 ++- LEADERBOARD.md | 13 ++++-------- docs/leaderboard.md | 11 ++++++++-- dqbench/adapters/goldenmatch_adapter.py | 17 +++++++++++++--- leaderboard/reference/er.json | 20 ------------------- leaderboard/results/er.json | 18 +++++++++++++++++ .../submissions/er-goldenmatch-auto.json | 3 +-- 7 files changed, 48 insertions(+), 37 deletions(-) delete mode 100644 leaderboard/reference/er.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ec783e..91c589b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,8 @@ ## Unreleased ### Added -- **Ungated "reference" board for auto-config runs** — non-deterministic auto-config results (which can't pass the reproducibility gate) can now be published in a separate "Reference — auto-config (not gate-verified)" section of `LEADERBOARD.md`. Manifests marked `"gated": false` route to `leaderboard/reference/` (no manifest-linkage required, skipped by the CI verify matrix). Seeded with **GoldenMatch (auto-config)** (ER) and **GoldenSuite (zero-config)** (Pipeline), each flagged non-reproducible with its observed range. New `GoldenMatchAutoConfigAdapter` (`goldenmatch-auto`). +- **GoldenMatch auto-config on the leaderboard (ER, 92.36)** — `auto_configure_df` with zero hand-tuning, now the top ER entry (vs Splink 87.14, recordlinkage 80.28, GoldenMatch tuned 76.91). Its run-to-run drift turned out to be GoldenMatch's persisted cross-run learning store (`~/.goldenmatch/autoconfig_memory.db`), not randomness — the profiling sample is already seeded. The new `GoldenMatchAutoConfigAdapter` (`goldenmatch-auto`) disables that store (`GOLDENMATCH_AUTOCONFIG_MEMORY=0`, set before import since the flag is read once at import time), so it reproduces exactly and passes `dqbench verify`. +- **Ungated "reference" board** — for genuinely non-reproducible auto-config runs that can't pass the gate. Manifests marked `"gated": false` route to `leaderboard/reference/` (no manifest-linkage required, skipped by the CI verify matrix and the refresh audit) and render in a separate "Reference — auto-config (not gate-verified)" section of `LEADERBOARD.md`. Seeded with **GoldenSuite (zero-config)** (Pipeline, ~33.85). - **ER B³ (BCubed) metrics + confusion matrix** — `score_er_tier` now also reports cluster-level B-Cubed precision/recall/F1 (built from the pair graph via connected components over all rows) and the full pair-level confusion matrix (TP/FP/FN/TN), surfaced in the ER report (rich + JSON). These are **diagnostic only** — the headline DQBench ER Score stays pair-F1-weighted and unchanged, so published entries don't move. - **Third-party OSS tools on the leaderboard** — new adapters and reproducible, version-pinned entries: **Splink** (ER, 87.14 — probabilistic Fellegi-Sunter, seeded), **recordlinkage** (ER, 80.28 — blocking + Jaro-Winkler), **cuallee** (Detect, 30.56 — rule-based DQ checks), **frictionless** (Detect, 2.22 — inferred-schema validation), and a **pandas cleaning baseline** (Transform, 100.0). **Great Expectations** is now included too (Detect: best-effort 21.68, auto-profiled 21.29, zero-config 0.0) — its earlier non-determinism turned out to be dev-environment contamination; in an isolated env with pinned deps it reproduces exactly. Each entry runs in its own isolated CI job and passes `dqbench verify`. - **Published leaderboard with a reproducibility gate** — a version-controlled, community-submittable board where **results are only accepted if a GitHub Action can reproduce them**. Each entry is backed by a manifest under `leaderboard/submissions/` (tool, category, adapter, pinned packages). New commands: `dqbench reproduce [--write]` (run the manifest, optionally record it), `dqbench verify ` (reproduce and confirm the committed numbers match), `dqbench publish [--check]` (regenerate/verify `LEADERBOARD.md`), and `dqbench leaderboard --source repo`. `dqbench run --adapter` now also accepts a `module:Class` reference. diff --git a/LEADERBOARD.md b/LEADERBOARD.md index e6eaa01..8fb8476 100644 --- a/LEADERBOARD.md +++ b/LEADERBOARD.md @@ -32,9 +32,10 @@ Published results across all five categories. Higher is better; the score is the | Rank | Tool | Version | T1 | T2 | T3 | T4 | Score | Submitter | Source | Date | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| 1 | Splink | 4.0.16 | 66.7% | 99.9% | 84.6% | 66.7% | 87.14 | DQBench maintainers | reproduced | 2026-05-24 | -| 2 | recordlinkage | 0.16 | 80.8% | 83.8% | 76.5% | 33.3% | 80.28 | DQBench maintainers | reproduced | 2026-05-24 | -| 3 | GoldenMatch | 1.18.1 | 87.0% | 81.0% | 67.8% | 67.8% | 76.91 | DQBench maintainers | reproduced | 2026-05-24 | +| 1 | GoldenMatch (auto-config) | 1.18.1 | 89.3% | 97.8% | 88.4% | 82.3% | 92.36 | DQBench maintainers | auto-config | 2026-05-24 | +| 2 | Splink | 4.0.16 | 66.7% | 99.9% | 84.6% | 66.7% | 87.14 | DQBench maintainers | reproduced | 2026-05-24 | +| 3 | recordlinkage | 0.16 | 80.8% | 83.8% | 76.5% | 33.3% | 80.28 | DQBench maintainers | reproduced | 2026-05-24 | +| 4 | GoldenMatch | 1.18.1 | 87.0% | 81.0% | 67.8% | 67.8% | 76.91 | DQBench maintainers | reproduced | 2026-05-24 | ## Pipeline @@ -47,12 +48,6 @@ Published results across all five categories. Higher is better; the score is the > ⚠️ These runs are **not reproducible** and are **not enforced by CI** — auto-config tools learn/sample and produce different numbers across runs. Shown for reference only; see each entry's notes for the observed range. -## ER - -| Rank | Tool | Version | T1 | T2 | T3 | T4 | Score | Submitter | Source | Date | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| 1 | GoldenMatch (auto-config) | 1.18.1 | 89.3% | 97.8% | 88.4% | 82.3% | 92.36 | DQBench maintainers | auto-config | 2026-05-24 | - ## Pipeline | Rank | Tool | Version | T1 | T2 | T3 | Score | Submitter | Source | Date | diff --git a/docs/leaderboard.md b/docs/leaderboard.md index c689493..460f3f1 100644 --- a/docs/leaderboard.md +++ b/docs/leaderboard.md @@ -84,13 +84,20 @@ made deterministic — the reproducibility gate will reject them. CI reproduces ### Reference board (ungated) -Auto-config tools that learn/sample (e.g. GoldenMatch's `auto_configure_df`) are -genuinely non-reproducible, so they can't sit on the gated board. They can still be +Some auto-config tools learn/sample across runs and are genuinely non-reproducible +(e.g. GoldenPipe's zero-config engine). They can't sit on the gated board, but can be shown for context in a separate **"Reference — auto-config (not gate-verified)"** section. Mark the manifest `"gated": false` — it routes to `leaderboard/reference/`, needs no manifest-linkage, and is skipped by the CI verify matrix. Reference entries should document their observed run-to-run range in `notes`. +Before reaching for the reference board, check whether the non-determinism is just +*persisted state* rather than true randomness. GoldenMatch's `auto_configure_df`, for +example, looked non-reproducible only because it caches configs in +`~/.goldenmatch/autoconfig_memory.db` and seeds each run from the last; disabling that +store (`GOLDENMATCH_AUTOCONFIG_MEMORY=0`) leaves only seeded sampling, so it reproduces +exactly and earns a place on the *gated* board. + ## Result sources | `--result-source` / `source` | Meaning | diff --git a/dqbench/adapters/goldenmatch_adapter.py b/dqbench/adapters/goldenmatch_adapter.py index dcfcbbb..ec040d1 100644 --- a/dqbench/adapters/goldenmatch_adapter.py +++ b/dqbench/adapters/goldenmatch_adapter.py @@ -24,9 +24,11 @@ def _pairs_from_clusters(result) -> list[tuple[int, int]]: class GoldenMatchAutoConfigAdapter(EntityResolutionAdapter): """GoldenMatch in auto-config mode (no hand-tuned config). - NOTE: GoldenMatch's auto-config learns/samples and persists state to - `.goldenmatch/memory.db`, so results are NOT reproducible run-to-run — this - adapter is for the ungated *reference* board only, never the gated leaderboard. + GoldenMatch's ``auto_configure_df`` normally persists a cross-run learning + store (``~/.goldenmatch/autoconfig_memory.db``) and seeds each run from the + last, which makes results drift run-to-run. This adapter disables that store + so the seeded heuristic + refit loop runs deterministically — making the + auto-config result reproducible and gate-verifiable. """ @property @@ -42,6 +44,15 @@ def version(self) -> str: return "not-installed" def deduplicate(self, csv_path: Path) -> list[tuple[int, int]]: + import os + + # The memory/LLM flags are read once at import (goldenmatch.core.autoconfig), + # so they must be set BEFORE goldenmatch is imported. Disabling the cross-run + # memory store removes the only source of non-determinism (the underlying + # profiling sample is already seeded), so the run reproduces exactly. + os.environ["GOLDENMATCH_AUTOCONFIG_MEMORY"] = "0" + os.environ["GOLDENMATCH_AUTOCONFIG_LLM"] = "0" + import goldenmatch import polars as pl diff --git a/leaderboard/reference/er.json b/leaderboard/reference/er.json deleted file mode 100644 index 90600a7..0000000 --- a/leaderboard/reference/er.json +++ /dev/null @@ -1,20 +0,0 @@ -[ - { - "category": "er", - "tool": "GoldenMatch (auto-config)", - "tool_version": "1.18.1", - "score": 92.36, - "tier_scores": { - "1": 0.8929, - "2": 0.9783, - "3": 0.8842, - "4": 0.8235 - }, - "submitter": "DQBench maintainers", - "date": "2026-05-24", - "adapter": "goldenmatch-auto", - "dqbench_version": "1.0.0", - "source": "auto-config", - "notes": "Auto-config (auto_configure_df). NOT reproducible: persists a learning store (.goldenmatch/memory.db), so the composite swings widely across runs (~57-92 observed). Value shown is one representative cold run \u2014 reference only, never gate-verified." - } -] diff --git a/leaderboard/results/er.json b/leaderboard/results/er.json index 5f73714..13353ea 100644 --- a/leaderboard/results/er.json +++ b/leaderboard/results/er.json @@ -1,4 +1,22 @@ [ + { + "category": "er", + "tool": "GoldenMatch (auto-config)", + "tool_version": "1.18.1", + "score": 92.36, + "tier_scores": { + "1": 0.8929, + "2": 0.9783, + "3": 0.8842, + "4": 0.8235 + }, + "submitter": "DQBench maintainers", + "date": "2026-05-24", + "adapter": "goldenmatch-auto", + "dqbench_version": "1.0.0", + "source": "auto-config", + "notes": "Zero-tuning auto-config (auto_configure_df). Deterministic: the adapter sets GOLDENMATCH_AUTOCONFIG_MEMORY=0 to disable GoldenMatch's cross-run learning store, leaving only seeded sampling, so it reproduces exactly." + }, { "category": "er", "tool": "Splink", diff --git a/leaderboard/submissions/er-goldenmatch-auto.json b/leaderboard/submissions/er-goldenmatch-auto.json index 74ddd56..de47b3c 100644 --- a/leaderboard/submissions/er-goldenmatch-auto.json +++ b/leaderboard/submissions/er-goldenmatch-auto.json @@ -10,6 +10,5 @@ ], "submitter": "DQBench maintainers", "source": "auto-config", - "gated": false, - "notes": "Auto-config (auto_configure_df). NOT reproducible: persists a learning store (.goldenmatch/memory.db), so the composite swings widely across runs (~57-92 observed). Value shown is one representative cold run \u2014 reference only, never gate-verified." + "notes": "Zero-tuning auto-config (auto_configure_df). Deterministic: the adapter sets GOLDENMATCH_AUTOCONFIG_MEMORY=0 to disable GoldenMatch's cross-run learning store, leaving only seeded sampling, so it reproduces exactly." }