benseverndev-oss · Ben Severn (benzsevern) · May 24, 2026 · May 24, 2026 · May 24, 2026 · May 24, 2026
diff --git a/.github/workflows/leaderboard-refresh.yml b/.github/workflows/leaderboard-refresh.yml
@@ -17,11 +17,17 @@ jobs:
         run: |
           set -uo pipefail
           files=$(ls leaderboard/submissions/*.json 2>/dev/null || true)
-          if [ -z "$files" ]; then
+          gated=""
+          for f in $files; do
+            if python -c "import json,sys; sys.exit(0 if json.load(open('$f')).get('gated', True) else 1)"; then
+              gated="$gated $f"
+            fi
+          done
+          if [ -z "$(echo $gated)" ]; then
             echo "any=false" >> "$GITHUB_OUTPUT"
             echo "manifests=[]" >> "$GITHUB_OUTPUT"
           else
-            arr=$(printf '%s\n' $files | jq -R . | jq -cs .)
+            arr=$(printf '%s\n' $gated | jq -R . | jq -cs .)
             echo "any=true" >> "$GITHUB_OUTPUT"
             echo "manifests=$arr" >> "$GITHUB_OUTPUT"
           fi

diff --git a/.github/workflows/leaderboard.yml b/.github/workflows/leaderboard.yml
@@ -44,11 +44,19 @@ jobs:
           base="${{ github.base_ref }}"
           git fetch origin "$base"
           files=$(git diff --name-only --diff-filter=ACMR "origin/$base...HEAD" -- leaderboard/submissions/ | grep '\.json$' || true)
-          if [ -z "$files" ]; then
+          # Only gate manifests with "gated" != false (ungated reference runs are not verified).
+          gated=""
+          for f in $files; do
+            [ -f "$f" ] || continue
+            if python -c "import json,sys; sys.exit(0 if json.load(open('$f')).get('gated', True) else 1)"; then
+              gated="$gated $f"
+            fi
+          done
+          if [ -z "$(echo $gated)" ]; then
             echo "any=false" >> "$GITHUB_OUTPUT"
             echo "manifests=[]" >> "$GITHUB_OUTPUT"
           else
-            arr=$(printf '%s\n' $files | jq -R . | jq -cs .)
+            arr=$(printf '%s\n' $gated | jq -R . | jq -cs .)
             echo "any=true" >> "$GITHUB_OUTPUT"
             echo "manifests=$arr" >> "$GITHUB_OUTPUT"
           fi

diff --git a/AGENTS.md b/AGENTS.md
@@ -6,7 +6,7 @@ The standard benchmark for data quality and validation tools — five categories
 
 ```bash
 pip install -e ".[dev]"          # Dev install
-pytest --tb=short -v             # Run tests (242 passing)
+pytest --tb=short -v             # Run tests (251 passing)
 ruff check .                     # Lint
 dqbench run <adapter>            # Run benchmark (records result on the local leaderboard)
 dqbench run all                  # Head-to-head comparison
@@ -93,7 +93,7 @@ The adapter interface is the primary extension point. Each adapter implements a
 
 ## Performance & Testing
 
-- Always run `pytest --tb=short -v` before committing. All 242 tests must pass.
+- Always run `pytest --tb=short -v` before committing. All 251 tests must pass.
 - Always run `ruff check .` for linting.
 - Tier generators use a local `random.Random(42)` instance for deterministic output.
 - Do not use numpy or any external RNG; stick to stdlib `random.Random(42)`.

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,9 @@
 ## Unreleased
 
 ### Added
+- **GoldenMatch auto-config on the leaderboard (ER, 92.36)** — `auto_configure_df` with zero hand-tuning, now the top ER entry (vs Splink 87.14, recordlinkage 80.28, GoldenMatch tuned 76.91). Its run-to-run drift turned out to be GoldenMatch's persisted cross-run learning store (`~/.goldenmatch/autoconfig_memory.db`), not randomness — the profiling sample is already seeded. The new `GoldenMatchAutoConfigAdapter` (`goldenmatch-auto`) disables that store (`GOLDENMATCH_AUTOCONFIG_MEMORY=0`, set before import since the flag is read once at import time), so it reproduces exactly and passes `dqbench verify`.
+- **Ungated "reference" board** — for genuinely non-reproducible auto-config runs that can't pass the gate. Manifests marked `"gated": false` route to `leaderboard/reference/` (no manifest-linkage required, skipped by the CI verify matrix and the refresh audit) and render in a separate "Reference — auto-config (not gate-verified)" section of `LEADERBOARD.md`. Seeded with **GoldenSuite (zero-config)** (Pipeline, ~33.85).
+- **ER B³ (BCubed) metrics + confusion matrix** — `score_er_tier` now also reports cluster-level B-Cubed precision/recall/F1 (built from the pair graph via connected components over all rows) and the full pair-level confusion matrix (TP/FP/FN/TN), surfaced in the ER report (rich + JSON). These are **diagnostic only** — the headline DQBench ER Score stays pair-F1-weighted and unchanged, so published entries don't move.
 - **Third-party OSS tools on the leaderboard** — new adapters and reproducible, version-pinned entries: **Splink** (ER, 87.14 — probabilistic Fellegi-Sunter, seeded), **recordlinkage** (ER, 80.28 — blocking + Jaro-Winkler), **cuallee** (Detect, 30.56 — rule-based DQ checks), **frictionless** (Detect, 2.22 — inferred-schema validation), and a **pandas cleaning baseline** (Transform, 100.0). **Great Expectations** is now included too (Detect: best-effort 21.68, auto-profiled 21.29, zero-config 0.0) — its earlier non-determinism turned out to be dev-environment contamination; in an isolated env with pinned deps it reproduces exactly. Each entry runs in its own isolated CI job and passes `dqbench verify`.
 - **Published leaderboard with a reproducibility gate** — a version-controlled, community-submittable board where **results are only accepted if a GitHub Action can reproduce them**. Each entry is backed by a manifest under `leaderboard/submissions/` (tool, category, adapter, pinned packages). New commands: `dqbench reproduce <manifest> [--write]` (run the manifest, optionally record it), `dqbench verify <manifest>` (reproduce and confirm the committed numbers match), `dqbench publish [--check]` (regenerate/verify `LEADERBOARD.md`), and `dqbench leaderboard --source repo`. `dqbench run --adapter` now also accepts a `module:Class` reference.
 - CI: `.github/workflows/leaderboard.yml` gates PRs with `dqbench publish --check` (structural) and `dqbench verify` on each changed manifest (reproduction); `.github/workflows/leaderboard-refresh.yml` audits all manifests on a schedule.
@@ -19,7 +22,7 @@
 ### Changed
 - `ensure_er_datasets()` in `dqbench/runner.py` is now per-tier idempotent — users with an existing T1-T3 cache pick up T4 without needing `dqbench generate --force`.
 - Default ER tier list extended to `[1, 2, 3, 4]`; existing callers passing explicit `tiers=` are unaffected.
-- Full test suite: 242 passing (was 161).
+- Full test suite: 251 passing (was 161).
 
 ## v1.1.0 — 2026-03-29
 

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -6,7 +6,7 @@ The standard benchmark for data quality and validation tools — five categories
 
 ```bash
 pip install -e ".[dev]"          # Dev install
-pytest --tb=short -v             # Run tests (242 passing)
+pytest --tb=short -v             # Run tests (251 passing)
 ruff check .                     # Lint
 dqbench run <adapter>            # Run benchmark (records result on the local leaderboard)
 dqbench run all                  # Head-to-head comparison
@@ -76,6 +76,7 @@ T4 has `weights.get(tier, 0) == 0` in `ERScorecard.dqbench_er_score` — reporte
 ### ER
 
 - **Pair-level P/R/F1** against `ERGroundTruth.duplicate_pairs` (pairs normalised to `(min, max)`)
+- **B³ (BCubed) P/R/F1 + confusion matrix** (TP/FP/FN/TN) also reported per tier — diagnostic only, do NOT affect the composite
 - **DQBench ER Score**: `T1_F1 × 20% + T2_F1 × 40% + T3_F1 × 40%` — T4 weight is 0
 - A perfect adapter on `tiers=[1, 2, 3]` scores 100; on `tiers=[4]` it scores 1.0 F1 but 0.0 composite
 

diff --git a/LEADERBOARD.md b/LEADERBOARD.md
@@ -32,13 +32,24 @@ Published results across all five categories. Higher is better; the score is the
 
 | Rank | Tool | Version | T1 | T2 | T3 | T4 | Score | Submitter | Source | Date |
 | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| 1 | Splink | 4.0.16 | 66.7% | 99.9% | 84.6% | 66.7% | 87.14 | DQBench maintainers | reproduced | 2026-05-24 |
-| 2 | recordlinkage | 0.16 | 80.8% | 83.8% | 76.5% | 33.3% | 80.28 | DQBench maintainers | reproduced | 2026-05-24 |
-| 3 | GoldenMatch | 1.18.1 | 87.0% | 81.0% | 67.8% | 67.8% | 76.91 | DQBench maintainers | reproduced | 2026-05-24 |
+| 1 | GoldenMatch (auto-config) | 1.18.1 | 89.3% | 97.8% | 88.4% | 82.3% | 92.36 | DQBench maintainers | auto-config | 2026-05-24 |
+| 2 | Splink | 4.0.16 | 66.7% | 99.9% | 84.6% | 66.7% | 87.14 | DQBench maintainers | reproduced | 2026-05-24 |
+| 3 | recordlinkage | 0.16 | 80.8% | 83.8% | 76.5% | 33.3% | 80.28 | DQBench maintainers | reproduced | 2026-05-24 |
+| 4 | GoldenMatch | 1.18.1 | 87.0% | 81.0% | 67.8% | 67.8% | 76.91 | DQBench maintainers | reproduced | 2026-05-24 |
 
 ## Pipeline
 
 | Rank | Tool | Version | T1 | T2 | T3 | Score | Submitter | Source | Date |
 | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
 | 1 | GoldenSuite (tuned) | 1.2.0 | 80.0% | 81.7% | 67.3% | 75.59 | DQBench maintainers | reproduced | 2026-05-24 |
 | 2 | GoldenPipe | 1.2.0 | 80.0% | 81.7% | 56.8% | 71.38 | DQBench maintainers | reproduced | 2026-05-24 |
+
+# Reference — auto-config (not gate-verified)
+
+> ⚠️ These runs are **not reproducible** and are **not enforced by CI** — auto-config tools learn/sample and produce different numbers across runs. Shown for reference only; see each entry's notes for the observed range.
+
+## Pipeline
+
+| Rank | Tool | Version | T1 | T2 | T3 | Score | Submitter | Source | Date |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| 1 | GoldenSuite (zero-config) | 1.2.0 | 49.8% | 28.9% | 30.9% | 33.85 | DQBench maintainers | auto-config | 2026-05-24 |
diff --git a/README.md b/README.md
@@ -287,7 +287,7 @@ dqbench run --adapter my_er_adapter.py
 | **Pipeline** | 3 | End-to-end pipeline orchestration |
 | **OCR Company** | 3 | OCR company-name confidence and correction |
 
-Full suite: 242 tests passing across all five categories.
+Full suite: 251 tests passing across all five categories.
 
 ## OCR Company Benchmark
 

diff --git a/docs/leaderboard.md b/docs/leaderboard.md
@@ -82,6 +82,22 @@ invocations (e.g. sampling-based profilers) cannot be accepted until their adapt
 made deterministic — the reproducibility gate will reject them. CI reproduces runs on
 **Python 3.11**; pin your `install` versions so numbers don't drift.
 
+### Reference board (ungated)
+
+Some auto-config tools learn/sample across runs and are genuinely non-reproducible
+(e.g. GoldenPipe's zero-config engine). They can't sit on the gated board, but can be
+shown for context in a separate **"Reference — auto-config (not gate-verified)"**
+section. Mark the manifest `"gated": false` — it routes to `leaderboard/reference/`,
+needs no manifest-linkage, and is skipped by the CI verify matrix. Reference entries
+should document their observed run-to-run range in `notes`.
+
+Before reaching for the reference board, check whether the non-determinism is just
+*persisted state* rather than true randomness. GoldenMatch's `auto_configure_df`, for
+example, looked non-reproducible only because it caches configs in
+`~/.goldenmatch/autoconfig_memory.db` and seeds each run from the last; disabling that
+store (`GOLDENMATCH_AUTOCONFIG_MEMORY=0`) leaves only seeded sampling, so it reproduces
+exactly and earns a place on the *gated* board.
+
 ## Result sources
 
 | `--result-source` / `source` | Meaning |

diff --git a/dqbench/adapters/goldenmatch_adapter.py b/dqbench/adapters/goldenmatch_adapter.py
@@ -9,6 +9,59 @@
 from dqbench.adapters.base import EntityResolutionAdapter
 
 
+def _pairs_from_clusters(result) -> list[tuple[int, int]]:
+    """All within-cluster pairs from a GoldenMatch DedupeResult."""
+    pairs: list[tuple[int, int]] = []
+    if result.clusters:
+        for cluster in result.clusters.values():
+            members = sorted(cluster["members"])
+            for i in range(len(members)):
+                for j in range(i + 1, len(members)):
+                    pairs.append((members[i], members[j]))
+    return pairs
+
+
+class GoldenMatchAutoConfigAdapter(EntityResolutionAdapter):
+    """GoldenMatch in auto-config mode (no hand-tuned config).
+
+    GoldenMatch's ``auto_configure_df`` normally persists a cross-run learning
+    store (``~/.goldenmatch/autoconfig_memory.db``) and seeds each run from the
+    last, which makes results drift run-to-run. This adapter disables that store
+    so the seeded heuristic + refit loop runs deterministically — making the
+    auto-config result reproducible and gate-verifiable.
+    """
+
+    @property
+    def name(self) -> str:
+        return "GoldenMatch (auto-config)"
+
+    @property
+    def version(self) -> str:
+        try:
+            import goldenmatch
+            return goldenmatch.__version__
+        except ImportError:
+            return "not-installed"
+
+    def deduplicate(self, csv_path: Path) -> list[tuple[int, int]]:
+        import os
+
+        # The memory/LLM flags are read once at import (goldenmatch.core.autoconfig),
+        # so they must be set BEFORE goldenmatch is imported. Disabling the cross-run
+        # memory store removes the only source of non-determinism (the underlying
+        # profiling sample is already seeded), so the run reproduces exactly.
+        os.environ["GOLDENMATCH_AUTOCONFIG_MEMORY"] = "0"
+        os.environ["GOLDENMATCH_AUTOCONFIG_LLM"] = "0"
+
+        import goldenmatch
+        import polars as pl
+
+        df = pl.read_csv(csv_path)
+        config = goldenmatch.auto_configure_df(df)
+        result = goldenmatch.dedupe_df(df, config=config)
+        return _pairs_from_clusters(result)
+
+
 class GoldenMatchAdapter(EntityResolutionAdapter):
     @property
     def name(self) -> str:

diff --git a/dqbench/cli.py b/dqbench/cli.py
@@ -36,6 +36,7 @@
     "frictionless": "dqbench.adapters.frictionless_adapter:FrictionlessAdapter",
     # GoldenMatch (ER)
     "goldenmatch": "dqbench.adapters.goldenmatch_adapter:GoldenMatchAdapter",
+    "goldenmatch-auto": "dqbench.adapters.goldenmatch_adapter:GoldenMatchAutoConfigAdapter",
     # recordlinkage (ER, third-party)
     "recordlinkage": "dqbench.adapters.recordlinkage_adapter:RecordLinkageAdapter",
     # Splink (ER, third-party)

diff --git a/dqbench/er_scorer.py b/dqbench/er_scorer.py
@@ -1,23 +1,84 @@
 """Scoring logic for ER benchmarks."""
 from __future__ import annotations
 
-from dqbench.models import ERTierResult
 from dqbench.er_ground_truth import ERGroundTruth
+from dqbench.models import ERTierResult
 
 
 def _normalize_pairs(pairs: list[tuple[int, int]]) -> set[tuple[int, int]]:
     """Normalize pairs to (min, max) for symmetric matching."""
     return {(min(a, b), max(a, b)) for a, b in pairs}
 
 
+def _clusters_from_pairs(pairs: set[tuple[int, int]], n: int) -> tuple[list[int], dict[int, set[int]]]:
+    """Union-find over rows 0..n-1; records not in any pair are singletons.
+
+    Returns (root_of_element, members_by_root).
+    """
+    parent = list(range(n))
+
+    def find(x: int) -> int:
+        root = x
+        while parent[root] != root:
+            root = parent[root]
+        while parent[x] != root:  # path compression
+            parent[x], x = root, parent[x]
+        return root
+
+    for a, b in pairs:
+        if a < n and b < n:
+            ra, rb = find(a), find(b)
+            if ra != rb:
+                parent[max(ra, rb)] = min(ra, rb)
+
+    roots = [find(i) for i in range(n)]
+    members: dict[int, set[int]] = {}
+    for i, r in enumerate(roots):
+        members.setdefault(r, set()).add(i)
+    return roots, members
+
+
+def _bcubed(pred_pairs: set[tuple[int, int]], true_pairs: set[tuple[int, int]], n: int) -> tuple[float, float, float]:
+    """B-Cubed (B³) precision/recall/F1, averaged over all n elements.
+
+    For each element e: precision = |pred_cluster(e) ∩ true_cluster(e)| / |pred_cluster(e)|,
+    recall = the same intersection / |true_cluster(e)|.
+    """
+    if n == 0:
+        return 0.0, 0.0, 0.0
+
+    pred_root, pred_members = _clusters_from_pairs(pred_pairs, n)
+    true_root, true_members = _clusters_from_pairs(true_pairs, n)
+
+    # Cache intersection size per (pred_root, true_root) pair to avoid recomputation.
+    inter_cache: dict[tuple[int, int], int] = {}
+    p_sum = 0.0
+    r_sum = 0.0
+    for e in range(n):
+        pc = pred_members[pred_root[e]]
+        tc = true_members[true_root[e]]
+        key = (pred_root[e], true_root[e])
+        inter = inter_cache.get(key)
+        if inter is None:
+            inter = len(pc & tc)
+            inter_cache[key] = inter
+        p_sum += inter / len(pc)
+        r_sum += inter / len(tc)
+
+    precision = p_sum / n
+    recall = r_sum / n
+    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
+    return precision, recall, f1
+
+
 def score_er_tier(
     predictions: list[tuple[int, int]],
     ground_truth: ERGroundTruth,
     tier: int,
     time_seconds: float,
     memory_mb: float,
 ) -> ERTierResult:
-    """Score ER predictions against ground truth using pair-level P/R/F1."""
+    """Score ER predictions with pair-level P/R/F1, a confusion matrix, and B³."""
     true_pairs = _normalize_pairs(ground_truth.duplicate_pairs)
     pred_pairs = _normalize_pairs(predictions)
 
@@ -29,6 +90,11 @@ def score_er_tier(
     fp = len(false_positives)
     fn = len(false_negatives)
 
+    # True negatives: all candidate pairs that are correctly NOT linked.
+    n = ground_truth.rows
+    total_possible = n * (n - 1) // 2
+    tn = max(0, total_possible - tp - fp - fn)
+
     if tp == 0:
         precision = 0.0
         recall = 0.0
@@ -38,6 +104,8 @@ def score_er_tier(
         recall = tp / (tp + fn)
         f1 = 2 * precision * recall / (precision + recall)
 
+    bc_p, bc_r, bc_f1 = _bcubed(pred_pairs, true_pairs, n)
+
     return ERTierResult(
         tier=tier,
         precision=precision,
@@ -47,4 +115,9 @@ def score_er_tier(
         false_negatives=fn,
         time_seconds=time_seconds,
         memory_mb=memory_mb,
+        true_positives=tp,
+        true_negatives=tn,
+        bcubed_precision=bc_p,
+        bcubed_recall=bc_r,
+        bcubed_f1=bc_f1,
     )
diff --git a/dqbench/models.py b/dqbench/models.py
@@ -96,6 +96,13 @@ class ERTierResult:
     false_negatives: int
     time_seconds: float
     memory_mb: float
+    # Confusion matrix (pair-level). fp/fn above are the off-diagonal counts.
+    true_positives: int = 0
+    true_negatives: int = 0
+    # B-Cubed (B³) cluster-level metrics — diagnostic, do not affect the composite.
+    bcubed_precision: float = 0.0
+    bcubed_recall: float = 0.0
+    bcubed_f1: float = 0.0
 
 
 @dataclass