AI-SDC · jim-smith · May 26, 2026 · May 25, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -29,6 +29,14 @@ Changes:
     `"member_prob"` (the attack classifier's membership probability) and `"member"`
     (the ground truth label), matching the per-record output convention used by
     `LIRAAttack` and `QMIAAttack`. Arrays are sized to the attack-model test slice.
+*   Tests: Cross-attack canary detection test (`tests/attacks/test_canary_predictions.py`)
+    parametrised over `QMIAAttack` and `LIRAAttack`. Replaces the QMIA-only
+    `test_qmia_predicts_canaries`. Shares a `canary_target` fixture in
+    `tests/conftest.py` that builds a target with label-flipped boundary rows and a
+    `bootstrap=False` RandomForest so the model memorises them. WorstCase is not yet
+    included because its per-record output indexes into an internal `train_test_split`
+    that does not align with original training-set indices; a follow-up will add a
+    dedicated WorstCase canary check.
 
 ## Version 1.4.3 (Jan 29, 2026)
 

diff --git a/tests/attacks/test_canary_predictions.py b/tests/attacks/test_canary_predictions.py
@@ -0,0 +1,97 @@
+"""Cross-attack canary detection tests.
+
+Each MIA attack should flag deliberately memorised "canary" training rows
+as members at higher confidence than genuine non-members. The fixture
+``canary_target`` (defined in ``tests/conftest.py``) builds a target whose
+training set contains label-flipped rows that sit on the decision
+boundary; a non-bagging RandomForestClassifier memorises them, blowing
+up their per-record MIA signal.
+
+WorstCaseAttack is intentionally excluded for now. Its per-record output
+indexes into an internal ``train_test_split`` of the combined train+test
+predictions, not into the original training set, so canary indices do
+not map directly. A WorstCase canary test is a follow-up.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+from sklearn.metrics import roc_auc_score
+
+from sacroml.attacks.likelihood_attack import LIRAAttack
+from sacroml.attacks.qmia_attack import QMIAAttack
+
+# Per-attack thresholds. Shadow-model attacks (LiRA) need looser bounds
+# than QMIA's quantile regression because per-canary variance is higher.
+CANARY_PARAMS = [
+    pytest.param(
+        QMIAAttack,
+        {"random_state": 0},
+        "member_prob",
+        0.90,
+        7,
+        id="qmia",
+    ),
+    pytest.param(
+        LIRAAttack,
+        {"n_shadow_models": 20},
+        "score",
+        0.85,
+        6,
+        id="lira",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    ("attack_cls", "attack_kwargs", "score_field", "auc_threshold", "canary_threshold"),
+    CANARY_PARAMS,
+)
+def test_attack_predicts_canaries(
+    attack_cls,
+    attack_kwargs,
+    score_field,
+    auc_threshold,
+    canary_threshold,
+    canary_target,
+    tmp_path,
+):
+    """Attack flags memorised canaries above genuine non-members."""
+    target, canary_idx, n_train = canary_target
+    n_canaries = len(canary_idx)
+
+    attack_obj = attack_cls(
+        output_dir=str(tmp_path / f"canary_{attack_cls.__name__}"),
+        write_report=False,
+        report_individual=True,
+        **attack_kwargs,
+    )
+    output = attack_obj.attack(target)
+
+    individual = output["attack_experiment_logger"]["attack_instance_logger"][
+        "instance_0"
+    ]["individual"]
+    member_prob = np.asarray(individual[score_field])
+
+    canary_mp = member_prob[canary_idx]
+    test_mp = member_prob[n_train:]
+
+    # AUC of canaries (positives) vs genuine non-members (negatives).
+    # > auc_threshold confirms the attack flags memorised rows correctly.
+    y_score = np.concatenate([canary_mp, test_mp])
+    y_true = np.concatenate([np.ones_like(canary_mp), np.zeros_like(test_mp)])
+    canary_vs_test_auc = roc_auc_score(y_true, y_score)
+    assert canary_vs_test_auc > auc_threshold, (
+        f"{attack_cls.__name__} failed canary AUC: "
+        f"AUC={canary_vs_test_auc:.3f} (threshold {auc_threshold})"
+    )
+
+    # Most canaries should land above the 90th percentile of test scores.
+    test_p90 = np.percentile(test_mp, 90)
+    n_above_p90 = int((canary_mp > test_p90).sum())
+    assert n_above_p90 >= canary_threshold, (
+        f"{attack_cls.__name__}: only {n_above_p90}/{n_canaries} canaries "
+        f"exceed test 90th percentile ({test_p90:.3f}); "
+        f"canary scores: {sorted(canary_mp.tolist())}"
+    )
diff --git a/tests/attacks/test_qmia_attack.py b/tests/attacks/test_qmia_attack.py
@@ -12,9 +12,7 @@
 from sklearn.datasets import make_classification
 from sklearn.dummy import DummyClassifier
 from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestClassifier
-from sklearn.metrics import roc_auc_score
 from sklearn.model_selection import train_test_split
-from sklearn.neighbors import KNeighborsClassifier
 
 from sacroml.attacks import utils
 from sacroml.attacks.qmia_attack import QMIAAttack
@@ -338,92 +336,9 @@ def test_qmia_attack_signal_direction(qmia_binary_target, tmp_path):
     assert instance["AUC"] > 0.5
 
 
-def test_qmia_predicts_canaries(tmp_path):
-    """QMIA should flag label-flipped 'canary' training rows as members.
-
-    Selects training rows nearest a decision boundary (lowest 9-NN
-    same-class confidence) and flips their labels. With ``bootstrap=False``
-    every tree fits every row, so the model memorises these mislabeled
-    rows and their hinge scores blow up. The attack should then assign
-    them member_probs well above genuine non-members (the test set).
-    Default RF with bootstrap=True only shows each row to ~63% of trees,
-    which dilutes the canary signal — bootstrap=False is what makes the
-    memorisation visible.
-    """
-    n_canaries = 8
-    X, y = make_classification(
-        n_samples=400,
-        n_features=10,
-        n_informative=6,
-        n_redundant=0,
-        n_classes=2,
-        class_sep=1.0,
-        random_state=0,
-    )
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=0.4, stratify=y, random_state=0
-    )
-
-    knn = KNeighborsClassifier(n_neighbors=9).fit(X_train, y_train)
-    own_class_proba = knn.predict_proba(X_train)[np.arange(len(y_train)), y_train]
-    canary_idx = np.argsort(own_class_proba)[:n_canaries]
-
-    y_train_flipped = y_train.copy()
-    y_train_flipped[canary_idx] = 1 - y_train_flipped[canary_idx]
-
-    model = RandomForestClassifier(n_estimators=100, bootstrap=False, random_state=0)
-    model.fit(X_train, y_train_flipped)
-
-    target = Target(
-        model=model,
-        dataset_name="qmia_canaries",
-        X_train=X_train,
-        y_train=y_train_flipped,
-        X_test=X_test,
-        y_test=y_test,
-        X_train_orig=X_train,
-        y_train_orig=y_train_flipped,
-        X_test_orig=X_test,
-        y_test_orig=y_test,
-    )
-    for idx in range(X.shape[1]):
-        target.add_feature(f"V{idx}", [idx], "float")
-
-    attack_obj = QMIAAttack(
-        output_dir=str(tmp_path / "qmia_canaries"),
-        write_report=False,
-        report_individual=True,
-        random_state=0,
-    )
-    output = attack_obj.attack(target)
-
-    assert output["status"] == "success"
-    individual = output["attack_experiment_logger"]["attack_instance_logger"][
-        "instance_0"
-    ]["individual"]
-    member_prob = np.asarray(individual["member_prob"])
-
-    n_train = len(y_train_flipped)
-    canary_mp = member_prob[canary_idx]
-    test_mp = member_prob[n_train:]
-
-    # AUC of canaries (positives) vs genuine non-members (negatives).
-    # >> 0.5 confirms QMIA flags the deliberately memorised rows correctly.
-    y_score = np.concatenate([canary_mp, test_mp])
-    y_true = np.concatenate([np.ones_like(canary_mp), np.zeros_like(test_mp)])
-    canary_vs_test_auc = roc_auc_score(y_true, y_score)
-    assert canary_vs_test_auc > 0.9, (
-        f"QMIA failed to distinguish memorised canaries from non-members: "
-        f"AUC={canary_vs_test_auc:.3f}"
-    )
-
-    # Most canaries should land above the 90th percentile of test scores.
-    test_p90 = np.percentile(test_mp, 90)
-    n_above_p90 = int((canary_mp > test_p90).sum())
-    assert n_above_p90 >= n_canaries - 1, (
-        f"Only {n_above_p90}/{n_canaries} canaries exceed the test 90th "
-        f"percentile ({test_p90:.3f}); canary scores: {sorted(canary_mp.tolist())}"
-    )
+# Cross-attack canary test lives in tests/attacks/test_canary_predictions.py
+# (parametrised across QMIA and LiRA, sharing the canary_target fixture in
+# tests/conftest.py). The earlier QMIA-only test was removed to avoid drift.
 
 
 # ---------------------------------------------------------------------------

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -198,6 +198,74 @@ def get_target(request) -> Target:
     return target
 
 
+@pytest.fixture(name="canary_target")
+def fixture_canary_target() -> tuple[Target, np.ndarray, int]:
+    """Return a Target with deliberately memorised canary training rows.
+
+    Selects training rows nearest a decision boundary (lowest 9-NN
+    same-class confidence) and flips their labels. With ``bootstrap=False``
+    every tree fits every row, so the model memorises these mislabeled
+    rows and their MIA signal blows up. Use for cross-attack canary tests
+    that check whether an attack flags deliberately memorised rows above
+    genuine non-members.
+
+    Returns
+    -------
+    target : Target
+        Wrapped target with the canary-poisoned training set and trained
+        RandomForestClassifier.
+    canary_idx : np.ndarray
+        Indices of the canaries within the training set, so callers can
+        slice the per-record member probability output.
+    n_train : int
+        Size of the training set, so callers can compute the test-record
+        slice as ``member_prob[n_train:]``.
+    """
+    # local import keeps the fixture module-light when not used
+    from sklearn.neighbors import KNeighborsClassifier  # noqa: PLC0415
+
+    n_canaries = 8
+    X, y = make_classification(
+        n_samples=400,
+        n_features=10,
+        n_informative=6,
+        n_redundant=0,
+        n_classes=2,
+        class_sep=1.0,
+        random_state=0,
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.4, stratify=y, random_state=0
+    )
+
+    knn = KNeighborsClassifier(n_neighbors=9).fit(X_train, y_train)
+    own_class_proba = knn.predict_proba(X_train)[np.arange(len(y_train)), y_train]
+    canary_idx = np.argsort(own_class_proba)[:n_canaries]
+
+    y_train_flipped = y_train.copy()
+    y_train_flipped[canary_idx] = 1 - y_train_flipped[canary_idx]
+
+    model = RandomForestClassifier(n_estimators=100, bootstrap=False, random_state=0)
+    model.fit(X_train, y_train_flipped)
+
+    target = Target(
+        model=model,
+        dataset_name="canary_target",
+        X_train=X_train,
+        y_train=y_train_flipped,
+        X_test=X_test,
+        y_test=y_test,
+        X_train_orig=X_train,
+        y_train_orig=y_train_flipped,
+        X_test_orig=X_test,
+        y_test_orig=y_test,
+    )
+    for idx in range(X.shape[1]):
+        target.add_feature(f"V{idx}", [idx], "float")
+
+    return target, canary_idx, len(y_train_flipped)
+
+
 @pytest.fixture
 def get_target_multiclass() -> Target:
     """