From 726d1bc4d9979734031607017d28badfa50f06ec Mon Sep 17 00:00:00 2001 From: shamykyzer Date: Mon, 25 May 2026 09:50:04 +0300 Subject: [PATCH] test(canary): parametrise canary detection across QMIA and LiRA --- CHANGELOG.md | 8 ++ tests/attacks/test_canary_predictions.py | 97 ++++++++++++++++++++++++ tests/attacks/test_qmia_attack.py | 91 +--------------------- tests/conftest.py | 68 +++++++++++++++++ 4 files changed, 176 insertions(+), 88 deletions(-) create mode 100644 tests/attacks/test_canary_predictions.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 389fb015..fe73c6cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,14 @@ Changes: `"member_prob"` (the attack classifier's membership probability) and `"member"` (the ground truth label), matching the per-record output convention used by `LIRAAttack` and `QMIAAttack`. Arrays are sized to the attack-model test slice. +* Tests: Cross-attack canary detection test (`tests/attacks/test_canary_predictions.py`) + parametrised over `QMIAAttack` and `LIRAAttack`. Replaces the QMIA-only + `test_qmia_predicts_canaries`. Shares a `canary_target` fixture in + `tests/conftest.py` that builds a target with label-flipped boundary rows and a + `bootstrap=False` RandomForest so the model memorises them. WorstCase is not yet + included because its per-record output indexes into an internal `train_test_split` + that does not align with original training-set indices; a follow-up will add a + dedicated WorstCase canary check. ## Version 1.4.3 (Jan 29, 2026) diff --git a/tests/attacks/test_canary_predictions.py b/tests/attacks/test_canary_predictions.py new file mode 100644 index 00000000..ec9e60e4 --- /dev/null +++ b/tests/attacks/test_canary_predictions.py @@ -0,0 +1,97 @@ +"""Cross-attack canary detection tests. + +Each MIA attack should flag deliberately memorised "canary" training rows +as members at higher confidence than genuine non-members. The fixture +``canary_target`` (defined in ``tests/conftest.py``) builds a target whose +training set contains label-flipped rows that sit on the decision +boundary; a non-bagging RandomForestClassifier memorises them, blowing +up their per-record MIA signal. + +WorstCaseAttack is intentionally excluded for now. Its per-record output +indexes into an internal ``train_test_split`` of the combined train+test +predictions, not into the original training set, so canary indices do +not map directly. A WorstCase canary test is a follow-up. +""" + +from __future__ import annotations + +import numpy as np +import pytest +from sklearn.metrics import roc_auc_score + +from sacroml.attacks.likelihood_attack import LIRAAttack +from sacroml.attacks.qmia_attack import QMIAAttack + +# Per-attack thresholds. Shadow-model attacks (LiRA) need looser bounds +# than QMIA's quantile regression because per-canary variance is higher. +CANARY_PARAMS = [ + pytest.param( + QMIAAttack, + {"random_state": 0}, + "member_prob", + 0.90, + 7, + id="qmia", + ), + pytest.param( + LIRAAttack, + {"n_shadow_models": 20}, + "score", + 0.85, + 6, + id="lira", + ), +] + + +@pytest.mark.parametrize( + ("attack_cls", "attack_kwargs", "score_field", "auc_threshold", "canary_threshold"), + CANARY_PARAMS, +) +def test_attack_predicts_canaries( + attack_cls, + attack_kwargs, + score_field, + auc_threshold, + canary_threshold, + canary_target, + tmp_path, +): + """Attack flags memorised canaries above genuine non-members.""" + target, canary_idx, n_train = canary_target + n_canaries = len(canary_idx) + + attack_obj = attack_cls( + output_dir=str(tmp_path / f"canary_{attack_cls.__name__}"), + write_report=False, + report_individual=True, + **attack_kwargs, + ) + output = attack_obj.attack(target) + + individual = output["attack_experiment_logger"]["attack_instance_logger"][ + "instance_0" + ]["individual"] + member_prob = np.asarray(individual[score_field]) + + canary_mp = member_prob[canary_idx] + test_mp = member_prob[n_train:] + + # AUC of canaries (positives) vs genuine non-members (negatives). + # > auc_threshold confirms the attack flags memorised rows correctly. + y_score = np.concatenate([canary_mp, test_mp]) + y_true = np.concatenate([np.ones_like(canary_mp), np.zeros_like(test_mp)]) + canary_vs_test_auc = roc_auc_score(y_true, y_score) + assert canary_vs_test_auc > auc_threshold, ( + f"{attack_cls.__name__} failed canary AUC: " + f"AUC={canary_vs_test_auc:.3f} (threshold {auc_threshold})" + ) + + # Most canaries should land above the 90th percentile of test scores. + test_p90 = np.percentile(test_mp, 90) + n_above_p90 = int((canary_mp > test_p90).sum()) + assert n_above_p90 >= canary_threshold, ( + f"{attack_cls.__name__}: only {n_above_p90}/{n_canaries} canaries " + f"exceed test 90th percentile ({test_p90:.3f}); " + f"canary scores: {sorted(canary_mp.tolist())}" + ) diff --git a/tests/attacks/test_qmia_attack.py b/tests/attacks/test_qmia_attack.py index 6261dbbb..8db78726 100644 --- a/tests/attacks/test_qmia_attack.py +++ b/tests/attacks/test_qmia_attack.py @@ -12,9 +12,7 @@ from sklearn.datasets import make_classification from sklearn.dummy import DummyClassifier from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestClassifier -from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier from sacroml.attacks import utils from sacroml.attacks.qmia_attack import QMIAAttack @@ -338,92 +336,9 @@ def test_qmia_attack_signal_direction(qmia_binary_target, tmp_path): assert instance["AUC"] > 0.5 -def test_qmia_predicts_canaries(tmp_path): - """QMIA should flag label-flipped 'canary' training rows as members. - - Selects training rows nearest a decision boundary (lowest 9-NN - same-class confidence) and flips their labels. With ``bootstrap=False`` - every tree fits every row, so the model memorises these mislabeled - rows and their hinge scores blow up. The attack should then assign - them member_probs well above genuine non-members (the test set). - Default RF with bootstrap=True only shows each row to ~63% of trees, - which dilutes the canary signal — bootstrap=False is what makes the - memorisation visible. - """ - n_canaries = 8 - X, y = make_classification( - n_samples=400, - n_features=10, - n_informative=6, - n_redundant=0, - n_classes=2, - class_sep=1.0, - random_state=0, - ) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.4, stratify=y, random_state=0 - ) - - knn = KNeighborsClassifier(n_neighbors=9).fit(X_train, y_train) - own_class_proba = knn.predict_proba(X_train)[np.arange(len(y_train)), y_train] - canary_idx = np.argsort(own_class_proba)[:n_canaries] - - y_train_flipped = y_train.copy() - y_train_flipped[canary_idx] = 1 - y_train_flipped[canary_idx] - - model = RandomForestClassifier(n_estimators=100, bootstrap=False, random_state=0) - model.fit(X_train, y_train_flipped) - - target = Target( - model=model, - dataset_name="qmia_canaries", - X_train=X_train, - y_train=y_train_flipped, - X_test=X_test, - y_test=y_test, - X_train_orig=X_train, - y_train_orig=y_train_flipped, - X_test_orig=X_test, - y_test_orig=y_test, - ) - for idx in range(X.shape[1]): - target.add_feature(f"V{idx}", [idx], "float") - - attack_obj = QMIAAttack( - output_dir=str(tmp_path / "qmia_canaries"), - write_report=False, - report_individual=True, - random_state=0, - ) - output = attack_obj.attack(target) - - assert output["status"] == "success" - individual = output["attack_experiment_logger"]["attack_instance_logger"][ - "instance_0" - ]["individual"] - member_prob = np.asarray(individual["member_prob"]) - - n_train = len(y_train_flipped) - canary_mp = member_prob[canary_idx] - test_mp = member_prob[n_train:] - - # AUC of canaries (positives) vs genuine non-members (negatives). - # >> 0.5 confirms QMIA flags the deliberately memorised rows correctly. - y_score = np.concatenate([canary_mp, test_mp]) - y_true = np.concatenate([np.ones_like(canary_mp), np.zeros_like(test_mp)]) - canary_vs_test_auc = roc_auc_score(y_true, y_score) - assert canary_vs_test_auc > 0.9, ( - f"QMIA failed to distinguish memorised canaries from non-members: " - f"AUC={canary_vs_test_auc:.3f}" - ) - - # Most canaries should land above the 90th percentile of test scores. - test_p90 = np.percentile(test_mp, 90) - n_above_p90 = int((canary_mp > test_p90).sum()) - assert n_above_p90 >= n_canaries - 1, ( - f"Only {n_above_p90}/{n_canaries} canaries exceed the test 90th " - f"percentile ({test_p90:.3f}); canary scores: {sorted(canary_mp.tolist())}" - ) +# Cross-attack canary test lives in tests/attacks/test_canary_predictions.py +# (parametrised across QMIA and LiRA, sharing the canary_target fixture in +# tests/conftest.py). The earlier QMIA-only test was removed to avoid drift. # --------------------------------------------------------------------------- diff --git a/tests/conftest.py b/tests/conftest.py index 7efca252..cf085fbf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -198,6 +198,74 @@ def get_target(request) -> Target: return target +@pytest.fixture(name="canary_target") +def fixture_canary_target() -> tuple[Target, np.ndarray, int]: + """Return a Target with deliberately memorised canary training rows. + + Selects training rows nearest a decision boundary (lowest 9-NN + same-class confidence) and flips their labels. With ``bootstrap=False`` + every tree fits every row, so the model memorises these mislabeled + rows and their MIA signal blows up. Use for cross-attack canary tests + that check whether an attack flags deliberately memorised rows above + genuine non-members. + + Returns + ------- + target : Target + Wrapped target with the canary-poisoned training set and trained + RandomForestClassifier. + canary_idx : np.ndarray + Indices of the canaries within the training set, so callers can + slice the per-record member probability output. + n_train : int + Size of the training set, so callers can compute the test-record + slice as ``member_prob[n_train:]``. + """ + # local import keeps the fixture module-light when not used + from sklearn.neighbors import KNeighborsClassifier # noqa: PLC0415 + + n_canaries = 8 + X, y = make_classification( + n_samples=400, + n_features=10, + n_informative=6, + n_redundant=0, + n_classes=2, + class_sep=1.0, + random_state=0, + ) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.4, stratify=y, random_state=0 + ) + + knn = KNeighborsClassifier(n_neighbors=9).fit(X_train, y_train) + own_class_proba = knn.predict_proba(X_train)[np.arange(len(y_train)), y_train] + canary_idx = np.argsort(own_class_proba)[:n_canaries] + + y_train_flipped = y_train.copy() + y_train_flipped[canary_idx] = 1 - y_train_flipped[canary_idx] + + model = RandomForestClassifier(n_estimators=100, bootstrap=False, random_state=0) + model.fit(X_train, y_train_flipped) + + target = Target( + model=model, + dataset_name="canary_target", + X_train=X_train, + y_train=y_train_flipped, + X_test=X_test, + y_test=y_test, + X_train_orig=X_train, + y_train_orig=y_train_flipped, + X_test_orig=X_test, + y_test_orig=y_test, + ) + for idx in range(X.shape[1]): + target.add_feature(f"V{idx}", [idx], "float") + + return target, canary_idx, len(y_train_flipped) + + @pytest.fixture def get_target_multiclass() -> Target: """