Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ Changes:
`"member_prob"` (the attack classifier's membership probability) and `"member"`
(the ground truth label), matching the per-record output convention used by
`LIRAAttack` and `QMIAAttack`. Arrays are sized to the attack-model test slice.
* Tests: Cross-attack canary detection test (`tests/attacks/test_canary_predictions.py`)
parametrised over `QMIAAttack` and `LIRAAttack`. Replaces the QMIA-only
`test_qmia_predicts_canaries`. Shares a `canary_target` fixture in
`tests/conftest.py` that builds a target with label-flipped boundary rows and a
`bootstrap=False` RandomForest so the model memorises them. WorstCase is not yet
included because its per-record output indexes into an internal `train_test_split`
that does not align with original training-set indices; a follow-up will add a
dedicated WorstCase canary check.

## Version 1.4.3 (Jan 29, 2026)

Expand Down
97 changes: 97 additions & 0 deletions tests/attacks/test_canary_predictions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""Cross-attack canary detection tests.

Each MIA attack should flag deliberately memorised "canary" training rows
as members at higher confidence than genuine non-members. The fixture
``canary_target`` (defined in ``tests/conftest.py``) builds a target whose
training set contains label-flipped rows that sit on the decision
boundary; a non-bagging RandomForestClassifier memorises them, blowing
up their per-record MIA signal.

WorstCaseAttack is intentionally excluded for now. Its per-record output
indexes into an internal ``train_test_split`` of the combined train+test
predictions, not into the original training set, so canary indices do
not map directly. A WorstCase canary test is a follow-up.
"""

from __future__ import annotations

import numpy as np
import pytest
from sklearn.metrics import roc_auc_score

from sacroml.attacks.likelihood_attack import LIRAAttack
from sacroml.attacks.qmia_attack import QMIAAttack

# Per-attack thresholds. Shadow-model attacks (LiRA) need looser bounds
# than QMIA's quantile regression because per-canary variance is higher.
CANARY_PARAMS = [
pytest.param(
QMIAAttack,
{"random_state": 0},
"member_prob",
0.90,
7,
id="qmia",
),
pytest.param(
LIRAAttack,
{"n_shadow_models": 20},
"score",
0.85,
6,
id="lira",
),
]


@pytest.mark.parametrize(
("attack_cls", "attack_kwargs", "score_field", "auc_threshold", "canary_threshold"),
CANARY_PARAMS,
)
def test_attack_predicts_canaries(
attack_cls,
attack_kwargs,
score_field,
auc_threshold,
canary_threshold,
canary_target,
tmp_path,
):
"""Attack flags memorised canaries above genuine non-members."""
target, canary_idx, n_train = canary_target
n_canaries = len(canary_idx)

attack_obj = attack_cls(
output_dir=str(tmp_path / f"canary_{attack_cls.__name__}"),
write_report=False,
report_individual=True,
**attack_kwargs,
)
output = attack_obj.attack(target)

individual = output["attack_experiment_logger"]["attack_instance_logger"][
"instance_0"
]["individual"]
member_prob = np.asarray(individual[score_field])

canary_mp = member_prob[canary_idx]
test_mp = member_prob[n_train:]

# AUC of canaries (positives) vs genuine non-members (negatives).
# > auc_threshold confirms the attack flags memorised rows correctly.
y_score = np.concatenate([canary_mp, test_mp])
y_true = np.concatenate([np.ones_like(canary_mp), np.zeros_like(test_mp)])
canary_vs_test_auc = roc_auc_score(y_true, y_score)
assert canary_vs_test_auc > auc_threshold, (
f"{attack_cls.__name__} failed canary AUC: "
f"AUC={canary_vs_test_auc:.3f} (threshold {auc_threshold})"
)

# Most canaries should land above the 90th percentile of test scores.
test_p90 = np.percentile(test_mp, 90)
n_above_p90 = int((canary_mp > test_p90).sum())
assert n_above_p90 >= canary_threshold, (
f"{attack_cls.__name__}: only {n_above_p90}/{n_canaries} canaries "
f"exceed test 90th percentile ({test_p90:.3f}); "
f"canary scores: {sorted(canary_mp.tolist())}"
)
91 changes: 3 additions & 88 deletions tests/attacks/test_qmia_attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@
from sklearn.datasets import make_classification
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sacroml.attacks import utils
from sacroml.attacks.qmia_attack import QMIAAttack
Expand Down Expand Up @@ -338,92 +336,9 @@ def test_qmia_attack_signal_direction(qmia_binary_target, tmp_path):
assert instance["AUC"] > 0.5


def test_qmia_predicts_canaries(tmp_path):
"""QMIA should flag label-flipped 'canary' training rows as members.

Selects training rows nearest a decision boundary (lowest 9-NN
same-class confidence) and flips their labels. With ``bootstrap=False``
every tree fits every row, so the model memorises these mislabeled
rows and their hinge scores blow up. The attack should then assign
them member_probs well above genuine non-members (the test set).
Default RF with bootstrap=True only shows each row to ~63% of trees,
which dilutes the canary signal — bootstrap=False is what makes the
memorisation visible.
"""
n_canaries = 8
X, y = make_classification(
n_samples=400,
n_features=10,
n_informative=6,
n_redundant=0,
n_classes=2,
class_sep=1.0,
random_state=0,
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.4, stratify=y, random_state=0
)

knn = KNeighborsClassifier(n_neighbors=9).fit(X_train, y_train)
own_class_proba = knn.predict_proba(X_train)[np.arange(len(y_train)), y_train]
canary_idx = np.argsort(own_class_proba)[:n_canaries]

y_train_flipped = y_train.copy()
y_train_flipped[canary_idx] = 1 - y_train_flipped[canary_idx]

model = RandomForestClassifier(n_estimators=100, bootstrap=False, random_state=0)
model.fit(X_train, y_train_flipped)

target = Target(
model=model,
dataset_name="qmia_canaries",
X_train=X_train,
y_train=y_train_flipped,
X_test=X_test,
y_test=y_test,
X_train_orig=X_train,
y_train_orig=y_train_flipped,
X_test_orig=X_test,
y_test_orig=y_test,
)
for idx in range(X.shape[1]):
target.add_feature(f"V{idx}", [idx], "float")

attack_obj = QMIAAttack(
output_dir=str(tmp_path / "qmia_canaries"),
write_report=False,
report_individual=True,
random_state=0,
)
output = attack_obj.attack(target)

assert output["status"] == "success"
individual = output["attack_experiment_logger"]["attack_instance_logger"][
"instance_0"
]["individual"]
member_prob = np.asarray(individual["member_prob"])

n_train = len(y_train_flipped)
canary_mp = member_prob[canary_idx]
test_mp = member_prob[n_train:]

# AUC of canaries (positives) vs genuine non-members (negatives).
# >> 0.5 confirms QMIA flags the deliberately memorised rows correctly.
y_score = np.concatenate([canary_mp, test_mp])
y_true = np.concatenate([np.ones_like(canary_mp), np.zeros_like(test_mp)])
canary_vs_test_auc = roc_auc_score(y_true, y_score)
assert canary_vs_test_auc > 0.9, (
f"QMIA failed to distinguish memorised canaries from non-members: "
f"AUC={canary_vs_test_auc:.3f}"
)

# Most canaries should land above the 90th percentile of test scores.
test_p90 = np.percentile(test_mp, 90)
n_above_p90 = int((canary_mp > test_p90).sum())
assert n_above_p90 >= n_canaries - 1, (
f"Only {n_above_p90}/{n_canaries} canaries exceed the test 90th "
f"percentile ({test_p90:.3f}); canary scores: {sorted(canary_mp.tolist())}"
)
# Cross-attack canary test lives in tests/attacks/test_canary_predictions.py
# (parametrised across QMIA and LiRA, sharing the canary_target fixture in
# tests/conftest.py). The earlier QMIA-only test was removed to avoid drift.


# ---------------------------------------------------------------------------
Expand Down
68 changes: 68 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,74 @@ def get_target(request) -> Target:
return target


@pytest.fixture(name="canary_target")
def fixture_canary_target() -> tuple[Target, np.ndarray, int]:
"""Return a Target with deliberately memorised canary training rows.

Selects training rows nearest a decision boundary (lowest 9-NN
same-class confidence) and flips their labels. With ``bootstrap=False``
every tree fits every row, so the model memorises these mislabeled
rows and their MIA signal blows up. Use for cross-attack canary tests
that check whether an attack flags deliberately memorised rows above
genuine non-members.

Returns
-------
target : Target
Wrapped target with the canary-poisoned training set and trained
RandomForestClassifier.
canary_idx : np.ndarray
Indices of the canaries within the training set, so callers can
slice the per-record member probability output.
n_train : int
Size of the training set, so callers can compute the test-record
slice as ``member_prob[n_train:]``.
"""
# local import keeps the fixture module-light when not used
from sklearn.neighbors import KNeighborsClassifier # noqa: PLC0415

n_canaries = 8
X, y = make_classification(
n_samples=400,
n_features=10,
n_informative=6,
n_redundant=0,
n_classes=2,
class_sep=1.0,
random_state=0,
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.4, stratify=y, random_state=0
)

knn = KNeighborsClassifier(n_neighbors=9).fit(X_train, y_train)
own_class_proba = knn.predict_proba(X_train)[np.arange(len(y_train)), y_train]
canary_idx = np.argsort(own_class_proba)[:n_canaries]

y_train_flipped = y_train.copy()
y_train_flipped[canary_idx] = 1 - y_train_flipped[canary_idx]

model = RandomForestClassifier(n_estimators=100, bootstrap=False, random_state=0)
model.fit(X_train, y_train_flipped)

target = Target(
model=model,
dataset_name="canary_target",
X_train=X_train,
y_train=y_train_flipped,
X_test=X_test,
y_test=y_test,
X_train_orig=X_train,
y_train_orig=y_train_flipped,
X_test_orig=X_test,
y_test_orig=y_test,
)
for idx in range(X.shape[1]):
target.add_feature(f"V{idx}", [idx], "float")

return target, canary_idx, len(y_train_flipped)


@pytest.fixture
def get_target_multiclass() -> Target:
"""
Expand Down