Bloom-Filter-Text-Classification/spam_experiment.py at main · tejfaster/Bloom-Filter-Text-Classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import csv
import os
import pandas as pd

from bloom_filter import Bloom_Filter
from features import ext_feat, compute_feature_mut_inf, slt_feat
from inference import post_mbshp_prob, cond_ent
from plots import plot_query_results, plot_mi_threshold_analysis


if __name__ == "__main__":

    # Load SMS Spam Dataset
    df = pd.read_csv("spam.csv", sep="\t", names=["label", "text"])

    # Convert labels to binary
    df["y"] = df["label"].map({"spam": 1, "ham": 0})
    df["features"] = df["text"].apply(ext_feat)

    # Train / test split (80/20)
    train_df = df.sample(frac=0.8, random_state=42)
    test_df = df.drop(train_df.index)


    # Compute Mutual Information on Training Data
    print("Computing Mutual Information on training data...")
    mi_scores = compute_feature_mut_inf(
        train_df["features"].tolist(),
        train_df["y"].tolist()
    )

    # Build Bloom Filter (Spam Only)
    bf = Bloom_Filter(n=2500, fpr=0.01)

    spam_train = train_df[train_df["y"] == 1]["text"]
    ham_test = test_df[test_df["y"] == 0]

    for msg in spam_train:
        feats = ext_feat(msg)
        selected = slt_feat(feats, mi_scores, threshold=0.01)
        bf.add(selected)

    # Per-Message Analysis (for plots)
    results = []

    for msg in ham_test["text"].head(10):
        post = post_mbshp_prob(bf, msg, mi_scores, threshold=0.01)
        ent = cond_ent(post)

        results.append({
            "query": msg[:20],
            "posterior": post,
            "entropy": ent
        })

    plot_query_results(results)

    # Empirical False Positive Rate (CORRECT METHOD)
    fp_count = sum(
        1 for msg in ham_test["text"]
        if all(
            bf.contains(f)
            for f in slt_feat(ext_feat(msg), mi_scores, threshold=0.01)
        )
    )

    total_ham = len(ham_test)
    actual_fpr = fp_count / total_ham

    print("\n--- Bloom Filter False Positive Analysis ---\n")

    print("Theoretical Configuration:")
    print(f"  Target Bloom Filter FPR (per feature): {bf.fpr}\n")

    print("Experimental Evaluation on Real Data:")
    print(f"  Total ham messages tested: {total_ham}")
    print(f"  Ham messages flagged as spam: {fp_count}\n")

    print("Results:")
    print(f"  Actual False Positive Rate: {actual_fpr:.4f} ({actual_fpr*100:.2f}%)\n")

    print("Interpretation:")
    print("  Bloom filter guarantees apply to single-element queries.")
    print("  Text messages contain multiple overlapping features,")
    print("  which violates independence assumptions and inflates FPR.\n")

    # MI Threshold Sensitivity Analysis (KEY EXPERIMENT)
    print("\n--- MI Threshold Sensitivity Analysis ---\n")

    thresholds = [0.001, 0.01, 0.05, 0.1]
    fprs = []
    entropies = []

    for t in thresholds:
        bf_t = Bloom_Filter(n=2500, fpr=0.01)

        # Add spam messages with MI threshold t
        for msg in spam_train:
            feats = ext_feat(msg)
            selected = slt_feat(feats, mi_scores, threshold=t)
            bf_t.add(selected)

        # Empirical FPR
        fp_t = sum(
            1 for msg in ham_test["text"]
            if all(
                bf_t.contains(f)
                for f in slt_feat(ext_feat(msg), mi_scores, threshold=t)
            )
        )
        fpr_t = fp_t / len(ham_test)
        fprs.append(fpr_t)

        # Average conditional entropy (subset for speed)
        avg_entropy = sum(
            cond_ent(post_mbshp_prob(bf_t, msg, mi_scores, threshold=t))
            for msg in ham_test["text"].head(100)
        ) / 100
        entropies.append(avg_entropy)

        print(f"MI={t:<6} | FPR={fpr_t:.4f} | Avg Entropy={avg_entropy:.4f}")

    # Plot MI threshold analysis graphs
    plot_mi_threshold_analysis(thresholds, fprs, entropies)


    # Save Experiment Summary
    os.makedirs("results/experiments", exist_ok=True)

    summary_file = "results/experiments/experiment_summary.csv"
    with open(summary_file, mode="w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            "target_fpr",
            "actual_fpr",
            "total_ham_messages",
            "false_positives",
            "mi_threshold",
            "prior",
            "bf_n",
            "bf_k",
            "bf_m"
        ])
        writer.writerow([
            bf.fpr,
            actual_fpr,
            total_ham,
            fp_count,
            0.01,
            0.1,
            bf.n,
            bf.k,
            bf.m
        ])

    print(f"\nExperiment summary saved to {summary_file}")

    # Save Per-Message Results
    details_file = "results/experiments/per_message_results.csv"
    with open(details_file, mode="w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            "message_preview",
            "posterior_probability",
            "conditional_entropy"
        ])
        for r in results:
            writer.writerow([
                r["query"],
                r["posterior"],
                r["entropy"]
            ])

    print(f"Per-message results saved to {details_file}")