-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathretrain.py
More file actions
142 lines (116 loc) · 5.08 KB
/
retrain.py
File metadata and controls
142 lines (116 loc) · 5.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# train_models.py
"""
Train an unsupervised Isolation Forest for NIDS.
- Loads features from data/flows_features.csv
- Trains IsolationForest on BENIGN samples only
- Tests on a held-out set (benign + attacks)
- Saves model + scaler in models/
- Saves metrics + eval results in models/ (instead of reports/)
All paths are hard-set for your project:
- Input: C:/Users/LENOVO/Documents/Projects/Bridge/AI/Anomaly/data/flows_features.csv
- Output: C:/Users/LENOVO/Documents/Projects/Bridge/AI/Anomaly/models
"""
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib
# --- Hardcoded paths ---
INPUT_PATH = r"C:\Users\LENOVO\Documents\Projects\Bridge\AI\Anomaly\data\flows_features.csv"
OUTPUT_DIR = r"C:\Users\LENOVO\Documents\Projects\Bridge\AI\Anomaly\models"
# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)
def get_numeric_feature_columns(df):
exclude = {'attack_label', 'is_attack'}
return [c for c in df.columns if c not in exclude and np.issubdtype(df[c].dtype, np.number)]
def main():
contamination = 0.4
sample_size = 200000
rs = 42
print(f"Loading data from: {INPUT_PATH}")
df = pd.read_csv(INPUT_PATH)
print("Total rows in CSV:", len(df))
# Optional subsample
if sample_size and sample_size > 0 and sample_size < len(df):
print(f"Sampling {sample_size} rows (stratified by label)...")
frac = sample_size / len(df)
df = df.groupby('attack_label', group_keys=False).apply(
lambda x: x.sample(frac=frac, random_state=rs)
)
df = df.sample(frac=1, random_state=rs).reset_index(drop=True)
print("Rows after sampling:", len(df))
# Map labels
df['is_attack'] = (df['attack_label'].str.upper() != 'BENIGN').astype(int)
# Features
feature_cols = get_numeric_feature_columns(df)
print("Numeric feature columns used:", feature_cols)
X = df[feature_cols].fillna(0).astype(float)
y = df['is_attack'].values
# Train/test split
benign_mask = (df['is_attack'] == 0)
X_benign = X[benign_mask]
print("Benign rows available for training:", len(X_benign))
rng = np.random.RandomState(rs)
n_benign = len(X_benign)
train_frac = 0.6
train_idx = rng.choice(n_benign, size=int(train_frac * n_benign), replace=False)
X_benign_train = X_benign.iloc[train_idx]
# Test set = remaining benign + all attacks
train_indices = X_benign_train.index
test_df = df.drop(index=train_indices)
X_test = X.loc[test_df.index]
y_test = test_df['is_attack'].values
print("Train (benign-only) shape:", X_benign_train.shape)
print("Test shape (benign+attacks):", X_test.shape)
# --------- Scale features -------------
scaler = StandardScaler()
scaler.fit(X_benign_train)
X_train_scaled = scaler.transform(X_benign_train)
X_test_scaled = scaler.transform(X_test)
# --------- Train IsolationForest -------------
print(f"Training IsolationForest (contamination={contamination}) ...")
iso = IsolationForest(contamination=contamination, random_state=rs, n_jobs=-1)
iso.fit(X_train_scaled)
print("Model trained.")
# --------- Predict ------------
preds = iso.predict(X_test_scaled)
y_pred = (preds == -1).astype(int)
# --------- Evaluate -----------
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
cm = confusion_matrix(y_test, y_pred)
print("Evaluation on test set:")
print("Confusion matrix (rows=true, cols=pred):\n", cm)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
# --------- Save eval results ----------
test_scores = iso.decision_function(X_test_scaled)
out_eval_df = test_df.copy()
out_eval_df = out_eval_df.assign(
pred_anomaly=y_pred,
score=test_scores
)
eval_csv = os.path.join(OUTPUT_DIR, 'eval_results.csv')
out_eval_df.to_csv(eval_csv, index=False)
print("Saved per-flow evaluation CSV to:", eval_csv)
# ------------ Save model + scaler -------------
joblib.dump(iso, os.path.join(OUTPUT_DIR, 'isoforest.pkl'))
joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'scaler.pkl'))
print("Saved model and scaler in:", OUTPUT_DIR)
# Save summary
summary = {
'precision': precision,
'recall': recall,
'f1': f1,
'n_train_benign': int(X_benign_train.shape[0]),
'n_test': int(X_test.shape[0])
}
summary_df = pd.DataFrame([summary])
summary_df.to_csv(os.path.join(OUTPUT_DIR, 'metrics_summary.csv'), index=False)
print("Saved metrics summary.")
print("\nClassification report (anomaly=1):")
print(classification_report(y_test, y_pred, zero_division=0))
if __name__ == '__main__':
main()