-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
54 lines (47 loc) · 1.67 KB
/
preprocess.py
File metadata and controls
54 lines (47 loc) · 1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# ============= Preprocess Dataset ==============
import pandas as pd
import os
print("pandas imported successfully")
# Input + Output paths
ROOT = os.path.expanduser("~")
data_path = os.path.join(
ROOT, "Documents", "Projects", "Bridge", "AI", "Anomaly", "data", "CICIDS_Flow.parquet"
)
output_path = os.path.join(
ROOT, "Documents", "Projects", "Bridge", "AI", "Anomaly", "data", "flows_clean.csv"
)
try:
# Load dataset
df = pd.read_parquet(data_path)
print("Dataset loaded successfully")
# Select only relevant columns (make sure names match exactly!)
selected_cols = [
"Flow Duration",
"Total Fwd Packets",
"Total Backward Packets",
"Total Length of Fwd Packets",
"Total Length of Bwd Packets",
"protocol",
"attack_label",
]
df_clean = df[selected_cols].copy()
# Rename columns to snake_case for easier ML processing
df_clean.rename(
columns={
"Flow Duration": "flow_duration",
"Total Fwd Packets": "total_fwd_packets",
"Total Backward Packets": "total_backward_packets",
"Total Length of Fwd Packets": "total_length_fwd_packets",
"Total Length of Bwd Packets": "total_length_bwd_packets",
"protocol": "protocol",
"attack_label": "attack_label",
},
inplace=True,
)
# Save to CSV
df_clean.to_csv(output_path, index=False)
print(f"✅ Clean dataset saved at: {output_path}")
print("Shape:", df_clean.shape)
print("Preview:\n", df_clean.head())
except Exception as e:
print(f"❌ Error preprocessing dataset: {e}")