Skip to content

Commit 2ab3545

Browse files
2 parents 7f0a127 + d824eb3 commit 2ab3545

2 files changed

Lines changed: 184 additions & 58 deletions

File tree

sections/analytics.py

Lines changed: 11 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77

88
from sklearn.preprocessing import StandardScaler
99
from sklearn.preprocessing import OneHotEncoder
10-
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
10+
from sklearn.cluster import KMeans
11+
from sklearn.decomposition import PCA
1112

1213
if "parsed_df" not in st.session_state:
1314
st.session_state.parsed_df = None
@@ -21,7 +22,7 @@
2122
st.stop()
2223

2324
data = st.session_state.parsed_df
24-
data = data.select(["portdest","protocole","regle1","status"])
25+
data = data.select(["portdst","protocole","regle","action"])
2526

2627
# Sélectionner toutes les colonnes numériques
2728
quanti = data.select(pl.col(pl.Int64))
@@ -75,21 +76,13 @@
7576
.groupby("cluster_kmeans", group_keys=False)
7677
.apply(lambda x: x.sample(frac=0.05, random_state=42))
7778
)
78-
# dbscan = DBSCAN(eps=0.5, min_samples=10)
79-
# preds = dbscan.fit_predict(df.to_pandas())
80-
# df = df.with_columns(pl.Series(values=preds, name='cluster_dbscan'))
81-
82-
# agg_clustering = AgglomerativeClustering(n_clusters=2)
83-
# preds = agg_clustering.fit_predict(df.to_pandas())
84-
# df = df.with_columns(pl.Series(values=preds, name='cluster_agg'))
8579

8680
###############################################################
8781
#### Visualisation des clusters ####
8882
###############################################################
8983

9084

91-
# Visualisation des clusters (en 2D avec PCA)
92-
from sklearn.decomposition import PCA
85+
# Visualisation des clusters (en 2D avec PCA)
9386

9487
pca = PCA(n_components=2)
9588
df_pca = pca.fit_transform(df_ech.to_pandas())
@@ -117,27 +110,8 @@
117110
with st.spinner("Performing some more data analysis..."):
118111
try:
119112
data = data.with_columns(pl.Series(name="cluster_kmeans", values=df_clust.select("cluster_kmeans")))
120-
cols = ["protocole","regle1","status"]
121-
for col in cols:
122-
# fig = px.bar(freq_df, x=col, y='frequency',
123-
# title=f'{col} frequency',
124-
# labels={'categorie': 'Category', 'frequence': 'Frequency'},
125-
# color=col)
126-
# fig.update_layout(xaxis_title='Categories', yaxis_title='Frequency')
127-
# st.plotly_chart(fig, use_container_width=True)
128-
129-
# data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
130-
# freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency"))
131-
132-
# fig = px.bar(freq_df, x=col, y='frequency',
133-
# title=f'{col} frequency',
134-
# labels={'categorie': 'Category', 'frequence': 'Frequency'},
135-
# color=col)
136-
# fig.update_layout(xaxis_title='Categories', yaxis_title='Frequency')
137-
# st.plotly_chart(fig, use_container_width=True)
138-
139-
140-
113+
# Analyse des variables qualitatives par cluster
114+
for col in quali.columns: # protocole, regle, action
141115
fig = make_subplots(rows=1, cols=2)
142116

143117
data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
@@ -166,21 +140,23 @@
166140
)
167141
st.plotly_chart(fig, use_container_width=True)
168142

143+
# Analyse de la variable quantitative par cluster
144+
169145
fig = make_subplots(rows=1, cols=2)
170146

171147
data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
172148

173149
# Ajouter le premier histogramme
174150
fig.add_trace(
175-
go.Histogram(x=data_filtered["portdest"], name="Cluster 0", marker_color="rebeccapurple"),
151+
go.Histogram(x=data_filtered["portdst"], name="Cluster 0", marker_color="rebeccapurple"),
176152
row=1, col=1
177153
)
178154

179155
data_filtered = data.filter(pl.col("cluster_kmeans") == 1)
180156

181157
# Ajouter le deuxième histogramme
182158
fig.add_trace(
183-
go.Histogram(x=data_filtered["portdest"], name="Cluster 1", marker_color="gold"),
159+
go.Histogram(x=data_filtered["portdst"], name="Cluster 1", marker_color="gold"),
184160
row=1, col=2
185161
)
186162

@@ -194,19 +170,4 @@
194170
except Exception as e:
195171
st.error(f"An error occured while doing the data analysis : {e}")
196172
else:
197-
st.warning("Please parse the log file first.")
198-
199-
# Choisir le nombre de clusters (méthode du coude)
200-
# inertia = []
201-
# for k in range(1, 11):
202-
# kmeans = KMeans(n_clusters=k, random_state=42)
203-
# kmeans.fit(df_scaled.to_pandas())
204-
# inertia.append(kmeans.inertia_)
205-
206-
# # Tracer la courbe pour la méthode du coude
207-
# plt.plot(range(1, 11), inertia, marker='o')
208-
# plt.title('Méthode du coude')
209-
# plt.xlabel('Nombre de clusters')
210-
# plt.ylabel('Inertie')
211-
# plt.show()
212-
173+
st.warning("Please parse the log file first.")

sections/analyze.py

Lines changed: 173 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
import polars as pl
22
import streamlit as st
3+
import ipaddress
4+
import plotly.express as px
5+
import plotly.graph_objs as go
6+
import pandas as pd
37

48
if "parsed_df" not in st.session_state:
59
st.session_state.parsed_df = None
@@ -14,6 +18,20 @@
1418

1519
data = st.session_state.parsed_df
1620

21+
university_subnets = [
22+
ipaddress.ip_network("192.168.0.0/16"),
23+
ipaddress.ip_network("10.79.0.0/16"),
24+
ipaddress.ip_network("159.84.0.0/16"),
25+
]
26+
27+
# Fonction pour vérifier si une IP appartient aux sous-réseaux universitaires
28+
def is_university_ip(ip):
29+
try:
30+
ip_obj = ipaddress.ip_address(ip)
31+
return any(ip_obj in subnet for subnet in university_subnets)
32+
except ValueError:
33+
return False
34+
1735
# Créer les onglets principaux
1836
tab1, tab2, tab3, tab4 = st.tabs(
1937
["Dataviz", "Analysis", "Foreign IP addresses", "Sankey"]
@@ -172,18 +190,165 @@ def set_dynamic():
172190
)
173191
st.dataframe(top_ips, use_container_width=True)
174192

193+
# Graphique
194+
195+
st.write("### 🔴 Analysis of Blocked Attempts")
196+
197+
if "ipsrc" in data.columns and "action" in data.columns:
198+
# Filtrer uniquement les tentatives bloquées
199+
blocked_attempts = data.filter(pl.col("action") == "DENY")
200+
201+
# Compter les occurrences des IP sources bloquées
202+
blocked_ips = (
203+
blocked_attempts
204+
.group_by("ipsrc")
205+
.agg(pl.count("ipsrc").alias("count"))
206+
.sort("count", descending=True)
207+
)
208+
209+
210+
top_n = st.slider(" ", 5, 20, 10, key="top_n_slider")
211+
212+
# Sélectionner le Top N des IP bloquées
213+
top_blocked_ips = blocked_ips.head(top_n)
214+
215+
216+
# ---- GRAPHIQUE AVEC PLOTLY ----
217+
color_palette = px.colors.sequential.Blues
218+
if not top_blocked_ips.is_empty():
219+
fig = px.bar(
220+
top_blocked_ips.to_pandas(), # Convertir en DataFrame Pandas pour Plotly
221+
x="count",
222+
y="ipsrc",
223+
orientation="h",
224+
text="count",
225+
title=f"Top {top_n} Most Blocked IPs",
226+
labels={"ipsrc": "IP Source", "count": "Number of Blocked Attempts"},
227+
color_discrete_sequence=["#3d85c6"]
228+
)
229+
230+
# Amélioration du layout
231+
fig.update_traces(texttemplate='%{text}', textposition='inside')
232+
fig.update_layout(yaxis=dict(categoryorder="total ascending"))
233+
234+
# Afficher le graphique interactif
235+
st.plotly_chart(fig, use_container_width=True)
236+
else:
237+
st.info("No blocked attempts found.")
238+
else:
239+
st.warning("Columns 'ipsrc' or 'action' not found.")
240+
241+
# Graphique de série temporelle des connexions par heure
242+
st.write("### 📊 Hourly Connection Activity")
243+
244+
if "timestamp" in data.columns:
245+
# Extraire uniquement les connexions autorisées (PERMIT) et valider le format datetime
246+
activity_data = (
247+
data
248+
.filter(pl.col("action") == "PERMIT") # Ne garder que les connexions autorisées
249+
.with_columns(pl.col("timestamp").dt.strftime("%Y-%m-%d %H:00:00").alias("hour")) # Normaliser à l'heure
250+
.group_by("hour")
251+
.agg(pl.count("hour").alias("connection_count")) # Compter les connexions par heure
252+
.sort("hour") # Trier chronologiquement
253+
)
254+
255+
# Vérifier si on a des données après filtrage
256+
if not activity_data.is_empty():
257+
# Convertir en DataFrame Pandas pour Plotly
258+
df_activity = activity_data.to_pandas()
259+
df_activity["hour"] = pd.to_datetime(df_activity["hour"]) # Assurer le bon format datetime
260+
261+
# Tracer le graphique
262+
fig = px.line(
263+
df_activity,
264+
x="hour",
265+
y="connection_count",
266+
markers=True, # Ajouter des points pour bien voir les pics
267+
title="Hourly Connection Activity",
268+
labels={"hour": "Hour", "connection_count": "Number of Connections"},
269+
line_shape="spline" # Rendre les courbes lisses
270+
)
271+
272+
# Afficher le graphique
273+
st.plotly_chart(fig, use_container_width=True)
274+
else:
275+
st.info("No connection data found for the selected period.")
276+
else:
277+
st.warning("Column 'timestamp' not found.")
278+
279+
175280

176281
# Onglet Foreign IP addresses
177282
with tab3:
178-
# Afficher ici la liste des accès hors plan d’adressage universitaire
179-
st.write("### 🚫 List of access outside the university network")
180-
external_access = data.filter(
181-
~pl.col("ipdst").cast(pl.Utf8).str.contains(r"^192\.168\.")
182-
& ~pl.col("ipdst").cast(pl.Utf8).str.contains(r"^10\.79\.")
183-
& ~pl.col("ipdst").cast(pl.Utf8).str.contains(r"^159\.84\.")
184-
)
185-
st.dataframe(external_access, use_container_width=True)
283+
st.subheader("🚫 List of access outside the university network")
284+
285+
if "ipsrc" in data.columns and "action" in data.columns:
286+
# Conversion des IPs en chaînes de caractères pour éviter les erreurs de type
287+
data = data.with_columns([
288+
pl.col("ipsrc").cast(pl.Utf8).alias("ipsrc"),
289+
pl.col("action").cast(pl.Utf8).alias("action")
290+
])
291+
292+
# Vérification des IPs avec la fonction is_university_ip
293+
data = data.with_columns([
294+
pl.col("ipsrc").map_elements(is_university_ip, return_dtype=pl.Boolean).alias("is_src_university_ip")
295+
])
296+
297+
# filtrer toutes les connexions impliquant une adresse externe
298+
intrusion_attempts = data.filter(
299+
(~pl.col("is_src_university_ip"))
300+
)
301+
# Ajout d'un filtre par action
302+
selected_action = st.selectbox("Select action type", ["All", "PERMIT", "DENY"])
303+
304+
if selected_action != "All":
305+
intrusion_attempts = intrusion_attempts.filter(
306+
pl.col("action") == selected_action
307+
)
308+
# Affichage des accès externes
309+
st.write(f"### 🔍 External accesses: {intrusion_attempts.shape[0]} entries")
310+
st.dataframe( intrusion_attempts.drop(["is_src_university_ip"]), use_container_width=True)
311+
312+
else:
313+
st.warning("Columns 'ipsrc' not found.")
314+
315+
186316

187317
# Onglet Sankey
188318
with tab4:
189319
st.subheader("Sankey Diagram")
320+
321+
def create_sankey(df, source_col, target_col):
322+
""" Crée un diagramme de Sankey entre deux colonnes """
323+
df_grouped = df.groupby([source_col, target_col]).len().to_pandas()
324+
325+
# Création des nœuds
326+
labels = list(pd.concat([df_grouped[source_col], df_grouped[target_col]]).unique())
327+
label_to_index = {label: i for i, label in enumerate(labels)}
328+
329+
# Création des liens
330+
sources = df_grouped[source_col].map(label_to_index)
331+
targets = df_grouped[target_col].map(label_to_index)
332+
values = df_grouped["len"]
333+
334+
# Création du Sankey Diagram
335+
fig = go.Figure(go.Sankey(
336+
node=dict(
337+
pad=15, thickness=20, line=dict(color="black", width=0.5),
338+
label=labels
339+
),
340+
link=dict(
341+
source=sources, target=targets, value=values
342+
)
343+
))
344+
345+
fig.update_layout(title_text=f"Flux entre {source_col} et {target_col}", font_size=10)
346+
st.plotly_chart(fig, use_container_width=True)
347+
348+
# 🔹 Sankey entre IP source et IP destination
349+
create_sankey(data, "ip_source", "ip_destination")
350+
351+
# 🔹 Sankey entre IP source et port destination
352+
df = df.with_columns(df["port_destination"].cast(pl.Utf8)) # Convertir les ports en chaînes pour éviter les erreurs
353+
create_sankey(data, "ip_source", "port_destination")
354+

0 commit comments

Comments
 (0)