-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSubstringDistance.py
More file actions
399 lines (330 loc) · 13 KB
/
SubstringDistance.py
File metadata and controls
399 lines (330 loc) · 13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
from PyQt5.QtWidgets import (
QDialog,
QVBoxLayout,
QHBoxLayout,
QComboBox,
QLabel,
QPushButton,
QGroupBox,
QScrollArea,
)
from PyQt5.QtCore import Qt
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.backends.backend_qt5agg import NavigationToolbar2QT as NavigationToolbar
import matplotlib.pyplot as plt
from collections import Counter
import markdown
class SubstringDistance(QDialog):
"""
Substring Distance Analysis - Finds repeating patterns to estimate key length
This tool searches for repeated substrings in the ciphertext and measures the
distances between their occurrences. Key lengths often divide these distances evenly.
"""
def __init__(self, parent):
super(SubstringDistance, self).__init__(parent)
self.parent1 = parent
# Window setup
self.setWindowTitle("Substring Distance Analysis - Key Length Estimation")
self.setMinimumSize(800, 600)
# Create matplotlib figure with better styling
plt.style.use("seaborn-v0_8-darkgrid")
self.figure = plt.figure(figsize=(10, 6))
self.canvas = FigureCanvas(self.figure)
self.toolbar = NavigationToolbar(self.canvas, self)
# Main layout
main_layout = QVBoxLayout()
# Add explanation section
self.add_explanation_section(main_layout)
# Add controls section
self.add_controls_section(main_layout)
# Add toolbar and canvas
main_layout.addWidget(self.toolbar)
main_layout.addWidget(self.canvas)
# Add results section
self.add_results_section(main_layout)
self.setLayout(main_layout)
# Initial plot
self.plot()
def add_explanation_section(self, layout):
"""Add explanation of what this analysis does"""
explanation_group = QGroupBox("ℹ️ How This Analysis Works")
explanation_group.setStyleSheet(
"""
QGroupBox {
font-weight: bold;
border: 2px solid #2196F3;
border-radius: 5px;
margin-top: 10px;
padding-top: 10px;
}
QGroupBox::title {
color: #2196F3;
}
"""
)
explanation_layout = QVBoxLayout()
explanation_text = QLabel(
markdown.markdown(
"""
## What is Substring Distance Analysis?
This tool finds repeating sequences (substrings) in your ciphertext and measures the distances
between their occurrences. In a Vigenère cipher, these distances are often multiples of the key length.
## How to interpret the results:
- **The bar chart** shows how often each distance appears between repeated substrings
- **Tall bars** indicate common distances - these are important!
- **The key length estimates** show which key lengths best explain the pattern of distances
- Look for a key length that appears consistently across different substring lengths
## Example:
If you see many distances at 15, 30, 45, 60... the key length is likely 15 (or possibly 5, 3, or even 1).
## Tips:
- Start with substring length 3-4 (good balance of accuracy and data)
- Try different substring lengths to confirm your findings
- Combine with Friedman Test for best results
- The "Top Key Length Candidates" shows the most likely key lengths
"""
)
)
explanation_text.setWordWrap(True)
explanation_text.setStyleSheet("padding: 10px; font-weight: normal;")
scroll = QScrollArea()
scroll.setWidget(explanation_text)
explanation_layout.addWidget(scroll)
explanation_group.setLayout(explanation_layout)
layout.addWidget(explanation_group)
def add_controls_section(self, layout):
"""Add controls for adjusting the analysis"""
controls_group = QGroupBox("🔧 Analysis Settings")
controls_group.setStyleSheet(
"""
QGroupBox {
font-weight: bold;
border: 2px solid #FF9800;
border-radius: 5px;
margin-top: 10px;
padding-top: 10px;
}
QGroupBox::title {
color: #FF9800;
}
"""
)
controls_layout = QHBoxLayout()
# Substring length selector
length_label = QLabel("Substring Length:")
length_label.setToolTip(
"Length of repeated sequences to search for.\n"
"3-4 is usually best for initial analysis.\n"
"Longer = more specific, but less data.\n"
"Shorter = more data, but less reliable."
)
length_label.setStyleSheet("font-weight: bold;")
controls_layout.addWidget(length_label)
self.lengthbox = QComboBox()
self.lengthbox.addItems([str(x) for x in range(2, 21)])
self.lengthbox.setCurrentText("3") # Default to 3
self.lengthbox.setToolTip("Select the length of substrings to analyze")
self.lengthbox.currentIndexChanged.connect(self.plot)
controls_layout.addWidget(self.lengthbox)
# Info about current selection
self.info_label = QLabel()
self.info_label.setStyleSheet(
"color: #666; font-style: italic; margin-left: 20px;"
)
controls_layout.addWidget(self.info_label)
controls_layout.addStretch()
# Refresh button
refresh_button = QPushButton("🔄 Refresh Analysis")
refresh_button.setToolTip("Re-run the analysis with current settings")
refresh_button.clicked.connect(self.plot)
controls_layout.addWidget(refresh_button)
controls_group.setLayout(controls_layout)
layout.addWidget(controls_group)
def add_results_section(self, layout):
"""Add results display section"""
results_group = QGroupBox("📊 Analysis Results")
results_group.setStyleSheet(
"""
QGroupBox {
font-weight: bold;
border: 2px solid #4CAF50;
border-radius: 5px;
margin-top: 10px;
padding-top: 10px;
}
QGroupBox::title {
color: #4CAF50;
}
"""
)
results_layout = QVBoxLayout()
# Top candidates label
candidates_header = QLabel(
"🎯 Top Key Length Candidates (ranked by likelihood):"
)
candidates_header.setStyleSheet("font-weight: bold; color: #4CAF50;")
results_layout.addWidget(candidates_header)
# Guess label with better formatting
self.guesslabel = QLabel("Run analysis to see results...")
self.guesslabel.setWordWrap(True)
self.guesslabel.setStyleSheet(
"""
QLabel {
padding: 10px;
background-color: #f0f0f0;
border: 1px solid #ccc;
border-radius: 4px;
font-family: 'Courier New', monospace;
}
"""
)
results_layout.addWidget(self.guesslabel)
# Statistics label
self.stats_label = QLabel()
self.stats_label.setStyleSheet(
"color: #666; font-style: italic; margin-top: 5px;"
)
results_layout.addWidget(self.stats_label)
# Interpretation hint
hint_label = QLabel(
"💡 Hint: The first number in each pair is the key length estimate, "
"the second is how many distance patterns support it. Higher is better!"
)
hint_label.setWordWrap(True)
hint_label.setStyleSheet("color: #2196F3; font-style: italic; margin-top: 5px;")
results_layout.addWidget(hint_label)
results_group.setLayout(results_layout)
layout.addWidget(results_group)
def plot(self):
"""Generate and display the substring distance analysis"""
# Get ciphertext
text = self.parent1.ciphertextbox.toPlainText()
if not text:
self.guesslabel.setText(
"⚠️ No ciphertext to analyze. Please enter text in the main window."
)
self.info_label.setText("")
self.stats_label.setText("")
self.figure.clear()
self.canvas.draw()
return
length = int(self.lengthbox.currentText())
# Calculate distances
data = Counter(self.find_substring_distances(text, length))
# Update info label
total_patterns = sum(data.values())
unique_distances = len(data)
self.info_label.setText(
f"Found {total_patterns} repeated patterns with {unique_distances} unique distances"
)
# Clear old figure
self.figure.clear()
if not data:
# No data found
ax = self.figure.add_subplot()
ax.text(
0.5,
0.5,
f"No repeated substrings of length {length} found.\n\n"
f"Try:\n"
f"• Using a shorter substring length\n"
f"• Checking if you have enough ciphertext\n"
f"• Verifying your ciphertext is correct",
horizontalalignment="center",
verticalalignment="center",
transform=ax.transAxes,
fontsize=12,
bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.5),
)
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis("off")
self.guesslabel.setText("⚠️ No patterns found with current settings.")
self.stats_label.setText("")
self.canvas.draw()
return
# Calculate key length votes
votes = dict()
max_distance = max(data.keys())
for i in range(1, min(max_distance + 1, 50)): # Limit to reasonable key lengths
for k, v in data.items():
if k % i == 0:
votes[i] = votes.get(i, 0) + v
# Format top candidates nicely
top_candidates = sorted(
votes.items(), key=lambda x: (x[1], -x[0]), reverse=True
)[:10]
# Create formatted display
result_lines = []
for rank, (key_len, score) in enumerate(top_candidates, 1):
# Add medals for top 3
if rank == 1:
medal = "🥇"
elif rank == 2:
medal = "🥈"
elif rank == 3:
medal = "🥉"
else:
medal = f"{rank}."
result_lines.append(
f"{medal} Key length {key_len:2d} → Score: {score:4d}"
)
self.guesslabel.setText("\n".join(result_lines))
# Update statistics
if top_candidates:
best_key = top_candidates[0][0]
best_score = top_candidates[0][1]
self.stats_label.setText(
f"Most likely key length: {best_key} (confidence score: {best_score})"
)
# Create the bar chart
ax = self.figure.add_subplot()
x, y = zip(*sorted(data.items()))
# Color bars based on whether they're multiples of the best candidate
if top_candidates:
best_key = top_candidates[0][0]
colors = ["#4CAF50" if dist % best_key == 0 else "#2196F3" for dist in x]
else:
colors = "#2196F3"
bars = ax.bar(x, y, color=colors, edgecolor="black", linewidth=0.5)
# Styling
ax.set_xlabel(
"Distance Between Repeated Substrings", fontsize=11, fontweight="bold"
)
ax.set_ylabel(
"Frequency (Number of Occurrences)", fontsize=11, fontweight="bold"
)
ax.set_title(
f"Substring Distance Distribution (substring length = {length})\n"
f'Green bars are multiples of the top candidate (key length {top_candidates[0][0] if top_candidates else "?"})',
fontsize=12,
fontweight="bold",
)
ax.grid(True, alpha=0.3, linestyle="--")
# Only show every nth tick if there are too many
if len(x) > 30:
ax.set_xticks(ax.get_xticks()[::2])
# Tight layout for better spacing
self.figure.tight_layout()
# Refresh canvas
self.canvas.draw()
def find_substring_distances(self, text, length):
"""
Find distances between repeated substrings
Args:
text: The ciphertext to analyze
length: Length of substrings to search for
Returns:
List of distances between repeated substring occurrences
"""
distances = []
seen = dict()
for i in range(len(text) - (length - 1)):
substring = text[i : i + length]
if substring in seen:
# Found a repeat - calculate distances from all previous occurrences
for previous_position in seen[substring]:
distances.append(i - previous_position)
seen[substring].append(i)
else:
seen[substring] = [i]
return distances