Skip to content

Commit 291a018

Browse files
committed
#42 reduces the number of blocks
1 parent b46f412 commit 291a018

1 file changed

Lines changed: 26 additions & 8 deletions

File tree

TidyObsidian/find-duplicate-blocks.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -219,24 +219,34 @@ def find_duplicates(
219219
all_blocks = [item for sublist in block_results for item in sublist]
220220
print(f"Collected {len(all_blocks)} blocks from unique files.\n")
221221

222-
# Step 5: Aggressive, indexed block-level duplicate detection
222+
# Step 5: Aggressive, indexed block-level duplicate detection
223223
canonical_blocks = []
224224
token_index = defaultdict(set)
225225

226-
227226
for file_path, block_text, line_num in all_blocks:
228227
tokens = tokenize(block_text)
229-
length = len(block_text)
228+
if not tokens:
229+
continue
230230

231+
length = len(block_text)
232+
token_count = len(tokens)
231233
sig_tokens = select_signature_tokens(tokens)
234+
235+
# If we have no signature tokens (e.g., all stopwords), just treat as its own canonical block.
232236
if not sig_tokens:
233237
cb_idx = len(canonical_blocks)
234238
canonical_blocks.append(
235-
{"text": block_text, "tokens": tokens, "length": length,
236-
"locations": [(file_path, line_num)]}
239+
{
240+
"text": block_text,
241+
"tokens": tokens,
242+
"token_count": token_count,
243+
"length": length,
244+
"locations": [(file_path, line_num)],
245+
}
237246
)
238247
continue
239248

249+
# Collect candidate canonical block indices via inverted index.
240250
candidate_indices = set()
241251
for t in sig_tokens:
242252
candidate_indices.update(token_index.get(t, ()))
@@ -245,14 +255,16 @@ def find_duplicates(
245255
for idx in candidate_indices:
246256
cb = canonical_blocks[idx]
247257

258+
# Quick length ratio filter (same as before).
248259
shorter = min(cb["length"], length)
249260
longer = max(cb["length"], length)
250261
if longer == 0:
251262
continue
252263
if (longer - shorter) / longer > LENGTH_RATIO_TOLERANCE:
253264
continue
254265

255-
sim = jaccard(tokens, cb["tokens"])
266+
# Faster Jaccard using cached token counts.
267+
sim = jaccard(tokens, cb["tokens"], token_count, cb["token_count"])
256268
if sim >= similarity_threshold:
257269
cb["locations"].append((file_path, line_num))
258270
matched = True
@@ -261,12 +273,18 @@ def find_duplicates(
261273
if not matched:
262274
cb_idx = len(canonical_blocks)
263275
canonical_blocks.append(
264-
{"text": block_text, "tokens": tokens, "length": length,
265-
"locations": [(file_path, line_num)]}
276+
{
277+
"text": block_text,
278+
"tokens": tokens,
279+
"token_count": token_count,
280+
"length": length,
281+
"locations": [(file_path, line_num)],
282+
}
266283
)
267284
for t in sig_tokens:
268285
token_index[t].add(cb_idx)
269286

287+
270288
# Step 6: Keep only blocks appearing in >1 file
271289
duplicates = []
272290
for cb in canonical_blocks:

0 commit comments

Comments
 (0)