@@ -219,24 +219,34 @@ def find_duplicates(
219219 all_blocks = [item for sublist in block_results for item in sublist ]
220220 print (f"Collected { len (all_blocks )} blocks from unique files.\n " )
221221
222- # Step 5: Aggressive, indexed block-level duplicate detection
222+ # Step 5: Aggressive, indexed block-level duplicate detection
223223 canonical_blocks = []
224224 token_index = defaultdict (set )
225225
226-
227226 for file_path , block_text , line_num in all_blocks :
228227 tokens = tokenize (block_text )
229- length = len (block_text )
228+ if not tokens :
229+ continue
230230
231+ length = len (block_text )
232+ token_count = len (tokens )
231233 sig_tokens = select_signature_tokens (tokens )
234+
235+ # If we have no signature tokens (e.g., all stopwords), just treat as its own canonical block.
232236 if not sig_tokens :
233237 cb_idx = len (canonical_blocks )
234238 canonical_blocks .append (
235- {"text" : block_text , "tokens" : tokens , "length" : length ,
236- "locations" : [(file_path , line_num )]}
239+ {
240+ "text" : block_text ,
241+ "tokens" : tokens ,
242+ "token_count" : token_count ,
243+ "length" : length ,
244+ "locations" : [(file_path , line_num )],
245+ }
237246 )
238247 continue
239248
249+ # Collect candidate canonical block indices via inverted index.
240250 candidate_indices = set ()
241251 for t in sig_tokens :
242252 candidate_indices .update (token_index .get (t , ()))
@@ -245,14 +255,16 @@ def find_duplicates(
245255 for idx in candidate_indices :
246256 cb = canonical_blocks [idx ]
247257
258+ # Quick length ratio filter (same as before).
248259 shorter = min (cb ["length" ], length )
249260 longer = max (cb ["length" ], length )
250261 if longer == 0 :
251262 continue
252263 if (longer - shorter ) / longer > LENGTH_RATIO_TOLERANCE :
253264 continue
254265
255- sim = jaccard (tokens , cb ["tokens" ])
266+ # Faster Jaccard using cached token counts.
267+ sim = jaccard (tokens , cb ["tokens" ], token_count , cb ["token_count" ])
256268 if sim >= similarity_threshold :
257269 cb ["locations" ].append ((file_path , line_num ))
258270 matched = True
@@ -261,12 +273,18 @@ def find_duplicates(
261273 if not matched :
262274 cb_idx = len (canonical_blocks )
263275 canonical_blocks .append (
264- {"text" : block_text , "tokens" : tokens , "length" : length ,
265- "locations" : [(file_path , line_num )]}
276+ {
277+ "text" : block_text ,
278+ "tokens" : tokens ,
279+ "token_count" : token_count ,
280+ "length" : length ,
281+ "locations" : [(file_path , line_num )],
282+ }
266283 )
267284 for t in sig_tokens :
268285 token_index [t ].add (cb_idx )
269286
287+
270288 # Step 6: Keep only blocks appearing in >1 file
271289 duplicates = []
272290 for cb in canonical_blocks :
0 commit comments