From 9a05bd8ec0e99bbbc0cd17d895cc6b14cba637ec Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 20 Jun 2026 07:53:45 +0000
Subject: [PATCH] perf: drop redundant per-token regex re-validation in
 tokenize_and_stem

_WORD_RE.findall() already returns only maximal [a-z][a-z0-9_-]+ runs
(letter-led, length >= 2), so the follow-up 'if _TOKEN_RE.match(t)' filter
re-checked the exact same shape and always returned True. It was a redundant
regex match per token on the index-time and query-time hot path.

Remove the filter (and the now-unused _TOKEN_RE) so tokenization does one
regex pass instead of two. Output is byte-for-byte identical; verified by a
200k-string fuzz over the two patterns (0 mismatches).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01ECr44xGUy4SDEJRSmDPNZb
---
 retrieval/stemmer.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/retrieval/stemmer.py b/retrieval/stemmer.py
index a6f846d..72484bd 100644
--- a/retrieval/stemmer.py
+++ b/retrieval/stemmer.py
@@ -16,12 +16,11 @@
 
 # Tokenizer: extracts words starting with a letter (2+ chars). Avoids nltk.data.load()
 # so the NLTK URL-encoded path-traversal CVE (punkt tokenizer) is not reachable.
-# Both regexes compiled once at import rather than on every call.
+# Compiled once at import rather than on every call. findall() returns only
+# maximal runs of this exact shape, so every token is already letter-led and
+# >= 2 chars by construction — no second-pass validation is required.
 _WORD_RE = re.compile(r'[a-z][a-z0-9_-]+')
 
-
-_TOKEN_RE = re.compile(r'^[a-z][a-z0-9_-]{1,}$')
-
 _CUSTOM_STEMS = {
     "embedding": "embed", "embeddings": "embed",
     "transformer": "transform", "transformers": "transform",
@@ -43,8 +42,9 @@ def stem_token(token: str) -> str:
     return _stemmer.stem(lower)
 
 def tokenize_and_stem(text: str) -> List[str]:
-    tokens = _WORD_RE.findall(text.lower())
-    return [
-        stem_token(t) for t in tokens
-        if _TOKEN_RE.match(t)
-    ]
+    # _WORD_RE.findall() already guarantees each token matches [a-z][a-z0-9_-]+
+    # (letter-led, length >= 2). The previous `if _TOKEN_RE.match(t)` filter
+    # re-validated that exact same shape and therefore always returned True —
+    # a redundant per-token regex match on the index/query hot path. Dropping it
+    # produces byte-for-byte identical output with one fewer regex op per token.
+    return [stem_token(t) for t in _WORD_RE.findall(text.lower())]