From 9a05bd8ec0e99bbbc0cd17d895cc6b14cba637ec Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Jun 2026 07:53:45 +0000 Subject: [PATCH] perf: drop redundant per-token regex re-validation in tokenize_and_stem _WORD_RE.findall() already returns only maximal [a-z][a-z0-9_-]+ runs (letter-led, length >= 2), so the follow-up 'if _TOKEN_RE.match(t)' filter re-checked the exact same shape and always returned True. It was a redundant regex match per token on the index-time and query-time hot path. Remove the filter (and the now-unused _TOKEN_RE) so tokenization does one regex pass instead of two. Output is byte-for-byte identical; verified by a 200k-string fuzz over the two patterns (0 mismatches). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01ECr44xGUy4SDEJRSmDPNZb --- retrieval/stemmer.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/retrieval/stemmer.py b/retrieval/stemmer.py index a6f846d..72484bd 100644 --- a/retrieval/stemmer.py +++ b/retrieval/stemmer.py @@ -16,12 +16,11 @@ # Tokenizer: extracts words starting with a letter (2+ chars). Avoids nltk.data.load() # so the NLTK URL-encoded path-traversal CVE (punkt tokenizer) is not reachable. -# Both regexes compiled once at import rather than on every call. +# Compiled once at import rather than on every call. findall() returns only +# maximal runs of this exact shape, so every token is already letter-led and +# >= 2 chars by construction — no second-pass validation is required. _WORD_RE = re.compile(r'[a-z][a-z0-9_-]+') - -_TOKEN_RE = re.compile(r'^[a-z][a-z0-9_-]{1,}$') - _CUSTOM_STEMS = { "embedding": "embed", "embeddings": "embed", "transformer": "transform", "transformers": "transform", @@ -43,8 +42,9 @@ def stem_token(token: str) -> str: return _stemmer.stem(lower) def tokenize_and_stem(text: str) -> List[str]: - tokens = _WORD_RE.findall(text.lower()) - return [ - stem_token(t) for t in tokens - if _TOKEN_RE.match(t) - ] + # _WORD_RE.findall() already guarantees each token matches [a-z][a-z0-9_-]+ + # (letter-led, length >= 2). The previous `if _TOKEN_RE.match(t)` filter + # re-validated that exact same shape and therefore always returned True — + # a redundant per-token regex match on the index/query hot path. Dropping it + # produces byte-for-byte identical output with one fewer regex op per token. + return [stem_token(t) for t in _WORD_RE.findall(text.lower())]