diff --git a/scripts/__tests__/session-summarizer.test.ts b/scripts/__tests__/session-summarizer.test.ts index d53d46a..3a6117e 100644 --- a/scripts/__tests__/session-summarizer.test.ts +++ b/scripts/__tests__/session-summarizer.test.ts @@ -179,9 +179,11 @@ describe("keywords extraction", () => { ); expect(result.keywords.length).toBeGreaterThan(0); expect(result.keywords.length).toBeLessThanOrEqual(5); - // "refactor" and "authentication" appear in all prompts, should be top keywords + // "refactor" and "authentication" appear in all prompts, should be top + // keywords. Issue #30 introduced Porter stemming, so `authentication` + // surfaces as the stem `authent`. expect(result.keywords).toContain("refactor"); - expect(result.keywords).toContain("authentication"); + expect(result.keywords).toContain("authent"); // Stop words should not appear for (const kw of result.keywords) { expect(kw.trim().length).toBeGreaterThan(0); diff --git a/scripts/__tests__/tokenizer.test.ts b/scripts/__tests__/tokenizer.test.ts index 78e1894..7ba451e 100644 --- a/scripts/__tests__/tokenizer.test.ts +++ b/scripts/__tests__/tokenizer.test.ts @@ -91,6 +91,30 @@ describe("isNoiseToken", () => { it("returns false for short hex-like strings (<6 chars)", () => { expect(isNoiseToken("abc")).toBe(false); }); + + // Issue #30: HEX_PATTERN now requires at least one digit. Without that + // guard the regex matched plain English words built only from a-f. + it("returns false for English words built from a-f without digits", () => { + expect(isNoiseToken("decade")).toBe(false); + expect(isNoiseToken("facade")).toBe(false); + expect(isNoiseToken("effect")).toBe(false); + expect(isNoiseToken("defaced")).toBe(false); + }); + + it("returns false for 6+ char hex-only words with no digits (Issue #30)", () => { + // `cafebabe` is a famous magic number, but as a standalone token without + // a digit there is no way to distinguish it from a regular English word. + // We err on the side of keeping it; sessions almost never refer to it + // anyway. The previous `^[0-9a-f]{6,}$` would have flagged this; the + // updated pattern does not. + expect(isNoiseToken("cafebabe")).toBe(false); + }); + + it("returns true for hex strings that contain a digit (Issue #30)", () => { + expect(isNoiseToken("abc123def")).toBe(true); + expect(isNoiseToken("0xabc123")).toBe(false); // contains 'x', not pure hex + expect(isNoiseToken("deadbeef0")).toBe(true); + }); }); describe("tokenize", () => { @@ -128,14 +152,51 @@ describe("tokenize", () => { it("handles kebab-case and snake_case", () => { const tokens = tokenize("my-component some_variable"); - expect(tokens).toContain("component"); + // After Issue #30 the non-CJK pipeline stems the surviving tokens with + // Porter, so `component`/`variable` collapse to their stems. + expect(tokens).toContain("compon"); + expect(tokens).toContain("variabl"); // "some" is a stop word, so it's excluded expect(tokens).not.toContain("some"); - expect(tokens).toContain("variable"); }); - it("handles Japanese text without crashing", () => { - expect(() => tokenize("セッションの分析を実行する")).not.toThrow(); + it("segments Japanese text into meaningful word-ish units (Issue #29)", () => { + // Before this fix, the entire Japanese run collapsed into one token + // because `\s+` cannot split text without whitespace. With + // Intl.Segmenter, common kanji compounds like 分析 / 実行 surface as + // individual tokens. + const tokens = tokenize("セッションの分析を実行する"); + expect(tokens).toContain("セッション"); + expect(tokens).toContain("分析"); + expect(tokens).toContain("実行"); + // The whole sentence should NOT survive as one giant token. + expect(tokens).not.toContain("セッションの分析を実行する"); + }); + + it("segments mixed Japanese / English text on both sides", () => { + const tokens = tokenize("TypeScriptの型エラーを修正"); + // English side: lowercased CamelCase split + expect(tokens).toContain("type"); + expect(tokens).toContain("script"); + // Japanese side: 2-char kanji compounds preserved + expect(tokens).toContain("エラー"); + expect(tokens).toContain("修正"); + }); + + it("does not collapse a long Japanese paragraph into a single oversized token (Issue #29 regression)", () => { + // Reproduces the exact bug from #29: the issue's example string used + // to land in the vocabulary as one token. After the fix every token + // should be word-sized, not paragraph-sized. + const text = + "セッションの分析を実行する。次にテストを書く。" + + "実装が完了したらリファクタリングを行い、最後にコードレビューを依頼する。"; + const tokens = tokenize(text); + expect(tokens.length).toBeGreaterThan(3); + for (const t of tokens) { + expect(t.length).toBeLessThanOrEqual(20); + } + // The whole paragraph must not survive as a single token. + expect(tokens).not.toContain(text); }); it("filters out noise tokens", () => { @@ -154,6 +215,71 @@ describe("tokenize", () => { expect(tokens).not.toContain("to"); expect(tokens).not.toContain("go"); }); + + // Issue #30: stemming collapses inflected forms. + it("stems inflected forms to a shared stem", () => { + const tokens = tokenize("running runs ran"); + // `running` and `runs` both reduce to `run` under Porter step1a/1b. + // `ran` is irregular and Porter does not handle it; we accept that + // limitation (Porter is rule-based, not lexicon-based). + expect(tokens).toContain("run"); + expect(tokens.filter((t) => t === "run").length).toBeGreaterThanOrEqual(2); + // The unstemmed surface forms must NOT be in the output. + expect(tokens).not.toContain("running"); + expect(tokens).not.toContain("runs"); + }); + + it("collapses other inflected pairs (test/tested/testing, walk/walked/walking)", () => { + const tested = tokenize("tested testing"); + expect(tested.length).toBe(2); + expect(tested.every((t) => t === "test")).toBe(true); + // Use `walk` rather than `fix` — `fix` is a project-specific stop word + // (constants.ts), so `tokenize("fixed fixing")` would return [] and the + // `.every()` assertion would pass vacuously, hiding any real regression. + const walked = tokenize("walked walking"); + expect(walked.length).toBe(2); + expect(walked.every((t) => t === "walk")).toBe(true); + }); + + // Issue #30: words built only from a-f used to be filtered as hex noise. + it("keeps English words built from a-f as real tokens", () => { + const tokens = tokenize("decade facade effect"); + // After Porter stemming `decade` -> `decad`, `facade` -> `facad`, + // `effect` -> `effect`. The point of this test is that NONE of them + // disappear into the noise filter. + expect(tokens).toContain("decad"); + expect(tokens).toContain("facad"); + expect(tokens).toContain("effect"); + }); + + // Issue #30: the expanded NLTK-parity stop-word list drops several common + // filler words that the old hand-rolled list let through. + it("drops newly added NLTK-parity stop words", () => { + const tokens = tokenize( + "really back even ever say seem tell yeah right thing the test" + ); + // Sanity: substantive vocab survives. + expect(tokens).toContain("test"); + // The new entries must be filtered. Note: `actually` is intentionally NOT + // listed — it is not a member of STOP_WORDS. The literal surface form is + // absent from the output only because Porter stems it to `actual`. + // Asserting `not.toContain("actually")` would therefore pass even without + // the new stop-word list, hiding regressions. + for (const w of [ + "really", + "back", + "even", + "ever", + "say", + "seem", + "tell", + "yeah", + "right", + "thing", + ]) { + expect(tokens).not.toContain(w); + } + }); }); describe("tokenize - large input regression (Issue #18)", () => { diff --git a/scripts/knowledge-graph/constants.ts b/scripts/knowledge-graph/constants.ts index 04f31e9..ac6a8f0 100644 --- a/scripts/knowledge-graph/constants.ts +++ b/scripts/knowledge-graph/constants.ts @@ -5,24 +5,36 @@ // ─── Stop words ───────────────────────────────────────────────────────────── export const STOP_WORDS = new Set([ - // English - "the", "a", "an", "is", "are", "was", "were", "be", "been", "being", - "have", "has", "had", "do", "does", "did", "will", "would", "could", - "should", "may", "might", "shall", "can", "need", "must", "ought", - "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", - "us", "them", "my", "your", "his", "its", "our", "their", "mine", - "yours", "hers", "ours", "theirs", "this", "that", "these", "those", - "what", "which", "who", "whom", "whose", "when", "where", "why", "how", - "all", "each", "every", "both", "few", "more", "most", "other", "some", - "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", - "very", "just", "because", "as", "until", "while", "of", "at", "by", - "for", "with", "about", "against", "between", "through", "during", - "before", "after", "above", "below", "to", "from", "up", "down", "in", - "out", "on", "off", "over", "under", "again", "further", "then", "once", - "here", "there", "and", "but", "or", "if", "else", "also", "like", - "please", "thanks", "thank", "yes", "no", "ok", "okay", "sure", "let", - "make", "use", "using", "used", "want", "see", "look", "try", "get", - "got", "think", "know", "now", "new", "way", "well", "back", "still", + // English — roughly NLTK English stop word parity (~180 entries) plus a + // small project-specific filler set ("file", "code", "change") that NLTK + // does not include but which carry no signal in this corpus. + // Issue #30: expanded from ~100 entries to NLTK parity to keep IDF weights + // tight on substantive vocabulary. + "a", "about", "above", "after", "again", "against", "ain", "all", "also", + "am", "an", "and", "any", "are", "aren", "as", "at", "back", "be", + "because", "been", "before", "being", "below", "between", "both", "but", + "by", "can", "could", "couldn", "did", "didn", "do", "does", "doesn", + "doing", "don", "down", "during", "each", "else", "even", "ever", "every", + "few", "for", "from", "further", "get", "give", "go", "going", "got", + "had", "hadn", "has", "hasn", "have", "haven", "having", "he", "her", + "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", + "in", "into", "is", "isn", "it", "its", "itself", "just", "know", "let", + "like", "look", "ma", "make", "many", "may", "me", "might", "mightn", + "mine", "more", "most", "much", "must", "mustn", "my", "myself", "need", + "needn", "new", "no", "nor", "not", "now", "of", "off", "ok", "okay", + "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", + "out", "over", "own", "please", "really", "right", "s", "same", "say", + "see", "seem", "seen", "shall", "shan", "she", "should", "shouldn", "so", + "some", "still", "such", "sure", "t", "tell", "than", "thank", "thanks", + "that", "the", "their", "theirs", "them", "themselves", "then", "there", + "these", "they", "thing", "think", "this", "those", "through", "to", + "too", "try", "under", "until", "up", "us", "use", "used", "using", + "very", "want", "was", "wasn", "way", "we", "well", "were", "weren", + "what", "when", "where", "which", "while", "who", "whom", "whose", "why", + "will", "with", "won", "would", "wouldn", "y", "yeah", "yes", "you", + "your", "yours", "yourself", "yourselves", + // Project-specific filler tokens (not part of NLTK). They appear in nearly + // every Claude Code session and dilute the TF-IDF signal. "file", "code", "change", "changes", "add", "update", "fix", "set", // Japanese particles and common words "の", "に", "は", "を", "が", "で", "と", "も", "か", "な", "だ", @@ -31,12 +43,22 @@ export const STOP_WORDS = new Set([ "こと", "もの", "ため", "よう", "から", "まで", "より", "ほど", "など", "ので", "けど", "でも", "しかし", "また", "そして", "って", "という", "ください", "お願い", "確認", + // Verb conjugation fragments and connective auxiliaries that + // `Intl.Segmenter('ja')` emits as standalone segments. These carry no + // standalone signal for TF-IDF / clustering. Content-bearing stems such + // as 行う / 書く / 修正 are intentionally NOT listed here. + // See https://github.com/chigichan24/crune/issues/29 + "ている", "てい", "しない", "次に", "に従って", ]); // ─── Noise token patterns ─────────────────────────────────────────────────── export const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i; -export const HEX_PATTERN = /^[0-9a-f]{6,}$/i; +// Hex literal must contain at least one digit. Without the lookahead this +// regex would also match plain English words built only from a-f (e.g. +// "decade", "facade", "effect", "defaced") and incorrectly drop them as +// noise. See https://github.com/chigichan24/crune/issues/30. +export const HEX_PATTERN = /^(?=[0-9a-f]*[0-9])[0-9a-f]{6,}$/i; export const NUM_PATTERN = /^\d+$/; // ─── Structural features ──────────────────────────────────────────────────── diff --git a/scripts/knowledge-graph/porter-stemmer.ts b/scripts/knowledge-graph/porter-stemmer.ts new file mode 100644 index 0000000..92a7e9c --- /dev/null +++ b/scripts/knowledge-graph/porter-stemmer.ts @@ -0,0 +1,258 @@ +/** + * Porter stemmer (Porter 1980 — original algorithm). + * + * Inline implementation, ~70 LoC of substantive logic. We deliberately avoid + * pulling in `natural` / `stemmer` to keep the dependency footprint flat + * (the data pipeline is shipped as a CLI binary; transitive deps cost startup + * time and lockfile noise). + * + * Reference: M.F. Porter, "An algorithm for suffix stripping", + * Program 14(3), 130-137 (1980). + * + * Usage caveats inside this codebase: + * - Apply only to non-CJK tokens. Tokens containing CJK characters come from + * `Intl.Segmenter` and stemming would corrupt them. + * - The tokenizer already drops tokens of length <= 2, so very short inputs + * like `"by"` never reach this function. We still guard with `length > 2` + * here so the module is safe to call standalone. + */ + +const VOWELS = new Set(["a", "e", "i", "o", "u"]); + +function isConsonant(word: string, i: number): boolean { + const ch = word[i]; + if (VOWELS.has(ch)) return false; + if (ch === "y") { + if (i === 0) return true; + return !isConsonant(word, i - 1); + } + return true; +} + +/** + * Measure `m` of a stem: the number of (vowel-sequence)(consonant-sequence) + * groups, ignoring leading consonants and trailing vowels. This is the + * standard Porter `[C](VC){m}[V]` count. + */ +function measure(stem: string): number { + let m = 0; + let i = 0; + const n = stem.length; + // skip leading consonants + while (i < n && isConsonant(stem, i)) i++; + while (i < n) { + // we are at a vowel; consume the vowel run + while (i < n && !isConsonant(stem, i)) i++; + if (i >= n) break; + // we are at a consonant; one VC pair complete + m++; + while (i < n && isConsonant(stem, i)) i++; + } + return m; +} + +function hasVowel(stem: string): boolean { + for (let i = 0; i < stem.length; i++) { + if (!isConsonant(stem, i)) return true; + } + return false; +} + +function endsWithDoubleConsonant(stem: string): boolean { + const n = stem.length; + if (n < 2) return false; + if (stem[n - 1] !== stem[n - 2]) return false; + return isConsonant(stem, n - 1); +} + +/** CVC pattern at end where final C is not w/x/y. Used by step1b/step5. */ +function endsCvc(stem: string): boolean { + const n = stem.length; + if (n < 3) return false; + if (!isConsonant(stem, n - 1)) return false; + if (isConsonant(stem, n - 2)) return false; + if (!isConsonant(stem, n - 3)) return false; + const last = stem[n - 1]; + if (last === "w" || last === "x" || last === "y") return false; + return true; +} + +function endsWith(word: string, suffix: string): boolean { + return word.length >= suffix.length && word.endsWith(suffix); +} + +function replaceSuffix(word: string, suffix: string, replacement: string): string { + return word.slice(0, word.length - suffix.length) + replacement; +} + +/** Step 1a: plurals. */ +function step1a(word: string): string { + if (endsWith(word, "sses")) return replaceSuffix(word, "sses", "ss"); + if (endsWith(word, "ies")) return replaceSuffix(word, "ies", "i"); + if (endsWith(word, "ss")) return word; + if (endsWith(word, "s")) return word.slice(0, -1); + return word; +} + +/** Step 1b: past tense / -ing. */ +function step1b(word: string): string { + if (endsWith(word, "eed")) { + const stem = word.slice(0, -3); + if (measure(stem) > 0) return stem + "ee"; + return word; + } + let stem: string | null = null; + let stripped = word; + if (endsWith(word, "ed")) { + const candidate = word.slice(0, -2); + if (hasVowel(candidate)) { + stem = candidate; + stripped = candidate; + } + } else if (endsWith(word, "ing")) { + const candidate = word.slice(0, -3); + if (hasVowel(candidate)) { + stem = candidate; + stripped = candidate; + } + } + if (stem === null) return word; + // step 1b post: add `e` for at/bl/iz; collapse double consonants except l/s/z; + // restore `e` when stem is short and ends CVC. + if ( + endsWith(stripped, "at") || + endsWith(stripped, "bl") || + endsWith(stripped, "iz") + ) { + return stripped + "e"; + } + if (endsWithDoubleConsonant(stripped)) { + const last = stripped[stripped.length - 1]; + if (last !== "l" && last !== "s" && last !== "z") { + return stripped.slice(0, -1); + } + return stripped; + } + if (measure(stripped) === 1 && endsCvc(stripped)) { + return stripped + "e"; + } + return stripped; +} + +/** Step 1c: y -> i when there is a vowel in the stem. */ +function step1c(word: string): string { + if (endsWith(word, "y") && word.length > 1 && hasVowel(word.slice(0, -1))) { + return word.slice(0, -1) + "i"; + } + return word; +} + +const STEP2_RULES: [string, string][] = [ + ["ational", "ate"], + ["tional", "tion"], + ["enci", "ence"], + ["anci", "ance"], + ["izer", "ize"], + ["bli", "ble"], + ["alli", "al"], + ["entli", "ent"], + ["eli", "e"], + ["ousli", "ous"], + ["ization", "ize"], + ["ation", "ate"], + ["ator", "ate"], + ["alism", "al"], + ["iveness", "ive"], + ["fulness", "ful"], + ["ousness", "ous"], + ["aliti", "al"], + ["iviti", "ive"], + ["biliti", "ble"], + ["logi", "log"], +]; + +const STEP3_RULES: [string, string][] = [ + ["icate", "ic"], + ["ative", ""], + ["alize", "al"], + ["iciti", "ic"], + ["ical", "ic"], + ["ful", ""], + ["ness", ""], +]; + +const STEP4_SUFFIXES = [ + "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", + "ent", "sion", "tion", "ou", "ism", "ate", "iti", "ous", "ive", "ize", +]; + +function applyRules(word: string, rules: [string, string][]): string { + for (const [suffix, replacement] of rules) { + if (endsWith(word, suffix)) { + const stem = word.slice(0, word.length - suffix.length); + if (measure(stem) > 0) { + return stem + replacement; + } + return word; + } + } + return word; +} + +function step4(word: string): string { + for (const suffix of STEP4_SUFFIXES) { + if (endsWith(word, suffix)) { + const stem = word.slice(0, word.length - suffix.length); + if (measure(stem) > 1) { + // sion/tion only strip when preceded by `s` or `t` + if (suffix === "sion" || suffix === "tion") { + const last = stem[stem.length - 1]; + if (last === "s" || last === "t") return stem; + return word; + } + return stem; + } + return word; + } + } + return word; +} + +function step5a(word: string): string { + if (!endsWith(word, "e")) return word; + const stem = word.slice(0, -1); + const m = measure(stem); + if (m > 1) return stem; + if (m === 1 && !endsCvc(stem)) return stem; + return word; +} + +function step5b(word: string): string { + if ( + measure(word) > 1 && + endsWithDoubleConsonant(word) && + word.endsWith("l") + ) { + return word.slice(0, -1); + } + return word; +} + +/** + * Stem a single English token. Lowercase ASCII input expected; tokens shorter + * than 3 chars are returned unchanged (Porter is unstable on very short words + * and our pipeline already filters them). + */ +export function porterStem(word: string): string { + if (word.length <= 2) return word; + let w = word; + w = step1a(w); + w = step1b(w); + w = step1c(w); + w = applyRules(w, STEP2_RULES); + w = applyRules(w, STEP3_RULES); + w = step4(w); + w = step5a(w); + w = step5b(w); + return w; +} diff --git a/scripts/knowledge-graph/tokenizer.ts b/scripts/knowledge-graph/tokenizer.ts index edb513e..bbfdb5f 100644 --- a/scripts/knowledge-graph/tokenizer.ts +++ b/scripts/knowledge-graph/tokenizer.ts @@ -3,6 +3,41 @@ */ import { STOP_WORDS, UUID_PATTERN, HEX_PATTERN, NUM_PATTERN } from "./constants.js"; +import { porterStem } from "./porter-stemmer.js"; + +// CJK character ranges: +// U+3040–U+309F Hiragana +// U+30A0–U+30FF Katakana +// U+4E00–U+9FFF CJK Unified Ideographs +const CJK_CHAR_RE = /[぀-ゟ゠-ヿ一-鿿]/; +const CJK_RUN_RE = /[぀-ゟ゠-ヿ一-鿿]+/g; + +// Lazily construct the segmenter once. `Intl.Segmenter` is built into Node 22+ +// (the project's CI target) and is CLDR-backed; no extra dependency is needed. +let jaSegmenter: Intl.Segmenter | null = null; +function getJaSegmenter(): Intl.Segmenter | null { + if (jaSegmenter) return jaSegmenter; + if (typeof Intl === "undefined" || typeof Intl.Segmenter === "undefined") { + return null; + } + jaSegmenter = new Intl.Segmenter("ja", { granularity: "word" }); + return jaSegmenter; +} + +/** + * Segment a Japanese (or otherwise CJK) run into word-like units. + * Falls back to returning the original run as a single token when + * `Intl.Segmenter` is unavailable. + */ +export function segmentJapanese(run: string): string[] { + const seg = getJaSegmenter(); + if (!seg) return [run]; + const out: string[] = []; + for (const piece of seg.segment(run)) { + if (piece.isWordLike) out.push(piece.segment); + } + return out; +} export function splitCamelCase(word: string): string[] { return word @@ -64,20 +99,87 @@ export function tokenize(text: string): string[] { // Handle kebab-case and snake_case const parts = word.split(/[-_]/).filter(Boolean); for (const part of parts) { - // Split CamelCase - const subTokens = splitCamelCase(part); - for (const t of subTokens) { - const clean = t.toLowerCase().replace(/[^a-z0-9\u3040-\u9fff]/g, ""); - if ( - clean.length > 2 && - !STOP_WORDS.has(clean) && - !isNoiseToken(clean) - ) { - tokens.push(clean); + // If the part contains CJK characters, segment CJK runs with + // Intl.Segmenter while keeping the existing English-side splitting + // (CamelCase / kebab / snake) for the non-CJK portions. This fixes + // the case where a Japanese paragraph collapses into a single token. + // See https://github.com/chigichan24/crune/issues/29 + if (CJK_CHAR_RE.test(part)) { + let cursor = 0; + CJK_RUN_RE.lastIndex = 0; + let m: RegExpExecArray | null; + while ((m = CJK_RUN_RE.exec(part)) !== null) { + if (m.index > cursor) { + const nonCjk = part.slice(cursor, m.index); + for (const sub of splitCamelCase(nonCjk)) pushClean(sub, tokens); + } + for (const seg of segmentJapanese(m[0])) pushClean(seg, tokens); + cursor = m.index + m[0].length; + } + if (cursor < part.length) { + const tail = part.slice(cursor); + for (const sub of splitCamelCase(tail)) pushClean(sub, tokens); } + continue; } + + // Pure non-CJK part: keep the existing CamelCase splitting path. + for (const sub of splitCamelCase(part)) pushClean(sub, tokens); } } return tokens; } + +/** + * Push a token through the standard post-processing pipeline: + * lowercase, length filter, Porter stemming (non-CJK only), STOP_WORDS + * lookup, isNoiseToken check. + * + * The cleaning regex strips any leftover punctuation while preserving + * ASCII alphanumerics and the CJK ranges we care about. + * + * Length rule: + * - non-CJK tokens: length > 2 (matches the legacy English-side behavior; + * drops "to", "is", "am", etc.) + * - pure-CJK tokens: length >= 2 (Japanese compounds such as + * bunseki / jissou / shusei are 2-character kanji words and must survive). + * + * Stemming rule (Issue #30): + * - Apply stop-word filtering on the cleaned surface form first, then apply + * Porter stemming, then re-check stop words on the stemmed form. + * This drops natural surface forms such as `using` before stemming while + * still letting inflections like `running`/`runs` collapse into `run`. + * - Skip stemming for tokens containing CJK characters \u2014 those came from + * `Intl.Segmenter` and Porter would corrupt them. + * - Skip stemming for tokens containing digits (e.g. `abc123`, `v2`) since + * Porter is undefined on alphanumerics. + */ +function pushClean(token: string, sink: string[]): void { + const clean = token.toLowerCase().replace(/[^a-z0-9\u3040-\u9fff]/g, ""); + if (clean.length === 0) return; + const isCjk = CJK_CHAR_RE.test(clean); + const minLen = isCjk ? 2 : 3; + if (clean.length < minLen) return; + // Stop-word check is done on the pre-stem form so that NLTK-style entries + // like "was", "using", "needed" are dropped via their natural surface + // form. Otherwise Porter would map "was" -> "wa", "using" -> "us" and + // those fragments would slip through the filter. + if (STOP_WORDS.has(clean)) return; + if (isNoiseToken(clean)) return; + // Apply Porter stemming only to pure-ASCII alphabetic tokens. + // Skip CJK (would corrupt segmenter output) and tokens with digits + // (Porter is undefined on `abc123` etc.). + const stemmed = + !isCjk && /^[a-z]+$/.test(clean) ? porterStem(clean) : clean; + // Re-check stop words on the stemmed form to catch derived inflections + // whose stem coincides with a stop word (rare, but free). + if (STOP_WORDS.has(stemmed)) return; + // Re-check noise: stemming can leave a token alphabetic-only, so the + // post-stem form is essentially never noise, but the guard is cheap. + if (isNoiseToken(stemmed)) return; + // After stemming a 3-char token could shrink below the length floor + // (e.g. "ate" -> "at"). Drop those rather than push fragments. + if (stemmed.length < minLen) return; + sink.push(stemmed); +}