chigichan24 · chigichan24 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
diff --git a/scripts/__tests__/session-summarizer.test.ts b/scripts/__tests__/session-summarizer.test.ts
@@ -179,9 +179,11 @@ describe("keywords extraction", () => {
     );
     expect(result.keywords.length).toBeGreaterThan(0);
     expect(result.keywords.length).toBeLessThanOrEqual(5);
-    // "refactor" and "authentication" appear in all prompts, should be top keywords
+    // "refactor" and "authentication" appear in all prompts, should be top
+    // keywords. Issue #30 introduced Porter stemming, so `authentication`
+    // surfaces as the stem `authent`.
     expect(result.keywords).toContain("refactor");
-    expect(result.keywords).toContain("authentication");
+    expect(result.keywords).toContain("authent");
     // Stop words should not appear
     for (const kw of result.keywords) {
       expect(kw.trim().length).toBeGreaterThan(0);

diff --git a/scripts/__tests__/tokenizer.test.ts b/scripts/__tests__/tokenizer.test.ts
@@ -91,6 +91,30 @@ describe("isNoiseToken", () => {
   it("returns false for short hex-like strings (<6 chars)", () => {
     expect(isNoiseToken("abc")).toBe(false);
   });
+
+  // Issue #30: HEX_PATTERN now requires at least one digit. Without that
+  // guard the regex matched plain English words built only from a-f.
+  it("returns false for English words built from a-f without digits", () => {
+    expect(isNoiseToken("decade")).toBe(false);
+    expect(isNoiseToken("facade")).toBe(false);
+    expect(isNoiseToken("effect")).toBe(false);
+    expect(isNoiseToken("defaced")).toBe(false);
+  });
+
+  it("returns false for 6+ char hex-only words with no digits (Issue #30)", () => {
+    // `cafebabe` is a famous magic number, but as a standalone token without
+    // a digit there is no way to distinguish it from a regular English word.
+    // We err on the side of keeping it; sessions almost never refer to it
+    // anyway. The previous `^[0-9a-f]{6,}$` would have flagged this; the
+    // updated pattern does not.
+    expect(isNoiseToken("cafebabe")).toBe(false);
+  });
+
+  it("returns true for hex strings that contain a digit (Issue #30)", () => {
+    expect(isNoiseToken("abc123def")).toBe(true);
+    expect(isNoiseToken("0xabc123")).toBe(false); // contains 'x', not pure hex
+    expect(isNoiseToken("deadbeef0")).toBe(true);
+  });
 });
 
 describe("tokenize", () => {
@@ -128,14 +152,51 @@ describe("tokenize", () => {
 
   it("handles kebab-case and snake_case", () => {
     const tokens = tokenize("my-component some_variable");
-    expect(tokens).toContain("component");
+    // After Issue #30 the non-CJK pipeline stems the surviving tokens with
+    // Porter, so `component`/`variable` collapse to their stems.
+    expect(tokens).toContain("compon");
+    expect(tokens).toContain("variabl");
     // "some" is a stop word, so it's excluded
     expect(tokens).not.toContain("some");
-    expect(tokens).toContain("variable");
   });
 
-  it("handles Japanese text without crashing", () => {
-    expect(() => tokenize("セッションの分析を実行する")).not.toThrow();
+  it("segments Japanese text into meaningful word-ish units (Issue #29)", () => {
+    // Before this fix, the entire Japanese run collapsed into one token
+    // because `\s+` cannot split text without whitespace. With
+    // Intl.Segmenter, common kanji compounds like 分析 / 実行 surface as
+    // individual tokens.
+    const tokens = tokenize("セッションの分析を実行する");
+    expect(tokens).toContain("セッション");
+    expect(tokens).toContain("分析");
+    expect(tokens).toContain("実行");
+    // The whole sentence should NOT survive as one giant token.
+    expect(tokens).not.toContain("セッションの分析を実行する");
+  });
+
+  it("segments mixed Japanese / English text on both sides", () => {
+    const tokens = tokenize("TypeScriptの型エラーを修正");
+    // English side: lowercased CamelCase split
+    expect(tokens).toContain("type");
+    expect(tokens).toContain("script");
+    // Japanese side: 2-char kanji compounds preserved
+    expect(tokens).toContain("エラー");
+    expect(tokens).toContain("修正");
+  });
+
+  it("does not collapse a long Japanese paragraph into a single oversized token (Issue #29 regression)", () => {
+    // Reproduces the exact bug from #29: the issue's example string used
+    // to land in the vocabulary as one token. After the fix every token
+    // should be word-sized, not paragraph-sized.
+    const text =
+      "セッションの分析を実行する。次にテストを書く。" +
+      "実装が完了したらリファクタリングを行い、最後にコードレビューを依頼する。";
+    const tokens = tokenize(text);
+    expect(tokens.length).toBeGreaterThan(3);
+    for (const t of tokens) {
+      expect(t.length).toBeLessThanOrEqual(20);
+    }
+    // The whole paragraph must not survive as a single token.
+    expect(tokens).not.toContain(text);
   });
 
   it("filters out noise tokens", () => {
@@ -154,6 +215,71 @@ describe("tokenize", () => {
     expect(tokens).not.toContain("to");
     expect(tokens).not.toContain("go");
   });
+
+  // Issue #30: stemming collapses inflected forms.
+  it("stems inflected forms to a shared stem", () => {
+    const tokens = tokenize("running runs ran");
+    // `running` and `runs` both reduce to `run` under Porter step1a/1b.
+    // `ran` is irregular and Porter does not handle it; we accept that
+    // limitation (Porter is rule-based, not lexicon-based).
+    expect(tokens).toContain("run");
+    expect(tokens.filter((t) => t === "run").length).toBeGreaterThanOrEqual(2);
+    // The unstemmed surface forms must NOT be in the output.
+    expect(tokens).not.toContain("running");
+    expect(tokens).not.toContain("runs");
+  });
+
+  it("collapses other inflected pairs (test/tested/testing, walk/walked/walking)", () => {
+    const tested = tokenize("tested testing");
+    expect(tested.length).toBe(2);
+    expect(tested.every((t) => t === "test")).toBe(true);
+    // Use `walk` rather than `fix` — `fix` is a project-specific stop word
+    // (constants.ts), so `tokenize("fixed fixing")` would return [] and the
+    // `.every()` assertion would pass vacuously, hiding any real regression.
+    const walked = tokenize("walked walking");
+    expect(walked.length).toBe(2);
+    expect(walked.every((t) => t === "walk")).toBe(true);
+  });
+
+  // Issue #30: words built only from a-f used to be filtered as hex noise.
+  it("keeps English words built from a-f as real tokens", () => {
+    const tokens = tokenize("decade facade effect");
+    // After Porter stemming `decade` -> `decad`, `facade` -> `facad`,
+    // `effect` -> `effect`. The point of this test is that NONE of them
+    // disappear into the noise filter.
+    expect(tokens).toContain("decad");
+    expect(tokens).toContain("facad");
+    expect(tokens).toContain("effect");
+  });
+
+  // Issue #30: the expanded NLTK-parity stop-word list drops several common
+  // filler words that the old hand-rolled list let through.
+  it("drops newly added NLTK-parity stop words", () => {
+    const tokens = tokenize(
+      "really back even ever say seem tell yeah right thing the test"
+    );
+    // Sanity: substantive vocab survives.
+    expect(tokens).toContain("test");
+    // The new entries must be filtered. Note: `actually` is intentionally NOT
+    // listed — it is not a member of STOP_WORDS. The literal surface form is
+    // absent from the output only because Porter stems it to `actual`.
+    // Asserting `not.toContain("actually")` would therefore pass even without
+    // the new stop-word list, hiding regressions.
+    for (const w of [
+      "really",
+      "back",
+      "even",
+      "ever",
+      "say",
+      "seem",
+      "tell",
+      "yeah",
+      "right",
+      "thing",
+    ]) {
+      expect(tokens).not.toContain(w);
+    }
+  });
 });
 
 describe("tokenize - large input regression (Issue #18)", () => {

diff --git a/scripts/knowledge-graph/constants.ts b/scripts/knowledge-graph/constants.ts
@@ -5,24 +5,36 @@
 // ─── Stop words ─────────────────────────────────────────────────────────────
 
 export const STOP_WORDS = new Set([
-  // English
-  "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
-  "have", "has", "had", "do", "does", "did", "will", "would", "could",
-  "should", "may", "might", "shall", "can", "need", "must", "ought",
-  "i", "you", "he", "she", "it", "we", "they", "me", "him", "her",
-  "us", "them", "my", "your", "his", "its", "our", "their", "mine",
-  "yours", "hers", "ours", "theirs", "this", "that", "these", "those",
-  "what", "which", "who", "whom", "whose", "when", "where", "why", "how",
-  "all", "each", "every", "both", "few", "more", "most", "other", "some",
-  "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
-  "very", "just", "because", "as", "until", "while", "of", "at", "by",
-  "for", "with", "about", "against", "between", "through", "during",
-  "before", "after", "above", "below", "to", "from", "up", "down", "in",
-  "out", "on", "off", "over", "under", "again", "further", "then", "once",
-  "here", "there", "and", "but", "or", "if", "else", "also", "like",
-  "please", "thanks", "thank", "yes", "no", "ok", "okay", "sure", "let",
-  "make", "use", "using", "used", "want", "see", "look", "try", "get",
-  "got", "think", "know", "now", "new", "way", "well", "back", "still",
+  // English — roughly NLTK English stop word parity (~180 entries) plus a
+  // small project-specific filler set ("file", "code", "change") that NLTK
+  // does not include but which carry no signal in this corpus.
+  // Issue #30: expanded from ~100 entries to NLTK parity to keep IDF weights
+  // tight on substantive vocabulary.
+  "a", "about", "above", "after", "again", "against", "ain", "all", "also",
+  "am", "an", "and", "any", "are", "aren", "as", "at", "back", "be",
+  "because", "been", "before", "being", "below", "between", "both", "but",
+  "by", "can", "could", "couldn", "did", "didn", "do", "does", "doesn",
+  "doing", "don", "down", "during", "each", "else", "even", "ever", "every",
+  "few", "for", "from", "further", "get", "give", "go", "going", "got",
+  "had", "hadn", "has", "hasn", "have", "haven", "having", "he", "her",
+  "here", "hers", "herself", "him", "himself", "his", "how", "i", "if",
+  "in", "into", "is", "isn", "it", "its", "itself", "just", "know", "let",
+  "like", "look", "ma", "make", "many", "may", "me", "might", "mightn",
+  "mine", "more", "most", "much", "must", "mustn", "my", "myself", "need",
+  "needn", "new", "no", "nor", "not", "now", "of", "off", "ok", "okay",
+  "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves",
+  "out", "over", "own", "please", "really", "right", "s", "same", "say",
+  "see", "seem", "seen", "shall", "shan", "she", "should", "shouldn", "so",
+  "some", "still", "such", "sure", "t", "tell", "than", "thank", "thanks",
+  "that", "the", "their", "theirs", "them", "themselves", "then", "there",
+  "these", "they", "thing", "think", "this", "those", "through", "to",
+  "too", "try", "under", "until", "up", "us", "use", "used", "using",
+  "very", "want", "was", "wasn", "way", "we", "well", "were", "weren",
+  "what", "when", "where", "which", "while", "who", "whom", "whose", "why",
+  "will", "with", "won", "would", "wouldn", "y", "yeah", "yes", "you",
+  "your", "yours", "yourself", "yourselves",
+  // Project-specific filler tokens (not part of NLTK). They appear in nearly
+  // every Claude Code session and dilute the TF-IDF signal.
   "file", "code", "change", "changes", "add", "update", "fix", "set",
   // Japanese particles and common words
   "の", "に", "は", "を", "が", "で", "と", "も", "か", "な", "だ",
@@ -31,12 +43,22 @@ export const STOP_WORDS = new Set([
   "こと", "もの", "ため", "よう", "から", "まで", "より", "ほど",
   "など", "ので", "けど", "でも", "しかし", "また", "そして",
   "って", "という", "ください", "お願い", "確認",
+  // Verb conjugation fragments and connective auxiliaries that
+  // `Intl.Segmenter('ja')` emits as standalone segments. These carry no
+  // standalone signal for TF-IDF / clustering. Content-bearing stems such
+  // as 行う / 書く / 修正 are intentionally NOT listed here.
+  // See https://github.com/chigichan24/crune/issues/29
+  "ている", "てい", "しない", "次に", "に従って",
 ]);
 
 // ─── Noise token patterns ───────────────────────────────────────────────────
 
 export const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
-export const HEX_PATTERN = /^[0-9a-f]{6,}$/i;
+// Hex literal must contain at least one digit. Without the lookahead this
+// regex would also match plain English words built only from a-f (e.g.
+// "decade", "facade", "effect", "defaced") and incorrectly drop them as
+// noise. See https://github.com/chigichan24/crune/issues/30.
+export const HEX_PATTERN = /^(?=[0-9a-f]*[0-9])[0-9a-f]{6,}$/i;
 export const NUM_PATTERN = /^\d+$/;
 
 // ─── Structural features ────────────────────────────────────────────────────