diff --git a/scripts/__tests__/session-summarizer.test.ts b/scripts/__tests__/session-summarizer.test.ts
index d53d46a..3a6117e 100644
--- a/scripts/__tests__/session-summarizer.test.ts
+++ b/scripts/__tests__/session-summarizer.test.ts
@@ -179,9 +179,11 @@ describe("keywords extraction", () => {
     );
     expect(result.keywords.length).toBeGreaterThan(0);
     expect(result.keywords.length).toBeLessThanOrEqual(5);
-    // "refactor" and "authentication" appear in all prompts, should be top keywords
+    // "refactor" and "authentication" appear in all prompts, should be top
+    // keywords. Issue #30 introduced Porter stemming, so `authentication`
+    // surfaces as the stem `authent`.
     expect(result.keywords).toContain("refactor");
-    expect(result.keywords).toContain("authentication");
+    expect(result.keywords).toContain("authent");
     // Stop words should not appear
     for (const kw of result.keywords) {
       expect(kw.trim().length).toBeGreaterThan(0);
diff --git a/scripts/__tests__/tokenizer.test.ts b/scripts/__tests__/tokenizer.test.ts
index 78e1894..7ba451e 100644
--- a/scripts/__tests__/tokenizer.test.ts
+++ b/scripts/__tests__/tokenizer.test.ts
@@ -91,6 +91,30 @@ describe("isNoiseToken", () => {
   it("returns false for short hex-like strings (<6 chars)", () => {
     expect(isNoiseToken("abc")).toBe(false);
   });
+
+  // Issue #30: HEX_PATTERN now requires at least one digit. Without that
+  // guard the regex matched plain English words built only from a-f.
+  it("returns false for English words built from a-f without digits", () => {
+    expect(isNoiseToken("decade")).toBe(false);
+    expect(isNoiseToken("facade")).toBe(false);
+    expect(isNoiseToken("effect")).toBe(false);
+    expect(isNoiseToken("defaced")).toBe(false);
+  });
+
+  it("returns false for 6+ char hex-only words with no digits (Issue #30)", () => {
+    // `cafebabe` is a famous magic number, but as a standalone token without
+    // a digit there is no way to distinguish it from a regular English word.
+    // We err on the side of keeping it; sessions almost never refer to it
+    // anyway. The previous `^[0-9a-f]{6,}$` would have flagged this; the
+    // updated pattern does not.
+    expect(isNoiseToken("cafebabe")).toBe(false);
+  });
+
+  it("returns true for hex strings that contain a digit (Issue #30)", () => {
+    expect(isNoiseToken("abc123def")).toBe(true);
+    expect(isNoiseToken("0xabc123")).toBe(false); // contains 'x', not pure hex
+    expect(isNoiseToken("deadbeef0")).toBe(true);
+  });
 });
 
 describe("tokenize", () => {
@@ -128,14 +152,51 @@ describe("tokenize", () => {
 
   it("handles kebab-case and snake_case", () => {
     const tokens = tokenize("my-component some_variable");
-    expect(tokens).toContain("component");
+    // After Issue #30 the non-CJK pipeline stems the surviving tokens with
+    // Porter, so `component`/`variable` collapse to their stems.
+    expect(tokens).toContain("compon");
+    expect(tokens).toContain("variabl");
     // "some" is a stop word, so it's excluded
     expect(tokens).not.toContain("some");
-    expect(tokens).toContain("variable");
   });
 
-  it("handles Japanese text without crashing", () => {
-    expect(() => tokenize("セッションの分析を実行する")).not.toThrow();
+  it("segments Japanese text into meaningful word-ish units (Issue #29)", () => {
+    // Before this fix, the entire Japanese run collapsed into one token
+    // because `\s+` cannot split text without whitespace. With
+    // Intl.Segmenter, common kanji compounds like 分析 / 実行 surface as
+    // individual tokens.
+    const tokens = tokenize("セッションの分析を実行する");
+    expect(tokens).toContain("セッション");
+    expect(tokens).toContain("分析");
+    expect(tokens).toContain("実行");
+    // The whole sentence should NOT survive as one giant token.
+    expect(tokens).not.toContain("セッションの分析を実行する");
+  });
+
+  it("segments mixed Japanese / English text on both sides", () => {
+    const tokens = tokenize("TypeScriptの型エラーを修正");
+    // English side: lowercased CamelCase split
+    expect(tokens).toContain("type");
+    expect(tokens).toContain("script");
+    // Japanese side: 2-char kanji compounds preserved
+    expect(tokens).toContain("エラー");
+    expect(tokens).toContain("修正");
+  });
+
+  it("does not collapse a long Japanese paragraph into a single oversized token (Issue #29 regression)", () => {
+    // Reproduces the exact bug from #29: the issue's example string used
+    // to land in the vocabulary as one token. After the fix every token
+    // should be word-sized, not paragraph-sized.
+    const text =
+      "セッションの分析を実行する。次にテストを書く。" +
+      "実装が完了したらリファクタリングを行い、最後にコードレビューを依頼する。";
+    const tokens = tokenize(text);
+    expect(tokens.length).toBeGreaterThan(3);
+    for (const t of tokens) {
+      expect(t.length).toBeLessThanOrEqual(20);
+    }
+    // The whole paragraph must not survive as a single token.
+    expect(tokens).not.toContain(text);
   });
 
   it("filters out noise tokens", () => {
@@ -154,6 +215,71 @@ describe("tokenize", () => {
     expect(tokens).not.toContain("to");
     expect(tokens).not.toContain("go");
   });
+
+  // Issue #30: stemming collapses inflected forms.
+  it("stems inflected forms to a shared stem", () => {
+    const tokens = tokenize("running runs ran");
+    // `running` and `runs` both reduce to `run` under Porter step1a/1b.
+    // `ran` is irregular and Porter does not handle it; we accept that
+    // limitation (Porter is rule-based, not lexicon-based).
+    expect(tokens).toContain("run");
+    expect(tokens.filter((t) => t === "run").length).toBeGreaterThanOrEqual(2);
+    // The unstemmed surface forms must NOT be in the output.
+    expect(tokens).not.toContain("running");
+    expect(tokens).not.toContain("runs");
+  });
+
+  it("collapses other inflected pairs (test/tested/testing, walk/walked/walking)", () => {
+    const tested = tokenize("tested testing");
+    expect(tested.length).toBe(2);
+    expect(tested.every((t) => t === "test")).toBe(true);
+    // Use `walk` rather than `fix` — `fix` is a project-specific stop word
+    // (constants.ts), so `tokenize("fixed fixing")` would return [] and the
+    // `.every()` assertion would pass vacuously, hiding any real regression.
+    const walked = tokenize("walked walking");
+    expect(walked.length).toBe(2);
+    expect(walked.every((t) => t === "walk")).toBe(true);
+  });
+
+  // Issue #30: words built only from a-f used to be filtered as hex noise.
+  it("keeps English words built from a-f as real tokens", () => {
+    const tokens = tokenize("decade facade effect");
+    // After Porter stemming `decade` -> `decad`, `facade` -> `facad`,
+    // `effect` -> `effect`. The point of this test is that NONE of them
+    // disappear into the noise filter.
+    expect(tokens).toContain("decad");
+    expect(tokens).toContain("facad");
+    expect(tokens).toContain("effect");
+  });
+
+  // Issue #30: the expanded NLTK-parity stop-word list drops several common
+  // filler words that the old hand-rolled list let through.
+  it("drops newly added NLTK-parity stop words", () => {
+    const tokens = tokenize(
+      "really back even ever say seem tell yeah right thing the test"
+    );
+    // Sanity: substantive vocab survives.
+    expect(tokens).toContain("test");
+    // The new entries must be filtered. Note: `actually` is intentionally NOT
+    // listed — it is not a member of STOP_WORDS. The literal surface form is
+    // absent from the output only because Porter stems it to `actual`.
+    // Asserting `not.toContain("actually")` would therefore pass even without
+    // the new stop-word list, hiding regressions.
+    for (const w of [
+      "really",
+      "back",
+      "even",
+      "ever",
+      "say",
+      "seem",
+      "tell",
+      "yeah",
+      "right",
+      "thing",
+    ]) {
+      expect(tokens).not.toContain(w);
+    }
+  });
 });
 
 describe("tokenize - large input regression (Issue #18)", () => {
diff --git a/scripts/knowledge-graph/constants.ts b/scripts/knowledge-graph/constants.ts
index 04f31e9..ac6a8f0 100644
--- a/scripts/knowledge-graph/constants.ts
+++ b/scripts/knowledge-graph/constants.ts
@@ -5,24 +5,36 @@
 // ─── Stop words ─────────────────────────────────────────────────────────────
 
 export const STOP_WORDS = new Set([
-  // English
-  "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
-  "have", "has", "had", "do", "does", "did", "will", "would", "could",
-  "should", "may", "might", "shall", "can", "need", "must", "ought",
-  "i", "you", "he", "she", "it", "we", "they", "me", "him", "her",
-  "us", "them", "my", "your", "his", "its", "our", "their", "mine",
-  "yours", "hers", "ours", "theirs", "this", "that", "these", "those",
-  "what", "which", "who", "whom", "whose", "when", "where", "why", "how",
-  "all", "each", "every", "both", "few", "more", "most", "other", "some",
-  "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
-  "very", "just", "because", "as", "until", "while", "of", "at", "by",
-  "for", "with", "about", "against", "between", "through", "during",
-  "before", "after", "above", "below", "to", "from", "up", "down", "in",
-  "out", "on", "off", "over", "under", "again", "further", "then", "once",
-  "here", "there", "and", "but", "or", "if", "else", "also", "like",
-  "please", "thanks", "thank", "yes", "no", "ok", "okay", "sure", "let",
-  "make", "use", "using", "used", "want", "see", "look", "try", "get",
-  "got", "think", "know", "now", "new", "way", "well", "back", "still",
+  // English — roughly NLTK English stop word parity (~180 entries) plus a
+  // small project-specific filler set ("file", "code", "change") that NLTK
+  // does not include but which carry no signal in this corpus.
+  // Issue #30: expanded from ~100 entries to NLTK parity to keep IDF weights
+  // tight on substantive vocabulary.
+  "a", "about", "above", "after", "again", "against", "ain", "all", "also",
+  "am", "an", "and", "any", "are", "aren", "as", "at", "back", "be",
+  "because", "been", "before", "being", "below", "between", "both", "but",
+  "by", "can", "could", "couldn", "did", "didn", "do", "does", "doesn",
+  "doing", "don", "down", "during", "each", "else", "even", "ever", "every",
+  "few", "for", "from", "further", "get", "give", "go", "going", "got",
+  "had", "hadn", "has", "hasn", "have", "haven", "having", "he", "her",
+  "here", "hers", "herself", "him", "himself", "his", "how", "i", "if",
+  "in", "into", "is", "isn", "it", "its", "itself", "just", "know", "let",
+  "like", "look", "ma", "make", "many", "may", "me", "might", "mightn",
+  "mine", "more", "most", "much", "must", "mustn", "my", "myself", "need",
+  "needn", "new", "no", "nor", "not", "now", "of", "off", "ok", "okay",
+  "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves",
+  "out", "over", "own", "please", "really", "right", "s", "same", "say",
+  "see", "seem", "seen", "shall", "shan", "she", "should", "shouldn", "so",
+  "some", "still", "such", "sure", "t", "tell", "than", "thank", "thanks",
+  "that", "the", "their", "theirs", "them", "themselves", "then", "there",
+  "these", "they", "thing", "think", "this", "those", "through", "to",
+  "too", "try", "under", "until", "up", "us", "use", "used", "using",
+  "very", "want", "was", "wasn", "way", "we", "well", "were", "weren",
+  "what", "when", "where", "which", "while", "who", "whom", "whose", "why",
+  "will", "with", "won", "would", "wouldn", "y", "yeah", "yes", "you",
+  "your", "yours", "yourself", "yourselves",
+  // Project-specific filler tokens (not part of NLTK). They appear in nearly
+  // every Claude Code session and dilute the TF-IDF signal.
   "file", "code", "change", "changes", "add", "update", "fix", "set",
   // Japanese particles and common words
   "の", "に", "は", "を", "が", "で", "と", "も", "か", "な", "だ",
@@ -31,12 +43,22 @@ export const STOP_WORDS = new Set([
   "こと", "もの", "ため", "よう", "から", "まで", "より", "ほど",
   "など", "ので", "けど", "でも", "しかし", "また", "そして",
   "って", "という", "ください", "お願い", "確認",
+  // Verb conjugation fragments and connective auxiliaries that
+  // `Intl.Segmenter('ja')` emits as standalone segments. These carry no
+  // standalone signal for TF-IDF / clustering. Content-bearing stems such
+  // as 行う / 書く / 修正 are intentionally NOT listed here.
+  // See https://github.com/chigichan24/crune/issues/29
+  "ている", "てい", "しない", "次に", "に従って",
 ]);
 
 // ─── Noise token patterns ───────────────────────────────────────────────────
 
 export const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
-export const HEX_PATTERN = /^[0-9a-f]{6,}$/i;
+// Hex literal must contain at least one digit. Without the lookahead this
+// regex would also match plain English words built only from a-f (e.g.
+// "decade", "facade", "effect", "defaced") and incorrectly drop them as
+// noise. See https://github.com/chigichan24/crune/issues/30.
+export const HEX_PATTERN = /^(?=[0-9a-f]*[0-9])[0-9a-f]{6,}$/i;
 export const NUM_PATTERN = /^\d+$/;
 
 // ─── Structural features ────────────────────────────────────────────────────
diff --git a/scripts/knowledge-graph/porter-stemmer.ts b/scripts/knowledge-graph/porter-stemmer.ts
new file mode 100644
index 0000000..92a7e9c
--- /dev/null
+++ b/scripts/knowledge-graph/porter-stemmer.ts
@@ -0,0 +1,258 @@
+/**
+ * Porter stemmer (Porter 1980 — original algorithm).
+ *
+ * Inline implementation, ~70 LoC of substantive logic. We deliberately avoid
+ * pulling in `natural` / `stemmer` to keep the dependency footprint flat
+ * (the data pipeline is shipped as a CLI binary; transitive deps cost startup
+ * time and lockfile noise).
+ *
+ * Reference: M.F. Porter, "An algorithm for suffix stripping",
+ * Program 14(3), 130-137 (1980).
+ *
+ * Usage caveats inside this codebase:
+ *  - Apply only to non-CJK tokens. Tokens containing CJK characters come from
+ *    `Intl.Segmenter` and stemming would corrupt them.
+ *  - The tokenizer already drops tokens of length <= 2, so very short inputs
+ *    like `"by"` never reach this function. We still guard with `length > 2`
+ *    here so the module is safe to call standalone.
+ */
+
+const VOWELS = new Set(["a", "e", "i", "o", "u"]);
+
+function isConsonant(word: string, i: number): boolean {
+  const ch = word[i];
+  if (VOWELS.has(ch)) return false;
+  if (ch === "y") {
+    if (i === 0) return true;
+    return !isConsonant(word, i - 1);
+  }
+  return true;
+}
+
+/**
+ * Measure `m` of a stem: the number of (vowel-sequence)(consonant-sequence)
+ * groups, ignoring leading consonants and trailing vowels. This is the
+ * standard Porter `[C](VC){m}[V]` count.
+ */
+function measure(stem: string): number {
+  let m = 0;
+  let i = 0;
+  const n = stem.length;
+  // skip leading consonants
+  while (i < n && isConsonant(stem, i)) i++;
+  while (i < n) {
+    // we are at a vowel; consume the vowel run
+    while (i < n && !isConsonant(stem, i)) i++;
+    if (i >= n) break;
+    // we are at a consonant; one VC pair complete
+    m++;
+    while (i < n && isConsonant(stem, i)) i++;
+  }
+  return m;
+}
+
+function hasVowel(stem: string): boolean {
+  for (let i = 0; i < stem.length; i++) {
+    if (!isConsonant(stem, i)) return true;
+  }
+  return false;
+}
+
+function endsWithDoubleConsonant(stem: string): boolean {
+  const n = stem.length;
+  if (n < 2) return false;
+  if (stem[n - 1] !== stem[n - 2]) return false;
+  return isConsonant(stem, n - 1);
+}
+
+/** CVC pattern at end where final C is not w/x/y. Used by step1b/step5. */
+function endsCvc(stem: string): boolean {
+  const n = stem.length;
+  if (n < 3) return false;
+  if (!isConsonant(stem, n - 1)) return false;
+  if (isConsonant(stem, n - 2)) return false;
+  if (!isConsonant(stem, n - 3)) return false;
+  const last = stem[n - 1];
+  if (last === "w" || last === "x" || last === "y") return false;
+  return true;
+}
+
+function endsWith(word: string, suffix: string): boolean {
+  return word.length >= suffix.length && word.endsWith(suffix);
+}
+
+function replaceSuffix(word: string, suffix: string, replacement: string): string {
+  return word.slice(0, word.length - suffix.length) + replacement;
+}
+
+/** Step 1a: plurals. */
+function step1a(word: string): string {
+  if (endsWith(word, "sses")) return replaceSuffix(word, "sses", "ss");
+  if (endsWith(word, "ies")) return replaceSuffix(word, "ies", "i");
+  if (endsWith(word, "ss")) return word;
+  if (endsWith(word, "s")) return word.slice(0, -1);
+  return word;
+}
+
+/** Step 1b: past tense / -ing. */
+function step1b(word: string): string {
+  if (endsWith(word, "eed")) {
+    const stem = word.slice(0, -3);
+    if (measure(stem) > 0) return stem + "ee";
+    return word;
+  }
+  let stem: string | null = null;
+  let stripped = word;
+  if (endsWith(word, "ed")) {
+    const candidate = word.slice(0, -2);
+    if (hasVowel(candidate)) {
+      stem = candidate;
+      stripped = candidate;
+    }
+  } else if (endsWith(word, "ing")) {
+    const candidate = word.slice(0, -3);
+    if (hasVowel(candidate)) {
+      stem = candidate;
+      stripped = candidate;
+    }
+  }
+  if (stem === null) return word;
+  // step 1b post: add `e` for at/bl/iz; collapse double consonants except l/s/z;
+  // restore `e` when stem is short and ends CVC.
+  if (
+    endsWith(stripped, "at") ||
+    endsWith(stripped, "bl") ||
+    endsWith(stripped, "iz")
+  ) {
+    return stripped + "e";
+  }
+  if (endsWithDoubleConsonant(stripped)) {
+    const last = stripped[stripped.length - 1];
+    if (last !== "l" && last !== "s" && last !== "z") {
+      return stripped.slice(0, -1);
+    }
+    return stripped;
+  }
+  if (measure(stripped) === 1 && endsCvc(stripped)) {
+    return stripped + "e";
+  }
+  return stripped;
+}
+
+/** Step 1c: y -> i when there is a vowel in the stem. */
+function step1c(word: string): string {
+  if (endsWith(word, "y") && word.length > 1 && hasVowel(word.slice(0, -1))) {
+    return word.slice(0, -1) + "i";
+  }
+  return word;
+}
+
+const STEP2_RULES: [string, string][] = [
+  ["ational", "ate"],
+  ["tional", "tion"],
+  ["enci", "ence"],
+  ["anci", "ance"],
+  ["izer", "ize"],
+  ["bli", "ble"],
+  ["alli", "al"],
+  ["entli", "ent"],
+  ["eli", "e"],
+  ["ousli", "ous"],
+  ["ization", "ize"],
+  ["ation", "ate"],
+  ["ator", "ate"],
+  ["alism", "al"],
+  ["iveness", "ive"],
+  ["fulness", "ful"],
+  ["ousness", "ous"],
+  ["aliti", "al"],
+  ["iviti", "ive"],
+  ["biliti", "ble"],
+  ["logi", "log"],
+];
+
+const STEP3_RULES: [string, string][] = [
+  ["icate", "ic"],
+  ["ative", ""],
+  ["alize", "al"],
+  ["iciti", "ic"],
+  ["ical", "ic"],
+  ["ful", ""],
+  ["ness", ""],
+];
+
+const STEP4_SUFFIXES = [
+  "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment",
+  "ent", "sion", "tion", "ou", "ism", "ate", "iti", "ous", "ive", "ize",
+];
+
+function applyRules(word: string, rules: [string, string][]): string {
+  for (const [suffix, replacement] of rules) {
+    if (endsWith(word, suffix)) {
+      const stem = word.slice(0, word.length - suffix.length);
+      if (measure(stem) > 0) {
+        return stem + replacement;
+      }
+      return word;
+    }
+  }
+  return word;
+}
+
+function step4(word: string): string {
+  for (const suffix of STEP4_SUFFIXES) {
+    if (endsWith(word, suffix)) {
+      const stem = word.slice(0, word.length - suffix.length);
+      if (measure(stem) > 1) {
+        // sion/tion only strip when preceded by `s` or `t`
+        if (suffix === "sion" || suffix === "tion") {
+          const last = stem[stem.length - 1];
+          if (last === "s" || last === "t") return stem;
+          return word;
+        }
+        return stem;
+      }
+      return word;
+    }
+  }
+  return word;
+}
+
+function step5a(word: string): string {
+  if (!endsWith(word, "e")) return word;
+  const stem = word.slice(0, -1);
+  const m = measure(stem);
+  if (m > 1) return stem;
+  if (m === 1 && !endsCvc(stem)) return stem;
+  return word;
+}
+
+function step5b(word: string): string {
+  if (
+    measure(word) > 1 &&
+    endsWithDoubleConsonant(word) &&
+    word.endsWith("l")
+  ) {
+    return word.slice(0, -1);
+  }
+  return word;
+}
+
+/**
+ * Stem a single English token. Lowercase ASCII input expected; tokens shorter
+ * than 3 chars are returned unchanged (Porter is unstable on very short words
+ * and our pipeline already filters them).
+ */
+export function porterStem(word: string): string {
+  if (word.length <= 2) return word;
+  let w = word;
+  w = step1a(w);
+  w = step1b(w);
+  w = step1c(w);
+  w = applyRules(w, STEP2_RULES);
+  w = applyRules(w, STEP3_RULES);
+  w = step4(w);
+  w = step5a(w);
+  w = step5b(w);
+  return w;
+}
diff --git a/scripts/knowledge-graph/tokenizer.ts b/scripts/knowledge-graph/tokenizer.ts
index edb513e..bbfdb5f 100644
--- a/scripts/knowledge-graph/tokenizer.ts
+++ b/scripts/knowledge-graph/tokenizer.ts
@@ -3,6 +3,41 @@
  */
 
 import { STOP_WORDS, UUID_PATTERN, HEX_PATTERN, NUM_PATTERN } from "./constants.js";
+import { porterStem } from "./porter-stemmer.js";
+
+// CJK character ranges:
+//   U+3040–U+309F  Hiragana
+//   U+30A0–U+30FF  Katakana
+//   U+4E00–U+9FFF  CJK Unified Ideographs
+const CJK_CHAR_RE = /[぀-ゟ゠-ヿ一-鿿]/;
+const CJK_RUN_RE = /[぀-ゟ゠-ヿ一-鿿]+/g;
+
+// Lazily construct the segmenter once. `Intl.Segmenter` is built into Node 22+
+// (the project's CI target) and is CLDR-backed; no extra dependency is needed.
+let jaSegmenter: Intl.Segmenter | null = null;
+function getJaSegmenter(): Intl.Segmenter | null {
+  if (jaSegmenter) return jaSegmenter;
+  if (typeof Intl === "undefined" || typeof Intl.Segmenter === "undefined") {
+    return null;
+  }
+  jaSegmenter = new Intl.Segmenter("ja", { granularity: "word" });
+  return jaSegmenter;
+}
+
+/**
+ * Segment a Japanese (or otherwise CJK) run into word-like units.
+ * Falls back to returning the original run as a single token when
+ * `Intl.Segmenter` is unavailable.
+ */
+export function segmentJapanese(run: string): string[] {
+  const seg = getJaSegmenter();
+  if (!seg) return [run];
+  const out: string[] = [];
+  for (const piece of seg.segment(run)) {
+    if (piece.isWordLike) out.push(piece.segment);
+  }
+  return out;
+}
 
 export function splitCamelCase(word: string): string[] {
   return word
@@ -64,20 +99,87 @@ export function tokenize(text: string): string[] {
     // Handle kebab-case and snake_case
     const parts = word.split(/[-_]/).filter(Boolean);
     for (const part of parts) {
-      // Split CamelCase
-      const subTokens = splitCamelCase(part);
-      for (const t of subTokens) {
-        const clean = t.toLowerCase().replace(/[^a-z0-9\u3040-\u9fff]/g, "");
-        if (
-          clean.length > 2 &&
-          !STOP_WORDS.has(clean) &&
-          !isNoiseToken(clean)
-        ) {
-          tokens.push(clean);
+      // If the part contains CJK characters, segment CJK runs with
+      // Intl.Segmenter while keeping the existing English-side splitting
+      // (CamelCase / kebab / snake) for the non-CJK portions. This fixes
+      // the case where a Japanese paragraph collapses into a single token.
+      // See https://github.com/chigichan24/crune/issues/29
+      if (CJK_CHAR_RE.test(part)) {
+        let cursor = 0;
+        CJK_RUN_RE.lastIndex = 0;
+        let m: RegExpExecArray | null;
+        while ((m = CJK_RUN_RE.exec(part)) !== null) {
+          if (m.index > cursor) {
+            const nonCjk = part.slice(cursor, m.index);
+            for (const sub of splitCamelCase(nonCjk)) pushClean(sub, tokens);
+          }
+          for (const seg of segmentJapanese(m[0])) pushClean(seg, tokens);
+          cursor = m.index + m[0].length;
+        }
+        if (cursor < part.length) {
+          const tail = part.slice(cursor);
+          for (const sub of splitCamelCase(tail)) pushClean(sub, tokens);
         }
+        continue;
       }
+
+      // Pure non-CJK part: keep the existing CamelCase splitting path.
+      for (const sub of splitCamelCase(part)) pushClean(sub, tokens);
     }
   }
 
   return tokens;
 }
+
+/**
+ * Push a token through the standard post-processing pipeline:
+ * lowercase, length filter, Porter stemming (non-CJK only), STOP_WORDS
+ * lookup, isNoiseToken check.
+ *
+ * The cleaning regex strips any leftover punctuation while preserving
+ * ASCII alphanumerics and the CJK ranges we care about.
+ *
+ * Length rule:
+ *  - non-CJK tokens: length > 2 (matches the legacy English-side behavior;
+ *    drops "to", "is", "am", etc.)
+ *  - pure-CJK tokens: length >= 2 (Japanese compounds such as
+ *    bunseki / jissou / shusei are 2-character kanji words and must survive).
+ *
+ * Stemming rule (Issue #30):
+ *  - Apply stop-word filtering on the cleaned surface form first, then apply
+ *    Porter stemming, then re-check stop words on the stemmed form.
+ *    This drops natural surface forms such as `using` before stemming while
+ *    still letting inflections like `running`/`runs` collapse into `run`.
+ *  - Skip stemming for tokens containing CJK characters \u2014 those came from
+ *    `Intl.Segmenter` and Porter would corrupt them.
+ *  - Skip stemming for tokens containing digits (e.g. `abc123`, `v2`) since
+ *    Porter is undefined on alphanumerics.
+ */
+function pushClean(token: string, sink: string[]): void {
+  const clean = token.toLowerCase().replace(/[^a-z0-9\u3040-\u9fff]/g, "");
+  if (clean.length === 0) return;
+  const isCjk = CJK_CHAR_RE.test(clean);
+  const minLen = isCjk ? 2 : 3;
+  if (clean.length < minLen) return;
+  // Stop-word check is done on the pre-stem form so that NLTK-style entries
+  // like "was", "using", "needed" are dropped via their natural surface
+  // form. Otherwise Porter would map "was" -> "wa", "using" -> "us" and
+  // those fragments would slip through the filter.
+  if (STOP_WORDS.has(clean)) return;
+  if (isNoiseToken(clean)) return;
+  // Apply Porter stemming only to pure-ASCII alphabetic tokens.
+  // Skip CJK (would corrupt segmenter output) and tokens with digits
+  // (Porter is undefined on `abc123` etc.).
+  const stemmed =
+    !isCjk && /^[a-z]+$/.test(clean) ? porterStem(clean) : clean;
+  // Re-check stop words on the stemmed form to catch derived inflections
+  // whose stem coincides with a stop word (rare, but free).
+  if (STOP_WORDS.has(stemmed)) return;
+  // Re-check noise: stemming can leave a token alphabetic-only, so the
+  // post-stem form is essentially never noise, but the guard is cheap.
+  if (isNoiseToken(stemmed)) return;
+  // After stemming a 3-char token could shrink below the length floor
+  // (e.g. "ate" -> "at"). Drop those rather than push fragments.
+  if (stemmed.length < minLen) return;
+  sink.push(stemmed);
+}