Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions scripts/__tests__/session-summarizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,11 @@ describe("keywords extraction", () => {
);
expect(result.keywords.length).toBeGreaterThan(0);
expect(result.keywords.length).toBeLessThanOrEqual(5);
// "refactor" and "authentication" appear in all prompts, should be top keywords
// "refactor" and "authentication" appear in all prompts, should be top
// keywords. Issue #30 introduced Porter stemming, so `authentication`
// surfaces as the stem `authent`.
expect(result.keywords).toContain("refactor");
expect(result.keywords).toContain("authentication");
expect(result.keywords).toContain("authent");
// Stop words should not appear
for (const kw of result.keywords) {
expect(kw.trim().length).toBeGreaterThan(0);
Expand Down
134 changes: 130 additions & 4 deletions scripts/__tests__/tokenizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,30 @@ describe("isNoiseToken", () => {
it("returns false for short hex-like strings (<6 chars)", () => {
expect(isNoiseToken("abc")).toBe(false);
});

// Issue #30: HEX_PATTERN now requires at least one digit. Without that
// guard the regex matched plain English words built only from a-f.
it("returns false for English words built from a-f without digits", () => {
expect(isNoiseToken("decade")).toBe(false);
expect(isNoiseToken("facade")).toBe(false);
expect(isNoiseToken("effect")).toBe(false);
expect(isNoiseToken("defaced")).toBe(false);
});

it("returns false for 6+ char hex-only words with no digits (Issue #30)", () => {
// `cafebabe` is a famous magic number, but as a standalone token without
// a digit there is no way to distinguish it from a regular English word.
// We err on the side of keeping it; sessions almost never refer to it
// anyway. The previous `^[0-9a-f]{6,}$` would have flagged this; the
// updated pattern does not.
expect(isNoiseToken("cafebabe")).toBe(false);
});

it("returns true for hex strings that contain a digit (Issue #30)", () => {
expect(isNoiseToken("abc123def")).toBe(true);
expect(isNoiseToken("0xabc123")).toBe(false); // contains 'x', not pure hex
expect(isNoiseToken("deadbeef0")).toBe(true);
});
});

describe("tokenize", () => {
Expand Down Expand Up @@ -128,14 +152,51 @@ describe("tokenize", () => {

it("handles kebab-case and snake_case", () => {
const tokens = tokenize("my-component some_variable");
expect(tokens).toContain("component");
// After Issue #30 the non-CJK pipeline stems the surviving tokens with
// Porter, so `component`/`variable` collapse to their stems.
expect(tokens).toContain("compon");
expect(tokens).toContain("variabl");
// "some" is a stop word, so it's excluded
expect(tokens).not.toContain("some");
expect(tokens).toContain("variable");
});

it("handles Japanese text without crashing", () => {
expect(() => tokenize("セッションの分析を実行する")).not.toThrow();
it("segments Japanese text into meaningful word-ish units (Issue #29)", () => {
// Before this fix, the entire Japanese run collapsed into one token
// because `\s+` cannot split text without whitespace. With
// Intl.Segmenter, common kanji compounds like 分析 / 実行 surface as
// individual tokens.
const tokens = tokenize("セッションの分析を実行する");
expect(tokens).toContain("セッション");
expect(tokens).toContain("分析");
expect(tokens).toContain("実行");
// The whole sentence should NOT survive as one giant token.
expect(tokens).not.toContain("セッションの分析を実行する");
});

it("segments mixed Japanese / English text on both sides", () => {
const tokens = tokenize("TypeScriptの型エラーを修正");
// English side: lowercased CamelCase split
expect(tokens).toContain("type");
expect(tokens).toContain("script");
// Japanese side: 2-char kanji compounds preserved
expect(tokens).toContain("エラー");
expect(tokens).toContain("修正");
});

it("does not collapse a long Japanese paragraph into a single oversized token (Issue #29 regression)", () => {
// Reproduces the exact bug from #29: the issue's example string used
// to land in the vocabulary as one token. After the fix every token
// should be word-sized, not paragraph-sized.
const text =
"セッションの分析を実行する。次にテストを書く。" +
"実装が完了したらリファクタリングを行い、最後にコードレビューを依頼する。";
const tokens = tokenize(text);
expect(tokens.length).toBeGreaterThan(3);
for (const t of tokens) {
expect(t.length).toBeLessThanOrEqual(20);
}
// The whole paragraph must not survive as a single token.
expect(tokens).not.toContain(text);
});

it("filters out noise tokens", () => {
Expand All @@ -154,6 +215,71 @@ describe("tokenize", () => {
expect(tokens).not.toContain("to");
expect(tokens).not.toContain("go");
});

// Issue #30: stemming collapses inflected forms.
it("stems inflected forms to a shared stem", () => {
const tokens = tokenize("running runs ran");
// `running` and `runs` both reduce to `run` under Porter step1a/1b.
// `ran` is irregular and Porter does not handle it; we accept that
// limitation (Porter is rule-based, not lexicon-based).
expect(tokens).toContain("run");
expect(tokens.filter((t) => t === "run").length).toBeGreaterThanOrEqual(2);
// The unstemmed surface forms must NOT be in the output.
expect(tokens).not.toContain("running");
expect(tokens).not.toContain("runs");
});

it("collapses other inflected pairs (test/tested/testing, walk/walked/walking)", () => {
const tested = tokenize("tested testing");
expect(tested.length).toBe(2);
expect(tested.every((t) => t === "test")).toBe(true);
// Use `walk` rather than `fix` — `fix` is a project-specific stop word
// (constants.ts), so `tokenize("fixed fixing")` would return [] and the
// `.every()` assertion would pass vacuously, hiding any real regression.
const walked = tokenize("walked walking");
expect(walked.length).toBe(2);
expect(walked.every((t) => t === "walk")).toBe(true);
});

// Issue #30: words built only from a-f used to be filtered as hex noise.
it("keeps English words built from a-f as real tokens", () => {
const tokens = tokenize("decade facade effect");
// After Porter stemming `decade` -> `decad`, `facade` -> `facad`,
// `effect` -> `effect`. The point of this test is that NONE of them
// disappear into the noise filter.
expect(tokens).toContain("decad");
expect(tokens).toContain("facad");
expect(tokens).toContain("effect");
});

// Issue #30: the expanded NLTK-parity stop-word list drops several common
// filler words that the old hand-rolled list let through.
it("drops newly added NLTK-parity stop words", () => {
const tokens = tokenize(
"really back even ever say seem tell yeah right thing the test"
);
// Sanity: substantive vocab survives.
expect(tokens).toContain("test");
// The new entries must be filtered. Note: `actually` is intentionally NOT
// listed — it is not a member of STOP_WORDS. The literal surface form is
// absent from the output only because Porter stems it to `actual`.
// Asserting `not.toContain("actually")` would therefore pass even without
// the new stop-word list, hiding regressions.
for (const w of [
"really",
"back",
"even",
"ever",
"say",
"seem",
"tell",
"yeah",
"right",
"thing",
]) {
expect(tokens).not.toContain(w);
}
});
});

describe("tokenize - large input regression (Issue #18)", () => {
Expand Down
60 changes: 41 additions & 19 deletions scripts/knowledge-graph/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,36 @@
// ─── Stop words ─────────────────────────────────────────────────────────────

export const STOP_WORDS = new Set([
// English
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
"have", "has", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "shall", "can", "need", "must", "ought",
"i", "you", "he", "she", "it", "we", "they", "me", "him", "her",
"us", "them", "my", "your", "his", "its", "our", "their", "mine",
"yours", "hers", "ours", "theirs", "this", "that", "these", "those",
"what", "which", "who", "whom", "whose", "when", "where", "why", "how",
"all", "each", "every", "both", "few", "more", "most", "other", "some",
"such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
"very", "just", "because", "as", "until", "while", "of", "at", "by",
"for", "with", "about", "against", "between", "through", "during",
"before", "after", "above", "below", "to", "from", "up", "down", "in",
"out", "on", "off", "over", "under", "again", "further", "then", "once",
"here", "there", "and", "but", "or", "if", "else", "also", "like",
"please", "thanks", "thank", "yes", "no", "ok", "okay", "sure", "let",
"make", "use", "using", "used", "want", "see", "look", "try", "get",
"got", "think", "know", "now", "new", "way", "well", "back", "still",
// English — roughly NLTK English stop word parity (~180 entries) plus a
// small project-specific filler set ("file", "code", "change") that NLTK
// does not include but which carry no signal in this corpus.
// Issue #30: expanded from ~100 entries to NLTK parity to keep IDF weights
// tight on substantive vocabulary.
"a", "about", "above", "after", "again", "against", "ain", "all", "also",
"am", "an", "and", "any", "are", "aren", "as", "at", "back", "be",
"because", "been", "before", "being", "below", "between", "both", "but",
"by", "can", "could", "couldn", "did", "didn", "do", "does", "doesn",
"doing", "don", "down", "during", "each", "else", "even", "ever", "every",
"few", "for", "from", "further", "get", "give", "go", "going", "got",
"had", "hadn", "has", "hasn", "have", "haven", "having", "he", "her",
"here", "hers", "herself", "him", "himself", "his", "how", "i", "if",
"in", "into", "is", "isn", "it", "its", "itself", "just", "know", "let",
"like", "look", "ma", "make", "many", "may", "me", "might", "mightn",
"mine", "more", "most", "much", "must", "mustn", "my", "myself", "need",
"needn", "new", "no", "nor", "not", "now", "of", "off", "ok", "okay",
"on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves",
"out", "over", "own", "please", "really", "right", "s", "same", "say",
"see", "seem", "seen", "shall", "shan", "she", "should", "shouldn", "so",
"some", "still", "such", "sure", "t", "tell", "than", "thank", "thanks",
"that", "the", "their", "theirs", "them", "themselves", "then", "there",
"these", "they", "thing", "think", "this", "those", "through", "to",
"too", "try", "under", "until", "up", "us", "use", "used", "using",
"very", "want", "was", "wasn", "way", "we", "well", "were", "weren",
"what", "when", "where", "which", "while", "who", "whom", "whose", "why",
"will", "with", "won", "would", "wouldn", "y", "yeah", "yes", "you",
"your", "yours", "yourself", "yourselves",
// Project-specific filler tokens (not part of NLTK). They appear in nearly
// every Claude Code session and dilute the TF-IDF signal.
"file", "code", "change", "changes", "add", "update", "fix", "set",
// Japanese particles and common words
"の", "に", "は", "を", "が", "で", "と", "も", "か", "な", "だ",
Expand All @@ -31,12 +43,22 @@ export const STOP_WORDS = new Set([
"こと", "もの", "ため", "よう", "から", "まで", "より", "ほど",
"など", "ので", "けど", "でも", "しかし", "また", "そして",
"って", "という", "ください", "お願い", "確認",
// Verb conjugation fragments and connective auxiliaries that
// `Intl.Segmenter('ja')` emits as standalone segments. These carry no
// standalone signal for TF-IDF / clustering. Content-bearing stems such
// as 行う / 書く / 修正 are intentionally NOT listed here.
// See https://github.com/chigichan24/crune/issues/29
"ている", "てい", "しない", "次に", "に従って",
]);

// ─── Noise token patterns ───────────────────────────────────────────────────

export const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
export const HEX_PATTERN = /^[0-9a-f]{6,}$/i;
// Hex literal must contain at least one digit. Without the lookahead this
// regex would also match plain English words built only from a-f (e.g.
// "decade", "facade", "effect", "defaced") and incorrectly drop them as
// noise. See https://github.com/chigichan24/crune/issues/30.
export const HEX_PATTERN = /^(?=[0-9a-f]*[0-9])[0-9a-f]{6,}$/i;
export const NUM_PATTERN = /^\d+$/;

// ─── Structural features ────────────────────────────────────────────────────
Expand Down
Loading
Loading