From a57810d01a8b1e9da029d1e7297148eeae52a0a8 Mon Sep 17 00:00:00 2001 From: "Aaron K. Clark" Date: Wed, 20 May 2026 16:53:41 -0500 Subject: [PATCH] =?UTF-8?q?bugfix(patterns):=20word-boundary=20slug=20matc?= =?UTF-8?q?h=20=E2=80=94=20fix=20false=20positive=20on=20short=20slugs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_slug_in_project` did a naive substring search: if slug.lower() in file_text.lower(): return True That false-positives on any short slug whose letters appear inside a longer word. Concrete cases: slug "auth" matches "author", "authentic", "authority" slug "api" matches "apiary", "rapidly", "tropical" slug "db" matches "subdivision", "subdued" slug "validate-numbers" matches "validate-numbers-attempt" When that happens, the pattern is reported as "used elsewhere" and silently does NOT appear in the UNUSED findings — masking genuinely unused patterns and undermining the whole point of the report. Fix: compile a word-boundary regex with custom boundary chars (`\w` + `-`) so kebab-case slugs match as complete tokens but not as prefixes/suffixes of longer kebab identifiers, and short slugs don't match inside longer words. (? still UNUSED - slug `validate-numbers` in a project that contains `validate-numbers-attempt` -> still UNUSED - positive control: exact `validate-numbers` mention -> NOT unused 150/150 tests pass; ruff + mypy clean. --- src/socrates120x/patterns.py | 26 +++++++++++- tests/test_patterns.py | 78 ++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 2 deletions(-) diff --git a/src/socrates120x/patterns.py b/src/socrates120x/patterns.py index 7a35e36..a49cf78 100644 --- a/src/socrates120x/patterns.py +++ b/src/socrates120x/patterns.py @@ -212,13 +212,35 @@ def _compute_usage_map( def _slug_in_project(slug: str, project_dir: Path) -> bool: + """True if *slug* appears as a complete token in any .md file under + *project_dir*. + + Naive substring match (the previous implementation) false-positived on + short slugs: ``auth`` matched ``author`` / ``authentic`` / ``authority``, + ``api`` matched ``apiary`` / ``rapidly``. The "unused candidate" report + silently hid genuine unused patterns whenever such a short slug + happened to be substring of a real word in any project's planning + files. + + Word-boundary regex with custom boundary chars (``\\w`` plus ``-``) + handles kebab-case slugs correctly: ``validate-numbers`` won't match + inside ``validate-numbers-attempt`` (longer kebab identifier), and + ``auth`` won't match inside ``author``. + """ needle = slug.lower() + # (? None: refreshed = json.loads(cache_path.read_text()) assert refreshed["version"] == 2 assert "phantom-from-v1" not in str(refreshed) + + +def test_patterns_review_no_false_positive_on_short_slug_substring(company: Path) -> None: + """A pattern slug like `auth` must NOT match `author` in another project. + + Bug: previous naive substring match `auth in author` -> True -> the + pattern was reported as 'used elsewhere' and never flagged as unused, + silently masking genuinely unused short-slug patterns. + """ + alpha = _make_build(company, "alpha") + beta = _make_build(company, "beta") + # alpha (source) mentions the slug somewhere — establishes the source. + (alpha / "planning" / "STATE.md").write_text( + (alpha / "planning" / "STATE.md").read_text() + "\nThe `auth` pattern.\n" + ) + # beta mentions `author` (longer word containing `auth`). Pre-fix this + # would have looked like a usage of the `auth` slug and the pattern + # would NOT have been flagged unused. + (beta / "planning" / "STATE.md").write_text( + (beta / "planning" / "STATE.md").read_text() + + "\nThe author of this is unknown.\n" + ) + today = _dt.date.today().isoformat() + (company / "patterns" / "CANDIDATE-auth.md").write_text( + _pattern(today, "alpha", "auth") + ) + report = review_patterns(company) + unused = [f for f in report.findings if f.kind == FindingKind.UNUSED] + # The pattern IS unused; substring-into-`author` should not save it. + assert any("auth" in f.path.name for f in unused), ( + "regression: short slug 'auth' was matched as a substring of 'author'" + ) + + +def test_patterns_review_no_false_positive_on_kebab_subset(company: Path) -> None: + """A pattern slug `validate-numbers` must NOT match `validate-numbers-attempt` + (a longer kebab-case identifier that happens to start with the slug). + """ + alpha = _make_build(company, "alpha") + beta = _make_build(company, "beta") + (alpha / "planning" / "STATE.md").write_text( + (alpha / "planning" / "STATE.md").read_text() + + "\nThe `validate-numbers` pattern was extracted here.\n" + ) + # beta has a DIFFERENT kebab identifier that contains the slug as a prefix. + (beta / "planning" / "STATE.md").write_text( + (beta / "planning" / "STATE.md").read_text() + + "\nWe tried validate-numbers-attempt instead, see #42.\n" + ) + today = _dt.date.today().isoformat() + (company / "patterns" / "CANDIDATE-validate-numbers.md").write_text( + _pattern(today, "alpha", "validate-numbers") + ) + report = review_patterns(company) + unused = [f for f in report.findings if f.kind == FindingKind.UNUSED] + assert any("validate-numbers" in f.path.name for f in unused), ( + "regression: kebab slug matched as prefix of a longer kebab identifier" + ) + + +def test_patterns_review_still_matches_exact_kebab_slug(company: Path) -> None: + """Positive control: an exact slug mention in another project still counts.""" + alpha = _make_build(company, "alpha") + beta = _make_build(company, "beta") + (alpha / "planning" / "STATE.md").write_text( + (alpha / "planning" / "STATE.md").read_text() + "\nThe `validate-numbers` pattern.\n" + ) + (beta / "planning" / "STATE.md").write_text( + (beta / "planning" / "STATE.md").read_text() + + "\nWe reused validate-numbers from alpha.\n" + ) + today = _dt.date.today().isoformat() + (company / "patterns" / "CANDIDATE-validate-numbers.md").write_text( + _pattern(today, "alpha", "validate-numbers") + ) + report = review_patterns(company) + unused_paths = [f.path.name for f in report.findings if f.kind == FindingKind.UNUSED] + assert "CANDIDATE-validate-numbers.md" not in unused_paths