broomva · broomva · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,40 @@ All notable changes to `role-x` are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.6.0] — 2026-06-05
+
+Adds the **resolver-eval harness** and fixes **BRO-1338** (multi-word keyword
+scoring). Origin: `/checkit` on Garry Tan's "skillify" essay surfaced that
+bstack tests its code and governs skill *promotion* but never *tests its skills
+as artifacts* — specifically, no test asserts that a lens's trigger actually
+routes. This is skillify step 7 ("resolver eval"), built bstack-native (BRO-1411
+slice 1).
+
+### Fixed
+
+- **BRO-1338 — multi-word / punctuated `prompt_keywords` scored zero.**
+  `_score_lens` matched keywords only via single-token set membership, so phrase
+  triggers like `"check this out"`, `"/checkit"`, `"let's research this"`,
+  `"last 30 days"` could never match (the token set never *contains* a phrase).
+  New `_kw_matches` substring-matches punctuated/multi-word keywords against the
+  raw lowercased prompt; clean single tokens keep the original word-boundary-safe
+  semantics, so existing single-word lens behavior is unchanged.
+
+### Added
+
+- **`role-x.py eval`** subcommand — runs `roles/<name>.eval.yaml` fixtures
+  (`should_fire` / `should_not_fire` intents, optional `touched_files`/`branch`
+  per case) through `_select_lenses` and asserts the declared lens's selection.
+  Exit 1 on any failure (CI-gateable); `--json`, `--verbose`, `--lens`,
+  `--active-only`.
+- **`include_statuses` param on `_select_lenses`** (default `("active",)`) — the
+  eval passes `("active","candidate")` so a lens's routing is testable *before*
+  promotion, closing the gap where a candidate lens ships with a broken trigger
+  nobody catches until it goes live.
+- 13 tests (`tests/test_eval.py`): unit coverage for the phrase fix +
+  `include_statuses`, integration coverage for the `eval` subcommand
+  (pass / false-negative / false-positive / json / no-fixtures / missing-dir).
+
 ## [0.5.0] — 2026-06-01
 
 Adds **task-relevant entity auto-loading** to intake (BRO-1295). Until now the

diff --git a/SKILL.md b/SKILL.md
@@ -105,8 +105,9 @@ The intake reflex routes prompts in real-time. The meta-progression discipline e
 
 - `roles/_meta.md` — always-loaded base lens (the workspace's implicit "bstack-aware autonomous senior engineer" contract made addressable). Lives in the consuming workspace, not in this skill repo.
 - `roles/<name>.md` — per-domain lenses. Live in the consuming workspace.
+- `roles/<name>.eval.yaml` — resolver-eval fixture (`should_fire` / `should_not_fire` intents) asserting the lens's trigger actually routes. Run via `role-x.py eval`; gate in CI. The skillify "resolver eval" step — a trigger that says "phrase X selects lens Y" is only trustworthy once a test proves it. Live in the consuming workspace alongside the lens.
 - `roles/_index.md` — auto-generated discovery index.
-- `scripts/role-x.py` — CLI helpers.
+- `scripts/role-x.py` — CLI helpers (`validate`, `list`, `index`, `intake`, `coverage`, `suggest`, `init`, `eval`).
 - `references/*.md` — schema + algorithm reference docs.
 - `~/.config/broomva/role/events.jsonl` — telemetry log (M2).
 - `~/.config/broomva/role/status.json` — per-lens stats cache (M2).

diff --git a/scripts/role-x.py b/scripts/role-x.py
@@ -396,12 +396,38 @@ def _resolve_threshold(lens: dict) -> int:
     return DEFAULT_THRESHOLD
 
 
-def _score_lens(lens: dict, branch: str, touched_files: list[str], prompt_tokens: set[str]) -> dict:
+def _kw_matches(kw_lc: str, prompt_tokens: set[str], prompt_lc: str) -> bool:
+    """Match one declared prompt_keyword against the prompt.
+
+    BRO-1338 fix: multi-word / punctuated keywords (e.g. "check this out",
+    "/checkit", "let's research this", "last 30 days") are matched as
+    substrings against the raw lowercased prompt, because the single-token set
+    built by _tokenize_prompt() can never *contain* a phrase — so before this
+    fix every multi-word keyword silently scored zero. Clean single tokens
+    (only [a-z0-9-]) keep the original word-boundary-safe set-membership
+    semantics, so existing single-word lens behavior is unchanged.
+    """
+    if re.search(r"[^a-z0-9-]", kw_lc):  # space, slash, apostrophe, digit-sep, etc.
+        return bool(prompt_lc) and kw_lc in prompt_lc
+    return kw_lc in prompt_tokens
+
+
+def _score_lens(
+    lens: dict,
+    branch: str,
+    touched_files: list[str],
+    prompt_tokens: set[str],
+    prompt_lc: str = "",
+) -> dict:
     """Score a lens's frontmatter against the current signals.
 
     Returns breakdown dict with raw match counts (for backward-compat event
     schema) plus weighted total. Per-lens `signals.weights` (v0.3.0) multiplies
     raw counts when computing the total used for threshold comparison.
+
+    `prompt_lc` (raw lowercased prompt) enables phrase matching for multi-word
+    keywords (BRO-1338). It defaults to "" so direct callers that pass only the
+    token set keep the pre-fix single-token behavior.
     """
     signals = lens.get("signals", {}) or {}
     paths = signals.get("paths") or []
@@ -414,7 +440,7 @@ def _score_lens(lens: dict, branch: str, touched_files: list[str], prompt_tokens
     )
     keyword_hits = sum(
         1 for kw in prompt_keywords
-        if str(kw).lower() in prompt_tokens
+        if _kw_matches(str(kw).lower(), prompt_tokens, prompt_lc)
     )
     branch_hits = sum(
         1 for pat in branch_patterns
@@ -472,31 +498,39 @@ def _select_lenses(
     signals: dict,
     prompt: str,
     threshold: int = DEFAULT_THRESHOLD,
+    include_statuses: tuple[str, ...] = ("active",),
 ) -> dict:
     """Score all lenses; return selection dict with selected, mode, signals.
 
     v0.3.0: each lens can override `threshold` (top-level) and
     `signals.weights.<type>` (nested) — the global `threshold` argument here
     is only used as the default when a lens doesn't declare its own.
+
+    `include_statuses` controls which lens `status:` values are scored. Live
+    intake passes the default ("active",); the resolver-eval harness (BRO-1411)
+    passes ("active", "candidate") so a lens can be routing-tested *before* it
+    is promoted to active — closing the gap where a candidate lens ships with a
+    broken trigger nobody can catch until it goes live.
     """
     lenses: dict[str, dict] = {}
     for path in discover_lenses(roles_dir):
         lens = _load_lens(path)
         if not lens:
             continue
-        if lens.get("status") != "active":
+        if lens.get("status") not in include_statuses:
             continue
         lenses[lens["name"]] = lens
 
     prompt_tokens = _tokenize_prompt(prompt)
+    prompt_lc = prompt.lower()
     branch = signals.get("branch", "")
     touched = signals.get("touched_files", [])
 
     scored: list[tuple[str, int, dict, int]] = []
     for name, lens in lenses.items():
         if name == "_meta":
             continue  # _meta is always applied as the base, not scored
-        breakdown = _score_lens(lens, branch, touched, prompt_tokens)
+        breakdown = _score_lens(lens, branch, touched, prompt_tokens, prompt_lc)
         per_lens_threshold = _resolve_threshold(lens) if "threshold" in lens else threshold
         scored.append((name, breakdown["total"], breakdown, per_lens_threshold))
     scored.sort(key=lambda x: x[1], reverse=True)
@@ -1426,6 +1460,143 @@ def _yaml_list(items: list[str]) -> str:
 ### Argparse + main ###
 
 
+def _normalize_eval_case(item) -> dict | None:
+    """Normalize one eval case.
+
+    Accepts either a bare intent string, or a dict
+    `{intent, touched_files?, branch?}` for cases that exercise path/branch
+    routing as well as prompt keywords. Returns None for malformed entries.
+    """
+    if isinstance(item, str):
+        return {"intent": item, "touched_files": [], "branch": ""}
+    if isinstance(item, dict) and item.get("intent"):
+        return {
+            "intent": str(item["intent"]),
+            "touched_files": [str(f) for f in (item.get("touched_files") or [])],
+            "branch": str(item.get("branch") or ""),
+        }
+    return None
+
+
+def _normalize_eval_cases(items) -> tuple[list[dict], int]:
+    """Normalize a list of eval cases; return (valid_cases, n_malformed).
+
+    Malformed entries are counted, never silently dropped — a broken fixture
+    must fail the eval loudly, not quietly shrink the assertion count and report
+    a false green (CodeRabbit, role-x#7).
+    """
+    valid: list[dict] = []
+    malformed = 0
+    for item in (items or []):
+        case = _normalize_eval_case(item)
+        if case is None:
+            malformed += 1
+        else:
+            valid.append(case)
+    return valid, malformed
+
+
+def _load_eval_specs(roles_dir: Path, only_lens: str | None) -> list[dict]:
+    """Load roles/<lens>.eval.yaml fixtures into normalized spec dicts."""
+    specs: list[dict] = []
+    for path in sorted(roles_dir.glob("*.eval.yaml")):
+        try:
+            data = yaml.safe_load(path.read_text(encoding="utf-8"))
+        except (yaml.YAMLError, OSError) as exc:
+            specs.append({"_path": path, "_error": str(exc)})
+            continue
+        if data is None:
+            data = {}
+        if not isinstance(data, dict):
+            specs.append({
+                "_path": path,
+                "_error": f"eval fixture root must be a mapping, got {type(data).__name__}",
+            })
+            continue
+        lens = data.get("lens") or path.name[: -len(".eval.yaml")]
+        if only_lens and lens != only_lens:
+            continue
+        fire, fire_bad = _normalize_eval_cases(data.get("should_fire"))
+        nofire, nofire_bad = _normalize_eval_cases(data.get("should_not_fire"))
+        specs.append({
+            "_path": path,
+            "lens": lens,
+            "should_fire": fire,
+            "should_not_fire": nofire,
+            "_malformed": fire_bad + nofire_bad,
+        })
+    return specs
+
+
+def cmd_eval(args: argparse.Namespace) -> int:
+    """(BRO-1411) Resolver-eval — assert that intents route to the expected lens.
+
+    Skillify step 7: a resolver *trigger* says "phrase X should select lens Y";
+    a resolver *eval* proves it actually does. Each roles/<lens>.eval.yaml
+    declares `should_fire` / `should_not_fire` intents; every intent is run
+    through _select_lenses() and the declared lens's presence in the selection
+    is asserted. Exit 1 on any failure (CI-gateable), 0 when all pass.
+    """
+    roles_dir = Path(args.roles_dir)
+    if not roles_dir.is_dir():
+        print(f"[eval] roles dir not found: {roles_dir}", file=sys.stderr)
+        return 2
+    specs = _load_eval_specs(roles_dir, args.lens)
+    if not specs:
+        msg = f"[eval] no *.eval.yaml fixtures in {roles_dir}"
+        if args.lens:
+            print(msg + f" for lens '{args.lens}'", file=sys.stderr)
+            return 2
+        print(msg + " (nothing to check)")
+        return 0
+
+    include = ("active", "candidate") if args.include_candidates else ("active",)
+    log = sys.stderr if args.json else sys.stdout  # keep --json stdout machine-clean
+    total = passed = failed = 0
+    results: list[dict] = []
+    for spec in specs:
+        if spec.get("_error"):
+            print(f"[eval] FAIL parse {spec['_path'].name}: {spec['_error']}", file=log)
+            failed += 1
+            total += 1
+            continue
+        if spec.get("_malformed"):
+            n = spec["_malformed"]
+            print(f"[eval] FAIL {spec['_path'].name}: {n} malformed eval case(s) "
+                  f"(each must be a string or a mapping with an 'intent' key)", file=log)
+            failed += n
+            total += n
+        lens = spec["lens"]
+        for expect, cases in (("fire", spec["should_fire"]), ("no-fire", spec["should_not_fire"])):
+            for case in cases:
+                total += 1
+                signals = {"branch": case["branch"], "touched_files": case["touched_files"]}
+                sel = _select_lenses(roles_dir, signals, case["intent"], include_statuses=include)
+                fired = lens in sel["lenses_selected"]
+                ok = fired == (expect == "fire")
+                score = (sel["per_lens_scores"].get(lens) or {}).get("total", 0)
+                thr = sel["per_lens_thresholds"].get(lens, "?")
+                if ok:
+                    passed += 1
+                else:
+                    failed += 1
+                results.append({
+                    "lens": lens, "intent": case["intent"], "expect": expect,
+                    "fired": fired, "score": score, "threshold": thr, "ok": ok,
+                })
+                if not ok or args.verbose:
+                    flag = "PASS" if ok else "FAIL"
+                    print(f"[eval] {flag} [{lens}] expect={expect} fired={fired} "
+                          f"score={score}/{thr} :: {case['intent'][:70]}", file=log)
+
+    if args.json:
+        print(json.dumps({"total": total, "passed": passed, "failed": failed, "results": results}, indent=2))
+    else:
+        n_lenses = len({s["lens"] for s in specs if not s.get("_error")})
+        print(f"[eval] {passed}/{total} passed, {failed} failed across {n_lenses} lens(es)")
+    return 1 if failed else 0
+
+
 def build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(
         prog="role-x",
@@ -1529,6 +1700,20 @@ def build_parser() -> argparse.ArgumentParser:
     p_init.add_argument("--force", action="store_true", help="overwrite if file already exists")
     p_init.set_defaults(func=cmd_init)
 
+    p_eval = sub.add_parser(
+        "eval",
+        help="(BRO-1411) resolver-eval: assert intent->lens routing from roles/*.eval.yaml fixtures",
+    )
+    p_eval.add_argument("--roles-dir", default="roles", help="path to roles directory (default: roles)")
+    p_eval.add_argument("--lens", default=None, help="eval only this lens (default: all *.eval.yaml)")
+    p_eval.add_argument(
+        "--active-only", dest="include_candidates", action="store_false",
+        help="only score status:active lenses (default also scores candidates so pre-promotion lenses are testable)",
+    )
+    p_eval.add_argument("--json", action="store_true", help="emit machine-readable JSON results")
+    p_eval.add_argument("--verbose", action="store_true", help="print PASS lines too, not just failures")
+    p_eval.set_defaults(func=cmd_eval, include_candidates=True)
+
     return parser