diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9265015..fa31187 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,40 @@ All notable changes to `role-x` are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.6.0] — 2026-06-05
+
+Adds the **resolver-eval harness** and fixes **BRO-1338** (multi-word keyword
+scoring). Origin: `/checkit` on Garry Tan's "skillify" essay surfaced that
+bstack tests its code and governs skill *promotion* but never *tests its skills
+as artifacts* — specifically, no test asserts that a lens's trigger actually
+routes. This is skillify step 7 ("resolver eval"), built bstack-native (BRO-1411
+slice 1).
+
+### Fixed
+
+- **BRO-1338 — multi-word / punctuated `prompt_keywords` scored zero.**
+  `_score_lens` matched keywords only via single-token set membership, so phrase
+  triggers like `"check this out"`, `"/checkit"`, `"let's research this"`,
+  `"last 30 days"` could never match (the token set never *contains* a phrase).
+  New `_kw_matches` substring-matches punctuated/multi-word keywords against the
+  raw lowercased prompt; clean single tokens keep the original word-boundary-safe
+  semantics, so existing single-word lens behavior is unchanged.
+
+### Added
+
+- **`role-x.py eval`** subcommand — runs `roles/<name>.eval.yaml` fixtures
+  (`should_fire` / `should_not_fire` intents, optional `touched_files`/`branch`
+  per case) through `_select_lenses` and asserts the declared lens's selection.
+  Exit 1 on any failure (CI-gateable); `--json`, `--verbose`, `--lens`,
+  `--active-only`.
+- **`include_statuses` param on `_select_lenses`** (default `("active",)`) — the
+  eval passes `("active","candidate")` so a lens's routing is testable *before*
+  promotion, closing the gap where a candidate lens ships with a broken trigger
+  nobody catches until it goes live.
+- 13 tests (`tests/test_eval.py`): unit coverage for the phrase fix +
+  `include_statuses`, integration coverage for the `eval` subcommand
+  (pass / false-negative / false-positive / json / no-fixtures / missing-dir).
+
 ## [0.5.0] — 2026-06-01
 
 Adds **task-relevant entity auto-loading** to intake (BRO-1295). Until now the
diff --git a/SKILL.md b/SKILL.md
index d836e31..d1d3bd2 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -105,8 +105,9 @@ The intake reflex routes prompts in real-time. The meta-progression discipline e
 
 - `roles/_meta.md` — always-loaded base lens (the workspace's implicit "bstack-aware autonomous senior engineer" contract made addressable). Lives in the consuming workspace, not in this skill repo.
 - `roles/<name>.md` — per-domain lenses. Live in the consuming workspace.
+- `roles/<name>.eval.yaml` — resolver-eval fixture (`should_fire` / `should_not_fire` intents) asserting the lens's trigger actually routes. Run via `role-x.py eval`; gate in CI. The skillify "resolver eval" step — a trigger that says "phrase X selects lens Y" is only trustworthy once a test proves it. Live in the consuming workspace alongside the lens.
 - `roles/_index.md` — auto-generated discovery index.
-- `scripts/role-x.py` — CLI helpers.
+- `scripts/role-x.py` — CLI helpers (`validate`, `list`, `index`, `intake`, `coverage`, `suggest`, `init`, `eval`).
 - `references/*.md` — schema + algorithm reference docs.
 - `~/.config/broomva/role/events.jsonl` — telemetry log (M2).
 - `~/.config/broomva/role/status.json` — per-lens stats cache (M2).
diff --git a/scripts/role-x.py b/scripts/role-x.py
index 6bfcaf4..296061d 100644
--- a/scripts/role-x.py
+++ b/scripts/role-x.py
@@ -396,12 +396,38 @@ def _resolve_threshold(lens: dict) -> int:
     return DEFAULT_THRESHOLD
 
 
-def _score_lens(lens: dict, branch: str, touched_files: list[str], prompt_tokens: set[str]) -> dict:
+def _kw_matches(kw_lc: str, prompt_tokens: set[str], prompt_lc: str) -> bool:
+    """Match one declared prompt_keyword against the prompt.
+
+    BRO-1338 fix: multi-word / punctuated keywords (e.g. "check this out",
+    "/checkit", "let's research this", "last 30 days") are matched as
+    substrings against the raw lowercased prompt, because the single-token set
+    built by _tokenize_prompt() can never *contain* a phrase — so before this
+    fix every multi-word keyword silently scored zero. Clean single tokens
+    (only [a-z0-9-]) keep the original word-boundary-safe set-membership
+    semantics, so existing single-word lens behavior is unchanged.
+    """
+    if re.search(r"[^a-z0-9-]", kw_lc):  # space, slash, apostrophe, digit-sep, etc.
+        return bool(prompt_lc) and kw_lc in prompt_lc
+    return kw_lc in prompt_tokens
+
+
+def _score_lens(
+    lens: dict,
+    branch: str,
+    touched_files: list[str],
+    prompt_tokens: set[str],
+    prompt_lc: str = "",
+) -> dict:
     """Score a lens's frontmatter against the current signals.
 
     Returns breakdown dict with raw match counts (for backward-compat event
     schema) plus weighted total. Per-lens `signals.weights` (v0.3.0) multiplies
     raw counts when computing the total used for threshold comparison.
+
+    `prompt_lc` (raw lowercased prompt) enables phrase matching for multi-word
+    keywords (BRO-1338). It defaults to "" so direct callers that pass only the
+    token set keep the pre-fix single-token behavior.
     """
     signals = lens.get("signals", {}) or {}
     paths = signals.get("paths") or []
@@ -414,7 +440,7 @@ def _score_lens(lens: dict, branch: str, touched_files: list[str], prompt_tokens
     )
     keyword_hits = sum(
         1 for kw in prompt_keywords
-        if str(kw).lower() in prompt_tokens
+        if _kw_matches(str(kw).lower(), prompt_tokens, prompt_lc)
     )
     branch_hits = sum(
         1 for pat in branch_patterns
@@ -472,23 +498,31 @@ def _select_lenses(
     signals: dict,
     prompt: str,
     threshold: int = DEFAULT_THRESHOLD,
+    include_statuses: tuple[str, ...] = ("active",),
 ) -> dict:
     """Score all lenses; return selection dict with selected, mode, signals.
 
     v0.3.0: each lens can override `threshold` (top-level) and
     `signals.weights.<type>` (nested) — the global `threshold` argument here
     is only used as the default when a lens doesn't declare its own.
+
+    `include_statuses` controls which lens `status:` values are scored. Live
+    intake passes the default ("active",); the resolver-eval harness (BRO-1411)
+    passes ("active", "candidate") so a lens can be routing-tested *before* it
+    is promoted to active — closing the gap where a candidate lens ships with a
+    broken trigger nobody can catch until it goes live.
     """
     lenses: dict[str, dict] = {}
     for path in discover_lenses(roles_dir):
         lens = _load_lens(path)
         if not lens:
             continue
-        if lens.get("status") != "active":
+        if lens.get("status") not in include_statuses:
             continue
         lenses[lens["name"]] = lens
 
     prompt_tokens = _tokenize_prompt(prompt)
+    prompt_lc = prompt.lower()
     branch = signals.get("branch", "")
     touched = signals.get("touched_files", [])
 
@@ -496,7 +530,7 @@ def _select_lenses(
     for name, lens in lenses.items():
         if name == "_meta":
             continue  # _meta is always applied as the base, not scored
-        breakdown = _score_lens(lens, branch, touched, prompt_tokens)
+        breakdown = _score_lens(lens, branch, touched, prompt_tokens, prompt_lc)
         per_lens_threshold = _resolve_threshold(lens) if "threshold" in lens else threshold
         scored.append((name, breakdown["total"], breakdown, per_lens_threshold))
     scored.sort(key=lambda x: x[1], reverse=True)
@@ -1426,6 +1460,143 @@ def _yaml_list(items: list[str]) -> str:
 ### Argparse + main ###
 
 
+def _normalize_eval_case(item) -> dict | None:
+    """Normalize one eval case.
+
+    Accepts either a bare intent string, or a dict
+    `{intent, touched_files?, branch?}` for cases that exercise path/branch
+    routing as well as prompt keywords. Returns None for malformed entries.
+    """
+    if isinstance(item, str):
+        return {"intent": item, "touched_files": [], "branch": ""}
+    if isinstance(item, dict) and item.get("intent"):
+        return {
+            "intent": str(item["intent"]),
+            "touched_files": [str(f) for f in (item.get("touched_files") or [])],
+            "branch": str(item.get("branch") or ""),
+        }
+    return None
+
+
+def _normalize_eval_cases(items) -> tuple[list[dict], int]:
+    """Normalize a list of eval cases; return (valid_cases, n_malformed).
+
+    Malformed entries are counted, never silently dropped — a broken fixture
+    must fail the eval loudly, not quietly shrink the assertion count and report
+    a false green (CodeRabbit, role-x#7).
+    """
+    valid: list[dict] = []
+    malformed = 0
+    for item in (items or []):
+        case = _normalize_eval_case(item)
+        if case is None:
+            malformed += 1
+        else:
+            valid.append(case)
+    return valid, malformed
+
+
+def _load_eval_specs(roles_dir: Path, only_lens: str | None) -> list[dict]:
+    """Load roles/<lens>.eval.yaml fixtures into normalized spec dicts."""
+    specs: list[dict] = []
+    for path in sorted(roles_dir.glob("*.eval.yaml")):
+        try:
+            data = yaml.safe_load(path.read_text(encoding="utf-8"))
+        except (yaml.YAMLError, OSError) as exc:
+            specs.append({"_path": path, "_error": str(exc)})
+            continue
+        if data is None:
+            data = {}
+        if not isinstance(data, dict):
+            specs.append({
+                "_path": path,
+                "_error": f"eval fixture root must be a mapping, got {type(data).__name__}",
+            })
+            continue
+        lens = data.get("lens") or path.name[: -len(".eval.yaml")]
+        if only_lens and lens != only_lens:
+            continue
+        fire, fire_bad = _normalize_eval_cases(data.get("should_fire"))
+        nofire, nofire_bad = _normalize_eval_cases(data.get("should_not_fire"))
+        specs.append({
+            "_path": path,
+            "lens": lens,
+            "should_fire": fire,
+            "should_not_fire": nofire,
+            "_malformed": fire_bad + nofire_bad,
+        })
+    return specs
+
+
+def cmd_eval(args: argparse.Namespace) -> int:
+    """(BRO-1411) Resolver-eval — assert that intents route to the expected lens.
+
+    Skillify step 7: a resolver *trigger* says "phrase X should select lens Y";
+    a resolver *eval* proves it actually does. Each roles/<lens>.eval.yaml
+    declares `should_fire` / `should_not_fire` intents; every intent is run
+    through _select_lenses() and the declared lens's presence in the selection
+    is asserted. Exit 1 on any failure (CI-gateable), 0 when all pass.
+    """
+    roles_dir = Path(args.roles_dir)
+    if not roles_dir.is_dir():
+        print(f"[eval] roles dir not found: {roles_dir}", file=sys.stderr)
+        return 2
+    specs = _load_eval_specs(roles_dir, args.lens)
+    if not specs:
+        msg = f"[eval] no *.eval.yaml fixtures in {roles_dir}"
+        if args.lens:
+            print(msg + f" for lens '{args.lens}'", file=sys.stderr)
+            return 2
+        print(msg + " (nothing to check)")
+        return 0
+
+    include = ("active", "candidate") if args.include_candidates else ("active",)
+    log = sys.stderr if args.json else sys.stdout  # keep --json stdout machine-clean
+    total = passed = failed = 0
+    results: list[dict] = []
+    for spec in specs:
+        if spec.get("_error"):
+            print(f"[eval] FAIL parse {spec['_path'].name}: {spec['_error']}", file=log)
+            failed += 1
+            total += 1
+            continue
+        if spec.get("_malformed"):
+            n = spec["_malformed"]
+            print(f"[eval] FAIL {spec['_path'].name}: {n} malformed eval case(s) "
+                  f"(each must be a string or a mapping with an 'intent' key)", file=log)
+            failed += n
+            total += n
+        lens = spec["lens"]
+        for expect, cases in (("fire", spec["should_fire"]), ("no-fire", spec["should_not_fire"])):
+            for case in cases:
+                total += 1
+                signals = {"branch": case["branch"], "touched_files": case["touched_files"]}
+                sel = _select_lenses(roles_dir, signals, case["intent"], include_statuses=include)
+                fired = lens in sel["lenses_selected"]
+                ok = fired == (expect == "fire")
+                score = (sel["per_lens_scores"].get(lens) or {}).get("total", 0)
+                thr = sel["per_lens_thresholds"].get(lens, "?")
+                if ok:
+                    passed += 1
+                else:
+                    failed += 1
+                results.append({
+                    "lens": lens, "intent": case["intent"], "expect": expect,
+                    "fired": fired, "score": score, "threshold": thr, "ok": ok,
+                })
+                if not ok or args.verbose:
+                    flag = "PASS" if ok else "FAIL"
+                    print(f"[eval] {flag} [{lens}] expect={expect} fired={fired} "
+                          f"score={score}/{thr} :: {case['intent'][:70]}", file=log)
+
+    if args.json:
+        print(json.dumps({"total": total, "passed": passed, "failed": failed, "results": results}, indent=2))
+    else:
+        n_lenses = len({s["lens"] for s in specs if not s.get("_error")})
+        print(f"[eval] {passed}/{total} passed, {failed} failed across {n_lenses} lens(es)")
+    return 1 if failed else 0
+
+
 def build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(
         prog="role-x",
@@ -1529,6 +1700,20 @@ def build_parser() -> argparse.ArgumentParser:
     p_init.add_argument("--force", action="store_true", help="overwrite if file already exists")
     p_init.set_defaults(func=cmd_init)
 
+    p_eval = sub.add_parser(
+        "eval",
+        help="(BRO-1411) resolver-eval: assert intent->lens routing from roles/*.eval.yaml fixtures",
+    )
+    p_eval.add_argument("--roles-dir", default="roles", help="path to roles directory (default: roles)")
+    p_eval.add_argument("--lens", default=None, help="eval only this lens (default: all *.eval.yaml)")
+    p_eval.add_argument(
+        "--active-only", dest="include_candidates", action="store_false",
+        help="only score status:active lenses (default also scores candidates so pre-promotion lenses are testable)",
+    )
+    p_eval.add_argument("--json", action="store_true", help="emit machine-readable JSON results")
+    p_eval.add_argument("--verbose", action="store_true", help="print PASS lines too, not just failures")
+    p_eval.set_defaults(func=cmd_eval, include_candidates=True)
+
     return parser
 
 
diff --git a/tests/test_eval.py b/tests/test_eval.py
new file mode 100644
index 0000000..d5fc0e4
--- /dev/null
+++ b/tests/test_eval.py
@@ -0,0 +1,218 @@
+"""Tests for the resolver-eval harness + BRO-1338 phrase-matching fix.
+
+Two layers (skillify steps 3 + 4):
+  - unit  : _kw_matches / _score_lens / _select_lenses pure functions
+  - integ : the `eval` subcommand end-to-end via subprocess over a seeded
+            roles/ dir with .eval.yaml fixtures
+"""
+from __future__ import annotations
+
+import importlib.util
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+SCRIPT = Path(__file__).parent.parent / "scripts" / "role-x.py"
+
+# --- load the hyphenated module for pure-function unit tests ---
+_spec = importlib.util.spec_from_file_location("role_x_mod", SCRIPT)
+role_x = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(role_x)
+
+
+# ──────────────────────────────────────────────────────────────────────────
+# Unit — BRO-1338: multi-word / punctuated keywords must match (skillify §3)
+# ──────────────────────────────────────────────────────────────────────────
+
+def test_kw_matches_phrase_substring():
+    # phrase keyword matches as substring of the raw prompt
+    assert role_x._kw_matches("check this out", set(), "yo check this out dude")
+    assert not role_x._kw_matches("check this out", {"check", "this", "out"}, "")
+
+
+def test_kw_matches_slash_token():
+    # "/checkit" tokenizes to "checkit" (slash stripped) — must match via prompt_lc
+    assert role_x._kw_matches("/checkit", {"checkit"}, "please run /checkit now")
+
+
+def test_kw_matches_apostrophe_phrase():
+    assert role_x._kw_matches("let's research this", set(), "ok let's research this paper")
+
+
+def test_kw_matches_clean_token_unchanged():
+    # clean single token keeps word-boundary set-membership semantics (no regression)
+    assert role_x._kw_matches("rust", {"rust", "tokio"}, "rust tokio")
+    assert not role_x._kw_matches("rust", {"trust"}, "i trust you")  # no false substring hit
+
+
+def test_score_lens_phrase_via_prompt_lc():
+    lens = {"signals": {"prompt_keywords": ["check this out", "wdyt"]}}
+    tokens = role_x._tokenize_prompt("check this out wow")
+    b = role_x._score_lens(lens, "", [], tokens, "check this out wow")
+    assert b["prompt_keywords"] == 1  # the phrase hit
+    # without prompt_lc (legacy direct call) the phrase silently scores 0
+    b0 = role_x._score_lens(lens, "", [], tokens)
+    assert b0["prompt_keywords"] == 0
+
+
+# ──────────────────────────────────────────────────────────────────────────
+# Unit — _select_lenses honors include_statuses (candidate lenses testable)
+# ──────────────────────────────────────────────────────────────────────────
+
+_META = """---
+name: _meta
+status: active
+default_mode: augment
+---
+# meta
+"""
+
+_CANDIDATE_LENS = """---
+name: probe
+status: candidate
+extends: _meta
+threshold: 1
+signals:
+  prompt_keywords:
+    - "check this out"
+    - "wdyt"
+---
+# probe lens
+"""
+
+
+def _seed_roles(tmp_path: Path) -> Path:
+    roles = tmp_path / "roles"
+    roles.mkdir()
+    (roles / "_meta.md").write_text(_META, encoding="utf-8")
+    (roles / "probe.md").write_text(_CANDIDATE_LENS, encoding="utf-8")
+    return roles
+
+
+def test_select_excludes_candidate_by_default(tmp_path):
+    roles = _seed_roles(tmp_path)
+    sel = role_x._select_lenses(roles, {"branch": "", "touched_files": []}, "check this out")
+    assert "probe" not in sel["lenses_selected"]  # candidate not scored by default
+
+
+def test_select_includes_candidate_when_asked(tmp_path):
+    roles = _seed_roles(tmp_path)
+    sel = role_x._select_lenses(
+        roles, {"branch": "", "touched_files": []}, "check this out",
+        include_statuses=("active", "candidate"),
+    )
+    assert "probe" in sel["lenses_selected"]  # phrase fix + status override → fires
+
+
+# ──────────────────────────────────────────────────────────────────────────
+# Integration — the `eval` subcommand end-to-end
+# ──────────────────────────────────────────────────────────────────────────
+
+def _run(*args: str, cwd: Path) -> tuple[int, str, str]:
+    r = subprocess.run(
+        [sys.executable, str(SCRIPT), *args],
+        capture_output=True, text=True, cwd=str(cwd),
+        env={**os.environ},
+    )
+    return r.returncode, r.stdout, r.stderr
+
+
+def _seed_eval_workspace(tmp_path: Path, fixture: str) -> Path:
+    roles = _seed_roles(tmp_path)
+    (roles / "probe.eval.yaml").write_text(fixture, encoding="utf-8")
+    return roles
+
+
+def test_eval_passes_when_routing_correct(tmp_path):
+    roles = _seed_eval_workspace(tmp_path, (
+        "lens: probe\n"
+        "should_fire:\n"
+        "  - \"check this out https://x.y\"\n"
+        "  - \"wdyt about this\"\n"
+        "should_not_fire:\n"
+        "  - \"summarize this pdf in 3 bullets\"\n"
+    ))
+    rc, out, err = _run("eval", "--roles-dir", str(roles), cwd=tmp_path)
+    assert rc == 0, f"expected pass, rc={rc}\n{out}\n{err}"
+    assert "0 failed" in out
+
+
+def test_eval_fails_on_false_negative(tmp_path):
+    # declares a should_fire intent with NO matching keyword → must FAIL (rc 1)
+    roles = _seed_eval_workspace(tmp_path, (
+        "lens: probe\n"
+        "should_fire:\n"
+        "  - \"totally unrelated request about the weather\"\n"
+    ))
+    rc, out, err = _run("eval", "--roles-dir", str(roles), cwd=tmp_path)
+    assert rc == 1, f"expected fail rc=1, got {rc}\n{out}"
+    assert "FAIL" in out
+
+
+def test_eval_fails_on_false_positive(tmp_path):
+    roles = _seed_eval_workspace(tmp_path, (
+        "lens: probe\n"
+        "should_not_fire:\n"
+        "  - \"check this out\"\n"  # this DOES fire → should_not_fire violated
+    ))
+    rc, out, err = _run("eval", "--roles-dir", str(roles), cwd=tmp_path)
+    assert rc == 1, f"expected fail rc=1, got {rc}\n{out}"
+
+
+def test_eval_json_output(tmp_path):
+    roles = _seed_eval_workspace(tmp_path, (
+        "lens: probe\n"
+        "should_fire:\n  - \"check this out\"\n"
+    ))
+    rc, out, err = _run("eval", "--roles-dir", str(roles), "--json", cwd=tmp_path)
+    assert rc == 0
+    payload = json.loads(out)
+    assert payload["passed"] == 1 and payload["failed"] == 0
+    assert payload["results"][0]["lens"] == "probe"
+
+
+def test_eval_no_fixtures_is_clean(tmp_path):
+    roles = _seed_roles(tmp_path)  # no .eval.yaml
+    rc, out, err = _run("eval", "--roles-dir", str(roles), cwd=tmp_path)
+    assert rc == 0
+
+
+def test_eval_missing_roles_dir(tmp_path):
+    rc, out, err = _run("eval", "--roles-dir", str(tmp_path / "nope"), cwd=tmp_path)
+    assert rc == 2
+
+
+# --- CodeRabbit role-x#7: malformed cases must fail loudly, not silently drop ---
+
+def test_eval_fails_on_malformed_case(tmp_path):
+    roles = _seed_eval_workspace(tmp_path, (
+        "lens: probe\n"
+        "should_fire:\n"
+        "  - 42\n"                       # int → not a valid case → must FAIL, not drop
+        "  - \"check this out\"\n"
+    ))
+    rc, out, err = _run("eval", "--roles-dir", str(roles), cwd=tmp_path)
+    assert rc == 1, f"malformed case must fail, got rc={rc}\n{out}\n{err}"
+    assert "malformed" in (out + err).lower()
+
+
+def test_eval_fails_on_non_mapping_root(tmp_path):
+    roles = _seed_roles(tmp_path)
+    (roles / "probe.eval.yaml").write_text("- a\n- b\n", encoding="utf-8")  # list, not mapping
+    rc, out, err = _run("eval", "--roles-dir", str(roles), cwd=tmp_path)
+    assert rc == 1, f"non-mapping root must fail, got rc={rc}\n{out}\n{err}"
+    assert "mapping" in (out + err).lower()
+
+
+def test_eval_json_stdout_clean_on_failure(tmp_path):
+    # a failing case with --json must keep stdout pure JSON (log lines → stderr)
+    roles = _seed_eval_workspace(tmp_path, (
+        "lens: probe\n"
+        "should_fire:\n  - \"totally unrelated weather request\"\n"  # will FAIL
+    ))
+    rc, out, err = _run("eval", "--roles-dir", str(roles), "--json", cwd=tmp_path)
+    assert rc == 1
+    payload = json.loads(out)  # must parse despite the failure
+    assert payload["failed"] == 1