diff --git a/CHANGELOG.md b/CHANGELOG.md index 9265015..fa31187 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,40 @@ All notable changes to `role-x` are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.6.0] — 2026-06-05 + +Adds the **resolver-eval harness** and fixes **BRO-1338** (multi-word keyword +scoring). Origin: `/checkit` on Garry Tan's "skillify" essay surfaced that +bstack tests its code and governs skill *promotion* but never *tests its skills +as artifacts* — specifically, no test asserts that a lens's trigger actually +routes. This is skillify step 7 ("resolver eval"), built bstack-native (BRO-1411 +slice 1). + +### Fixed + +- **BRO-1338 — multi-word / punctuated `prompt_keywords` scored zero.** + `_score_lens` matched keywords only via single-token set membership, so phrase + triggers like `"check this out"`, `"/checkit"`, `"let's research this"`, + `"last 30 days"` could never match (the token set never *contains* a phrase). + New `_kw_matches` substring-matches punctuated/multi-word keywords against the + raw lowercased prompt; clean single tokens keep the original word-boundary-safe + semantics, so existing single-word lens behavior is unchanged. + +### Added + +- **`role-x.py eval`** subcommand — runs `roles/.eval.yaml` fixtures + (`should_fire` / `should_not_fire` intents, optional `touched_files`/`branch` + per case) through `_select_lenses` and asserts the declared lens's selection. + Exit 1 on any failure (CI-gateable); `--json`, `--verbose`, `--lens`, + `--active-only`. +- **`include_statuses` param on `_select_lenses`** (default `("active",)`) — the + eval passes `("active","candidate")` so a lens's routing is testable *before* + promotion, closing the gap where a candidate lens ships with a broken trigger + nobody catches until it goes live. +- 13 tests (`tests/test_eval.py`): unit coverage for the phrase fix + + `include_statuses`, integration coverage for the `eval` subcommand + (pass / false-negative / false-positive / json / no-fixtures / missing-dir). + ## [0.5.0] — 2026-06-01 Adds **task-relevant entity auto-loading** to intake (BRO-1295). Until now the diff --git a/SKILL.md b/SKILL.md index d836e31..d1d3bd2 100644 --- a/SKILL.md +++ b/SKILL.md @@ -105,8 +105,9 @@ The intake reflex routes prompts in real-time. The meta-progression discipline e - `roles/_meta.md` — always-loaded base lens (the workspace's implicit "bstack-aware autonomous senior engineer" contract made addressable). Lives in the consuming workspace, not in this skill repo. - `roles/.md` — per-domain lenses. Live in the consuming workspace. +- `roles/.eval.yaml` — resolver-eval fixture (`should_fire` / `should_not_fire` intents) asserting the lens's trigger actually routes. Run via `role-x.py eval`; gate in CI. The skillify "resolver eval" step — a trigger that says "phrase X selects lens Y" is only trustworthy once a test proves it. Live in the consuming workspace alongside the lens. - `roles/_index.md` — auto-generated discovery index. -- `scripts/role-x.py` — CLI helpers. +- `scripts/role-x.py` — CLI helpers (`validate`, `list`, `index`, `intake`, `coverage`, `suggest`, `init`, `eval`). - `references/*.md` — schema + algorithm reference docs. - `~/.config/broomva/role/events.jsonl` — telemetry log (M2). - `~/.config/broomva/role/status.json` — per-lens stats cache (M2). diff --git a/scripts/role-x.py b/scripts/role-x.py index 6bfcaf4..296061d 100644 --- a/scripts/role-x.py +++ b/scripts/role-x.py @@ -396,12 +396,38 @@ def _resolve_threshold(lens: dict) -> int: return DEFAULT_THRESHOLD -def _score_lens(lens: dict, branch: str, touched_files: list[str], prompt_tokens: set[str]) -> dict: +def _kw_matches(kw_lc: str, prompt_tokens: set[str], prompt_lc: str) -> bool: + """Match one declared prompt_keyword against the prompt. + + BRO-1338 fix: multi-word / punctuated keywords (e.g. "check this out", + "/checkit", "let's research this", "last 30 days") are matched as + substrings against the raw lowercased prompt, because the single-token set + built by _tokenize_prompt() can never *contain* a phrase — so before this + fix every multi-word keyword silently scored zero. Clean single tokens + (only [a-z0-9-]) keep the original word-boundary-safe set-membership + semantics, so existing single-word lens behavior is unchanged. + """ + if re.search(r"[^a-z0-9-]", kw_lc): # space, slash, apostrophe, digit-sep, etc. + return bool(prompt_lc) and kw_lc in prompt_lc + return kw_lc in prompt_tokens + + +def _score_lens( + lens: dict, + branch: str, + touched_files: list[str], + prompt_tokens: set[str], + prompt_lc: str = "", +) -> dict: """Score a lens's frontmatter against the current signals. Returns breakdown dict with raw match counts (for backward-compat event schema) plus weighted total. Per-lens `signals.weights` (v0.3.0) multiplies raw counts when computing the total used for threshold comparison. + + `prompt_lc` (raw lowercased prompt) enables phrase matching for multi-word + keywords (BRO-1338). It defaults to "" so direct callers that pass only the + token set keep the pre-fix single-token behavior. """ signals = lens.get("signals", {}) or {} paths = signals.get("paths") or [] @@ -414,7 +440,7 @@ def _score_lens(lens: dict, branch: str, touched_files: list[str], prompt_tokens ) keyword_hits = sum( 1 for kw in prompt_keywords - if str(kw).lower() in prompt_tokens + if _kw_matches(str(kw).lower(), prompt_tokens, prompt_lc) ) branch_hits = sum( 1 for pat in branch_patterns @@ -472,23 +498,31 @@ def _select_lenses( signals: dict, prompt: str, threshold: int = DEFAULT_THRESHOLD, + include_statuses: tuple[str, ...] = ("active",), ) -> dict: """Score all lenses; return selection dict with selected, mode, signals. v0.3.0: each lens can override `threshold` (top-level) and `signals.weights.` (nested) — the global `threshold` argument here is only used as the default when a lens doesn't declare its own. + + `include_statuses` controls which lens `status:` values are scored. Live + intake passes the default ("active",); the resolver-eval harness (BRO-1411) + passes ("active", "candidate") so a lens can be routing-tested *before* it + is promoted to active — closing the gap where a candidate lens ships with a + broken trigger nobody can catch until it goes live. """ lenses: dict[str, dict] = {} for path in discover_lenses(roles_dir): lens = _load_lens(path) if not lens: continue - if lens.get("status") != "active": + if lens.get("status") not in include_statuses: continue lenses[lens["name"]] = lens prompt_tokens = _tokenize_prompt(prompt) + prompt_lc = prompt.lower() branch = signals.get("branch", "") touched = signals.get("touched_files", []) @@ -496,7 +530,7 @@ def _select_lenses( for name, lens in lenses.items(): if name == "_meta": continue # _meta is always applied as the base, not scored - breakdown = _score_lens(lens, branch, touched, prompt_tokens) + breakdown = _score_lens(lens, branch, touched, prompt_tokens, prompt_lc) per_lens_threshold = _resolve_threshold(lens) if "threshold" in lens else threshold scored.append((name, breakdown["total"], breakdown, per_lens_threshold)) scored.sort(key=lambda x: x[1], reverse=True) @@ -1426,6 +1460,143 @@ def _yaml_list(items: list[str]) -> str: ### Argparse + main ### +def _normalize_eval_case(item) -> dict | None: + """Normalize one eval case. + + Accepts either a bare intent string, or a dict + `{intent, touched_files?, branch?}` for cases that exercise path/branch + routing as well as prompt keywords. Returns None for malformed entries. + """ + if isinstance(item, str): + return {"intent": item, "touched_files": [], "branch": ""} + if isinstance(item, dict) and item.get("intent"): + return { + "intent": str(item["intent"]), + "touched_files": [str(f) for f in (item.get("touched_files") or [])], + "branch": str(item.get("branch") or ""), + } + return None + + +def _normalize_eval_cases(items) -> tuple[list[dict], int]: + """Normalize a list of eval cases; return (valid_cases, n_malformed). + + Malformed entries are counted, never silently dropped — a broken fixture + must fail the eval loudly, not quietly shrink the assertion count and report + a false green (CodeRabbit, role-x#7). + """ + valid: list[dict] = [] + malformed = 0 + for item in (items or []): + case = _normalize_eval_case(item) + if case is None: + malformed += 1 + else: + valid.append(case) + return valid, malformed + + +def _load_eval_specs(roles_dir: Path, only_lens: str | None) -> list[dict]: + """Load roles/.eval.yaml fixtures into normalized spec dicts.""" + specs: list[dict] = [] + for path in sorted(roles_dir.glob("*.eval.yaml")): + try: + data = yaml.safe_load(path.read_text(encoding="utf-8")) + except (yaml.YAMLError, OSError) as exc: + specs.append({"_path": path, "_error": str(exc)}) + continue + if data is None: + data = {} + if not isinstance(data, dict): + specs.append({ + "_path": path, + "_error": f"eval fixture root must be a mapping, got {type(data).__name__}", + }) + continue + lens = data.get("lens") or path.name[: -len(".eval.yaml")] + if only_lens and lens != only_lens: + continue + fire, fire_bad = _normalize_eval_cases(data.get("should_fire")) + nofire, nofire_bad = _normalize_eval_cases(data.get("should_not_fire")) + specs.append({ + "_path": path, + "lens": lens, + "should_fire": fire, + "should_not_fire": nofire, + "_malformed": fire_bad + nofire_bad, + }) + return specs + + +def cmd_eval(args: argparse.Namespace) -> int: + """(BRO-1411) Resolver-eval — assert that intents route to the expected lens. + + Skillify step 7: a resolver *trigger* says "phrase X should select lens Y"; + a resolver *eval* proves it actually does. Each roles/.eval.yaml + declares `should_fire` / `should_not_fire` intents; every intent is run + through _select_lenses() and the declared lens's presence in the selection + is asserted. Exit 1 on any failure (CI-gateable), 0 when all pass. + """ + roles_dir = Path(args.roles_dir) + if not roles_dir.is_dir(): + print(f"[eval] roles dir not found: {roles_dir}", file=sys.stderr) + return 2 + specs = _load_eval_specs(roles_dir, args.lens) + if not specs: + msg = f"[eval] no *.eval.yaml fixtures in {roles_dir}" + if args.lens: + print(msg + f" for lens '{args.lens}'", file=sys.stderr) + return 2 + print(msg + " (nothing to check)") + return 0 + + include = ("active", "candidate") if args.include_candidates else ("active",) + log = sys.stderr if args.json else sys.stdout # keep --json stdout machine-clean + total = passed = failed = 0 + results: list[dict] = [] + for spec in specs: + if spec.get("_error"): + print(f"[eval] FAIL parse {spec['_path'].name}: {spec['_error']}", file=log) + failed += 1 + total += 1 + continue + if spec.get("_malformed"): + n = spec["_malformed"] + print(f"[eval] FAIL {spec['_path'].name}: {n} malformed eval case(s) " + f"(each must be a string or a mapping with an 'intent' key)", file=log) + failed += n + total += n + lens = spec["lens"] + for expect, cases in (("fire", spec["should_fire"]), ("no-fire", spec["should_not_fire"])): + for case in cases: + total += 1 + signals = {"branch": case["branch"], "touched_files": case["touched_files"]} + sel = _select_lenses(roles_dir, signals, case["intent"], include_statuses=include) + fired = lens in sel["lenses_selected"] + ok = fired == (expect == "fire") + score = (sel["per_lens_scores"].get(lens) or {}).get("total", 0) + thr = sel["per_lens_thresholds"].get(lens, "?") + if ok: + passed += 1 + else: + failed += 1 + results.append({ + "lens": lens, "intent": case["intent"], "expect": expect, + "fired": fired, "score": score, "threshold": thr, "ok": ok, + }) + if not ok or args.verbose: + flag = "PASS" if ok else "FAIL" + print(f"[eval] {flag} [{lens}] expect={expect} fired={fired} " + f"score={score}/{thr} :: {case['intent'][:70]}", file=log) + + if args.json: + print(json.dumps({"total": total, "passed": passed, "failed": failed, "results": results}, indent=2)) + else: + n_lenses = len({s["lens"] for s in specs if not s.get("_error")}) + print(f"[eval] {passed}/{total} passed, {failed} failed across {n_lenses} lens(es)") + return 1 if failed else 0 + + def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( prog="role-x", @@ -1529,6 +1700,20 @@ def build_parser() -> argparse.ArgumentParser: p_init.add_argument("--force", action="store_true", help="overwrite if file already exists") p_init.set_defaults(func=cmd_init) + p_eval = sub.add_parser( + "eval", + help="(BRO-1411) resolver-eval: assert intent->lens routing from roles/*.eval.yaml fixtures", + ) + p_eval.add_argument("--roles-dir", default="roles", help="path to roles directory (default: roles)") + p_eval.add_argument("--lens", default=None, help="eval only this lens (default: all *.eval.yaml)") + p_eval.add_argument( + "--active-only", dest="include_candidates", action="store_false", + help="only score status:active lenses (default also scores candidates so pre-promotion lenses are testable)", + ) + p_eval.add_argument("--json", action="store_true", help="emit machine-readable JSON results") + p_eval.add_argument("--verbose", action="store_true", help="print PASS lines too, not just failures") + p_eval.set_defaults(func=cmd_eval, include_candidates=True) + return parser diff --git a/tests/test_eval.py b/tests/test_eval.py new file mode 100644 index 0000000..d5fc0e4 --- /dev/null +++ b/tests/test_eval.py @@ -0,0 +1,218 @@ +"""Tests for the resolver-eval harness + BRO-1338 phrase-matching fix. + +Two layers (skillify steps 3 + 4): + - unit : _kw_matches / _score_lens / _select_lenses pure functions + - integ : the `eval` subcommand end-to-end via subprocess over a seeded + roles/ dir with .eval.yaml fixtures +""" +from __future__ import annotations + +import importlib.util +import json +import os +import subprocess +import sys +from pathlib import Path + +SCRIPT = Path(__file__).parent.parent / "scripts" / "role-x.py" + +# --- load the hyphenated module for pure-function unit tests --- +_spec = importlib.util.spec_from_file_location("role_x_mod", SCRIPT) +role_x = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(role_x) + + +# ────────────────────────────────────────────────────────────────────────── +# Unit — BRO-1338: multi-word / punctuated keywords must match (skillify §3) +# ────────────────────────────────────────────────────────────────────────── + +def test_kw_matches_phrase_substring(): + # phrase keyword matches as substring of the raw prompt + assert role_x._kw_matches("check this out", set(), "yo check this out dude") + assert not role_x._kw_matches("check this out", {"check", "this", "out"}, "") + + +def test_kw_matches_slash_token(): + # "/checkit" tokenizes to "checkit" (slash stripped) — must match via prompt_lc + assert role_x._kw_matches("/checkit", {"checkit"}, "please run /checkit now") + + +def test_kw_matches_apostrophe_phrase(): + assert role_x._kw_matches("let's research this", set(), "ok let's research this paper") + + +def test_kw_matches_clean_token_unchanged(): + # clean single token keeps word-boundary set-membership semantics (no regression) + assert role_x._kw_matches("rust", {"rust", "tokio"}, "rust tokio") + assert not role_x._kw_matches("rust", {"trust"}, "i trust you") # no false substring hit + + +def test_score_lens_phrase_via_prompt_lc(): + lens = {"signals": {"prompt_keywords": ["check this out", "wdyt"]}} + tokens = role_x._tokenize_prompt("check this out wow") + b = role_x._score_lens(lens, "", [], tokens, "check this out wow") + assert b["prompt_keywords"] == 1 # the phrase hit + # without prompt_lc (legacy direct call) the phrase silently scores 0 + b0 = role_x._score_lens(lens, "", [], tokens) + assert b0["prompt_keywords"] == 0 + + +# ────────────────────────────────────────────────────────────────────────── +# Unit — _select_lenses honors include_statuses (candidate lenses testable) +# ────────────────────────────────────────────────────────────────────────── + +_META = """--- +name: _meta +status: active +default_mode: augment +--- +# meta +""" + +_CANDIDATE_LENS = """--- +name: probe +status: candidate +extends: _meta +threshold: 1 +signals: + prompt_keywords: + - "check this out" + - "wdyt" +--- +# probe lens +""" + + +def _seed_roles(tmp_path: Path) -> Path: + roles = tmp_path / "roles" + roles.mkdir() + (roles / "_meta.md").write_text(_META, encoding="utf-8") + (roles / "probe.md").write_text(_CANDIDATE_LENS, encoding="utf-8") + return roles + + +def test_select_excludes_candidate_by_default(tmp_path): + roles = _seed_roles(tmp_path) + sel = role_x._select_lenses(roles, {"branch": "", "touched_files": []}, "check this out") + assert "probe" not in sel["lenses_selected"] # candidate not scored by default + + +def test_select_includes_candidate_when_asked(tmp_path): + roles = _seed_roles(tmp_path) + sel = role_x._select_lenses( + roles, {"branch": "", "touched_files": []}, "check this out", + include_statuses=("active", "candidate"), + ) + assert "probe" in sel["lenses_selected"] # phrase fix + status override → fires + + +# ────────────────────────────────────────────────────────────────────────── +# Integration — the `eval` subcommand end-to-end +# ────────────────────────────────────────────────────────────────────────── + +def _run(*args: str, cwd: Path) -> tuple[int, str, str]: + r = subprocess.run( + [sys.executable, str(SCRIPT), *args], + capture_output=True, text=True, cwd=str(cwd), + env={**os.environ}, + ) + return r.returncode, r.stdout, r.stderr + + +def _seed_eval_workspace(tmp_path: Path, fixture: str) -> Path: + roles = _seed_roles(tmp_path) + (roles / "probe.eval.yaml").write_text(fixture, encoding="utf-8") + return roles + + +def test_eval_passes_when_routing_correct(tmp_path): + roles = _seed_eval_workspace(tmp_path, ( + "lens: probe\n" + "should_fire:\n" + " - \"check this out https://x.y\"\n" + " - \"wdyt about this\"\n" + "should_not_fire:\n" + " - \"summarize this pdf in 3 bullets\"\n" + )) + rc, out, err = _run("eval", "--roles-dir", str(roles), cwd=tmp_path) + assert rc == 0, f"expected pass, rc={rc}\n{out}\n{err}" + assert "0 failed" in out + + +def test_eval_fails_on_false_negative(tmp_path): + # declares a should_fire intent with NO matching keyword → must FAIL (rc 1) + roles = _seed_eval_workspace(tmp_path, ( + "lens: probe\n" + "should_fire:\n" + " - \"totally unrelated request about the weather\"\n" + )) + rc, out, err = _run("eval", "--roles-dir", str(roles), cwd=tmp_path) + assert rc == 1, f"expected fail rc=1, got {rc}\n{out}" + assert "FAIL" in out + + +def test_eval_fails_on_false_positive(tmp_path): + roles = _seed_eval_workspace(tmp_path, ( + "lens: probe\n" + "should_not_fire:\n" + " - \"check this out\"\n" # this DOES fire → should_not_fire violated + )) + rc, out, err = _run("eval", "--roles-dir", str(roles), cwd=tmp_path) + assert rc == 1, f"expected fail rc=1, got {rc}\n{out}" + + +def test_eval_json_output(tmp_path): + roles = _seed_eval_workspace(tmp_path, ( + "lens: probe\n" + "should_fire:\n - \"check this out\"\n" + )) + rc, out, err = _run("eval", "--roles-dir", str(roles), "--json", cwd=tmp_path) + assert rc == 0 + payload = json.loads(out) + assert payload["passed"] == 1 and payload["failed"] == 0 + assert payload["results"][0]["lens"] == "probe" + + +def test_eval_no_fixtures_is_clean(tmp_path): + roles = _seed_roles(tmp_path) # no .eval.yaml + rc, out, err = _run("eval", "--roles-dir", str(roles), cwd=tmp_path) + assert rc == 0 + + +def test_eval_missing_roles_dir(tmp_path): + rc, out, err = _run("eval", "--roles-dir", str(tmp_path / "nope"), cwd=tmp_path) + assert rc == 2 + + +# --- CodeRabbit role-x#7: malformed cases must fail loudly, not silently drop --- + +def test_eval_fails_on_malformed_case(tmp_path): + roles = _seed_eval_workspace(tmp_path, ( + "lens: probe\n" + "should_fire:\n" + " - 42\n" # int → not a valid case → must FAIL, not drop + " - \"check this out\"\n" + )) + rc, out, err = _run("eval", "--roles-dir", str(roles), cwd=tmp_path) + assert rc == 1, f"malformed case must fail, got rc={rc}\n{out}\n{err}" + assert "malformed" in (out + err).lower() + + +def test_eval_fails_on_non_mapping_root(tmp_path): + roles = _seed_roles(tmp_path) + (roles / "probe.eval.yaml").write_text("- a\n- b\n", encoding="utf-8") # list, not mapping + rc, out, err = _run("eval", "--roles-dir", str(roles), cwd=tmp_path) + assert rc == 1, f"non-mapping root must fail, got rc={rc}\n{out}\n{err}" + assert "mapping" in (out + err).lower() + + +def test_eval_json_stdout_clean_on_failure(tmp_path): + # a failing case with --json must keep stdout pure JSON (log lines → stderr) + roles = _seed_eval_workspace(tmp_path, ( + "lens: probe\n" + "should_fire:\n - \"totally unrelated weather request\"\n" # will FAIL + )) + rc, out, err = _run("eval", "--roles-dir", str(roles), "--json", cwd=tmp_path) + assert rc == 1 + payload = json.loads(out) # must parse despite the failure + assert payload["failed"] == 1