diff --git a/recipes/auto-enrich.md b/recipes/auto-enrich.md new file mode 100644 index 000000000..196001b32 --- /dev/null +++ b/recipes/auto-enrich.md @@ -0,0 +1,74 @@ +--- +id: auto-enrich +name: Auto-Enrichment Recipe +version: 0.1.0 +description: Nightly Cal dispatch that enriches sparse/orphan brain pages with research, citations, and cross-links via claude-haiku-4-5. +category: sense +requires: [] +secrets: [] +health_checks: + - type: env_exists + name: HOME + label: Heartbeat directory reachable +setup_time: 30 min +cost_estimate: "$3-5/mo (claude-haiku-4-5, nightly cadence)" +--- + +# Auto-Enrichment Recipe + +Nightly sensor that detects sparse, orphan, or stale entity pages in your brain and (in later phases) dispatches a Cal subagent to research and enrich them. Phase 1 ships the sensor and the recipe scaffold only. Research, quality gate, merge, and cron registration arrive in Phases 2 and 3. + +## What this is for + +Your brain accumulates pages that start as stubs: a person mentioned once, a company with a name and nothing else, a concept that never got fleshed out. Auto-enrich finds those pages on a regular cadence, ranks them by how sparse they are, and (in Phase 3) feeds a research artifact back through a quality gate before merging. + +Phase 1 (this PR) delivers: + +- Recipe manifest discoverable via `gbrain integrations list` +- Sensor (`scripts/detect_sparse.py`) that ranks candidate pages via CLI composition (`gbrain list`, `gbrain get`, `gbrain backlinks`) +- Heartbeat logging to `~/.gbrain/integrations/auto-enrich/heartbeat.jsonl` +- TDD test suite for the sensor + +## Usage (Phase 1, sensor only) + +Run the sensor against your live brain: + +```bash +bash recipes/auto-enrich/scripts/run_sensor.sh +``` + +Or directly: + +```bash +python3 recipes/auto-enrich/scripts/detect_sparse.py --limit 5 +``` + +Output is JSON to stdout: a ranked list of `{slug, score, reason, page_type}` records. The script also appends one heartbeat line per run to `~/.gbrain/integrations/auto-enrich/heartbeat.jsonl`. + +## Ranking signal + +For each candidate page the sensor computes: + +- `body_length_penalty` from the body text length returned by `gbrain get ` (target: 1500 chars) +- `link_starvation_penalty` from the inbound edge count returned by `gbrain backlinks ` (target: 3 inbound links) +- `enrichment_age_penalty` from the `last_enriched` frontmatter field if present, otherwise treated as never enriched (target: 90 days) + +The score is a weighted sum (defaults: 0.4 / 0.3 / 0.3) clamped to [0, 1]. Higher scores rank first. + +## Schedule + +Phase 1 has no cron registration. The cron is installed by `scripts/register-cron.sh` in Phase 3 (nightly at 03:00 local, delivered via Hermes cron, no Telegram noise on success). + +## Config + +See `config.yaml` for ranking weights, target thresholds, and runtime paths. Defaults are intentionally conservative; tune after the first dry-run sees real candidate distributions. + +## Files + +- `auto-enrich.md` (this manifest, flat at `recipes/` per integrations discovery contract) +- `auto-enrich/README.md` (human-readable extended docs) +- `auto-enrich/config.yaml` (tunables) +- `auto-enrich/scripts/detect_sparse.py` (sensor) +- `auto-enrich/scripts/auto_enrich_lib.py` (Heartbeat + subprocess wrapper) +- `auto-enrich/scripts/run_sensor.sh` (one-shot invoker) +- `auto-enrich/tests/` (pytest suite) diff --git a/recipes/auto-enrich/README.md b/recipes/auto-enrich/README.md new file mode 100644 index 000000000..46673b038 --- /dev/null +++ b/recipes/auto-enrich/README.md @@ -0,0 +1,107 @@ +# Auto-Enrichment Recipe + +Phase 1 (sensor + scaffold). + +## What this directory contains + +``` +recipes/auto-enrich.md # discoverable manifest (flat per integrations contract) +recipes/auto-enrich/ + README.md # this file + config.yaml # tunables: weights, thresholds, paths + scripts/ + auto_enrich_lib.py # Heartbeat class + gbrain subprocess wrapper + detect_sparse.py # sensor: ranks sparse/orphan/stale pages + run_sensor.sh # one-shot bash entry point + tests/ + test_detect_sparse.py # TDD coverage for the sensor +``` + +Runtime state lives at `~/.gbrain/integrations/auto-enrich/` (not committed): + +- `heartbeat.jsonl` (append-only health log, pruned to 30 days by `gbrain integrations`) +- `metrics.jsonl` (Phase 3) +- `escalations.jsonl` (Phase 3) + +## Running the sensor + +```bash +python3 recipes/auto-enrich/scripts/detect_sparse.py --limit 5 +``` + +CLI flags: + +- `--limit N`: maximum candidates to return after ranking (default: from config, 5) +- `--config PATH`: alternate config.yaml location +- `--output PATH`: write JSON to file instead of stdout +- `--types T1,T2`: comma-separated page types to scan (default: concept,entity,person,company) +- `--candidate-pool N`: how many oldest-updated pages to inspect per type before scoring (default: 50) + +Exit codes: + +- 0: success, ranked JSON on stdout (or written to `--output`) +- 1: gbrain subprocess error +- 2: config parse error + +## How the sensor works + +1. Enumerate candidates per page type using `gbrain list --type --sort updated_asc --limit `. The TSV columns are `slug, type, date, title`. +2. For each candidate, `gbrain get ` returns the markdown. Parse the YAML frontmatter (between the first two `---` fences) with `yaml.safe_load`. The body length is `len(body_string)`, computed client-side. +3. For each candidate, `gbrain backlinks ` returns a JSON edge array. The inbound link count is the array length. +4. Score each candidate: + + ``` + score = w_body * clamp(1 - body_length / target_body_length, 0, 1) + + w_links * clamp(1 - inbound_count / target_inbound_links, 0, 1) + + w_age * clamp(days_since(last_enriched) / max_age_days, 0, 1) + ``` + + When `last_enriched` is absent from the frontmatter (the common case until Phase 3 starts writing it), the age penalty maxes out at 1.0. + + ### Bootstrap mode + + Until any page in the brain carries `last_enriched`, every candidate hits the maximum age penalty and the term degenerates to a constant 0.3 baseline. That floor washes out body/link signal and ranks a well-developed 8K-char concept page identically to a 200-char stub. To avoid that, the sensor inspects the candidate pool first: if NO page has `last_enriched`, the run flips to `bootstrap_mode=True`, the age term is zeroed, and the body/links weights renormalize (0.4/0.7 and 0.3/0.7) so the score still spans [0, 1]. Once any page in the pool has `last_enriched`, scoring reverts to the original three-term formula. The flag is per-run and visible on each emitted candidate as `bootstrap_mode`. +5. Sort descending, truncate to `--limit`, emit JSON. + +## Discovery contract + +Integrations discovery in `src/commands/integrations.ts::loadAllRecipes` walks `recipes/*.md` (flat .md files only, no subdirectory recursion). The manifest is therefore at `recipes/auto-enrich.md`, not `recipes/auto-enrich/recipe.md`. The supporting tree under `recipes/auto-enrich/` is for code, not discovery. + +## Heartbeat contract + +`auto_enrich_lib.Heartbeat.emit(event, status, details)` appends one JSON line per call to `~/.gbrain/integrations/auto-enrich/heartbeat.jsonl`. Shape matches the format `gbrain integrations` consumes: + +```json +{"ts": "2026-05-20T03:00:00Z", "event": "sensor_run", "source_version": "0.1.0", "status": "ok", "details": {"candidates_scanned": 50, "candidates_returned": 5}} +``` + +`gbrain integrations show auto-enrich` and `gbrain integrations status auto-enrich` read this file and surface the most recent entry. + +## Tests + +```bash +cd recipes/auto-enrich +python3 -m pytest tests/ -v +``` + +The test suite mocks the `gbrain` subprocess boundary so it does not touch the live brain. Live verification is documented in the deliverable report. + +## Development + +Install the recipe's Python deps (scoped to this recipe; gbrain core is TypeScript): + +```bash +pip install -r recipes/auto-enrich/requirements.txt +``` + +Override the gbrain binary for local iteration with the `GBRAIN_BIN` env var. When unset, the sensor calls `gbrain` on `PATH`. + +```bash +GBRAIN_BIN=/Users/me/gbrain/bin/gbrain.js python3 recipes/auto-enrich/scripts/detect_sparse.py --limit 5 +``` + +## Phase boundaries + +- Phase 1 (this PR): sensor + recipe scaffold + heartbeat. No writes. +- Phase 2: research strategy + Cal dispatch + research artifact schema. +- Phase 3: quality gate + synthesize merge + cron registration + live smoke test. diff --git a/recipes/auto-enrich/config.yaml b/recipes/auto-enrich/config.yaml new file mode 100644 index 000000000..34cf8ebb5 --- /dev/null +++ b/recipes/auto-enrich/config.yaml @@ -0,0 +1,32 @@ +# Auto-Enrichment config (Phase 1 sensor only) +sensor: + max_candidates_per_run: 5 + candidate_pool_per_type: 50 + page_types: + - concept + - entity + - person + - company + target_body_length: 1500 + target_inbound_links: 3 + max_enrichment_age_days: 90 + ranking_weights: + body: 0.4 + links: 0.3 + age: 0.3 + +# Phase 2 + 3 knobs (kept here so the file is the single tunables surface) +research: + cal_subagent_timeout_seconds: 300 + max_parallel_research: 2 + cal_model: claude-haiku-4-5 + +quality_gate: + require_citations: true + reject_on_destructive_merge: true + lint_synthesized_draft: true + +runtime: + heartbeat_path: ~/.gbrain/integrations/auto-enrich/heartbeat.jsonl + metrics_path: ~/.gbrain/integrations/auto-enrich/metrics.jsonl + escalations_path: ~/.gbrain/integrations/auto-enrich/escalations.jsonl diff --git a/recipes/auto-enrich/requirements.txt b/recipes/auto-enrich/requirements.txt new file mode 100644 index 000000000..13c648d31 --- /dev/null +++ b/recipes/auto-enrich/requirements.txt @@ -0,0 +1,2 @@ +pyyaml>=6.0 +pytest>=7.0 diff --git a/recipes/auto-enrich/scripts/__init__.py b/recipes/auto-enrich/scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/recipes/auto-enrich/scripts/auto_enrich_lib.py b/recipes/auto-enrich/scripts/auto_enrich_lib.py new file mode 100644 index 000000000..d3879607a --- /dev/null +++ b/recipes/auto-enrich/scripts/auto_enrich_lib.py @@ -0,0 +1,125 @@ +"""Shared helpers for the auto-enrich recipe. + +Subprocess wrapper around the gbrain CLI and a Heartbeat class that writes +JSONL lines to ~/.gbrain/integrations/auto-enrich/heartbeat.jsonl in the +shape that `gbrain integrations show / status` consumes. + +No Python client for gbrain exists; everything is subprocess-only. The +wrapper raises GBrainCLIError on non-zero exit so callers can map to the +sensor's exit codes (0 ok, 1 CLI error, 2 config error). +""" + +from __future__ import annotations + +import json +import os +import subprocess +import sys +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +RECIPE_ID = "auto-enrich" +RECIPE_VERSION = "0.1.0" + +DEFAULT_HEARTBEAT_PATH = Path.home() / ".gbrain" / "integrations" / RECIPE_ID / "heartbeat.jsonl" + + +class GBrainCLIError(RuntimeError): + """Raised when a gbrain subprocess returns non-zero.""" + + def __init__(self, argv: list[str], returncode: int, stdout: str, stderr: str): + self.argv = argv + self.returncode = returncode + self.stdout = stdout + self.stderr = stderr + super().__init__( + f"gbrain {' '.join(argv[1:])} exited {returncode}: {stderr.strip() or stdout.strip()}" + ) + + +def run_gbrain(args: list[str], *, timeout: int = 60) -> str: + """Invoke `gbrain ` and return stdout. Raises GBrainCLIError on + non-zero. Pattern matches recipes/web-to-brain/scripts/web_lib.py.""" + argv = [os.environ.get("GBRAIN_BIN", "gbrain"), *args] + try: + result = subprocess.run( + argv, + capture_output=True, + text=True, + timeout=timeout, + check=False, + ) + except FileNotFoundError as exc: + raise GBrainCLIError(argv, 127, "", str(exc)) from exc + if result.returncode != 0: + raise GBrainCLIError(argv, result.returncode, result.stdout, result.stderr) + return result.stdout + + +@dataclass +class Heartbeat: + """Append-only JSONL heartbeat log. One line per recipe-run event. + + Matches the shape `gbrain integrations` reads in + src/commands/integrations.ts::readHeartbeat (HeartbeatEntry). + """ + + path: Path = DEFAULT_HEARTBEAT_PATH + recipe_id: str = RECIPE_ID + source_version: str = RECIPE_VERSION + + def emit( + self, + event: str, + status: str = "ok", + details: dict[str, Any] | None = None, + error: str | None = None, + ) -> None: + """Append one JSON line. Creates parent dirs on first write.""" + entry: dict[str, Any] = { + "ts": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "event": event, + "source_version": self.source_version, + "status": status, + } + if details is not None: + entry["details"] = details + if error is not None: + entry["error"] = error + self.path.parent.mkdir(parents=True, exist_ok=True) + with self.path.open("a", encoding="utf-8") as fh: + fh.write(json.dumps(entry) + "\n") + + +def parse_frontmatter(markdown: str, *, slug: str | None = None) -> tuple[dict[str, Any], str]: + """Split a page returned by `gbrain get` into (frontmatter_dict, body_str). + + The page format is: + ---\n\n---\n + + Pages without leading frontmatter return ({}, full_text). YAML errors + surface ({}, full_text) plus a logged warning rather than crashing the + sensor on a single malformed page. + """ + import yaml # local import keeps top-level import side-effect free + + if not markdown.startswith("---"): + return {}, markdown + parts = markdown.split("---", 2) + if len(parts) < 3: + return {}, markdown + _, fm_text, body = parts + try: + data = yaml.safe_load(fm_text) or {} + except yaml.YAMLError as exc: + slug_tag = slug or "" + print( + f"auto-enrich: parse_frontmatter YAML error on slug={slug_tag}: {exc}", + file=sys.stderr, + ) + return {}, markdown + if not isinstance(data, dict): + return {}, markdown + return data, body.lstrip("\n") diff --git a/recipes/auto-enrich/scripts/detect_sparse.py b/recipes/auto-enrich/scripts/detect_sparse.py new file mode 100644 index 000000000..6411c9d67 --- /dev/null +++ b/recipes/auto-enrich/scripts/detect_sparse.py @@ -0,0 +1,312 @@ +"""detect_sparse.py: rank sparse / orphan / stale entity pages. + +Composes the gbrain CLI (no Python client exists): + + gbrain list --type --sort updated_asc --limit + -> TSV: slug\\ttype\\tdate\\ttitle + + gbrain get + -> markdown: YAML frontmatter between --- fences, then body + + gbrain backlinks + -> JSON array of edge records + +Scoring is a weighted sum of three [0, 1] penalties: + - body_length_penalty (target body length, default 1500 chars) + - link_starvation_penalty (target inbound links, default 3) + - enrichment_age_penalty (target max age in days, default 90) + +Exit codes: 0 success, 1 gbrain CLI error, 2 config parse error. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +# Allow running both as a script (python detect_sparse.py) and as a module. +_SCRIPT_DIR = Path(__file__).resolve().parent +if str(_SCRIPT_DIR) not in sys.path: + sys.path.insert(0, str(_SCRIPT_DIR)) + +import auto_enrich_lib # noqa: E402 +from auto_enrich_lib import ( # noqa: E402 + GBrainCLIError, + Heartbeat, + parse_frontmatter, + run_gbrain, +) + +DEFAULT_CONFIG_PATH = Path(__file__).resolve().parent.parent / "config.yaml" + + +@dataclass +class SensorConfig: + page_types: list[str] = field( + default_factory=lambda: ["concept", "entity", "person", "company"] + ) + candidate_pool_per_type: int = 50 + target_body_length: int = 1500 + target_inbound_links: int = 3 + max_enrichment_age_days: int = 90 + w_body: float = 0.4 + w_links: float = 0.3 + w_age: float = 0.3 + max_candidates_per_run: int = 5 + + @classmethod + def from_yaml(cls, path: Path) -> "SensorConfig": + import yaml + + with path.open("r", encoding="utf-8") as fh: + data = yaml.safe_load(fh) or {} + sensor = (data.get("sensor") or {}) if isinstance(data, dict) else {} + weights = sensor.get("ranking_weights") or {} + return cls( + page_types=list(sensor.get("page_types") or cls().page_types), + candidate_pool_per_type=int(sensor.get("candidate_pool_per_type", 50)), + target_body_length=int(sensor.get("target_body_length", 1500)), + target_inbound_links=int(sensor.get("target_inbound_links", 3)), + max_enrichment_age_days=int(sensor.get("max_enrichment_age_days", 90)), + w_body=float(weights.get("body", 0.4)), + w_links=float(weights.get("links", 0.3)), + w_age=float(weights.get("age", 0.3)), + max_candidates_per_run=int(sensor.get("max_candidates_per_run", 5)), + ) + + +def _clamp01(x: float) -> float: + if x < 0.0: + return 0.0 + if x > 1.0: + return 1.0 + return x + + +def _parse_iso(ts: str | None) -> datetime | None: + if not ts: + return None + s = str(ts).strip() + if s.endswith("Z"): + s = s[:-1] + "+00:00" + try: + dt = datetime.fromisoformat(s) + except ValueError: + return None + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + + +def compute_score( + *, + body_length: int, + inbound_count: int, + last_enriched: str | None, + cfg: SensorConfig, + now_iso: str | None = None, + bootstrap_mode: bool = False, +) -> float: + body_penalty = _clamp01(1.0 - body_length / max(1, cfg.target_body_length)) + link_penalty = _clamp01(1.0 - inbound_count / max(1, cfg.target_inbound_links)) + + if bootstrap_mode: + # No page in the corpus has last_enriched yet. The age term degenerates + # to a constant 1.0 across the pool, contributing 0.3 baseline to every + # candidate and washing out body/link signal. Zero it and renormalize + # the remaining weights so the score still spans [0, 1]. + denom = cfg.w_body + cfg.w_links + if denom <= 0: + return 0.0 + return (cfg.w_body / denom) * body_penalty + (cfg.w_links / denom) * link_penalty + + enriched_dt = _parse_iso(last_enriched) + if enriched_dt is None: + age_penalty = 1.0 + else: + now = _parse_iso(now_iso) if now_iso else datetime.now(timezone.utc) + if now is None: + now = datetime.now(timezone.utc) + delta_days = max(0.0, (now - enriched_dt).total_seconds() / 86400.0) + age_penalty = _clamp01(delta_days / max(1, cfg.max_enrichment_age_days)) + + return cfg.w_body * body_penalty + cfg.w_links * link_penalty + cfg.w_age * age_penalty + + +def _parse_list_tsv(tsv: str) -> list[dict[str, str]]: + """Parse `gbrain list` TSV output. Columns: slug, type, date, title.""" + rows: list[dict[str, str]] = [] + for line in tsv.splitlines(): + if not line.strip(): + continue + cols = line.split("\t") + if len(cols) < 2: + continue + rows.append( + { + "slug": cols[0], + "type": cols[1], + "date": cols[2] if len(cols) > 2 else "", + "title": cols[3] if len(cols) > 3 else "", + } + ) + return rows + + +def _inspect_candidate(slug: str, page_type: str, cfg: SensorConfig) -> dict[str, Any] | None: + """Gather raw signal for one candidate. Scoring happens in detect() once the + corpus-level bootstrap flag is known. Returns None on CLI failure for this + slug so the caller can skip it without aborting the run.""" + try: + page = run_gbrain(["get", slug]) + except GBrainCLIError: + return None + fm, body = parse_frontmatter(page, slug=slug) + body_length = len(body) + + try: + backlinks_json = run_gbrain(["backlinks", slug]) + backlinks = json.loads(backlinks_json) if backlinks_json.strip() else [] + if not isinstance(backlinks, list): + backlinks = [] + except (GBrainCLIError, json.JSONDecodeError): + backlinks = [] + + last_enriched = fm.get("last_enriched") if isinstance(fm, dict) else None + reason_parts = [] + if body_length < cfg.target_body_length: + reason_parts.append(f"body={body_length}<{cfg.target_body_length}") + if len(backlinks) < cfg.target_inbound_links: + reason_parts.append(f"links={len(backlinks)}<{cfg.target_inbound_links}") + if last_enriched is None: + reason_parts.append("never_enriched") + return { + "slug": slug, + "page_type": page_type, + "body_length": body_length, + "inbound_link_count": len(backlinks), + "last_enriched": last_enriched, + "reason": ", ".join(reason_parts) or "above_thresholds", + } + + +def detect(*, cfg: SensorConfig, limit: int) -> list[dict[str, Any]]: + """Enumerate candidates, score them, return top `limit` sorted desc by score. + + Two-pass: gather raw signal for the full candidate set, decide whether the + run is in bootstrap mode (no page in the pool has `last_enriched`), then + score with that flag. Bootstrap detection happens at the sensor level so + every candidate in a given run uses the same scoring regime. + """ + raw: list[dict[str, Any]] = [] + seen: set[str] = set() + for page_type in cfg.page_types: + try: + tsv = run_gbrain( + [ + "list", + "--type", + page_type, + "--sort", + "updated_asc", + "--limit", + str(cfg.candidate_pool_per_type), + ] + ) + except GBrainCLIError: + # Re-raise so main() can map to exit code 1 instead of silently empty + raise + for row in _parse_list_tsv(tsv): + slug = row["slug"] + if slug in seen: + continue + seen.add(slug) + entry = _inspect_candidate(slug, row.get("type", page_type), cfg) + if entry is not None: + raw.append(entry) + + bootstrap_mode = all(r.get("last_enriched") in (None, "") for r in raw) if raw else False + + candidates: list[dict[str, Any]] = [] + for entry in raw: + score = compute_score( + body_length=entry["body_length"], + inbound_count=entry["inbound_link_count"], + last_enriched=str(entry["last_enriched"]) if entry["last_enriched"] is not None else None, + cfg=cfg, + bootstrap_mode=bootstrap_mode, + ) + scored = dict(entry) + scored["score"] = round(score, 6) + scored["bootstrap_mode"] = bootstrap_mode + candidates.append(scored) + + candidates.sort(key=lambda r: r["score"], reverse=True) + return candidates[:limit] + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Detect sparse / orphan / stale brain pages.") + parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG_PATH) + parser.add_argument("--limit", type=int, default=None) + parser.add_argument("--output", type=Path, default=None) + parser.add_argument( + "--types", + type=str, + default=None, + help="Comma-separated page types (default: from config.yaml)", + ) + parser.add_argument("--candidate-pool", type=int, default=None) + args = parser.parse_args(argv) + + # Load config + if args.config.exists(): + try: + cfg = SensorConfig.from_yaml(args.config) + except Exception as exc: # noqa: BLE001 + print(f"config parse error: {exc}", file=sys.stderr) + sys.exit(2) + else: + cfg = SensorConfig() + + if args.types: + cfg.page_types = [t.strip() for t in args.types.split(",") if t.strip()] + if args.candidate_pool is not None: + cfg.candidate_pool_per_type = args.candidate_pool + + limit = args.limit if args.limit is not None else cfg.max_candidates_per_run + + hb = Heartbeat(path=auto_enrich_lib.DEFAULT_HEARTBEAT_PATH) + try: + results = detect(cfg=cfg, limit=limit) + except GBrainCLIError as exc: + hb.emit("sensor_run", status="error", error=str(exc)) + print(f"gbrain CLI error: {exc}", file=sys.stderr) + sys.exit(1) + + payload = json.dumps(results, indent=2) + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(payload + "\n", encoding="utf-8") + else: + print(payload) + + hb.emit( + "sensor_run", + status="ok", + details={ + "candidates_returned": len(results), + "page_types": cfg.page_types, + "pool_per_type": cfg.candidate_pool_per_type, + }, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/recipes/auto-enrich/scripts/run_sensor.sh b/recipes/auto-enrich/scripts/run_sensor.sh new file mode 100755 index 000000000..ff6e20a69 --- /dev/null +++ b/recipes/auto-enrich/scripts/run_sensor.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +# Phase 1 entry point: run the sparse-page sensor once. +# Writes ranked JSON to stdout and appends a heartbeat line. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR/.." + +exec python3 scripts/detect_sparse.py "$@" diff --git a/recipes/auto-enrich/tests/__init__.py b/recipes/auto-enrich/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/recipes/auto-enrich/tests/test_detect_sparse.py b/recipes/auto-enrich/tests/test_detect_sparse.py new file mode 100644 index 000000000..ab8195ef8 --- /dev/null +++ b/recipes/auto-enrich/tests/test_detect_sparse.py @@ -0,0 +1,308 @@ +"""Tests for detect_sparse.py. + +Mocks the gbrain subprocess boundary so tests do not touch the live brain. +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from unittest.mock import patch + +import pytest + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT / "scripts")) + +import auto_enrich_lib # noqa: E402 +import detect_sparse # noqa: E402 + + +# --- Fixtures --- + +SPARSE_PAGE_MD = """--- +type: person +title: Sparse Person +--- + +# Sparse Person + +Just a stub. +""" + +FULL_PAGE_MD = ( + """--- +type: person +title: Full Person +last_enriched: '2026-05-15T00:00:00Z' +--- + +# Full Person + +""" + + ("Detailed biography paragraph. " * 200) +) + +LIST_TSV_FIVE_ROWS = "\n".join( + [ + "people/p1\tperson\t2026-01-01\tP1", + "people/p2\tperson\t2026-01-02\tP2", + "people/p3\tperson\t2026-01-03\tP3", + "people/p4\tperson\t2026-01-04\tP4", + "people/p5\tperson\t2026-01-05\tP5", + ] +) + + +def fake_gbrain_factory(pages: dict[str, str], backlinks: dict[str, list[dict]], list_output: str): + """Build a fake run_gbrain that dispatches by subcommand.""" + + def fake_run(args: list[str], *, timeout: int = 60) -> str: + if args[0] == "list": + return list_output + "\n" + if args[0] == "get": + slug = args[1] + if slug not in pages: + raise auto_enrich_lib.GBrainCLIError( + ["gbrain", *args], 1, "", f"unknown slug {slug}" + ) + return pages[slug] + if args[0] == "backlinks": + slug = args[1] + return json.dumps(backlinks.get(slug, [])) + raise AssertionError(f"unexpected gbrain call: {args}") + + return fake_run + + +# --- Tests --- + + +def test_parse_frontmatter_extracts_yaml_block(): + fm, body = auto_enrich_lib.parse_frontmatter(SPARSE_PAGE_MD) + assert fm["type"] == "person" + assert fm["title"] == "Sparse Person" + assert body.startswith("# Sparse Person") + + +def test_parse_frontmatter_no_fence_returns_empty_dict(): + fm, body = auto_enrich_lib.parse_frontmatter("# Just a body\n\nNo frontmatter.\n") + assert fm == {} + assert body.startswith("# Just a body") + + +def test_score_sparse_page_is_higher_than_full_page(): + cfg = detect_sparse.SensorConfig() + sparse_score = detect_sparse.compute_score( + body_length=20, inbound_count=0, last_enriched=None, cfg=cfg + ) + full_score = detect_sparse.compute_score( + body_length=5000, inbound_count=10, last_enriched="2026-05-15T00:00:00Z", cfg=cfg, now_iso="2026-05-20T00:00:00Z" + ) + assert sparse_score > full_score + assert sparse_score > 0.99 + + +def test_score_is_deterministic(): + cfg = detect_sparse.SensorConfig() + a = detect_sparse.compute_score(body_length=300, inbound_count=1, last_enriched=None, cfg=cfg) + b = detect_sparse.compute_score(body_length=300, inbound_count=1, last_enriched=None, cfg=cfg) + assert a == b + + +def test_missing_last_enriched_max_age_penalty(): + cfg = detect_sparse.SensorConfig() + score = detect_sparse.compute_score( + body_length=1500, inbound_count=3, last_enriched=None, cfg=cfg + ) + # Body + links perfect, only age penalty contributes (weight 0.3, value 1.0) + assert score == pytest.approx(0.3, abs=1e-6) + + +def test_sensor_flags_sparse_fixture_not_full_fixture(): + pages = { + "people/sparse": SPARSE_PAGE_MD, + "people/full": FULL_PAGE_MD, + } + backlinks = {"people/sparse": [], "people/full": [{"f": "x"}] * 5} + list_tsv = "people/sparse\tperson\t2026-01-01\tSparse Person\npeople/full\tperson\t2026-01-02\tFull Person" + + fake = fake_gbrain_factory(pages, backlinks, list_tsv) + with patch.object(detect_sparse, "run_gbrain", side_effect=fake): + results = detect_sparse.detect( + cfg=detect_sparse.SensorConfig(page_types=["person"], candidate_pool_per_type=5), + limit=10, + ) + + slugs = [r["slug"] for r in results] + assert "people/sparse" in slugs + # Sparse must rank above full + sparse_idx = slugs.index("people/sparse") + if "people/full" in slugs: + assert sparse_idx < slugs.index("people/full") + # Sparse score must exceed full score + sparse_score = next(r["score"] for r in results if r["slug"] == "people/sparse") + if any(r["slug"] == "people/full" for r in results): + full_score = next(r["score"] for r in results if r["slug"] == "people/full") + assert sparse_score > full_score + + +def test_limit_truncates_results(): + pages = {f"people/p{i}": SPARSE_PAGE_MD for i in range(1, 6)} + backlinks = {f"people/p{i}": [] for i in range(1, 6)} + + fake = fake_gbrain_factory(pages, backlinks, LIST_TSV_FIVE_ROWS) + with patch.object(detect_sparse, "run_gbrain", side_effect=fake): + results = detect_sparse.detect( + cfg=detect_sparse.SensorConfig(page_types=["person"], candidate_pool_per_type=5), + limit=3, + ) + assert len(results) == 3 + + +def test_cli_subprocess_error_exits_1(tmp_path, monkeypatch, capsys): + def boom(args, *, timeout: int = 60): + raise auto_enrich_lib.GBrainCLIError(["gbrain", *args], 1, "", "engine down") + + monkeypatch.setattr(detect_sparse, "run_gbrain", boom) + with pytest.raises(SystemExit) as exc: + detect_sparse.main(["--limit", "1"]) + assert exc.value.code == 1 + + +# --- Bootstrap-mode scoring (Ryan Ayers external review #1) --- + + +def test_bootstrap_mode_zeroes_age_term_and_renormalizes(): + """When no candidate has last_enriched, age weight collapses to zero and + body/links weights renormalize to sum to 1.""" + cfg = detect_sparse.SensorConfig() + # Perfect body + perfect links: bootstrap score must be 0.0 + bootstrap_zero = detect_sparse.compute_score( + body_length=10_000, + inbound_count=10, + last_enriched=None, + cfg=cfg, + bootstrap_mode=True, + ) + assert bootstrap_zero == pytest.approx(0.0, abs=1e-6) + + # Worst body + worst links: bootstrap score must be 1.0 (full renormalization) + bootstrap_max = detect_sparse.compute_score( + body_length=0, + inbound_count=0, + last_enriched=None, + cfg=cfg, + bootstrap_mode=True, + ) + assert bootstrap_max == pytest.approx(1.0, abs=1e-6) + + +def test_non_bootstrap_uses_original_formula(): + """bootstrap_mode=False preserves the original three-term score so existing + callers and golden values are unchanged.""" + cfg = detect_sparse.SensorConfig() + score = detect_sparse.compute_score( + body_length=1500, + inbound_count=3, + last_enriched=None, + cfg=cfg, + bootstrap_mode=False, + ) + # Body + links perfect, age 1.0 contributes 0.3 + assert score == pytest.approx(0.3, abs=1e-6) + + +def test_bootstrap_ranks_developed_concept_below_stub(): + """Ryan's fix verified: a long concept page with no inbound links scores + LOWER than a true stub during bootstrap, because the age term no longer + floors every score at 0.3.""" + cfg = detect_sparse.SensorConfig() + developed = detect_sparse.compute_score( + body_length=8821, + inbound_count=0, + last_enriched=None, + cfg=cfg, + bootstrap_mode=True, + ) + stub = detect_sparse.compute_score( + body_length=200, + inbound_count=0, + last_enriched=None, + cfg=cfg, + bootstrap_mode=True, + ) + assert developed < stub + + +def test_detect_flags_bootstrap_when_no_page_has_last_enriched(): + """All sparse pages with no last_enriched -> detect() marks bootstrap_mode=True.""" + pages = {f"people/p{i}": SPARSE_PAGE_MD for i in range(1, 4)} + backlinks = {f"people/p{i}": [] for i in range(1, 4)} + list_tsv = "\n".join( + f"people/p{i}\tperson\t2026-01-0{i}\tP{i}" for i in range(1, 4) + ) + fake = fake_gbrain_factory(pages, backlinks, list_tsv) + with patch.object(detect_sparse, "run_gbrain", side_effect=fake): + results = detect_sparse.detect( + cfg=detect_sparse.SensorConfig(page_types=["person"], candidate_pool_per_type=5), + limit=10, + ) + assert all(r["bootstrap_mode"] is True for r in results) + + +def test_detect_skips_bootstrap_when_any_page_has_last_enriched(): + """Mixed pool: at least one candidate has last_enriched -> normal scoring.""" + pages = {"people/sparse": SPARSE_PAGE_MD, "people/full": FULL_PAGE_MD} + backlinks = {"people/sparse": [], "people/full": []} + list_tsv = "people/sparse\tperson\t2026-01-01\tSparse\npeople/full\tperson\t2026-01-02\tFull" + fake = fake_gbrain_factory(pages, backlinks, list_tsv) + with patch.object(detect_sparse, "run_gbrain", side_effect=fake): + results = detect_sparse.detect( + cfg=detect_sparse.SensorConfig(page_types=["person"], candidate_pool_per_type=5), + limit=10, + ) + assert all(r["bootstrap_mode"] is False for r in results) + + +# --- YAML error logging (Grant code-review S-3) --- + + +def test_yaml_parse_error_logs_to_stderr(capsys): + """parse_frontmatter should log YAML errors instead of swallowing them. + + Includes the slug (when provided) so cron output points at the offending + page. + """ + broken = "---\nkey: : bad\n also: : bad\n---\nbody\n" + fm, body = auto_enrich_lib.parse_frontmatter(broken, slug="entity/broken") + assert fm == {} + err = capsys.readouterr().err + assert "entity/broken" in err + assert "YAML" in err + + +def test_main_writes_output_file(tmp_path, monkeypatch): + pages = {"people/p1": SPARSE_PAGE_MD} + backlinks = {"people/p1": []} + list_tsv = "people/p1\tperson\t2026-01-01\tP1" + fake = fake_gbrain_factory(pages, backlinks, list_tsv) + monkeypatch.setattr(detect_sparse, "run_gbrain", fake) + + # Avoid touching the real heartbeat path during tests + hb_path = tmp_path / "heartbeat.jsonl" + monkeypatch.setattr(auto_enrich_lib, "DEFAULT_HEARTBEAT_PATH", hb_path) + monkeypatch.setattr(detect_sparse.auto_enrich_lib, "DEFAULT_HEARTBEAT_PATH", hb_path) + + out_path = tmp_path / "out.json" + detect_sparse.main(["--limit", "1", "--output", str(out_path), "--types", "person"]) + data = json.loads(out_path.read_text()) + assert isinstance(data, list) + assert data[0]["slug"] == "people/p1" + # Heartbeat written + assert hb_path.exists() + line = hb_path.read_text().strip().splitlines()[-1] + entry = json.loads(line) + assert entry["event"] == "sensor_run" + assert entry["status"] == "ok"