Tracer-Cloud · devankitjuneja · May 14, 2026 · May 14, 2026 · May 14, 2026 · May 14, 2026
diff --git a/app/agent/prompt.py b/app/agent/prompt.py
@@ -101,6 +101,31 @@
 )
 
 
+def _build_runbook_section(matched_runbook: dict[str, Any] | None) -> str:
+    """Inject a team-authored runbook excerpt into the alert context.
+
+    Phase 2a runbook-aware reasoning: the agent prefers actions described in
+    the matched runbook when proposing remediation steps.
+    """
+    if not matched_runbook:
+        return ""
+    slug = str(matched_runbook.get("slug", "")).strip()
+    if not slug:
+        return ""
+    raw = str(matched_runbook.get("body", ""))
+    if len(raw) > 2000:
+        cut = raw[:2000].rfind("\n")
+        body = raw[: cut if cut > 0 else 2000] + "\n…(truncated)"
+    else:
+        body = raw
+    return (
+        f"\n## Relevant team runbook ({slug})\n\n"
+        f"{body}\n\n"
+        "When proposing remediation steps, prefer actions described in this runbook. "
+        f"Cite [{slug}] in any step you draw from it.\n"
+    )
+
+
 def build_system_prompt(state: dict[str, Any]) -> str:
     alert_source = _get_alert_source(state)
     root_cause_category_instruction = _DEFAULT_ROOT_CAUSE_CATEGORY_INSTRUCTION
@@ -143,15 +168,20 @@ def format_alert_context(state: dict[str, Any]) -> str:
     start_guidance = _build_start_guidance(alert_source, alert_name, tools_by_source)
     tools_section = _format_tools_by_source(tools_by_source)
 
-    return _ALERT_CONTEXT_TEMPLATE.format(
-        alert_name=alert_name,
-        alert_source=alert_source or "unknown",
-        pipeline_name=pipeline_name,
-        severity=severity,
-        extra=extra,
-        connected_integrations=connected_integrations,
-        start_guidance=start_guidance,
-        tools_by_source=tools_section,
+    runbook_section = _build_runbook_section(state.get("matched_runbook"))
+
+    return (
+        _ALERT_CONTEXT_TEMPLATE.format(
+            alert_name=alert_name,
+            alert_source=alert_source or "unknown",
+            pipeline_name=pipeline_name,
+            severity=severity,
+            extra=extra,
+            connected_integrations=connected_integrations,
+            start_guidance=start_guidance,
+            tools_by_source=tools_section,
+        )
+        + runbook_section
     )
 
 

diff --git a/app/cli/commands/__init__.py b/app/cli/commands/__init__.py
@@ -22,6 +22,7 @@
 from app.cli.commands.messaging import messaging
 from app.cli.commands.onboard import onboard
 from app.cli.commands.remote import remote
+from app.cli.commands.runbook import runbook
 from app.cli.commands.tests import tests
 from app.cli.commands.watchdog import watchdog_command
 
@@ -38,6 +39,7 @@
     hermes_command,
     cron_command,
     watchdog_command,
+    runbook,
     debug_command,
     health_command,
     doctor_command,

diff --git a/app/cli/commands/runbook.py b/app/cli/commands/runbook.py
@@ -0,0 +1,58 @@
+"""``opensre runbook`` CLI group — manage local diagnosis runbooks."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import click
+
+from app.runbooks.store import (
+    RUNBOOK_DIR,
+    RunbookValidationError,
+    load_all,
+    remove,
+    save,
+)
+
+
+@click.group(name="runbook")
+def runbook() -> None:
+    """Manage local runbooks that ground diagnosis remediation steps."""
+
+
+@runbook.command("add")
+@click.argument("path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
+def runbook_add(path: Path) -> None:
+    """Copy a markdown runbook into ~/.opensre/runbooks/."""
+    try:
+        stored = save(path)
+    except RunbookValidationError as exc:
+        raise click.ClickException(str(exc)) from exc
+    click.echo(f"✓ Saved runbook '{stored.slug}' to {stored.path}")
+
+
+@runbook.command("list")
+def runbook_list() -> None:
+    """List runbooks currently in the local store."""
+    runbooks = load_all()
+    if not runbooks:
+        click.echo(f"No runbooks found in {RUNBOOK_DIR}")
+        return
+    for rb in runbooks:
+        service = rb.service or "-"
+        triggers = ", ".join(rb.triggers)
+        click.echo(f"{rb.slug}  service={service}  triggers=[{triggers}]")
+
+
+@runbook.command("remove")
+@click.argument("slug")
+def runbook_remove(slug: str) -> None:
+    """Delete a runbook by slug (the filename without .md)."""
+    try:
+        found = remove(slug)
+    except (ValueError, RunbookValidationError) as exc:
+        raise click.ClickException(str(exc)) from exc
+    if found:
+        click.echo(f"✓ Removed runbook '{slug}'")
+        return
+    raise click.ClickException(f"no runbook with slug '{slug}' in {RUNBOOK_DIR}")
diff --git a/app/cli/interactive_shell/command_registry/cli_parity.py b/app/cli/interactive_shell/command_registry/cli_parity.py
@@ -272,6 +272,10 @@ def _cmd_watchdog(session: ReplSession, console: Console, args: list[str]) -> bo
     return run_cli_command(console, ["watchdog", *args])
 
 
+def _cmd_runbook(session: ReplSession, console: Console, args: list[str]) -> bool:  # noqa: ARG001
+    return run_cli_command(console, ["runbook", *args])
+
+
 def _cmd_debug(session: ReplSession, console: Console, args: list[str]) -> bool:  # noqa: ARG001
     return run_cli_command(console, ["debug", *args])
 
@@ -376,4 +380,11 @@ def _cmd_debug(session: ReplSession, console: Console, args: list[str]) -> bool:
         _cmd_debug,
         execution_tier=ExecutionTier.SAFE,
     ),
+    SlashCommand(
+        "/runbook",
+        "Manage local runbooks that ground diagnosis remediation steps.",
+        _cmd_runbook,
+        usage=("/runbook add <path>", "/runbook list", "/runbook remove <slug>"),
+        execution_tier=ExecutionTier.SAFE,
+    ),
 ]
diff --git a/app/cli/interactive_shell/command_registry/slash_catalog.py b/app/cli/interactive_shell/command_registry/slash_catalog.py
@@ -284,6 +284,11 @@ def _mcp(
         "User asks to run a debug check or diagnostic",
         anti_examples=("User asks a general debugging or troubleshooting question",),
     ),
+    "/runbook": _mcp(
+        "Manage local markdown runbooks that ground investigation remediation steps.",
+        "User wants to add, list, or remove a local runbook",
+        "User asks which runbooks are stored locally",
+    ),
 }
 
 

diff --git a/app/delivery/publish_findings/formatters/report.py b/app/delivery/publish_findings/formatters/report.py
@@ -502,6 +502,9 @@ def format_slack_message(ctx: ReportContext) -> str:
             + "\n".join(f"• {_sanitize_for_slack(s)}" for s in remediation_steps)
             + "\n"
         )
+        runbook_provenance = ctx.get("runbook_provenance")
+        if runbook_provenance and runbook_provenance.get("slug"):
+            remediation_block += f"_Source: runbooks/{runbook_provenance['slug']}.md_\n"
 
     trace_steps = build_investigation_trace(ctx)
     trace_block = (
@@ -767,6 +770,19 @@ def _add(block: "dict[str, Any] | None") -> None:
             }
         )
         _add(_mrkdwn_section("\n".join(f"• {_sanitize_for_slack(s)}" for s in remediation_steps)))
+        runbook_provenance = ctx.get("runbook_provenance")
+        if runbook_provenance and runbook_provenance.get("slug"):
+            blocks.append(
+                {
+                    "type": "context",
+                    "elements": [
+                        {
+                            "type": "mrkdwn",
+                            "text": f"_Source: runbooks/{runbook_provenance['slug']}.md_",
+                        }
+                    ],
+                }
+            )
 
     # ── Investigation Trace ──
     trace_steps = build_investigation_trace(ctx)

diff --git a/app/delivery/publish_findings/report_context.py b/app/delivery/publish_findings/report_context.py
@@ -95,6 +95,10 @@ class ReportContext(TypedDict, total=False):
     # Alert severity (e.g. critical, high) for channel-specific formatting (Telegram, etc.)
     severity: str | None
 
+    # Runbook provenance: top-1 runbook used to ground remediation_steps
+    # (populated by pipeline → matched_runbook).
+    runbook_provenance: dict[str, str] | None
+
     kube_pod_name: str | None
     kube_container_name: str | None
     kube_namespace: str | None
@@ -897,6 +901,16 @@ def build_report_context(state: InvestigationState) -> ReportContext:
     """
     ns = _NormalizedState(state)
     source_provenance = _build_source_provenance(ns.available_sources)
+    matched_runbook = state.get("matched_runbook")
+    runbook_provenance: dict[str, str] | None = (
+        {
+            "slug": str(matched_runbook.get("slug", "")),
+            "title": str(matched_runbook.get("title") or matched_runbook.get("slug", "")),
+            "path": str(matched_runbook.get("path", "")),
+        }
+        if isinstance(matched_runbook, dict) and matched_runbook.get("slug")
+        else None
+    )
     catalog, source_to_id = _build_evidence_catalog(ns)
     # Add provenance summaries to evidence entries when possible.
     for source_name, entry_id in source_to_id.items():
@@ -955,6 +969,7 @@ def build_report_context(state: InvestigationState) -> ReportContext:
         "datadog_site": ns.datadog_site,
         "source_provenance": source_provenance,
         "severity": (state.get("severity") or None),
+        "runbook_provenance": runbook_provenance,
         # Kubernetes pod details — from Datadog evidence first, then alert annotations
         "kube_pod_name": (
             ns.evidence.get("datadog_pod_name")

diff --git a/app/pipeline/pipeline.py b/app/pipeline/pipeline.py
@@ -155,6 +155,8 @@ def run_connected_investigation(state: AgentState) -> AgentState:
         if state_any.get("is_noise"):
             return cast(AgentState, state_any)
 
+        _merge(state_any, {"matched_runbook": _retrieve_runbook(state_any)})
+
         _merge(state_any, ConnectedInvestigationAgent().run(state_any))
         _merge(
             state_any,
@@ -192,6 +194,37 @@ def run_chat(state: AgentState) -> AgentState:
     return cast(AgentState, state_any)
 
 
+def _retrieve_runbook(state: dict[str, Any]) -> dict[str, Any] | None:
+    """Top-1 runbook match for the current alert, or None.
+
+    Pure read-only; never raises — a bad runbook store must not block investigation.
+    """
+    from app.runbooks.retrieval import retrieve_matching_runbook
+    from app.runbooks.store import load_all as load_runbooks
+
+    try:
+        alert_name = state.get("alert_name", "") or ""
+        problem_md = state.get("problem_md", "") or ""
+        raw = (alert_name + " " + problem_md).lower()
+        keywords = [w for w in raw.split() if len(w) >= 3]
+        alert_json = state.get("alert_json") or {}
+        common_labels = (
+            (alert_json.get("commonLabels") or {}) if isinstance(alert_json, dict) else {}
+        )
+        service = common_labels.get("service") or (
+            alert_json.get("service") if isinstance(alert_json, dict) else None
+        )
+        matched = retrieve_matching_runbook(
+            runbooks=load_runbooks(),
+            keywords=keywords,
+            service=str(service) if service else None,
+            pipeline_name=state.get("pipeline_name"),
+        )
+        return matched.to_dict() if matched else None
+    except Exception:  # noqa: BLE001
+        return None
+
+
 def _merge(state: dict[str, Any], updates: dict[str, Any]) -> None:
     if not updates:
         return

diff --git a/app/runbooks/__init__.py b/app/runbooks/__init__.py
@@ -0,0 +1,21 @@
+"""Local runbook store + retrieval used to ground diagnosis remediation steps."""
+
+from app.runbooks.retrieval import retrieve_matching_runbook
+from app.runbooks.store import (
+    RUNBOOK_DIR,
+    Runbook,
+    RunbookValidationError,
+    load_all,
+    remove,
+    save,
+)
+
+__all__ = [
+    "RUNBOOK_DIR",
+    "Runbook",
+    "RunbookValidationError",
+    "load_all",
+    "remove",
+    "retrieve_matching_runbook",
+    "save",
+]
diff --git a/app/runbooks/retrieval.py b/app/runbooks/retrieval.py
@@ -0,0 +1,71 @@
+"""Deterministic top-1 runbook retrieval.
+
+Pure scoring — no disk I/O, no LLM calls. The caller is responsible for
+loading the candidate runbooks (see ``app.runbooks.store.load_all``) and for
+producing keyword/service inputs from the current alert state.
+"""
+
+from __future__ import annotations
+
+from app.runbooks.store import Runbook
+
+
+def _score(
+    runbook: Runbook,
+    keyword_set: frozenset[str],
+    service: str | None,
+    pipeline_name: str | None,
+) -> int:
+    """Score a single runbook against the current alert.
+
+    +2 when ``runbook.service`` matches the alert service or pipeline name.
+    +1 for each shared trigger keyword.
+    """
+    service_score = 0
+    if runbook.service:
+        rb_service = runbook.service.lower()
+        if (service and rb_service == service.lower()) or (
+            pipeline_name and rb_service == pipeline_name.lower()
+        ):
+            service_score = 2
+
+    keyword_overlap = sum(
+        1
+        for trigger in runbook.triggers
+        if (parts := trigger.lower().split()) and all(p in keyword_set for p in parts)
+    )
+    return service_score + keyword_overlap
+
+
+def retrieve_matching_runbook(
+    runbooks: list[Runbook],
+    keywords: list[str],
+    service: str | None,
+    pipeline_name: str | None,
+) -> Runbook | None:
+    """Return the top-1 runbook by score, or ``None`` when nothing matches.
+
+    Ties broken by slug (sorted ascending) for deterministic output.
+    """
+    if not runbooks:
+        return None
+
+    keyword_set = frozenset(k.lower() for k in keywords if k)
+    best: tuple[int, str] | None = None
+    winner: Runbook | None = None
+
+    for runbook in runbooks:
+        score = _score(runbook, keyword_set, service, pipeline_name)
+        if score <= 0:
+            continue
+        candidate = (score, runbook.slug)
+        # Higher score wins; on tie, lexicographically smaller slug wins.
+        if (
+            best is None
+            or candidate[0] > best[0]
+            or (candidate[0] == best[0] and candidate[1] < best[1])
+        ):
+            best = candidate
+            winner = runbook
+
+    return winner