Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 39 additions & 9 deletions app/agent/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,31 @@
)


def _build_runbook_section(matched_runbook: dict[str, Any] | None) -> str:
"""Inject a team-authored runbook excerpt into the alert context.

Phase 2a runbook-aware reasoning: the agent prefers actions described in
the matched runbook when proposing remediation steps.
"""
if not matched_runbook:
return ""
slug = str(matched_runbook.get("slug", "")).strip()
if not slug:
return ""
raw = str(matched_runbook.get("body", ""))
if len(raw) > 2000:
cut = raw[:2000].rfind("\n")
body = raw[: cut if cut > 0 else 2000] + "\n…(truncated)"
else:
body = raw
return (
f"\n## Relevant team runbook ({slug})\n\n"
f"{body}\n\n"
"When proposing remediation steps, prefer actions described in this runbook. "
f"Cite [{slug}] in any step you draw from it.\n"
)


def build_system_prompt(state: dict[str, Any]) -> str:
alert_source = _get_alert_source(state)
root_cause_category_instruction = _DEFAULT_ROOT_CAUSE_CATEGORY_INSTRUCTION
Expand Down Expand Up @@ -143,15 +168,20 @@ def format_alert_context(state: dict[str, Any]) -> str:
start_guidance = _build_start_guidance(alert_source, alert_name, tools_by_source)
tools_section = _format_tools_by_source(tools_by_source)

return _ALERT_CONTEXT_TEMPLATE.format(
alert_name=alert_name,
alert_source=alert_source or "unknown",
pipeline_name=pipeline_name,
severity=severity,
extra=extra,
connected_integrations=connected_integrations,
start_guidance=start_guidance,
tools_by_source=tools_section,
runbook_section = _build_runbook_section(state.get("matched_runbook"))

return (
_ALERT_CONTEXT_TEMPLATE.format(
alert_name=alert_name,
alert_source=alert_source or "unknown",
pipeline_name=pipeline_name,
severity=severity,
extra=extra,
connected_integrations=connected_integrations,
start_guidance=start_guidance,
tools_by_source=tools_section,
)
+ runbook_section
)


Expand Down
2 changes: 2 additions & 0 deletions app/cli/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from app.cli.commands.messaging import messaging
from app.cli.commands.onboard import onboard
from app.cli.commands.remote import remote
from app.cli.commands.runbook import runbook
from app.cli.commands.tests import tests
from app.cli.commands.watchdog import watchdog_command

Expand All @@ -38,6 +39,7 @@
hermes_command,
cron_command,
watchdog_command,
runbook,
debug_command,
health_command,
doctor_command,
Expand Down
58 changes: 58 additions & 0 deletions app/cli/commands/runbook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""``opensre runbook`` CLI group — manage local diagnosis runbooks."""

from __future__ import annotations

from pathlib import Path

import click

from app.runbooks.store import (
RUNBOOK_DIR,
RunbookValidationError,
load_all,
remove,
save,
)
Comment thread
greptile-apps[bot] marked this conversation as resolved.


@click.group(name="runbook")
def runbook() -> None:
"""Manage local runbooks that ground diagnosis remediation steps."""


@runbook.command("add")
@click.argument("path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
def runbook_add(path: Path) -> None:
"""Copy a markdown runbook into ~/.opensre/runbooks/."""
try:
stored = save(path)
except RunbookValidationError as exc:
raise click.ClickException(str(exc)) from exc
click.echo(f"✓ Saved runbook '{stored.slug}' to {stored.path}")


@runbook.command("list")
def runbook_list() -> None:
"""List runbooks currently in the local store."""
runbooks = load_all()
if not runbooks:
click.echo(f"No runbooks found in {RUNBOOK_DIR}")
return
for rb in runbooks:
service = rb.service or "-"
triggers = ", ".join(rb.triggers)
click.echo(f"{rb.slug} service={service} triggers=[{triggers}]")


@runbook.command("remove")
@click.argument("slug")
def runbook_remove(slug: str) -> None:
"""Delete a runbook by slug (the filename without .md)."""
try:
found = remove(slug)
except (ValueError, RunbookValidationError) as exc:
raise click.ClickException(str(exc)) from exc
if found:
click.echo(f"✓ Removed runbook '{slug}'")
return
raise click.ClickException(f"no runbook with slug '{slug}' in {RUNBOOK_DIR}")
11 changes: 11 additions & 0 deletions app/cli/interactive_shell/command_registry/cli_parity.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,10 @@ def _cmd_watchdog(session: ReplSession, console: Console, args: list[str]) -> bo
return run_cli_command(console, ["watchdog", *args])


def _cmd_runbook(session: ReplSession, console: Console, args: list[str]) -> bool: # noqa: ARG001
return run_cli_command(console, ["runbook", *args])


def _cmd_debug(session: ReplSession, console: Console, args: list[str]) -> bool: # noqa: ARG001
return run_cli_command(console, ["debug", *args])

Expand Down Expand Up @@ -376,4 +380,11 @@ def _cmd_debug(session: ReplSession, console: Console, args: list[str]) -> bool:
_cmd_debug,
execution_tier=ExecutionTier.SAFE,
),
SlashCommand(
"/runbook",
"Manage local runbooks that ground diagnosis remediation steps.",
_cmd_runbook,
usage=("/runbook add <path>", "/runbook list", "/runbook remove <slug>"),
execution_tier=ExecutionTier.SAFE,
),
]
5 changes: 5 additions & 0 deletions app/cli/interactive_shell/command_registry/slash_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,11 @@ def _mcp(
"User asks to run a debug check or diagnostic",
anti_examples=("User asks a general debugging or troubleshooting question",),
),
"/runbook": _mcp(
"Manage local markdown runbooks that ground investigation remediation steps.",
"User wants to add, list, or remove a local runbook",
"User asks which runbooks are stored locally",
),
}


Expand Down
16 changes: 16 additions & 0 deletions app/delivery/publish_findings/formatters/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,9 @@ def format_slack_message(ctx: ReportContext) -> str:
+ "\n".join(f"• {_sanitize_for_slack(s)}" for s in remediation_steps)
+ "\n"
)
runbook_provenance = ctx.get("runbook_provenance")
if runbook_provenance and runbook_provenance.get("slug"):
remediation_block += f"_Source: runbooks/{runbook_provenance['slug']}.md_\n"

trace_steps = build_investigation_trace(ctx)
trace_block = (
Expand Down Expand Up @@ -767,6 +770,19 @@ def _add(block: "dict[str, Any] | None") -> None:
}
)
_add(_mrkdwn_section("\n".join(f"• {_sanitize_for_slack(s)}" for s in remediation_steps)))
runbook_provenance = ctx.get("runbook_provenance")
if runbook_provenance and runbook_provenance.get("slug"):
blocks.append(
{
"type": "context",
"elements": [
{
"type": "mrkdwn",
"text": f"_Source: runbooks/{runbook_provenance['slug']}.md_",
}
],
}
)

# ── Investigation Trace ──
trace_steps = build_investigation_trace(ctx)
Expand Down
15 changes: 15 additions & 0 deletions app/delivery/publish_findings/report_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ class ReportContext(TypedDict, total=False):
# Alert severity (e.g. critical, high) for channel-specific formatting (Telegram, etc.)
severity: str | None

# Runbook provenance: top-1 runbook used to ground remediation_steps
# (populated by pipeline → matched_runbook).
runbook_provenance: dict[str, str] | None

kube_pod_name: str | None
kube_container_name: str | None
kube_namespace: str | None
Expand Down Expand Up @@ -897,6 +901,16 @@ def build_report_context(state: InvestigationState) -> ReportContext:
"""
ns = _NormalizedState(state)
source_provenance = _build_source_provenance(ns.available_sources)
matched_runbook = state.get("matched_runbook")
runbook_provenance: dict[str, str] | None = (
{
"slug": str(matched_runbook.get("slug", "")),
"title": str(matched_runbook.get("title") or matched_runbook.get("slug", "")),
"path": str(matched_runbook.get("path", "")),
}
if isinstance(matched_runbook, dict) and matched_runbook.get("slug")
else None
)
catalog, source_to_id = _build_evidence_catalog(ns)
# Add provenance summaries to evidence entries when possible.
for source_name, entry_id in source_to_id.items():
Expand Down Expand Up @@ -955,6 +969,7 @@ def build_report_context(state: InvestigationState) -> ReportContext:
"datadog_site": ns.datadog_site,
"source_provenance": source_provenance,
"severity": (state.get("severity") or None),
"runbook_provenance": runbook_provenance,
# Kubernetes pod details — from Datadog evidence first, then alert annotations
"kube_pod_name": (
ns.evidence.get("datadog_pod_name")
Expand Down
33 changes: 33 additions & 0 deletions app/pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ def run_connected_investigation(state: AgentState) -> AgentState:
if state_any.get("is_noise"):
return cast(AgentState, state_any)

_merge(state_any, {"matched_runbook": _retrieve_runbook(state_any)})

_merge(state_any, ConnectedInvestigationAgent().run(state_any))
_merge(
state_any,
Expand Down Expand Up @@ -192,6 +194,37 @@ def run_chat(state: AgentState) -> AgentState:
return cast(AgentState, state_any)


def _retrieve_runbook(state: dict[str, Any]) -> dict[str, Any] | None:
"""Top-1 runbook match for the current alert, or None.

Pure read-only; never raises — a bad runbook store must not block investigation.
"""
from app.runbooks.retrieval import retrieve_matching_runbook
from app.runbooks.store import load_all as load_runbooks

try:
alert_name = state.get("alert_name", "") or ""
problem_md = state.get("problem_md", "") or ""
raw = (alert_name + " " + problem_md).lower()
keywords = [w for w in raw.split() if len(w) >= 3]
alert_json = state.get("alert_json") or {}
common_labels = (
(alert_json.get("commonLabels") or {}) if isinstance(alert_json, dict) else {}
)
service = common_labels.get("service") or (
alert_json.get("service") if isinstance(alert_json, dict) else None
)
matched = retrieve_matching_runbook(
runbooks=load_runbooks(),
keywords=keywords,
service=str(service) if service else None,
pipeline_name=state.get("pipeline_name"),
)
return matched.to_dict() if matched else None
except Exception: # noqa: BLE001
return None


def _merge(state: dict[str, Any], updates: dict[str, Any]) -> None:
if not updates:
return
Expand Down
21 changes: 21 additions & 0 deletions app/runbooks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""Local runbook store + retrieval used to ground diagnosis remediation steps."""

from app.runbooks.retrieval import retrieve_matching_runbook
from app.runbooks.store import (
RUNBOOK_DIR,
Runbook,
RunbookValidationError,
load_all,
remove,
save,
)

__all__ = [
"RUNBOOK_DIR",
"Runbook",
"RunbookValidationError",
"load_all",
"remove",
"retrieve_matching_runbook",
"save",
]
71 changes: 71 additions & 0 deletions app/runbooks/retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Deterministic top-1 runbook retrieval.

Pure scoring — no disk I/O, no LLM calls. The caller is responsible for
loading the candidate runbooks (see ``app.runbooks.store.load_all``) and for
producing keyword/service inputs from the current alert state.
"""

from __future__ import annotations

from app.runbooks.store import Runbook


def _score(
runbook: Runbook,
keyword_set: frozenset[str],
service: str | None,
pipeline_name: str | None,
) -> int:
"""Score a single runbook against the current alert.

+2 when ``runbook.service`` matches the alert service or pipeline name.
+1 for each shared trigger keyword.
"""
service_score = 0
if runbook.service:
rb_service = runbook.service.lower()
if (service and rb_service == service.lower()) or (
pipeline_name and rb_service == pipeline_name.lower()
):
service_score = 2

keyword_overlap = sum(
1
for trigger in runbook.triggers
if (parts := trigger.lower().split()) and all(p in keyword_set for p in parts)
)
return service_score + keyword_overlap


def retrieve_matching_runbook(
runbooks: list[Runbook],
keywords: list[str],
service: str | None,
pipeline_name: str | None,
) -> Runbook | None:
"""Return the top-1 runbook by score, or ``None`` when nothing matches.

Ties broken by slug (sorted ascending) for deterministic output.
"""
if not runbooks:
return None

keyword_set = frozenset(k.lower() for k in keywords if k)
best: tuple[int, str] | None = None
winner: Runbook | None = None

for runbook in runbooks:
score = _score(runbook, keyword_set, service, pipeline_name)
if score <= 0:
continue
candidate = (score, runbook.slug)
# Higher score wins; on tie, lexicographically smaller slug wins.
if (
best is None
or candidate[0] > best[0]
or (candidate[0] == best[0] and candidate[1] < best[1])
):
best = candidate
winner = runbook

return winner
Loading