diff --git a/app/agent/investigation.py b/app/agent/investigation.py index cd1ddfc11..e40bf8745 100644 --- a/app/agent/investigation.py +++ b/app/agent/investigation.py @@ -10,7 +10,7 @@ from typing import Any from app.agent.prompt import build_system_prompt, format_alert_context -from app.agent.result import InvestigationResult, parse_diagnosis +from app.agent.result import InvestigationResult, check_sufficiency, parse_diagnosis from app.cli.support.output import debug_print, get_tracker from app.constants.investigation import MAX_INVESTIGATION_LOOPS from app.services.agent_llm_client import ToolCall, get_agent_llm @@ -201,6 +201,9 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None: "root_cause": error_msg, "validity_score": 0.0, "root_cause_category": "Configuration Error", + "confidence_band": "low", + "ranked_hypotheses": [], + "missing_evidence": [], }, ) updates = { @@ -211,6 +214,9 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None: "non_validated_claims": [], "remediation_steps": remediation_steps, "validity_score": 0.0, + "confidence_band": "low", + "ranked_hypotheses": [], + "missing_evidence": [], "investigation_recommendations": [], "evidence": evidence, "evidence_entries": [e.model_dump() for e in evidence_entries], @@ -272,12 +278,23 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None: result.evidence_entries = [e.model_dump() for e in evidence_entries] result.agent_messages = messages + if check_sufficiency(result): + if result.root_cause.startswith("Most likely:"): + result.root_cause = result.root_cause[len("Most likely:") :].lstrip() + else: + if not result.root_cause.startswith("Most likely"): + result.root_cause = f"Most likely: {result.root_cause}" + # Override band set by classify_confidence_band in result.py: thin evidence means LOW + # regardless of LLM-reported score (a high score with zero claims is still insufficient). + result.confidence_band = "low" + _emit( "agent_end", { "root_cause": result.root_cause, "validity_score": result.validity_score, "root_cause_category": result.root_cause_category, + "confidence_band": result.confidence_band, }, ) @@ -605,6 +622,9 @@ def _result_to_state(result: InvestigationResult) -> dict[str, Any]: "non_validated_claims": result.non_validated_claims, "remediation_steps": result.remediation_steps, "validity_score": result.validity_score, + "confidence_band": result.confidence_band, + "ranked_hypotheses": result.ranked_hypotheses, + "missing_evidence": result.missing_evidence, "investigation_recommendations": result.investigation_recommendations, "evidence": result.evidence, "evidence_entries": result.evidence_entries, diff --git a/app/agent/prompt.py b/app/agent/prompt.py index a86dbdcfb..1e1a18d2f 100644 --- a/app/agent/prompt.py +++ b/app/agent/prompt.py @@ -36,6 +36,10 @@ - **Non-validated claims**: Hypotheses you could not confirm - **Remediation steps**: Ordered, concrete actions to fix the issue - **Validity score**: 0.0–1.0 reflecting your confidence based on evidence quality +- **Confidence band**: `high` (strong evidence from multiple sources), `medium` (partial evidence, some gaps), or `low` (thin or conflicting evidence) +- **Ranked hypotheses**: If confidence is medium or low, list alternative explanations in order of likelihood (most likely first) +- **Missing evidence**: List specific data sources or queries that would confirm or refute the diagnosis but were unavailable +- If evidence is thin or conflicting, begin your root cause statement with "Most likely: " to signal uncertainty """ _ALERT_CONTEXT_TEMPLATE = """## Alert diff --git a/app/agent/result.py b/app/agent/result.py index 6aa6239b4..c22f3154d 100644 --- a/app/agent/result.py +++ b/app/agent/result.py @@ -4,7 +4,7 @@ import logging from dataclasses import dataclass, field -from typing import Any, TypedDict, cast +from typing import Any, Literal, TypedDict, cast from pydantic import BaseModel, Field @@ -17,6 +17,17 @@ logger = logging.getLogger(__name__) +class _ValidatedClaimSchema(BaseModel): + claim: str = Field(description="The validated claim statement") + evidence_sources: list[str] = Field( + default_factory=list, + description=( + "Subset of the collected evidence keys that directly support this specific claim. " + "Only include keys that actually informed this claim." + ), + ) + + @dataclass class InvestigationResult: root_cause: str @@ -26,6 +37,9 @@ class InvestigationResult: non_validated_claims: list[dict] = field(default_factory=list) remediation_steps: list[str] = field(default_factory=list) validity_score: float = 0.0 + confidence_band: str = "" + ranked_hypotheses: list[str] = field(default_factory=list) + missing_evidence: list[str] = field(default_factory=list) evidence: dict[str, Any] = field(default_factory=dict) evidence_entries: list[dict] = field(default_factory=list) agent_messages: list[dict] = field(default_factory=list) @@ -37,6 +51,7 @@ def unknown(cls, alert_name: str = "Unknown alert") -> InvestigationResult: root_cause=f"{alert_name}: Unable to determine root cause — insufficient evidence.", root_cause_category="unknown", validity_score=0.0, + confidence_band="low", non_validated_claims=[ { "claim": "Insufficient evidence available", @@ -51,9 +66,26 @@ def noise(cls) -> InvestigationResult: root_cause="Message classified as noise — no investigation needed.", root_cause_category="healthy", validity_score=1.0, + confidence_band="high", ) +def classify_confidence_band(score: float) -> Literal["high", "medium", "low"]: + if score >= 0.75: + return "high" + if score >= 0.40: + return "medium" + return "low" + + +def check_sufficiency(result: InvestigationResult) -> bool: + if result.root_cause_category in {"healthy", "unknown"}: + return True + if result.validity_score >= 0.75 and len(result.validated_claims) >= 1: + return True + return result.validity_score >= 0.40 and len(result.validated_claims) >= 2 + + def parse_diagnosis( messages: list[dict[str, Any]], evidence: dict[str, Any], @@ -121,8 +153,9 @@ class DiagnosisSchema(BaseModel): causal_chain: list[str] = Field( default_factory=list, description="Ordered steps leading to the failure" ) - validated_claims: list[str] = Field( - default_factory=list, description="Claims supported by tool evidence" + validated_claims: list[_ValidatedClaimSchema] = Field( + default_factory=list, + description="Claims supported by tool evidence, each with their specific supporting evidence keys", ) non_validated_claims: list[str] = Field( default_factory=list, description="Claims not yet confirmed by evidence" @@ -133,6 +166,14 @@ class DiagnosisSchema(BaseModel): validity_score: float = Field( default=0.0, description="0.0–1.0 confidence in the diagnosis" ) + ranked_hypotheses: list[str] = Field( + default_factory=list, + description="Alternative hypotheses ranked by likelihood (most to least likely)", + ) + missing_evidence: list[str] = Field( + default_factory=list, + description="Evidence that would confirm or refute the diagnosis but was unavailable", + ) return DiagnosisSchema @@ -157,10 +198,12 @@ class _DiagnosisPayload(TypedDict): root_cause: str root_cause_category: str causal_chain: list[str] - validated_claims: list[str] + validated_claims: list[dict] non_validated_claims: list[str] remediation_steps: list[str] validity_score: float + ranked_hypotheses: list[str] + missing_evidence: list[str] llm = get_llm_for_reasoning() schema_model = _build_diagnosis_schema(_taxonomy_categories_for_alert_source(alert_source)) @@ -181,10 +224,23 @@ def _to_claim_dicts(claims: list[str], status: str) -> list[dict]: root_cause=schema["root_cause"], root_cause_category=schema["root_cause_category"], causal_chain=schema["causal_chain"], - validated_claims=_to_claim_dicts(schema["validated_claims"], "validated"), + validated_claims=[ + { + "claim": c["claim"], + "validation_status": "validated", + **( + {"evidence_sources": c["evidence_sources"]} if c.get("evidence_sources") else {} + ), + } + for c in schema["validated_claims"] + if c.get("claim") + ], non_validated_claims=_to_claim_dicts(schema["non_validated_claims"], "not_validated"), remediation_steps=schema["remediation_steps"], validity_score=schema["validity_score"], + confidence_band=classify_confidence_band(schema["validity_score"]), + ranked_hypotheses=schema["ranked_hypotheses"], + missing_evidence=schema["missing_evidence"], ) @@ -207,6 +263,7 @@ def _parse_via_legacy( ], remediation_steps=rr.remediation_steps, validity_score=0.5, + confidence_band=classify_confidence_band(0.5), ) except Exception as err: logger.warning("Legacy parse_root_cause also failed: %s", err) diff --git a/app/cli/investigation/investigate.py b/app/cli/investigation/investigate.py index f398c8414..7dd771951 100644 --- a/app/cli/investigation/investigate.py +++ b/app/cli/investigation/investigate.py @@ -140,6 +140,7 @@ def run_investigation_cli( "root_cause": state["root_cause"], "is_noise": state.get("is_noise", False), "validity_score": state.get("validity_score", 0.0), + "confidence_band": state.get("confidence_band", ""), } if state.get("evidence_entries"): out["tool_calls"] = state["evidence_entries"] diff --git a/app/delivery/publish_findings/formatters/report.py b/app/delivery/publish_findings/formatters/report.py index 886a8cf86..5efdfb29a 100644 --- a/app/delivery/publish_findings/formatters/report.py +++ b/app/delivery/publish_findings/formatters/report.py @@ -466,6 +466,12 @@ def format_slack_message(ctx: ReportContext) -> str: if top_log: conclusion_block += f"`{top_log}`\n" + confidence_band = ctx.get("confidence_band", "") + validity_score_val = ctx.get("validity_score") + if confidence_band: + pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else "" + conclusion_block += f"*Confidence:* {confidence_band.upper()}{pct}\n" + validated_lines, non_validated_lines = _render_claim_lines(ctx) if validated_lines: # Use a larger markdown heading so that "Findings" stands out as a section. @@ -475,6 +481,21 @@ def format_slack_message(ctx: ReportContext) -> str: "\n*Non-Validated Claims (Inferred):*\n" + "\n".join(non_validated_lines) + "\n" ) + ranked_hypotheses = ctx.get("ranked_hypotheses") or [] + if ranked_hypotheses: + conclusion_block += ( + "\n*Alternative hypotheses:*\n" + + "\n".join(f"• {_sanitize_for_slack(h)}" for h in ranked_hypotheses) + + "\n" + ) + missing_evidence_list = ctx.get("missing_evidence") or [] + if missing_evidence_list: + conclusion_block += ( + "\n*Missing evidence:*\n" + + "\n".join(f"• {_sanitize_for_slack(e)}" for e in missing_evidence_list) + + "\n" + ) + correlation_signal_lines, correlation_driver_lines = _format_correlation_lines(ctx) if correlation_signal_lines or correlation_driver_lines: conclusion_block += "\n## Upstream Correlation\n" @@ -556,12 +577,33 @@ def format_telegram_message(ctx: ReportContext) -> str: rc += "\n" + html.escape(top_log) + "" parts.append(rc) + confidence_band = ctx.get("confidence_band", "") + validity_score_val = ctx.get("validity_score") + if confidence_band: + pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else "" + parts.append(f"Confidence: {html.escape(confidence_band.upper())}{pct}") + validated_lines, non_validated_lines = _render_claim_lines_telegram(ctx) if validated_lines: parts.append("Findings\n" + "\n".join(validated_lines)) if non_validated_lines: parts.append("Non-Validated Claims (Inferred)\n" + "\n".join(non_validated_lines)) + ranked_hypotheses = ctx.get("ranked_hypotheses") or [] + if ranked_hypotheses: + hyp = "\n".join( + "• " + _to_telegram_html_body(_sanitize_for_slack(str(h))) for h in ranked_hypotheses + ) + parts.append("Alternative hypotheses\n" + hyp) + + missing_evidence_list = ctx.get("missing_evidence") or [] + if missing_evidence_list: + me = "\n".join( + "• " + _to_telegram_html_body(_sanitize_for_slack(str(e))) + for e in missing_evidence_list + ) + parts.append("Missing evidence\n" + me) + provenance_lines = _format_provenance_lines(ctx) if provenance_lines: prov = "\n".join( @@ -695,6 +737,13 @@ def _add(block: "dict[str, Any] | None") -> None: rc_text += f"\n`{top_log}`" _add(_mrkdwn_section(rc_text)) + # ── Confidence band ── + confidence_band = ctx.get("confidence_band", "") + validity_score_val = ctx.get("validity_score") + if confidence_band: + pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else "" + _add(_mrkdwn_section(f"*Confidence:* {confidence_band.upper()}{pct}")) + # ── Failed Pods ── datadog_site = ctx.get("datadog_site", "datadoghq.com") all_pods = get_failed_pods(ctx) @@ -727,6 +776,28 @@ def _add(block: "dict[str, Any] | None") -> None: if non_validated_lines: _add(_mrkdwn_section("*Inferred (not yet validated)*\n" + "\n".join(non_validated_lines))) + # ── Alternative Hypotheses ── + ranked_hypotheses = ctx.get("ranked_hypotheses") or [] + if ranked_hypotheses: + blocks.append({"type": "divider"}) + _add( + _mrkdwn_section( + "*Alternative hypotheses:*\n" + + "\n".join(f"• {_sanitize_for_slack(h)}" for h in ranked_hypotheses) + ) + ) + + # ── Missing Evidence ── + missing_evidence_list = ctx.get("missing_evidence") or [] + if missing_evidence_list: + blocks.append({"type": "divider"}) + _add( + _mrkdwn_section( + "*Missing evidence:*\n" + + "\n".join(f"• {_sanitize_for_slack(e)}" for e in missing_evidence_list) + ) + ) + correlation_signal_lines, correlation_driver_lines = _format_correlation_lines(ctx) if correlation_signal_lines or correlation_driver_lines: blocks.append({"type": "divider"}) diff --git a/app/delivery/publish_findings/node.py b/app/delivery/publish_findings/node.py index 0d865239c..f3c39ba59 100644 --- a/app/delivery/publish_findings/node.py +++ b/app/delivery/publish_findings/node.py @@ -48,7 +48,14 @@ def generate_report(state: InvestigationState) -> dict: all_blocks = build_slack_blocks(ctx) + build_action_blocks(investigation_url, investigation_id) all_blocks = masking_ctx.unmask_value(all_blocks) - render_report(slack_message, root_cause_category=state.get("root_cause_category")) + render_report( + slack_message, + root_cause_category=state.get("root_cause_category"), + confidence_band=state.get("confidence_band", ""), + validity_score=state.get("validity_score"), + ranked_hypotheses=state.get("ranked_hypotheses", []), + missing_evidence=state.get("missing_evidence", []), + ) open_in_editor(slack_message) slack_ctx = state.get("slack_context", {}) diff --git a/app/delivery/publish_findings/renderers/terminal.py b/app/delivery/publish_findings/renderers/terminal.py index 4b455567e..c85f9e43c 100644 --- a/app/delivery/publish_findings/renderers/terminal.py +++ b/app/delivery/publish_findings/renderers/terminal.py @@ -1,5 +1,6 @@ """Terminal rendering for RCA reports — Claude-style output.""" +import math import re from rich.console import Console @@ -70,6 +71,41 @@ def _strip_mrkdwn(text: str) -> str: _BOLD_RE = re.compile(r"\*\*?([^*]+)\*\*?") +_CONFIDENCE_LINE_RE = re.compile(r"^\*?Confidence:\*?\s+\w+", re.IGNORECASE) +_CONFIDENCE_BLOCK_LABELS = frozenset({"*Alternative hypotheses:*", "*Missing evidence:*"}) + + +def _filter_confidence_sections(lines: list[str]) -> list[str]: + """Drop lines rendered separately by _render_rich_confidence_block. + + Removes the inline Confidence: line and the Alternative hypotheses / + Missing evidence sections (header + bullets) so they are not printed twice. + Exits a skip-section on any ## heading or any other *Label:* section header. + """ + result: list[str] = [] + in_skip = False + for line in lines: + stripped = line.strip() + if in_skip: + is_heading = bool(_HEADING_RE.match(stripped)) + is_other_label = ( + stripped.startswith("*") + and stripped.endswith(":*") + and stripped not in _CONFIDENCE_BLOCK_LABELS + ) + if is_heading or is_other_label: + in_skip = False # fall through + else: + continue + if stripped in _CONFIDENCE_BLOCK_LABELS: + in_skip = True + continue + if _CONFIDENCE_LINE_RE.match(stripped): + continue + result.append(line) + return result + + def _render_rich_section_heading(console: Console, title: str) -> None: console.print() t = Text() @@ -111,7 +147,14 @@ def _render_rich_evidence_item(console: Console, line: str) -> None: # ───────────────────────────────────────────────────────────────────────────── -def render_report(slack_message: str, root_cause_category: str | None = None) -> None: +def render_report( + slack_message: str, + root_cause_category: str | None = None, + confidence_band: str = "", + validity_score: float | None = None, + ranked_hypotheses: list[str] | None = None, + missing_evidence: list[str] | None = None, +) -> None: """Render the final RCA report to terminal.""" from app.cli.support.output import stop_display @@ -128,17 +171,38 @@ def render_report(slack_message: str, root_cause_category: str | None = None) -> return if fmt == "rich": - _render_rich_report(slack_message, root_cause_category=root_cause_category) + _render_rich_report( + slack_message, + root_cause_category=root_cause_category, + confidence_band=confidence_band, + validity_score=validity_score, + ranked_hypotheses=ranked_hypotheses or [], + missing_evidence=missing_evidence or [], + ) else: - _render_plain_report(slack_message, root_cause_category=root_cause_category) - - -def _render_rich_report(slack_message: str, root_cause_category: str | None = None) -> None: + _render_plain_report( + slack_message, + root_cause_category=root_cause_category, + confidence_band=confidence_band, + validity_score=validity_score, + ranked_hypotheses=ranked_hypotheses or [], + missing_evidence=missing_evidence or [], + ) + + +def _render_rich_report( + slack_message: str, + root_cause_category: str | None = None, + confidence_band: str = "", + validity_score: float | None = None, + ranked_hypotheses: list[str] | None = None, + missing_evidence: list[str] | None = None, +) -> None: _ = root_cause_category console = Console() console.print() - lines = slack_message.splitlines() + lines = _filter_confidence_sections(slack_message.splitlines()) in_evidence = False for line in lines: @@ -197,11 +261,81 @@ def _render_rich_report(slack_message: str, root_cause_category: str | None = No t.append_text(_rich_line_with_links(stripped)) console.print(t) + _render_rich_confidence_block( + console, + confidence_band=confidence_band, + validity_score=validity_score, + ranked_hypotheses=ranked_hypotheses or [], + missing_evidence=missing_evidence or [], + ) console.print() -def _render_plain_report(slack_message: str, root_cause_category: str | None = None) -> None: +def _render_rich_confidence_block( + console: Console, + confidence_band: str, + validity_score: float | None, + ranked_hypotheses: list[str], + missing_evidence: list[str], +) -> None: + if not confidence_band and validity_score is None: + return + + console.print() + band_upper = confidence_band.upper() if confidence_band else "" + band_style = {"HIGH": "bold green", "MEDIUM": "bold yellow", "LOW": "bold red"}.get( + band_upper, f"bold {TEXT}" + ) + score_str = ( + f" ({int(validity_score * 100)}%)" + if validity_score is not None and not math.isnan(validity_score) + else "" + ) + + t = Text(" Confidence: ") + t.append(f"{band_upper}{score_str}" if band_upper else score_str.strip(), style=band_style) + console.print(t) + + if ranked_hypotheses: + _render_rich_section_heading(console, "Alternative hypotheses") + for h in ranked_hypotheses: + _render_rich_bullet(console, h) + + if missing_evidence: + _render_rich_section_heading(console, "Missing evidence") + for item in missing_evidence: + _render_rich_bullet(console, item) + + +def _render_plain_report( + slack_message: str, + root_cause_category: str | None = None, + confidence_band: str = "", + validity_score: float | None = None, + ranked_hypotheses: list[str] | None = None, + missing_evidence: list[str] | None = None, +) -> None: _ = root_cause_category print() - clean = _strip_slack_links(_strip_mrkdwn(slack_message)) + filtered = "\n".join(_filter_confidence_sections(slack_message.splitlines())) + clean = _strip_slack_links(_strip_mrkdwn(filtered)) print(clean) + + if confidence_band or validity_score is not None: + band_str = confidence_band.upper() if confidence_band else "" + score_str = ( + f" ({int(validity_score * 100)}%)" + if validity_score is not None and not math.isnan(validity_score) + else "" + ) + print(f"\nConfidence: {band_str}{score_str}".strip()) + + if ranked_hypotheses: + print("\nAlternative hypotheses:") + for h in ranked_hypotheses: + print(f" - {h}") + + if missing_evidence: + print("\nMissing evidence:") + for item in missing_evidence: + print(f" - {item}") diff --git a/app/delivery/publish_findings/report_context.py b/app/delivery/publish_findings/report_context.py index da5c16335..dbae4fdf4 100644 --- a/app/delivery/publish_findings/report_context.py +++ b/app/delivery/publish_findings/report_context.py @@ -46,6 +46,9 @@ class ReportContext(TypedDict, total=False): validated_claims: list[dict] non_validated_claims: list[dict] validity_score: float + confidence_band: str + ranked_hypotheses: list[str] + missing_evidence: list[str] investigation_recommendations: list[str] remediation_steps: list[str] correlation: dict[str, Any] @@ -918,6 +921,9 @@ def build_report_context(state: InvestigationState) -> ReportContext: "validated_claims": validated_claims, "non_validated_claims": non_validated_claims, "validity_score": state.get("validity_score", 0.0), + "confidence_band": state.get("confidence_band", ""), + "ranked_hypotheses": state.get("ranked_hypotheses", []), + "missing_evidence": state.get("missing_evidence", []), "investigation_recommendations": state.get("investigation_recommendations", []), "remediation_steps": state.get("remediation_steps", []), "correlation": state.get("correlation", {}), diff --git a/app/pipeline/runners.py b/app/pipeline/runners.py index c4e6d18f6..7b8a384d1 100644 --- a/app/pipeline/runners.py +++ b/app/pipeline/runners.py @@ -276,6 +276,9 @@ def _run_pipeline() -> None: "root_cause": state_any.get("root_cause", ""), "root_cause_category": state_any.get("root_cause_category", ""), "validity_score": state_any.get("validity_score"), + "confidence_band": state_any.get("confidence_band", ""), + "ranked_hypotheses": state_any.get("ranked_hypotheses", []), + "missing_evidence": state_any.get("missing_evidence", []), "report": state_any.get("report", ""), "slack_message": state_any.get("slack_message", ""), "problem_md": state_any.get("problem_md", ""), diff --git a/app/remote/renderer.py b/app/remote/renderer.py index 1488f7625..f04e0ebf6 100644 --- a/app/remote/renderer.py +++ b/app/remote/renderer.py @@ -758,6 +758,9 @@ def _build_node_message(self, node: str) -> str | None: return f"Resolved: {names}" if node in {"diagnose", "diagnose_root_cause"}: pct = _validity_score_percent(self._final_state.get("validity_score")) + band = self._final_state.get("confidence_band", "") + if pct and band: + return f"validity:{band.upper()}({pct})" if pct: return f"validity:{pct}" return None @@ -781,7 +784,14 @@ def _print_report(self) -> None: from app.delivery.publish_findings.renderers.terminal import render_report as _render - _render(slack_message, root_cause_category=root_cause_category) + _render( + slack_message, + root_cause_category=root_cause_category, + confidence_band=self._final_state.get("confidence_band", ""), + validity_score=self._final_state.get("validity_score"), + ranked_hypotheses=self._final_state.get("ranked_hypotheses") or [], + missing_evidence=self._final_state.get("missing_evidence") or [], + ) def _canonical_node_name(name: str) -> str: diff --git a/app/state/agent_state.py b/app/state/agent_state.py index 87f69f5a7..1c17ac16b 100644 --- a/app/state/agent_state.py +++ b/app/state/agent_state.py @@ -77,6 +77,9 @@ class AgentState(TypedDict, total=False): validated_claims: list[dict[str, Any]] non_validated_claims: list[dict[str, Any]] validity_score: float + confidence_band: str + ranked_hypotheses: list[str] + missing_evidence: list[str] investigation_recommendations: list[str] remediation_steps: list[str] investigation_loop_count: int @@ -185,6 +188,9 @@ class AgentStateModel(StrictConfigModel): validated_claims: list[dict[str, Any]] = Field(default_factory=list) non_validated_claims: list[dict[str, Any]] = Field(default_factory=list) validity_score: float = 0.0 + confidence_band: str = "" + ranked_hypotheses: list[str] = Field(default_factory=list) + missing_evidence: list[str] = Field(default_factory=list) investigation_recommendations: list[str] = Field(default_factory=list) remediation_steps: list[str] = Field(default_factory=list) investigation_loop_count: int = 0 diff --git a/app/state/factory.py b/app/state/factory.py index 9b9cc2b11..d64fc48c4 100644 --- a/app/state/factory.py +++ b/app/state/factory.py @@ -36,6 +36,9 @@ "validated_claims": [], "non_validated_claims": [], "validity_score": 0.0, + "confidence_band": "", + "ranked_hypotheses": [], + "missing_evidence": [], "investigation_recommendations": [], "remediation_steps": [], "investigation_loop_count": 0, diff --git a/app/utils/ingest_delivery.py b/app/utils/ingest_delivery.py index 1638c6cec..73e1107a5 100644 --- a/app/utils/ingest_delivery.py +++ b/app/utils/ingest_delivery.py @@ -69,6 +69,7 @@ def build_ingest_payload(state: InvestigationState) -> dict[str, Any]: "root_cause": state.get("root_cause") or "", "confidence": state.get("validity_score") or 0, "validity_score": state.get("validity_score") or 0, + "confidence_band": state.get("confidence_band") or "", "planned_actions": planned_actions, "problem_md": state.get("problem_md") or "", "investigation_recommendations": state.get("investigation_recommendations") or [], diff --git a/app/utils/openclaw_delivery.py b/app/utils/openclaw_delivery.py index 1d9024e0d..4bca853f4 100644 --- a/app/utils/openclaw_delivery.py +++ b/app/utils/openclaw_delivery.py @@ -33,8 +33,10 @@ def _report_body(state: InvestigationState, report: str) -> str: sections.append(f"Remediation steps:\n{rendered_steps}") validity_score = state.get("validity_score") + confidence_band = state.get("confidence_band", "") if isinstance(validity_score, (int, float)): - sections.append(f"Confidence: {validity_score:.0%}") + band_str = f" [{confidence_band.upper()}]" if confidence_band else "" + sections.append(f"Confidence: {validity_score:.0%}{band_str}") return "\n\n".join(section for section in sections if section).strip() diff --git a/docs/investigation-overview.mdx b/docs/investigation-overview.mdx index 8324d8a57..414585784 100644 --- a/docs/investigation-overview.mdx +++ b/docs/investigation-overview.mdx @@ -56,6 +56,19 @@ Each run captures: - tool outputs collected from connected integrations - final diagnosis and recommended remediation steps +### Confidence and Evidence Sufficiency + +When a diagnosis is reached, OpenSRE evaluates the available evidence and assigns a **Confidence Band**: +- **HIGH** (Score ≥ 0.75, at least 1 validated finding): Strong evidence backing the diagnosis. +- **MEDIUM** (Score 0.40–0.74, at least 2 validated findings): Partial evidence with some gaps. +- **LOW** (Score < 0.40, or insufficient validated findings): Thin or conflicting evidence. + +When evidence is insufficient — a low score or fewer validated findings than the threshold — the system prefixes the root cause with *"Most likely:"* to signal uncertainty. + +When the investigation identifies gaps, the report exposes: +- **Alternative hypotheses** — ranked by likelihood (shown when confidence is medium or low) +- **Missing evidence** — specific data sources that would confirm or refute the diagnosis + ## Chat For local binary usage, the primary workflow is file-based (`problem.md`, `report.md`, and optional JSON output). diff --git a/tests/agent/test_confidence_gating.py b/tests/agent/test_confidence_gating.py new file mode 100644 index 000000000..efe5b572a --- /dev/null +++ b/tests/agent/test_confidence_gating.py @@ -0,0 +1,189 @@ +"""Tests for confidence band classification and evidence sufficiency gating.""" + +from __future__ import annotations + +import pytest + +from app.agent.result import ( + InvestigationResult, + check_sufficiency, + classify_confidence_band, +) + +# --------------------------------------------------------------------------- +# classify_confidence_band +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "score, expected", + [ + (1.0, "high"), + (0.75, "high"), + (0.74, "medium"), + (0.40, "medium"), + (0.39, "low"), + (0.0, "low"), + ], +) +def test_classify_confidence_band_thresholds(score: float, expected: str) -> None: + assert classify_confidence_band(score) == expected + + +# --------------------------------------------------------------------------- +# InvestigationResult factory classmethods set correct band +# --------------------------------------------------------------------------- + + +def test_unknown_result_has_low_band() -> None: + result = InvestigationResult.unknown("test-alert") + assert result.confidence_band == "low" + assert result.validity_score == 0.0 + + +def test_noise_result_has_high_band() -> None: + result = InvestigationResult.noise() + assert result.confidence_band == "high" + assert result.validity_score == 1.0 + + +# --------------------------------------------------------------------------- +# check_sufficiency — three scenarios from the issue +# --------------------------------------------------------------------------- + + +def test_sufficient_evidence_passes_gate() -> None: + """High score + multiple validated claims → definitive, no prefix needed.""" + result = InvestigationResult( + root_cause="DB connection pool exhausted due to query buildup.", + root_cause_category="database", + validity_score=0.85, + confidence_band="high", + validated_claims=[ + {"claim": "Connection pool at 100%", "validation_status": "validated"}, + {"claim": "Query latency spiked at 14:32 UTC", "validation_status": "validated"}, + ], + ) + assert check_sufficiency(result) is True + + +def test_weak_evidence_fails_gate() -> None: + """Low score + no validated claims → gate fires, root cause should be prefixed.""" + result = InvestigationResult( + root_cause="Suspected memory leak in worker process.", + root_cause_category="performance", + validity_score=0.30, + confidence_band="low", + validated_claims=[], + ) + assert check_sufficiency(result) is False + # Simulate gate behaviour applied in investigation.py + if not result.root_cause.startswith("Most likely"): + result.root_cause = f"Most likely: {result.root_cause}" + assert result.root_cause.startswith("Most likely:") + + +def test_conflicting_evidence_medium_band_fails_gate() -> None: + """Medium score with only one validated claim — gate fires.""" + result = InvestigationResult( + root_cause="Possible network partition or config drift after deployment.", + root_cause_category="network", + validity_score=0.55, + confidence_band="medium", + validated_claims=[ + {"claim": "Packet loss observed on inter-AZ traffic", "validation_status": "validated"}, + ], + ranked_hypotheses=["Network partition between AZs", "Config drift after last deploy"], + missing_evidence=[ + "VPC flow logs for the affected subnets", + "Deployment history for the last 2 hours", + ], + ) + assert classify_confidence_band(result.validity_score) == "medium" + assert check_sufficiency(result) is False + assert len(result.ranked_hypotheses) == 2 + assert len(result.missing_evidence) == 2 + + +def test_medium_score_with_two_validated_claims_passes_gate() -> None: + """Medium score but 2+ validated claims is considered sufficient.""" + result = InvestigationResult( + root_cause="High CPU due to unindexed query on orders table.", + root_cause_category="database", + validity_score=0.60, + confidence_band="medium", + validated_claims=[ + {"claim": "CPU at 95% on RDS instance", "validation_status": "validated"}, + {"claim": "Slow query log shows full-table scan", "validation_status": "validated"}, + ], + ) + assert check_sufficiency(result) is True + + +def test_high_score_with_no_validated_claims_fails_gate() -> None: + """High validity_score but zero validated claims must not pass — LLM self-report alone insufficient.""" + result = InvestigationResult( + root_cause="DB connection pool exhausted.", + root_cause_category="database", + validity_score=0.85, + confidence_band="high", + validated_claims=[], + ) + assert check_sufficiency(result) is False + + +def test_healthy_category_always_passes_gate() -> None: + """Healthy findings are always definitive regardless of score.""" + result = InvestigationResult( + root_cause="All systems operating normally — no incident detected.", + root_cause_category="healthy", + validity_score=0.20, + confidence_band="low", + ) + assert check_sufficiency(result) is True + + +def test_unknown_category_passes_gate() -> None: + """Unknown results must not receive a 'Most likely:' prefix — already communicate uncertainty.""" + result = InvestigationResult.unknown("MyAlert") + assert check_sufficiency(result) is True + assert result.root_cause_category == "unknown" + + +def test_gate_downgrades_band_to_low_when_fired() -> None: + """Gate always sets band to low — regardless of starting band — to stay consistent with prefix.""" + for starting_band, validity_score, validated_claims in [ + ("high", 0.85, []), + ("medium", 0.55, [{"claim": "c1", "validation_status": "validated"}]), + ]: + result = InvestigationResult( + root_cause="DB connection pool exhausted.", + root_cause_category="database", + validity_score=validity_score, + confidence_band=starting_band, + validated_claims=validated_claims, + ) + assert check_sufficiency(result) is False + # Simulate gate behaviour applied in investigation.py + if not result.root_cause.startswith("Most likely"): + result.root_cause = f"Most likely: {result.root_cause}" + result.confidence_band = "low" + assert result.root_cause.startswith("Most likely:") + assert result.confidence_band == "low" + + +def test_gate_strips_llm_most_likely_prefix_when_sufficient() -> None: + """When gate passes, any pre-existing LLM-generated 'Most likely:' prefix must be stripped.""" + from app.agent.result import check_sufficiency + + result = InvestigationResult( + root_cause="Most likely: DB auth failure due to expired credentials.", + root_cause_category="database", + validity_score=0.80, + confidence_band="high", + validated_claims=[{"claim": "Auth error in logs", "validation_status": "validated"}], + ) + assert check_sufficiency(result) is True + if result.root_cause.startswith("Most likely:"): + result.root_cause = result.root_cause[len("Most likely:") :].lstrip() + assert not result.root_cause.startswith("Most likely") diff --git a/tests/cli/test_investigate.py b/tests/cli/test_investigate.py index e577decb7..38bd8b29b 100644 --- a/tests/cli/test_investigate.py +++ b/tests/cli/test_investigate.py @@ -109,6 +109,7 @@ def fake_run_investigation( "root_cause": "bad deploy", "is_noise": False, "validity_score": 0.0, + "confidence_band": "", }