diff --git a/app/agent/investigation.py b/app/agent/investigation.py
index cd1ddfc11..e40bf8745 100644
--- a/app/agent/investigation.py
+++ b/app/agent/investigation.py
@@ -10,7 +10,7 @@
from typing import Any
from app.agent.prompt import build_system_prompt, format_alert_context
-from app.agent.result import InvestigationResult, parse_diagnosis
+from app.agent.result import InvestigationResult, check_sufficiency, parse_diagnosis
from app.cli.support.output import debug_print, get_tracker
from app.constants.investigation import MAX_INVESTIGATION_LOOPS
from app.services.agent_llm_client import ToolCall, get_agent_llm
@@ -201,6 +201,9 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
"root_cause": error_msg,
"validity_score": 0.0,
"root_cause_category": "Configuration Error",
+ "confidence_band": "low",
+ "ranked_hypotheses": [],
+ "missing_evidence": [],
},
)
updates = {
@@ -211,6 +214,9 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
"non_validated_claims": [],
"remediation_steps": remediation_steps,
"validity_score": 0.0,
+ "confidence_band": "low",
+ "ranked_hypotheses": [],
+ "missing_evidence": [],
"investigation_recommendations": [],
"evidence": evidence,
"evidence_entries": [e.model_dump() for e in evidence_entries],
@@ -272,12 +278,23 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
result.evidence_entries = [e.model_dump() for e in evidence_entries]
result.agent_messages = messages
+ if check_sufficiency(result):
+ if result.root_cause.startswith("Most likely:"):
+ result.root_cause = result.root_cause[len("Most likely:") :].lstrip()
+ else:
+ if not result.root_cause.startswith("Most likely"):
+ result.root_cause = f"Most likely: {result.root_cause}"
+ # Override band set by classify_confidence_band in result.py: thin evidence means LOW
+ # regardless of LLM-reported score (a high score with zero claims is still insufficient).
+ result.confidence_band = "low"
+
_emit(
"agent_end",
{
"root_cause": result.root_cause,
"validity_score": result.validity_score,
"root_cause_category": result.root_cause_category,
+ "confidence_band": result.confidence_band,
},
)
@@ -605,6 +622,9 @@ def _result_to_state(result: InvestigationResult) -> dict[str, Any]:
"non_validated_claims": result.non_validated_claims,
"remediation_steps": result.remediation_steps,
"validity_score": result.validity_score,
+ "confidence_band": result.confidence_band,
+ "ranked_hypotheses": result.ranked_hypotheses,
+ "missing_evidence": result.missing_evidence,
"investigation_recommendations": result.investigation_recommendations,
"evidence": result.evidence,
"evidence_entries": result.evidence_entries,
diff --git a/app/agent/prompt.py b/app/agent/prompt.py
index a86dbdcfb..1e1a18d2f 100644
--- a/app/agent/prompt.py
+++ b/app/agent/prompt.py
@@ -36,6 +36,10 @@
- **Non-validated claims**: Hypotheses you could not confirm
- **Remediation steps**: Ordered, concrete actions to fix the issue
- **Validity score**: 0.0–1.0 reflecting your confidence based on evidence quality
+- **Confidence band**: `high` (strong evidence from multiple sources), `medium` (partial evidence, some gaps), or `low` (thin or conflicting evidence)
+- **Ranked hypotheses**: If confidence is medium or low, list alternative explanations in order of likelihood (most likely first)
+- **Missing evidence**: List specific data sources or queries that would confirm or refute the diagnosis but were unavailable
+- If evidence is thin or conflicting, begin your root cause statement with "Most likely: " to signal uncertainty
"""
_ALERT_CONTEXT_TEMPLATE = """## Alert
diff --git a/app/agent/result.py b/app/agent/result.py
index 6aa6239b4..c22f3154d 100644
--- a/app/agent/result.py
+++ b/app/agent/result.py
@@ -4,7 +4,7 @@
import logging
from dataclasses import dataclass, field
-from typing import Any, TypedDict, cast
+from typing import Any, Literal, TypedDict, cast
from pydantic import BaseModel, Field
@@ -17,6 +17,17 @@
logger = logging.getLogger(__name__)
+class _ValidatedClaimSchema(BaseModel):
+ claim: str = Field(description="The validated claim statement")
+ evidence_sources: list[str] = Field(
+ default_factory=list,
+ description=(
+ "Subset of the collected evidence keys that directly support this specific claim. "
+ "Only include keys that actually informed this claim."
+ ),
+ )
+
+
@dataclass
class InvestigationResult:
root_cause: str
@@ -26,6 +37,9 @@ class InvestigationResult:
non_validated_claims: list[dict] = field(default_factory=list)
remediation_steps: list[str] = field(default_factory=list)
validity_score: float = 0.0
+ confidence_band: str = ""
+ ranked_hypotheses: list[str] = field(default_factory=list)
+ missing_evidence: list[str] = field(default_factory=list)
evidence: dict[str, Any] = field(default_factory=dict)
evidence_entries: list[dict] = field(default_factory=list)
agent_messages: list[dict] = field(default_factory=list)
@@ -37,6 +51,7 @@ def unknown(cls, alert_name: str = "Unknown alert") -> InvestigationResult:
root_cause=f"{alert_name}: Unable to determine root cause — insufficient evidence.",
root_cause_category="unknown",
validity_score=0.0,
+ confidence_band="low",
non_validated_claims=[
{
"claim": "Insufficient evidence available",
@@ -51,9 +66,26 @@ def noise(cls) -> InvestigationResult:
root_cause="Message classified as noise — no investigation needed.",
root_cause_category="healthy",
validity_score=1.0,
+ confidence_band="high",
)
+def classify_confidence_band(score: float) -> Literal["high", "medium", "low"]:
+ if score >= 0.75:
+ return "high"
+ if score >= 0.40:
+ return "medium"
+ return "low"
+
+
+def check_sufficiency(result: InvestigationResult) -> bool:
+ if result.root_cause_category in {"healthy", "unknown"}:
+ return True
+ if result.validity_score >= 0.75 and len(result.validated_claims) >= 1:
+ return True
+ return result.validity_score >= 0.40 and len(result.validated_claims) >= 2
+
+
def parse_diagnosis(
messages: list[dict[str, Any]],
evidence: dict[str, Any],
@@ -121,8 +153,9 @@ class DiagnosisSchema(BaseModel):
causal_chain: list[str] = Field(
default_factory=list, description="Ordered steps leading to the failure"
)
- validated_claims: list[str] = Field(
- default_factory=list, description="Claims supported by tool evidence"
+ validated_claims: list[_ValidatedClaimSchema] = Field(
+ default_factory=list,
+ description="Claims supported by tool evidence, each with their specific supporting evidence keys",
)
non_validated_claims: list[str] = Field(
default_factory=list, description="Claims not yet confirmed by evidence"
@@ -133,6 +166,14 @@ class DiagnosisSchema(BaseModel):
validity_score: float = Field(
default=0.0, description="0.0–1.0 confidence in the diagnosis"
)
+ ranked_hypotheses: list[str] = Field(
+ default_factory=list,
+ description="Alternative hypotheses ranked by likelihood (most to least likely)",
+ )
+ missing_evidence: list[str] = Field(
+ default_factory=list,
+ description="Evidence that would confirm or refute the diagnosis but was unavailable",
+ )
return DiagnosisSchema
@@ -157,10 +198,12 @@ class _DiagnosisPayload(TypedDict):
root_cause: str
root_cause_category: str
causal_chain: list[str]
- validated_claims: list[str]
+ validated_claims: list[dict]
non_validated_claims: list[str]
remediation_steps: list[str]
validity_score: float
+ ranked_hypotheses: list[str]
+ missing_evidence: list[str]
llm = get_llm_for_reasoning()
schema_model = _build_diagnosis_schema(_taxonomy_categories_for_alert_source(alert_source))
@@ -181,10 +224,23 @@ def _to_claim_dicts(claims: list[str], status: str) -> list[dict]:
root_cause=schema["root_cause"],
root_cause_category=schema["root_cause_category"],
causal_chain=schema["causal_chain"],
- validated_claims=_to_claim_dicts(schema["validated_claims"], "validated"),
+ validated_claims=[
+ {
+ "claim": c["claim"],
+ "validation_status": "validated",
+ **(
+ {"evidence_sources": c["evidence_sources"]} if c.get("evidence_sources") else {}
+ ),
+ }
+ for c in schema["validated_claims"]
+ if c.get("claim")
+ ],
non_validated_claims=_to_claim_dicts(schema["non_validated_claims"], "not_validated"),
remediation_steps=schema["remediation_steps"],
validity_score=schema["validity_score"],
+ confidence_band=classify_confidence_band(schema["validity_score"]),
+ ranked_hypotheses=schema["ranked_hypotheses"],
+ missing_evidence=schema["missing_evidence"],
)
@@ -207,6 +263,7 @@ def _parse_via_legacy(
],
remediation_steps=rr.remediation_steps,
validity_score=0.5,
+ confidence_band=classify_confidence_band(0.5),
)
except Exception as err:
logger.warning("Legacy parse_root_cause also failed: %s", err)
diff --git a/app/cli/investigation/investigate.py b/app/cli/investigation/investigate.py
index f398c8414..7dd771951 100644
--- a/app/cli/investigation/investigate.py
+++ b/app/cli/investigation/investigate.py
@@ -140,6 +140,7 @@ def run_investigation_cli(
"root_cause": state["root_cause"],
"is_noise": state.get("is_noise", False),
"validity_score": state.get("validity_score", 0.0),
+ "confidence_band": state.get("confidence_band", ""),
}
if state.get("evidence_entries"):
out["tool_calls"] = state["evidence_entries"]
diff --git a/app/delivery/publish_findings/formatters/report.py b/app/delivery/publish_findings/formatters/report.py
index 886a8cf86..5efdfb29a 100644
--- a/app/delivery/publish_findings/formatters/report.py
+++ b/app/delivery/publish_findings/formatters/report.py
@@ -466,6 +466,12 @@ def format_slack_message(ctx: ReportContext) -> str:
if top_log:
conclusion_block += f"`{top_log}`\n"
+ confidence_band = ctx.get("confidence_band", "")
+ validity_score_val = ctx.get("validity_score")
+ if confidence_band:
+ pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
+ conclusion_block += f"*Confidence:* {confidence_band.upper()}{pct}\n"
+
validated_lines, non_validated_lines = _render_claim_lines(ctx)
if validated_lines:
# Use a larger markdown heading so that "Findings" stands out as a section.
@@ -475,6 +481,21 @@ def format_slack_message(ctx: ReportContext) -> str:
"\n*Non-Validated Claims (Inferred):*\n" + "\n".join(non_validated_lines) + "\n"
)
+ ranked_hypotheses = ctx.get("ranked_hypotheses") or []
+ if ranked_hypotheses:
+ conclusion_block += (
+ "\n*Alternative hypotheses:*\n"
+ + "\n".join(f"• {_sanitize_for_slack(h)}" for h in ranked_hypotheses)
+ + "\n"
+ )
+ missing_evidence_list = ctx.get("missing_evidence") or []
+ if missing_evidence_list:
+ conclusion_block += (
+ "\n*Missing evidence:*\n"
+ + "\n".join(f"• {_sanitize_for_slack(e)}" for e in missing_evidence_list)
+ + "\n"
+ )
+
correlation_signal_lines, correlation_driver_lines = _format_correlation_lines(ctx)
if correlation_signal_lines or correlation_driver_lines:
conclusion_block += "\n## Upstream Correlation\n"
@@ -556,12 +577,33 @@ def format_telegram_message(ctx: ReportContext) -> str:
rc += "\n" + html.escape(top_log) + ""
parts.append(rc)
+ confidence_band = ctx.get("confidence_band", "")
+ validity_score_val = ctx.get("validity_score")
+ if confidence_band:
+ pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
+ parts.append(f"Confidence: {html.escape(confidence_band.upper())}{pct}")
+
validated_lines, non_validated_lines = _render_claim_lines_telegram(ctx)
if validated_lines:
parts.append("Findings\n" + "\n".join(validated_lines))
if non_validated_lines:
parts.append("Non-Validated Claims (Inferred)\n" + "\n".join(non_validated_lines))
+ ranked_hypotheses = ctx.get("ranked_hypotheses") or []
+ if ranked_hypotheses:
+ hyp = "\n".join(
+ "• " + _to_telegram_html_body(_sanitize_for_slack(str(h))) for h in ranked_hypotheses
+ )
+ parts.append("Alternative hypotheses\n" + hyp)
+
+ missing_evidence_list = ctx.get("missing_evidence") or []
+ if missing_evidence_list:
+ me = "\n".join(
+ "• " + _to_telegram_html_body(_sanitize_for_slack(str(e)))
+ for e in missing_evidence_list
+ )
+ parts.append("Missing evidence\n" + me)
+
provenance_lines = _format_provenance_lines(ctx)
if provenance_lines:
prov = "\n".join(
@@ -695,6 +737,13 @@ def _add(block: "dict[str, Any] | None") -> None:
rc_text += f"\n`{top_log}`"
_add(_mrkdwn_section(rc_text))
+ # ── Confidence band ──
+ confidence_band = ctx.get("confidence_band", "")
+ validity_score_val = ctx.get("validity_score")
+ if confidence_band:
+ pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
+ _add(_mrkdwn_section(f"*Confidence:* {confidence_band.upper()}{pct}"))
+
# ── Failed Pods ──
datadog_site = ctx.get("datadog_site", "datadoghq.com")
all_pods = get_failed_pods(ctx)
@@ -727,6 +776,28 @@ def _add(block: "dict[str, Any] | None") -> None:
if non_validated_lines:
_add(_mrkdwn_section("*Inferred (not yet validated)*\n" + "\n".join(non_validated_lines)))
+ # ── Alternative Hypotheses ──
+ ranked_hypotheses = ctx.get("ranked_hypotheses") or []
+ if ranked_hypotheses:
+ blocks.append({"type": "divider"})
+ _add(
+ _mrkdwn_section(
+ "*Alternative hypotheses:*\n"
+ + "\n".join(f"• {_sanitize_for_slack(h)}" for h in ranked_hypotheses)
+ )
+ )
+
+ # ── Missing Evidence ──
+ missing_evidence_list = ctx.get("missing_evidence") or []
+ if missing_evidence_list:
+ blocks.append({"type": "divider"})
+ _add(
+ _mrkdwn_section(
+ "*Missing evidence:*\n"
+ + "\n".join(f"• {_sanitize_for_slack(e)}" for e in missing_evidence_list)
+ )
+ )
+
correlation_signal_lines, correlation_driver_lines = _format_correlation_lines(ctx)
if correlation_signal_lines or correlation_driver_lines:
blocks.append({"type": "divider"})
diff --git a/app/delivery/publish_findings/node.py b/app/delivery/publish_findings/node.py
index 0d865239c..f3c39ba59 100644
--- a/app/delivery/publish_findings/node.py
+++ b/app/delivery/publish_findings/node.py
@@ -48,7 +48,14 @@ def generate_report(state: InvestigationState) -> dict:
all_blocks = build_slack_blocks(ctx) + build_action_blocks(investigation_url, investigation_id)
all_blocks = masking_ctx.unmask_value(all_blocks)
- render_report(slack_message, root_cause_category=state.get("root_cause_category"))
+ render_report(
+ slack_message,
+ root_cause_category=state.get("root_cause_category"),
+ confidence_band=state.get("confidence_band", ""),
+ validity_score=state.get("validity_score"),
+ ranked_hypotheses=state.get("ranked_hypotheses", []),
+ missing_evidence=state.get("missing_evidence", []),
+ )
open_in_editor(slack_message)
slack_ctx = state.get("slack_context", {})
diff --git a/app/delivery/publish_findings/renderers/terminal.py b/app/delivery/publish_findings/renderers/terminal.py
index 4b455567e..c85f9e43c 100644
--- a/app/delivery/publish_findings/renderers/terminal.py
+++ b/app/delivery/publish_findings/renderers/terminal.py
@@ -1,5 +1,6 @@
"""Terminal rendering for RCA reports — Claude-style output."""
+import math
import re
from rich.console import Console
@@ -70,6 +71,41 @@ def _strip_mrkdwn(text: str) -> str:
_BOLD_RE = re.compile(r"\*\*?([^*]+)\*\*?")
+_CONFIDENCE_LINE_RE = re.compile(r"^\*?Confidence:\*?\s+\w+", re.IGNORECASE)
+_CONFIDENCE_BLOCK_LABELS = frozenset({"*Alternative hypotheses:*", "*Missing evidence:*"})
+
+
+def _filter_confidence_sections(lines: list[str]) -> list[str]:
+ """Drop lines rendered separately by _render_rich_confidence_block.
+
+ Removes the inline Confidence: line and the Alternative hypotheses /
+ Missing evidence sections (header + bullets) so they are not printed twice.
+ Exits a skip-section on any ## heading or any other *Label:* section header.
+ """
+ result: list[str] = []
+ in_skip = False
+ for line in lines:
+ stripped = line.strip()
+ if in_skip:
+ is_heading = bool(_HEADING_RE.match(stripped))
+ is_other_label = (
+ stripped.startswith("*")
+ and stripped.endswith(":*")
+ and stripped not in _CONFIDENCE_BLOCK_LABELS
+ )
+ if is_heading or is_other_label:
+ in_skip = False # fall through
+ else:
+ continue
+ if stripped in _CONFIDENCE_BLOCK_LABELS:
+ in_skip = True
+ continue
+ if _CONFIDENCE_LINE_RE.match(stripped):
+ continue
+ result.append(line)
+ return result
+
+
def _render_rich_section_heading(console: Console, title: str) -> None:
console.print()
t = Text()
@@ -111,7 +147,14 @@ def _render_rich_evidence_item(console: Console, line: str) -> None:
# ─────────────────────────────────────────────────────────────────────────────
-def render_report(slack_message: str, root_cause_category: str | None = None) -> None:
+def render_report(
+ slack_message: str,
+ root_cause_category: str | None = None,
+ confidence_band: str = "",
+ validity_score: float | None = None,
+ ranked_hypotheses: list[str] | None = None,
+ missing_evidence: list[str] | None = None,
+) -> None:
"""Render the final RCA report to terminal."""
from app.cli.support.output import stop_display
@@ -128,17 +171,38 @@ def render_report(slack_message: str, root_cause_category: str | None = None) ->
return
if fmt == "rich":
- _render_rich_report(slack_message, root_cause_category=root_cause_category)
+ _render_rich_report(
+ slack_message,
+ root_cause_category=root_cause_category,
+ confidence_band=confidence_band,
+ validity_score=validity_score,
+ ranked_hypotheses=ranked_hypotheses or [],
+ missing_evidence=missing_evidence or [],
+ )
else:
- _render_plain_report(slack_message, root_cause_category=root_cause_category)
-
-
-def _render_rich_report(slack_message: str, root_cause_category: str | None = None) -> None:
+ _render_plain_report(
+ slack_message,
+ root_cause_category=root_cause_category,
+ confidence_band=confidence_band,
+ validity_score=validity_score,
+ ranked_hypotheses=ranked_hypotheses or [],
+ missing_evidence=missing_evidence or [],
+ )
+
+
+def _render_rich_report(
+ slack_message: str,
+ root_cause_category: str | None = None,
+ confidence_band: str = "",
+ validity_score: float | None = None,
+ ranked_hypotheses: list[str] | None = None,
+ missing_evidence: list[str] | None = None,
+) -> None:
_ = root_cause_category
console = Console()
console.print()
- lines = slack_message.splitlines()
+ lines = _filter_confidence_sections(slack_message.splitlines())
in_evidence = False
for line in lines:
@@ -197,11 +261,81 @@ def _render_rich_report(slack_message: str, root_cause_category: str | None = No
t.append_text(_rich_line_with_links(stripped))
console.print(t)
+ _render_rich_confidence_block(
+ console,
+ confidence_band=confidence_band,
+ validity_score=validity_score,
+ ranked_hypotheses=ranked_hypotheses or [],
+ missing_evidence=missing_evidence or [],
+ )
console.print()
-def _render_plain_report(slack_message: str, root_cause_category: str | None = None) -> None:
+def _render_rich_confidence_block(
+ console: Console,
+ confidence_band: str,
+ validity_score: float | None,
+ ranked_hypotheses: list[str],
+ missing_evidence: list[str],
+) -> None:
+ if not confidence_band and validity_score is None:
+ return
+
+ console.print()
+ band_upper = confidence_band.upper() if confidence_band else ""
+ band_style = {"HIGH": "bold green", "MEDIUM": "bold yellow", "LOW": "bold red"}.get(
+ band_upper, f"bold {TEXT}"
+ )
+ score_str = (
+ f" ({int(validity_score * 100)}%)"
+ if validity_score is not None and not math.isnan(validity_score)
+ else ""
+ )
+
+ t = Text(" Confidence: ")
+ t.append(f"{band_upper}{score_str}" if band_upper else score_str.strip(), style=band_style)
+ console.print(t)
+
+ if ranked_hypotheses:
+ _render_rich_section_heading(console, "Alternative hypotheses")
+ for h in ranked_hypotheses:
+ _render_rich_bullet(console, h)
+
+ if missing_evidence:
+ _render_rich_section_heading(console, "Missing evidence")
+ for item in missing_evidence:
+ _render_rich_bullet(console, item)
+
+
+def _render_plain_report(
+ slack_message: str,
+ root_cause_category: str | None = None,
+ confidence_band: str = "",
+ validity_score: float | None = None,
+ ranked_hypotheses: list[str] | None = None,
+ missing_evidence: list[str] | None = None,
+) -> None:
_ = root_cause_category
print()
- clean = _strip_slack_links(_strip_mrkdwn(slack_message))
+ filtered = "\n".join(_filter_confidence_sections(slack_message.splitlines()))
+ clean = _strip_slack_links(_strip_mrkdwn(filtered))
print(clean)
+
+ if confidence_band or validity_score is not None:
+ band_str = confidence_band.upper() if confidence_band else ""
+ score_str = (
+ f" ({int(validity_score * 100)}%)"
+ if validity_score is not None and not math.isnan(validity_score)
+ else ""
+ )
+ print(f"\nConfidence: {band_str}{score_str}".strip())
+
+ if ranked_hypotheses:
+ print("\nAlternative hypotheses:")
+ for h in ranked_hypotheses:
+ print(f" - {h}")
+
+ if missing_evidence:
+ print("\nMissing evidence:")
+ for item in missing_evidence:
+ print(f" - {item}")
diff --git a/app/delivery/publish_findings/report_context.py b/app/delivery/publish_findings/report_context.py
index da5c16335..dbae4fdf4 100644
--- a/app/delivery/publish_findings/report_context.py
+++ b/app/delivery/publish_findings/report_context.py
@@ -46,6 +46,9 @@ class ReportContext(TypedDict, total=False):
validated_claims: list[dict]
non_validated_claims: list[dict]
validity_score: float
+ confidence_band: str
+ ranked_hypotheses: list[str]
+ missing_evidence: list[str]
investigation_recommendations: list[str]
remediation_steps: list[str]
correlation: dict[str, Any]
@@ -918,6 +921,9 @@ def build_report_context(state: InvestigationState) -> ReportContext:
"validated_claims": validated_claims,
"non_validated_claims": non_validated_claims,
"validity_score": state.get("validity_score", 0.0),
+ "confidence_band": state.get("confidence_band", ""),
+ "ranked_hypotheses": state.get("ranked_hypotheses", []),
+ "missing_evidence": state.get("missing_evidence", []),
"investigation_recommendations": state.get("investigation_recommendations", []),
"remediation_steps": state.get("remediation_steps", []),
"correlation": state.get("correlation", {}),
diff --git a/app/pipeline/runners.py b/app/pipeline/runners.py
index c4e6d18f6..7b8a384d1 100644
--- a/app/pipeline/runners.py
+++ b/app/pipeline/runners.py
@@ -276,6 +276,9 @@ def _run_pipeline() -> None:
"root_cause": state_any.get("root_cause", ""),
"root_cause_category": state_any.get("root_cause_category", ""),
"validity_score": state_any.get("validity_score"),
+ "confidence_band": state_any.get("confidence_band", ""),
+ "ranked_hypotheses": state_any.get("ranked_hypotheses", []),
+ "missing_evidence": state_any.get("missing_evidence", []),
"report": state_any.get("report", ""),
"slack_message": state_any.get("slack_message", ""),
"problem_md": state_any.get("problem_md", ""),
diff --git a/app/remote/renderer.py b/app/remote/renderer.py
index 1488f7625..f04e0ebf6 100644
--- a/app/remote/renderer.py
+++ b/app/remote/renderer.py
@@ -758,6 +758,9 @@ def _build_node_message(self, node: str) -> str | None:
return f"Resolved: {names}"
if node in {"diagnose", "diagnose_root_cause"}:
pct = _validity_score_percent(self._final_state.get("validity_score"))
+ band = self._final_state.get("confidence_band", "")
+ if pct and band:
+ return f"validity:{band.upper()}({pct})"
if pct:
return f"validity:{pct}"
return None
@@ -781,7 +784,14 @@ def _print_report(self) -> None:
from app.delivery.publish_findings.renderers.terminal import render_report as _render
- _render(slack_message, root_cause_category=root_cause_category)
+ _render(
+ slack_message,
+ root_cause_category=root_cause_category,
+ confidence_band=self._final_state.get("confidence_band", ""),
+ validity_score=self._final_state.get("validity_score"),
+ ranked_hypotheses=self._final_state.get("ranked_hypotheses") or [],
+ missing_evidence=self._final_state.get("missing_evidence") or [],
+ )
def _canonical_node_name(name: str) -> str:
diff --git a/app/state/agent_state.py b/app/state/agent_state.py
index 87f69f5a7..1c17ac16b 100644
--- a/app/state/agent_state.py
+++ b/app/state/agent_state.py
@@ -77,6 +77,9 @@ class AgentState(TypedDict, total=False):
validated_claims: list[dict[str, Any]]
non_validated_claims: list[dict[str, Any]]
validity_score: float
+ confidence_band: str
+ ranked_hypotheses: list[str]
+ missing_evidence: list[str]
investigation_recommendations: list[str]
remediation_steps: list[str]
investigation_loop_count: int
@@ -185,6 +188,9 @@ class AgentStateModel(StrictConfigModel):
validated_claims: list[dict[str, Any]] = Field(default_factory=list)
non_validated_claims: list[dict[str, Any]] = Field(default_factory=list)
validity_score: float = 0.0
+ confidence_band: str = ""
+ ranked_hypotheses: list[str] = Field(default_factory=list)
+ missing_evidence: list[str] = Field(default_factory=list)
investigation_recommendations: list[str] = Field(default_factory=list)
remediation_steps: list[str] = Field(default_factory=list)
investigation_loop_count: int = 0
diff --git a/app/state/factory.py b/app/state/factory.py
index 9b9cc2b11..d64fc48c4 100644
--- a/app/state/factory.py
+++ b/app/state/factory.py
@@ -36,6 +36,9 @@
"validated_claims": [],
"non_validated_claims": [],
"validity_score": 0.0,
+ "confidence_band": "",
+ "ranked_hypotheses": [],
+ "missing_evidence": [],
"investigation_recommendations": [],
"remediation_steps": [],
"investigation_loop_count": 0,
diff --git a/app/utils/ingest_delivery.py b/app/utils/ingest_delivery.py
index 1638c6cec..73e1107a5 100644
--- a/app/utils/ingest_delivery.py
+++ b/app/utils/ingest_delivery.py
@@ -69,6 +69,7 @@ def build_ingest_payload(state: InvestigationState) -> dict[str, Any]:
"root_cause": state.get("root_cause") or "",
"confidence": state.get("validity_score") or 0,
"validity_score": state.get("validity_score") or 0,
+ "confidence_band": state.get("confidence_band") or "",
"planned_actions": planned_actions,
"problem_md": state.get("problem_md") or "",
"investigation_recommendations": state.get("investigation_recommendations") or [],
diff --git a/app/utils/openclaw_delivery.py b/app/utils/openclaw_delivery.py
index 1d9024e0d..4bca853f4 100644
--- a/app/utils/openclaw_delivery.py
+++ b/app/utils/openclaw_delivery.py
@@ -33,8 +33,10 @@ def _report_body(state: InvestigationState, report: str) -> str:
sections.append(f"Remediation steps:\n{rendered_steps}")
validity_score = state.get("validity_score")
+ confidence_band = state.get("confidence_band", "")
if isinstance(validity_score, (int, float)):
- sections.append(f"Confidence: {validity_score:.0%}")
+ band_str = f" [{confidence_band.upper()}]" if confidence_band else ""
+ sections.append(f"Confidence: {validity_score:.0%}{band_str}")
return "\n\n".join(section for section in sections if section).strip()
diff --git a/docs/investigation-overview.mdx b/docs/investigation-overview.mdx
index 8324d8a57..414585784 100644
--- a/docs/investigation-overview.mdx
+++ b/docs/investigation-overview.mdx
@@ -56,6 +56,19 @@ Each run captures:
- tool outputs collected from connected integrations
- final diagnosis and recommended remediation steps
+### Confidence and Evidence Sufficiency
+
+When a diagnosis is reached, OpenSRE evaluates the available evidence and assigns a **Confidence Band**:
+- **HIGH** (Score ≥ 0.75, at least 1 validated finding): Strong evidence backing the diagnosis.
+- **MEDIUM** (Score 0.40–0.74, at least 2 validated findings): Partial evidence with some gaps.
+- **LOW** (Score < 0.40, or insufficient validated findings): Thin or conflicting evidence.
+
+When evidence is insufficient — a low score or fewer validated findings than the threshold — the system prefixes the root cause with *"Most likely:"* to signal uncertainty.
+
+When the investigation identifies gaps, the report exposes:
+- **Alternative hypotheses** — ranked by likelihood (shown when confidence is medium or low)
+- **Missing evidence** — specific data sources that would confirm or refute the diagnosis
+
## Chat
For local binary usage, the primary workflow is file-based (`problem.md`, `report.md`, and optional JSON output).
diff --git a/tests/agent/test_confidence_gating.py b/tests/agent/test_confidence_gating.py
new file mode 100644
index 000000000..efe5b572a
--- /dev/null
+++ b/tests/agent/test_confidence_gating.py
@@ -0,0 +1,189 @@
+"""Tests for confidence band classification and evidence sufficiency gating."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agent.result import (
+ InvestigationResult,
+ check_sufficiency,
+ classify_confidence_band,
+)
+
+# ---------------------------------------------------------------------------
+# classify_confidence_band
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+ "score, expected",
+ [
+ (1.0, "high"),
+ (0.75, "high"),
+ (0.74, "medium"),
+ (0.40, "medium"),
+ (0.39, "low"),
+ (0.0, "low"),
+ ],
+)
+def test_classify_confidence_band_thresholds(score: float, expected: str) -> None:
+ assert classify_confidence_band(score) == expected
+
+
+# ---------------------------------------------------------------------------
+# InvestigationResult factory classmethods set correct band
+# ---------------------------------------------------------------------------
+
+
+def test_unknown_result_has_low_band() -> None:
+ result = InvestigationResult.unknown("test-alert")
+ assert result.confidence_band == "low"
+ assert result.validity_score == 0.0
+
+
+def test_noise_result_has_high_band() -> None:
+ result = InvestigationResult.noise()
+ assert result.confidence_band == "high"
+ assert result.validity_score == 1.0
+
+
+# ---------------------------------------------------------------------------
+# check_sufficiency — three scenarios from the issue
+# ---------------------------------------------------------------------------
+
+
+def test_sufficient_evidence_passes_gate() -> None:
+ """High score + multiple validated claims → definitive, no prefix needed."""
+ result = InvestigationResult(
+ root_cause="DB connection pool exhausted due to query buildup.",
+ root_cause_category="database",
+ validity_score=0.85,
+ confidence_band="high",
+ validated_claims=[
+ {"claim": "Connection pool at 100%", "validation_status": "validated"},
+ {"claim": "Query latency spiked at 14:32 UTC", "validation_status": "validated"},
+ ],
+ )
+ assert check_sufficiency(result) is True
+
+
+def test_weak_evidence_fails_gate() -> None:
+ """Low score + no validated claims → gate fires, root cause should be prefixed."""
+ result = InvestigationResult(
+ root_cause="Suspected memory leak in worker process.",
+ root_cause_category="performance",
+ validity_score=0.30,
+ confidence_band="low",
+ validated_claims=[],
+ )
+ assert check_sufficiency(result) is False
+ # Simulate gate behaviour applied in investigation.py
+ if not result.root_cause.startswith("Most likely"):
+ result.root_cause = f"Most likely: {result.root_cause}"
+ assert result.root_cause.startswith("Most likely:")
+
+
+def test_conflicting_evidence_medium_band_fails_gate() -> None:
+ """Medium score with only one validated claim — gate fires."""
+ result = InvestigationResult(
+ root_cause="Possible network partition or config drift after deployment.",
+ root_cause_category="network",
+ validity_score=0.55,
+ confidence_band="medium",
+ validated_claims=[
+ {"claim": "Packet loss observed on inter-AZ traffic", "validation_status": "validated"},
+ ],
+ ranked_hypotheses=["Network partition between AZs", "Config drift after last deploy"],
+ missing_evidence=[
+ "VPC flow logs for the affected subnets",
+ "Deployment history for the last 2 hours",
+ ],
+ )
+ assert classify_confidence_band(result.validity_score) == "medium"
+ assert check_sufficiency(result) is False
+ assert len(result.ranked_hypotheses) == 2
+ assert len(result.missing_evidence) == 2
+
+
+def test_medium_score_with_two_validated_claims_passes_gate() -> None:
+ """Medium score but 2+ validated claims is considered sufficient."""
+ result = InvestigationResult(
+ root_cause="High CPU due to unindexed query on orders table.",
+ root_cause_category="database",
+ validity_score=0.60,
+ confidence_band="medium",
+ validated_claims=[
+ {"claim": "CPU at 95% on RDS instance", "validation_status": "validated"},
+ {"claim": "Slow query log shows full-table scan", "validation_status": "validated"},
+ ],
+ )
+ assert check_sufficiency(result) is True
+
+
+def test_high_score_with_no_validated_claims_fails_gate() -> None:
+ """High validity_score but zero validated claims must not pass — LLM self-report alone insufficient."""
+ result = InvestigationResult(
+ root_cause="DB connection pool exhausted.",
+ root_cause_category="database",
+ validity_score=0.85,
+ confidence_band="high",
+ validated_claims=[],
+ )
+ assert check_sufficiency(result) is False
+
+
+def test_healthy_category_always_passes_gate() -> None:
+ """Healthy findings are always definitive regardless of score."""
+ result = InvestigationResult(
+ root_cause="All systems operating normally — no incident detected.",
+ root_cause_category="healthy",
+ validity_score=0.20,
+ confidence_band="low",
+ )
+ assert check_sufficiency(result) is True
+
+
+def test_unknown_category_passes_gate() -> None:
+ """Unknown results must not receive a 'Most likely:' prefix — already communicate uncertainty."""
+ result = InvestigationResult.unknown("MyAlert")
+ assert check_sufficiency(result) is True
+ assert result.root_cause_category == "unknown"
+
+
+def test_gate_downgrades_band_to_low_when_fired() -> None:
+ """Gate always sets band to low — regardless of starting band — to stay consistent with prefix."""
+ for starting_band, validity_score, validated_claims in [
+ ("high", 0.85, []),
+ ("medium", 0.55, [{"claim": "c1", "validation_status": "validated"}]),
+ ]:
+ result = InvestigationResult(
+ root_cause="DB connection pool exhausted.",
+ root_cause_category="database",
+ validity_score=validity_score,
+ confidence_band=starting_band,
+ validated_claims=validated_claims,
+ )
+ assert check_sufficiency(result) is False
+ # Simulate gate behaviour applied in investigation.py
+ if not result.root_cause.startswith("Most likely"):
+ result.root_cause = f"Most likely: {result.root_cause}"
+ result.confidence_band = "low"
+ assert result.root_cause.startswith("Most likely:")
+ assert result.confidence_band == "low"
+
+
+def test_gate_strips_llm_most_likely_prefix_when_sufficient() -> None:
+ """When gate passes, any pre-existing LLM-generated 'Most likely:' prefix must be stripped."""
+ from app.agent.result import check_sufficiency
+
+ result = InvestigationResult(
+ root_cause="Most likely: DB auth failure due to expired credentials.",
+ root_cause_category="database",
+ validity_score=0.80,
+ confidence_band="high",
+ validated_claims=[{"claim": "Auth error in logs", "validation_status": "validated"}],
+ )
+ assert check_sufficiency(result) is True
+ if result.root_cause.startswith("Most likely:"):
+ result.root_cause = result.root_cause[len("Most likely:") :].lstrip()
+ assert not result.root_cause.startswith("Most likely")
diff --git a/tests/cli/test_investigate.py b/tests/cli/test_investigate.py
index e577decb7..38bd8b29b 100644
--- a/tests/cli/test_investigate.py
+++ b/tests/cli/test_investigate.py
@@ -109,6 +109,7 @@ def fake_run_investigation(
"root_cause": "bad deploy",
"is_noise": False,
"validity_score": 0.0,
+ "confidence_band": "",
}