diff --git a/app/agent/investigation.py b/app/agent/investigation.py
index cd1ddfc11..e40bf8745 100644
--- a/app/agent/investigation.py
+++ b/app/agent/investigation.py
@@ -10,7 +10,7 @@
 from typing import Any
 
 from app.agent.prompt import build_system_prompt, format_alert_context
-from app.agent.result import InvestigationResult, parse_diagnosis
+from app.agent.result import InvestigationResult, check_sufficiency, parse_diagnosis
 from app.cli.support.output import debug_print, get_tracker
 from app.constants.investigation import MAX_INVESTIGATION_LOOPS
 from app.services.agent_llm_client import ToolCall, get_agent_llm
@@ -201,6 +201,9 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
                         "root_cause": error_msg,
                         "validity_score": 0.0,
                         "root_cause_category": "Configuration Error",
+                        "confidence_band": "low",
+                        "ranked_hypotheses": [],
+                        "missing_evidence": [],
                     },
                 )
                 updates = {
@@ -211,6 +214,9 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
                     "non_validated_claims": [],
                     "remediation_steps": remediation_steps,
                     "validity_score": 0.0,
+                    "confidence_band": "low",
+                    "ranked_hypotheses": [],
+                    "missing_evidence": [],
                     "investigation_recommendations": [],
                     "evidence": evidence,
                     "evidence_entries": [e.model_dump() for e in evidence_entries],
@@ -272,12 +278,23 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
         result.evidence_entries = [e.model_dump() for e in evidence_entries]
         result.agent_messages = messages
 
+        if check_sufficiency(result):
+            if result.root_cause.startswith("Most likely:"):
+                result.root_cause = result.root_cause[len("Most likely:") :].lstrip()
+        else:
+            if not result.root_cause.startswith("Most likely"):
+                result.root_cause = f"Most likely: {result.root_cause}"
+            # Override band set by classify_confidence_band in result.py: thin evidence means LOW
+            # regardless of LLM-reported score (a high score with zero claims is still insufficient).
+            result.confidence_band = "low"
+
         _emit(
             "agent_end",
             {
                 "root_cause": result.root_cause,
                 "validity_score": result.validity_score,
                 "root_cause_category": result.root_cause_category,
+                "confidence_band": result.confidence_band,
             },
         )
 
@@ -605,6 +622,9 @@ def _result_to_state(result: InvestigationResult) -> dict[str, Any]:
         "non_validated_claims": result.non_validated_claims,
         "remediation_steps": result.remediation_steps,
         "validity_score": result.validity_score,
+        "confidence_band": result.confidence_band,
+        "ranked_hypotheses": result.ranked_hypotheses,
+        "missing_evidence": result.missing_evidence,
         "investigation_recommendations": result.investigation_recommendations,
         "evidence": result.evidence,
         "evidence_entries": result.evidence_entries,
diff --git a/app/agent/prompt.py b/app/agent/prompt.py
index a86dbdcfb..1e1a18d2f 100644
--- a/app/agent/prompt.py
+++ b/app/agent/prompt.py
@@ -36,6 +36,10 @@
 - **Non-validated claims**: Hypotheses you could not confirm
 - **Remediation steps**: Ordered, concrete actions to fix the issue
 - **Validity score**: 0.0–1.0 reflecting your confidence based on evidence quality
+- **Confidence band**: `high` (strong evidence from multiple sources), `medium` (partial evidence, some gaps), or `low` (thin or conflicting evidence)
+- **Ranked hypotheses**: If confidence is medium or low, list alternative explanations in order of likelihood (most likely first)
+- **Missing evidence**: List specific data sources or queries that would confirm or refute the diagnosis but were unavailable
+- If evidence is thin or conflicting, begin your root cause statement with "Most likely: " to signal uncertainty
 """
 
 _ALERT_CONTEXT_TEMPLATE = """## Alert
diff --git a/app/agent/result.py b/app/agent/result.py
index 6aa6239b4..c22f3154d 100644
--- a/app/agent/result.py
+++ b/app/agent/result.py
@@ -4,7 +4,7 @@
 
 import logging
 from dataclasses import dataclass, field
-from typing import Any, TypedDict, cast
+from typing import Any, Literal, TypedDict, cast
 
 from pydantic import BaseModel, Field
 
@@ -17,6 +17,17 @@
 logger = logging.getLogger(__name__)
 
 
+class _ValidatedClaimSchema(BaseModel):
+    claim: str = Field(description="The validated claim statement")
+    evidence_sources: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Subset of the collected evidence keys that directly support this specific claim. "
+            "Only include keys that actually informed this claim."
+        ),
+    )
+
+
 @dataclass
 class InvestigationResult:
     root_cause: str
@@ -26,6 +37,9 @@ class InvestigationResult:
     non_validated_claims: list[dict] = field(default_factory=list)
     remediation_steps: list[str] = field(default_factory=list)
     validity_score: float = 0.0
+    confidence_band: str = ""
+    ranked_hypotheses: list[str] = field(default_factory=list)
+    missing_evidence: list[str] = field(default_factory=list)
     evidence: dict[str, Any] = field(default_factory=dict)
     evidence_entries: list[dict] = field(default_factory=list)
     agent_messages: list[dict] = field(default_factory=list)
@@ -37,6 +51,7 @@ def unknown(cls, alert_name: str = "Unknown alert") -> InvestigationResult:
             root_cause=f"{alert_name}: Unable to determine root cause — insufficient evidence.",
             root_cause_category="unknown",
             validity_score=0.0,
+            confidence_band="low",
             non_validated_claims=[
                 {
                     "claim": "Insufficient evidence available",
@@ -51,9 +66,26 @@ def noise(cls) -> InvestigationResult:
             root_cause="Message classified as noise — no investigation needed.",
             root_cause_category="healthy",
             validity_score=1.0,
+            confidence_band="high",
         )
 
 
+def classify_confidence_band(score: float) -> Literal["high", "medium", "low"]:
+    if score >= 0.75:
+        return "high"
+    if score >= 0.40:
+        return "medium"
+    return "low"
+
+
+def check_sufficiency(result: InvestigationResult) -> bool:
+    if result.root_cause_category in {"healthy", "unknown"}:
+        return True
+    if result.validity_score >= 0.75 and len(result.validated_claims) >= 1:
+        return True
+    return result.validity_score >= 0.40 and len(result.validated_claims) >= 2
+
+
 def parse_diagnosis(
     messages: list[dict[str, Any]],
     evidence: dict[str, Any],
@@ -121,8 +153,9 @@ class DiagnosisSchema(BaseModel):
         causal_chain: list[str] = Field(
             default_factory=list, description="Ordered steps leading to the failure"
         )
-        validated_claims: list[str] = Field(
-            default_factory=list, description="Claims supported by tool evidence"
+        validated_claims: list[_ValidatedClaimSchema] = Field(
+            default_factory=list,
+            description="Claims supported by tool evidence, each with their specific supporting evidence keys",
         )
         non_validated_claims: list[str] = Field(
             default_factory=list, description="Claims not yet confirmed by evidence"
@@ -133,6 +166,14 @@ class DiagnosisSchema(BaseModel):
         validity_score: float = Field(
             default=0.0, description="0.0–1.0 confidence in the diagnosis"
         )
+        ranked_hypotheses: list[str] = Field(
+            default_factory=list,
+            description="Alternative hypotheses ranked by likelihood (most to least likely)",
+        )
+        missing_evidence: list[str] = Field(
+            default_factory=list,
+            description="Evidence that would confirm or refute the diagnosis but was unavailable",
+        )
 
     return DiagnosisSchema
 
@@ -157,10 +198,12 @@ class _DiagnosisPayload(TypedDict):
         root_cause: str
         root_cause_category: str
         causal_chain: list[str]
-        validated_claims: list[str]
+        validated_claims: list[dict]
         non_validated_claims: list[str]
         remediation_steps: list[str]
         validity_score: float
+        ranked_hypotheses: list[str]
+        missing_evidence: list[str]
 
     llm = get_llm_for_reasoning()
     schema_model = _build_diagnosis_schema(_taxonomy_categories_for_alert_source(alert_source))
@@ -181,10 +224,23 @@ def _to_claim_dicts(claims: list[str], status: str) -> list[dict]:
         root_cause=schema["root_cause"],
         root_cause_category=schema["root_cause_category"],
         causal_chain=schema["causal_chain"],
-        validated_claims=_to_claim_dicts(schema["validated_claims"], "validated"),
+        validated_claims=[
+            {
+                "claim": c["claim"],
+                "validation_status": "validated",
+                **(
+                    {"evidence_sources": c["evidence_sources"]} if c.get("evidence_sources") else {}
+                ),
+            }
+            for c in schema["validated_claims"]
+            if c.get("claim")
+        ],
         non_validated_claims=_to_claim_dicts(schema["non_validated_claims"], "not_validated"),
         remediation_steps=schema["remediation_steps"],
         validity_score=schema["validity_score"],
+        confidence_band=classify_confidence_band(schema["validity_score"]),
+        ranked_hypotheses=schema["ranked_hypotheses"],
+        missing_evidence=schema["missing_evidence"],
     )
 
 
@@ -207,6 +263,7 @@ def _parse_via_legacy(
             ],
             remediation_steps=rr.remediation_steps,
             validity_score=0.5,
+            confidence_band=classify_confidence_band(0.5),
         )
     except Exception as err:
         logger.warning("Legacy parse_root_cause also failed: %s", err)
diff --git a/app/cli/investigation/investigate.py b/app/cli/investigation/investigate.py
index f398c8414..7dd771951 100644
--- a/app/cli/investigation/investigate.py
+++ b/app/cli/investigation/investigate.py
@@ -140,6 +140,7 @@ def run_investigation_cli(
         "root_cause": state["root_cause"],
         "is_noise": state.get("is_noise", False),
         "validity_score": state.get("validity_score", 0.0),
+        "confidence_band": state.get("confidence_band", ""),
     }
     if state.get("evidence_entries"):
         out["tool_calls"] = state["evidence_entries"]
diff --git a/app/delivery/publish_findings/formatters/report.py b/app/delivery/publish_findings/formatters/report.py
index 886a8cf86..5efdfb29a 100644
--- a/app/delivery/publish_findings/formatters/report.py
+++ b/app/delivery/publish_findings/formatters/report.py
@@ -466,6 +466,12 @@ def format_slack_message(ctx: ReportContext) -> str:
     if top_log:
         conclusion_block += f"`{top_log}`\n"
 
+    confidence_band = ctx.get("confidence_band", "")
+    validity_score_val = ctx.get("validity_score")
+    if confidence_band:
+        pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
+        conclusion_block += f"*Confidence:* {confidence_band.upper()}{pct}\n"
+
     validated_lines, non_validated_lines = _render_claim_lines(ctx)
     if validated_lines:
         # Use a larger markdown heading so that "Findings" stands out as a section.
@@ -475,6 +481,21 @@ def format_slack_message(ctx: ReportContext) -> str:
             "\n*Non-Validated Claims (Inferred):*\n" + "\n".join(non_validated_lines) + "\n"
         )
 
+    ranked_hypotheses = ctx.get("ranked_hypotheses") or []
+    if ranked_hypotheses:
+        conclusion_block += (
+            "\n*Alternative hypotheses:*\n"
+            + "\n".join(f"• {_sanitize_for_slack(h)}" for h in ranked_hypotheses)
+            + "\n"
+        )
+    missing_evidence_list = ctx.get("missing_evidence") or []
+    if missing_evidence_list:
+        conclusion_block += (
+            "\n*Missing evidence:*\n"
+            + "\n".join(f"• {_sanitize_for_slack(e)}" for e in missing_evidence_list)
+            + "\n"
+        )
+
     correlation_signal_lines, correlation_driver_lines = _format_correlation_lines(ctx)
     if correlation_signal_lines or correlation_driver_lines:
         conclusion_block += "\n## Upstream Correlation\n"
@@ -556,12 +577,33 @@ def format_telegram_message(ctx: ReportContext) -> str:
             rc += "\n<code>" + html.escape(top_log) + "</code>"
         parts.append(rc)
 
+    confidence_band = ctx.get("confidence_band", "")
+    validity_score_val = ctx.get("validity_score")
+    if confidence_band:
+        pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
+        parts.append(f"<b>Confidence:</b> {html.escape(confidence_band.upper())}{pct}")
+
     validated_lines, non_validated_lines = _render_claim_lines_telegram(ctx)
     if validated_lines:
         parts.append("<b>Findings</b>\n" + "\n".join(validated_lines))
     if non_validated_lines:
         parts.append("<b>Non-Validated Claims (Inferred)</b>\n" + "\n".join(non_validated_lines))
 
+    ranked_hypotheses = ctx.get("ranked_hypotheses") or []
+    if ranked_hypotheses:
+        hyp = "\n".join(
+            "• " + _to_telegram_html_body(_sanitize_for_slack(str(h))) for h in ranked_hypotheses
+        )
+        parts.append("<b>Alternative hypotheses</b>\n" + hyp)
+
+    missing_evidence_list = ctx.get("missing_evidence") or []
+    if missing_evidence_list:
+        me = "\n".join(
+            "• " + _to_telegram_html_body(_sanitize_for_slack(str(e)))
+            for e in missing_evidence_list
+        )
+        parts.append("<b>Missing evidence</b>\n" + me)
+
     provenance_lines = _format_provenance_lines(ctx)
     if provenance_lines:
         prov = "\n".join(
@@ -695,6 +737,13 @@ def _add(block: "dict[str, Any] | None") -> None:
         rc_text += f"\n`{top_log}`"
     _add(_mrkdwn_section(rc_text))
 
+    # ── Confidence band ──
+    confidence_band = ctx.get("confidence_band", "")
+    validity_score_val = ctx.get("validity_score")
+    if confidence_band:
+        pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
+        _add(_mrkdwn_section(f"*Confidence:* {confidence_band.upper()}{pct}"))
+
     # ── Failed Pods ──
     datadog_site = ctx.get("datadog_site", "datadoghq.com")
     all_pods = get_failed_pods(ctx)
@@ -727,6 +776,28 @@ def _add(block: "dict[str, Any] | None") -> None:
     if non_validated_lines:
         _add(_mrkdwn_section("*Inferred (not yet validated)*\n" + "\n".join(non_validated_lines)))
 
+    # ── Alternative Hypotheses ──
+    ranked_hypotheses = ctx.get("ranked_hypotheses") or []
+    if ranked_hypotheses:
+        blocks.append({"type": "divider"})
+        _add(
+            _mrkdwn_section(
+                "*Alternative hypotheses:*\n"
+                + "\n".join(f"• {_sanitize_for_slack(h)}" for h in ranked_hypotheses)
+            )
+        )
+
+    # ── Missing Evidence ──
+    missing_evidence_list = ctx.get("missing_evidence") or []
+    if missing_evidence_list:
+        blocks.append({"type": "divider"})
+        _add(
+            _mrkdwn_section(
+                "*Missing evidence:*\n"
+                + "\n".join(f"• {_sanitize_for_slack(e)}" for e in missing_evidence_list)
+            )
+        )
+
     correlation_signal_lines, correlation_driver_lines = _format_correlation_lines(ctx)
     if correlation_signal_lines or correlation_driver_lines:
         blocks.append({"type": "divider"})
diff --git a/app/delivery/publish_findings/node.py b/app/delivery/publish_findings/node.py
index 0d865239c..f3c39ba59 100644
--- a/app/delivery/publish_findings/node.py
+++ b/app/delivery/publish_findings/node.py
@@ -48,7 +48,14 @@ def generate_report(state: InvestigationState) -> dict:
 
     all_blocks = build_slack_blocks(ctx) + build_action_blocks(investigation_url, investigation_id)
     all_blocks = masking_ctx.unmask_value(all_blocks)
-    render_report(slack_message, root_cause_category=state.get("root_cause_category"))
+    render_report(
+        slack_message,
+        root_cause_category=state.get("root_cause_category"),
+        confidence_band=state.get("confidence_band", ""),
+        validity_score=state.get("validity_score"),
+        ranked_hypotheses=state.get("ranked_hypotheses", []),
+        missing_evidence=state.get("missing_evidence", []),
+    )
     open_in_editor(slack_message)
 
     slack_ctx = state.get("slack_context", {})
diff --git a/app/delivery/publish_findings/renderers/terminal.py b/app/delivery/publish_findings/renderers/terminal.py
index 4b455567e..c85f9e43c 100644
--- a/app/delivery/publish_findings/renderers/terminal.py
+++ b/app/delivery/publish_findings/renderers/terminal.py
@@ -1,5 +1,6 @@
 """Terminal rendering for RCA reports — Claude-style output."""
 
+import math
 import re
 
 from rich.console import Console
@@ -70,6 +71,41 @@ def _strip_mrkdwn(text: str) -> str:
 _BOLD_RE = re.compile(r"\*\*?([^*]+)\*\*?")
 
 
+_CONFIDENCE_LINE_RE = re.compile(r"^\*?Confidence:\*?\s+\w+", re.IGNORECASE)
+_CONFIDENCE_BLOCK_LABELS = frozenset({"*Alternative hypotheses:*", "*Missing evidence:*"})
+
+
+def _filter_confidence_sections(lines: list[str]) -> list[str]:
+    """Drop lines rendered separately by _render_rich_confidence_block.
+
+    Removes the inline Confidence: line and the Alternative hypotheses /
+    Missing evidence sections (header + bullets) so they are not printed twice.
+    Exits a skip-section on any ## heading or any other *Label:* section header.
+    """
+    result: list[str] = []
+    in_skip = False
+    for line in lines:
+        stripped = line.strip()
+        if in_skip:
+            is_heading = bool(_HEADING_RE.match(stripped))
+            is_other_label = (
+                stripped.startswith("*")
+                and stripped.endswith(":*")
+                and stripped not in _CONFIDENCE_BLOCK_LABELS
+            )
+            if is_heading or is_other_label:
+                in_skip = False  # fall through
+            else:
+                continue
+        if stripped in _CONFIDENCE_BLOCK_LABELS:
+            in_skip = True
+            continue
+        if _CONFIDENCE_LINE_RE.match(stripped):
+            continue
+        result.append(line)
+    return result
+
+
 def _render_rich_section_heading(console: Console, title: str) -> None:
     console.print()
     t = Text()
@@ -111,7 +147,14 @@ def _render_rich_evidence_item(console: Console, line: str) -> None:
 # ─────────────────────────────────────────────────────────────────────────────
 
 
-def render_report(slack_message: str, root_cause_category: str | None = None) -> None:
+def render_report(
+    slack_message: str,
+    root_cause_category: str | None = None,
+    confidence_band: str = "",
+    validity_score: float | None = None,
+    ranked_hypotheses: list[str] | None = None,
+    missing_evidence: list[str] | None = None,
+) -> None:
     """Render the final RCA report to terminal."""
     from app.cli.support.output import stop_display
 
@@ -128,17 +171,38 @@ def render_report(slack_message: str, root_cause_category: str | None = None) ->
         return
 
     if fmt == "rich":
-        _render_rich_report(slack_message, root_cause_category=root_cause_category)
+        _render_rich_report(
+            slack_message,
+            root_cause_category=root_cause_category,
+            confidence_band=confidence_band,
+            validity_score=validity_score,
+            ranked_hypotheses=ranked_hypotheses or [],
+            missing_evidence=missing_evidence or [],
+        )
     else:
-        _render_plain_report(slack_message, root_cause_category=root_cause_category)
-
-
-def _render_rich_report(slack_message: str, root_cause_category: str | None = None) -> None:
+        _render_plain_report(
+            slack_message,
+            root_cause_category=root_cause_category,
+            confidence_band=confidence_band,
+            validity_score=validity_score,
+            ranked_hypotheses=ranked_hypotheses or [],
+            missing_evidence=missing_evidence or [],
+        )
+
+
+def _render_rich_report(
+    slack_message: str,
+    root_cause_category: str | None = None,
+    confidence_band: str = "",
+    validity_score: float | None = None,
+    ranked_hypotheses: list[str] | None = None,
+    missing_evidence: list[str] | None = None,
+) -> None:
     _ = root_cause_category
     console = Console()
     console.print()
 
-    lines = slack_message.splitlines()
+    lines = _filter_confidence_sections(slack_message.splitlines())
     in_evidence = False
 
     for line in lines:
@@ -197,11 +261,81 @@ def _render_rich_report(slack_message: str, root_cause_category: str | None = No
         t.append_text(_rich_line_with_links(stripped))
         console.print(t)
 
+    _render_rich_confidence_block(
+        console,
+        confidence_band=confidence_band,
+        validity_score=validity_score,
+        ranked_hypotheses=ranked_hypotheses or [],
+        missing_evidence=missing_evidence or [],
+    )
     console.print()
 
 
-def _render_plain_report(slack_message: str, root_cause_category: str | None = None) -> None:
+def _render_rich_confidence_block(
+    console: Console,
+    confidence_band: str,
+    validity_score: float | None,
+    ranked_hypotheses: list[str],
+    missing_evidence: list[str],
+) -> None:
+    if not confidence_band and validity_score is None:
+        return
+
+    console.print()
+    band_upper = confidence_band.upper() if confidence_band else ""
+    band_style = {"HIGH": "bold green", "MEDIUM": "bold yellow", "LOW": "bold red"}.get(
+        band_upper, f"bold {TEXT}"
+    )
+    score_str = (
+        f" ({int(validity_score * 100)}%)"
+        if validity_score is not None and not math.isnan(validity_score)
+        else ""
+    )
+
+    t = Text("  Confidence: ")
+    t.append(f"{band_upper}{score_str}" if band_upper else score_str.strip(), style=band_style)
+    console.print(t)
+
+    if ranked_hypotheses:
+        _render_rich_section_heading(console, "Alternative hypotheses")
+        for h in ranked_hypotheses:
+            _render_rich_bullet(console, h)
+
+    if missing_evidence:
+        _render_rich_section_heading(console, "Missing evidence")
+        for item in missing_evidence:
+            _render_rich_bullet(console, item)
+
+
+def _render_plain_report(
+    slack_message: str,
+    root_cause_category: str | None = None,
+    confidence_band: str = "",
+    validity_score: float | None = None,
+    ranked_hypotheses: list[str] | None = None,
+    missing_evidence: list[str] | None = None,
+) -> None:
     _ = root_cause_category
     print()
-    clean = _strip_slack_links(_strip_mrkdwn(slack_message))
+    filtered = "\n".join(_filter_confidence_sections(slack_message.splitlines()))
+    clean = _strip_slack_links(_strip_mrkdwn(filtered))
     print(clean)
+
+    if confidence_band or validity_score is not None:
+        band_str = confidence_band.upper() if confidence_band else ""
+        score_str = (
+            f" ({int(validity_score * 100)}%)"
+            if validity_score is not None and not math.isnan(validity_score)
+            else ""
+        )
+        print(f"\nConfidence: {band_str}{score_str}".strip())
+
+    if ranked_hypotheses:
+        print("\nAlternative hypotheses:")
+        for h in ranked_hypotheses:
+            print(f"  - {h}")
+
+    if missing_evidence:
+        print("\nMissing evidence:")
+        for item in missing_evidence:
+            print(f"  - {item}")
diff --git a/app/delivery/publish_findings/report_context.py b/app/delivery/publish_findings/report_context.py
index da5c16335..dbae4fdf4 100644
--- a/app/delivery/publish_findings/report_context.py
+++ b/app/delivery/publish_findings/report_context.py
@@ -46,6 +46,9 @@ class ReportContext(TypedDict, total=False):
     validated_claims: list[dict]
     non_validated_claims: list[dict]
     validity_score: float
+    confidence_band: str
+    ranked_hypotheses: list[str]
+    missing_evidence: list[str]
     investigation_recommendations: list[str]
     remediation_steps: list[str]
     correlation: dict[str, Any]
@@ -918,6 +921,9 @@ def build_report_context(state: InvestigationState) -> ReportContext:
         "validated_claims": validated_claims,
         "non_validated_claims": non_validated_claims,
         "validity_score": state.get("validity_score", 0.0),
+        "confidence_band": state.get("confidence_band", ""),
+        "ranked_hypotheses": state.get("ranked_hypotheses", []),
+        "missing_evidence": state.get("missing_evidence", []),
         "investigation_recommendations": state.get("investigation_recommendations", []),
         "remediation_steps": state.get("remediation_steps", []),
         "correlation": state.get("correlation", {}),
diff --git a/app/pipeline/runners.py b/app/pipeline/runners.py
index c4e6d18f6..7b8a384d1 100644
--- a/app/pipeline/runners.py
+++ b/app/pipeline/runners.py
@@ -276,6 +276,9 @@ def _run_pipeline() -> None:
                             "root_cause": state_any.get("root_cause", ""),
                             "root_cause_category": state_any.get("root_cause_category", ""),
                             "validity_score": state_any.get("validity_score"),
+                            "confidence_band": state_any.get("confidence_band", ""),
+                            "ranked_hypotheses": state_any.get("ranked_hypotheses", []),
+                            "missing_evidence": state_any.get("missing_evidence", []),
                             "report": state_any.get("report", ""),
                             "slack_message": state_any.get("slack_message", ""),
                             "problem_md": state_any.get("problem_md", ""),
diff --git a/app/remote/renderer.py b/app/remote/renderer.py
index 1488f7625..f04e0ebf6 100644
--- a/app/remote/renderer.py
+++ b/app/remote/renderer.py
@@ -758,6 +758,9 @@ def _build_node_message(self, node: str) -> str | None:
                 return f"Resolved: {names}"
         if node in {"diagnose", "diagnose_root_cause"}:
             pct = _validity_score_percent(self._final_state.get("validity_score"))
+            band = self._final_state.get("confidence_band", "")
+            if pct and band:
+                return f"validity:{band.upper()}({pct})"
             if pct:
                 return f"validity:{pct}"
         return None
@@ -781,7 +784,14 @@ def _print_report(self) -> None:
 
         from app.delivery.publish_findings.renderers.terminal import render_report as _render
 
-        _render(slack_message, root_cause_category=root_cause_category)
+        _render(
+            slack_message,
+            root_cause_category=root_cause_category,
+            confidence_band=self._final_state.get("confidence_band", ""),
+            validity_score=self._final_state.get("validity_score"),
+            ranked_hypotheses=self._final_state.get("ranked_hypotheses") or [],
+            missing_evidence=self._final_state.get("missing_evidence") or [],
+        )
 
 
 def _canonical_node_name(name: str) -> str:
diff --git a/app/state/agent_state.py b/app/state/agent_state.py
index 87f69f5a7..1c17ac16b 100644
--- a/app/state/agent_state.py
+++ b/app/state/agent_state.py
@@ -77,6 +77,9 @@ class AgentState(TypedDict, total=False):
     validated_claims: list[dict[str, Any]]
     non_validated_claims: list[dict[str, Any]]
     validity_score: float
+    confidence_band: str
+    ranked_hypotheses: list[str]
+    missing_evidence: list[str]
     investigation_recommendations: list[str]
     remediation_steps: list[str]
     investigation_loop_count: int
@@ -185,6 +188,9 @@ class AgentStateModel(StrictConfigModel):
     validated_claims: list[dict[str, Any]] = Field(default_factory=list)
     non_validated_claims: list[dict[str, Any]] = Field(default_factory=list)
     validity_score: float = 0.0
+    confidence_band: str = ""
+    ranked_hypotheses: list[str] = Field(default_factory=list)
+    missing_evidence: list[str] = Field(default_factory=list)
     investigation_recommendations: list[str] = Field(default_factory=list)
     remediation_steps: list[str] = Field(default_factory=list)
     investigation_loop_count: int = 0
diff --git a/app/state/factory.py b/app/state/factory.py
index 9b9cc2b11..d64fc48c4 100644
--- a/app/state/factory.py
+++ b/app/state/factory.py
@@ -36,6 +36,9 @@
     "validated_claims": [],
     "non_validated_claims": [],
     "validity_score": 0.0,
+    "confidence_band": "",
+    "ranked_hypotheses": [],
+    "missing_evidence": [],
     "investigation_recommendations": [],
     "remediation_steps": [],
     "investigation_loop_count": 0,
diff --git a/app/utils/ingest_delivery.py b/app/utils/ingest_delivery.py
index 1638c6cec..73e1107a5 100644
--- a/app/utils/ingest_delivery.py
+++ b/app/utils/ingest_delivery.py
@@ -69,6 +69,7 @@ def build_ingest_payload(state: InvestigationState) -> dict[str, Any]:
         "root_cause": state.get("root_cause") or "",
         "confidence": state.get("validity_score") or 0,
         "validity_score": state.get("validity_score") or 0,
+        "confidence_band": state.get("confidence_band") or "",
         "planned_actions": planned_actions,
         "problem_md": state.get("problem_md") or "",
         "investigation_recommendations": state.get("investigation_recommendations") or [],
diff --git a/app/utils/openclaw_delivery.py b/app/utils/openclaw_delivery.py
index 1d9024e0d..4bca853f4 100644
--- a/app/utils/openclaw_delivery.py
+++ b/app/utils/openclaw_delivery.py
@@ -33,8 +33,10 @@ def _report_body(state: InvestigationState, report: str) -> str:
             sections.append(f"Remediation steps:\n{rendered_steps}")
 
     validity_score = state.get("validity_score")
+    confidence_band = state.get("confidence_band", "")
     if isinstance(validity_score, (int, float)):
-        sections.append(f"Confidence: {validity_score:.0%}")
+        band_str = f" [{confidence_band.upper()}]" if confidence_band else ""
+        sections.append(f"Confidence: {validity_score:.0%}{band_str}")
 
     return "\n\n".join(section for section in sections if section).strip()
 
diff --git a/docs/investigation-overview.mdx b/docs/investigation-overview.mdx
index 8324d8a57..414585784 100644
--- a/docs/investigation-overview.mdx
+++ b/docs/investigation-overview.mdx
@@ -56,6 +56,19 @@ Each run captures:
 - tool outputs collected from connected integrations
 - final diagnosis and recommended remediation steps
 
+### Confidence and Evidence Sufficiency
+
+When a diagnosis is reached, OpenSRE evaluates the available evidence and assigns a **Confidence Band**:
+- **HIGH** (Score ≥ 0.75, at least 1 validated finding): Strong evidence backing the diagnosis.
+- **MEDIUM** (Score 0.40–0.74, at least 2 validated findings): Partial evidence with some gaps.
+- **LOW** (Score < 0.40, or insufficient validated findings): Thin or conflicting evidence.
+
+When evidence is insufficient — a low score or fewer validated findings than the threshold — the system prefixes the root cause with *"Most likely:"* to signal uncertainty.
+
+When the investigation identifies gaps, the report exposes:
+- **Alternative hypotheses** — ranked by likelihood (shown when confidence is medium or low)
+- **Missing evidence** — specific data sources that would confirm or refute the diagnosis
+
 ## Chat
 
 For local binary usage, the primary workflow is file-based (`problem.md`, `report.md`, and optional JSON output).
diff --git a/tests/agent/test_confidence_gating.py b/tests/agent/test_confidence_gating.py
new file mode 100644
index 000000000..efe5b572a
--- /dev/null
+++ b/tests/agent/test_confidence_gating.py
@@ -0,0 +1,189 @@
+"""Tests for confidence band classification and evidence sufficiency gating."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agent.result import (
+    InvestigationResult,
+    check_sufficiency,
+    classify_confidence_band,
+)
+
+# ---------------------------------------------------------------------------
+# classify_confidence_band
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "score, expected",
+    [
+        (1.0, "high"),
+        (0.75, "high"),
+        (0.74, "medium"),
+        (0.40, "medium"),
+        (0.39, "low"),
+        (0.0, "low"),
+    ],
+)
+def test_classify_confidence_band_thresholds(score: float, expected: str) -> None:
+    assert classify_confidence_band(score) == expected
+
+
+# ---------------------------------------------------------------------------
+# InvestigationResult factory classmethods set correct band
+# ---------------------------------------------------------------------------
+
+
+def test_unknown_result_has_low_band() -> None:
+    result = InvestigationResult.unknown("test-alert")
+    assert result.confidence_band == "low"
+    assert result.validity_score == 0.0
+
+
+def test_noise_result_has_high_band() -> None:
+    result = InvestigationResult.noise()
+    assert result.confidence_band == "high"
+    assert result.validity_score == 1.0
+
+
+# ---------------------------------------------------------------------------
+# check_sufficiency — three scenarios from the issue
+# ---------------------------------------------------------------------------
+
+
+def test_sufficient_evidence_passes_gate() -> None:
+    """High score + multiple validated claims → definitive, no prefix needed."""
+    result = InvestigationResult(
+        root_cause="DB connection pool exhausted due to query buildup.",
+        root_cause_category="database",
+        validity_score=0.85,
+        confidence_band="high",
+        validated_claims=[
+            {"claim": "Connection pool at 100%", "validation_status": "validated"},
+            {"claim": "Query latency spiked at 14:32 UTC", "validation_status": "validated"},
+        ],
+    )
+    assert check_sufficiency(result) is True
+
+
+def test_weak_evidence_fails_gate() -> None:
+    """Low score + no validated claims → gate fires, root cause should be prefixed."""
+    result = InvestigationResult(
+        root_cause="Suspected memory leak in worker process.",
+        root_cause_category="performance",
+        validity_score=0.30,
+        confidence_band="low",
+        validated_claims=[],
+    )
+    assert check_sufficiency(result) is False
+    # Simulate gate behaviour applied in investigation.py
+    if not result.root_cause.startswith("Most likely"):
+        result.root_cause = f"Most likely: {result.root_cause}"
+    assert result.root_cause.startswith("Most likely:")
+
+
+def test_conflicting_evidence_medium_band_fails_gate() -> None:
+    """Medium score with only one validated claim — gate fires."""
+    result = InvestigationResult(
+        root_cause="Possible network partition or config drift after deployment.",
+        root_cause_category="network",
+        validity_score=0.55,
+        confidence_band="medium",
+        validated_claims=[
+            {"claim": "Packet loss observed on inter-AZ traffic", "validation_status": "validated"},
+        ],
+        ranked_hypotheses=["Network partition between AZs", "Config drift after last deploy"],
+        missing_evidence=[
+            "VPC flow logs for the affected subnets",
+            "Deployment history for the last 2 hours",
+        ],
+    )
+    assert classify_confidence_band(result.validity_score) == "medium"
+    assert check_sufficiency(result) is False
+    assert len(result.ranked_hypotheses) == 2
+    assert len(result.missing_evidence) == 2
+
+
+def test_medium_score_with_two_validated_claims_passes_gate() -> None:
+    """Medium score but 2+ validated claims is considered sufficient."""
+    result = InvestigationResult(
+        root_cause="High CPU due to unindexed query on orders table.",
+        root_cause_category="database",
+        validity_score=0.60,
+        confidence_band="medium",
+        validated_claims=[
+            {"claim": "CPU at 95% on RDS instance", "validation_status": "validated"},
+            {"claim": "Slow query log shows full-table scan", "validation_status": "validated"},
+        ],
+    )
+    assert check_sufficiency(result) is True
+
+
+def test_high_score_with_no_validated_claims_fails_gate() -> None:
+    """High validity_score but zero validated claims must not pass — LLM self-report alone insufficient."""
+    result = InvestigationResult(
+        root_cause="DB connection pool exhausted.",
+        root_cause_category="database",
+        validity_score=0.85,
+        confidence_band="high",
+        validated_claims=[],
+    )
+    assert check_sufficiency(result) is False
+
+
+def test_healthy_category_always_passes_gate() -> None:
+    """Healthy findings are always definitive regardless of score."""
+    result = InvestigationResult(
+        root_cause="All systems operating normally — no incident detected.",
+        root_cause_category="healthy",
+        validity_score=0.20,
+        confidence_band="low",
+    )
+    assert check_sufficiency(result) is True
+
+
+def test_unknown_category_passes_gate() -> None:
+    """Unknown results must not receive a 'Most likely:' prefix — already communicate uncertainty."""
+    result = InvestigationResult.unknown("MyAlert")
+    assert check_sufficiency(result) is True
+    assert result.root_cause_category == "unknown"
+
+
+def test_gate_downgrades_band_to_low_when_fired() -> None:
+    """Gate always sets band to low — regardless of starting band — to stay consistent with prefix."""
+    for starting_band, validity_score, validated_claims in [
+        ("high", 0.85, []),
+        ("medium", 0.55, [{"claim": "c1", "validation_status": "validated"}]),
+    ]:
+        result = InvestigationResult(
+            root_cause="DB connection pool exhausted.",
+            root_cause_category="database",
+            validity_score=validity_score,
+            confidence_band=starting_band,
+            validated_claims=validated_claims,
+        )
+        assert check_sufficiency(result) is False
+        # Simulate gate behaviour applied in investigation.py
+        if not result.root_cause.startswith("Most likely"):
+            result.root_cause = f"Most likely: {result.root_cause}"
+        result.confidence_band = "low"
+        assert result.root_cause.startswith("Most likely:")
+        assert result.confidence_band == "low"
+
+
+def test_gate_strips_llm_most_likely_prefix_when_sufficient() -> None:
+    """When gate passes, any pre-existing LLM-generated 'Most likely:' prefix must be stripped."""
+    from app.agent.result import check_sufficiency
+
+    result = InvestigationResult(
+        root_cause="Most likely: DB auth failure due to expired credentials.",
+        root_cause_category="database",
+        validity_score=0.80,
+        confidence_band="high",
+        validated_claims=[{"claim": "Auth error in logs", "validation_status": "validated"}],
+    )
+    assert check_sufficiency(result) is True
+    if result.root_cause.startswith("Most likely:"):
+        result.root_cause = result.root_cause[len("Most likely:") :].lstrip()
+    assert not result.root_cause.startswith("Most likely")
diff --git a/tests/cli/test_investigate.py b/tests/cli/test_investigate.py
index e577decb7..38bd8b29b 100644
--- a/tests/cli/test_investigate.py
+++ b/tests/cli/test_investigate.py
@@ -109,6 +109,7 @@ def fake_run_investigation(
         "root_cause": "bad deploy",
         "is_noise": False,
         "validity_score": 0.0,
+        "confidence_band": "",
     }