Skip to content
22 changes: 21 additions & 1 deletion app/agent/investigation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from typing import Any

from app.agent.prompt import build_system_prompt, format_alert_context
from app.agent.result import InvestigationResult, parse_diagnosis
from app.agent.result import InvestigationResult, check_sufficiency, parse_diagnosis
from app.cli.support.output import debug_print, get_tracker
from app.constants.investigation import MAX_INVESTIGATION_LOOPS
from app.services.agent_llm_client import ToolCall, get_agent_llm
Expand Down Expand Up @@ -201,6 +201,9 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
"root_cause": error_msg,
"validity_score": 0.0,
"root_cause_category": "Configuration Error",
"confidence_band": "low",
"ranked_hypotheses": [],
"missing_evidence": [],
},
)
updates = {
Expand All @@ -211,6 +214,9 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
"non_validated_claims": [],
"remediation_steps": remediation_steps,
"validity_score": 0.0,
"confidence_band": "low",
"ranked_hypotheses": [],
"missing_evidence": [],
"investigation_recommendations": [],
"evidence": evidence,
"evidence_entries": [e.model_dump() for e in evidence_entries],
Expand Down Expand Up @@ -272,12 +278,23 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
result.evidence_entries = [e.model_dump() for e in evidence_entries]
result.agent_messages = messages

if check_sufficiency(result):
if result.root_cause.startswith("Most likely:"):
result.root_cause = result.root_cause[len("Most likely:") :].lstrip()
else:
if not result.root_cause.startswith("Most likely"):
result.root_cause = f"Most likely: {result.root_cause}"
# Override band set by classify_confidence_band in result.py: thin evidence means LOW
# regardless of LLM-reported score (a high score with zero claims is still insufficient).
result.confidence_band = "low"

_emit(
"agent_end",
{
"root_cause": result.root_cause,
"validity_score": result.validity_score,
"root_cause_category": result.root_cause_category,
"confidence_band": result.confidence_band,
},
)

Expand Down Expand Up @@ -605,6 +622,9 @@ def _result_to_state(result: InvestigationResult) -> dict[str, Any]:
"non_validated_claims": result.non_validated_claims,
"remediation_steps": result.remediation_steps,
"validity_score": result.validity_score,
"confidence_band": result.confidence_band,
"ranked_hypotheses": result.ranked_hypotheses,
"missing_evidence": result.missing_evidence,
"investigation_recommendations": result.investigation_recommendations,
"evidence": result.evidence,
"evidence_entries": result.evidence_entries,
Expand Down
4 changes: 4 additions & 0 deletions app/agent/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@
- **Non-validated claims**: Hypotheses you could not confirm
- **Remediation steps**: Ordered, concrete actions to fix the issue
- **Validity score**: 0.0–1.0 reflecting your confidence based on evidence quality
- **Confidence band**: `high` (strong evidence from multiple sources), `medium` (partial evidence, some gaps), or `low` (thin or conflicting evidence)
- **Ranked hypotheses**: If confidence is medium or low, list alternative explanations in order of likelihood (most likely first)
- **Missing evidence**: List specific data sources or queries that would confirm or refute the diagnosis but were unavailable
- If evidence is thin or conflicting, begin your root cause statement with "Most likely: " to signal uncertainty
"""

_ALERT_CONTEXT_TEMPLATE = """## Alert
Expand Down
67 changes: 62 additions & 5 deletions app/agent/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import logging
from dataclasses import dataclass, field
from typing import Any, TypedDict, cast
from typing import Any, Literal, TypedDict, cast

from pydantic import BaseModel, Field

Expand All @@ -17,6 +17,17 @@
logger = logging.getLogger(__name__)


class _ValidatedClaimSchema(BaseModel):
claim: str = Field(description="The validated claim statement")
evidence_sources: list[str] = Field(
default_factory=list,
description=(
"Subset of the collected evidence keys that directly support this specific claim. "
"Only include keys that actually informed this claim."
),
)


@dataclass
class InvestigationResult:
root_cause: str
Expand All @@ -26,6 +37,9 @@ class InvestigationResult:
non_validated_claims: list[dict] = field(default_factory=list)
remediation_steps: list[str] = field(default_factory=list)
validity_score: float = 0.0
confidence_band: str = ""
ranked_hypotheses: list[str] = field(default_factory=list)
missing_evidence: list[str] = field(default_factory=list)
evidence: dict[str, Any] = field(default_factory=dict)
evidence_entries: list[dict] = field(default_factory=list)
agent_messages: list[dict] = field(default_factory=list)
Expand All @@ -37,6 +51,7 @@ def unknown(cls, alert_name: str = "Unknown alert") -> InvestigationResult:
root_cause=f"{alert_name}: Unable to determine root cause — insufficient evidence.",
root_cause_category="unknown",
validity_score=0.0,
confidence_band="low",
non_validated_claims=[
{
"claim": "Insufficient evidence available",
Expand All @@ -51,9 +66,26 @@ def noise(cls) -> InvestigationResult:
root_cause="Message classified as noise — no investigation needed.",
root_cause_category="healthy",
validity_score=1.0,
confidence_band="high",
)


def classify_confidence_band(score: float) -> Literal["high", "medium", "low"]:
if score >= 0.75:
return "high"
if score >= 0.40:
return "medium"
return "low"


def check_sufficiency(result: InvestigationResult) -> bool:
if result.root_cause_category in {"healthy", "unknown"}:
return True
if result.validity_score >= 0.75 and len(result.validated_claims) >= 1:
return True
return result.validity_score >= 0.40 and len(result.validated_claims) >= 2
Comment thread
greptile-apps[bot] marked this conversation as resolved.


def parse_diagnosis(
messages: list[dict[str, Any]],
evidence: dict[str, Any],
Expand Down Expand Up @@ -121,8 +153,9 @@ class DiagnosisSchema(BaseModel):
causal_chain: list[str] = Field(
default_factory=list, description="Ordered steps leading to the failure"
)
validated_claims: list[str] = Field(
default_factory=list, description="Claims supported by tool evidence"
validated_claims: list[_ValidatedClaimSchema] = Field(
default_factory=list,
description="Claims supported by tool evidence, each with their specific supporting evidence keys",
)
non_validated_claims: list[str] = Field(
default_factory=list, description="Claims not yet confirmed by evidence"
Expand All @@ -133,6 +166,14 @@ class DiagnosisSchema(BaseModel):
validity_score: float = Field(
default=0.0, description="0.0–1.0 confidence in the diagnosis"
)
ranked_hypotheses: list[str] = Field(
default_factory=list,
description="Alternative hypotheses ranked by likelihood (most to least likely)",
)
missing_evidence: list[str] = Field(
default_factory=list,
description="Evidence that would confirm or refute the diagnosis but was unavailable",
)

return DiagnosisSchema

Expand All @@ -157,10 +198,12 @@ class _DiagnosisPayload(TypedDict):
root_cause: str
root_cause_category: str
causal_chain: list[str]
validated_claims: list[str]
validated_claims: list[dict]
non_validated_claims: list[str]
remediation_steps: list[str]
validity_score: float
ranked_hypotheses: list[str]
missing_evidence: list[str]

llm = get_llm_for_reasoning()
schema_model = _build_diagnosis_schema(_taxonomy_categories_for_alert_source(alert_source))
Expand All @@ -181,10 +224,23 @@ def _to_claim_dicts(claims: list[str], status: str) -> list[dict]:
root_cause=schema["root_cause"],
root_cause_category=schema["root_cause_category"],
causal_chain=schema["causal_chain"],
validated_claims=_to_claim_dicts(schema["validated_claims"], "validated"),
validated_claims=[
{
"claim": c["claim"],
"validation_status": "validated",
**(
{"evidence_sources": c["evidence_sources"]} if c.get("evidence_sources") else {}
),
}
for c in schema["validated_claims"]
if c.get("claim")
],
non_validated_claims=_to_claim_dicts(schema["non_validated_claims"], "not_validated"),
remediation_steps=schema["remediation_steps"],
validity_score=schema["validity_score"],
confidence_band=classify_confidence_band(schema["validity_score"]),
ranked_hypotheses=schema["ranked_hypotheses"],
missing_evidence=schema["missing_evidence"],
)


Expand All @@ -207,6 +263,7 @@ def _parse_via_legacy(
],
remediation_steps=rr.remediation_steps,
validity_score=0.5,
confidence_band=classify_confidence_band(0.5),
)
except Exception as err:
logger.warning("Legacy parse_root_cause also failed: %s", err)
Expand Down
1 change: 1 addition & 0 deletions app/cli/investigation/investigate.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def run_investigation_cli(
"root_cause": state["root_cause"],
"is_noise": state.get("is_noise", False),
"validity_score": state.get("validity_score", 0.0),
"confidence_band": state.get("confidence_band", ""),
}
if state.get("evidence_entries"):
out["tool_calls"] = state["evidence_entries"]
Expand Down
71 changes: 71 additions & 0 deletions app/delivery/publish_findings/formatters/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,12 @@ def format_slack_message(ctx: ReportContext) -> str:
if top_log:
conclusion_block += f"`{top_log}`\n"

confidence_band = ctx.get("confidence_band", "")
validity_score_val = ctx.get("validity_score")
if confidence_band:
pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
conclusion_block += f"*Confidence:* {confidence_band.upper()}{pct}\n"

validated_lines, non_validated_lines = _render_claim_lines(ctx)
if validated_lines:
# Use a larger markdown heading so that "Findings" stands out as a section.
Expand All @@ -475,6 +481,21 @@ def format_slack_message(ctx: ReportContext) -> str:
"\n*Non-Validated Claims (Inferred):*\n" + "\n".join(non_validated_lines) + "\n"
)

ranked_hypotheses = ctx.get("ranked_hypotheses") or []
if ranked_hypotheses:
conclusion_block += (
"\n*Alternative hypotheses:*\n"
+ "\n".join(f"• {_sanitize_for_slack(h)}" for h in ranked_hypotheses)
+ "\n"
)
missing_evidence_list = ctx.get("missing_evidence") or []
if missing_evidence_list:
conclusion_block += (
"\n*Missing evidence:*\n"
+ "\n".join(f"• {_sanitize_for_slack(e)}" for e in missing_evidence_list)
+ "\n"
)

correlation_signal_lines, correlation_driver_lines = _format_correlation_lines(ctx)
if correlation_signal_lines or correlation_driver_lines:
conclusion_block += "\n## Upstream Correlation\n"
Expand Down Expand Up @@ -556,12 +577,33 @@ def format_telegram_message(ctx: ReportContext) -> str:
rc += "\n<code>" + html.escape(top_log) + "</code>"
parts.append(rc)

confidence_band = ctx.get("confidence_band", "")
validity_score_val = ctx.get("validity_score")
if confidence_band:
pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
parts.append(f"<b>Confidence:</b> {html.escape(confidence_band.upper())}{pct}")

validated_lines, non_validated_lines = _render_claim_lines_telegram(ctx)
if validated_lines:
parts.append("<b>Findings</b>\n" + "\n".join(validated_lines))
if non_validated_lines:
parts.append("<b>Non-Validated Claims (Inferred)</b>\n" + "\n".join(non_validated_lines))

ranked_hypotheses = ctx.get("ranked_hypotheses") or []
if ranked_hypotheses:
hyp = "\n".join(
"• " + _to_telegram_html_body(_sanitize_for_slack(str(h))) for h in ranked_hypotheses
)
parts.append("<b>Alternative hypotheses</b>\n" + hyp)

missing_evidence_list = ctx.get("missing_evidence") or []
if missing_evidence_list:
me = "\n".join(
"• " + _to_telegram_html_body(_sanitize_for_slack(str(e)))
for e in missing_evidence_list
)
parts.append("<b>Missing evidence</b>\n" + me)

provenance_lines = _format_provenance_lines(ctx)
if provenance_lines:
prov = "\n".join(
Expand Down Expand Up @@ -695,6 +737,13 @@ def _add(block: "dict[str, Any] | None") -> None:
rc_text += f"\n`{top_log}`"
_add(_mrkdwn_section(rc_text))

# ── Confidence band ──
confidence_band = ctx.get("confidence_band", "")
validity_score_val = ctx.get("validity_score")
if confidence_band:
pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
_add(_mrkdwn_section(f"*Confidence:* {confidence_band.upper()}{pct}"))

# ── Failed Pods ──
datadog_site = ctx.get("datadog_site", "datadoghq.com")
all_pods = get_failed_pods(ctx)
Expand Down Expand Up @@ -727,6 +776,28 @@ def _add(block: "dict[str, Any] | None") -> None:
if non_validated_lines:
_add(_mrkdwn_section("*Inferred (not yet validated)*\n" + "\n".join(non_validated_lines)))

# ── Alternative Hypotheses ──
ranked_hypotheses = ctx.get("ranked_hypotheses") or []
if ranked_hypotheses:
blocks.append({"type": "divider"})
_add(
_mrkdwn_section(
"*Alternative hypotheses:*\n"
+ "\n".join(f"• {_sanitize_for_slack(h)}" for h in ranked_hypotheses)
)
)

# ── Missing Evidence ──
missing_evidence_list = ctx.get("missing_evidence") or []
if missing_evidence_list:
blocks.append({"type": "divider"})
_add(
_mrkdwn_section(
"*Missing evidence:*\n"
+ "\n".join(f"• {_sanitize_for_slack(e)}" for e in missing_evidence_list)
)
)
Comment thread
greptile-apps[bot] marked this conversation as resolved.

correlation_signal_lines, correlation_driver_lines = _format_correlation_lines(ctx)
if correlation_signal_lines or correlation_driver_lines:
blocks.append({"type": "divider"})
Expand Down
9 changes: 8 additions & 1 deletion app/delivery/publish_findings/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,14 @@ def generate_report(state: InvestigationState) -> dict:

all_blocks = build_slack_blocks(ctx) + build_action_blocks(investigation_url, investigation_id)
all_blocks = masking_ctx.unmask_value(all_blocks)
render_report(slack_message, root_cause_category=state.get("root_cause_category"))
render_report(
slack_message,
root_cause_category=state.get("root_cause_category"),
confidence_band=state.get("confidence_band", ""),
validity_score=state.get("validity_score"),
ranked_hypotheses=state.get("ranked_hypotheses", []),
missing_evidence=state.get("missing_evidence", []),
)
open_in_editor(slack_message)

slack_ctx = state.get("slack_context", {})
Expand Down
Loading