From 74e2d406a1fda90a8b04df3bb5518debbef71c3e Mon Sep 17 00:00:00 2001 From: Evgeny Kiriyak <224408464+evkir@users.noreply.github.com> Date: Thu, 11 Jun 2026 00:13:36 +0300 Subject: [PATCH 1/4] feat(llm): response_format=json_schema support --- cyberai/core/llm_client.py | 109 +++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/cyberai/core/llm_client.py b/cyberai/core/llm_client.py index 6ab8f37..e33127b 100644 --- a/cyberai/core/llm_client.py +++ b/cyberai/core/llm_client.py @@ -220,6 +220,115 @@ def _call_tools_anthropic( stop_reason=getattr(response, "stop_reason", None), ) + # ── structured output (sync) ────────────────────────────────────── + + def structured_call( + self, + messages: List[Dict], + schema: Dict[str, Any], + schema_name: str = "response", + description: str = "", + system: Optional[str] = None, + agent_name: str = "unknown", + cacheable_system: bool = False, + ) -> Dict[str, Any]: + """Force the model to return JSON matching `schema`; returns parsed dict. + + OpenAI: response_format=json_schema. Anthropic: a single forced tool + whose input_schema is `schema` — the tool_use input IS the structured + output. Ollama is unsupported. Caller validates via pydantic. + """ + if self.config.provider == "openai": + return self._structured_openai( + messages, schema, schema_name, description, system, agent_name + ) + elif self.config.provider == "anthropic": + return self._structured_anthropic( + messages, + schema, + schema_name, + description, + system, + agent_name, + cacheable_system, + ) + else: + raise ValueError(f"Structured output unsupported for provider: {self.config.provider}") + + def _structured_openai( + self, messages, schema, schema_name, description, system, agent_name="unknown" + ): + import openai + + client = openai.OpenAI(api_key=self.config.api_key) + full_messages = [] + if system: + full_messages.append({"role": "system", "content": system}) + full_messages.extend(messages) + response = client.chat.completions.create( + model=self.config.model, + messages=full_messages, + max_tokens=self.config.max_tokens, + temperature=self.config.temperature, + response_format={ + "type": "json_schema", + "json_schema": { + "name": schema_name, + "schema": schema, + "strict": False, + }, + }, + ) + self._record_usage( + agent_name, + getattr(response, "model", self.config.model), + getattr(response.usage, "prompt_tokens", 0), + getattr(response.usage, "completion_tokens", 0), + ) + content = response.choices[0].message.content or "{}" + return json.loads(content) + + def _structured_anthropic( + self, + messages, + schema, + schema_name, + description, + system, + agent_name="unknown", + cacheable_system=False, + ): + import anthropic + + client = anthropic.Anthropic(api_key=self.config.api_key) + tool = { + "name": schema_name, + "description": description or f"Return a structured {schema_name}.", + "input_schema": schema, + } + kwargs: Dict[str, Any] = dict( + model=self.config.model, + max_tokens=self.config.max_tokens, + messages=messages, + tools=[tool], + tool_choice={"type": "tool", "name": schema_name}, + ) + if system: + kwargs["system"] = _wrap_cacheable(system) if cacheable_system else system + response = client.messages.create(**kwargs) + self._record_usage( + agent_name, + getattr(response, "model", self.config.model), + getattr(response.usage, "input_tokens", 0), + getattr(response.usage, "output_tokens", 0), + cache_creation_tokens=getattr(response.usage, "cache_creation_input_tokens", 0) or 0, + cache_read_tokens=getattr(response.usage, "cache_read_input_tokens", 0) or 0, + ) + for block in response.content: + if block.type == "tool_use": + return dict(block.input) + return {} + # ── async API ───────────────────────────────────────────────────── async def acall( From c9f98259493767b6f0f3715ab9520b865475cf5b Mon Sep 17 00:00:00 2001 From: Evgeny Kiriyak <224408464+evkir@users.noreply.github.com> Date: Thu, 11 Jun 2026 00:15:02 +0300 Subject: [PATCH 2/4] feat(report): Pydantic-validated report from LLM --- cyberai/agents/report/agent.py | 53 ++++++++++++++++++++++++++++++++++ cyberai/core/types.py | 25 +++++++++++++++- 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/cyberai/agents/report/agent.py b/cyberai/agents/report/agent.py index d628f35..b654876 100644 --- a/cyberai/agents/report/agent.py +++ b/cyberai/agents/report/agent.py @@ -6,7 +6,10 @@ from pathlib import Path from typing import Any, Dict, Optional +import json + from cyberai.core.base_agent import BaseAgent, Tool +from cyberai.core.types import ReportSection from .json_exporter import export_json from .markdown_renderer import render_markdown @@ -58,9 +61,59 @@ def run(self, target: str, context: Optional[Dict[str, Any]] = None) -> Dict[str self.kb.set("report.markdown_path", md_path, agent=self.AGENT_NAME) self.kb.set("report.json_path", json_path, agent=self.AGENT_NAME) + # Flag-gated: LLM-generated structured executive section. + if getattr(self.config, "use_llm_summary", False) and self.llm is not None: + section = self._structured_summary(target) + if section is not None: + self.kb.set("report.section", section.model_dump(), agent=self.AGENT_NAME) + return { "status": "done", "markdown": md_path, "json": json_path, "total_findings": len(self.session.findings), } + + def _structured_summary(self, target: str): + """Flag-gated: ask the LLM for a Pydantic-validated ReportSection. + + Uses LLMClient.structured_call with ReportSection's JSON Schema; the + provider returns JSON, which we validate. Returns None on any failure + so the deterministic report is never blocked. + """ + if self.llm is None: + return None + findings = [ + { + "title": f.title, + "severity": getattr(f.severity, "value", str(f.severity)), + "description": f.description, + } + for f in self.session.findings + ] + system = ( + "You are a penetration-test report writer. Summarize the findings " + "into one executive ReportSection: a concise title, the highest " + "applicable severity, key findings, concrete recommendations, and " + "a short business impact statement." + ) + messages = [ + { + "role": "user", + "content": (f"Target: {target}\nFindings JSON:\n{json.dumps(findings, indent=2)}"), + } + ] + schema = ReportSection.model_json_schema() + try: + raw = self.llm.structured_call( + messages, + schema=schema, + schema_name="report_section", + description="Executive pentest report section.", + system=system, + agent_name=self.AGENT_NAME, + ) + return ReportSection.model_validate(raw) + except Exception as exc: # noqa: BLE001 — report must never hard-fail + self._log(f"LLM structured summary failed: {exc}") + return None diff --git a/cyberai/core/types.py b/cyberai/core/types.py index 2a4652d..d670471 100644 --- a/cyberai/core/types.py +++ b/cyberai/core/types.py @@ -6,7 +6,7 @@ from typing import Any, Union from pathlib import Path -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator # Target types Target = str # IP, CIDR, or domain @@ -105,6 +105,29 @@ class ExploitResult(BaseModel): ReportPath = Path ReportFormat = str # "markdown" | "html" | "json" | "pdf" +_VALID_SEVERITIES = {"CRITICAL", "HIGH", "MEDIUM", "LOW", "INFO"} + + +class ReportSection(BaseModel): + """LLM-generated structured report section (day 20 structured outputs). + + `impact` is included for HackerOne-style export; not in the original + plan column but required by the H1 template. + """ + + title: str + severity: str = "INFO" + findings: list[str] = Field(default_factory=list) + recommendations: list[str] = Field(default_factory=list) + impact: str = "" + + @field_validator("severity") + @classmethod + def _norm_severity(cls, v: str) -> str: + up = (v or "INFO").strip().upper() + return up if up in _VALID_SEVERITIES else "INFO" + + # Pipeline PipelineInput = Target PipelineOutput = dict[str, AgentOutput] From d2b0c810a92ea187753401d025d2fe39fd90bf76 Mon Sep 17 00:00:00 2001 From: Evgeny Kiriyak <224408464+evkir@users.noreply.github.com> Date: Thu, 11 Jun 2026 00:16:01 +0300 Subject: [PATCH 3/4] feat(report): HackerOne-compatible export --- cyberai/agents/report/h1_exporter.py | 42 ++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 cyberai/agents/report/h1_exporter.py diff --git a/cyberai/agents/report/h1_exporter.py b/cyberai/agents/report/h1_exporter.py new file mode 100644 index 0000000..9435fcf --- /dev/null +++ b/cyberai/agents/report/h1_exporter.py @@ -0,0 +1,42 @@ +"""HackerOne-compatible Markdown export for a ReportSection (day 20).""" + +from __future__ import annotations + +from cyberai.core.types import ReportSection + +# Map internal severity to HackerOne's severity vocabulary. +_H1_SEVERITY = { + "CRITICAL": "Critical", + "HIGH": "High", + "MEDIUM": "Medium", + "LOW": "Low", + "INFO": "None", +} + + +def _bullets(items: list[str]) -> str: + """Render a list as Markdown bullets; placeholder if empty.""" + if not items: + return "_None provided._" + return "\n".join(f"- {it}" for it in items) + + +def export_hackerone(section: ReportSection) -> str: + """Render a ReportSection as a HackerOne-style Markdown submission. + + Sections follow the H1 report template: Title, Severity, Steps to + Reproduce, Impact, Recommendation. `findings` map to reproduction + steps; `recommendations` to the Recommendation block. + """ + severity = _H1_SEVERITY.get(section.severity.upper(), "None") + impact = section.impact.strip() or "_Impact not specified._" + return ( + f"# {section.title}\n\n" + f"**Severity:** {severity}\n\n" + f"## Steps to Reproduce\n\n" + f"{_bullets(section.findings)}\n\n" + f"## Impact\n\n" + f"{impact}\n\n" + f"## Recommendation\n\n" + f"{_bullets(section.recommendations)}\n" + ) From a2b06046bd4e450dded09cd7e31489404d13bff1 Mon Sep 17 00:00:00 2001 From: Evgeny Kiriyak <224408464+evkir@users.noreply.github.com> Date: Thu, 11 Jun 2026 00:17:32 +0300 Subject: [PATCH 4/4] test(report): structured output roundtrip --- tests/unit/test_structured_report.py | 194 +++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 tests/unit/test_structured_report.py diff --git a/tests/unit/test_structured_report.py b/tests/unit/test_structured_report.py new file mode 100644 index 0000000..f8c75ac --- /dev/null +++ b/tests/unit/test_structured_report.py @@ -0,0 +1,194 @@ +"""Day 20 — structured outputs: ReportSection, structured_call, H1 export.""" + +from __future__ import annotations + +import json +from unittest.mock import MagicMock + +from cyberai.agents.report.h1_exporter import export_hackerone +from cyberai.core.llm_client import LLMClient +from cyberai.core.types import ReportSection + + +# ── ReportSection model ─────────────────────────────────────────────── + + +def test_section_severity_normalized(): + s = ReportSection(title="t", severity="critical") + assert s.severity == "CRITICAL" + + +def test_section_severity_invalid_falls_back_info(): + s = ReportSection(title="t", severity="bogus") + assert s.severity == "INFO" + + +def test_section_defaults(): + s = ReportSection(title="t") + assert s.severity == "INFO" + assert s.findings == [] + assert s.recommendations == [] + assert s.impact == "" + + +# ── HackerOne export ────────────────────────────────────────────────── + + +def _sample_section() -> ReportSection: + return ReportSection( + title="SQL Injection in login", + severity="HIGH", + findings=["Send ' OR 1=1-- in username", "Observe auth bypass"], + recommendations=["Use parameterized queries"], + impact="Full auth bypass, account takeover.", + ) + + +def test_h1_export_contains_sections(): + md = export_hackerone(_sample_section()) + assert "# SQL Injection in login" in md + assert "**Severity:** High" in md + assert "## Steps to Reproduce" in md + assert "## Impact" in md + assert "## Recommendation" in md + + +def test_h1_export_info_maps_to_none(): + md = export_hackerone(ReportSection(title="x", severity="INFO")) + assert "**Severity:** None" in md + + +def test_h1_export_empty_lists_placeholder(): + md = export_hackerone(ReportSection(title="x", severity="LOW")) + assert "_None provided._" in md + assert "_Impact not specified._" in md + + +def test_h1_roundtrip_steps_present(): + section = _sample_section() + md = export_hackerone(section) + for step in section.findings: + assert step in md + for rec in section.recommendations: + assert rec in md + + +# ── structured_call provider branches (mocked SDK) ──────────────────── + + +def _client(provider: str) -> LLMClient: + cfg = MagicMock() + cfg.provider = provider + cfg.api_key = "x" + cfg.model = "test-model" + cfg.max_tokens = 1024 + cfg.temperature = 0.0 + return LLMClient(cfg) + + +SCHEMA = ReportSection.model_json_schema() +PAYLOAD = {"title": "t", "severity": "HIGH", "findings": ["a"]} + + +def test_structured_call_openai(monkeypatch): + client = _client("openai") + fake = MagicMock() + msg = MagicMock() + msg.content = json.dumps(PAYLOAD) + fake.choices = [MagicMock(message=msg)] + fake.usage = MagicMock(prompt_tokens=10, completion_tokens=5) + fake.model = "test-model" + + import openai + + inst = MagicMock() + inst.chat.completions.create.return_value = fake + monkeypatch.setattr(openai, "OpenAI", lambda **kw: inst) + + out = client.structured_call( + [{"role": "user", "content": "go"}], schema=SCHEMA, schema_name="rs" + ) + assert out["title"] == "t" + # response_format must carry json_schema + kwargs = inst.chat.completions.create.call_args.kwargs + assert kwargs["response_format"]["type"] == "json_schema" + + +def test_structured_call_anthropic(monkeypatch): + client = _client("anthropic") + block = MagicMock() + block.type = "tool_use" + block.input = PAYLOAD + fake = MagicMock() + fake.content = [block] + fake.usage = MagicMock( + input_tokens=10, + output_tokens=5, + cache_creation_input_tokens=0, + cache_read_input_tokens=0, + ) + fake.model = "test-model" + + import anthropic + + inst = MagicMock() + inst.messages.create.return_value = fake + monkeypatch.setattr(anthropic, "Anthropic", lambda **kw: inst) + + out = client.structured_call( + [{"role": "user", "content": "go"}], schema=SCHEMA, schema_name="rs" + ) + assert out["severity"] == "HIGH" + # forced single-tool choice + kwargs = inst.messages.create.call_args.kwargs + assert kwargs["tool_choice"] == {"type": "tool", "name": "rs"} + + +def test_structured_call_ollama_unsupported(): + client = _client("ollama") + try: + client.structured_call([], schema=SCHEMA) + assert False, "expected ValueError" + except ValueError: + pass + + +# ── ReportAgent._structured_summary (mocked) ────────────────────────── + + +def _report_agent(provider="anthropic"): + from cyberai.agents.report.agent import ReportAgent + + agent = ReportAgent.__new__(ReportAgent) + agent.AGENT_NAME = "report" + agent.llm = MagicMock() + agent.llm.config.provider = provider + session = MagicMock() + session.findings = [ + MagicMock(title="Log4Shell", severity="CRITICAL", description="rce"), + ] + agent.session = session + return agent + + +def test_structured_summary_validates(): + agent = _report_agent() + agent.llm.structured_call.return_value = { + "title": "Exec summary", + "severity": "critical", + "findings": ["Log4Shell RCE"], + "recommendations": ["Patch log4j"], + "impact": "RCE on host.", + } + section = agent._structured_summary("testhost") + assert isinstance(section, ReportSection) + assert section.severity == "CRITICAL" # normalized + md = export_hackerone(section) + assert "Log4Shell RCE" in md + + +def test_structured_summary_failsafe_returns_none(): + agent = _report_agent() + agent.llm.structured_call.side_effect = RuntimeError("api down") + agent._log = MagicMock() + assert agent._structured_summary("testhost") is None