oracle-samples · fede-kamel · Apr 30, 2026 · Apr 30, 2026
diff --git a/src/locus/agent/agent.py b/src/locus/agent/agent.py
@@ -1047,6 +1047,15 @@ async def _run() -> AgentResult:
                     # using ``result.message`` still see a schema-valid string.
                     structured_message = parsed_obj.model_dump_json()
 
+            # Run GSAR judgment when configured. Single-pass v1: judge
+            # the final answer, surface the result on AgentResult.
+            # Full Algorithm-1 outer loop (regenerate / replan) lives in
+            # locus.reasoning.gsar_evaluator and can be wired
+            # explicitly when the caller wants the loop dynamics.
+            gsar_judgment, gsar_score_value, gsar_decision = await self._run_gsar_judgment(
+                state, structured_message or final_message
+            )
+
             elapsed_ms = (datetime.now(UTC) - started_at).total_seconds() * 1000
             metrics = ExecutionMetrics(
                 iterations=state.iteration,
@@ -1066,6 +1075,9 @@ async def _run() -> AgentResult:
                 parsed=parsed_obj,
                 parse_error=parse_error_msg,
                 message=structured_message,
+                gsar_judgment=gsar_judgment,
+                gsar_score=gsar_score_value,
+                gsar_decision=gsar_decision,
             )
 
         try:
@@ -1899,6 +1911,89 @@ def _build_fallback_summary(state: AgentState) -> str:
                 parts.append(f"- {execution.tool_name}: {preview}")
         return "\n".join(parts)
 
+    async def _run_gsar_judgment(
+        self,
+        state: AgentState,
+        final_message: str,
+    ) -> tuple[Any, float | None, str | None]:
+        """Run the GSAR judge over the agent's final answer + tool history.
+
+        Returns ``(judgment, score, decision_value)`` where:
+
+        - ``judgment`` is a ``JudgeOutput`` (or ``None`` if the
+          judge raised and the safe-default fallback was used).
+        - ``score`` is the recomputed scalar ``S`` from the judgment's
+          partition under the configured weight map and contradiction
+          penalty.
+        - ``decision_value`` is the string form of
+          :class:`~locus.reasoning.gsar.Decision` (``"proceed"``, etc.),
+          or ``"abstain"`` when the judge abstained.
+
+        Returns ``(None, None, None)`` when ``self.config.gsar`` is unset.
+        """
+        if self.config.gsar is None:
+            return None, None, None
+
+        from locus.reasoning.gsar import (
+            EvidenceType,
+            GSARThresholds,
+            decide,
+            gsar_score,
+        )
+        from locus.reasoning.gsar_judge import StructuredOutputGSARJudge
+
+        cfg = self.config.gsar
+
+        # Default judge: a StructuredOutputGSARJudge over the agent's
+        # primary model. Documented as "almost never what you want for
+        # production" — the paper recommends a different judge model
+        # from the generator.
+        judge = cfg.judge
+        if judge is None:
+            judge = StructuredOutputGSARJudge(model=self._model)
+
+        # Build the evidence corpus from tool executions on the final
+        # state. Format mirrors the shape the default judge prompt
+        # expects: one ``[tool=NAME args=…] result``-flavoured line per
+        # execution, skipping idempotent cache hits and errored calls.
+        evidence_lines: list[str] = []
+        for ex in state.tool_executions:
+            if ex.error:
+                continue
+            line = f"[tool={ex.tool_name} args={ex.arguments}] {ex.result or ''}"
+            evidence_lines.append(line)
+        evidence_corpus = "\n".join(evidence_lines) or "(no tool executions)"
+
+        # Translate optional weight_map (str-keyed) into the typed map.
+        weight_map: dict[EvidenceType, float] | None = None
+        if cfg.weight_map is not None:
+            weight_map = {EvidenceType(k): v for k, v in cfg.weight_map.items()}
+
+        try:
+            judgment = await judge.judge(
+                report_synthesis=final_message,
+                evidence_corpus=evidence_corpus,
+            )
+        except Exception:  # noqa: BLE001 — paper §6 "Robustness": never
+            # let a judge failure crash the agent. Surface ``None`` so
+            # the caller can decide whether to ship or replan.
+            return None, None, None
+
+        partition = judgment.to_partition()
+        score = gsar_score(
+            partition,
+            weight_map=weight_map,
+            contradiction_penalty=cfg.contradiction_penalty,
+        )
+
+        if judgment.abstained:
+            decision_value = "abstain"
+        else:
+            thresholds = GSARThresholds(proceed=cfg.tau_proceed, regenerate=cfg.tau_regenerate)
+            decision_value = decide(score, thresholds=thresholds).value
+
+        return judgment, score, decision_value
+
     # Hook lifecycle dispatch is delegated to HookOrchestrator; these
     # thin wrappers preserve the original method names so internal
     # callers don't need to change.

diff --git a/src/locus/agent/config.py b/src/locus/agent/config.py
@@ -36,6 +36,89 @@ class GroundingConfig(BaseModel):
     model_config = {"extra": "forbid"}
 
 
+class GSARConfig(BaseModel):
+    """Configuration for the GSAR typed-grounding layer.
+
+    Wires the framework from `arXiv:2604.23366` onto an ``Agent``. When
+    set on :class:`AgentConfig`, the agent runs the configured judge
+    over its final assistant message + tool-execution history after
+    the loop completes; the resulting :class:`~locus.reasoning.gsar_judge.JudgeOutput`,
+    scalar score ``S``, and decision ``δ`` are surfaced on
+    :class:`~locus.agent.result.AgentResult`.
+
+    This is a single-pass v1 — the agent produces an answer, the judge
+    scores it, and the result is exposed for the caller to act on. The
+    full Algorithm-1 outer loop with regenerate / replan callbacks
+    lives separately in :mod:`locus.reasoning.gsar_evaluator`; wire it
+    explicitly when you want the loop dynamics.
+    """
+
+    judge: Any = Field(
+        default=None,
+        description=(
+            "A :class:`~locus.reasoning.gsar_judge.BaseGSARJudge` "
+            "instance. When ``None`` the agent constructs a default "
+            "``StructuredOutputGSARJudge`` over the agent's primary "
+            "model — that's almost never what you want for production "
+            "(the paper recommends a different model from the generator), "
+            "so prefer to pass an explicit judge."
+        ),
+    )
+
+    contradiction_penalty: float = Field(
+        default=0.5,
+        ge=0.0,
+        le=1.0,
+        description="``ρ`` from Eq. 2. Default 0.5 (Appendix B reference).",
+    )
+
+    tau_proceed: float = Field(
+        default=0.80,
+        ge=0.0,
+        le=1.0,
+        description="``τ_proceed`` from Eq. 3. Default 0.80 (Appendix B).",
+    )
+
+    tau_regenerate: float = Field(
+        default=0.65,
+        ge=0.0,
+        le=1.0,
+        description="``τ_regenerate`` from Eq. 3. Default 0.65 (Appendix B).",
+    )
+
+    weight_map: dict[str, float] | None = Field(
+        default=None,
+        description=(
+            "Optional override of the Appendix-B reference weights. "
+            "Keys must be ``EvidenceType`` enum values "
+            "(``'tool_match'`` etc.). ``None`` uses the defaults."
+        ),
+    )
+
+    fail_on_low_score: bool = Field(
+        default=False,
+        description=(
+            "When True, an ``AgentResult`` whose GSAR decision is not "
+            "``proceed`` raises a ``GSARValidationError`` instead of "
+            "returning. Useful for pipelines that should refuse to "
+            "ship un-grounded summaries; off by default so callers "
+            "can inspect the judgment and decide."
+        ),
+    )
+
+    model_config = {"arbitrary_types_allowed": True, "extra": "forbid"}
+
+    @field_validator("tau_regenerate")
+    @classmethod
+    def _ordered(cls, v: float, info: Any) -> float:
+        proceed = getattr(info, "data", {}).get("tau_proceed", 0.80)
+        if v >= proceed:
+            raise ValueError(
+                f"tau_regenerate ({v}) must be strictly less than tau_proceed ({proceed})."
+            )
+        return v
+
+
 class AgentConfig(BaseModel):
     """
     Configuration for an Agent instance.
@@ -119,6 +202,20 @@ class AgentConfig(BaseModel):
         description="Grounding evaluation configuration (None to disable)",
     )
 
+    gsar: GSARConfig | None = Field(
+        default=None,
+        description=(
+            "GSAR typed-grounding layer config (`arXiv:2604.23366`). "
+            "When set, the agent runs the configured judge over its "
+            "final answer + tool-execution history after the loop "
+            "completes and surfaces the JudgeOutput / score / decision "
+            "on ``AgentResult``. Use for safety-critical pipelines "
+            "where typed-evidence partitioning earns its keep over the "
+            "binary ``grounding=True`` path. ``None`` (default) "
+            "disables GSAR."
+        ),
+    )
+
     # Planning
     planning: bool = Field(
         default=False,

diff --git a/src/locus/agent/result.py b/src/locus/agent/result.py
@@ -127,6 +127,41 @@ class AgentResult(BaseModel):
         description="Claims that couldn't be grounded",
     )
 
+    # GSAR info (if AgentConfig.gsar was set). The framework lives in
+    # locus.reasoning.gsar — see arXiv:2604.23366. The fields are typed
+    # as ``Any`` here to keep ``locus.agent`` import-light; the actual
+    # values are ``JudgeOutput``, ``float``, and ``Decision`` from
+    # ``locus.reasoning.gsar*``.
+    gsar_judgment: Any = Field(
+        default=None,
+        description=(
+            "The :class:`~locus.reasoning.gsar_judge.JudgeOutput` "
+            "produced by the configured GSAR judge over the agent's "
+            "final message + tool-execution history. ``None`` when "
+            "``AgentConfig.gsar`` is unset."
+        ),
+    )
+
+    gsar_score: float | None = Field(
+        default=None,
+        ge=0.0,
+        le=1.0,
+        description=(
+            "Scalar score ``S`` from Eq. 2, recomputed from the "
+            "judgment partition under the configured weight map and "
+            "contradiction penalty. ``None`` when GSAR is unset."
+        ),
+    )
+
+    gsar_decision: str | None = Field(
+        default=None,
+        description=(
+            "The :class:`~locus.reasoning.gsar.Decision` (``proceed``, "
+            "``regenerate``, ``replan``, ``abstain``) for ``gsar_score`` "
+            "under the configured thresholds. ``None`` when GSAR is unset."
+        ),
+    )
+
     # Structured output (if Agent was configured with output_schema)
     parsed: BaseModel | None = Field(
         default=None,
@@ -226,6 +261,9 @@ def from_state(
         parsed: BaseModel | None = None,
         parse_error: str | None = None,
         message: str | None = None,
+        gsar_judgment: Any = None,
+        gsar_score: float | None = None,
+        gsar_decision: str | None = None,
     ) -> AgentResult:
         """
         Create a result from final state.
@@ -254,6 +292,9 @@ def from_state(
             ungrounded_claims=ungrounded_claims or [],
             parsed=parsed,
             parse_error=parse_error,
+            gsar_judgment=gsar_judgment,
+            gsar_score=gsar_score,
+            gsar_decision=gsar_decision,
         )