Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions src/locus/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -1047,6 +1047,15 @@ async def _run() -> AgentResult:
# using ``result.message`` still see a schema-valid string.
structured_message = parsed_obj.model_dump_json()

# Run GSAR judgment when configured. Single-pass v1: judge
# the final answer, surface the result on AgentResult.
# Full Algorithm-1 outer loop (regenerate / replan) lives in
# locus.reasoning.gsar_evaluator and can be wired
# explicitly when the caller wants the loop dynamics.
gsar_judgment, gsar_score_value, gsar_decision = await self._run_gsar_judgment(
state, structured_message or final_message
)

elapsed_ms = (datetime.now(UTC) - started_at).total_seconds() * 1000
metrics = ExecutionMetrics(
iterations=state.iteration,
Expand All @@ -1066,6 +1075,9 @@ async def _run() -> AgentResult:
parsed=parsed_obj,
parse_error=parse_error_msg,
message=structured_message,
gsar_judgment=gsar_judgment,
gsar_score=gsar_score_value,
gsar_decision=gsar_decision,
)

try:
Expand Down Expand Up @@ -1899,6 +1911,89 @@ def _build_fallback_summary(state: AgentState) -> str:
parts.append(f"- {execution.tool_name}: {preview}")
return "\n".join(parts)

async def _run_gsar_judgment(
self,
state: AgentState,
final_message: str,
) -> tuple[Any, float | None, str | None]:
"""Run the GSAR judge over the agent's final answer + tool history.

Returns ``(judgment, score, decision_value)`` where:

- ``judgment`` is a ``JudgeOutput`` (or ``None`` if the
judge raised and the safe-default fallback was used).
- ``score`` is the recomputed scalar ``S`` from the judgment's
partition under the configured weight map and contradiction
penalty.
- ``decision_value`` is the string form of
:class:`~locus.reasoning.gsar.Decision` (``"proceed"``, etc.),
or ``"abstain"`` when the judge abstained.

Returns ``(None, None, None)`` when ``self.config.gsar`` is unset.
"""
if self.config.gsar is None:
return None, None, None

from locus.reasoning.gsar import (
EvidenceType,
GSARThresholds,
decide,
gsar_score,
)
from locus.reasoning.gsar_judge import StructuredOutputGSARJudge

cfg = self.config.gsar

# Default judge: a StructuredOutputGSARJudge over the agent's
# primary model. Documented as "almost never what you want for
# production" — the paper recommends a different judge model
# from the generator.
judge = cfg.judge
if judge is None:
judge = StructuredOutputGSARJudge(model=self._model)

# Build the evidence corpus from tool executions on the final
# state. Format mirrors the shape the default judge prompt
# expects: one ``[tool=NAME args=…] result``-flavoured line per
# execution, skipping idempotent cache hits and errored calls.
evidence_lines: list[str] = []
for ex in state.tool_executions:
if ex.error:
continue
line = f"[tool={ex.tool_name} args={ex.arguments}] {ex.result or ''}"
evidence_lines.append(line)
evidence_corpus = "\n".join(evidence_lines) or "(no tool executions)"

# Translate optional weight_map (str-keyed) into the typed map.
weight_map: dict[EvidenceType, float] | None = None
if cfg.weight_map is not None:
weight_map = {EvidenceType(k): v for k, v in cfg.weight_map.items()}

try:
judgment = await judge.judge(
report_synthesis=final_message,
evidence_corpus=evidence_corpus,
)
except Exception: # noqa: BLE001 — paper §6 "Robustness": never
# let a judge failure crash the agent. Surface ``None`` so
# the caller can decide whether to ship or replan.
return None, None, None

partition = judgment.to_partition()
score = gsar_score(
partition,
weight_map=weight_map,
contradiction_penalty=cfg.contradiction_penalty,
)

if judgment.abstained:
decision_value = "abstain"
else:
thresholds = GSARThresholds(proceed=cfg.tau_proceed, regenerate=cfg.tau_regenerate)
decision_value = decide(score, thresholds=thresholds).value

return judgment, score, decision_value

# Hook lifecycle dispatch is delegated to HookOrchestrator; these
# thin wrappers preserve the original method names so internal
# callers don't need to change.
Expand Down
97 changes: 97 additions & 0 deletions src/locus/agent/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,89 @@ class GroundingConfig(BaseModel):
model_config = {"extra": "forbid"}


class GSARConfig(BaseModel):
"""Configuration for the GSAR typed-grounding layer.

Wires the framework from `arXiv:2604.23366` onto an ``Agent``. When
set on :class:`AgentConfig`, the agent runs the configured judge
over its final assistant message + tool-execution history after
the loop completes; the resulting :class:`~locus.reasoning.gsar_judge.JudgeOutput`,
scalar score ``S``, and decision ``δ`` are surfaced on
:class:`~locus.agent.result.AgentResult`.

This is a single-pass v1 — the agent produces an answer, the judge
scores it, and the result is exposed for the caller to act on. The
full Algorithm-1 outer loop with regenerate / replan callbacks
lives separately in :mod:`locus.reasoning.gsar_evaluator`; wire it
explicitly when you want the loop dynamics.
"""

judge: Any = Field(
default=None,
description=(
"A :class:`~locus.reasoning.gsar_judge.BaseGSARJudge` "
"instance. When ``None`` the agent constructs a default "
"``StructuredOutputGSARJudge`` over the agent's primary "
"model — that's almost never what you want for production "
"(the paper recommends a different model from the generator), "
"so prefer to pass an explicit judge."
),
)

contradiction_penalty: float = Field(
default=0.5,
ge=0.0,
le=1.0,
description="``ρ`` from Eq. 2. Default 0.5 (Appendix B reference).",
)

tau_proceed: float = Field(
default=0.80,
ge=0.0,
le=1.0,
description="``τ_proceed`` from Eq. 3. Default 0.80 (Appendix B).",
)

tau_regenerate: float = Field(
default=0.65,
ge=0.0,
le=1.0,
description="``τ_regenerate`` from Eq. 3. Default 0.65 (Appendix B).",
)

weight_map: dict[str, float] | None = Field(
default=None,
description=(
"Optional override of the Appendix-B reference weights. "
"Keys must be ``EvidenceType`` enum values "
"(``'tool_match'`` etc.). ``None`` uses the defaults."
),
)

fail_on_low_score: bool = Field(
default=False,
description=(
"When True, an ``AgentResult`` whose GSAR decision is not "
"``proceed`` raises a ``GSARValidationError`` instead of "
"returning. Useful for pipelines that should refuse to "
"ship un-grounded summaries; off by default so callers "
"can inspect the judgment and decide."
),
)

model_config = {"arbitrary_types_allowed": True, "extra": "forbid"}

@field_validator("tau_regenerate")
@classmethod
def _ordered(cls, v: float, info: Any) -> float:
proceed = getattr(info, "data", {}).get("tau_proceed", 0.80)
if v >= proceed:
raise ValueError(
f"tau_regenerate ({v}) must be strictly less than tau_proceed ({proceed})."
)
return v


class AgentConfig(BaseModel):
"""
Configuration for an Agent instance.
Expand Down Expand Up @@ -119,6 +202,20 @@ class AgentConfig(BaseModel):
description="Grounding evaluation configuration (None to disable)",
)

gsar: GSARConfig | None = Field(
default=None,
description=(
"GSAR typed-grounding layer config (`arXiv:2604.23366`). "
"When set, the agent runs the configured judge over its "
"final answer + tool-execution history after the loop "
"completes and surfaces the JudgeOutput / score / decision "
"on ``AgentResult``. Use for safety-critical pipelines "
"where typed-evidence partitioning earns its keep over the "
"binary ``grounding=True`` path. ``None`` (default) "
"disables GSAR."
),
)

# Planning
planning: bool = Field(
default=False,
Expand Down
41 changes: 41 additions & 0 deletions src/locus/agent/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,41 @@ class AgentResult(BaseModel):
description="Claims that couldn't be grounded",
)

# GSAR info (if AgentConfig.gsar was set). The framework lives in
# locus.reasoning.gsar — see arXiv:2604.23366. The fields are typed
# as ``Any`` here to keep ``locus.agent`` import-light; the actual
# values are ``JudgeOutput``, ``float``, and ``Decision`` from
# ``locus.reasoning.gsar*``.
gsar_judgment: Any = Field(
default=None,
description=(
"The :class:`~locus.reasoning.gsar_judge.JudgeOutput` "
"produced by the configured GSAR judge over the agent's "
"final message + tool-execution history. ``None`` when "
"``AgentConfig.gsar`` is unset."
),
)

gsar_score: float | None = Field(
default=None,
ge=0.0,
le=1.0,
description=(
"Scalar score ``S`` from Eq. 2, recomputed from the "
"judgment partition under the configured weight map and "
"contradiction penalty. ``None`` when GSAR is unset."
),
)

gsar_decision: str | None = Field(
default=None,
description=(
"The :class:`~locus.reasoning.gsar.Decision` (``proceed``, "
"``regenerate``, ``replan``, ``abstain``) for ``gsar_score`` "
"under the configured thresholds. ``None`` when GSAR is unset."
),
)

# Structured output (if Agent was configured with output_schema)
parsed: BaseModel | None = Field(
default=None,
Expand Down Expand Up @@ -226,6 +261,9 @@ def from_state(
parsed: BaseModel | None = None,
parse_error: str | None = None,
message: str | None = None,
gsar_judgment: Any = None,
gsar_score: float | None = None,
gsar_decision: str | None = None,
) -> AgentResult:
"""
Create a result from final state.
Expand Down Expand Up @@ -254,6 +292,9 @@ def from_state(
ungrounded_claims=ungrounded_claims or [],
parsed=parsed,
parse_error=parse_error,
gsar_judgment=gsar_judgment,
gsar_score=gsar_score,
gsar_decision=gsar_decision,
)


Expand Down
Loading