Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/main/java/dev/talos/cli/modes/ReadEvidenceHandoff.java
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ static Result readEvidenceRecoveryForPartialTargetsIfNeeded(
safePlan,
workspace,
ctx == null ? null : ctx.cfg());
if (obligation != EvidenceObligation.READ_TARGET_REQUIRED) {
if (obligation != EvidenceObligation.READ_TARGET_REQUIRED
&& obligation != EvidenceObligation.PATH_EXISTENCE_EVIDENCE_REQUIRED) {
return new Result(answer, null, null);
}
if (contract.mutationRequested() || contract.mutationAllowed()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ private static String missingEvidenceContainmentMessage(
"I did not inspect the required workspace target this turn, so I cannot "
+ "answer from its contents or propose grounded changes yet."
+ targetSentence(plan);
case PATH_EXISTENCE_EVIDENCE_REQUIRED ->
"I did not gather directory or target-read evidence for the requested path "
+ "existence check, so I cannot answer whether those files exist yet."
+ targetSentence(plan);
case LIST_DIRECTORY_ONLY ->
"I did not complete a directory-list-only evidence path this turn. "
+ "I cannot answer with file contents or derived file claims from "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,9 @@ private static String promptPreview(String value) {
private static String evidenceGuidance(EvidenceObligation evidence) {
return switch (evidence) {
case READ_TARGET_REQUIRED -> "Evidence: read the named target before answering.";
case PATH_EXISTENCE_EVIDENCE_REQUIRED ->
"Evidence: verify path existence with talos.list_dir for the parent directory "
+ "or talos.read_file for each named target before answering.";
case PROTECTED_READ_APPROVAL_REQUIRED ->
"Evidence: the named target is protected. "
+ "Call talos.read_file for the protected target; runtime will request approval. "
Expand Down
4 changes: 3 additions & 1 deletion src/main/java/dev/talos/runtime/policy/EvidenceGate.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ public static EvidenceObligation selectObligation(CurrentTurnPlan plan, Path wor

public static boolean requiresReadEvidenceHandoff(EvidenceObligation obligation) {
return obligation == EvidenceObligation.READ_TARGET_REQUIRED
|| obligation == EvidenceObligation.PATH_EXISTENCE_EVIDENCE_REQUIRED
|| obligation == EvidenceObligation.PROTECTED_READ_APPROVAL_REQUIRED
|| obligation == EvidenceObligation.UNSUPPORTED_CAPABILITY_CHECK_REQUIRED;
}
Expand Down Expand Up @@ -70,7 +71,8 @@ public static List<String> handoffTargets(
} else if (obligation == EvidenceObligation.UNSUPPORTED_CAPABILITY_CHECK_REQUIRED
&& isUnsupportedExpectedTarget(target, cfg)) {
targets.add(target);
} else if (obligation == EvidenceObligation.READ_TARGET_REQUIRED && !protectedTarget) {
} else if ((obligation == EvidenceObligation.READ_TARGET_REQUIRED
|| obligation == EvidenceObligation.PATH_EXISTENCE_EVIDENCE_REQUIRED) && !protectedTarget) {
targets.add(target);
Comment on lines +74 to 76

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Use directory evidence for existence handoff

For existence-only prompts where the model initially gathers no evidence, this branch now sends the target through the read-evidence handoff path; ReadEvidenceHandoff maps each returned target to talos.read_file. That means a question like Does large.log exist? can read the whole file content into the turn just to prove existence, even though the new verifier accepts parent talos.list_dir evidence. Return parent directories/list-dir handoff calls for PATH_EXISTENCE_EVIDENCE_REQUIRED to avoid unnecessary content exposure and large reads.

Useful? React with 👍 / 👎.

}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ public enum EvidenceObligation {
NONE,
LIST_DIRECTORY_ONLY,
READ_TARGET_REQUIRED,
PATH_EXISTENCE_EVIDENCE_REQUIRED,
PROTECTED_READ_APPROVAL_REQUIRED,
WORKSPACE_INSPECTION_REQUIRED,
STATIC_WEB_DIAGNOSIS_REQUIRED,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ public static EvidenceObligation derive(
if (!contract.mutationAllowed() && hasProtectedExpectedTarget(contract, workspace)) {
return EvidenceObligation.PROTECTED_READ_APPROVAL_REQUIRED;
}
if (hasReadOnlyPathExistenceObligation(contract)) {
return EvidenceObligation.PATH_EXISTENCE_EVIDENCE_REQUIRED;
}
if (hasStaticWebDiagnosisObligation(contract, type)) {
return EvidenceObligation.STATIC_WEB_DIAGNOSIS_REQUIRED;
}
Expand Down Expand Up @@ -128,6 +131,23 @@ private static boolean hasStaticWebDiagnosisObligation(TaskContract contract, Ta
|| lower.contains("button");
}

private static boolean hasReadOnlyPathExistenceObligation(TaskContract contract) {
if (contract == null || contract.mutationAllowed() || contract.expectedTargets().isEmpty()) {
return false;
}
String request = contract.originalUserRequest();
if (request == null || request.isBlank()) return false;
String lower = request.toLowerCase(Locale.ROOT);
boolean asksExistence = lower.contains("exists")
|| lower.contains("exist")
|| lower.contains("present")
|| lower.contains("is there")
|| lower.contains("are there");
Comment on lines +144 to +145

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Narrow existence detection to explicit path questions

With prompts such as Is there a bug in script.js?, TaskContractResolver still extracts script.js, but this new substring match classifies the turn as PATH_EXISTENCE_EVIDENCE_REQUIRED before the later static-web/read-target obligations run. Because that obligation can be satisfied by only listing the parent directory, the evidence gate no longer requires reading script.js before a bug/diagnosis answer. Restrict this branch to explicit file/path existence wording rather than generic is there/are there phrasing.

Useful? React with 👍 / 👎.

boolean asksPathStatus = lower.contains("path")
&& (lower.contains("check") || lower.contains("verify") || lower.contains("whether"));
return asksExistence || asksPathStatus;
}

private static boolean isStaticWebTarget(String target) {
if (target == null || target.isBlank()) return false;
String lower = target.replace('\\', '/').toLowerCase(Locale.ROOT);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ public static Result verify(
case NONE -> Result.satisfied("No workspace evidence was required.");
case LIST_DIRECTORY_ONLY -> verifyListDirectoryOnly(safeOutcomes);
case READ_TARGET_REQUIRED -> verifyReadTargets(targets, safeOutcomes, false);
case PATH_EXISTENCE_EVIDENCE_REQUIRED -> verifyPathExistenceTargets(targets, safeOutcomes);
case PROTECTED_READ_APPROVAL_REQUIRED -> verifyProtectedRead(targets, safeOutcomes);
case STATIC_WEB_DIAGNOSIS_REQUIRED -> verifyStaticWebDiagnosis(targets, safeOutcomes, workspace);
case WORKSPACE_INSPECTION_REQUIRED, VERIFY_FROM_TRACE_OR_EVIDENCE ->
Expand Down Expand Up @@ -190,6 +191,46 @@ private static Result verifyProtectedRead(Set<String> expectedTargets, List<Tool
return verifyReadTargets(expectedTargets, outcomes, true);
}

private static Result verifyPathExistenceTargets(
Set<String> expectedTargets,
List<ToolCallLoop.ToolOutcome> outcomes
) {
if (outcomes.isEmpty()) {
return Result.unsatisfied("Path existence evidence was not gathered.");
}
return aggregateTargetResults(
expectedTargets,
target -> verifyPathExistenceTarget(target, outcomes),
"Path existence evidence was gathered.");
}

private static Result verifyPathExistenceTarget(
String expectedTarget,
List<ToolCallLoop.ToolOutcome> outcomes
) {
String expected = normalizePath(expectedTarget);
for (ToolCallLoop.ToolOutcome outcome : outcomes) {
if (!"talos.read_file".equals(canonicalToolName(outcome.toolName()))) continue;
if (!expected.equals(normalizePath(outcome.pathHint()))) continue;
if (outcome.denied()) {
return Result.blocked("Path existence read was blocked by approval.");
}
return Result.satisfied("Path existence evidence was gathered.");
}
String expectedParent = parentDirectory(expected);
for (ToolCallLoop.ToolOutcome outcome : outcomes) {
if (!"talos.list_dir".equals(canonicalToolName(outcome.toolName()))) continue;
if (outcome.denied()) {
return Result.blocked("Path existence directory listing was blocked by approval.");
}
if (!outcome.success()) continue;
if (expectedParent.equals(normalizeDirectory(outcome.pathHint()))) {
return Result.satisfied("Path existence evidence was gathered.");
}
}
return Result.unsatisfied("Path existence evidence was not gathered for " + expectedTarget + ".");
}

private static Result verifyReadTarget(
String expectedTarget,
List<ToolCallLoop.ToolOutcome> outcomes,
Expand Down Expand Up @@ -490,6 +531,19 @@ private static String normalizePath(String path) {
return normalized;
}

private static String normalizeDirectory(String path) {
String normalized = normalizePath(path);
return normalized.isBlank() ? "." : normalized;
}

private static String parentDirectory(String normalizedPath) {
String normalized = normalizePath(normalizedPath);
int slash = normalized.lastIndexOf('/');
if (slash < 0) return ".";
String parent = normalized.substring(0, slash);
return parent.isBlank() ? "." : parent;
}

private static String canonicalToolName(String toolName) {
ToolAliasPolicy.Decision decision = ToolAliasPolicy.resolve(toolName);
if (decision.accepted() && decision.canonicalToolName() != null && !decision.canonicalToolName().isBlank()) {
Expand Down
54 changes: 54 additions & 0 deletions src/test/java/dev/talos/cli/modes/ReadEvidenceHandoffTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,60 @@ void partialTargetRecoveryDoesNotRetryAfterDeniedEvidenceTarget(@TempDir Path wo
assertNull(result.extraSummary());
}

@Test
void pathExistenceRecoveryRunsAfterIrrelevantReadEvidence(@TempDir Path workspace) throws Exception {
Files.writeString(workspace.resolve("scripts.js"), "console.log('present');\n");
Files.writeString(workspace.resolve("styles.css"), "body { color: red; }\n");
Context ctx = context(workspace, "Path existence answer after deterministic handoff.");
List<ChatMessage> messages = messages(
"Check whether scripts.js exists and whether script.js exists. Do not change anything.");
CurrentTurnPlan plan = plan(
new TaskContract(
TaskType.DIAGNOSE_ONLY,
false,
false,
false,
Set.of("scripts.js", "script.js"),
Set.of(),
"Check whether scripts.js exists and whether script.js exists. Do not change anything."),
EvidenceObligation.PATH_EXISTENCE_EVIDENCE_REQUIRED);
ToolCallLoop.LoopResult irrelevantRead = new ToolCallLoop.LoopResult(
"scripts.js does not exist.",
1,
1,
List.of("talos.read_file"),
messages,
1,
0,
false,
0,
List.of("styles.css"),
0,
0,
0,
0,
List.of(new ToolCallLoop.ToolOutcome(
"talos.read_file",
"styles.css",
true,
false,
false,
"body { color: red; }",
"")));

ReadEvidenceHandoff.Result result = ReadEvidenceHandoff.readEvidenceRecoveryForPartialTargetsIfNeeded(
"scripts.js does not exist.",
messages,
plan,
irrelevantRead,
workspace,
ctx);

assertNotNull(result.loopResult(), "path existence should recover from irrelevant read evidence");
assertEquals("Path existence answer after deterministic handoff.", result.answer());
assertTrue(result.extraSummary().contains("talos.read_file"), result.extraSummary());
}

private static CurrentTurnPlan plan(TaskContract contract, EvidenceObligation obligation) {
return new CurrentTurnPlan(
contract,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,27 @@ I did not inspect the required workspace target this turn, so I cannot answer fr
assertFalse(answer.contains("Proposed change"), answer);
}

@Test
void pathExistenceMissingEvidenceSuppressesFabricatedExistenceAnswer() {
String answer = EvidenceContainmentAnswerGuard.containMissingEvidence(
"scripts.js does not exist and script.js exists.",
pathExistencePlan(),
EvidenceObligation.PATH_EXISTENCE_EVIDENCE_REQUIRED,
EvidenceObligationVerifier.Result.unsatisfied(
"Path existence evidence was not gathered for scripts.js."),
MARKERS);

assertTrue(answer.startsWith(EvidenceObligationVerifier.MISSING_EVIDENCE_PREFIX), answer);
assertTrue(answer.contains(
"I did not gather directory or target-read evidence for the requested path existence check"),
answer);
assertTrue(answer.contains("Required target(s):"), answer);
assertTrue(answer.contains("scripts.js"), answer);
assertTrue(answer.contains("script.js"), answer);
assertFalse(answer.contains("scripts.js does not exist"), answer);
assertFalse(answer.contains("script.js exists"), answer);
}

@Test
void protectedReadNotAttemptedSuppressesFabricatedProtectedBody() {
String answer = EvidenceContainmentAnswerGuard.containMissingEvidence(
Expand Down Expand Up @@ -164,4 +185,21 @@ private static CurrentTurnPlan readTargetPlan(String target) {
List.of("talos.read_file"),
List.of());
}

private static CurrentTurnPlan pathExistencePlan() {
TaskContract contract = new TaskContract(
TaskType.DIAGNOSE_ONLY,
false,
false,
false,
Set.of("scripts.js", "script.js"),
Set.of(),
"Check whether scripts.js exists and whether script.js exists. Do not change anything.");
return CurrentTurnPlan.create(
contract,
ExecutionPhase.INSPECT,
List.of("talos.list_dir", "talos.read_file"),
List.of("talos.list_dir", "talos.read_file"),
List.of());
}
}
21 changes: 21 additions & 0 deletions src/test/java/dev/talos/runtime/policy/EvidenceGateTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,27 @@ void readTargetHandoffSkipsProtectedTargets(@TempDir Path workspace) {
assertFalse(targets.contains(".env"), targets.toString());
}

@Test
void pathExistenceHandoffUsesNamedNonProtectedTargets(@TempDir Path workspace) {
TaskContract contract = new TaskContract(
TaskType.DIAGNOSE_ONLY,
false,
false,
false,
Set.of("scripts.js", "script.js"),
Set.of(),
"Check whether scripts.js exists and whether script.js exists. Do not change anything.");

assertTrue(EvidenceGate.requiresReadEvidenceHandoff(
EvidenceObligation.PATH_EXISTENCE_EVIDENCE_REQUIRED));
assertEquals(
Set.of("scripts.js", "script.js"),
Set.copyOf(EvidenceGate.handoffTargets(
contract,
EvidenceObligation.PATH_EXISTENCE_EVIDENCE_REQUIRED,
workspace)));
}

@Test
void protectedReadHandoffRequiresExplicitReadIntent(@TempDir Path workspace) {
TaskContract readEnv = new TaskContract(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,16 @@ void staticWebDiagnosisRequiresStaticWebDiagnosisEvidence() {
EvidenceObligationPolicy.derive(contract, ExecutionPhase.INSPECT, WORKSPACE));
}

@Test
void fileExistenceQuestionRequiresPathExistenceEvidenceBeforeStaticWebDiagnosis() {
TaskContract contract = TaskContractResolver.fromUserRequest(
"Check whether scripts.js exists and whether script.js exists. Do not change anything.");

assertEquals(
EvidenceObligation.PATH_EXISTENCE_EVIDENCE_REQUIRED,
EvidenceObligationPolicy.derive(contract, ExecutionPhase.INSPECT, WORKSPACE));
}

@Test
void extractableDocumentTargetRequiresNormalReadEvidence() {
TaskContract contract = TaskContractResolver.fromUserRequest("Read report.docx and summarize it.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,46 @@ void listOnlyRejectsRetrieve() {
assertEquals(EvidenceObligationVerifier.Status.UNSATISFIED, result.status());
}

@Test
void pathExistenceRejectsIrrelevantReadEvidence() {
var result = EvidenceObligationVerifier.verify(
EvidenceObligation.PATH_EXISTENCE_EVIDENCE_REQUIRED,
Set.of("scripts.js", "script.js"),
List.of(new ToolCallLoop.ToolOutcome(
"talos.read_file", "styles.css", true, false, false,
"body { color: red; }", "")));

assertEquals(EvidenceObligationVerifier.Status.UNSATISFIED, result.status());
}

@Test
void pathExistenceAcceptsParentDirectoryListingEvidence() {
var result = EvidenceObligationVerifier.verify(
EvidenceObligation.PATH_EXISTENCE_EVIDENCE_REQUIRED,
Set.of("scripts.js", "script.js"),
List.of(new ToolCallLoop.ToolOutcome(
"talos.list_dir", ".", true, false, false,
"index.html\nscripts.js\nstyles.css\n", "")));

assertEquals(EvidenceObligationVerifier.Status.SATISFIED, result.status());
}

@Test
void pathExistenceAcceptsDirectTargetReadAttempts() {
var result = EvidenceObligationVerifier.verify(
EvidenceObligation.PATH_EXISTENCE_EVIDENCE_REQUIRED,
Set.of("scripts.js", "script.js"),
List.of(
new ToolCallLoop.ToolOutcome(
"talos.read_file", "scripts.js", true, false, false,
"console.log('ok');", ""),
new ToolCallLoop.ToolOutcome(
"talos.read_file", "script.js", false, false, false,
"", "script.js was not found.", null, ToolError.NOT_FOUND)));

assertEquals(EvidenceObligationVerifier.Status.SATISFIED, result.status());
}

@Test
void staticWebDiagnosisRejectsDirectoryListingOnlyWhenIndexIsPresent() {
var result = EvidenceObligationVerifier.verify(
Expand Down