Apra-Labs · kumaakh · May 28, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
diff --git a/.github/e2e/extract-results.mjs b/.github/e2e/extract-results.mjs
@@ -26,6 +26,46 @@ function processRawFile(filePath, provider) {
 
   const content = readFileSync(filePath, 'utf8');
 
+  if (provider === 'agy') {
+    // The raw file contains the stdout of the agy invocation. After agy exits,
+    // fleet appends the transcript JSONL wrapped in FLEET_TRANSCRIPT_START/END markers.
+    // We extract text from PLANNER_RESPONSE entries in the JSONL so that CHECKPOINT lines
+    // embedded in the agent's responses can be detected.
+    const startMarker = 'FLEET_TRANSCRIPT_START';
+    const endMarker = 'FLEET_TRANSCRIPT_END';
+    const startIdx = content.indexOf(startMarker);
+    const endIdx = content.indexOf(endMarker);
+    if (startIdx !== -1 && endIdx !== -1) {
+      const section = content.substring(startIdx + startMarker.length, endIdx);
+      let extracted = '';
+      for (const line of section.split('\n')) {
+        const trimmed = line.trim();
+        if (!trimmed) continue;
+        try {
+          const entry = JSON.parse(trimmed);
+          if (entry.type === 'PLANNER_RESPONSE' && entry.status === 'DONE' && typeof entry.content === 'string' && entry.content.trim()) {
+            extracted += '\n' + entry.content.trim();
+          }
+        } catch { /* skip malformed lines */ }
+      }
+      return {
+        assistantText: extracted || content,
+        tokensIn: 0,
+        tokensOut: 0,
+        cacheCreate: 0,
+        cacheRead: 0,
+      };
+    }
+    // No markers: treat raw content as plain text (fallback for empty or unexpected output)
+    return {
+      assistantText: content,
+      tokensIn: 0,
+      tokensOut: 0,
+      cacheCreate: 0,
+      cacheRead: 0,
+    };
+  }
+
   for (const line of content.split('\n')) {
     const trimmed = line.trim();
     if (!trimmed) continue;
@@ -115,7 +155,7 @@ function sumMemberLogs(role) {
 telemetry.push({ role: 'doer', ...sumMemberLogs('doer') });
 telemetry.push({ role: 'reviewer', ...sumMemberLogs('reviewer') });
 
-// Extract checkpoints: one JSON object per "CHECKPOINT:" line
+// Extract checkpoints: one JSON object per "CHECKPOINT:" line (text-based, legacy)
 let checkpoints = [];
 const regex = /CHECKPOINT:\s*(\{[\s\S]*?\})/g;
 let match;
@@ -130,6 +170,24 @@ while ((match = regex.exec(allAssistantText)) !== null) {
   } catch {}
 }
 
+// Also read file-based checkpoints written by the PM via Add-Content (agy-specific approach).
+// These are more reliable -- the PM writes them as tool calls (no agy exit risk).
+// File-based entries take precedence over text-based ones.
+const checkpointFile = join(runDir, 'checkpoints.json');
+if (existsSync(checkpointFile)) {
+  for (const line of readFileSync(checkpointFile, 'utf8').split('\n')) {
+    if (!line.trim()) continue;
+    try {
+      const cp = JSON.parse(line.trim());
+      if (cp && cp.id) {
+        const existing = checkpoints.findIndex(c => c.id === cp.id);
+        if (existing >= 0) checkpoints[existing] = cp;
+        else checkpoints.push(cp);
+      }
+    } catch {}
+  }
+}
+
 // A phase passes only if its terminal checkpoint was emitted.
 const TERMINALS = { setup: 'T2-done', sprint: 'T3-done' };
 const requiredTerminals = [];

diff --git a/.github/e2e/setup-script.md b/.github/e2e/setup-script.md
@@ -6,6 +6,9 @@ PM: {{PM_OS}} / {{PM_PROVIDER}} | VCS: {{VCS}} | Toy: {{TOY_PROJECT_URL}}
 
 Do all work yourself in this conversation -- no sub-agents. If a step fails, move on to the next one.
 
+> [!IMPORTANT]
+> If you are running in non-interactive print mode (e.g. through a CI runner or using the `-p` / `--print` flags): after printing each CHECKPOINT line, you MUST immediately execute a tool call (e.g. `fleet_status`) and then continue to the next task without stopping. Never return a text-only response until you have printed ALL checkpoints including the final `-done` one.
+
 ## Checkpoints
 
 When you finish a step, print one line, exactly like this, as plain text (no code block, no backticks):
@@ -63,7 +66,7 @@ grep -q 'HOME/bin' ~/.profile 2>/dev/null || echo 'export PATH=$HOME/bin:$PATH'
 
 CHECKPOINT: {"id":"T1","status":"PASS","notes":"..."}
 
----
+After printing that line, immediately call `fleet_status` and then start T2 without stopping.
 
 ## T2: Basic Execution
 
@@ -75,6 +78,8 @@ Write any scratch files into the run directory (the current working directory),
 
 CHECKPOINT: {"id":"T2","status":"PASS","notes":"..."}
 
+After printing that line, immediately call `fleet_status` and then print T2-done without stopping.
+
 ### Done
 
 Print this only after T1 and T2 are done:

diff --git a/.github/e2e/sprint-script.md b/.github/e2e/sprint-script.md
@@ -9,16 +9,22 @@ PM: {{PM_OS}} / {{PM_PROVIDER}} | VCS: {{VCS}} | Toy: {{TOY_PROJECT_URL}}
 - **doer** (name: `doer`, provider: {{DOER_PROVIDER}})
 - **reviewer** (name: `reviewer`, provider: {{REVIEWER_PROVIDER}})
 
+> [!IMPORTANT]
+> Do NOT print CHECKPOINT lines as plain text -- that causes agy to exit. Instead, record each checkpoint by running a command (see below). After recording, immediately continue to the next task without any text-only response.
+
 ## Checkpoints
 
-When you finish a step, print one line, exactly like this, as plain text (no code block, no backticks):
+Record each checkpoint by running this PowerShell command (replace id, status, notes):
 
-  CHECKPOINT: {"id":"T3-repo-setup","status":"PASS","notes":"one short note"}
+```powershell
+Add-Content -Path checkpoints.json -Value '{"id":"T3-repo-setup","status":"PASS","notes":"one short note"}'
+```
 
-- One line per step. One JSON object, not an array. Print it once.
-- If a step fails, print it with `"status":"FAIL"` and move on to the next step.
+- One JSON object per line appended to `checkpoints.json` in the current working directory.
+- If a step fails, write `"status":"FAIL"` and continue to the next step.
 - The steps are: `T3-repo-setup`, `T3-discover`, `T3-sprint`, `T3-pr-verified`, `T3-done`.
-- Print `T3-done` last, only after the other four. If `T3-done` is missing, the phase failed.
+- After writing each checkpoint, immediately continue to the next task -- no pausing, no text summary.
+- Write `T3-done` last. If it is missing from `checkpoints.json` after the session, the phase failed.
 
 ---
 
@@ -30,17 +36,29 @@ Run a full sprint on the toy repo using the pm skill. Do all of it yourself in t
 
 On the doer: clone {{TOY_PROJECT_URL}} into its work folder if needed, then `git fetch origin && git checkout main && git pull`. Provision {{VCS}} auth.
 
-CHECKPOINT: {"id":"T3-repo-setup","status":"PASS","notes":"..."}
+Record checkpoint:
+```powershell
+Add-Content -Path checkpoints.json -Value '{"id":"T3-repo-setup","status":"PASS","notes":"...your note..."}'
+```
+Then immediately continue to T3.2.
 
 ### T3.2 Pick the work
 
 Run `bd ready` on the doer. Pick 3 P1 issues. Write `requirements.md` for them into the current working directory.
 
-CHECKPOINT: {"id":"T3-discover","status":"PASS","notes":"..."}
+Record checkpoint:
+```powershell
+Add-Content -Path checkpoints.json -Value '{"id":"T3-discover","status":"PASS","notes":"...your note..."}'
+```
+Then immediately continue to T3.3.
 
 ### T3.3 Run the sprint
 
-Activate the pm skill, then run:
+Activate the pm skill.
+
+Before running any `/pm` commands, read `projects.md` in the current directory:
+- If `fleet-e2e-toy` already appears in the table with Phase 1 or later and status "active", **skip `/pm init`, `/pm pair`, and `/pm plan`** -- the plan is already done. Go directly to `/pm start doer`.
+- If `fleet-e2e-toy` is not present, run the full sequence:
 
 ```
 /pm init fleet-e2e-toy
@@ -51,21 +69,33 @@ Activate the pm skill, then run:
 
 Branch prefix: `{{BRANCH_PREFIX}}`.
 
-The pm skill runs the doer/reviewer loop. After `/pm start doer`, keep driving that loop yourself: when the doer reaches review, dispatch the reviewer; when the reviewer asks for changes, dispatch the doer again. Repeat until the reviewer approves, then run `/pm cleanup fleet-e2e-toy`. Do not stop until the sprint is approved.
+The pm skill runs the doer/reviewer loop. Drive it yourself:
+1. Dispatch the doer with `execute_prompt`. Wait for its response.
+2. Read the doer's response (it will be in the `execute_prompt` result). If it says VERIFY or requests review, dispatch the reviewer.
+3. Read the reviewer's response. If the reviewer requests changes, dispatch the doer again. Repeat until the reviewer explicitly approves.
+4. A reviewer approval means the reviewer's response contains words like "approved", "LGTM", or "no changes needed". If `execute_prompt` returns empty or an error, re-dispatch.
+5. Once approved, run `/pm cleanup fleet-e2e-toy`.
+
+Do NOT record T3-sprint PASS until you have confirmed a reviewer approval response in the execute_prompt result (not just dispatched -- you must read the response).
 
-CHECKPOINT: {"id":"T3-sprint","status":"PASS","notes":"..."}
+Record checkpoint:
+```powershell
+Add-Content -Path checkpoints.json -Value '{"id":"T3-sprint","status":"PASS","notes":"...your note..."}'
+```
+Then immediately continue to T3.4.
 
 ### T3.4 Check the result
 
 Confirm a branch with prefix `{{BRANCH_PREFIX}}` exists on origin and a PR was raised.
 
-CHECKPOINT: {"id":"T3-pr-verified","status":"PASS","notes":"..."}
-
-### Done
-
-Print this only after the four steps above are done:
-
-CHECKPOINT: {"id":"T3-done","status":"PASS","notes":"sprint phase finished"}
+Record checkpoint:
+```powershell
+Add-Content -Path checkpoints.json -Value '{"id":"T3-pr-verified","status":"PASS","notes":"...your note..."}'
+```
+Then record T3-done:
+```powershell
+Add-Content -Path checkpoints.json -Value '{"id":"T3-done","status":"PASS","notes":"sprint phase finished"}'
+```
 
 ---
 

diff --git a/.github/e2e/suites.json b/.github/e2e/suites.json
@@ -71,6 +71,24 @@
       "doer":     { "os": "local_doer_macos",     "provider": "gemini", "type": "local" },
       "reviewer": { "os": "local_reviewer_macos", "provider": "gemini", "type": "local" },
       "vcs": "github"
+    },
+    "s8.1": {
+      "pm":       { "os": "windows", "provider": "agy", "runner": "fleet-windows" },
+      "doer":     { "os": "local_doer_windows",     "provider": "agy", "type": "local" },
+      "reviewer": { "os": "local_reviewer_windows", "provider": "agy", "type": "local" },
+      "vcs": "github"
+    },
+    "s8.2": {
+      "pm":       { "os": "linux",   "provider": "agy", "runner": "fleet-linux" },
+      "doer":     { "os": "local_doer_linux",     "provider": "agy", "type": "local" },
+      "reviewer": { "os": "local_reviewer_linux", "provider": "agy", "type": "local" },
+      "vcs": "github"
+    },
+    "s8.3": {
+      "pm":       { "os": "macos",   "provider": "agy", "runner": "fleet-macos" },
+      "doer":     { "os": "local_doer_macos",     "provider": "agy", "type": "local" },
+      "reviewer": { "os": "local_reviewer_macos", "provider": "agy", "type": "local" },
+      "vcs": "github"
     }
   }
 }