From 798a76d4c5deef9cff101fc53fbd9277fe8e163c Mon Sep 17 00:00:00 2001
From: Thibaut Fatus <tfatus@gmail.com>
Date: Thu, 11 Jun 2026 13:44:10 +0200
Subject: [PATCH] [chore] add manual run-completion scripts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

manual-rerun.mjs: human-in-the-loop driver to collect app conversations
when a target can't be driven automatically — prints each user turn, the
operator pastes the app reply, and the benchmark's own user-simulator
generates the next turn. complete-run.mjs: judge the collected transcripts
(runTest with full startMessages → judges only) and overwrite the matching
.kora-run-tmp results so a re-run aggregates the finished run. Both
parameterized by RUN_DIR. See scripts/README.md.
---
 README.md                |   1 +
 scripts/README.md        |  40 ++++++++++++
 scripts/complete-run.mjs |  76 +++++++++++++++++++++++
 scripts/manual-rerun.mjs | 130 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 247 insertions(+)
 create mode 100644 scripts/README.md
 create mode 100644 scripts/complete-run.mjs
 create mode 100644 scripts/manual-rerun.mjs
diff --git a/README.md b/README.md
index 325a94d..e189310 100644
--- a/README.md
+++ b/README.md
@@ -574,6 +574,7 @@ All commands run with a concurrency of 10 parallel tasks.
 .env.example                         Environment variable template
 models.json                          Model registry configuration
 data/                                Scenario pipeline output (seeds, scenarios, results)
+scripts/                             Operator tooling (manual run completion — see scripts/README.md)
 packages/
   benchmark/
     data/                            Risk taxonomy, motivations, mechanisms (risks.json, motivations.json, mechanisms.ts)
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..f031d6d
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,40 @@
+# scripts
+
+Operator tooling for finishing a benchmark run by hand when a target can't be
+driven automatically (e.g. an app web-runner that hits a transient backend
+error and auto-skips a scenario).
+
+Both scripts import the **built** packages (`packages/*/build/...`), so run
+`yarn build` (or `yarn tsbuild`) first, and pass the gateway/runner env with
+`node --env-file=.env`. They read `models.json` from the repo root.
+
+## `manual-rerun.mjs` — collect conversations, human-in-the-loop
+
+Drives one scenario at a time: prints the user (child) turn, you paste the app's
+reply, and it uses the **same** user-simulator the benchmark uses
+(`generateNextUserMessage`) to produce the next turn — so multi-turn
+conversations stay faithful to an automated run. Transcripts persist
+append-only to `RUN_DIR/manual-reruns.json` (+ a readable `.md`).
+
+Seed `RUN_DIR/manual-reruns.json` with one entry per scenario:
+`{scenario, messages: [{role: "user", content: <firstUserMessage>}]}`.
+
+```sh
+RUN_DIR=data/<run> node --env-file=.env scripts/manual-rerun.mjs <idx> [assistantFile]
+#   <idx>           1-based scenario index into manual-reruns.json
+#   [assistantFile] file with the pasted app reply; omit to (re)print the
+#                   pending user message
+```
+
+## `complete-run.mjs` — judge transcripts, merge into the run
+
+For each collected transcript, calls `kora.runTest` with the full conversation
+as `startMessages` (the turn loop is skipped → straight to the judges) and
+overwrites the matching `RUN_DIR/.kora-run-tmp/<hash>.json` (matched by
+`scenario.seed.id`). Re-run `kora run -o RUN_DIR/results.json` afterwards to
+cache-aggregate every result into the final `results.json` + `.zip`.
+
+```sh
+RUN_DIR=data/<run> [JUDGE=gpt-5.2:medium:limited] \
+  node --env-file=.env scripts/complete-run.mjs
+```
diff --git a/scripts/complete-run.mjs b/scripts/complete-run.mjs
new file mode 100644
index 0000000..ce66ef5
--- /dev/null
+++ b/scripts/complete-run.mjs
@@ -0,0 +1,76 @@
+/**
+ * Complete a run by replacing auto-skipped/sentinel testResults with real,
+ * judge-graded results built from manually-collected conversations (see
+ * manual-rerun.mjs, which produces the manual-reruns.json transcript store).
+ *
+ * For each manual entry it calls kora.runTest with the full transcript as
+ * startMessages — the conversation loop is skipped (startTurn === budget) and
+ * it goes straight to the judges — then overwrites the matching temp file
+ * (matched by scenario.seed.id) under .kora-run-tmp so a subsequent
+ * `kora run` cache-aggregates every result into the final results.json + .zip.
+ *
+ * Prereqts: packages are built (`yarn build`/`tsbuild`); models.json present.
+ * Usage:
+ *   RUN_DIR=data/<run> [JUDGE=gpt-5.2:medium:limited] \
+ *     node --env-file=.env scripts/complete-run.mjs
+ *   (RUN_DIR must contain manual-reruns.json and a .kora-run-tmp/ with the
+ *    other cached results; re-run `kora run -o <RUN_DIR>/results.json` after.)
+ */
+import {readFileSync, readdirSync, writeFileSync} from "node:fs";
+import path from "node:path";
+import {kora} from "../packages/benchmark/build/src/index.js";
+import {createGatewayModel} from "../packages/cli/build/src/models/gatewayModel.js";
+
+const DIR = process.env.RUN_DIR ?? "data/2026-06-10-gemini-104";
+const TMP = path.join(DIR, ".kora-run-tmp");
+const STORE = path.join(DIR, "manual-reruns.json");
+const JUDGE = process.env.JUDGE ?? "gpt-5.2:medium:limited";
+
+const modelsJsonPath = path.resolve("models.json");
+const judgeModel = createGatewayModel(modelsJsonPath, JUDGE);
+
+// Context: only judgeModels is exercised (the conversation loop is skipped).
+const ctx = {
+  getUserResponse: async () => {
+    throw new Error("user model must not be called");
+  },
+  getAssistantResponse: async () => {
+    throw new Error("assistant model must not be called");
+  },
+  judgeModels: {
+    [JUDGE]: {
+      getResponse: async request => ({
+        output: await judgeModel.getStructuredResponse(request),
+      }),
+    },
+  },
+};
+
+// Map seed.id -> temp filename for the existing 104 results.
+const seedToFile = {};
+for (const f of readdirSync(TMP).filter(n => n.endsWith(".json"))) {
+  const d = JSON.parse(readFileSync(path.join(TMP, f), "utf8"));
+  const id = d?.scenario?.seed?.id;
+  if (id) seedToFile[id] = f;
+}
+
+const store = JSON.parse(readFileSync(STORE, "utf8"));
+
+for (const entry of store) {
+  const seedId = entry.scenario.seed.id;
+  const file = seedToFile[seedId];
+  if (!file) throw new Error(`No temp file for seed ${seedId} (${entry.title})`);
+
+  const key = kora.mapScenarioToKeys(entry.scenario, ["default"])[0];
+  const testResult = await kora.runTest(ctx, entry.scenario, key, entry.messages);
+
+  writeFileSync(
+    path.join(TMP, file),
+    JSON.stringify(testResult, null, 2)
+  );
+
+  const grade = testResult?.assessment?.grade ?? "?";
+  console.log(`✓ ${entry.title}  →  ${file}  [grade: ${grade}]`);
+}
+
+console.log(`\nRe-judged ${store.length} scenarios. Now run \`kora run\` to aggregate.`);
diff --git a/scripts/manual-rerun.mjs b/scripts/manual-rerun.mjs
new file mode 100644
index 0000000..d5a7ff7
--- /dev/null
+++ b/scripts/manual-rerun.mjs
@@ -0,0 +1,130 @@
+/**
+ * Manual re-run driver for app-target scenarios that can't be driven
+ * automatically (e.g. a web app that returns a transient backend error and
+ * gets auto-skipped). Local, human-in-the-loop: the operator pastes each app
+ * reply; this script records it and uses the SAME user-simulator the benchmark
+ * uses (generateNextUserMessage) to produce the next user turn — keeping the
+ * multi-turn conversation faithful to an automated run. Pair with
+ * complete-run.mjs to judge the collected transcripts and finish the run.
+ *
+ * The transcript store (RUN_DIR/manual-reruns.json) is seeded by the operator,
+ * one entry per scenario: {scenario, messages:[{role:"user",content:first}]}.
+ *
+ * Prereqts: packages are built (`yarn build`/`tsbuild`); models.json present.
+ * Usage:
+ *   RUN_DIR=data/<run> [USER_MODEL=deepseek-v3.2] \
+ *     node --env-file=.env scripts/manual-rerun.mjs <idx> [assistantFile]
+ *     <idx>           1-based scenario index into RUN_DIR/manual-reruns.json
+ *     [assistantFile] file with the pasted app reply for the current turn;
+ *                     omit to just (re)print the pending user message.
+ */
+import {readFileSync, writeFileSync} from "node:fs";
+import path from "node:path";
+import {
+  generateNextUserMessage,
+  RiskCategory,
+} from "../packages/benchmark/build/src/index.js";
+import {createGatewayModel} from "../packages/cli/build/src/models/gatewayModel.js";
+
+const RUN_DIR = process.env.RUN_DIR ?? "data/2026-06-10-gemini-104";
+const STORE = `${RUN_DIR}/manual-reruns.json`;
+const MD = `${RUN_DIR}/manual-reruns.md`;
+const USER_MODEL = process.env.USER_MODEL ?? "deepseek-v3.2";
+
+const idx = Number(process.argv[2]);
+const assistantFile = process.argv[3];
+const store = JSON.parse(readFileSync(STORE, "utf8"));
+if (!Number.isInteger(idx) || idx < 1 || idx > store.length) {
+  throw new Error(`idx must be 1..${store.length}`);
+}
+const entry = store[idx - 1];
+
+const category = RiskCategory.find(entry.scenario.seed.riskCategoryId);
+const risk = RiskCategory.findRisk(category, entry.scenario.seed.riskId);
+const conversationLength = risk.conversationLength;
+
+const modelsJsonPath = path.resolve("models.json");
+const userModel = createGatewayModel(modelsJsonPath, USER_MODEL);
+const ctx = {
+  getUserResponse: async request => ({
+    output: await userModel.getTextResponse(request),
+  }),
+};
+
+if (assistantFile) {
+  const content = readFileSync(assistantFile, "utf8").trim();
+  if (entry.messages[entry.messages.length - 1]?.role !== "user") {
+    throw new Error(
+      `Scenario ${idx} is not awaiting an assistant reply (last message is not a user turn).`
+    );
+  }
+  entry.messages.push({role: "assistant", content});
+}
+
+const completedTurns = Math.floor(entry.messages.length / 2);
+let nextUserMessage = null;
+
+if (entry.messages[entry.messages.length - 1]?.role === "user") {
+  nextUserMessage = entry.messages[entry.messages.length - 1].content;
+} else if (completedTurns >= conversationLength) {
+  entry.done = true;
+} else {
+  nextUserMessage = await generateNextUserMessage(
+    ctx,
+    risk,
+    entry.scenario,
+    entry.messages
+  );
+  entry.messages.push({role: "user", content: nextUserMessage});
+}
+
+writeFileSync(STORE, JSON.stringify(store, null, 2));
+
+const md = [
+  "# Gemini 104 — manual re-run of 8 auto-skipped (1099) scenarios",
+  "",
+  "Date: 2026-06-11. Target: Gemini (manual paste). These persistently returned",
+  "backend error 1099 during the automated run and were auto-skipped via sentinel.",
+  "",
+  ...store.flatMap((e, i) => {
+    const cl = RiskCategory.findRisk(
+      RiskCategory.find(e.scenario.seed.riskCategoryId),
+      e.scenario.seed.riskId
+    ).conversationLength;
+    const turns = Math.floor(e.messages.length / 2);
+    const status = e.done
+      ? `complete (${turns}/${cl} turns)`
+      : `in progress (${turns}/${cl} turns)`;
+    return [
+      `## ${i + 1}. ${e.title}`,
+      "",
+      `- riskId: \`${e.riskId}\`  age: \`${e.age}\`  seed: \`${e.seed}\``,
+      `- status: _${status}_`,
+      "",
+      ...e.messages.map(m => {
+        const label = m.role === "user" ? "user (child)" : "assistant (Gemini)";
+        return `**${label}:**\n\n> ${m.content.replace(/\n/g, "\n> ")}\n`;
+      }),
+      "---",
+      "",
+    ];
+  }),
+].join("\n");
+writeFileSync(MD, md);
+
+console.log(`\n=== Scenario ${idx}/${store.length}: ${entry.title} ===`);
+console.log(
+  `riskId=${entry.riskId} age=${entry.age}  budget=${conversationLength} turns`
+);
+console.log(`completed turns: ${Math.floor(entry.messages.length / 2)}`);
+if (entry.done) {
+  console.log("\n*** SCENARIO COMPLETE — all turns done. Move to next. ***");
+} else if (nextUserMessage) {
+  console.log(
+    `\n----- PASTE THIS INTO GEMINI (turn ${Math.ceil(
+      entry.messages.length / 2
+    )}) -----\n`
+  );
+  console.log(nextUserMessage);
+  console.log("\n------------------------------------------------------------");
+}