From 798a76d4c5deef9cff101fc53fbd9277fe8e163c Mon Sep 17 00:00:00 2001 From: Thibaut Fatus Date: Thu, 11 Jun 2026 13:44:10 +0200 Subject: [PATCH] [chore] add manual run-completion scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit manual-rerun.mjs: human-in-the-loop driver to collect app conversations when a target can't be driven automatically — prints each user turn, the operator pastes the app reply, and the benchmark's own user-simulator generates the next turn. complete-run.mjs: judge the collected transcripts (runTest with full startMessages → judges only) and overwrite the matching .kora-run-tmp results so a re-run aggregates the finished run. Both parameterized by RUN_DIR. See scripts/README.md. --- README.md | 1 + scripts/README.md | 40 ++++++++++++ scripts/complete-run.mjs | 76 +++++++++++++++++++++++ scripts/manual-rerun.mjs | 130 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 247 insertions(+) create mode 100644 scripts/README.md create mode 100644 scripts/complete-run.mjs create mode 100644 scripts/manual-rerun.mjs diff --git a/README.md b/README.md index 325a94d..e189310 100644 --- a/README.md +++ b/README.md @@ -574,6 +574,7 @@ All commands run with a concurrency of 10 parallel tasks. .env.example Environment variable template models.json Model registry configuration data/ Scenario pipeline output (seeds, scenarios, results) +scripts/ Operator tooling (manual run completion — see scripts/README.md) packages/ benchmark/ data/ Risk taxonomy, motivations, mechanisms (risks.json, motivations.json, mechanisms.ts) diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..f031d6d --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,40 @@ +# scripts + +Operator tooling for finishing a benchmark run by hand when a target can't be +driven automatically (e.g. an app web-runner that hits a transient backend +error and auto-skips a scenario). + +Both scripts import the **built** packages (`packages/*/build/...`), so run +`yarn build` (or `yarn tsbuild`) first, and pass the gateway/runner env with +`node --env-file=.env`. They read `models.json` from the repo root. + +## `manual-rerun.mjs` — collect conversations, human-in-the-loop + +Drives one scenario at a time: prints the user (child) turn, you paste the app's +reply, and it uses the **same** user-simulator the benchmark uses +(`generateNextUserMessage`) to produce the next turn — so multi-turn +conversations stay faithful to an automated run. Transcripts persist +append-only to `RUN_DIR/manual-reruns.json` (+ a readable `.md`). + +Seed `RUN_DIR/manual-reruns.json` with one entry per scenario: +`{scenario, messages: [{role: "user", content: }]}`. + +```sh +RUN_DIR=data/ node --env-file=.env scripts/manual-rerun.mjs [assistantFile] +# 1-based scenario index into manual-reruns.json +# [assistantFile] file with the pasted app reply; omit to (re)print the +# pending user message +``` + +## `complete-run.mjs` — judge transcripts, merge into the run + +For each collected transcript, calls `kora.runTest` with the full conversation +as `startMessages` (the turn loop is skipped → straight to the judges) and +overwrites the matching `RUN_DIR/.kora-run-tmp/.json` (matched by +`scenario.seed.id`). Re-run `kora run -o RUN_DIR/results.json` afterwards to +cache-aggregate every result into the final `results.json` + `.zip`. + +```sh +RUN_DIR=data/ [JUDGE=gpt-5.2:medium:limited] \ + node --env-file=.env scripts/complete-run.mjs +``` diff --git a/scripts/complete-run.mjs b/scripts/complete-run.mjs new file mode 100644 index 0000000..ce66ef5 --- /dev/null +++ b/scripts/complete-run.mjs @@ -0,0 +1,76 @@ +/** + * Complete a run by replacing auto-skipped/sentinel testResults with real, + * judge-graded results built from manually-collected conversations (see + * manual-rerun.mjs, which produces the manual-reruns.json transcript store). + * + * For each manual entry it calls kora.runTest with the full transcript as + * startMessages — the conversation loop is skipped (startTurn === budget) and + * it goes straight to the judges — then overwrites the matching temp file + * (matched by scenario.seed.id) under .kora-run-tmp so a subsequent + * `kora run` cache-aggregates every result into the final results.json + .zip. + * + * Prereqts: packages are built (`yarn build`/`tsbuild`); models.json present. + * Usage: + * RUN_DIR=data/ [JUDGE=gpt-5.2:medium:limited] \ + * node --env-file=.env scripts/complete-run.mjs + * (RUN_DIR must contain manual-reruns.json and a .kora-run-tmp/ with the + * other cached results; re-run `kora run -o /results.json` after.) + */ +import {readFileSync, readdirSync, writeFileSync} from "node:fs"; +import path from "node:path"; +import {kora} from "../packages/benchmark/build/src/index.js"; +import {createGatewayModel} from "../packages/cli/build/src/models/gatewayModel.js"; + +const DIR = process.env.RUN_DIR ?? "data/2026-06-10-gemini-104"; +const TMP = path.join(DIR, ".kora-run-tmp"); +const STORE = path.join(DIR, "manual-reruns.json"); +const JUDGE = process.env.JUDGE ?? "gpt-5.2:medium:limited"; + +const modelsJsonPath = path.resolve("models.json"); +const judgeModel = createGatewayModel(modelsJsonPath, JUDGE); + +// Context: only judgeModels is exercised (the conversation loop is skipped). +const ctx = { + getUserResponse: async () => { + throw new Error("user model must not be called"); + }, + getAssistantResponse: async () => { + throw new Error("assistant model must not be called"); + }, + judgeModels: { + [JUDGE]: { + getResponse: async request => ({ + output: await judgeModel.getStructuredResponse(request), + }), + }, + }, +}; + +// Map seed.id -> temp filename for the existing 104 results. +const seedToFile = {}; +for (const f of readdirSync(TMP).filter(n => n.endsWith(".json"))) { + const d = JSON.parse(readFileSync(path.join(TMP, f), "utf8")); + const id = d?.scenario?.seed?.id; + if (id) seedToFile[id] = f; +} + +const store = JSON.parse(readFileSync(STORE, "utf8")); + +for (const entry of store) { + const seedId = entry.scenario.seed.id; + const file = seedToFile[seedId]; + if (!file) throw new Error(`No temp file for seed ${seedId} (${entry.title})`); + + const key = kora.mapScenarioToKeys(entry.scenario, ["default"])[0]; + const testResult = await kora.runTest(ctx, entry.scenario, key, entry.messages); + + writeFileSync( + path.join(TMP, file), + JSON.stringify(testResult, null, 2) + ); + + const grade = testResult?.assessment?.grade ?? "?"; + console.log(`✓ ${entry.title} → ${file} [grade: ${grade}]`); +} + +console.log(`\nRe-judged ${store.length} scenarios. Now run \`kora run\` to aggregate.`); diff --git a/scripts/manual-rerun.mjs b/scripts/manual-rerun.mjs new file mode 100644 index 0000000..d5a7ff7 --- /dev/null +++ b/scripts/manual-rerun.mjs @@ -0,0 +1,130 @@ +/** + * Manual re-run driver for app-target scenarios that can't be driven + * automatically (e.g. a web app that returns a transient backend error and + * gets auto-skipped). Local, human-in-the-loop: the operator pastes each app + * reply; this script records it and uses the SAME user-simulator the benchmark + * uses (generateNextUserMessage) to produce the next user turn — keeping the + * multi-turn conversation faithful to an automated run. Pair with + * complete-run.mjs to judge the collected transcripts and finish the run. + * + * The transcript store (RUN_DIR/manual-reruns.json) is seeded by the operator, + * one entry per scenario: {scenario, messages:[{role:"user",content:first}]}. + * + * Prereqts: packages are built (`yarn build`/`tsbuild`); models.json present. + * Usage: + * RUN_DIR=data/ [USER_MODEL=deepseek-v3.2] \ + * node --env-file=.env scripts/manual-rerun.mjs [assistantFile] + * 1-based scenario index into RUN_DIR/manual-reruns.json + * [assistantFile] file with the pasted app reply for the current turn; + * omit to just (re)print the pending user message. + */ +import {readFileSync, writeFileSync} from "node:fs"; +import path from "node:path"; +import { + generateNextUserMessage, + RiskCategory, +} from "../packages/benchmark/build/src/index.js"; +import {createGatewayModel} from "../packages/cli/build/src/models/gatewayModel.js"; + +const RUN_DIR = process.env.RUN_DIR ?? "data/2026-06-10-gemini-104"; +const STORE = `${RUN_DIR}/manual-reruns.json`; +const MD = `${RUN_DIR}/manual-reruns.md`; +const USER_MODEL = process.env.USER_MODEL ?? "deepseek-v3.2"; + +const idx = Number(process.argv[2]); +const assistantFile = process.argv[3]; +const store = JSON.parse(readFileSync(STORE, "utf8")); +if (!Number.isInteger(idx) || idx < 1 || idx > store.length) { + throw new Error(`idx must be 1..${store.length}`); +} +const entry = store[idx - 1]; + +const category = RiskCategory.find(entry.scenario.seed.riskCategoryId); +const risk = RiskCategory.findRisk(category, entry.scenario.seed.riskId); +const conversationLength = risk.conversationLength; + +const modelsJsonPath = path.resolve("models.json"); +const userModel = createGatewayModel(modelsJsonPath, USER_MODEL); +const ctx = { + getUserResponse: async request => ({ + output: await userModel.getTextResponse(request), + }), +}; + +if (assistantFile) { + const content = readFileSync(assistantFile, "utf8").trim(); + if (entry.messages[entry.messages.length - 1]?.role !== "user") { + throw new Error( + `Scenario ${idx} is not awaiting an assistant reply (last message is not a user turn).` + ); + } + entry.messages.push({role: "assistant", content}); +} + +const completedTurns = Math.floor(entry.messages.length / 2); +let nextUserMessage = null; + +if (entry.messages[entry.messages.length - 1]?.role === "user") { + nextUserMessage = entry.messages[entry.messages.length - 1].content; +} else if (completedTurns >= conversationLength) { + entry.done = true; +} else { + nextUserMessage = await generateNextUserMessage( + ctx, + risk, + entry.scenario, + entry.messages + ); + entry.messages.push({role: "user", content: nextUserMessage}); +} + +writeFileSync(STORE, JSON.stringify(store, null, 2)); + +const md = [ + "# Gemini 104 — manual re-run of 8 auto-skipped (1099) scenarios", + "", + "Date: 2026-06-11. Target: Gemini (manual paste). These persistently returned", + "backend error 1099 during the automated run and were auto-skipped via sentinel.", + "", + ...store.flatMap((e, i) => { + const cl = RiskCategory.findRisk( + RiskCategory.find(e.scenario.seed.riskCategoryId), + e.scenario.seed.riskId + ).conversationLength; + const turns = Math.floor(e.messages.length / 2); + const status = e.done + ? `complete (${turns}/${cl} turns)` + : `in progress (${turns}/${cl} turns)`; + return [ + `## ${i + 1}. ${e.title}`, + "", + `- riskId: \`${e.riskId}\` age: \`${e.age}\` seed: \`${e.seed}\``, + `- status: _${status}_`, + "", + ...e.messages.map(m => { + const label = m.role === "user" ? "user (child)" : "assistant (Gemini)"; + return `**${label}:**\n\n> ${m.content.replace(/\n/g, "\n> ")}\n`; + }), + "---", + "", + ]; + }), +].join("\n"); +writeFileSync(MD, md); + +console.log(`\n=== Scenario ${idx}/${store.length}: ${entry.title} ===`); +console.log( + `riskId=${entry.riskId} age=${entry.age} budget=${conversationLength} turns` +); +console.log(`completed turns: ${Math.floor(entry.messages.length / 2)}`); +if (entry.done) { + console.log("\n*** SCENARIO COMPLETE — all turns done. Move to next. ***"); +} else if (nextUserMessage) { + console.log( + `\n----- PASTE THIS INTO GEMINI (turn ${Math.ceil( + entry.messages.length / 2 + )}) -----\n` + ); + console.log(nextUserMessage); + console.log("\n------------------------------------------------------------"); +}