Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,7 @@ All commands run with a concurrency of 10 parallel tasks.
.env.example Environment variable template
models.json Model registry configuration
data/ Scenario pipeline output (seeds, scenarios, results)
scripts/ Operator tooling (manual run completion — see scripts/README.md)
packages/
benchmark/
data/ Risk taxonomy, motivations, mechanisms (risks.json, motivations.json, mechanisms.ts)
Expand Down
40 changes: 40 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# scripts

Operator tooling for finishing a benchmark run by hand when a target can't be
driven automatically (e.g. an app web-runner that hits a transient backend
error and auto-skips a scenario).

Both scripts import the **built** packages (`packages/*/build/...`), so run
`yarn build` (or `yarn tsbuild`) first, and pass the gateway/runner env with
`node --env-file=.env`. They read `models.json` from the repo root.

## `manual-rerun.mjs` — collect conversations, human-in-the-loop

Drives one scenario at a time: prints the user (child) turn, you paste the app's
reply, and it uses the **same** user-simulator the benchmark uses
(`generateNextUserMessage`) to produce the next turn — so multi-turn
conversations stay faithful to an automated run. Transcripts persist
append-only to `RUN_DIR/manual-reruns.json` (+ a readable `.md`).

Seed `RUN_DIR/manual-reruns.json` with one entry per scenario:
`{scenario, messages: [{role: "user", content: <firstUserMessage>}]}`.

```sh
RUN_DIR=data/<run> node --env-file=.env scripts/manual-rerun.mjs <idx> [assistantFile]
# <idx> 1-based scenario index into manual-reruns.json
# [assistantFile] file with the pasted app reply; omit to (re)print the
# pending user message
```

## `complete-run.mjs` — judge transcripts, merge into the run

For each collected transcript, calls `kora.runTest` with the full conversation
as `startMessages` (the turn loop is skipped → straight to the judges) and
overwrites the matching `RUN_DIR/.kora-run-tmp/<hash>.json` (matched by
`scenario.seed.id`). Re-run `kora run -o RUN_DIR/results.json` afterwards to
cache-aggregate every result into the final `results.json` + `.zip`.

```sh
RUN_DIR=data/<run> [JUDGE=gpt-5.2:medium:limited] \
node --env-file=.env scripts/complete-run.mjs
```
76 changes: 76 additions & 0 deletions scripts/complete-run.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/**
* Complete a run by replacing auto-skipped/sentinel testResults with real,
* judge-graded results built from manually-collected conversations (see
* manual-rerun.mjs, which produces the manual-reruns.json transcript store).
*
* For each manual entry it calls kora.runTest with the full transcript as
* startMessages — the conversation loop is skipped (startTurn === budget) and
* it goes straight to the judges — then overwrites the matching temp file
* (matched by scenario.seed.id) under .kora-run-tmp so a subsequent
* `kora run` cache-aggregates every result into the final results.json + .zip.
*
* Prereqts: packages are built (`yarn build`/`tsbuild`); models.json present.
* Usage:
* RUN_DIR=data/<run> [JUDGE=gpt-5.2:medium:limited] \
* node --env-file=.env scripts/complete-run.mjs
* (RUN_DIR must contain manual-reruns.json and a .kora-run-tmp/ with the
* other cached results; re-run `kora run -o <RUN_DIR>/results.json` after.)
*/
import {readFileSync, readdirSync, writeFileSync} from "node:fs";
import path from "node:path";
import {kora} from "../packages/benchmark/build/src/index.js";
import {createGatewayModel} from "../packages/cli/build/src/models/gatewayModel.js";

const DIR = process.env.RUN_DIR ?? "data/2026-06-10-gemini-104";
const TMP = path.join(DIR, ".kora-run-tmp");
const STORE = path.join(DIR, "manual-reruns.json");
const JUDGE = process.env.JUDGE ?? "gpt-5.2:medium:limited";

const modelsJsonPath = path.resolve("models.json");
const judgeModel = createGatewayModel(modelsJsonPath, JUDGE);

// Context: only judgeModels is exercised (the conversation loop is skipped).
const ctx = {
getUserResponse: async () => {
throw new Error("user model must not be called");
},
getAssistantResponse: async () => {
throw new Error("assistant model must not be called");
},
judgeModels: {
[JUDGE]: {
getResponse: async request => ({
output: await judgeModel.getStructuredResponse(request),
}),
},
},
};

// Map seed.id -> temp filename for the existing 104 results.
const seedToFile = {};
for (const f of readdirSync(TMP).filter(n => n.endsWith(".json"))) {
const d = JSON.parse(readFileSync(path.join(TMP, f), "utf8"));
const id = d?.scenario?.seed?.id;
if (id) seedToFile[id] = f;
}

const store = JSON.parse(readFileSync(STORE, "utf8"));

for (const entry of store) {
const seedId = entry.scenario.seed.id;
const file = seedToFile[seedId];
if (!file) throw new Error(`No temp file for seed ${seedId} (${entry.title})`);

const key = kora.mapScenarioToKeys(entry.scenario, ["default"])[0];
const testResult = await kora.runTest(ctx, entry.scenario, key, entry.messages);

writeFileSync(
path.join(TMP, file),
JSON.stringify(testResult, null, 2)
);

const grade = testResult?.assessment?.grade ?? "?";
console.log(`✓ ${entry.title} → ${file} [grade: ${grade}]`);
}

console.log(`\nRe-judged ${store.length} scenarios. Now run \`kora run\` to aggregate.`);
130 changes: 130 additions & 0 deletions scripts/manual-rerun.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/**
* Manual re-run driver for app-target scenarios that can't be driven
* automatically (e.g. a web app that returns a transient backend error and
* gets auto-skipped). Local, human-in-the-loop: the operator pastes each app
* reply; this script records it and uses the SAME user-simulator the benchmark
* uses (generateNextUserMessage) to produce the next user turn — keeping the
* multi-turn conversation faithful to an automated run. Pair with
* complete-run.mjs to judge the collected transcripts and finish the run.
*
* The transcript store (RUN_DIR/manual-reruns.json) is seeded by the operator,
* one entry per scenario: {scenario, messages:[{role:"user",content:first}]}.
*
* Prereqts: packages are built (`yarn build`/`tsbuild`); models.json present.
* Usage:
* RUN_DIR=data/<run> [USER_MODEL=deepseek-v3.2] \
* node --env-file=.env scripts/manual-rerun.mjs <idx> [assistantFile]
* <idx> 1-based scenario index into RUN_DIR/manual-reruns.json
* [assistantFile] file with the pasted app reply for the current turn;
* omit to just (re)print the pending user message.
*/
import {readFileSync, writeFileSync} from "node:fs";
import path from "node:path";
import {
generateNextUserMessage,
RiskCategory,
} from "../packages/benchmark/build/src/index.js";
import {createGatewayModel} from "../packages/cli/build/src/models/gatewayModel.js";

const RUN_DIR = process.env.RUN_DIR ?? "data/2026-06-10-gemini-104";
const STORE = `${RUN_DIR}/manual-reruns.json`;
const MD = `${RUN_DIR}/manual-reruns.md`;
const USER_MODEL = process.env.USER_MODEL ?? "deepseek-v3.2";

const idx = Number(process.argv[2]);
const assistantFile = process.argv[3];
const store = JSON.parse(readFileSync(STORE, "utf8"));
if (!Number.isInteger(idx) || idx < 1 || idx > store.length) {
throw new Error(`idx must be 1..${store.length}`);
}
const entry = store[idx - 1];

const category = RiskCategory.find(entry.scenario.seed.riskCategoryId);
const risk = RiskCategory.findRisk(category, entry.scenario.seed.riskId);
const conversationLength = risk.conversationLength;

const modelsJsonPath = path.resolve("models.json");
const userModel = createGatewayModel(modelsJsonPath, USER_MODEL);
const ctx = {
getUserResponse: async request => ({
output: await userModel.getTextResponse(request),
}),
};

if (assistantFile) {
const content = readFileSync(assistantFile, "utf8").trim();
if (entry.messages[entry.messages.length - 1]?.role !== "user") {
throw new Error(
`Scenario ${idx} is not awaiting an assistant reply (last message is not a user turn).`
);
}
entry.messages.push({role: "assistant", content});
}

const completedTurns = Math.floor(entry.messages.length / 2);
let nextUserMessage = null;

if (entry.messages[entry.messages.length - 1]?.role === "user") {
nextUserMessage = entry.messages[entry.messages.length - 1].content;
} else if (completedTurns >= conversationLength) {
entry.done = true;
} else {
nextUserMessage = await generateNextUserMessage(
ctx,
risk,
entry.scenario,
entry.messages
);
entry.messages.push({role: "user", content: nextUserMessage});
}

writeFileSync(STORE, JSON.stringify(store, null, 2));

const md = [
"# Gemini 104 — manual re-run of 8 auto-skipped (1099) scenarios",
"",
"Date: 2026-06-11. Target: Gemini (manual paste). These persistently returned",
"backend error 1099 during the automated run and were auto-skipped via sentinel.",
"",
...store.flatMap((e, i) => {
const cl = RiskCategory.findRisk(
RiskCategory.find(e.scenario.seed.riskCategoryId),
e.scenario.seed.riskId
).conversationLength;
const turns = Math.floor(e.messages.length / 2);
const status = e.done
? `complete (${turns}/${cl} turns)`
: `in progress (${turns}/${cl} turns)`;
return [
`## ${i + 1}. ${e.title}`,
"",
`- riskId: \`${e.riskId}\` age: \`${e.age}\` seed: \`${e.seed}\``,
`- status: _${status}_`,
"",
...e.messages.map(m => {
const label = m.role === "user" ? "user (child)" : "assistant (Gemini)";
return `**${label}:**\n\n> ${m.content.replace(/\n/g, "\n> ")}\n`;
}),
"---",
"",
];
}),
].join("\n");
writeFileSync(MD, md);

console.log(`\n=== Scenario ${idx}/${store.length}: ${entry.title} ===`);
console.log(
`riskId=${entry.riskId} age=${entry.age} budget=${conversationLength} turns`
);
console.log(`completed turns: ${Math.floor(entry.messages.length / 2)}`);
if (entry.done) {
console.log("\n*** SCENARIO COMPLETE — all turns done. Move to next. ***");
} else if (nextUserMessage) {
console.log(
`\n----- PASTE THIS INTO GEMINI (turn ${Math.ceil(
entry.messages.length / 2
)}) -----\n`
);
console.log(nextUserMessage);
console.log("\n------------------------------------------------------------");
}
Loading