From 9e3e5b618d1acb59efc3c93c05a21371560df360 Mon Sep 17 00:00:00 2001 From: Elron Bandel Date: Sun, 31 May 2026 16:01:35 +0300 Subject: [PATCH] run: scope non-canonical --mode job overlay patches to the eval Job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The synthesized Kustomize overlay for non-canonical `--mode job` runs patched the runner/gateway env and pod-template labels with an unscoped `target: { kind: Job }`. On a benchmark that ships a bespoke second Job in the same manifest — only tau-bench today (a user-sim `harness` Job beside the eval `tau-bench-task-0` Job) — those strategic-merge patches leaked into the harness Job too, injecting imageless `runner` and `gateway` containers. The result is an invalid Job spec, so `run tau-bench --mode job --task-id N --agent X` produced a manifest that fails admission. Scope all three patches (runner env, pod labels, gateway env) to the canonical Job by name, and move the rename patch LAST so the name-scoped patches still match `-task-0` before the rename changes it. Verified by rendering the actual generated overlay: - tau-bench: harness Job now has only [harness] (was [gateway, runner, harness], two of them imageless); the task-N Job is unchanged. - single-Job benchmarks (mmlu-pro, ...): render identically — no regression. Co-Authored-By: Claude Opus 4.8 --- src/run.rs | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/src/run.rs b/src/run.rs index b570f22e..628a93a8 100644 --- a/src/run.rs +++ b/src/run.rs @@ -559,6 +559,7 @@ fn run_job( format!( r#" - target: kind: Job + name: {benchmark}-task-{canonical_task} patch: |- apiVersion: batch/v1 kind: Job @@ -590,17 +591,31 @@ images: task: "{want_task}" includeSelectors: false patches: - # Rename the Job so multiple tasks can be applied concurrently. + # Every patch below targets the canonical Job BY NAME. Some benchmarks + # (e.g. tau-bench) ship a bespoke second Job — a user-sim harness — in + # the same manifest. An unscoped `kind: Job` target strategic-merges the + # runner/gateway container into that Job too, producing an imageless + # container that fails admission. Name-scoping confines us to the eval Job. + # Override runner env vars. - target: kind: Job name: {benchmark}-task-{canonical_task} patch: |- - - op: replace - path: /metadata/name - value: {benchmark}-task-{want_task} + apiVersion: batch/v1 + kind: Job + metadata: + name: {benchmark}-task-{canonical_task} + spec: + template: + spec: + containers: + - name: runner + env: +{runner_env_block} # Sync pod-template labels so `kubectl get pods -l agent=…` works. - target: kind: Job + name: {benchmark}-task-{canonical_task} patch: |- apiVersion: batch/v1 kind: Job @@ -612,22 +627,16 @@ patches: labels: agent: {want_agent} task: "{want_task}" - # Override runner env vars. +{gateway_patch} # Rename the Job LAST — the name-scoped patches above must match the + # canonical name before the rename changes it. Lets concurrent tasks coexist. - target: kind: Job + name: {benchmark}-task-{canonical_task} patch: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {benchmark}-task-{canonical_task} - spec: - template: - spec: - containers: - - name: runner - env: -{runner_env_block} -{gateway_patch}"#, + - op: replace + path: /metadata/name + value: {benchmark}-task-{want_task} +"#, rel_base = rel_base.display(), ); let kustomization_path = tmp_dir.join("kustomization.yaml");