From 9e3e5b618d1acb59efc3c93c05a21371560df360 Mon Sep 17 00:00:00 2001
From: Elron Bandel <elron.bandel@ibm.com>
Date: Sun, 31 May 2026 16:01:35 +0300
Subject: [PATCH] run: scope non-canonical --mode job overlay patches to the
 eval Job
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The synthesized Kustomize overlay for non-canonical `--mode job` runs
patched the runner/gateway env and pod-template labels with an unscoped
`target: { kind: Job }`. On a benchmark that ships a bespoke second Job in
the same manifest — only tau-bench today (a user-sim `harness` Job beside
the eval `tau-bench-task-0` Job) — those strategic-merge patches leaked
into the harness Job too, injecting imageless `runner` and `gateway`
containers. The result is an invalid Job spec, so
`run tau-bench --mode job --task-id N --agent X` produced a manifest that
fails admission.

Scope all three patches (runner env, pod labels, gateway env) to the
canonical Job by name, and move the rename patch LAST so the name-scoped
patches still match `<bench>-task-0` before the rename changes it.

Verified by rendering the actual generated overlay:
- tau-bench: harness Job now has only [harness] (was [gateway, runner,
  harness], two of them imageless); the task-N Job is unchanged.
- single-Job benchmarks (mmlu-pro, ...): render identically — no regression.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/run.rs | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)
diff --git a/src/run.rs b/src/run.rs
index b570f22e..628a93a8 100644
--- a/src/run.rs
+++ b/src/run.rs
@@ -559,6 +559,7 @@ fn run_job(
         format!(
             r#"  - target:
       kind: Job
+      name: {benchmark}-task-{canonical_task}
     patch: |-
       apiVersion: batch/v1
       kind: Job
@@ -590,17 +591,31 @@ images:
       task: "{want_task}"
     includeSelectors: false
 patches:
-  # Rename the Job so multiple tasks can be applied concurrently.
+  # Every patch below targets the canonical Job BY NAME. Some benchmarks
+  # (e.g. tau-bench) ship a bespoke second Job — a user-sim harness — in
+  # the same manifest. An unscoped `kind: Job` target strategic-merges the
+  # runner/gateway container into that Job too, producing an imageless
+  # container that fails admission. Name-scoping confines us to the eval Job.
+  # Override runner env vars.
   - target:
       kind: Job
       name: {benchmark}-task-{canonical_task}
     patch: |-
-      - op: replace
-        path: /metadata/name
-        value: {benchmark}-task-{want_task}
+      apiVersion: batch/v1
+      kind: Job
+      metadata:
+        name: {benchmark}-task-{canonical_task}
+      spec:
+        template:
+          spec:
+            containers:
+              - name: runner
+                env:
+{runner_env_block}
   # Sync pod-template labels so `kubectl get pods -l agent=…` works.
   - target:
       kind: Job
+      name: {benchmark}-task-{canonical_task}
     patch: |-
       apiVersion: batch/v1
       kind: Job
@@ -612,22 +627,16 @@ patches:
             labels:
               agent: {want_agent}
               task: "{want_task}"
-  # Override runner env vars.
+{gateway_patch}  # Rename the Job LAST — the name-scoped patches above must match the
+  # canonical name before the rename changes it. Lets concurrent tasks coexist.
   - target:
       kind: Job
+      name: {benchmark}-task-{canonical_task}
     patch: |-
-      apiVersion: batch/v1
-      kind: Job
-      metadata:
-        name: {benchmark}-task-{canonical_task}
-      spec:
-        template:
-          spec:
-            containers:
-              - name: runner
-                env:
-{runner_env_block}
-{gateway_patch}"#,
+      - op: replace
+        path: /metadata/name
+        value: {benchmark}-task-{want_task}
+"#,
         rel_base = rel_base.display(),
     );
     let kustomization_path = tmp_dir.join("kustomization.yaml");