From b5668487788f13ed42f4d8d50347026d4650575a Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 1 Jun 2026 20:55:51 +0000
Subject: [PATCH 1/6] test: add mock-LLM E2E test for /model slash command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add tests/e2e/mock-llm/mock-llm-model-switch.spec.ts exercising the
full /model mid-conversation profile switch flow against the real
agent-server with a scripted mock LLM backend.

Step 1 — setup:
  - ensureMockLLMProfile() configures agent_settings.llm (proven pattern)
  - POST /api/profiles/{name} creates profile B as the switch target
  - Registers a 3-entry trajectory: padding for the internal LLM call,
    INITIAL_REPLY_TOKEN, POST_SWITCH_REPLY_TOKEN

Step 2 — conversation + /model switch:
  - Starts conversation from home page, waits for agent reply
  - Types '/model model-switch-profile-b' and submits
  - Verifies 'Switched to profile' confirmation in data-testid=model-messages
  - Verifies POST /api/conversations/{id}/switch_profile was intercepted
  - Sends follow-up, verifies agent responds (post-switch continuity)
  - Verifies no error banners

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../mock-llm/mock-llm-model-switch.spec.ts    | 291 ++++++++++++++++++
 1 file changed, 291 insertions(+)
 create mode 100644 tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
diff --git a/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts b/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
new file mode 100644
index 00000000..e0698fe7
--- /dev/null
+++ b/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
@@ -0,0 +1,291 @@
+/**
+ * Mock-LLM E2E test: /model slash command — mid-conversation LLM profile switching.
+ *
+ * Exercises the full /model flow end-to-end against the real agent-server:
+ *
+ *   1. Setup: configure the active LLM settings via ensureMockLLMProfile
+ *      (the proven pattern), create a named profile B as the switch target
+ *      via the profiles API, and register a trajectory with text replies.
+ *
+ *   2. Conversation + switch: start a conversation from the home page,
+ *      wait for the agent to reply, then type `/model <profile-B>` in the
+ *      chat input. Verify the "Switched to profile" confirmation renders in
+ *      the chat UI. Verify the switch_profile POST was made to the agent-server.
+ *
+ *   3. Post-switch verification: send another message after the switch
+ *      and verify the agent responds, proving the conversation continues
+ *      working under the new profile.
+ */
+
+import { test, expect } from "@playwright/test";
+import {
+  BACKEND_URL,
+  SESSION_API_KEY,
+  MOCK_LLM_AGENT_URL,
+  seedLocalStorage,
+  routeSessionApiKey,
+  dismissAnalyticsModal,
+  waitForTestId,
+  waitForPath,
+  getConversationIdFromURL,
+  waitForNonUserMessageText,
+  deleteConversation,
+  registerTrajectory,
+  activateTrajectory,
+  resetMockLLM,
+  ensureMockLLMProfile,
+} from "./utils/mock-llm-helpers";
+
+/** Profile B is the switch target — created via the profiles API. */
+const PROFILE_B_NAME = "model-switch-profile-b";
+const MODEL_B = "openai/mock-model-beta";
+
+const INITIAL_REPLY_TOKEN = "MODEL_SWITCH_INITIAL_REPLY_OK";
+const POST_SWITCH_REPLY_TOKEN = "MODEL_SWITCH_POST_SWITCH_REPLY_OK";
+
+/** Create a named LLM profile via the agent-server profiles API. */
+async function saveProfile(
+  request: import("@playwright/test").APIRequestContext,
+  name: string,
+  model: string,
+) {
+  // ProfilesClient.saveProfile() uses POST /api/profiles/{name}
+  const resp = await request.post(
+    `${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`,
+    {
+      headers: {
+        "X-Session-API-Key": SESSION_API_KEY,
+        "Content-Type": "application/json",
+      },
+      data: {
+        llm: {
+          model,
+          api_key: "mock-api-key-for-testing",
+          base_url: MOCK_LLM_AGENT_URL,
+        },
+      },
+    },
+  );
+  expect(
+    resp.ok(),
+    `POST /api/profiles/${name} returned ${resp.status()}`,
+  ).toBe(true);
+}
+
+/** Delete a profile (best-effort cleanup). */
+async function deleteProfile(
+  request: import("@playwright/test").APIRequestContext,
+  name: string,
+) {
+  await request.delete(
+    `${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`,
+    { headers: { "X-Session-API-Key": SESSION_API_KEY } },
+  );
+}
+
+/**
+ * Set contentEditable chat input text and dispatch an input event.
+ *
+ * contentEditable divs don't respond reliably to Playwright's .fill() or
+ * .type(), so we set the text programmatically via evaluate.
+ */
+async function setChatInput(
+  page: import("@playwright/test").Page,
+  text: string,
+) {
+  await page.evaluate(
+    ({ testId, inputText }) => {
+      const el = document.querySelector(`[data-testid="${testId}"]`);
+      if (!(el instanceof HTMLElement))
+        throw new Error("Chat input not found");
+      el.focus();
+      el.textContent = inputText;
+      el.dispatchEvent(
+        new InputEvent("input", {
+          bubbles: true,
+          data: inputText,
+          inputType: "insertText",
+        }),
+      );
+    },
+    { testId: "chat-input", inputText: text },
+  );
+}
+
+test.describe.configure({ mode: "serial" });
+
+test.describe("mock-LLM /model slash command", () => {
+  const conversationIds = new Set<string>();
+
+  test.beforeEach(async ({ page }) => {
+    await seedLocalStorage(page);
+  });
+
+  test.afterEach(async ({ request }) => {
+    for (const id of Array.from(conversationIds)) {
+      try {
+        await deleteConversation(request, id);
+        conversationIds.delete(id);
+      } catch {
+        // best-effort cleanup
+      }
+    }
+  });
+
+  test.afterAll(async ({ request }) => {
+    // Clean up the profile we created and reset mock LLM
+    try {
+      await deleteProfile(request, PROFILE_B_NAME);
+    } catch {
+      // best-effort
+    }
+    try {
+      await resetMockLLM(request);
+    } catch {
+      // best-effort
+    }
+  });
+
+  // ── Step 1: Configure LLM + create switch-target profile + register trajectory
+
+  test("step 1: configure LLM, create switch-target profile, register trajectory", async ({
+    request,
+  }) => {
+    // Use the proven ensureMockLLMProfile helper to set agent_settings.llm
+    // so the agent-server can reach the mock LLM. This is the same approach
+    // used by mock-llm-conversation.spec.ts.
+    await ensureMockLLMProfile(request);
+
+    // Create profile B as the switch target — it has a different model name
+    // but the same mock LLM base_url so post-switch inference still works.
+    await saveProfile(request, PROFILE_B_NAME, MODEL_B);
+
+    // Verify profile B was created
+    const profilesResp = await request.get(`${BACKEND_URL}/api/profiles`, {
+      headers: { "X-Session-API-Key": SESSION_API_KEY },
+    });
+    expect(profilesResp.ok()).toBe(true);
+    const profiles = await profilesResp.json();
+    const profileNames: string[] = profiles.profiles.map(
+      (p: { name: string }) => p.name,
+    );
+    expect(profileNames).toContain(PROFILE_B_NAME);
+
+    // Register a trajectory with THREE entries:
+    //   Turn 0: padding for the agent-server's internal LLM call
+    //           (condenser/skill-analysis) that runs before the main loop
+    //   Turn 1: actual reply to the initial user message
+    //   Turn 2: reply to the post-switch follow-up message
+    await registerTrajectory(request, "model-switch", [
+      { text: "" },
+      { text: INITIAL_REPLY_TOKEN },
+      { text: POST_SWITCH_REPLY_TOKEN },
+    ]);
+    await activateTrajectory(request, "model-switch");
+  });
+
+  // ── Step 2: Conversation + /model switch + post-switch verification ─
+
+  test("step 2: start conversation, switch profile via /model, verify switch", async ({
+    page,
+    request,
+  }) => {
+    test.setTimeout(120_000);
+
+    // Track whether the switch_profile POST was intercepted.
+    let switchProfileCalled = false;
+    let switchProfileBody: Record<string, unknown> | null = null;
+    page.on("request", (req) => {
+      const url = new URL(req.url());
+      // The switch_profile endpoint is POST /api/conversations/{id}/switch_profile
+      if (
+        req.method() === "POST" &&
+        url.pathname.match(/\/api\/conversations\/[^/]+\/switch_profile/)
+      ) {
+        switchProfileCalled = true;
+        try {
+          switchProfileBody = req.postDataJSON();
+        } catch {
+          // non-JSON body
+        }
+      }
+    });
+
+    await routeSessionApiKey(page);
+    await page.goto("/", { waitUntil: "domcontentloaded" });
+    await dismissAnalyticsModal(page);
+    await waitForTestId(page, "home-chat-launcher");
+
+    // ── Send initial message and wait for agent reply ──
+
+    await test.step("send initial message", async () => {
+      await setChatInput(page, "Hello, please respond briefly.");
+      await page.getByTestId("submit-button").click();
+      await waitForPath(page, /\/conversations\/.+/, 30_000);
+    });
+
+    const conversationId = getConversationIdFromURL(page);
+    conversationIds.add(conversationId);
+
+    await test.step("wait for initial agent reply", async () => {
+      await waitForNonUserMessageText(page, INITIAL_REPLY_TOKEN, 30_000);
+    });
+
+    // ── Type /model <profile-B> to switch ──
+
+    await test.step("type /model command to switch to profile B", async () => {
+      // Wait for the chat input to be available (it should be focused
+      // after the agent reply completes).
+      await waitForTestId(page, "chat-input");
+
+      await setChatInput(page, `/model ${PROFILE_B_NAME}`);
+      await page.getByTestId("submit-button").click();
+    });
+
+    // ── Verify: "Switched to profile" message appears in chat UI ──
+
+    await test.step("verify 'Switched to profile' message in chat", async () => {
+      // The ModelMessages component renders with data-testid="model-messages"
+      // and contains "Switched to profile <name>"
+      await waitForNonUserMessageText(page, PROFILE_B_NAME, 15_000);
+
+      // Also verify the specific model-messages container is rendered
+      const modelMessages = page.getByTestId("model-messages");
+      await expect(modelMessages.first()).toBeVisible({ timeout: 5_000 });
+    });
+
+    // ── Verify: the switch_profile POST was made ──
+
+    await test.step("verify switch_profile API was called", async () => {
+      expect(
+        switchProfileCalled,
+        "POST /switch_profile should have been called",
+      ).toBe(true);
+      // The body should contain the profile name
+      expect(switchProfileBody).toBeTruthy();
+      const bodyStr = JSON.stringify(switchProfileBody);
+      expect(
+        bodyStr.includes(PROFILE_B_NAME),
+        `switch_profile body should contain "${PROFILE_B_NAME}", got: ${bodyStr}`,
+      ).toBe(true);
+    });
+
+    // ── Send a follow-up message to verify conversation still works ──
+
+    await test.step("send follow-up message after switch", async () => {
+      await setChatInput(page, "Confirm the model switch worked.");
+      await page.getByTestId("submit-button").click();
+    });
+
+    await test.step("verify post-switch agent reply", async () => {
+      await waitForNonUserMessageText(page, POST_SWITCH_REPLY_TOKEN, 30_000);
+    });
+
+    // ── Verify: no error banners ──
+
+    await test.step("verify no error banners", async () => {
+      const errorBanner = page.getByTestId("error-message-banner");
+      await expect(errorBanner).not.toBeVisible({ timeout: 2_000 });
+    });
+  });
+});

From 7f8b45474973dd0b71a039b7be2781349ccdb0b2 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 1 Jun 2026 21:13:09 +0000
Subject: [PATCH 2/6] fix(ci): prevent main-branch Docker E2E from posting
 comments to PRs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the Docker workflow completes on main, it triggers
mock-llm-docker-e2e.yml via workflow_run. GitHub's API can populate
workflow_run.pull_requests[] with unrelated open PRs (stale association
from shared commit ancestry). This caused main-branch Docker E2E
failures to post ❌ comments on random PRs.

Fix: skip PR number extraction when the triggering workflow ran on
main/master. Main-branch runs still execute (validating the published
image) but no longer post misleading comments to PRs. PR-specific runs
via the pull_request trigger are unaffected.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/mock-llm-docker-e2e.yml | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mock-llm-docker-e2e.yml b/.github/workflows/mock-llm-docker-e2e.yml
index 1915e629..424b30f2 100644
--- a/.github/workflows/mock-llm-docker-e2e.yml
+++ b/.github/workflows/mock-llm-docker-e2e.yml
@@ -62,8 +62,17 @@ jobs:
           if [ "${{ github.event_name }}" = "workflow_run" ]; then
             echo "sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
             echo "ref=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
-            PR_NUMBER=$(echo '${{ toJSON(github.event.workflow_run.pull_requests) }}' \
-              | jq -r '.[0].number // empty')
+            # Only associate with a PR when the triggering Docker workflow
+            # ran on a PR branch.  Main-branch workflow_run events can have
+            # a stale pull_requests[] entry that points at an unrelated PR,
+            # causing failure comments to land on the wrong PR.
+            HEAD_BRANCH="${{ github.event.workflow_run.head_branch }}"
+            if [ "$HEAD_BRANCH" = "main" ] || [ "$HEAD_BRANCH" = "master" ]; then
+              PR_NUMBER=""
+            else
+              PR_NUMBER=$(echo '${{ toJSON(github.event.workflow_run.pull_requests) }}' \
+                | jq -r '.[0].number // empty')
+            fi
             echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
           elif [ "${{ github.event_name }}" = "pull_request" ]; then
             echo "sha=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"

From 35d8db8d047af850a1effc5b6a93a2045e51cafb Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 1 Jun 2026 21:27:45 +0000
Subject: [PATCH 3/6] refactor: address review comments on mock-LLM
 model-switch test

- Import APIRequestContext from top-level @playwright/test import
  instead of inline import() type references (review #1)
- Extract setChatInput to shared mock-llm-helpers.ts and reuse it
  in both mock-llm-conversation and mock-llm-model-switch (review #2)
- Add upstream reference for the padding trajectory entry coupling
  to CondensationMixin (review #3)
- Use direct field assertion on switchProfileBody.profile_name
  instead of loose JSON.stringify substring match (review #4)
- Remove redundant toBeVisible check after waitForNonUserMessageText
  already polls model-messages elements (review #5)

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../mock-llm/mock-llm-conversation.spec.ts    | 20 +-----
 .../mock-llm/mock-llm-model-switch.spec.ts    | 68 +++++--------------
 tests/e2e/mock-llm/utils/mock-llm-helpers.ts  | 26 +++++++
 3 files changed, 46 insertions(+), 68 deletions(-)

diff --git a/tests/e2e/mock-llm/mock-llm-conversation.spec.ts b/tests/e2e/mock-llm/mock-llm-conversation.spec.ts
index 87cfcb37..46116f14 100644
--- a/tests/e2e/mock-llm/mock-llm-conversation.spec.ts
+++ b/tests/e2e/mock-llm/mock-llm-conversation.spec.ts
@@ -37,6 +37,7 @@ import {
   waitForSuccessfulBashObservation,
   deleteConversation,
   resetMockLLM,
+  setChatInput,
 } from "./utils/mock-llm-helpers";
 
 const PROFILE_NAME = "mock-llm-e2e";
@@ -291,24 +292,7 @@ test.describe("mock-LLM agent-server conversation", () => {
     // Keeping the tokens out of the user bubble lets us assert they appear
     // *only* in agent output.
 
-    // Set contenteditable text via evaluate (contentEditable divs don't
-    // respond reliably to Playwright's .fill() or .type()).
-    await page.evaluate(
-      ({ testId, text }) => {
-        const el = document.querySelector(`[data-testid="${testId}"]`);
-        if (!(el instanceof HTMLElement)) throw new Error("Chat input not found");
-        el.focus();
-        el.textContent = text;
-        el.dispatchEvent(
-          new InputEvent("input", {
-            bubbles: true,
-            data: text,
-            inputType: "insertText",
-          }),
-        );
-      },
-      { testId: "chat-input", text: USER_MESSAGE },
-    );
+    await setChatInput(page, USER_MESSAGE);
 
     // Click the submit button — this triggers conversation creation
     await page.getByTestId("submit-button").click();
diff --git a/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts b/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
index e0698fe7..5f9df140 100644
--- a/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
+++ b/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
@@ -17,7 +17,7 @@
  *      working under the new profile.
  */
 
-import { test, expect } from "@playwright/test";
+import { test, expect, type APIRequestContext } from "@playwright/test";
 import {
   BACKEND_URL,
   SESSION_API_KEY,
@@ -34,6 +34,7 @@ import {
   activateTrajectory,
   resetMockLLM,
   ensureMockLLMProfile,
+  setChatInput,
 } from "./utils/mock-llm-helpers";
 
 /** Profile B is the switch target — created via the profiles API. */
@@ -45,11 +46,10 @@ const POST_SWITCH_REPLY_TOKEN = "MODEL_SWITCH_POST_SWITCH_REPLY_OK";
 
 /** Create a named LLM profile via the agent-server profiles API. */
 async function saveProfile(
-  request: import("@playwright/test").APIRequestContext,
+  request: APIRequestContext,
   name: string,
   model: string,
 ) {
-  // ProfilesClient.saveProfile() uses POST /api/profiles/{name}
   const resp = await request.post(
     `${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`,
     {
@@ -73,45 +73,13 @@ async function saveProfile(
 }
 
 /** Delete a profile (best-effort cleanup). */
-async function deleteProfile(
-  request: import("@playwright/test").APIRequestContext,
-  name: string,
-) {
+async function deleteProfile(request: APIRequestContext, name: string) {
   await request.delete(
     `${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`,
     { headers: { "X-Session-API-Key": SESSION_API_KEY } },
   );
 }
 
-/**
- * Set contentEditable chat input text and dispatch an input event.
- *
- * contentEditable divs don't respond reliably to Playwright's .fill() or
- * .type(), so we set the text programmatically via evaluate.
- */
-async function setChatInput(
-  page: import("@playwright/test").Page,
-  text: string,
-) {
-  await page.evaluate(
-    ({ testId, inputText }) => {
-      const el = document.querySelector(`[data-testid="${testId}"]`);
-      if (!(el instanceof HTMLElement))
-        throw new Error("Chat input not found");
-      el.focus();
-      el.textContent = inputText;
-      el.dispatchEvent(
-        new InputEvent("input", {
-          bubbles: true,
-          data: inputText,
-          inputType: "insertText",
-        }),
-      );
-    },
-    { testId: "chat-input", inputText: text },
-  );
-}
-
 test.describe.configure({ mode: "serial" });
 
 test.describe("mock-LLM /model slash command", () => {
@@ -172,12 +140,17 @@ test.describe("mock-LLM /model slash command", () => {
     expect(profileNames).toContain(PROFILE_B_NAME);
 
     // Register a trajectory with THREE entries:
-    //   Turn 0: padding for the agent-server's internal LLM call
-    //           (condenser/skill-analysis) that runs before the main loop
+    //   Turn 0: padding — the agent-server makes an internal LLM call
+    //           (condenser/skill-analysis) before the agent's main loop.
+    //           This consumes one trajectory response.  If the SDK removes
+    //           that internal call, this padding entry will cause an
+    //           off-by-one; delete it at that point.
+    //           Ref: same pattern in mock-llm-automation.spec.ts;
+    //           upstream SDK code: openhands-sdk CondensationMixin.
     //   Turn 1: actual reply to the initial user message
     //   Turn 2: reply to the post-switch follow-up message
     await registerTrajectory(request, "model-switch", [
-      { text: "" },
+      { text: "" }, // padding for internal LLM call (see comment above)
       { text: INITIAL_REPLY_TOKEN },
       { text: POST_SWITCH_REPLY_TOKEN },
     ]);
@@ -245,13 +218,9 @@ test.describe("mock-LLM /model slash command", () => {
     // ── Verify: "Switched to profile" message appears in chat UI ──
 
     await test.step("verify 'Switched to profile' message in chat", async () => {
-      // The ModelMessages component renders with data-testid="model-messages"
-      // and contains "Switched to profile <name>"
+      // waitForNonUserMessageText already polls data-testid="model-messages"
+      // elements, so a successful wait proves the container is visible.
       await waitForNonUserMessageText(page, PROFILE_B_NAME, 15_000);
-
-      // Also verify the specific model-messages container is rendered
-      const modelMessages = page.getByTestId("model-messages");
-      await expect(modelMessages.first()).toBeVisible({ timeout: 5_000 });
     });
 
     // ── Verify: the switch_profile POST was made ──
@@ -261,13 +230,12 @@ test.describe("mock-LLM /model slash command", () => {
         switchProfileCalled,
         "POST /switch_profile should have been called",
       ).toBe(true);
-      // The body should contain the profile name
       expect(switchProfileBody).toBeTruthy();
-      const bodyStr = JSON.stringify(switchProfileBody);
+      // The switch_profile API uses { profile_name: "..." }
       expect(
-        bodyStr.includes(PROFILE_B_NAME),
-        `switch_profile body should contain "${PROFILE_B_NAME}", got: ${bodyStr}`,
-      ).toBe(true);
+        (switchProfileBody as Record<string, unknown>)?.profile_name,
+        `switch_profile body.profile_name should be "${PROFILE_B_NAME}"`,
+      ).toBe(PROFILE_B_NAME);
     });
 
     // ── Send a follow-up message to verify conversation still works ──
diff --git a/tests/e2e/mock-llm/utils/mock-llm-helpers.ts b/tests/e2e/mock-llm/utils/mock-llm-helpers.ts
index 03206c10..8a413f6f 100644
--- a/tests/e2e/mock-llm/utils/mock-llm-helpers.ts
+++ b/tests/e2e/mock-llm/utils/mock-llm-helpers.ts
@@ -380,5 +380,31 @@ export async function resetMockLLM(request: APIRequestContext) {
   expect(resp.ok(), `Reset mock LLM: ${resp.status()}`).toBe(true);
 }
 
+/**
+ * Set contentEditable chat input text and dispatch an input event.
+ *
+ * contentEditable divs don't respond reliably to Playwright's .fill() or
+ * .type(), so we set the text programmatically via page.evaluate().
+ */
+export async function setChatInput(page: Page, text: string) {
+  await page.evaluate(
+    ({ testId, inputText }) => {
+      const el = document.querySelector(`[data-testid="${testId}"]`);
+      if (!(el instanceof HTMLElement))
+        throw new Error("Chat input not found");
+      el.focus();
+      el.textContent = inputText;
+      el.dispatchEvent(
+        new InputEvent("input", {
+          bubbles: true,
+          data: inputText,
+          inputType: "insertText",
+        }),
+      );
+    },
+    { testId: "chat-input", inputText: text },
+  );
+}
+
 // Mock automation helpers removed — the automation test now hits the real
 // automation backend running inside the bin/agent-canvas.mjs stack.

From 1ab0495419b07f32e813c801d2798d86422125e2 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 1 Jun 2026 21:41:53 +0000
Subject: [PATCH 4/6] fix(ci): deduplicate Docker E2E workflow_run triggers +
 add retry

Two Docker E2E reliability fixes:

1. Concurrency group: workflow_run triggers used the unique
   workflow_run.id as the group key, so multiple Docker builds
   completing on main each spawned their own E2E run (they never
   cancelled each other). Changed to key by the triggering
   workflow's head_branch, so main-branch workflow_run triggers
   share the group 'wr-main' and only the latest run survives.
   pull_request triggers still key by PR number (unchanged).

2. Retry: Docker Playwright config now uses retries:1 in CI to
   handle transient container startup ECONNREFUSED failures. The
   webServer health-check confirms the stack is up, but occasional
   races between container readiness and the first test request
   can still cause failures.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/mock-llm-docker-e2e.yml | 37 ++++++++++++++---------
 playwright.mock-llm-docker.config.ts      |  5 ++-
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/mock-llm-docker-e2e.yml b/.github/workflows/mock-llm-docker-e2e.yml
index 424b30f2..03348ad4 100644
--- a/.github/workflows/mock-llm-docker-e2e.yml
+++ b/.github/workflows/mock-llm-docker-e2e.yml
@@ -25,8 +25,18 @@ on:
         type: string
         default: ""
 
+# Concurrency: deduplicate runs for the same logical trigger.
+# - pull_request: keyed by PR number (pushes to the same PR cancel earlier runs)
+# - workflow_run: keyed by the triggering workflow's branch (e.g. "wr-main"),
+#   so multiple Docker builds completing on main don't pile up separate E2E runs
+# - workflow_dispatch / fallback: keyed by ref
 concurrency:
-  group: mock-llm-docker-e2e-${{ github.event.workflow_run.id || github.event.pull_request.number || github.ref }}
+  group: >-
+    mock-llm-docker-e2e-${{
+      github.event.pull_request.number ||
+      (github.event.workflow_run.id && format('wr-{0}', github.event.workflow_run.head_branch)) ||
+      github.ref
+    }}
   cancel-in-progress: true
 
 permissions:
@@ -37,13 +47,18 @@ permissions:
 
 jobs:
   mock-llm-docker-e2e:
-    # workflow_run: only run if the Docker build succeeded.
+    # workflow_run: only for main/master — validates the published image.
+    #   PR branches are already covered by the pull_request trigger below;
+    #   without this guard, both triggers fire for e2e-tests PRs, producing
+    #   duplicate (and potentially contradictory) comment pairs.
     # pull_request: only run with the 'e2e-tests' label, skip fork PRs (no GHCR push).
     # workflow_dispatch: always run.
     if: >-
       github.event_name == 'workflow_dispatch' ||
       (github.event_name == 'workflow_run' &&
-       github.event.workflow_run.conclusion == 'success') ||
+       github.event.workflow_run.conclusion == 'success' &&
+       (github.event.workflow_run.head_branch == 'main' ||
+        github.event.workflow_run.head_branch == 'master')) ||
       (github.event_name == 'pull_request' &&
        contains(github.event.pull_request.labels.*.name, 'e2e-tests') &&
        !github.event.pull_request.head.repo.fork)
@@ -60,20 +75,12 @@ jobs:
         id: ctx
         run: |
           if [ "${{ github.event_name }}" = "workflow_run" ]; then
+            # workflow_run only fires for main (job-level `if` guard), so
+            # this path always tests the main-branch Docker image. Never
+            # post PR comments — results go to the step summary only.
             echo "sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
             echo "ref=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
-            # Only associate with a PR when the triggering Docker workflow
-            # ran on a PR branch.  Main-branch workflow_run events can have
-            # a stale pull_requests[] entry that points at an unrelated PR,
-            # causing failure comments to land on the wrong PR.
-            HEAD_BRANCH="${{ github.event.workflow_run.head_branch }}"
-            if [ "$HEAD_BRANCH" = "main" ] || [ "$HEAD_BRANCH" = "master" ]; then
-              PR_NUMBER=""
-            else
-              PR_NUMBER=$(echo '${{ toJSON(github.event.workflow_run.pull_requests) }}' \
-                | jq -r '.[0].number // empty')
-            fi
-            echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
+            echo "pr_number=" >> "$GITHUB_OUTPUT"
           elif [ "${{ github.event_name }}" = "pull_request" ]; then
             echo "sha=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
             echo "ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
diff --git a/playwright.mock-llm-docker.config.ts b/playwright.mock-llm-docker.config.ts
index a7c4e682..e1fad78a 100644
--- a/playwright.mock-llm-docker.config.ts
+++ b/playwright.mock-llm-docker.config.ts
@@ -86,7 +86,10 @@ export default defineConfig({
   testMatch: /.*\.spec\.ts/,
   fullyParallel: false,
   forbidOnly: !!process.env.CI,
-  retries: 0,
+  // One retry for transient Docker container startup failures (ECONNREFUSED).
+  // The container health-check (webServer.url) confirms the stack is up, but
+  // occasional races can still cause the first request to fail.
+  retries: process.env.CI ? 1 : 0,
   workers: 1,
   timeout: 60_000,
   globalTimeout: process.env.CI ? 600_000 : 0, // 10 min hard cap in CI

From ba7b2aebaa93afe4503b63339b0098fc9c6d5dc3 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 1 Jun 2026 22:13:52 +0000
Subject: [PATCH 5/6] refactor: address second round of review comments

- Make saveProfile idempotent with best-effort delete-first so stale
  profiles from crashed CI runs don't cause persistent 409 failures
- Increase switch-confirmation timeout from 15s to 30s for consistency
  with all other waitForNonUserMessageText calls in this test
- Add waitForTestId guard before post-switch setChatInput to handle
  potential UI disable during profile switch settling

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/e2e/mock-llm/mock-llm-model-switch.spec.ts | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts b/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
index 5f9df140..ab3a54b4 100644
--- a/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
+++ b/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
@@ -44,12 +44,21 @@ const MODEL_B = "openai/mock-model-beta";
 const INITIAL_REPLY_TOKEN = "MODEL_SWITCH_INITIAL_REPLY_OK";
 const POST_SWITCH_REPLY_TOKEN = "MODEL_SWITCH_POST_SWITCH_REPLY_OK";
 
-/** Create a named LLM profile via the agent-server profiles API. */
+/**
+ * Create (or overwrite) a named LLM profile via the agent-server profiles API.
+ * Deletes first so setup is idempotent across re-runs — if a previous test
+ * crashed before afterAll cleanup, the stale profile won't cause a 409.
+ */
 async function saveProfile(
   request: APIRequestContext,
   name: string,
   model: string,
 ) {
+  // Best-effort delete so a leftover profile doesn't block creation.
+  await request.delete(
+    `${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`,
+    { headers: { "X-Session-API-Key": SESSION_API_KEY } },
+  );
   const resp = await request.post(
     `${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`,
     {
@@ -220,7 +229,7 @@ test.describe("mock-LLM /model slash command", () => {
     await test.step("verify 'Switched to profile' message in chat", async () => {
       // waitForNonUserMessageText already polls data-testid="model-messages"
       // elements, so a successful wait proves the container is visible.
-      await waitForNonUserMessageText(page, PROFILE_B_NAME, 15_000);
+      await waitForNonUserMessageText(page, PROFILE_B_NAME, 30_000);
     });
 
     // ── Verify: the switch_profile POST was made ──
@@ -241,6 +250,9 @@ test.describe("mock-LLM /model slash command", () => {
     // ── Send a follow-up message to verify conversation still works ──
 
     await test.step("send follow-up message after switch", async () => {
+      // Wait for the chat input to be ready — the UI may briefly disable
+      // it while the profile switch settles.
+      await waitForTestId(page, "chat-input");
       await setChatInput(page, "Confirm the model switch worked.");
       await page.getByTestId("submit-button").click();
     });

From b5562647de4143341d67eb1da958cde6466ecf37 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 1 Jun 2026 22:28:03 +0000
Subject: [PATCH 6/6] =?UTF-8?q?refactor:=20minor=20nits=20=E2=80=94=20non-?=
 =?UTF-8?q?null=20assertion=20+=20testId=20in=20error=20message?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Use switchProfileBody!.profile_name after toBeTruthy guard instead
  of optional chain (the assertion guarantees non-null)
- Include testId in setChatInput error message for easier diagnosis;
  accept optional testId parameter for future reuse

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/e2e/mock-llm/mock-llm-model-switch.spec.ts |  2 +-
 tests/e2e/mock-llm/utils/mock-llm-helpers.ts     | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts b/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
index ab3a54b4..8d090a5d 100644
--- a/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
+++ b/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
@@ -242,7 +242,7 @@ test.describe("mock-LLM /model slash command", () => {
       expect(switchProfileBody).toBeTruthy();
       // The switch_profile API uses { profile_name: "..." }
       expect(
-        (switchProfileBody as Record<string, unknown>)?.profile_name,
+        switchProfileBody!.profile_name,
         `switch_profile body.profile_name should be "${PROFILE_B_NAME}"`,
       ).toBe(PROFILE_B_NAME);
     });
diff --git a/tests/e2e/mock-llm/utils/mock-llm-helpers.ts b/tests/e2e/mock-llm/utils/mock-llm-helpers.ts
index 8a413f6f..d68ef40a 100644
--- a/tests/e2e/mock-llm/utils/mock-llm-helpers.ts
+++ b/tests/e2e/mock-llm/utils/mock-llm-helpers.ts
@@ -386,12 +386,16 @@ export async function resetMockLLM(request: APIRequestContext) {
  * contentEditable divs don't respond reliably to Playwright's .fill() or
  * .type(), so we set the text programmatically via page.evaluate().
  */
-export async function setChatInput(page: Page, text: string) {
+export async function setChatInput(
+  page: Page,
+  text: string,
+  testId = "chat-input",
+) {
   await page.evaluate(
-    ({ testId, inputText }) => {
-      const el = document.querySelector(`[data-testid="${testId}"]`);
+    ({ tid, inputText }) => {
+      const el = document.querySelector(`[data-testid="${tid}"]`);
       if (!(el instanceof HTMLElement))
-        throw new Error("Chat input not found");
+        throw new Error(`Chat input [data-testid="${tid}"] not found`);
       el.focus();
       el.textContent = inputText;
       el.dispatchEvent(
@@ -402,7 +406,7 @@ export async function setChatInput(page: Page, text: string) {
         }),
       );
     },
-    { testId: "chat-input", inputText: text },
+    { tid: testId, inputText: text },
   );
 }