OpenHands · malhotra5 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/.github/workflows/mock-llm-docker-e2e.yml b/.github/workflows/mock-llm-docker-e2e.yml
@@ -25,8 +25,18 @@ on:
         type: string
         default: ""
 
+# Concurrency: deduplicate runs for the same logical trigger.
+# - pull_request: keyed by PR number (pushes to the same PR cancel earlier runs)
+# - workflow_run: keyed by the triggering workflow's branch (e.g. "wr-main"),
+#   so multiple Docker builds completing on main don't pile up separate E2E runs
+# - workflow_dispatch / fallback: keyed by ref
 concurrency:
-  group: mock-llm-docker-e2e-${{ github.event.workflow_run.id || github.event.pull_request.number || github.ref }}
+  group: >-
+    mock-llm-docker-e2e-${{
+      github.event.pull_request.number ||
+      (github.event.workflow_run.id && format('wr-{0}', github.event.workflow_run.head_branch)) ||
+      github.ref
+    }}
   cancel-in-progress: true
 
 permissions:
@@ -37,13 +47,18 @@ permissions:
 
 jobs:
   mock-llm-docker-e2e:
-    # workflow_run: only run if the Docker build succeeded.
+    # workflow_run: only for main/master — validates the published image.
+    #   PR branches are already covered by the pull_request trigger below;
+    #   without this guard, both triggers fire for e2e-tests PRs, producing
+    #   duplicate (and potentially contradictory) comment pairs.
     # pull_request: only run with the 'e2e-tests' label, skip fork PRs (no GHCR push).
     # workflow_dispatch: always run.
     if: >-
       github.event_name == 'workflow_dispatch' ||
       (github.event_name == 'workflow_run' &&
-       github.event.workflow_run.conclusion == 'success') ||
+       github.event.workflow_run.conclusion == 'success' &&
+       (github.event.workflow_run.head_branch == 'main' ||
+        github.event.workflow_run.head_branch == 'master')) ||
       (github.event_name == 'pull_request' &&
        contains(github.event.pull_request.labels.*.name, 'e2e-tests') &&
        !github.event.pull_request.head.repo.fork)
@@ -60,11 +75,12 @@ jobs:
         id: ctx
         run: |
           if [ "${{ github.event_name }}" = "workflow_run" ]; then
+            # workflow_run only fires for main (job-level `if` guard), so
+            # this path always tests the main-branch Docker image. Never
+            # post PR comments — results go to the step summary only.
             echo "sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
             echo "ref=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
-            PR_NUMBER=$(echo '${{ toJSON(github.event.workflow_run.pull_requests) }}' \
-              | jq -r '.[0].number // empty')
-            echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
+            echo "pr_number=" >> "$GITHUB_OUTPUT"
           elif [ "${{ github.event_name }}" = "pull_request" ]; then
             echo "sha=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
             echo "ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"

diff --git a/playwright.mock-llm-docker.config.ts b/playwright.mock-llm-docker.config.ts
@@ -86,7 +86,10 @@ export default defineConfig({
   testMatch: /.*\.spec\.ts/,
   fullyParallel: false,
   forbidOnly: !!process.env.CI,
-  retries: 0,
+  // One retry for transient Docker container startup failures (ECONNREFUSED).
+  // The container health-check (webServer.url) confirms the stack is up, but
+  // occasional races can still cause the first request to fail.
+  retries: process.env.CI ? 1 : 0,
   workers: 1,
   timeout: 60_000,
   globalTimeout: process.env.CI ? 600_000 : 0, // 10 min hard cap in CI

diff --git a/tests/e2e/mock-llm/mock-llm-conversation.spec.ts b/tests/e2e/mock-llm/mock-llm-conversation.spec.ts
@@ -37,6 +37,7 @@ import {
   waitForSuccessfulBashObservation,
   deleteConversation,
   resetMockLLM,
+  setChatInput,
 } from "./utils/mock-llm-helpers";
 
 const PROFILE_NAME = "mock-llm-e2e";
@@ -291,24 +292,7 @@ test.describe("mock-LLM agent-server conversation", () => {
     // Keeping the tokens out of the user bubble lets us assert they appear
     // *only* in agent output.
 
-    // Set contenteditable text via evaluate (contentEditable divs don't
-    // respond reliably to Playwright's .fill() or .type()).
-    await page.evaluate(
-      ({ testId, text }) => {
-        const el = document.querySelector(`[data-testid="${testId}"]`);
-        if (!(el instanceof HTMLElement)) throw new Error("Chat input not found");
-        el.focus();
-        el.textContent = text;
-        el.dispatchEvent(
-          new InputEvent("input", {
-            bubbles: true,
-            data: text,
-            inputType: "insertText",
-          }),
-        );
-      },
-      { testId: "chat-input", text: USER_MESSAGE },
-    );
+    await setChatInput(page, USER_MESSAGE);
 
     // Click the submit button — this triggers conversation creation
     await page.getByTestId("submit-button").click();

diff --git a/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts b/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
@@ -0,0 +1,271 @@
+/**
+ * Mock-LLM E2E test: /model slash command — mid-conversation LLM profile switching.
+ *
+ * Exercises the full /model flow end-to-end against the real agent-server:
+ *
+ *   1. Setup: configure the active LLM settings via ensureMockLLMProfile
+ *      (the proven pattern), create a named profile B as the switch target
+ *      via the profiles API, and register a trajectory with text replies.
+ *
+ *   2. Conversation + switch: start a conversation from the home page,
+ *      wait for the agent to reply, then type `/model <profile-B>` in the
+ *      chat input. Verify the "Switched to profile" confirmation renders in
+ *      the chat UI. Verify the switch_profile POST was made to the agent-server.
+ *
+ *   3. Post-switch verification: send another message after the switch
+ *      and verify the agent responds, proving the conversation continues
+ *      working under the new profile.
+ */
+
+import { test, expect, type APIRequestContext } from "@playwright/test";
+import {
+  BACKEND_URL,
+  SESSION_API_KEY,
+  MOCK_LLM_AGENT_URL,
+  seedLocalStorage,
+  routeSessionApiKey,
+  dismissAnalyticsModal,
+  waitForTestId,
+  waitForPath,
+  getConversationIdFromURL,
+  waitForNonUserMessageText,
+  deleteConversation,
+  registerTrajectory,
+  activateTrajectory,
+  resetMockLLM,
+  ensureMockLLMProfile,
+  setChatInput,
+} from "./utils/mock-llm-helpers";
+
+/** Profile B is the switch target — created via the profiles API. */
+const PROFILE_B_NAME = "model-switch-profile-b";
+const MODEL_B = "openai/mock-model-beta";
+
+const INITIAL_REPLY_TOKEN = "MODEL_SWITCH_INITIAL_REPLY_OK";
+const POST_SWITCH_REPLY_TOKEN = "MODEL_SWITCH_POST_SWITCH_REPLY_OK";
+
+/**
+ * Create (or overwrite) a named LLM profile via the agent-server profiles API.
+ * Deletes first so setup is idempotent across re-runs — if a previous test
+ * crashed before afterAll cleanup, the stale profile won't cause a 409.
+ */
+async function saveProfile(
+  request: APIRequestContext,
+  name: string,
+  model: string,
+) {
-) {
+import { test, expect, type APIRequestContext, type Page } from "@playwright/test";
-) {
+import { test, expect, type APIRequestContext, type Page } from "@playwright/test";
+  // Best-effort delete so a leftover profile doesn't block creation.
+  await request.delete(
+    `${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`,
+    { headers: { "X-Session-API-Key": SESSION_API_KEY } },
+  );
+  const resp = await request.post(
+    `${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`,
+    {
+      headers: {
+        "X-Session-API-Key": SESSION_API_KEY,
+        "Content-Type": "application/json",
+      },
+      data: {
+        llm: {
+          model,
+          api_key: "mock-api-key-for-testing",
+          base_url: MOCK_LLM_AGENT_URL,
+        },
+      },
+    },
+  );
+  expect(
+    resp.ok(),
+    `POST /api/profiles/${name} returned ${resp.status()}`,
+  ).toBe(true);
+}
+
+/** Delete a profile (best-effort cleanup). */
+async function deleteProfile(request: APIRequestContext, name: string) {
+  await request.delete(
+    `${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`,
+    { headers: { "X-Session-API-Key": SESSION_API_KEY } },
+  );
+}
+
+test.describe.configure({ mode: "serial" });
+
+test.describe("mock-LLM /model slash command", () => {
+  const conversationIds = new Set<string>();
+
+  test.beforeEach(async ({ page }) => {
+    await seedLocalStorage(page);
+  });
+
+  test.afterEach(async ({ request }) => {
+    for (const id of Array.from(conversationIds)) {
+      try {
+        await deleteConversation(request, id);
+        conversationIds.delete(id);
+      } catch {
+        // best-effort cleanup
+      }
+    }
+  });
+
+  test.afterAll(async ({ request }) => {
+    // Clean up the profile we created and reset mock LLM
+    try {
+      await deleteProfile(request, PROFILE_B_NAME);
+    } catch {
+      // best-effort
+    }
+    try {
+      await resetMockLLM(request);
+    } catch {
+      // best-effort
+    }
+  });
+
+  // ── Step 1: Configure LLM + create switch-target profile + register trajectory
+
+  test("step 1: configure LLM, create switch-target profile, register trajectory", async ({
+    request,
+  }) => {
+    // Use the proven ensureMockLLMProfile helper to set agent_settings.llm
+    // so the agent-server can reach the mock LLM. This is the same approach
+    // used by mock-llm-conversation.spec.ts.
+    await ensureMockLLMProfile(request);
+
+    // Create profile B as the switch target — it has a different model name
+    // but the same mock LLM base_url so post-switch inference still works.
+    await saveProfile(request, PROFILE_B_NAME, MODEL_B);
+
+    // Verify profile B was created
+    const profilesResp = await request.get(`${BACKEND_URL}/api/profiles`, {
+      headers: { "X-Session-API-Key": SESSION_API_KEY },
+    });
+    expect(profilesResp.ok()).toBe(true);
+    const profiles = await profilesResp.json();
+    const profileNames: string[] = profiles.profiles.map(
+      (p: { name: string }) => p.name,
+    );
+    expect(profileNames).toContain(PROFILE_B_NAME);
+
+    // Register a trajectory with THREE entries:
+    //   Turn 0: padding — the agent-server makes an internal LLM call
+    //           (condenser/skill-analysis) before the agent's main loop.
+    //           This consumes one trajectory response.  If the SDK removes
+    //           that internal call, this padding entry will cause an
+    //           off-by-one; delete it at that point.
+    //           Ref: same pattern in mock-llm-automation.spec.ts;
+    //           upstream SDK code: openhands-sdk CondensationMixin.
+    //   Turn 1: actual reply to the initial user message
+    //   Turn 2: reply to the post-switch follow-up message
+    await registerTrajectory(request, "model-switch", [
+      { text: "" }, // padding for internal LLM call (see comment above)
+      { text: INITIAL_REPLY_TOKEN },
+      { text: POST_SWITCH_REPLY_TOKEN },
+    ]);
+    await activateTrajectory(request, "model-switch");
+  });
+
+  // ── Step 2: Conversation + /model switch + post-switch verification ─
+
+  test("step 2: start conversation, switch profile via /model, verify switch", async ({
+    page,
+    request,
+  }) => {
+    test.setTimeout(120_000);
+
+    // Track whether the switch_profile POST was intercepted.
+    let switchProfileCalled = false;
+    let switchProfileBody: Record<string, unknown> | null = null;
+    page.on("request", (req) => {
+      const url = new URL(req.url());
+      // The switch_profile endpoint is POST /api/conversations/{id}/switch_profile
+      if (
+        req.method() === "POST" &&
+        url.pathname.match(/\/api\/conversations\/[^/]+\/switch_profile/)
+      ) {
+        switchProfileCalled = true;
+        try {
+          switchProfileBody = req.postDataJSON();
+        } catch {
+          // non-JSON body
+        }
+      }
+    });
+
+    await routeSessionApiKey(page);
+    await page.goto("/", { waitUntil: "domcontentloaded" });
+    await dismissAnalyticsModal(page);
+    await waitForTestId(page, "home-chat-launcher");
+
+    // ── Send initial message and wait for agent reply ──
+
+    await test.step("send initial message", async () => {
+      await setChatInput(page, "Hello, please respond briefly.");
+      await page.getByTestId("submit-button").click();
+      await waitForPath(page, /\/conversations\/.+/, 30_000);
+    });
+
+    const conversationId = getConversationIdFromURL(page);
+    conversationIds.add(conversationId);
+
+    await test.step("wait for initial agent reply", async () => {
+      await waitForNonUserMessageText(page, INITIAL_REPLY_TOKEN, 30_000);
+    });
+
+    // ── Type /model <profile-B> to switch ──
+
+    await test.step("type /model command to switch to profile B", async () => {
+      // Wait for the chat input to be available (it should be focused
+      // after the agent reply completes).
+      await waitForTestId(page, "chat-input");
+
+      await setChatInput(page, `/model ${PROFILE_B_NAME}`);
+      await page.getByTestId("submit-button").click();
+    });
+
+    // ── Verify: "Switched to profile" message appears in chat UI ──
+
+    await test.step("verify 'Switched to profile' message in chat", async () => {
+      // waitForNonUserMessageText already polls data-testid="model-messages"
+      // elements, so a successful wait proves the container is visible.
+      await waitForNonUserMessageText(page, PROFILE_B_NAME, 30_000);
+    });
+
+    // ── Verify: the switch_profile POST was made ──
+
+    await test.step("verify switch_profile API was called", async () => {
+      expect(
+        switchProfileCalled,
+        "POST /switch_profile should have been called",
+      ).toBe(true);
+      expect(switchProfileBody).toBeTruthy();
+      // The switch_profile API uses { profile_name: "..." }
+      expect(
+        switchProfileBody!.profile_name,
+        `switch_profile body.profile_name should be "${PROFILE_B_NAME}"`,
+      ).toBe(PROFILE_B_NAME);
+    });
+
+    // ── Send a follow-up message to verify conversation still works ──
+
+    await test.step("send follow-up message after switch", async () => {
+      // Wait for the chat input to be ready — the UI may briefly disable
+      // it while the profile switch settles.
+      await waitForTestId(page, "chat-input");
+      await setChatInput(page, "Confirm the model switch worked.");
+      await page.getByTestId("submit-button").click();
+    });
+
+    await test.step("verify post-switch agent reply", async () => {
+      await waitForNonUserMessageText(page, POST_SWITCH_REPLY_TOKEN, 30_000);
+    });
+
+    // ── Verify: no error banners ──
+
+    await test.step("verify no error banners", async () => {
+      const errorBanner = page.getByTestId("error-message-banner");
+      await expect(errorBanner).not.toBeVisible({ timeout: 2_000 });
+    });
+  });
+});