diff --git a/.github/workflows/mock-llm-docker-e2e.yml b/.github/workflows/mock-llm-docker-e2e.yml index 1915e6293..03348ad41 100644 --- a/.github/workflows/mock-llm-docker-e2e.yml +++ b/.github/workflows/mock-llm-docker-e2e.yml @@ -25,8 +25,18 @@ on: type: string default: "" +# Concurrency: deduplicate runs for the same logical trigger. +# - pull_request: keyed by PR number (pushes to the same PR cancel earlier runs) +# - workflow_run: keyed by the triggering workflow's branch (e.g. "wr-main"), +# so multiple Docker builds completing on main don't pile up separate E2E runs +# - workflow_dispatch / fallback: keyed by ref concurrency: - group: mock-llm-docker-e2e-${{ github.event.workflow_run.id || github.event.pull_request.number || github.ref }} + group: >- + mock-llm-docker-e2e-${{ + github.event.pull_request.number || + (github.event.workflow_run.id && format('wr-{0}', github.event.workflow_run.head_branch)) || + github.ref + }} cancel-in-progress: true permissions: @@ -37,13 +47,18 @@ permissions: jobs: mock-llm-docker-e2e: - # workflow_run: only run if the Docker build succeeded. + # workflow_run: only for main/master — validates the published image. + # PR branches are already covered by the pull_request trigger below; + # without this guard, both triggers fire for e2e-tests PRs, producing + # duplicate (and potentially contradictory) comment pairs. # pull_request: only run with the 'e2e-tests' label, skip fork PRs (no GHCR push). # workflow_dispatch: always run. if: >- github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && - github.event.workflow_run.conclusion == 'success') || + github.event.workflow_run.conclusion == 'success' && + (github.event.workflow_run.head_branch == 'main' || + github.event.workflow_run.head_branch == 'master')) || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'e2e-tests') && !github.event.pull_request.head.repo.fork) @@ -60,11 +75,12 @@ jobs: id: ctx run: | if [ "${{ github.event_name }}" = "workflow_run" ]; then + # workflow_run only fires for main (job-level `if` guard), so + # this path always tests the main-branch Docker image. Never + # post PR comments — results go to the step summary only. echo "sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT" echo "ref=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT" - PR_NUMBER=$(echo '${{ toJSON(github.event.workflow_run.pull_requests) }}' \ - | jq -r '.[0].number // empty') - echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT" + echo "pr_number=" >> "$GITHUB_OUTPUT" elif [ "${{ github.event_name }}" = "pull_request" ]; then echo "sha=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT" echo "ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT" diff --git a/playwright.mock-llm-docker.config.ts b/playwright.mock-llm-docker.config.ts index a7c4e682e..e1fad78a2 100644 --- a/playwright.mock-llm-docker.config.ts +++ b/playwright.mock-llm-docker.config.ts @@ -86,7 +86,10 @@ export default defineConfig({ testMatch: /.*\.spec\.ts/, fullyParallel: false, forbidOnly: !!process.env.CI, - retries: 0, + // One retry for transient Docker container startup failures (ECONNREFUSED). + // The container health-check (webServer.url) confirms the stack is up, but + // occasional races can still cause the first request to fail. + retries: process.env.CI ? 1 : 0, workers: 1, timeout: 60_000, globalTimeout: process.env.CI ? 600_000 : 0, // 10 min hard cap in CI diff --git a/tests/e2e/mock-llm/mock-llm-conversation.spec.ts b/tests/e2e/mock-llm/mock-llm-conversation.spec.ts index 87cfcb37b..46116f142 100644 --- a/tests/e2e/mock-llm/mock-llm-conversation.spec.ts +++ b/tests/e2e/mock-llm/mock-llm-conversation.spec.ts @@ -37,6 +37,7 @@ import { waitForSuccessfulBashObservation, deleteConversation, resetMockLLM, + setChatInput, } from "./utils/mock-llm-helpers"; const PROFILE_NAME = "mock-llm-e2e"; @@ -291,24 +292,7 @@ test.describe("mock-LLM agent-server conversation", () => { // Keeping the tokens out of the user bubble lets us assert they appear // *only* in agent output. - // Set contenteditable text via evaluate (contentEditable divs don't - // respond reliably to Playwright's .fill() or .type()). - await page.evaluate( - ({ testId, text }) => { - const el = document.querySelector(`[data-testid="${testId}"]`); - if (!(el instanceof HTMLElement)) throw new Error("Chat input not found"); - el.focus(); - el.textContent = text; - el.dispatchEvent( - new InputEvent("input", { - bubbles: true, - data: text, - inputType: "insertText", - }), - ); - }, - { testId: "chat-input", text: USER_MESSAGE }, - ); + await setChatInput(page, USER_MESSAGE); // Click the submit button — this triggers conversation creation await page.getByTestId("submit-button").click(); diff --git a/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts b/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts new file mode 100644 index 000000000..8d090a5de --- /dev/null +++ b/tests/e2e/mock-llm/mock-llm-model-switch.spec.ts @@ -0,0 +1,271 @@ +/** + * Mock-LLM E2E test: /model slash command — mid-conversation LLM profile switching. + * + * Exercises the full /model flow end-to-end against the real agent-server: + * + * 1. Setup: configure the active LLM settings via ensureMockLLMProfile + * (the proven pattern), create a named profile B as the switch target + * via the profiles API, and register a trajectory with text replies. + * + * 2. Conversation + switch: start a conversation from the home page, + * wait for the agent to reply, then type `/model ` in the + * chat input. Verify the "Switched to profile" confirmation renders in + * the chat UI. Verify the switch_profile POST was made to the agent-server. + * + * 3. Post-switch verification: send another message after the switch + * and verify the agent responds, proving the conversation continues + * working under the new profile. + */ + +import { test, expect, type APIRequestContext } from "@playwright/test"; +import { + BACKEND_URL, + SESSION_API_KEY, + MOCK_LLM_AGENT_URL, + seedLocalStorage, + routeSessionApiKey, + dismissAnalyticsModal, + waitForTestId, + waitForPath, + getConversationIdFromURL, + waitForNonUserMessageText, + deleteConversation, + registerTrajectory, + activateTrajectory, + resetMockLLM, + ensureMockLLMProfile, + setChatInput, +} from "./utils/mock-llm-helpers"; + +/** Profile B is the switch target — created via the profiles API. */ +const PROFILE_B_NAME = "model-switch-profile-b"; +const MODEL_B = "openai/mock-model-beta"; + +const INITIAL_REPLY_TOKEN = "MODEL_SWITCH_INITIAL_REPLY_OK"; +const POST_SWITCH_REPLY_TOKEN = "MODEL_SWITCH_POST_SWITCH_REPLY_OK"; + +/** + * Create (or overwrite) a named LLM profile via the agent-server profiles API. + * Deletes first so setup is idempotent across re-runs — if a previous test + * crashed before afterAll cleanup, the stale profile won't cause a 409. + */ +async function saveProfile( + request: APIRequestContext, + name: string, + model: string, +) { + // Best-effort delete so a leftover profile doesn't block creation. + await request.delete( + `${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`, + { headers: { "X-Session-API-Key": SESSION_API_KEY } }, + ); + const resp = await request.post( + `${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`, + { + headers: { + "X-Session-API-Key": SESSION_API_KEY, + "Content-Type": "application/json", + }, + data: { + llm: { + model, + api_key: "mock-api-key-for-testing", + base_url: MOCK_LLM_AGENT_URL, + }, + }, + }, + ); + expect( + resp.ok(), + `POST /api/profiles/${name} returned ${resp.status()}`, + ).toBe(true); +} + +/** Delete a profile (best-effort cleanup). */ +async function deleteProfile(request: APIRequestContext, name: string) { + await request.delete( + `${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`, + { headers: { "X-Session-API-Key": SESSION_API_KEY } }, + ); +} + +test.describe.configure({ mode: "serial" }); + +test.describe("mock-LLM /model slash command", () => { + const conversationIds = new Set(); + + test.beforeEach(async ({ page }) => { + await seedLocalStorage(page); + }); + + test.afterEach(async ({ request }) => { + for (const id of Array.from(conversationIds)) { + try { + await deleteConversation(request, id); + conversationIds.delete(id); + } catch { + // best-effort cleanup + } + } + }); + + test.afterAll(async ({ request }) => { + // Clean up the profile we created and reset mock LLM + try { + await deleteProfile(request, PROFILE_B_NAME); + } catch { + // best-effort + } + try { + await resetMockLLM(request); + } catch { + // best-effort + } + }); + + // ── Step 1: Configure LLM + create switch-target profile + register trajectory + + test("step 1: configure LLM, create switch-target profile, register trajectory", async ({ + request, + }) => { + // Use the proven ensureMockLLMProfile helper to set agent_settings.llm + // so the agent-server can reach the mock LLM. This is the same approach + // used by mock-llm-conversation.spec.ts. + await ensureMockLLMProfile(request); + + // Create profile B as the switch target — it has a different model name + // but the same mock LLM base_url so post-switch inference still works. + await saveProfile(request, PROFILE_B_NAME, MODEL_B); + + // Verify profile B was created + const profilesResp = await request.get(`${BACKEND_URL}/api/profiles`, { + headers: { "X-Session-API-Key": SESSION_API_KEY }, + }); + expect(profilesResp.ok()).toBe(true); + const profiles = await profilesResp.json(); + const profileNames: string[] = profiles.profiles.map( + (p: { name: string }) => p.name, + ); + expect(profileNames).toContain(PROFILE_B_NAME); + + // Register a trajectory with THREE entries: + // Turn 0: padding — the agent-server makes an internal LLM call + // (condenser/skill-analysis) before the agent's main loop. + // This consumes one trajectory response. If the SDK removes + // that internal call, this padding entry will cause an + // off-by-one; delete it at that point. + // Ref: same pattern in mock-llm-automation.spec.ts; + // upstream SDK code: openhands-sdk CondensationMixin. + // Turn 1: actual reply to the initial user message + // Turn 2: reply to the post-switch follow-up message + await registerTrajectory(request, "model-switch", [ + { text: "" }, // padding for internal LLM call (see comment above) + { text: INITIAL_REPLY_TOKEN }, + { text: POST_SWITCH_REPLY_TOKEN }, + ]); + await activateTrajectory(request, "model-switch"); + }); + + // ── Step 2: Conversation + /model switch + post-switch verification ─ + + test("step 2: start conversation, switch profile via /model, verify switch", async ({ + page, + request, + }) => { + test.setTimeout(120_000); + + // Track whether the switch_profile POST was intercepted. + let switchProfileCalled = false; + let switchProfileBody: Record | null = null; + page.on("request", (req) => { + const url = new URL(req.url()); + // The switch_profile endpoint is POST /api/conversations/{id}/switch_profile + if ( + req.method() === "POST" && + url.pathname.match(/\/api\/conversations\/[^/]+\/switch_profile/) + ) { + switchProfileCalled = true; + try { + switchProfileBody = req.postDataJSON(); + } catch { + // non-JSON body + } + } + }); + + await routeSessionApiKey(page); + await page.goto("/", { waitUntil: "domcontentloaded" }); + await dismissAnalyticsModal(page); + await waitForTestId(page, "home-chat-launcher"); + + // ── Send initial message and wait for agent reply ── + + await test.step("send initial message", async () => { + await setChatInput(page, "Hello, please respond briefly."); + await page.getByTestId("submit-button").click(); + await waitForPath(page, /\/conversations\/.+/, 30_000); + }); + + const conversationId = getConversationIdFromURL(page); + conversationIds.add(conversationId); + + await test.step("wait for initial agent reply", async () => { + await waitForNonUserMessageText(page, INITIAL_REPLY_TOKEN, 30_000); + }); + + // ── Type /model to switch ── + + await test.step("type /model command to switch to profile B", async () => { + // Wait for the chat input to be available (it should be focused + // after the agent reply completes). + await waitForTestId(page, "chat-input"); + + await setChatInput(page, `/model ${PROFILE_B_NAME}`); + await page.getByTestId("submit-button").click(); + }); + + // ── Verify: "Switched to profile" message appears in chat UI ── + + await test.step("verify 'Switched to profile' message in chat", async () => { + // waitForNonUserMessageText already polls data-testid="model-messages" + // elements, so a successful wait proves the container is visible. + await waitForNonUserMessageText(page, PROFILE_B_NAME, 30_000); + }); + + // ── Verify: the switch_profile POST was made ── + + await test.step("verify switch_profile API was called", async () => { + expect( + switchProfileCalled, + "POST /switch_profile should have been called", + ).toBe(true); + expect(switchProfileBody).toBeTruthy(); + // The switch_profile API uses { profile_name: "..." } + expect( + switchProfileBody!.profile_name, + `switch_profile body.profile_name should be "${PROFILE_B_NAME}"`, + ).toBe(PROFILE_B_NAME); + }); + + // ── Send a follow-up message to verify conversation still works ── + + await test.step("send follow-up message after switch", async () => { + // Wait for the chat input to be ready — the UI may briefly disable + // it while the profile switch settles. + await waitForTestId(page, "chat-input"); + await setChatInput(page, "Confirm the model switch worked."); + await page.getByTestId("submit-button").click(); + }); + + await test.step("verify post-switch agent reply", async () => { + await waitForNonUserMessageText(page, POST_SWITCH_REPLY_TOKEN, 30_000); + }); + + // ── Verify: no error banners ── + + await test.step("verify no error banners", async () => { + const errorBanner = page.getByTestId("error-message-banner"); + await expect(errorBanner).not.toBeVisible({ timeout: 2_000 }); + }); + }); +}); diff --git a/tests/e2e/mock-llm/utils/mock-llm-helpers.ts b/tests/e2e/mock-llm/utils/mock-llm-helpers.ts index 03206c105..d68ef40a6 100644 --- a/tests/e2e/mock-llm/utils/mock-llm-helpers.ts +++ b/tests/e2e/mock-llm/utils/mock-llm-helpers.ts @@ -380,5 +380,35 @@ export async function resetMockLLM(request: APIRequestContext) { expect(resp.ok(), `Reset mock LLM: ${resp.status()}`).toBe(true); } +/** + * Set contentEditable chat input text and dispatch an input event. + * + * contentEditable divs don't respond reliably to Playwright's .fill() or + * .type(), so we set the text programmatically via page.evaluate(). + */ +export async function setChatInput( + page: Page, + text: string, + testId = "chat-input", +) { + await page.evaluate( + ({ tid, inputText }) => { + const el = document.querySelector(`[data-testid="${tid}"]`); + if (!(el instanceof HTMLElement)) + throw new Error(`Chat input [data-testid="${tid}"] not found`); + el.focus(); + el.textContent = inputText; + el.dispatchEvent( + new InputEvent("input", { + bubbles: true, + data: inputText, + inputType: "insertText", + }), + ); + }, + { tid: testId, inputText: text }, + ); +} + // Mock automation helpers removed — the automation test now hits the real // automation backend running inside the bin/agent-canvas.mjs stack.