Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions .github/workflows/mock-llm-docker-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,18 @@ on:
type: string
default: ""

# Concurrency: deduplicate runs for the same logical trigger.
# - pull_request: keyed by PR number (pushes to the same PR cancel earlier runs)
# - workflow_run: keyed by the triggering workflow's branch (e.g. "wr-main"),
# so multiple Docker builds completing on main don't pile up separate E2E runs
# - workflow_dispatch / fallback: keyed by ref
concurrency:
group: mock-llm-docker-e2e-${{ github.event.workflow_run.id || github.event.pull_request.number || github.ref }}
group: >-
mock-llm-docker-e2e-${{
github.event.pull_request.number ||
(github.event.workflow_run.id && format('wr-{0}', github.event.workflow_run.head_branch)) ||
github.ref
}}
cancel-in-progress: true

permissions:
Expand All @@ -37,13 +47,18 @@ permissions:

jobs:
mock-llm-docker-e2e:
# workflow_run: only run if the Docker build succeeded.
# workflow_run: only for main/master — validates the published image.
# PR branches are already covered by the pull_request trigger below;
# without this guard, both triggers fire for e2e-tests PRs, producing
# duplicate (and potentially contradictory) comment pairs.
# pull_request: only run with the 'e2e-tests' label, skip fork PRs (no GHCR push).
# workflow_dispatch: always run.
if: >-
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'workflow_run' &&
github.event.workflow_run.conclusion == 'success') ||
github.event.workflow_run.conclusion == 'success' &&
(github.event.workflow_run.head_branch == 'main' ||
github.event.workflow_run.head_branch == 'master')) ||
(github.event_name == 'pull_request' &&
contains(github.event.pull_request.labels.*.name, 'e2e-tests') &&
!github.event.pull_request.head.repo.fork)
Expand All @@ -60,11 +75,12 @@ jobs:
id: ctx
run: |
if [ "${{ github.event_name }}" = "workflow_run" ]; then
# workflow_run only fires for main (job-level `if` guard), so
# this path always tests the main-branch Docker image. Never
# post PR comments — results go to the step summary only.
echo "sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
echo "ref=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
PR_NUMBER=$(echo '${{ toJSON(github.event.workflow_run.pull_requests) }}' \
| jq -r '.[0].number // empty')
echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
echo "pr_number=" >> "$GITHUB_OUTPUT"
elif [ "${{ github.event_name }}" = "pull_request" ]; then
echo "sha=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
echo "ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
Expand Down
5 changes: 4 additions & 1 deletion playwright.mock-llm-docker.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,10 @@ export default defineConfig({
testMatch: /.*\.spec\.ts/,
fullyParallel: false,
forbidOnly: !!process.env.CI,
retries: 0,
// One retry for transient Docker container startup failures (ECONNREFUSED).
// The container health-check (webServer.url) confirms the stack is up, but
// occasional races can still cause the first request to fail.
retries: process.env.CI ? 1 : 0,
workers: 1,
timeout: 60_000,
globalTimeout: process.env.CI ? 600_000 : 0, // 10 min hard cap in CI
Expand Down
20 changes: 2 additions & 18 deletions tests/e2e/mock-llm/mock-llm-conversation.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import {
waitForSuccessfulBashObservation,
deleteConversation,
resetMockLLM,
setChatInput,
} from "./utils/mock-llm-helpers";

const PROFILE_NAME = "mock-llm-e2e";
Expand Down Expand Up @@ -291,24 +292,7 @@ test.describe("mock-LLM agent-server conversation", () => {
// Keeping the tokens out of the user bubble lets us assert they appear
// *only* in agent output.

// Set contenteditable text via evaluate (contentEditable divs don't
// respond reliably to Playwright's .fill() or .type()).
await page.evaluate(
({ testId, text }) => {
const el = document.querySelector(`[data-testid="${testId}"]`);
if (!(el instanceof HTMLElement)) throw new Error("Chat input not found");
el.focus();
el.textContent = text;
el.dispatchEvent(
new InputEvent("input", {
bubbles: true,
data: text,
inputType: "insertText",
}),
);
},
{ testId: "chat-input", text: USER_MESSAGE },
);
await setChatInput(page, USER_MESSAGE);

// Click the submit button — this triggers conversation creation
await page.getByTestId("submit-button").click();
Expand Down
271 changes: 271 additions & 0 deletions tests/e2e/mock-llm/mock-llm-model-switch.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
/**
* Mock-LLM E2E test: /model slash command — mid-conversation LLM profile switching.
*
* Exercises the full /model flow end-to-end against the real agent-server:
*
* 1. Setup: configure the active LLM settings via ensureMockLLMProfile
* (the proven pattern), create a named profile B as the switch target
* via the profiles API, and register a trajectory with text replies.
*
* 2. Conversation + switch: start a conversation from the home page,
* wait for the agent to reply, then type `/model <profile-B>` in the
* chat input. Verify the "Switched to profile" confirmation renders in
* the chat UI. Verify the switch_profile POST was made to the agent-server.
*
* 3. Post-switch verification: send another message after the switch
* and verify the agent responds, proving the conversation continues
* working under the new profile.
*/

import { test, expect, type APIRequestContext } from "@playwright/test";
import {
BACKEND_URL,
SESSION_API_KEY,
MOCK_LLM_AGENT_URL,
seedLocalStorage,
routeSessionApiKey,
dismissAnalyticsModal,
waitForTestId,
waitForPath,
getConversationIdFromURL,
waitForNonUserMessageText,
deleteConversation,
registerTrajectory,
activateTrajectory,
resetMockLLM,
ensureMockLLMProfile,
setChatInput,
} from "./utils/mock-llm-helpers";

/** Profile B is the switch target — created via the profiles API. */
const PROFILE_B_NAME = "model-switch-profile-b";
const MODEL_B = "openai/mock-model-beta";

const INITIAL_REPLY_TOKEN = "MODEL_SWITCH_INITIAL_REPLY_OK";
const POST_SWITCH_REPLY_TOKEN = "MODEL_SWITCH_POST_SWITCH_REPLY_OK";

/**
* Create (or overwrite) a named LLM profile via the agent-server profiles API.
* Deletes first so setup is idempotent across re-runs — if a previous test
* crashed before afterAll cleanup, the stale profile won't cause a 409.
*/
async function saveProfile(
request: APIRequestContext,
name: string,
model: string,
) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Suggestion: Inline import("@playwright/test") type references are verbose and inconsistent with the file's top-level import. Add APIRequestContext and Page to the existing import statement instead:

Suggested change
) {
import { test, expect, type APIRequestContext, type Page } from "@playwright/test";

Then replace all three occurrences:

  • request: import("@playwright/test").APIRequestContextrequest: APIRequestContext
  • page: import("@playwright/test").Pagepage: Page

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done — imported APIRequestContext from top-level @playwright/test import; removed all inline import() type references. setChatInput moved to shared helpers so the Page type import is no longer needed locally either. (commit 35d8db8)

This comment was posted by an AI agent (OpenHands) on behalf of the user.

// Best-effort delete so a leftover profile doesn't block creation.
await request.delete(
`${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`,
{ headers: { "X-Session-API-Key": SESSION_API_KEY } },
);
const resp = await request.post(
`${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`,
{
headers: {
"X-Session-API-Key": SESSION_API_KEY,
"Content-Type": "application/json",
},
data: {
llm: {
model,
api_key: "mock-api-key-for-testing",
base_url: MOCK_LLM_AGENT_URL,
},
},
},
);
expect(
resp.ok(),
Comment thread
malhotra5 marked this conversation as resolved.
`POST /api/profiles/${name} returned ${resp.status()}`,
).toBe(true);
}

/** Delete a profile (best-effort cleanup). */
async function deleteProfile(request: APIRequestContext, name: string) {
await request.delete(
`${BACKEND_URL}/api/profiles/${encodeURIComponent(name)}`,
{ headers: { "X-Session-API-Key": SESSION_API_KEY } },
);
}

test.describe.configure({ mode: "serial" });

test.describe("mock-LLM /model slash command", () => {
const conversationIds = new Set<string>();

test.beforeEach(async ({ page }) => {
await seedLocalStorage(page);
});

test.afterEach(async ({ request }) => {
for (const id of Array.from(conversationIds)) {
try {
await deleteConversation(request, id);
conversationIds.delete(id);
} catch {
// best-effort cleanup
}
}
});

test.afterAll(async ({ request }) => {
// Clean up the profile we created and reset mock LLM
try {
await deleteProfile(request, PROFILE_B_NAME);
} catch {
// best-effort
}
try {
await resetMockLLM(request);
} catch {
// best-effort
}
});

// ── Step 1: Configure LLM + create switch-target profile + register trajectory

test("step 1: configure LLM, create switch-target profile, register trajectory", async ({
request,
}) => {
// Use the proven ensureMockLLMProfile helper to set agent_settings.llm
// so the agent-server can reach the mock LLM. This is the same approach
// used by mock-llm-conversation.spec.ts.
await ensureMockLLMProfile(request);

// Create profile B as the switch target — it has a different model name
// but the same mock LLM base_url so post-switch inference still works.
await saveProfile(request, PROFILE_B_NAME, MODEL_B);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟠 Important: The empty { text: "" } padding turn exists to absorb an internal agent-server LLM call (condenser/skill-analysis) that runs before the main loop. This tightly couples the test's trajectory shape to an implementation detail of the agent-server internals.

If that internal call is ever removed, merged, or reordered, this test will fail with a confusing off-by-one trajectory mismatch rather than a clear assertion error. Consider adding a reference to the issue or agent-server code path (e.g. a link to the relevant code or issue number) in the comment so future maintainers know exactly why this padding exists and can find the upstream source.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done — expanded the comment to explain the coupling risk and added a cross-reference to mock-llm-automation.spec.ts (which uses the same pattern) and the upstream SDK's CondensationMixin. (commit 35d8db8)

This comment was posted by an AI agent (OpenHands) on behalf of the user.

// Verify profile B was created
const profilesResp = await request.get(`${BACKEND_URL}/api/profiles`, {
headers: { "X-Session-API-Key": SESSION_API_KEY },
});
expect(profilesResp.ok()).toBe(true);
const profiles = await profilesResp.json();
const profileNames: string[] = profiles.profiles.map(
(p: { name: string }) => p.name,
);
expect(profileNames).toContain(PROFILE_B_NAME);

// Register a trajectory with THREE entries:
// Turn 0: padding — the agent-server makes an internal LLM call
// (condenser/skill-analysis) before the agent's main loop.
// This consumes one trajectory response. If the SDK removes
// that internal call, this padding entry will cause an
// off-by-one; delete it at that point.
// Ref: same pattern in mock-llm-automation.spec.ts;
// upstream SDK code: openhands-sdk CondensationMixin.
// Turn 1: actual reply to the initial user message
// Turn 2: reply to the post-switch follow-up message
await registerTrajectory(request, "model-switch", [
{ text: "" }, // padding for internal LLM call (see comment above)
{ text: INITIAL_REPLY_TOKEN },
{ text: POST_SWITCH_REPLY_TOKEN },
]);
await activateTrajectory(request, "model-switch");
});

// ── Step 2: Conversation + /model switch + post-switch verification ─

test("step 2: start conversation, switch profile via /model, verify switch", async ({
page,
request,
}) => {
test.setTimeout(120_000);

// Track whether the switch_profile POST was intercepted.
let switchProfileCalled = false;
let switchProfileBody: Record<string, unknown> | null = null;
page.on("request", (req) => {
const url = new URL(req.url());
// The switch_profile endpoint is POST /api/conversations/{id}/switch_profile
if (
req.method() === "POST" &&
url.pathname.match(/\/api\/conversations\/[^/]+\/switch_profile/)
) {
switchProfileCalled = true;
try {
switchProfileBody = req.postDataJSON();
} catch {
// non-JSON body
}
}
});

await routeSessionApiKey(page);
await page.goto("/", { waitUntil: "domcontentloaded" });
await dismissAnalyticsModal(page);
await waitForTestId(page, "home-chat-launcher");

// ── Send initial message and wait for agent reply ──

await test.step("send initial message", async () => {
await setChatInput(page, "Hello, please respond briefly.");
await page.getByTestId("submit-button").click();
await waitForPath(page, /\/conversations\/.+/, 30_000);
});

const conversationId = getConversationIdFromURL(page);
conversationIds.add(conversationId);

await test.step("wait for initial agent reply", async () => {
await waitForNonUserMessageText(page, INITIAL_REPLY_TOKEN, 30_000);
});

// ── Type /model <profile-B> to switch ──

await test.step("type /model command to switch to profile B", async () => {
// Wait for the chat input to be available (it should be focused
// after the agent reply completes).
await waitForTestId(page, "chat-input");
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Suggestion: The waitForNonUserMessageText call above already scans [data-testid="model-messages"] elements for PROFILE_B_NAME — if that wait succeeds, the element must already be visible. The subsequent expect(modelMessages.first()).toBeVisible({ timeout: 5_000 }) check is effectively redundant. You can safely drop it, or replace the whole block with a single assertion on the element's text content if you want to keep the container check.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done — removed the redundant toBeVisible check. waitForNonUserMessageText already polls [data-testid="model-messages"] elements, so a successful wait proves the element exists and contains the expected text. (commit 35d8db8)

This comment was posted by an AI agent (OpenHands) on behalf of the user.


await setChatInput(page, `/model ${PROFILE_B_NAME}`);
await page.getByTestId("submit-button").click();
});

// ── Verify: "Switched to profile" message appears in chat UI ──

await test.step("verify 'Switched to profile' message in chat", async () => {
// waitForNonUserMessageText already polls data-testid="model-messages"
// elements, so a successful wait proves the container is visible.
await waitForNonUserMessageText(page, PROFILE_B_NAME, 30_000);
});

// ── Verify: the switch_profile POST was made ──

await test.step("verify switch_profile API was called", async () => {
expect(
switchProfileCalled,
"POST /switch_profile should have been called",
).toBe(true);
expect(switchProfileBody).toBeTruthy();
// The switch_profile API uses { profile_name: "..." }
expect(
switchProfileBody!.profile_name,
`switch_profile body.profile_name should be "${PROFILE_B_NAME}"`,
).toBe(PROFILE_B_NAME);
});

// ── Send a follow-up message to verify conversation still works ──

await test.step("send follow-up message after switch", async () => {
// Wait for the chat input to be ready — the UI may briefly disable
// it while the profile switch settles.
await waitForTestId(page, "chat-input");
await setChatInput(page, "Confirm the model switch worked.");
Comment thread
malhotra5 marked this conversation as resolved.
await page.getByTestId("submit-button").click();
});

await test.step("verify post-switch agent reply", async () => {
await waitForNonUserMessageText(page, POST_SWITCH_REPLY_TOKEN, 30_000);
});

// ── Verify: no error banners ──

await test.step("verify no error banners", async () => {
const errorBanner = page.getByTestId("error-message-banner");
await expect(errorBanner).not.toBeVisible({ timeout: 2_000 });
});
});
});
Loading
Loading