From 3a61aea09139f58669e9fdb688678ff00e317c45 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 1 Jun 2026 13:01:40 -0600
Subject: [PATCH 1/6] chore: bump version to 1.0.0-beta.1

---
 README.md         | 4 ++--
 package-lock.json | 4 ++--
 package.json      | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 96073282..906104c2 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ agent-canvas
 ### Option 2: With a Docker Sandbox
 
 ```sh
-docker pull ghcr.io/openhands/agent-canvas:1.0.0-alpha.10
+docker pull ghcr.io/openhands/agent-canvas:1.0.0-beta.1
 
 export PROJECTS_PATH=~/projects  # directory containing your project folders
 
@@ -69,7 +69,7 @@ docker run -it --rm \
   -p 8000:8000 \
   -v ~/.openhands:/home/openhands/.openhands \
   -v ${PROJECTS_PATH}:/projects \
-  ghcr.io/openhands/agent-canvas:1.0.0-alpha.10
+  ghcr.io/openhands/agent-canvas:1.0.0-beta.1
 ```
 
 The agent will be able to access any project under `PROJECTS_PATH`.
diff --git a/package-lock.json b/package-lock.json
index 630524a9..a9b54018 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "@openhands/agent-canvas",
-  "version": "1.0.0-alpha.10",
+  "version": "1.0.0-beta.1",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "@openhands/agent-canvas",
-      "version": "1.0.0-alpha.10",
+      "version": "1.0.0-beta.1",
       "license": "MIT",
       "dependencies": {
         "@heroui/react": "2.8.10",
diff --git a/package.json b/package.json
index 3ce4579d..4010a61b 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@openhands/agent-canvas",
-  "version": "1.0.0-alpha.10",
+  "version": "1.0.0-beta.1",
   "description": "Agent Canvas UI for OpenHands - run AI coding agents with a visual interface",
   "license": "MIT",
   "private": false,

From bb02177a5a7c8a132565c9fb322f11ffef13d336 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 1 Jun 2026 13:23:52 -0600
Subject: [PATCH 2/6] chore: publish beta and rc versions as 'latest' dist-tag

---
 .github/workflows/npm-publish.yml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/npm-publish.yml b/.github/workflows/npm-publish.yml
index bae15a48..54354d1d 100644
--- a/.github/workflows/npm-publish.yml
+++ b/.github/workflows/npm-publish.yml
@@ -92,9 +92,11 @@ jobs:
 
       # Resolve the npm dist-tag from the version's pre-release identifier:
       #   alpha  →  --tag alpha   (e.g. 1.0.0-alpha.1)
-      #   beta   →  --tag beta    (e.g. 1.0.0-beta.1)
-      #   rc     →  --tag rc      (e.g. 1.0.0-rc.1)
+      #   beta   →  --tag latest  (e.g. 1.0.0-beta.1)
+      #   rc     →  --tag latest  (e.g. 1.0.0-rc.1)
       #   stable →  --tag latest  (e.g. 1.0.0)
+      # Beta and RC publish as `latest` so `npm install @openhands/agent-canvas`
+      # resolves to the most recent release until the first stable version ships.
       # Note: OIDC trusted-publishing tokens cover only the `npm publish` call
       # itself; a separate `npm dist-tag add` would fail with E401, so the tag
       # is resolved and passed directly in one step.
@@ -104,10 +106,6 @@ jobs:
           VERSION=$(node -p "require('./package.json').version")
           if [[ "$VERSION" == *-alpha* ]]; then
             DIST_TAG="alpha"
-          elif [[ "$VERSION" == *-beta* ]]; then
-            DIST_TAG="beta"
-          elif [[ "$VERSION" == *-rc* ]]; then
-            DIST_TAG="rc"
           else
             DIST_TAG="latest"
           fi

From 660efffe7f6c675cb09d23cfaf2d0b07dd00b2ed Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 1 Jun 2026 13:29:38 -0600
Subject: [PATCH 3/6] chore: bump version to 1.0.0-beta.2

---
 README.md         | 4 ++--
 package-lock.json | 4 ++--
 package.json      | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 906104c2..4c4daa29 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ agent-canvas
 ### Option 2: With a Docker Sandbox
 
 ```sh
-docker pull ghcr.io/openhands/agent-canvas:1.0.0-beta.1
+docker pull ghcr.io/openhands/agent-canvas:1.0.0-beta.2
 
 export PROJECTS_PATH=~/projects  # directory containing your project folders
 
@@ -69,7 +69,7 @@ docker run -it --rm \
   -p 8000:8000 \
   -v ~/.openhands:/home/openhands/.openhands \
   -v ${PROJECTS_PATH}:/projects \
-  ghcr.io/openhands/agent-canvas:1.0.0-beta.1
+  ghcr.io/openhands/agent-canvas:1.0.0-beta.2
 ```
 
 The agent will be able to access any project under `PROJECTS_PATH`.
diff --git a/package-lock.json b/package-lock.json
index a9b54018..3e1ef92a 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "@openhands/agent-canvas",
-  "version": "1.0.0-beta.1",
+  "version": "1.0.0-beta.2",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "@openhands/agent-canvas",
-      "version": "1.0.0-beta.1",
+      "version": "1.0.0-beta.2",
       "license": "MIT",
       "dependencies": {
         "@heroui/react": "2.8.10",
diff --git a/package.json b/package.json
index 4010a61b..ef48b205 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@openhands/agent-canvas",
-  "version": "1.0.0-beta.1",
+  "version": "1.0.0-beta.2",
   "description": "Agent Canvas UI for OpenHands - run AI coding agents with a visual interface",
   "license": "MIT",
   "private": false,

From 20894d9baea5b61567b53f305e1aab67d3ef691f Mon Sep 17 00:00:00 2001
From: Tim O'Farrell <tofarr@gmail.com>
Date: Mon, 1 Jun 2026 15:10:30 -0600
Subject: [PATCH 4/6] fix: use X-Session-API-Key for local automation auth in
 prompts and RUNTIME_SERVICES (#999)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #980

The agent prompt in recommended-automations-launcher and the
RUNTIME_SERVICES block in agent-server-adapter both advertised
X-API-Key as the auth header for the local automation backend.
The automation service (openhands-automation) does not accept
X-API-Key — it accepts Authorization: Bearer and X-Session-API-Key.

X-Session-API-Key is the established local convention: the agent
server uses it, the frontend automation API client uses it (with an
explicit comment that both backends share the same header), and
auth.py describes it as matching that convention. Update both call
sites and the corresponding test assertion to use X-Session-API-Key.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 AGENTS.md                                                 | 8 ++++----
 __tests__/api/agent-server-adapter.test.ts                | 3 ++-
 scripts/dev-safe.mjs                                      | 2 +-
 src/api/agent-server-adapter.ts                           | 4 +++-
 .../automations/recommended-automations-launcher.tsx      | 2 +-
 5 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index fcded59e..1717463b 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -25,7 +25,7 @@
 - The block lists URLs **from the agent's point of view**:
   - The Agent Server is always reachable as `http://localhost:<port>` from inside the sandbox — but that is _you_, not the automation backend.
   - Host-side services (ingress, Vite, automation) are reachable as `http://localhost:<port>`.
-- Agents should treat the `<RUNTIME_SERVICES>` block as authoritative: don't hardcode `localhost:8000` for "the automation server", and don't probe random ports trying to discover services. If the block says automation is not running, skip `/api/automation` calls; otherwise use the listed `url_from_agent` + `api_prefix` (default `/api/automation`) and the `X-API-Key: $OPENHANDS_AUTOMATION_API_KEY` header.
+- Agents should treat the `<RUNTIME_SERVICES>` block as authoritative: don't hardcode `localhost:8000` for "the automation server", and don't probe random ports trying to discover services. If the block says automation is not running, skip `/api/automation` calls; otherwise use the listed `url_from_agent` + `api_prefix` (default `/api/automation`) and the `X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY` header.
 - The launcher → frontend → suffix plumbing is:
   - `scripts/dev-safe.mjs::buildRuntimeServicesInfo()` — pure helper that constructs the info object.
   - `scripts/dev-with-automation.mjs::buildAutomationRuntimeServicesInfo()` — wraps it with automation details; called from both Vite spawn (`startVite`) and the static build (`static-build.mjs`).
@@ -53,7 +53,7 @@ The env var is a JSON string of:
       "url_from_agent": "http://localhost:3001"
     },
     "automation": {
-      "description": "OpenHands Automations service. All routes are mounted under '/api/automation'. Authenticate with header 'X-API-Key: $OPENHANDS_AUTOMATION_API_KEY'.",
+      "description": "OpenHands Automations service. All routes are mounted under '/api/automation'. Authenticate with header 'X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY'.",
       "url_from_agent": "http://localhost:18001",
       "api_prefix": "/api/automation",
       "docs_url": "http://localhost:18001/api/automation/docs",
@@ -81,10 +81,10 @@ from your point of view (i.e., as you should curl/fetch them).
 * Frontend: http://localhost:3001
     Vite dev server hosting the agent-canvas frontend.
 * Automation backend: http://localhost:18001
-    OpenHands Automations service. All routes are mounted under '/api/automation'. Authenticate with header 'X-API-Key: $OPENHANDS_AUTOMATION_API_KEY'.
+    OpenHands Automations service. All routes are mounted under '/api/automation'. Authenticate with header 'X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY'.
     Docs:    http://localhost:18001/api/automation/docs
     OpenAPI: http://localhost:18001/api/automation/openapi.json
-    Auth:    header 'X-API-Key: $OPENHANDS_AUTOMATION_API_KEY'
+    Auth:    header 'X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY'
 
 Trust this block over guessing: do not assume any other URLs are running.
 In particular, http://localhost:18000 inside your sandbox is the Agent Server
diff --git a/__tests__/api/agent-server-adapter.test.ts b/__tests__/api/agent-server-adapter.test.ts
index c82028b5..f628d539 100644
--- a/__tests__/api/agent-server-adapter.test.ts
+++ b/__tests__/api/agent-server-adapter.test.ts
@@ -795,7 +795,8 @@ describe("buildRuntimeServicesSystemSuffix", () => {
     expect(suffix).toContain("http://localhost:18000");
     expect(suffix).toContain("http://localhost:18001");
     expect(suffix).toContain("http://localhost:18001/api/automation/docs");
-    expect(suffix).toContain("X-API-Key: $OPENHANDS_AUTOMATION_API_KEY");
+    expect(suffix).toContain("X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY");
+    expect(suffix).not.toContain("X-API-Key: $OPENHANDS_AUTOMATION_API_KEY");
     expect(suffix).toContain("</RUNTIME_SERVICES>");
     // The "don't guess" line should reference the actual agent-server URL
     // for this stack, not a hardcoded port. The assertion anchors on the URL
diff --git a/scripts/dev-safe.mjs b/scripts/dev-safe.mjs
index 322a63d1..c85d40fc 100644
--- a/scripts/dev-safe.mjs
+++ b/scripts/dev-safe.mjs
@@ -782,7 +782,7 @@ export function buildRuntimeServicesInfo(options) {
       description:
         "OpenHands Automations service. All routes are mounted under " +
         `'${apiPrefix}'. Authenticate with header ` +
-        `'X-API-Key: $${authEnvVar}'.`,
+        `'X-Session-API-Key: $${authEnvVar}'.`,
       url_from_agent: baseUrl,
       api_prefix: apiPrefix,
       docs_url: `${baseUrl}${apiPrefix}/docs`,
diff --git a/src/api/agent-server-adapter.ts b/src/api/agent-server-adapter.ts
index 075cf394..d8ee2e36 100644
--- a/src/api/agent-server-adapter.ts
+++ b/src/api/agent-server-adapter.ts
@@ -199,8 +199,10 @@ export function buildRuntimeServicesSystemSuffix(): string | undefined {
       lines.push(`    OpenAPI: ${automation.openapi_url}`);
     }
     if (automation.auth_env_var) {
+      // X-Session-API-Key is the local convention shared by the agent-server
+      // and automation backend (see openhands-automation auth.py).
       lines.push(
-        `    Auth:    header 'X-API-Key: $${automation.auth_env_var}'`,
+        `    Auth:    header 'X-Session-API-Key: $${automation.auth_env_var}'`,
       );
     }
   } else {
diff --git a/src/components/features/automations/recommended-automations-launcher.tsx b/src/components/features/automations/recommended-automations-launcher.tsx
index 5ab90710..9d457d58 100644
--- a/src/components/features/automations/recommended-automations-launcher.tsx
+++ b/src/components/features/automations/recommended-automations-launcher.tsx
@@ -73,7 +73,7 @@ export function buildAutomationPrompt(
     "**Which API to use:** Create this automation using the **local** OpenHands Automations API that is running alongside this agent.",
     "- Read the Automation backend URL from the `<RUNTIME_SERVICES>` block in your system context.",
     "- Endpoint path: `POST /api/automation/v1/preset/prompt`",
-    "- Auth: `X-API-Key: $OPENHANDS_AUTOMATION_API_KEY`",
+    "- Auth: `X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY`",
     "- If no local Automation backend is listed in `<RUNTIME_SERVICES>`, stop and ask me to start the full local automation stack instead of using any remote/cloud automation API.",
   ].join("\n");
 }

From 0f860f5577b98dcc9ffd73304bba8a3e3e69265a Mon Sep 17 00:00:00 2001
From: Rohit Malhotra <rohitvinodmalhotra@gmail.com>
Date: Mon, 1 Jun 2026 15:09:01 -0400
Subject: [PATCH 5/6] feat: reuse mock-LLM E2E tests for Docker image
 validation (#992)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: reuse mock-LLM E2E tests for Docker image validation

Add a Docker-specific Playwright config (playwright.mock-llm-docker.config.ts)
that runs the exact same test specs and helpers against the agent-canvas Docker
image instead of the npm build path (bin/agent-canvas.mjs + uvx).

Key changes:

- Split MOCK_LLM_BASE_URL into two constants in mock-llm-helpers.ts:
  - MOCK_LLM_BASE_URL: always host-local, used by tests for admin API
  - MOCK_LLM_AGENT_URL: env-overridable, used when configuring the LLM
    profile (the URL the agent-server uses for inference). Defaults to
    MOCK_LLM_BASE_URL for backward compatibility with the npm path.

- New playwright.mock-llm-docker.config.ts:
  - Starts the mock LLM server on the host (same as npm path)
  - Runs the Docker container with --network host (Linux CI)
  - Points to the same testDir (tests/e2e/mock-llm/) and specs
  - Separate output dirs to avoid collision with npm path results

- New CI workflow (.github/workflows/mock-llm-docker-e2e.yml):
  - Builds the Docker image from current code (or uses a pre-built image)
  - Runs the same specs against the container
  - Posts PR comment with differentiated report title

- render-mock-llm-report.mjs: accept --title flag for Docker vs npm reports
- npm run test:e2e:mock-llm:docker script added
- .gitignore updated for docker test output dirs

The npm path (test:e2e:mock-llm) is fully backward-compatible — no env var
override needed since MOCK_LLM_AGENT_URL defaults to MOCK_LLM_BASE_URL.

Co-authored-by: openhands <openhands@all-hands.dev>

* refactor: chain Docker E2E off existing Docker CI via workflow_run

Instead of rebuilding the Docker image in the E2E workflow (duplicating
~10-15 min of Docker build time), use workflow_run to trigger automatically
after the existing 'Docker' workflow completes successfully.

The workflow now:
- Triggers on: workflow_run (Docker completed) + workflow_dispatch (manual)
- Derives the image tag from the Docker build's commit SHA
  (ghcr.io/openhands/agent-canvas:sha-<short>-amd64)
- Pulls the already-built image from GHCR — no rebuild needed
- Checks out code at the same SHA as the Docker build
- Extracts PR number from workflow_run.pull_requests[] for comments

Removed: Docker build steps, Buildx setup, build-arg resolution.
All image building stays in docker.yml where it belongs.

Co-authored-by: openhands <openhands@all-hands.dev>

* fix: replace flaky 1s timeout with polling for Active badge assertion

The 'Active badge' check in step 2 used a hardcoded 1-second
waitForTimeout before reloading. On a loaded CI runner the profile
activation mutation may not persist in time, causing the reload to
show stale state. This is a pre-existing flake (identical test code
passed on the first push and failed on the second).

Replace with expect.poll() that retries the reload+check cycle with
increasing intervals (1s, 2s, 3s) up to 15 seconds total.

Co-authored-by: openhands <openhands@all-hands.dev>

* fix: add pull_request trigger for Docker E2E (workflow_run bootstrap)

workflow_run only fires when the workflow file exists on the default
branch (main). Since mock-llm-docker-e2e.yml is new and only on the
PR branch, GitHub doesn't recognize it as a workflow_run listener yet.

Add pull_request trigger (gated by 'e2e-tests' label, skip forks) that
polls the Docker workflow via gh API until it completes for the PR's
head SHA, then pulls the already-built image from GHCR and runs tests.

After merge, workflow_run takes over as the primary automatic trigger.
The pull_request path remains as a fallback for label-gated runs.

Co-authored-by: openhands <openhands@all-hands.dev>

* fix: add FILE_STORE, AUTOMATION_BASE_URL, AUTOMATION_WORKSPACE_BASE to Docker entrypoint

The Docker entrypoint was missing several environment variables that the npm
path (dev-with-automation.mjs) sets for the automation backend:

- FILE_STORE=local — without this, the automation backend may fall back to
  cloud storage (S3/GCS) which fails without credentials, causing tarball-
  based presets (preset/prompt, preset/plugin) to silently error
- LOCAL_STORAGE_PATH — where to store files on the local filesystem
- AUTOMATION_BASE_URL — publicly-reachable base URL for callback URLs
- AUTOMATION_WORKSPACE_BASE — where automation runs unpack tarballs

This explains the Docker E2E failure: the agent's curl to create an automation
via /api/automation/v1/preset/prompt returned an error (likely 500 from missing
storage config), but the mock LLM doesn't care about terminal output and
proceeded to return the scripted final reply. The test then found 0 automations.

Co-authored-by: openhands <openhands@all-hands.dev>

* fix: exclude auth-modes spec from Docker E2E tests

The mock-llm-auth-modes.spec.ts tests npm-binary-specific --auth-required
behaviour (a second static-server instance on port 18301). The Docker image
doesn't provide this second server — it has its own auth handling. Exclude
the spec from the Docker test run via testIgnore.

Co-authored-by: openhands <openhands@all-hands.dev>

* feat: run auth-modes tests inside Docker via PUBLIC_MODE_PORT

Instead of excluding the auth-modes spec from the Docker E2E run or
spinning up a host-side static server with a duplicate build/ directory,
the Docker entrypoint now supports an optional PUBLIC_MODE_PORT env var.

When set, entrypoint.sh starts a second static-server instance from the
same baked-in frontend assets with --auth-required (no session key
injected). This tests the actual Docker image's auth gate behaviour —
not a host-side approximation.

The Playwright Docker config passes -e PUBLIC_MODE_PORT=18301 to the
container and exports MOCK_LLM_PUBLIC_MODE_URL so the auth-modes spec
can reach it. With --network host the port is accessible from the host.

Co-authored-by: openhands <openhands@all-hands.dev>

* address review feedback: drop unlabeled trigger, improve error messages, document env vars

- Drop 'unlabeled' from pull_request trigger types to avoid wasted
  workflow runs when any label is removed (the job-level if: condition
  would skip immediately anyway)
- Distinguish 'no Docker run found' vs 'didn't complete in time' in
  the polling loop's final error message
- Add comment explaining /api/automation/v1 probe returns 200 without
  auth so the readiness check won't spin for 180s
- Document FILE_STORE, LOCAL_STORAGE_PATH, AUTOMATION_BASE_URL, and
  AUTOMATION_WORKSPACE_BASE in the entrypoint header — these affect
  production deployments, not just E2E tests

Co-authored-by: openhands <openhands@all-hands.dev>

---------

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/mock-llm-docker-e2e.yml     | 321 ++++++++++++++++++
 .gitignore                                    |   2 +
 AGENTS.md                                     |  11 +
 docker/entrypoint.sh                          |  59 +++-
 package.json                                  |   1 +
 playwright.mock-llm-docker.config.ts          | 171 ++++++++++
 .../mock-llm/mock-llm-conversation.spec.ts    |  63 ++--
 .../scripts/render-mock-llm-report.mjs        |   5 +-
 tests/e2e/mock-llm/utils/mock-llm-helpers.ts  |  18 +-
 9 files changed, 615 insertions(+), 36 deletions(-)
 create mode 100644 .github/workflows/mock-llm-docker-e2e.yml
 create mode 100644 playwright.mock-llm-docker.config.ts

diff --git a/.github/workflows/mock-llm-docker-e2e.yml b/.github/workflows/mock-llm-docker-e2e.yml
new file mode 100644
index 00000000..1915e629
--- /dev/null
+++ b/.github/workflows/mock-llm-docker-e2e.yml
@@ -0,0 +1,321 @@
+name: Mock-LLM Docker E2E Tests
+
+# Runs the same mock-LLM E2E test specs as mock-llm-e2e.yml, but against
+# the Docker image instead of the npm build path (bin/agent-canvas.mjs).
+#
+# Trigger chain:
+#   1. workflow_run — fires automatically after the "Docker" workflow
+#      completes on main. The image is already built/pushed to GHCR.
+#   2. pull_request — fires on PRs with the 'e2e-tests' label. Waits for
+#      the Docker workflow to finish, then pulls the image from GHCR.
+#      (workflow_run doesn't fire for new workflow files until they're on
+#      the default branch, so pull_request is needed for first-run PRs.)
+#   3. workflow_dispatch — manual trigger with a custom image tag.
+
+on:
+  workflow_run:
+    workflows: ["Docker"]
+    types: [completed]
+  pull_request:
+    types: [opened, synchronize, reopened, labeled]
+  workflow_dispatch:
+    inputs:
+      docker_image:
+        description: 'Docker image to test (e.g., ghcr.io/openhands/agent-canvas:sha-abc1234-amd64)'
+        type: string
+        default: ""
+
+concurrency:
+  group: mock-llm-docker-e2e-${{ github.event.workflow_run.id || github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  packages: read
+  pull-requests: write
+  actions: read
+
+jobs:
+  mock-llm-docker-e2e:
+    # workflow_run: only run if the Docker build succeeded.
+    # pull_request: only run with the 'e2e-tests' label, skip fork PRs (no GHCR push).
+    # workflow_dispatch: always run.
+    if: >-
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'workflow_run' &&
+       github.event.workflow_run.conclusion == 'success') ||
+      (github.event_name == 'pull_request' &&
+       contains(github.event.pull_request.labels.*.name, 'e2e-tests') &&
+       !github.event.pull_request.head.repo.fork)
+    runs-on: ubuntu-24.04
+    timeout-minutes: 15
+
+    env:
+      MOCK_LLM_REPORT_PATH: mock-llm-docker-report.md
+      MOCK_LLM_WORKFLOW_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+    steps:
+      # ── Resolve which commit / PR to test ──────────────────────────────
+      - name: Resolve source context
+        id: ctx
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_run" ]; then
+            echo "sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
+            echo "ref=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
+            PR_NUMBER=$(echo '${{ toJSON(github.event.workflow_run.pull_requests) }}' \
+              | jq -r '.[0].number // empty')
+            echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
+          elif [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "sha=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
+            echo "ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
+            echo "pr_number=${{ github.event.pull_request.number }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
+            echo "ref=${{ github.ref }}" >> "$GITHUB_OUTPUT"
+            echo "pr_number=" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Check out repository
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ steps.ctx.outputs.ref }}
+
+      - name: Read defaults from config/defaults.json
+        id: defaults
+        run: |
+          echo "agent_server_version=$(node -p "require('./config/defaults.json').versions.agentServer")" >> "$GITHUB_OUTPUT"
+
+      # ── Wait for Docker workflow (pull_request trigger only) ────────────
+      # When triggered by pull_request, the Docker image may still be
+      # building. Poll the Docker workflow until it completes for this SHA.
+      - name: Wait for Docker workflow to complete
+        if: github.event_name == 'pull_request'
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        run: |
+          SHA="${{ steps.ctx.outputs.sha }}"
+          echo "Waiting for Docker workflow to complete for SHA ${SHA}..."
+
+          for i in $(seq 1 60); do
+            # Find the Docker workflow run for this exact commit
+            RUN=$(gh api \
+              "/repos/${{ github.repository }}/actions/workflows/docker.yml/runs?head_sha=${SHA}&per_page=1" \
+              --jq '.workflow_runs[0] // empty' 2>/dev/null || echo "")
+
+            if [ -z "$RUN" ]; then
+              echo "  Attempt $i: No Docker workflow run found yet for ${SHA}..."
+              sleep 15
+              continue
+            fi
+
+            STATUS=$(echo "$RUN" | jq -r '.status')
+            CONCLUSION=$(echo "$RUN" | jq -r '.conclusion // empty')
+            RUN_URL=$(echo "$RUN" | jq -r '.html_url')
+
+            if [ "$STATUS" = "completed" ]; then
+              if [ "$CONCLUSION" = "success" ]; then
+                echo "Docker workflow completed successfully: $RUN_URL"
+                break
+              else
+                echo "::error::Docker workflow finished with conclusion '$CONCLUSION': $RUN_URL"
+                exit 1
+              fi
+            fi
+
+            echo "  Attempt $i: Docker workflow status=$STATUS (${RUN_URL})"
+            sleep 15
+          done
+
+          # Final check — if we exhausted retries
+          if [ -z "${STATUS:-}" ]; then
+            echo "::error::No Docker workflow run found for SHA ${SHA} after 15 minutes"
+            exit 1
+          elif [ "$STATUS" != "completed" ]; then
+            echo "::error::Docker workflow did not complete within 15 minutes (last status: $STATUS)"
+            exit 1
+          fi
+
+      # ── Resolve Docker image tag ───────────────────────────────────────
+      - name: Resolve Docker image
+        id: image
+        run: |
+          if [ -n "${{ inputs.docker_image }}" ]; then
+            echo "tag=${{ inputs.docker_image }}" >> "$GITHUB_OUTPUT"
+          else
+            SHORT_SHA=$(echo "${{ steps.ctx.outputs.sha }}" | cut -c1-7)
+            # Use the amd64-specific tag (always pushed by the Docker workflow).
+            echo "tag=ghcr.io/openhands/agent-canvas:sha-${SHORT_SHA}-amd64" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Log in to GHCR
+        uses: docker/login-action@v4
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Pull Docker image
+        run: |
+          echo "Pulling ${{ steps.image.outputs.tag }}..."
+          docker pull "${{ steps.image.outputs.tag }}"
+
+      # ── Test infrastructure setup ──────────────────────────────────────
+      - name: Set up Node.js
+        uses: actions/setup-node@v6
+        with:
+          # Pin to 24.15.x — Node 24.16.0 has a zip-extraction regression
+          # (nodejs/node#63487) that hangs `playwright install` for Playwright
+          # < 1.60.0. Remove this pin after upgrading to Playwright >= 1.60.0.
+          node-version: "24.15"
+          cache: npm
+
+      - name: Install npm dependencies
+        run: npm ci
+
+      - name: Get Playwright version
+        id: pw_version
+        run: echo "version=$(npx playwright --version | awk '{print $2}')" >> "$GITHUB_OUTPUT"
+
+      - name: Cache Playwright browsers
+        id: pw_cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/ms-playwright
+          key: playwright-${{ runner.os }}-${{ steps.pw_version.outputs.version }}
+
+      - name: Install Playwright Chromium
+        if: steps.pw_cache.outputs.cache-hit != 'true'
+        run: npx playwright install chromium
+
+      - name: Install Playwright system deps
+        run: npx playwright install-deps chromium
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
+      - name: Install openhands-sdk (for mock LLM server)
+        run: |
+          uv venv .mock-llm-venv
+          uv pip install -p .mock-llm-venv openhands-sdk==${{ steps.defaults.outputs.agent_server_version }}
+
+      - name: Verify mock LLM server starts
+        run: |
+          .mock-llm-venv/bin/python3 tests/e2e/mock-llm/scripts/mock-llm-server.py --port 9998 &
+          SERVER_PID=$!
+          for i in $(seq 1 30); do
+            if curl -sf http://127.0.0.1:9998/v1/chat/completions \
+              -H "Content-Type: application/json" \
+              -d '{"model":"test","messages":[]}' > /dev/null 2>&1; then
+              echo "Mock LLM server responded on attempt $i"
+              break
+            fi
+            sleep 1
+          done
+          curl -sf http://127.0.0.1:9998/v1/chat/completions \
+            -H "Content-Type: application/json" \
+            -d '{"model":"test","messages":[]}' | python3 -m json.tool
+          kill $SERVER_PID
+
+      # ── Run tests ──────────────────────────────────────────────────────
+      - name: Run mock-LLM Docker E2E tests
+        id: run_tests
+        env:
+          MOCK_LLM_PYTHON: .mock-llm-venv/bin/python3
+          MOCK_LLM_DOCKER_IMAGE: ${{ steps.image.outputs.tag }}
+        run: |
+          set +e
+          MARKER_DIR=".mock-llm-markers"
+          DONE_MARKER="$MARKER_DIR/.tests-done"
+          PASS_MARKER="$MARKER_DIR/.all-passed"
+          rm -rf "$MARKER_DIR"
+
+          # Run Playwright in background so our shell survives if we have
+          # to kill it (the Docker container teardown can hang).
+          npm run test:e2e:mock-llm:docker &
+          PW_PID=$!
+
+          # Wait up to 5 min for tests to complete.
+          deadline=$((SECONDS + 300))
+          while [ "$SECONDS" -lt "$deadline" ]; do
+            if ! kill -0 "$PW_PID" 2>/dev/null; then
+              break
+            fi
+            if [ -f "$DONE_MARKER" ]; then
+              echo "Tests completed: $(cat "$DONE_MARKER")"
+              break
+            fi
+            sleep 2
+          done
+
+          # If Playwright is still running (teardown hang), give it 5s
+          # grace then force-kill.
+          if kill -0 "$PW_PID" 2>/dev/null; then
+            sleep 5
+            if kill -0 "$PW_PID" 2>/dev/null; then
+              echo "::warning::Killing lingering Playwright process (teardown hung)"
+              kill "$PW_PID" 2>/dev/null
+              sleep 5
+              kill -9 "$PW_PID" 2>/dev/null
+            fi
+            wait "$PW_PID" 2>/dev/null
+            pw_exit=124
+          else
+            wait "$PW_PID"
+            pw_exit=$?
+          fi
+
+          echo "Playwright exited with code $pw_exit"
+
+          # When killed during teardown, the exit code is non-zero but
+          # tests may have passed.
+          if [ "$pw_exit" -ne 0 ] && [ -f "$PASS_MARKER" ]; then
+            echo "::notice::All tests passed (marker file present); non-zero exit was teardown-related"
+            pw_exit=0
+          fi
+
+          # Clean up the Docker container (belt-and-suspenders)
+          docker ps -q --filter "name=agent-canvas-mock-llm" | xargs -r docker stop 2>/dev/null || true
+
+          echo "exit_code=$pw_exit" >> "$GITHUB_OUTPUT"
+          exit 0
+
+      # ── Reporting ──────────────────────────────────────────────────────
+      - name: Upload test artifacts
+        id: upload_artifacts
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: mock-llm-docker-e2e-results
+          if-no-files-found: ignore
+          retention-days: 14
+          path: |
+            playwright-report-mock-llm-docker/
+            test-results-mock-llm-docker/
+
+      - name: Render test report
+        if: always()
+        run: |
+          node tests/e2e/mock-llm/scripts/render-mock-llm-report.mjs \
+            --results "test-results-mock-llm-docker/results.json" \
+            --output "$MOCK_LLM_REPORT_PATH" \
+            --workflow-url "$MOCK_LLM_WORKFLOW_URL" \
+            --commit "${{ steps.ctx.outputs.sha }}" \
+            --artifact-url "${{ steps.upload_artifacts.outputs.artifact-url || '' }}" \
+            --title "Mock-LLM Docker E2E Test Results"
+          cat "$MOCK_LLM_REPORT_PATH" >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Post PR comment
+        if: always() && steps.ctx.outputs.pr_number
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        run: |
+          gh pr comment "${{ steps.ctx.outputs.pr_number }}" \
+            --body-file "$MOCK_LLM_REPORT_PATH"
+
+      - name: Fail job when tests fail
+        if: always()
+        run: |
+          exit_code="${{ steps.run_tests.outputs.exit_code }}"
+          exit "${exit_code:-1}"
diff --git a/.gitignore b/.gitignore
index 97c97dcb..ea10421a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,10 +15,12 @@ __pycache__
 /test-results/
 /test-results-live/
 /test-results-mock-llm/
+/test-results-mock-llm-docker/
 /.mock-llm-markers/
 /playwright-report/
 /playwright-report-live/
 /playwright-report-mock-llm/
+/playwright-report-mock-llm-docker/
 /blob-report/
 /playwright/.cache/
 # Snapshot baselines are stored as GitHub Actions artifacts — not in git.
diff --git a/AGENTS.md b/AGENTS.md
index 1717463b..52d123b1 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -164,6 +164,17 @@ you are running inside of — NOT the automation backend.
 - CI workflow: `.github/workflows/mock-llm-e2e.yml` runs on PRs with the `e2e-tests` label or on manual dispatch. It builds the frontend, starts the mock LLM server, runs the tests, and posts a PR comment with results.
 - The custom `DoneMarkerReporter` writes `.mock-llm-markers/.tests-done` after all tests complete (before webServer teardown) so the CI wrapper can detect completion and kill the lingering teardown process.
 
+### Docker Image Testing (Shared Specs)
+
+- The same test specs and helpers are reused to validate the Docker image via `playwright.mock-llm-docker.config.ts`. Run locally with `npm run test:e2e:mock-llm:docker` (requires Docker daemon and a built image).
+- **Architecture**: The Docker config replaces the npm path's `bin/agent-canvas.mjs` webServer with a `docker run --network host` command. The mock LLM server still runs on the host. On Linux (including CI), `--network host` lets the container share the host's network stack so all `127.0.0.1` URLs work identically. On macOS/Windows Docker Desktop (bridge networking), set `MOCK_LLM_AGENT_URL=http://host.docker.internal:<port>` so the agent-server inside Docker can reach the host-side mock LLM server.
+- **URL split**: `mock-llm-helpers.ts` exports two mock LLM URL constants:
+  - `MOCK_LLM_BASE_URL` — always `http://127.0.0.1:<port>`, used by tests for the mock LLM admin API (register/activate/reset trajectories).
+  - `MOCK_LLM_AGENT_URL` — defaults to `MOCK_LLM_BASE_URL`, overridable via `MOCK_LLM_AGENT_URL` env var. Used when configuring the LLM profile (`base_url` field) — this is the URL the agent-server uses for inference calls. The npm path and Docker-with-`--network host` path use the same value; Docker on macOS needs the override.
+- **Docker image**: Set `MOCK_LLM_DOCKER_IMAGE` to the image tag (default: `ghcr.io/openhands/agent-canvas:latest`). The container is started with `--rm --network host` and a unique `--name` for cleanup.
+- **State isolation**: The Docker container uses its internal state directory (no host mount needed for tests). Each test run starts a fresh container.
+- CI workflow: `.github/workflows/mock-llm-docker-e2e.yml` has three triggers — all pull the already-built image from GHCR (no rebuild): (1) `workflow_run` fires automatically after the `Docker` workflow completes on main; (2) `pull_request` with the `e2e-tests` label polls the Docker workflow until it finishes for the PR's head SHA, then pulls the image (needed because `workflow_run` only fires for workflow files already on the default branch); (3) `workflow_dispatch` accepts a custom `docker_image` input. The image tag is derived from the commit SHA (`ghcr.io/openhands/agent-canvas:sha-<short>-amd64`). Fork PRs are skipped (no GHCR push). Report artifacts go to `test-results-mock-llm-docker/` and `playwright-report-mock-llm-docker/`.
+
 ## Additional Notes
 
 - **Published binary auth fix**: When users install the npm package globally (`npm install -g @openhands/agent-canvas`) and run `agent-canvas`, the pre-built static frontend has a `VITE_SESSION_API_KEY` baked in at publish time that differs from the user's persisted runtime key (`~/.openhands/agent-canvas/session-api-key.txt`). The fix is to inject the runtime session key into `index.html` responses at serve time (not build time). `scripts/static-server.mjs` accepts a `--session-api-key <key>` flag and injects a tiny inline `<script>` before `</head>` that seeds the key into `localStorage['openhands-agent-server-config'].sessionApiKey` — only if no key is already stored (preserving user-set overrides). `scripts/dev-with-automation.mjs` and `scripts/dev-static.mjs` both pass `--session-api-key ${config.sessionApiKey}` when starting the static server.
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
index e02a2855..b641161c 100644
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@@ -2,17 +2,22 @@
 # ═══════════════════════════════════════════════════════════════════════════════
 # agent-canvas all-in-one entrypoint
 #
-# Starts three services:
+# Starts three services (plus an optional fourth):
 #   1. Agent Server   on port $AGENT_SERVER_PORT  (default 18000)
 #   2. Automation     on port $AUTOMATION_PORT     (default 18001)
 #   3. Static server  on port $PORT               (default 8000)
 #      Routes /api/automation/* → automation, /api/* → agent-server,
 #      and serves the frontend static build for everything else.
+#   4. (Optional) Public-mode static server on $PUBLIC_MODE_PORT
+#      Same frontend, but with --auth-required (no baked session key).
+#      Used by auth-mode E2E tests. Only started when PUBLIC_MODE_PORT is set.
 #
 # Environment variables:
 #   PORT                 – Unified entry point port (default: 8000)
 #   AGENT_SERVER_PORT    – Internal agent-server port (default: 18000)
 #   AUTOMATION_PORT      – Internal automation port (default: 18001)
+#   PUBLIC_MODE_PORT     – If set, starts a second static server on this port
+#                          with --auth-required (no session key injected)
 #   OH_SECRET_KEY        – Secret key for settings encryption (auto-generated
 #                          and persisted if not provided)
 #   OPENHANDS_AUTOMATION_API_KEY – Override automation backend auth key
@@ -23,6 +28,16 @@
 #                          Setting this enables local-mode auth so the session
 #                          API key is validated internally instead of against the
 #                          OpenHands cloud API.
+#   FILE_STORE             – Storage backend for automation tarballs (default: local).
+#                          Without this the automation backend may fall back to
+#                          S3/GCS which fails without cloud credentials.
+#   LOCAL_STORAGE_PATH     – Directory for local file storage (default: ~/.openhands/storage)
+#   AUTOMATION_BASE_URL    – Publicly-reachable base URL for the automation
+#                          service, used in callback URLs and injected into
+#                          sandboxes (default: http://127.0.0.1:$PORT).
+#                          Override in production when the external URL differs.
+#   AUTOMATION_WORKSPACE_BASE – Directory for automation run workspaces
+#                          (default: ~/.openhands/workspaces)
 #   Any agent-server or automation env vars are passed through.
 # ═══════════════════════════════════════════════════════════════════════════════
 set -uo pipefail
@@ -146,6 +161,23 @@ log "Starting automation server on port $AUTOMATION_PORT..."
 # Disable the automation's own frontend — agent-canvas provides the UI.
 export AUTOMATION_FRONTEND_DIR=""
 
+# File storage — use local filesystem unless the user has configured cloud
+# storage.  Without FILE_STORE=local the automation backend may fall back
+# to a cloud provider (S3/GCS) which will fail without credentials, causing
+# tarball-based presets (preset/prompt, preset/plugin) to silently error.
+export FILE_STORE="${FILE_STORE:-local}"
+export LOCAL_STORAGE_PATH="${LOCAL_STORAGE_PATH:-${OPENHANDS_DIR}/storage}"
+mkdir -p "$LOCAL_STORAGE_PATH"
+
+# AUTOMATION_BASE_URL — the publicly-reachable base URL for the automation
+# service.  Appended to callback URLs and injected into each sandbox as
+# AUTOMATION_API_URL.  Defaults to the unified ingress.
+export AUTOMATION_BASE_URL="${AUTOMATION_BASE_URL:-http://127.0.0.1:${PORT}}"
+
+# AUTOMATION_WORKSPACE_BASE — where automation runs unpack tarballs.
+export AUTOMATION_WORKSPACE_BASE="${AUTOMATION_WORKSPACE_BASE:-${OPENHANDS_DIR}/workspaces}"
+mkdir -p "$AUTOMATION_WORKSPACE_BASE"
+
 # Default to SQLite so the automation server works out of the box without
 # an external PostgreSQL instance. Users can override AUTOMATION_DB_URL to
 # point at a real Postgres for production deployments.
@@ -213,6 +245,31 @@ node /opt/agent-canvas/static-server.mjs \
   --route "/openapi.json=http://127.0.0.1:${AGENT_SERVER_PORT}" &
 PIDS+=($!)
 
+# ── 5. (Optional) Public-mode static server ─────────────────────────────────
+# When PUBLIC_MODE_PORT is set, start a second static-server instance that
+# serves the same frontend WITHOUT injecting the session key into the HTML
+# (--auth-required). This is used by auth-mode E2E tests to verify the
+# ApiKeyEntryScreen gate, key rotation recovery, etc.
+if [ -n "${PUBLIC_MODE_PORT:-}" ]; then
+  log "Starting public-mode frontend on port $PUBLIC_MODE_PORT (--auth-required)..."
+  node /opt/agent-canvas/static-server.mjs \
+    --port "$PUBLIC_MODE_PORT" \
+    --host 0.0.0.0 \
+    --dir /opt/agent-canvas/frontend \
+    --auth-required \
+    --route "/api/automation=http://127.0.0.1:${AUTOMATION_PORT}" \
+    --route "/api=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/server_info=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/sockets=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/alive=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/health=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/ready=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/docs=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/redoc=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/openapi.json=http://127.0.0.1:${AGENT_SERVER_PORT}" &
+  PIDS+=($!)
+fi
+
 log "All services started. Unified entry point: http://0.0.0.0:${PORT}/"
 
 # Wait for any child to exit. If one dies, the trap will clean up the rest.
diff --git a/package.json b/package.json
index ef48b205..2b954024 100644
--- a/package.json
+++ b/package.json
@@ -81,6 +81,7 @@
     "test:e2e": "playwright test --pass-with-no-tests",
     "test:e2e:live": "node --env-file-if-exists=.env tests/e2e/live/scripts/run-live-e2e.mjs",
     "test:e2e:mock-llm": "playwright test --config=playwright.mock-llm.config.ts",
+    "test:e2e:mock-llm:docker": "playwright test --config=playwright.mock-llm-docker.config.ts",
     "test:e2e:snapshots": "playwright test tests/e2e/snapshots --project=chromium --retries=0",
     "test:e2e:snapshots:update": "playwright test tests/e2e/snapshots --project=chromium --update-snapshots",
     "test:coverage": "npm run make-i18n && vitest run --coverage",
diff --git a/playwright.mock-llm-docker.config.ts b/playwright.mock-llm-docker.config.ts
new file mode 100644
index 00000000..a7c4e682
--- /dev/null
+++ b/playwright.mock-llm-docker.config.ts
@@ -0,0 +1,171 @@
+/**
+ * Playwright config for mock-LLM E2E tests against a Docker container.
+ *
+ * Reuses the same test specs as playwright.mock-llm.config.ts but launches
+ * the agent-canvas stack inside a Docker container instead of via
+ * bin/agent-canvas.mjs + uvx.
+ *
+ * Starts two processes:
+ *   1. Mock LLM server (Python on the host, using openhands-sdk TestLLM)
+ *   2. Docker container running the agent-canvas all-in-one image
+ *      (agent-server + automation backend + static frontend + proxy)
+ *      The container also starts a second static-server instance on
+ *      PUBLIC_MODE_PORT with --auth-required for auth-mode E2E tests.
+ *
+ * Networking:
+ *   Uses --network host on Linux so the container shares the host's network
+ *   stack. This means the agent-server inside Docker can reach the mock LLM
+ *   server at 127.0.0.1:<port> — identical to the npm path.
+ *
+ *   For macOS/Windows (Docker Desktop with bridge networking), set
+ *   MOCK_LLM_AGENT_URL=http://host.docker.internal:<port> so the
+ *   agent-server can reach the host-side mock LLM server.
+ *
+ * Required:
+ *   - A built Docker image. Set MOCK_LLM_DOCKER_IMAGE to the image tag
+ *     (default: ghcr.io/openhands/agent-canvas:latest).
+ *   - Docker daemon must be running.
+ */
+
+import { defineConfig, devices } from "@playwright/test";
+import { randomBytes } from "node:crypto";
+
+// ── Docker image ────────────────────────────────────────────────────────
+const DOCKER_IMAGE =
+  process.env.MOCK_LLM_DOCKER_IMAGE ?? "ghcr.io/openhands/agent-canvas:latest";
+
+// Container name for cleanup — unique per run to avoid collisions.
+const CONTAINER_NAME =
+  process.env.MOCK_LLM_CONTAINER_NAME ??
+  `agent-canvas-mock-llm-${randomBytes(4).toString("hex")}`;
+
+// ── Port allocation (separate from live E2E / dev to avoid collisions) ─
+const MOCK_LLM_PORT = process.env.MOCK_LLM_PORT ?? "9999";
+
+// The Docker container exposes a single port for the unified ingress.
+// With --network host this is accessible at localhost directly.
+const INGRESS_PORT = process.env.MOCK_LLM_INGRESS_PORT ?? "18300";
+
+// Public-mode static server — runs inside the Docker container when
+// PUBLIC_MODE_PORT is set (see docker/entrypoint.sh). With --network host
+// the port is accessible from the host at localhost directly.
+const PUBLIC_MODE_PORT = process.env.MOCK_LLM_PUBLIC_MODE_PORT ?? "18301";
+
+// ── Session API key ────────────────────────────────────────────────────
+const sessionApiKey =
+  process.env.MOCK_LLM_SESSION_API_KEY?.trim() ||
+  randomBytes(32).toString("hex");
+process.env.MOCK_LLM_SESSION_API_KEY = sessionApiKey;
+
+// ── URLs ───────────────────────────────────────────────────────────────
+const INGRESS_URL = `http://localhost:${INGRESS_PORT}/`;
+const MOCK_LLM_URL = `http://127.0.0.1:${MOCK_LLM_PORT}`;
+
+// Python binary for the mock server — defaults to "python3" but CI can
+// point this at a venv (e.g. ".mock-llm-venv/bin/python3") to avoid
+// PEP 668 "externally managed" errors on Ubuntu 24.04+.
+const MOCK_LLM_PYTHON = process.env.MOCK_LLM_PYTHON ?? "python3";
+
+// Export for the test helpers — BACKEND_URL points to the ingress (API
+// calls are proxied to the agent-server, so no direct backend port needed).
+process.env.MOCK_LLM_BACKEND_URL = `http://localhost:${INGRESS_PORT}`;
+process.env.MOCK_LLM_PORT = MOCK_LLM_PORT;
+process.env.MOCK_LLM_PUBLIC_MODE_URL = `http://localhost:${PUBLIC_MODE_PORT}`;
+process.env.VITE_SESSION_API_KEY = sessionApiKey;
+
+// MOCK_LLM_AGENT_URL — the URL the agent-server inside Docker uses to
+// call the mock LLM for inference. With --network host on Linux the
+// agent-server can reach 127.0.0.1 directly. For macOS/Windows Docker
+// Desktop, override this to http://host.docker.internal:<port>.
+if (!process.env.MOCK_LLM_AGENT_URL) {
+  process.env.MOCK_LLM_AGENT_URL = MOCK_LLM_URL;
+}
+
+export default defineConfig({
+  testDir: "./tests/e2e/mock-llm",
+  testMatch: /.*\.spec\.ts/,
+  fullyParallel: false,
+  forbidOnly: !!process.env.CI,
+  retries: 0,
+  workers: 1,
+  timeout: 60_000,
+  globalTimeout: process.env.CI ? 600_000 : 0, // 10 min hard cap in CI
+  reporter: [
+    ["line"],
+    [
+      "json",
+      { outputFile: "test-results-mock-llm-docker/results.json" },
+    ],
+    [
+      "html",
+      {
+        outputFolder: "playwright-report-mock-llm-docker",
+        open: "never",
+      },
+    ],
+    ["./tests/e2e/mock-llm/reporters/done-marker-reporter.ts"],
+  ],
+  outputDir: "test-results-mock-llm-docker",
+  use: {
+    baseURL: INGRESS_URL,
+    screenshot: "only-on-failure",
+    trace: "on-first-retry",
+    video: "on",
+  },
+  projects: [
+    {
+      name: "chromium",
+      use: { ...devices["Desktop Chrome"] },
+    },
+  ],
+  webServer: [
+    // 1. Mock LLM server (Python, on the host)
+    {
+      command: `${MOCK_LLM_PYTHON} tests/e2e/mock-llm/scripts/mock-llm-server.py --port ${MOCK_LLM_PORT}`,
+      url: MOCK_LLM_URL,
+      timeout: 30_000,
+      reuseExistingServer: !process.env.CI,
+      stdout: "pipe",
+      stderr: "pipe",
+    },
+    // 2. Docker container running the agent-canvas all-in-one image
+    //
+    // Uses --network host so the container shares the host's network:
+    //   - The ingress port is available at localhost:<INGRESS_PORT>
+    //   - The agent-server can reach the mock LLM at 127.0.0.1:<MOCK_LLM_PORT>
+    //
+    // The container is started with --rm for auto-cleanup. A named container
+    // is used so the teardown can `docker stop` it reliably.
+    //
+    // Note: --network host is Linux-only. On macOS/Windows Docker Desktop,
+    // use -p port mapping and set MOCK_LLM_AGENT_URL=http://host.docker.internal:<port>.
+    {
+      command: [
+        // Stop any leftover container from a previous failed run
+        `docker rm -f ${CONTAINER_NAME} 2>/dev/null;`,
+        "exec docker run",
+        "--rm",
+        `--name ${CONTAINER_NAME}`,
+        "--network host",
+        `-e PORT=${INGRESS_PORT}`,
+        `-e SESSION_API_KEY=${sessionApiKey}`,
+        `-e OH_SESSION_API_KEYS_0=${sessionApiKey}`,
+        `-e PUBLIC_MODE_PORT=${PUBLIC_MODE_PORT}`,
+        "-e VITE_DO_NOT_TRACK=1",
+        "-e VITE_ENABLE_BROWSER_TOOLS=false",
+        DOCKER_IMAGE,
+      ].join(" "),
+      // Probe the automation list endpoint through the ingress to ensure
+      // the FULL stack (agent-server + automation backend + ingress) is
+      // up before tests start. GET /api/automation/v1 returns 200 (empty
+      // list) without auth — the automation backend does not enforce
+      // session-key auth on the list endpoint.
+      url: `http://localhost:${INGRESS_PORT}/api/automation/v1`,
+      timeout: 180_000, // Docker pull + all services startup
+      reuseExistingServer: !process.env.CI,
+    },
+  ],
+  // globalTeardown stops the Docker container when Playwright exits.
+  // Playwright sends SIGTERM to the webServer command, but `docker run`
+  // with --rm handles cleanup automatically on termination.
+});
diff --git a/tests/e2e/mock-llm/mock-llm-conversation.spec.ts b/tests/e2e/mock-llm/mock-llm-conversation.spec.ts
index b4f460e6..87cfcb37 100644
--- a/tests/e2e/mock-llm/mock-llm-conversation.spec.ts
+++ b/tests/e2e/mock-llm/mock-llm-conversation.spec.ts
@@ -24,7 +24,7 @@ import {
   BASH_TOKEN,
   REPLY_TOKEN,
   waitForAgentMessageContaining,
-  MOCK_LLM_BASE_URL,
+  MOCK_LLM_AGENT_URL,
   BACKEND_URL,
   SESSION_API_KEY,
   seedLocalStorage,
@@ -117,10 +117,12 @@ test.describe("mock-LLM agent-server conversation", () => {
     await modelInput.click();
     await modelInput.fill(MOCK_MODEL);
 
-    // Fill in base URL pointing to our mock server
+    // Fill in base URL pointing to our mock server.
+    // Use MOCK_LLM_AGENT_URL — the URL the agent-server will use for
+    // inference calls. In Docker this may differ from the host-local URL.
     const baseUrlInput = page.getByTestId("base-url-input");
     await baseUrlInput.click();
-    await baseUrlInput.fill(MOCK_LLM_BASE_URL);
+    await baseUrlInput.fill(MOCK_LLM_AGENT_URL);
 
     // Fill in a fake API key (mock server doesn't validate it)
     const apiKeyInput = page.getByTestId("llm-api-key-input");
@@ -174,31 +176,32 @@ test.describe("mock-LLM agent-server conversation", () => {
     // Click "Set as active"
     await page.getByTestId("profile-set-active").click();
 
-    // Verify the "Active" badge appears on our profile
-    // Re-find the row after the state change
-    await page.waitForTimeout(1_000); // wait for the mutation to settle
-
-    // Reload to see the persisted state
-    await page.goto("/settings/llm", { waitUntil: "domcontentloaded" });
-    await waitForTestId(page, "add-llm-profile");
-
-    const updatedRows = page.getByTestId("profile-row");
-    const updatedCount = await updatedRows.count();
-    let foundActiveBadge = false;
-
-    for (let i = 0; i < updatedCount; i++) {
-      const row = updatedRows.nth(i);
-      const text = await row.textContent();
-      if (text?.includes(PROFILE_NAME)) {
-        const badge = row.getByTestId("profile-active-badge");
-        foundActiveBadge = (await badge.count()) > 0;
-        break;
-      }
-    }
-    expect(
-      foundActiveBadge,
-      `Profile "${PROFILE_NAME}" should have an "Active" badge`,
-    ).toBe(true);
+    // Verify the "Active" badge appears on our profile.
+    // Poll with reload instead of a fixed timeout — the mutation may take
+    // more than 1s to persist on a loaded CI runner.
+    await expect
+      .poll(
+        async () => {
+          await page.goto("/settings/llm", { waitUntil: "domcontentloaded" });
+          await waitForTestId(page, "add-llm-profile");
+          const rows = page.getByTestId("profile-row");
+          const count = await rows.count();
+          for (let i = 0; i < count; i++) {
+            const row = rows.nth(i);
+            const text = await row.textContent();
+            if (text?.includes(PROFILE_NAME)) {
+              return (await row.getByTestId("profile-active-badge").count()) > 0;
+            }
+          }
+          return false;
+        },
+        {
+          message: `Profile "${PROFILE_NAME}" should have an "Active" badge`,
+          timeout: 15_000,
+          intervals: [1_000, 2_000, 3_000],
+        },
+      )
+      .toBe(true);
 
     // Verify the settings API now reflects the activated profile's LLM config
     await test.step("verify settings API reflects the active profile's model", async () => {
@@ -219,8 +222,8 @@ test.describe("mock-LLM agent-server conversation", () => {
       const llmBaseUrl = settings?.agent_settings?.llm?.base_url;
       expect(
         llmBaseUrl,
-        `Expected settings llm.base_url="${MOCK_LLM_BASE_URL}" but got "${llmBaseUrl}"`,
-      ).toBe(MOCK_LLM_BASE_URL);
+        `Expected settings llm.base_url="${MOCK_LLM_AGENT_URL}" but got "${llmBaseUrl}"`,
+      ).toBe(MOCK_LLM_AGENT_URL);
     });
   });
 
diff --git a/tests/e2e/mock-llm/scripts/render-mock-llm-report.mjs b/tests/e2e/mock-llm/scripts/render-mock-llm-report.mjs
index 4cb1eac4..e8c3e420 100644
--- a/tests/e2e/mock-llm/scripts/render-mock-llm-report.mjs
+++ b/tests/e2e/mock-llm/scripts/render-mock-llm-report.mjs
@@ -141,7 +141,7 @@ function overallIcon(status) {
 
 // ── Report rendering ───────────────────────────────────────────────────
 
-function renderReport({ tests, workflowUrl, commit, artifactUrl }) {
+function renderReport({ tests, workflowUrl, commit, artifactUrl, title }) {
   const status = overallStatus(tests);
   const icon = overallIcon(status);
   const passed = tests.filter((t) => t.status === "passed").length;
@@ -154,7 +154,7 @@ function renderReport({ tests, workflowUrl, commit, artifactUrl }) {
   const lines = [];
 
   // Header
-  lines.push(`## ${icon} Mock-LLM E2E Tests`);
+  lines.push(`## ${icon} ${title || "Mock-LLM E2E Tests"}`);
   lines.push("");
 
   // Summary line
@@ -274,6 +274,7 @@ const report = renderReport({
   workflowUrl: args.workflow_url || "",
   commit: args.commit || "",
   artifactUrl: args.artifact_url || "",
+  title: args.title || "",
 });
 
 writeFileSync(outputPath, report);
diff --git a/tests/e2e/mock-llm/utils/mock-llm-helpers.ts b/tests/e2e/mock-llm/utils/mock-llm-helpers.ts
index 2bcbfdce..03206c10 100644
--- a/tests/e2e/mock-llm/utils/mock-llm-helpers.ts
+++ b/tests/e2e/mock-llm/utils/mock-llm-helpers.ts
@@ -16,7 +16,17 @@ export const BASH_COMMAND = `printf '${BASH_TOKEN}\\n'`;
 // The agent-canvas binary exposes a single ingress port; API calls are proxied
 // through it, so BACKEND_URL = ingress URL (no separate backend port).
 export const MOCK_LLM_PORT = process.env.MOCK_LLM_PORT ?? "9999";
+
+// URL tests use to hit the mock LLM admin API (always on the host).
 export const MOCK_LLM_BASE_URL = `http://127.0.0.1:${MOCK_LLM_PORT}`;
+
+// URL the agent-server uses to reach the mock LLM for inference calls.
+// In the npm path both run on the host, so this equals MOCK_LLM_BASE_URL.
+// In Docker with --network host on Linux this also works as-is.
+// For Docker on macOS (bridge networking), set MOCK_LLM_AGENT_URL to
+// http://host.docker.internal:<port> so the container can reach the host.
+export const MOCK_LLM_AGENT_URL =
+  process.env.MOCK_LLM_AGENT_URL ?? MOCK_LLM_BASE_URL;
 export const BACKEND_URL =
   process.env.MOCK_LLM_BACKEND_URL ?? "http://localhost:18300";
 // Public-mode static server (--auth-required, no session key injected).
@@ -283,7 +293,9 @@ export async function ensureMockLLMProfile(
   request: APIRequestContext,
   model = "openai/mock-test-model",
 ) {
-  // Check if the current profile already has the mock LLM settings
+  // Check if the current profile already has the mock LLM settings.
+  // Use MOCK_LLM_AGENT_URL — this is the URL the agent-server will use to
+  // reach the mock LLM, which may differ from MOCK_LLM_BASE_URL in Docker.
   const settingsResp = await request.get(`${BACKEND_URL}/api/settings`, {
     headers: {
       "X-Session-API-Key": SESSION_API_KEY,
@@ -294,7 +306,7 @@ export async function ensureMockLLMProfile(
   if (settingsResp.ok()) {
     const settings = await settingsResp.json();
     const llm = settings?.agent_settings?.llm;
-    if (llm?.model === model && llm?.base_url === MOCK_LLM_BASE_URL) {
+    if (llm?.model === model && llm?.base_url === MOCK_LLM_AGENT_URL) {
       return; // Already configured
     }
   }
@@ -310,7 +322,7 @@ export async function ensureMockLLMProfile(
         llm: {
           model,
           api_key: "mock-api-key-for-testing",
-          base_url: MOCK_LLM_BASE_URL,
+          base_url: MOCK_LLM_AGENT_URL,
         },
       },
     },

From a9c643d07762dd03ecf0a384219c774e68da43f4 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 1 Jun 2026 15:20:04 -0600
Subject: [PATCH 6/6] chore: bump version to 1.0.0-beta.3

---
 package-lock.json | 4 ++--
 package.json      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 3e1ef92a..82f9febf 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "@openhands/agent-canvas",
-  "version": "1.0.0-beta.2",
+  "version": "1.0.0-beta.3",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "@openhands/agent-canvas",
-      "version": "1.0.0-beta.2",
+      "version": "1.0.0-beta.3",
       "license": "MIT",
       "dependencies": {
         "@heroui/react": "2.8.10",
diff --git a/package.json b/package.json
index 2b954024..b1f41184 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@openhands/agent-canvas",
-  "version": "1.0.0-beta.2",
+  "version": "1.0.0-beta.3",
   "description": "Agent Canvas UI for OpenHands - run AI coding agents with a visual interface",
   "license": "MIT",
   "private": false,