diff --git a/.github/workflows/mock-llm-docker-e2e.yml b/.github/workflows/mock-llm-docker-e2e.yml
new file mode 100644
index 00000000..1915e629
--- /dev/null
+++ b/.github/workflows/mock-llm-docker-e2e.yml
@@ -0,0 +1,321 @@
+name: Mock-LLM Docker E2E Tests
+
+# Runs the same mock-LLM E2E test specs as mock-llm-e2e.yml, but against
+# the Docker image instead of the npm build path (bin/agent-canvas.mjs).
+#
+# Trigger chain:
+#   1. workflow_run — fires automatically after the "Docker" workflow
+#      completes on main. The image is already built/pushed to GHCR.
+#   2. pull_request — fires on PRs with the 'e2e-tests' label. Waits for
+#      the Docker workflow to finish, then pulls the image from GHCR.
+#      (workflow_run doesn't fire for new workflow files until they're on
+#      the default branch, so pull_request is needed for first-run PRs.)
+#   3. workflow_dispatch — manual trigger with a custom image tag.
+
+on:
+  workflow_run:
+    workflows: ["Docker"]
+    types: [completed]
+  pull_request:
+    types: [opened, synchronize, reopened, labeled]
+  workflow_dispatch:
+    inputs:
+      docker_image:
+        description: 'Docker image to test (e.g., ghcr.io/openhands/agent-canvas:sha-abc1234-amd64)'
+        type: string
+        default: ""
+
+concurrency:
+  group: mock-llm-docker-e2e-${{ github.event.workflow_run.id || github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  packages: read
+  pull-requests: write
+  actions: read
+
+jobs:
+  mock-llm-docker-e2e:
+    # workflow_run: only run if the Docker build succeeded.
+    # pull_request: only run with the 'e2e-tests' label, skip fork PRs (no GHCR push).
+    # workflow_dispatch: always run.
+    if: >-
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'workflow_run' &&
+       github.event.workflow_run.conclusion == 'success') ||
+      (github.event_name == 'pull_request' &&
+       contains(github.event.pull_request.labels.*.name, 'e2e-tests') &&
+       !github.event.pull_request.head.repo.fork)
+    runs-on: ubuntu-24.04
+    timeout-minutes: 15
+
+    env:
+      MOCK_LLM_REPORT_PATH: mock-llm-docker-report.md
+      MOCK_LLM_WORKFLOW_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+    steps:
+      # ── Resolve which commit / PR to test ──────────────────────────────
+      - name: Resolve source context
+        id: ctx
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_run" ]; then
+            echo "sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
+            echo "ref=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
+            PR_NUMBER=$(echo '${{ toJSON(github.event.workflow_run.pull_requests) }}' \
+              | jq -r '.[0].number // empty')
+            echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
+          elif [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "sha=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
+            echo "ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
+            echo "pr_number=${{ github.event.pull_request.number }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
+            echo "ref=${{ github.ref }}" >> "$GITHUB_OUTPUT"
+            echo "pr_number=" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Check out repository
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ steps.ctx.outputs.ref }}
+
+      - name: Read defaults from config/defaults.json
+        id: defaults
+        run: |
+          echo "agent_server_version=$(node -p "require('./config/defaults.json').versions.agentServer")" >> "$GITHUB_OUTPUT"
+
+      # ── Wait for Docker workflow (pull_request trigger only) ────────────
+      # When triggered by pull_request, the Docker image may still be
+      # building. Poll the Docker workflow until it completes for this SHA.
+      - name: Wait for Docker workflow to complete
+        if: github.event_name == 'pull_request'
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        run: |
+          SHA="${{ steps.ctx.outputs.sha }}"
+          echo "Waiting for Docker workflow to complete for SHA ${SHA}..."
+
+          for i in $(seq 1 60); do
+            # Find the Docker workflow run for this exact commit
+            RUN=$(gh api \
+              "/repos/${{ github.repository }}/actions/workflows/docker.yml/runs?head_sha=${SHA}&per_page=1" \
+              --jq '.workflow_runs[0] // empty' 2>/dev/null || echo "")
+
+            if [ -z "$RUN" ]; then
+              echo "  Attempt $i: No Docker workflow run found yet for ${SHA}..."
+              sleep 15
+              continue
+            fi
+
+            STATUS=$(echo "$RUN" | jq -r '.status')
+            CONCLUSION=$(echo "$RUN" | jq -r '.conclusion // empty')
+            RUN_URL=$(echo "$RUN" | jq -r '.html_url')
+
+            if [ "$STATUS" = "completed" ]; then
+              if [ "$CONCLUSION" = "success" ]; then
+                echo "Docker workflow completed successfully: $RUN_URL"
+                break
+              else
+                echo "::error::Docker workflow finished with conclusion '$CONCLUSION': $RUN_URL"
+                exit 1
+              fi
+            fi
+
+            echo "  Attempt $i: Docker workflow status=$STATUS (${RUN_URL})"
+            sleep 15
+          done
+
+          # Final check — if we exhausted retries
+          if [ -z "${STATUS:-}" ]; then
+            echo "::error::No Docker workflow run found for SHA ${SHA} after 15 minutes"
+            exit 1
+          elif [ "$STATUS" != "completed" ]; then
+            echo "::error::Docker workflow did not complete within 15 minutes (last status: $STATUS)"
+            exit 1
+          fi
+
+      # ── Resolve Docker image tag ───────────────────────────────────────
+      - name: Resolve Docker image
+        id: image
+        run: |
+          if [ -n "${{ inputs.docker_image }}" ]; then
+            echo "tag=${{ inputs.docker_image }}" >> "$GITHUB_OUTPUT"
+          else
+            SHORT_SHA=$(echo "${{ steps.ctx.outputs.sha }}" | cut -c1-7)
+            # Use the amd64-specific tag (always pushed by the Docker workflow).
+            echo "tag=ghcr.io/openhands/agent-canvas:sha-${SHORT_SHA}-amd64" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Log in to GHCR
+        uses: docker/login-action@v4
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Pull Docker image
+        run: |
+          echo "Pulling ${{ steps.image.outputs.tag }}..."
+          docker pull "${{ steps.image.outputs.tag }}"
+
+      # ── Test infrastructure setup ──────────────────────────────────────
+      - name: Set up Node.js
+        uses: actions/setup-node@v6
+        with:
+          # Pin to 24.15.x — Node 24.16.0 has a zip-extraction regression
+          # (nodejs/node#63487) that hangs `playwright install` for Playwright
+          # < 1.60.0. Remove this pin after upgrading to Playwright >= 1.60.0.
+          node-version: "24.15"
+          cache: npm
+
+      - name: Install npm dependencies
+        run: npm ci
+
+      - name: Get Playwright version
+        id: pw_version
+        run: echo "version=$(npx playwright --version | awk '{print $2}')" >> "$GITHUB_OUTPUT"
+
+      - name: Cache Playwright browsers
+        id: pw_cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/ms-playwright
+          key: playwright-${{ runner.os }}-${{ steps.pw_version.outputs.version }}
+
+      - name: Install Playwright Chromium
+        if: steps.pw_cache.outputs.cache-hit != 'true'
+        run: npx playwright install chromium
+
+      - name: Install Playwright system deps
+        run: npx playwright install-deps chromium
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
+      - name: Install openhands-sdk (for mock LLM server)
+        run: |
+          uv venv .mock-llm-venv
+          uv pip install -p .mock-llm-venv openhands-sdk==${{ steps.defaults.outputs.agent_server_version }}
+
+      - name: Verify mock LLM server starts
+        run: |
+          .mock-llm-venv/bin/python3 tests/e2e/mock-llm/scripts/mock-llm-server.py --port 9998 &
+          SERVER_PID=$!
+          for i in $(seq 1 30); do
+            if curl -sf http://127.0.0.1:9998/v1/chat/completions \
+              -H "Content-Type: application/json" \
+              -d '{"model":"test","messages":[]}' > /dev/null 2>&1; then
+              echo "Mock LLM server responded on attempt $i"
+              break
+            fi
+            sleep 1
+          done
+          curl -sf http://127.0.0.1:9998/v1/chat/completions \
+            -H "Content-Type: application/json" \
+            -d '{"model":"test","messages":[]}' | python3 -m json.tool
+          kill $SERVER_PID
+
+      # ── Run tests ──────────────────────────────────────────────────────
+      - name: Run mock-LLM Docker E2E tests
+        id: run_tests
+        env:
+          MOCK_LLM_PYTHON: .mock-llm-venv/bin/python3
+          MOCK_LLM_DOCKER_IMAGE: ${{ steps.image.outputs.tag }}
+        run: |
+          set +e
+          MARKER_DIR=".mock-llm-markers"
+          DONE_MARKER="$MARKER_DIR/.tests-done"
+          PASS_MARKER="$MARKER_DIR/.all-passed"
+          rm -rf "$MARKER_DIR"
+
+          # Run Playwright in background so our shell survives if we have
+          # to kill it (the Docker container teardown can hang).
+          npm run test:e2e:mock-llm:docker &
+          PW_PID=$!
+
+          # Wait up to 5 min for tests to complete.
+          deadline=$((SECONDS + 300))
+          while [ "$SECONDS" -lt "$deadline" ]; do
+            if ! kill -0 "$PW_PID" 2>/dev/null; then
+              break
+            fi
+            if [ -f "$DONE_MARKER" ]; then
+              echo "Tests completed: $(cat "$DONE_MARKER")"
+              break
+            fi
+            sleep 2
+          done
+
+          # If Playwright is still running (teardown hang), give it 5s
+          # grace then force-kill.
+          if kill -0 "$PW_PID" 2>/dev/null; then
+            sleep 5
+            if kill -0 "$PW_PID" 2>/dev/null; then
+              echo "::warning::Killing lingering Playwright process (teardown hung)"
+              kill "$PW_PID" 2>/dev/null
+              sleep 5
+              kill -9 "$PW_PID" 2>/dev/null
+            fi
+            wait "$PW_PID" 2>/dev/null
+            pw_exit=124
+          else
+            wait "$PW_PID"
+            pw_exit=$?
+          fi
+
+          echo "Playwright exited with code $pw_exit"
+
+          # When killed during teardown, the exit code is non-zero but
+          # tests may have passed.
+          if [ "$pw_exit" -ne 0 ] && [ -f "$PASS_MARKER" ]; then
+            echo "::notice::All tests passed (marker file present); non-zero exit was teardown-related"
+            pw_exit=0
+          fi
+
+          # Clean up the Docker container (belt-and-suspenders)
+          docker ps -q --filter "name=agent-canvas-mock-llm" | xargs -r docker stop 2>/dev/null || true
+
+          echo "exit_code=$pw_exit" >> "$GITHUB_OUTPUT"
+          exit 0
+
+      # ── Reporting ──────────────────────────────────────────────────────
+      - name: Upload test artifacts
+        id: upload_artifacts
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: mock-llm-docker-e2e-results
+          if-no-files-found: ignore
+          retention-days: 14
+          path: |
+            playwright-report-mock-llm-docker/
+            test-results-mock-llm-docker/
+
+      - name: Render test report
+        if: always()
+        run: |
+          node tests/e2e/mock-llm/scripts/render-mock-llm-report.mjs \
+            --results "test-results-mock-llm-docker/results.json" \
+            --output "$MOCK_LLM_REPORT_PATH" \
+            --workflow-url "$MOCK_LLM_WORKFLOW_URL" \
+            --commit "${{ steps.ctx.outputs.sha }}" \
+            --artifact-url "${{ steps.upload_artifacts.outputs.artifact-url || '' }}" \
+            --title "Mock-LLM Docker E2E Test Results"
+          cat "$MOCK_LLM_REPORT_PATH" >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Post PR comment
+        if: always() && steps.ctx.outputs.pr_number
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        run: |
+          gh pr comment "${{ steps.ctx.outputs.pr_number }}" \
+            --body-file "$MOCK_LLM_REPORT_PATH"
+
+      - name: Fail job when tests fail
+        if: always()
+        run: |
+          exit_code="${{ steps.run_tests.outputs.exit_code }}"
+          exit "${exit_code:-1}"
diff --git a/.github/workflows/npm-publish.yml b/.github/workflows/npm-publish.yml
index bae15a48..54354d1d 100644
--- a/.github/workflows/npm-publish.yml
+++ b/.github/workflows/npm-publish.yml
@@ -92,9 +92,11 @@ jobs:
 
       # Resolve the npm dist-tag from the version's pre-release identifier:
       #   alpha  →  --tag alpha   (e.g. 1.0.0-alpha.1)
-      #   beta   →  --tag beta    (e.g. 1.0.0-beta.1)
-      #   rc     →  --tag rc      (e.g. 1.0.0-rc.1)
+      #   beta   →  --tag latest  (e.g. 1.0.0-beta.1)
+      #   rc     →  --tag latest  (e.g. 1.0.0-rc.1)
       #   stable →  --tag latest  (e.g. 1.0.0)
+      # Beta and RC publish as `latest` so `npm install @openhands/agent-canvas`
+      # resolves to the most recent release until the first stable version ships.
       # Note: OIDC trusted-publishing tokens cover only the `npm publish` call
       # itself; a separate `npm dist-tag add` would fail with E401, so the tag
       # is resolved and passed directly in one step.
@@ -104,10 +106,6 @@ jobs:
           VERSION=$(node -p "require('./package.json').version")
           if [[ "$VERSION" == *-alpha* ]]; then
             DIST_TAG="alpha"
-          elif [[ "$VERSION" == *-beta* ]]; then
-            DIST_TAG="beta"
-          elif [[ "$VERSION" == *-rc* ]]; then
-            DIST_TAG="rc"
           else
             DIST_TAG="latest"
           fi
diff --git a/.gitignore b/.gitignore
index 97c97dcb..ea10421a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,10 +15,12 @@ __pycache__
 /test-results/
 /test-results-live/
 /test-results-mock-llm/
+/test-results-mock-llm-docker/
 /.mock-llm-markers/
 /playwright-report/
 /playwright-report-live/
 /playwright-report-mock-llm/
+/playwright-report-mock-llm-docker/
 /blob-report/
 /playwright/.cache/
 # Snapshot baselines are stored as GitHub Actions artifacts — not in git.
diff --git a/AGENTS.md b/AGENTS.md
index fcded59e..52d123b1 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -25,7 +25,7 @@
 - The block lists URLs **from the agent's point of view**:
   - The Agent Server is always reachable as `http://localhost:<port>` from inside the sandbox — but that is _you_, not the automation backend.
   - Host-side services (ingress, Vite, automation) are reachable as `http://localhost:<port>`.
-- Agents should treat the `<RUNTIME_SERVICES>` block as authoritative: don't hardcode `localhost:8000` for "the automation server", and don't probe random ports trying to discover services. If the block says automation is not running, skip `/api/automation` calls; otherwise use the listed `url_from_agent` + `api_prefix` (default `/api/automation`) and the `X-API-Key: $OPENHANDS_AUTOMATION_API_KEY` header.
+- Agents should treat the `<RUNTIME_SERVICES>` block as authoritative: don't hardcode `localhost:8000` for "the automation server", and don't probe random ports trying to discover services. If the block says automation is not running, skip `/api/automation` calls; otherwise use the listed `url_from_agent` + `api_prefix` (default `/api/automation`) and the `X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY` header.
 - The launcher → frontend → suffix plumbing is:
   - `scripts/dev-safe.mjs::buildRuntimeServicesInfo()` — pure helper that constructs the info object.
   - `scripts/dev-with-automation.mjs::buildAutomationRuntimeServicesInfo()` — wraps it with automation details; called from both Vite spawn (`startVite`) and the static build (`static-build.mjs`).
@@ -53,7 +53,7 @@ The env var is a JSON string of:
       "url_from_agent": "http://localhost:3001"
     },
     "automation": {
-      "description": "OpenHands Automations service. All routes are mounted under '/api/automation'. Authenticate with header 'X-API-Key: $OPENHANDS_AUTOMATION_API_KEY'.",
+      "description": "OpenHands Automations service. All routes are mounted under '/api/automation'. Authenticate with header 'X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY'.",
       "url_from_agent": "http://localhost:18001",
       "api_prefix": "/api/automation",
       "docs_url": "http://localhost:18001/api/automation/docs",
@@ -81,10 +81,10 @@ from your point of view (i.e., as you should curl/fetch them).
 * Frontend: http://localhost:3001
     Vite dev server hosting the agent-canvas frontend.
 * Automation backend: http://localhost:18001
-    OpenHands Automations service. All routes are mounted under '/api/automation'. Authenticate with header 'X-API-Key: $OPENHANDS_AUTOMATION_API_KEY'.
+    OpenHands Automations service. All routes are mounted under '/api/automation'. Authenticate with header 'X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY'.
     Docs:    http://localhost:18001/api/automation/docs
     OpenAPI: http://localhost:18001/api/automation/openapi.json
-    Auth:    header 'X-API-Key: $OPENHANDS_AUTOMATION_API_KEY'
+    Auth:    header 'X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY'
 
 Trust this block over guessing: do not assume any other URLs are running.
 In particular, http://localhost:18000 inside your sandbox is the Agent Server
@@ -164,6 +164,17 @@ you are running inside of — NOT the automation backend.
 - CI workflow: `.github/workflows/mock-llm-e2e.yml` runs on PRs with the `e2e-tests` label or on manual dispatch. It builds the frontend, starts the mock LLM server, runs the tests, and posts a PR comment with results.
 - The custom `DoneMarkerReporter` writes `.mock-llm-markers/.tests-done` after all tests complete (before webServer teardown) so the CI wrapper can detect completion and kill the lingering teardown process.
 
+### Docker Image Testing (Shared Specs)
+
+- The same test specs and helpers are reused to validate the Docker image via `playwright.mock-llm-docker.config.ts`. Run locally with `npm run test:e2e:mock-llm:docker` (requires Docker daemon and a built image).
+- **Architecture**: The Docker config replaces the npm path's `bin/agent-canvas.mjs` webServer with a `docker run --network host` command. The mock LLM server still runs on the host. On Linux (including CI), `--network host` lets the container share the host's network stack so all `127.0.0.1` URLs work identically. On macOS/Windows Docker Desktop (bridge networking), set `MOCK_LLM_AGENT_URL=http://host.docker.internal:<port>` so the agent-server inside Docker can reach the host-side mock LLM server.
+- **URL split**: `mock-llm-helpers.ts` exports two mock LLM URL constants:
+  - `MOCK_LLM_BASE_URL` — always `http://127.0.0.1:<port>`, used by tests for the mock LLM admin API (register/activate/reset trajectories).
+  - `MOCK_LLM_AGENT_URL` — defaults to `MOCK_LLM_BASE_URL`, overridable via `MOCK_LLM_AGENT_URL` env var. Used when configuring the LLM profile (`base_url` field) — this is the URL the agent-server uses for inference calls. The npm path and Docker-with-`--network host` path use the same value; Docker on macOS needs the override.
+- **Docker image**: Set `MOCK_LLM_DOCKER_IMAGE` to the image tag (default: `ghcr.io/openhands/agent-canvas:latest`). The container is started with `--rm --network host` and a unique `--name` for cleanup.
+- **State isolation**: The Docker container uses its internal state directory (no host mount needed for tests). Each test run starts a fresh container.
+- CI workflow: `.github/workflows/mock-llm-docker-e2e.yml` has three triggers — all pull the already-built image from GHCR (no rebuild): (1) `workflow_run` fires automatically after the `Docker` workflow completes on main; (2) `pull_request` with the `e2e-tests` label polls the Docker workflow until it finishes for the PR's head SHA, then pulls the image (needed because `workflow_run` only fires for workflow files already on the default branch); (3) `workflow_dispatch` accepts a custom `docker_image` input. The image tag is derived from the commit SHA (`ghcr.io/openhands/agent-canvas:sha-<short>-amd64`). Fork PRs are skipped (no GHCR push). Report artifacts go to `test-results-mock-llm-docker/` and `playwright-report-mock-llm-docker/`.
+
 ## Additional Notes
 
 - **Published binary auth fix**: When users install the npm package globally (`npm install -g @openhands/agent-canvas`) and run `agent-canvas`, the pre-built static frontend has a `VITE_SESSION_API_KEY` baked in at publish time that differs from the user's persisted runtime key (`~/.openhands/agent-canvas/session-api-key.txt`). The fix is to inject the runtime session key into `index.html` responses at serve time (not build time). `scripts/static-server.mjs` accepts a `--session-api-key <key>` flag and injects a tiny inline `<script>` before `</head>` that seeds the key into `localStorage['openhands-agent-server-config'].sessionApiKey` — only if no key is already stored (preserving user-set overrides). `scripts/dev-with-automation.mjs` and `scripts/dev-static.mjs` both pass `--session-api-key ${config.sessionApiKey}` when starting the static server.
diff --git a/README.md b/README.md
index 96073282..4c4daa29 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ agent-canvas
 ### Option 2: With a Docker Sandbox
 
 ```sh
-docker pull ghcr.io/openhands/agent-canvas:1.0.0-alpha.10
+docker pull ghcr.io/openhands/agent-canvas:1.0.0-beta.2
 
 export PROJECTS_PATH=~/projects  # directory containing your project folders
 
@@ -69,7 +69,7 @@ docker run -it --rm \
   -p 8000:8000 \
   -v ~/.openhands:/home/openhands/.openhands \
   -v ${PROJECTS_PATH}:/projects \
-  ghcr.io/openhands/agent-canvas:1.0.0-alpha.10
+  ghcr.io/openhands/agent-canvas:1.0.0-beta.2
 ```
 
 The agent will be able to access any project under `PROJECTS_PATH`.
diff --git a/__tests__/api/agent-server-adapter.test.ts b/__tests__/api/agent-server-adapter.test.ts
index c82028b5..f628d539 100644
--- a/__tests__/api/agent-server-adapter.test.ts
+++ b/__tests__/api/agent-server-adapter.test.ts
@@ -795,7 +795,8 @@ describe("buildRuntimeServicesSystemSuffix", () => {
     expect(suffix).toContain("http://localhost:18000");
     expect(suffix).toContain("http://localhost:18001");
     expect(suffix).toContain("http://localhost:18001/api/automation/docs");
-    expect(suffix).toContain("X-API-Key: $OPENHANDS_AUTOMATION_API_KEY");
+    expect(suffix).toContain("X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY");
+    expect(suffix).not.toContain("X-API-Key: $OPENHANDS_AUTOMATION_API_KEY");
     expect(suffix).toContain("</RUNTIME_SERVICES>");
     // The "don't guess" line should reference the actual agent-server URL
     // for this stack, not a hardcoded port. The assertion anchors on the URL
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
index e02a2855..b641161c 100644
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@@ -2,17 +2,22 @@
 # ═══════════════════════════════════════════════════════════════════════════════
 # agent-canvas all-in-one entrypoint
 #
-# Starts three services:
+# Starts three services (plus an optional fourth):
 #   1. Agent Server   on port $AGENT_SERVER_PORT  (default 18000)
 #   2. Automation     on port $AUTOMATION_PORT     (default 18001)
 #   3. Static server  on port $PORT               (default 8000)
 #      Routes /api/automation/* → automation, /api/* → agent-server,
 #      and serves the frontend static build for everything else.
+#   4. (Optional) Public-mode static server on $PUBLIC_MODE_PORT
+#      Same frontend, but with --auth-required (no baked session key).
+#      Used by auth-mode E2E tests. Only started when PUBLIC_MODE_PORT is set.
 #
 # Environment variables:
 #   PORT                 – Unified entry point port (default: 8000)
 #   AGENT_SERVER_PORT    – Internal agent-server port (default: 18000)
 #   AUTOMATION_PORT      – Internal automation port (default: 18001)
+#   PUBLIC_MODE_PORT     – If set, starts a second static server on this port
+#                          with --auth-required (no session key injected)
 #   OH_SECRET_KEY        – Secret key for settings encryption (auto-generated
 #                          and persisted if not provided)
 #   OPENHANDS_AUTOMATION_API_KEY – Override automation backend auth key
@@ -23,6 +28,16 @@
 #                          Setting this enables local-mode auth so the session
 #                          API key is validated internally instead of against the
 #                          OpenHands cloud API.
+#   FILE_STORE             – Storage backend for automation tarballs (default: local).
+#                          Without this the automation backend may fall back to
+#                          S3/GCS which fails without cloud credentials.
+#   LOCAL_STORAGE_PATH     – Directory for local file storage (default: ~/.openhands/storage)
+#   AUTOMATION_BASE_URL    – Publicly-reachable base URL for the automation
+#                          service, used in callback URLs and injected into
+#                          sandboxes (default: http://127.0.0.1:$PORT).
+#                          Override in production when the external URL differs.
+#   AUTOMATION_WORKSPACE_BASE – Directory for automation run workspaces
+#                          (default: ~/.openhands/workspaces)
 #   Any agent-server or automation env vars are passed through.
 # ═══════════════════════════════════════════════════════════════════════════════
 set -uo pipefail
@@ -146,6 +161,23 @@ log "Starting automation server on port $AUTOMATION_PORT..."
 # Disable the automation's own frontend — agent-canvas provides the UI.
 export AUTOMATION_FRONTEND_DIR=""
 
+# File storage — use local filesystem unless the user has configured cloud
+# storage.  Without FILE_STORE=local the automation backend may fall back
+# to a cloud provider (S3/GCS) which will fail without credentials, causing
+# tarball-based presets (preset/prompt, preset/plugin) to silently error.
+export FILE_STORE="${FILE_STORE:-local}"
+export LOCAL_STORAGE_PATH="${LOCAL_STORAGE_PATH:-${OPENHANDS_DIR}/storage}"
+mkdir -p "$LOCAL_STORAGE_PATH"
+
+# AUTOMATION_BASE_URL — the publicly-reachable base URL for the automation
+# service.  Appended to callback URLs and injected into each sandbox as
+# AUTOMATION_API_URL.  Defaults to the unified ingress.
+export AUTOMATION_BASE_URL="${AUTOMATION_BASE_URL:-http://127.0.0.1:${PORT}}"
+
+# AUTOMATION_WORKSPACE_BASE — where automation runs unpack tarballs.
+export AUTOMATION_WORKSPACE_BASE="${AUTOMATION_WORKSPACE_BASE:-${OPENHANDS_DIR}/workspaces}"
+mkdir -p "$AUTOMATION_WORKSPACE_BASE"
+
 # Default to SQLite so the automation server works out of the box without
 # an external PostgreSQL instance. Users can override AUTOMATION_DB_URL to
 # point at a real Postgres for production deployments.
@@ -213,6 +245,31 @@ node /opt/agent-canvas/static-server.mjs \
   --route "/openapi.json=http://127.0.0.1:${AGENT_SERVER_PORT}" &
 PIDS+=($!)
 
+# ── 5. (Optional) Public-mode static server ─────────────────────────────────
+# When PUBLIC_MODE_PORT is set, start a second static-server instance that
+# serves the same frontend WITHOUT injecting the session key into the HTML
+# (--auth-required). This is used by auth-mode E2E tests to verify the
+# ApiKeyEntryScreen gate, key rotation recovery, etc.
+if [ -n "${PUBLIC_MODE_PORT:-}" ]; then
+  log "Starting public-mode frontend on port $PUBLIC_MODE_PORT (--auth-required)..."
+  node /opt/agent-canvas/static-server.mjs \
+    --port "$PUBLIC_MODE_PORT" \
+    --host 0.0.0.0 \
+    --dir /opt/agent-canvas/frontend \
+    --auth-required \
+    --route "/api/automation=http://127.0.0.1:${AUTOMATION_PORT}" \
+    --route "/api=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/server_info=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/sockets=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/alive=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/health=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/ready=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/docs=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/redoc=http://127.0.0.1:${AGENT_SERVER_PORT}" \
+    --route "/openapi.json=http://127.0.0.1:${AGENT_SERVER_PORT}" &
+  PIDS+=($!)
+fi
+
 log "All services started. Unified entry point: http://0.0.0.0:${PORT}/"
 
 # Wait for any child to exit. If one dies, the trap will clean up the rest.
diff --git a/package-lock.json b/package-lock.json
index 630524a9..82f9febf 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "@openhands/agent-canvas",
-  "version": "1.0.0-alpha.10",
+  "version": "1.0.0-beta.3",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "@openhands/agent-canvas",
-      "version": "1.0.0-alpha.10",
+      "version": "1.0.0-beta.3",
       "license": "MIT",
       "dependencies": {
         "@heroui/react": "2.8.10",
diff --git a/package.json b/package.json
index 3ce4579d..b1f41184 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@openhands/agent-canvas",
-  "version": "1.0.0-alpha.10",
+  "version": "1.0.0-beta.3",
   "description": "Agent Canvas UI for OpenHands - run AI coding agents with a visual interface",
   "license": "MIT",
   "private": false,
@@ -81,6 +81,7 @@
     "test:e2e": "playwright test --pass-with-no-tests",
     "test:e2e:live": "node --env-file-if-exists=.env tests/e2e/live/scripts/run-live-e2e.mjs",
     "test:e2e:mock-llm": "playwright test --config=playwright.mock-llm.config.ts",
+    "test:e2e:mock-llm:docker": "playwright test --config=playwright.mock-llm-docker.config.ts",
     "test:e2e:snapshots": "playwright test tests/e2e/snapshots --project=chromium --retries=0",
     "test:e2e:snapshots:update": "playwright test tests/e2e/snapshots --project=chromium --update-snapshots",
     "test:coverage": "npm run make-i18n && vitest run --coverage",
diff --git a/playwright.mock-llm-docker.config.ts b/playwright.mock-llm-docker.config.ts
new file mode 100644
index 00000000..a7c4e682
--- /dev/null
+++ b/playwright.mock-llm-docker.config.ts
@@ -0,0 +1,171 @@
+/**
+ * Playwright config for mock-LLM E2E tests against a Docker container.
+ *
+ * Reuses the same test specs as playwright.mock-llm.config.ts but launches
+ * the agent-canvas stack inside a Docker container instead of via
+ * bin/agent-canvas.mjs + uvx.
+ *
+ * Starts two processes:
+ *   1. Mock LLM server (Python on the host, using openhands-sdk TestLLM)
+ *   2. Docker container running the agent-canvas all-in-one image
+ *      (agent-server + automation backend + static frontend + proxy)
+ *      The container also starts a second static-server instance on
+ *      PUBLIC_MODE_PORT with --auth-required for auth-mode E2E tests.
+ *
+ * Networking:
+ *   Uses --network host on Linux so the container shares the host's network
+ *   stack. This means the agent-server inside Docker can reach the mock LLM
+ *   server at 127.0.0.1:<port> — identical to the npm path.
+ *
+ *   For macOS/Windows (Docker Desktop with bridge networking), set
+ *   MOCK_LLM_AGENT_URL=http://host.docker.internal:<port> so the
+ *   agent-server can reach the host-side mock LLM server.
+ *
+ * Required:
+ *   - A built Docker image. Set MOCK_LLM_DOCKER_IMAGE to the image tag
+ *     (default: ghcr.io/openhands/agent-canvas:latest).
+ *   - Docker daemon must be running.
+ */
+
+import { defineConfig, devices } from "@playwright/test";
+import { randomBytes } from "node:crypto";
+
+// ── Docker image ────────────────────────────────────────────────────────
+const DOCKER_IMAGE =
+  process.env.MOCK_LLM_DOCKER_IMAGE ?? "ghcr.io/openhands/agent-canvas:latest";
+
+// Container name for cleanup — unique per run to avoid collisions.
+const CONTAINER_NAME =
+  process.env.MOCK_LLM_CONTAINER_NAME ??
+  `agent-canvas-mock-llm-${randomBytes(4).toString("hex")}`;
+
+// ── Port allocation (separate from live E2E / dev to avoid collisions) ─
+const MOCK_LLM_PORT = process.env.MOCK_LLM_PORT ?? "9999";
+
+// The Docker container exposes a single port for the unified ingress.
+// With --network host this is accessible at localhost directly.
+const INGRESS_PORT = process.env.MOCK_LLM_INGRESS_PORT ?? "18300";
+
+// Public-mode static server — runs inside the Docker container when
+// PUBLIC_MODE_PORT is set (see docker/entrypoint.sh). With --network host
+// the port is accessible from the host at localhost directly.
+const PUBLIC_MODE_PORT = process.env.MOCK_LLM_PUBLIC_MODE_PORT ?? "18301";
+
+// ── Session API key ────────────────────────────────────────────────────
+const sessionApiKey =
+  process.env.MOCK_LLM_SESSION_API_KEY?.trim() ||
+  randomBytes(32).toString("hex");
+process.env.MOCK_LLM_SESSION_API_KEY = sessionApiKey;
+
+// ── URLs ───────────────────────────────────────────────────────────────
+const INGRESS_URL = `http://localhost:${INGRESS_PORT}/`;
+const MOCK_LLM_URL = `http://127.0.0.1:${MOCK_LLM_PORT}`;
+
+// Python binary for the mock server — defaults to "python3" but CI can
+// point this at a venv (e.g. ".mock-llm-venv/bin/python3") to avoid
+// PEP 668 "externally managed" errors on Ubuntu 24.04+.
+const MOCK_LLM_PYTHON = process.env.MOCK_LLM_PYTHON ?? "python3";
+
+// Export for the test helpers — BACKEND_URL points to the ingress (API
+// calls are proxied to the agent-server, so no direct backend port needed).
+process.env.MOCK_LLM_BACKEND_URL = `http://localhost:${INGRESS_PORT}`;
+process.env.MOCK_LLM_PORT = MOCK_LLM_PORT;
+process.env.MOCK_LLM_PUBLIC_MODE_URL = `http://localhost:${PUBLIC_MODE_PORT}`;
+process.env.VITE_SESSION_API_KEY = sessionApiKey;
+
+// MOCK_LLM_AGENT_URL — the URL the agent-server inside Docker uses to
+// call the mock LLM for inference. With --network host on Linux the
+// agent-server can reach 127.0.0.1 directly. For macOS/Windows Docker
+// Desktop, override this to http://host.docker.internal:<port>.
+if (!process.env.MOCK_LLM_AGENT_URL) {
+  process.env.MOCK_LLM_AGENT_URL = MOCK_LLM_URL;
+}
+
+export default defineConfig({
+  testDir: "./tests/e2e/mock-llm",
+  testMatch: /.*\.spec\.ts/,
+  fullyParallel: false,
+  forbidOnly: !!process.env.CI,
+  retries: 0,
+  workers: 1,
+  timeout: 60_000,
+  globalTimeout: process.env.CI ? 600_000 : 0, // 10 min hard cap in CI
+  reporter: [
+    ["line"],
+    [
+      "json",
+      { outputFile: "test-results-mock-llm-docker/results.json" },
+    ],
+    [
+      "html",
+      {
+        outputFolder: "playwright-report-mock-llm-docker",
+        open: "never",
+      },
+    ],
+    ["./tests/e2e/mock-llm/reporters/done-marker-reporter.ts"],
+  ],
+  outputDir: "test-results-mock-llm-docker",
+  use: {
+    baseURL: INGRESS_URL,
+    screenshot: "only-on-failure",
+    trace: "on-first-retry",
+    video: "on",
+  },
+  projects: [
+    {
+      name: "chromium",
+      use: { ...devices["Desktop Chrome"] },
+    },
+  ],
+  webServer: [
+    // 1. Mock LLM server (Python, on the host)
+    {
+      command: `${MOCK_LLM_PYTHON} tests/e2e/mock-llm/scripts/mock-llm-server.py --port ${MOCK_LLM_PORT}`,
+      url: MOCK_LLM_URL,
+      timeout: 30_000,
+      reuseExistingServer: !process.env.CI,
+      stdout: "pipe",
+      stderr: "pipe",
+    },
+    // 2. Docker container running the agent-canvas all-in-one image
+    //
+    // Uses --network host so the container shares the host's network:
+    //   - The ingress port is available at localhost:<INGRESS_PORT>
+    //   - The agent-server can reach the mock LLM at 127.0.0.1:<MOCK_LLM_PORT>
+    //
+    // The container is started with --rm for auto-cleanup. A named container
+    // is used so the teardown can `docker stop` it reliably.
+    //
+    // Note: --network host is Linux-only. On macOS/Windows Docker Desktop,
+    // use -p port mapping and set MOCK_LLM_AGENT_URL=http://host.docker.internal:<port>.
+    {
+      command: [
+        // Stop any leftover container from a previous failed run
+        `docker rm -f ${CONTAINER_NAME} 2>/dev/null;`,
+        "exec docker run",
+        "--rm",
+        `--name ${CONTAINER_NAME}`,
+        "--network host",
+        `-e PORT=${INGRESS_PORT}`,
+        `-e SESSION_API_KEY=${sessionApiKey}`,
+        `-e OH_SESSION_API_KEYS_0=${sessionApiKey}`,
+        `-e PUBLIC_MODE_PORT=${PUBLIC_MODE_PORT}`,
+        "-e VITE_DO_NOT_TRACK=1",
+        "-e VITE_ENABLE_BROWSER_TOOLS=false",
+        DOCKER_IMAGE,
+      ].join(" "),
+      // Probe the automation list endpoint through the ingress to ensure
+      // the FULL stack (agent-server + automation backend + ingress) is
+      // up before tests start. GET /api/automation/v1 returns 200 (empty
+      // list) without auth — the automation backend does not enforce
+      // session-key auth on the list endpoint.
+      url: `http://localhost:${INGRESS_PORT}/api/automation/v1`,
+      timeout: 180_000, // Docker pull + all services startup
+      reuseExistingServer: !process.env.CI,
+    },
+  ],
+  // globalTeardown stops the Docker container when Playwright exits.
+  // Playwright sends SIGTERM to the webServer command, but `docker run`
+  // with --rm handles cleanup automatically on termination.
+});
diff --git a/scripts/dev-safe.mjs b/scripts/dev-safe.mjs
index 322a63d1..c85d40fc 100644
--- a/scripts/dev-safe.mjs
+++ b/scripts/dev-safe.mjs
@@ -782,7 +782,7 @@ export function buildRuntimeServicesInfo(options) {
       description:
         "OpenHands Automations service. All routes are mounted under " +
         `'${apiPrefix}'. Authenticate with header ` +
-        `'X-API-Key: $${authEnvVar}'.`,
+        `'X-Session-API-Key: $${authEnvVar}'.`,
       url_from_agent: baseUrl,
       api_prefix: apiPrefix,
       docs_url: `${baseUrl}${apiPrefix}/docs`,
diff --git a/src/api/agent-server-adapter.ts b/src/api/agent-server-adapter.ts
index 075cf394..d8ee2e36 100644
--- a/src/api/agent-server-adapter.ts
+++ b/src/api/agent-server-adapter.ts
@@ -199,8 +199,10 @@ export function buildRuntimeServicesSystemSuffix(): string | undefined {
       lines.push(`    OpenAPI: ${automation.openapi_url}`);
     }
     if (automation.auth_env_var) {
+      // X-Session-API-Key is the local convention shared by the agent-server
+      // and automation backend (see openhands-automation auth.py).
       lines.push(
-        `    Auth:    header 'X-API-Key: $${automation.auth_env_var}'`,
+        `    Auth:    header 'X-Session-API-Key: $${automation.auth_env_var}'`,
       );
     }
   } else {
diff --git a/src/components/features/automations/recommended-automations-launcher.tsx b/src/components/features/automations/recommended-automations-launcher.tsx
index 5ab90710..9d457d58 100644
--- a/src/components/features/automations/recommended-automations-launcher.tsx
+++ b/src/components/features/automations/recommended-automations-launcher.tsx
@@ -73,7 +73,7 @@ export function buildAutomationPrompt(
     "**Which API to use:** Create this automation using the **local** OpenHands Automations API that is running alongside this agent.",
     "- Read the Automation backend URL from the `<RUNTIME_SERVICES>` block in your system context.",
     "- Endpoint path: `POST /api/automation/v1/preset/prompt`",
-    "- Auth: `X-API-Key: $OPENHANDS_AUTOMATION_API_KEY`",
+    "- Auth: `X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY`",
     "- If no local Automation backend is listed in `<RUNTIME_SERVICES>`, stop and ask me to start the full local automation stack instead of using any remote/cloud automation API.",
   ].join("\n");
 }
diff --git a/tests/e2e/mock-llm/mock-llm-conversation.spec.ts b/tests/e2e/mock-llm/mock-llm-conversation.spec.ts
index b4f460e6..87cfcb37 100644
--- a/tests/e2e/mock-llm/mock-llm-conversation.spec.ts
+++ b/tests/e2e/mock-llm/mock-llm-conversation.spec.ts
@@ -24,7 +24,7 @@ import {
   BASH_TOKEN,
   REPLY_TOKEN,
   waitForAgentMessageContaining,
-  MOCK_LLM_BASE_URL,
+  MOCK_LLM_AGENT_URL,
   BACKEND_URL,
   SESSION_API_KEY,
   seedLocalStorage,
@@ -117,10 +117,12 @@ test.describe("mock-LLM agent-server conversation", () => {
     await modelInput.click();
     await modelInput.fill(MOCK_MODEL);
 
-    // Fill in base URL pointing to our mock server
+    // Fill in base URL pointing to our mock server.
+    // Use MOCK_LLM_AGENT_URL — the URL the agent-server will use for
+    // inference calls. In Docker this may differ from the host-local URL.
     const baseUrlInput = page.getByTestId("base-url-input");
     await baseUrlInput.click();
-    await baseUrlInput.fill(MOCK_LLM_BASE_URL);
+    await baseUrlInput.fill(MOCK_LLM_AGENT_URL);
 
     // Fill in a fake API key (mock server doesn't validate it)
     const apiKeyInput = page.getByTestId("llm-api-key-input");
@@ -174,31 +176,32 @@ test.describe("mock-LLM agent-server conversation", () => {
     // Click "Set as active"
     await page.getByTestId("profile-set-active").click();
 
-    // Verify the "Active" badge appears on our profile
-    // Re-find the row after the state change
-    await page.waitForTimeout(1_000); // wait for the mutation to settle
-
-    // Reload to see the persisted state
-    await page.goto("/settings/llm", { waitUntil: "domcontentloaded" });
-    await waitForTestId(page, "add-llm-profile");
-
-    const updatedRows = page.getByTestId("profile-row");
-    const updatedCount = await updatedRows.count();
-    let foundActiveBadge = false;
-
-    for (let i = 0; i < updatedCount; i++) {
-      const row = updatedRows.nth(i);
-      const text = await row.textContent();
-      if (text?.includes(PROFILE_NAME)) {
-        const badge = row.getByTestId("profile-active-badge");
-        foundActiveBadge = (await badge.count()) > 0;
-        break;
-      }
-    }
-    expect(
-      foundActiveBadge,
-      `Profile "${PROFILE_NAME}" should have an "Active" badge`,
-    ).toBe(true);
+    // Verify the "Active" badge appears on our profile.
+    // Poll with reload instead of a fixed timeout — the mutation may take
+    // more than 1s to persist on a loaded CI runner.
+    await expect
+      .poll(
+        async () => {
+          await page.goto("/settings/llm", { waitUntil: "domcontentloaded" });
+          await waitForTestId(page, "add-llm-profile");
+          const rows = page.getByTestId("profile-row");
+          const count = await rows.count();
+          for (let i = 0; i < count; i++) {
+            const row = rows.nth(i);
+            const text = await row.textContent();
+            if (text?.includes(PROFILE_NAME)) {
+              return (await row.getByTestId("profile-active-badge").count()) > 0;
+            }
+          }
+          return false;
+        },
+        {
+          message: `Profile "${PROFILE_NAME}" should have an "Active" badge`,
+          timeout: 15_000,
+          intervals: [1_000, 2_000, 3_000],
+        },
+      )
+      .toBe(true);
 
     // Verify the settings API now reflects the activated profile's LLM config
     await test.step("verify settings API reflects the active profile's model", async () => {
@@ -219,8 +222,8 @@ test.describe("mock-LLM agent-server conversation", () => {
       const llmBaseUrl = settings?.agent_settings?.llm?.base_url;
       expect(
         llmBaseUrl,
-        `Expected settings llm.base_url="${MOCK_LLM_BASE_URL}" but got "${llmBaseUrl}"`,
-      ).toBe(MOCK_LLM_BASE_URL);
+        `Expected settings llm.base_url="${MOCK_LLM_AGENT_URL}" but got "${llmBaseUrl}"`,
+      ).toBe(MOCK_LLM_AGENT_URL);
     });
   });
 
diff --git a/tests/e2e/mock-llm/scripts/render-mock-llm-report.mjs b/tests/e2e/mock-llm/scripts/render-mock-llm-report.mjs
index 4cb1eac4..e8c3e420 100644
--- a/tests/e2e/mock-llm/scripts/render-mock-llm-report.mjs
+++ b/tests/e2e/mock-llm/scripts/render-mock-llm-report.mjs
@@ -141,7 +141,7 @@ function overallIcon(status) {
 
 // ── Report rendering ───────────────────────────────────────────────────
 
-function renderReport({ tests, workflowUrl, commit, artifactUrl }) {
+function renderReport({ tests, workflowUrl, commit, artifactUrl, title }) {
   const status = overallStatus(tests);
   const icon = overallIcon(status);
   const passed = tests.filter((t) => t.status === "passed").length;
@@ -154,7 +154,7 @@ function renderReport({ tests, workflowUrl, commit, artifactUrl }) {
   const lines = [];
 
   // Header
-  lines.push(`## ${icon} Mock-LLM E2E Tests`);
+  lines.push(`## ${icon} ${title || "Mock-LLM E2E Tests"}`);
   lines.push("");
 
   // Summary line
@@ -274,6 +274,7 @@ const report = renderReport({
   workflowUrl: args.workflow_url || "",
   commit: args.commit || "",
   artifactUrl: args.artifact_url || "",
+  title: args.title || "",
 });
 
 writeFileSync(outputPath, report);
diff --git a/tests/e2e/mock-llm/utils/mock-llm-helpers.ts b/tests/e2e/mock-llm/utils/mock-llm-helpers.ts
index 2bcbfdce..03206c10 100644
--- a/tests/e2e/mock-llm/utils/mock-llm-helpers.ts
+++ b/tests/e2e/mock-llm/utils/mock-llm-helpers.ts
@@ -16,7 +16,17 @@ export const BASH_COMMAND = `printf '${BASH_TOKEN}\\n'`;
 // The agent-canvas binary exposes a single ingress port; API calls are proxied
 // through it, so BACKEND_URL = ingress URL (no separate backend port).
 export const MOCK_LLM_PORT = process.env.MOCK_LLM_PORT ?? "9999";
+
+// URL tests use to hit the mock LLM admin API (always on the host).
 export const MOCK_LLM_BASE_URL = `http://127.0.0.1:${MOCK_LLM_PORT}`;
+
+// URL the agent-server uses to reach the mock LLM for inference calls.
+// In the npm path both run on the host, so this equals MOCK_LLM_BASE_URL.
+// In Docker with --network host on Linux this also works as-is.
+// For Docker on macOS (bridge networking), set MOCK_LLM_AGENT_URL to
+// http://host.docker.internal:<port> so the container can reach the host.
+export const MOCK_LLM_AGENT_URL =
+  process.env.MOCK_LLM_AGENT_URL ?? MOCK_LLM_BASE_URL;
 export const BACKEND_URL =
   process.env.MOCK_LLM_BACKEND_URL ?? "http://localhost:18300";
 // Public-mode static server (--auth-required, no session key injected).
@@ -283,7 +293,9 @@ export async function ensureMockLLMProfile(
   request: APIRequestContext,
   model = "openai/mock-test-model",
 ) {
-  // Check if the current profile already has the mock LLM settings
+  // Check if the current profile already has the mock LLM settings.
+  // Use MOCK_LLM_AGENT_URL — this is the URL the agent-server will use to
+  // reach the mock LLM, which may differ from MOCK_LLM_BASE_URL in Docker.
   const settingsResp = await request.get(`${BACKEND_URL}/api/settings`, {
     headers: {
       "X-Session-API-Key": SESSION_API_KEY,
@@ -294,7 +306,7 @@ export async function ensureMockLLMProfile(
   if (settingsResp.ok()) {
     const settings = await settingsResp.json();
     const llm = settings?.agent_settings?.llm;
-    if (llm?.model === model && llm?.base_url === MOCK_LLM_BASE_URL) {
+    if (llm?.model === model && llm?.base_url === MOCK_LLM_AGENT_URL) {
       return; // Already configured
     }
   }
@@ -310,7 +322,7 @@ export async function ensureMockLLMProfile(
         llm: {
           model,
           api_key: "mock-api-key-for-testing",
-          base_url: MOCK_LLM_BASE_URL,
+          base_url: MOCK_LLM_AGENT_URL,
         },
       },
     },