OpenHands · tofarr · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/.github/workflows/mock-llm-docker-e2e.yml b/.github/workflows/mock-llm-docker-e2e.yml
@@ -0,0 +1,321 @@
+name: Mock-LLM Docker E2E Tests
+
+# Runs the same mock-LLM E2E test specs as mock-llm-e2e.yml, but against
+# the Docker image instead of the npm build path (bin/agent-canvas.mjs).
+#
+# Trigger chain:
+#   1. workflow_run — fires automatically after the "Docker" workflow
+#      completes on main. The image is already built/pushed to GHCR.
+#   2. pull_request — fires on PRs with the 'e2e-tests' label. Waits for
+#      the Docker workflow to finish, then pulls the image from GHCR.
+#      (workflow_run doesn't fire for new workflow files until they're on
+#      the default branch, so pull_request is needed for first-run PRs.)
+#   3. workflow_dispatch — manual trigger with a custom image tag.
+
+on:
+  workflow_run:
+    workflows: ["Docker"]
+    types: [completed]
+  pull_request:
+    types: [opened, synchronize, reopened, labeled]
+  workflow_dispatch:
+    inputs:
+      docker_image:
+        description: 'Docker image to test (e.g., ghcr.io/openhands/agent-canvas:sha-abc1234-amd64)'
+        type: string
+        default: ""
+
+concurrency:
+  group: mock-llm-docker-e2e-${{ github.event.workflow_run.id || github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  packages: read
+  pull-requests: write
+  actions: read
+
+jobs:
+  mock-llm-docker-e2e:
+    # workflow_run: only run if the Docker build succeeded.
+    # pull_request: only run with the 'e2e-tests' label, skip fork PRs (no GHCR push).
+    # workflow_dispatch: always run.
+    if: >-
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'workflow_run' &&
+       github.event.workflow_run.conclusion == 'success') ||
+      (github.event_name == 'pull_request' &&
+       contains(github.event.pull_request.labels.*.name, 'e2e-tests') &&
+       !github.event.pull_request.head.repo.fork)
+    runs-on: ubuntu-24.04
+    timeout-minutes: 15
+
+    env:
+      MOCK_LLM_REPORT_PATH: mock-llm-docker-report.md
+      MOCK_LLM_WORKFLOW_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+    steps:
+      # ── Resolve which commit / PR to test ──────────────────────────────
+      - name: Resolve source context
+        id: ctx
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_run" ]; then
+            echo "sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
+            echo "ref=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
+            PR_NUMBER=$(echo '${{ toJSON(github.event.workflow_run.pull_requests) }}' \
+              | jq -r '.[0].number // empty')
+            echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
+          elif [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "sha=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
+            echo "ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
+            echo "pr_number=${{ github.event.pull_request.number }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
+            echo "ref=${{ github.ref }}" >> "$GITHUB_OUTPUT"
+            echo "pr_number=" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Check out repository
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ steps.ctx.outputs.ref }}
+
+      - name: Read defaults from config/defaults.json
+        id: defaults
+        run: |
+          echo "agent_server_version=$(node -p "require('./config/defaults.json').versions.agentServer")" >> "$GITHUB_OUTPUT"
+
+      # ── Wait for Docker workflow (pull_request trigger only) ────────────
+      # When triggered by pull_request, the Docker image may still be
+      # building. Poll the Docker workflow until it completes for this SHA.
+      - name: Wait for Docker workflow to complete
+        if: github.event_name == 'pull_request'
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        run: |
+          SHA="${{ steps.ctx.outputs.sha }}"
+          echo "Waiting for Docker workflow to complete for SHA ${SHA}..."
+
+          for i in $(seq 1 60); do
+            # Find the Docker workflow run for this exact commit
+            RUN=$(gh api \
+              "/repos/${{ github.repository }}/actions/workflows/docker.yml/runs?head_sha=${SHA}&per_page=1" \
+              --jq '.workflow_runs[0] // empty' 2>/dev/null || echo "")
+
+            if [ -z "$RUN" ]; then
+              echo "  Attempt $i: No Docker workflow run found yet for ${SHA}..."
+              sleep 15
+              continue
+            fi
+
+            STATUS=$(echo "$RUN" | jq -r '.status')
+            CONCLUSION=$(echo "$RUN" | jq -r '.conclusion // empty')
+            RUN_URL=$(echo "$RUN" | jq -r '.html_url')
+
+            if [ "$STATUS" = "completed" ]; then
+              if [ "$CONCLUSION" = "success" ]; then
+                echo "Docker workflow completed successfully: $RUN_URL"
+                break
+              else
+                echo "::error::Docker workflow finished with conclusion '$CONCLUSION': $RUN_URL"
+                exit 1
+              fi
+            fi
+
+            echo "  Attempt $i: Docker workflow status=$STATUS (${RUN_URL})"
+            sleep 15
+          done
+
+          # Final check — if we exhausted retries
+          if [ -z "${STATUS:-}" ]; then
+            echo "::error::No Docker workflow run found for SHA ${SHA} after 15 minutes"
+            exit 1
+          elif [ "$STATUS" != "completed" ]; then
+            echo "::error::Docker workflow did not complete within 15 minutes (last status: $STATUS)"
+            exit 1
+          fi
+
+      # ── Resolve Docker image tag ───────────────────────────────────────
+      - name: Resolve Docker image
+        id: image
+        run: |
+          if [ -n "${{ inputs.docker_image }}" ]; then
+            echo "tag=${{ inputs.docker_image }}" >> "$GITHUB_OUTPUT"
+          else
+            SHORT_SHA=$(echo "${{ steps.ctx.outputs.sha }}" | cut -c1-7)
+            # Use the amd64-specific tag (always pushed by the Docker workflow).
+            echo "tag=ghcr.io/openhands/agent-canvas:sha-${SHORT_SHA}-amd64" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Log in to GHCR
+        uses: docker/login-action@v4
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Pull Docker image
+        run: |
+          echo "Pulling ${{ steps.image.outputs.tag }}..."
+          docker pull "${{ steps.image.outputs.tag }}"
+
+      # ── Test infrastructure setup ──────────────────────────────────────
+      - name: Set up Node.js
+        uses: actions/setup-node@v6
+        with:
+          # Pin to 24.15.x — Node 24.16.0 has a zip-extraction regression
+          # (nodejs/node#63487) that hangs `playwright install` for Playwright
+          # < 1.60.0. Remove this pin after upgrading to Playwright >= 1.60.0.
+          node-version: "24.15"
+          cache: npm
+
+      - name: Install npm dependencies
+        run: npm ci
+
+      - name: Get Playwright version
+        id: pw_version
+        run: echo "version=$(npx playwright --version | awk '{print $2}')" >> "$GITHUB_OUTPUT"
+
+      - name: Cache Playwright browsers
+        id: pw_cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/ms-playwright
+          key: playwright-${{ runner.os }}-${{ steps.pw_version.outputs.version }}
+
+      - name: Install Playwright Chromium
+        if: steps.pw_cache.outputs.cache-hit != 'true'
+        run: npx playwright install chromium
+
+      - name: Install Playwright system deps
+        run: npx playwright install-deps chromium
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
+      - name: Install openhands-sdk (for mock LLM server)
+        run: |
+          uv venv .mock-llm-venv
+          uv pip install -p .mock-llm-venv openhands-sdk==${{ steps.defaults.outputs.agent_server_version }}
+
+      - name: Verify mock LLM server starts
+        run: |
+          .mock-llm-venv/bin/python3 tests/e2e/mock-llm/scripts/mock-llm-server.py --port 9998 &
+          SERVER_PID=$!
+          for i in $(seq 1 30); do
+            if curl -sf http://127.0.0.1:9998/v1/chat/completions \
+              -H "Content-Type: application/json" \
+              -d '{"model":"test","messages":[]}' > /dev/null 2>&1; then
+              echo "Mock LLM server responded on attempt $i"
+              break
+            fi
+            sleep 1
+          done
+          curl -sf http://127.0.0.1:9998/v1/chat/completions \
+            -H "Content-Type: application/json" \
+            -d '{"model":"test","messages":[]}' | python3 -m json.tool
+          kill $SERVER_PID
+
+      # ── Run tests ──────────────────────────────────────────────────────
+      - name: Run mock-LLM Docker E2E tests
+        id: run_tests
+        env:
+          MOCK_LLM_PYTHON: .mock-llm-venv/bin/python3
+          MOCK_LLM_DOCKER_IMAGE: ${{ steps.image.outputs.tag }}
+        run: |
+          set +e
+          MARKER_DIR=".mock-llm-markers"
+          DONE_MARKER="$MARKER_DIR/.tests-done"
+          PASS_MARKER="$MARKER_DIR/.all-passed"
+          rm -rf "$MARKER_DIR"
+
+          # Run Playwright in background so our shell survives if we have
+          # to kill it (the Docker container teardown can hang).
+          npm run test:e2e:mock-llm:docker &
+          PW_PID=$!
+
+          # Wait up to 5 min for tests to complete.
+          deadline=$((SECONDS + 300))
+          while [ "$SECONDS" -lt "$deadline" ]; do
+            if ! kill -0 "$PW_PID" 2>/dev/null; then
+              break
+            fi
+            if [ -f "$DONE_MARKER" ]; then
+              echo "Tests completed: $(cat "$DONE_MARKER")"
+              break
+            fi
+            sleep 2
+          done
+
+          # If Playwright is still running (teardown hang), give it 5s
+          # grace then force-kill.
+          if kill -0 "$PW_PID" 2>/dev/null; then
+            sleep 5
+            if kill -0 "$PW_PID" 2>/dev/null; then
+              echo "::warning::Killing lingering Playwright process (teardown hung)"
+              kill "$PW_PID" 2>/dev/null
+              sleep 5
+              kill -9 "$PW_PID" 2>/dev/null
+            fi
+            wait "$PW_PID" 2>/dev/null
+            pw_exit=124
+          else
+            wait "$PW_PID"
+            pw_exit=$?
+          fi
+
+          echo "Playwright exited with code $pw_exit"
+
+          # When killed during teardown, the exit code is non-zero but
+          # tests may have passed.
+          if [ "$pw_exit" -ne 0 ] && [ -f "$PASS_MARKER" ]; then
+            echo "::notice::All tests passed (marker file present); non-zero exit was teardown-related"
+            pw_exit=0
+          fi
+
+          # Clean up the Docker container (belt-and-suspenders)
+          docker ps -q --filter "name=agent-canvas-mock-llm" | xargs -r docker stop 2>/dev/null || true
+
+          echo "exit_code=$pw_exit" >> "$GITHUB_OUTPUT"
+          exit 0
+
+      # ── Reporting ──────────────────────────────────────────────────────
+      - name: Upload test artifacts
+        id: upload_artifacts
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: mock-llm-docker-e2e-results
+          if-no-files-found: ignore
+          retention-days: 14
+          path: |
+            playwright-report-mock-llm-docker/
+            test-results-mock-llm-docker/
+
+      - name: Render test report
+        if: always()
+        run: |
+          node tests/e2e/mock-llm/scripts/render-mock-llm-report.mjs \
+            --results "test-results-mock-llm-docker/results.json" \
+            --output "$MOCK_LLM_REPORT_PATH" \
+            --workflow-url "$MOCK_LLM_WORKFLOW_URL" \
+            --commit "${{ steps.ctx.outputs.sha }}" \
+            --artifact-url "${{ steps.upload_artifacts.outputs.artifact-url || '' }}" \
+            --title "Mock-LLM Docker E2E Test Results"
+          cat "$MOCK_LLM_REPORT_PATH" >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Post PR comment
+        if: always() && steps.ctx.outputs.pr_number
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        run: |
+          gh pr comment "${{ steps.ctx.outputs.pr_number }}" \
+            --body-file "$MOCK_LLM_REPORT_PATH"
+
+      - name: Fail job when tests fail
+        if: always()
+        run: |
+          exit_code="${{ steps.run_tests.outputs.exit_code }}"
+          exit "${exit_code:-1}"
diff --git a/.github/workflows/npm-publish.yml b/.github/workflows/npm-publish.yml
@@ -92,9 +92,11 @@ jobs:
 
       # Resolve the npm dist-tag from the version's pre-release identifier:
       #   alpha  →  --tag alpha   (e.g. 1.0.0-alpha.1)
-      #   beta   →  --tag beta    (e.g. 1.0.0-beta.1)
-      #   rc     →  --tag rc      (e.g. 1.0.0-rc.1)
+      #   beta   →  --tag latest  (e.g. 1.0.0-beta.1)
+      #   rc     →  --tag latest  (e.g. 1.0.0-rc.1)
       #   stable →  --tag latest  (e.g. 1.0.0)
+      # Beta and RC publish as `latest` so `npm install @openhands/agent-canvas`
+      # resolves to the most recent release until the first stable version ships.
       # Note: OIDC trusted-publishing tokens cover only the `npm publish` call
       # itself; a separate `npm dist-tag add` would fail with E401, so the tag
       # is resolved and passed directly in one step.
@@ -104,10 +106,6 @@ jobs:
           VERSION=$(node -p "require('./package.json').version")
           if [[ "$VERSION" == *-alpha* ]]; then
             DIST_TAG="alpha"
-          elif [[ "$VERSION" == *-beta* ]]; then
-            DIST_TAG="beta"
-          elif [[ "$VERSION" == *-rc* ]]; then
-            DIST_TAG="rc"
           else
             DIST_TAG="latest"
           fi

diff --git a/.gitignore b/.gitignore
@@ -15,10 +15,12 @@ __pycache__
 /test-results/
 /test-results-live/
 /test-results-mock-llm/
+/test-results-mock-llm-docker/
 /.mock-llm-markers/
 /playwright-report/
 /playwright-report-live/
 /playwright-report-mock-llm/
+/playwright-report-mock-llm-docker/
 /blob-report/
 /playwright/.cache/
 # Snapshot baselines are stored as GitHub Actions artifacts — not in git.