diff --git a/.github/workflows/mock-llm-docker-e2e.yml b/.github/workflows/mock-llm-docker-e2e.yml new file mode 100644 index 00000000..1915e629 --- /dev/null +++ b/.github/workflows/mock-llm-docker-e2e.yml @@ -0,0 +1,321 @@ +name: Mock-LLM Docker E2E Tests + +# Runs the same mock-LLM E2E test specs as mock-llm-e2e.yml, but against +# the Docker image instead of the npm build path (bin/agent-canvas.mjs). +# +# Trigger chain: +# 1. workflow_run — fires automatically after the "Docker" workflow +# completes on main. The image is already built/pushed to GHCR. +# 2. pull_request — fires on PRs with the 'e2e-tests' label. Waits for +# the Docker workflow to finish, then pulls the image from GHCR. +# (workflow_run doesn't fire for new workflow files until they're on +# the default branch, so pull_request is needed for first-run PRs.) +# 3. workflow_dispatch — manual trigger with a custom image tag. + +on: + workflow_run: + workflows: ["Docker"] + types: [completed] + pull_request: + types: [opened, synchronize, reopened, labeled] + workflow_dispatch: + inputs: + docker_image: + description: 'Docker image to test (e.g., ghcr.io/openhands/agent-canvas:sha-abc1234-amd64)' + type: string + default: "" + +concurrency: + group: mock-llm-docker-e2e-${{ github.event.workflow_run.id || github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + packages: read + pull-requests: write + actions: read + +jobs: + mock-llm-docker-e2e: + # workflow_run: only run if the Docker build succeeded. + # pull_request: only run with the 'e2e-tests' label, skip fork PRs (no GHCR push). + # workflow_dispatch: always run. + if: >- + github.event_name == 'workflow_dispatch' || + (github.event_name == 'workflow_run' && + github.event.workflow_run.conclusion == 'success') || + (github.event_name == 'pull_request' && + contains(github.event.pull_request.labels.*.name, 'e2e-tests') && + !github.event.pull_request.head.repo.fork) + runs-on: ubuntu-24.04 + timeout-minutes: 15 + + env: + MOCK_LLM_REPORT_PATH: mock-llm-docker-report.md + MOCK_LLM_WORKFLOW_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + + steps: + # ── Resolve which commit / PR to test ────────────────────────────── + - name: Resolve source context + id: ctx + run: | + if [ "${{ github.event_name }}" = "workflow_run" ]; then + echo "sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT" + echo "ref=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT" + PR_NUMBER=$(echo '${{ toJSON(github.event.workflow_run.pull_requests) }}' \ + | jq -r '.[0].number // empty') + echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT" + elif [ "${{ github.event_name }}" = "pull_request" ]; then + echo "sha=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT" + echo "ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT" + echo "pr_number=${{ github.event.pull_request.number }}" >> "$GITHUB_OUTPUT" + else + echo "sha=${{ github.sha }}" >> "$GITHUB_OUTPUT" + echo "ref=${{ github.ref }}" >> "$GITHUB_OUTPUT" + echo "pr_number=" >> "$GITHUB_OUTPUT" + fi + + - name: Check out repository + uses: actions/checkout@v6 + with: + ref: ${{ steps.ctx.outputs.ref }} + + - name: Read defaults from config/defaults.json + id: defaults + run: | + echo "agent_server_version=$(node -p "require('./config/defaults.json').versions.agentServer")" >> "$GITHUB_OUTPUT" + + # ── Wait for Docker workflow (pull_request trigger only) ──────────── + # When triggered by pull_request, the Docker image may still be + # building. Poll the Docker workflow until it completes for this SHA. + - name: Wait for Docker workflow to complete + if: github.event_name == 'pull_request' + env: + GITHUB_TOKEN: ${{ github.token }} + run: | + SHA="${{ steps.ctx.outputs.sha }}" + echo "Waiting for Docker workflow to complete for SHA ${SHA}..." + + for i in $(seq 1 60); do + # Find the Docker workflow run for this exact commit + RUN=$(gh api \ + "/repos/${{ github.repository }}/actions/workflows/docker.yml/runs?head_sha=${SHA}&per_page=1" \ + --jq '.workflow_runs[0] // empty' 2>/dev/null || echo "") + + if [ -z "$RUN" ]; then + echo " Attempt $i: No Docker workflow run found yet for ${SHA}..." + sleep 15 + continue + fi + + STATUS=$(echo "$RUN" | jq -r '.status') + CONCLUSION=$(echo "$RUN" | jq -r '.conclusion // empty') + RUN_URL=$(echo "$RUN" | jq -r '.html_url') + + if [ "$STATUS" = "completed" ]; then + if [ "$CONCLUSION" = "success" ]; then + echo "Docker workflow completed successfully: $RUN_URL" + break + else + echo "::error::Docker workflow finished with conclusion '$CONCLUSION': $RUN_URL" + exit 1 + fi + fi + + echo " Attempt $i: Docker workflow status=$STATUS (${RUN_URL})" + sleep 15 + done + + # Final check — if we exhausted retries + if [ -z "${STATUS:-}" ]; then + echo "::error::No Docker workflow run found for SHA ${SHA} after 15 minutes" + exit 1 + elif [ "$STATUS" != "completed" ]; then + echo "::error::Docker workflow did not complete within 15 minutes (last status: $STATUS)" + exit 1 + fi + + # ── Resolve Docker image tag ─────────────────────────────────────── + - name: Resolve Docker image + id: image + run: | + if [ -n "${{ inputs.docker_image }}" ]; then + echo "tag=${{ inputs.docker_image }}" >> "$GITHUB_OUTPUT" + else + SHORT_SHA=$(echo "${{ steps.ctx.outputs.sha }}" | cut -c1-7) + # Use the amd64-specific tag (always pushed by the Docker workflow). + echo "tag=ghcr.io/openhands/agent-canvas:sha-${SHORT_SHA}-amd64" >> "$GITHUB_OUTPUT" + fi + + - name: Log in to GHCR + uses: docker/login-action@v4 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Pull Docker image + run: | + echo "Pulling ${{ steps.image.outputs.tag }}..." + docker pull "${{ steps.image.outputs.tag }}" + + # ── Test infrastructure setup ────────────────────────────────────── + - name: Set up Node.js + uses: actions/setup-node@v6 + with: + # Pin to 24.15.x — Node 24.16.0 has a zip-extraction regression + # (nodejs/node#63487) that hangs `playwright install` for Playwright + # < 1.60.0. Remove this pin after upgrading to Playwright >= 1.60.0. + node-version: "24.15" + cache: npm + + - name: Install npm dependencies + run: npm ci + + - name: Get Playwright version + id: pw_version + run: echo "version=$(npx playwright --version | awk '{print $2}')" >> "$GITHUB_OUTPUT" + + - name: Cache Playwright browsers + id: pw_cache + uses: actions/cache@v4 + with: + path: ~/.cache/ms-playwright + key: playwright-${{ runner.os }}-${{ steps.pw_version.outputs.version }} + + - name: Install Playwright Chromium + if: steps.pw_cache.outputs.cache-hit != 'true' + run: npx playwright install chromium + + - name: Install Playwright system deps + run: npx playwright install-deps chromium + + - name: Install uv + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + + - name: Install openhands-sdk (for mock LLM server) + run: | + uv venv .mock-llm-venv + uv pip install -p .mock-llm-venv openhands-sdk==${{ steps.defaults.outputs.agent_server_version }} + + - name: Verify mock LLM server starts + run: | + .mock-llm-venv/bin/python3 tests/e2e/mock-llm/scripts/mock-llm-server.py --port 9998 & + SERVER_PID=$! + for i in $(seq 1 30); do + if curl -sf http://127.0.0.1:9998/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model":"test","messages":[]}' > /dev/null 2>&1; then + echo "Mock LLM server responded on attempt $i" + break + fi + sleep 1 + done + curl -sf http://127.0.0.1:9998/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model":"test","messages":[]}' | python3 -m json.tool + kill $SERVER_PID + + # ── Run tests ────────────────────────────────────────────────────── + - name: Run mock-LLM Docker E2E tests + id: run_tests + env: + MOCK_LLM_PYTHON: .mock-llm-venv/bin/python3 + MOCK_LLM_DOCKER_IMAGE: ${{ steps.image.outputs.tag }} + run: | + set +e + MARKER_DIR=".mock-llm-markers" + DONE_MARKER="$MARKER_DIR/.tests-done" + PASS_MARKER="$MARKER_DIR/.all-passed" + rm -rf "$MARKER_DIR" + + # Run Playwright in background so our shell survives if we have + # to kill it (the Docker container teardown can hang). + npm run test:e2e:mock-llm:docker & + PW_PID=$! + + # Wait up to 5 min for tests to complete. + deadline=$((SECONDS + 300)) + while [ "$SECONDS" -lt "$deadline" ]; do + if ! kill -0 "$PW_PID" 2>/dev/null; then + break + fi + if [ -f "$DONE_MARKER" ]; then + echo "Tests completed: $(cat "$DONE_MARKER")" + break + fi + sleep 2 + done + + # If Playwright is still running (teardown hang), give it 5s + # grace then force-kill. + if kill -0 "$PW_PID" 2>/dev/null; then + sleep 5 + if kill -0 "$PW_PID" 2>/dev/null; then + echo "::warning::Killing lingering Playwright process (teardown hung)" + kill "$PW_PID" 2>/dev/null + sleep 5 + kill -9 "$PW_PID" 2>/dev/null + fi + wait "$PW_PID" 2>/dev/null + pw_exit=124 + else + wait "$PW_PID" + pw_exit=$? + fi + + echo "Playwright exited with code $pw_exit" + + # When killed during teardown, the exit code is non-zero but + # tests may have passed. + if [ "$pw_exit" -ne 0 ] && [ -f "$PASS_MARKER" ]; then + echo "::notice::All tests passed (marker file present); non-zero exit was teardown-related" + pw_exit=0 + fi + + # Clean up the Docker container (belt-and-suspenders) + docker ps -q --filter "name=agent-canvas-mock-llm" | xargs -r docker stop 2>/dev/null || true + + echo "exit_code=$pw_exit" >> "$GITHUB_OUTPUT" + exit 0 + + # ── Reporting ────────────────────────────────────────────────────── + - name: Upload test artifacts + id: upload_artifacts + if: always() + uses: actions/upload-artifact@v7 + with: + name: mock-llm-docker-e2e-results + if-no-files-found: ignore + retention-days: 14 + path: | + playwright-report-mock-llm-docker/ + test-results-mock-llm-docker/ + + - name: Render test report + if: always() + run: | + node tests/e2e/mock-llm/scripts/render-mock-llm-report.mjs \ + --results "test-results-mock-llm-docker/results.json" \ + --output "$MOCK_LLM_REPORT_PATH" \ + --workflow-url "$MOCK_LLM_WORKFLOW_URL" \ + --commit "${{ steps.ctx.outputs.sha }}" \ + --artifact-url "${{ steps.upload_artifacts.outputs.artifact-url || '' }}" \ + --title "Mock-LLM Docker E2E Test Results" + cat "$MOCK_LLM_REPORT_PATH" >> "$GITHUB_STEP_SUMMARY" + + - name: Post PR comment + if: always() && steps.ctx.outputs.pr_number + env: + GITHUB_TOKEN: ${{ github.token }} + run: | + gh pr comment "${{ steps.ctx.outputs.pr_number }}" \ + --body-file "$MOCK_LLM_REPORT_PATH" + + - name: Fail job when tests fail + if: always() + run: | + exit_code="${{ steps.run_tests.outputs.exit_code }}" + exit "${exit_code:-1}" diff --git a/.github/workflows/npm-publish.yml b/.github/workflows/npm-publish.yml index bae15a48..54354d1d 100644 --- a/.github/workflows/npm-publish.yml +++ b/.github/workflows/npm-publish.yml @@ -92,9 +92,11 @@ jobs: # Resolve the npm dist-tag from the version's pre-release identifier: # alpha → --tag alpha (e.g. 1.0.0-alpha.1) - # beta → --tag beta (e.g. 1.0.0-beta.1) - # rc → --tag rc (e.g. 1.0.0-rc.1) + # beta → --tag latest (e.g. 1.0.0-beta.1) + # rc → --tag latest (e.g. 1.0.0-rc.1) # stable → --tag latest (e.g. 1.0.0) + # Beta and RC publish as `latest` so `npm install @openhands/agent-canvas` + # resolves to the most recent release until the first stable version ships. # Note: OIDC trusted-publishing tokens cover only the `npm publish` call # itself; a separate `npm dist-tag add` would fail with E401, so the tag # is resolved and passed directly in one step. @@ -104,10 +106,6 @@ jobs: VERSION=$(node -p "require('./package.json').version") if [[ "$VERSION" == *-alpha* ]]; then DIST_TAG="alpha" - elif [[ "$VERSION" == *-beta* ]]; then - DIST_TAG="beta" - elif [[ "$VERSION" == *-rc* ]]; then - DIST_TAG="rc" else DIST_TAG="latest" fi diff --git a/.gitignore b/.gitignore index 97c97dcb..ea10421a 100644 --- a/.gitignore +++ b/.gitignore @@ -15,10 +15,12 @@ __pycache__ /test-results/ /test-results-live/ /test-results-mock-llm/ +/test-results-mock-llm-docker/ /.mock-llm-markers/ /playwright-report/ /playwright-report-live/ /playwright-report-mock-llm/ +/playwright-report-mock-llm-docker/ /blob-report/ /playwright/.cache/ # Snapshot baselines are stored as GitHub Actions artifacts — not in git. diff --git a/AGENTS.md b/AGENTS.md index fcded59e..52d123b1 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -25,7 +25,7 @@ - The block lists URLs **from the agent's point of view**: - The Agent Server is always reachable as `http://localhost:` from inside the sandbox — but that is _you_, not the automation backend. - Host-side services (ingress, Vite, automation) are reachable as `http://localhost:`. -- Agents should treat the `` block as authoritative: don't hardcode `localhost:8000` for "the automation server", and don't probe random ports trying to discover services. If the block says automation is not running, skip `/api/automation` calls; otherwise use the listed `url_from_agent` + `api_prefix` (default `/api/automation`) and the `X-API-Key: $OPENHANDS_AUTOMATION_API_KEY` header. +- Agents should treat the `` block as authoritative: don't hardcode `localhost:8000` for "the automation server", and don't probe random ports trying to discover services. If the block says automation is not running, skip `/api/automation` calls; otherwise use the listed `url_from_agent` + `api_prefix` (default `/api/automation`) and the `X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY` header. - The launcher → frontend → suffix plumbing is: - `scripts/dev-safe.mjs::buildRuntimeServicesInfo()` — pure helper that constructs the info object. - `scripts/dev-with-automation.mjs::buildAutomationRuntimeServicesInfo()` — wraps it with automation details; called from both Vite spawn (`startVite`) and the static build (`static-build.mjs`). @@ -53,7 +53,7 @@ The env var is a JSON string of: "url_from_agent": "http://localhost:3001" }, "automation": { - "description": "OpenHands Automations service. All routes are mounted under '/api/automation'. Authenticate with header 'X-API-Key: $OPENHANDS_AUTOMATION_API_KEY'.", + "description": "OpenHands Automations service. All routes are mounted under '/api/automation'. Authenticate with header 'X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY'.", "url_from_agent": "http://localhost:18001", "api_prefix": "/api/automation", "docs_url": "http://localhost:18001/api/automation/docs", @@ -81,10 +81,10 @@ from your point of view (i.e., as you should curl/fetch them). * Frontend: http://localhost:3001 Vite dev server hosting the agent-canvas frontend. * Automation backend: http://localhost:18001 - OpenHands Automations service. All routes are mounted under '/api/automation'. Authenticate with header 'X-API-Key: $OPENHANDS_AUTOMATION_API_KEY'. + OpenHands Automations service. All routes are mounted under '/api/automation'. Authenticate with header 'X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY'. Docs: http://localhost:18001/api/automation/docs OpenAPI: http://localhost:18001/api/automation/openapi.json - Auth: header 'X-API-Key: $OPENHANDS_AUTOMATION_API_KEY' + Auth: header 'X-Session-API-Key: $OPENHANDS_AUTOMATION_API_KEY' Trust this block over guessing: do not assume any other URLs are running. In particular, http://localhost:18000 inside your sandbox is the Agent Server @@ -164,6 +164,17 @@ you are running inside of — NOT the automation backend. - CI workflow: `.github/workflows/mock-llm-e2e.yml` runs on PRs with the `e2e-tests` label or on manual dispatch. It builds the frontend, starts the mock LLM server, runs the tests, and posts a PR comment with results. - The custom `DoneMarkerReporter` writes `.mock-llm-markers/.tests-done` after all tests complete (before webServer teardown) so the CI wrapper can detect completion and kill the lingering teardown process. +### Docker Image Testing (Shared Specs) + +- The same test specs and helpers are reused to validate the Docker image via `playwright.mock-llm-docker.config.ts`. Run locally with `npm run test:e2e:mock-llm:docker` (requires Docker daemon and a built image). +- **Architecture**: The Docker config replaces the npm path's `bin/agent-canvas.mjs` webServer with a `docker run --network host` command. The mock LLM server still runs on the host. On Linux (including CI), `--network host` lets the container share the host's network stack so all `127.0.0.1` URLs work identically. On macOS/Windows Docker Desktop (bridge networking), set `MOCK_LLM_AGENT_URL=http://host.docker.internal:` so the agent-server inside Docker can reach the host-side mock LLM server. +- **URL split**: `mock-llm-helpers.ts` exports two mock LLM URL constants: + - `MOCK_LLM_BASE_URL` — always `http://127.0.0.1:`, used by tests for the mock LLM admin API (register/activate/reset trajectories). + - `MOCK_LLM_AGENT_URL` — defaults to `MOCK_LLM_BASE_URL`, overridable via `MOCK_LLM_AGENT_URL` env var. Used when configuring the LLM profile (`base_url` field) — this is the URL the agent-server uses for inference calls. The npm path and Docker-with-`--network host` path use the same value; Docker on macOS needs the override. +- **Docker image**: Set `MOCK_LLM_DOCKER_IMAGE` to the image tag (default: `ghcr.io/openhands/agent-canvas:latest`). The container is started with `--rm --network host` and a unique `--name` for cleanup. +- **State isolation**: The Docker container uses its internal state directory (no host mount needed for tests). Each test run starts a fresh container. +- CI workflow: `.github/workflows/mock-llm-docker-e2e.yml` has three triggers — all pull the already-built image from GHCR (no rebuild): (1) `workflow_run` fires automatically after the `Docker` workflow completes on main; (2) `pull_request` with the `e2e-tests` label polls the Docker workflow until it finishes for the PR's head SHA, then pulls the image (needed because `workflow_run` only fires for workflow files already on the default branch); (3) `workflow_dispatch` accepts a custom `docker_image` input. The image tag is derived from the commit SHA (`ghcr.io/openhands/agent-canvas:sha--amd64`). Fork PRs are skipped (no GHCR push). Report artifacts go to `test-results-mock-llm-docker/` and `playwright-report-mock-llm-docker/`. + ## Additional Notes - **Published binary auth fix**: When users install the npm package globally (`npm install -g @openhands/agent-canvas`) and run `agent-canvas`, the pre-built static frontend has a `VITE_SESSION_API_KEY` baked in at publish time that differs from the user's persisted runtime key (`~/.openhands/agent-canvas/session-api-key.txt`). The fix is to inject the runtime session key into `index.html` responses at serve time (not build time). `scripts/static-server.mjs` accepts a `--session-api-key ` flag and injects a tiny inline `