Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
321 changes: 321 additions & 0 deletions .github/workflows/mock-llm-docker-e2e.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
name: Mock-LLM Docker E2E Tests

# Runs the same mock-LLM E2E test specs as mock-llm-e2e.yml, but against
# the Docker image instead of the npm build path (bin/agent-canvas.mjs).
#
# Trigger chain:
# 1. workflow_run — fires automatically after the "Docker" workflow
# completes on main. The image is already built/pushed to GHCR.
# 2. pull_request — fires on PRs with the 'e2e-tests' label. Waits for
# the Docker workflow to finish, then pulls the image from GHCR.
# (workflow_run doesn't fire for new workflow files until they're on
# the default branch, so pull_request is needed for first-run PRs.)
# 3. workflow_dispatch — manual trigger with a custom image tag.

on:
workflow_run:
workflows: ["Docker"]
types: [completed]
pull_request:
types: [opened, synchronize, reopened, labeled]
workflow_dispatch:
inputs:
docker_image:
description: 'Docker image to test (e.g., ghcr.io/openhands/agent-canvas:sha-abc1234-amd64)'
type: string
default: ""

concurrency:
group: mock-llm-docker-e2e-${{ github.event.workflow_run.id || github.event.pull_request.number || github.ref }}
cancel-in-progress: true

permissions:
contents: read
packages: read
pull-requests: write
actions: read

jobs:
mock-llm-docker-e2e:
# workflow_run: only run if the Docker build succeeded.
# pull_request: only run with the 'e2e-tests' label, skip fork PRs (no GHCR push).
# workflow_dispatch: always run.
if: >-
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'workflow_run' &&
github.event.workflow_run.conclusion == 'success') ||
(github.event_name == 'pull_request' &&
contains(github.event.pull_request.labels.*.name, 'e2e-tests') &&
!github.event.pull_request.head.repo.fork)
runs-on: ubuntu-24.04
timeout-minutes: 15

env:
MOCK_LLM_REPORT_PATH: mock-llm-docker-report.md
MOCK_LLM_WORKFLOW_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}

steps:
# ── Resolve which commit / PR to test ──────────────────────────────
- name: Resolve source context
id: ctx
run: |
if [ "${{ github.event_name }}" = "workflow_run" ]; then
echo "sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
echo "ref=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
PR_NUMBER=$(echo '${{ toJSON(github.event.workflow_run.pull_requests) }}' \
| jq -r '.[0].number // empty')
echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
elif [ "${{ github.event_name }}" = "pull_request" ]; then
echo "sha=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
echo "ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
echo "pr_number=${{ github.event.pull_request.number }}" >> "$GITHUB_OUTPUT"
else
echo "sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
echo "ref=${{ github.ref }}" >> "$GITHUB_OUTPUT"
echo "pr_number=" >> "$GITHUB_OUTPUT"
fi

- name: Check out repository
uses: actions/checkout@v6
with:
ref: ${{ steps.ctx.outputs.ref }}

- name: Read defaults from config/defaults.json
id: defaults
run: |
echo "agent_server_version=$(node -p "require('./config/defaults.json').versions.agentServer")" >> "$GITHUB_OUTPUT"

# ── Wait for Docker workflow (pull_request trigger only) ────────────
# When triggered by pull_request, the Docker image may still be
# building. Poll the Docker workflow until it completes for this SHA.
- name: Wait for Docker workflow to complete
if: github.event_name == 'pull_request'
env:
GITHUB_TOKEN: ${{ github.token }}
run: |
SHA="${{ steps.ctx.outputs.sha }}"
echo "Waiting for Docker workflow to complete for SHA ${SHA}..."

for i in $(seq 1 60); do
# Find the Docker workflow run for this exact commit
RUN=$(gh api \
"/repos/${{ github.repository }}/actions/workflows/docker.yml/runs?head_sha=${SHA}&per_page=1" \
--jq '.workflow_runs[0] // empty' 2>/dev/null || echo "")

if [ -z "$RUN" ]; then
echo " Attempt $i: No Docker workflow run found yet for ${SHA}..."
sleep 15
continue
fi

STATUS=$(echo "$RUN" | jq -r '.status')
CONCLUSION=$(echo "$RUN" | jq -r '.conclusion // empty')
RUN_URL=$(echo "$RUN" | jq -r '.html_url')

if [ "$STATUS" = "completed" ]; then
if [ "$CONCLUSION" = "success" ]; then
echo "Docker workflow completed successfully: $RUN_URL"
break
else
echo "::error::Docker workflow finished with conclusion '$CONCLUSION': $RUN_URL"
exit 1
fi
fi

echo " Attempt $i: Docker workflow status=$STATUS (${RUN_URL})"
sleep 15
done

# Final check — if we exhausted retries
if [ -z "${STATUS:-}" ]; then
echo "::error::No Docker workflow run found for SHA ${SHA} after 15 minutes"
exit 1
elif [ "$STATUS" != "completed" ]; then
echo "::error::Docker workflow did not complete within 15 minutes (last status: $STATUS)"
exit 1
fi

# ── Resolve Docker image tag ───────────────────────────────────────
- name: Resolve Docker image
id: image
run: |
if [ -n "${{ inputs.docker_image }}" ]; then
echo "tag=${{ inputs.docker_image }}" >> "$GITHUB_OUTPUT"
else
SHORT_SHA=$(echo "${{ steps.ctx.outputs.sha }}" | cut -c1-7)
# Use the amd64-specific tag (always pushed by the Docker workflow).
echo "tag=ghcr.io/openhands/agent-canvas:sha-${SHORT_SHA}-amd64" >> "$GITHUB_OUTPUT"
fi

- name: Log in to GHCR
uses: docker/login-action@v4
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Pull Docker image
run: |
echo "Pulling ${{ steps.image.outputs.tag }}..."
docker pull "${{ steps.image.outputs.tag }}"

# ── Test infrastructure setup ──────────────────────────────────────
- name: Set up Node.js
uses: actions/setup-node@v6
with:
# Pin to 24.15.x — Node 24.16.0 has a zip-extraction regression
# (nodejs/node#63487) that hangs `playwright install` for Playwright
# < 1.60.0. Remove this pin after upgrading to Playwright >= 1.60.0.
node-version: "24.15"
cache: npm

- name: Install npm dependencies
run: npm ci

- name: Get Playwright version
id: pw_version
run: echo "version=$(npx playwright --version | awk '{print $2}')" >> "$GITHUB_OUTPUT"

- name: Cache Playwright browsers
id: pw_cache
uses: actions/cache@v4
with:
path: ~/.cache/ms-playwright
key: playwright-${{ runner.os }}-${{ steps.pw_version.outputs.version }}

- name: Install Playwright Chromium
if: steps.pw_cache.outputs.cache-hit != 'true'
run: npx playwright install chromium

- name: Install Playwright system deps
run: npx playwright install-deps chromium

- name: Install uv
run: |
curl -LsSf https://astral.sh/uv/install.sh | sh
echo "$HOME/.local/bin" >> "$GITHUB_PATH"

- name: Install openhands-sdk (for mock LLM server)
run: |
uv venv .mock-llm-venv
uv pip install -p .mock-llm-venv openhands-sdk==${{ steps.defaults.outputs.agent_server_version }}

- name: Verify mock LLM server starts
run: |
.mock-llm-venv/bin/python3 tests/e2e/mock-llm/scripts/mock-llm-server.py --port 9998 &
SERVER_PID=$!
for i in $(seq 1 30); do
if curl -sf http://127.0.0.1:9998/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model":"test","messages":[]}' > /dev/null 2>&1; then
echo "Mock LLM server responded on attempt $i"
break
fi
sleep 1
done
curl -sf http://127.0.0.1:9998/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model":"test","messages":[]}' | python3 -m json.tool
kill $SERVER_PID

# ── Run tests ──────────────────────────────────────────────────────
- name: Run mock-LLM Docker E2E tests
id: run_tests
env:
MOCK_LLM_PYTHON: .mock-llm-venv/bin/python3
MOCK_LLM_DOCKER_IMAGE: ${{ steps.image.outputs.tag }}
run: |
set +e
MARKER_DIR=".mock-llm-markers"
DONE_MARKER="$MARKER_DIR/.tests-done"
PASS_MARKER="$MARKER_DIR/.all-passed"
rm -rf "$MARKER_DIR"

# Run Playwright in background so our shell survives if we have
# to kill it (the Docker container teardown can hang).
npm run test:e2e:mock-llm:docker &
PW_PID=$!

# Wait up to 5 min for tests to complete.
deadline=$((SECONDS + 300))
while [ "$SECONDS" -lt "$deadline" ]; do
if ! kill -0 "$PW_PID" 2>/dev/null; then
break
fi
if [ -f "$DONE_MARKER" ]; then
echo "Tests completed: $(cat "$DONE_MARKER")"
break
fi
sleep 2
done

# If Playwright is still running (teardown hang), give it 5s
# grace then force-kill.
if kill -0 "$PW_PID" 2>/dev/null; then
sleep 5
if kill -0 "$PW_PID" 2>/dev/null; then
echo "::warning::Killing lingering Playwright process (teardown hung)"
kill "$PW_PID" 2>/dev/null
sleep 5
kill -9 "$PW_PID" 2>/dev/null
fi
wait "$PW_PID" 2>/dev/null
pw_exit=124
else
wait "$PW_PID"
pw_exit=$?
fi

echo "Playwright exited with code $pw_exit"

# When killed during teardown, the exit code is non-zero but
# tests may have passed.
if [ "$pw_exit" -ne 0 ] && [ -f "$PASS_MARKER" ]; then
echo "::notice::All tests passed (marker file present); non-zero exit was teardown-related"
pw_exit=0
fi

# Clean up the Docker container (belt-and-suspenders)
docker ps -q --filter "name=agent-canvas-mock-llm" | xargs -r docker stop 2>/dev/null || true

echo "exit_code=$pw_exit" >> "$GITHUB_OUTPUT"
exit 0

# ── Reporting ──────────────────────────────────────────────────────
- name: Upload test artifacts
id: upload_artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: mock-llm-docker-e2e-results
if-no-files-found: ignore
retention-days: 14
path: |
playwright-report-mock-llm-docker/
test-results-mock-llm-docker/

- name: Render test report
if: always()
run: |
node tests/e2e/mock-llm/scripts/render-mock-llm-report.mjs \
--results "test-results-mock-llm-docker/results.json" \
--output "$MOCK_LLM_REPORT_PATH" \
--workflow-url "$MOCK_LLM_WORKFLOW_URL" \
--commit "${{ steps.ctx.outputs.sha }}" \
--artifact-url "${{ steps.upload_artifacts.outputs.artifact-url || '' }}" \
--title "Mock-LLM Docker E2E Test Results"
cat "$MOCK_LLM_REPORT_PATH" >> "$GITHUB_STEP_SUMMARY"

- name: Post PR comment
if: always() && steps.ctx.outputs.pr_number
env:
GITHUB_TOKEN: ${{ github.token }}
run: |
gh pr comment "${{ steps.ctx.outputs.pr_number }}" \
--body-file "$MOCK_LLM_REPORT_PATH"

- name: Fail job when tests fail
if: always()
run: |
exit_code="${{ steps.run_tests.outputs.exit_code }}"
exit "${exit_code:-1}"
10 changes: 4 additions & 6 deletions .github/workflows/npm-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,11 @@ jobs:

# Resolve the npm dist-tag from the version's pre-release identifier:
# alpha → --tag alpha (e.g. 1.0.0-alpha.1)
# beta → --tag beta (e.g. 1.0.0-beta.1)
# rc → --tag rc (e.g. 1.0.0-rc.1)
# beta → --tag latest (e.g. 1.0.0-beta.1)
# rc → --tag latest (e.g. 1.0.0-rc.1)
# stable → --tag latest (e.g. 1.0.0)
# Beta and RC publish as `latest` so `npm install @openhands/agent-canvas`
# resolves to the most recent release until the first stable version ships.
# Note: OIDC trusted-publishing tokens cover only the `npm publish` call
# itself; a separate `npm dist-tag add` would fail with E401, so the tag
# is resolved and passed directly in one step.
Expand All @@ -104,10 +106,6 @@ jobs:
VERSION=$(node -p "require('./package.json').version")
if [[ "$VERSION" == *-alpha* ]]; then
DIST_TAG="alpha"
elif [[ "$VERSION" == *-beta* ]]; then
DIST_TAG="beta"
elif [[ "$VERSION" == *-rc* ]]; then
DIST_TAG="rc"
else
DIST_TAG="latest"
fi
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@ __pycache__
/test-results/
/test-results-live/
/test-results-mock-llm/
/test-results-mock-llm-docker/
/.mock-llm-markers/
/playwright-report/
/playwright-report-live/
/playwright-report-mock-llm/
/playwright-report-mock-llm-docker/
/blob-report/
/playwright/.cache/
# Snapshot baselines are stored as GitHub Actions artifacts — not in git.
Expand Down
Loading
Loading