From 57bb7788917ae7803fbb575e9311debe3a8aec8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8D=9A=E6=83=9F?= Date: Thu, 7 May 2026 17:54:50 +0800 Subject: [PATCH 1/6] ci: add real training jobs and branch dispatch to nightly workflow Replace dummy test with actual gsm8k GRPO training inside Docker containers (dev-sglang/dev-vllm). Add workflow_dispatch input to run arbitrary branches. Key changes: - Pull latest runtime images, run via docker exec in persistent containers - Install AReaL from source (uv pip install -e . --no-deps) - Round-robin training backend (fsdp/megatron/archon) by day - Add 'ref' input for branch/tag/SHA override on manual dispatch --- .github/workflows/nightly.yml | 99 ++++++++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 19 deletions(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index cc93470264..522eed0861 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -5,6 +5,12 @@ on: # 00:00 Beijing time (UTC+8) = 16:00 UTC - cron: '0 16 * * *' workflow_dispatch: + inputs: + ref: + description: 'Branch, tag, or SHA to checkout (default: main)' + required: false + type: string + default: 'main' concurrency: group: nightly-ci @@ -123,35 +129,90 @@ jobs: throw new Error(`Timed out waiting for runner ${instanceName} to come online.`); nightly-tests: - name: Run nightly tests + name: Run nightly tests (${{ matrix.variant }}) needs: start-runner runs-on: - self-hosted - areal-nightly + strategy: + fail-fast: false + max-parallel: 1 + matrix: + variant: [sglang, vllm] timeout-minutes: 480 + env: + CONTAINER_IMAGE: ghcr.io/inclusionai/areal-runtime:dev-${{ matrix.variant }} + CONTAINER_NAME: areal-nightly-${{ matrix.variant }} steps: - uses: actions/checkout@v6 + with: + ref: ${{ inputs.ref || github.sha }} + + - name: Log in to GitHub Container Registry + run: echo "${{ secrets.GHCR_TOKEN }}" | docker login ghcr.io -u inclusionai --password-stdin + + - name: Pull latest runtime image + run: docker pull "$CONTAINER_IMAGE" + + - name: Start container + run: | + docker rm -f "$CONTAINER_NAME" 2>/dev/null || true + docker run --name "$CONTAINER_NAME" -d \ + --runtime=nvidia --gpus all \ + --net=host \ + --shm-size=54g \ + --ulimit nofile=1048576:1048576 \ + --cap-add=SYS_ADMIN \ + --device=/dev/fuse \ + --security-opt=apparmor:unconfined \ + -e HF_TOKEN="${{ secrets.HF_TOKEN }}" \ + -e TOKENIZERS_PARALLELISM=false \ + -v "${{ github.workspace }}:/workspace" \ + -w /workspace \ + --entrypoint=/bin/bash \ + "$CONTAINER_IMAGE" \ + -lc "trap : TERM INT; sleep infinity & wait" + + - name: Install AReaL from source + run: | + docker exec "$CONTAINER_NAME" bash -lc ' + export PATH=/opt/.venv/bin:$PATH + uv pip install -e . --no-deps + ' - - name: System info + - name: Determine run parameters + id: params + run: | + BACKENDS=("fsdp" "megatron" "archon") + DAY_OF_YEAR=$(date -u +%j) + BACKEND_INDEX=$(( (10#$DAY_OF_YEAR - 1) % 3 )) + echo "experiment_name=nightly-gsm8k-$(date -u +%Y-%m)" >> "$GITHUB_OUTPUT" + echo "trial_name=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT" + echo "train_backend=${BACKENDS[$BACKEND_INDEX]}" >> "$GITHUB_OUTPUT" + + - name: Run training + env: + VARIANT: ${{ matrix.variant }} + EXPERIMENT_NAME: ${{ steps.params.outputs.experiment_name }} + TRIAL_NAME: ${{ steps.params.outputs.trial_name }} + TRAIN_BACKEND: ${{ steps.params.outputs.train_backend }} run: | - echo "=== GPU info ===" - nvidia-smi - echo "" - echo "=== Python ===" - python --version || python3 --version - echo "" - echo "=== Disk ===" - df -h - echo "" - echo "=== Memory ===" - free -h - - - name: Dummy test (placeholder) + docker exec "$CONTAINER_NAME" bash -lc ' + export PATH=/opt/.venv/bin:$PATH + python3 examples/math/gsm8k_rl.py --config examples/math/gsm8k_grpo.yaml \ + experiment_name='"$EXPERIMENT_NAME"' trial_name='"$TRIAL_NAME"' \ + stats_logger.wandb.mode=disabled \ + rollout.backend='"$VARIANT"':d1 actor.backend='"$TRAIN_BACKEND"':d1 \ + cluster.n_nodes=1 cluster.n_gpus_per_node=2 \ + actor.path=Qwen/Qwen3-0.6B \ + scheduler.type=local train_dataset.batch_size=64 + ' + + - name: Teardown container + if: always() run: | - echo "Nightly CI running on $(hostname) at $(date -u)" - echo "TODO: Replace with actual long-running tests" - sleep 10 - echo "Dummy test completed successfully." + docker rm -f "$CONTAINER_NAME" 2>/dev/null || true + docker logout ghcr.io 2>/dev/null || true stop-runner: name: Stop areal-nightly instance From 5016da8b9269fc41bb9ccc6ac063e0f3ba02e866 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8D=9A=E6=83=9F?= Date: Sat, 9 May 2026 15:28:54 +0800 Subject: [PATCH 2/6] ci: fix root-owned file cleanup between matrix jobs Docker containers run as root, creating .pyc files owned by root:root on the bind-mounted workspace. The next matrix job fails when actions/checkout tries to git clean these files. Key changes: - Add sudo rm cleanup before checkout - Clean root-owned files in teardown step --- .github/workflows/nightly.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 522eed0861..adb1cddafc 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -144,6 +144,9 @@ jobs: CONTAINER_IMAGE: ghcr.io/inclusionai/areal-runtime:dev-${{ matrix.variant }} CONTAINER_NAME: areal-nightly-${{ matrix.variant }} steps: + - name: Clean workspace + run: sudo rm -rf "${{ github.workspace }}"/* + - uses: actions/checkout@v6 with: ref: ${{ inputs.ref || github.sha }} @@ -213,6 +216,7 @@ jobs: run: | docker rm -f "$CONTAINER_NAME" 2>/dev/null || true docker logout ghcr.io 2>/dev/null || true + sudo find "${{ github.workspace }}" -not -user "$(id -u)" -delete 2>/dev/null || true stop-runner: name: Stop areal-nightly instance From a1bf2cbecd85f4472a726835994f2da7aace69c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8D=9A=E6=83=9F?= Date: Sat, 9 May 2026 15:50:46 +0800 Subject: [PATCH 3/6] ci: sequential variants, model pre-download, wandb logging Key changes: - Run sglang/vllm sequentially via shared run_variant function - Pre-download model to persistent /opt/hf_cache volume - Enable wandb online logging with nightly environment secret - Trial name includes backend+variant shortcodes (e.g., m.s-2026-05-09) --- .github/workflows/nightly.yml | 116 ++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 55 deletions(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index adb1cddafc..58ac1e78e9 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -129,24 +129,16 @@ jobs: throw new Error(`Timed out waiting for runner ${instanceName} to come online.`); nightly-tests: - name: Run nightly tests (${{ matrix.variant }}) + name: Run nightly tests needs: start-runner + environment: nightly runs-on: - self-hosted - areal-nightly - strategy: - fail-fast: false - max-parallel: 1 - matrix: - variant: [sglang, vllm] timeout-minutes: 480 env: - CONTAINER_IMAGE: ghcr.io/inclusionai/areal-runtime:dev-${{ matrix.variant }} - CONTAINER_NAME: areal-nightly-${{ matrix.variant }} + IMAGE_REPO: ghcr.io/inclusionai/areal-runtime steps: - - name: Clean workspace - run: sudo rm -rf "${{ github.workspace }}"/* - - uses: actions/checkout@v6 with: ref: ${{ inputs.ref || github.sha }} @@ -154,67 +146,81 @@ jobs: - name: Log in to GitHub Container Registry run: echo "${{ secrets.GHCR_TOKEN }}" | docker login ghcr.io -u inclusionai --password-stdin - - name: Pull latest runtime image - run: docker pull "$CONTAINER_IMAGE" - - - name: Start container - run: | - docker rm -f "$CONTAINER_NAME" 2>/dev/null || true - docker run --name "$CONTAINER_NAME" -d \ - --runtime=nvidia --gpus all \ - --net=host \ - --shm-size=54g \ - --ulimit nofile=1048576:1048576 \ - --cap-add=SYS_ADMIN \ - --device=/dev/fuse \ - --security-opt=apparmor:unconfined \ - -e HF_TOKEN="${{ secrets.HF_TOKEN }}" \ - -e TOKENIZERS_PARALLELISM=false \ - -v "${{ github.workspace }}:/workspace" \ - -w /workspace \ - --entrypoint=/bin/bash \ - "$CONTAINER_IMAGE" \ - -lc "trap : TERM INT; sleep infinity & wait" - - - name: Install AReaL from source + - name: Pull latest runtime images run: | - docker exec "$CONTAINER_NAME" bash -lc ' - export PATH=/opt/.venv/bin:$PATH - uv pip install -e . --no-deps - ' + docker pull "$IMAGE_REPO:dev-sglang" + docker pull "$IMAGE_REPO:dev-vllm" - name: Determine run parameters id: params run: | BACKENDS=("fsdp" "megatron" "archon") + BACKEND_SHORT=("f" "m" "a") DAY_OF_YEAR=$(date -u +%j) BACKEND_INDEX=$(( (10#$DAY_OF_YEAR - 1) % 3 )) echo "experiment_name=nightly-gsm8k-$(date -u +%Y-%m)" >> "$GITHUB_OUTPUT" - echo "trial_name=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT" + echo "trial_date=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT" echo "train_backend=${BACKENDS[$BACKEND_INDEX]}" >> "$GITHUB_OUTPUT" + echo "train_backend_short=${BACKEND_SHORT[$BACKEND_INDEX]}" >> "$GITHUB_OUTPUT" - - name: Run training + - name: Run training (sglang + vllm) env: - VARIANT: ${{ matrix.variant }} EXPERIMENT_NAME: ${{ steps.params.outputs.experiment_name }} - TRIAL_NAME: ${{ steps.params.outputs.trial_name }} + TRIAL_DATE: ${{ steps.params.outputs.trial_date }} TRAIN_BACKEND: ${{ steps.params.outputs.train_backend }} + TRAIN_BACKEND_SHORT: ${{ steps.params.outputs.train_backend_short }} + WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + WORKSPACE: ${{ github.workspace }} + MODEL_NAME: Qwen/Qwen3-0.6B run: | - docker exec "$CONTAINER_NAME" bash -lc ' - export PATH=/opt/.venv/bin:$PATH - python3 examples/math/gsm8k_rl.py --config examples/math/gsm8k_grpo.yaml \ - experiment_name='"$EXPERIMENT_NAME"' trial_name='"$TRIAL_NAME"' \ - stats_logger.wandb.mode=disabled \ - rollout.backend='"$VARIANT"':d1 actor.backend='"$TRAIN_BACKEND"':d1 \ - cluster.n_nodes=1 cluster.n_gpus_per_node=2 \ - actor.path=Qwen/Qwen3-0.6B \ - scheduler.type=local train_dataset.batch_size=64 - ' - - - name: Teardown container + MODEL_CACHE=/opt/hf_cache + sudo mkdir -p "$MODEL_CACHE" + sudo chmod 777 "$MODEL_CACHE" + MODEL_LOCAL="$MODEL_CACHE/$MODEL_NAME" + + declare -A VARIANT_SHORT=( [sglang]=s [vllm]=v ) + + run_variant() { + local variant=$1 + local trial_name="${TRAIN_BACKEND_SHORT}.${VARIANT_SHORT[$variant]}-${TRIAL_DATE}" + echo "=== Running variant: $variant (trial: $trial_name) ===" + docker run --rm \ + --runtime=nvidia --gpus all \ + --net=host \ + --shm-size=500g \ + --ulimit nofile=1048576:1048576 \ + --cap-add=SYS_ADMIN \ + --device=/dev/fuse \ + --security-opt=apparmor:unconfined \ + -e HF_TOKEN="$HF_TOKEN" \ + -e WANDB_API_KEY="$WANDB_API_KEY" \ + -e TOKENIZERS_PARALLELISM=false \ + -v "$WORKSPACE:/workspace" \ + -v "$MODEL_CACHE:/model_cache" \ + -w /workspace \ + "$IMAGE_REPO:dev-$variant" \ + bash -lc ' + export PATH=/opt/.venv/bin:$PATH + huggingface-cli download '"$MODEL_NAME"' --local-dir /model_cache/'"$MODEL_NAME"' + uv pip install -e . --no-deps + python3 examples/math/gsm8k_rl.py --config examples/math/gsm8k_grpo.yaml \ + experiment_name='"$EXPERIMENT_NAME"' trial_name='"$trial_name"' \ + stats_logger.wandb.mode=online \ + rollout.backend='"$variant"':d1 actor.backend='"$TRAIN_BACKEND"':d1 \ + cluster.n_nodes=1 cluster.n_gpus_per_node=2 \ + actor.path=/model_cache/'"$MODEL_NAME"' \ + scheduler.type=local train_dataset.batch_size=64 + ' + sudo find "$WORKSPACE" -not -user "$(id -u)" -delete 2>/dev/null || true + } + + run_variant sglang + run_variant vllm + + - name: Cleanup if: always() run: | - docker rm -f "$CONTAINER_NAME" 2>/dev/null || true docker logout ghcr.io 2>/dev/null || true sudo find "${{ github.workspace }}" -not -user "$(id -u)" -delete 2>/dev/null || true From f433798b556f4e466b14c1fe06ae24568b3874e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8D=9A=E6=83=9F?= Date: Sat, 9 May 2026 15:55:36 +0800 Subject: [PATCH 4/6] ci: add pre-checkout workspace cleanup for root-owned files --- .github/workflows/nightly.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 58ac1e78e9..1d437229b2 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -139,6 +139,9 @@ jobs: env: IMAGE_REPO: ghcr.io/inclusionai/areal-runtime steps: + - name: Clean workspace + run: sudo rm -rf "${{ github.workspace }}"/* 2>/dev/null || true + - uses: actions/checkout@v6 with: ref: ${{ inputs.ref || github.sha }} From 6d4ea28d81734b93ab2773d6f1c248f9b8ca9af5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8D=9A=E6=83=9F?= Date: Sat, 9 May 2026 17:27:52 +0800 Subject: [PATCH 5/6] ci: remove sudo mkdir/chmod for model cache dir Runner user lacks passwordless sudo. The cache dir is pre-created on the instance. --- .github/workflows/nightly.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 1d437229b2..8cf85d396f 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -178,9 +178,6 @@ jobs: MODEL_NAME: Qwen/Qwen3-0.6B run: | MODEL_CACHE=/opt/hf_cache - sudo mkdir -p "$MODEL_CACHE" - sudo chmod 777 "$MODEL_CACHE" - MODEL_LOCAL="$MODEL_CACHE/$MODEL_NAME" declare -A VARIANT_SHORT=( [sglang]=s [vllm]=v ) From 955b4ce7483c06c5cb06e17e3e4327f630a269bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8D=9A=E6=83=9F?= Date: Sat, 9 May 2026 19:01:21 +0800 Subject: [PATCH 6/6] ci: use HF_HOME for model cache instead of --local-dir Pass model ID to actor.path and let from_pretrained resolve from the HF cache. Pre-download just populates the cache. --- .github/workflows/nightly.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 8cf85d396f..ff835c3400 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -194,6 +194,7 @@ jobs: --device=/dev/fuse \ --security-opt=apparmor:unconfined \ -e HF_TOKEN="$HF_TOKEN" \ + -e HF_HOME=/model_cache \ -e WANDB_API_KEY="$WANDB_API_KEY" \ -e TOKENIZERS_PARALLELISM=false \ -v "$WORKSPACE:/workspace" \ @@ -202,14 +203,14 @@ jobs: "$IMAGE_REPO:dev-$variant" \ bash -lc ' export PATH=/opt/.venv/bin:$PATH - huggingface-cli download '"$MODEL_NAME"' --local-dir /model_cache/'"$MODEL_NAME"' + huggingface-cli download '"$MODEL_NAME"' uv pip install -e . --no-deps python3 examples/math/gsm8k_rl.py --config examples/math/gsm8k_grpo.yaml \ experiment_name='"$EXPERIMENT_NAME"' trial_name='"$trial_name"' \ stats_logger.wandb.mode=online \ rollout.backend='"$variant"':d1 actor.backend='"$TRAIN_BACKEND"':d1 \ cluster.n_nodes=1 cluster.n_gpus_per_node=2 \ - actor.path=/model_cache/'"$MODEL_NAME"' \ + actor.path='"$MODEL_NAME"' \ scheduler.type=local train_dataset.batch_size=64 ' sudo find "$WORKSPACE" -not -user "$(id -u)" -delete 2>/dev/null || true