From 57bb7788917ae7803fbb575e9311debe3a8aec8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8D=9A=E6=83=9F?= <bowei.fw@antgroup.com>
Date: Thu, 7 May 2026 17:54:50 +0800
Subject: [PATCH 1/6] ci: add real training jobs and branch dispatch to nightly
 workflow

Replace dummy test with actual gsm8k GRPO training inside Docker
containers (dev-sglang/dev-vllm). Add workflow_dispatch input to
run arbitrary branches.

Key changes:
- Pull latest runtime images, run via docker exec in persistent containers
- Install AReaL from source (uv pip install -e . --no-deps)
- Round-robin training backend (fsdp/megatron/archon) by day
- Add 'ref' input for branch/tag/SHA override on manual dispatch
---
 .github/workflows/nightly.yml | 99 ++++++++++++++++++++++++++++-------
 1 file changed, 80 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index cc93470264..522eed0861 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -5,6 +5,12 @@ on:
     # 00:00 Beijing time (UTC+8) = 16:00 UTC
     - cron: '0 16 * * *'
   workflow_dispatch:
+    inputs:
+      ref:
+        description: 'Branch, tag, or SHA to checkout (default: main)'
+        required: false
+        type: string
+        default: 'main'
 
 concurrency:
   group: nightly-ci
@@ -123,35 +129,90 @@ jobs:
             throw new Error(`Timed out waiting for runner ${instanceName} to come online.`);
 
   nightly-tests:
-    name: Run nightly tests
+    name: Run nightly tests (${{ matrix.variant }})
     needs: start-runner
     runs-on:
       - self-hosted
       - areal-nightly
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        variant: [sglang, vllm]
     timeout-minutes: 480
+    env:
+      CONTAINER_IMAGE: ghcr.io/inclusionai/areal-runtime:dev-${{ matrix.variant }}
+      CONTAINER_NAME: areal-nightly-${{ matrix.variant }}
     steps:
       - uses: actions/checkout@v6
+        with:
+          ref: ${{ inputs.ref || github.sha }}
+
+      - name: Log in to GitHub Container Registry
+        run: echo "${{ secrets.GHCR_TOKEN }}" | docker login ghcr.io -u inclusionai --password-stdin
+
+      - name: Pull latest runtime image
+        run: docker pull "$CONTAINER_IMAGE"
+
+      - name: Start container
+        run: |
+          docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
+          docker run --name "$CONTAINER_NAME" -d \
+            --runtime=nvidia --gpus all \
+            --net=host \
+            --shm-size=54g \
+            --ulimit nofile=1048576:1048576 \
+            --cap-add=SYS_ADMIN \
+            --device=/dev/fuse \
+            --security-opt=apparmor:unconfined \
+            -e HF_TOKEN="${{ secrets.HF_TOKEN }}" \
+            -e TOKENIZERS_PARALLELISM=false \
+            -v "${{ github.workspace }}:/workspace" \
+            -w /workspace \
+            --entrypoint=/bin/bash \
+            "$CONTAINER_IMAGE" \
+            -lc "trap : TERM INT; sleep infinity & wait"
+
+      - name: Install AReaL from source
+        run: |
+          docker exec "$CONTAINER_NAME" bash -lc '
+            export PATH=/opt/.venv/bin:$PATH
+            uv pip install -e . --no-deps
+          '
 
-      - name: System info
+      - name: Determine run parameters
+        id: params
+        run: |
+          BACKENDS=("fsdp" "megatron" "archon")
+          DAY_OF_YEAR=$(date -u +%j)
+          BACKEND_INDEX=$(( (10#$DAY_OF_YEAR - 1) % 3 ))
+          echo "experiment_name=nightly-gsm8k-$(date -u +%Y-%m)" >> "$GITHUB_OUTPUT"
+          echo "trial_name=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
+          echo "train_backend=${BACKENDS[$BACKEND_INDEX]}" >> "$GITHUB_OUTPUT"
+
+      - name: Run training
+        env:
+          VARIANT: ${{ matrix.variant }}
+          EXPERIMENT_NAME: ${{ steps.params.outputs.experiment_name }}
+          TRIAL_NAME: ${{ steps.params.outputs.trial_name }}
+          TRAIN_BACKEND: ${{ steps.params.outputs.train_backend }}
         run: |
-          echo "=== GPU info ==="
-          nvidia-smi
-          echo ""
-          echo "=== Python ==="
-          python --version || python3 --version
-          echo ""
-          echo "=== Disk ==="
-          df -h
-          echo ""
-          echo "=== Memory ==="
-          free -h
-
-      - name: Dummy test (placeholder)
+          docker exec "$CONTAINER_NAME" bash -lc '
+            export PATH=/opt/.venv/bin:$PATH
+            python3 examples/math/gsm8k_rl.py --config examples/math/gsm8k_grpo.yaml \
+              experiment_name='"$EXPERIMENT_NAME"' trial_name='"$TRIAL_NAME"' \
+              stats_logger.wandb.mode=disabled \
+              rollout.backend='"$VARIANT"':d1 actor.backend='"$TRAIN_BACKEND"':d1 \
+              cluster.n_nodes=1 cluster.n_gpus_per_node=2 \
+              actor.path=Qwen/Qwen3-0.6B \
+              scheduler.type=local train_dataset.batch_size=64
+          '
+
+      - name: Teardown container
+        if: always()
         run: |
-          echo "Nightly CI running on $(hostname) at $(date -u)"
-          echo "TODO: Replace with actual long-running tests"
-          sleep 10
-          echo "Dummy test completed successfully."
+          docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
+          docker logout ghcr.io 2>/dev/null || true
 
   stop-runner:
     name: Stop areal-nightly instance

From 5016da8b9269fc41bb9ccc6ac063e0f3ba02e866 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8D=9A=E6=83=9F?= <bowei.fw@antgroup.com>
Date: Sat, 9 May 2026 15:28:54 +0800
Subject: [PATCH 2/6] ci: fix root-owned file cleanup between matrix jobs

Docker containers run as root, creating .pyc files owned by
root:root on the bind-mounted workspace. The next matrix job
fails when actions/checkout tries to git clean these files.

Key changes:
- Add sudo rm cleanup before checkout
- Clean root-owned files in teardown step
---
 .github/workflows/nightly.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 522eed0861..adb1cddafc 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -144,6 +144,9 @@ jobs:
       CONTAINER_IMAGE: ghcr.io/inclusionai/areal-runtime:dev-${{ matrix.variant }}
       CONTAINER_NAME: areal-nightly-${{ matrix.variant }}
     steps:
+      - name: Clean workspace
+        run: sudo rm -rf "${{ github.workspace }}"/*
+
       - uses: actions/checkout@v6
         with:
           ref: ${{ inputs.ref || github.sha }}
@@ -213,6 +216,7 @@ jobs:
         run: |
           docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
           docker logout ghcr.io 2>/dev/null || true
+          sudo find "${{ github.workspace }}" -not -user "$(id -u)" -delete 2>/dev/null || true
 
   stop-runner:
     name: Stop areal-nightly instance

From a1bf2cbecd85f4472a726835994f2da7aace69c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8D=9A=E6=83=9F?= <bowei.fw@antgroup.com>
Date: Sat, 9 May 2026 15:50:46 +0800
Subject: [PATCH 3/6] ci: sequential variants, model pre-download, wandb
 logging

Key changes:
- Run sglang/vllm sequentially via shared run_variant function
- Pre-download model to persistent /opt/hf_cache volume
- Enable wandb online logging with nightly environment secret
- Trial name includes backend+variant shortcodes (e.g., m.s-2026-05-09)
---
 .github/workflows/nightly.yml | 116 ++++++++++++++++++----------------
 1 file changed, 61 insertions(+), 55 deletions(-)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index adb1cddafc..58ac1e78e9 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -129,24 +129,16 @@ jobs:
             throw new Error(`Timed out waiting for runner ${instanceName} to come online.`);
 
   nightly-tests:
-    name: Run nightly tests (${{ matrix.variant }})
+    name: Run nightly tests
     needs: start-runner
+    environment: nightly
     runs-on:
       - self-hosted
       - areal-nightly
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        variant: [sglang, vllm]
     timeout-minutes: 480
     env:
-      CONTAINER_IMAGE: ghcr.io/inclusionai/areal-runtime:dev-${{ matrix.variant }}
-      CONTAINER_NAME: areal-nightly-${{ matrix.variant }}
+      IMAGE_REPO: ghcr.io/inclusionai/areal-runtime
     steps:
-      - name: Clean workspace
-        run: sudo rm -rf "${{ github.workspace }}"/*
-
       - uses: actions/checkout@v6
         with:
           ref: ${{ inputs.ref || github.sha }}
@@ -154,67 +146,81 @@ jobs:
       - name: Log in to GitHub Container Registry
         run: echo "${{ secrets.GHCR_TOKEN }}" | docker login ghcr.io -u inclusionai --password-stdin
 
-      - name: Pull latest runtime image
-        run: docker pull "$CONTAINER_IMAGE"
-
-      - name: Start container
-        run: |
-          docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
-          docker run --name "$CONTAINER_NAME" -d \
-            --runtime=nvidia --gpus all \
-            --net=host \
-            --shm-size=54g \
-            --ulimit nofile=1048576:1048576 \
-            --cap-add=SYS_ADMIN \
-            --device=/dev/fuse \
-            --security-opt=apparmor:unconfined \
-            -e HF_TOKEN="${{ secrets.HF_TOKEN }}" \
-            -e TOKENIZERS_PARALLELISM=false \
-            -v "${{ github.workspace }}:/workspace" \
-            -w /workspace \
-            --entrypoint=/bin/bash \
-            "$CONTAINER_IMAGE" \
-            -lc "trap : TERM INT; sleep infinity & wait"
-
-      - name: Install AReaL from source
+      - name: Pull latest runtime images
         run: |
-          docker exec "$CONTAINER_NAME" bash -lc '
-            export PATH=/opt/.venv/bin:$PATH
-            uv pip install -e . --no-deps
-          '
+          docker pull "$IMAGE_REPO:dev-sglang"
+          docker pull "$IMAGE_REPO:dev-vllm"
 
       - name: Determine run parameters
         id: params
         run: |
           BACKENDS=("fsdp" "megatron" "archon")
+          BACKEND_SHORT=("f" "m" "a")
           DAY_OF_YEAR=$(date -u +%j)
           BACKEND_INDEX=$(( (10#$DAY_OF_YEAR - 1) % 3 ))
           echo "experiment_name=nightly-gsm8k-$(date -u +%Y-%m)" >> "$GITHUB_OUTPUT"
-          echo "trial_name=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
+          echo "trial_date=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
           echo "train_backend=${BACKENDS[$BACKEND_INDEX]}" >> "$GITHUB_OUTPUT"
+          echo "train_backend_short=${BACKEND_SHORT[$BACKEND_INDEX]}" >> "$GITHUB_OUTPUT"
 
-      - name: Run training
+      - name: Run training (sglang + vllm)
         env:
-          VARIANT: ${{ matrix.variant }}
           EXPERIMENT_NAME: ${{ steps.params.outputs.experiment_name }}
-          TRIAL_NAME: ${{ steps.params.outputs.trial_name }}
+          TRIAL_DATE: ${{ steps.params.outputs.trial_date }}
           TRAIN_BACKEND: ${{ steps.params.outputs.train_backend }}
+          TRAIN_BACKEND_SHORT: ${{ steps.params.outputs.train_backend_short }}
+          WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          WORKSPACE: ${{ github.workspace }}
+          MODEL_NAME: Qwen/Qwen3-0.6B
         run: |
-          docker exec "$CONTAINER_NAME" bash -lc '
-            export PATH=/opt/.venv/bin:$PATH
-            python3 examples/math/gsm8k_rl.py --config examples/math/gsm8k_grpo.yaml \
-              experiment_name='"$EXPERIMENT_NAME"' trial_name='"$TRIAL_NAME"' \
-              stats_logger.wandb.mode=disabled \
-              rollout.backend='"$VARIANT"':d1 actor.backend='"$TRAIN_BACKEND"':d1 \
-              cluster.n_nodes=1 cluster.n_gpus_per_node=2 \
-              actor.path=Qwen/Qwen3-0.6B \
-              scheduler.type=local train_dataset.batch_size=64
-          '
-
-      - name: Teardown container
+          MODEL_CACHE=/opt/hf_cache
+          sudo mkdir -p "$MODEL_CACHE"
+          sudo chmod 777 "$MODEL_CACHE"
+          MODEL_LOCAL="$MODEL_CACHE/$MODEL_NAME"
+
+          declare -A VARIANT_SHORT=( [sglang]=s [vllm]=v )
+
+          run_variant() {
+            local variant=$1
+            local trial_name="${TRAIN_BACKEND_SHORT}.${VARIANT_SHORT[$variant]}-${TRIAL_DATE}"
+            echo "=== Running variant: $variant (trial: $trial_name) ==="
+            docker run --rm \
+              --runtime=nvidia --gpus all \
+              --net=host \
+              --shm-size=500g \
+              --ulimit nofile=1048576:1048576 \
+              --cap-add=SYS_ADMIN \
+              --device=/dev/fuse \
+              --security-opt=apparmor:unconfined \
+              -e HF_TOKEN="$HF_TOKEN" \
+              -e WANDB_API_KEY="$WANDB_API_KEY" \
+              -e TOKENIZERS_PARALLELISM=false \
+              -v "$WORKSPACE:/workspace" \
+              -v "$MODEL_CACHE:/model_cache" \
+              -w /workspace \
+              "$IMAGE_REPO:dev-$variant" \
+              bash -lc '
+                export PATH=/opt/.venv/bin:$PATH
+                huggingface-cli download '"$MODEL_NAME"' --local-dir /model_cache/'"$MODEL_NAME"'
+                uv pip install -e . --no-deps
+                python3 examples/math/gsm8k_rl.py --config examples/math/gsm8k_grpo.yaml \
+                  experiment_name='"$EXPERIMENT_NAME"' trial_name='"$trial_name"' \
+                  stats_logger.wandb.mode=online \
+                  rollout.backend='"$variant"':d1 actor.backend='"$TRAIN_BACKEND"':d1 \
+                  cluster.n_nodes=1 cluster.n_gpus_per_node=2 \
+                  actor.path=/model_cache/'"$MODEL_NAME"' \
+                  scheduler.type=local train_dataset.batch_size=64
+              '
+            sudo find "$WORKSPACE" -not -user "$(id -u)" -delete 2>/dev/null || true
+          }
+
+          run_variant sglang
+          run_variant vllm
+
+      - name: Cleanup
         if: always()
         run: |
-          docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
           docker logout ghcr.io 2>/dev/null || true
           sudo find "${{ github.workspace }}" -not -user "$(id -u)" -delete 2>/dev/null || true
 

From f433798b556f4e466b14c1fe06ae24568b3874e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8D=9A=E6=83=9F?= <bowei.fw@antgroup.com>
Date: Sat, 9 May 2026 15:55:36 +0800
Subject: [PATCH 4/6] ci: add pre-checkout workspace cleanup for root-owned
 files

---
 .github/workflows/nightly.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 58ac1e78e9..1d437229b2 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -139,6 +139,9 @@ jobs:
     env:
       IMAGE_REPO: ghcr.io/inclusionai/areal-runtime
     steps:
+      - name: Clean workspace
+        run: sudo rm -rf "${{ github.workspace }}"/* 2>/dev/null || true
+
       - uses: actions/checkout@v6
         with:
           ref: ${{ inputs.ref || github.sha }}

From 6d4ea28d81734b93ab2773d6f1c248f9b8ca9af5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8D=9A=E6=83=9F?= <bowei.fw@antgroup.com>
Date: Sat, 9 May 2026 17:27:52 +0800
Subject: [PATCH 5/6] ci: remove sudo mkdir/chmod for model cache dir

Runner user lacks passwordless sudo. The cache dir is
pre-created on the instance.
---
 .github/workflows/nightly.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 1d437229b2..8cf85d396f 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -178,9 +178,6 @@ jobs:
           MODEL_NAME: Qwen/Qwen3-0.6B
         run: |
           MODEL_CACHE=/opt/hf_cache
-          sudo mkdir -p "$MODEL_CACHE"
-          sudo chmod 777 "$MODEL_CACHE"
-          MODEL_LOCAL="$MODEL_CACHE/$MODEL_NAME"
 
           declare -A VARIANT_SHORT=( [sglang]=s [vllm]=v )
 

From 955b4ce7483c06c5cb06e17e3e4327f630a269bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8D=9A=E6=83=9F?= <bowei.fw@antgroup.com>
Date: Sat, 9 May 2026 19:01:21 +0800
Subject: [PATCH 6/6] ci: use HF_HOME for model cache instead of --local-dir

Pass model ID to actor.path and let from_pretrained resolve
from the HF cache. Pre-download just populates the cache.
---
 .github/workflows/nightly.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 8cf85d396f..ff835c3400 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -194,6 +194,7 @@ jobs:
               --device=/dev/fuse \
               --security-opt=apparmor:unconfined \
               -e HF_TOKEN="$HF_TOKEN" \
+              -e HF_HOME=/model_cache \
               -e WANDB_API_KEY="$WANDB_API_KEY" \
               -e TOKENIZERS_PARALLELISM=false \
               -v "$WORKSPACE:/workspace" \
@@ -202,14 +203,14 @@ jobs:
               "$IMAGE_REPO:dev-$variant" \
               bash -lc '
                 export PATH=/opt/.venv/bin:$PATH
-                huggingface-cli download '"$MODEL_NAME"' --local-dir /model_cache/'"$MODEL_NAME"'
+                huggingface-cli download '"$MODEL_NAME"'
                 uv pip install -e . --no-deps
                 python3 examples/math/gsm8k_rl.py --config examples/math/gsm8k_grpo.yaml \
                   experiment_name='"$EXPERIMENT_NAME"' trial_name='"$trial_name"' \
                   stats_logger.wandb.mode=online \
                   rollout.backend='"$variant"':d1 actor.backend='"$TRAIN_BACKEND"':d1 \
                   cluster.n_nodes=1 cluster.n_gpus_per_node=2 \
-                  actor.path=/model_cache/'"$MODEL_NAME"' \
+                  actor.path='"$MODEL_NAME"' \
                   scheduler.type=local train_dataset.batch_size=64
               '
             sudo find "$WORKSPACE" -not -user "$(id -u)" -delete 2>/dev/null || true