opendilab · puyuan1996 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026
diff --git a/examples/gsm8k_geo3k/example.py b/examples/gsm8k_geo3k/example.py
@@ -0,0 +1,28 @@
+from vllm import LLM, SamplingParams
+# from vllm_ascend import LLM, SamplingParams
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+def main():
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    # Create an LLM.
+    #llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
+    llm = LLM(model="/data/puyuan/LightRFT/model/Qwen2.5-0.5B-Instruct/7ae557604adf67be50417f59c2c2f167def9a775/")
+
+
+    # Generate texts from the prompts.
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gsm8k_geo3k/reward_models_utils.py b/examples/gsm8k_geo3k/reward_models_utils.py
@@ -25,7 +25,7 @@
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 import torch
-
+from lightrft.utils.utils import get_current_device
 
 # ============================================================================
 # Reward Recipe Configuration
@@ -289,7 +289,7 @@ def mix_rewards(
         print(f"[mix_rewards] labels: {labels}")
         print(f"[mix_rewards] model_scores shape: {model_scores.shape}")
 
-    device = model_scores.device if model_scores.numel() > 0 else torch.device('cuda')
+    device = model_scores.device if model_scores.numel() > 0 else get_current_device()
     B = len(labels)
 
     final_reward = torch.zeros(B, dtype=torch.float32, device=device)
@@ -390,7 +390,7 @@ def reward_fn(
     else:
         # No neural reward models - create empty placeholder
         B = len(labels)
-        model_scores = torch.zeros(0, B, dtype=torch.float32, device="cuda")
+        model_scores = torch.zeros(0, B, dtype=torch.float32, device=get_current_device())
 
     # Call mix_rewards to compute final rewards
     return mix_rewards(labels, model_scores, label_map, queries, refs)
diff --git a/examples/gsm8k_geo3k/run_grpo_gsm8k_qwen2.5_0.5b_2.sh b/examples/gsm8k_geo3k/run_grpo_gsm8k_qwen2.5_0.5b_2.sh
@@ -0,0 +1,255 @@
+#!/bin/bash
+
+# --- 开始修改 ---
+
+# 1. 初始化 Conda 的 Shell 功能 (对于非交互式脚本，这通常是必需的)
+#    请将 /path/to/your/conda 替换为您的 Anaconda 或 Miniconda 的安装路径
+#    通常是 /root/anaconda3 或 /opt/conda 等
+#    如果不确定，可以执行 `echo $CONDA_PREFIX` 查看当前激活环境的前缀，再往上找
+#    例如，如果 $CONDA_PREFIX 是 /mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312
+#    那么 conda 的主目录可能是 /mnt/shared-storage-user/puyuan/conda_envs
+#    但更可靠的是找到主安装目录。我们先假设一个通用路径。
+eval "$(conda shell.bash hook)"
+
+# 2. 激活您的目标环境
+conda activate /mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312
+
+# 将您的项目根目录添加到 PYTHONPATH
+export PYTHONPATH=/mnt/shared-storage-user/puyuan/code/LightRFT:$PYTHONPATH
+
+
+# export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+
+# # 1. 导出包含 libcudart.so.12 的准确路径
+# export LD_LIBRARY_PATH=/mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312/lib/python3.12/site-packages/nvidia/cuda_runtime/lib:$LD_LIBRARY_PATH
+# # 2. 为了保险，把 nvidia 其他相关库（如 cudnn, cublas 等）的路径也加进去，防止报其他错
+# export LD_LIBRARY_PATH=/mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312/lib/python3.12/site-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312/lib/python3.12/site-packages/nvidia/cublas/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib:$LD_LIBRARY_PATH
+# # 3. 再次确认 Conda 基础 lib 也在里面
+# export LD_LIBRARY_PATH=/mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312/lib:$LD_LIBRARY_PATH
+# # 4. 验证设置是否生效
+# echo $LD_LIBRARY_PATH
+
+
+# LightRFT Training Script for the GSM8K Dataset.
+# This script fine-tunes a text-only model (e.g., Qwen2.5-Instruct) using the GRPO algorithm.
+#
+# Key Feature:
+# This training process utilizes a PURE RULE-BASED REWARD mechanism, eliminating the need for a separate reward model.
+# The reward is calculated based on two criteria:
+# - Format Correctness (10%): Adherence to the required <think>...</think> and \boxed{} format.
+# - Answer Accuracy (90%): Correctness of the final mathematical answer.
+#
+
+################################################################################
+#                           Part 1: User Configuration                         #
+# Please update the following paths and settings to match your environment.    #
+################################################################################
+
+# --- Model and Dataset Paths ---
+# Path to the base model. Can be a Hugging Face model name or a local directory.
+# This script is designed for TEXT-ONLY models.
+# PATH_TO_YOUR_BASE_MODEL="Qwen/Qwen2.5-0.5B-Instruct"
+# PATH_TO_YOUR_BASE_MODEL="Qwen/Qwen2.5-7B-Instruct" # Example for a larger model
+# PATH_TO_YOUR_BASE_MODEL="/mnt/shared-storage-user/puyuan/model/Qwen2.5-0.5B-Instruct"
+# PATH_TO_YOUR_BASE_MODEL="/mnt/shared-storage-user/puyuan/model/Qwen2.5-3B-Instruct"
+# PATH_TO_YOUR_BASE_MODEL="/mnt/shared-storage-user/puyuan/xiongjyu/models/Qwen2.5-0.5B-Instruct"
+PATH_TO_YOUR_BASE_MODEL="/data/puyuan/LightRFT/model/Qwen2.5-0.5B-Instruct"
+# PATH_TO_YOUR_BASE_MODEL="/mnt/shared-storage-user/puyuan/xiongjyu/models/Qwen2.5-3B-Instruct"
+
+
+# Path to the preprocessed GSM8K dataset.
+# See "Usage Instructions" at the end of the script for preprocessing steps.
+# PATH_TO_YOUR_GSM8K_DATASET="/path/to/your/preprocessed/gsm8k_dataset"
+# PATH_TO_YOUR_GSM8K_DATASET="/mnt/shared-storage-user/puyuan/data/gsm8k"
+PATH_TO_YOUR_GSM8K_DATASET="/data/puyuan/LightRFT/data/gsm8k"
+
+# --- Experiment and Logging ---
+# A descriptive name for your experiment. Used for organizing logs and checkpoints.
+# EXPERIMENT_NAME="lightrft-gsm8k-grpo-0119-nosleep"
+EXPERIMENT_NAME="lightrft-gsm8k-grpo-0127-sleep"
+
+# Your Weights & Biases API key.
+# Set to an empty string "" if you are not using W&B.
+# It is strongly recommended to set this as an environment variable instead of hardcoding.
+# export WANDB_API_KEY="YOUR_WANDB_API_KEY"
+export WANDB_API_KEY="968275bc822c87ac741ecce2f06cdfb54dbc1608"  # Replace with your key
+export WANDB_PROJECT="LightRFT-GSM8K-Experiments"
+
+
+
+################################################################################
+#                       Part 2: Training Hyperparameters                       #
+# These settings control the training process. Adjust them as needed.          #
+################################################################################
+
+# --- GRPO Settings ---
+GROUP_METHOD="normal"
+N_SAMPLES=8              # Number of samples per prompt for GRPO (must be > 1).
+EPISODE=30               # Total number of training episodes.
+WARMUP=0.03              # Learning rate warmup ratio.
+
+# --- Batch Size Configuration ---
+# RBS=64                         # Rollout Batch Size.
+# TBS=$((RBS * N_SAMPLES))       # Train Batch Size is derived from RBS and N_SAMPLES.
+RBS=128
+TBS=128
+
+# --- Learning and Model Settings ---
+KL=0.01                  # KL divergence coefficient.
+LR=1e-6                  # Actor learning rate.
+MAX_LENGTH=3072          # Max sequence length (prompt + generation).
+PROMPT_MAX_LEN=1024      # Max length of the input prompt.
+GENERATE_MAX_LEN=2048    # Max length of the generated response.
+
+# --- Evaluation Settings ---
+EVAL_SPLIT="test"        # Dataset split for evaluation.
+MAX_EVAL_SAMPLES=1319    # Set to 1319 for a full evaluation on the GSM8K test set.
+
+
+################################################################################
+#                    Part 3: Distributed Training Setup                        #
+# Configure settings for multi-GPU and multi-node training.                    #
+################################################################################
+
+# --- Single-Node Distributed Setup ---
+# Update these if you are running in a multi-node environment.
+export MLP_WORKER_NUM=1                 # Number of nodes.
+export MLP_WORKER_GPU=8               # Number of GPUs per node.
+# export MLP_WORKER_GPU=4               # Number of GPUs per node. # TODO
+export MLP_ROLE_INDEX=0                 # Rank of the current node.
+export MLP_WORKER_0_HOST="localhost"    # IP address of the master node (node 0).
+export MLP_WORKER_0_PORT=20090          # Port for the master node.
+
+# --- PyTorch Distributed Environment Variables ---
+export MASTER_ADDR=$MLP_WORKER_0_HOST
+export MASTER_PORT=$MLP_WORKER_0_PORT
+export NNODES=$MLP_WORKER_NUM
+export NODE_RANK=$MLP_ROLE_INDEX
+export GPUS_PER_NODE=$MLP_WORKER_GPU
+
+# --- vLLM/SGLang Engine Settings ---
+ENGINE_TP=2  # Tensor parallelism size for the inference engine. Adjust based on your model and GPU setup.
+# ENGINE_TP=1  # Tensor parallelism size for the inference engine. Adjust based on your model and GPU setup.
+
+
+################################################################################
+#                      Part 4: Execution and Logging                           #
+# This section prepares and launches the training command.                     #
+################################################################################
+
+# --- Generate dynamic names and paths ---
+current_time=$(date +"%Y%m%d_%H%M%S")
+SAVE_MODEL_NAME="${EXPERIMENT_NAME}-ep${EPISODE}-kl${KL}-lr${LR}-${current_time}"
+WANDB_RUN_NAME="${EXPERIMENT_NAME}-${current_time}"
+
+# --- Create directories for logs and checkpoints ---
+mkdir -p "results/${EXPERIMENT_NAME}/${SAVE_MODEL_NAME}"
+mkdir -p "rft_logs/${EXPERIMENT_NAME}"
+
+
+# --- System and Environment Optimizations ---
+export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+export NCCL_DEBUG="WARN"
+export IGNORE_EOS=0
+# export WANDB_MODE="offline" # Set to "online" for real-time W&B logging.
+
+# --- Set execution verbosity ---
+set -x
+
+
+################################################################################
+#                         Part 5: Main Training Command                        #
+################################################################################
+
+    # --engine_type sglang \
+    # --eval_steps 10 \
+    # --enable_engine_sleep \
+    # --disable_engine_sleep \
+
+    # --engine_type sglang \
+
+
+# /mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312/bin/python3 -m torch.distributed.run \
+torchrun \
+    --nnodes $NNODES \
+    --nproc-per-node $GPUS_PER_NODE \
+    --node_rank $NODE_RANK \
+    --master-port $MASTER_PORT \
+    --master-addr $MASTER_ADDR \
+    examples/gsm8k_geo3k/train_colocate.py \
+    --pretrain "${PATH_TO_YOUR_BASE_MODEL}" \
+    --save_trajectories \
+    --advantage_estimator "group_norm" \
+    --fsdp \
+    --use_kl_loss \
+    --flash_attn \
+    --engine_type vllm \
+    --enable_engine_sleep \
+    --rm_use_engine \
+    --reward_pretrain "{}" \
+    --save_path "results/${EXPERIMENT_NAME}/${SAVE_MODEL_NAME}" \
+    --ckpt_path "results/${EXPERIMENT_NAME}/${SAVE_MODEL_NAME}" \
+    --micro_train_batch_size 4 \
+    --train_batch_size ${TBS} \
+    --micro_rollout_batch_size 4 \
+    --rollout_batch_size ${RBS} \
+    --max_epochs 1 \
+    --num_episodes ${EPISODE} \
+    --lr_warmup_ratio ${WARMUP} \
+    --n_samples_per_prompt $N_SAMPLES \
+    --prompt_max_len $PROMPT_MAX_LEN \
+    --generate_max_len $GENERATE_MAX_LEN \
+    --zero_stage 3 \
+    --bf16 \
+    --actor_learning_rate $LR \
+    --init_kl_coef $KL \
+    --kl_estimator "k3" \
+    --prompt_data "${PATH_TO_YOUR_GSM8K_DATASET}" \
+    --input_key "prompt" \
+    --label_key "label" \
+    --eval_steps 20 \
+    --eval_split "${EVAL_SPLIT}" \
+    --max_eval_samples $MAX_EVAL_SAMPLES \
+    --apply_chat_template \
+    --gradient_checkpointing \
+    --save_steps 20 \
+    --max_ckpt_num 3 \
+    --engine_mem_util 0.6 \
+    --engine_tp_size $ENGINE_TP \
+    --system_prompt 'A conversation between the User and Assistant. The User asks a question, and the Assistant provides a solution. The Assistant first thinks through the reasoning process internally with self-reflection and consistency check and then gives the final analysis and answer. The reasoning process should be enclosed within <think></think>, followed directly by the final thought and answer, the final answer MUST BE put in \\boxed{}, like this: <think> reasoning process here </think> final thought and \\boxed{answer} here.' \
+    --l2 1.0e-2 \
+    --freeze_prefix \
+    --adam_offload \
+    --text_only \
+    --use_wandb "${WANDB_API_KEY}" \
+    --wandb_project "${WANDB_PROJECT}" \
+    --wandb_run_name "${WANDB_RUN_NAME}" \
+    2>&1 | tee "rft_logs/${EXPERIMENT_NAME}/node${NODE_RANK}_${current_time}.log"
+
+
+################################################################################
+#                           Usage Instructions                                 #
+#                                                                              #
+# Step 1: Preprocess the GSM8K Dataset                                         #
+#   Run the provided preprocessing script to prepare the dataset.              #
+#   Make sure the output directory matches `PATH_TO_YOUR_GSM8K_DATASET`.       #
+#                                                                              #
+#   `python examples/data_preprocess/gsm8k_lightrft.py --local_save_dir /path/to/your/preprocessed/gsm8k_dataset`
+#                                                                              #
+# Step 2: Configure the Script                                                 #
+#   Edit "Part 1: User Configuration" at the top of this file. Set the paths   #
+#   to your base model and the preprocessed dataset.                           #
+#                                                                              #
+# Step 3: Run the Training Script                                              #
+#   Execute this script from your shell.                                       #
+#                                                                              #
+#   `bash /path/to/this/script.sh`                                             #
+#                                                                              #
+# Key Notes for Text-Only Training:                                            #
+# - This script is configured for a text-only task (GSM8K).                    #
+# - The `--text_only` flag is CRITICAL. It ensures the script runs in          #
+#   text-only mode and does not expect image data.                             #
+#                                                                              #
+################################################################################