Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
f49cf58
feature(pu): adapt to npu device
puyuan1996 Feb 9, 2026
0918623
fix(pu): fix indent type
Feb 9, 2026
4ab9eb8
fix(pu): fix import compatibility when no sglang
puyuan1996 Feb 9, 2026
ce5fd16
fix(pu): fix npu compatibility in strategy_base.py
puyuan1996 Feb 9, 2026
877a5c2
fix(pu): fix npu compatibility in the whole repo
puyuan1996 Feb 9, 2026
179b50e
fix(pu): fix npu backend
puyuan1996 Feb 9, 2026
1b868e3
fix(pu): fix torch.cuda hardcoded to adapt to npu
puyuan1996 Feb 9, 2026
ccda92d
fix(pu): fix cuda hardcoded to adapt to npu
puyuan1996 Feb 9, 2026
61efe44
fix(pu): fix vllm init bug
puyuan1996 Feb 9, 2026
fac17fd
tmp
puyuan1996 Feb 10, 2026
bf02648
fix(pu): adapt vllm WorkerWrap to npu
puyuan1996 Mar 2, 2026
3676fee
test(pu): try to fix vllm inference
puyuan1996 Mar 2, 2026
55af18a
Merge branch 'dev-npu' of https://github.com/opendilab/LightRFT into …
puyuan1996 Mar 2, 2026
5f11088
fix(pu): fix torch.cuda hardcoded to adapt to current_device
puyuan1996 Mar 2, 2026
76f79ac
test(pu): try to fix gen_time.item() runtime error
puyuan1996 Mar 2, 2026
0997809
fix(pu): fix get_current_device
puyuan1996 Mar 2, 2026
65c283a
test(pu): try Synchronize before CPU transfer
puyuan1996 Mar 2, 2026
bde52b2
test(pu): try to fix all_reduce and all_gather in npu setting
puyuan1996 Mar 2, 2026
fa07849
test(pu): try to fix all_reduce and all_gather in npu setting
puyuan1996 Mar 2, 2026
fefa608
test(pu): try to fix all_reduce and all_gather in npu setting
puyuan1996 Mar 2, 2026
0189c9b
test(pu): try to fix all_reduce and all_gather in npu setting
puyuan1996 Mar 2, 2026
6dd1a44
polish(pu): add compilation_config in vllm_ascend
puyuan1996 Mar 3, 2026
2cb6724
fix(pu): fix npu device compatibility in reward_models_utils.py
puyuan1996 Mar 3, 2026
f279ca0
test(pu): add safework_t1 test
Mar 4, 2026
0cd9540
Merge branch 'dev-npu' of https://github.com/opendilab/LightRFT into …
puyuan1996 Mar 4, 2026
7cc96cc
polish(pu): adapt safework_t1 code to npu device
puyuan1996 Mar 4, 2026
e2982f1
sync code
puyuan1996 Mar 4, 2026
a735d32
fix(pu): fix rm use_engine cfg bug
puyuan1996 Mar 4, 2026
8a96ab2
fix(pu): fix get_tokenizer_processor_vl args
puyuan1996 Mar 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions examples/gsm8k_geo3k/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from vllm import LLM, SamplingParams
# from vllm_ascend import LLM, SamplingParams

prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]

def main():
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
#llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
llm = LLM(model="/data/puyuan/LightRFT/model/Qwen2.5-0.5B-Instruct/7ae557604adf67be50417f59c2c2f167def9a775/")


# Generate texts from the prompts.
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


if __name__ == "__main__":
main()
6 changes: 3 additions & 3 deletions examples/gsm8k_geo3k/reward_models_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

import torch

from lightrft.utils.utils import get_current_device

# ============================================================================
# Reward Recipe Configuration
Expand Down Expand Up @@ -289,7 +289,7 @@ def mix_rewards(
print(f"[mix_rewards] labels: {labels}")
print(f"[mix_rewards] model_scores shape: {model_scores.shape}")

device = model_scores.device if model_scores.numel() > 0 else torch.device('cuda')
device = model_scores.device if model_scores.numel() > 0 else get_current_device()
B = len(labels)

final_reward = torch.zeros(B, dtype=torch.float32, device=device)
Expand Down Expand Up @@ -390,7 +390,7 @@ def reward_fn(
else:
# No neural reward models - create empty placeholder
B = len(labels)
model_scores = torch.zeros(0, B, dtype=torch.float32, device="cuda")
model_scores = torch.zeros(0, B, dtype=torch.float32, device=get_current_device())

# Call mix_rewards to compute final rewards
return mix_rewards(labels, model_scores, label_map, queries, refs)
255 changes: 255 additions & 0 deletions examples/gsm8k_geo3k/run_grpo_gsm8k_qwen2.5_0.5b_2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
#!/bin/bash

# --- 开始修改 ---

# 1. 初始化 Conda 的 Shell 功能 (对于非交互式脚本,这通常是必需的)
# 请将 /path/to/your/conda 替换为您的 Anaconda 或 Miniconda 的安装路径
# 通常是 /root/anaconda3 或 /opt/conda 等
# 如果不确定,可以执行 `echo $CONDA_PREFIX` 查看当前激活环境的前缀,再往上找
# 例如,如果 $CONDA_PREFIX 是 /mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312
# 那么 conda 的主目录可能是 /mnt/shared-storage-user/puyuan/conda_envs
# 但更可靠的是找到主安装目录。我们先假设一个通用路径。
eval "$(conda shell.bash hook)"

# 2. 激活您的目标环境
conda activate /mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312

# 将您的项目根目录添加到 PYTHONPATH
export PYTHONPATH=/mnt/shared-storage-user/puyuan/code/LightRFT:$PYTHONPATH


# export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64

# # 1. 导出包含 libcudart.so.12 的准确路径
# export LD_LIBRARY_PATH=/mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312/lib/python3.12/site-packages/nvidia/cuda_runtime/lib:$LD_LIBRARY_PATH
# # 2. 为了保险,把 nvidia 其他相关库(如 cudnn, cublas 等)的路径也加进去,防止报其他错
# export LD_LIBRARY_PATH=/mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312/lib/python3.12/site-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312/lib/python3.12/site-packages/nvidia/cublas/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib:$LD_LIBRARY_PATH
# # 3. 再次确认 Conda 基础 lib 也在里面
# export LD_LIBRARY_PATH=/mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312/lib:$LD_LIBRARY_PATH
# # 4. 验证设置是否生效
# echo $LD_LIBRARY_PATH


# LightRFT Training Script for the GSM8K Dataset.
# This script fine-tunes a text-only model (e.g., Qwen2.5-Instruct) using the GRPO algorithm.
#
# Key Feature:
# This training process utilizes a PURE RULE-BASED REWARD mechanism, eliminating the need for a separate reward model.
# The reward is calculated based on two criteria:
# - Format Correctness (10%): Adherence to the required <think>...</think> and \boxed{} format.
# - Answer Accuracy (90%): Correctness of the final mathematical answer.
#

################################################################################
# Part 1: User Configuration #
# Please update the following paths and settings to match your environment. #
################################################################################

# --- Model and Dataset Paths ---
# Path to the base model. Can be a Hugging Face model name or a local directory.
# This script is designed for TEXT-ONLY models.
# PATH_TO_YOUR_BASE_MODEL="Qwen/Qwen2.5-0.5B-Instruct"
# PATH_TO_YOUR_BASE_MODEL="Qwen/Qwen2.5-7B-Instruct" # Example for a larger model
# PATH_TO_YOUR_BASE_MODEL="/mnt/shared-storage-user/puyuan/model/Qwen2.5-0.5B-Instruct"
# PATH_TO_YOUR_BASE_MODEL="/mnt/shared-storage-user/puyuan/model/Qwen2.5-3B-Instruct"
# PATH_TO_YOUR_BASE_MODEL="/mnt/shared-storage-user/puyuan/xiongjyu/models/Qwen2.5-0.5B-Instruct"
PATH_TO_YOUR_BASE_MODEL="/data/puyuan/LightRFT/model/Qwen2.5-0.5B-Instruct"
# PATH_TO_YOUR_BASE_MODEL="/mnt/shared-storage-user/puyuan/xiongjyu/models/Qwen2.5-3B-Instruct"


# Path to the preprocessed GSM8K dataset.
# See "Usage Instructions" at the end of the script for preprocessing steps.
# PATH_TO_YOUR_GSM8K_DATASET="/path/to/your/preprocessed/gsm8k_dataset"
# PATH_TO_YOUR_GSM8K_DATASET="/mnt/shared-storage-user/puyuan/data/gsm8k"
PATH_TO_YOUR_GSM8K_DATASET="/data/puyuan/LightRFT/data/gsm8k"

# --- Experiment and Logging ---
# A descriptive name for your experiment. Used for organizing logs and checkpoints.
# EXPERIMENT_NAME="lightrft-gsm8k-grpo-0119-nosleep"
EXPERIMENT_NAME="lightrft-gsm8k-grpo-0127-sleep"

# Your Weights & Biases API key.
# Set to an empty string "" if you are not using W&B.
# It is strongly recommended to set this as an environment variable instead of hardcoding.
# export WANDB_API_KEY="YOUR_WANDB_API_KEY"
export WANDB_API_KEY="968275bc822c87ac741ecce2f06cdfb54dbc1608" # Replace with your key
export WANDB_PROJECT="LightRFT-GSM8K-Experiments"



################################################################################
# Part 2: Training Hyperparameters #
# These settings control the training process. Adjust them as needed. #
################################################################################

# --- GRPO Settings ---
GROUP_METHOD="normal"
N_SAMPLES=8 # Number of samples per prompt for GRPO (must be > 1).
EPISODE=30 # Total number of training episodes.
WARMUP=0.03 # Learning rate warmup ratio.

# --- Batch Size Configuration ---
# RBS=64 # Rollout Batch Size.
# TBS=$((RBS * N_SAMPLES)) # Train Batch Size is derived from RBS and N_SAMPLES.
RBS=128
TBS=128

# --- Learning and Model Settings ---
KL=0.01 # KL divergence coefficient.
LR=1e-6 # Actor learning rate.
MAX_LENGTH=3072 # Max sequence length (prompt + generation).
PROMPT_MAX_LEN=1024 # Max length of the input prompt.
GENERATE_MAX_LEN=2048 # Max length of the generated response.

# --- Evaluation Settings ---
EVAL_SPLIT="test" # Dataset split for evaluation.
MAX_EVAL_SAMPLES=1319 # Set to 1319 for a full evaluation on the GSM8K test set.


################################################################################
# Part 3: Distributed Training Setup #
# Configure settings for multi-GPU and multi-node training. #
################################################################################

# --- Single-Node Distributed Setup ---
# Update these if you are running in a multi-node environment.
export MLP_WORKER_NUM=1 # Number of nodes.
export MLP_WORKER_GPU=8 # Number of GPUs per node.
# export MLP_WORKER_GPU=4 # Number of GPUs per node. # TODO
export MLP_ROLE_INDEX=0 # Rank of the current node.
export MLP_WORKER_0_HOST="localhost" # IP address of the master node (node 0).
export MLP_WORKER_0_PORT=20090 # Port for the master node.

# --- PyTorch Distributed Environment Variables ---
export MASTER_ADDR=$MLP_WORKER_0_HOST
export MASTER_PORT=$MLP_WORKER_0_PORT
export NNODES=$MLP_WORKER_NUM
export NODE_RANK=$MLP_ROLE_INDEX
export GPUS_PER_NODE=$MLP_WORKER_GPU

# --- vLLM/SGLang Engine Settings ---
ENGINE_TP=2 # Tensor parallelism size for the inference engine. Adjust based on your model and GPU setup.
# ENGINE_TP=1 # Tensor parallelism size for the inference engine. Adjust based on your model and GPU setup.


################################################################################
# Part 4: Execution and Logging #
# This section prepares and launches the training command. #
################################################################################

# --- Generate dynamic names and paths ---
current_time=$(date +"%Y%m%d_%H%M%S")
SAVE_MODEL_NAME="${EXPERIMENT_NAME}-ep${EPISODE}-kl${KL}-lr${LR}-${current_time}"
WANDB_RUN_NAME="${EXPERIMENT_NAME}-${current_time}"

# --- Create directories for logs and checkpoints ---
mkdir -p "results/${EXPERIMENT_NAME}/${SAVE_MODEL_NAME}"
mkdir -p "rft_logs/${EXPERIMENT_NAME}"


# --- System and Environment Optimizations ---
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
export NCCL_DEBUG="WARN"
export IGNORE_EOS=0
# export WANDB_MODE="offline" # Set to "online" for real-time W&B logging.

# --- Set execution verbosity ---
set -x


################################################################################
# Part 5: Main Training Command #
################################################################################

# --engine_type sglang \
# --eval_steps 10 \
# --enable_engine_sleep \
# --disable_engine_sleep \

# --engine_type sglang \


# /mnt/shared-storage-user/puyuan/conda_envs/lightrft_py312/bin/python3 -m torch.distributed.run \
torchrun \
--nnodes $NNODES \
--nproc-per-node $GPUS_PER_NODE \
--node_rank $NODE_RANK \
--master-port $MASTER_PORT \
--master-addr $MASTER_ADDR \
examples/gsm8k_geo3k/train_colocate.py \
--pretrain "${PATH_TO_YOUR_BASE_MODEL}" \
--save_trajectories \
--advantage_estimator "group_norm" \
--fsdp \
--use_kl_loss \
--flash_attn \
--engine_type vllm \
--enable_engine_sleep \
--rm_use_engine \
--reward_pretrain "{}" \
--save_path "results/${EXPERIMENT_NAME}/${SAVE_MODEL_NAME}" \
--ckpt_path "results/${EXPERIMENT_NAME}/${SAVE_MODEL_NAME}" \
--micro_train_batch_size 4 \
--train_batch_size ${TBS} \
--micro_rollout_batch_size 4 \
--rollout_batch_size ${RBS} \
--max_epochs 1 \
--num_episodes ${EPISODE} \
--lr_warmup_ratio ${WARMUP} \
--n_samples_per_prompt $N_SAMPLES \
--prompt_max_len $PROMPT_MAX_LEN \
--generate_max_len $GENERATE_MAX_LEN \
--zero_stage 3 \
--bf16 \
--actor_learning_rate $LR \
--init_kl_coef $KL \
--kl_estimator "k3" \
--prompt_data "${PATH_TO_YOUR_GSM8K_DATASET}" \
--input_key "prompt" \
--label_key "label" \
--eval_steps 20 \
--eval_split "${EVAL_SPLIT}" \
--max_eval_samples $MAX_EVAL_SAMPLES \
--apply_chat_template \
--gradient_checkpointing \
--save_steps 20 \
--max_ckpt_num 3 \
--engine_mem_util 0.6 \
--engine_tp_size $ENGINE_TP \
--system_prompt 'A conversation between the User and Assistant. The User asks a question, and the Assistant provides a solution. The Assistant first thinks through the reasoning process internally with self-reflection and consistency check and then gives the final analysis and answer. The reasoning process should be enclosed within <think></think>, followed directly by the final thought and answer, the final answer MUST BE put in \\boxed{}, like this: <think> reasoning process here </think> final thought and \\boxed{answer} here.' \
--l2 1.0e-2 \
--freeze_prefix \
--adam_offload \
--text_only \
--use_wandb "${WANDB_API_KEY}" \
--wandb_project "${WANDB_PROJECT}" \
--wandb_run_name "${WANDB_RUN_NAME}" \
2>&1 | tee "rft_logs/${EXPERIMENT_NAME}/node${NODE_RANK}_${current_time}.log"


################################################################################
# Usage Instructions #
# #
# Step 1: Preprocess the GSM8K Dataset #
# Run the provided preprocessing script to prepare the dataset. #
# Make sure the output directory matches `PATH_TO_YOUR_GSM8K_DATASET`. #
# #
# `python examples/data_preprocess/gsm8k_lightrft.py --local_save_dir /path/to/your/preprocessed/gsm8k_dataset`
# #
# Step 2: Configure the Script #
# Edit "Part 1: User Configuration" at the top of this file. Set the paths #
# to your base model and the preprocessed dataset. #
# #
# Step 3: Run the Training Script #
# Execute this script from your shell. #
# #
# `bash /path/to/this/script.sh` #
# #
# Key Notes for Text-Only Training: #
# - This script is configured for a text-only task (GSM8K). #
# - The `--text_only` flag is CRITICAL. It ensures the script runs in #
# text-only mode and does not expect image data. #
# #
################################################################################
Loading
Loading