Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 167 additions & 0 deletions examples/ascend_extras/grpo_trainer/run_qwen3_5_35b_megatron_4k_32k.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#!/usr/bin/env bash
set -xeuo pipefail

########################### Environment ###########################

export VLLM_USE_V1=${VLLM_USE_V1:-1}
export VLLM_ALLREDUCE_USE_SYMM_MEM=${VLLM_ALLREDUCE_USE_SYMM_MEM:-0}

export VLLM_ASCEND_ENABLE_PREFETCH_MLP=${VLLM_ASCEND_ENABLE_PREFETCH_MLP:-1}
export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=${VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE:-1}
export VLLM_ASCEND_ENABLE_FLASHCOMM1=${VLLM_ASCEND_ENABLE_FLASHCOMM1:-1}
export CPU_AFFINITY_CONF=${CPU_AFFINITY_CONF:-1}

########################### Quick Config ###########################

# ---- user-adjustable ----
TP=${TP:-2}
PP=${PP:-2}
CP=${CP:-4}
EP=${EP:-8}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

In this single-node 16-GPU configuration (trainer.nnodes=1, trainer.n_gpus_per_node=16), the data parallel size is DP = World_Size / (TP * PP * CP) = 16 / (2 * 2 * 4) = 1. Since Megatron-Core requires the expert model parallel size (EP) to be less than or equal to the data parallel size (DP), setting EP=8 will cause an assertion failure during model parallel initialization. For a single-node run, EP must be set to 1.

Suggested change
EP=${EP:-8}
EP=${EP:-1}

ETP=${ETP:-1}
GEN_TP=${GEN_TP:-8}
NDEVICES_PER_NODE=${NDEVICES_PER_NODE:-16}

ALL_OFFLOAD=${ALL_OFFLOAD:-True}

rollout_name=${rollout_name:-vllm}
project_name=${project_name:-verl_grpo_qwen3_5_35b_geo3k}
exp_name=${exp_name:-qwen3_5_35b_megatron_npu_4k_32k}
adv_estimator=${adv_estimator:-grpo}

HF_MODEL_PATH=${HF_MODEL_PATH:-"${HOME}/models/Qwen3.5-35B-A3B"}
train_path=${train_path:-"${HOME}/data/geo3k-4k/train.parquet"}
test_path=${test_path:-"${HOME}/data/geo3k-4k/test.parquet"}

start_time=$(date +%Y%m%d)_$(date +%H%M%S)
# ---- end user-adjustable ----

########################### Parameter Arrays ###########################

DATA=(
data.train_files=${train_path}
data.val_files=${test_path}
data.train_batch_size=16
data.max_prompt_length=$((1024 * 4))
data.max_response_length=$((1024 * 32))
data.truncation='error'
data.filter_overlong_prompts=True
)

MODEL=(
actor_rollout_ref.model.path=${HF_MODEL_PATH}
actor_rollout_ref.model.trust_remote_code=True
actor_rollout_ref.model.use_remove_padding=False
)

ACTOR=(
actor_rollout_ref.actor.optim.lr=1e-6
actor_rollout_ref.actor.ppo_mini_batch_size=16
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096
actor_rollout_ref.actor.use_dynamic_bsz=False
actor_rollout_ref.actor.use_kl_loss=True
actor_rollout_ref.actor.kl_loss_coef=0.01
actor_rollout_ref.actor.kl_loss_type=low_var_kl
actor_rollout_ref.actor.entropy_coeff=0

actor_rollout_ref.actor.megatron.use_mbridge=True
actor_rollout_ref.actor.megatron.vanilla_mbridge=False
actor_rollout_ref.actor.megatron.use_remove_padding=False
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TP}
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${PP}
actor_rollout_ref.actor.megatron.context_parallel_size=${CP}
actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.actor.megatron.param_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.optimizer_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.grad_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.dtype=bfloat16

actor_rollout_ref.actor.checkpoint.strict=False

++actor_rollout_ref.actor.megatron.override_transformer_config.attention_backend=auto
+actor_rollout_ref.actor.megatron.override_transformer_config.context_parallel_algo=kvallgather_cp_algo
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_aux_loss_coeff=0.01
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_z_loss_coeff=0.001
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_grouped_gemm=True
+actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall
+actor_rollout_ref.actor.megatron.override_transformer_config.use_naive_l2norm=True

+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
)

ROLLOUT=(
actor_rollout_ref.rollout.name=${rollout_name}
actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP}
actor_rollout_ref.rollout.gpu_memory_utilization=0.6
actor_rollout_ref.rollout.n=5
actor_rollout_ref.rollout.dtype=bfloat16
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=False
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096
actor_rollout_ref.rollout.calculate_log_probs=True

actor_rollout_ref.rollout.ignore_eos=False
actor_rollout_ref.rollout.enforce_eager=False
actor_rollout_ref.rollout.max_num_batched_tokens=16384
actor_rollout_ref.rollout.expert_parallel_size=${EP}
)

REF=(
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=False
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TP}
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${PP}
actor_rollout_ref.ref.megatron.context_parallel_size=${CP}
actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.ref.megatron.param_offload=${ALL_OFFLOAD}
)

ALGORITHM=(
algorithm.adv_estimator=${adv_estimator}
algorithm.use_kl_in_reward=False
)

TRAINER=(
trainer.critic_warmup=0
trainer.logger='["console"]'
trainer.project_name=${project_name}
trainer.experiment_name=${exp_name}
trainer.n_gpus_per_node=${NDEVICES_PER_NODE}
trainer.nnodes=1
trainer.save_freq=-1
trainer.val_before_train=False
trainer.test_freq=-1
trainer.total_training_steps=20
trainer.total_epochs=15
)

EXTRA=(
model_engine=megatron
)

########################### Launch ###########################

mkdir -p logs

python3 -m verl.trainer.main_ppo \
"${DATA[@]}" \
"${ALGORITHM[@]}" \
"${MODEL[@]}" \
"${ROLLOUT[@]}" \
"${ACTOR[@]}" \
"${REF[@]}" \
"${TRAINER[@]}" \
"${EXTRA[@]}" \
"$@" 2>&1 | tee logs/qwen3_5_35b_grpo_megatron_npu_4k_32k-${start_time}.log
Loading