From c2ecb1aed5de9ba6025126338042cb69f183155e Mon Sep 17 00:00:00 2001 From: Yanglin Zhang <56758695+lucky9-cyou@users.noreply.github.com> Date: Wed, 27 May 2026 06:37:20 +0000 Subject: [PATCH] fix: fix gdpo batch normalization --- verl/trainer/ppo/core_algos.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py index 5e6a19a15d1..8cddb32aba5 100644 --- a/verl/trainer/ppo/core_algos.py +++ b/verl/trainer/ppo/core_algos.py @@ -463,7 +463,11 @@ def compute_gdpo_outcome_advantage( else: new_advantage += weights[i] * normalized_score - advantages = verl_F.masked_whiten(new_advantage, response_mask) * response_mask + response_level_advantage = verl_F.masked_mean(new_advantage, response_mask, axis=-1) + response_level_mask = response_mask.sum(dim=-1) > 0 + response_level_advantage = verl_F.masked_whiten(response_level_advantage, response_level_mask) + + advantages = response_level_advantage.unsqueeze(-1) * response_mask return advantages, advantages