diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py index 5e6a19a15d1..8cddb32aba5 100644 --- a/verl/trainer/ppo/core_algos.py +++ b/verl/trainer/ppo/core_algos.py @@ -463,7 +463,11 @@ def compute_gdpo_outcome_advantage( else: new_advantage += weights[i] * normalized_score - advantages = verl_F.masked_whiten(new_advantage, response_mask) * response_mask + response_level_advantage = verl_F.masked_mean(new_advantage, response_mask, axis=-1) + response_level_mask = response_mask.sum(dim=-1) > 0 + response_level_advantage = verl_F.masked_whiten(response_level_advantage, response_level_mask) + + advantages = response_level_advantage.unsqueeze(-1) * response_mask return advantages, advantages