From 9dcf37eeea2cea7a53314a60011e28c922cce9d8 Mon Sep 17 00:00:00 2001 From: imitater-dou Date: Tue, 26 May 2026 19:26:40 +0800 Subject: [PATCH 1/2] [reward_manager] fix: guard against empty responses and None overlong_buffer_cfg Two silent bugs in reward manager __call__ methods: 1. DAPORewardManager.__call__ accessed self.overlong_buffer_cfg.enable without checking for None first. Since overlong_buffer_cfg defaults to None, any call without this argument raised AttributeError at runtime. 2. NaiveRewardManager, DAPORewardManager and PrimeRewardManager all computed reward_tensor[i, valid_response_length - 1] without guarding against valid_response_length == 0. PyTorch/Python negative indexing silently wrote the reward to the last position of the tensor row, corrupting training signal for aborted or fully-padded responses. Both issues are confirmed by static analysis and reproduced with a minimal Python script (no GPU required). Related: #5613 fixes the same index-(-1) pattern in core_algos.py for GDPO; this PR addresses the same class of bug in the three reward manager classes. Co-authored-by: Claude Signed-off-by: imitater-dou --- verl/workers/reward_manager/dapo.py | 5 +++-- verl/workers/reward_manager/naive.py | 3 ++- verl/workers/reward_manager/prime.py | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/verl/workers/reward_manager/dapo.py b/verl/workers/reward_manager/dapo.py index f44cb3d8f9d..cc62de117bb 100644 --- a/verl/workers/reward_manager/dapo.py +++ b/verl/workers/reward_manager/dapo.py @@ -118,7 +118,7 @@ def __call__(self, data: DataProto, return_dict: bool = False): reward = score - if self.overlong_buffer_cfg.enable: + if self.overlong_buffer_cfg is not None and self.overlong_buffer_cfg.enable: overlong_buffer_len = self.overlong_buffer_cfg.len expected_len = self.max_resp_len - overlong_buffer_len exceed_len = valid_response_length - expected_len @@ -129,7 +129,8 @@ def __call__(self, data: DataProto, return_dict: bool = False): reward_extra_info["overlong_reward"].append(overlong_reward) reward_extra_info["overlong"].append(overlong_reward < 0) - reward_tensor[i, valid_response_length - 1] = reward + if valid_response_length > 0: + reward_tensor[i, valid_response_length - 1] = reward if data_source not in already_print_data_sources: already_print_data_sources[data_source] = 0 diff --git a/verl/workers/reward_manager/naive.py b/verl/workers/reward_manager/naive.py index f3ca122c2b6..b4dd7e68136 100644 --- a/verl/workers/reward_manager/naive.py +++ b/verl/workers/reward_manager/naive.py @@ -97,7 +97,8 @@ def __call__(self, data: DataProto, return_dict: bool = False) -> torch.Tensor | else: reward = score - reward_tensor[i, valid_response_length - 1] = reward + if valid_response_length > 0: + reward_tensor[i, valid_response_length - 1] = reward if data_source not in already_print_data_sources: already_print_data_sources[data_source] = 0 diff --git a/verl/workers/reward_manager/prime.py b/verl/workers/reward_manager/prime.py index b15ed7c3fcb..6c0c234b64d 100644 --- a/verl/workers/reward_manager/prime.py +++ b/verl/workers/reward_manager/prime.py @@ -174,7 +174,8 @@ def __call__(self, data: DataProto, return_dict: bool = False) -> torch.Tensor | for i in range(len(data)): data_source = data_sources[i] - reward_tensor[i, valid_response_length[i].item() - 1] = scores[i] + if valid_response_length[i].item() > 0: + reward_tensor[i, valid_response_length[i].item() - 1] = scores[i] if data_source not in already_print_data_sources: already_print_data_sources[data_source] = 0 From 31d143a68e6699e9f50e54d559598083bd8cd2e8 Mon Sep 17 00:00:00 2001 From: imitater-dou Date: Tue, 26 May 2026 19:26:40 +0800 Subject: [PATCH 2/2] [reward_manager] fix: use .item() to avoid implicit CPU-GPU sync in reward placement Address Gemini code-review feedback on PR #6484: valid_response_length is a scalar PyTorch tensor; using it directly in a Python conditional and as a tensor index triggers implicit device-to-host synchronization inside the per-sample loop. Convert to Python int with .item() at the two usage sites in dapo.py and naive.py. prime.py already used .item() in the same pattern; now all three managers are consistent. Co-authored-by: Claude Signed-off-by: imitater-dou --- verl/workers/reward_manager/dapo.py | 4 ++-- verl/workers/reward_manager/naive.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/verl/workers/reward_manager/dapo.py b/verl/workers/reward_manager/dapo.py index cc62de117bb..0c43dc1f45d 100644 --- a/verl/workers/reward_manager/dapo.py +++ b/verl/workers/reward_manager/dapo.py @@ -129,8 +129,8 @@ def __call__(self, data: DataProto, return_dict: bool = False): reward_extra_info["overlong_reward"].append(overlong_reward) reward_extra_info["overlong"].append(overlong_reward < 0) - if valid_response_length > 0: - reward_tensor[i, valid_response_length - 1] = reward + if valid_response_length.item() > 0: + reward_tensor[i, valid_response_length.item() - 1] = reward if data_source not in already_print_data_sources: already_print_data_sources[data_source] = 0 diff --git a/verl/workers/reward_manager/naive.py b/verl/workers/reward_manager/naive.py index b4dd7e68136..dea8e4d4f19 100644 --- a/verl/workers/reward_manager/naive.py +++ b/verl/workers/reward_manager/naive.py @@ -97,8 +97,8 @@ def __call__(self, data: DataProto, return_dict: bool = False) -> torch.Tensor | else: reward = score - if valid_response_length > 0: - reward_tensor[i, valid_response_length - 1] = reward + if valid_response_length.item() > 0: + reward_tensor[i, valid_response_length.item() - 1] = reward if data_source not in already_print_data_sources: already_print_data_sources[data_source] = 0