diff --git a/verl/workers/reward_manager/dapo.py b/verl/workers/reward_manager/dapo.py index f44cb3d8f9d..0c43dc1f45d 100644 --- a/verl/workers/reward_manager/dapo.py +++ b/verl/workers/reward_manager/dapo.py @@ -118,7 +118,7 @@ def __call__(self, data: DataProto, return_dict: bool = False): reward = score - if self.overlong_buffer_cfg.enable: + if self.overlong_buffer_cfg is not None and self.overlong_buffer_cfg.enable: overlong_buffer_len = self.overlong_buffer_cfg.len expected_len = self.max_resp_len - overlong_buffer_len exceed_len = valid_response_length - expected_len @@ -129,7 +129,8 @@ def __call__(self, data: DataProto, return_dict: bool = False): reward_extra_info["overlong_reward"].append(overlong_reward) reward_extra_info["overlong"].append(overlong_reward < 0) - reward_tensor[i, valid_response_length - 1] = reward + if valid_response_length.item() > 0: + reward_tensor[i, valid_response_length.item() - 1] = reward if data_source not in already_print_data_sources: already_print_data_sources[data_source] = 0 diff --git a/verl/workers/reward_manager/naive.py b/verl/workers/reward_manager/naive.py index f3ca122c2b6..dea8e4d4f19 100644 --- a/verl/workers/reward_manager/naive.py +++ b/verl/workers/reward_manager/naive.py @@ -97,7 +97,8 @@ def __call__(self, data: DataProto, return_dict: bool = False) -> torch.Tensor | else: reward = score - reward_tensor[i, valid_response_length - 1] = reward + if valid_response_length.item() > 0: + reward_tensor[i, valid_response_length.item() - 1] = reward if data_source not in already_print_data_sources: already_print_data_sources[data_source] = 0 diff --git a/verl/workers/reward_manager/prime.py b/verl/workers/reward_manager/prime.py index b15ed7c3fcb..6c0c234b64d 100644 --- a/verl/workers/reward_manager/prime.py +++ b/verl/workers/reward_manager/prime.py @@ -174,7 +174,8 @@ def __call__(self, data: DataProto, return_dict: bool = False) -> torch.Tensor | for i in range(len(data)): data_source = data_sources[i] - reward_tensor[i, valid_response_length[i].item() - 1] = scores[i] + if valid_response_length[i].item() > 0: + reward_tensor[i, valid_response_length[i].item() - 1] = scores[i] if data_source not in already_print_data_sources: already_print_data_sources[data_source] = 0