diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 67cf8b39eba..04fa84a83ee 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -251,7 +251,6 @@ async def launch_server(self, master_address: str = None, master_port: int = Non "dtype": self.config.dtype, "load_format": self.config.load_format, "skip_tokenizer_init": False, - "distributed_executor_backend": "mp", "worker_extension_cls": self._get_worker_extension_cls(), "trust_remote_code": self.model_config.trust_remote_code, "max_model_len": self.config.max_model_len, @@ -274,6 +273,20 @@ async def launch_server(self, master_address: str = None, master_port: int = Non **engine_kwargs, } + # Set distributed_executor_backend default if not already specified + # by the user via engine_kwargs. On ROCm with TP=1, omit it entirely + # so vLLM defaults to UniProcExecutor, avoiding a nested WorkerProc + # spawn that causes SIGSEGV (HIP runtime issue with two-level + # multiprocessing spawn after CUDA/HIP init in a Ray actor). + if "distributed_executor_backend" not in args: + if not self._should_omit_distributed_executor_backend(): + args["distributed_executor_backend"] = "mp" + else: + logger.info( + "ROCm + TP=1 detected: omitting distributed_executor_backend " + "so vLLM uses UniProcExecutor and avoids nested WorkerProc spawn." + ) + # update profiler args profiler_args = build_vllm_profiler_args( self.profiler_controller.config, self.profiler_controller.tool_config, self.replica_rank @@ -886,6 +899,20 @@ def _apply_quantization(self) -> tuple[Optional[str], dict]: return quantization, hf_overrides + def _should_omit_distributed_executor_backend(self) -> bool: + """On ROCm with TP=1, let vLLM default to UniProcExecutor.""" + try: + from vllm.platforms import current_platform + + return current_platform.is_rocm() and int(self.config.tensor_model_parallel_size) == 1 + except Exception as e: + logger.warning( + "Failed to check ROCm executor backend condition: %s. " + "Falling back to distributed_executor_backend='mp'.", + e, + ) + return False + def _get_worker_extension_cls(self) -> str: """Return the fully-qualified colocate worker extension class name.""" return "verl.workers.rollout.vllm_rollout.utils.vLLMColocateWorkerExtension"