From 0f5dc2ba45305cbf439861cf5a74b2b6360c77f8 Mon Sep 17 00:00:00 2001 From: HeShiLie Date: Mon, 25 May 2026 11:28:12 +0800 Subject: [PATCH 1/2] [ROCm] Fix AsyncLLM WorkerProc SIGSEGV by conditionally omitting distributed_executor_backend On ROCm with TP=1, the hardcoded distributed_executor_backend="mp" forces vLLM to use MultiprocExecutor, creating a two-level multiprocessing spawn chain (EngineCore -> MultiprocExecutor -> WorkerProc) inside a Ray actor where CUDA/HIP is already initialized. This causes the inner WorkerProc to SIGSEGV (exitcode=-11) before entering worker_main. By omitting distributed_executor_backend when on ROCm with TP=1, vLLM auto-selects UniProcExecutor which keeps the worker in-process, avoiding the nested spawn. For TP>1, "mp" is still set as MultiprocExecutor is required for inter-process communication. CUDA/NVIDIA and TP>1 behavior are unchanged. Tested on: ROCm 7.0.2 + PyTorch 2.8.0 + vLLM 0.11.0rc2 + verl main Co-Authored-By: Claude Signed-off-by: HeShiLie --- .../rollout/vllm_rollout/vllm_async_server.py | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 67cf8b39eba..2ed0298fd47 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -251,7 +251,6 @@ async def launch_server(self, master_address: str = None, master_port: int = Non "dtype": self.config.dtype, "load_format": self.config.load_format, "skip_tokenizer_init": False, - "distributed_executor_backend": "mp", "worker_extension_cls": self._get_worker_extension_cls(), "trust_remote_code": self.model_config.trust_remote_code, "max_model_len": self.config.max_model_len, @@ -274,6 +273,18 @@ async def launch_server(self, master_address: str = None, master_port: int = Non **engine_kwargs, } + # On ROCm with TP=1, omit distributed_executor_backend so vLLM defaults + # to UniProcExecutor, avoiding a nested WorkerProc spawn that causes + # SIGSEGV (HIP runtime issue with two-level multiprocessing spawn + # after CUDA/HIP init in a Ray actor). For TP>1, "mp" is still required. + if not self._should_omit_distributed_executor_backend(): + args["distributed_executor_backend"] = "mp" + else: + logger.info( + "ROCm + TP=1 detected: omitting distributed_executor_backend " + "so vLLM uses UniProcExecutor and avoids nested WorkerProc spawn." + ) + # update profiler args profiler_args = build_vllm_profiler_args( self.profiler_controller.config, self.profiler_controller.tool_config, self.replica_rank @@ -886,6 +897,20 @@ def _apply_quantization(self) -> tuple[Optional[str], dict]: return quantization, hf_overrides + def _should_omit_distributed_executor_backend(self) -> bool: + """On ROCm with TP=1, let vLLM default to UniProcExecutor.""" + try: + from vllm.platforms import current_platform + + return current_platform.is_rocm() and int(self.config.tensor_model_parallel_size) == 1 + except Exception as e: + logger.warning( + "Failed to check ROCm executor backend condition: %s. " + "Falling back to distributed_executor_backend='mp'.", + e, + ) + return False + def _get_worker_extension_cls(self) -> str: """Return the fully-qualified colocate worker extension class name.""" return "verl.workers.rollout.vllm_rollout.utils.vLLMColocateWorkerExtension" From 42f2bff2137e01e340bcc90ec2dbcdc4b78d5601 Mon Sep 17 00:00:00 2001 From: HeShiLie Date: Mon, 25 May 2026 15:38:57 +0800 Subject: [PATCH 2/2] Respect user-specified distributed_executor_backend from engine_kwargs If the user already provides distributed_executor_backend via engine_kwargs, do not overwrite it with the default "mp". This preserves the existing precedence where engine_kwargs can override built-in defaults. Addresses review feedback from gemini-code-assist. Co-Authored-By: Claude Signed-off-by: HeShiLie --- .../rollout/vllm_rollout/vllm_async_server.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 2ed0298fd47..04fa84a83ee 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -273,17 +273,19 @@ async def launch_server(self, master_address: str = None, master_port: int = Non **engine_kwargs, } - # On ROCm with TP=1, omit distributed_executor_backend so vLLM defaults - # to UniProcExecutor, avoiding a nested WorkerProc spawn that causes - # SIGSEGV (HIP runtime issue with two-level multiprocessing spawn - # after CUDA/HIP init in a Ray actor). For TP>1, "mp" is still required. - if not self._should_omit_distributed_executor_backend(): - args["distributed_executor_backend"] = "mp" - else: - logger.info( - "ROCm + TP=1 detected: omitting distributed_executor_backend " - "so vLLM uses UniProcExecutor and avoids nested WorkerProc spawn." - ) + # Set distributed_executor_backend default if not already specified + # by the user via engine_kwargs. On ROCm with TP=1, omit it entirely + # so vLLM defaults to UniProcExecutor, avoiding a nested WorkerProc + # spawn that causes SIGSEGV (HIP runtime issue with two-level + # multiprocessing spawn after CUDA/HIP init in a Ray actor). + if "distributed_executor_backend" not in args: + if not self._should_omit_distributed_executor_backend(): + args["distributed_executor_backend"] = "mp" + else: + logger.info( + "ROCm + TP=1 detected: omitting distributed_executor_backend " + "so vLLM uses UniProcExecutor and avoids nested WorkerProc spawn." + ) # update profiler args profiler_args = build_vllm_profiler_args(