From 0f5dc2ba45305cbf439861cf5a74b2b6360c77f8 Mon Sep 17 00:00:00 2001
From: HeShiLie <gzben01@gmail.com>
Date: Mon, 25 May 2026 11:28:12 +0800
Subject: [PATCH 1/2] [ROCm] Fix AsyncLLM WorkerProc SIGSEGV by conditionally
 omitting distributed_executor_backend

On ROCm with TP=1, the hardcoded distributed_executor_backend="mp" forces
vLLM to use MultiprocExecutor, creating a two-level multiprocessing spawn
chain (EngineCore -> MultiprocExecutor -> WorkerProc) inside a Ray actor
where CUDA/HIP is already initialized. This causes the inner WorkerProc
to SIGSEGV (exitcode=-11) before entering worker_main.

By omitting distributed_executor_backend when on ROCm with TP=1, vLLM
auto-selects UniProcExecutor which keeps the worker in-process, avoiding
the nested spawn. For TP>1, "mp" is still set as MultiprocExecutor is
required for inter-process communication.

CUDA/NVIDIA and TP>1 behavior are unchanged.

Tested on: ROCm 7.0.2 + PyTorch 2.8.0 + vLLM 0.11.0rc2 + verl main

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: HeShiLie <gaozhe.gao@alibaba-inc.com>
---
 .../rollout/vllm_rollout/vllm_async_server.py | 27 ++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 67cf8b39eba..2ed0298fd47 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -251,7 +251,6 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
             "dtype": self.config.dtype,
             "load_format": self.config.load_format,
             "skip_tokenizer_init": False,
-            "distributed_executor_backend": "mp",
             "worker_extension_cls": self._get_worker_extension_cls(),
             "trust_remote_code": self.model_config.trust_remote_code,
             "max_model_len": self.config.max_model_len,
@@ -274,6 +273,18 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
             **engine_kwargs,
         }
 
+        # On ROCm with TP=1, omit distributed_executor_backend so vLLM defaults
+        # to UniProcExecutor, avoiding a nested WorkerProc spawn that causes
+        # SIGSEGV (HIP runtime issue with two-level multiprocessing spawn
+        # after CUDA/HIP init in a Ray actor).  For TP>1, "mp" is still required.
+        if not self._should_omit_distributed_executor_backend():
+            args["distributed_executor_backend"] = "mp"
+        else:
+            logger.info(
+                "ROCm + TP=1 detected: omitting distributed_executor_backend "
+                "so vLLM uses UniProcExecutor and avoids nested WorkerProc spawn."
+            )
+
         # update profiler args
         profiler_args = build_vllm_profiler_args(
             self.profiler_controller.config, self.profiler_controller.tool_config, self.replica_rank
@@ -886,6 +897,20 @@ def _apply_quantization(self) -> tuple[Optional[str], dict]:
 
         return quantization, hf_overrides
 
+    def _should_omit_distributed_executor_backend(self) -> bool:
+        """On ROCm with TP=1, let vLLM default to UniProcExecutor."""
+        try:
+            from vllm.platforms import current_platform
+
+            return current_platform.is_rocm() and int(self.config.tensor_model_parallel_size) == 1
+        except Exception as e:
+            logger.warning(
+                "Failed to check ROCm executor backend condition: %s. "
+                "Falling back to distributed_executor_backend='mp'.",
+                e,
+            )
+            return False
+
     def _get_worker_extension_cls(self) -> str:
         """Return the fully-qualified colocate worker extension class name."""
         return "verl.workers.rollout.vllm_rollout.utils.vLLMColocateWorkerExtension"

From 42f2bff2137e01e340bcc90ec2dbcdc4b78d5601 Mon Sep 17 00:00:00 2001
From: HeShiLie <gzben01@gmail.com>
Date: Mon, 25 May 2026 15:38:57 +0800
Subject: [PATCH 2/2] Respect user-specified distributed_executor_backend from
 engine_kwargs

If the user already provides distributed_executor_backend via engine_kwargs,
do not overwrite it with the default "mp". This preserves the existing
precedence where engine_kwargs can override built-in defaults.

Addresses review feedback from gemini-code-assist.

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: HeShiLie <gaozhe.gao@alibaba-inc.com>
---
 .../rollout/vllm_rollout/vllm_async_server.py | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 2ed0298fd47..04fa84a83ee 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -273,17 +273,19 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
             **engine_kwargs,
         }
 
-        # On ROCm with TP=1, omit distributed_executor_backend so vLLM defaults
-        # to UniProcExecutor, avoiding a nested WorkerProc spawn that causes
-        # SIGSEGV (HIP runtime issue with two-level multiprocessing spawn
-        # after CUDA/HIP init in a Ray actor).  For TP>1, "mp" is still required.
-        if not self._should_omit_distributed_executor_backend():
-            args["distributed_executor_backend"] = "mp"
-        else:
-            logger.info(
-                "ROCm + TP=1 detected: omitting distributed_executor_backend "
-                "so vLLM uses UniProcExecutor and avoids nested WorkerProc spawn."
-            )
+        # Set distributed_executor_backend default if not already specified
+        # by the user via engine_kwargs.  On ROCm with TP=1, omit it entirely
+        # so vLLM defaults to UniProcExecutor, avoiding a nested WorkerProc
+        # spawn that causes SIGSEGV (HIP runtime issue with two-level
+        # multiprocessing spawn after CUDA/HIP init in a Ray actor).
+        if "distributed_executor_backend" not in args:
+            if not self._should_omit_distributed_executor_backend():
+                args["distributed_executor_backend"] = "mp"
+            else:
+                logger.info(
+                    "ROCm + TP=1 detected: omitting distributed_executor_backend "
+                    "so vLLM uses UniProcExecutor and avoids nested WorkerProc spawn."
+                )
 
         # update profiler args
         profiler_args = build_vllm_profiler_args(