Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion verl/workers/rollout/vllm_rollout/vllm_async_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,6 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
"dtype": self.config.dtype,
"load_format": self.config.load_format,
"skip_tokenizer_init": False,
"distributed_executor_backend": "mp",
"worker_extension_cls": self._get_worker_extension_cls(),
"trust_remote_code": self.model_config.trust_remote_code,
"max_model_len": self.config.max_model_len,
Expand All @@ -274,6 +273,20 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
**engine_kwargs,
}

# Set distributed_executor_backend default if not already specified
# by the user via engine_kwargs. On ROCm with TP=1, omit it entirely
# so vLLM defaults to UniProcExecutor, avoiding a nested WorkerProc
# spawn that causes SIGSEGV (HIP runtime issue with two-level
# multiprocessing spawn after CUDA/HIP init in a Ray actor).
if "distributed_executor_backend" not in args:
if not self._should_omit_distributed_executor_backend():
args["distributed_executor_backend"] = "mp"
else:
logger.info(
"ROCm + TP=1 detected: omitting distributed_executor_backend "
"so vLLM uses UniProcExecutor and avoids nested WorkerProc spawn."
)

# update profiler args
profiler_args = build_vllm_profiler_args(
self.profiler_controller.config, self.profiler_controller.tool_config, self.replica_rank
Expand Down Expand Up @@ -886,6 +899,20 @@ def _apply_quantization(self) -> tuple[Optional[str], dict]:

return quantization, hf_overrides

def _should_omit_distributed_executor_backend(self) -> bool:
"""On ROCm with TP=1, let vLLM default to UniProcExecutor."""
try:
from vllm.platforms import current_platform

return current_platform.is_rocm() and int(self.config.tensor_model_parallel_size) == 1
except Exception as e:
logger.warning(
"Failed to check ROCm executor backend condition: %s. "
"Falling back to distributed_executor_backend='mp'.",
e,
)
return False

def _get_worker_extension_cls(self) -> str:
"""Return the fully-qualified colocate worker extension class name."""
return "verl.workers.rollout.vllm_rollout.utils.vLLMColocateWorkerExtension"
Expand Down