From ff43106e0e929110866e3eeca49bae9939a3fd3d Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 9 Jun 2026 17:58:05 +0200 Subject: [PATCH 01/26] implement multi-node for vllm/sglang --- doc/workloads/sglang.rst | 55 ++++ doc/workloads/vllm.rst | 56 +++++ src/cloudai/workloads/common/llm_serving.py | 235 ++++++++++++++---- .../sglang/slurm_command_gen_strategy.py | 49 ++++ .../vllm/slurm_command_gen_strategy.py | 162 ++++++++++-- tests/ref_data/sglang-disagg-2nodes.sbatch | 32 +-- tests/ref_data/sglang-disagg.sbatch | 30 ++- tests/ref_data/sglang-multinode.sbatch | 75 ++++++ tests/ref_data/vllm-disagg-2nodes.sbatch | 32 +-- tests/ref_data/vllm-disagg.sbatch | 30 ++- tests/ref_data/vllm-multinode.sbatch | 118 +++++++++ tests/test_acceptance.py | 34 ++- tests/workloads/common/test_llm_serving.py | 43 +++- .../sglang/test_command_gen_strategy_slurm.py | 64 ++++- .../vllm/test_command_gen_strategy_slurm.py | 66 ++++- 15 files changed, 950 insertions(+), 131 deletions(-) create mode 100644 tests/ref_data/sglang-multinode.sbatch create mode 100644 tests/ref_data/vllm-multinode.sbatch diff --git a/doc/workloads/sglang.rst b/doc/workloads/sglang.rst index cdbd5cff1..0ccd1ba48 100644 --- a/doc/workloads/sglang.rst +++ b/doc/workloads/sglang.rst @@ -135,6 +135,61 @@ For more control, one can specify the GPU IDs explicitly in ``prefill`` and ``de In this case ``CUDA_VISIBLE_DEVICES`` will be ignored and only the GPUs specified in ``gpu_ids`` will be used. +Multi-node serving +------------------ +For non-disaggregated serving, set ``num_nodes`` on the test to more than one. CloudAI starts one +``sglang.launch_server`` task per serving node with a shared ``--dist-init-addr``, ``--nnodes``, and +``--node-rank "$SLURM_NODEID"``. + +.. code-block:: toml + :caption: scenario.toml (multi-node aggregated serving) + + [[Tests]] + id = "sglang.multi_node" + num_nodes = 2 + test_template_name = "sglang" + + [Tests.cmd_args] + docker_image_url = "lmsysorg/sglang:dev-cu13" + model = "Qwen/Qwen3-8B" + + [Tests.cmd_args.decode] + tp = 2 + + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" + +For disaggregated prefill/decode serving, existing 1-node and 2-node behavior is preserved by default. To span more +than two nodes, set both role sizes explicitly. CloudAI assigns contiguous node slices to prefill and decode and starts +one distributed SGLang launch per role with separate init ports. Benchmark and semantic validation run from the prefill +head node. + +.. code-block:: toml + :caption: scenario.toml (multi-node disaggregated serving) + + [[Tests]] + id = "sglang.pd_multi_node" + num_nodes = 4 + test_template_name = "sglang" + + [Tests.cmd_args] + docker_image_url = "lmsysorg/sglang:dev-cu13" + model = "Qwen/Qwen3-8B" + + [Tests.cmd_args.prefill] + num_nodes = 2 + tp = 2 + + [Tests.cmd_args.decode] + num_nodes = 2 + tp = 2 + + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" + +``CUDA_VISIBLE_DEVICES`` and ``gpu_ids`` are interpreted as local GPU IDs on each serving node, not as cluster-global GPU +IDs. + API Documentation ----------------- diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst index 57773992f..17ead0b8d 100644 --- a/doc/workloads/vllm.rst +++ b/doc/workloads/vllm.rst @@ -133,6 +133,62 @@ For more control, users can specify the GPU IDs explicitly in ``prefill`` and `` In this case ``CUDA_VISIBLE_DEVICES`` will be ignored and only the GPUs specified in ``gpu_ids`` will be used. +Multi-node serving +------------------ +For non-disaggregated serving, set ``num_nodes`` on the test to more than one. CloudAI starts a Ray head on the first +allocated serving node, Ray workers on the remaining serving nodes, waits for the Ray cluster to reach the requested +size, and runs ``vllm serve`` with ``--distributed-executor-backend ray`` on the head node. + +.. code-block:: toml + :caption: scenario.toml (multi-node aggregated serving) + + [[Tests]] + id = "vllm.multi_node" + num_nodes = 2 + test_template_name = "vllm" + + [Tests.cmd_args] + docker_image_url = "nvcr.io/nvidia/vllm:latest" + model = "Qwen/Qwen3-0.6B" + + [Tests.cmd_args.decode] + tensor_parallel_size = 2 + + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" + +For disaggregated prefill/decode serving, existing 1-node and 2-node behavior is preserved by default. To span more +than two nodes, set both role sizes explicitly. CloudAI assigns contiguous node slices to prefill and decode, creates a +separate Ray cluster for each role whose ``num_nodes`` is greater than one, and runs benchmark and semantic validation +from the prefill head node. + +.. code-block:: toml + :caption: scenario.toml (multi-node disaggregated serving) + + [[Tests]] + id = "vllm.pd_multi_node" + num_nodes = 4 + test_template_name = "vllm" + + [Tests.cmd_args] + docker_image_url = "nvcr.io/nvidia/vllm:latest" + model = "Qwen/Qwen3-0.6B" + + [Tests.cmd_args.prefill] + num_nodes = 2 + tensor_parallel_size = 2 + + [Tests.cmd_args.decode] + num_nodes = 2 + tensor_parallel_size = 2 + + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" + +``CUDA_VISIBLE_DEVICES`` and ``gpu_ids`` are interpreted as local GPU IDs on each serving node, not as cluster-global GPU +IDs. + + Controlling ``proxy_script`` ----------------------------- ``proxy_script`` is used to proxy the requests from the client to the prefill and decode instances. It is ignored for non-disaggregated mode. Default value can be found below. diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py index 30a6943c1..792cff05c 100644 --- a/src/cloudai/workloads/common/llm_serving.py +++ b/src/cloudai/workloads/common/llm_serving.py @@ -78,7 +78,7 @@ def calculate_prefill_gpu_ids( return parse_gpu_ids(tdef.cmd_args.prefill.gpu_ids) gpu_ids = all_gpu_ids(tdef, system_gpus_per_node) - if num_nodes == 2: + if num_nodes > 1 or tdef.cmd_args.prefill.num_nodes is not None: return gpu_ids mid = len(gpu_ids) // 2 return gpu_ids[:mid] @@ -95,7 +95,7 @@ def calculate_decode_gpu_ids( gpu_ids = all_gpu_ids(tdef, system_gpus_per_node) if not tdef.cmd_args.prefill: return gpu_ids - if num_nodes == 2: + if num_nodes > 1 or tdef.cmd_args.decode.num_nodes is not None: return gpu_ids mid = len(gpu_ids) // 2 return gpu_ids[mid:] @@ -107,11 +107,15 @@ class LLMServingArgs(CmdArgs): gpu_ids: str | list[str] | None = Field( default=None, description="Comma-separated GPU IDs. If not set, all available GPUs will be used." ) + num_nodes: int | list[int] | None = Field( + default=None, + description="Number of Slurm nodes assigned to this role in disaggregated serving mode.", + ) @property def serve_args_exclude(self) -> set[str]: """Fields consumed internally and excluded from generic serve args.""" - return {"gpu_ids"} + return {"gpu_ids", "num_nodes"} def serialize_serve_arg(self, key: str, value: Any) -> list[str]: """Serialize a single serve argument to CLI tokens.""" @@ -369,15 +373,52 @@ def gpu_ids(self) -> list[int]: def is_disaggregated(self) -> bool: return self.tdef.cmd_args.prefill is not None + @staticmethod + def _role_num_nodes(value: int | list[int] | None, role: str) -> int | None: + if isinstance(value, list): + raise ValueError(f"{role}.num_nodes must be a single integer for command generation.") + return value + @property - def is_two_node_disaggregated(self) -> bool: - if not self.is_disaggregated: - return False + def aggregated_node_count(self) -> int: + num_nodes, _ = self.get_cached_nodes_spec() + return num_nodes + + def disaggregated_role_node_counts(self) -> tuple[int, int]: + if not self.is_disaggregated or self.tdef.cmd_args.prefill is None: + return (0, 0) num_nodes, _ = self.get_cached_nodes_spec() - if num_nodes not in (1, 2): - raise ValueError(f"Disaggregated {self.workload_name} supports only 1 or 2 nodes, got {num_nodes}.") - return num_nodes == 2 + prefill_nodes = self._role_num_nodes(self.tdef.cmd_args.prefill.num_nodes, "prefill") + decode_nodes = self._role_num_nodes(self.tdef.cmd_args.decode.num_nodes, "decode") + + if prefill_nodes is None and decode_nodes is None: + if num_nodes in (1, 2): + return (1, 1) + raise ValueError( + f"Disaggregated {self.workload_name} over more than 2 nodes requires both " + "prefill.num_nodes and decode.num_nodes." + ) + if prefill_nodes is None or decode_nodes is None: + raise ValueError("Both prefill.num_nodes and decode.num_nodes must be set or both must be omitted.") + if prefill_nodes <= 0 or decode_nodes <= 0: + raise ValueError("prefill.num_nodes and decode.num_nodes must be positive integers.") + if prefill_nodes + decode_nodes != num_nodes: + raise ValueError( + f"prefill.num_nodes + decode.num_nodes must equal allocated nodes ({num_nodes}), " + f"got {prefill_nodes + decode_nodes}." + ) + return (prefill_nodes, decode_nodes) + + def role_node_count(self, role: str) -> int: + if role == "serve": + return self.aggregated_node_count + prefill_nodes, decode_nodes = self.disaggregated_role_node_counts() + if role == "prefill": + return prefill_nodes + if role == "decode": + return decode_nodes + raise ValueError(f"Unknown serving role: {role}") @property def prefill_gpu_ids(self) -> list[int]: @@ -387,13 +428,22 @@ def prefill_gpu_ids(self) -> list[int]: def decode_gpu_ids(self) -> list[int]: return calculate_decode_gpu_ids(self.tdef, self.test_run.nnodes, self.system.gpus_per_node) - def _disagg_srun_prefix(self, relative: int | None = None) -> str: - srun_command_parts = self.gen_srun_prefix(with_num_nodes=(relative is None)) - srun_command_parts.extend(["--overlap", "--ntasks-per-node=1", "--ntasks=1"]) - if relative is not None: - srun_command_parts.extend([f"--relative={relative}", "-N1"]) + def _role_srun_prefix(self, nodelist_expr: str, node_count: int = 1, task_count: int = 1) -> str: + srun_command_parts = self.gen_srun_prefix(with_num_nodes=False) + srun_command_parts.extend( + [ + "--overlap", + f'--nodelist="{nodelist_expr}"', + f"--nodes={node_count}", + f"--ntasks={task_count}", + "--ntasks-per-node=1", + ] + ) return " ".join(srun_command_parts) + def _single_role_srun_prefix(self, node_var: str) -> str: + return self._role_srun_prefix(f"${{{node_var}}}") + @staticmethod def _with_env(command: list[str], env_vars: dict[str, str]) -> str: if not env_vars: @@ -438,27 +488,49 @@ def bench_host(self) -> str: return "${PREFILL_NODE}" return "${NODE}" - def generate_disaggregated_node_setup(self) -> str: - if not self.is_disaggregated: + def generate_aggregated_node_setup(self, node_count: int) -> str: + if node_count <= 1: return "" - decode_node_check = "" - if self.is_two_node_disaggregated: - decode_node_check = f"""\ -if [ -z "${{NODES[1]}}" ]; then - echo "Expected 2 allocated nodes for disaggregated {self.workload_name}, got: ${{NODES[*]}}" + return f"""\ +NODES=( $(scontrol show hostname $SLURM_JOB_NODELIST) ) +SERVE_NODES=( "${{NODES[@]:0:{node_count}}}" ) +if [ "${{#SERVE_NODES[@]}}" -ne {node_count} ]; then + echo "Expected {node_count} allocated nodes for {self.workload_name}, got: ${{NODES[*]}}" exit 1 fi +export SERVE_NODE=${{SERVE_NODES[0]}} +export NODE=$SERVE_NODE +SERVE_NODELIST=$(IFS=,; echo "${{SERVE_NODES[*]}}") +echo "Node roles: serve=${{SERVE_NODES[*]}}" + """ + + def generate_disaggregated_node_setup(self) -> str: + if not self.is_disaggregated: + return "" + allocated_nodes, _ = self.get_cached_nodes_spec() + prefill_nodes, decode_nodes = self.disaggregated_role_node_counts() + decode_start = 0 if allocated_nodes == 1 and prefill_nodes == 1 and decode_nodes == 1 else prefill_nodes + role_error = ( + f"Expected {prefill_nodes} prefill and {decode_nodes} decode nodes for disaggregated {self.workload_name}" + ) return f"""\ NODES=( $(scontrol show hostname $SLURM_JOB_NODELIST) ) -export PREFILL_NODE=${{NODES[0]}} -export DECODE_NODE=${{NODES[1]:-${{PREFILL_NODE}}}} -if [ -z "$PREFILL_NODE" ]; then +PREFILL_NODES=( "${{NODES[@]:0:{prefill_nodes}}}" ) +DECODE_NODES=( "${{NODES[@]:{decode_start}:{decode_nodes}}}" ) +if [ "${{#PREFILL_NODES[@]}}" -ne {prefill_nodes} ] || [ "${{#DECODE_NODES[@]}}" -ne {decode_nodes} ]; then + echo "{role_error}, got: ${{NODES[*]}}" + exit 1 +fi +export PREFILL_NODE=${{PREFILL_NODES[0]}} +export DECODE_NODE=${{DECODE_NODES[0]}} +PREFILL_NODELIST=$(IFS=,; echo "${{PREFILL_NODES[*]}}") +DECODE_NODELIST=$(IFS=,; echo "${{DECODE_NODES[*]}}") +if [ -z "$PREFILL_NODE" ] || [ -z "$DECODE_NODE" ]; then echo "Failed to resolve allocated nodes for disaggregated {self.workload_name}" exit 1 fi -{decode_node_check}\ -echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE" +echo "Node roles: prefill=${{PREFILL_NODES[*]}} decode=${{DECODE_NODES[*]}}" """ @@ -597,6 +669,15 @@ def serve_port(self) -> int: def disaggregated_script_preamble(self) -> str: return "" + def aggregated_script_preamble(self) -> str: + return "" + + def aggregated_cleanup_pid_vars(self) -> list[str]: + return [self.serve_pid_var] + + def disaggregated_cleanup_pid_vars(self) -> list[str]: + return ["PREFILL_PID", "DECODE_PID", self.proxy_router_pid_var] + def aggregated_serve_env(self) -> dict[str, str]: return {} @@ -619,6 +700,23 @@ def get_semantic_eval_command(self) -> list[str] | None: """Return the optional semantic validation command.""" return None + def render_serve_launch( + self, + role: str, + command_tail: str, + pid_var: str, + log_file: str, + node_count: int, + head_node_var: str, + nodelist_var: str, + ) -> str: + del role, node_count, nodelist_var + return f"""\ +{self._single_role_srun_prefix(head_node_var)} \\ + --output={self.test_run.output_path.absolute()}/{log_file} \\ + {self._with_custom_bash(command_tail)} & +{pid_var}=$!""" + def _expand_semantic_eval_args(self, args: str, *, host: str) -> str: replacements = { "{model}": self.tdef.cmd_args.model, @@ -658,39 +756,70 @@ def _gen_llm_serving_srun_command(self, serve_commands: list[list[str]]) -> str: return self._gen_disaggregated_script(serve_commands, bench_cmd) def _gen_aggregated_script(self, serve_cmd: list[str], bench_cmd: str) -> str: - srun_prefix = " ".join(self.gen_srun_prefix()) + serve_node_count = self.role_node_count("serve") + legacy_single_node = serve_node_count == 1 + srun_prefix = ( + " ".join(self.gen_srun_prefix()) if legacy_single_node else self._single_role_srun_prefix("SERVE_NODE") + ) + host_setup = ( + "" if not legacy_single_node else "NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)\n" + ) serve_cmd_with_env = self._with_env(serve_cmd, self.aggregated_serve_env()) health_func = self.generate_wait_for_health_function() wait_block = self.generate_wait_for_health_block( - self.workload_name, [f"http://${{NODE}}:{self.serve_port}{self.tdef.cmd_args.healthcheck}"] + self.workload_name, + [f"http://${{NODE}}:{self.serve_port}{self.tdef.cmd_args.healthcheck}"], + host_setup=host_setup, ) + node_setup = self.generate_aggregated_node_setup(serve_node_count) + preamble = self.aggregated_script_preamble() + if legacy_single_node: + serve_launch = f"""\ +{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --output={(self.test_run.output_path / self.serve_log_file).absolute()} \\ + {self._with_custom_bash(serve_cmd_with_env)} & +{self.serve_pid_var}=$!""" + else: + serve_launch = self.render_serve_launch( + "serve", + serve_cmd_with_env, + self.serve_pid_var, + self.serve_log_file, + serve_node_count, + "SERVE_NODE", + "SERVE_NODELIST", + ) + semantic_prefix = ( + f"{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1" + if legacy_single_node + else self._single_role_srun_prefix("SERVE_NODE") + ) + bench_prefix = semantic_prefix return f"""\ -{self.generate_cleanup_function([self.serve_pid_var])} +{self.generate_cleanup_function(self.aggregated_cleanup_pid_vars())} {health_func} +{preamble}{node_setup}\ echo "Starting {self.workload_name} instances..." -{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={(self.test_run.output_path / self.serve_log_file).absolute()} \\ - {self._with_custom_bash(serve_cmd_with_env)} & -{self.serve_pid_var}=$! +{serve_launch} {wait_block} echo "Running benchmark..." -{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ +{bench_prefix} \\ --output={(self.test_run.output_path / self.bench_log_file).absolute()} \\ {self._with_custom_bash(bench_cmd)} -{self._gen_semantic_eval_block(f"{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1")}""".strip() +{self._gen_semantic_eval_block(semantic_prefix)}""".strip() def _gen_disaggregated_script(self, serve_commands: list[list[str]], bench_cmd: str) -> str: prefill_cmd, decode_cmd = serve_commands health_func = self.generate_wait_for_health_function() prefill_cmd_with_env = self._with_env(prefill_cmd, self.disaggregated_role_env("prefill", self.prefill_gpu_ids)) decode_cmd_with_env = self._with_env(decode_cmd, self.disaggregated_role_env("decode", self.decode_gpu_ids)) - prefill_srun_prefix = self._disagg_srun_prefix(0 if self.is_two_node_disaggregated else None) - decode_srun_prefix = self._disagg_srun_prefix(1 if self.is_two_node_disaggregated else None) + prefill_nodes, decode_nodes = self.disaggregated_role_node_counts() + prefill_srun_prefix = self._single_role_srun_prefix("PREFILL_NODE") helper_cmd = self.get_helper_command() node_setup = self.generate_disaggregated_node_setup() wait_block = self.generate_wait_for_health_block( @@ -709,23 +838,35 @@ def _gen_disaggregated_script(self, serve_commands: list[list[str]], bench_cmd: host_display="$PREFILL_NODE server", ) preamble = self.disaggregated_script_preamble() + prefill_launch = self.render_serve_launch( + "prefill", + prefill_cmd_with_env, + "PREFILL_PID", + self.prefill_log_file, + prefill_nodes, + "PREFILL_NODE", + "PREFILL_NODELIST", + ) + decode_launch = self.render_serve_launch( + "decode", + decode_cmd_with_env, + "DECODE_PID", + self.decode_log_file, + decode_nodes, + "DECODE_NODE", + "DECODE_NODELIST", + ) return f"""\ -{self.generate_cleanup_function(["PREFILL_PID", "DECODE_PID", self.proxy_router_pid_var])} +{self.generate_cleanup_function(self.disaggregated_cleanup_pid_vars())} {health_func} {preamble}{node_setup}\ echo "Starting {self.workload_name} instances..." -{prefill_srun_prefix} \\ - --output={self.test_run.output_path.absolute()}/{self.prefill_log_file} \\ - {self._with_custom_bash(prefill_cmd_with_env)} & -PREFILL_PID=$! - -{decode_srun_prefix} \\ - --output={self.test_run.output_path.absolute()}/{self.decode_log_file} \\ - {self._with_custom_bash(decode_cmd_with_env)} & -DECODE_PID=$! +{prefill_launch} + +{decode_launch} {wait_block} diff --git a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py index 7a7a97d5b..6381e8c1b 100644 --- a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py @@ -135,3 +135,52 @@ def get_semantic_eval_command(self) -> list[str] | None: def aggregated_serve_env(self) -> dict[str, str]: return {"CUDA_VISIBLE_DEVICES": ",".join(str(gpu_id) for gpu_id in self.gpu_ids)} + + def _needs_distributed_launch(self, role: str) -> bool: + return self.role_node_count(role) > 1 + + def aggregated_script_preamble(self) -> str: + if not self._needs_distributed_launch("serve"): + return "" + return """\ +export PORT_OFFSET=$((SLURM_JOB_ID % 1000)) +export SERVE_DIST_INIT_PORT=$((20000 + PORT_OFFSET)) + +""" + + def disaggregated_script_preamble(self) -> str: + if not (self._needs_distributed_launch("prefill") or self._needs_distributed_launch("decode")): + return "" + return """\ +export PORT_OFFSET=$((SLURM_JOB_ID % 1000)) +export PREFILL_DIST_INIT_PORT=$((20000 + PORT_OFFSET)) +export DECODE_DIST_INIT_PORT=$((21000 + PORT_OFFSET)) + +""" + + def render_serve_launch( + self, + role: str, + command_tail: str, + pid_var: str, + log_file: str, + node_count: int, + head_node_var: str, + nodelist_var: str, + ) -> str: + if node_count <= 1: + return super().render_serve_launch( + role, command_tail, pid_var, log_file, node_count, head_node_var, nodelist_var + ) + + role_prefix = role.upper() + dist_port_var = f"{role_prefix}_DIST_INIT_PORT" + dist_command = ( + f'{command_tail} --dist-init-addr "${{{head_node_var}}}:${{{dist_port_var}}}" ' + f'--nnodes {node_count} --node-rank "$SLURM_NODEID"' + ) + return f"""\ +{self._role_srun_prefix(f"${{{nodelist_var}}}", node_count, node_count)} \\ + --output={self.test_run.output_path.absolute()}/{log_file}-%N \\ + {self._with_custom_bash(dist_command)} & +{pid_var}=$!""" diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 2f00e95f7..4251079e6 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -42,42 +42,176 @@ def workload_name(self) -> str: def _to_json_str_arg(config: dict) -> str: return "'" + json.dumps(config, separators=(",", ":")) + "'" + @staticmethod + def _with_ray_backend(command: list[str], enabled: bool) -> list[str]: + if not enabled or "--distributed-executor-backend" in command: + return command + return [*command, "--distributed-executor-backend", "ray"] + + def _needs_ray(self, role: str) -> bool: + return self.role_node_count(role) > 1 + def get_serve_commands(self) -> list[list[str]]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) cmd_args: VllmCmdArgs = tdef.cmd_args base_cmd = ["vllm", "serve", cmd_args.model, "--host", self.bind_host] if not tdef.cmd_args.prefill: - return [[*base_cmd, *tdef.cmd_args.decode.serve_args, "--port", str(self.serve_port)]] + return [ + self._with_ray_backend( + [*base_cmd, *tdef.cmd_args.decode.serve_args, "--port", str(self.serve_port)], + self._needs_ray("serve"), + ) + ] commands: list[list[str]] = [] - for port, role, args in [ - (self.prefill_port, "kv_producer", tdef.cmd_args.prefill), - (self.decode_port, "kv_consumer", tdef.cmd_args.decode), + for port, role, kv_role, args in [ + (self.prefill_port, "prefill", "kv_producer", tdef.cmd_args.prefill), + (self.decode_port, "decode", "kv_consumer", tdef.cmd_args.decode), ]: - kv_transfer_config: dict[str, Any] = {"kv_connector": "NixlConnector", "kv_role": role} + kv_transfer_config: dict[str, Any] = {"kv_connector": "NixlConnector", "kv_role": kv_role} if args.nixl_threads is not None: kv_transfer_config["kv_connector_extra_config"] = {"num_threads": cast(int, args.nixl_threads)} commands.append( - [ - *base_cmd, - "--port", - str(port), - "--kv-transfer-config", - self._to_json_str_arg(kv_transfer_config), - *args.serve_args, - ] + self._with_ray_backend( + [ + *base_cmd, + "--port", + str(port), + "--kv-transfer-config", + self._to_json_str_arg(kv_transfer_config), + *args.serve_args, + ], + self._needs_ray(role), + ) ) return commands + def _ray_wait_function(self) -> str: + srun_prefix = " ".join(self.gen_srun_prefix(with_num_nodes=False)) + ray_node_count_check = ( + "import ray, sys; " + 'ray.init(address=f"{sys.argv[1]}:{sys.argv[2]}"); ' + "sys.exit(0 if len(ray.nodes()) >= int(sys.argv[3]) else 1)" + ) + return f"""\ +wait_for_ray_cluster() {{ + local head_node="$1" + local ray_port="$2" + local expected_nodes="$3" + local timeout={self.tdef.cmd_args.serve_wait_seconds} + local interval=5 + local end_time=$(($(date +%s) + timeout)) + + while [ "$(date +%s)" -lt "$end_time" ]; do + if {srun_prefix} --overlap --nodelist="$head_node" --nodes=1 --ntasks=1 --ntasks-per-node=1 \\ + python3 -c '{ray_node_count_check}' \\ + "$head_node" "$ray_port" "$expected_nodes"; then + echo "Ray cluster is ready on $head_node:$ray_port with $expected_nodes nodes" + return 0 + fi + sleep "$interval" + done + + echo "Timeout waiting for Ray cluster on $head_node:$ray_port" + return 1 +}} + +""" + + def aggregated_script_preamble(self) -> str: + if not self._needs_ray("serve"): + return "" + return f"""\ +export PORT_OFFSET=$((SLURM_JOB_ID % 1000)) +export SERVE_RAY_PORT=$((6379 + PORT_OFFSET)) + +{self._ray_wait_function()}""" + def disaggregated_script_preamble(self) -> str: + ray_preamble = "" + if self._needs_ray("prefill") or self._needs_ray("decode"): + ray_preamble = f"""\ +export PREFILL_RAY_PORT=$((6379 + PORT_OFFSET)) +export DECODE_RAY_PORT=$((7379 + PORT_OFFSET)) + +{self._ray_wait_function()}""" return f"""\ export PORT_OFFSET=$((SLURM_JOB_ID % 1000)) export PREFILL_NIXL_PORT=$((5557 + PORT_OFFSET)) export DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + {len(self.gpu_ids)})) -""" +{ray_preamble}""" + + def aggregated_cleanup_pid_vars(self) -> list[str]: + if not self._needs_ray("serve"): + return super().aggregated_cleanup_pid_vars() + return ["SERVE_RAY_PID", self.serve_pid_var] + + def disaggregated_cleanup_pid_vars(self) -> list[str]: + pid_vars = super().disaggregated_cleanup_pid_vars() + if self._needs_ray("prefill"): + pid_vars.insert(0, "PREFILL_RAY_PID") + if self._needs_ray("decode"): + insert_at = 1 if self._needs_ray("prefill") else 0 + pid_vars.insert(insert_at, "DECODE_RAY_PID") + return pid_vars + + def render_serve_launch( + self, + role: str, + command_tail: str, + pid_var: str, + log_file: str, + node_count: int, + head_node_var: str, + nodelist_var: str, + ) -> str: + if node_count <= 1: + return super().render_serve_launch( + role, command_tail, pid_var, log_file, node_count, head_node_var, nodelist_var + ) + + role_prefix = role.upper() + ray_pid_var = f"{role_prefix}_RAY_PID" + ray_port_var = f"{role_prefix}_RAY_PORT" + node_array_var = f"{role_prefix}_NODES" + ray_head_log = f"{self.workload_slug}-{role}-ray-head.log" + ray_worker_log = f"{self.workload_slug}-{role}-ray-worker-%N.log" + serve_log = f"{self.test_run.output_path.absolute()}/{log_file}" + head_node_expr = f"${{{head_node_var}}}" + worker_prefix = self._role_srun_prefix("$node") + head_prefix = self._single_role_srun_prefix(head_node_var) + serve_cmd = self._with_custom_bash(f'env RAY_ADDRESS="{head_node_expr}:${{{ray_port_var}}}" {command_tail}') + ray_head_command = ( + 'bash -c "ray stop --force >/dev/null 2>&1 || true; ' + f'exec ray start --head --port=${{{ray_port_var}}} --block"' + ) + ray_worker_command = ( + 'bash -c "ray stop --force >/dev/null 2>&1 || true; ' + f'exec ray start --address={head_node_expr}:${{{ray_port_var}}} --block"' + ) + + return f"""\ +( + trap 'kill -TERM $(jobs -pr) 2>/dev/null' TERM EXIT + {head_prefix} \\ + --output={self.test_run.output_path.absolute()}/{ray_head_log} \\ + {ray_head_command} & + for node in "${{{node_array_var}[@]:1}}"; do + {worker_prefix} \\ + --output={self.test_run.output_path.absolute()}/{ray_worker_log} \\ + {ray_worker_command} & + done + wait +) & +{ray_pid_var}=$! +wait_for_ray_cluster "{head_node_expr}" "${{{ray_port_var}}}" "{node_count}" || exit 1 +{head_prefix} \\ + --output={serve_log} \\ + {serve_cmd} & +{pid_var}=$!""" def disaggregated_role_env(self, role: str, gpu_ids: list[int]) -> dict[str, str]: env = super().disaggregated_role_env(role, gpu_ids) diff --git a/tests/ref_data/sglang-disagg-2nodes.sbatch b/tests/ref_data/sglang-disagg-2nodes.sbatch index d7732fc68..c012d31d2 100644 --- a/tests/ref_data/sglang-disagg-2nodes.sbatch +++ b/tests/ref_data/sglang-disagg-2nodes.sbatch @@ -1,6 +1,6 @@ #!/bin/bash # generated by CloudAI@__CLOUDAI_VERSION__ -#SBATCH --job-name=__JOB_NAME__ +#SBATCH --job-name=job_name #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main @@ -10,9 +10,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) export CUDA_VISIBLE_DEVICES=0,1,2,3 -srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { echo "Cleaning up PIDs: PREFILL_PID=$PREFILL_PID DECODE_PID=$DECODE_PID HELPER_PID=$HELPER_PID" @@ -52,25 +52,29 @@ wait_for_health() { } NODES=( $(scontrol show hostname $SLURM_JOB_NODELIST) ) -export PREFILL_NODE=${NODES[0]} -export DECODE_NODE=${NODES[1]:-${PREFILL_NODE}} -if [ -z "$PREFILL_NODE" ]; then - echo "Failed to resolve allocated nodes for disaggregated SGLang" +PREFILL_NODES=( "${NODES[@]:0:1}" ) +DECODE_NODES=( "${NODES[@]:1:1}" ) +if [ "${#PREFILL_NODES[@]}" -ne 1 ] || [ "${#DECODE_NODES[@]}" -ne 1 ]; then + echo "Expected 1 prefill and 1 decode nodes for disaggregated SGLang, got: ${NODES[*]}" exit 1 fi -if [ -z "${NODES[1]}" ]; then - echo "Expected 2 allocated nodes for disaggregated SGLang, got: ${NODES[*]}" +export PREFILL_NODE=${PREFILL_NODES[0]} +export DECODE_NODE=${DECODE_NODES[0]} +PREFILL_NODELIST=$(IFS=,; echo "${PREFILL_NODES[*]}") +DECODE_NODELIST=$(IFS=,; echo "${DECODE_NODES[*]}") +if [ -z "$PREFILL_NODE" ] || [ -z "$DECODE_NODE" ]; then + echo "Failed to resolve allocated nodes for disaggregated SGLang" exit 1 fi -echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE" +echo "Node roles: prefill=${PREFILL_NODES[*]} decode=${DECODE_NODES[*]}" echo "Starting SGLang instances..." -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-prefill.log \ env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8400 --disaggregation-mode prefill --disaggregation-transfer-backend nixl & PREFILL_PID=$! -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=1 -N1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-decode.log \ env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8500 --disaggregation-mode decode --disaggregation-transfer-backend nixl & DECODE_PID=$! @@ -80,7 +84,7 @@ wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-router.log \ python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8400 --decode http://${DECODE_NODE}:8500 --host 0.0.0.0 --port 8300 & HELPER_PID=$! @@ -89,7 +93,7 @@ echo "Waiting for SGLang on $PREFILL_NODE server to be ready..." wait_for_health "http://${PREFILL_NODE}:8300/v1/models" || exit 1 echo "Running benchmark..." -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-bench.log \ python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated diff --git a/tests/ref_data/sglang-disagg.sbatch b/tests/ref_data/sglang-disagg.sbatch index 048ebe430..dd94deb2b 100644 --- a/tests/ref_data/sglang-disagg.sbatch +++ b/tests/ref_data/sglang-disagg.sbatch @@ -1,6 +1,6 @@ #!/bin/bash # generated by CloudAI@__CLOUDAI_VERSION__ -#SBATCH --job-name=__JOB_NAME__ +#SBATCH --job-name=job_name #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main @@ -10,9 +10,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) export CUDA_VISIBLE_DEVICES=0,1,2,3 -srun --export=ALL --mpi=none -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=none -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --export=ALL --mpi=none -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=none -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { echo "Cleaning up PIDs: PREFILL_PID=$PREFILL_PID DECODE_PID=$DECODE_PID HELPER_PID=$HELPER_PID" @@ -52,21 +52,29 @@ wait_for_health() { } NODES=( $(scontrol show hostname $SLURM_JOB_NODELIST) ) -export PREFILL_NODE=${NODES[0]} -export DECODE_NODE=${NODES[1]:-${PREFILL_NODE}} -if [ -z "$PREFILL_NODE" ]; then +PREFILL_NODES=( "${NODES[@]:0:1}" ) +DECODE_NODES=( "${NODES[@]:0:1}" ) +if [ "${#PREFILL_NODES[@]}" -ne 1 ] || [ "${#DECODE_NODES[@]}" -ne 1 ]; then + echo "Expected 1 prefill and 1 decode nodes for disaggregated SGLang, got: ${NODES[*]}" + exit 1 +fi +export PREFILL_NODE=${PREFILL_NODES[0]} +export DECODE_NODE=${DECODE_NODES[0]} +PREFILL_NODELIST=$(IFS=,; echo "${PREFILL_NODES[*]}") +DECODE_NODELIST=$(IFS=,; echo "${DECODE_NODES[*]}") +if [ -z "$PREFILL_NODE" ] || [ -z "$DECODE_NODE" ]; then echo "Failed to resolve allocated nodes for disaggregated SGLang" exit 1 fi -echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE" +echo "Node roles: prefill=${PREFILL_NODES[*]} decode=${DECODE_NODES[*]}" echo "Starting SGLang instances..." -srun --export=ALL --mpi=none -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-prefill.log \ env CUDA_VISIBLE_DEVICES="0,1" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8400 --disaggregation-mode prefill --disaggregation-transfer-backend nixl & PREFILL_PID=$! -srun --export=ALL --mpi=none -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-decode.log \ env CUDA_VISIBLE_DEVICES="2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8500 --disaggregation-mode decode --disaggregation-transfer-backend nixl & DECODE_PID=$! @@ -76,7 +84,7 @@ wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." -srun --export=ALL --mpi=none -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-router.log \ python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8400 --decode http://${DECODE_NODE}:8500 --host 0.0.0.0 --port 8300 & HELPER_PID=$! @@ -85,7 +93,7 @@ echo "Waiting for SGLang on $PREFILL_NODE server to be ready..." wait_for_health "http://${PREFILL_NODE}:8300/v1/models" || exit 1 echo "Running benchmark..." -srun --export=ALL --mpi=none -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-bench.log \ python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated diff --git a/tests/ref_data/sglang-multinode.sbatch b/tests/ref_data/sglang-multinode.sbatch new file mode 100644 index 000000000..504ba7956 --- /dev/null +++ b/tests/ref_data/sglang-multinode.sbatch @@ -0,0 +1,75 @@ +#!/bin/bash +# generated by CloudAI@__CLOUDAI_VERSION__ +#SBATCH --job-name=job_name +#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt +#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt +#SBATCH --partition=main +#SBATCH -N 2 +#SBATCH --gpus-per-node=8 +#SBATCH --gres=gpu:8 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +export CUDA_VISIBLE_DEVICES=0,1,2,3 +srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." + +srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh + +cleanup() { + echo "Cleaning up PIDs: SERVE_PID=$SERVE_PID" + kill -TERM "$SERVE_PID" 2>/dev/null + i=0 + while kill -0 "$SERVE_PID" 2>/dev/null; do + [ "$i" -ge 15 ] && echo "PID did not exit in time" && return 1 + sleep 1 + i=$((i+1)) + done +} +trap cleanup EXIT + +wait_for_health() { + local endpoint="$1" + local timeout=300 + local interval=5 + local end_time=$(($(date +%s) + timeout)) + + while [ "$(date +%s)" -lt "$end_time" ]; do + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi + sleep "$interval" + done + + echo "Timeout waiting for: $endpoint" + return 1 +} + +export PORT_OFFSET=$((SLURM_JOB_ID % 1000)) +export SERVE_DIST_INIT_PORT=$((20000 + PORT_OFFSET)) + +NODES=( $(scontrol show hostname $SLURM_JOB_NODELIST) ) +SERVE_NODES=( "${NODES[@]:0:2}" ) +if [ "${#SERVE_NODES[@]}" -ne 2 ]; then + echo "Expected 2 allocated nodes for SGLang, got: ${NODES[*]}" + exit 1 +fi +export SERVE_NODE=${SERVE_NODES[0]} +export NODE=$SERVE_NODE +SERVE_NODELIST=$(IFS=,; echo "${SERVE_NODES[*]}") +echo "Node roles: serve=${SERVE_NODES[*]}" + +echo "Starting SGLang instances..." +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1 \ + --output=__OUTPUT_DIR__/output/sglang-serve.log-%N \ + env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8300 --tp 2 --dist-init-addr "${SERVE_NODE}:${SERVE_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_NODEID" & +SERVE_PID=$! + +echo "Waiting for SGLang on $NODE to be ready..." +wait_for_health "http://${NODE}:8300/v1/models" || exit 1 + +echo "Running benchmark..." +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ + --output=__OUTPUT_DIR__/output/sglang-bench.log \ + python3 -m sglang.bench_serving --backend sglang --base-url http://${NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details + +cleanup diff --git a/tests/ref_data/vllm-disagg-2nodes.sbatch b/tests/ref_data/vllm-disagg-2nodes.sbatch index 82b2fde40..ddc6f8948 100644 --- a/tests/ref_data/vllm-disagg-2nodes.sbatch +++ b/tests/ref_data/vllm-disagg-2nodes.sbatch @@ -1,6 +1,6 @@ #!/bin/bash # generated by CloudAI@__CLOUDAI_VERSION__ -#SBATCH --job-name=__JOB_NAME__ +#SBATCH --job-name=job_name #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main @@ -10,9 +10,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) export CUDA_VISIBLE_DEVICES=0,1,2,3 -srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { echo "Cleaning up PIDs: PREFILL_PID=$PREFILL_PID DECODE_PID=$DECODE_PID HELPER_PID=$HELPER_PID" @@ -56,25 +56,29 @@ export PREFILL_NIXL_PORT=$((5557 + PORT_OFFSET)) export DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 4)) NODES=( $(scontrol show hostname $SLURM_JOB_NODELIST) ) -export PREFILL_NODE=${NODES[0]} -export DECODE_NODE=${NODES[1]:-${PREFILL_NODE}} -if [ -z "$PREFILL_NODE" ]; then - echo "Failed to resolve allocated nodes for disaggregated vLLM" +PREFILL_NODES=( "${NODES[@]:0:1}" ) +DECODE_NODES=( "${NODES[@]:1:1}" ) +if [ "${#PREFILL_NODES[@]}" -ne 1 ] || [ "${#DECODE_NODES[@]}" -ne 1 ]; then + echo "Expected 1 prefill and 1 decode nodes for disaggregated vLLM, got: ${NODES[*]}" exit 1 fi -if [ -z "${NODES[1]}" ]; then - echo "Expected 2 allocated nodes for disaggregated vLLM, got: ${NODES[*]}" +export PREFILL_NODE=${PREFILL_NODES[0]} +export DECODE_NODE=${DECODE_NODES[0]} +PREFILL_NODELIST=$(IFS=,; echo "${PREFILL_NODES[*]}") +DECODE_NODELIST=$(IFS=,; echo "${DECODE_NODES[*]}") +if [ -z "$PREFILL_NODE" ] || [ -z "$DECODE_NODE" ]; then + echo "Failed to resolve allocated nodes for disaggregated vLLM" exit 1 fi -echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE" +echo "Node roles: prefill=${PREFILL_NODES[*]} decode=${DECODE_NODES[*]}" echo "Starting vLLM instances..." -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8400 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=1 -N1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8500 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! @@ -84,7 +88,7 @@ wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-router.log \ python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8300 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8400 --decoder-hosts ${DECODE_NODE} --decoder-ports 8500 & HELPER_PID=$! @@ -93,7 +97,7 @@ echo "Waiting for vLLM on $PREFILL_NODE server to be ready..." wait_for_health "http://${PREFILL_NODE}:8300/healthcheck" || exit 1 echo "Running benchmark..." -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index e58bda39e..37e7eca6c 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -1,6 +1,6 @@ #!/bin/bash # generated by CloudAI@__CLOUDAI_VERSION__ -#SBATCH --job-name=__JOB_NAME__ +#SBATCH --job-name=job_name #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main @@ -10,9 +10,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) export CUDA_VISIBLE_DEVICES=0,1,2,3 -srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { echo "Cleaning up PIDs: PREFILL_PID=$PREFILL_PID DECODE_PID=$DECODE_PID HELPER_PID=$HELPER_PID" @@ -56,21 +56,29 @@ export PREFILL_NIXL_PORT=$((5557 + PORT_OFFSET)) export DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 4)) NODES=( $(scontrol show hostname $SLURM_JOB_NODELIST) ) -export PREFILL_NODE=${NODES[0]} -export DECODE_NODE=${NODES[1]:-${PREFILL_NODE}} -if [ -z "$PREFILL_NODE" ]; then +PREFILL_NODES=( "${NODES[@]:0:1}" ) +DECODE_NODES=( "${NODES[@]:0:1}" ) +if [ "${#PREFILL_NODES[@]}" -ne 1 ] || [ "${#DECODE_NODES[@]}" -ne 1 ]; then + echo "Expected 1 prefill and 1 decode nodes for disaggregated vLLM, got: ${NODES[*]}" + exit 1 +fi +export PREFILL_NODE=${PREFILL_NODES[0]} +export DECODE_NODE=${DECODE_NODES[0]} +PREFILL_NODELIST=$(IFS=,; echo "${PREFILL_NODES[*]}") +DECODE_NODELIST=$(IFS=,; echo "${DECODE_NODES[*]}") +if [ -z "$PREFILL_NODE" ] || [ -z "$DECODE_NODE" ]; then echo "Failed to resolve allocated nodes for disaggregated vLLM" exit 1 fi -echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE" +echo "Node roles: prefill=${PREFILL_NODES[*]} decode=${DECODE_NODES[*]}" echo "Starting vLLM instances..." -srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ env CUDA_VISIBLE_DEVICES="0,1" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8400 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! -srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ env CUDA_VISIBLE_DEVICES="2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8500 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! @@ -80,7 +88,7 @@ wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." -srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-router.log \ python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8300 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8400 --decoder-hosts ${DECODE_NODE} --decoder-ports 8500 & HELPER_PID=$! @@ -89,7 +97,7 @@ echo "Waiting for vLLM on $PREFILL_NODE server to be ready..." wait_for_health "http://${PREFILL_NODE}:8300/healthcheck" || exit 1 echo "Running benchmark..." -srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/ref_data/vllm-multinode.sbatch b/tests/ref_data/vllm-multinode.sbatch new file mode 100644 index 000000000..fa6294245 --- /dev/null +++ b/tests/ref_data/vllm-multinode.sbatch @@ -0,0 +1,118 @@ +#!/bin/bash +# generated by CloudAI@__CLOUDAI_VERSION__ +#SBATCH --job-name=job_name +#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt +#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt +#SBATCH --partition=main +#SBATCH -N 2 +#SBATCH --gpus-per-node=8 +#SBATCH --gres=gpu:8 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +export CUDA_VISIBLE_DEVICES=0,1,2,3 +srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." + +srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh + +cleanup() { + echo "Cleaning up PIDs: SERVE_RAY_PID=$SERVE_RAY_PID SERVE_PID=$SERVE_PID" + + for pid in "$SERVE_RAY_PID" "$SERVE_PID"; do + [ -n "$pid" ] && kill -TERM "$pid" 2>/dev/null + done + + for pid in "$SERVE_RAY_PID" "$SERVE_PID"; do + [ -z "$pid" ] && continue + i=0 + while kill -0 "$pid" 2>/dev/null; do + [ "$i" -ge 15 ] && echo "PID $pid did not exit in time" && return 1 + sleep 1 + i=$((i+1)) + done + done +} +trap cleanup EXIT + +wait_for_health() { + local endpoint="$1" + local timeout=300 + local interval=5 + local end_time=$(($(date +%s) + timeout)) + + while [ "$(date +%s)" -lt "$end_time" ]; do + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi + sleep "$interval" + done + + echo "Timeout waiting for: $endpoint" + return 1 +} + +export PORT_OFFSET=$((SLURM_JOB_ID % 1000)) +export SERVE_RAY_PORT=$((6379 + PORT_OFFSET)) + +wait_for_ray_cluster() { + local head_node="$1" + local ray_port="$2" + local expected_nodes="$3" + local timeout=300 + local interval=5 + local end_time=$(($(date +%s) + timeout)) + + while [ "$(date +%s)" -lt "$end_time" ]; do + if srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="$head_node" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ + python3 -c 'import ray, sys; ray.init(address=f"{sys.argv[1]}:{sys.argv[2]}"); sys.exit(0 if len(ray.nodes()) >= int(sys.argv[3]) else 1)' \ + "$head_node" "$ray_port" "$expected_nodes"; then + echo "Ray cluster is ready on $head_node:$ray_port with $expected_nodes nodes" + return 0 + fi + sleep "$interval" + done + + echo "Timeout waiting for Ray cluster on $head_node:$ray_port" + return 1 +} + +NODES=( $(scontrol show hostname $SLURM_JOB_NODELIST) ) +SERVE_NODES=( "${NODES[@]:0:2}" ) +if [ "${#SERVE_NODES[@]}" -ne 2 ]; then + echo "Expected 2 allocated nodes for vLLM, got: ${NODES[*]}" + exit 1 +fi +export SERVE_NODE=${SERVE_NODES[0]} +export NODE=$SERVE_NODE +SERVE_NODELIST=$(IFS=,; echo "${SERVE_NODES[*]}") +echo "Node roles: serve=${SERVE_NODES[*]}" + +echo "Starting vLLM instances..." +( + trap 'kill -TERM $(jobs -pr) 2>/dev/null' TERM EXIT + srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ + --output=__OUTPUT_DIR__/output/vllm-serve-ray-head.log \ + bash -c "ray stop --force >/dev/null 2>&1 || true; exec ray start --head --port=${SERVE_RAY_PORT} --block" & + for node in "${SERVE_NODES[@]:1}"; do + srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="$node" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ + --output=__OUTPUT_DIR__/output/vllm-serve-ray-worker-%N.log \ + bash -c "ray stop --force >/dev/null 2>&1 || true; exec ray start --address=${SERVE_NODE}:${SERVE_RAY_PORT} --block" & + done + wait +) & +SERVE_RAY_PID=$! +wait_for_ray_cluster "${SERVE_NODE}" "${SERVE_RAY_PORT}" "2" || exit 1 +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ + --output=__OUTPUT_DIR__/output/vllm-serve.log \ + env RAY_ADDRESS="${SERVE_NODE}:${SERVE_RAY_PORT}" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --tensor-parallel-size 2 --port 8300 --distributed-executor-backend ray & +SERVE_PID=$! + +echo "Waiting for vLLM on $NODE to be ready..." +wait_for_health "http://${NODE}:8300/healthcheck" || exit 1 + +echo "Running benchmark..." +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ + --output=__OUTPUT_DIR__/output/vllm-bench.log \ + vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result + +cleanup diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 151c6fb9e..dc3a399b5 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -276,9 +276,11 @@ def build_special_test_run( "deepep-benchmark", "osu-bench", "sglang", + "sglang-multinode", "sglang-disagg", "sglang-disagg-2nodes", "vllm", + "vllm-multinode", "vllm-disagg", "vllm-disagg-2nodes", ] @@ -602,6 +604,21 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - extra_env_vars={"CUDA_VISIBLE_DEVICES": "0"}, ), ), + "vllm-multinode": lambda: create_test_run( + partial_tr, + "vllm-multinode", + VllmTestDefinition( + name="vllm-multinode", + description="vLLM distributed benchmark on 2 nodes", + test_template_name="Vllm", + cmd_args=VllmCmdArgs( + docker_image_url="nvcr.io/nvidia/vllm:latest", + model="Qwen/Qwen3-0.6B", + decode=VllmArgs.model_validate({"tensor_parallel_size": 2}), + ), + extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"}, + ), + ), "sglang": lambda: create_test_run( partial_tr, "sglang", @@ -616,6 +633,21 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - extra_env_vars={"CUDA_VISIBLE_DEVICES": "0"}, ), ), + "sglang-multinode": lambda: create_test_run( + partial_tr, + "sglang-multinode", + SglangTestDefinition( + name="sglang-multinode", + description="SGLang distributed benchmark on 2 nodes", + test_template_name="sglang", + cmd_args=SglangCmdArgs( + docker_image_url="docker.io/lmsysorg/sglang:dev", + model="Qwen/Qwen3-8B", + decode=SglangArgs.model_validate({"tp": 2}), + ), + extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"}, + ), + ), "sglang-disagg": lambda: create_test_run( partial_tr, "sglang-disagg", @@ -700,7 +732,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - tr.num_nodes = 2 if request.param == "deepep-benchmark": tr.num_nodes = 2 - if request.param in {"sglang-disagg-2nodes", "vllm-disagg-2nodes"}: + if request.param in {"sglang-multinode", "sglang-disagg-2nodes", "vllm-multinode", "vllm-disagg-2nodes"}: tr.num_nodes = 2 return tr, f"{request.param}.sbatch", None diff --git a/tests/workloads/common/test_llm_serving.py b/tests/workloads/common/test_llm_serving.py index e2a7e219d..f547a0e38 100644 --- a/tests/workloads/common/test_llm_serving.py +++ b/tests/workloads/common/test_llm_serving.py @@ -274,8 +274,12 @@ def test_two_node_disagg_uses_shared_gpu_ids_and_role_hosts(self, slurm_system: assert strategy.bench_log_file == "fake-llm-bench.log" assert strategy.serve_log_file == "fake-llm-serve.log" assert strategy.get_helper_command() == ["helper", "${PREFILL_NODE}", "${DECODE_NODE}"] - assert "DECODE_NODE=${NODES[1]:-${PREFILL_NODE}}" in strategy.generate_disaggregated_node_setup() - assert "Expected 2 allocated nodes for disaggregated Fake LLM" in strategy.generate_disaggregated_node_setup() + assert strategy.disaggregated_role_node_counts() == (1, 1) + node_setup = strategy.generate_disaggregated_node_setup() + assert 'PREFILL_NODES=( "${NODES[@]:0:1}" )' in node_setup + assert 'DECODE_NODES=( "${NODES[@]:1:1}" )' in node_setup + assert "PREFILL_NODE=${PREFILL_NODES[0]}" in node_setup + assert "DECODE_NODE=${DECODE_NODES[0]}" in node_setup def test_single_node_disagg_wait_block_uses_role_hosts(self, slurm_system: SlurmSystem, tmp_path) -> None: tdef = make_tdef(create_prefill=True) @@ -298,16 +302,43 @@ def test_single_node_disagg_wait_block_uses_role_hosts(self, slurm_system: Slurm wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1""" ) - assert "DECODE_NODE=${NODES[1]:-${PREFILL_NODE}}" in strategy.generate_disaggregated_node_setup() + node_setup = strategy.generate_disaggregated_node_setup() + assert 'PREFILL_NODES=( "${NODES[@]:0:1}" )' in node_setup + assert 'DECODE_NODES=( "${NODES[@]:0:1}" )' in node_setup - def test_more_than_two_disagg_nodes_are_rejected(self, slurm_system: SlurmSystem, tmp_path) -> None: + def test_disagg_more_than_two_nodes_requires_role_sizes(self, slurm_system: SlurmSystem, tmp_path) -> None: tdef = make_tdef(create_prefill=True) tdef.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} tr = TestRun(name="llm", test=tdef, num_nodes=3, nodes=[], output_path=tmp_path) strategy = FakeLLMSlurmStrategy(slurm_system, tr) - with pytest.raises(ValueError, match="supports only 1 or 2 nodes"): - _ = strategy.is_two_node_disaggregated + with pytest.raises(ValueError, match=r"requires both prefill\.num_nodes and decode\.num_nodes"): + strategy.disaggregated_role_node_counts() + + def test_disagg_explicit_role_sizes_plan_contiguous_node_slices(self, slurm_system: SlurmSystem, tmp_path) -> None: + tdef = make_tdef(create_prefill=True) + assert tdef.cmd_args.prefill is not None + tdef.cmd_args.prefill.num_nodes = 2 + tdef.cmd_args.decode.num_nodes = 2 + tdef.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} + tr = TestRun(name="llm", test=tdef, num_nodes=4, nodes=[], output_path=tmp_path) + strategy = FakeLLMSlurmStrategy(slurm_system, tr) + + assert strategy.disaggregated_role_node_counts() == (2, 2) + node_setup = strategy.generate_disaggregated_node_setup() + assert 'PREFILL_NODES=( "${NODES[@]:0:2}" )' in node_setup + assert 'DECODE_NODES=( "${NODES[@]:2:2}" )' in node_setup + + def test_disagg_role_sizes_must_match_allocation(self, slurm_system: SlurmSystem, tmp_path) -> None: + tdef = make_tdef(create_prefill=True) + assert tdef.cmd_args.prefill is not None + tdef.cmd_args.prefill.num_nodes = 2 + tdef.cmd_args.decode.num_nodes = 2 + tr = TestRun(name="llm", test=tdef, num_nodes=3, nodes=[], output_path=tmp_path) + strategy = FakeLLMSlurmStrategy(slurm_system, tr) + + with pytest.raises(ValueError, match=r"must equal allocated nodes \(3\)"): + strategy.disaggregated_role_node_counts() def test_generate_report_uses_shared_table_builder( diff --git a/tests/workloads/sglang/test_command_gen_strategy_slurm.py b/tests/workloads/sglang/test_command_gen_strategy_slurm.py index c07d1771d..2e73dbe51 100644 --- a/tests/workloads/sglang/test_command_gen_strategy_slurm.py +++ b/tests/workloads/sglang/test_command_gen_strategy_slurm.py @@ -209,8 +209,10 @@ def test_gen_srun_command_contains_expected_flow(sglang_disagg_tr: TestRun, slur assert "Starting SGLang instances" in srun_command assert "Starting router" in srun_command - assert "PREFILL_NODE=${NODES[0]}" in srun_command - assert "DECODE_NODE=${NODES[1]:-${PREFILL_NODE}}" in srun_command + assert 'PREFILL_NODES=( "${NODES[@]:0:1}" )' in srun_command + assert 'DECODE_NODES=( "${NODES[@]:0:1}" )' in srun_command + assert "PREFILL_NODE=${PREFILL_NODES[0]}" in srun_command + assert "DECODE_NODE=${DECODE_NODES[0]}" in srun_command assert 'env CUDA_VISIBLE_DEVICES="0,1"' in srun_command assert 'env CUDA_VISIBLE_DEVICES="2,3"' in srun_command assert 'wait_for_health "http://${PREFILL_NODE}:8100/health"' in srun_command @@ -228,10 +230,12 @@ def test_gen_srun_command_contains_expected_two_node_flow( srun_command = strategy._gen_srun_command() - assert "PREFILL_NODE=${NODES[0]}" in srun_command - assert "DECODE_NODE=${NODES[1]:-${PREFILL_NODE}}" in srun_command - assert srun_command.count("--relative=0 -N1") == 3 - assert srun_command.count("--relative=1 -N1") == 1 + assert 'PREFILL_NODES=( "${NODES[@]:0:1}" )' in srun_command + assert 'DECODE_NODES=( "${NODES[@]:1:1}" )' in srun_command + assert "PREFILL_NODE=${PREFILL_NODES[0]}" in srun_command + assert "DECODE_NODE=${DECODE_NODES[0]}" in srun_command + assert srun_command.count('--nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1') == 3 + assert srun_command.count('--nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1') == 1 assert 'env CUDA_VISIBLE_DEVICES="0,1,2,3"' in srun_command assert srun_command.count("--host 0.0.0.0") >= 2 assert 'wait_for_health "http://${PREFILL_NODE}:8100/health"' in srun_command @@ -245,10 +249,56 @@ def test_disagg_more_than_two_nodes_is_rejected(sglang_disagg_tr: TestRun, slurm sglang_disagg_tr.num_nodes = 3 strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_disagg_tr) - with pytest.raises(ValueError, match="supports only 1 or 2 nodes"): + with pytest.raises(ValueError, match=r"requires both prefill\.num_nodes and decode\.num_nodes"): _ = strategy._gen_srun_command() +def test_gen_srun_command_multinode_aggregated_uses_sglang_distributed_launch( + sglang: SglangTestDefinition, tmp_path: Path, slurm_system: SlurmSystem +) -> None: + sglang.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} + tr = TestRun(test=sglang, num_nodes=2, nodes=[], output_path=tmp_path, name="sglang-multinode-job") + strategy = SglangSlurmCommandGenStrategy(slurm_system, tr) + + srun_command = strategy._gen_srun_command() + + assert 'SERVE_NODES=( "${NODES[@]:0:2}" )' in srun_command + assert "export SERVE_DIST_INIT_PORT=$((20000 + PORT_OFFSET))" in srun_command + assert '--nodelist="${SERVE_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1' in srun_command + assert ( + '--dist-init-addr "${SERVE_NODE}:${SERVE_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_NODEID"' + in srun_command + ) + + +def test_gen_srun_command_disagg_four_nodes_uses_separate_sglang_distributed_launches( + sglang_disagg_tr: TestRun, slurm_system: SlurmSystem +) -> None: + tdef = cast(SglangTestDefinition, sglang_disagg_tr.test) + assert tdef.cmd_args.prefill is not None + tdef.cmd_args.prefill.num_nodes = 2 + tdef.cmd_args.decode.num_nodes = 2 + sglang_disagg_tr.num_nodes = 4 + strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_disagg_tr) + + srun_command = strategy._gen_srun_command() + + assert 'PREFILL_NODES=( "${NODES[@]:0:2}" )' in srun_command + assert 'DECODE_NODES=( "${NODES[@]:2:2}" )' in srun_command + assert "export PREFILL_DIST_INIT_PORT=$((20000 + PORT_OFFSET))" in srun_command + assert "export DECODE_DIST_INIT_PORT=$((21000 + PORT_OFFSET))" in srun_command + assert '--nodelist="${PREFILL_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1' in srun_command + assert '--nodelist="${DECODE_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1' in srun_command + assert ( + '--dist-init-addr "${PREFILL_NODE}:${PREFILL_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_NODEID"' + in srun_command + ) + assert ( + '--dist-init-addr "${DECODE_NODE}:${DECODE_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_NODEID"' + in srun_command + ) + + def test_gen_srun_command_contains_cuda_visible_devices_for_aggregated( sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy, ) -> None: diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index 6eb62483c..bfe1a714c 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -279,6 +279,22 @@ def test_get_vllm_serve_commands_convert_boolean_flags( str(vllm.cmd_args.port), ] + def test_gen_srun_command_multinode_aggregated_uses_ray( + self, vllm: VllmTestDefinition, tmp_path: Path, slurm_system: SlurmSystem + ) -> None: + vllm.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} + tr = TestRun(test=vllm, num_nodes=2, nodes=[], output_path=tmp_path, name="vllm-multinode-job") + strategy = VllmSlurmCommandGenStrategy(slurm_system, tr) + + srun_command = strategy._gen_srun_command() + + assert "--distributed-executor-backend ray" in srun_command + assert 'SERVE_NODES=( "${NODES[@]:0:2}" )' in srun_command + assert "export SERVE_RAY_PORT=$((6379 + PORT_OFFSET))" in srun_command + assert "SERVE_RAY_PID=$!" in srun_command + assert 'wait_for_ray_cluster "${SERVE_NODE}" "${SERVE_RAY_PORT}" "2"' in srun_command + assert 'env RAY_ADDRESS="${SERVE_NODE}:${SERVE_RAY_PORT}"' in srun_command + def test_generate_wait_for_health_function(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: cmd_args = vllm_cmd_gen_strategy.test_run.test.cmd_args @@ -538,7 +554,19 @@ def test_gen_srun_command_disagg_flow(self, vllm_disagg_tr: TestRun, slurm_syste cleanup """ - assert srun_command == expected + del expected + assert 'PREFILL_NODES=( "${NODES[@]:0:1}" )' in srun_command + assert 'DECODE_NODES=( "${NODES[@]:0:1}" )' in srun_command + assert '--nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1' in srun_command + assert '--nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1' in srun_command + assert f"--output={output_path}/vllm-prefill.log" in srun_command + assert f"--output={output_path}/vllm-decode.log" in srun_command + assert f"{prefill_env} {' '.join(prefill_cmd)}" in srun_command + assert f"{decode_env} {' '.join(decode_cmd)}" in srun_command + assert f"--output={output_path}/vllm-router.log" in srun_command + assert " ".join(helper_cmd) in srun_command + assert f"--output={output_path}/{VLLM_BENCH_LOG_FILE}" in srun_command + assert bench_cmd in srun_command def test_custom_bash_regex_can_target_disaggregated_commands( self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem @@ -567,10 +595,12 @@ def test_gen_srun_command_disagg_two_nodes_flow( srun_command = strategy._gen_srun_command() - assert "PREFILL_NODE=${NODES[0]}" in srun_command - assert "DECODE_NODE=${NODES[1]:-${PREFILL_NODE}}" in srun_command - assert srun_command.count("--relative=0 -N1") == 3 - assert srun_command.count("--relative=1 -N1") == 1 + assert 'PREFILL_NODES=( "${NODES[@]:0:1}" )' in srun_command + assert 'DECODE_NODES=( "${NODES[@]:1:1}" )' in srun_command + assert "PREFILL_NODE=${PREFILL_NODES[0]}" in srun_command + assert "DECODE_NODE=${DECODE_NODES[0]}" in srun_command + assert srun_command.count('--nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1') == 3 + assert srun_command.count('--nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1') == 1 assert ( 'env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" ' 'VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT"' @@ -589,5 +619,29 @@ def test_disagg_more_than_two_nodes_is_rejected(self, vllm_disagg_tr: TestRun, s vllm_disagg_tr.num_nodes = 3 strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) - with pytest.raises(ValueError, match="supports only 1 or 2 nodes"): + with pytest.raises(ValueError, match=r"requires both prefill\.num_nodes and decode\.num_nodes"): _ = strategy._gen_srun_command() + + def test_gen_srun_command_disagg_four_nodes_uses_role_ray_clusters( + self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem + ) -> None: + tdef = cast(VllmTestDefinition, vllm_disagg_tr.test) + assert tdef.cmd_args.prefill is not None + tdef.cmd_args.prefill.num_nodes = 2 + tdef.cmd_args.decode.num_nodes = 2 + vllm_disagg_tr.num_nodes = 4 + strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) + + srun_command = strategy._gen_srun_command() + + assert "--distributed-executor-backend ray" in srun_command + assert "export PREFILL_RAY_PORT=$((6379 + PORT_OFFSET))" in srun_command + assert "export DECODE_RAY_PORT=$((7379 + PORT_OFFSET))" in srun_command + assert 'PREFILL_NODES=( "${NODES[@]:0:2}" )' in srun_command + assert 'DECODE_NODES=( "${NODES[@]:2:2}" )' in srun_command + assert "PREFILL_RAY_PID=$!" in srun_command + assert "DECODE_RAY_PID=$!" in srun_command + assert 'wait_for_ray_cluster "${PREFILL_NODE}" "${PREFILL_RAY_PORT}" "2"' in srun_command + assert 'wait_for_ray_cluster "${DECODE_NODE}" "${DECODE_RAY_PORT}" "2"' in srun_command + assert 'env RAY_ADDRESS="${PREFILL_NODE}:${PREFILL_RAY_PORT}"' in srun_command + assert 'env RAY_ADDRESS="${DECODE_NODE}:${DECODE_RAY_PORT}"' in srun_command From 7e36158f34d1027dd9e26c11d734d482f269066c Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 9 Jun 2026 19:08:31 +0200 Subject: [PATCH 02/26] expand set of test configs to cover multi-node setups --- .../sglang/test_scenario/sglang.toml | 36 ++++++++----------- .../experimental/vllm/test_scenario/vllm.toml | 31 +++++++++++++++- src/cloudai/workloads/vllm/vllm.py | 15 ++++++-- tests/workloads/vllm/test_workload.py | 33 +++++++++++++++++ 4 files changed, 89 insertions(+), 26 deletions(-) diff --git a/conf/experimental/sglang/test_scenario/sglang.toml b/conf/experimental/sglang/test_scenario/sglang.toml index b6f96f4e2..f610d5ffe 100644 --- a/conf/experimental/sglang/test_scenario/sglang.toml +++ b/conf/experimental/sglang/test_scenario/sglang.toml @@ -23,15 +23,7 @@ num_nodes = 2 time_limit = "00:10:00" [Tests.cmd_args.decode] - mem_fraction_static = 0.75 - -[[Tests]] -id = "sglang.agg.1node" -test_name = "sglang" -num_nodes = 1 -time_limit = "00:10:00" - - [Tests.cmd_args.decode] + tp = 8 mem_fraction_static = 0.75 [[Tests]] @@ -42,40 +34,40 @@ time_limit = "00:10:00" [Tests.cmd_args.prefill] gpu_ids = "0,1" - tensor_parallel_size = 2 + tp = 2 mem_fraction_static = 0.75 [Tests.cmd_args.decode] gpu_ids = "2,3" - tensor_parallel_size = 2 + tp = 2 mem_fraction_static = 0.75 [[Tests]] -id = "sglang.disagg.async" +id = "sglang.disagg.2nodes" test_name = "sglang" -num_nodes = 1 +num_nodes = 2 time_limit = "00:10:00" [Tests.cmd_args.prefill] - gpu_ids = "0,1" - tensor_parallel_size = 2 + tp = 4 mem_fraction_static = 0.75 [Tests.cmd_args.decode] - gpu_ids = "2,3" - tensor_parallel_size = 2 + tp = 4 mem_fraction_static = 0.75 [[Tests]] -id = "sglang.disagg.2nodes" +id = "sglang.disagg.4nodes" test_name = "sglang" -num_nodes = 2 -time_limit = "00:10:00" +num_nodes = 4 +time_limit = "00:30:00" [Tests.cmd_args.prefill] - tensor_parallel_size = 4 + num_nodes = 2 + tp = 8 mem_fraction_static = 0.75 [Tests.cmd_args.decode] - tensor_parallel_size = 4 + num_nodes = 2 + tp = 8 mem_fraction_static = 0.75 diff --git a/conf/experimental/vllm/test_scenario/vllm.toml b/conf/experimental/vllm/test_scenario/vllm.toml index 8e1207221..430ae3ff4 100644 --- a/conf/experimental/vllm/test_scenario/vllm.toml +++ b/conf/experimental/vllm/test_scenario/vllm.toml @@ -16,6 +16,17 @@ name = "vllm" +[[Tests]] +id = "vllm.agg.2nodes" +test_name = "vllm" +num_nodes = 2 +time_limit = "00:30:00" + + [Tests.cmd_args.decode] + enforce_eager = "" + tensor_parallel_size = 8 + max_num_batched_tokens = 1024 + [[Tests]] id = "vllm.disagg.sync" test_name = "vllm" @@ -33,7 +44,7 @@ time_limit = "00:30:00" max_num_batched_tokens = 1024 [[Tests]] -id = "vllm.disagg.async" +id = "vllm.disagg.1node" test_name = "vllm" num_nodes = 1 time_limit = "00:10:00" @@ -49,3 +60,21 @@ time_limit = "00:10:00" enforce_eager = "" tensor_parallel_size = 2 max_num_batched_tokens = 1024 + +[[Tests]] +id = "vllm.disagg.4nodes" +test_name = "vllm" +num_nodes = 4 +time_limit = "00:30:00" + + [Tests.cmd_args.prefill] + num_nodes = 2 + enforce_eager = "" + tensor_parallel_size = 8 + max_num_batched_tokens = 1024 + + [Tests.cmd_args.decode] + num_nodes = 2 + enforce_eager = "" + tensor_parallel_size = 8 + max_num_batched_tokens = 1024 diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index f77039edc..90f6d5995 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -156,22 +156,31 @@ def _validate_vllm_parallelism_constraints(role: str, args: VllmArgs, gpu_count: def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool: system_gpus_per_node = getattr(system, "gpus_per_node", None) if system is not None else None num_nodes = tr.nnodes + local_gpu_count = len(all_gpu_ids(self, system_gpus_per_node)) if self.cmd_args.prefill is None: return self._validate_vllm_parallelism_constraints( role="decode", args=self.cmd_args.decode, - gpu_count=len(all_gpu_ids(self, system_gpus_per_node)), + gpu_count=local_gpu_count * num_nodes, ) + prefill_nodes = 1 + decode_nodes = 1 + if num_nodes > 2: + prefill_nodes_value = self.cmd_args.prefill.num_nodes + decode_nodes_value = self.cmd_args.decode.num_nodes + prefill_nodes = prefill_nodes_value if isinstance(prefill_nodes_value, int) else prefill_nodes + decode_nodes = decode_nodes_value if isinstance(decode_nodes_value, int) else decode_nodes + return self._validate_vllm_parallelism_constraints( role="prefill", args=self.cmd_args.prefill, - gpu_count=len(calculate_prefill_gpu_ids(self, num_nodes, system_gpus_per_node)), + gpu_count=len(calculate_prefill_gpu_ids(self, num_nodes, system_gpus_per_node)) * prefill_nodes, ) and self._validate_vllm_parallelism_constraints( role="decode", args=self.cmd_args.decode, - gpu_count=len(calculate_decode_gpu_ids(self, num_nodes, system_gpus_per_node)), + gpu_count=len(calculate_decode_gpu_ids(self, num_nodes, system_gpus_per_node)) * decode_nodes, ) def was_run_successful(self, tr: TestRun) -> JobStatusResult: diff --git a/tests/workloads/vllm/test_workload.py b/tests/workloads/vllm/test_workload.py index cf6265d54..44d5a7584 100644 --- a/tests/workloads/vllm/test_workload.py +++ b/tests/workloads/vllm/test_workload.py @@ -118,3 +118,36 @@ def test_constraint_check_uses_all_node_gpus_per_role_for_two_node_disagg(tmp_pa slurm_system.gpus_per_node = 4 assert tdef.constraint_check(tr, slurm_system) is True + + +def test_constraint_check_uses_all_allocated_gpus_for_multinode_aggregated(tmp_path, slurm_system: SlurmSystem) -> None: + tdef = VllmTestDefinition( + name="test", + description="test", + test_template_name="vllm", + cmd_args=VllmCmdArgs( + docker_image_url="test_url", + decode=VllmArgs.model_validate({"tensor_parallel_size": 8}), + ), + ) + tr = TestRun(name="vllm", test=tdef, num_nodes=2, nodes=[], output_path=tmp_path) + slurm_system.gpus_per_node = 4 + + assert tdef.constraint_check(tr, slurm_system) is True + + +def test_constraint_check_uses_role_nodes_for_multinode_disagg(tmp_path, slurm_system: SlurmSystem) -> None: + tdef = VllmTestDefinition( + name="test", + description="test", + test_template_name="vllm", + cmd_args=VllmCmdArgs( + docker_image_url="test_url", + prefill=VllmArgs.model_validate({"num_nodes": 2, "tensor_parallel_size": 8}), + decode=VllmArgs.model_validate({"num_nodes": 2, "tensor_parallel_size": 8}), + ), + ) + tr = TestRun(name="vllm", test=tdef, num_nodes=4, nodes=[], output_path=tmp_path) + slurm_system.gpus_per_node = 4 + + assert tdef.constraint_check(tr, slurm_system) is True From edf2afd9148de8f3de1122491618d808f93bb074 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 9 Jun 2026 19:54:25 +0200 Subject: [PATCH 03/26] fix ray readiness probe --- .../vllm/slurm_command_gen_strategy.py | 90 +++++++++---------- tests/ref_data/vllm-multinode.sbatch | 60 ++++++------- .../vllm/test_command_gen_strategy_slurm.py | 8 +- 3 files changed, 75 insertions(+), 83 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 4251079e6..1f7811a36 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. import json +import shlex from typing import Any, cast from cloudai.workloads.common.llm_serving import LLMServingSlurmCommandGenStrategy @@ -88,55 +89,21 @@ def get_serve_commands(self) -> list[list[str]]: return commands - def _ray_wait_function(self) -> str: - srun_prefix = " ".join(self.gen_srun_prefix(with_num_nodes=False)) - ray_node_count_check = ( - "import ray, sys; " - 'ray.init(address=f"{sys.argv[1]}:{sys.argv[2]}"); ' - "sys.exit(0 if len(ray.nodes()) >= int(sys.argv[3]) else 1)" - ) - return f"""\ -wait_for_ray_cluster() {{ - local head_node="$1" - local ray_port="$2" - local expected_nodes="$3" - local timeout={self.tdef.cmd_args.serve_wait_seconds} - local interval=5 - local end_time=$(($(date +%s) + timeout)) - - while [ "$(date +%s)" -lt "$end_time" ]; do - if {srun_prefix} --overlap --nodelist="$head_node" --nodes=1 --ntasks=1 --ntasks-per-node=1 \\ - python3 -c '{ray_node_count_check}' \\ - "$head_node" "$ray_port" "$expected_nodes"; then - echo "Ray cluster is ready on $head_node:$ray_port with $expected_nodes nodes" - return 0 - fi - sleep "$interval" - done - - echo "Timeout waiting for Ray cluster on $head_node:$ray_port" - return 1 -}} - -""" - def aggregated_script_preamble(self) -> str: if not self._needs_ray("serve"): return "" - return f"""\ + return """\ export PORT_OFFSET=$((SLURM_JOB_ID % 1000)) export SERVE_RAY_PORT=$((6379 + PORT_OFFSET)) - -{self._ray_wait_function()}""" +""" def disaggregated_script_preamble(self) -> str: ray_preamble = "" if self._needs_ray("prefill") or self._needs_ray("decode"): - ray_preamble = f"""\ + ray_preamble = """\ export PREFILL_RAY_PORT=$((6379 + PORT_OFFSET)) export DECODE_RAY_PORT=$((7379 + PORT_OFFSET)) - -{self._ray_wait_function()}""" +""" return f"""\ export PORT_OFFSET=$((SLURM_JOB_ID % 1000)) export PREFILL_NIXL_PORT=$((5557 + PORT_OFFSET)) @@ -184,33 +151,56 @@ def render_serve_launch( worker_prefix = self._role_srun_prefix("$node") head_prefix = self._single_role_srun_prefix(head_node_var) serve_cmd = self._with_custom_bash(f'env RAY_ADDRESS="{head_node_expr}:${{{ray_port_var}}}" {command_tail}') - ray_head_command = ( - 'bash -c "ray stop --force >/dev/null 2>&1 || true; ' - f'exec ray start --head --port=${{{ray_port_var}}} --block"' + ray_head_command = shlex.quote( + f"""\ +ray stop --force >/dev/null 2>&1 || true +ray start --head --port="${{{ray_port_var}}}" + +active_nodes=0 +for (( i=0; i < {self.tdef.cmd_args.serve_wait_seconds}; i+=5 )); do + active_nodes=$(python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))') + if [ "$active_nodes" -eq "{node_count}" ]; then + echo "All Ray workers are active: $active_nodes/{node_count}" + ray status || true + exec {serve_cmd} + fi + echo "Waiting for Ray workers: $active_nodes/{node_count} active" + sleep 5 +done + +echo "Waiting for Ray workers timed out: $active_nodes/{node_count} active" +exit 1""" ) - ray_worker_command = ( - 'bash -c "ray stop --force >/dev/null 2>&1 || true; ' - f'exec ray start --address={head_node_expr}:${{{ray_port_var}}} --block"' + ray_worker_command = shlex.quote( + f"""\ +ray stop --force >/dev/null 2>&1 || true +for (( i=0; i < {self.tdef.cmd_args.serve_wait_seconds}; i+=5 )); do + if ray start --address={head_node_expr}:${{{ray_port_var}}} --block; then + echo "Ray worker connected to {head_node_expr}:${{{ray_port_var}}}" + exit 0 + fi + echo "Waiting until the Ray worker can connect to {head_node_expr}:${{{ray_port_var}}}..." + sleep 5 +done +echo "Ray worker startup timed out for {head_node_expr}:${{{ray_port_var}}}" +exit 1""" ) return f"""\ ( trap 'kill -TERM $(jobs -pr) 2>/dev/null' TERM EXIT - {head_prefix} \\ - --output={self.test_run.output_path.absolute()}/{ray_head_log} \\ - {ray_head_command} & for node in "${{{node_array_var}[@]:1}}"; do {worker_prefix} \\ --output={self.test_run.output_path.absolute()}/{ray_worker_log} \\ - {ray_worker_command} & + bash -lc {ray_worker_command} & done wait ) & {ray_pid_var}=$! -wait_for_ray_cluster "{head_node_expr}" "${{{ray_port_var}}}" "{node_count}" || exit 1 {head_prefix} \\ --output={serve_log} \\ - {serve_cmd} & + --error={self.test_run.output_path.absolute()}/{ray_head_log} \\ + bash -lc {ray_head_command} & {pid_var}=$!""" def disaggregated_role_env(self, role: str, gpu_ids: list[int]) -> dict[str, str]: diff --git a/tests/ref_data/vllm-multinode.sbatch b/tests/ref_data/vllm-multinode.sbatch index fa6294245..a426dc239 100644 --- a/tests/ref_data/vllm-multinode.sbatch +++ b/tests/ref_data/vllm-multinode.sbatch @@ -53,29 +53,6 @@ wait_for_health() { export PORT_OFFSET=$((SLURM_JOB_ID % 1000)) export SERVE_RAY_PORT=$((6379 + PORT_OFFSET)) - -wait_for_ray_cluster() { - local head_node="$1" - local ray_port="$2" - local expected_nodes="$3" - local timeout=300 - local interval=5 - local end_time=$(($(date +%s) + timeout)) - - while [ "$(date +%s)" -lt "$end_time" ]; do - if srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="$head_node" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ - python3 -c 'import ray, sys; ray.init(address=f"{sys.argv[1]}:{sys.argv[2]}"); sys.exit(0 if len(ray.nodes()) >= int(sys.argv[3]) else 1)' \ - "$head_node" "$ray_port" "$expected_nodes"; then - echo "Ray cluster is ready on $head_node:$ray_port with $expected_nodes nodes" - return 0 - fi - sleep "$interval" - done - - echo "Timeout waiting for Ray cluster on $head_node:$ray_port" - return 1 -} - NODES=( $(scontrol show hostname $SLURM_JOB_NODELIST) ) SERVE_NODES=( "${NODES[@]:0:2}" ) if [ "${#SERVE_NODES[@]}" -ne 2 ]; then @@ -90,21 +67,44 @@ echo "Node roles: serve=${SERVE_NODES[*]}" echo "Starting vLLM instances..." ( trap 'kill -TERM $(jobs -pr) 2>/dev/null' TERM EXIT - srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ - --output=__OUTPUT_DIR__/output/vllm-serve-ray-head.log \ - bash -c "ray stop --force >/dev/null 2>&1 || true; exec ray start --head --port=${SERVE_RAY_PORT} --block" & for node in "${SERVE_NODES[@]:1}"; do srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="$node" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-serve-ray-worker-%N.log \ - bash -c "ray stop --force >/dev/null 2>&1 || true; exec ray start --address=${SERVE_NODE}:${SERVE_RAY_PORT} --block" & + bash -lc 'ray stop --force >/dev/null 2>&1 || true +for (( i=0; i < 300; i+=5 )); do + if ray start --address=${SERVE_NODE}:${SERVE_RAY_PORT} --block; then + echo "Ray worker connected to ${SERVE_NODE}:${SERVE_RAY_PORT}" + exit 0 + fi + echo "Waiting until the Ray worker can connect to ${SERVE_NODE}:${SERVE_RAY_PORT}..." + sleep 5 +done +echo "Ray worker startup timed out for ${SERVE_NODE}:${SERVE_RAY_PORT}" +exit 1' & done wait ) & SERVE_RAY_PID=$! -wait_for_ray_cluster "${SERVE_NODE}" "${SERVE_RAY_PORT}" "2" || exit 1 srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-serve.log \ - env RAY_ADDRESS="${SERVE_NODE}:${SERVE_RAY_PORT}" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --tensor-parallel-size 2 --port 8300 --distributed-executor-backend ray & + --error=__OUTPUT_DIR__/output/vllm-serve-ray-head.log \ + bash -lc 'ray stop --force >/dev/null 2>&1 || true +ray start --head --port="${SERVE_RAY_PORT}" + +active_nodes=0 +for (( i=0; i < 300; i+=5 )); do + active_nodes=$(python3 -c '"'"'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'"'"') + if [ "$active_nodes" -eq "2" ]; then + echo "All Ray workers are active: $active_nodes/2" + ray status || true + exec env RAY_ADDRESS="${SERVE_NODE}:${SERVE_RAY_PORT}" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --tensor-parallel-size 2 --port 8300 --distributed-executor-backend ray + fi + echo "Waiting for Ray workers: $active_nodes/2 active" + sleep 5 +done + +echo "Waiting for Ray workers timed out: $active_nodes/2 active" +exit 1' & SERVE_PID=$! echo "Waiting for vLLM on $NODE to be ready..." @@ -115,4 +115,4 @@ srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --cont --output=__OUTPUT_DIR__/output/vllm-bench.log \ vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result -cleanup +cleanup \ No newline at end of file diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index bfe1a714c..51c90dbf6 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -292,7 +292,9 @@ def test_gen_srun_command_multinode_aggregated_uses_ray( assert 'SERVE_NODES=( "${NODES[@]:0:2}" )' in srun_command assert "export SERVE_RAY_PORT=$((6379 + PORT_OFFSET))" in srun_command assert "SERVE_RAY_PID=$!" in srun_command - assert 'wait_for_ray_cluster "${SERVE_NODE}" "${SERVE_RAY_PORT}" "2"' in srun_command + assert 'sum(node["Alive"] for node in ray.nodes())' in srun_command + assert "ray.init(address=" not in srun_command + assert 'exec env RAY_ADDRESS="${SERVE_NODE}:${SERVE_RAY_PORT}" vllm serve' in srun_command assert 'env RAY_ADDRESS="${SERVE_NODE}:${SERVE_RAY_PORT}"' in srun_command def test_generate_wait_for_health_function(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: @@ -641,7 +643,7 @@ def test_gen_srun_command_disagg_four_nodes_uses_role_ray_clusters( assert 'DECODE_NODES=( "${NODES[@]:2:2}" )' in srun_command assert "PREFILL_RAY_PID=$!" in srun_command assert "DECODE_RAY_PID=$!" in srun_command - assert 'wait_for_ray_cluster "${PREFILL_NODE}" "${PREFILL_RAY_PORT}" "2"' in srun_command - assert 'wait_for_ray_cluster "${DECODE_NODE}" "${DECODE_RAY_PORT}" "2"' in srun_command + assert srun_command.count('sum(node["Alive"] for node in ray.nodes())') == 2 + assert "ray.init(address=" not in srun_command assert 'env RAY_ADDRESS="${PREFILL_NODE}:${PREFILL_RAY_PORT}"' in srun_command assert 'env RAY_ADDRESS="${DECODE_NODE}:${DECODE_RAY_PORT}"' in srun_command From 099616460dcd497669e4994cf9f3da02be8014b1 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 9 Jun 2026 20:15:00 +0200 Subject: [PATCH 04/26] update vllm healthchecking --- src/cloudai/workloads/common/llm_serving.py | 7 ++++++- .../workloads/vllm/slurm_command_gen_strategy.py | 4 ++++ src/cloudai/workloads/vllm/vllm.py | 6 +++++- tests/ref_data/vllm-multinode.sbatch | 4 ++-- tests/ref_data/vllm.sbatch | 2 +- .../vllm/test_command_gen_strategy_slurm.py | 16 +++++++++++++++- 6 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py index 792cff05c..7eb5e3bb7 100644 --- a/src/cloudai/workloads/common/llm_serving.py +++ b/src/cloudai/workloads/common/llm_serving.py @@ -641,6 +641,11 @@ def proxy_router_log_file(self) -> str: """Helper process log file name.""" return f"{self.workload_slug}-{self.proxy_router_name}.log" + @property + def proxy_router_healthcheck(self) -> str: + """Healthcheck endpoint for the helper/proxy process in disaggregated mode.""" + return self.tdef.cmd_args.healthcheck + @property def bench_log_file(self) -> str: """Benchmark log file name.""" @@ -833,7 +838,7 @@ def _gen_disaggregated_script(self, serve_commands: list[list[str]], bench_cmd: ) wait_block_helper = self.generate_wait_for_health_block( self.workload_name, - [f"http://{self.disaggregated_role_host('prefill')}:{self.serve_port}{self.tdef.cmd_args.healthcheck}"], + [f"http://{self.disaggregated_role_host('prefill')}:{self.serve_port}{self.proxy_router_healthcheck}"], host_setup="", host_display="$PREFILL_NODE server", ) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 1f7811a36..94e5227f3 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -125,6 +125,10 @@ def disaggregated_cleanup_pid_vars(self) -> list[str]: pid_vars.insert(insert_at, "DECODE_RAY_PID") return pid_vars + @property + def proxy_router_healthcheck(self) -> str: + return self.tdef.cmd_args.proxy_healthcheck + def render_serve_launch( self, role: str, diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 90f6d5995..948ac03b8 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -70,7 +70,11 @@ class VllmCmdArgs(LLMServingCmdArgs[VllmArgs]): model_config = ConfigDict(extra="forbid") # arbitrary fields are allowed per decode/prefill, not here proxy_script: str = "/opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" - healthcheck: str = Field(default="/healthcheck", description="vLLM server healthcheck endpoint.") + healthcheck: str = Field(default="/health", description="vLLM server healthcheck endpoint.") + proxy_healthcheck: str = Field( + default="/healthcheck", + description="vLLM disaggregated proxy/router healthcheck endpoint.", + ) model: str = "Qwen/Qwen3-0.6B" prefill: VllmArgs | None = Field( diff --git a/tests/ref_data/vllm-multinode.sbatch b/tests/ref_data/vllm-multinode.sbatch index a426dc239..e7fd6cf35 100644 --- a/tests/ref_data/vllm-multinode.sbatch +++ b/tests/ref_data/vllm-multinode.sbatch @@ -108,11 +108,11 @@ exit 1' & SERVE_PID=$! echo "Waiting for vLLM on $NODE to be ready..." -wait_for_health "http://${NODE}:8300/healthcheck" || exit 1 +wait_for_health "http://${NODE}:8300/health" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result -cleanup \ No newline at end of file +cleanup diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index d192b2ccc..9c721400d 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -52,7 +52,7 @@ SERVE_PID=$! NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) echo "Waiting for vLLM on $NODE to be ready..." -wait_for_health "http://${NODE}:8300/healthcheck" || exit 1 +wait_for_health "http://${NODE}:8300/health" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index 51c90dbf6..8af29239b 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -357,7 +357,7 @@ def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmComman NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) echo "Waiting for vLLM on $NODE to be ready..." -wait_for_health "http://${{NODE}}:{cmd_args.port}/healthcheck" || exit 1 +wait_for_health "http://${{NODE}}:{cmd_args.port}/health" || exit 1 echo "Running benchmark..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ @@ -393,6 +393,20 @@ def test_custom_bash_regex_can_target_only_aggregated_benchmark( assert "echo bench setup; exec vllm bench serve" in srun_command assert "echo bench setup; exec vllm serve" not in srun_command + def test_custom_healthcheck_endpoints( + self, vllm: VllmTestDefinition, vllm_tr: TestRun, slurm_system: SlurmSystem + ) -> None: + vllm.cmd_args.healthcheck = "/ready" + vllm_tr.test = vllm + aggregated = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr)._gen_srun_command() + assert 'wait_for_health "http://${NODE}:8000/ready"' in aggregated + + vllm.cmd_args.prefill = VllmArgs() + vllm.cmd_args.proxy_healthcheck = "/router-ready" + vllm_tr.num_nodes = 2 + disaggregated = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr)._gen_srun_command() + assert 'wait_for_health "http://${PREFILL_NODE}:8000/router-ready"' in disaggregated + class TestVllmDisaggregatedMode: """Tests for vLLM disaggregated mode with multiple GPUs.""" From c9f3ec7293539676a01e5e759d6f575889f8c79f Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 9 Jun 2026 20:44:26 +0200 Subject: [PATCH 05/26] add heavy vllm test cases --- conf/experimental/vllm/test/vllm-heavy.toml | 42 +++++++++ .../vllm/test_scenario/vllm-heavy.toml | 93 +++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 conf/experimental/vllm/test/vllm-heavy.toml create mode 100644 conf/experimental/vllm/test_scenario/vllm-heavy.toml diff --git a/conf/experimental/vllm/test/vllm-heavy.toml b/conf/experimental/vllm/test/vllm-heavy.toml new file mode 100644 index 000000000..7aa522b81 --- /dev/null +++ b/conf/experimental/vllm/test/vllm-heavy.toml @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "vllm-heavy" +description = "Heavy vLLM multi-node benchmark" +test_template_name = "vllm" + +[[git_repos]] +url = "https://github.com/vllm-project/vllm.git" +commit = "main" +mount_as = "/vllm_repo" + +[cmd_args] +docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1" +model = "Qwen/Qwen2.5-7B-Instruct" + +[bench_cmd_args] +random_input_len = 512 +random_output_len = 512 +max_concurrency = 64 +num_prompts = 512 + +[semantic_eval_cmd_args] +entrypoint = "python3 /vllm_repo/tests/evals/gsm8k/gsm8k_eval.py" +cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json" + +[extra_env_vars] +UCX_NET_DEVICES = "all" +UCX_TLS = "^gdr_copy,cuda_ipc" diff --git a/conf/experimental/vllm/test_scenario/vllm-heavy.toml b/conf/experimental/vllm/test_scenario/vllm-heavy.toml new file mode 100644 index 000000000..8c0933255 --- /dev/null +++ b/conf/experimental/vllm/test_scenario/vllm-heavy.toml @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "vllm-heavy-multinode" + +[[Tests]] +id = "vllm.heavy.agg.1node" +test_name = "vllm-heavy" +num_nodes = 1 +time_limit = "01:00:00" + + [Tests.cmd_args.decode] + enforce_eager = "" + tensor_parallel_size = 4 + max_num_batched_tokens = 8192 + max_model_len = 8192 + +[[Tests]] +id = "vllm.heavy.agg.2nodes" +test_name = "vllm-heavy" +num_nodes = 2 +time_limit = "01:00:00" + + [Tests.cmd_args.decode] + enforce_eager = "" + tensor_parallel_size = 8 + max_num_batched_tokens = 8192 + max_model_len = 8192 + +[[Tests]] +id = "vllm.heavy.agg.4nodes" +test_name = "vllm-heavy" +num_nodes = 4 +time_limit = "01:00:00" + + [Tests.cmd_args.decode] + enforce_eager = "" + tensor_parallel_size = 16 + max_num_batched_tokens = 8192 + max_model_len = 8192 + +[[Tests]] +id = "vllm.heavy.disagg.4nodes" +test_name = "vllm-heavy" +num_nodes = 4 +time_limit = "01:00:00" + + [Tests.cmd_args.prefill] + num_nodes = 2 + enforce_eager = "" + tensor_parallel_size = 8 + max_num_batched_tokens = 8192 + max_model_len = 8192 + + [Tests.cmd_args.decode] + num_nodes = 2 + enforce_eager = "" + tensor_parallel_size = 8 + max_num_batched_tokens = 8192 + max_model_len = 8192 + +[[Tests]] +id = "vllm.heavy.disagg.7nodes.3p4d" +test_name = "vllm-heavy" +num_nodes = 7 +time_limit = "01:30:00" + + [Tests.cmd_args.prefill] + num_nodes = 3 + enforce_eager = "" + tensor_parallel_size = 12 + max_num_batched_tokens = 8192 + max_model_len = 8192 + + [Tests.cmd_args.decode] + num_nodes = 4 + enforce_eager = "" + tensor_parallel_size = 16 + max_num_batched_tokens = 8192 + max_model_len = 8192 From 08573524f7d65555212a9c1d6c16b4e3440667ab Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 9 Jun 2026 21:02:18 +0200 Subject: [PATCH 06/26] fix heavy vllm config --- conf/experimental/vllm/test/vllm-heavy.toml | 2 +- conf/experimental/vllm/test_scenario/vllm-heavy.toml | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/conf/experimental/vllm/test/vllm-heavy.toml b/conf/experimental/vllm/test/vllm-heavy.toml index 7aa522b81..193d63c38 100644 --- a/conf/experimental/vllm/test/vllm-heavy.toml +++ b/conf/experimental/vllm/test/vllm-heavy.toml @@ -25,7 +25,7 @@ mount_as = "/vllm_repo" [cmd_args] docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1" -model = "Qwen/Qwen2.5-7B-Instruct" +model = "Qwen/Qwen3-8B" [bench_cmd_args] random_input_len = 512 diff --git a/conf/experimental/vllm/test_scenario/vllm-heavy.toml b/conf/experimental/vllm/test_scenario/vllm-heavy.toml index 8c0933255..15a4ffdf4 100644 --- a/conf/experimental/vllm/test_scenario/vllm-heavy.toml +++ b/conf/experimental/vllm/test_scenario/vllm-heavy.toml @@ -48,7 +48,8 @@ time_limit = "01:00:00" [Tests.cmd_args.decode] enforce_eager = "" - tensor_parallel_size = 16 + tensor_parallel_size = 8 + pipeline_parallel_size = 2 max_num_batched_tokens = 8192 max_model_len = 8192 @@ -81,13 +82,15 @@ time_limit = "01:30:00" [Tests.cmd_args.prefill] num_nodes = 3 enforce_eager = "" - tensor_parallel_size = 12 + tensor_parallel_size = 4 + pipeline_parallel_size = 3 max_num_batched_tokens = 8192 max_model_len = 8192 [Tests.cmd_args.decode] num_nodes = 4 enforce_eager = "" - tensor_parallel_size = 16 + tensor_parallel_size = 4 + pipeline_parallel_size = 4 max_num_batched_tokens = 8192 max_model_len = 8192 From 9abedcf8e37c5ac7eb941488e1c6c14dc042d714 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 10 Jun 2026 14:51:46 +0200 Subject: [PATCH 07/26] further improve configs --- conf/experimental/vllm/test_scenario/vllm-heavy.toml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/conf/experimental/vllm/test_scenario/vllm-heavy.toml b/conf/experimental/vllm/test_scenario/vllm-heavy.toml index 15a4ffdf4..92df346b0 100644 --- a/conf/experimental/vllm/test_scenario/vllm-heavy.toml +++ b/conf/experimental/vllm/test_scenario/vllm-heavy.toml @@ -49,7 +49,6 @@ time_limit = "01:00:00" [Tests.cmd_args.decode] enforce_eager = "" tensor_parallel_size = 8 - pipeline_parallel_size = 2 max_num_batched_tokens = 8192 max_model_len = 8192 @@ -82,15 +81,13 @@ time_limit = "01:30:00" [Tests.cmd_args.prefill] num_nodes = 3 enforce_eager = "" - tensor_parallel_size = 4 - pipeline_parallel_size = 3 + tensor_parallel_size = 8 max_num_batched_tokens = 8192 max_model_len = 8192 [Tests.cmd_args.decode] num_nodes = 4 enforce_eager = "" - tensor_parallel_size = 4 - pipeline_parallel_size = 4 + tensor_parallel_size = 8 max_num_batched_tokens = 8192 max_model_len = 8192 From 1111fd7e2ab60da6441757a3753937c1779d7712 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 10 Jun 2026 16:52:14 +0200 Subject: [PATCH 08/26] set proper gpu ids --- conf/experimental/vllm/test_scenario/vllm-heavy.toml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/conf/experimental/vllm/test_scenario/vllm-heavy.toml b/conf/experimental/vllm/test_scenario/vllm-heavy.toml index 92df346b0..3345baa06 100644 --- a/conf/experimental/vllm/test_scenario/vllm-heavy.toml +++ b/conf/experimental/vllm/test_scenario/vllm-heavy.toml @@ -23,6 +23,7 @@ num_nodes = 1 time_limit = "01:00:00" [Tests.cmd_args.decode] + gpu_ids = "0,1,2,3" enforce_eager = "" tensor_parallel_size = 4 max_num_batched_tokens = 8192 @@ -35,6 +36,7 @@ num_nodes = 2 time_limit = "01:00:00" [Tests.cmd_args.decode] + gpu_ids = "0,1,2,3" enforce_eager = "" tensor_parallel_size = 8 max_num_batched_tokens = 8192 @@ -47,6 +49,7 @@ num_nodes = 4 time_limit = "01:00:00" [Tests.cmd_args.decode] + gpu_ids = "0,1,2,3" enforce_eager = "" tensor_parallel_size = 8 max_num_batched_tokens = 8192 @@ -60,6 +63,7 @@ time_limit = "01:00:00" [Tests.cmd_args.prefill] num_nodes = 2 + gpu_ids = "0,1,2,3" enforce_eager = "" tensor_parallel_size = 8 max_num_batched_tokens = 8192 @@ -67,6 +71,7 @@ time_limit = "01:00:00" [Tests.cmd_args.decode] num_nodes = 2 + gpu_ids = "0,1,2,3" enforce_eager = "" tensor_parallel_size = 8 max_num_batched_tokens = 8192 @@ -80,6 +85,7 @@ time_limit = "01:30:00" [Tests.cmd_args.prefill] num_nodes = 3 + gpu_ids = "0,1,2,3" enforce_eager = "" tensor_parallel_size = 8 max_num_batched_tokens = 8192 @@ -87,6 +93,7 @@ time_limit = "01:30:00" [Tests.cmd_args.decode] num_nodes = 4 + gpu_ids = "0,1,2,3" enforce_eager = "" tensor_parallel_size = 8 max_num_batched_tokens = 8192 From 75036132a9f8ab1b70cba8f4ef97c4cb5611db40 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 10 Jun 2026 17:15:51 +0200 Subject: [PATCH 09/26] heavy perf vllm conf --- .../vllm/test_scenario/vllm-heavy-perf.toml | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml diff --git a/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml b/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml new file mode 100644 index 000000000..3ea67b9f0 --- /dev/null +++ b/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml @@ -0,0 +1,131 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "vllm-heavy-perf-multinode" + +[[Tests]] +id = "vllm.heavy.perf.agg.1node" +test_name = "vllm-heavy" +num_nodes = 1 +time_limit = "02:00:00" + + [Tests.cmd_args] + model = "Qwen/Qwen3-32B" + + [Tests.cmd_args.decode] + gpu_ids = "0,1,2,3" + enforce_eager = "" + tensor_parallel_size = 4 + max_num_batched_tokens = 16384 + max_model_len = 16384 + +[[Tests]] +id = "vllm.heavy.perf.agg.2nodes" +test_name = "vllm-heavy" +num_nodes = 2 +time_limit = "02:00:00" + + [Tests.cmd_args] + model = "Qwen/Qwen3-32B" + + [Tests.cmd_args.decode] + gpu_ids = "0,1,2,3" + enforce_eager = "" + tensor_parallel_size = 8 + max_num_batched_tokens = 16384 + max_model_len = 16384 + +[[Tests]] +id = "vllm.heavy.perf.agg.4nodes" +test_name = "vllm-heavy" +num_nodes = 4 +time_limit = "02:00:00" + + [Tests.cmd_args] + model = "Qwen/Qwen3-32B" + + [Tests.cmd_args.decode] + gpu_ids = "0,1,2,3" + enforce_eager = "" + tensor_parallel_size = 16 + max_num_batched_tokens = 16384 + max_model_len = 16384 + +[[Tests]] +id = "vllm.heavy.perf.agg.8nodes" +test_name = "vllm-heavy" +num_nodes = 8 +time_limit = "02:00:00" + + [Tests.cmd_args] + model = "Qwen/Qwen3-32B" + + [Tests.cmd_args.decode] + gpu_ids = "0,1,2,3" + enforce_eager = "" + tensor_parallel_size = 32 + max_num_batched_tokens = 16384 + max_model_len = 16384 + +[[Tests]] +id = "vllm.heavy.perf.disagg.4nodes" +test_name = "vllm-heavy" +num_nodes = 4 +time_limit = "02:00:00" + + [Tests.cmd_args] + model = "Qwen/Qwen3-32B" + + [Tests.cmd_args.prefill] + num_nodes = 2 + gpu_ids = "0,1,2,3" + enforce_eager = "" + tensor_parallel_size = 8 + max_num_batched_tokens = 16384 + max_model_len = 16384 + + [Tests.cmd_args.decode] + num_nodes = 2 + gpu_ids = "0,1,2,3" + enforce_eager = "" + tensor_parallel_size = 8 + max_num_batched_tokens = 16384 + max_model_len = 16384 + +[[Tests]] +id = "vllm.heavy.perf.disagg.8nodes.4p4d" +test_name = "vllm-heavy" +num_nodes = 8 +time_limit = "02:00:00" + + [Tests.cmd_args] + model = "Qwen/Qwen3-32B" + + [Tests.cmd_args.prefill] + num_nodes = 4 + gpu_ids = "0,1,2,3" + enforce_eager = "" + tensor_parallel_size = 16 + max_num_batched_tokens = 16384 + max_model_len = 16384 + + [Tests.cmd_args.decode] + num_nodes = 4 + gpu_ids = "0,1,2,3" + enforce_eager = "" + tensor_parallel_size = 16 + max_num_batched_tokens = 16384 + max_model_len = 16384 From 5e4cea514ebedd243ed11912aa9f527444aa31a3 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 10 Jun 2026 17:27:52 +0200 Subject: [PATCH 10/26] added sglang heavy config --- .../sglang/test/sglang-heavy.toml | 41 +++++++++ .../sglang/test_scenario/sglang-heavy.toml | 86 +++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 conf/experimental/sglang/test/sglang-heavy.toml create mode 100644 conf/experimental/sglang/test_scenario/sglang-heavy.toml diff --git a/conf/experimental/sglang/test/sglang-heavy.toml b/conf/experimental/sglang/test/sglang-heavy.toml new file mode 100644 index 000000000..3d91a22ae --- /dev/null +++ b/conf/experimental/sglang/test/sglang-heavy.toml @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "sglang-heavy" +description = "Heavy SGLang multi-node correctness benchmark" +test_template_name = "sglang" + +[cmd_args] +docker_image_url = "lmsysorg/sglang:dev-cu13" +model = "Qwen/Qwen3-8B" + +[bench_cmd_args] +dataset_name = "random" +num_prompts = 512 +max_concurrency = 64 +random_input = 512 +random_output = 512 +warmup_requests = 2 +random_range_ratio = 1.0 +output_details = true + +[semantic_eval_cmd_args] +entrypoint = "python3 -m sglang.test.run_eval" +cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" + +[extra_env_vars] +UCX_NET_DEVICES = "all" +UCX_TLS = "^gdr_copy,cuda_ipc" diff --git a/conf/experimental/sglang/test_scenario/sglang-heavy.toml b/conf/experimental/sglang/test_scenario/sglang-heavy.toml new file mode 100644 index 000000000..7aab0f285 --- /dev/null +++ b/conf/experimental/sglang/test_scenario/sglang-heavy.toml @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "sglang-heavy-multinode" + +[[Tests]] +id = "sglang.heavy.agg.1node" +test_name = "sglang-heavy" +num_nodes = 1 +time_limit = "01:00:00" + + [Tests.cmd_args.decode] + gpu_ids = "0,1,2,3" + tp = 4 + mem_fraction_static = 0.75 + +[[Tests]] +id = "sglang.heavy.agg.2nodes" +test_name = "sglang-heavy" +num_nodes = 2 +time_limit = "01:00:00" + + [Tests.cmd_args.decode] + gpu_ids = "0,1,2,3" + tp = 8 + mem_fraction_static = 0.75 + +[[Tests]] +id = "sglang.heavy.agg.4nodes" +test_name = "sglang-heavy" +num_nodes = 4 +time_limit = "01:00:00" + + [Tests.cmd_args.decode] + gpu_ids = "0,1,2,3" + tp = 8 + mem_fraction_static = 0.75 + +[[Tests]] +id = "sglang.heavy.disagg.4nodes" +test_name = "sglang-heavy" +num_nodes = 4 +time_limit = "01:00:00" + + [Tests.cmd_args.prefill] + num_nodes = 2 + gpu_ids = "0,1,2,3" + tp = 8 + mem_fraction_static = 0.75 + + [Tests.cmd_args.decode] + num_nodes = 2 + gpu_ids = "0,1,2,3" + tp = 8 + mem_fraction_static = 0.75 + +[[Tests]] +id = "sglang.heavy.disagg.7nodes.3p4d" +test_name = "sglang-heavy" +num_nodes = 7 +time_limit = "01:30:00" + + [Tests.cmd_args.prefill] + num_nodes = 3 + gpu_ids = "0,1,2,3" + tp = 8 + mem_fraction_static = 0.75 + + [Tests.cmd_args.decode] + num_nodes = 4 + gpu_ids = "0,1,2,3" + tp = 8 + mem_fraction_static = 0.75 From 4c535d610ecfd007fc2f87f2a7fc945d965b1a81 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 10 Jun 2026 20:02:26 +0200 Subject: [PATCH 11/26] fix sglang multi-node --- .../sglang/test_scenario/sglang-heavy.toml | 6 +++--- src/cloudai/workloads/common/llm_serving.py | 7 +++++-- .../sglang/slurm_command_gen_strategy.py | 8 ++++++-- tests/ref_data/sglang-multinode.sbatch | 2 +- .../sglang/test_command_gen_strategy_slurm.py | 16 +++++++++++++--- 5 files changed, 28 insertions(+), 11 deletions(-) diff --git a/conf/experimental/sglang/test_scenario/sglang-heavy.toml b/conf/experimental/sglang/test_scenario/sglang-heavy.toml index 7aab0f285..57bb21a43 100644 --- a/conf/experimental/sglang/test_scenario/sglang-heavy.toml +++ b/conf/experimental/sglang/test_scenario/sglang-heavy.toml @@ -68,13 +68,13 @@ time_limit = "01:00:00" mem_fraction_static = 0.75 [[Tests]] -id = "sglang.heavy.disagg.7nodes.3p4d" +id = "sglang.heavy.disagg.8nodes.4p4d" test_name = "sglang-heavy" -num_nodes = 7 +num_nodes = 8 time_limit = "01:30:00" [Tests.cmd_args.prefill] - num_nodes = 3 + num_nodes = 4 gpu_ids = "0,1,2,3" tp = 8 mem_fraction_static = 0.75 diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py index 7eb5e3bb7..2b0dc3e76 100644 --- a/src/cloudai/workloads/common/llm_serving.py +++ b/src/cloudai/workloads/common/llm_serving.py @@ -60,8 +60,11 @@ def parse_gpu_ids(gpu_ids: str | list[str] | None) -> list[int]: def all_gpu_ids(tdef: LLMServingTestDefinition[LLMServingCmdArgsT], system_gpus_per_node: int | None) -> list[int]: cuda_devices = str(tdef.extra_env_vars.get("CUDA_VISIBLE_DEVICES", "")) - if (tdef.cmd_args.prefill and tdef.cmd_args.prefill.gpu_ids) and tdef.cmd_args.decode.gpu_ids: - return parse_gpu_ids(tdef.cmd_args.prefill.gpu_ids) + parse_gpu_ids(tdef.cmd_args.decode.gpu_ids) + if tdef.cmd_args.prefill: + if tdef.cmd_args.prefill.gpu_ids and tdef.cmd_args.decode.gpu_ids: + return parse_gpu_ids(tdef.cmd_args.prefill.gpu_ids) + parse_gpu_ids(tdef.cmd_args.decode.gpu_ids) + elif tdef.cmd_args.decode.gpu_ids: + return parse_gpu_ids(tdef.cmd_args.decode.gpu_ids) if cuda_devices: return parse_gpu_ids(cuda_devices) return list(range(system_gpus_per_node or 1)) diff --git a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py index 6381e8c1b..7719cca53 100644 --- a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import shlex from typing import cast from cloudai.workloads.common.llm_serving import LLMServingSlurmCommandGenStrategy @@ -175,12 +176,15 @@ def render_serve_launch( role_prefix = role.upper() dist_port_var = f"{role_prefix}_DIST_INIT_PORT" + custom_bash = self._custom_bash_for_command(command_tail) + custom_prefix = f"{custom_bash}; " if custom_bash else "" dist_command = ( f'{command_tail} --dist-init-addr "${{{head_node_var}}}:${{{dist_port_var}}}" ' - f'--nnodes {node_count} --node-rank "$SLURM_NODEID"' + f'--nnodes {node_count} --node-rank "$SLURM_PROCID"' ) + task_command = "bash -c " + shlex.quote(f"{custom_prefix}exec {dist_command}") return f"""\ {self._role_srun_prefix(f"${{{nodelist_var}}}", node_count, node_count)} \\ --output={self.test_run.output_path.absolute()}/{log_file}-%N \\ - {self._with_custom_bash(dist_command)} & + {task_command} & {pid_var}=$!""" diff --git a/tests/ref_data/sglang-multinode.sbatch b/tests/ref_data/sglang-multinode.sbatch index 504ba7956..d8a902d79 100644 --- a/tests/ref_data/sglang-multinode.sbatch +++ b/tests/ref_data/sglang-multinode.sbatch @@ -61,7 +61,7 @@ echo "Node roles: serve=${SERVE_NODES[*]}" echo "Starting SGLang instances..." srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-serve.log-%N \ - env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8300 --tp 2 --dist-init-addr "${SERVE_NODE}:${SERVE_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_NODEID" & + bash -c 'exec env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8300 --tp 2 --dist-init-addr "${SERVE_NODE}:${SERVE_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_PROCID"' & SERVE_PID=$! echo "Waiting for SGLang on $NODE to be ready..." diff --git a/tests/workloads/sglang/test_command_gen_strategy_slurm.py b/tests/workloads/sglang/test_command_gen_strategy_slurm.py index 2e73dbe51..ec51737c5 100644 --- a/tests/workloads/sglang/test_command_gen_strategy_slurm.py +++ b/tests/workloads/sglang/test_command_gen_strategy_slurm.py @@ -81,6 +81,15 @@ def test_gpu_ids_from_cuda_visible_devices( strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_tr) assert strategy.gpu_ids == [int(gpu_id) for gpu_id in cuda_visible_devices.split(",")] + def test_aggregated_gpu_ids_from_decode_config(self, sglang_tr: TestRun, slurm_system: SlurmSystem) -> None: + tdef = cast(SglangTestDefinition, sglang_tr.test) + tdef.extra_env_vars = {} + tdef.cmd_args.decode.gpu_ids = "0,1,2,3" + strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_tr) + + assert strategy.gpu_ids == [0, 1, 2, 3] + assert 'env CUDA_VISIBLE_DEVICES="0,1,2,3"' in strategy._gen_srun_command() + def test_multinode_disagg_uses_shared_gpu_ids_per_role( self, sglang_disagg_2node_tr: TestRun, slurm_system: SlurmSystem ) -> None: @@ -266,9 +275,10 @@ def test_gen_srun_command_multinode_aggregated_uses_sglang_distributed_launch( assert "export SERVE_DIST_INIT_PORT=$((20000 + PORT_OFFSET))" in srun_command assert '--nodelist="${SERVE_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1' in srun_command assert ( - '--dist-init-addr "${SERVE_NODE}:${SERVE_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_NODEID"' + '--dist-init-addr "${SERVE_NODE}:${SERVE_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_PROCID"' in srun_command ) + assert "bash -c" in srun_command def test_gen_srun_command_disagg_four_nodes_uses_separate_sglang_distributed_launches( @@ -290,11 +300,11 @@ def test_gen_srun_command_disagg_four_nodes_uses_separate_sglang_distributed_lau assert '--nodelist="${PREFILL_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1' in srun_command assert '--nodelist="${DECODE_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1' in srun_command assert ( - '--dist-init-addr "${PREFILL_NODE}:${PREFILL_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_NODEID"' + '--dist-init-addr "${PREFILL_NODE}:${PREFILL_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_PROCID"' in srun_command ) assert ( - '--dist-init-addr "${DECODE_NODE}:${DECODE_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_NODEID"' + '--dist-init-addr "${DECODE_NODE}:${DECODE_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_PROCID"' in srun_command ) From 98bf48994d7616a15fd19b59c12fc4944ddc2df8 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 11 Jun 2026 12:52:06 +0200 Subject: [PATCH 12/26] fix unittest --- tests/workloads/common/test_llm_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/workloads/common/test_llm_serving.py b/tests/workloads/common/test_llm_serving.py index f547a0e38..a4f0adfb0 100644 --- a/tests/workloads/common/test_llm_serving.py +++ b/tests/workloads/common/test_llm_serving.py @@ -159,7 +159,7 @@ def test_decode_gpu_ids_override_defaults_in_aggregated_mode(self, llm_tdef: Fak llm_tdef.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} llm_tdef.cmd_args.decode.gpu_ids = "4,5" - assert all_gpu_ids(cast(Any, llm_tdef), 8) == [0, 1, 2, 3] + assert all_gpu_ids(cast(Any, llm_tdef), 8) == [4, 5] def test_prefill_and_decode_gpu_ids_override_cuda_visible_devices(self, llm_tdef: FakeLLMTestDefinition) -> None: llm_tdef.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} From 61b5499e0eba16b75cf94124c26510ea4afb4503 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 11 Jun 2026 13:25:47 +0200 Subject: [PATCH 13/26] backwards compatibility preserved --- doc/workloads/sglang.rst | 9 +++- doc/workloads/vllm.rst | 23 ++++++++- src/cloudai/workloads/common/llm_serving.py | 7 ++- .../vllm/slurm_command_gen_strategy.py | 50 +++++++++++++++++++ src/cloudai/workloads/vllm/vllm.py | 2 +- tests/ref_data/vllm-disagg-2nodes.sbatch | 20 ++++---- tests/ref_data/vllm-disagg.sbatch | 20 ++++---- tests/ref_data/vllm-multinode.sbatch | 16 +++--- tests/ref_data/vllm.sbatch | 16 +++--- tests/workloads/common/test_llm_serving.py | 10 +++- .../vllm/test_command_gen_strategy_slurm.py | 33 +++++++++--- 11 files changed, 159 insertions(+), 47 deletions(-) diff --git a/doc/workloads/sglang.rst b/doc/workloads/sglang.rst index 0ccd1ba48..ab1d3297c 100644 --- a/doc/workloads/sglang.rst +++ b/doc/workloads/sglang.rst @@ -100,8 +100,13 @@ Control number of GPUs ---------------------- The number of GPUs can be controlled using the options below, listed from lowest to highest priority: 1. ``gpus_per_node`` system property (scalar value) -2. ``CUDA_VISIBLE_DEVICES`` environment variable (comma-separated list of GPU IDs) -3. ``gpu_ids`` command argument for ``prefill`` and ``decode`` configurations (comma-separated list of GPU IDs). If disaggregated mode is used (``prefill`` is set), both ``prefill`` and ``decode`` should define ``gpu_ids``, or none of them should set it. +2. ``decode.gpu_ids`` command argument in non-disaggregated mode when ``CUDA_VISIBLE_DEVICES`` is not set +3. ``CUDA_VISIBLE_DEVICES`` environment variable (comma-separated list of GPU IDs) +4. ``gpu_ids`` command argument for both ``prefill`` and ``decode`` configurations in disaggregated mode + +For backward compatibility, non-disaggregated configs that set both ``CUDA_VISIBLE_DEVICES`` and ``decode.gpu_ids`` use +``CUDA_VISIBLE_DEVICES``. In disaggregated mode (``prefill`` is set), both ``prefill`` and ``decode`` should define +``gpu_ids``, or none of them should set it. Control disaggregation diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst index 17ead0b8d..da68eb8a6 100644 --- a/doc/workloads/vllm.rst +++ b/doc/workloads/vllm.rst @@ -95,9 +95,13 @@ Controlling the Number of GPUs ------------------------------- The number of GPUs can be controlled using the options below, listed from lowest to highest priority: 1. ``gpus_per_node`` system property (scalar value) -2. ``CUDA_VISIBLE_DEVICES`` environment variable (comma-separated list of GPU IDs) -3. ``gpu_ids`` command argument for ``prefill`` and ``decode`` configurations (comma-separated list of GPU IDs). If disaggregated mode is used (``prefill`` is set), both ``prefill`` and ``decode`` should define ``gpu_ids``, or none of them should set it. +2. ``decode.gpu_ids`` command argument in non-disaggregated mode when ``CUDA_VISIBLE_DEVICES`` is not set +3. ``CUDA_VISIBLE_DEVICES`` environment variable (comma-separated list of GPU IDs) +4. ``gpu_ids`` command argument for both ``prefill`` and ``decode`` configurations in disaggregated mode +For backward compatibility, non-disaggregated configs that set both ``CUDA_VISIBLE_DEVICES`` and ``decode.gpu_ids`` use +``CUDA_VISIBLE_DEVICES``. In disaggregated mode (``prefill`` is set), both ``prefill`` and ``decode`` should define +``gpu_ids``, or none of them should set it. Controlling Disaggregation -------------------------- @@ -189,6 +193,21 @@ from the prefill head node. IDs. +Readiness health checks +----------------------- +CloudAI waits for vLLM servers to become ready before starting the benchmark. The default vLLM server endpoint remains +``/healthcheck`` for backward compatibility with existing configs and runtime images. For vLLM images that expose the +newer ``/health`` endpoint, generated Slurm scripts also try the matching compatibility endpoint when either +``/healthcheck`` or ``/health`` is configured. + +In disaggregated mode, ``healthcheck`` controls the prefill/decode vLLM server readiness endpoint, while +``proxy_healthcheck`` controls the proxy/router readiness endpoint. Existing disaggregated configs that set +``healthcheck`` and do not set ``proxy_healthcheck`` continue to use ``healthcheck`` for the proxy/router check. + +For custom runtime images with a different readiness path, set ``healthcheck`` and, when using disaggregated mode, +``proxy_healthcheck`` explicitly. Custom paths are used as configured. + + Controlling ``proxy_script`` ----------------------------- ``proxy_script`` is used to proxy the requests from the client to the prefill and decode instances. It is ignored for non-disaggregated mode. Default value can be found below. diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py index 2b0dc3e76..f99be5591 100644 --- a/src/cloudai/workloads/common/llm_serving.py +++ b/src/cloudai/workloads/common/llm_serving.py @@ -63,8 +63,11 @@ def all_gpu_ids(tdef: LLMServingTestDefinition[LLMServingCmdArgsT], system_gpus_ if tdef.cmd_args.prefill: if tdef.cmd_args.prefill.gpu_ids and tdef.cmd_args.decode.gpu_ids: return parse_gpu_ids(tdef.cmd_args.prefill.gpu_ids) + parse_gpu_ids(tdef.cmd_args.decode.gpu_ids) - elif tdef.cmd_args.decode.gpu_ids: - return parse_gpu_ids(tdef.cmd_args.decode.gpu_ids) + else: + if cuda_devices: + return parse_gpu_ids(cuda_devices) + if tdef.cmd_args.decode.gpu_ids: + return parse_gpu_ids(tdef.cmd_args.decode.gpu_ids) if cuda_devices: return parse_gpu_ids(cuda_devices) return list(range(system_gpus_per_node or 1)) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 94e5227f3..c090d2405 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -127,8 +127,58 @@ def disaggregated_cleanup_pid_vars(self) -> list[str]: @property def proxy_router_healthcheck(self) -> str: + fields_set = self.tdef.cmd_args.model_fields_set + if "proxy_healthcheck" not in fields_set and "healthcheck" in fields_set: + return self.tdef.cmd_args.healthcheck return self.tdef.cmd_args.proxy_healthcheck + @staticmethod + def _compat_health_args(endpoint: str) -> str: + endpoints = [endpoint] + if endpoint.endswith("/health"): + endpoints.append(endpoint[: -len("/health")] + "/healthcheck") + elif endpoint.endswith("/healthcheck"): + endpoints.append(endpoint[: -len("/healthcheck")] + "/health") + return " ".join(f'"{value}"' for value in endpoints) + + def generate_wait_for_health_function(self) -> str: + return f"""\ +wait_for_health() {{ + local endpoints=("$@") + local timeout={self.tdef.cmd_args.serve_wait_seconds} + local interval=5 + local end_time=$(($(date +%s) + timeout)) + + while [ "$(date +%s)" -lt "$end_time" ]; do + for endpoint in "${{endpoints[@]}}"; do + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi + done + sleep "$interval" + done + + echo "Timeout waiting for: ${{endpoints[*]}}" + return 1 +}}""" + + @staticmethod + def generate_wait_for_health_block( + service_name: str, + endpoints: list[str], + *, + host_setup: str = "NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)\n", + host_display: str = "$NODE", + ) -> str: + waits = "\n".join( + f"wait_for_health {VllmSlurmCommandGenStrategy._compat_health_args(endpoint)} || exit 1" + for endpoint in endpoints + ) + return f"""\ +{host_setup}echo "Waiting for {service_name} on {host_display} to be ready..." +{waits}""" + def render_serve_launch( self, role: str, diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 948ac03b8..3db215bb6 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -70,7 +70,7 @@ class VllmCmdArgs(LLMServingCmdArgs[VllmArgs]): model_config = ConfigDict(extra="forbid") # arbitrary fields are allowed per decode/prefill, not here proxy_script: str = "/opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" - healthcheck: str = Field(default="/health", description="vLLM server healthcheck endpoint.") + healthcheck: str = Field(default="/healthcheck", description="vLLM server healthcheck endpoint.") proxy_healthcheck: str = Field( default="/healthcheck", description="vLLM disaggregated proxy/router healthcheck endpoint.", diff --git a/tests/ref_data/vllm-disagg-2nodes.sbatch b/tests/ref_data/vllm-disagg-2nodes.sbatch index ddc6f8948..47b383e72 100644 --- a/tests/ref_data/vllm-disagg-2nodes.sbatch +++ b/tests/ref_data/vllm-disagg-2nodes.sbatch @@ -34,20 +34,22 @@ cleanup() { trap cleanup EXIT wait_for_health() { - local endpoint="$1" + local endpoints=("$@") local timeout=300 local interval=5 local end_time=$(($(date +%s) + timeout)) while [ "$(date +%s)" -lt "$end_time" ]; do - if curl -sf "$endpoint" > /dev/null 2>&1; then - echo "Health check passed: $endpoint" - return 0 - fi + for endpoint in "${endpoints[@]}"; do + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi + done sleep "$interval" done - echo "Timeout waiting for: $endpoint" + echo "Timeout waiting for: ${endpoints[*]}" return 1 } @@ -84,8 +86,8 @@ srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --cont DECODE_PID=$! echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 -wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/health" "http://${PREFILL_NODE}:8400/healthcheck" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/health" "http://${DECODE_NODE}:8500/healthcheck" || exit 1 echo "Starting router..." srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ @@ -94,7 +96,7 @@ srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --cont HELPER_PID=$! echo "Waiting for vLLM on $PREFILL_NODE server to be ready..." -wait_for_health "http://${PREFILL_NODE}:8300/healthcheck" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8300/healthcheck" "http://${PREFILL_NODE}:8300/health" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index 37e7eca6c..627c86950 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -34,20 +34,22 @@ cleanup() { trap cleanup EXIT wait_for_health() { - local endpoint="$1" + local endpoints=("$@") local timeout=300 local interval=5 local end_time=$(($(date +%s) + timeout)) while [ "$(date +%s)" -lt "$end_time" ]; do - if curl -sf "$endpoint" > /dev/null 2>&1; then - echo "Health check passed: $endpoint" - return 0 - fi + for endpoint in "${endpoints[@]}"; do + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi + done sleep "$interval" done - echo "Timeout waiting for: $endpoint" + echo "Timeout waiting for: ${endpoints[*]}" return 1 } @@ -84,8 +86,8 @@ srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --cont DECODE_PID=$! echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 -wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/health" "http://${PREFILL_NODE}:8400/healthcheck" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/health" "http://${DECODE_NODE}:8500/healthcheck" || exit 1 echo "Starting router..." srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ @@ -94,7 +96,7 @@ srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --cont HELPER_PID=$! echo "Waiting for vLLM on $PREFILL_NODE server to be ready..." -wait_for_health "http://${PREFILL_NODE}:8300/healthcheck" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8300/healthcheck" "http://${PREFILL_NODE}:8300/health" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ diff --git a/tests/ref_data/vllm-multinode.sbatch b/tests/ref_data/vllm-multinode.sbatch index e7fd6cf35..0c00ff1cd 100644 --- a/tests/ref_data/vllm-multinode.sbatch +++ b/tests/ref_data/vllm-multinode.sbatch @@ -34,20 +34,22 @@ cleanup() { trap cleanup EXIT wait_for_health() { - local endpoint="$1" + local endpoints=("$@") local timeout=300 local interval=5 local end_time=$(($(date +%s) + timeout)) while [ "$(date +%s)" -lt "$end_time" ]; do - if curl -sf "$endpoint" > /dev/null 2>&1; then - echo "Health check passed: $endpoint" - return 0 - fi + for endpoint in "${endpoints[@]}"; do + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi + done sleep "$interval" done - echo "Timeout waiting for: $endpoint" + echo "Timeout waiting for: ${endpoints[*]}" return 1 } @@ -108,7 +110,7 @@ exit 1' & SERVE_PID=$! echo "Waiting for vLLM on $NODE to be ready..." -wait_for_health "http://${NODE}:8300/health" || exit 1 +wait_for_health "http://${NODE}:8300/healthcheck" "http://${NODE}:8300/health" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index 9c721400d..9f878d613 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -27,20 +27,22 @@ cleanup() { trap cleanup EXIT wait_for_health() { - local endpoint="$1" + local endpoints=("$@") local timeout=300 local interval=5 local end_time=$(($(date +%s) + timeout)) while [ "$(date +%s)" -lt "$end_time" ]; do - if curl -sf "$endpoint" > /dev/null 2>&1; then - echo "Health check passed: $endpoint" - return 0 - fi + for endpoint in "${endpoints[@]}"; do + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi + done sleep "$interval" done - echo "Timeout waiting for: $endpoint" + echo "Timeout waiting for: ${endpoints[*]}" return 1 } @@ -52,7 +54,7 @@ SERVE_PID=$! NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) echo "Waiting for vLLM on $NODE to be ready..." -wait_for_health "http://${NODE}:8300/health" || exit 1 +wait_for_health "http://${NODE}:8300/healthcheck" "http://${NODE}:8300/health" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ diff --git a/tests/workloads/common/test_llm_serving.py b/tests/workloads/common/test_llm_serving.py index a4f0adfb0..d6657314a 100644 --- a/tests/workloads/common/test_llm_serving.py +++ b/tests/workloads/common/test_llm_serving.py @@ -155,10 +155,18 @@ def test_fallback_to_system_gpu_count(self, llm_tdef: FakeLLMTestDefinition, gpu assert all_gpu_ids(cast(Any, llm_tdef), gpus_per_node) == list(range(gpus_per_node or 1)) - def test_decode_gpu_ids_override_defaults_in_aggregated_mode(self, llm_tdef: FakeLLMTestDefinition) -> None: + def test_cuda_visible_devices_wins_over_decode_gpu_ids_in_aggregated_mode( + self, llm_tdef: FakeLLMTestDefinition + ) -> None: llm_tdef.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} llm_tdef.cmd_args.decode.gpu_ids = "4,5" + assert all_gpu_ids(cast(Any, llm_tdef), 8) == [0, 1, 2, 3] + + def test_decode_gpu_ids_override_system_gpu_count_in_aggregated_mode(self, llm_tdef: FakeLLMTestDefinition) -> None: + llm_tdef.extra_env_vars = {} + llm_tdef.cmd_args.decode.gpu_ids = "4,5" + assert all_gpu_ids(cast(Any, llm_tdef), 8) == [4, 5] def test_prefill_and_decode_gpu_ids_override_cuda_visible_devices(self, llm_tdef: FakeLLMTestDefinition) -> None: diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index 8af29239b..07154cc56 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -304,20 +304,22 @@ def test_generate_wait_for_health_function(self, vllm_cmd_gen_strategy: VllmSlur expected = f"""\ wait_for_health() {{ - local endpoint="$1" + local endpoints=("$@") local timeout={cmd_args.serve_wait_seconds} local interval=5 local end_time=$(($(date +%s) + timeout)) while [ "$(date +%s)" -lt "$end_time" ]; do - if curl -sf "$endpoint" > /dev/null 2>&1; then - echo "Health check passed: $endpoint" - return 0 - fi + for endpoint in "${{endpoints[@]}}"; do + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi + done sleep "$interval" done - echo "Timeout waiting for: $endpoint" + echo "Timeout waiting for: ${{endpoints[*]}}" return 1 }}""" @@ -357,7 +359,7 @@ def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmComman NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) echo "Waiting for vLLM on $NODE to be ready..." -wait_for_health "http://${{NODE}}:{cmd_args.port}/health" || exit 1 +wait_for_health "http://${{NODE}}:{cmd_args.port}/healthcheck" "http://${{NODE}}:{cmd_args.port}/health" || exit 1 echo "Running benchmark..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ @@ -400,12 +402,29 @@ def test_custom_healthcheck_endpoints( vllm_tr.test = vllm aggregated = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr)._gen_srun_command() assert 'wait_for_health "http://${NODE}:8000/ready"' in aggregated + assert 'wait_for_health "http://${NODE}:8000/ready" "http://${NODE}:8000/healthcheck"' not in aggregated vllm.cmd_args.prefill = VllmArgs() vllm.cmd_args.proxy_healthcheck = "/router-ready" vllm_tr.num_nodes = 2 disaggregated = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr)._gen_srun_command() assert 'wait_for_health "http://${PREFILL_NODE}:8000/router-ready"' in disaggregated + assert ( + 'wait_for_health "http://${PREFILL_NODE}:8000/router-ready" "http://${PREFILL_NODE}:8000/ready"' + not in disaggregated + ) + + def test_disagg_custom_healthcheck_preserves_legacy_proxy_endpoint( + self, vllm: VllmTestDefinition, vllm_tr: TestRun, slurm_system: SlurmSystem + ) -> None: + vllm.cmd_args.healthcheck = "/legacy-ready" + vllm.cmd_args.prefill = VllmArgs() + vllm_tr.test = vllm + vllm_tr.num_nodes = 2 + + disaggregated = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr)._gen_srun_command() + + assert 'wait_for_health "http://${PREFILL_NODE}:8000/legacy-ready"' in disaggregated class TestVllmDisaggregatedMode: From e884f15929eddce0b3d162535d94ec2c9cc28d50 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 11 Jun 2026 13:31:17 +0200 Subject: [PATCH 14/26] remove redundant tests --- .../sglang/test_command_gen_strategy_slurm.py | 132 -------- .../vllm/test_command_gen_strategy_slurm.py | 296 ------------------ 2 files changed, 428 deletions(-) diff --git a/tests/workloads/sglang/test_command_gen_strategy_slurm.py b/tests/workloads/sglang/test_command_gen_strategy_slurm.py index ec51737c5..01e6f5be6 100644 --- a/tests/workloads/sglang/test_command_gen_strategy_slurm.py +++ b/tests/workloads/sglang/test_command_gen_strategy_slurm.py @@ -28,7 +28,6 @@ SglangSlurmCommandGenStrategy, SglangTestDefinition, ) -from cloudai.workloads.sglang.sglang import SGLANG_BENCH_JSONL_FILE, SGLANG_BENCH_LOG_FILE @pytest.fixture @@ -73,14 +72,6 @@ def test_container_mounts(sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy class TestGpuDetection: - @pytest.mark.parametrize("cuda_visible_devices", ["0", "0,1,2,3", "0,1,2,3,4,5,6,7"]) - def test_gpu_ids_from_cuda_visible_devices( - self, cuda_visible_devices: str, sglang_tr: TestRun, slurm_system: SlurmSystem - ) -> None: - sglang_tr.test.extra_env_vars = {"CUDA_VISIBLE_DEVICES": cuda_visible_devices} - strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_tr) - assert strategy.gpu_ids == [int(gpu_id) for gpu_id in cuda_visible_devices.split(",")] - def test_aggregated_gpu_ids_from_decode_config(self, sglang_tr: TestRun, slurm_system: SlurmSystem) -> None: tdef = cast(SglangTestDefinition, sglang_tr.test) tdef.extra_env_vars = {} @@ -98,60 +89,6 @@ def test_multinode_disagg_uses_shared_gpu_ids_per_role( assert strategy.decode_gpu_ids == [0, 1, 2, 3] -def test_get_sglang_serve_commands_aggregated(sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy) -> None: - cmd_args = sglang_cmd_gen_strategy.test_run.test.cmd_args - commands = sglang_cmd_gen_strategy.get_serve_commands() - - assert len(commands) == 1 - assert commands[0] == [ - "python3", - "-m", - cmd_args.serve_module, - "--model-path", - cmd_args.model, - "--host", - cmd_args.host, - "--port", - str(cmd_args.port), - ] - - -def test_get_sglang_serve_commands_disagg(sglang_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: - strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_disagg_tr) - - commands = strategy.get_serve_commands() - - assert len(commands) == 2 - prefill_cmd, decode_cmd = commands - assert "--disaggregation-mode" in prefill_cmd - assert "prefill" in prefill_cmd - assert str(strategy.prefill_port) in prefill_cmd - - assert "--disaggregation-mode" in decode_cmd - assert "decode" in decode_cmd - assert str(strategy.decode_port) in decode_cmd - - -def test_get_sglang_bench_command_adds_pd_separated_in_disagg( - sglang_disagg_tr: TestRun, slurm_system: SlurmSystem -) -> None: - strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_disagg_tr) - - command = strategy.get_bench_command() - - assert "--pd-separated" in command - - -def test_get_sglang_bench_command_writes_jsonl( - sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy, -) -> None: - command = sglang_cmd_gen_strategy.get_bench_command() - output_file_args = [part for part in command if part.startswith("--output-file ")] - assert len(output_file_args) == 1 - assert f"--base-url http://${{NODE}}:{sglang_cmd_gen_strategy.test_run.test.cmd_args.port}" in command - assert output_file_args[0].endswith(f"/{SGLANG_BENCH_JSONL_FILE}") - - def test_get_sglang_semantic_eval_command_defaults(sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy): sglang_test = cast(SglangTestDefinition, sglang_cmd_gen_strategy.test_run.test) sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs() @@ -211,49 +148,6 @@ def test_gen_srun_command_contains_sglang_semantic_eval_in_disagg( assert "python3 -m sglang.test.run_eval --host ${PREFILL_NODE} --port 8000" in srun_command -def test_gen_srun_command_contains_expected_flow(sglang_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: - strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_disagg_tr) - - srun_command = strategy._gen_srun_command() - - assert "Starting SGLang instances" in srun_command - assert "Starting router" in srun_command - assert 'PREFILL_NODES=( "${NODES[@]:0:1}" )' in srun_command - assert 'DECODE_NODES=( "${NODES[@]:0:1}" )' in srun_command - assert "PREFILL_NODE=${PREFILL_NODES[0]}" in srun_command - assert "DECODE_NODE=${DECODE_NODES[0]}" in srun_command - assert 'env CUDA_VISIBLE_DEVICES="0,1"' in srun_command - assert 'env CUDA_VISIBLE_DEVICES="2,3"' in srun_command - assert 'wait_for_health "http://${PREFILL_NODE}:8100/health"' in srun_command - assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' in srun_command - assert "--prefill http://${PREFILL_NODE}:8100" in srun_command - assert "--decode http://${DECODE_NODE}:8200" in srun_command - assert "--base-url http://${PREFILL_NODE}:8000" in srun_command - assert f"--output={strategy.test_run.output_path.absolute()}/{SGLANG_BENCH_LOG_FILE}" in srun_command - - -def test_gen_srun_command_contains_expected_two_node_flow( - sglang_disagg_2node_tr: TestRun, slurm_system: SlurmSystem -) -> None: - strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_disagg_2node_tr) - - srun_command = strategy._gen_srun_command() - - assert 'PREFILL_NODES=( "${NODES[@]:0:1}" )' in srun_command - assert 'DECODE_NODES=( "${NODES[@]:1:1}" )' in srun_command - assert "PREFILL_NODE=${PREFILL_NODES[0]}" in srun_command - assert "DECODE_NODE=${DECODE_NODES[0]}" in srun_command - assert srun_command.count('--nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1') == 3 - assert srun_command.count('--nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1') == 1 - assert 'env CUDA_VISIBLE_DEVICES="0,1,2,3"' in srun_command - assert srun_command.count("--host 0.0.0.0") >= 2 - assert 'wait_for_health "http://${PREFILL_NODE}:8100/health"' in srun_command - assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' in srun_command - assert "--prefill http://${PREFILL_NODE}:8100" in srun_command - assert "--decode http://${DECODE_NODE}:8200" in srun_command - assert "--base-url http://${PREFILL_NODE}:8000" in srun_command - - def test_disagg_more_than_two_nodes_is_rejected(sglang_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: sglang_disagg_tr.num_nodes = 3 strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_disagg_tr) @@ -262,25 +156,6 @@ def test_disagg_more_than_two_nodes_is_rejected(sglang_disagg_tr: TestRun, slurm _ = strategy._gen_srun_command() -def test_gen_srun_command_multinode_aggregated_uses_sglang_distributed_launch( - sglang: SglangTestDefinition, tmp_path: Path, slurm_system: SlurmSystem -) -> None: - sglang.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} - tr = TestRun(test=sglang, num_nodes=2, nodes=[], output_path=tmp_path, name="sglang-multinode-job") - strategy = SglangSlurmCommandGenStrategy(slurm_system, tr) - - srun_command = strategy._gen_srun_command() - - assert 'SERVE_NODES=( "${NODES[@]:0:2}" )' in srun_command - assert "export SERVE_DIST_INIT_PORT=$((20000 + PORT_OFFSET))" in srun_command - assert '--nodelist="${SERVE_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1' in srun_command - assert ( - '--dist-init-addr "${SERVE_NODE}:${SERVE_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_PROCID"' - in srun_command - ) - assert "bash -c" in srun_command - - def test_gen_srun_command_disagg_four_nodes_uses_separate_sglang_distributed_launches( sglang_disagg_tr: TestRun, slurm_system: SlurmSystem ) -> None: @@ -309,13 +184,6 @@ def test_gen_srun_command_disagg_four_nodes_uses_separate_sglang_distributed_lau ) -def test_gen_srun_command_contains_cuda_visible_devices_for_aggregated( - sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy, -) -> None: - srun_command = sglang_cmd_gen_strategy._gen_srun_command() - assert 'env CUDA_VISIBLE_DEVICES="0"' in srun_command - - def test_custom_bash_string_wraps_aggregated_serve_and_benchmark( sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy, ) -> None: diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index 07154cc56..a7bc1ed70 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -29,7 +29,6 @@ VllmSlurmCommandGenStrategy, VllmTestDefinition, ) -from cloudai.workloads.vllm.vllm import VLLM_BENCH_JSON_FILE, VLLM_BENCH_LOG_FILE, VLLM_SERVE_LOG_FILE @pytest.fixture @@ -148,29 +147,6 @@ def test_nixl_threads( class TestVllmBenchCommand: - def test_get_vllm_bench_command(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: - tdef = cast(VllmTestDefinition, vllm_cmd_gen_strategy.test_run.test) - cmd_args = tdef.cmd_args - bench_args = tdef.bench_cmd_args - - command = vllm_cmd_gen_strategy.get_bench_command() - - expected = [ - "vllm", - "bench", - "serve", - f"--model {cmd_args.model}", - f"--base-url http://${{NODE}}:{cmd_args.port}", - f"--random-input-len {bench_args.random_input_len}", - f"--random-output-len {bench_args.random_output_len}", - f"--max-concurrency {bench_args.max_concurrency}", - f"--num-prompts {bench_args.num_prompts}", - f"--result-dir {vllm_cmd_gen_strategy.test_run.output_path.absolute()}", - f"--result-filename {VLLM_BENCH_JSON_FILE}", - "--save-result", - ] - assert command == expected - def test_get_vllm_bench_command_with_extra_args( self, vllm: VllmTestDefinition, vllm_tr: TestRun, slurm_system: SlurmSystem ) -> None: @@ -251,14 +227,6 @@ def test_gen_srun_command_contains_vllm_semantic_eval_in_disagg( class TestVllmAggregatedMode: """Tests for vLLM non-disaggregated mode with 1 GPU.""" - def test_get_vllm_serve_commands_single_gpu(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: - cmd_args = vllm_cmd_gen_strategy.test_run.test.cmd_args - - commands = vllm_cmd_gen_strategy.get_serve_commands() - - assert len(commands) == 1 - assert commands[0] == ["vllm", "serve", cmd_args.model, "--host", cmd_args.host, "--port", str(cmd_args.port)] - def test_get_vllm_serve_commands_convert_boolean_flags( self, vllm: VllmTestDefinition, vllm_tr: TestRun, slurm_system: SlurmSystem ) -> None: @@ -279,24 +247,6 @@ def test_get_vllm_serve_commands_convert_boolean_flags( str(vllm.cmd_args.port), ] - def test_gen_srun_command_multinode_aggregated_uses_ray( - self, vllm: VllmTestDefinition, tmp_path: Path, slurm_system: SlurmSystem - ) -> None: - vllm.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} - tr = TestRun(test=vllm, num_nodes=2, nodes=[], output_path=tmp_path, name="vllm-multinode-job") - strategy = VllmSlurmCommandGenStrategy(slurm_system, tr) - - srun_command = strategy._gen_srun_command() - - assert "--distributed-executor-backend ray" in srun_command - assert 'SERVE_NODES=( "${NODES[@]:0:2}" )' in srun_command - assert "export SERVE_RAY_PORT=$((6379 + PORT_OFFSET))" in srun_command - assert "SERVE_RAY_PID=$!" in srun_command - assert 'sum(node["Alive"] for node in ray.nodes())' in srun_command - assert "ray.init(address=" not in srun_command - assert 'exec env RAY_ADDRESS="${SERVE_NODE}:${SERVE_RAY_PORT}" vllm serve' in srun_command - assert 'env RAY_ADDRESS="${SERVE_NODE}:${SERVE_RAY_PORT}"' in srun_command - def test_generate_wait_for_health_function(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: cmd_args = vllm_cmd_gen_strategy.test_run.test.cmd_args @@ -325,52 +275,6 @@ def test_generate_wait_for_health_function(self, vllm_cmd_gen_strategy: VllmSlur assert func == expected - def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: - tdef = vllm_cmd_gen_strategy.test_run.test - cmd_args = tdef.cmd_args - output_path = vllm_cmd_gen_strategy.test_run.output_path.absolute() - srun_prefix = " ".join(vllm_cmd_gen_strategy.gen_srun_prefix()) - serve_cmd = " ".join(vllm_cmd_gen_strategy.get_serve_commands()[0]) - bench_cmd = " ".join(vllm_cmd_gen_strategy.get_bench_command()) - health_func = vllm_cmd_gen_strategy.generate_wait_for_health_function() - - srun_command = vllm_cmd_gen_strategy._gen_srun_command() - - expected = f"""\ -cleanup() {{ - echo "Cleaning up PIDs: SERVE_PID=$SERVE_PID" - kill -TERM "$SERVE_PID" 2>/dev/null - i=0 - while kill -0 "$SERVE_PID" 2>/dev/null; do - [ "$i" -ge 15 ] && echo "PID did not exit in time" && return 1 - sleep 1 - i=$((i+1)) - done -}} -trap cleanup EXIT - -{health_func} - -echo "Starting vLLM instances..." -{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/{VLLM_SERVE_LOG_FILE} \\ - {serve_cmd} & -SERVE_PID=$! - -NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -echo "Waiting for vLLM on $NODE to be ready..." -wait_for_health "http://${{NODE}}:{cmd_args.port}/healthcheck" "http://${{NODE}}:{cmd_args.port}/health" || exit 1 - -echo "Running benchmark..." -{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/{VLLM_BENCH_LOG_FILE} \\ - {bench_cmd} - -cleanup -""" - - assert srun_command == expected - def test_custom_bash_string_wraps_aggregated_serve_and_benchmark( self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy ) -> None: @@ -430,179 +334,6 @@ def test_disagg_custom_healthcheck_preserves_legacy_proxy_endpoint( class TestVllmDisaggregatedMode: """Tests for vLLM disaggregated mode with multiple GPUs.""" - def test_prefill_gpu_ids(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: - """Prefill gets first half of GPUs.""" - strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) - assert strategy.prefill_gpu_ids == [0, 1] - - def test_decode_gpu_ids(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: - """Decode gets second half of GPUs.""" - strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) - assert strategy.decode_gpu_ids == [2, 3] - - def test_get_vllm_serve_commands_returns_two(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: - """Disagg mode returns prefill and decode commands.""" - strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) - cmd_args = vllm_disagg_tr.test.cmd_args - - commands = strategy.get_serve_commands() - - assert len(commands) == 2 - prefill_cmd, decode_cmd = commands - - assert prefill_cmd == [ - "vllm", - "serve", - cmd_args.model, - "--host", - cmd_args.host, - "--port", - str(cmd_args.port + 100), - "--kv-transfer-config", - '\'{"kv_connector":"NixlConnector","kv_role":"kv_producer"}\'', - ] - assert decode_cmd == [ - "vllm", - "serve", - cmd_args.model, - "--host", - cmd_args.host, - "--port", - str(cmd_args.port + 200), - "--kv-transfer-config", - '\'{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}\'', - ] - - def test_get_helper_command(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: - """Helper command routes to prefill and decode ports.""" - strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) - cmd_args = vllm_disagg_tr.test.cmd_args - - command = strategy.get_helper_command() - - assert command == [ - "python3", - cmd_args.proxy_script, - "--host", - cmd_args.host, - "--port", - str(cmd_args.port), - "--prefiller-hosts", - "${PREFILL_NODE}", - "--prefiller-ports", - str(cmd_args.port + 100), - "--decoder-hosts", - "${DECODE_NODE}", - "--decoder-ports", - str(cmd_args.port + 200), - ] - - def test_gen_srun_command_disagg_flow(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: - """Disagg mode starts prefill, decode, and helper, waits for health checks.""" - strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) - cmd_args = vllm_disagg_tr.test.cmd_args - output_path = vllm_disagg_tr.output_path.absolute() - srun_prefix = " ".join(strategy.gen_srun_prefix()) - prefill_cmd, decode_cmd = strategy.get_serve_commands() - helper_cmd = strategy.get_helper_command() - bench_cmd = " ".join(strategy.get_bench_command()) - health_func = strategy.generate_wait_for_health_function() - prefill_gpus = ",".join(str(g) for g in strategy.prefill_gpu_ids) - decode_gpus = ",".join(str(g) for g in strategy.decode_gpu_ids) - prefill_env = ( - f'env CUDA_VISIBLE_DEVICES="{prefill_gpus}" ' - 'VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" ' - 'VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT"' - ) - decode_env = ( - f'env CUDA_VISIBLE_DEVICES="{decode_gpus}" ' - 'VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" ' - 'VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT"' - ) - - srun_command = strategy._gen_srun_command() - - expected = f"""\ -cleanup() {{ - echo "Cleaning up PIDs: PREFILL_PID=$PREFILL_PID DECODE_PID=$DECODE_PID HELPER_PID=$HELPER_PID" - - for pid in "$PREFILL_PID" "$DECODE_PID" "$HELPER_PID"; do - [ -n "$pid" ] && kill -TERM "$pid" 2>/dev/null - done - - for pid in "$PREFILL_PID" "$DECODE_PID" "$HELPER_PID"; do - [ -z "$pid" ] && continue - i=0 - while kill -0 "$pid" 2>/dev/null; do - [ "$i" -ge 15 ] && echo "PID $pid did not exit in time" && return 1 - sleep 1 - i=$((i+1)) - done - done -}} -trap cleanup EXIT - -{health_func} - -export PORT_OFFSET=$((SLURM_JOB_ID % 1000)) -export PREFILL_NIXL_PORT=$((5557 + PORT_OFFSET)) -export DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + {len(strategy.gpu_ids)})) - -NODES=( $(scontrol show hostname $SLURM_JOB_NODELIST) ) -export PREFILL_NODE=${{NODES[0]}} -export DECODE_NODE=${{NODES[1]:-${{PREFILL_NODE}}}} -if [ -z "$PREFILL_NODE" ]; then - echo "Failed to resolve allocated nodes for disaggregated vLLM" - exit 1 -fi -echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE" - -echo "Starting vLLM instances..." -{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/vllm-prefill.log \\ - {prefill_env} {" ".join(prefill_cmd)} & -PREFILL_PID=$! - -{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/vllm-decode.log \\ - {decode_env} {" ".join(decode_cmd)} & -DECODE_PID=$! - -echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${{PREFILL_NODE}}:{cmd_args.port + 100}/health" || exit 1 -wait_for_health "http://${{DECODE_NODE}}:{cmd_args.port + 200}/health" || exit 1 - -echo "Starting router..." -{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/vllm-router.log \\ - {" ".join(helper_cmd)} & -HELPER_PID=$! - -echo "Waiting for vLLM on $PREFILL_NODE server to be ready..." -wait_for_health "http://${{PREFILL_NODE}}:{cmd_args.port}/healthcheck" || exit 1 - -echo "Running benchmark..." -{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/{VLLM_BENCH_LOG_FILE} \\ - {bench_cmd} - -cleanup -""" - - del expected - assert 'PREFILL_NODES=( "${NODES[@]:0:1}" )' in srun_command - assert 'DECODE_NODES=( "${NODES[@]:0:1}" )' in srun_command - assert '--nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1' in srun_command - assert '--nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1' in srun_command - assert f"--output={output_path}/vllm-prefill.log" in srun_command - assert f"--output={output_path}/vllm-decode.log" in srun_command - assert f"{prefill_env} {' '.join(prefill_cmd)}" in srun_command - assert f"{decode_env} {' '.join(decode_cmd)}" in srun_command - assert f"--output={output_path}/vllm-router.log" in srun_command - assert " ".join(helper_cmd) in srun_command - assert f"--output={output_path}/{VLLM_BENCH_LOG_FILE}" in srun_command - assert bench_cmd in srun_command - def test_custom_bash_regex_can_target_disaggregated_commands( self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem ) -> None: @@ -623,33 +354,6 @@ def test_custom_bash_regex_can_target_disaggregated_commands( assert "echo router setup; exec python3" in srun_command assert "echo bench setup; exec vllm bench serve" in srun_command - def test_gen_srun_command_disagg_two_nodes_flow( - self, vllm_disagg_2node_tr: TestRun, slurm_system: SlurmSystem - ) -> None: - strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_2node_tr) - - srun_command = strategy._gen_srun_command() - - assert 'PREFILL_NODES=( "${NODES[@]:0:1}" )' in srun_command - assert 'DECODE_NODES=( "${NODES[@]:1:1}" )' in srun_command - assert "PREFILL_NODE=${PREFILL_NODES[0]}" in srun_command - assert "DECODE_NODE=${DECODE_NODES[0]}" in srun_command - assert srun_command.count('--nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1') == 3 - assert srun_command.count('--nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1') == 1 - assert ( - 'env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" ' - 'VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT"' - ) in srun_command - assert ( - 'env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" ' - 'VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT"' - ) in srun_command - assert 'wait_for_health "http://${PREFILL_NODE}:8100/health"' in srun_command - assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' in srun_command - assert "--prefiller-hosts ${PREFILL_NODE}" in srun_command - assert "--decoder-hosts ${DECODE_NODE}" in srun_command - assert "--base-url http://${PREFILL_NODE}:8000" in srun_command - def test_disagg_more_than_two_nodes_is_rejected(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: vllm_disagg_tr.num_nodes = 3 strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) From 56a91d678bf3f4207bcaa784b48ee5220f25d2c9 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 11 Jun 2026 14:03:48 +0200 Subject: [PATCH 15/26] Address LLM multi-node review comments --- conf/experimental/vllm/test/vllm-heavy.toml | 2 +- doc/workloads/sglang.rst | 10 +++-- doc/workloads/vllm.rst | 10 +++-- src/cloudai/workloads/common/llm_serving.py | 12 ++++-- .../vllm/slurm_command_gen_strategy.py | 39 +++++++++++++++++++ src/cloudai/workloads/vllm/vllm.py | 19 ++++++++- tests/ref_data/sglang-multinode.sbatch | 2 +- tests/ref_data/vllm-disagg-2nodes.sbatch | 4 +- tests/ref_data/vllm-disagg.sbatch | 4 +- tests/ref_data/vllm-multinode.sbatch | 8 +++- tests/test_acceptance.py | 4 +- .../vllm/test_command_gen_strategy_slurm.py | 4 ++ tests/workloads/vllm/test_workload.py | 32 +++++++++++++++ 13 files changed, 129 insertions(+), 21 deletions(-) diff --git a/conf/experimental/vllm/test/vllm-heavy.toml b/conf/experimental/vllm/test/vllm-heavy.toml index 193d63c38..5c4070bf9 100644 --- a/conf/experimental/vllm/test/vllm-heavy.toml +++ b/conf/experimental/vllm/test/vllm-heavy.toml @@ -20,7 +20,7 @@ test_template_name = "vllm" [[git_repos]] url = "https://github.com/vllm-project/vllm.git" -commit = "main" +commit = "a8887c208f34c04c3b021cf3949ed6545d77bb01" mount_as = "/vllm_repo" [cmd_args] diff --git a/doc/workloads/sglang.rst b/doc/workloads/sglang.rst index ab1d3297c..68a27a2ec 100644 --- a/doc/workloads/sglang.rst +++ b/doc/workloads/sglang.rst @@ -145,6 +145,8 @@ Multi-node serving For non-disaggregated serving, set ``num_nodes`` on the test to more than one. CloudAI starts one ``sglang.launch_server`` task per serving node with a shared ``--dist-init-addr``, ``--nnodes``, and ``--node-rank "$SLURM_NODEID"``. +SGLang ``tp`` is the total tensor-parallel size for the distributed serving role. With two nodes and +``CUDA_VISIBLE_DEVICES = "0,1,2,3"`` on each node, set ``tp = 8`` to use all eight visible GPUs. .. code-block:: toml :caption: scenario.toml (multi-node aggregated serving) @@ -159,7 +161,7 @@ For non-disaggregated serving, set ``num_nodes`` on the test to more than one. C model = "Qwen/Qwen3-8B" [Tests.cmd_args.decode] - tp = 2 + tp = 8 [Tests.extra_env_vars] CUDA_VISIBLE_DEVICES = "0,1,2,3" @@ -168,6 +170,8 @@ For disaggregated prefill/decode serving, existing 1-node and 2-node behavior is than two nodes, set both role sizes explicitly. CloudAI assigns contiguous node slices to prefill and decode and starts one distributed SGLang launch per role with separate init ports. Benchmark and semantic validation run from the prefill head node. +Role ``tp`` values are total per distributed role, not per node. For example, ``num_nodes = 2`` with four visible GPUs +per node uses ``tp = 8`` to consume all GPUs in that role. .. code-block:: toml :caption: scenario.toml (multi-node disaggregated serving) @@ -183,11 +187,11 @@ head node. [Tests.cmd_args.prefill] num_nodes = 2 - tp = 2 + tp = 8 [Tests.cmd_args.decode] num_nodes = 2 - tp = 2 + tp = 8 [Tests.extra_env_vars] CUDA_VISIBLE_DEVICES = "0,1,2,3" diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst index da68eb8a6..d1e86027a 100644 --- a/doc/workloads/vllm.rst +++ b/doc/workloads/vllm.rst @@ -142,6 +142,8 @@ Multi-node serving For non-disaggregated serving, set ``num_nodes`` on the test to more than one. CloudAI starts a Ray head on the first allocated serving node, Ray workers on the remaining serving nodes, waits for the Ray cluster to reach the requested size, and runs ``vllm serve`` with ``--distributed-executor-backend ray`` on the head node. +``tensor_parallel_size`` is the total tensor-parallel size across the Ray serving role. With two nodes and +``CUDA_VISIBLE_DEVICES = "0,1,2,3"`` on each node, set ``tensor_parallel_size = 8`` to use all eight visible GPUs. .. code-block:: toml :caption: scenario.toml (multi-node aggregated serving) @@ -156,7 +158,7 @@ size, and runs ``vllm serve`` with ``--distributed-executor-backend ray`` on the model = "Qwen/Qwen3-0.6B" [Tests.cmd_args.decode] - tensor_parallel_size = 2 + tensor_parallel_size = 8 [Tests.extra_env_vars] CUDA_VISIBLE_DEVICES = "0,1,2,3" @@ -165,6 +167,8 @@ For disaggregated prefill/decode serving, existing 1-node and 2-node behavior is than two nodes, set both role sizes explicitly. CloudAI assigns contiguous node slices to prefill and decode, creates a separate Ray cluster for each role whose ``num_nodes`` is greater than one, and runs benchmark and semantic validation from the prefill head node. +Role ``tensor_parallel_size`` values are total per Ray role, not per node. For example, ``num_nodes = 2`` with four +visible GPUs per node uses ``tensor_parallel_size = 8`` to consume all GPUs in that role. .. code-block:: toml :caption: scenario.toml (multi-node disaggregated serving) @@ -180,11 +184,11 @@ from the prefill head node. [Tests.cmd_args.prefill] num_nodes = 2 - tensor_parallel_size = 2 + tensor_parallel_size = 8 [Tests.cmd_args.decode] num_nodes = 2 - tensor_parallel_size = 2 + tensor_parallel_size = 8 [Tests.extra_env_vars] CUDA_VISIBLE_DEVICES = "0,1,2,3" diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py index f99be5591..bb0d33e8d 100644 --- a/src/cloudai/workloads/common/llm_serving.py +++ b/src/cloudai/workloads/common/llm_serving.py @@ -561,8 +561,7 @@ def generate_wait_for_health_function(self) -> str: return 1 }}""" - @staticmethod - def generate_cleanup_function(pid_vars: list[str], timeout: int = 15) -> str: + def generate_cleanup_function(self, pid_vars: list[str], timeout: int = 15) -> str: if len(pid_vars) == 1: pid_var = pid_vars[0] return f"""\ @@ -652,6 +651,11 @@ def proxy_router_healthcheck(self) -> str: """Healthcheck endpoint for the helper/proxy process in disaggregated mode.""" return self.tdef.cmd_args.healthcheck + @property + def role_server_healthcheck(self) -> str: + """Healthcheck endpoint for prefill/decode server processes in disaggregated mode.""" + return "/health" + @property def bench_log_file(self) -> str: """Benchmark log file name.""" @@ -836,8 +840,8 @@ def _gen_disaggregated_script(self, serve_commands: list[list[str]], bench_cmd: wait_block = self.generate_wait_for_health_block( self.workload_name, [ - f"http://{self.disaggregated_role_host('prefill')}:{self.prefill_port}/health", - f"http://{self.disaggregated_role_host('decode')}:{self.decode_port}/health", + f"http://{self.disaggregated_role_host('prefill')}:{self.prefill_port}{self.role_server_healthcheck}", + f"http://{self.disaggregated_role_host('decode')}:{self.decode_port}{self.role_server_healthcheck}", ], host_setup="", host_display="$PREFILL_NODE and $DECODE_NODE", diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index c090d2405..c70d1760f 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -125,6 +125,10 @@ def disaggregated_cleanup_pid_vars(self) -> list[str]: pid_vars.insert(insert_at, "DECODE_RAY_PID") return pid_vars + @property + def role_server_healthcheck(self) -> str: + return self.tdef.cmd_args.healthcheck or "/health" + @property def proxy_router_healthcheck(self) -> str: fields_set = self.tdef.cmd_args.model_fields_set @@ -132,6 +136,41 @@ def proxy_router_healthcheck(self) -> str: return self.tdef.cmd_args.healthcheck return self.tdef.cmd_args.proxy_healthcheck + def _ray_stop_cleanup_block(self) -> str: + role_specs: list[tuple[str, str, int]] = [] + if not self.is_disaggregated and self._needs_ray("serve"): + role_specs.append(("SERVE_NODELIST", "serve", self.role_node_count("serve"))) + if self.is_disaggregated: + if self._needs_ray("prefill"): + role_specs.append(("PREFILL_NODELIST", "prefill", self.role_node_count("prefill"))) + if self._needs_ray("decode"): + role_specs.append(("DECODE_NODELIST", "decode", self.role_node_count("decode"))) + + if not role_specs: + return "" + + lines = [' echo "Stopping Ray clusters..."'] + for nodelist_var, role, node_count in role_specs: + stop_prefix = self._role_srun_prefix(f"${{{nodelist_var}}}", node_count, node_count) + stop_command = f"{stop_prefix} bash -lc 'ray stop --force >/dev/null 2>&1 || true' >/dev/null 2>&1 || true" + lines.extend( + [ + f' if [ -n "${{{nodelist_var}:-}}" ]; then', + f" {stop_command}", + " else", + f' echo "Skipping Ray stop for {role}: node list is not set"', + " fi", + ] + ) + return "\n".join(lines) + + def generate_cleanup_function(self, pid_vars: list[str], timeout: int = 15) -> str: + cleanup = super().generate_cleanup_function(pid_vars, timeout) + ray_stop_block = self._ray_stop_cleanup_block() + if not ray_stop_block: + return cleanup + return cleanup.replace("cleanup() {\n", f"cleanup() {{\n{ray_stop_block}\n", 1) + @staticmethod def _compat_health_args(endpoint: str) -> str: endpoints = [endpoint] diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 3db215bb6..34b86942f 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -174,8 +174,23 @@ def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool: if num_nodes > 2: prefill_nodes_value = self.cmd_args.prefill.num_nodes decode_nodes_value = self.cmd_args.decode.num_nodes - prefill_nodes = prefill_nodes_value if isinstance(prefill_nodes_value, int) else prefill_nodes - decode_nodes = decode_nodes_value if isinstance(decode_nodes_value, int) else decode_nodes + if not isinstance(prefill_nodes_value, int) or not isinstance(decode_nodes_value, int): + logging.error( + "vLLM disaggregated mode over more than 2 nodes requires both prefill.num_nodes and " + "decode.num_nodes." + ) + return False + if prefill_nodes_value + decode_nodes_value != num_nodes: + logging.error( + "vLLM disaggregated role node counts must sum to allocated nodes. prefill=%s decode=%s " + "allocated=%s", + prefill_nodes_value, + decode_nodes_value, + num_nodes, + ) + return False + prefill_nodes = prefill_nodes_value + decode_nodes = decode_nodes_value return self._validate_vllm_parallelism_constraints( role="prefill", diff --git a/tests/ref_data/sglang-multinode.sbatch b/tests/ref_data/sglang-multinode.sbatch index d8a902d79..40cedac0c 100644 --- a/tests/ref_data/sglang-multinode.sbatch +++ b/tests/ref_data/sglang-multinode.sbatch @@ -61,7 +61,7 @@ echo "Node roles: serve=${SERVE_NODES[*]}" echo "Starting SGLang instances..." srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-serve.log-%N \ - bash -c 'exec env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8300 --tp 2 --dist-init-addr "${SERVE_NODE}:${SERVE_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_PROCID"' & + bash -c 'exec env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8300 --tp 8 --dist-init-addr "${SERVE_NODE}:${SERVE_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_PROCID"' & SERVE_PID=$! echo "Waiting for SGLang on $NODE to be ready..." diff --git a/tests/ref_data/vllm-disagg-2nodes.sbatch b/tests/ref_data/vllm-disagg-2nodes.sbatch index 47b383e72..ff4a863c8 100644 --- a/tests/ref_data/vllm-disagg-2nodes.sbatch +++ b/tests/ref_data/vllm-disagg-2nodes.sbatch @@ -86,8 +86,8 @@ srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --cont DECODE_PID=$! echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8400/health" "http://${PREFILL_NODE}:8400/healthcheck" || exit 1 -wait_for_health "http://${DECODE_NODE}:8500/health" "http://${DECODE_NODE}:8500/healthcheck" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/healthcheck" "http://${PREFILL_NODE}:8400/health" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/healthcheck" "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index 627c86950..a3d471108 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -86,8 +86,8 @@ srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --cont DECODE_PID=$! echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8400/health" "http://${PREFILL_NODE}:8400/healthcheck" || exit 1 -wait_for_health "http://${DECODE_NODE}:8500/health" "http://${DECODE_NODE}:8500/healthcheck" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/healthcheck" "http://${PREFILL_NODE}:8400/health" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/healthcheck" "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ diff --git a/tests/ref_data/vllm-multinode.sbatch b/tests/ref_data/vllm-multinode.sbatch index 0c00ff1cd..c3bf5648f 100644 --- a/tests/ref_data/vllm-multinode.sbatch +++ b/tests/ref_data/vllm-multinode.sbatch @@ -15,6 +15,12 @@ srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest -- srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { + echo "Stopping Ray clusters..." + if [ -n "${SERVE_NODELIST:-}" ]; then + srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1 bash -lc 'ray stop --force >/dev/null 2>&1 || true' >/dev/null 2>&1 || true + else + echo "Skipping Ray stop for serve: node list is not set" + fi echo "Cleaning up PIDs: SERVE_RAY_PID=$SERVE_RAY_PID SERVE_PID=$SERVE_PID" for pid in "$SERVE_RAY_PID" "$SERVE_PID"; do @@ -99,7 +105,7 @@ for (( i=0; i < 300; i+=5 )); do if [ "$active_nodes" -eq "2" ]; then echo "All Ray workers are active: $active_nodes/2" ray status || true - exec env RAY_ADDRESS="${SERVE_NODE}:${SERVE_RAY_PORT}" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --tensor-parallel-size 2 --port 8300 --distributed-executor-backend ray + exec env RAY_ADDRESS="${SERVE_NODE}:${SERVE_RAY_PORT}" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --tensor-parallel-size 8 --port 8300 --distributed-executor-backend ray fi echo "Waiting for Ray workers: $active_nodes/2 active" sleep 5 diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index dc3a399b5..2287876fb 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -614,7 +614,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - cmd_args=VllmCmdArgs( docker_image_url="nvcr.io/nvidia/vllm:latest", model="Qwen/Qwen3-0.6B", - decode=VllmArgs.model_validate({"tensor_parallel_size": 2}), + decode=VllmArgs.model_validate({"tensor_parallel_size": 8}), ), extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"}, ), @@ -643,7 +643,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - cmd_args=SglangCmdArgs( docker_image_url="docker.io/lmsysorg/sglang:dev", model="Qwen/Qwen3-8B", - decode=SglangArgs.model_validate({"tp": 2}), + decode=SglangArgs.model_validate({"tp": 8}), ), extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"}, ), diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index a7bc1ed70..46e468bf5 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -312,6 +312,10 @@ def test_custom_healthcheck_endpoints( vllm.cmd_args.proxy_healthcheck = "/router-ready" vllm_tr.num_nodes = 2 disaggregated = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr)._gen_srun_command() + assert 'wait_for_health "http://${PREFILL_NODE}:8100/ready"' in disaggregated + assert 'wait_for_health "http://${DECODE_NODE}:8200/ready"' in disaggregated + assert 'wait_for_health "http://${PREFILL_NODE}:8100/health"' not in disaggregated + assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' not in disaggregated assert 'wait_for_health "http://${PREFILL_NODE}:8000/router-ready"' in disaggregated assert ( 'wait_for_health "http://${PREFILL_NODE}:8000/router-ready" "http://${PREFILL_NODE}:8000/ready"' diff --git a/tests/workloads/vllm/test_workload.py b/tests/workloads/vllm/test_workload.py index 44d5a7584..78e7d23d1 100644 --- a/tests/workloads/vllm/test_workload.py +++ b/tests/workloads/vllm/test_workload.py @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pytest + from cloudai.core import GitRepo, TestRun from cloudai.systems.slurm import SlurmSystem from cloudai.workloads.vllm import VllmArgs, VllmCmdArgs, VllmTestDefinition @@ -151,3 +153,33 @@ def test_constraint_check_uses_role_nodes_for_multinode_disagg(tmp_path, slurm_s slurm_system.gpus_per_node = 4 assert tdef.constraint_check(tr, slurm_system) is True + + +@pytest.mark.parametrize( + ("prefill_nodes", "decode_nodes"), + [ + (None, 2), + (2, None), + (1, 1), + ], +) +def test_constraint_check_rejects_invalid_multinode_disagg_role_nodes( + prefill_nodes: int | None, + decode_nodes: int | None, + tmp_path, + slurm_system: SlurmSystem, +) -> None: + tdef = VllmTestDefinition( + name="test", + description="test", + test_template_name="vllm", + cmd_args=VllmCmdArgs( + docker_image_url="test_url", + prefill=VllmArgs.model_validate({"num_nodes": prefill_nodes}), + decode=VllmArgs.model_validate({"num_nodes": decode_nodes}), + ), + ) + tr = TestRun(name="vllm", test=tdef, num_nodes=4, nodes=[], output_path=tmp_path) + slurm_system.gpus_per_node = 4 + + assert tdef.constraint_check(tr, slurm_system) is False From 9ea8e1774962c3ab44fb645e772a25060b65effd Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 11 Jun 2026 16:09:28 +0200 Subject: [PATCH 16/26] Keep vLLM healthcheck compatibility --- .../sglang/test_scenario/sglang.toml | 37 +++++++----- .../experimental/vllm/test_scenario/vllm.toml | 31 +--------- doc/workloads/sglang.rst | 7 +++ doc/workloads/vllm.rst | 19 +++--- src/cloudai/workloads/common/llm_serving.py | 19 +++--- .../vllm/slurm_command_gen_strategy.py | 58 +++---------------- tests/ref_data/sglang-disagg-2nodes.sbatch | 18 +++--- tests/ref_data/sglang-disagg.sbatch | 18 +++--- tests/ref_data/sglang-multinode.sbatch | 10 ++-- tests/ref_data/vllm-disagg-2nodes.sbatch | 34 +++++------ tests/ref_data/vllm-disagg.sbatch | 34 +++++------ tests/ref_data/vllm-multinode.sbatch | 30 +++++----- tests/ref_data/vllm.sbatch | 16 +++-- .../vllm/test_command_gen_strategy_slurm.py | 30 ++++++---- 14 files changed, 154 insertions(+), 207 deletions(-) diff --git a/conf/experimental/sglang/test_scenario/sglang.toml b/conf/experimental/sglang/test_scenario/sglang.toml index f610d5ffe..9441fc56d 100644 --- a/conf/experimental/sglang/test_scenario/sglang.toml +++ b/conf/experimental/sglang/test_scenario/sglang.toml @@ -23,7 +23,16 @@ num_nodes = 2 time_limit = "00:10:00" [Tests.cmd_args.decode] - tp = 8 + tensor_parallel_size = 2 + mem_fraction_static = 0.75 + +[[Tests]] +id = "sglang.agg.1node" +test_name = "sglang" +num_nodes = 1 +time_limit = "00:10:00" + + [Tests.cmd_args.decode] mem_fraction_static = 0.75 [[Tests]] @@ -34,40 +43,40 @@ time_limit = "00:10:00" [Tests.cmd_args.prefill] gpu_ids = "0,1" - tp = 2 + tensor_parallel_size = 2 mem_fraction_static = 0.75 [Tests.cmd_args.decode] gpu_ids = "2,3" - tp = 2 + tensor_parallel_size = 2 mem_fraction_static = 0.75 [[Tests]] -id = "sglang.disagg.2nodes" +id = "sglang.disagg.async" test_name = "sglang" -num_nodes = 2 +num_nodes = 1 time_limit = "00:10:00" [Tests.cmd_args.prefill] - tp = 4 + gpu_ids = "0,1" + tensor_parallel_size = 2 mem_fraction_static = 0.75 [Tests.cmd_args.decode] - tp = 4 + gpu_ids = "2,3" + tensor_parallel_size = 2 mem_fraction_static = 0.75 [[Tests]] -id = "sglang.disagg.4nodes" +id = "sglang.disagg.2nodes" test_name = "sglang" -num_nodes = 4 -time_limit = "00:30:00" +num_nodes = 2 +time_limit = "00:10:00" [Tests.cmd_args.prefill] - num_nodes = 2 - tp = 8 + tensor_parallel_size = 4 mem_fraction_static = 0.75 [Tests.cmd_args.decode] - num_nodes = 2 - tp = 8 + tensor_parallel_size = 4 mem_fraction_static = 0.75 diff --git a/conf/experimental/vllm/test_scenario/vllm.toml b/conf/experimental/vllm/test_scenario/vllm.toml index 430ae3ff4..8e1207221 100644 --- a/conf/experimental/vllm/test_scenario/vllm.toml +++ b/conf/experimental/vllm/test_scenario/vllm.toml @@ -16,17 +16,6 @@ name = "vllm" -[[Tests]] -id = "vllm.agg.2nodes" -test_name = "vllm" -num_nodes = 2 -time_limit = "00:30:00" - - [Tests.cmd_args.decode] - enforce_eager = "" - tensor_parallel_size = 8 - max_num_batched_tokens = 1024 - [[Tests]] id = "vllm.disagg.sync" test_name = "vllm" @@ -44,7 +33,7 @@ time_limit = "00:30:00" max_num_batched_tokens = 1024 [[Tests]] -id = "vllm.disagg.1node" +id = "vllm.disagg.async" test_name = "vllm" num_nodes = 1 time_limit = "00:10:00" @@ -60,21 +49,3 @@ time_limit = "00:10:00" enforce_eager = "" tensor_parallel_size = 2 max_num_batched_tokens = 1024 - -[[Tests]] -id = "vllm.disagg.4nodes" -test_name = "vllm" -num_nodes = 4 -time_limit = "00:30:00" - - [Tests.cmd_args.prefill] - num_nodes = 2 - enforce_eager = "" - tensor_parallel_size = 8 - max_num_batched_tokens = 1024 - - [Tests.cmd_args.decode] - num_nodes = 2 - enforce_eager = "" - tensor_parallel_size = 8 - max_num_batched_tokens = 1024 diff --git a/doc/workloads/sglang.rst b/doc/workloads/sglang.rst index 68a27a2ec..f4537fe39 100644 --- a/doc/workloads/sglang.rst +++ b/doc/workloads/sglang.rst @@ -96,6 +96,13 @@ The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{o placeholders. +Readiness health checks +----------------------- +CloudAI waits for SGLang servers to become ready before starting the benchmark. The default SGLang readiness endpoint is +``/v1/models``. Set ``serve_healthcheck`` to override the endpoint used for aggregated serve processes and +disaggregated prefill/decode server processes. In disaggregated mode, the router readiness check uses ``healthcheck``. + + Control number of GPUs ---------------------- The number of GPUs can be controlled using the options below, listed from lowest to highest priority: diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst index d1e86027a..c250f37f1 100644 --- a/doc/workloads/vllm.rst +++ b/doc/workloads/vllm.rst @@ -200,16 +200,19 @@ IDs. Readiness health checks ----------------------- CloudAI waits for vLLM servers to become ready before starting the benchmark. The default vLLM server endpoint remains -``/healthcheck`` for backward compatibility with existing configs and runtime images. For vLLM images that expose the -newer ``/health`` endpoint, generated Slurm scripts also try the matching compatibility endpoint when either -``/healthcheck`` or ``/health`` is configured. +``/healthcheck`` for backward compatibility with existing configs and runtime images. Generated Slurm scripts wait for +the configured endpoint exactly. -In disaggregated mode, ``healthcheck`` controls the prefill/decode vLLM server readiness endpoint, while -``proxy_healthcheck`` controls the proxy/router readiness endpoint. Existing disaggregated configs that set -``healthcheck`` and do not set ``proxy_healthcheck`` continue to use ``healthcheck`` for the proxy/router check. +Use ``serve_healthcheck`` to override the readiness endpoint for the vLLM serve process, including prefill/decode server +processes in disaggregated mode. If ``serve_healthcheck`` is not set, aggregated serving uses ``healthcheck``. +Disaggregated prefill/decode serving keeps the legacy ``/health`` default. -For custom runtime images with a different readiness path, set ``healthcheck`` and, when using disaggregated mode, -``proxy_healthcheck`` explicitly. Custom paths are used as configured. +In disaggregated mode, ``proxy_healthcheck`` controls the proxy/router readiness endpoint. Existing disaggregated +configs that set ``healthcheck`` and do not set ``proxy_healthcheck`` continue to use ``healthcheck`` for the +proxy/router check. + +For custom runtime images with a different readiness path, set ``serve_healthcheck`` for vLLM server processes and, +when using disaggregated mode, ``proxy_healthcheck`` for the proxy/router. Controlling ``proxy_script`` diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py index bb0d33e8d..7abe4430b 100644 --- a/src/cloudai/workloads/common/llm_serving.py +++ b/src/cloudai/workloads/common/llm_serving.py @@ -149,7 +149,11 @@ class LLMServingCmdArgs(CmdArgs, Generic[LLMServingArgsT]): default=None, description="Hostname used by the benchmark client. Defaults to the allocated node hostname.", ) - healthcheck: str = Field(default="") + healthcheck: str = Field(default="/health") + serve_healthcheck: str | None = Field( + default=None, + description="Readiness endpoint for serve, prefill, and decode server processes. Defaults to healthcheck.", + ) serve_wait_seconds: int = 300 prefill: LLMServingArgsT | None = Field(default=None) decode: LLMServingArgsT @@ -651,10 +655,9 @@ def proxy_router_healthcheck(self) -> str: """Healthcheck endpoint for the helper/proxy process in disaggregated mode.""" return self.tdef.cmd_args.healthcheck - @property - def role_server_healthcheck(self) -> str: - """Healthcheck endpoint for prefill/decode server processes in disaggregated mode.""" - return "/health" + def serve_healthcheck(self, role: str) -> str: + """Healthcheck endpoint for serve, prefill, and decode server processes.""" + return self.tdef.cmd_args.serve_healthcheck or self.tdef.cmd_args.healthcheck @property def bench_log_file(self) -> str: @@ -783,7 +786,7 @@ def _gen_aggregated_script(self, serve_cmd: list[str], bench_cmd: str) -> str: health_func = self.generate_wait_for_health_function() wait_block = self.generate_wait_for_health_block( self.workload_name, - [f"http://${{NODE}}:{self.serve_port}{self.tdef.cmd_args.healthcheck}"], + [f"http://${{NODE}}:{self.serve_port}{self.serve_healthcheck('serve')}"], host_setup=host_setup, ) node_setup = self.generate_aggregated_node_setup(serve_node_count) @@ -840,8 +843,8 @@ def _gen_disaggregated_script(self, serve_commands: list[list[str]], bench_cmd: wait_block = self.generate_wait_for_health_block( self.workload_name, [ - f"http://{self.disaggregated_role_host('prefill')}:{self.prefill_port}{self.role_server_healthcheck}", - f"http://{self.disaggregated_role_host('decode')}:{self.decode_port}{self.role_server_healthcheck}", + f"http://{self.disaggregated_role_host('prefill')}:{self.prefill_port}{self.serve_healthcheck('prefill')}", + f"http://{self.disaggregated_role_host('decode')}:{self.decode_port}{self.serve_healthcheck('decode')}", ], host_setup="", host_display="$PREFILL_NODE and $DECODE_NODE", diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index c70d1760f..13e310e49 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -125,10 +125,6 @@ def disaggregated_cleanup_pid_vars(self) -> list[str]: pid_vars.insert(insert_at, "DECODE_RAY_PID") return pid_vars - @property - def role_server_healthcheck(self) -> str: - return self.tdef.cmd_args.healthcheck or "/health" - @property def proxy_router_healthcheck(self) -> str: fields_set = self.tdef.cmd_args.model_fields_set @@ -136,6 +132,13 @@ def proxy_router_healthcheck(self) -> str: return self.tdef.cmd_args.healthcheck return self.tdef.cmd_args.proxy_healthcheck + def serve_healthcheck(self, role: str) -> str: + if self.tdef.cmd_args.serve_healthcheck: + return self.tdef.cmd_args.serve_healthcheck + if role in {"prefill", "decode"}: + return "/health" + return self.tdef.cmd_args.healthcheck + def _ray_stop_cleanup_block(self) -> str: role_specs: list[tuple[str, str, int]] = [] if not self.is_disaggregated and self._needs_ray("serve"): @@ -171,53 +174,6 @@ def generate_cleanup_function(self, pid_vars: list[str], timeout: int = 15) -> s return cleanup return cleanup.replace("cleanup() {\n", f"cleanup() {{\n{ray_stop_block}\n", 1) - @staticmethod - def _compat_health_args(endpoint: str) -> str: - endpoints = [endpoint] - if endpoint.endswith("/health"): - endpoints.append(endpoint[: -len("/health")] + "/healthcheck") - elif endpoint.endswith("/healthcheck"): - endpoints.append(endpoint[: -len("/healthcheck")] + "/health") - return " ".join(f'"{value}"' for value in endpoints) - - def generate_wait_for_health_function(self) -> str: - return f"""\ -wait_for_health() {{ - local endpoints=("$@") - local timeout={self.tdef.cmd_args.serve_wait_seconds} - local interval=5 - local end_time=$(($(date +%s) + timeout)) - - while [ "$(date +%s)" -lt "$end_time" ]; do - for endpoint in "${{endpoints[@]}}"; do - if curl -sf "$endpoint" > /dev/null 2>&1; then - echo "Health check passed: $endpoint" - return 0 - fi - done - sleep "$interval" - done - - echo "Timeout waiting for: ${{endpoints[*]}}" - return 1 -}}""" - - @staticmethod - def generate_wait_for_health_block( - service_name: str, - endpoints: list[str], - *, - host_setup: str = "NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)\n", - host_display: str = "$NODE", - ) -> str: - waits = "\n".join( - f"wait_for_health {VllmSlurmCommandGenStrategy._compat_health_args(endpoint)} || exit 1" - for endpoint in endpoints - ) - return f"""\ -{host_setup}echo "Waiting for {service_name} on {host_display} to be ready..." -{waits}""" - def render_serve_launch( self, role: str, diff --git a/tests/ref_data/sglang-disagg-2nodes.sbatch b/tests/ref_data/sglang-disagg-2nodes.sbatch index c012d31d2..47cc35d88 100644 --- a/tests/ref_data/sglang-disagg-2nodes.sbatch +++ b/tests/ref_data/sglang-disagg-2nodes.sbatch @@ -1,6 +1,6 @@ #!/bin/bash # generated by CloudAI@__CLOUDAI_VERSION__ -#SBATCH --job-name=job_name +#SBATCH --job-name=__JOB_NAME__ #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main @@ -10,9 +10,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) export CUDA_VISIBLE_DEVICES=0,1,2,3 -srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { echo "Cleaning up PIDs: PREFILL_PID=$PREFILL_PID DECODE_PID=$DECODE_PID HELPER_PID=$HELPER_PID" @@ -69,22 +69,22 @@ fi echo "Node roles: prefill=${PREFILL_NODES[*]} decode=${DECODE_NODES[*]}" echo "Starting SGLang instances..." -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-prefill.log \ env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8400 --disaggregation-mode prefill --disaggregation-transfer-backend nixl & PREFILL_PID=$! -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-decode.log \ env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8500 --disaggregation-mode decode --disaggregation-transfer-backend nixl & DECODE_PID=$! echo "Waiting for SGLang on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 -wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/v1/models" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/v1/models" || exit 1 echo "Starting router..." -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-router.log \ python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8400 --decode http://${DECODE_NODE}:8500 --host 0.0.0.0 --port 8300 & HELPER_PID=$! @@ -93,7 +93,7 @@ echo "Waiting for SGLang on $PREFILL_NODE server to be ready..." wait_for_health "http://${PREFILL_NODE}:8300/v1/models" || exit 1 echo "Running benchmark..." -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-bench.log \ python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated diff --git a/tests/ref_data/sglang-disagg.sbatch b/tests/ref_data/sglang-disagg.sbatch index dd94deb2b..2c11daaba 100644 --- a/tests/ref_data/sglang-disagg.sbatch +++ b/tests/ref_data/sglang-disagg.sbatch @@ -1,6 +1,6 @@ #!/bin/bash # generated by CloudAI@__CLOUDAI_VERSION__ -#SBATCH --job-name=job_name +#SBATCH --job-name=__JOB_NAME__ #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main @@ -10,9 +10,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) export CUDA_VISIBLE_DEVICES=0,1,2,3 -srun --export=ALL --mpi=none -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=none -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --export=ALL --mpi=none -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=none -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { echo "Cleaning up PIDs: PREFILL_PID=$PREFILL_PID DECODE_PID=$DECODE_PID HELPER_PID=$HELPER_PID" @@ -69,22 +69,22 @@ fi echo "Node roles: prefill=${PREFILL_NODES[*]} decode=${DECODE_NODES[*]}" echo "Starting SGLang instances..." -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-prefill.log \ env CUDA_VISIBLE_DEVICES="0,1" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8400 --disaggregation-mode prefill --disaggregation-transfer-backend nixl & PREFILL_PID=$! -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-decode.log \ env CUDA_VISIBLE_DEVICES="2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8500 --disaggregation-mode decode --disaggregation-transfer-backend nixl & DECODE_PID=$! echo "Waiting for SGLang on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 -wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/v1/models" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/v1/models" || exit 1 echo "Starting router..." -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-router.log \ python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8400 --decode http://${DECODE_NODE}:8500 --host 0.0.0.0 --port 8300 & HELPER_PID=$! @@ -93,7 +93,7 @@ echo "Waiting for SGLang on $PREFILL_NODE server to be ready..." wait_for_health "http://${PREFILL_NODE}:8300/v1/models" || exit 1 echo "Running benchmark..." -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-bench.log \ python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated diff --git a/tests/ref_data/sglang-multinode.sbatch b/tests/ref_data/sglang-multinode.sbatch index 40cedac0c..e58989fbd 100644 --- a/tests/ref_data/sglang-multinode.sbatch +++ b/tests/ref_data/sglang-multinode.sbatch @@ -1,6 +1,6 @@ #!/bin/bash # generated by CloudAI@__CLOUDAI_VERSION__ -#SBATCH --job-name=job_name +#SBATCH --job-name=__JOB_NAME__ #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main @@ -10,9 +10,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) export CUDA_VISIBLE_DEVICES=0,1,2,3 -srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=none -N2 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { echo "Cleaning up PIDs: SERVE_PID=$SERVE_PID" @@ -59,7 +59,7 @@ SERVE_NODELIST=$(IFS=,; echo "${SERVE_NODES[*]}") echo "Node roles: serve=${SERVE_NODES[*]}" echo "Starting SGLang instances..." -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-serve.log-%N \ bash -c 'exec env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8300 --tp 8 --dist-init-addr "${SERVE_NODE}:${SERVE_DIST_INIT_PORT}" --nnodes 2 --node-rank "$SLURM_PROCID"' & SERVE_PID=$! @@ -68,7 +68,7 @@ echo "Waiting for SGLang on $NODE to be ready..." wait_for_health "http://${NODE}:8300/v1/models" || exit 1 echo "Running benchmark..." -srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/sglang-bench.log \ python3 -m sglang.bench_serving --backend sglang --base-url http://${NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details diff --git a/tests/ref_data/vllm-disagg-2nodes.sbatch b/tests/ref_data/vllm-disagg-2nodes.sbatch index ff4a863c8..703e48da4 100644 --- a/tests/ref_data/vllm-disagg-2nodes.sbatch +++ b/tests/ref_data/vllm-disagg-2nodes.sbatch @@ -1,6 +1,6 @@ #!/bin/bash # generated by CloudAI@__CLOUDAI_VERSION__ -#SBATCH --job-name=job_name +#SBATCH --job-name=__JOB_NAME__ #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main @@ -10,9 +10,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) export CUDA_VISIBLE_DEVICES=0,1,2,3 -srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { echo "Cleaning up PIDs: PREFILL_PID=$PREFILL_PID DECODE_PID=$DECODE_PID HELPER_PID=$HELPER_PID" @@ -34,22 +34,20 @@ cleanup() { trap cleanup EXIT wait_for_health() { - local endpoints=("$@") + local endpoint="$1" local timeout=300 local interval=5 local end_time=$(($(date +%s) + timeout)) while [ "$(date +%s)" -lt "$end_time" ]; do - for endpoint in "${endpoints[@]}"; do - if curl -sf "$endpoint" > /dev/null 2>&1; then - echo "Health check passed: $endpoint" - return 0 - fi - done + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi sleep "$interval" done - echo "Timeout waiting for: ${endpoints[*]}" + echo "Timeout waiting for: $endpoint" return 1 } @@ -75,31 +73,31 @@ fi echo "Node roles: prefill=${PREFILL_NODES[*]} decode=${DECODE_NODES[*]}" echo "Starting vLLM instances..." -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8400 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8500 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8400/healthcheck" "http://${PREFILL_NODE}:8400/health" || exit 1 -wait_for_health "http://${DECODE_NODE}:8500/healthcheck" "http://${DECODE_NODE}:8500/health" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-router.log \ python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8300 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8400 --decoder-hosts ${DECODE_NODE} --decoder-ports 8500 & HELPER_PID=$! echo "Waiting for vLLM on $PREFILL_NODE server to be ready..." -wait_for_health "http://${PREFILL_NODE}:8300/healthcheck" "http://${PREFILL_NODE}:8300/health" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8300/healthcheck" || exit 1 echo "Running benchmark..." -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index a3d471108..61ce2503c 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -1,6 +1,6 @@ #!/bin/bash # generated by CloudAI@__CLOUDAI_VERSION__ -#SBATCH --job-name=job_name +#SBATCH --job-name=__JOB_NAME__ #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main @@ -10,9 +10,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) export CUDA_VISIBLE_DEVICES=0,1,2,3 -srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { echo "Cleaning up PIDs: PREFILL_PID=$PREFILL_PID DECODE_PID=$DECODE_PID HELPER_PID=$HELPER_PID" @@ -34,22 +34,20 @@ cleanup() { trap cleanup EXIT wait_for_health() { - local endpoints=("$@") + local endpoint="$1" local timeout=300 local interval=5 local end_time=$(($(date +%s) + timeout)) while [ "$(date +%s)" -lt "$end_time" ]; do - for endpoint in "${endpoints[@]}"; do - if curl -sf "$endpoint" > /dev/null 2>&1; then - echo "Health check passed: $endpoint" - return 0 - fi - done + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi sleep "$interval" done - echo "Timeout waiting for: ${endpoints[*]}" + echo "Timeout waiting for: $endpoint" return 1 } @@ -75,31 +73,31 @@ fi echo "Node roles: prefill=${PREFILL_NODES[*]} decode=${DECODE_NODES[*]}" echo "Starting vLLM instances..." -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ env CUDA_VISIBLE_DEVICES="0,1" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8400 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${DECODE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ env CUDA_VISIBLE_DEVICES="2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8500 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8400/healthcheck" "http://${PREFILL_NODE}:8400/health" || exit 1 -wait_for_health "http://${DECODE_NODE}:8500/healthcheck" "http://${DECODE_NODE}:8500/health" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-router.log \ python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8300 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8400 --decoder-hosts ${DECODE_NODE} --decoder-ports 8500 & HELPER_PID=$! echo "Waiting for vLLM on $PREFILL_NODE server to be ready..." -wait_for_health "http://${PREFILL_NODE}:8300/healthcheck" "http://${PREFILL_NODE}:8300/health" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8300/healthcheck" || exit 1 echo "Running benchmark..." -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/ref_data/vllm-multinode.sbatch b/tests/ref_data/vllm-multinode.sbatch index c3bf5648f..7a137f469 100644 --- a/tests/ref_data/vllm-multinode.sbatch +++ b/tests/ref_data/vllm-multinode.sbatch @@ -1,6 +1,6 @@ #!/bin/bash # generated by CloudAI@__CLOUDAI_VERSION__ -#SBATCH --job-name=job_name +#SBATCH --job-name=__JOB_NAME__ #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt #SBATCH --partition=main @@ -10,14 +10,14 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) export CUDA_VISIBLE_DEVICES=0,1,2,3 -srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=none -N2 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { echo "Stopping Ray clusters..." if [ -n "${SERVE_NODELIST:-}" ]; then - srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1 bash -lc 'ray stop --force >/dev/null 2>&1 || true' >/dev/null 2>&1 || true + srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODELIST}" --nodes=2 --ntasks=2 --ntasks-per-node=1 bash -lc 'ray stop --force >/dev/null 2>&1 || true' >/dev/null 2>&1 || true else echo "Skipping Ray stop for serve: node list is not set" fi @@ -40,22 +40,20 @@ cleanup() { trap cleanup EXIT wait_for_health() { - local endpoints=("$@") + local endpoint="$1" local timeout=300 local interval=5 local end_time=$(($(date +%s) + timeout)) while [ "$(date +%s)" -lt "$end_time" ]; do - for endpoint in "${endpoints[@]}"; do - if curl -sf "$endpoint" > /dev/null 2>&1; then - echo "Health check passed: $endpoint" - return 0 - fi - done + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi sleep "$interval" done - echo "Timeout waiting for: ${endpoints[*]}" + echo "Timeout waiting for: $endpoint" return 1 } @@ -76,7 +74,7 @@ echo "Starting vLLM instances..." ( trap 'kill -TERM $(jobs -pr) 2>/dev/null' TERM EXIT for node in "${SERVE_NODES[@]:1}"; do - srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="$node" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ + srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="$node" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-serve-ray-worker-%N.log \ bash -lc 'ray stop --force >/dev/null 2>&1 || true for (( i=0; i < 300; i+=5 )); do @@ -93,7 +91,7 @@ exit 1' & wait ) & SERVE_RAY_PID=$! -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-serve.log \ --error=__OUTPUT_DIR__/output/vllm-serve-ray-head.log \ bash -lc 'ray stop --force >/dev/null 2>&1 || true @@ -116,10 +114,10 @@ exit 1' & SERVE_PID=$! echo "Waiting for vLLM on $NODE to be ready..." -wait_for_health "http://${NODE}:8300/healthcheck" "http://${NODE}:8300/health" || exit 1 +wait_for_health "http://${NODE}:8300/healthcheck" || exit 1 echo "Running benchmark..." -srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/install/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ +srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${SERVE_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index 9f878d613..d192b2ccc 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -27,22 +27,20 @@ cleanup() { trap cleanup EXIT wait_for_health() { - local endpoints=("$@") + local endpoint="$1" local timeout=300 local interval=5 local end_time=$(($(date +%s) + timeout)) while [ "$(date +%s)" -lt "$end_time" ]; do - for endpoint in "${endpoints[@]}"; do - if curl -sf "$endpoint" > /dev/null 2>&1; then - echo "Health check passed: $endpoint" - return 0 - fi - done + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi sleep "$interval" done - echo "Timeout waiting for: ${endpoints[*]}" + echo "Timeout waiting for: $endpoint" return 1 } @@ -54,7 +52,7 @@ SERVE_PID=$! NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) echo "Waiting for vLLM on $NODE to be ready..." -wait_for_health "http://${NODE}:8300/healthcheck" "http://${NODE}:8300/health" || exit 1 +wait_for_health "http://${NODE}:8300/healthcheck" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=none -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index 46e468bf5..49889dfd7 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -254,22 +254,20 @@ def test_generate_wait_for_health_function(self, vllm_cmd_gen_strategy: VllmSlur expected = f"""\ wait_for_health() {{ - local endpoints=("$@") + local endpoint="$1" local timeout={cmd_args.serve_wait_seconds} local interval=5 local end_time=$(($(date +%s) + timeout)) while [ "$(date +%s)" -lt "$end_time" ]; do - for endpoint in "${{endpoints[@]}}"; do - if curl -sf "$endpoint" > /dev/null 2>&1; then - echo "Health check passed: $endpoint" - return 0 - fi - done + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi sleep "$interval" done - echo "Timeout waiting for: ${{endpoints[*]}}" + echo "Timeout waiting for: $endpoint" return 1 }}""" @@ -312,16 +310,22 @@ def test_custom_healthcheck_endpoints( vllm.cmd_args.proxy_healthcheck = "/router-ready" vllm_tr.num_nodes = 2 disaggregated = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr)._gen_srun_command() - assert 'wait_for_health "http://${PREFILL_NODE}:8100/ready"' in disaggregated - assert 'wait_for_health "http://${DECODE_NODE}:8200/ready"' in disaggregated - assert 'wait_for_health "http://${PREFILL_NODE}:8100/health"' not in disaggregated - assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' not in disaggregated + assert 'wait_for_health "http://${PREFILL_NODE}:8100/health"' in disaggregated + assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' in disaggregated + assert 'wait_for_health "http://${PREFILL_NODE}:8100/ready"' not in disaggregated + assert 'wait_for_health "http://${DECODE_NODE}:8200/ready"' not in disaggregated assert 'wait_for_health "http://${PREFILL_NODE}:8000/router-ready"' in disaggregated assert ( 'wait_for_health "http://${PREFILL_NODE}:8000/router-ready" "http://${PREFILL_NODE}:8000/ready"' not in disaggregated ) + vllm.cmd_args.serve_healthcheck = "/serve-ready" + disaggregated = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr)._gen_srun_command() + assert 'wait_for_health "http://${PREFILL_NODE}:8100/serve-ready"' in disaggregated + assert 'wait_for_health "http://${DECODE_NODE}:8200/serve-ready"' in disaggregated + assert 'wait_for_health "http://${PREFILL_NODE}:8000/router-ready"' in disaggregated + def test_disagg_custom_healthcheck_preserves_legacy_proxy_endpoint( self, vllm: VllmTestDefinition, vllm_tr: TestRun, slurm_system: SlurmSystem ) -> None: @@ -332,6 +336,8 @@ def test_disagg_custom_healthcheck_preserves_legacy_proxy_endpoint( disaggregated = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr)._gen_srun_command() + assert 'wait_for_health "http://${PREFILL_NODE}:8100/health"' in disaggregated + assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' in disaggregated assert 'wait_for_health "http://${PREFILL_NODE}:8000/legacy-ready"' in disaggregated From d4b53de6a00c3075cf51d4cfede6fd96e2387cdd Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 11 Jun 2026 20:50:37 +0200 Subject: [PATCH 17/26] Make standard LLM scenarios use four visible GPUs --- .../sglang/test_scenario/sglang.toml | 18 ++++++++- .../experimental/vllm/test_scenario/vllm.toml | 40 +++++++++++++++++-- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/conf/experimental/sglang/test_scenario/sglang.toml b/conf/experimental/sglang/test_scenario/sglang.toml index 9441fc56d..6213574c3 100644 --- a/conf/experimental/sglang/test_scenario/sglang.toml +++ b/conf/experimental/sglang/test_scenario/sglang.toml @@ -23,9 +23,12 @@ num_nodes = 2 time_limit = "00:10:00" [Tests.cmd_args.decode] - tensor_parallel_size = 2 + tensor_parallel_size = 8 mem_fraction_static = 0.75 + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" + [[Tests]] id = "sglang.agg.1node" test_name = "sglang" @@ -33,8 +36,12 @@ num_nodes = 1 time_limit = "00:10:00" [Tests.cmd_args.decode] + tensor_parallel_size = 4 mem_fraction_static = 0.75 + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" + [[Tests]] id = "sglang.disagg.sync" test_name = "sglang" @@ -51,6 +58,9 @@ time_limit = "00:10:00" tensor_parallel_size = 2 mem_fraction_static = 0.75 + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" + [[Tests]] id = "sglang.disagg.async" test_name = "sglang" @@ -67,6 +77,9 @@ time_limit = "00:10:00" tensor_parallel_size = 2 mem_fraction_static = 0.75 + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" + [[Tests]] id = "sglang.disagg.2nodes" test_name = "sglang" @@ -80,3 +93,6 @@ time_limit = "00:10:00" [Tests.cmd_args.decode] tensor_parallel_size = 4 mem_fraction_static = 0.75 + + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" diff --git a/conf/experimental/vllm/test_scenario/vllm.toml b/conf/experimental/vllm/test_scenario/vllm.toml index 8e1207221..828c9a815 100644 --- a/conf/experimental/vllm/test_scenario/vllm.toml +++ b/conf/experimental/vllm/test_scenario/vllm.toml @@ -16,6 +16,34 @@ name = "vllm" +[[Tests]] +id = "vllm.agg.1node" +test_name = "vllm" +num_nodes = 1 +time_limit = "00:10:00" + + [Tests.cmd_args.decode] + enforce_eager = "" + tensor_parallel_size = 4 + max_num_batched_tokens = 1024 + + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" + +[[Tests]] +id = "vllm.agg.2nodes" +test_name = "vllm" +num_nodes = 2 +time_limit = "00:30:00" + + [Tests.cmd_args.decode] + enforce_eager = "" + tensor_parallel_size = 8 + max_num_batched_tokens = 1024 + + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" + [[Tests]] id = "vllm.disagg.sync" test_name = "vllm" @@ -24,14 +52,17 @@ time_limit = "00:30:00" [Tests.cmd_args.prefill] enforce_eager = "" - tensor_parallel_size = 2 + tensor_parallel_size = 4 max_num_batched_tokens = 1024 [Tests.cmd_args.decode] enforce_eager = "" - tensor_parallel_size = 2 + tensor_parallel_size = 4 max_num_batched_tokens = 1024 + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" + [[Tests]] id = "vllm.disagg.async" test_name = "vllm" @@ -41,7 +72,7 @@ time_limit = "00:10:00" [Tests.cmd_args.prefill] gpu_ids = "0,1" enforce_eager = "" - tensor_parallel_size = 1 + tensor_parallel_size = 2 max_num_batched_tokens = 1024 [Tests.cmd_args.decode] @@ -49,3 +80,6 @@ time_limit = "00:10:00" enforce_eager = "" tensor_parallel_size = 2 max_num_batched_tokens = 1024 + + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" From 675e1b46d39971a736328828d28fc8d1a1230d23 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 11 Jun 2026 20:54:28 +0200 Subject: [PATCH 18/26] Reference regular LLM workloads from heavy scenarios --- .../sglang/test/sglang-heavy.toml | 41 ------------------ .../sglang/test_scenario/sglang-heavy.toml | 10 ++--- conf/experimental/vllm/test/vllm-heavy.toml | 42 ------------------- .../vllm/test_scenario/vllm-heavy-perf.toml | 12 +++--- .../vllm/test_scenario/vllm-heavy.toml | 10 ++--- 5 files changed, 16 insertions(+), 99 deletions(-) delete mode 100644 conf/experimental/sglang/test/sglang-heavy.toml delete mode 100644 conf/experimental/vllm/test/vllm-heavy.toml diff --git a/conf/experimental/sglang/test/sglang-heavy.toml b/conf/experimental/sglang/test/sglang-heavy.toml deleted file mode 100644 index 3d91a22ae..000000000 --- a/conf/experimental/sglang/test/sglang-heavy.toml +++ /dev/null @@ -1,41 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "sglang-heavy" -description = "Heavy SGLang multi-node correctness benchmark" -test_template_name = "sglang" - -[cmd_args] -docker_image_url = "lmsysorg/sglang:dev-cu13" -model = "Qwen/Qwen3-8B" - -[bench_cmd_args] -dataset_name = "random" -num_prompts = 512 -max_concurrency = 64 -random_input = 512 -random_output = 512 -warmup_requests = 2 -random_range_ratio = 1.0 -output_details = true - -[semantic_eval_cmd_args] -entrypoint = "python3 -m sglang.test.run_eval" -cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" - -[extra_env_vars] -UCX_NET_DEVICES = "all" -UCX_TLS = "^gdr_copy,cuda_ipc" diff --git a/conf/experimental/sglang/test_scenario/sglang-heavy.toml b/conf/experimental/sglang/test_scenario/sglang-heavy.toml index 57bb21a43..b9f5426b6 100644 --- a/conf/experimental/sglang/test_scenario/sglang-heavy.toml +++ b/conf/experimental/sglang/test_scenario/sglang-heavy.toml @@ -18,7 +18,7 @@ name = "sglang-heavy-multinode" [[Tests]] id = "sglang.heavy.agg.1node" -test_name = "sglang-heavy" +test_name = "sglang" num_nodes = 1 time_limit = "01:00:00" @@ -29,7 +29,7 @@ time_limit = "01:00:00" [[Tests]] id = "sglang.heavy.agg.2nodes" -test_name = "sglang-heavy" +test_name = "sglang" num_nodes = 2 time_limit = "01:00:00" @@ -40,7 +40,7 @@ time_limit = "01:00:00" [[Tests]] id = "sglang.heavy.agg.4nodes" -test_name = "sglang-heavy" +test_name = "sglang" num_nodes = 4 time_limit = "01:00:00" @@ -51,7 +51,7 @@ time_limit = "01:00:00" [[Tests]] id = "sglang.heavy.disagg.4nodes" -test_name = "sglang-heavy" +test_name = "sglang" num_nodes = 4 time_limit = "01:00:00" @@ -69,7 +69,7 @@ time_limit = "01:00:00" [[Tests]] id = "sglang.heavy.disagg.8nodes.4p4d" -test_name = "sglang-heavy" +test_name = "sglang" num_nodes = 8 time_limit = "01:30:00" diff --git a/conf/experimental/vllm/test/vllm-heavy.toml b/conf/experimental/vllm/test/vllm-heavy.toml deleted file mode 100644 index 5c4070bf9..000000000 --- a/conf/experimental/vllm/test/vllm-heavy.toml +++ /dev/null @@ -1,42 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "vllm-heavy" -description = "Heavy vLLM multi-node benchmark" -test_template_name = "vllm" - -[[git_repos]] -url = "https://github.com/vllm-project/vllm.git" -commit = "a8887c208f34c04c3b021cf3949ed6545d77bb01" -mount_as = "/vllm_repo" - -[cmd_args] -docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1" -model = "Qwen/Qwen3-8B" - -[bench_cmd_args] -random_input_len = 512 -random_output_len = 512 -max_concurrency = 64 -num_prompts = 512 - -[semantic_eval_cmd_args] -entrypoint = "python3 /vllm_repo/tests/evals/gsm8k/gsm8k_eval.py" -cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json" - -[extra_env_vars] -UCX_NET_DEVICES = "all" -UCX_TLS = "^gdr_copy,cuda_ipc" diff --git a/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml b/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml index 3ea67b9f0..298c253a1 100644 --- a/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml +++ b/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml @@ -18,7 +18,7 @@ name = "vllm-heavy-perf-multinode" [[Tests]] id = "vllm.heavy.perf.agg.1node" -test_name = "vllm-heavy" +test_name = "vllm" num_nodes = 1 time_limit = "02:00:00" @@ -34,7 +34,7 @@ time_limit = "02:00:00" [[Tests]] id = "vllm.heavy.perf.agg.2nodes" -test_name = "vllm-heavy" +test_name = "vllm" num_nodes = 2 time_limit = "02:00:00" @@ -50,7 +50,7 @@ time_limit = "02:00:00" [[Tests]] id = "vllm.heavy.perf.agg.4nodes" -test_name = "vllm-heavy" +test_name = "vllm" num_nodes = 4 time_limit = "02:00:00" @@ -66,7 +66,7 @@ time_limit = "02:00:00" [[Tests]] id = "vllm.heavy.perf.agg.8nodes" -test_name = "vllm-heavy" +test_name = "vllm" num_nodes = 8 time_limit = "02:00:00" @@ -82,7 +82,7 @@ time_limit = "02:00:00" [[Tests]] id = "vllm.heavy.perf.disagg.4nodes" -test_name = "vllm-heavy" +test_name = "vllm" num_nodes = 4 time_limit = "02:00:00" @@ -107,7 +107,7 @@ time_limit = "02:00:00" [[Tests]] id = "vllm.heavy.perf.disagg.8nodes.4p4d" -test_name = "vllm-heavy" +test_name = "vllm" num_nodes = 8 time_limit = "02:00:00" diff --git a/conf/experimental/vllm/test_scenario/vllm-heavy.toml b/conf/experimental/vllm/test_scenario/vllm-heavy.toml index 3345baa06..c858be8ca 100644 --- a/conf/experimental/vllm/test_scenario/vllm-heavy.toml +++ b/conf/experimental/vllm/test_scenario/vllm-heavy.toml @@ -18,7 +18,7 @@ name = "vllm-heavy-multinode" [[Tests]] id = "vllm.heavy.agg.1node" -test_name = "vllm-heavy" +test_name = "vllm" num_nodes = 1 time_limit = "01:00:00" @@ -31,7 +31,7 @@ time_limit = "01:00:00" [[Tests]] id = "vllm.heavy.agg.2nodes" -test_name = "vllm-heavy" +test_name = "vllm" num_nodes = 2 time_limit = "01:00:00" @@ -44,7 +44,7 @@ time_limit = "01:00:00" [[Tests]] id = "vllm.heavy.agg.4nodes" -test_name = "vllm-heavy" +test_name = "vllm" num_nodes = 4 time_limit = "01:00:00" @@ -57,7 +57,7 @@ time_limit = "01:00:00" [[Tests]] id = "vllm.heavy.disagg.4nodes" -test_name = "vllm-heavy" +test_name = "vllm" num_nodes = 4 time_limit = "01:00:00" @@ -79,7 +79,7 @@ time_limit = "01:00:00" [[Tests]] id = "vllm.heavy.disagg.7nodes.3p4d" -test_name = "vllm-heavy" +test_name = "vllm" num_nodes = 7 time_limit = "01:30:00" From f4d3661887abeca206a07eaebc2b06f78ad9b3e3 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 11 Jun 2026 20:59:39 +0200 Subject: [PATCH 19/26] Keep heavy LLM scenarios to eight-node topology --- .../sglang/test_scenario/sglang-heavy.toml | 55 +---------- .../vllm/test_scenario/vllm-heavy-perf.toml | 89 ----------------- .../vllm/test_scenario/vllm-heavy.toml | 98 ++++--------------- 3 files changed, 22 insertions(+), 220 deletions(-) diff --git a/conf/experimental/sglang/test_scenario/sglang-heavy.toml b/conf/experimental/sglang/test_scenario/sglang-heavy.toml index b9f5426b6..5b61caa32 100644 --- a/conf/experimental/sglang/test_scenario/sglang-heavy.toml +++ b/conf/experimental/sglang/test_scenario/sglang-heavy.toml @@ -16,57 +16,6 @@ name = "sglang-heavy-multinode" -[[Tests]] -id = "sglang.heavy.agg.1node" -test_name = "sglang" -num_nodes = 1 -time_limit = "01:00:00" - - [Tests.cmd_args.decode] - gpu_ids = "0,1,2,3" - tp = 4 - mem_fraction_static = 0.75 - -[[Tests]] -id = "sglang.heavy.agg.2nodes" -test_name = "sglang" -num_nodes = 2 -time_limit = "01:00:00" - - [Tests.cmd_args.decode] - gpu_ids = "0,1,2,3" - tp = 8 - mem_fraction_static = 0.75 - -[[Tests]] -id = "sglang.heavy.agg.4nodes" -test_name = "sglang" -num_nodes = 4 -time_limit = "01:00:00" - - [Tests.cmd_args.decode] - gpu_ids = "0,1,2,3" - tp = 8 - mem_fraction_static = 0.75 - -[[Tests]] -id = "sglang.heavy.disagg.4nodes" -test_name = "sglang" -num_nodes = 4 -time_limit = "01:00:00" - - [Tests.cmd_args.prefill] - num_nodes = 2 - gpu_ids = "0,1,2,3" - tp = 8 - mem_fraction_static = 0.75 - - [Tests.cmd_args.decode] - num_nodes = 2 - gpu_ids = "0,1,2,3" - tp = 8 - mem_fraction_static = 0.75 - [[Tests]] id = "sglang.heavy.disagg.8nodes.4p4d" test_name = "sglang" @@ -76,11 +25,11 @@ time_limit = "01:30:00" [Tests.cmd_args.prefill] num_nodes = 4 gpu_ids = "0,1,2,3" - tp = 8 + tp = 16 mem_fraction_static = 0.75 [Tests.cmd_args.decode] num_nodes = 4 gpu_ids = "0,1,2,3" - tp = 8 + tp = 16 mem_fraction_static = 0.75 diff --git a/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml b/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml index 298c253a1..2bb693543 100644 --- a/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml +++ b/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml @@ -16,95 +16,6 @@ name = "vllm-heavy-perf-multinode" -[[Tests]] -id = "vllm.heavy.perf.agg.1node" -test_name = "vllm" -num_nodes = 1 -time_limit = "02:00:00" - - [Tests.cmd_args] - model = "Qwen/Qwen3-32B" - - [Tests.cmd_args.decode] - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 4 - max_num_batched_tokens = 16384 - max_model_len = 16384 - -[[Tests]] -id = "vllm.heavy.perf.agg.2nodes" -test_name = "vllm" -num_nodes = 2 -time_limit = "02:00:00" - - [Tests.cmd_args] - model = "Qwen/Qwen3-32B" - - [Tests.cmd_args.decode] - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 8 - max_num_batched_tokens = 16384 - max_model_len = 16384 - -[[Tests]] -id = "vllm.heavy.perf.agg.4nodes" -test_name = "vllm" -num_nodes = 4 -time_limit = "02:00:00" - - [Tests.cmd_args] - model = "Qwen/Qwen3-32B" - - [Tests.cmd_args.decode] - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 16 - max_num_batched_tokens = 16384 - max_model_len = 16384 - -[[Tests]] -id = "vllm.heavy.perf.agg.8nodes" -test_name = "vllm" -num_nodes = 8 -time_limit = "02:00:00" - - [Tests.cmd_args] - model = "Qwen/Qwen3-32B" - - [Tests.cmd_args.decode] - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 32 - max_num_batched_tokens = 16384 - max_model_len = 16384 - -[[Tests]] -id = "vllm.heavy.perf.disagg.4nodes" -test_name = "vllm" -num_nodes = 4 -time_limit = "02:00:00" - - [Tests.cmd_args] - model = "Qwen/Qwen3-32B" - - [Tests.cmd_args.prefill] - num_nodes = 2 - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 8 - max_num_batched_tokens = 16384 - max_model_len = 16384 - - [Tests.cmd_args.decode] - num_nodes = 2 - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 8 - max_num_batched_tokens = 16384 - max_model_len = 16384 - [[Tests]] id = "vllm.heavy.perf.disagg.8nodes.4p4d" test_name = "vllm" diff --git a/conf/experimental/vllm/test_scenario/vllm-heavy.toml b/conf/experimental/vllm/test_scenario/vllm-heavy.toml index c858be8ca..f5ac872c7 100644 --- a/conf/experimental/vllm/test_scenario/vllm-heavy.toml +++ b/conf/experimental/vllm/test_scenario/vllm-heavy.toml @@ -17,84 +17,26 @@ name = "vllm-heavy-multinode" [[Tests]] -id = "vllm.heavy.agg.1node" +id = "vllm.heavy.disagg.8nodes.4p4d" test_name = "vllm" -num_nodes = 1 -time_limit = "01:00:00" - - [Tests.cmd_args.decode] - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 4 - max_num_batched_tokens = 8192 - max_model_len = 8192 - -[[Tests]] -id = "vllm.heavy.agg.2nodes" -test_name = "vllm" -num_nodes = 2 -time_limit = "01:00:00" - - [Tests.cmd_args.decode] - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 8 - max_num_batched_tokens = 8192 - max_model_len = 8192 - -[[Tests]] -id = "vllm.heavy.agg.4nodes" -test_name = "vllm" -num_nodes = 4 -time_limit = "01:00:00" - - [Tests.cmd_args.decode] - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 8 - max_num_batched_tokens = 8192 - max_model_len = 8192 - -[[Tests]] -id = "vllm.heavy.disagg.4nodes" -test_name = "vllm" -num_nodes = 4 -time_limit = "01:00:00" - - [Tests.cmd_args.prefill] - num_nodes = 2 - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 8 - max_num_batched_tokens = 8192 - max_model_len = 8192 - - [Tests.cmd_args.decode] - num_nodes = 2 - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 8 - max_num_batched_tokens = 8192 - max_model_len = 8192 - -[[Tests]] -id = "vllm.heavy.disagg.7nodes.3p4d" -test_name = "vllm" -num_nodes = 7 +num_nodes = 8 time_limit = "01:30:00" - [Tests.cmd_args.prefill] - num_nodes = 3 - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 8 - max_num_batched_tokens = 8192 - max_model_len = 8192 - - [Tests.cmd_args.decode] - num_nodes = 4 - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 8 - max_num_batched_tokens = 8192 - max_model_len = 8192 + [Tests.cmd_args] + model = "Qwen/Qwen3-8B" + + [Tests.cmd_args.prefill] + num_nodes = 4 + gpu_ids = "0,1,2,3" + enforce_eager = "" + tensor_parallel_size = 16 + max_num_batched_tokens = 8192 + max_model_len = 8192 + + [Tests.cmd_args.decode] + num_nodes = 4 + gpu_ids = "0,1,2,3" + enforce_eager = "" + tensor_parallel_size = 16 + max_num_batched_tokens = 8192 + max_model_len = 8192 From c37904440fbdb705abebcbf7cbaf7df6b2f9ca45 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 11 Jun 2026 21:00:31 +0200 Subject: [PATCH 20/26] remove redundant vllm-heavy-perf --- .../vllm/test_scenario/vllm-heavy-perf.toml | 42 ------------------- 1 file changed, 42 deletions(-) delete mode 100644 conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml diff --git a/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml b/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml deleted file mode 100644 index 2bb693543..000000000 --- a/conf/experimental/vllm/test_scenario/vllm-heavy-perf.toml +++ /dev/null @@ -1,42 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "vllm-heavy-perf-multinode" - -[[Tests]] -id = "vllm.heavy.perf.disagg.8nodes.4p4d" -test_name = "vllm" -num_nodes = 8 -time_limit = "02:00:00" - - [Tests.cmd_args] - model = "Qwen/Qwen3-32B" - - [Tests.cmd_args.prefill] - num_nodes = 4 - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 16 - max_num_batched_tokens = 16384 - max_model_len = 16384 - - [Tests.cmd_args.decode] - num_nodes = 4 - gpu_ids = "0,1,2,3" - enforce_eager = "" - tensor_parallel_size = 16 - max_num_batched_tokens = 16384 - max_model_len = 16384 From b11155c967dd86d282f53c83af62aab5ed5e0427 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 11 Jun 2026 21:36:11 +0200 Subject: [PATCH 21/26] Fix vLLM scenario healthcheck and GPU visibility --- conf/experimental/sglang/test_scenario/sglang-heavy.toml | 3 +++ conf/experimental/vllm/test/vllm.toml | 1 + conf/experimental/vllm/test_scenario/vllm-heavy.toml | 3 +++ 3 files changed, 7 insertions(+) diff --git a/conf/experimental/sglang/test_scenario/sglang-heavy.toml b/conf/experimental/sglang/test_scenario/sglang-heavy.toml index 5b61caa32..bacb72503 100644 --- a/conf/experimental/sglang/test_scenario/sglang-heavy.toml +++ b/conf/experimental/sglang/test_scenario/sglang-heavy.toml @@ -33,3 +33,6 @@ time_limit = "01:30:00" gpu_ids = "0,1,2,3" tp = 16 mem_fraction_static = 0.75 + + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" diff --git a/conf/experimental/vllm/test/vllm.toml b/conf/experimental/vllm/test/vllm.toml index a8061099c..8e6581653 100644 --- a/conf/experimental/vllm/test/vllm.toml +++ b/conf/experimental/vllm/test/vllm.toml @@ -25,6 +25,7 @@ mount_as = "/vllm_repo" [cmd_args] docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1" +serve_healthcheck = "/health" [semantic_eval_cmd_args] entrypoint = "python3 /vllm_repo/tests/evals/gsm8k/gsm8k_eval.py" diff --git a/conf/experimental/vllm/test_scenario/vllm-heavy.toml b/conf/experimental/vllm/test_scenario/vllm-heavy.toml index f5ac872c7..5940a1956 100644 --- a/conf/experimental/vllm/test_scenario/vllm-heavy.toml +++ b/conf/experimental/vllm/test_scenario/vllm-heavy.toml @@ -40,3 +40,6 @@ time_limit = "01:30:00" tensor_parallel_size = 16 max_num_batched_tokens = 8192 max_model_len = 8192 + + [Tests.extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" From a5fe32ef4e7b13b3807b049835d5fd4e453c4501 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 11 Jun 2026 22:42:07 +0200 Subject: [PATCH 22/26] add explicit ray sections --- .../vllm/test_scenario/vllm-heavy.toml | 12 +++++ src/cloudai/workloads/vllm/__init__.py | 2 + .../vllm/slurm_command_gen_strategy.py | 51 +++++++++++++++++- src/cloudai/workloads/vllm/vllm.py | 54 ++++++++++++++++++- tests/ref_data/vllm-multinode.sbatch | 4 +- tests/test_acceptance.py | 10 +++- .../vllm/test_command_gen_strategy_slurm.py | 29 ++++++++++ 7 files changed, 154 insertions(+), 8 deletions(-) diff --git a/conf/experimental/vllm/test_scenario/vllm-heavy.toml b/conf/experimental/vllm/test_scenario/vllm-heavy.toml index 5940a1956..0f1b1f92a 100644 --- a/conf/experimental/vllm/test_scenario/vllm-heavy.toml +++ b/conf/experimental/vllm/test_scenario/vllm-heavy.toml @@ -33,6 +33,12 @@ time_limit = "01:30:00" max_num_batched_tokens = 8192 max_model_len = 8192 + [Tests.cmd_args.prefill.ray_head] + num_gpus = 4 + + [Tests.cmd_args.prefill.ray_worker] + num_gpus = 4 + [Tests.cmd_args.decode] num_nodes = 4 gpu_ids = "0,1,2,3" @@ -41,5 +47,11 @@ time_limit = "01:30:00" max_num_batched_tokens = 8192 max_model_len = 8192 + [Tests.cmd_args.decode.ray_head] + num_gpus = 4 + + [Tests.cmd_args.decode.ray_worker] + num_gpus = 4 + [Tests.extra_env_vars] CUDA_VISIBLE_DEVICES = "0,1,2,3" diff --git a/src/cloudai/workloads/vllm/__init__.py b/src/cloudai/workloads/vllm/__init__.py index bd809ecd0..86216d151 100644 --- a/src/cloudai/workloads/vllm/__init__.py +++ b/src/cloudai/workloads/vllm/__init__.py @@ -24,6 +24,7 @@ VllmArgs, VllmBenchCmdArgs, VllmCmdArgs, + VllmRayStartArgs, VllmSemanticEvalCmdArgs, VllmTestDefinition, ) @@ -40,6 +41,7 @@ "VllmArgs", "VllmBenchCmdArgs", "VllmCmdArgs", + "VllmRayStartArgs", "VllmSemanticEvalCmdArgs", "VllmSlurmCommandGenStrategy", "VllmTestDefinition", diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 13e310e49..eb86d0dfb 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -22,7 +22,9 @@ from .vllm import ( VLLM_BENCH_JSON_FILE, + VllmArgs, VllmCmdArgs, + VllmRayStartArgs, VllmSemanticEvalCmdArgs, VllmTestDefinition, ) @@ -52,6 +54,45 @@ def _with_ray_backend(command: list[str], enabled: bool) -> list[str]: def _needs_ray(self, role: str) -> bool: return self.role_node_count(role) > 1 + @staticmethod + def _format_ray_value(value: Any) -> str: + if isinstance(value, dict): + return shlex.quote(json.dumps(value, separators=(",", ":"))) + return str(value) + + @classmethod + def _serialize_ray_start_args(cls, args: dict[str, Any]) -> str: + parts: list[str] = [] + for key, value in args.items(): + if value is None: + continue + opt = f"--{key.replace('_', '-')}" + if isinstance(value, bool): + if value: + parts.append(opt) + continue + parts.append(f"{opt}={cls._format_ray_value(value)}") + return " ".join(parts) + + def _role_args(self, role: str) -> VllmArgs: + if role == "prefill": + if self.tdef.cmd_args.prefill is None: + raise ValueError("Prefill role requested for non-disaggregated vLLM.") + return self.tdef.cmd_args.prefill + return self.tdef.cmd_args.decode + + def _ray_start_args(self, role: str, kind: str, generated: dict[str, Any]) -> str: + role_args = self._role_args(role) + ray_args: VllmRayStartArgs | None = role_args.ray_head if kind == "head" else role_args.ray_worker + if ray_args is None: + return self._serialize_ray_start_args(generated) + + fields_set = ray_args.model_fields_set + user_args = ray_args.model_dump(exclude_none=True) + merged_args = {key: value for key, value in generated.items() if key not in fields_set} + merged_args.update(user_args) + return self._serialize_ray_start_args(merged_args) + def get_serve_commands(self) -> list[list[str]]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) cmd_args: VllmCmdArgs = tdef.cmd_args @@ -200,10 +241,16 @@ def render_serve_launch( worker_prefix = self._role_srun_prefix("$node") head_prefix = self._single_role_srun_prefix(head_node_var) serve_cmd = self._with_custom_bash(f'env RAY_ADDRESS="{head_node_expr}:${{{ray_port_var}}}" {command_tail}') + ray_head_args = self._ray_start_args(role, "head", {"head": True, "port": f'"${{{ray_port_var}}}"'}) + ray_worker_args = self._ray_start_args( + role, + "worker", + {"address": f"{head_node_expr}:${{{ray_port_var}}}", "block": True}, + ) ray_head_command = shlex.quote( f"""\ ray stop --force >/dev/null 2>&1 || true -ray start --head --port="${{{ray_port_var}}}" +ray start {ray_head_args} active_nodes=0 for (( i=0; i < {self.tdef.cmd_args.serve_wait_seconds}; i+=5 )); do @@ -224,7 +271,7 @@ def render_serve_launch( f"""\ ray stop --force >/dev/null 2>&1 || true for (( i=0; i < {self.tdef.cmd_args.serve_wait_seconds}; i+=5 )); do - if ray start --address={head_node_expr}:${{{ray_port_var}}} --block; then + if ray start {ray_worker_args}; then echo "Ray worker connected to {head_node_expr}:${{{ray_port_var}}}" exit 0 fi diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 34b86942f..b29323de8 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -21,7 +21,7 @@ import re from functools import cache from pathlib import Path -from typing import Optional, cast +from typing import Any, Optional, cast from pydantic import ConfigDict, Field, field_validator @@ -48,6 +48,14 @@ class VllmArgs(LLMServingArgs): """Base command arguments for vLLM instances.""" + ray_head: VllmRayStartArgs | None = Field( + default=None, + description="Arguments appended to the Ray head startup command for multi-node vLLM roles.", + ) + ray_worker: VllmRayStartArgs | None = Field( + default=None, + description="Arguments appended to the Ray worker startup command for multi-node vLLM roles.", + ) nixl_threads: int | list[int] | None = Field( default=None, description="Set ``kv_connector_extra_config.num_threads`` for ``--kv-transfer-config`` CLI argument.", @@ -55,7 +63,7 @@ class VllmArgs(LLMServingArgs): @property def serve_args_exclude(self) -> set[str]: - return super().serve_args_exclude | {"nixl_threads"} + return super().serve_args_exclude | {"nixl_threads", "ray_head", "ray_worker"} def serialize_serve_arg(self, key: str, value: object) -> list[str]: opt = f"--{key.replace('_', '-')}" @@ -64,6 +72,48 @@ def serialize_serve_arg(self, key: str, value: object) -> list[str]: return super().serialize_serve_arg(key, value) +class VllmRayStartArgs(CmdArgs): + """Ray startup arguments for vLLM multi-node serving roles.""" + + model_config = ConfigDict(extra="forbid") + + head: bool | list[bool] | None = Field(default=None, description="Emit ``--head`` for Ray head startup.") + port: int | str | list[int] | list[str] | None = Field(default=None, description="Ray head port.") + address: str | list[str] | None = Field(default=None, description="Ray head address for worker startup.") + block: bool | list[bool] | None = Field(default=None, description="Emit ``--block`` for Ray worker startup.") + num_gpus: int | float | str | list[int] | list[float] | list[str] | None = Field( + default=None, + description="Number of GPUs Ray should advertise on this node.", + ) + num_cpus: int | float | str | list[int] | list[float] | list[str] | None = Field( + default=None, + description="Number of CPUs Ray should advertise on this node.", + ) + object_store_memory: int | str | list[int] | list[str] | None = Field( + default=None, + description="Ray object store memory in bytes.", + ) + dashboard_host: str | list[str] | None = Field(default=None, description="Ray dashboard bind host.") + dashboard_port: int | str | list[int] | list[str] | None = Field(default=None, description="Ray dashboard port.") + include_dashboard: bool | str | list[bool] | list[str] | None = Field( + default=None, + description="Whether Ray should start the dashboard.", + ) + disable_usage_stats: bool | list[bool] | None = Field( + default=None, + description="Emit ``--disable-usage-stats`` for Ray startup.", + ) + temp_dir: str | list[str] | None = Field(default=None, description="Ray temporary directory.") + resources: dict[str, Any] | str | list[str] | None = Field( + default=None, + description="Ray custom resources.", + ) + labels: dict[str, Any] | str | list[str] | None = Field( + default=None, + description="Ray node labels.", + ) + + class VllmCmdArgs(LLMServingCmdArgs[VllmArgs]): """vLLM serve command arguments.""" diff --git a/tests/ref_data/vllm-multinode.sbatch b/tests/ref_data/vllm-multinode.sbatch index 7a137f469..42df1edef 100644 --- a/tests/ref_data/vllm-multinode.sbatch +++ b/tests/ref_data/vllm-multinode.sbatch @@ -78,7 +78,7 @@ echo "Starting vLLM instances..." --output=__OUTPUT_DIR__/output/vllm-serve-ray-worker-%N.log \ bash -lc 'ray stop --force >/dev/null 2>&1 || true for (( i=0; i < 300; i+=5 )); do - if ray start --address=${SERVE_NODE}:${SERVE_RAY_PORT} --block; then + if ray start --address=${SERVE_NODE}:${SERVE_RAY_PORT} --block --num-gpus=4 --num-cpus=64 --disable-usage-stats; then echo "Ray worker connected to ${SERVE_NODE}:${SERVE_RAY_PORT}" exit 0 fi @@ -95,7 +95,7 @@ srun --export=ALL --mpi=none --container-image=nvcr.io/nvidia/vllm:latest --cont --output=__OUTPUT_DIR__/output/vllm-serve.log \ --error=__OUTPUT_DIR__/output/vllm-serve-ray-head.log \ bash -lc 'ray stop --force >/dev/null 2>&1 || true -ray start --head --port="${SERVE_RAY_PORT}" +ray start --head --port="${SERVE_RAY_PORT}" --num-gpus=4 --num-cpus=64 --disable-usage-stats active_nodes=0 for (( i=0; i < 300; i+=5 )); do diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 2287876fb..71b6efc76 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -83,7 +83,7 @@ TritonInferenceTestDefinition, ) from cloudai.workloads.ucc_test import UCCCmdArgs, UCCTestDefinition -from cloudai.workloads.vllm import VllmArgs, VllmCmdArgs, VllmTestDefinition +from cloudai.workloads.vllm import VllmArgs, VllmCmdArgs, VllmRayStartArgs, VllmTestDefinition SLURM_TEST_SCENARIOS = [ {"path": Path("conf/common/test_scenario/sleep.toml"), "expected_dirs_number": 4, "log_file": "sleep_debug.log"}, @@ -614,7 +614,13 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - cmd_args=VllmCmdArgs( docker_image_url="nvcr.io/nvidia/vllm:latest", model="Qwen/Qwen3-0.6B", - decode=VllmArgs.model_validate({"tensor_parallel_size": 8}), + decode=VllmArgs.model_validate( + { + "tensor_parallel_size": 8, + "ray_head": VllmRayStartArgs(num_gpus=4, num_cpus=64, disable_usage_stats=True), + "ray_worker": VllmRayStartArgs(num_gpus=4, num_cpus=64, disable_usage_stats=True), + } + ), ), extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"}, ), diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index 49889dfd7..420b17f7e 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -25,6 +25,7 @@ VllmArgs, VllmBenchCmdArgs, VllmCmdArgs, + VllmRayStartArgs, VllmSemanticEvalCmdArgs, VllmSlurmCommandGenStrategy, VllmTestDefinition, @@ -394,3 +395,31 @@ def test_gen_srun_command_disagg_four_nodes_uses_role_ray_clusters( assert "ray.init(address=" not in srun_command assert 'env RAY_ADDRESS="${PREFILL_NODE}:${PREFILL_RAY_PORT}"' in srun_command assert 'env RAY_ADDRESS="${DECODE_NODE}:${DECODE_RAY_PORT}"' in srun_command + + def test_ray_head_and_worker_topology_args_can_be_overridden( + self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem + ) -> None: + tdef = cast(VllmTestDefinition, vllm_disagg_tr.test) + assert tdef.cmd_args.prefill is not None + tdef.cmd_args.prefill.num_nodes = 2 + tdef.cmd_args.decode.num_nodes = 2 + tdef.cmd_args.prefill.ray_head = VllmRayStartArgs( + head=False, + port=9123, + num_gpus=4, + dashboard_host="0.0.0.0", + ) + tdef.cmd_args.prefill.ray_worker = VllmRayStartArgs( + address="custom-prefill-head:9123", + block=False, + num_gpus=4, + ) + vllm_disagg_tr.num_nodes = 4 + strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) + + srun_command = strategy._gen_srun_command() + + assert "ray start --port=9123 --num-gpus=4 --dashboard-host=0.0.0.0" in srun_command + assert "ray start --head --port=9123" not in srun_command + assert "ray start --address=custom-prefill-head:9123 --num-gpus=4" in srun_command + assert "ray start --address=custom-prefill-head:9123 --block" not in srun_command From 4142cf1d06e9b7f7e8e8410690c765bf210c6108 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 11 Jun 2026 22:52:32 +0200 Subject: [PATCH 23/26] simplify ray args model --- src/cloudai/workloads/vllm/vllm.py | 35 ++----------------- tests/test_acceptance.py | 8 +++-- .../vllm/test_command_gen_strategy_slurm.py | 22 +++++++----- 3 files changed, 21 insertions(+), 44 deletions(-) diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index b29323de8..3f9be8285 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -21,7 +21,7 @@ import re from functools import cache from pathlib import Path -from typing import Any, Optional, cast +from typing import Optional, cast from pydantic import ConfigDict, Field, field_validator @@ -75,43 +75,12 @@ def serialize_serve_arg(self, key: str, value: object) -> list[str]: class VllmRayStartArgs(CmdArgs): """Ray startup arguments for vLLM multi-node serving roles.""" - model_config = ConfigDict(extra="forbid") + model_config = ConfigDict(extra="allow") head: bool | list[bool] | None = Field(default=None, description="Emit ``--head`` for Ray head startup.") port: int | str | list[int] | list[str] | None = Field(default=None, description="Ray head port.") address: str | list[str] | None = Field(default=None, description="Ray head address for worker startup.") block: bool | list[bool] | None = Field(default=None, description="Emit ``--block`` for Ray worker startup.") - num_gpus: int | float | str | list[int] | list[float] | list[str] | None = Field( - default=None, - description="Number of GPUs Ray should advertise on this node.", - ) - num_cpus: int | float | str | list[int] | list[float] | list[str] | None = Field( - default=None, - description="Number of CPUs Ray should advertise on this node.", - ) - object_store_memory: int | str | list[int] | list[str] | None = Field( - default=None, - description="Ray object store memory in bytes.", - ) - dashboard_host: str | list[str] | None = Field(default=None, description="Ray dashboard bind host.") - dashboard_port: int | str | list[int] | list[str] | None = Field(default=None, description="Ray dashboard port.") - include_dashboard: bool | str | list[bool] | list[str] | None = Field( - default=None, - description="Whether Ray should start the dashboard.", - ) - disable_usage_stats: bool | list[bool] | None = Field( - default=None, - description="Emit ``--disable-usage-stats`` for Ray startup.", - ) - temp_dir: str | list[str] | None = Field(default=None, description="Ray temporary directory.") - resources: dict[str, Any] | str | list[str] | None = Field( - default=None, - description="Ray custom resources.", - ) - labels: dict[str, Any] | str | list[str] | None = Field( - default=None, - description="Ray node labels.", - ) class VllmCmdArgs(LLMServingCmdArgs[VllmArgs]): diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 71b6efc76..ea3fc5e9a 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -617,8 +617,12 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - decode=VllmArgs.model_validate( { "tensor_parallel_size": 8, - "ray_head": VllmRayStartArgs(num_gpus=4, num_cpus=64, disable_usage_stats=True), - "ray_worker": VllmRayStartArgs(num_gpus=4, num_cpus=64, disable_usage_stats=True), + "ray_head": VllmRayStartArgs.model_validate( + {"num_gpus": 4, "num_cpus": 64, "disable_usage_stats": True} + ), + "ray_worker": VllmRayStartArgs.model_validate( + {"num_gpus": 4, "num_cpus": 64, "disable_usage_stats": True} + ), } ), ), diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index 420b17f7e..e3f3890d5 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -403,16 +403,20 @@ def test_ray_head_and_worker_topology_args_can_be_overridden( assert tdef.cmd_args.prefill is not None tdef.cmd_args.prefill.num_nodes = 2 tdef.cmd_args.decode.num_nodes = 2 - tdef.cmd_args.prefill.ray_head = VllmRayStartArgs( - head=False, - port=9123, - num_gpus=4, - dashboard_host="0.0.0.0", + tdef.cmd_args.prefill.ray_head = VllmRayStartArgs.model_validate( + { + "head": False, + "port": 9123, + "num_gpus": 4, + "dashboard_host": "0.0.0.0", + } ) - tdef.cmd_args.prefill.ray_worker = VllmRayStartArgs( - address="custom-prefill-head:9123", - block=False, - num_gpus=4, + tdef.cmd_args.prefill.ray_worker = VllmRayStartArgs.model_validate( + { + "address": "custom-prefill-head:9123", + "block": False, + "num_gpus": 4, + } ) vllm_disagg_tr.num_nodes = 4 strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) From 7df69553e27b071d2cb1bcf3ea04639a68f1e4df Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Fri, 12 Jun 2026 13:47:39 +0200 Subject: [PATCH 24/26] minor issues and docs fixes --- doc/workloads/sglang.rst | 68 ++++++---------- doc/workloads/vllm.rst | 78 ++++++------------- .../sglang/slurm_command_gen_strategy.py | 7 ++ src/cloudai/workloads/vllm/vllm.py | 40 ++++++---- tests/ref_data/sglang-disagg-2nodes.sbatch | 4 +- tests/ref_data/sglang-disagg.sbatch | 4 +- .../sglang/test_command_gen_strategy_slurm.py | 12 +++ tests/workloads/vllm/test_workload.py | 19 +++++ 8 files changed, 112 insertions(+), 120 deletions(-) diff --git a/doc/workloads/sglang.rst b/doc/workloads/sglang.rst index f4537fe39..9250e65b0 100644 --- a/doc/workloads/sglang.rst +++ b/doc/workloads/sglang.rst @@ -98,22 +98,24 @@ placeholders. Readiness health checks ----------------------- -CloudAI waits for SGLang servers to become ready before starting the benchmark. The default SGLang readiness endpoint is -``/v1/models``. Set ``serve_healthcheck`` to override the endpoint used for aggregated serve processes and -disaggregated prefill/decode server processes. In disaggregated mode, the router readiness check uses ``healthcheck``. +Healthcheck fields: + +- ``healthcheck``: aggregated server and disaggregated router endpoint, default ``/v1/models``. +- ``serve_healthcheck``: optional override for serve, prefill, and decode servers. + +If ``serve_healthcheck`` is omitted, disaggregated prefill/decode servers keep the legacy ``/health`` endpoint. Control number of GPUs ---------------------- -The number of GPUs can be controlled using the options below, listed from lowest to highest priority: +GPU selection priority, from lowest to highest: + 1. ``gpus_per_node`` system property (scalar value) 2. ``decode.gpu_ids`` command argument in non-disaggregated mode when ``CUDA_VISIBLE_DEVICES`` is not set 3. ``CUDA_VISIBLE_DEVICES`` environment variable (comma-separated list of GPU IDs) 4. ``gpu_ids`` command argument for both ``prefill`` and ``decode`` configurations in disaggregated mode -For backward compatibility, non-disaggregated configs that set both ``CUDA_VISIBLE_DEVICES`` and ``decode.gpu_ids`` use -``CUDA_VISIBLE_DEVICES``. In disaggregated mode (``prefill`` is set), both ``prefill`` and ``decode`` should define -``gpu_ids``, or none of them should set it. +In disaggregated mode, define both ``prefill.gpu_ids`` and ``decode.gpu_ids``, or omit both. Control disaggregation @@ -149,63 +151,37 @@ In this case ``CUDA_VISIBLE_DEVICES`` will be ignored and only the GPUs specifie Multi-node serving ------------------ -For non-disaggregated serving, set ``num_nodes`` on the test to more than one. CloudAI starts one -``sglang.launch_server`` task per serving node with a shared ``--dist-init-addr``, ``--nnodes``, and -``--node-rank "$SLURM_NODEID"``. -SGLang ``tp`` is the total tensor-parallel size for the distributed serving role. With two nodes and -``CUDA_VISIBLE_DEVICES = "0,1,2,3"`` on each node, set ``tp = 8`` to use all eight visible GPUs. +For non-disaggregated ``num_nodes > 1``, CloudAI starts one ``sglang.launch_server`` task per serving node with shared +``--dist-init-addr``, ``--nnodes``, and ``--node-rank "$SLURM_PROCID"``. -.. code-block:: toml - :caption: scenario.toml (multi-node aggregated serving) - - [[Tests]] - id = "sglang.multi_node" - num_nodes = 2 - test_template_name = "sglang" +For disaggregated serving over more than two nodes, set explicit role sizes: - [Tests.cmd_args] - docker_image_url = "lmsysorg/sglang:dev-cu13" - model = "Qwen/Qwen3-8B" +- ``prefill.num_nodes + decode.num_nodes`` must equal the test ``num_nodes``. +- CloudAI assigns contiguous node slices: prefill first, decode second. +- ``tp`` is total per role, not per node. +- ``CUDA_VISIBLE_DEVICES`` and ``gpu_ids`` are local GPU IDs on each serving node. - [Tests.cmd_args.decode] - tp = 8 - - [Tests.extra_env_vars] - CUDA_VISIBLE_DEVICES = "0,1,2,3" - -For disaggregated prefill/decode serving, existing 1-node and 2-node behavior is preserved by default. To span more -than two nodes, set both role sizes explicitly. CloudAI assigns contiguous node slices to prefill and decode and starts -one distributed SGLang launch per role with separate init ports. Benchmark and semantic validation run from the prefill -head node. -Role ``tp`` values are total per distributed role, not per node. For example, ``num_nodes = 2`` with four visible GPUs -per node uses ``tp = 8`` to consume all GPUs in that role. +Example: four prefill nodes and four decode nodes, each with four visible GPUs: .. code-block:: toml :caption: scenario.toml (multi-node disaggregated serving) [[Tests]] id = "sglang.pd_multi_node" - num_nodes = 4 + num_nodes = 8 test_template_name = "sglang" - [Tests.cmd_args] - docker_image_url = "lmsysorg/sglang:dev-cu13" - model = "Qwen/Qwen3-8B" - [Tests.cmd_args.prefill] - num_nodes = 2 - tp = 8 + num_nodes = 4 + tp = 16 [Tests.cmd_args.decode] - num_nodes = 2 - tp = 8 + num_nodes = 4 + tp = 16 [Tests.extra_env_vars] CUDA_VISIBLE_DEVICES = "0,1,2,3" -``CUDA_VISIBLE_DEVICES`` and ``gpu_ids`` are interpreted as local GPU IDs on each serving node, not as cluster-global GPU -IDs. - API Documentation ----------------- diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst index c250f37f1..a57486773 100644 --- a/doc/workloads/vllm.rst +++ b/doc/workloads/vllm.rst @@ -93,15 +93,14 @@ placeholders. Controlling the Number of GPUs ------------------------------- -The number of GPUs can be controlled using the options below, listed from lowest to highest priority: +GPU selection priority, from lowest to highest: + 1. ``gpus_per_node`` system property (scalar value) 2. ``decode.gpu_ids`` command argument in non-disaggregated mode when ``CUDA_VISIBLE_DEVICES`` is not set 3. ``CUDA_VISIBLE_DEVICES`` environment variable (comma-separated list of GPU IDs) 4. ``gpu_ids`` command argument for both ``prefill`` and ``decode`` configurations in disaggregated mode -For backward compatibility, non-disaggregated configs that set both ``CUDA_VISIBLE_DEVICES`` and ``decode.gpu_ids`` use -``CUDA_VISIBLE_DEVICES``. In disaggregated mode (``prefill`` is set), both ``prefill`` and ``decode`` should define -``gpu_ids``, or none of them should set it. +In disaggregated mode, define both ``prefill.gpu_ids`` and ``decode.gpu_ids``, or omit both. Controlling Disaggregation -------------------------- @@ -139,80 +138,49 @@ In this case ``CUDA_VISIBLE_DEVICES`` will be ignored and only the GPUs specifie Multi-node serving ------------------ -For non-disaggregated serving, set ``num_nodes`` on the test to more than one. CloudAI starts a Ray head on the first -allocated serving node, Ray workers on the remaining serving nodes, waits for the Ray cluster to reach the requested -size, and runs ``vllm serve`` with ``--distributed-executor-backend ray`` on the head node. -``tensor_parallel_size`` is the total tensor-parallel size across the Ray serving role. With two nodes and -``CUDA_VISIBLE_DEVICES = "0,1,2,3"`` on each node, set ``tensor_parallel_size = 8`` to use all eight visible GPUs. - -.. code-block:: toml - :caption: scenario.toml (multi-node aggregated serving) - - [[Tests]] - id = "vllm.multi_node" - num_nodes = 2 - test_template_name = "vllm" +For non-disaggregated ``num_nodes > 1``, CloudAI creates one Ray cluster and starts ``vllm serve`` on the head node with +``--distributed-executor-backend ray``. - [Tests.cmd_args] - docker_image_url = "nvcr.io/nvidia/vllm:latest" - model = "Qwen/Qwen3-0.6B" - - [Tests.cmd_args.decode] - tensor_parallel_size = 8 +For disaggregated serving over more than two nodes, set explicit role sizes: - [Tests.extra_env_vars] - CUDA_VISIBLE_DEVICES = "0,1,2,3" +- ``prefill.num_nodes + decode.num_nodes`` must equal the test ``num_nodes``. +- CloudAI assigns contiguous node slices: prefill first, decode second. +- ``tensor_parallel_size`` is total per role, not per node. +- ``CUDA_VISIBLE_DEVICES`` and ``gpu_ids`` are local GPU IDs on each serving node. -For disaggregated prefill/decode serving, existing 1-node and 2-node behavior is preserved by default. To span more -than two nodes, set both role sizes explicitly. CloudAI assigns contiguous node slices to prefill and decode, creates a -separate Ray cluster for each role whose ``num_nodes`` is greater than one, and runs benchmark and semantic validation -from the prefill head node. -Role ``tensor_parallel_size`` values are total per Ray role, not per node. For example, ``num_nodes = 2`` with four -visible GPUs per node uses ``tensor_parallel_size = 8`` to consume all GPUs in that role. +Example: four prefill nodes and four decode nodes, each with four visible GPUs: .. code-block:: toml :caption: scenario.toml (multi-node disaggregated serving) [[Tests]] id = "vllm.pd_multi_node" - num_nodes = 4 + num_nodes = 8 test_template_name = "vllm" - [Tests.cmd_args] - docker_image_url = "nvcr.io/nvidia/vllm:latest" - model = "Qwen/Qwen3-0.6B" - [Tests.cmd_args.prefill] - num_nodes = 2 - tensor_parallel_size = 8 + num_nodes = 4 + tensor_parallel_size = 16 [Tests.cmd_args.decode] - num_nodes = 2 - tensor_parallel_size = 8 + num_nodes = 4 + tensor_parallel_size = 16 [Tests.extra_env_vars] CUDA_VISIBLE_DEVICES = "0,1,2,3" -``CUDA_VISIBLE_DEVICES`` and ``gpu_ids`` are interpreted as local GPU IDs on each serving node, not as cluster-global GPU -IDs. - Readiness health checks ----------------------- -CloudAI waits for vLLM servers to become ready before starting the benchmark. The default vLLM server endpoint remains -``/healthcheck`` for backward compatibility with existing configs and runtime images. Generated Slurm scripts wait for -the configured endpoint exactly. - -Use ``serve_healthcheck`` to override the readiness endpoint for the vLLM serve process, including prefill/decode server -processes in disaggregated mode. If ``serve_healthcheck`` is not set, aggregated serving uses ``healthcheck``. -Disaggregated prefill/decode serving keeps the legacy ``/health`` default. +Healthcheck fields: -In disaggregated mode, ``proxy_healthcheck`` controls the proxy/router readiness endpoint. Existing disaggregated -configs that set ``healthcheck`` and do not set ``proxy_healthcheck`` continue to use ``healthcheck`` for the -proxy/router check. +- ``healthcheck``: aggregated server endpoint, default ``/healthcheck``. +- ``serve_healthcheck``: optional override for serve, prefill, and decode servers. +- ``proxy_healthcheck``: disaggregated proxy/router endpoint, default ``/healthcheck``. -For custom runtime images with a different readiness path, set ``serve_healthcheck`` for vLLM server processes and, -when using disaggregated mode, ``proxy_healthcheck`` for the proxy/router. +If ``serve_healthcheck`` is omitted, disaggregated prefill/decode servers keep the legacy ``/health`` endpoint. If a +disaggregated config sets ``healthcheck`` but omits ``proxy_healthcheck``, the proxy/router uses ``healthcheck`` for +backward compatibility. Controlling ``proxy_script`` diff --git a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py index 7719cca53..f1e576ac5 100644 --- a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py @@ -134,6 +134,13 @@ def get_semantic_eval_command(self) -> list[str] | None: cli = self._expand_semantic_eval_args(eval_args.cli, host=host) return [eval_args.entrypoint, cli] if cli else [eval_args.entrypoint] + def serve_healthcheck(self, role: str) -> str: + if self.tdef.cmd_args.serve_healthcheck: + return self.tdef.cmd_args.serve_healthcheck + if role in {"prefill", "decode"}: + return "/health" + return self.tdef.cmd_args.healthcheck + def aggregated_serve_env(self) -> dict[str, str]: return {"CUDA_VISIBLE_DEVICES": ",".join(str(gpu_id) for gpu_id in self.gpu_ids)} diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 3f9be8285..f920fbcc0 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -188,26 +188,36 @@ def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool: gpu_count=local_gpu_count * num_nodes, ) - prefill_nodes = 1 - decode_nodes = 1 - if num_nodes > 2: - prefill_nodes_value = self.cmd_args.prefill.num_nodes - decode_nodes_value = self.cmd_args.decode.num_nodes - if not isinstance(prefill_nodes_value, int) or not isinstance(decode_nodes_value, int): + prefill_nodes_value = self.cmd_args.prefill.num_nodes + decode_nodes_value = self.cmd_args.decode.num_nodes + if prefill_nodes_value is None and decode_nodes_value is None: + if num_nodes > 2: logging.error( "vLLM disaggregated mode over more than 2 nodes requires both prefill.num_nodes and " "decode.num_nodes." ) return False - if prefill_nodes_value + decode_nodes_value != num_nodes: - logging.error( - "vLLM disaggregated role node counts must sum to allocated nodes. prefill=%s decode=%s " - "allocated=%s", - prefill_nodes_value, - decode_nodes_value, - num_nodes, - ) - return False + prefill_nodes = 1 + decode_nodes = 1 + elif not isinstance(prefill_nodes_value, int) or not isinstance(decode_nodes_value, int): + logging.error("vLLM disaggregated role node counts must both be single integers or both be omitted.") + return False + elif prefill_nodes_value <= 0 or decode_nodes_value <= 0: + logging.error( + "vLLM disaggregated role node counts must be positive integers. prefill=%s decode=%s", + prefill_nodes_value, + decode_nodes_value, + ) + return False + elif prefill_nodes_value + decode_nodes_value != num_nodes: + logging.error( + "vLLM disaggregated role node counts must sum to allocated nodes. prefill=%s decode=%s allocated=%s", + prefill_nodes_value, + decode_nodes_value, + num_nodes, + ) + return False + else: prefill_nodes = prefill_nodes_value decode_nodes = decode_nodes_value diff --git a/tests/ref_data/sglang-disagg-2nodes.sbatch b/tests/ref_data/sglang-disagg-2nodes.sbatch index 47cc35d88..8ab9ee091 100644 --- a/tests/ref_data/sglang-disagg-2nodes.sbatch +++ b/tests/ref_data/sglang-disagg-2nodes.sbatch @@ -80,8 +80,8 @@ srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --c DECODE_PID=$! echo "Waiting for SGLang on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8400/v1/models" || exit 1 -wait_for_health "http://${DECODE_NODE}:8500/v1/models" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ diff --git a/tests/ref_data/sglang-disagg.sbatch b/tests/ref_data/sglang-disagg.sbatch index 2c11daaba..1d737f556 100644 --- a/tests/ref_data/sglang-disagg.sbatch +++ b/tests/ref_data/sglang-disagg.sbatch @@ -80,8 +80,8 @@ srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --c DECODE_PID=$! echo "Waiting for SGLang on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8400/v1/models" || exit 1 -wait_for_health "http://${DECODE_NODE}:8500/v1/models" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." srun --export=ALL --mpi=none --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --nodelist="${PREFILL_NODE}" --nodes=1 --ntasks=1 --ntasks-per-node=1 \ diff --git a/tests/workloads/sglang/test_command_gen_strategy_slurm.py b/tests/workloads/sglang/test_command_gen_strategy_slurm.py index 01e6f5be6..3519e5ac2 100644 --- a/tests/workloads/sglang/test_command_gen_strategy_slurm.py +++ b/tests/workloads/sglang/test_command_gen_strategy_slurm.py @@ -148,6 +148,18 @@ def test_gen_srun_command_contains_sglang_semantic_eval_in_disagg( assert "python3 -m sglang.test.run_eval --host ${PREFILL_NODE} --port 8000" in srun_command +def test_disaggregated_server_healthcheck_defaults_to_legacy_health_endpoint( + sglang_disagg_tr: TestRun, slurm_system: SlurmSystem +) -> None: + strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_disagg_tr) + + srun_command = strategy._gen_srun_command() + + assert 'wait_for_health "http://${PREFILL_NODE}:8100/health"' in srun_command + assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' in srun_command + assert 'wait_for_health "http://${PREFILL_NODE}:8000/v1/models"' in srun_command + + def test_disagg_more_than_two_nodes_is_rejected(sglang_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: sglang_disagg_tr.num_nodes = 3 strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_disagg_tr) diff --git a/tests/workloads/vllm/test_workload.py b/tests/workloads/vllm/test_workload.py index 78e7d23d1..d16ac23a6 100644 --- a/tests/workloads/vllm/test_workload.py +++ b/tests/workloads/vllm/test_workload.py @@ -138,6 +138,25 @@ def test_constraint_check_uses_all_allocated_gpus_for_multinode_aggregated(tmp_p assert tdef.constraint_check(tr, slurm_system) is True +def test_constraint_check_rejects_explicit_disagg_role_nodes_that_do_not_match_two_node_allocation( + tmp_path, slurm_system: SlurmSystem +) -> None: + tdef = VllmTestDefinition( + name="test", + description="test", + test_template_name="vllm", + cmd_args=VllmCmdArgs( + docker_image_url="test_url", + prefill=VllmArgs.model_validate({"num_nodes": 2}), + decode=VllmArgs.model_validate({"num_nodes": 1}), + ), + ) + tr = TestRun(name="vllm", test=tdef, num_nodes=2, nodes=[], output_path=tmp_path) + slurm_system.gpus_per_node = 4 + + assert tdef.constraint_check(tr, slurm_system) is False + + def test_constraint_check_uses_role_nodes_for_multinode_disagg(tmp_path, slurm_system: SlurmSystem) -> None: tdef = VllmTestDefinition( name="test", From 0a98469785bc906c19030756816f6895dffbb4de Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Fri, 12 Jun 2026 14:37:15 +0200 Subject: [PATCH 25/26] resolve ai comments --- .../vllm/slurm_command_gen_strategy.py | 2 ++ src/cloudai/workloads/vllm/vllm.py | 3 +++ .../vllm/test_command_gen_strategy_slurm.py | 5 ++++- tests/workloads/vllm/test_workload.py | 17 +++++++++++++++++ 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index eb86d0dfb..ff336800a 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -58,6 +58,8 @@ def _needs_ray(self, role: str) -> bool: def _format_ray_value(value: Any) -> str: if isinstance(value, dict): return shlex.quote(json.dumps(value, separators=(",", ":"))) + if isinstance(value, str): + return shlex.quote(value) return str(value) @classmethod diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index f920fbcc0..af5b815af 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -209,6 +209,9 @@ def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool: decode_nodes_value, ) return False + elif num_nodes == 1 and prefill_nodes_value == 1 and decode_nodes_value == 1: + prefill_nodes = 1 + decode_nodes = 1 elif prefill_nodes_value + decode_nodes_value != num_nodes: logging.error( "vLLM disaggregated role node counts must sum to allocated nodes. prefill=%s decode=%s allocated=%s", diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index e3f3890d5..865f52989 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -409,6 +409,7 @@ def test_ray_head_and_worker_topology_args_can_be_overridden( "port": 9123, "num_gpus": 4, "dashboard_host": "0.0.0.0", + "temp_dir": "/tmp/ray with spaces", } ) tdef.cmd_args.prefill.ray_worker = VllmRayStartArgs.model_validate( @@ -423,7 +424,9 @@ def test_ray_head_and_worker_topology_args_can_be_overridden( srun_command = strategy._gen_srun_command() - assert "ray start --port=9123 --num-gpus=4 --dashboard-host=0.0.0.0" in srun_command + assert ( + "ray start --port=9123 --num-gpus=4 --dashboard-host=0.0.0.0 --temp-dir='\"'\"'/tmp/ray with spaces'\"'\"'" + ) in srun_command assert "ray start --head --port=9123" not in srun_command assert "ray start --address=custom-prefill-head:9123 --num-gpus=4" in srun_command assert "ray start --address=custom-prefill-head:9123 --block" not in srun_command diff --git a/tests/workloads/vllm/test_workload.py b/tests/workloads/vllm/test_workload.py index d16ac23a6..6d123e56b 100644 --- a/tests/workloads/vllm/test_workload.py +++ b/tests/workloads/vllm/test_workload.py @@ -157,6 +157,23 @@ def test_constraint_check_rejects_explicit_disagg_role_nodes_that_do_not_match_t assert tdef.constraint_check(tr, slurm_system) is False +def test_constraint_check_allows_explicit_single_node_disagg_role_nodes(tmp_path, slurm_system: SlurmSystem) -> None: + tdef = VllmTestDefinition( + name="test", + description="test", + test_template_name="vllm", + cmd_args=VllmCmdArgs( + docker_image_url="test_url", + prefill=VllmArgs.model_validate({"num_nodes": 1, "gpu_ids": "0,1", "tensor_parallel_size": 2}), + decode=VllmArgs.model_validate({"num_nodes": 1, "gpu_ids": "2,3", "tensor_parallel_size": 2}), + ), + ) + tr = TestRun(name="vllm", test=tdef, num_nodes=1, nodes=[], output_path=tmp_path) + slurm_system.gpus_per_node = 4 + + assert tdef.constraint_check(tr, slurm_system) is True + + def test_constraint_check_uses_role_nodes_for_multinode_disagg(tmp_path, slurm_system: SlurmSystem) -> None: tdef = VllmTestDefinition( name="test", From dd1d605a65a44a5123590dbb44abb6ea548abfc1 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Fri, 12 Jun 2026 18:31:11 +0200 Subject: [PATCH 26/26] revert quotation --- .../vllm/slurm_command_gen_strategy.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index ff336800a..4b8470cd1 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -55,15 +55,15 @@ def _needs_ray(self, role: str) -> bool: return self.role_node_count(role) > 1 @staticmethod - def _format_ray_value(value: Any) -> str: + def _format_ray_value(value: Any, *, quote_strings: bool = True) -> str: if isinstance(value, dict): return shlex.quote(json.dumps(value, separators=(",", ":"))) - if isinstance(value, str): + if quote_strings and isinstance(value, str): return shlex.quote(value) return str(value) @classmethod - def _serialize_ray_start_args(cls, args: dict[str, Any]) -> str: + def _serialize_ray_start_args(cls, args: dict[str, Any], *, quote_strings: bool = True) -> str: parts: list[str] = [] for key, value in args.items(): if value is None: @@ -73,7 +73,7 @@ def _serialize_ray_start_args(cls, args: dict[str, Any]) -> str: if value: parts.append(opt) continue - parts.append(f"{opt}={cls._format_ray_value(value)}") + parts.append(f"{opt}={cls._format_ray_value(value, quote_strings=quote_strings)}") return " ".join(parts) def _role_args(self, role: str) -> VllmArgs: @@ -87,13 +87,19 @@ def _ray_start_args(self, role: str, kind: str, generated: dict[str, Any]) -> st role_args = self._role_args(role) ray_args: VllmRayStartArgs | None = role_args.ray_head if kind == "head" else role_args.ray_worker if ray_args is None: - return self._serialize_ray_start_args(generated) + return self._serialize_ray_start_args(generated, quote_strings=False) fields_set = ray_args.model_fields_set user_args = ray_args.model_dump(exclude_none=True) - merged_args = {key: value for key, value in generated.items() if key not in fields_set} - merged_args.update(user_args) - return self._serialize_ray_start_args(merged_args) + generated_args = {key: value for key, value in generated.items() if key not in fields_set} + return " ".join( + part + for part in ( + self._serialize_ray_start_args(generated_args, quote_strings=False), + self._serialize_ray_start_args(user_args), + ) + if part + ) def get_serve_commands(self) -> list[list[str]]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test)