NVIDIA · podkidyshev · Jun 15, 2026 · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "sglang-heavy-multinode"
+
+[[Tests]]
+id = "sglang.heavy.disagg.8nodes.4p4d"
+test_name = "sglang"
+num_nodes = 8
+time_limit = "01:30:00"
+
+  [Tests.cmd_args.prefill]
+  num_nodes = 4
+  gpu_ids = "0,1,2,3"
+  tp = 16
+  mem_fraction_static = 0.75
+
+  [Tests.cmd_args.decode]
+  num_nodes = 4
+  gpu_ids = "0,1,2,3"
+  tp = 16
+  mem_fraction_static = 0.75
+
+  [Tests.extra_env_vars]
+  CUDA_VISIBLE_DEVICES = "0,1,2,3"
@@ -23,17 +23,25 @@ num_nodes = 2
 time_limit = "00:10:00"
 
   [Tests.cmd_args.decode]
+  tensor_parallel_size = 8
   mem_fraction_static = 0.75
 
+  [Tests.extra_env_vars]
+  CUDA_VISIBLE_DEVICES = "0,1,2,3"
+
 [[Tests]]
 id = "sglang.agg.1node"
 test_name = "sglang"
 num_nodes = 1
 time_limit = "00:10:00"
 
   [Tests.cmd_args.decode]
+  tensor_parallel_size = 4
   mem_fraction_static = 0.75
 
+  [Tests.extra_env_vars]
+  CUDA_VISIBLE_DEVICES = "0,1,2,3"
+
 [[Tests]]
 id = "sglang.disagg.sync"
 test_name = "sglang"
@@ -50,6 +58,9 @@ time_limit = "00:10:00"
   tensor_parallel_size = 2
   mem_fraction_static = 0.75
 
+  [Tests.extra_env_vars]
+  CUDA_VISIBLE_DEVICES = "0,1,2,3"
+
 [[Tests]]
 id = "sglang.disagg.async"
 test_name = "sglang"
@@ -66,6 +77,9 @@ time_limit = "00:10:00"
   tensor_parallel_size = 2
   mem_fraction_static = 0.75
 
+  [Tests.extra_env_vars]
+  CUDA_VISIBLE_DEVICES = "0,1,2,3"
+
 [[Tests]]
 id = "sglang.disagg.2nodes"
 test_name = "sglang"
@@ -79,3 +93,6 @@ time_limit = "00:10:00"
   [Tests.cmd_args.decode]
   tensor_parallel_size = 4
   mem_fraction_static = 0.75
+
+  [Tests.extra_env_vars]
+  CUDA_VISIBLE_DEVICES = "0,1,2,3"
@@ -25,6 +25,7 @@ mount_as = "/vllm_repo"
 
 [cmd_args]
 docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1"
+serve_healthcheck = "/health"
 
 [semantic_eval_cmd_args]
 entrypoint = "python3 /vllm_repo/tests/evals/gsm8k/gsm8k_eval.py"

@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "vllm-heavy-multinode"
+
+[[Tests]]
+id = "vllm.heavy.disagg.8nodes.4p4d"
+test_name = "vllm"
+num_nodes = 8
+time_limit = "01:30:00"
+
+  [Tests.cmd_args]
+  model = "Qwen/Qwen3-8B"
+
+    [Tests.cmd_args.prefill]
+    num_nodes = 4
+    gpu_ids = "0,1,2,3"
+    enforce_eager = ""
+    tensor_parallel_size = 16
+    max_num_batched_tokens = 8192
+    max_model_len = 8192
+
+      [Tests.cmd_args.prefill.ray_head]
+      num_gpus = 4
+
+      [Tests.cmd_args.prefill.ray_worker]
+      num_gpus = 4
+
+    [Tests.cmd_args.decode]
+    num_nodes = 4
+    gpu_ids = "0,1,2,3"
+    enforce_eager = ""
+    tensor_parallel_size = 16
+    max_num_batched_tokens = 8192
+    max_model_len = 8192
+
+      [Tests.cmd_args.decode.ray_head]
+      num_gpus = 4
+
+      [Tests.cmd_args.decode.ray_worker]
+      num_gpus = 4
+
+  [Tests.extra_env_vars]
+  CUDA_VISIBLE_DEVICES = "0,1,2,3"
@@ -16,6 +16,34 @@
 
 name = "vllm"
 
+[[Tests]]
+id = "vllm.agg.1node"
+test_name = "vllm"
+num_nodes = 1
+time_limit = "00:10:00"
+
+  [Tests.cmd_args.decode]
+  enforce_eager = ""
+  tensor_parallel_size = 4
+  max_num_batched_tokens = 1024
+
+  [Tests.extra_env_vars]
+  CUDA_VISIBLE_DEVICES = "0,1,2,3"
+
+[[Tests]]
+id = "vllm.agg.2nodes"
+test_name = "vllm"
+num_nodes = 2
+time_limit = "00:30:00"
+
+  [Tests.cmd_args.decode]
+  enforce_eager = ""
+  tensor_parallel_size = 8
+  max_num_batched_tokens = 1024
+
+  [Tests.extra_env_vars]
+  CUDA_VISIBLE_DEVICES = "0,1,2,3"
+
 [[Tests]]
 id = "vllm.disagg.sync"
 test_name = "vllm"
@@ -24,14 +52,17 @@ time_limit = "00:30:00"
 
   [Tests.cmd_args.prefill]
   enforce_eager = ""
-  tensor_parallel_size = 2
+  tensor_parallel_size = 4
   max_num_batched_tokens = 1024
 
   [Tests.cmd_args.decode]
   enforce_eager = ""
-  tensor_parallel_size = 2
+  tensor_parallel_size = 4
   max_num_batched_tokens = 1024
 
+  [Tests.extra_env_vars]
+  CUDA_VISIBLE_DEVICES = "0,1,2,3"
+
 [[Tests]]
 id = "vllm.disagg.async"
 test_name = "vllm"
@@ -41,11 +72,14 @@ time_limit = "00:10:00"
   [Tests.cmd_args.prefill]
   gpu_ids = "0,1"
   enforce_eager = ""
-  tensor_parallel_size = 1
+  tensor_parallel_size = 2
   max_num_batched_tokens = 1024
 
   [Tests.cmd_args.decode]
   gpu_ids = "2,3"
   enforce_eager = ""
   tensor_parallel_size = 2
   max_num_batched_tokens = 1024
+
+  [Tests.extra_env_vars]
+  CUDA_VISIBLE_DEVICES = "0,1,2,3"
@@ -96,12 +96,26 @@ The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{o
 placeholders.
 
 
+Readiness health checks
+-----------------------
+Healthcheck fields:
+
+- ``healthcheck``: aggregated server and disaggregated router endpoint, default ``/v1/models``.
+- ``serve_healthcheck``: optional override for serve, prefill, and decode servers.
+
+If ``serve_healthcheck`` is omitted, disaggregated prefill/decode servers keep the legacy ``/health`` endpoint.
+
+
 Control number of GPUs
 ----------------------
-The number of GPUs can be controlled using the options below, listed from lowest to highest priority:
+GPU selection priority, from lowest to highest:
+
 1. ``gpus_per_node`` system property (scalar value)
-2. ``CUDA_VISIBLE_DEVICES`` environment variable (comma-separated list of GPU IDs)
-3. ``gpu_ids`` command argument for ``prefill`` and ``decode`` configurations (comma-separated list of GPU IDs). If disaggregated mode is used (``prefill`` is set), both ``prefill`` and ``decode`` should define ``gpu_ids``, or none of them should set it.
+2. ``decode.gpu_ids`` command argument in non-disaggregated mode when ``CUDA_VISIBLE_DEVICES`` is not set
+3. ``CUDA_VISIBLE_DEVICES`` environment variable (comma-separated list of GPU IDs)
+4. ``gpu_ids`` command argument for both ``prefill`` and ``decode`` configurations in disaggregated mode
+
+In disaggregated mode, define both ``prefill.gpu_ids`` and ``decode.gpu_ids``, or omit both.
 
 
 Control disaggregation
@@ -135,6 +149,39 @@ For more control, one can specify the GPU IDs explicitly in ``prefill`` and ``de
 
 In this case ``CUDA_VISIBLE_DEVICES`` will be ignored and only the GPUs specified in ``gpu_ids`` will be used.
 
+Multi-node serving
+------------------
+For non-disaggregated ``num_nodes > 1``, CloudAI starts one ``sglang.launch_server`` task per serving node with shared
+``--dist-init-addr``, ``--nnodes``, and ``--node-rank "$SLURM_PROCID"``.
+
+For disaggregated serving over more than two nodes, set explicit role sizes:
+
+- ``prefill.num_nodes + decode.num_nodes`` must equal the test ``num_nodes``.
+- CloudAI assigns contiguous node slices: prefill first, decode second.
+- ``tp`` is total per role, not per node.
+- ``CUDA_VISIBLE_DEVICES`` and ``gpu_ids`` are local GPU IDs on each serving node.
+
+Example: four prefill nodes and four decode nodes, each with four visible GPUs:
+
+.. code-block:: toml
+   :caption: scenario.toml (multi-node disaggregated serving)
+
+   [[Tests]]
+   id = "sglang.pd_multi_node"
+   num_nodes = 8
+   test_template_name = "sglang"
+
+   [Tests.cmd_args.prefill]
+   num_nodes = 4
+   tp = 16
+
+   [Tests.cmd_args.decode]
+   num_nodes = 4
+   tp = 16
+
+   [Tests.extra_env_vars]
+   CUDA_VISIBLE_DEVICES = "0,1,2,3"
+
 API Documentation
 -----------------
 

@@ -93,11 +93,14 @@ placeholders.
 
 Controlling the Number of GPUs
 -------------------------------
-The number of GPUs can be controlled using the options below, listed from lowest to highest priority:
+GPU selection priority, from lowest to highest:
+
 1. ``gpus_per_node`` system property (scalar value)
-2. ``CUDA_VISIBLE_DEVICES`` environment variable (comma-separated list of GPU IDs)
-3. ``gpu_ids`` command argument for ``prefill`` and ``decode`` configurations (comma-separated list of GPU IDs). If disaggregated mode is used (``prefill`` is set), both ``prefill`` and ``decode`` should define ``gpu_ids``, or none of them should set it.
+2. ``decode.gpu_ids`` command argument in non-disaggregated mode when ``CUDA_VISIBLE_DEVICES`` is not set
+3. ``CUDA_VISIBLE_DEVICES`` environment variable (comma-separated list of GPU IDs)
+4. ``gpu_ids`` command argument for both ``prefill`` and ``decode`` configurations in disaggregated mode
 
+In disaggregated mode, define both ``prefill.gpu_ids`` and ``decode.gpu_ids``, or omit both.
 
 Controlling Disaggregation
 --------------------------
@@ -133,6 +136,53 @@ For more control, users can specify the GPU IDs explicitly in ``prefill`` and ``
 In this case ``CUDA_VISIBLE_DEVICES`` will be ignored and only the GPUs specified in ``gpu_ids`` will be used.
 
 
+Multi-node serving
+------------------
+For non-disaggregated ``num_nodes > 1``, CloudAI creates one Ray cluster and starts ``vllm serve`` on the head node with
+``--distributed-executor-backend ray``.
+
+For disaggregated serving over more than two nodes, set explicit role sizes:
+
+- ``prefill.num_nodes + decode.num_nodes`` must equal the test ``num_nodes``.
+- CloudAI assigns contiguous node slices: prefill first, decode second.
+- ``tensor_parallel_size`` is total per role, not per node.
+- ``CUDA_VISIBLE_DEVICES`` and ``gpu_ids`` are local GPU IDs on each serving node.
+
+Example: four prefill nodes and four decode nodes, each with four visible GPUs:
+
+.. code-block:: toml
+   :caption: scenario.toml (multi-node disaggregated serving)
+
+   [[Tests]]
+   id = "vllm.pd_multi_node"
+   num_nodes = 8
+   test_template_name = "vllm"
+
+   [Tests.cmd_args.prefill]
+   num_nodes = 4
+   tensor_parallel_size = 16
+
+   [Tests.cmd_args.decode]
+   num_nodes = 4
+   tensor_parallel_size = 16
+
+   [Tests.extra_env_vars]
+   CUDA_VISIBLE_DEVICES = "0,1,2,3"
+
+
+Readiness health checks
+-----------------------
+Healthcheck fields:
+
+- ``healthcheck``: aggregated server endpoint, default ``/healthcheck``.
+- ``serve_healthcheck``: optional override for serve, prefill, and decode servers.
+- ``proxy_healthcheck``: disaggregated proxy/router endpoint, default ``/healthcheck``.
+
+If ``serve_healthcheck`` is omitted, disaggregated prefill/decode servers keep the legacy ``/health`` endpoint. If a
+disaggregated config sets ``healthcheck`` but omits ``proxy_healthcheck``, the proxy/router uses ``healthcheck`` for
+backward compatibility.
+
+
 Controlling ``proxy_script``
 -----------------------------
 ``proxy_script`` is used to proxy the requests from the client to the prefill and decode instances. It is ignored for non-disaggregated mode. Default value can be found below.