NVIDIA · podkidyshev · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
@@ -20,8 +20,8 @@ test_template_name = "AIDynamo"
 extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
 
 [cmd_args]
-docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0"
-workloads = "genai_perf.sh"
+docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.1.1"
+workloads = "aiperf.sh"
 
   [cmd_args.dynamo]
   backend = "sglang"
@@ -32,7 +32,7 @@ workloads = "genai_perf.sh"
     num-nodes = 1
     cmd = 'python3 -m dynamo.sglang'
     extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics"
-    worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
+    worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
 
       [cmd_args.dynamo.prefill_worker.args]
       page-size = 16
@@ -48,7 +48,7 @@ workloads = "genai_perf.sh"
     num-nodes = 1
     cmd = 'python3 -m dynamo.sglang'
     extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics"
-    worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
+    worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
 
       [cmd_args.dynamo.decode_worker.args]
       page-size = 16
@@ -94,18 +94,38 @@ workloads = "genai_perf.sh"
     concurrency = 2
 
   [cmd_args.aiperf]
+  setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
 
     [cmd_args.aiperf.args]
     concurrency = 2
+    extra-inputs = '{"min_tokens":10}'
+    output-tokens-mean = 500
     request-count = 50
     synthetic-input-tokens-mean = 300
-    output-tokens-mean = 500
+
+  [cmd_args.aiperf_accuracy]
+  entrypoint = "aiperf profile"
+  setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
+  cli = '''
+--model {model}
+--url {url}
+--endpoint-type chat
+--streaming
+--artifact-dir {artifact_dir}
+--no-server-metrics
+--accuracy-benchmark mmlu
+--accuracy-n-shots 5
+--accuracy-tasks abstract_algebra
+--concurrency 10
+--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+--num-requests 100
+'''
 
 [extra_env_vars]
 UCX_LOG_LEVEL = "warn"
-HF_HUB_OFFLINE = "1"
-TRANSFORMERS_OFFLINE = "1"
-HF_DATASETS_OFFLINE = "1"
+HF_HUB_OFFLINE = "0"
+TRANSFORMERS_OFFLINE = "0"
+HF_DATASETS_OFFLINE = "0"
 DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
 UCX_TLS = "all"
 #DYN_LOGGING_JSONL="true"

@@ -20,8 +20,8 @@ test_template_name = "AIDynamo"
 extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
 
 [cmd_args]
-docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1"
-workloads = "genai_perf.sh"
+docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.1.1"
+workloads = "aiperf.sh"
 
   [cmd_args.dynamo]
   backend = "vllm"
@@ -38,6 +38,7 @@ workloads = "genai_perf.sh"
       tensor-parallel-size = 8
       pipeline-parallel-size = 1
       data-parallel-size = 1
+      kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
 
     [cmd_args.dynamo.decode_worker]
     num-nodes = 1
@@ -50,6 +51,7 @@ workloads = "genai_perf.sh"
       tensor-parallel-size = 8
       pipeline-parallel-size = 1
       data-parallel-size = 1
+      kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
 
   [cmd_args.lmcache]
   controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001"
@@ -86,17 +88,35 @@ workloads = "genai_perf.sh"
     concurrency = 2
 
   [cmd_args.aiperf]
-
     [cmd_args.aiperf.args]
     concurrency = 2
+    extra-inputs = '{"min_tokens":10}'
+    output-tokens-mean = 500
     request-count = 50
     synthetic-input-tokens-mean = 300
-    output-tokens-mean = 500
+
+  [cmd_args.aiperf_accuracy]
+  entrypoint = "aiperf profile"
+  setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
+  cli = '''
+--model {model}
+--url {url}
+--endpoint-type chat
+--streaming
+--artifact-dir {artifact_dir}
+--no-server-metrics
+--accuracy-benchmark mmlu
+--accuracy-n-shots 5
+--accuracy-tasks abstract_algebra
+--concurrency 10
+--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+--num-requests 100
+'''
 
 [extra_env_vars]
 UCX_LOG_LEVEL = "warn"
-HF_HUB_OFFLINE = "1"
-TRANSFORMERS_OFFLINE = "1"
-HF_DATASETS_OFFLINE = "1"
+HF_HUB_OFFLINE = "0"
+TRANSFORMERS_OFFLINE = "0"
+HF_DATASETS_OFFLINE = "0"
 DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
 UCX_TLS = "all"
@@ -15,11 +15,12 @@
 # limitations under the License.
 
 name = "dynamo_sglang"
+job_status_check = false
 
 [[Tests]]
-id = "sglang-Qwen3-0.6B"
+id = "test.disagg.single-node"
 test_name = "sglang"
-time_limit = "00:20:00"
+time_limit = "00:10:00"
 
   [Tests.cmd_args]
     [Tests.cmd_args.dynamo]
@@ -37,3 +38,25 @@ time_limit = "00:20:00"
 
         [Tests.cmd_args.dynamo.decode_worker.args]
         tensor-parallel-size = 1
+
+[[Tests]]
+id = "test.disagg.multinode"
+test_name = "sglang"
+time_limit = "00:10:00"
+
+  [Tests.cmd_args]
+    [Tests.cmd_args.dynamo]
+    model = "Qwen/Qwen3-0.6B"
+    node-setup-cmd = "hostname"
+
+      [Tests.cmd_args.dynamo.prefill_worker]
+      num-nodes = 2
+
+        [Tests.cmd_args.dynamo.prefill_worker.args]
+        tensor-parallel-size = 1
+
+      [Tests.cmd_args.dynamo.decode_worker]
+      num-nodes = 2
+
+        [Tests.cmd_args.dynamo.decode_worker.args]
+        tensor-parallel-size = 1
@@ -22,8 +22,8 @@ test_template_name = "sglang"
 docker_image_url = "lmsysorg/sglang:dev-cu13"
 
 [semantic_eval_cmd_args]
-module = "sglang.test.run_eval"
-args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+entrypoint = "python3 -m sglang.test.run_eval"
+cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
 
 [extra_env_vars]
 UCX_NET_DEVICES = "all"

@@ -27,8 +27,8 @@ mount_as = "/vllm_repo"
 docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1"
 
 [semantic_eval_cmd_args]
-script = "/vllm_repo/tests/evals/gsm8k/gsm8k_eval.py"
-args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+entrypoint = "python3 /vllm_repo/tests/evals/gsm8k/gsm8k_eval.py"
+cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
 
 [extra_env_vars]
 UCX_NET_DEVICES = "all"

@@ -87,14 +87,14 @@ The frontend node will initially wait to allow weight loading on all nodes. Once
 Choosing a Benchmark Tool
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The benchmark tool is controlled by the ``workloads`` field in the test TOML. The default is ``aiperf.sh``:
+The benchmark tool is controlled by the ``workloads`` field in the test TOML. Set ``aiperf.sh`` to use AIPerf:
 
 .. code-block:: toml
 
    [cmd_args]
-   workloads = "aiperf.sh"   # default — uses aiperf, writes aiperf_report.csv
+   workloads = "aiperf.sh"   # uses aiperf, writes aiperf_report.csv
 
-To use genai-perf instead, set:
+To use genai-perf, set:
 
 .. code-block:: toml
 
@@ -110,17 +110,88 @@ To use genai-perf instead, set:
      output-tokens-mean = 500
      request-count = 50
 
+Semantic Degradation With AIPerf Accuracy
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it with
+``[cmd_args.aiperf_accuracy]``. This runs after the configured performance workload, so it can be used with either
+``aiperf.sh`` or ``genai_perf.sh``:
+
+.. code-block:: toml
+
+   [cmd_args]
+   workloads = "aiperf.sh"
+
+   [cmd_args.aiperf]
+     [cmd_args.aiperf.args]
+     request-count = 50
+     synthetic-input-tokens-mean = 300
+     output-tokens-mean = 500
+     concurrency = 2
+
+   [cmd_args.aiperf_accuracy]
+   entrypoint = "aiperf profile"
+   setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
+   cli = '''
+   --model {model}
+   --url {url}
+   --endpoint-type chat
+   --streaming
+   --artifact-dir {artifact_dir}
+   --no-server-metrics
+   --accuracy-benchmark mmlu
+   --accuracy-n-shots 5
+   --accuracy-tasks abstract_algebra
+   --concurrency 10
+   --extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+   --num-requests 100
+   '''
+
+When ``cmd_args.aiperf_accuracy`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes
+the ``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt
+and token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark.
+
+The ``entrypoint`` and ``cli`` fields form the accuracy command. CloudAI expands ``{model}``, ``{url}``,
+``{endpoint}``, ``{result_dir}``, and ``{artifact_dir}`` in ``cli`` before launching it. The ``setup-cmd`` field is
+optional. It is useful for Dynamo images that include an older system ``aiperf`` build without the accuracy benchmark
+plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``.
+MMLU is loaded from ``lighteval/mmlu``, so either allow Hugging Face dataset access or pre-cache that dataset before
+running with ``HF_HUB_OFFLINE``/``HF_DATASETS_OFFLINE`` enabled.
+For Qwen3 models, the example disables thinking mode so short MMLU answers can be parsed as choices.
+
+Custom Accuracy Scripts
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``cmd_args.aiperf_accuracy`` can also launch a custom mounted script instead of AIPerf. Mount the script or its parent
+directory with ``extra_container_mounts`` and set ``entrypoint`` to the in-container command:
+
+.. code-block:: toml
+
+   extra_container_mounts = ["/host/custom_accuracy:/custom_accuracy"]
+
+   [cmd_args.aiperf_accuracy]
+   entrypoint = "python /custom_accuracy/dummy_accuracy.py"
+   cli = "--model {model} --url {url} --endpoint {endpoint} --artifact-dir {artifact_dir} --prompt ping"
+
+CloudAI expands placeholders in ``cli`` and runs ``entrypoint`` with that CLI string. The custom command must write
+``accuracy_results.csv`` inside ``{artifact_dir}`` with an ``OVERALL`` row. CloudAI copies that file to the run output
+directory and exposes the same ``accuracy`` metric as AIPerf accuracy mode.
+
 Review Benchmark Results
 ------------------------
 
 After job completion, CloudAI places output logs and result files in the designated results directory. The result file name depends on the configured ``workloads`` field:
 
-- ``aiperf.sh`` (default) → ``aiperf_report.csv``
+- ``aiperf.sh`` → ``aiperf_report.csv``
 - ``genai_perf.sh`` → ``genai_perf_report.csv``
+- ``cmd_args.aiperf_accuracy`` → ``accuracy_results.csv``
+
+If AIPerf accuracy mode is enabled, CloudAI copies ``aiperf_accuracy_artifacts/accuracy_results.csv`` to
+``accuracy_results.csv`` in the run output directory and marks the run failed if that file is not produced.
 
 Navigate to ``./results/<scenario>/<test-id>/0/`` and open the CSV to examine performance metrics.
 
-Example ``aiperf_report.csv`` (default):
+Example ``aiperf_report.csv``:
 
 ::
 

@@ -29,8 +29,8 @@ Test + Scenario example
    num_prompts = 30
 
    [semantic_eval_cmd_args]
-   module = "sglang.test.run_eval"
-   args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+   entrypoint = "python3 -m sglang.test.run_eval"
+   cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
 
 
 .. code-block:: toml
@@ -81,18 +81,19 @@ To run GSM8K semantic validation after the serving benchmark, add ``semantic_eva
    :caption: test.toml (semantic validation)
 
    [semantic_eval_cmd_args]
-   module = "sglang.test.run_eval"
-   args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+   entrypoint = "python3 -m sglang.test.run_eval"
+   cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
 
-For images that still use the legacy SGLang GSM8K runner, override the module and raw arguments:
+For images that still use the legacy SGLang GSM8K runner, override the entrypoint and raw CLI:
 
 .. code-block:: toml
 
    [semantic_eval_cmd_args]
-   module = "sglang.test.few_shot_gsm8k"
-   args = "--num-questions 200"
+   entrypoint = "python3 -m sglang.test.few_shot_gsm8k"
+   cli = "--host {host} --port {port} --num-questions 200"
 
-The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders.
+The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{output_path}``, and ``{result_dir}``
+placeholders.
 
 
 Control number of GPUs

@@ -29,8 +29,8 @@ Test and Scenario Examples
    num_prompts = 30
 
    [semantic_eval_cmd_args]
-   script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
-   args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+   entrypoint = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
+   cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
 
 
 .. code-block:: toml
@@ -81,13 +81,14 @@ To run GSM8K semantic validation after the serving benchmark, add ``semantic_eva
    :caption: test.toml (semantic validation)
 
    [semantic_eval_cmd_args]
-   script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
-   args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+   entrypoint = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
+   cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
 
 If the runtime image does not contain the eval script, mount a vLLM repository with existing ``git_repos`` support and
-point ``script`` at the mounted path.
+point ``entrypoint`` at the mounted path.
 
-The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders.
+The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{output_path}``, and ``{result_dir}``
+placeholders.
 
 
 Controlling the Number of GPUs

@@ -19,6 +19,7 @@
     AIDynamoCmdArgs,
     AIDynamoTestDefinition,
     AIPerf,
+    AIPerfAccuracy,
     GenAIPerf,
     LMCache,
     LMCacheArgs,
@@ -37,6 +38,7 @@
     "AIDynamoSlurmCommandGenStrategy",
     "AIDynamoTestDefinition",
     "AIPerf",
+    "AIPerfAccuracy",
     "GenAIPerf",
     "LMCache",
     "LMCacheArgs",