Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 28 additions & 8 deletions conf/experimental/ai_dynamo/test/sglang.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ test_template_name = "AIDynamo"
extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]

[cmd_args]
docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0"
workloads = "genai_perf.sh"
docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.1.1"
workloads = "aiperf.sh"

[cmd_args.dynamo]
backend = "sglang"
Expand All @@ -32,7 +32,7 @@ workloads = "genai_perf.sh"
num-nodes = 1
cmd = 'python3 -m dynamo.sglang'
extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics"
worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'

[cmd_args.dynamo.prefill_worker.args]
page-size = 16
Expand All @@ -48,7 +48,7 @@ workloads = "genai_perf.sh"
num-nodes = 1
cmd = 'python3 -m dynamo.sglang'
extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics"
worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'

[cmd_args.dynamo.decode_worker.args]
page-size = 16
Expand Down Expand Up @@ -94,18 +94,38 @@ workloads = "genai_perf.sh"
concurrency = 2

[cmd_args.aiperf]
setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0"

[cmd_args.aiperf.args]
concurrency = 2
extra-inputs = '{"min_tokens":10}'
output-tokens-mean = 500
request-count = 50
synthetic-input-tokens-mean = 300
output-tokens-mean = 500

[cmd_args.aiperf_accuracy]
entrypoint = "aiperf profile"
setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
cli = '''
--model {model}
--url {url}
--endpoint-type chat
--streaming
--artifact-dir {artifact_dir}
--no-server-metrics
--accuracy-benchmark mmlu
--accuracy-n-shots 5
--accuracy-tasks abstract_algebra
--concurrency 10
--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
--num-requests 100
'''

[extra_env_vars]
UCX_LOG_LEVEL = "warn"
HF_HUB_OFFLINE = "1"
TRANSFORMERS_OFFLINE = "1"
HF_DATASETS_OFFLINE = "1"
HF_HUB_OFFLINE = "0"
TRANSFORMERS_OFFLINE = "0"
HF_DATASETS_OFFLINE = "0"
DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
UCX_TLS = "all"
#DYN_LOGGING_JSONL="true"
Expand Down
34 changes: 27 additions & 7 deletions conf/experimental/ai_dynamo/test/vllm.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ test_template_name = "AIDynamo"
extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]

[cmd_args]
docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1"
workloads = "genai_perf.sh"
docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.1.1"
workloads = "aiperf.sh"

[cmd_args.dynamo]
backend = "vllm"
Expand All @@ -38,6 +38,7 @@ workloads = "genai_perf.sh"
tensor-parallel-size = 8
pipeline-parallel-size = 1
data-parallel-size = 1
kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'

[cmd_args.dynamo.decode_worker]
num-nodes = 1
Expand All @@ -50,6 +51,7 @@ workloads = "genai_perf.sh"
tensor-parallel-size = 8
pipeline-parallel-size = 1
data-parallel-size = 1
kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'

[cmd_args.lmcache]
controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001"
Expand Down Expand Up @@ -86,17 +88,35 @@ workloads = "genai_perf.sh"
concurrency = 2

[cmd_args.aiperf]

[cmd_args.aiperf.args]
concurrency = 2
extra-inputs = '{"min_tokens":10}'
output-tokens-mean = 500
request-count = 50
synthetic-input-tokens-mean = 300
output-tokens-mean = 500

[cmd_args.aiperf_accuracy]
entrypoint = "aiperf profile"
setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
cli = '''
--model {model}
--url {url}
--endpoint-type chat
--streaming
--artifact-dir {artifact_dir}
--no-server-metrics
--accuracy-benchmark mmlu
--accuracy-n-shots 5
--accuracy-tasks abstract_algebra
--concurrency 10
--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
--num-requests 100
'''

[extra_env_vars]
UCX_LOG_LEVEL = "warn"
HF_HUB_OFFLINE = "1"
TRANSFORMERS_OFFLINE = "1"
HF_DATASETS_OFFLINE = "1"
HF_HUB_OFFLINE = "0"
TRANSFORMERS_OFFLINE = "0"
HF_DATASETS_OFFLINE = "0"
DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
UCX_TLS = "all"
27 changes: 25 additions & 2 deletions conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@
# limitations under the License.

name = "dynamo_sglang"
job_status_check = false

[[Tests]]
id = "sglang-Qwen3-0.6B"
id = "test.disagg.single-node"
test_name = "sglang"
time_limit = "00:20:00"
time_limit = "00:10:00"

[Tests.cmd_args]
[Tests.cmd_args.dynamo]
Expand All @@ -37,3 +38,25 @@ time_limit = "00:20:00"

[Tests.cmd_args.dynamo.decode_worker.args]
tensor-parallel-size = 1

[[Tests]]
id = "test.disagg.multinode"
test_name = "sglang"
time_limit = "00:10:00"

[Tests.cmd_args]
[Tests.cmd_args.dynamo]
model = "Qwen/Qwen3-0.6B"
node-setup-cmd = "hostname"

[Tests.cmd_args.dynamo.prefill_worker]
num-nodes = 2

[Tests.cmd_args.dynamo.prefill_worker.args]
tensor-parallel-size = 1

[Tests.cmd_args.dynamo.decode_worker]
num-nodes = 2

[Tests.cmd_args.dynamo.decode_worker.args]
tensor-parallel-size = 1
4 changes: 2 additions & 2 deletions conf/experimental/sglang/test/sglang.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ test_template_name = "sglang"
docker_image_url = "lmsysorg/sglang:dev-cu13"

[semantic_eval_cmd_args]
module = "sglang.test.run_eval"
args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
entrypoint = "python3 -m sglang.test.run_eval"
cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"

[extra_env_vars]
UCX_NET_DEVICES = "all"
Expand Down
4 changes: 2 additions & 2 deletions conf/experimental/vllm/test/vllm.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ mount_as = "/vllm_repo"
docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1"

[semantic_eval_cmd_args]
script = "/vllm_repo/tests/evals/gsm8k/gsm8k_eval.py"
args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
entrypoint = "python3 /vllm_repo/tests/evals/gsm8k/gsm8k_eval.py"
cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"

[extra_env_vars]
UCX_NET_DEVICES = "all"
Expand Down
81 changes: 76 additions & 5 deletions doc/workloads/ai_dynamo.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,14 @@ The frontend node will initially wait to allow weight loading on all nodes. Once
Choosing a Benchmark Tool
~~~~~~~~~~~~~~~~~~~~~~~~~

The benchmark tool is controlled by the ``workloads`` field in the test TOML. The default is ``aiperf.sh``:
The benchmark tool is controlled by the ``workloads`` field in the test TOML. Set ``aiperf.sh`` to use AIPerf:

.. code-block:: toml

[cmd_args]
workloads = "aiperf.sh" # default — uses aiperf, writes aiperf_report.csv
workloads = "aiperf.sh" # uses aiperf, writes aiperf_report.csv

To use genai-perf instead, set:
To use genai-perf, set:

.. code-block:: toml

Expand All @@ -110,17 +110,88 @@ To use genai-perf instead, set:
output-tokens-mean = 500
request-count = 50

Semantic Degradation With AIPerf Accuracy
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it with
``[cmd_args.aiperf_accuracy]``. This runs after the configured performance workload, so it can be used with either
``aiperf.sh`` or ``genai_perf.sh``:

.. code-block:: toml

[cmd_args]
workloads = "aiperf.sh"

[cmd_args.aiperf]
[cmd_args.aiperf.args]
request-count = 50
synthetic-input-tokens-mean = 300
output-tokens-mean = 500
concurrency = 2

[cmd_args.aiperf_accuracy]
entrypoint = "aiperf profile"
setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
cli = '''
--model {model}
--url {url}
--endpoint-type chat
--streaming
--artifact-dir {artifact_dir}
--no-server-metrics
--accuracy-benchmark mmlu
--accuracy-n-shots 5
--accuracy-tasks abstract_algebra
--concurrency 10
--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
--num-requests 100
'''

When ``cmd_args.aiperf_accuracy`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes
the ``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt
and token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark.

The ``entrypoint`` and ``cli`` fields form the accuracy command. CloudAI expands ``{model}``, ``{url}``,
``{endpoint}``, ``{result_dir}``, and ``{artifact_dir}`` in ``cli`` before launching it. The ``setup-cmd`` field is
optional. It is useful for Dynamo images that include an older system ``aiperf`` build without the accuracy benchmark
plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``.
MMLU is loaded from ``lighteval/mmlu``, so either allow Hugging Face dataset access or pre-cache that dataset before
running with ``HF_HUB_OFFLINE``/``HF_DATASETS_OFFLINE`` enabled.
For Qwen3 models, the example disables thinking mode so short MMLU answers can be parsed as choices.

Custom Accuracy Scripts
~~~~~~~~~~~~~~~~~~~~~~~

``cmd_args.aiperf_accuracy`` can also launch a custom mounted script instead of AIPerf. Mount the script or its parent
directory with ``extra_container_mounts`` and set ``entrypoint`` to the in-container command:

.. code-block:: toml

extra_container_mounts = ["/host/custom_accuracy:/custom_accuracy"]

[cmd_args.aiperf_accuracy]
entrypoint = "python /custom_accuracy/dummy_accuracy.py"
cli = "--model {model} --url {url} --endpoint {endpoint} --artifact-dir {artifact_dir} --prompt ping"

CloudAI expands placeholders in ``cli`` and runs ``entrypoint`` with that CLI string. The custom command must write
``accuracy_results.csv`` inside ``{artifact_dir}`` with an ``OVERALL`` row. CloudAI copies that file to the run output
directory and exposes the same ``accuracy`` metric as AIPerf accuracy mode.

Review Benchmark Results
------------------------

After job completion, CloudAI places output logs and result files in the designated results directory. The result file name depends on the configured ``workloads`` field:

- ``aiperf.sh`` (default) → ``aiperf_report.csv``
- ``aiperf.sh`` → ``aiperf_report.csv``
- ``genai_perf.sh`` → ``genai_perf_report.csv``
- ``cmd_args.aiperf_accuracy`` → ``accuracy_results.csv``

If AIPerf accuracy mode is enabled, CloudAI copies ``aiperf_accuracy_artifacts/accuracy_results.csv`` to
``accuracy_results.csv`` in the run output directory and marks the run failed if that file is not produced.

Navigate to ``./results/<scenario>/<test-id>/0/`` and open the CSV to examine performance metrics.

Example ``aiperf_report.csv`` (default):
Example ``aiperf_report.csv``:

::

Expand Down
17 changes: 9 additions & 8 deletions doc/workloads/sglang.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ Test + Scenario example
num_prompts = 30

[semantic_eval_cmd_args]
module = "sglang.test.run_eval"
args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
entrypoint = "python3 -m sglang.test.run_eval"
cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"


.. code-block:: toml
Expand Down Expand Up @@ -81,18 +81,19 @@ To run GSM8K semantic validation after the serving benchmark, add ``semantic_eva
:caption: test.toml (semantic validation)

[semantic_eval_cmd_args]
module = "sglang.test.run_eval"
args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
entrypoint = "python3 -m sglang.test.run_eval"
cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"

For images that still use the legacy SGLang GSM8K runner, override the module and raw arguments:
For images that still use the legacy SGLang GSM8K runner, override the entrypoint and raw CLI:

.. code-block:: toml

[semantic_eval_cmd_args]
module = "sglang.test.few_shot_gsm8k"
args = "--num-questions 200"
entrypoint = "python3 -m sglang.test.few_shot_gsm8k"
cli = "--host {host} --port {port} --num-questions 200"

The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders.
The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{output_path}``, and ``{result_dir}``
placeholders.


Control number of GPUs
Expand Down
13 changes: 7 additions & 6 deletions doc/workloads/vllm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ Test and Scenario Examples
num_prompts = 30

[semantic_eval_cmd_args]
script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
entrypoint = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"


.. code-block:: toml
Expand Down Expand Up @@ -81,13 +81,14 @@ To run GSM8K semantic validation after the serving benchmark, add ``semantic_eva
:caption: test.toml (semantic validation)

[semantic_eval_cmd_args]
script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
entrypoint = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"

If the runtime image does not contain the eval script, mount a vLLM repository with existing ``git_repos`` support and
point ``script`` at the mounted path.
point ``entrypoint`` at the mounted path.

The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders.
The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{output_path}``, and ``{result_dir}``
placeholders.


Controlling the Number of GPUs
Expand Down
2 changes: 2 additions & 0 deletions src/cloudai/workloads/ai_dynamo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
AIDynamoCmdArgs,
AIDynamoTestDefinition,
AIPerf,
AIPerfAccuracy,
GenAIPerf,
LMCache,
LMCacheArgs,
Expand All @@ -37,6 +38,7 @@
"AIDynamoSlurmCommandGenStrategy",
"AIDynamoTestDefinition",
"AIPerf",
"AIPerfAccuracy",
"GenAIPerf",
"LMCache",
"LMCacheArgs",
Expand Down
Loading
Loading