diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index 62f8f615d..37b2c392b 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -20,8 +20,8 @@ test_template_name = "AIDynamo" extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"] [cmd_args] -docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0" -workloads = "genai_perf.sh" +docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.1.1" +workloads = "aiperf.sh" [cmd_args.dynamo] backend = "sglang" @@ -32,7 +32,7 @@ workloads = "genai_perf.sh" num-nodes = 1 cmd = 'python3 -m dynamo.sglang' extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics" - worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config' + worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config' [cmd_args.dynamo.prefill_worker.args] page-size = 16 @@ -48,7 +48,7 @@ workloads = "genai_perf.sh" num-nodes = 1 cmd = 'python3 -m dynamo.sglang' extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics" - worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config' + worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config' [cmd_args.dynamo.decode_worker.args] page-size = 16 @@ -94,18 +94,38 @@ workloads = "genai_perf.sh" concurrency = 2 [cmd_args.aiperf] + setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0" [cmd_args.aiperf.args] concurrency = 2 + extra-inputs = '{"min_tokens":10}' + output-tokens-mean = 500 request-count = 50 synthetic-input-tokens-mean = 300 - output-tokens-mean = 500 + + [cmd_args.aiperf_accuracy] + entrypoint = "aiperf profile" + setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0" + cli = ''' +--model {model} +--url {url} +--endpoint-type chat +--streaming +--artifact-dir {artifact_dir} +--no-server-metrics +--accuracy-benchmark mmlu +--accuracy-n-shots 5 +--accuracy-tasks abstract_algebra +--concurrency 10 +--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' +--num-requests 100 +''' [extra_env_vars] UCX_LOG_LEVEL = "warn" -HF_HUB_OFFLINE = "1" -TRANSFORMERS_OFFLINE = "1" -HF_DATASETS_OFFLINE = "1" +HF_HUB_OFFLINE = "0" +TRANSFORMERS_OFFLINE = "0" +HF_DATASETS_OFFLINE = "0" DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')" UCX_TLS = "all" #DYN_LOGGING_JSONL="true" diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index 193510728..583d11a88 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -20,8 +20,8 @@ test_template_name = "AIDynamo" extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"] [cmd_args] -docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1" -workloads = "genai_perf.sh" +docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.1.1" +workloads = "aiperf.sh" [cmd_args.dynamo] backend = "vllm" @@ -38,6 +38,7 @@ workloads = "genai_perf.sh" tensor-parallel-size = 8 pipeline-parallel-size = 1 data-parallel-size = 1 + kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' [cmd_args.dynamo.decode_worker] num-nodes = 1 @@ -50,6 +51,7 @@ workloads = "genai_perf.sh" tensor-parallel-size = 8 pipeline-parallel-size = 1 data-parallel-size = 1 + kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' [cmd_args.lmcache] controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001" @@ -86,17 +88,35 @@ workloads = "genai_perf.sh" concurrency = 2 [cmd_args.aiperf] - [cmd_args.aiperf.args] concurrency = 2 + extra-inputs = '{"min_tokens":10}' + output-tokens-mean = 500 request-count = 50 synthetic-input-tokens-mean = 300 - output-tokens-mean = 500 + + [cmd_args.aiperf_accuracy] + entrypoint = "aiperf profile" + setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" + cli = ''' +--model {model} +--url {url} +--endpoint-type chat +--streaming +--artifact-dir {artifact_dir} +--no-server-metrics +--accuracy-benchmark mmlu +--accuracy-n-shots 5 +--accuracy-tasks abstract_algebra +--concurrency 10 +--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' +--num-requests 100 +''' [extra_env_vars] UCX_LOG_LEVEL = "warn" -HF_HUB_OFFLINE = "1" -TRANSFORMERS_OFFLINE = "1" -HF_DATASETS_OFFLINE = "1" +HF_HUB_OFFLINE = "0" +TRANSFORMERS_OFFLINE = "0" +HF_DATASETS_OFFLINE = "0" DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')" UCX_TLS = "all" diff --git a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml index 26ed91285..4df1a6d64 100644 --- a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml +++ b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml @@ -15,11 +15,12 @@ # limitations under the License. name = "dynamo_sglang" +job_status_check = false [[Tests]] -id = "sglang-Qwen3-0.6B" +id = "test.disagg.single-node" test_name = "sglang" -time_limit = "00:20:00" +time_limit = "00:10:00" [Tests.cmd_args] [Tests.cmd_args.dynamo] @@ -37,3 +38,25 @@ time_limit = "00:20:00" [Tests.cmd_args.dynamo.decode_worker.args] tensor-parallel-size = 1 + +[[Tests]] +id = "test.disagg.multinode" +test_name = "sglang" +time_limit = "00:10:00" + + [Tests.cmd_args] + [Tests.cmd_args.dynamo] + model = "Qwen/Qwen3-0.6B" + node-setup-cmd = "hostname" + + [Tests.cmd_args.dynamo.prefill_worker] + num-nodes = 2 + + [Tests.cmd_args.dynamo.prefill_worker.args] + tensor-parallel-size = 1 + + [Tests.cmd_args.dynamo.decode_worker] + num-nodes = 2 + + [Tests.cmd_args.dynamo.decode_worker.args] + tensor-parallel-size = 1 diff --git a/conf/experimental/sglang/test/sglang.toml b/conf/experimental/sglang/test/sglang.toml index e6d2c09b4..2866f656c 100644 --- a/conf/experimental/sglang/test/sglang.toml +++ b/conf/experimental/sglang/test/sglang.toml @@ -22,8 +22,8 @@ test_template_name = "sglang" docker_image_url = "lmsysorg/sglang:dev-cu13" [semantic_eval_cmd_args] -module = "sglang.test.run_eval" -args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" +entrypoint = "python3 -m sglang.test.run_eval" +cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" [extra_env_vars] UCX_NET_DEVICES = "all" diff --git a/conf/experimental/vllm/test/vllm.toml b/conf/experimental/vllm/test/vllm.toml index 891023201..a8061099c 100644 --- a/conf/experimental/vllm/test/vllm.toml +++ b/conf/experimental/vllm/test/vllm.toml @@ -27,8 +27,8 @@ mount_as = "/vllm_repo" docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1" [semantic_eval_cmd_args] -script = "/vllm_repo/tests/evals/gsm8k/gsm8k_eval.py" -args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json" +entrypoint = "python3 /vllm_repo/tests/evals/gsm8k/gsm8k_eval.py" +cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json" [extra_env_vars] UCX_NET_DEVICES = "all" diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index 023d92bf2..c00449681 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -87,14 +87,14 @@ The frontend node will initially wait to allow weight loading on all nodes. Once Choosing a Benchmark Tool ~~~~~~~~~~~~~~~~~~~~~~~~~ -The benchmark tool is controlled by the ``workloads`` field in the test TOML. The default is ``aiperf.sh``: +The benchmark tool is controlled by the ``workloads`` field in the test TOML. Set ``aiperf.sh`` to use AIPerf: .. code-block:: toml [cmd_args] - workloads = "aiperf.sh" # default — uses aiperf, writes aiperf_report.csv + workloads = "aiperf.sh" # uses aiperf, writes aiperf_report.csv -To use genai-perf instead, set: +To use genai-perf, set: .. code-block:: toml @@ -110,17 +110,88 @@ To use genai-perf instead, set: output-tokens-mean = 500 request-count = 50 +Semantic Degradation With AIPerf Accuracy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it with +``[cmd_args.aiperf_accuracy]``. This runs after the configured performance workload, so it can be used with either +``aiperf.sh`` or ``genai_perf.sh``: + +.. code-block:: toml + + [cmd_args] + workloads = "aiperf.sh" + + [cmd_args.aiperf] + [cmd_args.aiperf.args] + request-count = 50 + synthetic-input-tokens-mean = 300 + output-tokens-mean = 500 + concurrency = 2 + + [cmd_args.aiperf_accuracy] + entrypoint = "aiperf profile" + setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" + cli = ''' + --model {model} + --url {url} + --endpoint-type chat + --streaming + --artifact-dir {artifact_dir} + --no-server-metrics + --accuracy-benchmark mmlu + --accuracy-n-shots 5 + --accuracy-tasks abstract_algebra + --concurrency 10 + --extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' + --num-requests 100 + ''' + +When ``cmd_args.aiperf_accuracy`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes +the ``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt +and token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark. + +The ``entrypoint`` and ``cli`` fields form the accuracy command. CloudAI expands ``{model}``, ``{url}``, +``{endpoint}``, ``{result_dir}``, and ``{artifact_dir}`` in ``cli`` before launching it. The ``setup-cmd`` field is +optional. It is useful for Dynamo images that include an older system ``aiperf`` build without the accuracy benchmark +plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``. +MMLU is loaded from ``lighteval/mmlu``, so either allow Hugging Face dataset access or pre-cache that dataset before +running with ``HF_HUB_OFFLINE``/``HF_DATASETS_OFFLINE`` enabled. +For Qwen3 models, the example disables thinking mode so short MMLU answers can be parsed as choices. + +Custom Accuracy Scripts +~~~~~~~~~~~~~~~~~~~~~~~ + +``cmd_args.aiperf_accuracy`` can also launch a custom mounted script instead of AIPerf. Mount the script or its parent +directory with ``extra_container_mounts`` and set ``entrypoint`` to the in-container command: + +.. code-block:: toml + + extra_container_mounts = ["/host/custom_accuracy:/custom_accuracy"] + + [cmd_args.aiperf_accuracy] + entrypoint = "python /custom_accuracy/dummy_accuracy.py" + cli = "--model {model} --url {url} --endpoint {endpoint} --artifact-dir {artifact_dir} --prompt ping" + +CloudAI expands placeholders in ``cli`` and runs ``entrypoint`` with that CLI string. The custom command must write +``accuracy_results.csv`` inside ``{artifact_dir}`` with an ``OVERALL`` row. CloudAI copies that file to the run output +directory and exposes the same ``accuracy`` metric as AIPerf accuracy mode. + Review Benchmark Results ------------------------ After job completion, CloudAI places output logs and result files in the designated results directory. The result file name depends on the configured ``workloads`` field: -- ``aiperf.sh`` (default) → ``aiperf_report.csv`` +- ``aiperf.sh`` → ``aiperf_report.csv`` - ``genai_perf.sh`` → ``genai_perf_report.csv`` +- ``cmd_args.aiperf_accuracy`` → ``accuracy_results.csv`` + +If AIPerf accuracy mode is enabled, CloudAI copies ``aiperf_accuracy_artifacts/accuracy_results.csv`` to +``accuracy_results.csv`` in the run output directory and marks the run failed if that file is not produced. Navigate to ``./results///0/`` and open the CSV to examine performance metrics. -Example ``aiperf_report.csv`` (default): +Example ``aiperf_report.csv``: :: diff --git a/doc/workloads/sglang.rst b/doc/workloads/sglang.rst index d0561c773..cdbd5cff1 100644 --- a/doc/workloads/sglang.rst +++ b/doc/workloads/sglang.rst @@ -29,8 +29,8 @@ Test + Scenario example num_prompts = 30 [semantic_eval_cmd_args] - module = "sglang.test.run_eval" - args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" + entrypoint = "python3 -m sglang.test.run_eval" + cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" .. code-block:: toml @@ -81,18 +81,19 @@ To run GSM8K semantic validation after the serving benchmark, add ``semantic_eva :caption: test.toml (semantic validation) [semantic_eval_cmd_args] - module = "sglang.test.run_eval" - args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" + entrypoint = "python3 -m sglang.test.run_eval" + cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" -For images that still use the legacy SGLang GSM8K runner, override the module and raw arguments: +For images that still use the legacy SGLang GSM8K runner, override the entrypoint and raw CLI: .. code-block:: toml [semantic_eval_cmd_args] - module = "sglang.test.few_shot_gsm8k" - args = "--num-questions 200" + entrypoint = "python3 -m sglang.test.few_shot_gsm8k" + cli = "--host {host} --port {port} --num-questions 200" -The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders. +The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{output_path}``, and ``{result_dir}`` +placeholders. Control number of GPUs diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst index 930bcf11b..57773992f 100644 --- a/doc/workloads/vllm.rst +++ b/doc/workloads/vllm.rst @@ -29,8 +29,8 @@ Test and Scenario Examples num_prompts = 30 [semantic_eval_cmd_args] - script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" - args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json" + entrypoint = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" + cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json" .. code-block:: toml @@ -81,13 +81,14 @@ To run GSM8K semantic validation after the serving benchmark, add ``semantic_eva :caption: test.toml (semantic validation) [semantic_eval_cmd_args] - script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" - args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json" + entrypoint = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" + cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json" If the runtime image does not contain the eval script, mount a vLLM repository with existing ``git_repos`` support and -point ``script`` at the mounted path. +point ``entrypoint`` at the mounted path. -The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders. +The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{output_path}``, and ``{result_dir}`` +placeholders. Controlling the Number of GPUs diff --git a/src/cloudai/workloads/ai_dynamo/__init__.py b/src/cloudai/workloads/ai_dynamo/__init__.py index 1360ce10d..4aac3fd2c 100644 --- a/src/cloudai/workloads/ai_dynamo/__init__.py +++ b/src/cloudai/workloads/ai_dynamo/__init__.py @@ -19,6 +19,7 @@ AIDynamoCmdArgs, AIDynamoTestDefinition, AIPerf, + AIPerfAccuracy, GenAIPerf, LMCache, LMCacheArgs, @@ -37,6 +38,7 @@ "AIDynamoSlurmCommandGenStrategy", "AIDynamoTestDefinition", "AIPerf", + "AIPerfAccuracy", "GenAIPerf", "LMCache", "LMCacheArgs", diff --git a/src/cloudai/workloads/ai_dynamo/accuracy.sh b/src/cloudai/workloads/ai_dynamo/accuracy.sh new file mode 100644 index 000000000..0e85ee109 --- /dev/null +++ b/src/cloudai/workloads/ai_dynamo/accuracy.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -Eeuo pipefail + +result_dir="" +model="" +url="http://localhost" +port=8000 +endpoint="v1/chat/completions" +entrypoint="" +cli="" +setup_cmd="" +artifact_dir_name="aiperf_accuracy_artifacts" + +log() { + echo "[$(date '+%F %T') $(hostname)]: $*" +} + +process_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --result-dir) result_dir="$2"; shift 2 ;; + --model) model="$2"; shift 2 ;; + --url) url="$2"; shift 2 ;; + --port) port="$2"; shift 2 ;; + --endpoint) endpoint="$2"; shift 2 ;; + --entrypoint) entrypoint="$2"; shift 2 ;; + --cli) cli="$2"; shift 2 ;; + --setup-cmd) setup_cmd="$2"; shift 2 ;; + --artifact-dir-name) artifact_dir_name="$2"; shift 2 ;; + --) shift; break ;; + --*) if [[ -n "${2:-}" && "${2}" != -* ]]; then shift 2; else shift 1; fi ;; + *) shift ;; + esac + done + + log "Parsed args: + result_dir: $result_dir + model: $model + url: $url + port: $port + endpoint: $endpoint + entrypoint: $entrypoint + setup_cmd: ${setup_cmd:-} + artifact_dir: $artifact_dir_name + cli: ${cli:-}" +} + +run_setup_cmd() { + if [[ -z "$setup_cmd" ]]; then + return + fi + + log "Running accuracy setup command: $setup_cmd" + bash -lc "$setup_cmd" + log "Accuracy setup command complete" +} + +expand_cli() { + local artifact_dir="$1" + local full_url="$2" + local expanded="$cli" + + expanded="${expanded//\{model\}/$model}" + expanded="${expanded//\{url\}/$full_url}" + expanded="${expanded//\{endpoint\}/$endpoint}" + expanded="${expanded//\{result_dir\}/$result_dir}" + expanded="${expanded//\{artifact_dir\}/$artifact_dir}" + expanded="${expanded//$'\n'/ }" + + echo "$expanded" +} + +copy_accuracy_results() { + local artifact_dir="$1" + local accuracy_path="$artifact_dir/accuracy_results.csv" + + if [[ ! -s "$accuracy_path" ]]; then + log "ERROR: accuracy benchmark was requested but $accuracy_path was not produced" + exit 1 + fi + + cp "$accuracy_path" "$result_dir/accuracy_results.csv" + log "accuracy report saved to $result_dir/accuracy_results.csv" +} + +main() { + process_args "$@" + + if [[ -z "$result_dir" ]]; then + log "ERROR: --result-dir is required"; exit 1 + fi + if [[ -z "$model" ]]; then + log "ERROR: --model is required"; exit 1 + fi + if [[ -z "$entrypoint" ]]; then + log "ERROR: --entrypoint is required"; exit 1 + fi + + run_setup_cmd + + local full_url="${url}:${port}" + local artifact_dir="$result_dir/$artifact_dir_name" + rm -rf "$artifact_dir" + mkdir -p "$artifact_dir" + + local expanded_cli + expanded_cli="$(expand_cli "$artifact_dir" "$full_url")" + + log "Launching accuracy command: $entrypoint $expanded_cli" + bash -lc "$entrypoint $expanded_cli" + log "accuracy command complete" + + copy_accuracy_results "$artifact_dir" +} + +main "$@" +exit 0 diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index 01912f0c1..35da5b782 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import csv import logging from pathlib import Path from typing import Literal, Optional, cast @@ -40,6 +41,10 @@ from cloudai.models.workload import CmdArgs, TestDefinition from cloudai.systems.slurm import SlurmSystem +AIPERF_ARTIFACTS_DIR = "aiperf_artifacts" +AIPERF_ACCURACY_ARTIFACTS_DIR = "aiperf_accuracy_artifacts" +AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv" + class Args(BaseModel): """Arguments for custom workloads.""" @@ -290,6 +295,11 @@ class AIPerf(Workload): name: str = "aiperf" cmd: str = "aiperf profile" script: File = File(Path(__file__).parent.parent / "ai_dynamo/aiperf.sh") + setup_cmd: str | None = Field( + default=None, + serialization_alias="setup-cmd", + validation_alias=AliasChoices("setup-cmd", "setup_cmd"), + ) report_name: str = Field( default="aiperf_report.csv", serialization_alias="report-name", @@ -301,6 +311,31 @@ def installables(self) -> list[Installable]: return [self.script] +class AIPerfAccuracy(BaseModel): + """Optional accuracy benchmark configuration.""" + + model_config = ConfigDict(extra="forbid", populate_by_name=True) + + name: str = "aiperf_accuracy" + entrypoint: str = "aiperf profile" + cli: str + script: File = File(Path(__file__).parent.parent / "ai_dynamo/accuracy.sh") + setup_cmd: str | None = Field( + default=None, + serialization_alias="setup-cmd", + validation_alias=AliasChoices("setup-cmd", "setup_cmd"), + ) + artifact_dir_name: str = Field( + default=AIPERF_ACCURACY_ARTIFACTS_DIR, + serialization_alias="artifact-dir-name", + validation_alias=AliasChoices("artifact-dir-name", "artifact_dir_name"), + ) + + @property + def installables(self) -> list[Installable]: + return [self.script] + + class Constraints(BaseModel): """Constraints for validation of AI Dynamo configurations when using DSE.""" @@ -321,6 +356,7 @@ class AIDynamoCmdArgs(CmdArgs): lmcache: LMCache = Field(default_factory=LMCache) genai_perf: GenAIPerf = Field(default_factory=GenAIPerf) aiperf: AIPerf = Field(default_factory=AIPerf) + aiperf_accuracy: AIPerfAccuracy | None = None workloads: str = "genai_perf.sh" @field_validator("workloads", mode="before") @@ -343,6 +379,7 @@ def installables(self) -> list[Installable]: *self.lmcache.installables, *self.genai_perf.installables, *self.aiperf.installables, + *(self.aiperf_accuracy.installables if self.aiperf_accuracy else []), ] @@ -404,10 +441,52 @@ def installables(self) -> list[Installable]: *self.cmd_args.installables, ] + def _has_aiperf_accuracy_results(self, output_path: Path) -> bool: + accuracy = parse_aiperf_accuracy(output_path) + if accuracy is None: + logging.info(f"AIPerf accuracy results not found in {output_path}.") + return False + + logging.info(f"AIPerf accuracy results found in {output_path}: {accuracy}") + return True + + def _was_workload_report_produced(self, output_path: Path, workload: str, workload_config: Workload) -> bool: + report_name = workload_config.report_name + if report_name is None: + logging.warning(f"Workload {workload} has no report_name configured") + return False + + workload_csv_file = output_path / report_name + if not workload_csv_file.exists(): + logging.info(f"Result file ({workload_csv_file.absolute()}) not found for workload: {workload}") + return False + + logging.info(f"Result file ({workload_csv_file.absolute()}) exists for {workload}") + return True + + def _was_workload_successful(self, output_path: Path, workload: str, workload_map: dict[str, Workload]) -> bool: + workload_config = workload_map.get(workload) + if workload_config is None: + logging.info(f"Workload {workload} not found in workload map") + return False + + return self._was_workload_report_produced(output_path, workload, workload_config) + + def _were_workloads_successful(self, output_path: Path) -> bool: + workload_map = self.get_workload_map() + result = True + for workload in self.cmd_args.workloads_list: + result = self._was_workload_successful(output_path, workload, workload_map) and result + return result + + def _was_aiperf_accuracy_successful(self, output_path: Path) -> bool: + if self.cmd_args.aiperf_accuracy is None: + return True + + return self._has_aiperf_accuracy_results(output_path) + def was_run_successful(self, tr: TestRun) -> JobStatusResult: output_path = tr.output_path - result = True - workload_map = self.get_workload_map() failure_marker = output_path / self.failure_marker success_marker = output_path / self.success_marker @@ -418,24 +497,9 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: if not success_marker.exists(): return JobStatusResult(False, error_message=f"Success marker file not found: {success_marker.absolute()}") - for workload in self.cmd_args.workloads_list: - if workload not in workload_map: - logging.info(f"Workload {workload} not found in workload map") - result = False - continue - report_name = workload_map[workload].report_name - if report_name is None: - logging.warning(f"Workload {workload} has no report_name configured") - result = False - continue - workload_csv_file = output_path / report_name - if not workload_csv_file.exists(): - logging.info(f"Result file ({workload_csv_file.absolute()}) not found for workload: {workload}") - result = False - else: - logging.info(f"Result file ({workload_csv_file.absolute()}) exists for {workload}") - - return JobStatusResult(result) + workloads_successful = self._were_workloads_successful(output_path) + accuracy_successful = self._was_aiperf_accuracy_successful(output_path) + return JobStatusResult(workloads_successful and accuracy_successful) def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool: prefill_worker = tr.test.cmd_args.dynamo.prefill_worker @@ -467,3 +531,84 @@ def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool: logging.info("constraint_check passed for: tp_times_pp_le_gpus_per_node") return True + + +def _parse_accuracy_value(value: str | int | float | None) -> float | None: + if value is None: + return None + if isinstance(value, (int, float)): + accuracy = float(value) + return accuracy / 100 if accuracy > 1 else accuracy + + raw_value = value.strip() + if not raw_value: + return None + + is_percentage = raw_value.endswith("%") + if is_percentage: + raw_value = raw_value[:-1].strip() + + try: + accuracy = float(raw_value) + except ValueError: + return None + + return accuracy / 100 if is_percentage or accuracy > 1 else accuracy + + +def _parse_count_value(value: str | int | float | None) -> float | None: + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + try: + return float(value.strip()) + except ValueError: + return None + + +def parse_aiperf_accuracy(output_path: Path) -> float | None: + """ + Parse AIPerf accuracy from accuracy_results.csv. + + Expected CSV format: + Task,Correct,Total,Accuracy + abstract_algebra,35,100,35.00% + OVERALL,8368,14042,59.59% + + AIPerf writes this file under aiperf_artifacts; CloudAI's wrapper also copies + it to the run output directory when present. The returned value is normalized + to a 0.0-1.0 fraction. + """ + candidates = [ + output_path / AIPERF_ACCURACY_RESULTS_CSV, + output_path / AIPERF_ACCURACY_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV, + output_path / AIPERF_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV, + ] + + for csv_file in candidates: + if not csv_file.exists() or csv_file.stat().st_size == 0: + continue + + fallback_accuracy: float | None = None + with csv_file.open(newline="", encoding="utf-8") as f: + for row in csv.DictReader(f): + accuracy = _parse_accuracy_value(row.get("Accuracy") or row.get("accuracy") or row.get("Value")) + if accuracy is None: + correct = _parse_count_value(row.get("Correct") or row.get("correct")) + total = _parse_count_value(row.get("Total") or row.get("total")) + if correct is not None and total: + accuracy = correct / total + if accuracy is None: + continue + + task = (row.get("Task") or row.get("task") or row.get("Metric") or "").strip().upper() + if task == "OVERALL": + return accuracy + if fallback_accuracy is None: + fallback_accuracy = accuracy + + if fallback_accuracy is not None: + return fallback_accuracy + + return None diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh index 46f5daa42..5b65db41f 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh @@ -37,6 +37,8 @@ declare -A genai_perf_args declare -A genai_perf_config declare -A aiperf_args declare -A aiperf_config +declare -A aiperf_accuracy_args +declare -A aiperf_accuracy_config declare -A dynamo_args dynamo_args["backend"]="vllm" @@ -100,6 +102,10 @@ _resolve_host_ip() { echo "$ip" } +_current_node_ip() { + _resolve_host_ip "$(_current_node_name)" +} + _apply_sglang_dsr1_section_args() { local self="$(_current_node_name)" local gpn="$(_gpus_per_node)" @@ -169,6 +175,10 @@ _parse_cli_pairs() { aiperf_args["--${key#--aiperf-args-}"]="$2" ;; --aiperf-*) aiperf_config["--${key#--aiperf-}"]="$2" ;; + --aiperf_accuracy-args-*) + aiperf_accuracy_args["--${key#--aiperf_accuracy-args-}"]="$2" ;; + --aiperf_accuracy-*) + aiperf_accuracy_config["--${key#--aiperf_accuracy-}"]="$2" ;; --hf-home) HUGGINGFACE_HOME="$2" ;; --storage-cache-dir) @@ -361,6 +371,8 @@ _dump_args() { log "GenAI-Perf args:\n$(arg_array_to_string genai_perf_args)" log "AIPerf config params:\n$(arg_array_to_string aiperf_config)" log "AIPerf args:\n$(arg_array_to_string aiperf_args)" + log "AIPerf accuracy config params:\n$(arg_array_to_string aiperf_accuracy_config)" + log "AIPerf accuracy args:\n$(arg_array_to_string aiperf_accuracy_args)" log "--------------------------------" } @@ -420,6 +432,10 @@ function perform_exit() exit_on_error() { local fatal=$(_detect_fatal_once) + if [ -f "${FATAL_ERROR_MARKER}" ]; then + log "FATAL_ERROR_MARKER found. Terminating." + perform_exit 1 + fi if [ -f "${DONE_MARKER}" ]; then log "DONE_MARKER found. Skipping error check." return @@ -517,6 +533,10 @@ _is_aiperf_workload() { [[ "${dynamo_args["workloads"]}" == *"aiperf.sh"* ]] } +_is_aiperf_accuracy_enabled() { + [[ -n "${aiperf_accuracy_config["--script"]:-}" ]] +} + _init_runtime_env() { if _is_vllm || _is_sglang; then export HF_HOME="${HUGGINGFACE_HOME}" @@ -689,6 +709,13 @@ function mark_done() touch "$DONE_MARKER" } +function mark_failed() +{ + local message="$1" + log "ERROR: ${message}" + printf '%s\n' "${message}" > "${FATAL_ERROR_MARKER}" +} + function launch_etcd() { log "Launching etcd with cmd: ${dynamo_args["etcd-cmd"]} --listen-client-urls http://0.0.0.0:${dynamo_args["etcd-port"]} --advertise-client-urls http://0.0.0.0:${dynamo_args["etcd-port"]}" @@ -733,6 +760,8 @@ function launch_decode() local base_kvbm_pub_port=${DYN_KVBM_LEADER_ZMQ_PUB_PORT:-56001} local base_kvbm_ack_port=${DYN_KVBM_LEADER_ZMQ_ACK_PORT:-56002} local kvbm_port_stride=2 + local side_channel_host + side_channel_host="$(_current_node_ip)" log "Launching $workers_per_node decode worker(s) with unique port ranges" for i in $(seq 0 $(( $workers_per_node - 1 ))); do @@ -754,10 +783,10 @@ function launch_decode() args_arr+=($key "${decode_args[$key]}") done - log "Launching decode worker $i on GPUs $gpu_list (NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)" + log "Launching decode worker $i on GPUs $gpu_list (NIXL host: $side_channel_host, NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)" log "Decode cmd: ${decode_config["cmd"]} ${args_arr[*]} ${decode_config["extra-args"]}" CUDA_VISIBLE_DEVICES=$gpu_list \ - VLLM_NIXL_SIDE_CHANNEL_HOST=$(hostname -I | awk '{print $1}') \ + VLLM_NIXL_SIDE_CHANNEL_HOST="$side_channel_host" \ VLLM_NIXL_SIDE_CHANNEL_PORT=$nixl_port \ DYN_VLLM_KV_EVENT_PORT=$kv_event_port \ DYN_KVBM_LEADER_ZMQ_PUB_PORT=$kvbm_pub_port \ @@ -788,6 +817,8 @@ function launch_prefill() local base_kvbm_pub_port=${DYN_KVBM_LEADER_ZMQ_PUB_PORT:-56001} local base_kvbm_ack_port=${DYN_KVBM_LEADER_ZMQ_ACK_PORT:-56002} local kvbm_port_stride=2 + local side_channel_host + side_channel_host="$(_current_node_ip)" log "Launching $workers_per_node prefill worker(s) with unique port ranges" for i in $(seq 0 $(( $workers_per_node - 1 ))); do @@ -809,10 +840,10 @@ function launch_prefill() args_arr+=($key "${prefill_args[$key]}") done - log "Launching prefill worker $i on GPUs $gpu_list (NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)" + log "Launching prefill worker $i on GPUs $gpu_list (NIXL host: $side_channel_host, NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)" log "Prefill cmd: ${prefill_config["cmd"]} ${args_arr[*]} ${prefill_config["extra-args"]}" CUDA_VISIBLE_DEVICES=$gpu_list \ - VLLM_NIXL_SIDE_CHANNEL_HOST=$(hostname -I | awk '{print $1}') \ + VLLM_NIXL_SIDE_CHANNEL_HOST="$side_channel_host" \ VLLM_NIXL_SIDE_CHANNEL_PORT=$nixl_port \ DYN_VLLM_KV_EVENT_PORT=$kv_event_port \ DYN_KVBM_LEADER_ZMQ_PUB_PORT=$kvbm_pub_port \ @@ -1026,6 +1057,11 @@ function launch_workload() --decode-nodes "${decode_config["node-list"]}" \ "${config_arr[@]}" \ -- "${args_arr[@]}" > "${RESULTS_DIR}/$workload_name.log" 2>&1 + local workload_status=$? + if [[ "${workload_status}" -ne 0 ]]; then + mark_failed "Workload ${workload_name} failed with exit code ${workload_status}. See ${RESULTS_DIR}/${workload_name}.log" + return "${workload_status}" + fi log "Done with $workload_name run" } @@ -1035,11 +1071,15 @@ function launch_workloads() wait_for_dynamo_frontend if _is_genai_perf_workload; then - launch_workload genai_perf_config genai_perf_args + launch_workload genai_perf_config genai_perf_args || return $? fi if _is_aiperf_workload; then - launch_workload aiperf_config aiperf_args + launch_workload aiperf_config aiperf_args || return $? + fi + + if _is_aiperf_accuracy_enabled; then + launch_workload aiperf_accuracy_config aiperf_accuracy_args || return $? fi mark_done diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh index 9f5a78b33..15cee3a58 100644 --- a/src/cloudai/workloads/ai_dynamo/aiperf.sh +++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh @@ -19,7 +19,7 @@ # # Called from ai_dynamo.sh's launch_workload() with: # bash aiperf.sh --result-dir --model --url --port -# [--cmd ] [--report-name ] [--extra-args ] +# [--cmd ] [--report-name ] [--artifact-dir-name ] [--extra-args ] # -- ... # # Context flags (before --) that are recognised and used: @@ -28,7 +28,9 @@ # --url Base URL of the dynamo.frontend (e.g. http://node01). # --port HTTP port the dynamo.frontend is listening on. # --report-name Output CSV name (default: aiperf_report.csv). +# --artifact-dir-name Artifact directory name under --result-dir (default: aiperf_artifacts). # --cmd Full launch command including subcommand (default: "aiperf profile"). +# --setup-cmd Optional shell command run before launching aiperf. # --extra-args Raw string appended verbatim after all other flags. # # All unrecognised flags (--install-dir, --gpus-per-node, etc.) are silently @@ -43,9 +45,11 @@ model="" url="http://localhost" port=8000 report_name="aiperf_report.csv" +artifact_dir_name="aiperf_artifacts" cmd="aiperf profile" +setup_cmd="" declare -a extra_args=() -declare -a aiperf_profile_args=() +declare -a profile_args=() log() { echo "[$(date '+%F %T') $(hostname)]: $*" @@ -54,14 +58,14 @@ log() { _parse_aiperf_args() { while [[ $# -ge 2 ]]; do case "$1" in - --*) aiperf_profile_args+=("$1" "$2"); shift 2 ;; + --*) profile_args+=("$1" "$2"); shift 2 ;; *) shift ;; esac done # Capture a trailing lone boolean flag if present. # Use if/fi — not [[ ]] && — so set -e does not trigger on a false condition. if [[ $# -eq 1 && "$1" == --* ]]; then - aiperf_profile_args+=("$1") + profile_args+=("$1") fi } @@ -73,9 +77,11 @@ process_args() { --url) url="$2"; shift 2 ;; --port) port="$2"; shift 2 ;; --report-name) report_name="$2"; shift 2 ;; - --cmd) cmd="$2"; shift 2 ;; - --extra-args) read -ra extra_args <<< "$2"; shift 2 ;; - --) shift; _parse_aiperf_args "$@"; break ;; + --artifact-dir-name) artifact_dir_name="$2"; shift 2 ;; + --cmd) cmd="$2"; shift 2 ;; + --setup-cmd) setup_cmd="$2"; shift 2 ;; + --extra-args) read -ra extra_args <<< "$2"; shift 2 ;; + --) shift; _parse_aiperf_args "$@"; break ;; --*) if [[ -n "${2:-}" && "${2}" != -* ]]; then shift 2; else shift 1; fi ;; # consume unknown flag; shift 2 only if next arg is a value *) shift ;; esac @@ -87,15 +93,33 @@ process_args() { url: $url port: $port report_name: $report_name + artifact_dir: $artifact_dir_name cmd: $cmd + setup_cmd: ${setup_cmd:-} extra_args: ${extra_args[*]:-} - profile_args: ${aiperf_profile_args[*]:-}" + profile_args: ${profile_args[*]:-}" +} + +run_setup_cmd() { + if [[ -z "$setup_cmd" ]]; then + return + fi + + log "Running AIPerf setup command: $setup_cmd" + bash -lc "$setup_cmd" + log "AIPerf setup command complete" } process_results() { - local artifact_dir="$result_dir/aiperf_artifacts" - local csv_path - csv_path=$(find "$artifact_dir" -name "*.csv" -print -quit 2>/dev/null || true) + local artifact_dir="$result_dir/$artifact_dir_name" + local csv_path="" + + if [[ -f "$artifact_dir/profile_export_aiperf.csv" ]]; then + csv_path="$artifact_dir/profile_export_aiperf.csv" + else + csv_path=$(find "$artifact_dir" -name "*aiperf*.csv" -print -quit 2>/dev/null || true) + fi + if [[ -n "$csv_path" ]]; then cp "$csv_path" "$result_dir/$report_name" log "aiperf report saved to $result_dir/$report_name" @@ -103,6 +127,36 @@ process_results() { log "ERROR: no CSV found in $artifact_dir — aiperf may not have completed" exit 1 fi + +} + +run_aiperf() { + local full_url="$1" + local artifact_dir="$2" + local -a run_cmd=() + read -ra run_cmd <<< "$cmd" + local -a launch_cmd=( + "${run_cmd[@]}" + --model "$model" + --url "$full_url" + --endpoint-type chat + --streaming + --artifact-dir "$artifact_dir" + --no-server-metrics + ) + + log "Launching aiperf: ${run_cmd[*]} --model $model --url $full_url" + + if [[ "${#profile_args[@]}" -gt 0 ]]; then + launch_cmd+=("${profile_args[@]}") + fi + if [[ "${#extra_args[@]}" -gt 0 ]]; then + launch_cmd+=("${extra_args[@]}") + fi + + "${launch_cmd[@]}" + + log "aiperf run complete" } main() { @@ -115,27 +169,13 @@ main() { log "ERROR: --model is required"; exit 1 fi + run_setup_cmd + local full_url="${url}:${port}" - local artifact_dir="$result_dir/aiperf_artifacts" + local artifact_dir="$result_dir/$artifact_dir_name" rm -rf "$artifact_dir" - # Split cmd into an array (e.g. "aiperf profile" → ["aiperf", "profile"]) - local -a run_cmd=() - read -ra run_cmd <<< "$cmd" - - log "Launching aiperf: ${run_cmd[*]} --model $model --url $full_url" - - "${run_cmd[@]}" \ - --model "$model" \ - --url "$full_url" \ - --endpoint-type chat \ - --streaming \ - --artifact-dir "$artifact_dir" \ - --no-server-metrics \ - "${aiperf_profile_args[@]}" \ - "${extra_args[@]}" - - log "aiperf run complete" + run_aiperf "$full_url" "$artifact_dir" process_results } diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py index a8e4e91b8..a0ef92005 100644 --- a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py @@ -21,6 +21,7 @@ from cloudai.core import METRIC_ERROR, MetricValue, ReportGenerationStrategy from cloudai.util.lazy_imports import lazy +from cloudai.workloads.ai_dynamo.ai_dynamo import AIDynamoTestDefinition, parse_aiperf_accuracy class AIDynamoReportGenerationStrategy(ReportGenerationStrategy): @@ -44,6 +45,16 @@ def extract_metric_from_csv(self, csv_file: Path, metric_name: str, metric_type: def get_metric(self, metric: str) -> MetricValue: logging.info(f"Getting metric: {metric}") + + if metric.lower() == "accuracy": + tdef = self.test_run.test + if not isinstance(tdef, AIDynamoTestDefinition): + return METRIC_ERROR + if tdef.cmd_args.aiperf_accuracy is None: + return METRIC_ERROR + accuracy = parse_aiperf_accuracy(self.test_run.output_path) + return accuracy if accuracy is not None else METRIC_ERROR + metric_name = metric metric_type = "avg" diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 17079875c..c1a817853 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. import logging +import shlex from pathlib import Path from typing import List, cast @@ -71,6 +72,8 @@ def _get_toml_args(self, base_model: BaseModel, prefix: str, exclude: List[str] str_v = str(v) if str_v.startswith("{") and str_v.endswith("}"): args.append(f"{prefix}{k} '{str_v}'") + elif any(char in str_v for char in ['"', "'", "\n"]): + args.append(f"{prefix}{k} {shlex.quote(str_v)}") else: args.append(f'{prefix}{k} "{v}"') @@ -118,6 +121,8 @@ def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]: args.extend(self._get_nested_toml_args(td.cmd_args.lmcache, "--lmcache-")) args.extend(self._get_nested_toml_args(td.cmd_args.genai_perf, "--genai_perf-")) args.extend(self._get_nested_toml_args(td.cmd_args.aiperf, "--aiperf-")) + if td.cmd_args.aiperf_accuracy is not None: + args.extend(self._get_nested_toml_args(td.cmd_args.aiperf_accuracy, "--aiperf_accuracy-")) return args diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py index 87ad7b3a3..30a6943c1 100644 --- a/src/cloudai/workloads/common/llm_serving.py +++ b/src/cloudai/workloads/common/llm_serving.py @@ -624,7 +624,9 @@ def _expand_semantic_eval_args(self, args: str, *, host: str) -> str: "{model}": self.tdef.cmd_args.model, "{host}": host, "{port}": str(self.serve_port), + "{url}": f"{host}:{self.serve_port}", "{output_path}": str(self.test_run.output_path.absolute()), + "{result_dir}": str(self.test_run.output_path.absolute()), } for placeholder, value in replacements.items(): args = args.replace(placeholder, value) diff --git a/src/cloudai/workloads/sglang/sglang.py b/src/cloudai/workloads/sglang/sglang.py index 338bbfecc..49a7af140 100644 --- a/src/cloudai/workloads/sglang/sglang.py +++ b/src/cloudai/workloads/sglang/sglang.py @@ -92,8 +92,10 @@ class SglangBenchCmdArgs(CmdArgs): class SglangSemanticEvalCmdArgs(CmdArgs): """SGLang semantic validation command arguments.""" - module: str = "sglang.test.run_eval" - args: str = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" + model_config = ConfigDict(extra="forbid") + + entrypoint: str = "python3 -m sglang.test.run_eval" + cli: str = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" class SglangTestDefinition(LLMServingTestDefinition[SglangCmdArgs]): diff --git a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py index f1c7c741c..7a7a97d5b 100644 --- a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py @@ -130,17 +130,8 @@ def get_semantic_eval_command(self) -> list[str] | None: return None host = self.bench_host - command = [ - "python3", - "-m", - eval_args.module, - f"--host {host}", - f"--port {self.serve_port}", - ] - args = self._expand_semantic_eval_args(eval_args.args, host=host) - if args: - command.append(args) - return command + cli = self._expand_semantic_eval_args(eval_args.cli, host=host) + return [eval_args.entrypoint, cli] if cli else [eval_args.entrypoint] def aggregated_serve_env(self) -> dict[str, str]: return {"CUDA_VISIBLE_DEVICES": ",".join(str(gpu_id) for gpu_id in self.gpu_ids)} diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 13d87ad77..2f00e95f7 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -130,13 +130,5 @@ def get_semantic_eval_command(self) -> list[str] | None: host = self.bench_host http_host = host if host.startswith("http://") or host.startswith("https://") else f"http://{host}" - command = [ - "python3", - eval_args.script, - f"--host {http_host}", - f"--port {self.serve_port}", - ] - args = self._expand_semantic_eval_args(eval_args.args, host=http_host) - if args: - command.append(args) - return command + cli = self._expand_semantic_eval_args(eval_args.cli, host=http_host) + return [eval_args.entrypoint, cli] if cli else [eval_args.entrypoint] diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index d2fda3ab5..f77039edc 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -92,8 +92,10 @@ class VllmBenchCmdArgs(CmdArgs): class VllmSemanticEvalCmdArgs(CmdArgs): """vLLM semantic validation command arguments.""" - script: str = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" - args: str = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json" + model_config = ConfigDict(extra="forbid") + + entrypoint: str = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" + cli: str = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json" class VllmTestDefinition(LLMServingTestDefinition[VllmCmdArgs]): diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index a0a028caa..7b036b5a8 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -14,7 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import shlex from pathlib import Path +from typing import cast import pytest @@ -26,6 +28,8 @@ AIDynamoCmdArgs, AIDynamoSlurmCommandGenStrategy, AIDynamoTestDefinition, + AIPerf, + AIPerfAccuracy, GenAIPerf, LMCache, LMCacheArgs, @@ -148,3 +152,78 @@ def test_dynamo_cmd( ) -> None: result = strategy.gen_dynamo_cmd(module, Path(config)) assert result.strip() == expected + + +def test_gen_script_args_contains_split_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: + td = cast(AIDynamoTestDefinition, strategy.test_run.test) + td.cmd_args.workloads = "aiperf.sh" + setup_cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" + cli = ( + "--model {model} " + "--url {url} " + "--endpoint-type chat " + "--streaming " + "--artifact-dir {artifact_dir} " + "--no-server-metrics " + "--accuracy-benchmark mmlu " + "--accuracy-n-shots 5 " + "--accuracy-tasks abstract_algebra " + "--concurrency 10 " + '--extra-inputs \'{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}\' ' + "--num-requests 100" + ) + td.cmd_args.aiperf = AIPerf.model_validate( + { + "args": { + "concurrency": 2, + "request-count": 50, + "synthetic-input-tokens-mean": 300, + "output-tokens-mean": 500, + }, + } + ) + td.cmd_args.aiperf_accuracy = AIPerfAccuracy.model_validate( + { + "setup-cmd": setup_cmd, + "cli": cli, + } + ) + + result = strategy._gen_script_args(td) + + assert '--aiperf-args-request-count "50"' in result + assert '--aiperf-args-synthetic-input-tokens-mean "300"' in result + assert '--aiperf-args-output-tokens-mean "500"' in result + assert f'--aiperf_accuracy-setup-cmd "{setup_cmd}"' in result + assert '--aiperf_accuracy-name "aiperf_accuracy"' in result + assert '--aiperf_accuracy-entrypoint "aiperf profile"' in result + assert '--aiperf_accuracy-artifact-dir-name "aiperf_accuracy_artifacts"' in result + assert f"--aiperf_accuracy-cli {shlex.quote(cli)}" in result + + +def test_gen_script_args_contains_custom_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: + td = cast(AIDynamoTestDefinition, strategy.test_run.test) + cli = "--model {model} --url {url} --endpoint {endpoint} --artifact-dir {artifact_dir} --prompt ping" + td.cmd_args.aiperf_accuracy = AIPerfAccuracy.model_validate( + { + "entrypoint": "python /custom_accuracy/dummy_accuracy.py", + "cli": cli, + } + ) + + result = strategy._gen_script_args(td) + + assert '--aiperf_accuracy-entrypoint "python /custom_accuracy/dummy_accuracy.py"' in result + assert f'--aiperf_accuracy-cli "{cli}"' in result + + +def test_gen_script_args_quotes_worker_json_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: + td = cast(AIDynamoTestDefinition, strategy.test_run.test) + config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + td.cmd_args.dynamo.prefill_worker.args = WorkerBaseArgs.model_validate({"kv-transfer-config": config}) + td.cmd_args.dynamo.decode_worker.args = WorkerBaseArgs.model_validate({"kv-transfer-config": config}) + + result = strategy._gen_script_args(td) + + assert f"--prefill-args-kv-transfer-config '{config}'" in result + assert f"--decode-args-kv-transfer-config '{config}'" in result diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py index 0e51c414f..47e214421 100644 --- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py +++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py @@ -26,12 +26,14 @@ AIDynamoCmdArgs, AIDynamoTestDefinition, AIPerf, + AIPerfAccuracy, GenAIPerf, LMCache, LMCacheArgs, WorkerBaseArgs, WorkerConfig, ) +from cloudai.workloads.ai_dynamo.ai_dynamo import parse_aiperf_accuracy from cloudai.workloads.ai_dynamo.report_generation_strategy import AIDynamoReportGenerationStrategy @@ -62,6 +64,14 @@ def get_aiperf_csv_content() -> str: ) +def get_aiperf_accuracy_csv_content() -> str: + return "Task,Correct,Total,Accuracy\nabstract_algebra,35,100,35.00%\nOVERALL,35,100,35.00%\n" + + +def get_aiperf_accuracy_cli() -> str: + return "--model {model} --url {url} --artifact-dir {artifact_dir} --accuracy-benchmark mmlu" + + @pytest.fixture def ai_dynamo_tr(tmp_path: Path) -> TestRun: test = AIDynamoTestDefinition( @@ -70,6 +80,7 @@ def ai_dynamo_tr(tmp_path: Path) -> TestRun: test_template_name="t", cmd_args=AIDynamoCmdArgs( docker_image_url="http://url", + workloads="genai_perf.sh", dynamo=AIDynamoArgs( prefill_worker=WorkerConfig( cmd="python3 -m dynamo.vllm --is-prefill-worker", @@ -119,6 +130,64 @@ def ai_dynamo_aiperf_tr(tmp_path: Path) -> TestRun: return tr +@pytest.fixture +def ai_dynamo_aiperf_with_split_accuracy_tr(tmp_path: Path) -> TestRun: + test = AIDynamoTestDefinition( + name="ai_dynamo_aiperf_with_split_accuracy", + description="desc", + test_template_name="t", + cmd_args=AIDynamoCmdArgs( + docker_image_url="http://url", + workloads="aiperf.sh", + dynamo=AIDynamoArgs( + prefill_worker=WorkerConfig( + cmd="python3 -m dynamo.vllm --is-prefill-worker", + worker_initialized_regex="VllmWorker.*has.been.initialized", + args=WorkerBaseArgs(), + ), + ), + aiperf=AIPerf(), + aiperf_accuracy=AIPerfAccuracy.model_validate({"cli": get_aiperf_accuracy_cli()}), + lmcache=LMCache(args=LMCacheArgs()), + ), + ) + tr = TestRun(name="ai_dynamo_aiperf_with_split_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path) + (tr.output_path / "aiperf_report.csv").write_text(get_aiperf_csv_content()) + (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content()) + (tr.output_path / test.success_marker).touch() + return tr + + +@pytest.fixture +def ai_dynamo_genai_perf_with_split_accuracy_tr(tmp_path: Path) -> TestRun: + test = AIDynamoTestDefinition( + name="ai_dynamo_genai_perf_with_split_accuracy", + description="desc", + test_template_name="t", + cmd_args=AIDynamoCmdArgs( + docker_image_url="http://url", + workloads="genai_perf.sh", + dynamo=AIDynamoArgs( + prefill_worker=WorkerConfig( + cmd="python3 -m dynamo.vllm --is-prefill-worker", + worker_initialized_regex="VllmWorker.*has.been.initialized", + args=WorkerBaseArgs(), + ), + ), + genai_perf=GenAIPerf(), + aiperf_accuracy=AIPerfAccuracy.model_validate({"cli": get_aiperf_accuracy_cli()}), + lmcache=LMCache(args=LMCacheArgs()), + ), + ) + tr = TestRun( + name="ai_dynamo_genai_perf_with_split_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path + ) + (tr.output_path / "genai_perf_report.csv").write_text(get_csv_content()) + (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content()) + (tr.output_path / test.success_marker).touch() + return tr + + @pytest.fixture def csv_content() -> str: return get_csv_content() @@ -161,6 +230,23 @@ def test_ai_dynamo_get_metric_aiperf(slurm_system: SlurmSystem, ai_dynamo_aiperf assert strategy.get_metric("aiperf:Total Token Throughput (tokens/sec):avg") == 954.47 +def test_ai_dynamo_get_metric_split_aiperf_accuracy( + slurm_system: SlurmSystem, ai_dynamo_aiperf_with_split_accuracy_tr: TestRun +) -> None: + strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_with_split_accuracy_tr) + + assert strategy.get_metric("accuracy") == 0.35 + assert strategy.get_metric("Inter Token Latency (ms)") == 2.83 + + +def test_ai_dynamo_accuracy_metric_requires_aiperf_accuracy_config( + slurm_system: SlurmSystem, ai_dynamo_aiperf_tr: TestRun +) -> None: + strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_tr) + + assert strategy.get_metric("accuracy") == METRIC_ERROR + + def test_ai_dynamo_get_metric_invalid(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None: strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr) @@ -176,9 +262,56 @@ def test_was_run_successful(ai_dynamo_tr: TestRun) -> None: assert result.is_successful is True +def test_was_run_successful_with_split_aiperf_accuracy( + ai_dynamo_aiperf_with_split_accuracy_tr: TestRun, +) -> None: + test_def = ai_dynamo_aiperf_with_split_accuracy_tr.test + result = test_def.was_run_successful(ai_dynamo_aiperf_with_split_accuracy_tr) + assert result.is_successful is True + + +def test_was_run_successful_with_genai_perf_and_split_aiperf_accuracy( + ai_dynamo_genai_perf_with_split_accuracy_tr: TestRun, +) -> None: + test_def = ai_dynamo_genai_perf_with_split_accuracy_tr.test + result = test_def.was_run_successful(ai_dynamo_genai_perf_with_split_accuracy_tr) + assert result.is_successful is True + + +def test_was_run_successful_requires_split_aiperf_accuracy( + ai_dynamo_aiperf_with_split_accuracy_tr: TestRun, +) -> None: + test_def = ai_dynamo_aiperf_with_split_accuracy_tr.test + (ai_dynamo_aiperf_with_split_accuracy_tr.output_path / "accuracy_results.csv").unlink() + result = test_def.was_run_successful(ai_dynamo_aiperf_with_split_accuracy_tr) + assert result.is_successful is False + + def test_was_run_successful_no_results(ai_dynamo_tr: TestRun, tmp_path: Path) -> None: test_def = ai_dynamo_tr.test ai_dynamo_tr.output_path = tmp_path / "empty_output" ai_dynamo_tr.output_path.mkdir(parents=True, exist_ok=True) result = test_def.was_run_successful(ai_dynamo_tr) assert result.is_successful is False + + +def test_parse_aiperf_accuracy_from_artifact_dir(tmp_path: Path) -> None: + artifact_dir = tmp_path / "aiperf_artifacts" + artifact_dir.mkdir() + (artifact_dir / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content(), encoding="utf-8") + + assert parse_aiperf_accuracy(tmp_path) == 0.35 + + +def test_parse_aiperf_accuracy_from_split_accuracy_artifact_dir(tmp_path: Path) -> None: + artifact_dir = tmp_path / "aiperf_accuracy_artifacts" + artifact_dir.mkdir() + (artifact_dir / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content(), encoding="utf-8") + + assert parse_aiperf_accuracy(tmp_path) == 0.35 + + +def test_parse_aiperf_accuracy_missing_or_invalid(tmp_path: Path) -> None: + (tmp_path / "accuracy_results.csv").write_text("Task,Correct,Total,Accuracy\nOVERALL,n/a,100,n/a\n") + + assert parse_aiperf_accuracy(tmp_path) is None diff --git a/tests/workloads/sglang/test_command_gen_strategy_slurm.py b/tests/workloads/sglang/test_command_gen_strategy_slurm.py index 7d2812580..c07d1771d 100644 --- a/tests/workloads/sglang/test_command_gen_strategy_slurm.py +++ b/tests/workloads/sglang/test_command_gen_strategy_slurm.py @@ -150,28 +150,24 @@ def test_get_sglang_semantic_eval_command_defaults(sglang_cmd_gen_strategy: Sgla command = sglang_cmd_gen_strategy.get_semantic_eval_command() assert command == [ - "python3", - "-m", - "sglang.test.run_eval", - "--host ${NODE}", - "--port 8000", - "--eval-name gsm8k --num-examples 200 --num-threads 128 --model Qwen/Qwen3-8B", + "python3 -m sglang.test.run_eval", + "--host ${NODE} --port 8000 --eval-name gsm8k --num-examples 200 --num-threads 128 --model Qwen/Qwen3-8B", ] -def test_get_sglang_semantic_eval_command_supports_custom_module_and_args( +def test_get_sglang_semantic_eval_command_supports_custom_entrypoint_and_cli( sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy, ): sglang_test = cast(SglangTestDefinition, sglang_cmd_gen_strategy.test_run.test) sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs( - module="sglang.test.few_shot_gsm8k", - args="--num-questions 200 --data-path {output_path}/gsm8k.jsonl --seen {host}:{port}", + entrypoint="python3 /custom/semantic_eval.py", + cli="--num-questions 200 --data-path {result_dir}/gsm8k.jsonl --seen {url}", ) command = sglang_cmd_gen_strategy.get_semantic_eval_command() assert command is not None - assert command[2] == "sglang.test.few_shot_gsm8k" + assert command[0] == "python3 /custom/semantic_eval.py" assert command[-1] == ( f"--num-questions 200 --data-path {sglang_cmd_gen_strategy.test_run.output_path.absolute()}/gsm8k.jsonl " "--seen ${NODE}:8000" diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index 6bd6ada36..6eb62483c 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -193,14 +193,29 @@ def test_get_vllm_semantic_eval_command_defaults(self, vllm_cmd_gen_strategy: Vl command = vllm_cmd_gen_strategy.get_semantic_eval_command() assert command == [ - "python3", - "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py", - "--host http://${NODE}", - "--port 8000", + "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py", + "--host http://${NODE} --port 8000 " "--num-questions 200 --save-results " f"{vllm_cmd_gen_strategy.test_run.output_path.absolute()}/vllm-gsm8k.json", ] + def test_get_vllm_semantic_eval_command_supports_custom_entrypoint_and_cli( + self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy + ) -> None: + vllm_test = cast(VllmTestDefinition, vllm_cmd_gen_strategy.test_run.test) + vllm_test.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs( + entrypoint="python3 /custom/eval.py", + cli="--model {model} --api {url} --out {result_dir}/vllm-gsm8k.json", + ) + + command = vllm_cmd_gen_strategy.get_semantic_eval_command() + + assert command == [ + "python3 /custom/eval.py", + f"--model Qwen/Qwen3-0.6B --api http://${{NODE}}:8000 " + f"--out {vllm_cmd_gen_strategy.test_run.output_path.absolute()}/vllm-gsm8k.json", + ] + def test_gen_srun_command_contains_vllm_semantic_eval( self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy ) -> None: