From d112a2a95e5e12c54c5aa7ee0ba3daaf26f677d3 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Fri, 29 May 2026 11:50:34 -0700 Subject: [PATCH 01/16] multiple aiperf runs --- conf/experimental/ai_dynamo/test/sglang.toml | 15 ++ conf/experimental/ai_dynamo/test/vllm.toml | 15 ++ .../ai_dynamo/test_scenario/vllm_lmcache.toml | 16 +- src/cloudai/workloads/ai_dynamo/__init__.py | 4 + src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 28 ++- src/cloudai/workloads/ai_dynamo/aiperf.sh | 178 +----------------- .../workloads/ai_dynamo/runtime/aiperf.py | 81 ++++++++ .../ai_dynamo/slurm_command_gen_strategy.py | 142 +++++++++++++- .../test_command_gen_strategy_slurm.py | 94 +++++++++ .../ai_dynamo/test_runtime_aiperf.py | 63 +++++++ 10 files changed, 456 insertions(+), 180 deletions(-) create mode 100644 src/cloudai/workloads/ai_dynamo/runtime/aiperf.py create mode 100644 tests/workloads/ai_dynamo/test_runtime_aiperf.py diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index 67fc999f0..34bc9cbff 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -18,6 +18,7 @@ name = "sglang" description = "sglang backend" test_template_name = "AIDynamo" extra_container_mounts = ["/run/udev:/run/udev"] +dse_excluded_args = ["cmd_args.aiperf_phases"] [cmd_args] docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.1.1" @@ -88,6 +89,20 @@ workloads = "aiperf.sh" request-count = 50 synthetic-input-tokens-mean = 300 + [[cmd_args.aiperf_phases]] + name = "round_1" + + [cmd_args.aiperf_phases.args] + concurrency = 2 + request-count = 50 + + [[cmd_args.aiperf_phases]] + name = "round_2" + + [cmd_args.aiperf_phases.args] + concurrency = 4 + request-count = 50 + [cmd_args.aiperf_accuracy] entrypoint = "aiperf profile" setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0" diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index 8a5f3b939..0667f1cab 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -18,6 +18,7 @@ name = "vLLM" description = "vLLM backend" test_template_name = "AIDynamo" extra_container_mounts = ["/run/udev:/run/udev"] +dse_excluded_args = ["cmd_args.aiperf_phases"] [cmd_args] docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.1.1" @@ -78,6 +79,20 @@ workloads = "aiperf.sh" request-count = 50 synthetic-input-tokens-mean = 300 + [[cmd_args.aiperf_phases]] + name = "round_1" + + [cmd_args.aiperf_phases.args] + concurrency = 2 + request-count = 50 + + [[cmd_args.aiperf_phases]] + name = "round_2" + + [cmd_args.aiperf_phases.args] + concurrency = 4 + request-count = 50 + [cmd_args.aiperf_accuracy] entrypoint = "aiperf profile" setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml index 564311240..f975e784e 100644 --- a/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml +++ b/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml @@ -24,7 +24,7 @@ description = "Self-contained AIDynamo scenario wiring vLLM disaggregated infere test_template_name = "AIDynamo" time_limit = "00:10:00" extra_container_mounts = ["/run/udev:/run/udev"] -dse_excluded_args = ["cmd_args.lmcache.lmcache_worker_ports"] +dse_excluded_args = ["cmd_args.lmcache.lmcache_worker_ports", "cmd_args.aiperf_phases"] [Tests.cmd_args] docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.1.1" @@ -90,6 +90,20 @@ dse_excluded_args = ["cmd_args.lmcache.lmcache_worker_ports"] request-count = 50 synthetic-input-tokens-mean = 300 + [[Tests.cmd_args.aiperf_phases]] + name = "round_1" + + [Tests.cmd_args.aiperf_phases.args] + concurrency = 2 + request-count = 50 + + [[Tests.cmd_args.aiperf_phases]] + name = "round_2" + + [Tests.cmd_args.aiperf_phases.args] + concurrency = 4 + request-count = 50 + [Tests.cmd_args.aiperf_accuracy] entrypoint = "aiperf profile" setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" diff --git a/src/cloudai/workloads/ai_dynamo/__init__.py b/src/cloudai/workloads/ai_dynamo/__init__.py index 5e430068d..57e2eb99e 100644 --- a/src/cloudai/workloads/ai_dynamo/__init__.py +++ b/src/cloudai/workloads/ai_dynamo/__init__.py @@ -15,6 +15,7 @@ # limitations under the License. from .ai_dynamo import ( + AIPERF_COMMANDS_FILE_NAME, LMCACHE_CONFIG_BACKUP_FILE_NAME, LMCACHE_CONFIG_FILE_NAME, AIDynamoArgs, @@ -22,6 +23,7 @@ AIDynamoTestDefinition, AIPerf, AIPerfAccuracy, + AIPerfPhase, GenAIPerf, LMCacheController, WorkerBaseArgs, @@ -32,6 +34,7 @@ from .slurm_command_gen_strategy import AIDynamoSlurmCommandGenStrategy __all__ = [ + "AIPERF_COMMANDS_FILE_NAME", "LMCACHE_CONFIG_BACKUP_FILE_NAME", "LMCACHE_CONFIG_FILE_NAME", "AIDynamoArgs", @@ -42,6 +45,7 @@ "AIDynamoTestDefinition", "AIPerf", "AIPerfAccuracy", + "AIPerfPhase", "GenAIPerf", "LMCacheController", "WorkerBaseArgs", diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index 7f8da4165..5c45a149b 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -42,6 +42,7 @@ from cloudai.systems.slurm import SlurmSystem AIPERF_ARTIFACTS_DIR = "aiperf_artifacts" +AIPERF_COMMANDS_FILE_NAME = "aiperf_commands.json" AIPERF_ACCURACY_ARTIFACTS_DIR = "aiperf_accuracy_artifacts" AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv" LMCACHE_CONFIG_FILE_NAME = "lmcache-config.yaml" @@ -254,6 +255,7 @@ class AIPerf(Workload): name: str = "aiperf" cmd: str = "aiperf profile" script: File = File(Path(__file__).parent.parent / "ai_dynamo/aiperf.sh") + runtime: File = Field(default=File(Path(__file__).parent.parent / "ai_dynamo/runtime/aiperf.py"), exclude=True) setup_cmd: str | None = Field( default=None, serialization_alias="setup-cmd", @@ -267,7 +269,13 @@ class AIPerf(Workload): @property def installables(self) -> list[Installable]: - return [self.script] + return [self.script, self.runtime] + + +class AIPerfPhase(AIPerf): + """Named AIPerf phase that overrides the base AIPerf configuration.""" + + name: str = Field(min_length=1, pattern=r"^[A-Za-z0-9_.-]+$") class AIPerfAccuracy(BaseModel): @@ -324,6 +332,7 @@ class AIDynamoCmdArgs(CmdArgs): lmcache_controller: LMCacheController | None = None genai_perf: GenAIPerf = Field(default_factory=GenAIPerf) aiperf: AIPerf = Field(default_factory=AIPerf) + aiperf_phases: list[AIPerfPhase] | None = None aiperf_accuracy: AIPerfAccuracy | None = None workloads: str = "genai_perf.sh" @@ -341,6 +350,23 @@ def validate_workloads(cls, v: str) -> str: def workloads_list(self) -> list[str]: return [w.strip() for w in self.workloads.split(",")] + @model_validator(mode="after") + def validate_aiperf_phases(self) -> "AIDynamoCmdArgs": + """Validate AIPerf phases.""" + if not self.aiperf_phases: + return self + + seen = set() + duplicates = set() + for phase in self.aiperf_phases: + if phase.name in seen: + duplicates.add(phase.name) + seen.add(phase.name) + if duplicates: + raise ValueError(f"AIPerf phase names must be unique. Duplicates: {sorted(duplicates)}") + + return self + @property def installables(self) -> list[Installable]: return [ diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh index 15cee3a58..476ee3062 100644 --- a/src/cloudai/workloads/ai_dynamo/aiperf.sh +++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh @@ -2,182 +2,8 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES # Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# aiperf.sh — aiperf profile wrapper for ai_dynamo workloads. -# -# Called from ai_dynamo.sh's launch_workload() with: -# bash aiperf.sh --result-dir --model --url --port -# [--cmd ] [--report-name ] [--artifact-dir-name ] [--extra-args ] -# -- ... -# -# Context flags (before --) that are recognised and used: -# --result-dir Directory where artifacts and the final report are written. -# --model HuggingFace model identifier (e.g. Qwen/Qwen3-0.6B). -# --url Base URL of the dynamo.frontend (e.g. http://node01). -# --port HTTP port the dynamo.frontend is listening on. -# --report-name Output CSV name (default: aiperf_report.csv). -# --artifact-dir-name Artifact directory name under --result-dir (default: aiperf_artifacts). -# --cmd Full launch command including subcommand (default: "aiperf profile"). -# --setup-cmd Optional shell command run before launching aiperf. -# --extra-args Raw string appended verbatim after all other flags. -# -# All unrecognised flags (--install-dir, --gpus-per-node, etc.) are silently -# consumed so this script is forward-compatible with launch_workload additions. -# -# Everything after -- is passed directly to the aiperf profile invocation. set -Eeuo pipefail -result_dir="" -model="" -url="http://localhost" -port=8000 -report_name="aiperf_report.csv" -artifact_dir_name="aiperf_artifacts" -cmd="aiperf profile" -setup_cmd="" -declare -a extra_args=() -declare -a profile_args=() - -log() { - echo "[$(date '+%F %T') $(hostname)]: $*" -} - -_parse_aiperf_args() { - while [[ $# -ge 2 ]]; do - case "$1" in - --*) profile_args+=("$1" "$2"); shift 2 ;; - *) shift ;; - esac - done - # Capture a trailing lone boolean flag if present. - # Use if/fi — not [[ ]] && — so set -e does not trigger on a false condition. - if [[ $# -eq 1 && "$1" == --* ]]; then - profile_args+=("$1") - fi -} - -process_args() { - while [[ $# -gt 0 ]]; do - case "$1" in - --result-dir) result_dir="$2"; shift 2 ;; - --model) model="$2"; shift 2 ;; - --url) url="$2"; shift 2 ;; - --port) port="$2"; shift 2 ;; - --report-name) report_name="$2"; shift 2 ;; - --artifact-dir-name) artifact_dir_name="$2"; shift 2 ;; - --cmd) cmd="$2"; shift 2 ;; - --setup-cmd) setup_cmd="$2"; shift 2 ;; - --extra-args) read -ra extra_args <<< "$2"; shift 2 ;; - --) shift; _parse_aiperf_args "$@"; break ;; - --*) if [[ -n "${2:-}" && "${2}" != -* ]]; then shift 2; else shift 1; fi ;; # consume unknown flag; shift 2 only if next arg is a value - *) shift ;; - esac - done - - log "Parsed args: - result_dir: $result_dir - model: $model - url: $url - port: $port - report_name: $report_name - artifact_dir: $artifact_dir_name - cmd: $cmd - setup_cmd: ${setup_cmd:-} - extra_args: ${extra_args[*]:-} - profile_args: ${profile_args[*]:-}" -} - -run_setup_cmd() { - if [[ -z "$setup_cmd" ]]; then - return - fi - - log "Running AIPerf setup command: $setup_cmd" - bash -lc "$setup_cmd" - log "AIPerf setup command complete" -} - -process_results() { - local artifact_dir="$result_dir/$artifact_dir_name" - local csv_path="" - - if [[ -f "$artifact_dir/profile_export_aiperf.csv" ]]; then - csv_path="$artifact_dir/profile_export_aiperf.csv" - else - csv_path=$(find "$artifact_dir" -name "*aiperf*.csv" -print -quit 2>/dev/null || true) - fi - - if [[ -n "$csv_path" ]]; then - cp "$csv_path" "$result_dir/$report_name" - log "aiperf report saved to $result_dir/$report_name" - else - log "ERROR: no CSV found in $artifact_dir — aiperf may not have completed" - exit 1 - fi - -} - -run_aiperf() { - local full_url="$1" - local artifact_dir="$2" - local -a run_cmd=() - read -ra run_cmd <<< "$cmd" - local -a launch_cmd=( - "${run_cmd[@]}" - --model "$model" - --url "$full_url" - --endpoint-type chat - --streaming - --artifact-dir "$artifact_dir" - --no-server-metrics - ) - - log "Launching aiperf: ${run_cmd[*]} --model $model --url $full_url" - - if [[ "${#profile_args[@]}" -gt 0 ]]; then - launch_cmd+=("${profile_args[@]}") - fi - if [[ "${#extra_args[@]}" -gt 0 ]]; then - launch_cmd+=("${extra_args[@]}") - fi - - "${launch_cmd[@]}" - - log "aiperf run complete" -} - -main() { - process_args "$@" - - if [[ -z "$result_dir" ]]; then - log "ERROR: --result-dir is required"; exit 1 - fi - if [[ -z "$model" ]]; then - log "ERROR: --model is required"; exit 1 - fi - - run_setup_cmd - - local full_url="${url}:${port}" - local artifact_dir="$result_dir/$artifact_dir_name" - rm -rf "$artifact_dir" - - run_aiperf "$full_url" "$artifact_dir" - process_results -} - -main "$@" -exit 0 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +exec python3 "${SCRIPT_DIR}/aiperf.py" "$@" diff --git a/src/cloudai/workloads/ai_dynamo/runtime/aiperf.py b/src/cloudai/workloads/ai_dynamo/runtime/aiperf.py new file mode 100644 index 000000000..b5476d571 --- /dev/null +++ b/src/cloudai/workloads/ai_dynamo/runtime/aiperf.py @@ -0,0 +1,81 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Execute generated AIPerf runtime entries.""" + +from __future__ import annotations + +import argparse +import json +import shlex +import shutil +import subprocess +import sys +from pathlib import Path +from typing import Any + + +def log(message: str) -> None: + print(message, flush=True) + + +def substitute_frontend_url(values: list[str], frontend_url: str) -> list[str]: + return [value.replace("{frontend_url}", frontend_url) for value in values] + + +def run_entry(entry: dict[str, Any], frontend_url: str) -> None: + argv = substitute_frontend_url([*entry["cmd"], *entry.get("cli", [])], frontend_url) + output_folder = entry.get("output_folder") + if output_folder: + shutil.rmtree(output_folder, ignore_errors=True) + + log(f"Running {entry['name']}: {shlex.join(argv)}") + log_file = entry.get("log_file") + if log_file: + log_path = Path(log_file) + log_path.parent.mkdir(parents=True, exist_ok=True) + with log_path.open("w", encoding="utf-8") as fp: + subprocess.run(argv, stdout=fp, stderr=subprocess.STDOUT, check=True) + else: + subprocess.run(argv, check=True) + + report_source = entry.get("report_source") + report_file = entry.get("report_file") + if report_source and report_file: + report_path = Path(report_file) + report_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(report_source, report_path) + log(f"AIPerf report saved to {report_path}") + + final_report_file = entry.get("final_report_file") + if final_report_file and report_file: + shutil.copy2(report_file, final_report_file) + log(f"Final AIPerf report saved to {final_report_file}") + + +def parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--commands-file", required=True) + parser.add_argument("--url", required=True) + args, _ = parser.parse_known_args(argv) + return args + + +def main(argv: list[str]) -> int: + try: + args = parse_args(argv) + with Path(args.commands_file).open(encoding="utf-8") as fp: + entries = json.load(fp) + + for entry in entries: + run_entry(entry, args.url) + except Exception as exc: + log(f"ERROR: {exc}") + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 861a4c469..4cbb33823 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -14,10 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import logging import shlex from pathlib import Path -from typing import List, cast +from typing import Any, List, cast import yaml from pydantic import BaseModel, TypeAdapter, ValidationError @@ -25,7 +26,15 @@ from cloudai.core import File, GitRepo from cloudai.systems.slurm import SlurmCommandGenStrategy -from .ai_dynamo import LMCACHE_CONFIG_BACKUP_FILE_NAME, LMCACHE_CONFIG_FILE_NAME, AIDynamoTestDefinition +from .ai_dynamo import ( + AIPERF_ARTIFACTS_DIR, + AIPERF_COMMANDS_FILE_NAME, + LMCACHE_CONFIG_BACKUP_FILE_NAME, + LMCACHE_CONFIG_FILE_NAME, + AIDynamoTestDefinition, + AIPerf, + AIPerfPhase, +) class AIDynamoSlurmCommandGenStrategy(SlurmCommandGenStrategy): @@ -109,8 +118,135 @@ def _prepare_lmcache_config(self): (self.test_run.output_path / LMCACHE_CONFIG_FILE_NAME).write_text(config) (self.test_run.output_path / LMCACHE_CONFIG_BACKUP_FILE_NAME).write_text(config) + def _aiperf_config_dict(self, aiperf: AIPerf, *, exclude_unset: bool = False) -> dict[str, Any]: + return aiperf.model_dump( + by_alias=True, + exclude={"args", "name", "repo", "script", "runtime"}, + exclude_none=True, + exclude_unset=exclude_unset, + ) + + def _aiperf_args_dict(self, aiperf: AIPerf, *, exclude_unset: bool = False) -> dict[str, Any]: + return aiperf.args.model_dump(by_alias=True, exclude_none=True, exclude_unset=exclude_unset) + + def _aiperf_args_argv(self, args: dict[str, Any]) -> list[str]: + result = [] + for key, value in args.items(): + result.append(f"--{key}") + if value is not None: + result.append(str(value)) + return result + + def _runtime_result_path(self, path: str) -> str: + if Path(path).is_absolute(): + return path + return f"{self.CONTAINER_MOUNT_OUTPUT}/{path}" + + def _split_extra_args(self, value: Any) -> list[str]: + if value is None: + return [] + if isinstance(value, list): + return [str(item) for item in value] + return shlex.split(str(value)) + + def _aiperf_phase_manifest_entry(self, base: AIPerf, phase: AIPerfPhase, *, single_phase: bool) -> dict[str, Any]: + base_config = self._aiperf_config_dict(base) + phase_config = self._aiperf_config_dict(phase, exclude_unset=True) + config = {**base_config, **phase_config} + + base_args = self._aiperf_args_dict(base) + phase_args = self._aiperf_args_dict(phase, exclude_unset=True) + args = {**base_args, **phase_args} + + if "artifact-dir-name" not in phase_config: + base_artifact_dir = base_config.get("artifact-dir-name", AIPERF_ARTIFACTS_DIR) + config["artifact-dir-name"] = base_artifact_dir if single_phase else f"{base_artifact_dir}/{phase.name}" + if "report-name" not in phase_config: + base_report_name = base_config.get("report-name", "aiperf_report.csv") + config["report-name"] = base_report_name if single_phase else f"aiperf_{phase.name}_report.csv" + + return { + "name": phase.name, + "config": config, + "profile_args": self._aiperf_args_argv(args), + } + + def _aiperf_entries(self) -> list[dict[str, Any]]: + phases = self.td.cmd_args.aiperf_phases or [AIPerfPhase.model_validate({"name": "aiperf"})] + return [ + self._aiperf_phase_manifest_entry( + self.td.cmd_args.aiperf, + phase, + single_phase=len(phases) == 1, + ) + for phase in phases + ] + + def _aiperf_run_entry(self, entry: dict[str, Any], *, write_phase_log: bool, is_final: bool) -> dict[str, Any]: + config = entry["config"] + artifact_dir_name = config["artifact-dir-name"] + artifact_dir = self._runtime_result_path(artifact_dir_name) + runtime_entry = { + "name": entry["name"], + "cmd": shlex.split(config["cmd"]), + "cli": [ + "--model", + self.td.cmd_args.dynamo.model, + "--url", + f"{{frontend_url}}:{self.td.cmd_args.dynamo.port}", + "--endpoint-type", + "chat", + "--streaming", + "--artifact-dir", + artifact_dir, + "--no-server-metrics", + *entry["profile_args"], + *self._split_extra_args(config.get("extra-args")), + ], + "output_folder": artifact_dir, + "report_source": f"{artifact_dir}/profile_export_aiperf.csv", + "report_file": self._runtime_result_path(config["report-name"]), + } + if write_phase_log: + runtime_entry["log_file"] = self._runtime_result_path(f"aiperf_{entry['name']}.log") + if is_final: + runtime_entry["final_report_file"] = self._runtime_result_path("aiperf_report.csv") + return runtime_entry + + def _aiperf_setup_entry(self, setup_cmd: str) -> dict[str, Any]: + return { + "name": "aiperf_setup", + "cmd": ["bash", "-lc", setup_cmd], + "cli": [], + } + + def _prepare_aiperf_commands(self) -> str | None: + if "aiperf.sh" not in self.td.cmd_args.workloads_list: + return None + + self.test_run.output_path.mkdir(parents=True, exist_ok=True) + entries = self._aiperf_entries() + runtime_entries = [] + setup_cmd = entries[0]["config"].get("setup-cmd") + if setup_cmd: + runtime_entries.append(self._aiperf_setup_entry(setup_cmd)) + + write_phase_logs = len(entries) > 1 + for idx, entry in enumerate(entries): + runtime_entries.append( + self._aiperf_run_entry( + entry, + write_phase_log=write_phase_logs, + is_final=len(entries) > 1 and idx == len(entries) - 1, + ) + ) + + (self.test_run.output_path / AIPERF_COMMANDS_FILE_NAME).write_text(json.dumps(runtime_entries, indent=2)) + return f"{self.CONTAINER_MOUNT_OUTPUT}/{AIPERF_COMMANDS_FILE_NAME}" + def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]: self._prepare_lmcache_config() + aiperf_commands_file = self._prepare_aiperf_commands() if not td.repo.installed_path: raise ValueError("Dynamo repo is not installed") args = [ @@ -146,6 +282,8 @@ def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]: args.extend(self._get_nested_toml_args(td.cmd_args.genai_perf, "--genai_perf-")) args.extend(self._get_nested_toml_args(td.cmd_args.aiperf, "--aiperf-")) + if aiperf_commands_file: + args.append(f"--aiperf-commands-file {aiperf_commands_file}") if td.cmd_args.aiperf_accuracy is not None: args.extend(self._get_nested_toml_args(td.cmd_args.aiperf_accuracy, "--aiperf_accuracy-")) diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index 0e2f23061..a279297ce 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import shlex from pathlib import Path from typing import cast @@ -25,6 +26,7 @@ from cloudai.core import GitRepo from cloudai.systems.slurm import SlurmSystem from cloudai.workloads.ai_dynamo import ( + AIPERF_COMMANDS_FILE_NAME, LMCACHE_CONFIG_BACKUP_FILE_NAME, LMCACHE_CONFIG_FILE_NAME, AIDynamoArgs, @@ -33,6 +35,7 @@ AIDynamoTestDefinition, AIPerf, AIPerfAccuracy, + AIPerfPhase, GenAIPerf, LMCacheController, WorkerBaseArgs, @@ -218,6 +221,97 @@ def test_gen_script_args_contains_custom_aiperf_accuracy_args(strategy: AIDynamo assert f'--aiperf_accuracy-cli "{cli}"' in result +def test_gen_script_args_writes_resolved_aiperf_commands(strategy: AIDynamoSlurmCommandGenStrategy) -> None: + td = cast(AIDynamoTestDefinition, strategy.test_run.test) + td.cmd_args.workloads = "aiperf.sh" + td.cmd_args.aiperf = AIPerf.model_validate( + { + "setup-cmd": "python -m pip install --upgrade aiperf", + "args": { + "concurrency": 2, + "request-count": 50, + "synthetic-input-tokens-mean": 300, + "output-tokens-mean": 500, + }, + } + ) + td.cmd_args.aiperf_phases = [ + AIPerfPhase.model_validate({"name": "round_1", "args": {"concurrency": 1}}), + AIPerfPhase.model_validate({"name": "round_2", "args": {"request-count": 10}}), + ] + + result = strategy._gen_script_args(td) + + assert f"--aiperf-commands-file {strategy.CONTAINER_MOUNT_OUTPUT}/{AIPERF_COMMANDS_FILE_NAME}" in result + entries = json.loads((strategy.test_run.output_path / AIPERF_COMMANDS_FILE_NAME).read_text()) + assert entries[0] == { + "name": "aiperf_setup", + "cmd": ["bash", "-lc", "python -m pip install --upgrade aiperf"], + "cli": [], + } + assert entries[1]["name"] == "round_1" + assert entries[1]["cmd"] == ["aiperf", "profile"] + assert entries[1]["cli"][:9] == [ + "--model", + "model", + "--url", + "{frontend_url}:8000", + "--endpoint-type", + "chat", + "--streaming", + "--artifact-dir", + f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_artifacts/round_1", + ] + assert entries[1]["cli"][-8:] == [ + "--concurrency", + "1", + "--request-count", + "50", + "--synthetic-input-tokens-mean", + "300", + "--output-tokens-mean", + "500", + ] + assert entries[1]["log_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_1.log" + assert entries[1]["report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_1_report.csv" + assert entries[2]["cli"][-8:] == [ + "--concurrency", + "2", + "--request-count", + "10", + "--synthetic-input-tokens-mean", + "300", + "--output-tokens-mean", + "500", + ] + assert entries[2]["final_report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_report.csv" + + +def test_single_aiperf_phase_keeps_legacy_artifact_defaults(strategy: AIDynamoSlurmCommandGenStrategy) -> None: + td = cast(AIDynamoTestDefinition, strategy.test_run.test) + td.cmd_args.workloads = "aiperf.sh" + td.cmd_args.aiperf_phases = [AIPerfPhase.model_validate({"name": "round_1", "args": {"request-count": 10}})] + + strategy._gen_script_args(td) + + entries = json.loads((strategy.test_run.output_path / AIPERF_COMMANDS_FILE_NAME).read_text()) + assert entries[0]["output_folder"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_artifacts" + assert entries[0]["report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_report.csv" + assert "log_file" not in entries[0] + + +def test_aiperf_phase_names_must_be_unique(cmd_args: AIDynamoCmdArgs) -> None: + with pytest.raises(ValueError, match="AIPerf phase names must be unique"): + AIDynamoCmdArgs( + docker_image_url=cmd_args.docker_image_url, + dynamo=cmd_args.dynamo, + aiperf_phases=[ + AIPerfPhase.model_validate({"name": "round_1"}), + AIPerfPhase.model_validate({"name": "round_1"}), + ], + ) + + def test_gen_script_args_quotes_worker_json_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: td = cast(AIDynamoTestDefinition, strategy.test_run.test) config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' diff --git a/tests/workloads/ai_dynamo/test_runtime_aiperf.py b/tests/workloads/ai_dynamo/test_runtime_aiperf.py new file mode 100644 index 000000000..18045b2a2 --- /dev/null +++ b/tests/workloads/ai_dynamo/test_runtime_aiperf.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import json +import sys +from pathlib import Path + +from cloudai.workloads.ai_dynamo.runtime import aiperf + + +def _write_fake_aiperf(tmp_path: Path) -> Path: + script = tmp_path / "fake_aiperf.py" + script.write_text( + """ +import sys +from pathlib import Path + +artifact_dir = Path(sys.argv[sys.argv.index("--artifact-dir") + 1]) +url = sys.argv[sys.argv.index("--url") + 1] +artifact_dir.mkdir(parents=True, exist_ok=True) +(artifact_dir / "profile_export_aiperf.csv").write_text(f"url\\n{url}\\n", encoding="utf-8") +""".strip(), + encoding="utf-8", + ) + return script + + +def test_runtime_executes_entries_and_copies_final_report(tmp_path: Path) -> None: + fake_aiperf = _write_fake_aiperf(tmp_path) + commands_file = tmp_path / "aiperf_commands.json" + artifact_dir = tmp_path / "aiperf_artifacts" / "round_1" + report_file = tmp_path / "aiperf_round_1_report.csv" + final_report_file = tmp_path / "aiperf_report.csv" + commands_file.write_text( + json.dumps( + [ + { + "name": "round_1", + "cmd": [sys.executable, str(fake_aiperf)], + "cli": [ + "--url", + "{frontend_url}:8000", + "--artifact-dir", + str(artifact_dir), + ], + "output_folder": str(artifact_dir), + "log_file": str(tmp_path / "aiperf_round_1.log"), + "report_source": str(artifact_dir / "profile_export_aiperf.csv"), + "report_file": str(report_file), + "final_report_file": str(final_report_file), + } + ] + ), + encoding="utf-8", + ) + + result = aiperf.main(["--url", "http://frontend", "--commands-file", str(commands_file)]) + + assert result == 0 + assert report_file.read_text(encoding="utf-8") == "url\nhttp://frontend:8000\n" + assert final_report_file.read_text(encoding="utf-8") == "url\nhttp://frontend:8000\n" + assert (tmp_path / "aiperf_round_1.log").is_file() From c3877ca478b5c1ea6508070fcaf956c2856568f2 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Fri, 29 May 2026 18:39:20 -0700 Subject: [PATCH 02/16] fix vllm config --- conf/experimental/ai_dynamo/test/vllm.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index 0667f1cab..581ecf3e7 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -39,6 +39,7 @@ workloads = "aiperf.sh" tensor-parallel-size = 8 pipeline-parallel-size = 1 data-parallel-size = 1 + kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' [cmd_args.dynamo.decode_worker] num-nodes = 1 @@ -51,6 +52,7 @@ workloads = "aiperf.sh" tensor-parallel-size = 8 pipeline-parallel-size = 1 data-parallel-size = 1 + kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' [cmd_args.lmcache_controller] cmd = "lmcache_controller --host 0.0.0.0 --port 9000 --monitor-port 9001" From c4512dc38a52af050138aaf9a373e8fe212f4137 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 12:08:16 +0200 Subject: [PATCH 03/16] fix filenames for different aiperf iterations --- .../workloads/ai_dynamo/runtime/aiperf.py | 20 +++++++---- .../ai_dynamo/slurm_command_gen_strategy.py | 13 ++++++-- .../test_command_gen_strategy_slurm.py | 1 + .../ai_dynamo/test_runtime_aiperf.py | 33 +++++++++++++++++++ 4 files changed, 59 insertions(+), 8 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/runtime/aiperf.py b/src/cloudai/workloads/ai_dynamo/runtime/aiperf.py index b5476d571..d76361cfa 100644 --- a/src/cloudai/workloads/ai_dynamo/runtime/aiperf.py +++ b/src/cloudai/workloads/ai_dynamo/runtime/aiperf.py @@ -24,6 +24,18 @@ def substitute_frontend_url(values: list[str], frontend_url: str) -> list[str]: return [value.replace("{frontend_url}", frontend_url) for value in values] +def copy_file(source: str, destination: str, message: str) -> None: + source_path = Path(source) + destination_path = Path(destination) + destination_path.parent.mkdir(parents=True, exist_ok=True) + if source_path.resolve() == destination_path.resolve(): + log(f"{message} {destination_path}") + return + + shutil.copy2(source_path, destination_path) + log(f"{message} {destination_path}") + + def run_entry(entry: dict[str, Any], frontend_url: str) -> None: argv = substitute_frontend_url([*entry["cmd"], *entry.get("cli", [])], frontend_url) output_folder = entry.get("output_folder") @@ -43,15 +55,11 @@ def run_entry(entry: dict[str, Any], frontend_url: str) -> None: report_source = entry.get("report_source") report_file = entry.get("report_file") if report_source and report_file: - report_path = Path(report_file) - report_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(report_source, report_path) - log(f"AIPerf report saved to {report_path}") + copy_file(report_source, report_file, "AIPerf report saved to") final_report_file = entry.get("final_report_file") if final_report_file and report_file: - shutil.copy2(report_file, final_report_file) - log(f"Final AIPerf report saved to {final_report_file}") + copy_file(report_file, final_report_file, "Final AIPerf report saved to") def parse_args(argv: list[str]) -> argparse.Namespace: diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 4cbb33823..567d8d7a5 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -149,6 +149,13 @@ def _split_extra_args(self, value: Any) -> list[str]: return [str(item) for item in value] return shlex.split(str(value)) + def _aiperf_phase_has_explicit_value(self, phase: AIPerfPhase, field_name: str, *extra_aliases: str) -> bool: + if field_name in phase.model_fields_set: + return True + + extra = phase.model_extra or {} + return any(alias in extra for alias in extra_aliases) + def _aiperf_phase_manifest_entry(self, base: AIPerf, phase: AIPerfPhase, *, single_phase: bool) -> dict[str, Any]: base_config = self._aiperf_config_dict(base) phase_config = self._aiperf_config_dict(phase, exclude_unset=True) @@ -158,10 +165,12 @@ def _aiperf_phase_manifest_entry(self, base: AIPerf, phase: AIPerfPhase, *, sing phase_args = self._aiperf_args_dict(phase, exclude_unset=True) args = {**base_args, **phase_args} - if "artifact-dir-name" not in phase_config: + if not self._aiperf_phase_has_explicit_value( + phase, "artifact_dir_name", "artifact-dir-name", "artifact_dir_name" + ): base_artifact_dir = base_config.get("artifact-dir-name", AIPERF_ARTIFACTS_DIR) config["artifact-dir-name"] = base_artifact_dir if single_phase else f"{base_artifact_dir}/{phase.name}" - if "report-name" not in phase_config: + if not self._aiperf_phase_has_explicit_value(phase, "report_name", "report-name", "report_name"): base_report_name = base_config.get("report-name", "aiperf_report.csv") config["report-name"] = base_report_name if single_phase else f"aiperf_{phase.name}_report.csv" diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index a279297ce..e8e0f68d5 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -284,6 +284,7 @@ def test_gen_script_args_writes_resolved_aiperf_commands(strategy: AIDynamoSlurm "--output-tokens-mean", "500", ] + assert entries[2]["report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_2_report.csv" assert entries[2]["final_report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_report.csv" diff --git a/tests/workloads/ai_dynamo/test_runtime_aiperf.py b/tests/workloads/ai_dynamo/test_runtime_aiperf.py index 18045b2a2..b54988102 100644 --- a/tests/workloads/ai_dynamo/test_runtime_aiperf.py +++ b/tests/workloads/ai_dynamo/test_runtime_aiperf.py @@ -61,3 +61,36 @@ def test_runtime_executes_entries_and_copies_final_report(tmp_path: Path) -> Non assert report_file.read_text(encoding="utf-8") == "url\nhttp://frontend:8000\n" assert final_report_file.read_text(encoding="utf-8") == "url\nhttp://frontend:8000\n" assert (tmp_path / "aiperf_round_1.log").is_file() + + +def test_runtime_allows_final_report_to_match_report_file(tmp_path: Path) -> None: + fake_aiperf = _write_fake_aiperf(tmp_path) + commands_file = tmp_path / "aiperf_commands.json" + artifact_dir = tmp_path / "aiperf_artifacts" + report_file = tmp_path / "aiperf_report.csv" + commands_file.write_text( + json.dumps( + [ + { + "name": "aiperf", + "cmd": [sys.executable, str(fake_aiperf)], + "cli": [ + "--url", + "{frontend_url}:8000", + "--artifact-dir", + str(artifact_dir), + ], + "output_folder": str(artifact_dir), + "report_source": str(artifact_dir / "profile_export_aiperf.csv"), + "report_file": str(report_file), + "final_report_file": str(report_file), + } + ] + ), + encoding="utf-8", + ) + + result = aiperf.main(["--url", "http://frontend", "--commands-file", str(commands_file)]) + + assert result == 0 + assert report_file.read_text(encoding="utf-8") == "url\nhttp://frontend:8000\n" From 2ab10fae661c2d9130fdabb61ae9d46f465b1a28 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 12:42:37 +0200 Subject: [PATCH 04/16] fix copy crash --- src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 23 +++++++++++++++++-- .../ai_dynamo/slurm_command_gen_strategy.py | 8 +++---- .../test_command_gen_strategy_slurm.py | 21 +++++++++++++++++ 3 files changed, 46 insertions(+), 6 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index 5c45a149b..4f08f68d8 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -272,10 +272,29 @@ def installables(self) -> list[Installable]: return [self.script, self.runtime] -class AIPerfPhase(AIPerf): +class AIPerfPhase(BaseModel): """Named AIPerf phase that overrides the base AIPerf configuration.""" - name: str = Field(min_length=1, pattern=r"^[A-Za-z0-9_.-]+$") + model_config = ConfigDict(extra="allow", populate_by_name=True) + + name: str = Field(..., min_length=1, pattern=r"^[A-Za-z0-9_.-]+$") + cmd: str | None = None + setup_cmd: str | None = Field( + default=None, + serialization_alias="setup-cmd", + validation_alias=AliasChoices("setup-cmd", "setup_cmd"), + ) + report_name: str | None = Field( + default=None, + serialization_alias="report-name", + validation_alias=AliasChoices("report-name", "report_name"), + ) + args: Args = Field(default_factory=Args) + extra_args: str | list[str] | None = Field( + default=None, + serialization_alias="extra-args", + validation_alias=AliasChoices("extra-args", "extra_args"), + ) class AIPerfAccuracy(BaseModel): diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 567d8d7a5..b1c1b2526 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -118,7 +118,7 @@ def _prepare_lmcache_config(self): (self.test_run.output_path / LMCACHE_CONFIG_FILE_NAME).write_text(config) (self.test_run.output_path / LMCACHE_CONFIG_BACKUP_FILE_NAME).write_text(config) - def _aiperf_config_dict(self, aiperf: AIPerf, *, exclude_unset: bool = False) -> dict[str, Any]: + def _aiperf_config_dict(self, aiperf: AIPerf | AIPerfPhase, *, exclude_unset: bool = False) -> dict[str, Any]: return aiperf.model_dump( by_alias=True, exclude={"args", "name", "repo", "script", "runtime"}, @@ -126,7 +126,7 @@ def _aiperf_config_dict(self, aiperf: AIPerf, *, exclude_unset: bool = False) -> exclude_unset=exclude_unset, ) - def _aiperf_args_dict(self, aiperf: AIPerf, *, exclude_unset: bool = False) -> dict[str, Any]: + def _aiperf_args_dict(self, aiperf: AIPerf | AIPerfPhase, *, exclude_unset: bool = False) -> dict[str, Any]: return aiperf.args.model_dump(by_alias=True, exclude_none=True, exclude_unset=exclude_unset) def _aiperf_args_argv(self, args: dict[str, Any]) -> list[str]: @@ -150,11 +150,11 @@ def _split_extra_args(self, value: Any) -> list[str]: return shlex.split(str(value)) def _aiperf_phase_has_explicit_value(self, phase: AIPerfPhase, field_name: str, *extra_aliases: str) -> bool: - if field_name in phase.model_fields_set: + if field_name in phase.model_fields_set and getattr(phase, field_name) is not None: return True extra = phase.model_extra or {} - return any(alias in extra for alias in extra_aliases) + return any(extra.get(alias) is not None for alias in extra_aliases) def _aiperf_phase_manifest_entry(self, base: AIPerf, phase: AIPerfPhase, *, single_phase: bool) -> dict[str, Any]: base_config = self._aiperf_config_dict(base) diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index e8e0f68d5..78ef40f1a 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -288,6 +288,27 @@ def test_gen_script_args_writes_resolved_aiperf_commands(strategy: AIDynamoSlurm assert entries[2]["final_report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_report.csv" +def test_aiperf_phase_roundtrip_does_not_emit_default_report_name(strategy: AIDynamoSlurmCommandGenStrategy) -> None: + td = cast(AIDynamoTestDefinition, strategy.test_run.test) + td.cmd_args.workloads = "aiperf.sh" + td.cmd_args.aiperf_phases = [ + AIPerfPhase.model_validate({"name": "round_1"}), + AIPerfPhase.model_validate({"name": "round_2"}), + ] + + roundtripped = AIDynamoTestDefinition.model_validate(td.model_dump()) + strategy.test_run.test = roundtripped + + assert roundtripped.cmd_args.aiperf_phases is not None + assert [phase.report_name for phase in roundtripped.cmd_args.aiperf_phases] == [None, None] + + strategy._gen_script_args(roundtripped) + + entries = json.loads((strategy.test_run.output_path / AIPERF_COMMANDS_FILE_NAME).read_text()) + assert entries[0]["report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_1_report.csv" + assert entries[1]["report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_2_report.csv" + + def test_single_aiperf_phase_keeps_legacy_artifact_defaults(strategy: AIDynamoSlurmCommandGenStrategy) -> None: td = cast(AIDynamoTestDefinition, strategy.test_run.test) td.cmd_args.workloads = "aiperf.sh" From bbd5cf2118b45e7b1dc60893dfc4f943021f19d1 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 15:03:44 +0200 Subject: [PATCH 05/16] refactor and more tests --- src/cloudai/workloads/ai_dynamo/__init__.py | 2 - src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 14 +- src/cloudai/workloads/ai_dynamo/ai_dynamo.sh | 1 + src/cloudai/workloads/ai_dynamo/aiperf.sh | 5 +- .../workloads/ai_dynamo/runtime/aiperf.py | 89 --------- .../ai_dynamo/slurm_command_gen_strategy.py | 186 +++++++++--------- tests/ref_data/ai-dynamo-aiperf.sh | 25 +++ tests/ref_data/ai-dynamo.sbatch | 10 +- tests/test_acceptance.py | 24 +++ .../test_command_gen_strategy_slurm.py | 75 ++----- .../ai_dynamo/test_runtime_aiperf.py | 96 --------- 11 files changed, 182 insertions(+), 345 deletions(-) delete mode 100644 src/cloudai/workloads/ai_dynamo/runtime/aiperf.py create mode 100644 tests/ref_data/ai-dynamo-aiperf.sh delete mode 100644 tests/workloads/ai_dynamo/test_runtime_aiperf.py diff --git a/src/cloudai/workloads/ai_dynamo/__init__.py b/src/cloudai/workloads/ai_dynamo/__init__.py index 57e2eb99e..86938dfec 100644 --- a/src/cloudai/workloads/ai_dynamo/__init__.py +++ b/src/cloudai/workloads/ai_dynamo/__init__.py @@ -15,7 +15,6 @@ # limitations under the License. from .ai_dynamo import ( - AIPERF_COMMANDS_FILE_NAME, LMCACHE_CONFIG_BACKUP_FILE_NAME, LMCACHE_CONFIG_FILE_NAME, AIDynamoArgs, @@ -34,7 +33,6 @@ from .slurm_command_gen_strategy import AIDynamoSlurmCommandGenStrategy __all__ = [ - "AIPERF_COMMANDS_FILE_NAME", "LMCACHE_CONFIG_BACKUP_FILE_NAME", "LMCACHE_CONFIG_FILE_NAME", "AIDynamoArgs", diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index 4f08f68d8..ed04d63ba 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -42,7 +42,6 @@ from cloudai.systems.slurm import SlurmSystem AIPERF_ARTIFACTS_DIR = "aiperf_artifacts" -AIPERF_COMMANDS_FILE_NAME = "aiperf_commands.json" AIPERF_ACCURACY_ARTIFACTS_DIR = "aiperf_accuracy_artifacts" AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv" LMCACHE_CONFIG_FILE_NAME = "lmcache-config.yaml" @@ -255,7 +254,6 @@ class AIPerf(Workload): name: str = "aiperf" cmd: str = "aiperf profile" script: File = File(Path(__file__).parent.parent / "ai_dynamo/aiperf.sh") - runtime: File = Field(default=File(Path(__file__).parent.parent / "ai_dynamo/runtime/aiperf.py"), exclude=True) setup_cmd: str | None = Field( default=None, serialization_alias="setup-cmd", @@ -266,10 +264,15 @@ class AIPerf(Workload): serialization_alias="report-name", validation_alias=AliasChoices("report-name", "report_name"), ) + artifact_dir_name: str = Field( + default=AIPERF_ARTIFACTS_DIR, + serialization_alias="artifact-dir-name", + validation_alias=AliasChoices("artifact-dir-name", "artifact_dir_name"), + ) @property def installables(self) -> list[Installable]: - return [self.script, self.runtime] + return [self.script] class AIPerfPhase(BaseModel): @@ -289,6 +292,11 @@ class AIPerfPhase(BaseModel): serialization_alias="report-name", validation_alias=AliasChoices("report-name", "report_name"), ) + artifact_dir_name: str | None = Field( + default=None, + serialization_alias="artifact-dir-name", + validation_alias=AliasChoices("artifact-dir-name", "artifact_dir_name"), + ) args: Args = Field(default_factory=Args) extra_args: str | list[str] | None = Field( default=None, diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh index 52e975850..25c4126fc 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh @@ -1058,6 +1058,7 @@ function launch_workload() local workload_name="${workload_config_ref["--name"]}" local script="${workload_config_ref["--script"]}" + export FRONTEND_URL="${dynamo_args["url"]}" # Build config and workload args as proper bash arrays to preserve # multi-word values (e.g. --cmd "genai-perf profile") through word splitting. diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh index 476ee3062..22eb0541c 100644 --- a/src/cloudai/workloads/ai_dynamo/aiperf.sh +++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh @@ -5,5 +5,6 @@ set -Eeuo pipefail -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -exec python3 "${SCRIPT_DIR}/aiperf.py" "$@" +echo "AIPerf scripts are generated per test run by the AIDynamo Slurm command generator." >&2 +echo "This installable placeholder should be overridden by --aiperf-script /cloudai_run_results/aiperf.sh." >&2 +exit 1 diff --git a/src/cloudai/workloads/ai_dynamo/runtime/aiperf.py b/src/cloudai/workloads/ai_dynamo/runtime/aiperf.py deleted file mode 100644 index d76361cfa..000000000 --- a/src/cloudai/workloads/ai_dynamo/runtime/aiperf.py +++ /dev/null @@ -1,89 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Execute generated AIPerf runtime entries.""" - -from __future__ import annotations - -import argparse -import json -import shlex -import shutil -import subprocess -import sys -from pathlib import Path -from typing import Any - - -def log(message: str) -> None: - print(message, flush=True) - - -def substitute_frontend_url(values: list[str], frontend_url: str) -> list[str]: - return [value.replace("{frontend_url}", frontend_url) for value in values] - - -def copy_file(source: str, destination: str, message: str) -> None: - source_path = Path(source) - destination_path = Path(destination) - destination_path.parent.mkdir(parents=True, exist_ok=True) - if source_path.resolve() == destination_path.resolve(): - log(f"{message} {destination_path}") - return - - shutil.copy2(source_path, destination_path) - log(f"{message} {destination_path}") - - -def run_entry(entry: dict[str, Any], frontend_url: str) -> None: - argv = substitute_frontend_url([*entry["cmd"], *entry.get("cli", [])], frontend_url) - output_folder = entry.get("output_folder") - if output_folder: - shutil.rmtree(output_folder, ignore_errors=True) - - log(f"Running {entry['name']}: {shlex.join(argv)}") - log_file = entry.get("log_file") - if log_file: - log_path = Path(log_file) - log_path.parent.mkdir(parents=True, exist_ok=True) - with log_path.open("w", encoding="utf-8") as fp: - subprocess.run(argv, stdout=fp, stderr=subprocess.STDOUT, check=True) - else: - subprocess.run(argv, check=True) - - report_source = entry.get("report_source") - report_file = entry.get("report_file") - if report_source and report_file: - copy_file(report_source, report_file, "AIPerf report saved to") - - final_report_file = entry.get("final_report_file") - if final_report_file and report_file: - copy_file(report_file, final_report_file, "Final AIPerf report saved to") - - -def parse_args(argv: list[str]) -> argparse.Namespace: - parser = argparse.ArgumentParser() - parser.add_argument("--commands-file", required=True) - parser.add_argument("--url", required=True) - args, _ = parser.parse_known_args(argv) - return args - - -def main(argv: list[str]) -> int: - try: - args = parse_args(argv) - with Path(args.commands_file).open(encoding="utf-8") as fp: - entries = json.load(fp) - - for entry in entries: - run_entry(entry, args.url) - except Exception as exc: - log(f"ERROR: {exc}") - return 1 - - return 0 - - -if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index b1c1b2526..f02104d2d 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import logging import shlex from pathlib import Path @@ -27,8 +26,6 @@ from cloudai.systems.slurm import SlurmCommandGenStrategy from .ai_dynamo import ( - AIPERF_ARTIFACTS_DIR, - AIPERF_COMMANDS_FILE_NAME, LMCACHE_CONFIG_BACKUP_FILE_NAME, LMCACHE_CONFIG_FILE_NAME, AIDynamoTestDefinition, @@ -36,6 +33,8 @@ AIPerfPhase, ) +AIPERF_SCRIPT_FILE_NAME = "aiperf.sh" + class AIDynamoSlurmCommandGenStrategy(SlurmCommandGenStrategy): """Command generation strategy for AI Dynamo on Slurm systems.""" @@ -118,17 +117,6 @@ def _prepare_lmcache_config(self): (self.test_run.output_path / LMCACHE_CONFIG_FILE_NAME).write_text(config) (self.test_run.output_path / LMCACHE_CONFIG_BACKUP_FILE_NAME).write_text(config) - def _aiperf_config_dict(self, aiperf: AIPerf | AIPerfPhase, *, exclude_unset: bool = False) -> dict[str, Any]: - return aiperf.model_dump( - by_alias=True, - exclude={"args", "name", "repo", "script", "runtime"}, - exclude_none=True, - exclude_unset=exclude_unset, - ) - - def _aiperf_args_dict(self, aiperf: AIPerf | AIPerfPhase, *, exclude_unset: bool = False) -> dict[str, Any]: - return aiperf.args.model_dump(by_alias=True, exclude_none=True, exclude_unset=exclude_unset) - def _aiperf_args_argv(self, args: dict[str, Any]) -> list[str]: result = [] for key, value in args.items(): @@ -156,106 +144,110 @@ def _aiperf_phase_has_explicit_value(self, phase: AIPerfPhase, field_name: str, extra = phase.model_extra or {} return any(extra.get(alias) is not None for alias in extra_aliases) - def _aiperf_phase_manifest_entry(self, base: AIPerf, phase: AIPerfPhase, *, single_phase: bool) -> dict[str, Any]: - base_config = self._aiperf_config_dict(base) - phase_config = self._aiperf_config_dict(phase, exclude_unset=True) - config = {**base_config, **phase_config} - - base_args = self._aiperf_args_dict(base) - phase_args = self._aiperf_args_dict(phase, exclude_unset=True) - args = {**base_args, **phase_args} - - if not self._aiperf_phase_has_explicit_value( - phase, "artifact_dir_name", "artifact-dir-name", "artifact_dir_name" - ): - base_artifact_dir = base_config.get("artifact-dir-name", AIPERF_ARTIFACTS_DIR) - config["artifact-dir-name"] = base_artifact_dir if single_phase else f"{base_artifact_dir}/{phase.name}" - if not self._aiperf_phase_has_explicit_value(phase, "report_name", "report-name", "report_name"): - base_report_name = base_config.get("report-name", "aiperf_report.csv") - config["report-name"] = base_report_name if single_phase else f"aiperf_{phase.name}_report.csv" - - return { - "name": phase.name, - "config": config, - "profile_args": self._aiperf_args_argv(args), - } - - def _aiperf_entries(self) -> list[dict[str, Any]]: + def _resolve_aiperf_phase(self, phase: AIPerfPhase) -> AIPerf: + resolved = self.td.cmd_args.aiperf.model_copy(deep=True) + resolved.name = phase.name + single_phase = self.td.cmd_args.aiperf_phases is None or len(self.td.cmd_args.aiperf_phases) == 1 + + for field_name in ("cmd", "setup_cmd", "report_name", "artifact_dir_name", "extra_args"): + if self._aiperf_phase_has_explicit_value(phase, field_name, field_name.replace("_", "-")): + setattr(resolved, field_name, getattr(phase, field_name)) + + if not self._aiperf_phase_has_explicit_value(phase, "artifact_dir_name", "artifact-dir-name"): + base_artifact_dir = resolved.artifact_dir_name + resolved.artifact_dir_name = base_artifact_dir if single_phase else f"{base_artifact_dir}/{phase.name}" + if not self._aiperf_phase_has_explicit_value(phase, "report_name", "report-name"): + base_report_name = resolved.report_name + resolved.report_name = base_report_name if single_phase else f"aiperf_{phase.name}_report.csv" + + resolved.args = resolved.args.model_copy( + update=phase.args.model_dump(by_alias=True, exclude_none=True, exclude_unset=True) + ) + return resolved + + def _render_aiperf_script(self) -> str: phases = self.td.cmd_args.aiperf_phases or [AIPerfPhase.model_validate({"name": "aiperf"})] - return [ - self._aiperf_phase_manifest_entry( - self.td.cmd_args.aiperf, - phase, - single_phase=len(phases) == 1, - ) - for phase in phases + single_phase = len(phases) == 1 + setup_cmd = self._resolve_aiperf_phase(phases[0]).setup_cmd + lines = [ + "#!/usr/bin/env bash", + "set -Eeuo pipefail", + "", + 'log() { echo "[$(date +%F\\ %T) $(hostname)]: $*"; }', + "", + ': "${FRONTEND_URL:?FRONTEND_URL is not set}"', + "", ] - def _aiperf_run_entry(self, entry: dict[str, Any], *, write_phase_log: bool, is_final: bool) -> dict[str, Any]: - config = entry["config"] - artifact_dir_name = config["artifact-dir-name"] - artifact_dir = self._runtime_result_path(artifact_dir_name) - runtime_entry = { - "name": entry["name"], - "cmd": shlex.split(config["cmd"]), - "cli": [ + if setup_cmd: + setup_argv = ["bash", "-lc", setup_cmd] + lines.extend( + [ + f"log {shlex.quote(f'Running aiperf setup: {shlex.join(setup_argv)}')}", + shlex.join(setup_argv), + "", + ] + ) + + write_phase_logs = not single_phase + for idx, phase in enumerate(phases): + resolved_phase = self._resolve_aiperf_phase(phase) + artifact_dir = self._runtime_result_path(resolved_phase.artifact_dir_name) + report_source = f"{artifact_dir}/profile_export_aiperf.csv" + report_file = self._runtime_result_path(resolved_phase.report_name) + argv = [ + *shlex.split(resolved_phase.cmd), "--model", self.td.cmd_args.dynamo.model, - "--url", - f"{{frontend_url}}:{self.td.cmd_args.dynamo.port}", "--endpoint-type", "chat", "--streaming", "--artifact-dir", artifact_dir, "--no-server-metrics", - *entry["profile_args"], - *self._split_extra_args(config.get("extra-args")), - ], - "output_folder": artifact_dir, - "report_source": f"{artifact_dir}/profile_export_aiperf.csv", - "report_file": self._runtime_result_path(config["report-name"]), - } - if write_phase_log: - runtime_entry["log_file"] = self._runtime_result_path(f"aiperf_{entry['name']}.log") - if is_final: - runtime_entry["final_report_file"] = self._runtime_result_path("aiperf_report.csv") - return runtime_entry - - def _aiperf_setup_entry(self, setup_cmd: str) -> dict[str, Any]: - return { - "name": "aiperf_setup", - "cmd": ["bash", "-lc", setup_cmd], - "cli": [], - } - - def _prepare_aiperf_commands(self) -> str | None: + *self._aiperf_args_argv(resolved_phase.args.model_dump(by_alias=True, exclude_none=True)), + *self._split_extra_args(resolved_phase.extra_args), + ] + cmd = f'{shlex.join(argv)} --url "$FRONTEND_URL"' + log_message = f"Running {phase.name}: {cmd}" + lines.append(f"rm -rf {shlex.quote(artifact_dir)}") + lines.append(f"mkdir -p {shlex.quote(artifact_dir)}") + lines.append(f"log {shlex.quote(log_message)}") + if write_phase_logs: + log_file = self._runtime_result_path(f"aiperf_{phase.name}.log") + lines.append(f"{cmd} > {shlex.quote(log_file)} 2>&1") + else: + lines.append(cmd) + + lines.append(f"mkdir -p {shlex.quote(str(Path(report_file).parent))}") + if report_source != report_file: + lines.append(f"cp {shlex.quote(report_source)} {shlex.quote(report_file)}") + lines.append(f"log {shlex.quote(f'AIPerf report saved to {report_file}')}") + + if not single_phase and idx == len(phases) - 1: + final_report_file = self._runtime_result_path("aiperf_report.csv") + lines.append(f"mkdir -p {shlex.quote(str(Path(final_report_file).parent))}") + if report_file != final_report_file: + lines.append(f"cp {shlex.quote(report_file)} {shlex.quote(final_report_file)}") + lines.append(f"log {shlex.quote(f'Final AIPerf report saved to {final_report_file}')}") + lines.append("") + + return "\n".join(lines) + + def _prepare_aiperf_script(self) -> str | None: if "aiperf.sh" not in self.td.cmd_args.workloads_list: return None self.test_run.output_path.mkdir(parents=True, exist_ok=True) - entries = self._aiperf_entries() - runtime_entries = [] - setup_cmd = entries[0]["config"].get("setup-cmd") - if setup_cmd: - runtime_entries.append(self._aiperf_setup_entry(setup_cmd)) - - write_phase_logs = len(entries) > 1 - for idx, entry in enumerate(entries): - runtime_entries.append( - self._aiperf_run_entry( - entry, - write_phase_log=write_phase_logs, - is_final=len(entries) > 1 and idx == len(entries) - 1, - ) - ) - (self.test_run.output_path / AIPERF_COMMANDS_FILE_NAME).write_text(json.dumps(runtime_entries, indent=2)) - return f"{self.CONTAINER_MOUNT_OUTPUT}/{AIPERF_COMMANDS_FILE_NAME}" + script_path = self.test_run.output_path / AIPERF_SCRIPT_FILE_NAME + script_path.write_text(self._render_aiperf_script() + "\n") + script_path.chmod(0o755) + return f"{self.CONTAINER_MOUNT_OUTPUT}/{AIPERF_SCRIPT_FILE_NAME}" def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]: self._prepare_lmcache_config() - aiperf_commands_file = self._prepare_aiperf_commands() + aiperf_script = self._prepare_aiperf_script() if not td.repo.installed_path: raise ValueError("Dynamo repo is not installed") args = [ @@ -291,8 +283,8 @@ def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]: args.extend(self._get_nested_toml_args(td.cmd_args.genai_perf, "--genai_perf-")) args.extend(self._get_nested_toml_args(td.cmd_args.aiperf, "--aiperf-")) - if aiperf_commands_file: - args.append(f"--aiperf-commands-file {aiperf_commands_file}") + if aiperf_script: + args.append(f"--aiperf-script {aiperf_script}") if td.cmd_args.aiperf_accuracy is not None: args.extend(self._get_nested_toml_args(td.cmd_args.aiperf_accuracy, "--aiperf_accuracy-")) diff --git a/tests/ref_data/ai-dynamo-aiperf.sh b/tests/ref_data/ai-dynamo-aiperf.sh new file mode 100644 index 000000000..3fcd013cb --- /dev/null +++ b/tests/ref_data/ai-dynamo-aiperf.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +log() { echo "[$(date +%F\ %T) $(hostname)]: $*"; } + +: "${FRONTEND_URL:?FRONTEND_URL is not set}" + +rm -rf /cloudai_run_results/aiperf_artifacts/round_1 +mkdir -p /cloudai_run_results/aiperf_artifacts/round_1 +log 'Running round_1: aiperf profile --model model --endpoint-type chat --streaming --artifact-dir /cloudai_run_results/aiperf_artifacts/round_1 --no-server-metrics --concurrency 1 --request-count 50 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --url "$FRONTEND_URL"' +aiperf profile --model model --endpoint-type chat --streaming --artifact-dir /cloudai_run_results/aiperf_artifacts/round_1 --no-server-metrics --concurrency 1 --request-count 50 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --url "$FRONTEND_URL" > /cloudai_run_results/aiperf_round_1.log 2>&1 +mkdir -p /cloudai_run_results +cp /cloudai_run_results/aiperf_artifacts/round_1/profile_export_aiperf.csv /cloudai_run_results/aiperf_round_1_report.csv +log 'AIPerf report saved to /cloudai_run_results/aiperf_round_1_report.csv' + +rm -rf /cloudai_run_results/aiperf_artifacts/round_2 +mkdir -p /cloudai_run_results/aiperf_artifacts/round_2 +log 'Running round_2: aiperf profile --model model --endpoint-type chat --streaming --artifact-dir /cloudai_run_results/aiperf_artifacts/round_2 --no-server-metrics --concurrency 2 --request-count 10 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --url "$FRONTEND_URL"' +aiperf profile --model model --endpoint-type chat --streaming --artifact-dir /cloudai_run_results/aiperf_artifacts/round_2 --no-server-metrics --concurrency 2 --request-count 10 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --url "$FRONTEND_URL" > /cloudai_run_results/aiperf_round_2.log 2>&1 +mkdir -p /cloudai_run_results +cp /cloudai_run_results/aiperf_artifacts/round_2/profile_export_aiperf.csv /cloudai_run_results/aiperf_round_2_report.csv +log 'AIPerf report saved to /cloudai_run_results/aiperf_round_2_report.csv' +mkdir -p /cloudai_run_results +cp /cloudai_run_results/aiperf_round_2_report.csv /cloudai_run_results/aiperf_report.csv +log 'Final AIPerf report saved to /cloudai_run_results/aiperf_report.csv' diff --git a/tests/ref_data/ai-dynamo.sbatch b/tests/ref_data/ai-dynamo.sbatch index 865444b81..0f4a74569 100644 --- a/tests/ref_data/ai-dynamo.sbatch +++ b/tests/ref_data/ai-dynamo.sbatch @@ -32,7 +32,7 @@ srun \ --results-dir /cloudai_run_results \ --dynamo-repo /cloudai_install/dynamo__f7e468c7e8ff0d1426db987564e60572167e8464 \ --hf-home /cloudai_install/huggingface \ - --workloads genai_perf.sh \ + --workloads aiperf.sh \ --failure-marker /cloudai_run_results/failure-marker.txt \ --success-marker /cloudai_run_results/success-marker.txt \ --storage-cache-dir /tmp \ @@ -75,4 +75,10 @@ srun \ --aiperf-name "aiperf" \ --aiperf-cmd "aiperf profile" \ --aiperf-script "/cloudai_install/aiperf.sh" \ - --aiperf-report-name "aiperf_report.csv" + --aiperf-report-name "aiperf_report.csv" \ + --aiperf-artifact-dir-name "aiperf_artifacts" \ + --aiperf-args-concurrency "2" \ + --aiperf-args-request-count "50" \ + --aiperf-args-synthetic-input-tokens-mean "300" \ + --aiperf-args-output-tokens-mean "500" \ + --aiperf-script /cloudai_run_results/aiperf.sh diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index d45416595..78902298e 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -34,6 +34,8 @@ AIDynamoArgs, AIDynamoCmdArgs, AIDynamoTestDefinition, + AIPerf, + AIPerfPhase, GenAIPerf, WorkerBaseArgs, WorkerConfig, @@ -493,6 +495,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - ), cmd_args=AIDynamoCmdArgs( docker_image_url="nvcr.io/nvidia/ai-dynamo:24.09", + workloads="aiperf.sh", dynamo=AIDynamoArgs( model="model", backend="vllm", @@ -526,6 +529,20 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - "warmup-request-count": 10, } ), + aiperf=AIPerf.model_validate( + { + "args": { + "concurrency": 2, + "request-count": 50, + "synthetic-input-tokens-mean": 300, + "output-tokens-mean": 500, + } + } + ), + aiperf_phases=[ + AIPerfPhase.model_validate({"name": "round_1", "args": {"concurrency": 1}}), + AIPerfPhase.model_validate({"name": "round_2", "args": {"request-count": 10}}), + ], ), ), ), @@ -745,3 +762,10 @@ def test_sbatch_generation(slurm_system: SlurmSystem, test_req: tuple[TestRun, s "__INSTALL_DIR__", str(slurm_system.install_path.absolute()) ) assert curr_launcher == ref_launcher, "nixl-ep-launch.sh does not match reference" + + if test_req[1] == "ai-dynamo.sbatch": + aiperf_script = slurm_system.output_path / "aiperf.sh" + assert aiperf_script.exists(), "aiperf.sh was not generated" + curr_aiperf = aiperf_script.read_text().strip() + ref_aiperf = (Path(__file__).parent / "ref_data" / "ai-dynamo-aiperf.sh").read_text().strip() + assert curr_aiperf == ref_aiperf, "aiperf.sh does not match reference" diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index 78ef40f1a..0ece2e800 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import shlex from pathlib import Path from typing import cast @@ -26,7 +25,6 @@ from cloudai.core import GitRepo from cloudai.systems.slurm import SlurmSystem from cloudai.workloads.ai_dynamo import ( - AIPERF_COMMANDS_FILE_NAME, LMCACHE_CONFIG_BACKUP_FILE_NAME, LMCACHE_CONFIG_FILE_NAME, AIDynamoArgs, @@ -221,7 +219,7 @@ def test_gen_script_args_contains_custom_aiperf_accuracy_args(strategy: AIDynamo assert f'--aiperf_accuracy-cli "{cli}"' in result -def test_gen_script_args_writes_resolved_aiperf_commands(strategy: AIDynamoSlurmCommandGenStrategy) -> None: +def test_gen_script_args_writes_resolved_aiperf_script(strategy: AIDynamoSlurmCommandGenStrategy) -> None: td = cast(AIDynamoTestDefinition, strategy.test_run.test) td.cmd_args.workloads = "aiperf.sh" td.cmd_args.aiperf = AIPerf.model_validate( @@ -242,50 +240,19 @@ def test_gen_script_args_writes_resolved_aiperf_commands(strategy: AIDynamoSlurm result = strategy._gen_script_args(td) - assert f"--aiperf-commands-file {strategy.CONTAINER_MOUNT_OUTPUT}/{AIPERF_COMMANDS_FILE_NAME}" in result - entries = json.loads((strategy.test_run.output_path / AIPERF_COMMANDS_FILE_NAME).read_text()) - assert entries[0] == { - "name": "aiperf_setup", - "cmd": ["bash", "-lc", "python -m pip install --upgrade aiperf"], - "cli": [], - } - assert entries[1]["name"] == "round_1" - assert entries[1]["cmd"] == ["aiperf", "profile"] - assert entries[1]["cli"][:9] == [ - "--model", - "model", - "--url", - "{frontend_url}:8000", - "--endpoint-type", - "chat", - "--streaming", - "--artifact-dir", - f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_artifacts/round_1", - ] - assert entries[1]["cli"][-8:] == [ - "--concurrency", - "1", - "--request-count", - "50", - "--synthetic-input-tokens-mean", - "300", - "--output-tokens-mean", - "500", - ] - assert entries[1]["log_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_1.log" - assert entries[1]["report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_1_report.csv" - assert entries[2]["cli"][-8:] == [ - "--concurrency", - "2", - "--request-count", - "10", - "--synthetic-input-tokens-mean", - "300", - "--output-tokens-mean", - "500", - ] - assert entries[2]["report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_2_report.csv" - assert entries[2]["final_report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_report.csv" + assert f"--aiperf-script {strategy.CONTAINER_MOUNT_OUTPUT}/aiperf.sh" in result + script = (strategy.test_run.output_path / "aiperf.sh").read_text() + assert "bash -lc 'python -m pip install --upgrade aiperf'" in script + assert ': "${FRONTEND_URL:?FRONTEND_URL is not set}"' in script + assert '--url "$FRONTEND_URL"' in script + assert f"--artifact-dir {strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_artifacts/round_1" in script + assert f"--artifact-dir {strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_artifacts/round_2" in script + assert "--concurrency 1 --request-count 50" in script + assert "--concurrency 2 --request-count 10" in script + assert f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_1.log" in script + assert f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_1_report.csv" in script + assert f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_2_report.csv" in script + assert f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_report.csv" in script def test_aiperf_phase_roundtrip_does_not_emit_default_report_name(strategy: AIDynamoSlurmCommandGenStrategy) -> None: @@ -304,9 +271,9 @@ def test_aiperf_phase_roundtrip_does_not_emit_default_report_name(strategy: AIDy strategy._gen_script_args(roundtripped) - entries = json.loads((strategy.test_run.output_path / AIPERF_COMMANDS_FILE_NAME).read_text()) - assert entries[0]["report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_1_report.csv" - assert entries[1]["report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_2_report.csv" + script = (strategy.test_run.output_path / "aiperf.sh").read_text() + assert f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_1_report.csv" in script + assert f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_2_report.csv" in script def test_single_aiperf_phase_keeps_legacy_artifact_defaults(strategy: AIDynamoSlurmCommandGenStrategy) -> None: @@ -316,10 +283,10 @@ def test_single_aiperf_phase_keeps_legacy_artifact_defaults(strategy: AIDynamoSl strategy._gen_script_args(td) - entries = json.loads((strategy.test_run.output_path / AIPERF_COMMANDS_FILE_NAME).read_text()) - assert entries[0]["output_folder"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_artifacts" - assert entries[0]["report_file"] == f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_report.csv" - assert "log_file" not in entries[0] + script = (strategy.test_run.output_path / "aiperf.sh").read_text() + assert f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_artifacts" in script + assert f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_report.csv" in script + assert f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_round_1.log" not in script def test_aiperf_phase_names_must_be_unique(cmd_args: AIDynamoCmdArgs) -> None: diff --git a/tests/workloads/ai_dynamo/test_runtime_aiperf.py b/tests/workloads/ai_dynamo/test_runtime_aiperf.py deleted file mode 100644 index b54988102..000000000 --- a/tests/workloads/ai_dynamo/test_runtime_aiperf.py +++ /dev/null @@ -1,96 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -import json -import sys -from pathlib import Path - -from cloudai.workloads.ai_dynamo.runtime import aiperf - - -def _write_fake_aiperf(tmp_path: Path) -> Path: - script = tmp_path / "fake_aiperf.py" - script.write_text( - """ -import sys -from pathlib import Path - -artifact_dir = Path(sys.argv[sys.argv.index("--artifact-dir") + 1]) -url = sys.argv[sys.argv.index("--url") + 1] -artifact_dir.mkdir(parents=True, exist_ok=True) -(artifact_dir / "profile_export_aiperf.csv").write_text(f"url\\n{url}\\n", encoding="utf-8") -""".strip(), - encoding="utf-8", - ) - return script - - -def test_runtime_executes_entries_and_copies_final_report(tmp_path: Path) -> None: - fake_aiperf = _write_fake_aiperf(tmp_path) - commands_file = tmp_path / "aiperf_commands.json" - artifact_dir = tmp_path / "aiperf_artifacts" / "round_1" - report_file = tmp_path / "aiperf_round_1_report.csv" - final_report_file = tmp_path / "aiperf_report.csv" - commands_file.write_text( - json.dumps( - [ - { - "name": "round_1", - "cmd": [sys.executable, str(fake_aiperf)], - "cli": [ - "--url", - "{frontend_url}:8000", - "--artifact-dir", - str(artifact_dir), - ], - "output_folder": str(artifact_dir), - "log_file": str(tmp_path / "aiperf_round_1.log"), - "report_source": str(artifact_dir / "profile_export_aiperf.csv"), - "report_file": str(report_file), - "final_report_file": str(final_report_file), - } - ] - ), - encoding="utf-8", - ) - - result = aiperf.main(["--url", "http://frontend", "--commands-file", str(commands_file)]) - - assert result == 0 - assert report_file.read_text(encoding="utf-8") == "url\nhttp://frontend:8000\n" - assert final_report_file.read_text(encoding="utf-8") == "url\nhttp://frontend:8000\n" - assert (tmp_path / "aiperf_round_1.log").is_file() - - -def test_runtime_allows_final_report_to_match_report_file(tmp_path: Path) -> None: - fake_aiperf = _write_fake_aiperf(tmp_path) - commands_file = tmp_path / "aiperf_commands.json" - artifact_dir = tmp_path / "aiperf_artifacts" - report_file = tmp_path / "aiperf_report.csv" - commands_file.write_text( - json.dumps( - [ - { - "name": "aiperf", - "cmd": [sys.executable, str(fake_aiperf)], - "cli": [ - "--url", - "{frontend_url}:8000", - "--artifact-dir", - str(artifact_dir), - ], - "output_folder": str(artifact_dir), - "report_source": str(artifact_dir / "profile_export_aiperf.csv"), - "report_file": str(report_file), - "final_report_file": str(report_file), - } - ] - ), - encoding="utf-8", - ) - - result = aiperf.main(["--url", "http://frontend", "--commands-file", str(commands_file)]) - - assert result == 0 - assert report_file.read_text(encoding="utf-8") == "url\nhttp://frontend:8000\n" From dda28ef8d75b20e51d6449a463d502728a5fb255 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 15:10:28 +0200 Subject: [PATCH 06/16] cleaner phases merge --- .../ai_dynamo/slurm_command_gen_strategy.py | 31 +++++-------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index f02104d2d..0de74ccef 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -22,6 +22,7 @@ import yaml from pydantic import BaseModel, TypeAdapter, ValidationError +import cloudai.util from cloudai.core import File, GitRepo from cloudai.systems.slurm import SlurmCommandGenStrategy @@ -137,33 +138,17 @@ def _split_extra_args(self, value: Any) -> list[str]: return [str(item) for item in value] return shlex.split(str(value)) - def _aiperf_phase_has_explicit_value(self, phase: AIPerfPhase, field_name: str, *extra_aliases: str) -> bool: - if field_name in phase.model_fields_set and getattr(phase, field_name) is not None: - return True - - extra = phase.model_extra or {} - return any(extra.get(alias) is not None for alias in extra_aliases) - def _resolve_aiperf_phase(self, phase: AIPerfPhase) -> AIPerf: - resolved = self.td.cmd_args.aiperf.model_copy(deep=True) - resolved.name = phase.name + base_data = self.td.cmd_args.aiperf.model_dump(by_alias=True, exclude_none=True) + phase_data = phase.model_dump(by_alias=True, exclude_none=True, exclude_unset=True) single_phase = self.td.cmd_args.aiperf_phases is None or len(self.td.cmd_args.aiperf_phases) == 1 - for field_name in ("cmd", "setup_cmd", "report_name", "artifact_dir_name", "extra_args"): - if self._aiperf_phase_has_explicit_value(phase, field_name, field_name.replace("_", "-")): - setattr(resolved, field_name, getattr(phase, field_name)) + if "artifact-dir-name" not in phase_data and not single_phase: + phase_data["artifact-dir-name"] = f"{self.td.cmd_args.aiperf.artifact_dir_name}/{phase.name}" + if "report-name" not in phase_data and not single_phase: + phase_data["report-name"] = f"aiperf_{phase.name}_report.csv" - if not self._aiperf_phase_has_explicit_value(phase, "artifact_dir_name", "artifact-dir-name"): - base_artifact_dir = resolved.artifact_dir_name - resolved.artifact_dir_name = base_artifact_dir if single_phase else f"{base_artifact_dir}/{phase.name}" - if not self._aiperf_phase_has_explicit_value(phase, "report_name", "report-name"): - base_report_name = resolved.report_name - resolved.report_name = base_report_name if single_phase else f"aiperf_{phase.name}_report.csv" - - resolved.args = resolved.args.model_copy( - update=phase.args.model_dump(by_alias=True, exclude_none=True, exclude_unset=True) - ) - return resolved + return AIPerf.model_validate(cloudai.util.deep_merge(base_data, phase_data)) def _render_aiperf_script(self) -> str: phases = self.td.cmd_args.aiperf_phases or [AIPerfPhase.model_validate({"name": "aiperf"})] From ea08df789de2deac17c5eb787626fa97c726c7b3 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 15:30:50 +0200 Subject: [PATCH 07/16] simplify aiperf handling --- .../workloads/ai_dynamo/slurm_command_gen_strategy.py | 4 +++- tests/ref_data/ai-dynamo.sbatch | 8 -------- .../ai_dynamo/test_command_gen_strategy_slurm.py | 7 ++++--- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 0de74ccef..447ca1154 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -267,9 +267,11 @@ def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]: args.extend(self._get_nested_toml_args(td.cmd_args.dynamo.decode_worker, "--decode-")) args.extend(self._get_nested_toml_args(td.cmd_args.genai_perf, "--genai_perf-")) - args.extend(self._get_nested_toml_args(td.cmd_args.aiperf, "--aiperf-")) if aiperf_script: + args.append(f'--aiperf-name "{td.cmd_args.aiperf.name}"') args.append(f"--aiperf-script {aiperf_script}") + else: + args.extend(self._get_nested_toml_args(td.cmd_args.aiperf, "--aiperf-")) if td.cmd_args.aiperf_accuracy is not None: args.extend(self._get_nested_toml_args(td.cmd_args.aiperf_accuracy, "--aiperf_accuracy-")) diff --git a/tests/ref_data/ai-dynamo.sbatch b/tests/ref_data/ai-dynamo.sbatch index 0f4a74569..e3384343b 100644 --- a/tests/ref_data/ai-dynamo.sbatch +++ b/tests/ref_data/ai-dynamo.sbatch @@ -73,12 +73,4 @@ srun \ --genai_perf-synthetic-input-tokens-mean "550" \ --genai_perf-warmup-request-count "10" \ --aiperf-name "aiperf" \ - --aiperf-cmd "aiperf profile" \ - --aiperf-script "/cloudai_install/aiperf.sh" \ - --aiperf-report-name "aiperf_report.csv" \ - --aiperf-artifact-dir-name "aiperf_artifacts" \ - --aiperf-args-concurrency "2" \ - --aiperf-args-request-count "50" \ - --aiperf-args-synthetic-input-tokens-mean "300" \ - --aiperf-args-output-tokens-mean "500" \ --aiperf-script /cloudai_run_results/aiperf.sh diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index 0ece2e800..c1a8b3c64 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -193,9 +193,10 @@ def test_gen_script_args_contains_split_aiperf_accuracy_args(strategy: AIDynamoS result = strategy._gen_script_args(td) - assert '--aiperf-args-request-count "50"' in result - assert '--aiperf-args-synthetic-input-tokens-mean "300"' in result - assert '--aiperf-args-output-tokens-mean "500"' in result + script = (strategy.test_run.output_path / "aiperf.sh").read_text() + assert "--request-count 50" in script + assert "--synthetic-input-tokens-mean 300" in script + assert "--output-tokens-mean 500" in script assert f'--aiperf_accuracy-setup-cmd "{setup_cmd}"' in result assert '--aiperf_accuracy-name "aiperf_accuracy"' in result assert '--aiperf_accuracy-entrypoint "aiperf profile"' in result From 10c1a0048cc1e2093dcdcf46dc60b9afa30feec5 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 15:59:15 +0200 Subject: [PATCH 08/16] add remaning fork functionality --- conf/experimental/ai_dynamo/test/vllm.toml | 6 + .../ai_dynamo/test_scenario/vllm_lmcache.toml | 5 +- .../ai_dynamo/test_scenario/vllm_slurm.toml | 30 +++ src/cloudai/workloads/ai_dynamo/__init__.py | 2 + src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 25 +++ src/cloudai/workloads/ai_dynamo/ai_dynamo.sh | 38 ++++ .../ai_dynamo/slurm_command_gen_strategy.py | 212 +++++++++++++++--- tests/ref_data/ai-dynamo-aiperf.sh | 57 +++-- .../test_command_gen_strategy_slurm.py | 42 ++++ 9 files changed, 378 insertions(+), 39 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index 581ecf3e7..ea2a4552c 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -74,11 +74,16 @@ workloads = "aiperf.sh" concurrency = 2 [cmd_args.aiperf] + health-check-between-phases = true + continue-on-phase-failure = false [cmd_args.aiperf.args] concurrency = 2 + endpoint-type = "chat" extra-inputs = '{"min_tokens":10}' output-tokens-mean = 500 request-count = 50 + server-metrics = "auto" + streaming = true synthetic-input-tokens-mean = 300 [[cmd_args.aiperf_phases]] @@ -94,6 +99,7 @@ workloads = "aiperf.sh" [cmd_args.aiperf_phases.args] concurrency = 4 request-count = 50 + streaming = false [cmd_args.aiperf_accuracy] entrypoint = "aiperf profile" diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml index f975e784e..c63319b4e 100644 --- a/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml +++ b/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml @@ -24,7 +24,10 @@ description = "Self-contained AIDynamo scenario wiring vLLM disaggregated infere test_template_name = "AIDynamo" time_limit = "00:10:00" extra_container_mounts = ["/run/udev:/run/udev"] -dse_excluded_args = ["cmd_args.lmcache.lmcache_worker_ports", "cmd_args.aiperf_phases"] +dse_excluded_args = [ + "cmd_args.lmcache.lmcache_worker_ports", + "cmd_args.aiperf_phases", +] [Tests.cmd_args] docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.1.1" diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml index 45031da3a..7f279ab71 100644 --- a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml +++ b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml @@ -36,12 +36,28 @@ time_limit = "00:10:00" tensor-parallel-size = 4 pipeline-parallel-size = 1 + [[Tests.cmd_args.aiperf_phases]] + name = "round_1" + [Tests.cmd_args.aiperf_phases.args] + concurrency = 2 + request-count = 50 + server-metrics = "auto" + + [[Tests.cmd_args.aiperf_phases]] + name = "round_2" + [Tests.cmd_args.aiperf_phases.args] + concurrency = 4 + request-count = 50 + [[Tests]] id = "test.disagg.multinode" test_name = "vLLM" time_limit = "00:10:00" [Tests.cmd_args] + [Tests.cmd_args.dynamo.dcgm_exporter] + enabled = true + [Tests.cmd_args.dynamo.prefill_worker] num-nodes = 2 [Tests.cmd_args.dynamo.prefill_worker.args] @@ -53,3 +69,17 @@ time_limit = "00:10:00" [Tests.cmd_args.dynamo.decode_worker.args] tensor-parallel-size = 4 pipeline-parallel-size = 1 + + [[Tests.cmd_args.aiperf_phases]] + name = "round_1" + [Tests.cmd_args.aiperf_phases.args] + concurrency = 4 + request-count = 50 + server-metrics = "auto" + + [[Tests.cmd_args.aiperf_phases]] + name = "round_2" + [Tests.cmd_args.aiperf_phases.args] + concurrency = 8 + request-count = 50 + server-metrics = "auto" diff --git a/src/cloudai/workloads/ai_dynamo/__init__.py b/src/cloudai/workloads/ai_dynamo/__init__.py index 86938dfec..fc7e2b376 100644 --- a/src/cloudai/workloads/ai_dynamo/__init__.py +++ b/src/cloudai/workloads/ai_dynamo/__init__.py @@ -23,6 +23,7 @@ AIPerf, AIPerfAccuracy, AIPerfPhase, + DCGMExporter, GenAIPerf, LMCacheController, WorkerBaseArgs, @@ -44,6 +45,7 @@ "AIPerf", "AIPerfAccuracy", "AIPerfPhase", + "DCGMExporter", "GenAIPerf", "LMCacheController", "WorkerBaseArgs", diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index ed04d63ba..fcc6f2f27 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -140,6 +140,20 @@ class WorkerConfig(BaseModel): ) +class DCGMExporter(BaseModel): + """Optional DCGM exporter launch configuration.""" + + model_config = ConfigDict(extra="forbid", populate_by_name=True) + + enabled: bool = False + image_url: str = Field( + default="nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless", + serialization_alias="image-url", + validation_alias=AliasChoices("image-url", "image_url"), + ) + port: int = 9401 + + class AIDynamoArgs(BaseModel): """Arguments for AI Dynamo setup.""" @@ -205,6 +219,7 @@ def validate_connector(cls, v: str | list[str] | None) -> str | list[str] | None serialization_alias="nats-port", validation_alias=AliasChoices("nats-port", "nats_port"), ) + dcgm_exporter: DCGMExporter = Field(default_factory=DCGMExporter) decode_worker: WorkerConfig = WorkerConfig( cmd="python3 -m dynamo.vllm", @@ -269,6 +284,16 @@ class AIPerf(Workload): serialization_alias="artifact-dir-name", validation_alias=AliasChoices("artifact-dir-name", "artifact_dir_name"), ) + health_check_between_phases: bool = Field( + default=True, + serialization_alias="health-check-between-phases", + validation_alias=AliasChoices("health-check-between-phases", "health_check_between_phases"), + ) + continue_on_phase_failure: bool = Field( + default=False, + serialization_alias="continue-on-phase-failure", + validation_alias=AliasChoices("continue-on-phase-failure", "continue_on_phase_failure"), + ) @property def installables(self) -> list[Installable]: diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh index 25c4126fc..add2cf61b 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh @@ -60,6 +60,8 @@ dynamo_args["worker-error-pattern"]="zmq.error.ZMQError:.Address.already.in.use| dynamo_args["sgl-http-port"]=9001 dynamo_args["prefill-port"]=30011 dynamo_args["decode-port"]=30021 +dynamo_args["dcgm-exporter-enabled"]="False" +dynamo_args["dcgm-exporter-port"]=9401 function log() { @@ -892,6 +894,38 @@ _query_frontend() { curl -s -X POST "${dynamo_args["url"]}/v1/chat/completions" -H "Content-Type: application/json" -d @$RESULTS_DIR/curl_cmd.json } +_resolve_aiperf_server_metrics_urls() { + local urls="http://${dynamo_args["frontend-node"]}:${dynamo_args["port"]}/metrics" + local base_system_port=${DYN_SYSTEM_PORT:-9090} + local decode_workers_per_node=${decode_config["workers-per-node"]:-1} + local prefill_workers_per_node=${prefill_config["workers-per-node"]:-1} + local IFS_SAVE="$IFS" + local node i + + IFS=',' + for node in ${prefill_config["node-list"]:-}; do + for i in $(seq 0 $(( prefill_workers_per_node - 1 ))); do + urls="${urls},http://${node}:$((base_system_port + i))/metrics" + done + done + + for node in ${decode_config["node-list"]:-}; do + for i in $(seq 0 $(( decode_workers_per_node - 1 ))); do + urls="${urls},http://${node}:$((base_system_port + i))/metrics" + done + done + + if [[ "${dynamo_args["dcgm-exporter-enabled"]}" == "True" || "${dynamo_args["dcgm-exporter-enabled"]}" == "true" ]]; then + for node in ${decode_config["node-list"]:-},${prefill_config["node-list"]:-}; do + [[ -z "$node" ]] && continue + urls="${urls},http://${node}:${dynamo_args["dcgm-exporter-port"]}/metrics" + done + fi + IFS="$IFS_SAVE" + + echo "$urls" +} + function setup_cufile() { export CUFILE_ENV_PATH_JSON="$RESULTS_DIR/cufile.json" @@ -1059,6 +1093,10 @@ function launch_workload() local workload_name="${workload_config_ref["--name"]}" local script="${workload_config_ref["--script"]}" export FRONTEND_URL="${dynamo_args["url"]}" + export AIPERF_MODEL="${dynamo_args["model"]}" + export AIPERF_ENDPOINT="${dynamo_args["endpoint"]}" + export AIPERF_FAILURE_MARKER="${FATAL_ERROR_MARKER}" + export AIPERF_SERVER_METRICS_URLS="$(_resolve_aiperf_server_metrics_urls)" # Build config and workload args as proper bash arrays to preserve # multi-word values (e.g. --cmd "genai-perf profile") through word splitting. diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 447ca1154..32f962af3 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -118,13 +118,23 @@ def _prepare_lmcache_config(self): (self.test_run.output_path / LMCACHE_CONFIG_FILE_NAME).write_text(config) (self.test_run.output_path / LMCACHE_CONFIG_BACKUP_FILE_NAME).write_text(config) - def _aiperf_args_argv(self, args: dict[str, Any]) -> list[str]: - result = [] + def _render_aiperf_args(self, args: dict[str, Any]) -> str: + parts: list[str] = [] for key, value in args.items(): - result.append(f"--{key}") - if value is not None: - result.append(str(value)) - return result + if value is None or value is False: + continue + + parts.append(f"--{key}") + if value is True: + continue + + values = [",".join(str(item) for item in value)] if isinstance(value, list) else [str(value)] + for rendered_value in values: + if rendered_value in {"$FRONTEND_URL", "$AIPERF_SERVER_METRICS_URLS"}: + parts.append(f'"{rendered_value}"') + else: + parts.append(shlex.quote(rendered_value)) + return " ".join(parts) def _runtime_result_path(self, path: str) -> str: if Path(path).is_absolute(): @@ -138,6 +148,23 @@ def _split_extra_args(self, value: Any) -> list[str]: return [str(item) for item in value] return shlex.split(str(value)) + def _aiperf_phase_args(self, resolved_phase: AIPerf, artifact_dir: str) -> dict[str, Any]: + args: dict[str, Any] = { + "model": self.td.cmd_args.dynamo.model, + "endpoint-type": "chat", + "streaming": True, + "url": "$FRONTEND_URL", + } + args.update(resolved_phase.args.model_dump(by_alias=True, exclude_none=True)) + args["artifact-dir"] = artifact_dir + + if args.get("server-metrics") == "auto": + args["server-metrics"] = "$AIPERF_SERVER_METRICS_URLS" + if "server-metrics" not in args and "no-server-metrics" not in args: + args["no-server-metrics"] = True + + return args + def _resolve_aiperf_phase(self, phase: AIPerfPhase) -> AIPerf: base_data = self.td.cmd_args.aiperf.model_dump(by_alias=True, exclude_none=True) phase_data = phase.model_dump(by_alias=True, exclude_none=True, exclude_unset=True) @@ -161,6 +188,9 @@ def _render_aiperf_script(self) -> str: 'log() { echo "[$(date +%F\\ %T) $(hostname)]: $*"; }', "", ': "${FRONTEND_URL:?FRONTEND_URL is not set}"', + f': "${{AIPERF_MODEL:={self.td.cmd_args.dynamo.model}}}"', + f': "${{AIPERF_ENDPOINT:={self.td.cmd_args.dynamo.endpoint}}}"', + f': "${{AIPERF_FAILURE_MARKER:={self.CONTAINER_MOUNT_OUTPUT}/{self.td.failure_marker}}}"', "", ] @@ -180,41 +210,63 @@ def _render_aiperf_script(self) -> str: artifact_dir = self._runtime_result_path(resolved_phase.artifact_dir_name) report_source = f"{artifact_dir}/profile_export_aiperf.csv" report_file = self._runtime_result_path(resolved_phase.report_name) - argv = [ - *shlex.split(resolved_phase.cmd), - "--model", - self.td.cmd_args.dynamo.model, - "--endpoint-type", - "chat", - "--streaming", - "--artifact-dir", - artifact_dir, - "--no-server-metrics", - *self._aiperf_args_argv(resolved_phase.args.model_dump(by_alias=True, exclude_none=True)), - *self._split_extra_args(resolved_phase.extra_args), + cmd_parts = [ + shlex.join(shlex.split(resolved_phase.cmd)), + self._render_aiperf_args(self._aiperf_phase_args(resolved_phase, artifact_dir)), + shlex.join(self._split_extra_args(resolved_phase.extra_args)), ] - cmd = f'{shlex.join(argv)} --url "$FRONTEND_URL"' + cmd = " ".join(part for part in cmd_parts if part) log_message = f"Running {phase.name}: {cmd}" lines.append(f"rm -rf {shlex.quote(artifact_dir)}") lines.append(f"mkdir -p {shlex.quote(artifact_dir)}") lines.append(f"log {shlex.quote(log_message)}") + lines.append("phase_status=0") if write_phase_logs: log_file = self._runtime_result_path(f"aiperf_{phase.name}.log") + lines.append("set +e") lines.append(f"{cmd} > {shlex.quote(log_file)} 2>&1") + lines.append("phase_status=$?") + lines.append("set -e") else: + lines.append("set +e") lines.append(cmd) + lines.append("phase_status=$?") + lines.append("set -e") - lines.append(f"mkdir -p {shlex.quote(str(Path(report_file).parent))}") + lines.append('if [[ "$phase_status" -ne 0 ]]; then') + lines.append(f" log {shlex.quote(f'AIPerf phase {phase.name} failed')}") + if not resolved_phase.continue_on_phase_failure: + lines.append(' exit "$phase_status"') + lines.append("fi") + lines.append('if [[ "$phase_status" -eq 0 ]]; then') + + lines.append(f" mkdir -p {shlex.quote(str(Path(report_file).parent))}") if report_source != report_file: - lines.append(f"cp {shlex.quote(report_source)} {shlex.quote(report_file)}") - lines.append(f"log {shlex.quote(f'AIPerf report saved to {report_file}')}") + lines.append(f" cp {shlex.quote(report_source)} {shlex.quote(report_file)}") + lines.append(f" log {shlex.quote(f'AIPerf report saved to {report_file}')}") if not single_phase and idx == len(phases) - 1: final_report_file = self._runtime_result_path("aiperf_report.csv") - lines.append(f"mkdir -p {shlex.quote(str(Path(final_report_file).parent))}") + lines.append(f" mkdir -p {shlex.quote(str(Path(final_report_file).parent))}") if report_file != final_report_file: - lines.append(f"cp {shlex.quote(report_file)} {shlex.quote(final_report_file)}") - lines.append(f"log {shlex.quote(f'Final AIPerf report saved to {final_report_file}')}") + lines.append(f" cp {shlex.quote(report_file)} {shlex.quote(final_report_file)}") + lines.append(f" log {shlex.quote(f'Final AIPerf report saved to {final_report_file}')}") + if not single_phase and idx < len(phases) - 1 and resolved_phase.health_check_between_phases: + lines.append(' if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then') + lines.append(" log 'FATAL: failure marker found between AIPerf phases'") + lines.append(" exit 1") + lines.append(" fi") + lines.append( + ' if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" ' + "-H 'Content-Type: application/json' " + '-d "{\\"model\\":\\"${AIPERF_MODEL}\\",\\"messages\\":[{\\"role\\":\\"user\\",' + '\\"content\\":\\"ping\\"}],\\"stream\\":false,\\"max_tokens\\":1}" ' + ">/dev/null; then" + ) + lines.append(" log 'FATAL: frontend health probe failed between AIPerf phases'") + lines.append(" exit 1") + lines.append(" fi") + lines.append("fi") lines.append("") return "\n".join(lines) @@ -258,9 +310,14 @@ def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]: exclude=[ "prefill_worker", "decode_worker", + "dcgm_exporter", + "dcgm-exporter", ], ) ) + if td.cmd_args.dynamo.dcgm_exporter.enabled: + args.append('--dynamo-dcgm-exporter-enabled "True"') + args.append(f'--dynamo-dcgm-exporter-port "{td.cmd_args.dynamo.dcgm_exporter.port}"') if td.cmd_args.dynamo.prefill_worker: args.extend(self._get_nested_toml_args(td.cmd_args.dynamo.prefill_worker, "--prefill-")) @@ -298,6 +355,111 @@ def _gen_srun_command(self) -> str: srun_cmd.extend(self._gen_script_args(self.td)) return " \\\n ".join(srun_cmd) + "\n" + def _gen_dcgm_launcher_block(self) -> list[str]: + if not self.td.cmd_args.dynamo.dcgm_exporter.enabled: + return [] + + num_nodes, node_list = self.get_cached_nodes_spec() + out_dir = self.test_run.output_path.absolute() + port = self.td.cmd_args.dynamo.dcgm_exporter.port + image_url = self.td.cmd_args.dynamo.dcgm_exporter.image_url + wrapper_body = [ + "#!/bin/bash", + "set -e", + "nohup docker run --rm --user root --gpus all --cap-add SYS_ADMIN \\", + f" -e DCGM_EXPORTER_LISTEN=:{port} -p {port}:{port} \\", + ' -v "${RESULTS_DIR}:/cloudai_run_results" \\', + ' "${DCGM_IMAGE}" dcgm-exporter \\', + ' >> "${RESULTS_DIR}/dcgm_exporter_node${SLURM_NODEID:-0}.log" 2>&1 &', + "disown", + "exit 0", + ] + srun_parts = [ + "srun", + "--export=ALL", + f"-N{num_nodes}", + *([] if not node_list else [f"--nodelist={','.join(node_list)}"]), + f"--ntasks={num_nodes}", + "--ntasks-per-node=1", + f"--output={out_dir / 'dcgm-node-%n-stdout.txt'}", + f"--error={out_dir / 'dcgm-node-%n-stderr.txt'}", + "bash", + str(out_dir / "run_dcgm.sh"), + ] + + block = [ + "# Start DCGM exporter via Docker on each node.", + f"export RESULTS_DIR={out_dir}", + f"export DCGM_IMAGE={shlex.quote(image_url)}", + "cat > \"$RESULTS_DIR/run_dcgm.sh\" << 'WRAPPER_DCGM_EOF'", + *wrapper_body, + "WRAPPER_DCGM_EOF", + 'chmod +x "$RESULTS_DIR/run_dcgm.sh"', + " ".join(srun_parts), + "sleep 5", + ] + if node_list: + block.extend( + [ + "echo 'DCGM endpoints:' > \"$RESULTS_DIR/dcgm_endpoints.txt\"", + "for n in " + + " ".join(node_list) + + f'; do echo " http://$n:{port}/metrics" >> "$RESULTS_DIR/dcgm_endpoints.txt"; done', + "", + ] + ) + return block + + def _gen_dcgm_cleanup_command(self) -> str | None: + if not self.td.cmd_args.dynamo.dcgm_exporter.enabled: + return None + + num_nodes, node_list = self.get_cached_nodes_spec() + kill_cmd = 'docker ps -q -f ancestor="$DCGM_IMAGE" 2>/dev/null | xargs -r docker kill 2>/dev/null || true' + parts = [ + "srun", + "--export=ALL", + f"-N{num_nodes}", + *([] if not node_list else [f"--nodelist={','.join(node_list)}"]), + f"--ntasks={num_nodes}", + "--ntasks-per-node=1", + "bash", + "-c", + shlex.quote(kill_cmd), + ] + return " ".join(parts) + + def gen_exec_command(self) -> str: + srun_command = self._gen_srun_command() + command_list = [] + indent = "" + + if self.test_run.pre_test: + pre_test_command = self.gen_pre_test(self.test_run.pre_test, self.test_run.output_path) + command_list.extend([pre_test_command, "if [ $PRE_TEST_SUCCESS -eq 1 ]; then"]) + indent = " " + + dcgm_block = self._gen_dcgm_launcher_block() + if dcgm_block: + command_list.extend(f"{indent}{line}" for line in dcgm_block) + + command_list.append(f"{indent}{srun_command}") + + dcgm_cleanup = self._gen_dcgm_cleanup_command() + if dcgm_cleanup: + command_list.append(f"{indent}# Kill DCGM exporter containers when test finishes") + command_list.append(f"{indent}{dcgm_cleanup}") + + if self.test_run.post_test: + post_test_command = self.gen_post_test(self.test_run.post_test, self.test_run.output_path) + command_list.append(f"{indent}{post_test_command}") + + if self.test_run.pre_test: + command_list.append("fi") + + full_command = "\n".join(command_list).strip() + return self._write_sbatch_script(full_command) + def _validate_worker_nodes( self, node_list: list[str], worker_nodes: str | None, num_nodes: int, worker_type: str ) -> None: diff --git a/tests/ref_data/ai-dynamo-aiperf.sh b/tests/ref_data/ai-dynamo-aiperf.sh index 3fcd013cb..bd73f2ab7 100644 --- a/tests/ref_data/ai-dynamo-aiperf.sh +++ b/tests/ref_data/ai-dynamo-aiperf.sh @@ -4,22 +4,53 @@ set -Eeuo pipefail log() { echo "[$(date +%F\ %T) $(hostname)]: $*"; } : "${FRONTEND_URL:?FRONTEND_URL is not set}" +: "${AIPERF_MODEL:=model}" +: "${AIPERF_ENDPOINT:=v1/chat/completions}" +: "${AIPERF_FAILURE_MARKER:=/cloudai_run_results/failure-marker.txt}" rm -rf /cloudai_run_results/aiperf_artifacts/round_1 mkdir -p /cloudai_run_results/aiperf_artifacts/round_1 -log 'Running round_1: aiperf profile --model model --endpoint-type chat --streaming --artifact-dir /cloudai_run_results/aiperf_artifacts/round_1 --no-server-metrics --concurrency 1 --request-count 50 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --url "$FRONTEND_URL"' -aiperf profile --model model --endpoint-type chat --streaming --artifact-dir /cloudai_run_results/aiperf_artifacts/round_1 --no-server-metrics --concurrency 1 --request-count 50 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --url "$FRONTEND_URL" > /cloudai_run_results/aiperf_round_1.log 2>&1 -mkdir -p /cloudai_run_results -cp /cloudai_run_results/aiperf_artifacts/round_1/profile_export_aiperf.csv /cloudai_run_results/aiperf_round_1_report.csv -log 'AIPerf report saved to /cloudai_run_results/aiperf_round_1_report.csv' +log 'Running round_1: aiperf profile --model model --endpoint-type chat --streaming --url "$FRONTEND_URL" --concurrency 1 --request-count 50 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --artifact-dir /cloudai_run_results/aiperf_artifacts/round_1 --no-server-metrics' +phase_status=0 +set +e +aiperf profile --model model --endpoint-type chat --streaming --url "$FRONTEND_URL" --concurrency 1 --request-count 50 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --artifact-dir /cloudai_run_results/aiperf_artifacts/round_1 --no-server-metrics > /cloudai_run_results/aiperf_round_1.log 2>&1 +phase_status=$? +set -e +if [[ "$phase_status" -ne 0 ]]; then + log 'AIPerf phase round_1 failed' + exit "$phase_status" +fi +if [[ "$phase_status" -eq 0 ]]; then + mkdir -p /cloudai_run_results + cp /cloudai_run_results/aiperf_artifacts/round_1/profile_export_aiperf.csv /cloudai_run_results/aiperf_round_1_report.csv + log 'AIPerf report saved to /cloudai_run_results/aiperf_round_1_report.csv' + if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then + log 'FATAL: failure marker found between AIPerf phases' + exit 1 + fi + if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" -H 'Content-Type: application/json' -d "{\"model\":\"${AIPERF_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"stream\":false,\"max_tokens\":1}" >/dev/null; then + log 'FATAL: frontend health probe failed between AIPerf phases' + exit 1 + fi +fi rm -rf /cloudai_run_results/aiperf_artifacts/round_2 mkdir -p /cloudai_run_results/aiperf_artifacts/round_2 -log 'Running round_2: aiperf profile --model model --endpoint-type chat --streaming --artifact-dir /cloudai_run_results/aiperf_artifacts/round_2 --no-server-metrics --concurrency 2 --request-count 10 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --url "$FRONTEND_URL"' -aiperf profile --model model --endpoint-type chat --streaming --artifact-dir /cloudai_run_results/aiperf_artifacts/round_2 --no-server-metrics --concurrency 2 --request-count 10 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --url "$FRONTEND_URL" > /cloudai_run_results/aiperf_round_2.log 2>&1 -mkdir -p /cloudai_run_results -cp /cloudai_run_results/aiperf_artifacts/round_2/profile_export_aiperf.csv /cloudai_run_results/aiperf_round_2_report.csv -log 'AIPerf report saved to /cloudai_run_results/aiperf_round_2_report.csv' -mkdir -p /cloudai_run_results -cp /cloudai_run_results/aiperf_round_2_report.csv /cloudai_run_results/aiperf_report.csv -log 'Final AIPerf report saved to /cloudai_run_results/aiperf_report.csv' +log 'Running round_2: aiperf profile --model model --endpoint-type chat --streaming --url "$FRONTEND_URL" --concurrency 2 --request-count 10 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --artifact-dir /cloudai_run_results/aiperf_artifacts/round_2 --no-server-metrics' +phase_status=0 +set +e +aiperf profile --model model --endpoint-type chat --streaming --url "$FRONTEND_URL" --concurrency 2 --request-count 10 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --artifact-dir /cloudai_run_results/aiperf_artifacts/round_2 --no-server-metrics > /cloudai_run_results/aiperf_round_2.log 2>&1 +phase_status=$? +set -e +if [[ "$phase_status" -ne 0 ]]; then + log 'AIPerf phase round_2 failed' + exit "$phase_status" +fi +if [[ "$phase_status" -eq 0 ]]; then + mkdir -p /cloudai_run_results + cp /cloudai_run_results/aiperf_artifacts/round_2/profile_export_aiperf.csv /cloudai_run_results/aiperf_round_2_report.csv + log 'AIPerf report saved to /cloudai_run_results/aiperf_round_2_report.csv' + mkdir -p /cloudai_run_results + cp /cloudai_run_results/aiperf_round_2_report.csv /cloudai_run_results/aiperf_report.csv + log 'Final AIPerf report saved to /cloudai_run_results/aiperf_report.csv' +fi diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index c1a8b3c64..999fa0b60 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -256,6 +256,48 @@ def test_gen_script_args_writes_resolved_aiperf_script(strategy: AIDynamoSlurmCo assert f"{strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_report.csv" in script +def test_generated_aiperf_script_supports_core_overrides_and_server_metrics_auto( + strategy: AIDynamoSlurmCommandGenStrategy, +) -> None: + td = cast(AIDynamoTestDefinition, strategy.test_run.test) + td.cmd_args.workloads = "aiperf.sh" + td.cmd_args.aiperf = AIPerf.model_validate( + { + "args": { + "model": "custom-model", + "endpoint-type": "completions", + "streaming": False, + "server-metrics": "auto", + "request-count": 10, + }, + } + ) + + strategy._gen_script_args(td) + + script = (strategy.test_run.output_path / "aiperf.sh").read_text() + assert "--model custom-model" in script + assert "--endpoint-type completions" in script + assert "--streaming" not in script + assert '--server-metrics "$AIPERF_SERVER_METRICS_URLS"' in script + assert "--no-server-metrics" not in script + + +def test_dcgm_exporter_generates_launcher_and_runtime_flags(strategy: AIDynamoSlurmCommandGenStrategy) -> None: + td = cast(AIDynamoTestDefinition, strategy.test_run.test) + td.cmd_args.dynamo.dcgm_exporter.enabled = True + td.cmd_args.dynamo.dcgm_exporter.image_url = "nvcr.io/test/dcgm:latest" + td.cmd_args.dynamo.dcgm_exporter.port = 9501 + + args = strategy._gen_script_args(td) + block = strategy._gen_dcgm_launcher_block() + + assert '--dynamo-dcgm-exporter-enabled "True"' in args + assert '--dynamo-dcgm-exporter-port "9501"' in args + assert any("nvcr.io/test/dcgm:latest" in line for line in block) + assert any("DCGM_EXPORTER_LISTEN=:9501" in line for line in block) + + def test_aiperf_phase_roundtrip_does_not_emit_default_report_name(strategy: AIDynamoSlurmCommandGenStrategy) -> None: td = cast(AIDynamoTestDefinition, strategy.test_run.test) td.cmd_args.workloads = "aiperf.sh" From 08d8e0a6623e5b1aca97a7be820359e8d1fcefce Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 16:09:13 +0200 Subject: [PATCH 09/16] fix dcgm endpoint url --- src/cloudai/workloads/ai_dynamo/ai_dynamo.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh index add2cf61b..5697a78ea 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh @@ -916,7 +916,8 @@ _resolve_aiperf_server_metrics_urls() { done if [[ "${dynamo_args["dcgm-exporter-enabled"]}" == "True" || "${dynamo_args["dcgm-exporter-enabled"]}" == "true" ]]; then - for node in ${decode_config["node-list"]:-},${prefill_config["node-list"]:-}; do + local dcgm_nodes="${decode_config["node-list"]:-},${prefill_config["node-list"]:-}" + for node in $dcgm_nodes; do [[ -z "$node" ]] && continue urls="${urls},http://${node}:${dynamo_args["dcgm-exporter-port"]}/metrics" done From 16e9639d8c680d804d4ab5680ea07f31fa1b88c3 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 16:21:00 +0200 Subject: [PATCH 10/16] switch dcgm to use enroot to run the image --- .../ai_dynamo/test_scenario/vllm_slurm.toml | 1 + src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 22 +++++- .../ai_dynamo/slurm_command_gen_strategy.py | 74 +++++++++---------- .../test_command_gen_strategy_slurm.py | 20 ++++- 4 files changed, 71 insertions(+), 46 deletions(-) diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml index 7f279ab71..decfead3d 100644 --- a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml +++ b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml @@ -57,6 +57,7 @@ time_limit = "00:10:00" [Tests.cmd_args] [Tests.cmd_args.dynamo.dcgm_exporter] enabled = true + docker-image-url = "nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless" [Tests.cmd_args.dynamo.prefill_worker] num-nodes = 2 diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index fcc6f2f27..3f72ed1c7 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -146,10 +146,10 @@ class DCGMExporter(BaseModel): model_config = ConfigDict(extra="forbid", populate_by_name=True) enabled: bool = False - image_url: str = Field( + docker_image_url: str = Field( default="nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless", - serialization_alias="image-url", - validation_alias=AliasChoices("image-url", "image_url"), + serialization_alias="docker-image-url", + validation_alias=AliasChoices("docker-image-url", "docker_image_url", "image-url", "image_url"), ) port: int = 9401 @@ -434,6 +434,7 @@ class AIDynamoTestDefinition(TestDefinition): model_config = ConfigDict(extra="forbid") cmd_args: AIDynamoCmdArgs _docker_image: Optional[DockerImage] = None + _dcgm_exporter_image: Optional[DockerImage] = None script: File = File(Path(__file__).parent.parent / "ai_dynamo/ai_dynamo.sh") repo: GitRepo = GitRepo( url="https://github.com/ai-dynamo/dynamo.git", commit="f7e468c7e8ff0d1426db987564e60572167e8464" @@ -467,6 +468,16 @@ def docker_image(self) -> DockerImage: self._docker_image = DockerImage(url=self.cmd_args.docker_image_url) return self._docker_image + @property + def dcgm_exporter_image(self) -> DockerImage | None: + if not self.cmd_args.dynamo.dcgm_exporter.enabled: + return None + + image_url = self.cmd_args.dynamo.dcgm_exporter.docker_image_url + if not self._dcgm_exporter_image or self._dcgm_exporter_image.url != image_url: + self._dcgm_exporter_image = DockerImage(url=image_url) + return self._dcgm_exporter_image + @property def hf_model(self) -> HFModel: if not self._hf_model: @@ -477,13 +488,16 @@ def hf_model(self) -> HFModel: @property def installables(self) -> list[Installable]: """Get all installables for this test definition.""" - return [ + installables = [ self.docker_image, self.repo, self.script, self.hf_model, *self.cmd_args.installables, ] + if self.dcgm_exporter_image: + installables.append(self.dcgm_exporter_image) + return installables def _has_aiperf_accuracy_results(self, output_path: Path) -> bool: accuracy = parse_aiperf_accuracy(output_path) diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 32f962af3..50486e6f1 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -23,7 +23,7 @@ from pydantic import BaseModel, TypeAdapter, ValidationError import cloudai.util -from cloudai.core import File, GitRepo +from cloudai.core import File, GitRepo, System, TestRun from cloudai.systems.slurm import SlurmCommandGenStrategy from .ai_dynamo import ( @@ -40,6 +40,10 @@ class AIDynamoSlurmCommandGenStrategy(SlurmCommandGenStrategy): """Command generation strategy for AI Dynamo on Slurm systems.""" + def __init__(self, system: System, test_run: TestRun) -> None: + super().__init__(system, test_run) + self._current_image_path: str | None = None + @property def td(self) -> AIDynamoTestDefinition: return cast(AIDynamoTestDefinition, self.test_run.test) @@ -65,10 +69,20 @@ def final_env_vars(self, value: dict[str, str | list[str]]) -> None: self._final_env_vars = value def image_path(self) -> str | None: + if self._current_image_path: + return self._current_image_path if self.td.docker_image and self.td.docker_image.installed_path: return str(self.td.docker_image.installed_path) return None + def _gen_srun_prefix_for_image(self, image_path: str) -> list[str]: + current_image_path = self._current_image_path + self._current_image_path = image_path + try: + return self.gen_srun_prefix(with_num_nodes=False) + finally: + self._current_image_path = current_image_path + def _get_toml_args(self, base_model: BaseModel, prefix: str, exclude: List[str] | None = None) -> List[str]: args = [] exclude = exclude or [] @@ -356,27 +370,17 @@ def _gen_srun_command(self) -> str: return " \\\n ".join(srun_cmd) + "\n" def _gen_dcgm_launcher_block(self) -> list[str]: - if not self.td.cmd_args.dynamo.dcgm_exporter.enabled: + dcgm_image = self.td.dcgm_exporter_image + if not dcgm_image: return [] num_nodes, node_list = self.get_cached_nodes_spec() out_dir = self.test_run.output_path.absolute() port = self.td.cmd_args.dynamo.dcgm_exporter.port - image_url = self.td.cmd_args.dynamo.dcgm_exporter.image_url - wrapper_body = [ - "#!/bin/bash", - "set -e", - "nohup docker run --rm --user root --gpus all --cap-add SYS_ADMIN \\", - f" -e DCGM_EXPORTER_LISTEN=:{port} -p {port}:{port} \\", - ' -v "${RESULTS_DIR}:/cloudai_run_results" \\', - ' "${DCGM_IMAGE}" dcgm-exporter \\', - ' >> "${RESULTS_DIR}/dcgm_exporter_node${SLURM_NODEID:-0}.log" 2>&1 &', - "disown", - "exit 0", - ] + dcgm_cmd = f"DCGM_EXPORTER_LISTEN=:{port} dcgm-exporter" srun_parts = [ - "srun", - "--export=ALL", + *self._gen_srun_prefix_for_image(str(dcgm_image.installed_path)), + "--overlap", f"-N{num_nodes}", *([] if not node_list else [f"--nodelist={','.join(node_list)}"]), f"--ntasks={num_nodes}", @@ -384,18 +388,16 @@ def _gen_dcgm_launcher_block(self) -> list[str]: f"--output={out_dir / 'dcgm-node-%n-stdout.txt'}", f"--error={out_dir / 'dcgm-node-%n-stderr.txt'}", "bash", - str(out_dir / "run_dcgm.sh"), + "-lc", + shlex.quote(dcgm_cmd), ] block = [ - "# Start DCGM exporter via Docker on each node.", - f"export RESULTS_DIR={out_dir}", - f"export DCGM_IMAGE={shlex.quote(image_url)}", - "cat > \"$RESULTS_DIR/run_dcgm.sh\" << 'WRAPPER_DCGM_EOF'", - *wrapper_body, - "WRAPPER_DCGM_EOF", - 'chmod +x "$RESULTS_DIR/run_dcgm.sh"', - " ".join(srun_parts), + "# Start DCGM exporter on each node.", + 'echo "Starting DCGM exporter..."', + " ".join(srun_parts) + " &", + "DCGM_EXPORTER_SRUN_PID=$!", + 'echo "DCGM exporter srun PID: ${DCGM_EXPORTER_SRUN_PID}"', "sleep 5", ] if node_list: @@ -414,20 +416,12 @@ def _gen_dcgm_cleanup_command(self) -> str | None: if not self.td.cmd_args.dynamo.dcgm_exporter.enabled: return None - num_nodes, node_list = self.get_cached_nodes_spec() - kill_cmd = 'docker ps -q -f ancestor="$DCGM_IMAGE" 2>/dev/null | xargs -r docker kill 2>/dev/null || true' - parts = [ - "srun", - "--export=ALL", - f"-N{num_nodes}", - *([] if not node_list else [f"--nodelist={','.join(node_list)}"]), - f"--ntasks={num_nodes}", - "--ntasks-per-node=1", - "bash", - "-c", - shlex.quote(kill_cmd), - ] - return " ".join(parts) + return ( + 'if [[ -n "${DCGM_EXPORTER_SRUN_PID:-}" ]]; then ' + 'kill "${DCGM_EXPORTER_SRUN_PID}" 2>/dev/null || true; ' + 'wait "${DCGM_EXPORTER_SRUN_PID}" 2>/dev/null || true; ' + "fi" + ) def gen_exec_command(self) -> str: srun_command = self._gen_srun_command() @@ -447,7 +441,7 @@ def gen_exec_command(self) -> str: dcgm_cleanup = self._gen_dcgm_cleanup_command() if dcgm_cleanup: - command_list.append(f"{indent}# Kill DCGM exporter containers when test finishes") + command_list.append(f"{indent}# Stop DCGM exporter when test finishes") command_list.append(f"{indent}{dcgm_cleanup}") if self.test_run.post_test: diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index 999fa0b60..a6c1ba7a3 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -286,7 +286,7 @@ def test_generated_aiperf_script_supports_core_overrides_and_server_metrics_auto def test_dcgm_exporter_generates_launcher_and_runtime_flags(strategy: AIDynamoSlurmCommandGenStrategy) -> None: td = cast(AIDynamoTestDefinition, strategy.test_run.test) td.cmd_args.dynamo.dcgm_exporter.enabled = True - td.cmd_args.dynamo.dcgm_exporter.image_url = "nvcr.io/test/dcgm:latest" + td.cmd_args.dynamo.dcgm_exporter.docker_image_url = "nvcr.io/test/dcgm:latest" td.cmd_args.dynamo.dcgm_exporter.port = 9501 args = strategy._gen_script_args(td) @@ -294,8 +294,24 @@ def test_dcgm_exporter_generates_launcher_and_runtime_flags(strategy: AIDynamoSl assert '--dynamo-dcgm-exporter-enabled "True"' in args assert '--dynamo-dcgm-exporter-port "9501"' in args - assert any("nvcr.io/test/dcgm:latest" in line for line in block) + assert any("--container-image=nvcr.io/test/dcgm:latest" in line for line in block) assert any("DCGM_EXPORTER_LISTEN=:9501" in line for line in block) + assert not any("docker run" in line for line in block) + + +def test_dcgm_exporter_adds_configured_docker_image_installable(cmd_args: AIDynamoCmdArgs) -> None: + cmd_args.dynamo.dcgm_exporter.enabled = True + cmd_args.dynamo.dcgm_exporter.docker_image_url = "nvcr.io/test/dcgm:latest" + tdef = AIDynamoTestDefinition( + name="test", + description="desc", + test_template_name="template", + cmd_args=cmd_args, + ) + + assert tdef.dcgm_exporter_image is not None + assert tdef.dcgm_exporter_image.url == "nvcr.io/test/dcgm:latest" + assert tdef.dcgm_exporter_image in tdef.installables def test_aiperf_phase_roundtrip_does_not_emit_default_report_name(strategy: AIDynamoSlurmCommandGenStrategy) -> None: From baa04d95db6e650876a643caa0de42ca7cf62cd0 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 16:27:35 +0200 Subject: [PATCH 11/16] remove state --- .../ai_dynamo/slurm_command_gen_strategy.py | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 50486e6f1..2d377805b 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -23,7 +23,7 @@ from pydantic import BaseModel, TypeAdapter, ValidationError import cloudai.util -from cloudai.core import File, GitRepo, System, TestRun +from cloudai.core import File, GitRepo from cloudai.systems.slurm import SlurmCommandGenStrategy from .ai_dynamo import ( @@ -40,10 +40,6 @@ class AIDynamoSlurmCommandGenStrategy(SlurmCommandGenStrategy): """Command generation strategy for AI Dynamo on Slurm systems.""" - def __init__(self, system: System, test_run: TestRun) -> None: - super().__init__(system, test_run) - self._current_image_path: str | None = None - @property def td(self) -> AIDynamoTestDefinition: return cast(AIDynamoTestDefinition, self.test_run.test) @@ -69,19 +65,22 @@ def final_env_vars(self, value: dict[str, str | list[str]]) -> None: self._final_env_vars = value def image_path(self) -> str | None: - if self._current_image_path: - return self._current_image_path if self.td.docker_image and self.td.docker_image.installed_path: return str(self.td.docker_image.installed_path) return None - def _gen_srun_prefix_for_image(self, image_path: str) -> list[str]: - current_image_path = self._current_image_path - self._current_image_path = image_path - try: - return self.gen_srun_prefix(with_num_nodes=False) - finally: - self._current_image_path = current_image_path + def _gen_dcgm_srun_prefix(self, image_path: str) -> list[str]: + srun_parts = ["srun", "--export=ALL", f"--mpi={self.mpi}", f"--container-image={image_path}"] + mounts = self.container_mounts() + if mounts: + srun_parts.append(f"--container-mounts={','.join(mounts)}") + if not self.system.container_mount_home: + srun_parts.append("--no-container-mount-home") + if self.system.extra_srun_args: + srun_parts.append(self.system.extra_srun_args) + if self.test_run.extra_srun_args: + srun_parts.append(self.test_run.extra_srun_args) + return srun_parts def _get_toml_args(self, base_model: BaseModel, prefix: str, exclude: List[str] | None = None) -> List[str]: args = [] @@ -379,7 +378,7 @@ def _gen_dcgm_launcher_block(self) -> list[str]: port = self.td.cmd_args.dynamo.dcgm_exporter.port dcgm_cmd = f"DCGM_EXPORTER_LISTEN=:{port} dcgm-exporter" srun_parts = [ - *self._gen_srun_prefix_for_image(str(dcgm_image.installed_path)), + *self._gen_dcgm_srun_prefix(str(dcgm_image.installed_path)), "--overlap", f"-N{num_nodes}", *([] if not node_list else [f"--nodelist={','.join(node_list)}"]), From b749cb35b02ce02624d58f0b09a4b34f36510348 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 16:37:48 +0200 Subject: [PATCH 12/16] fail early if dcgm fails --- .../ai_dynamo/slurm_command_gen_strategy.py | 38 ++++++++++++++++--- .../test_command_gen_strategy_slurm.py | 3 ++ 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 2d377805b..c36408ca9 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -398,17 +398,45 @@ def _gen_dcgm_launcher_block(self) -> list[str]: "DCGM_EXPORTER_SRUN_PID=$!", 'echo "DCGM exporter srun PID: ${DCGM_EXPORTER_SRUN_PID}"', "sleep 5", + 'echo "Checking DCGM exporter metrics endpoints..."', + "DCGM_EXPORTER_STARTUP_TIMEOUT=${DCGM_EXPORTER_STARTUP_TIMEOUT:-60}", ] if node_list: block.extend( [ - "echo 'DCGM endpoints:' > \"$RESULTS_DIR/dcgm_endpoints.txt\"", - "for n in " - + " ".join(node_list) - + f'; do echo " http://$n:{port}/metrics" >> "$RESULTS_DIR/dcgm_endpoints.txt"; done', - "", + "dcgm_nodes=(" + " ".join(shlex.quote(node) for node in node_list) + ")", ] ) + else: + block.append('mapfile -t dcgm_nodes < <(scontrol show hostnames "$SLURM_JOB_NODELIST")') + endpoints_file = shlex.quote(str(out_dir / "dcgm_endpoints.txt")) + block.extend( + [ + f": > {endpoints_file}", + "dcgm_failed=0", + 'for node in "${dcgm_nodes[@]}"; do', + f' dcgm_url="http://${{node}}:{port}/metrics"', + f' echo " ${{dcgm_url}}" >> {endpoints_file}', + " deadline=$((SECONDS + DCGM_EXPORTER_STARTUP_TIMEOUT))", + ' until curl -fsS --max-time 2 "${dcgm_url}" >/dev/null; do', + " if (( SECONDS >= deadline )); then", + ' echo "FATAL: DCGM exporter metrics endpoint is unreachable: ${dcgm_url}" >&2', + " dcgm_failed=1", + " break", + " fi", + " sleep 2", + " done", + " if (( dcgm_failed != 0 )); then break; fi", + ' echo "DCGM exporter reachable: ${dcgm_url}"', + "done", + "if (( dcgm_failed != 0 )); then", + ' kill "${DCGM_EXPORTER_SRUN_PID}" 2>/dev/null || true', + ' wait "${DCGM_EXPORTER_SRUN_PID}" 2>/dev/null || true', + " exit 1", + "fi", + "", + ] + ) return block def _gen_dcgm_cleanup_command(self) -> str | None: diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index a6c1ba7a3..533b059b0 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -296,6 +296,9 @@ def test_dcgm_exporter_generates_launcher_and_runtime_flags(strategy: AIDynamoSl assert '--dynamo-dcgm-exporter-port "9501"' in args assert any("--container-image=nvcr.io/test/dcgm:latest" in line for line in block) assert any("DCGM_EXPORTER_LISTEN=:9501" in line for line in block) + assert any("DCGM_EXPORTER_STARTUP_TIMEOUT" in line for line in block) + assert any('curl -fsS --max-time 2 "${dcgm_url}"' in line for line in block) + assert any("FATAL: DCGM exporter metrics endpoint is unreachable" in line for line in block) assert not any("docker run" in line for line in block) From f39797abd376b892e783b5d07046a840cc0eb52b Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 16:49:31 +0200 Subject: [PATCH 13/16] cleanup hardcoded env vars escaping --- .../ai_dynamo/slurm_command_gen_strategy.py | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index c36408ca9..563fd7054 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -143,10 +143,7 @@ def _render_aiperf_args(self, args: dict[str, Any]) -> str: values = [",".join(str(item) for item in value)] if isinstance(value, list) else [str(value)] for rendered_value in values: - if rendered_value in {"$FRONTEND_URL", "$AIPERF_SERVER_METRICS_URLS"}: - parts.append(f'"{rendered_value}"') - else: - parts.append(shlex.quote(rendered_value)) + parts.append(shlex.quote(rendered_value)) return " ".join(parts) def _runtime_result_path(self, path: str) -> str: @@ -166,18 +163,35 @@ def _aiperf_phase_args(self, resolved_phase: AIPerf, artifact_dir: str) -> dict[ "model": self.td.cmd_args.dynamo.model, "endpoint-type": "chat", "streaming": True, - "url": "$FRONTEND_URL", } args.update(resolved_phase.args.model_dump(by_alias=True, exclude_none=True)) args["artifact-dir"] = artifact_dir - if args.get("server-metrics") == "auto": - args["server-metrics"] = "$AIPERF_SERVER_METRICS_URLS" if "server-metrics" not in args and "no-server-metrics" not in args: args["no-server-metrics"] = True return args + def _render_aiperf_phase_args(self, resolved_phase: AIPerf, artifact_dir: str) -> str: + args = self._aiperf_phase_args(resolved_phase, artifact_dir) + url = args.pop("url", None) + server_metrics_auto = args.get("server-metrics") == "auto" + if server_metrics_auto: + args.pop("server-metrics") + + parts = [] + for key in ("model", "endpoint-type", "streaming"): + if key in args: + parts.append(self._render_aiperf_args({key: args.pop(key)})) + if url is None: + parts.append('--url "$FRONTEND_URL"') + else: + parts.append(self._render_aiperf_args({"url": url})) + parts.append(self._render_aiperf_args(args)) + if server_metrics_auto: + parts.append('--server-metrics "$AIPERF_SERVER_METRICS_URLS"') + return " ".join(part for part in parts if part) + def _resolve_aiperf_phase(self, phase: AIPerfPhase) -> AIPerf: base_data = self.td.cmd_args.aiperf.model_dump(by_alias=True, exclude_none=True) phase_data = phase.model_dump(by_alias=True, exclude_none=True, exclude_unset=True) @@ -225,7 +239,7 @@ def _render_aiperf_script(self) -> str: report_file = self._runtime_result_path(resolved_phase.report_name) cmd_parts = [ shlex.join(shlex.split(resolved_phase.cmd)), - self._render_aiperf_args(self._aiperf_phase_args(resolved_phase, artifact_dir)), + self._render_aiperf_phase_args(resolved_phase, artifact_dir), shlex.join(self._split_extra_args(resolved_phase.extra_args)), ] cmd = " ".join(part for part in cmd_parts if part) From 9042f60ac6a46e6d17c8589aa19d2d546b5b1eb0 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 18:42:22 +0200 Subject: [PATCH 14/16] replace kill with scancel --- src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 8 ++- .../ai_dynamo/slurm_command_gen_strategy.py | 49 ++++++++++------- tests/ref_data/ai-dynamo-aiperf.sh | 8 +-- tests/ref_data/ai-dynamo.sbatch | 52 +++++++++++++++++++ tests/test_acceptance.py | 6 ++- .../test_command_gen_strategy_slurm.py | 20 +++++++ 6 files changed, 118 insertions(+), 25 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index 3f72ed1c7..b85b35d9a 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -299,6 +299,12 @@ class AIPerf(Workload): def installables(self) -> list[Installable]: return [self.script] + @model_validator(mode="after") + def validate_extra_args(self) -> "AIPerf": + if isinstance(self.extra_args, list): + raise ValueError("AIPerf extra_args must be a string with explicit CLI syntax") + return self + class AIPerfPhase(BaseModel): """Named AIPerf phase that overrides the base AIPerf configuration.""" @@ -323,7 +329,7 @@ class AIPerfPhase(BaseModel): validation_alias=AliasChoices("artifact-dir-name", "artifact_dir_name"), ) args: Args = Field(default_factory=Args) - extra_args: str | list[str] | None = Field( + extra_args: str | None = Field( default=None, serialization_alias="extra-args", validation_alias=AliasChoices("extra-args", "extra_args"), diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 563fd7054..29eb9fd3c 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -136,14 +136,17 @@ def _render_aiperf_args(self, args: dict[str, Any]) -> str: for key, value in args.items(): if value is None or value is False: continue + if isinstance(value, list | dict): + raise ValueError( + f"AIPerf argument {key!r} must be a scalar value. " + "Use a string with AIPerf CLI syntax for multi-value arguments." + ) parts.append(f"--{key}") if value is True: continue - values = [",".join(str(item) for item in value)] if isinstance(value, list) else [str(value)] - for rendered_value in values: - parts.append(shlex.quote(rendered_value)) + parts.append(shlex.quote(str(value))) return " ".join(parts) def _runtime_result_path(self, path: str) -> str: @@ -151,13 +154,6 @@ def _runtime_result_path(self, path: str) -> str: return path return f"{self.CONTAINER_MOUNT_OUTPUT}/{path}" - def _split_extra_args(self, value: Any) -> list[str]: - if value is None: - return [] - if isinstance(value, list): - return [str(item) for item in value] - return shlex.split(str(value)) - def _aiperf_phase_args(self, resolved_phase: AIPerf, artifact_dir: str) -> dict[str, Any]: args: dict[str, Any] = { "model": self.td.cmd_args.dynamo.model, @@ -237,10 +233,12 @@ def _render_aiperf_script(self) -> str: artifact_dir = self._runtime_result_path(resolved_phase.artifact_dir_name) report_source = f"{artifact_dir}/profile_export_aiperf.csv" report_file = self._runtime_result_path(resolved_phase.report_name) + if isinstance(resolved_phase.extra_args, list): + raise ValueError("AIPerf extra_args must be a string with explicit CLI syntax") cmd_parts = [ shlex.join(shlex.split(resolved_phase.cmd)), self._render_aiperf_phase_args(resolved_phase, artifact_dir), - shlex.join(self._split_extra_args(resolved_phase.extra_args)), + resolved_phase.extra_args or "", ] cmd = " ".join(part for part in cmd_parts if part) log_message = f"Running {phase.name}: {cmd}" @@ -391,9 +389,11 @@ def _gen_dcgm_launcher_block(self) -> list[str]: out_dir = self.test_run.output_path.absolute() port = self.td.cmd_args.dynamo.dcgm_exporter.port dcgm_cmd = f"DCGM_EXPORTER_LISTEN=:{port} dcgm-exporter" + dcgm_step_name = "cloudai-dcgm-exporter" srun_parts = [ *self._gen_dcgm_srun_prefix(str(dcgm_image.installed_path)), "--overlap", + f"--job-name={dcgm_step_name}", f"-N{num_nodes}", *([] if not node_list else [f"--nodelist={','.join(node_list)}"]), f"--ntasks={num_nodes}", @@ -411,6 +411,23 @@ def _gen_dcgm_launcher_block(self) -> list[str]: " ".join(srun_parts) + " &", "DCGM_EXPORTER_SRUN_PID=$!", 'echo "DCGM exporter srun PID: ${DCGM_EXPORTER_SRUN_PID}"', + "DCGM_EXPORTER_STEP_ID=", + "for _ in {1..10}; do", + ' DCGM_EXPORTER_STEP_ID=$(squeue --noheader --steps --job "$SLURM_JOB_ID" ' + f'--format="%i %j" | awk \'$2 == "{dcgm_step_name}" {{ print $1; exit }}\')', + ' if [[ -n "${DCGM_EXPORTER_STEP_ID}" ]]; then break; fi', + " sleep 1", + "done", + 'echo "DCGM exporter step ID: ${DCGM_EXPORTER_STEP_ID:-unknown}"', + "function stop_dcgm_exporter()", + "{", + ' if [[ -n "${DCGM_EXPORTER_STEP_ID:-}" ]]; then', + ' scancel --signal=TERM "${DCGM_EXPORTER_STEP_ID}" 2>/dev/null || true', + " fi", + ' if [[ -n "${DCGM_EXPORTER_SRUN_PID:-}" ]]; then', + ' wait "${DCGM_EXPORTER_SRUN_PID}" 2>/dev/null || true', + " fi", + "}", "sleep 5", 'echo "Checking DCGM exporter metrics endpoints..."', "DCGM_EXPORTER_STARTUP_TIMEOUT=${DCGM_EXPORTER_STARTUP_TIMEOUT:-60}", @@ -444,8 +461,7 @@ def _gen_dcgm_launcher_block(self) -> list[str]: ' echo "DCGM exporter reachable: ${dcgm_url}"', "done", "if (( dcgm_failed != 0 )); then", - ' kill "${DCGM_EXPORTER_SRUN_PID}" 2>/dev/null || true', - ' wait "${DCGM_EXPORTER_SRUN_PID}" 2>/dev/null || true', + " stop_dcgm_exporter", " exit 1", "fi", "", @@ -457,12 +473,7 @@ def _gen_dcgm_cleanup_command(self) -> str | None: if not self.td.cmd_args.dynamo.dcgm_exporter.enabled: return None - return ( - 'if [[ -n "${DCGM_EXPORTER_SRUN_PID:-}" ]]; then ' - 'kill "${DCGM_EXPORTER_SRUN_PID}" 2>/dev/null || true; ' - 'wait "${DCGM_EXPORTER_SRUN_PID}" 2>/dev/null || true; ' - "fi" - ) + return "stop_dcgm_exporter" def gen_exec_command(self) -> str: srun_command = self._gen_srun_command() diff --git a/tests/ref_data/ai-dynamo-aiperf.sh b/tests/ref_data/ai-dynamo-aiperf.sh index bd73f2ab7..60798ef8b 100644 --- a/tests/ref_data/ai-dynamo-aiperf.sh +++ b/tests/ref_data/ai-dynamo-aiperf.sh @@ -10,10 +10,10 @@ log() { echo "[$(date +%F\ %T) $(hostname)]: $*"; } rm -rf /cloudai_run_results/aiperf_artifacts/round_1 mkdir -p /cloudai_run_results/aiperf_artifacts/round_1 -log 'Running round_1: aiperf profile --model model --endpoint-type chat --streaming --url "$FRONTEND_URL" --concurrency 1 --request-count 50 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --artifact-dir /cloudai_run_results/aiperf_artifacts/round_1 --no-server-metrics' +log 'Running round_1: aiperf profile --model model --endpoint-type chat --streaming --url "$FRONTEND_URL" --concurrency 1 --request-count 50 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --artifact-dir /cloudai_run_results/aiperf_artifacts/round_1 --server-metrics "$AIPERF_SERVER_METRICS_URLS" --server-metrics-formats json csv' phase_status=0 set +e -aiperf profile --model model --endpoint-type chat --streaming --url "$FRONTEND_URL" --concurrency 1 --request-count 50 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --artifact-dir /cloudai_run_results/aiperf_artifacts/round_1 --no-server-metrics > /cloudai_run_results/aiperf_round_1.log 2>&1 +aiperf profile --model model --endpoint-type chat --streaming --url "$FRONTEND_URL" --concurrency 1 --request-count 50 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --artifact-dir /cloudai_run_results/aiperf_artifacts/round_1 --server-metrics "$AIPERF_SERVER_METRICS_URLS" --server-metrics-formats json csv > /cloudai_run_results/aiperf_round_1.log 2>&1 phase_status=$? set -e if [[ "$phase_status" -ne 0 ]]; then @@ -36,10 +36,10 @@ fi rm -rf /cloudai_run_results/aiperf_artifacts/round_2 mkdir -p /cloudai_run_results/aiperf_artifacts/round_2 -log 'Running round_2: aiperf profile --model model --endpoint-type chat --streaming --url "$FRONTEND_URL" --concurrency 2 --request-count 10 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --artifact-dir /cloudai_run_results/aiperf_artifacts/round_2 --no-server-metrics' +log 'Running round_2: aiperf profile --model model --endpoint-type chat --streaming --url "$FRONTEND_URL" --concurrency 2 --request-count 10 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --artifact-dir /cloudai_run_results/aiperf_artifacts/round_2 --server-metrics "$AIPERF_SERVER_METRICS_URLS" --server-metrics-formats json csv' phase_status=0 set +e -aiperf profile --model model --endpoint-type chat --streaming --url "$FRONTEND_URL" --concurrency 2 --request-count 10 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --artifact-dir /cloudai_run_results/aiperf_artifacts/round_2 --no-server-metrics > /cloudai_run_results/aiperf_round_2.log 2>&1 +aiperf profile --model model --endpoint-type chat --streaming --url "$FRONTEND_URL" --concurrency 2 --request-count 10 --synthetic-input-tokens-mean 300 --output-tokens-mean 500 --artifact-dir /cloudai_run_results/aiperf_artifacts/round_2 --server-metrics "$AIPERF_SERVER_METRICS_URLS" --server-metrics-formats json csv > /cloudai_run_results/aiperf_round_2.log 2>&1 phase_status=$? set -e if [[ "$phase_status" -ne 0 ]]; then diff --git a/tests/ref_data/ai-dynamo.sbatch b/tests/ref_data/ai-dynamo.sbatch index e3384343b..c00906d40 100644 --- a/tests/ref_data/ai-dynamo.sbatch +++ b/tests/ref_data/ai-dynamo.sbatch @@ -14,6 +14,53 @@ srun --export=ALL --mpi=pmix -N2 --container-image=nvcr.io/nvidia/ai-dynamo:24.0 srun --export=ALL --mpi=pmix -N2 --container-image=nvcr.io/nvidia/ai-dynamo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/cloudai_install/huggingface,/tmp:/tmp --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +# Start DCGM exporter on each node. +echo "Starting DCGM exporter..." +srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/cloudai_install/huggingface,/tmp:/tmp --overlap --job-name=cloudai-dcgm-exporter -N2 --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/dcgm-node-%n-stdout.txt --error=__OUTPUT_DIR__/output/dcgm-node-%n-stderr.txt bash -lc 'DCGM_EXPORTER_LISTEN=:9501 dcgm-exporter' & +DCGM_EXPORTER_SRUN_PID=$! +echo "DCGM exporter srun PID: ${DCGM_EXPORTER_SRUN_PID}" +DCGM_EXPORTER_STEP_ID= +for _ in {1..10}; do + DCGM_EXPORTER_STEP_ID=$(squeue --noheader --steps --job "$SLURM_JOB_ID" --format="%i %j" | awk '$2 == "cloudai-dcgm-exporter" { print $1; exit }') + if [[ -n "${DCGM_EXPORTER_STEP_ID}" ]]; then break; fi + sleep 1 +done +echo "DCGM exporter step ID: ${DCGM_EXPORTER_STEP_ID:-unknown}" +function stop_dcgm_exporter() +{ + if [[ -n "${DCGM_EXPORTER_STEP_ID:-}" ]]; then + scancel --signal=TERM "${DCGM_EXPORTER_STEP_ID}" 2>/dev/null || true + fi + if [[ -n "${DCGM_EXPORTER_SRUN_PID:-}" ]]; then + wait "${DCGM_EXPORTER_SRUN_PID}" 2>/dev/null || true + fi +} +sleep 5 +echo "Checking DCGM exporter metrics endpoints..." +DCGM_EXPORTER_STARTUP_TIMEOUT=${DCGM_EXPORTER_STARTUP_TIMEOUT:-60} +mapfile -t dcgm_nodes < <(scontrol show hostnames "$SLURM_JOB_NODELIST") +: > __OUTPUT_DIR__/output/dcgm_endpoints.txt +dcgm_failed=0 +for node in "${dcgm_nodes[@]}"; do + dcgm_url="http://${node}:9501/metrics" + echo " ${dcgm_url}" >> __OUTPUT_DIR__/output/dcgm_endpoints.txt + deadline=$((SECONDS + DCGM_EXPORTER_STARTUP_TIMEOUT)) + until curl -fsS --max-time 2 "${dcgm_url}" >/dev/null; do + if (( SECONDS >= deadline )); then + echo "FATAL: DCGM exporter metrics endpoint is unreachable: ${dcgm_url}" >&2 + dcgm_failed=1 + break + fi + sleep 2 + done + if (( dcgm_failed != 0 )); then break; fi + echo "DCGM exporter reachable: ${dcgm_url}" +done +if (( dcgm_failed != 0 )); then + stop_dcgm_exporter + exit 1 +fi + srun \ --export=ALL \ --mpi=pmix \ @@ -47,6 +94,8 @@ srun \ --dynamo-etcd-port "2379" \ --dynamo-nats-cmd "nats-server -js" \ --dynamo-nats-port "4222" \ + --dynamo-dcgm-exporter-enabled "True" \ + --dynamo-dcgm-exporter-port "9501" \ --prefill-cmd "python3 -m dynamo.vllm --is-prefill-worker" \ --prefill-worker-initialized-regex "VllmWorker.*has.been.initialized" \ --prefill-multiple-workers-per-node "False" \ @@ -74,3 +123,6 @@ srun \ --genai_perf-warmup-request-count "10" \ --aiperf-name "aiperf" \ --aiperf-script /cloudai_run_results/aiperf.sh + +# Stop DCGM exporter when test finishes +stop_dcgm_exporter diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 78902298e..151c6fb9e 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -36,6 +36,7 @@ AIDynamoTestDefinition, AIPerf, AIPerfPhase, + DCGMExporter, GenAIPerf, WorkerBaseArgs, WorkerConfig, @@ -501,6 +502,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - backend="vllm", endpoint="v1/chat/completions", workspace_path="/workspace", + dcgm_exporter=DCGMExporter(enabled=True, port=9501), prefill_worker=WorkerConfig( cmd="python3 -m dynamo.vllm --is-prefill-worker", worker_initialized_regex="VllmWorker.*has.been.initialized", @@ -531,12 +533,14 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - ), aiperf=AIPerf.model_validate( { + "extra-args": "--server-metrics-formats json csv", "args": { "concurrency": 2, "request-count": 50, "synthetic-input-tokens-mean": 300, "output-tokens-mean": 500, - } + "server-metrics": "auto", + }, } ), aiperf_phases=[ diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index 533b059b0..d55741311 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -283,6 +283,23 @@ def test_generated_aiperf_script_supports_core_overrides_and_server_metrics_auto assert "--no-server-metrics" not in script +def test_generated_aiperf_script_rejects_list_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: + td = cast(AIDynamoTestDefinition, strategy.test_run.test) + td.cmd_args.workloads = "aiperf.sh" + td.cmd_args.aiperf = AIPerf.model_validate({"args": {"server-metrics-formats": ["json", "csv"]}}) + + with pytest.raises(ValueError, match="AIPerf argument 'server-metrics-formats' must be a scalar value"): + strategy._gen_script_args(td) + + +def test_aiperf_extra_args_must_be_string() -> None: + with pytest.raises(ValueError): + AIPerf.model_validate({"extra-args": ["--server-metrics-formats", "json"]}) + + with pytest.raises(ValueError): + AIPerfPhase.model_validate({"name": "round_1", "extra-args": ["--server-metrics-formats", "json"]}) + + def test_dcgm_exporter_generates_launcher_and_runtime_flags(strategy: AIDynamoSlurmCommandGenStrategy) -> None: td = cast(AIDynamoTestDefinition, strategy.test_run.test) td.cmd_args.dynamo.dcgm_exporter.enabled = True @@ -299,7 +316,10 @@ def test_dcgm_exporter_generates_launcher_and_runtime_flags(strategy: AIDynamoSl assert any("DCGM_EXPORTER_STARTUP_TIMEOUT" in line for line in block) assert any('curl -fsS --max-time 2 "${dcgm_url}"' in line for line in block) assert any("FATAL: DCGM exporter metrics endpoint is unreachable" in line for line in block) + assert any('scancel --signal=TERM "${DCGM_EXPORTER_STEP_ID}"' in line for line in block) + assert strategy._gen_dcgm_cleanup_command() == "stop_dcgm_exporter" assert not any("docker run" in line for line in block) + assert not any('kill "${DCGM_EXPORTER_SRUN_PID}"' in line for line in block) def test_dcgm_exporter_adds_configured_docker_image_installable(cmd_args: AIDynamoCmdArgs) -> None: From a1f225d0092d1ff0422285d497e4058e018ad965 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 18:59:50 +0200 Subject: [PATCH 15/16] reformat aiperf script cmd generation --- .../ai_dynamo/slurm_command_gen_strategy.py | 129 ++++++++++-------- 1 file changed, 75 insertions(+), 54 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 29eb9fd3c..5a9d3320b 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -16,6 +16,7 @@ import logging import shlex +import textwrap from pathlib import Path from typing import Any, List, cast @@ -204,27 +205,31 @@ def _render_aiperf_script(self) -> str: phases = self.td.cmd_args.aiperf_phases or [AIPerfPhase.model_validate({"name": "aiperf"})] single_phase = len(phases) == 1 setup_cmd = self._resolve_aiperf_phase(phases[0]).setup_cmd - lines = [ - "#!/usr/bin/env bash", - "set -Eeuo pipefail", - "", - 'log() { echo "[$(date +%F\\ %T) $(hostname)]: $*"; }', - "", - ': "${FRONTEND_URL:?FRONTEND_URL is not set}"', - f': "${{AIPERF_MODEL:={self.td.cmd_args.dynamo.model}}}"', - f': "${{AIPERF_ENDPOINT:={self.td.cmd_args.dynamo.endpoint}}}"', - f': "${{AIPERF_FAILURE_MARKER:={self.CONTAINER_MOUNT_OUTPUT}/{self.td.failure_marker}}}"', - "", + blocks = [ + textwrap.dedent( + f"""\ + #!/usr/bin/env bash + set -Eeuo pipefail + + log() {{ echo "[$(date +%F\\ %T) $(hostname)]: $*"; }} + + : "${{FRONTEND_URL:?FRONTEND_URL is not set}}" + : "${{AIPERF_MODEL:={self.td.cmd_args.dynamo.model}}}" + : "${{AIPERF_ENDPOINT:={self.td.cmd_args.dynamo.endpoint}}}" + : "${{AIPERF_FAILURE_MARKER:={self.CONTAINER_MOUNT_OUTPUT}/{self.td.failure_marker}}}" + """ + ).rstrip() ] if setup_cmd: setup_argv = ["bash", "-lc", setup_cmd] - lines.extend( - [ - f"log {shlex.quote(f'Running aiperf setup: {shlex.join(setup_argv)}')}", - shlex.join(setup_argv), - "", - ] + blocks.append( + textwrap.dedent( + f"""\ + log {shlex.quote(f"Running aiperf setup: {shlex.join(setup_argv)}")} + {shlex.join(setup_argv)} + """ + ).rstrip() ) write_phase_logs = not single_phase @@ -241,60 +246,76 @@ def _render_aiperf_script(self) -> str: resolved_phase.extra_args or "", ] cmd = " ".join(part for part in cmd_parts if part) - log_message = f"Running {phase.name}: {cmd}" - lines.append(f"rm -rf {shlex.quote(artifact_dir)}") - lines.append(f"mkdir -p {shlex.quote(artifact_dir)}") - lines.append(f"log {shlex.quote(log_message)}") - lines.append("phase_status=0") if write_phase_logs: log_file = self._runtime_result_path(f"aiperf_{phase.name}.log") - lines.append("set +e") - lines.append(f"{cmd} > {shlex.quote(log_file)} 2>&1") - lines.append("phase_status=$?") - lines.append("set -e") + run_cmd = f"{cmd} > {shlex.quote(log_file)} 2>&1" else: - lines.append("set +e") - lines.append(cmd) - lines.append("phase_status=$?") - lines.append("set -e") - - lines.append('if [[ "$phase_status" -ne 0 ]]; then') - lines.append(f" log {shlex.quote(f'AIPerf phase {phase.name} failed')}") + run_cmd = cmd + log_message = f"Running {phase.name}: {cmd}" + phase_lines = [ + textwrap.dedent( + f"""\ + rm -rf {shlex.quote(artifact_dir)} + mkdir -p {shlex.quote(artifact_dir)} + log {shlex.quote(log_message)} + phase_status=0 + set +e + {run_cmd} + phase_status=$? + set -e + if [[ "$phase_status" -ne 0 ]]; then + log {shlex.quote(f"AIPerf phase {phase.name} failed")} + """ + ).rstrip() + ] if not resolved_phase.continue_on_phase_failure: - lines.append(' exit "$phase_status"') - lines.append("fi") - lines.append('if [[ "$phase_status" -eq 0 ]]; then') - - lines.append(f" mkdir -p {shlex.quote(str(Path(report_file).parent))}") + phase_lines.append(' exit "$phase_status"') + phase_lines.extend( + [ + "fi", + textwrap.dedent( + f"""\ + if [[ "$phase_status" -eq 0 ]]; then + mkdir -p {shlex.quote(str(Path(report_file).parent))} + """ + ).rstrip(), + ] + ) if report_source != report_file: - lines.append(f" cp {shlex.quote(report_source)} {shlex.quote(report_file)}") - lines.append(f" log {shlex.quote(f'AIPerf report saved to {report_file}')}") + phase_lines.append(f" cp {shlex.quote(report_source)} {shlex.quote(report_file)}") + phase_lines.append(f" log {shlex.quote(f'AIPerf report saved to {report_file}')}") if not single_phase and idx == len(phases) - 1: final_report_file = self._runtime_result_path("aiperf_report.csv") - lines.append(f" mkdir -p {shlex.quote(str(Path(final_report_file).parent))}") + phase_lines.append(f" mkdir -p {shlex.quote(str(Path(final_report_file).parent))}") if report_file != final_report_file: - lines.append(f" cp {shlex.quote(report_file)} {shlex.quote(final_report_file)}") - lines.append(f" log {shlex.quote(f'Final AIPerf report saved to {final_report_file}')}") + phase_lines.append(f" cp {shlex.quote(report_file)} {shlex.quote(final_report_file)}") + phase_lines.append(f" log {shlex.quote(f'Final AIPerf report saved to {final_report_file}')}") + if not single_phase and idx < len(phases) - 1 and resolved_phase.health_check_between_phases: - lines.append(' if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then') - lines.append(" log 'FATAL: failure marker found between AIPerf phases'") - lines.append(" exit 1") - lines.append(" fi") - lines.append( + health_probe_cmd = ( ' if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" ' "-H 'Content-Type: application/json' " '-d "{\\"model\\":\\"${AIPERF_MODEL}\\",\\"messages\\":[{\\"role\\":\\"user\\",' '\\"content\\":\\"ping\\"}],\\"stream\\":false,\\"max_tokens\\":1}" ' ">/dev/null; then" ) - lines.append(" log 'FATAL: frontend health probe failed between AIPerf phases'") - lines.append(" exit 1") - lines.append(" fi") - lines.append("fi") - lines.append("") + phase_lines.extend( + [ + ' if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then', + " log 'FATAL: failure marker found between AIPerf phases'", + " exit 1", + " fi", + health_probe_cmd, + " log 'FATAL: frontend health probe failed between AIPerf phases'", + " exit 1", + " fi", + ] + ) + phase_lines.append("fi") + blocks.append("\n".join(phase_lines)) - return "\n".join(lines) + return "\n\n".join(blocks) def _prepare_aiperf_script(self) -> str | None: if "aiperf.sh" not in self.td.cmd_args.workloads_list: From 2113080e426b37c04ddbbcff97ef81e2c7b55eca Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 20:10:28 +0200 Subject: [PATCH 16/16] honor per-phase AIPerf setup commands --- .../ai_dynamo/slurm_command_gen_strategy.py | 32 +++++++++++-------- .../test_command_gen_strategy_slurm.py | 12 ++++++- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 5a9d3320b..cc3b51273 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -201,10 +201,23 @@ def _resolve_aiperf_phase(self, phase: AIPerfPhase) -> AIPerf: return AIPerf.model_validate(cloudai.util.deep_merge(base_data, phase_data)) + def _render_aiperf_setup_blocks(self, log_message: str, setup_cmd: str | None) -> list[str]: + if not setup_cmd: + return [] + + setup_argv = ["bash", "-lc", setup_cmd] + return [ + textwrap.dedent( + f"""\ + log {shlex.quote(f"{log_message}: {shlex.join(setup_argv)}")} + {shlex.join(setup_argv)} + """ + ).rstrip() + ] + def _render_aiperf_script(self) -> str: phases = self.td.cmd_args.aiperf_phases or [AIPerfPhase.model_validate({"name": "aiperf"})] single_phase = len(phases) == 1 - setup_cmd = self._resolve_aiperf_phase(phases[0]).setup_cmd blocks = [ textwrap.dedent( f"""\ @@ -221,16 +234,7 @@ def _render_aiperf_script(self) -> str: ).rstrip() ] - if setup_cmd: - setup_argv = ["bash", "-lc", setup_cmd] - blocks.append( - textwrap.dedent( - f"""\ - log {shlex.quote(f"Running aiperf setup: {shlex.join(setup_argv)}")} - {shlex.join(setup_argv)} - """ - ).rstrip() - ) + blocks.extend(self._render_aiperf_setup_blocks("Running aiperf setup", self.td.cmd_args.aiperf.setup_cmd)) write_phase_logs = not single_phase for idx, phase in enumerate(phases): @@ -252,7 +256,9 @@ def _render_aiperf_script(self) -> str: else: run_cmd = cmd log_message = f"Running {phase.name}: {cmd}" - phase_lines = [ + phase_setup = phase.setup_cmd if "setup_cmd" in phase.model_fields_set else None + phase_lines = self._render_aiperf_setup_blocks(f"Running AIPerf phase setup for {phase.name}", phase_setup) + phase_lines.append( textwrap.dedent( f"""\ rm -rf {shlex.quote(artifact_dir)} @@ -267,7 +273,7 @@ def _render_aiperf_script(self) -> str: log {shlex.quote(f"AIPerf phase {phase.name} failed")} """ ).rstrip() - ] + ) if not resolved_phase.continue_on_phase_failure: phase_lines.append(' exit "$phase_status"') phase_lines.extend( diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index 2c353dc91..46a10906b 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -236,14 +236,24 @@ def test_gen_script_args_writes_resolved_aiperf_script(strategy: AIDynamoSlurmCo ) td.cmd_args.aiperf_phases = [ AIPerfPhase.model_validate({"name": "round_1", "args": {"concurrency": 1}}), - AIPerfPhase.model_validate({"name": "round_2", "args": {"request-count": 10}}), + AIPerfPhase.model_validate( + { + "name": "round_2", + "setup-cmd": "python -m pip install --upgrade another-aiperf-plugin", + "args": {"request-count": 10}, + } + ), ] result = strategy._gen_script_args(td) assert f"--aiperf-script {strategy.CONTAINER_MOUNT_OUTPUT}/aiperf.sh" in result script = (strategy.test_run.output_path / "aiperf.sh").read_text() + assert script.count("Running aiperf setup:") == 1 assert "bash -lc 'python -m pip install --upgrade aiperf'" in script + assert "Running AIPerf phase setup for round_1" not in script + assert "Running AIPerf phase setup for round_2" in script + assert "bash -lc 'python -m pip install --upgrade another-aiperf-plugin'" in script assert ': "${FRONTEND_URL:?FRONTEND_URL is not set}"' in script assert '--url "$FRONTEND_URL"' in script assert f"--artifact-dir {strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_artifacts/round_1" in script