From 27291976d331249e0cd164e9aba3479c0e0b7b20 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 26 May 2026 19:04:13 -0700 Subject: [PATCH 01/16] implement semantic degradataion for aidynamo using aiperf --- conf/experimental/ai_dynamo/test/sglang.toml | 8 +- conf/experimental/ai_dynamo/test/vllm.toml | 8 +- doc/workloads/ai_dynamo.rst | 33 +++++- src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 101 ++++++++++++++++++ src/cloudai/workloads/ai_dynamo/aiperf.sh | 16 ++- .../ai_dynamo/report_generation_strategy.py | 9 ++ .../test_command_gen_strategy_slurm.py | 22 ++++ .../ai_dynamo/test_report_gen_strategy.py | 74 +++++++++++++ 8 files changed, 260 insertions(+), 11 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index 62f8f615d..28ec71d60 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -20,8 +20,8 @@ test_template_name = "AIDynamo" extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"] [cmd_args] -docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0" -workloads = "genai_perf.sh" +docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.1.1" +workloads = "aiperf.sh" [cmd_args.dynamo] backend = "sglang" @@ -96,7 +96,11 @@ workloads = "genai_perf.sh" [cmd_args.aiperf] [cmd_args.aiperf.args] + accuracy-benchmark = "mmlu" + accuracy-n-shots = 5 + accuracy-tasks = "abstract_algebra" concurrency = 2 + extra-inputs = '{"temperature":0,"stop":["\n"]}' request-count = 50 synthetic-input-tokens-mean = 300 output-tokens-mean = 500 diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index 193510728..91f12b08b 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -20,8 +20,8 @@ test_template_name = "AIDynamo" extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"] [cmd_args] -docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1" -workloads = "genai_perf.sh" +docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.1.1" +workloads = "aiperf.sh" [cmd_args.dynamo] backend = "vllm" @@ -88,7 +88,11 @@ workloads = "genai_perf.sh" [cmd_args.aiperf] [cmd_args.aiperf.args] + accuracy-benchmark = "mmlu" + accuracy-n-shots = 5 + accuracy-tasks = "abstract_algebra" concurrency = 2 + extra-inputs = '{"temperature":0,"stop":["\n"]}' request-count = 50 synthetic-input-tokens-mean = 300 output-tokens-mean = 500 diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index 023d92bf2..b266b0d77 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -87,14 +87,14 @@ The frontend node will initially wait to allow weight loading on all nodes. Once Choosing a Benchmark Tool ~~~~~~~~~~~~~~~~~~~~~~~~~ -The benchmark tool is controlled by the ``workloads`` field in the test TOML. The default is ``aiperf.sh``: +The benchmark tool is controlled by the ``workloads`` field in the test TOML. Set ``aiperf.sh`` to use AIPerf: .. code-block:: toml [cmd_args] - workloads = "aiperf.sh" # default — uses aiperf, writes aiperf_report.csv + workloads = "aiperf.sh" # uses aiperf, writes aiperf_report.csv -To use genai-perf instead, set: +To use genai-perf, set: .. code-block:: toml @@ -110,17 +110,40 @@ To use genai-perf instead, set: output-tokens-mean = 500 request-count = 50 +Semantic Degradation With AIPerf Accuracy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it by adding AIPerf accuracy flags under +``[cmd_args.aiperf.args]`` and running the ``aiperf.sh`` workload: + +.. code-block:: toml + + [cmd_args] + workloads = "aiperf.sh" + + [cmd_args.aiperf.args] + accuracy-benchmark = "mmlu" + accuracy-n-shots = 5 + accuracy-tasks = "abstract_algebra" + extra-inputs = '{"temperature":0,"stop":["\n"]}' + +When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes the +``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. + Review Benchmark Results ------------------------ After job completion, CloudAI places output logs and result files in the designated results directory. The result file name depends on the configured ``workloads`` field: -- ``aiperf.sh`` (default) → ``aiperf_report.csv`` +- ``aiperf.sh`` → ``aiperf_report.csv`` - ``genai_perf.sh`` → ``genai_perf_report.csv`` +If AIPerf accuracy mode is enabled, CloudAI also copies ``aiperf_artifacts/accuracy_results.csv`` to +``accuracy_results.csv`` in the run output directory. + Navigate to ``./results///0/`` and open the CSV to examine performance metrics. -Example ``aiperf_report.csv`` (default): +Example ``aiperf_report.csv``: :: diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index 01912f0c1..7016fc42c 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import csv import logging from pathlib import Path from typing import Literal, Optional, cast @@ -40,6 +41,89 @@ from cloudai.models.workload import CmdArgs, TestDefinition from cloudai.systems.slurm import SlurmSystem +AIPERF_ARTIFACTS_DIR = "aiperf_artifacts" +AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv" + + +def _parse_accuracy_value(value: str | int | float | None) -> float | None: + if value is None: + return None + if isinstance(value, (int, float)): + accuracy = float(value) + return accuracy / 100 if accuracy > 1 else accuracy + + raw_value = value.strip() + if not raw_value: + return None + + is_percentage = raw_value.endswith("%") + if is_percentage: + raw_value = raw_value[:-1].strip() + + try: + accuracy = float(raw_value) + except ValueError: + return None + + return accuracy / 100 if is_percentage or accuracy > 1 else accuracy + + +def _parse_count_value(value: str | int | float | None) -> float | None: + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + try: + return float(value.strip()) + except ValueError: + return None + + +def parse_aiperf_accuracy(output_path: Path) -> float | None: + """ + Parse AIPerf accuracy from accuracy_results.csv. + + Expected CSV format: + Task,Correct,Total,Accuracy + abstract_algebra,35,100,35.00% + OVERALL,8368,14042,59.59% + + AIPerf writes this file under aiperf_artifacts; CloudAI's wrapper also copies + it to the run output directory when present. The returned value is normalized + to a 0.0-1.0 fraction. + """ + candidates = [ + output_path / AIPERF_ACCURACY_RESULTS_CSV, + output_path / AIPERF_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV, + ] + + for csv_file in candidates: + if not csv_file.exists() or csv_file.stat().st_size == 0: + continue + + fallback_accuracy: float | None = None + with csv_file.open(newline="", encoding="utf-8") as f: + for row in csv.DictReader(f): + accuracy = _parse_accuracy_value(row.get("Accuracy") or row.get("accuracy") or row.get("Value")) + if accuracy is None: + correct = _parse_count_value(row.get("Correct") or row.get("correct")) + total = _parse_count_value(row.get("Total") or row.get("total")) + if correct is not None and total: + accuracy = correct / total + if accuracy is None: + continue + + task = (row.get("Task") or row.get("task") or row.get("Metric") or "").strip().upper() + if task == "OVERALL": + return accuracy + if fallback_accuracy is None: + fallback_accuracy = accuracy + + if fallback_accuracy is not None: + return fallback_accuracy + + return None + class Args(BaseModel): """Arguments for custom workloads.""" @@ -300,6 +384,17 @@ class AIPerf(Workload): def installables(self) -> list[Installable]: return [self.script] + @property + def has_accuracy_benchmark(self) -> bool: + args_extra = getattr(self.args, "model_extra", {}) or {} + if args_extra.get("accuracy-benchmark") or args_extra.get("accuracy_benchmark"): + return True + + extra_args = self.extra_args or "" + if isinstance(extra_args, list): + return "--accuracy-benchmark" in extra_args + return "--accuracy-benchmark" in extra_args + class Constraints(BaseModel): """Constraints for validation of AI Dynamo configurations when using DSE.""" @@ -435,6 +530,12 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: else: logging.info(f"Result file ({workload_csv_file.absolute()}) exists for {workload}") + if workload == self.cmd_args.aiperf.script.src.name and self.cmd_args.aiperf.has_accuracy_benchmark: + accuracy = parse_aiperf_accuracy(output_path) + if accuracy is None: + logging.info(f"AIPerf accuracy results not found in {output_path}.") + result = False + return JobStatusResult(result) def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool: diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh index 9f5a78b33..f0826686a 100644 --- a/src/cloudai/workloads/ai_dynamo/aiperf.sh +++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh @@ -94,8 +94,15 @@ process_args() { process_results() { local artifact_dir="$result_dir/aiperf_artifacts" - local csv_path - csv_path=$(find "$artifact_dir" -name "*.csv" -print -quit 2>/dev/null || true) + local csv_path="" + local accuracy_path="$artifact_dir/accuracy_results.csv" + + if [[ -f "$artifact_dir/profile_export_aiperf.csv" ]]; then + csv_path="$artifact_dir/profile_export_aiperf.csv" + else + csv_path=$(find "$artifact_dir" -name "*aiperf*.csv" -print -quit 2>/dev/null || true) + fi + if [[ -n "$csv_path" ]]; then cp "$csv_path" "$result_dir/$report_name" log "aiperf report saved to $result_dir/$report_name" @@ -103,6 +110,11 @@ process_results() { log "ERROR: no CSV found in $artifact_dir — aiperf may not have completed" exit 1 fi + + if [[ -f "$accuracy_path" ]]; then + cp "$accuracy_path" "$result_dir/accuracy_results.csv" + log "aiperf accuracy report saved to $result_dir/accuracy_results.csv" + fi } main() { diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py index a8e4e91b8..a665b0d6a 100644 --- a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py @@ -21,6 +21,7 @@ from cloudai.core import METRIC_ERROR, MetricValue, ReportGenerationStrategy from cloudai.util.lazy_imports import lazy +from cloudai.workloads.ai_dynamo.ai_dynamo import AIDynamoTestDefinition, parse_aiperf_accuracy class AIDynamoReportGenerationStrategy(ReportGenerationStrategy): @@ -44,6 +45,14 @@ def extract_metric_from_csv(self, csv_file: Path, metric_name: str, metric_type: def get_metric(self, metric: str) -> MetricValue: logging.info(f"Getting metric: {metric}") + + if metric.lower() == "accuracy": + tdef = self.test_run.test + if not isinstance(tdef, AIDynamoTestDefinition) or not tdef.cmd_args.aiperf.has_accuracy_benchmark: + return METRIC_ERROR + accuracy = parse_aiperf_accuracy(self.test_run.output_path) + return accuracy if accuracy is not None else METRIC_ERROR + metric_name = metric metric_type = "avg" diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index a0a028caa..034576f89 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -15,6 +15,7 @@ # limitations under the License. from pathlib import Path +from typing import cast import pytest @@ -26,6 +27,7 @@ AIDynamoCmdArgs, AIDynamoSlurmCommandGenStrategy, AIDynamoTestDefinition, + AIPerf, GenAIPerf, LMCache, LMCacheArgs, @@ -148,3 +150,23 @@ def test_dynamo_cmd( ) -> None: result = strategy.gen_dynamo_cmd(module, Path(config)) assert result.strip() == expected + + +def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: + td = cast(AIDynamoTestDefinition, strategy.test_run.test) + td.cmd_args.workloads = "aiperf.sh" + td.cmd_args.aiperf = AIPerf.model_validate( + { + "args": { + "accuracy-benchmark": "mmlu", + "accuracy-n-shots": 5, + "accuracy-tasks": "abstract_algebra", + } + } + ) + + result = strategy._gen_script_args(td) + + assert '--aiperf-args-accuracy-benchmark "mmlu"' in result + assert '--aiperf-args-accuracy-n-shots "5"' in result + assert '--aiperf-args-accuracy-tasks "abstract_algebra"' in result diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py index 0e51c414f..0ba90024d 100644 --- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py +++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py @@ -32,6 +32,7 @@ WorkerBaseArgs, WorkerConfig, ) +from cloudai.workloads.ai_dynamo.ai_dynamo import parse_aiperf_accuracy from cloudai.workloads.ai_dynamo.report_generation_strategy import AIDynamoReportGenerationStrategy @@ -62,6 +63,10 @@ def get_aiperf_csv_content() -> str: ) +def get_aiperf_accuracy_csv_content() -> str: + return "Task,Correct,Total,Accuracy\nabstract_algebra,35,100,35.00%\nOVERALL,35,100,35.00%\n" + + @pytest.fixture def ai_dynamo_tr(tmp_path: Path) -> TestRun: test = AIDynamoTestDefinition( @@ -70,6 +75,7 @@ def ai_dynamo_tr(tmp_path: Path) -> TestRun: test_template_name="t", cmd_args=AIDynamoCmdArgs( docker_image_url="http://url", + workloads="genai_perf.sh", dynamo=AIDynamoArgs( prefill_worker=WorkerConfig( cmd="python3 -m dynamo.vllm --is-prefill-worker", @@ -119,6 +125,33 @@ def ai_dynamo_aiperf_tr(tmp_path: Path) -> TestRun: return tr +@pytest.fixture +def ai_dynamo_aiperf_accuracy_tr(tmp_path: Path) -> TestRun: + test = AIDynamoTestDefinition( + name="ai_dynamo_aiperf_accuracy", + description="desc", + test_template_name="t", + cmd_args=AIDynamoCmdArgs( + docker_image_url="http://url", + workloads="aiperf.sh", + dynamo=AIDynamoArgs( + prefill_worker=WorkerConfig( + cmd="python3 -m dynamo.vllm --is-prefill-worker", + worker_initialized_regex="VllmWorker.*has.been.initialized", + args=WorkerBaseArgs(), + ), + ), + aiperf=AIPerf.model_validate({"args": {"accuracy-benchmark": "mmlu"}}), + lmcache=LMCache(args=LMCacheArgs()), + ), + ) + tr = TestRun(name="ai_dynamo_aiperf_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path) + (tr.output_path / "aiperf_report.csv").write_text(get_aiperf_csv_content()) + (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content()) + (tr.output_path / test.success_marker).touch() + return tr + + @pytest.fixture def csv_content() -> str: return get_csv_content() @@ -161,6 +194,20 @@ def test_ai_dynamo_get_metric_aiperf(slurm_system: SlurmSystem, ai_dynamo_aiperf assert strategy.get_metric("aiperf:Total Token Throughput (tokens/sec):avg") == 954.47 +def test_ai_dynamo_get_metric_aiperf_accuracy(slurm_system: SlurmSystem, ai_dynamo_aiperf_accuracy_tr: TestRun) -> None: + strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_accuracy_tr) + + assert strategy.get_metric("accuracy") == 0.35 + + +def test_ai_dynamo_accuracy_metric_requires_aiperf_accuracy_config( + slurm_system: SlurmSystem, ai_dynamo_aiperf_tr: TestRun +) -> None: + strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_tr) + + assert strategy.get_metric("accuracy") == METRIC_ERROR + + def test_ai_dynamo_get_metric_invalid(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None: strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr) @@ -176,9 +223,36 @@ def test_was_run_successful(ai_dynamo_tr: TestRun) -> None: assert result.is_successful is True +def test_was_run_successful_with_aiperf_accuracy(ai_dynamo_aiperf_accuracy_tr: TestRun) -> None: + test_def = ai_dynamo_aiperf_accuracy_tr.test + result = test_def.was_run_successful(ai_dynamo_aiperf_accuracy_tr) + assert result.is_successful is True + + +def test_was_run_successful_requires_aiperf_accuracy(ai_dynamo_aiperf_accuracy_tr: TestRun) -> None: + test_def = ai_dynamo_aiperf_accuracy_tr.test + (ai_dynamo_aiperf_accuracy_tr.output_path / "accuracy_results.csv").unlink() + result = test_def.was_run_successful(ai_dynamo_aiperf_accuracy_tr) + assert result.is_successful is False + + def test_was_run_successful_no_results(ai_dynamo_tr: TestRun, tmp_path: Path) -> None: test_def = ai_dynamo_tr.test ai_dynamo_tr.output_path = tmp_path / "empty_output" ai_dynamo_tr.output_path.mkdir(parents=True, exist_ok=True) result = test_def.was_run_successful(ai_dynamo_tr) assert result.is_successful is False + + +def test_parse_aiperf_accuracy_from_artifact_dir(tmp_path: Path) -> None: + artifact_dir = tmp_path / "aiperf_artifacts" + artifact_dir.mkdir() + (artifact_dir / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content(), encoding="utf-8") + + assert parse_aiperf_accuracy(tmp_path) == 0.35 + + +def test_parse_aiperf_accuracy_missing_or_invalid(tmp_path: Path) -> None: + (tmp_path / "accuracy_results.csv").write_text("Task,Correct,Total,Accuracy\nOVERALL,n/a,100,n/a\n") + + assert parse_aiperf_accuracy(tmp_path) is None From 72c8ef2a70adfe4c0fbeabe27571745a69fc13e3 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 26 May 2026 20:34:31 -0700 Subject: [PATCH 02/16] update conf fix nixl connector --- conf/experimental/ai_dynamo/test/vllm.toml | 2 ++ .../ai_dynamo/test_command_gen_strategy_slurm.py | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index 91f12b08b..8c044dba6 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -38,6 +38,7 @@ workloads = "aiperf.sh" tensor-parallel-size = 8 pipeline-parallel-size = 1 data-parallel-size = 1 + kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' [cmd_args.dynamo.decode_worker] num-nodes = 1 @@ -50,6 +51,7 @@ workloads = "aiperf.sh" tensor-parallel-size = 8 pipeline-parallel-size = 1 data-parallel-size = 1 + kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' [cmd_args.lmcache] controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001" diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index 034576f89..aab622578 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -170,3 +170,15 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo assert '--aiperf-args-accuracy-benchmark "mmlu"' in result assert '--aiperf-args-accuracy-n-shots "5"' in result assert '--aiperf-args-accuracy-tasks "abstract_algebra"' in result + + +def test_gen_script_args_quotes_worker_json_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: + td = cast(AIDynamoTestDefinition, strategy.test_run.test) + config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + td.cmd_args.dynamo.prefill_worker.args = WorkerBaseArgs.model_validate({"kv-transfer-config": config}) + td.cmd_args.dynamo.decode_worker.args = WorkerBaseArgs.model_validate({"kv-transfer-config": config}) + + result = strategy._gen_script_args(td) + + assert f"--prefill-args-kv-transfer-config '{config}'" in result + assert f"--decode-args-kv-transfer-config '{config}'" in result From b023c34c3e0ca7ef15a711a5f312117240c1261a Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 26 May 2026 20:56:11 -0700 Subject: [PATCH 03/16] accuracy fixes --- conf/experimental/ai_dynamo/test/sglang.toml | 6 ++--- conf/experimental/ai_dynamo/test/vllm.toml | 6 ++--- doc/workloads/ai_dynamo.rst | 11 +++++--- src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 16 +++++++----- src/cloudai/workloads/ai_dynamo/ai_dynamo.sh | 16 +++++++++--- src/cloudai/workloads/ai_dynamo/aiperf.sh | 25 ++++++++++++++++--- .../test_command_gen_strategy_slurm.py | 4 +++ .../ai_dynamo/test_report_gen_strategy.py | 1 - 8 files changed, 58 insertions(+), 27 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index 28ec71d60..adf9552af 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -99,11 +99,9 @@ workloads = "aiperf.sh" accuracy-benchmark = "mmlu" accuracy-n-shots = 5 accuracy-tasks = "abstract_algebra" - concurrency = 2 + concurrency = 10 extra-inputs = '{"temperature":0,"stop":["\n"]}' - request-count = 50 - synthetic-input-tokens-mean = 300 - output-tokens-mean = 500 + num-requests = 100 [extra_env_vars] UCX_LOG_LEVEL = "warn" diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index 8c044dba6..76b8c3c6e 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -93,11 +93,9 @@ workloads = "aiperf.sh" accuracy-benchmark = "mmlu" accuracy-n-shots = 5 accuracy-tasks = "abstract_algebra" - concurrency = 2 + concurrency = 10 extra-inputs = '{"temperature":0,"stop":["\n"]}' - request-count = 50 - synthetic-input-tokens-mean = 300 - output-tokens-mean = 500 + num-requests = 100 [extra_env_vars] UCX_LOG_LEVEL = "warn" diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index b266b0d77..1e077b7c1 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -125,21 +125,24 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it accuracy-benchmark = "mmlu" accuracy-n-shots = 5 accuracy-tasks = "abstract_algebra" + concurrency = 10 extra-inputs = '{"temperature":0,"stop":["\n"]}' + num-requests = 100 When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes the -``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. +``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt and +token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark. Review Benchmark Results ------------------------ After job completion, CloudAI places output logs and result files in the designated results directory. The result file name depends on the configured ``workloads`` field: -- ``aiperf.sh`` → ``aiperf_report.csv`` +- ``aiperf.sh`` → ``aiperf_report.csv`` for performance mode, ``accuracy_results.csv`` for accuracy mode - ``genai_perf.sh`` → ``genai_perf_report.csv`` -If AIPerf accuracy mode is enabled, CloudAI also copies ``aiperf_artifacts/accuracy_results.csv`` to -``accuracy_results.csv`` in the run output directory. +If AIPerf accuracy mode is enabled, CloudAI copies ``aiperf_artifacts/accuracy_results.csv`` to ``accuracy_results.csv`` +in the run output directory and marks the run failed if that file is not produced. Navigate to ``./results///0/`` and open the CSV to examine performance metrics. diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index 7016fc42c..2d0e5e8fd 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -518,6 +518,16 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: logging.info(f"Workload {workload} not found in workload map") result = False continue + + if workload == self.cmd_args.aiperf.script.src.name and self.cmd_args.aiperf.has_accuracy_benchmark: + accuracy = parse_aiperf_accuracy(output_path) + if accuracy is None: + logging.info(f"AIPerf accuracy results not found in {output_path}.") + result = False + else: + logging.info(f"AIPerf accuracy results found in {output_path}: {accuracy}") + continue + report_name = workload_map[workload].report_name if report_name is None: logging.warning(f"Workload {workload} has no report_name configured") @@ -530,12 +540,6 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: else: logging.info(f"Result file ({workload_csv_file.absolute()}) exists for {workload}") - if workload == self.cmd_args.aiperf.script.src.name and self.cmd_args.aiperf.has_accuracy_benchmark: - accuracy = parse_aiperf_accuracy(output_path) - if accuracy is None: - logging.info(f"AIPerf accuracy results not found in {output_path}.") - result = False - return JobStatusResult(result) def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool: diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh index 46f5daa42..8cb84f7d9 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh @@ -100,6 +100,10 @@ _resolve_host_ip() { echo "$ip" } +_current_node_ip() { + _resolve_host_ip "$(_current_node_name)" +} + _apply_sglang_dsr1_section_args() { local self="$(_current_node_name)" local gpn="$(_gpus_per_node)" @@ -733,6 +737,8 @@ function launch_decode() local base_kvbm_pub_port=${DYN_KVBM_LEADER_ZMQ_PUB_PORT:-56001} local base_kvbm_ack_port=${DYN_KVBM_LEADER_ZMQ_ACK_PORT:-56002} local kvbm_port_stride=2 + local side_channel_host + side_channel_host="$(_current_node_ip)" log "Launching $workers_per_node decode worker(s) with unique port ranges" for i in $(seq 0 $(( $workers_per_node - 1 ))); do @@ -754,10 +760,10 @@ function launch_decode() args_arr+=($key "${decode_args[$key]}") done - log "Launching decode worker $i on GPUs $gpu_list (NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)" + log "Launching decode worker $i on GPUs $gpu_list (NIXL host: $side_channel_host, NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)" log "Decode cmd: ${decode_config["cmd"]} ${args_arr[*]} ${decode_config["extra-args"]}" CUDA_VISIBLE_DEVICES=$gpu_list \ - VLLM_NIXL_SIDE_CHANNEL_HOST=$(hostname -I | awk '{print $1}') \ + VLLM_NIXL_SIDE_CHANNEL_HOST="$side_channel_host" \ VLLM_NIXL_SIDE_CHANNEL_PORT=$nixl_port \ DYN_VLLM_KV_EVENT_PORT=$kv_event_port \ DYN_KVBM_LEADER_ZMQ_PUB_PORT=$kvbm_pub_port \ @@ -788,6 +794,8 @@ function launch_prefill() local base_kvbm_pub_port=${DYN_KVBM_LEADER_ZMQ_PUB_PORT:-56001} local base_kvbm_ack_port=${DYN_KVBM_LEADER_ZMQ_ACK_PORT:-56002} local kvbm_port_stride=2 + local side_channel_host + side_channel_host="$(_current_node_ip)" log "Launching $workers_per_node prefill worker(s) with unique port ranges" for i in $(seq 0 $(( $workers_per_node - 1 ))); do @@ -809,10 +817,10 @@ function launch_prefill() args_arr+=($key "${prefill_args[$key]}") done - log "Launching prefill worker $i on GPUs $gpu_list (NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)" + log "Launching prefill worker $i on GPUs $gpu_list (NIXL host: $side_channel_host, NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)" log "Prefill cmd: ${prefill_config["cmd"]} ${args_arr[*]} ${prefill_config["extra-args"]}" CUDA_VISIBLE_DEVICES=$gpu_list \ - VLLM_NIXL_SIDE_CHANNEL_HOST=$(hostname -I | awk '{print $1}') \ + VLLM_NIXL_SIDE_CHANNEL_HOST="$side_channel_host" \ VLLM_NIXL_SIDE_CHANNEL_PORT=$nixl_port \ DYN_VLLM_KV_EVENT_PORT=$kv_event_port \ DYN_KVBM_LEADER_ZMQ_PUB_PORT=$kvbm_pub_port \ diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh index f0826686a..62298dc97 100644 --- a/src/cloudai/workloads/ai_dynamo/aiperf.sh +++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh @@ -65,6 +65,16 @@ _parse_aiperf_args() { fi } +has_accuracy_benchmark() { + local arg + for arg in "${aiperf_profile_args[@]}" "${extra_args[@]}"; do + if [[ "$arg" == "--accuracy-benchmark" ]]; then + return 0 + fi + done + return 1 +} + process_args() { while [[ $# -gt 0 ]]; do case "$1" in @@ -97,6 +107,17 @@ process_results() { local csv_path="" local accuracy_path="$artifact_dir/accuracy_results.csv" + if has_accuracy_benchmark; then + if [[ ! -s "$accuracy_path" ]]; then + log "ERROR: AIPerf accuracy benchmark was requested but $accuracy_path was not produced" + exit 1 + fi + + cp "$accuracy_path" "$result_dir/accuracy_results.csv" + log "aiperf accuracy report saved to $result_dir/accuracy_results.csv" + return 0 + fi + if [[ -f "$artifact_dir/profile_export_aiperf.csv" ]]; then csv_path="$artifact_dir/profile_export_aiperf.csv" else @@ -111,10 +132,6 @@ process_results() { exit 1 fi - if [[ -f "$accuracy_path" ]]; then - cp "$accuracy_path" "$result_dir/accuracy_results.csv" - log "aiperf accuracy report saved to $result_dir/accuracy_results.csv" - fi } main() { diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index aab622578..146694734 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -161,6 +161,8 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo "accuracy-benchmark": "mmlu", "accuracy-n-shots": 5, "accuracy-tasks": "abstract_algebra", + "concurrency": 10, + "num-requests": 100, } } ) @@ -170,6 +172,8 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo assert '--aiperf-args-accuracy-benchmark "mmlu"' in result assert '--aiperf-args-accuracy-n-shots "5"' in result assert '--aiperf-args-accuracy-tasks "abstract_algebra"' in result + assert '--aiperf-args-concurrency "10"' in result + assert '--aiperf-args-num-requests "100"' in result def test_gen_script_args_quotes_worker_json_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py index 0ba90024d..1c9cbb013 100644 --- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py +++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py @@ -146,7 +146,6 @@ def ai_dynamo_aiperf_accuracy_tr(tmp_path: Path) -> TestRun: ), ) tr = TestRun(name="ai_dynamo_aiperf_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path) - (tr.output_path / "aiperf_report.csv").write_text(get_aiperf_csv_content()) (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content()) (tr.output_path / test.success_marker).touch() return tr From 0e20d7493370d947da8edf4d863aee7e33154f3d Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 26 May 2026 21:43:39 -0700 Subject: [PATCH 04/16] add aiperf setup for accuracy test --- conf/experimental/ai_dynamo/test/sglang.toml | 1 + conf/experimental/ai_dynamo/test/vllm.toml | 1 + doc/workloads/ai_dynamo.rst | 20 ++++++++++++------- src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 5 +++++ src/cloudai/workloads/ai_dynamo/aiperf.sh | 16 +++++++++++++++ .../test_command_gen_strategy_slurm.py | 5 ++++- 6 files changed, 40 insertions(+), 8 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index adf9552af..be393b829 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -94,6 +94,7 @@ workloads = "aiperf.sh" concurrency = 2 [cmd_args.aiperf] + setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'" [cmd_args.aiperf.args] accuracy-benchmark = "mmlu" diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index 76b8c3c6e..a072b5a5e 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -88,6 +88,7 @@ workloads = "aiperf.sh" concurrency = 2 [cmd_args.aiperf] + setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'" [cmd_args.aiperf.args] accuracy-benchmark = "mmlu" diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index 1e077b7c1..d9912fb06 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -121,18 +121,24 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it [cmd_args] workloads = "aiperf.sh" - [cmd_args.aiperf.args] - accuracy-benchmark = "mmlu" - accuracy-n-shots = 5 - accuracy-tasks = "abstract_algebra" - concurrency = 10 - extra-inputs = '{"temperature":0,"stop":["\n"]}' - num-requests = 100 + [cmd_args.aiperf] + setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'" + + [cmd_args.aiperf.args] + accuracy-benchmark = "mmlu" + accuracy-n-shots = 5 + accuracy-tasks = "abstract_algebra" + concurrency = 10 + extra-inputs = '{"temperature":0,"stop":["\n"]}' + num-requests = 100 When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes the ``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt and token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark. +The ``setup-cmd`` field is optional. It is useful for Dynamo images that include ``aiperf`` without its accuracy extra; +CloudAI runs it immediately before launching ``aiperf profile``. + Review Benchmark Results ------------------------ diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index 2d0e5e8fd..a28d84f4b 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -374,6 +374,11 @@ class AIPerf(Workload): name: str = "aiperf" cmd: str = "aiperf profile" script: File = File(Path(__file__).parent.parent / "ai_dynamo/aiperf.sh") + setup_cmd: str | None = Field( + default=None, + serialization_alias="setup-cmd", + validation_alias=AliasChoices("setup-cmd", "setup_cmd"), + ) report_name: str = Field( default="aiperf_report.csv", serialization_alias="report-name", diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh index 62298dc97..ac3131b50 100644 --- a/src/cloudai/workloads/ai_dynamo/aiperf.sh +++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh @@ -29,6 +29,7 @@ # --port HTTP port the dynamo.frontend is listening on. # --report-name Output CSV name (default: aiperf_report.csv). # --cmd Full launch command including subcommand (default: "aiperf profile"). +# --setup-cmd Optional shell command run before launching aiperf. # --extra-args Raw string appended verbatim after all other flags. # # All unrecognised flags (--install-dir, --gpus-per-node, etc.) are silently @@ -44,6 +45,7 @@ url="http://localhost" port=8000 report_name="aiperf_report.csv" cmd="aiperf profile" +setup_cmd="" declare -a extra_args=() declare -a aiperf_profile_args=() @@ -84,6 +86,7 @@ process_args() { --port) port="$2"; shift 2 ;; --report-name) report_name="$2"; shift 2 ;; --cmd) cmd="$2"; shift 2 ;; + --setup-cmd) setup_cmd="$2"; shift 2 ;; --extra-args) read -ra extra_args <<< "$2"; shift 2 ;; --) shift; _parse_aiperf_args "$@"; break ;; --*) if [[ -n "${2:-}" && "${2}" != -* ]]; then shift 2; else shift 1; fi ;; # consume unknown flag; shift 2 only if next arg is a value @@ -98,10 +101,21 @@ process_args() { port: $port report_name: $report_name cmd: $cmd + setup_cmd: ${setup_cmd:-} extra_args: ${extra_args[*]:-} profile_args: ${aiperf_profile_args[*]:-}" } +run_setup_cmd() { + if [[ -z "$setup_cmd" ]]; then + return + fi + + log "Running AIPerf setup command: $setup_cmd" + bash -lc "$setup_cmd" + log "AIPerf setup command complete" +} + process_results() { local artifact_dir="$result_dir/aiperf_artifacts" local csv_path="" @@ -144,6 +158,8 @@ main() { log "ERROR: --model is required"; exit 1 fi + run_setup_cmd + local full_url="${url}:${port}" local artifact_dir="$result_dir/aiperf_artifacts" rm -rf "$artifact_dir" diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index 146694734..23132aef6 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -155,20 +155,23 @@ def test_dynamo_cmd( def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: td = cast(AIDynamoTestDefinition, strategy.test_run.test) td.cmd_args.workloads = "aiperf.sh" + setup_cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'" td.cmd_args.aiperf = AIPerf.model_validate( { + "setup-cmd": setup_cmd, "args": { "accuracy-benchmark": "mmlu", "accuracy-n-shots": 5, "accuracy-tasks": "abstract_algebra", "concurrency": 10, "num-requests": 100, - } + }, } ) result = strategy._gen_script_args(td) + assert f'--aiperf-setup-cmd "{setup_cmd}"' in result assert '--aiperf-args-accuracy-benchmark "mmlu"' in result assert '--aiperf-args-accuracy-n-shots "5"' in result assert '--aiperf-args-accuracy-tasks "abstract_algebra"' in result From b6c798277a9afc160917155c3a5e26e356d893e2 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 26 May 2026 21:52:27 -0700 Subject: [PATCH 05/16] hard bump aiperf --- conf/experimental/ai_dynamo/test/sglang.toml | 2 +- conf/experimental/ai_dynamo/test/vllm.toml | 2 +- doc/workloads/ai_dynamo.rst | 6 +++--- .../workloads/ai_dynamo/test_command_gen_strategy_slurm.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index be393b829..df62ee649 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -94,7 +94,7 @@ workloads = "aiperf.sh" concurrency = 2 [cmd_args.aiperf] - setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'" + setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'" [cmd_args.aiperf.args] accuracy-benchmark = "mmlu" diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index a072b5a5e..e1f65daa3 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -88,7 +88,7 @@ workloads = "aiperf.sh" concurrency = 2 [cmd_args.aiperf] - setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'" + setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'" [cmd_args.aiperf.args] accuracy-benchmark = "mmlu" diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index d9912fb06..66864bd54 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -122,7 +122,7 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it workloads = "aiperf.sh" [cmd_args.aiperf] - setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'" + setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'" [cmd_args.aiperf.args] accuracy-benchmark = "mmlu" @@ -136,8 +136,8 @@ When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``a ``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt and token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark. -The ``setup-cmd`` field is optional. It is useful for Dynamo images that include ``aiperf`` without its accuracy extra; -CloudAI runs it immediately before launching ``aiperf profile``. +The ``setup-cmd`` field is optional. It is useful for Dynamo images that include an older system ``aiperf`` build without +the accuracy benchmark plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``. Review Benchmark Results ------------------------ diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index 23132aef6..f9ca68008 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -155,7 +155,7 @@ def test_dynamo_cmd( def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: td = cast(AIDynamoTestDefinition, strategy.test_run.test) td.cmd_args.workloads = "aiperf.sh" - setup_cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'" + setup_cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'" td.cmd_args.aiperf = AIPerf.model_validate( { "setup-cmd": setup_cmd, From de1110ed0dd75be69b29f4b779cea20622fc6a6e Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 26 May 2026 22:02:43 -0700 Subject: [PATCH 06/16] enable hf online --- conf/experimental/ai_dynamo/test/sglang.toml | 6 +++--- conf/experimental/ai_dynamo/test/vllm.toml | 6 +++--- doc/workloads/ai_dynamo.rst | 2 ++ src/cloudai/workloads/ai_dynamo/ai_dynamo.sh | 20 ++++++++++++++++++-- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index df62ee649..f73984d58 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -106,9 +106,9 @@ workloads = "aiperf.sh" [extra_env_vars] UCX_LOG_LEVEL = "warn" -HF_HUB_OFFLINE = "1" -TRANSFORMERS_OFFLINE = "1" -HF_DATASETS_OFFLINE = "1" +HF_HUB_OFFLINE = "0" +TRANSFORMERS_OFFLINE = "0" +HF_DATASETS_OFFLINE = "0" DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')" UCX_TLS = "all" #DYN_LOGGING_JSONL="true" diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index e1f65daa3..196595529 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -100,8 +100,8 @@ workloads = "aiperf.sh" [extra_env_vars] UCX_LOG_LEVEL = "warn" -HF_HUB_OFFLINE = "1" -TRANSFORMERS_OFFLINE = "1" -HF_DATASETS_OFFLINE = "1" +HF_HUB_OFFLINE = "0" +TRANSFORMERS_OFFLINE = "0" +HF_DATASETS_OFFLINE = "0" DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')" UCX_TLS = "all" diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index 66864bd54..63e617b73 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -138,6 +138,8 @@ token-length flags out of this mode; the benchmark dataset should come from AIPe The ``setup-cmd`` field is optional. It is useful for Dynamo images that include an older system ``aiperf`` build without the accuracy benchmark plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``. +MMLU is loaded from ``lighteval/mmlu``, so either allow Hugging Face dataset access or pre-cache that dataset before +running with ``HF_HUB_OFFLINE``/``HF_DATASETS_OFFLINE`` enabled. Review Benchmark Results ------------------------ diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh index 8cb84f7d9..26a412647 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh @@ -424,6 +424,10 @@ function perform_exit() exit_on_error() { local fatal=$(_detect_fatal_once) + if [ -f "${FATAL_ERROR_MARKER}" ]; then + log "FATAL_ERROR_MARKER found. Terminating." + perform_exit 1 + fi if [ -f "${DONE_MARKER}" ]; then log "DONE_MARKER found. Skipping error check." return @@ -693,6 +697,13 @@ function mark_done() touch "$DONE_MARKER" } +function mark_failed() +{ + local message="$1" + log "ERROR: ${message}" + printf '%s\n' "${message}" > "${FATAL_ERROR_MARKER}" +} + function launch_etcd() { log "Launching etcd with cmd: ${dynamo_args["etcd-cmd"]} --listen-client-urls http://0.0.0.0:${dynamo_args["etcd-port"]} --advertise-client-urls http://0.0.0.0:${dynamo_args["etcd-port"]}" @@ -1034,6 +1045,11 @@ function launch_workload() --decode-nodes "${decode_config["node-list"]}" \ "${config_arr[@]}" \ -- "${args_arr[@]}" > "${RESULTS_DIR}/$workload_name.log" 2>&1 + local workload_status=$? + if [[ "${workload_status}" -ne 0 ]]; then + mark_failed "Workload ${workload_name} failed with exit code ${workload_status}. See ${RESULTS_DIR}/${workload_name}.log" + return "${workload_status}" + fi log "Done with $workload_name run" } @@ -1043,11 +1059,11 @@ function launch_workloads() wait_for_dynamo_frontend if _is_genai_perf_workload; then - launch_workload genai_perf_config genai_perf_args + launch_workload genai_perf_config genai_perf_args || return $? fi if _is_aiperf_workload; then - launch_workload aiperf_config aiperf_args + launch_workload aiperf_config aiperf_args || return $? fi mark_done From 632e8f52bd008cac2ad1a37394098e0791dc6292 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 26 May 2026 22:12:22 -0700 Subject: [PATCH 07/16] remove first token conf --- conf/experimental/ai_dynamo/test/sglang.toml | 2 +- conf/experimental/ai_dynamo/test/vllm.toml | 2 +- doc/workloads/ai_dynamo.rst | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index f73984d58..36f8d7cc9 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -101,7 +101,7 @@ workloads = "aiperf.sh" accuracy-n-shots = 5 accuracy-tasks = "abstract_algebra" concurrency = 10 - extra-inputs = '{"temperature":0,"stop":["\n"]}' + extra-inputs = '{"temperature":0}' num-requests = 100 [extra_env_vars] diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index 196595529..634f127b8 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -95,7 +95,7 @@ workloads = "aiperf.sh" accuracy-n-shots = 5 accuracy-tasks = "abstract_algebra" concurrency = 10 - extra-inputs = '{"temperature":0,"stop":["\n"]}' + extra-inputs = '{"temperature":0}' num-requests = 100 [extra_env_vars] diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index 63e617b73..bdf2b8f70 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -129,7 +129,7 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it accuracy-n-shots = 5 accuracy-tasks = "abstract_algebra" concurrency = 10 - extra-inputs = '{"temperature":0,"stop":["\n"]}' + extra-inputs = '{"temperature":0}' num-requests = 100 When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes the From b8217ec0304f2dd264de141edc882e16beb07bbb Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 26 May 2026 22:21:07 -0700 Subject: [PATCH 08/16] disable qwen thinking --- conf/experimental/ai_dynamo/test/sglang.toml | 2 +- conf/experimental/ai_dynamo/test/vllm.toml | 2 +- doc/workloads/ai_dynamo.rst | 3 ++- tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py | 3 +++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index 36f8d7cc9..28e049c8a 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -101,7 +101,7 @@ workloads = "aiperf.sh" accuracy-n-shots = 5 accuracy-tasks = "abstract_algebra" concurrency = 10 - extra-inputs = '{"temperature":0}' + extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' num-requests = 100 [extra_env_vars] diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index 634f127b8..fdae52847 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -95,7 +95,7 @@ workloads = "aiperf.sh" accuracy-n-shots = 5 accuracy-tasks = "abstract_algebra" concurrency = 10 - extra-inputs = '{"temperature":0}' + extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' num-requests = 100 [extra_env_vars] diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index bdf2b8f70..3de4a5523 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -129,7 +129,7 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it accuracy-n-shots = 5 accuracy-tasks = "abstract_algebra" concurrency = 10 - extra-inputs = '{"temperature":0}' + extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' num-requests = 100 When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes the @@ -140,6 +140,7 @@ The ``setup-cmd`` field is optional. It is useful for Dynamo images that include the accuracy benchmark plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``. MMLU is loaded from ``lighteval/mmlu``, so either allow Hugging Face dataset access or pre-cache that dataset before running with ``HF_HUB_OFFLINE``/``HF_DATASETS_OFFLINE`` enabled. +For Qwen3 models, the example disables thinking mode so short MMLU answers can be parsed as choices. Review Benchmark Results ------------------------ diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index f9ca68008..8667cf0d2 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -156,6 +156,7 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo td = cast(AIDynamoTestDefinition, strategy.test_run.test) td.cmd_args.workloads = "aiperf.sh" setup_cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'" + extra_inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' td.cmd_args.aiperf = AIPerf.model_validate( { "setup-cmd": setup_cmd, @@ -164,6 +165,7 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo "accuracy-n-shots": 5, "accuracy-tasks": "abstract_algebra", "concurrency": 10, + "extra-inputs": extra_inputs, "num-requests": 100, }, } @@ -176,6 +178,7 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo assert '--aiperf-args-accuracy-n-shots "5"' in result assert '--aiperf-args-accuracy-tasks "abstract_algebra"' in result assert '--aiperf-args-concurrency "10"' in result + assert f"--aiperf-args-extra-inputs '{extra_inputs}'" in result assert '--aiperf-args-num-requests "100"' in result From 3c05fb515a8066ac10dd6c94b6f8a92d3ded79f8 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 27 May 2026 08:39:47 -0700 Subject: [PATCH 09/16] run both perf and accuracy tests --- conf/experimental/ai_dynamo/test/sglang.toml | 12 +- conf/experimental/ai_dynamo/test/vllm.toml | 12 +- doc/workloads/ai_dynamo.rst | 30 +++-- src/cloudai/workloads/ai_dynamo/__init__.py | 2 + src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 106 +++++++++++++----- src/cloudai/workloads/ai_dynamo/ai_dynamo.sh | 16 +++ src/cloudai/workloads/ai_dynamo/aiperf.sh | 18 +-- .../ai_dynamo/report_generation_strategy.py | 4 +- .../ai_dynamo/slurm_command_gen_strategy.py | 2 + .../test_command_gen_strategy_slurm.py | 46 ++++++++ .../ai_dynamo/test_report_gen_strategy.py | 101 +++++++++++++++++ 11 files changed, 297 insertions(+), 52 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index 28e049c8a..7d9930ecd 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -94,9 +94,17 @@ workloads = "aiperf.sh" concurrency = 2 [cmd_args.aiperf] - setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'" - [cmd_args.aiperf.args] + concurrency = 2 + extra-inputs = '{"min_tokens":10}' + output-tokens-mean = 500 + request-count = 50 + synthetic-input-tokens-mean = 300 + + [cmd_args.aiperf_accuracy] + setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" + + [cmd_args.aiperf_accuracy.args] accuracy-benchmark = "mmlu" accuracy-n-shots = 5 accuracy-tasks = "abstract_algebra" diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index fdae52847..e314fe743 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -88,9 +88,17 @@ workloads = "aiperf.sh" concurrency = 2 [cmd_args.aiperf] - setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'" - [cmd_args.aiperf.args] + concurrency = 2 + extra-inputs = '{"min_tokens":10}' + output-tokens-mean = 500 + request-count = 50 + synthetic-input-tokens-mean = 300 + + [cmd_args.aiperf_accuracy] + setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" + + [cmd_args.aiperf_accuracy.args] accuracy-benchmark = "mmlu" accuracy-n-shots = 5 accuracy-tasks = "abstract_algebra" diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index 3de4a5523..b0e077f9f 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -113,8 +113,9 @@ To use genai-perf, set: Semantic Degradation With AIPerf Accuracy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it by adding AIPerf accuracy flags under -``[cmd_args.aiperf.args]`` and running the ``aiperf.sh`` workload: +AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it with +``[cmd_args.aiperf_accuracy]``. This runs after the configured performance workload, so it can be used with either +``aiperf.sh`` or ``genai_perf.sh``: .. code-block:: toml @@ -122,9 +123,16 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it workloads = "aiperf.sh" [cmd_args.aiperf] - setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'" + [cmd_args.aiperf.args] + request-count = 50 + synthetic-input-tokens-mean = 300 + output-tokens-mean = 500 + concurrency = 2 + + [cmd_args.aiperf_accuracy] + setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" - [cmd_args.aiperf.args] + [cmd_args.aiperf_accuracy.args] accuracy-benchmark = "mmlu" accuracy-n-shots = 5 accuracy-tasks = "abstract_algebra" @@ -132,9 +140,9 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' num-requests = 100 -When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes the -``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt and -token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark. +When ``cmd_args.aiperf_accuracy`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes +the ``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt +and token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark. The ``setup-cmd`` field is optional. It is useful for Dynamo images that include an older system ``aiperf`` build without the accuracy benchmark plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``. @@ -147,11 +155,13 @@ Review Benchmark Results After job completion, CloudAI places output logs and result files in the designated results directory. The result file name depends on the configured ``workloads`` field: -- ``aiperf.sh`` → ``aiperf_report.csv`` for performance mode, ``accuracy_results.csv`` for accuracy mode +- ``aiperf.sh`` → ``aiperf_report.csv`` - ``genai_perf.sh`` → ``genai_perf_report.csv`` +- ``cmd_args.aiperf_accuracy`` → ``accuracy_results.csv`` -If AIPerf accuracy mode is enabled, CloudAI copies ``aiperf_artifacts/accuracy_results.csv`` to ``accuracy_results.csv`` -in the run output directory and marks the run failed if that file is not produced. +If AIPerf accuracy mode is enabled, CloudAI copies ``aiperf_accuracy_artifacts/accuracy_results.csv`` to +``accuracy_results.csv`` in the run output directory and marks the run failed if that file is not produced. The older +one-shot form that puts ``accuracy-benchmark`` under ``cmd_args.aiperf.args`` remains supported for compatibility. Navigate to ``./results///0/`` and open the CSV to examine performance metrics. diff --git a/src/cloudai/workloads/ai_dynamo/__init__.py b/src/cloudai/workloads/ai_dynamo/__init__.py index 1360ce10d..4aac3fd2c 100644 --- a/src/cloudai/workloads/ai_dynamo/__init__.py +++ b/src/cloudai/workloads/ai_dynamo/__init__.py @@ -19,6 +19,7 @@ AIDynamoCmdArgs, AIDynamoTestDefinition, AIPerf, + AIPerfAccuracy, GenAIPerf, LMCache, LMCacheArgs, @@ -37,6 +38,7 @@ "AIDynamoSlurmCommandGenStrategy", "AIDynamoTestDefinition", "AIPerf", + "AIPerfAccuracy", "GenAIPerf", "LMCache", "LMCacheArgs", diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index a28d84f4b..ffe34fa82 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -42,6 +42,7 @@ from cloudai.systems.slurm import SlurmSystem AIPERF_ARTIFACTS_DIR = "aiperf_artifacts" +AIPERF_ACCURACY_ARTIFACTS_DIR = "aiperf_accuracy_artifacts" AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv" @@ -94,6 +95,7 @@ def parse_aiperf_accuracy(output_path: Path) -> float | None: """ candidates = [ output_path / AIPERF_ACCURACY_RESULTS_CSV, + output_path / AIPERF_ACCURACY_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV, output_path / AIPERF_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV, ] @@ -401,6 +403,22 @@ def has_accuracy_benchmark(self) -> bool: return "--accuracy-benchmark" in extra_args +class AIPerfAccuracy(AIPerf): + """Optional AIPerf accuracy benchmark configuration.""" + + name: str = "aiperf_accuracy" + report_name: str = Field( + default="aiperf_accuracy_report.csv", + serialization_alias="report-name", + validation_alias=AliasChoices("report-name", "report_name"), + ) + artifact_dir_name: str = Field( + default=AIPERF_ACCURACY_ARTIFACTS_DIR, + serialization_alias="artifact-dir-name", + validation_alias=AliasChoices("artifact-dir-name", "artifact_dir_name"), + ) + + class Constraints(BaseModel): """Constraints for validation of AI Dynamo configurations when using DSE.""" @@ -421,6 +439,7 @@ class AIDynamoCmdArgs(CmdArgs): lmcache: LMCache = Field(default_factory=LMCache) genai_perf: GenAIPerf = Field(default_factory=GenAIPerf) aiperf: AIPerf = Field(default_factory=AIPerf) + aiperf_accuracy: AIPerfAccuracy | None = None workloads: str = "genai_perf.sh" @field_validator("workloads", mode="before") @@ -443,6 +462,7 @@ def installables(self) -> list[Installable]: *self.lmcache.installables, *self.genai_perf.installables, *self.aiperf.installables, + *(self.aiperf_accuracy.installables if self.aiperf_accuracy else []), ] @@ -471,6 +491,9 @@ def workload_scripts(self) -> "AIDynamoTestDefinition": if workload not in workload_map: raise ValueError(f"Invalid workload: {workload}. Available workloads: {list(workload_map.keys())}") + if self.cmd_args.aiperf_accuracy is not None and not self.cmd_args.aiperf_accuracy.has_accuracy_benchmark: + raise ValueError("cmd_args.aiperf_accuracy must configure an AIPerf --accuracy-benchmark argument") + return self def get_workload_map(self) -> dict[str, Workload]: @@ -504,10 +527,58 @@ def installables(self) -> list[Installable]: *self.cmd_args.installables, ] + def _has_aiperf_accuracy_results(self, output_path: Path) -> bool: + accuracy = parse_aiperf_accuracy(output_path) + if accuracy is None: + logging.info(f"AIPerf accuracy results not found in {output_path}.") + return False + + logging.info(f"AIPerf accuracy results found in {output_path}: {accuracy}") + return True + + def _is_legacy_aiperf_accuracy_workload(self, workload: str) -> bool: + return workload == self.cmd_args.aiperf.script.src.name and self.cmd_args.aiperf.has_accuracy_benchmark + + def _was_workload_report_produced(self, output_path: Path, workload: str, workload_config: Workload) -> bool: + report_name = workload_config.report_name + if report_name is None: + logging.warning(f"Workload {workload} has no report_name configured") + return False + + workload_csv_file = output_path / report_name + if not workload_csv_file.exists(): + logging.info(f"Result file ({workload_csv_file.absolute()}) not found for workload: {workload}") + return False + + logging.info(f"Result file ({workload_csv_file.absolute()}) exists for {workload}") + return True + + def _was_workload_successful(self, output_path: Path, workload: str, workload_map: dict[str, Workload]) -> bool: + workload_config = workload_map.get(workload) + if workload_config is None: + logging.info(f"Workload {workload} not found in workload map") + return False + + if self._is_legacy_aiperf_accuracy_workload(workload): + return self._has_aiperf_accuracy_results(output_path) + + return self._was_workload_report_produced(output_path, workload, workload_config) + + def _were_workloads_successful(self, output_path: Path) -> bool: + workload_map = self.get_workload_map() + result = True + for workload in self.cmd_args.workloads_list: + result = self._was_workload_successful(output_path, workload, workload_map) and result + return result + + def _was_aiperf_accuracy_successful(self, output_path: Path) -> bool: + if self.cmd_args.aiperf_accuracy is None: + return True + + return self._has_aiperf_accuracy_results(output_path) + def was_run_successful(self, tr: TestRun) -> JobStatusResult: output_path = tr.output_path - result = True - workload_map = self.get_workload_map() failure_marker = output_path / self.failure_marker success_marker = output_path / self.success_marker @@ -518,34 +589,9 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: if not success_marker.exists(): return JobStatusResult(False, error_message=f"Success marker file not found: {success_marker.absolute()}") - for workload in self.cmd_args.workloads_list: - if workload not in workload_map: - logging.info(f"Workload {workload} not found in workload map") - result = False - continue - - if workload == self.cmd_args.aiperf.script.src.name and self.cmd_args.aiperf.has_accuracy_benchmark: - accuracy = parse_aiperf_accuracy(output_path) - if accuracy is None: - logging.info(f"AIPerf accuracy results not found in {output_path}.") - result = False - else: - logging.info(f"AIPerf accuracy results found in {output_path}: {accuracy}") - continue - - report_name = workload_map[workload].report_name - if report_name is None: - logging.warning(f"Workload {workload} has no report_name configured") - result = False - continue - workload_csv_file = output_path / report_name - if not workload_csv_file.exists(): - logging.info(f"Result file ({workload_csv_file.absolute()}) not found for workload: {workload}") - result = False - else: - logging.info(f"Result file ({workload_csv_file.absolute()}) exists for {workload}") - - return JobStatusResult(result) + workloads_successful = self._were_workloads_successful(output_path) + accuracy_successful = self._was_aiperf_accuracy_successful(output_path) + return JobStatusResult(workloads_successful and accuracy_successful) def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool: prefill_worker = tr.test.cmd_args.dynamo.prefill_worker diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh index 26a412647..5b65db41f 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh @@ -37,6 +37,8 @@ declare -A genai_perf_args declare -A genai_perf_config declare -A aiperf_args declare -A aiperf_config +declare -A aiperf_accuracy_args +declare -A aiperf_accuracy_config declare -A dynamo_args dynamo_args["backend"]="vllm" @@ -173,6 +175,10 @@ _parse_cli_pairs() { aiperf_args["--${key#--aiperf-args-}"]="$2" ;; --aiperf-*) aiperf_config["--${key#--aiperf-}"]="$2" ;; + --aiperf_accuracy-args-*) + aiperf_accuracy_args["--${key#--aiperf_accuracy-args-}"]="$2" ;; + --aiperf_accuracy-*) + aiperf_accuracy_config["--${key#--aiperf_accuracy-}"]="$2" ;; --hf-home) HUGGINGFACE_HOME="$2" ;; --storage-cache-dir) @@ -365,6 +371,8 @@ _dump_args() { log "GenAI-Perf args:\n$(arg_array_to_string genai_perf_args)" log "AIPerf config params:\n$(arg_array_to_string aiperf_config)" log "AIPerf args:\n$(arg_array_to_string aiperf_args)" + log "AIPerf accuracy config params:\n$(arg_array_to_string aiperf_accuracy_config)" + log "AIPerf accuracy args:\n$(arg_array_to_string aiperf_accuracy_args)" log "--------------------------------" } @@ -525,6 +533,10 @@ _is_aiperf_workload() { [[ "${dynamo_args["workloads"]}" == *"aiperf.sh"* ]] } +_is_aiperf_accuracy_enabled() { + [[ -n "${aiperf_accuracy_config["--script"]:-}" ]] +} + _init_runtime_env() { if _is_vllm || _is_sglang; then export HF_HOME="${HUGGINGFACE_HOME}" @@ -1066,6 +1078,10 @@ function launch_workloads() launch_workload aiperf_config aiperf_args || return $? fi + if _is_aiperf_accuracy_enabled; then + launch_workload aiperf_accuracy_config aiperf_accuracy_args || return $? + fi + mark_done } diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh index ac3131b50..bf08f869d 100644 --- a/src/cloudai/workloads/ai_dynamo/aiperf.sh +++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh @@ -19,7 +19,7 @@ # # Called from ai_dynamo.sh's launch_workload() with: # bash aiperf.sh --result-dir --model --url --port -# [--cmd ] [--report-name ] [--extra-args ] +# [--cmd ] [--report-name ] [--artifact-dir-name ] [--extra-args ] # -- ... # # Context flags (before --) that are recognised and used: @@ -28,6 +28,7 @@ # --url Base URL of the dynamo.frontend (e.g. http://node01). # --port HTTP port the dynamo.frontend is listening on. # --report-name Output CSV name (default: aiperf_report.csv). +# --artifact-dir-name Artifact directory name under --result-dir (default: aiperf_artifacts). # --cmd Full launch command including subcommand (default: "aiperf profile"). # --setup-cmd Optional shell command run before launching aiperf. # --extra-args Raw string appended verbatim after all other flags. @@ -44,6 +45,7 @@ model="" url="http://localhost" port=8000 report_name="aiperf_report.csv" +artifact_dir_name="aiperf_artifacts" cmd="aiperf profile" setup_cmd="" declare -a extra_args=() @@ -85,10 +87,11 @@ process_args() { --url) url="$2"; shift 2 ;; --port) port="$2"; shift 2 ;; --report-name) report_name="$2"; shift 2 ;; - --cmd) cmd="$2"; shift 2 ;; - --setup-cmd) setup_cmd="$2"; shift 2 ;; - --extra-args) read -ra extra_args <<< "$2"; shift 2 ;; - --) shift; _parse_aiperf_args "$@"; break ;; + --artifact-dir-name) artifact_dir_name="$2"; shift 2 ;; + --cmd) cmd="$2"; shift 2 ;; + --setup-cmd) setup_cmd="$2"; shift 2 ;; + --extra-args) read -ra extra_args <<< "$2"; shift 2 ;; + --) shift; _parse_aiperf_args "$@"; break ;; --*) if [[ -n "${2:-}" && "${2}" != -* ]]; then shift 2; else shift 1; fi ;; # consume unknown flag; shift 2 only if next arg is a value *) shift ;; esac @@ -100,6 +103,7 @@ process_args() { url: $url port: $port report_name: $report_name + artifact_dir: $artifact_dir_name cmd: $cmd setup_cmd: ${setup_cmd:-} extra_args: ${extra_args[*]:-} @@ -117,7 +121,7 @@ run_setup_cmd() { } process_results() { - local artifact_dir="$result_dir/aiperf_artifacts" + local artifact_dir="$result_dir/$artifact_dir_name" local csv_path="" local accuracy_path="$artifact_dir/accuracy_results.csv" @@ -161,7 +165,7 @@ main() { run_setup_cmd local full_url="${url}:${port}" - local artifact_dir="$result_dir/aiperf_artifacts" + local artifact_dir="$result_dir/$artifact_dir_name" rm -rf "$artifact_dir" # Split cmd into an array (e.g. "aiperf profile" → ["aiperf", "profile"]) diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py index a665b0d6a..4b46cf5dc 100644 --- a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py @@ -48,7 +48,9 @@ def get_metric(self, metric: str) -> MetricValue: if metric.lower() == "accuracy": tdef = self.test_run.test - if not isinstance(tdef, AIDynamoTestDefinition) or not tdef.cmd_args.aiperf.has_accuracy_benchmark: + if not isinstance(tdef, AIDynamoTestDefinition): + return METRIC_ERROR + if tdef.cmd_args.aiperf_accuracy is None and not tdef.cmd_args.aiperf.has_accuracy_benchmark: return METRIC_ERROR accuracy = parse_aiperf_accuracy(self.test_run.output_path) return accuracy if accuracy is not None else METRIC_ERROR diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 17079875c..51e704102 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -118,6 +118,8 @@ def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]: args.extend(self._get_nested_toml_args(td.cmd_args.lmcache, "--lmcache-")) args.extend(self._get_nested_toml_args(td.cmd_args.genai_perf, "--genai_perf-")) args.extend(self._get_nested_toml_args(td.cmd_args.aiperf, "--aiperf-")) + if td.cmd_args.aiperf_accuracy is not None: + args.extend(self._get_nested_toml_args(td.cmd_args.aiperf_accuracy, "--aiperf_accuracy-")) return args diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index 8667cf0d2..6a9274b8c 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -28,6 +28,7 @@ AIDynamoSlurmCommandGenStrategy, AIDynamoTestDefinition, AIPerf, + AIPerfAccuracy, GenAIPerf, LMCache, LMCacheArgs, @@ -182,6 +183,51 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo assert '--aiperf-args-num-requests "100"' in result +def test_gen_script_args_contains_split_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: + td = cast(AIDynamoTestDefinition, strategy.test_run.test) + td.cmd_args.workloads = "aiperf.sh" + setup_cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" + extra_inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' + td.cmd_args.aiperf = AIPerf.model_validate( + { + "args": { + "concurrency": 2, + "request-count": 50, + "synthetic-input-tokens-mean": 300, + "output-tokens-mean": 500, + }, + } + ) + td.cmd_args.aiperf_accuracy = AIPerfAccuracy.model_validate( + { + "setup-cmd": setup_cmd, + "args": { + "accuracy-benchmark": "mmlu", + "accuracy-n-shots": 5, + "accuracy-tasks": "abstract_algebra", + "concurrency": 10, + "extra-inputs": extra_inputs, + "num-requests": 100, + }, + } + ) + + result = strategy._gen_script_args(td) + + assert '--aiperf-args-request-count "50"' in result + assert '--aiperf-args-synthetic-input-tokens-mean "300"' in result + assert '--aiperf-args-output-tokens-mean "500"' in result + assert f'--aiperf_accuracy-setup-cmd "{setup_cmd}"' in result + assert '--aiperf_accuracy-name "aiperf_accuracy"' in result + assert '--aiperf_accuracy-artifact-dir-name "aiperf_accuracy_artifacts"' in result + assert '--aiperf_accuracy-args-accuracy-benchmark "mmlu"' in result + assert '--aiperf_accuracy-args-accuracy-n-shots "5"' in result + assert '--aiperf_accuracy-args-accuracy-tasks "abstract_algebra"' in result + assert '--aiperf_accuracy-args-concurrency "10"' in result + assert f"--aiperf_accuracy-args-extra-inputs '{extra_inputs}'" in result + assert '--aiperf_accuracy-args-num-requests "100"' in result + + def test_gen_script_args_quotes_worker_json_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: td = cast(AIDynamoTestDefinition, strategy.test_run.test) config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py index 1c9cbb013..1235ab3e6 100644 --- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py +++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py @@ -26,6 +26,7 @@ AIDynamoCmdArgs, AIDynamoTestDefinition, AIPerf, + AIPerfAccuracy, GenAIPerf, LMCache, LMCacheArgs, @@ -151,6 +152,64 @@ def ai_dynamo_aiperf_accuracy_tr(tmp_path: Path) -> TestRun: return tr +@pytest.fixture +def ai_dynamo_aiperf_with_split_accuracy_tr(tmp_path: Path) -> TestRun: + test = AIDynamoTestDefinition( + name="ai_dynamo_aiperf_with_split_accuracy", + description="desc", + test_template_name="t", + cmd_args=AIDynamoCmdArgs( + docker_image_url="http://url", + workloads="aiperf.sh", + dynamo=AIDynamoArgs( + prefill_worker=WorkerConfig( + cmd="python3 -m dynamo.vllm --is-prefill-worker", + worker_initialized_regex="VllmWorker.*has.been.initialized", + args=WorkerBaseArgs(), + ), + ), + aiperf=AIPerf(), + aiperf_accuracy=AIPerfAccuracy.model_validate({"args": {"accuracy-benchmark": "mmlu"}}), + lmcache=LMCache(args=LMCacheArgs()), + ), + ) + tr = TestRun(name="ai_dynamo_aiperf_with_split_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path) + (tr.output_path / "aiperf_report.csv").write_text(get_aiperf_csv_content()) + (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content()) + (tr.output_path / test.success_marker).touch() + return tr + + +@pytest.fixture +def ai_dynamo_genai_perf_with_split_accuracy_tr(tmp_path: Path) -> TestRun: + test = AIDynamoTestDefinition( + name="ai_dynamo_genai_perf_with_split_accuracy", + description="desc", + test_template_name="t", + cmd_args=AIDynamoCmdArgs( + docker_image_url="http://url", + workloads="genai_perf.sh", + dynamo=AIDynamoArgs( + prefill_worker=WorkerConfig( + cmd="python3 -m dynamo.vllm --is-prefill-worker", + worker_initialized_regex="VllmWorker.*has.been.initialized", + args=WorkerBaseArgs(), + ), + ), + genai_perf=GenAIPerf(), + aiperf_accuracy=AIPerfAccuracy.model_validate({"args": {"accuracy-benchmark": "mmlu"}}), + lmcache=LMCache(args=LMCacheArgs()), + ), + ) + tr = TestRun( + name="ai_dynamo_genai_perf_with_split_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path + ) + (tr.output_path / "genai_perf_report.csv").write_text(get_csv_content()) + (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content()) + (tr.output_path / test.success_marker).touch() + return tr + + @pytest.fixture def csv_content() -> str: return get_csv_content() @@ -199,6 +258,15 @@ def test_ai_dynamo_get_metric_aiperf_accuracy(slurm_system: SlurmSystem, ai_dyna assert strategy.get_metric("accuracy") == 0.35 +def test_ai_dynamo_get_metric_split_aiperf_accuracy( + slurm_system: SlurmSystem, ai_dynamo_aiperf_with_split_accuracy_tr: TestRun +) -> None: + strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_with_split_accuracy_tr) + + assert strategy.get_metric("accuracy") == 0.35 + assert strategy.get_metric("Inter Token Latency (ms)") == 2.83 + + def test_ai_dynamo_accuracy_metric_requires_aiperf_accuracy_config( slurm_system: SlurmSystem, ai_dynamo_aiperf_tr: TestRun ) -> None: @@ -228,6 +296,22 @@ def test_was_run_successful_with_aiperf_accuracy(ai_dynamo_aiperf_accuracy_tr: T assert result.is_successful is True +def test_was_run_successful_with_split_aiperf_accuracy( + ai_dynamo_aiperf_with_split_accuracy_tr: TestRun, +) -> None: + test_def = ai_dynamo_aiperf_with_split_accuracy_tr.test + result = test_def.was_run_successful(ai_dynamo_aiperf_with_split_accuracy_tr) + assert result.is_successful is True + + +def test_was_run_successful_with_genai_perf_and_split_aiperf_accuracy( + ai_dynamo_genai_perf_with_split_accuracy_tr: TestRun, +) -> None: + test_def = ai_dynamo_genai_perf_with_split_accuracy_tr.test + result = test_def.was_run_successful(ai_dynamo_genai_perf_with_split_accuracy_tr) + assert result.is_successful is True + + def test_was_run_successful_requires_aiperf_accuracy(ai_dynamo_aiperf_accuracy_tr: TestRun) -> None: test_def = ai_dynamo_aiperf_accuracy_tr.test (ai_dynamo_aiperf_accuracy_tr.output_path / "accuracy_results.csv").unlink() @@ -235,6 +319,15 @@ def test_was_run_successful_requires_aiperf_accuracy(ai_dynamo_aiperf_accuracy_t assert result.is_successful is False +def test_was_run_successful_requires_split_aiperf_accuracy( + ai_dynamo_aiperf_with_split_accuracy_tr: TestRun, +) -> None: + test_def = ai_dynamo_aiperf_with_split_accuracy_tr.test + (ai_dynamo_aiperf_with_split_accuracy_tr.output_path / "accuracy_results.csv").unlink() + result = test_def.was_run_successful(ai_dynamo_aiperf_with_split_accuracy_tr) + assert result.is_successful is False + + def test_was_run_successful_no_results(ai_dynamo_tr: TestRun, tmp_path: Path) -> None: test_def = ai_dynamo_tr.test ai_dynamo_tr.output_path = tmp_path / "empty_output" @@ -251,6 +344,14 @@ def test_parse_aiperf_accuracy_from_artifact_dir(tmp_path: Path) -> None: assert parse_aiperf_accuracy(tmp_path) == 0.35 +def test_parse_aiperf_accuracy_from_split_accuracy_artifact_dir(tmp_path: Path) -> None: + artifact_dir = tmp_path / "aiperf_accuracy_artifacts" + artifact_dir.mkdir() + (artifact_dir / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content(), encoding="utf-8") + + assert parse_aiperf_accuracy(tmp_path) == 0.35 + + def test_parse_aiperf_accuracy_missing_or_invalid(tmp_path: Path) -> None: (tmp_path / "accuracy_results.csv").write_text("Task,Correct,Total,Accuracy\nOVERALL,n/a,100,n/a\n") From 92d4c89392aaaedce263aae8cf26518d09787a24 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 27 May 2026 09:05:53 -0700 Subject: [PATCH 10/16] refactor --- doc/workloads/ai_dynamo.rst | 3 +- src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 168 +++++++++--------- .../ai_dynamo/report_generation_strategy.py | 2 +- .../test_command_gen_strategy_slurm.py | 30 ---- .../ai_dynamo/test_report_gen_strategy.py | 45 ----- 5 files changed, 83 insertions(+), 165 deletions(-) diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index b0e077f9f..54aa4c252 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -160,8 +160,7 @@ After job completion, CloudAI places output logs and result files in the designa - ``cmd_args.aiperf_accuracy`` → ``accuracy_results.csv`` If AIPerf accuracy mode is enabled, CloudAI copies ``aiperf_accuracy_artifacts/accuracy_results.csv`` to -``accuracy_results.csv`` in the run output directory and marks the run failed if that file is not produced. The older -one-shot form that puts ``accuracy-benchmark`` under ``cmd_args.aiperf.args`` remains supported for compatibility. +``accuracy_results.csv`` in the run output directory and marks the run failed if that file is not produced. Navigate to ``./results///0/`` and open the CSV to examine performance metrics. diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index ffe34fa82..8c5dcaad1 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -46,87 +46,6 @@ AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv" -def _parse_accuracy_value(value: str | int | float | None) -> float | None: - if value is None: - return None - if isinstance(value, (int, float)): - accuracy = float(value) - return accuracy / 100 if accuracy > 1 else accuracy - - raw_value = value.strip() - if not raw_value: - return None - - is_percentage = raw_value.endswith("%") - if is_percentage: - raw_value = raw_value[:-1].strip() - - try: - accuracy = float(raw_value) - except ValueError: - return None - - return accuracy / 100 if is_percentage or accuracy > 1 else accuracy - - -def _parse_count_value(value: str | int | float | None) -> float | None: - if value is None: - return None - if isinstance(value, (int, float)): - return float(value) - try: - return float(value.strip()) - except ValueError: - return None - - -def parse_aiperf_accuracy(output_path: Path) -> float | None: - """ - Parse AIPerf accuracy from accuracy_results.csv. - - Expected CSV format: - Task,Correct,Total,Accuracy - abstract_algebra,35,100,35.00% - OVERALL,8368,14042,59.59% - - AIPerf writes this file under aiperf_artifacts; CloudAI's wrapper also copies - it to the run output directory when present. The returned value is normalized - to a 0.0-1.0 fraction. - """ - candidates = [ - output_path / AIPERF_ACCURACY_RESULTS_CSV, - output_path / AIPERF_ACCURACY_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV, - output_path / AIPERF_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV, - ] - - for csv_file in candidates: - if not csv_file.exists() or csv_file.stat().st_size == 0: - continue - - fallback_accuracy: float | None = None - with csv_file.open(newline="", encoding="utf-8") as f: - for row in csv.DictReader(f): - accuracy = _parse_accuracy_value(row.get("Accuracy") or row.get("accuracy") or row.get("Value")) - if accuracy is None: - correct = _parse_count_value(row.get("Correct") or row.get("correct")) - total = _parse_count_value(row.get("Total") or row.get("total")) - if correct is not None and total: - accuracy = correct / total - if accuracy is None: - continue - - task = (row.get("Task") or row.get("task") or row.get("Metric") or "").strip().upper() - if task == "OVERALL": - return accuracy - if fallback_accuracy is None: - fallback_accuracy = accuracy - - if fallback_accuracy is not None: - return fallback_accuracy - - return None - - class Args(BaseModel): """Arguments for custom workloads.""" @@ -536,9 +455,6 @@ def _has_aiperf_accuracy_results(self, output_path: Path) -> bool: logging.info(f"AIPerf accuracy results found in {output_path}: {accuracy}") return True - def _is_legacy_aiperf_accuracy_workload(self, workload: str) -> bool: - return workload == self.cmd_args.aiperf.script.src.name and self.cmd_args.aiperf.has_accuracy_benchmark - def _was_workload_report_produced(self, output_path: Path, workload: str, workload_config: Workload) -> bool: report_name = workload_config.report_name if report_name is None: @@ -559,9 +475,6 @@ def _was_workload_successful(self, output_path: Path, workload: str, workload_ma logging.info(f"Workload {workload} not found in workload map") return False - if self._is_legacy_aiperf_accuracy_workload(workload): - return self._has_aiperf_accuracy_results(output_path) - return self._was_workload_report_produced(output_path, workload, workload_config) def _were_workloads_successful(self, output_path: Path) -> bool: @@ -623,3 +536,84 @@ def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool: logging.info("constraint_check passed for: tp_times_pp_le_gpus_per_node") return True + + +def _parse_accuracy_value(value: str | int | float | None) -> float | None: + if value is None: + return None + if isinstance(value, (int, float)): + accuracy = float(value) + return accuracy / 100 if accuracy > 1 else accuracy + + raw_value = value.strip() + if not raw_value: + return None + + is_percentage = raw_value.endswith("%") + if is_percentage: + raw_value = raw_value[:-1].strip() + + try: + accuracy = float(raw_value) + except ValueError: + return None + + return accuracy / 100 if is_percentage or accuracy > 1 else accuracy + + +def _parse_count_value(value: str | int | float | None) -> float | None: + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + try: + return float(value.strip()) + except ValueError: + return None + + +def parse_aiperf_accuracy(output_path: Path) -> float | None: + """ + Parse AIPerf accuracy from accuracy_results.csv. + + Expected CSV format: + Task,Correct,Total,Accuracy + abstract_algebra,35,100,35.00% + OVERALL,8368,14042,59.59% + + AIPerf writes this file under aiperf_artifacts; CloudAI's wrapper also copies + it to the run output directory when present. The returned value is normalized + to a 0.0-1.0 fraction. + """ + candidates = [ + output_path / AIPERF_ACCURACY_RESULTS_CSV, + output_path / AIPERF_ACCURACY_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV, + output_path / AIPERF_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV, + ] + + for csv_file in candidates: + if not csv_file.exists() or csv_file.stat().st_size == 0: + continue + + fallback_accuracy: float | None = None + with csv_file.open(newline="", encoding="utf-8") as f: + for row in csv.DictReader(f): + accuracy = _parse_accuracy_value(row.get("Accuracy") or row.get("accuracy") or row.get("Value")) + if accuracy is None: + correct = _parse_count_value(row.get("Correct") or row.get("correct")) + total = _parse_count_value(row.get("Total") or row.get("total")) + if correct is not None and total: + accuracy = correct / total + if accuracy is None: + continue + + task = (row.get("Task") or row.get("task") or row.get("Metric") or "").strip().upper() + if task == "OVERALL": + return accuracy + if fallback_accuracy is None: + fallback_accuracy = accuracy + + if fallback_accuracy is not None: + return fallback_accuracy + + return None diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py index 4b46cf5dc..a0ef92005 100644 --- a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py @@ -50,7 +50,7 @@ def get_metric(self, metric: str) -> MetricValue: tdef = self.test_run.test if not isinstance(tdef, AIDynamoTestDefinition): return METRIC_ERROR - if tdef.cmd_args.aiperf_accuracy is None and not tdef.cmd_args.aiperf.has_accuracy_benchmark: + if tdef.cmd_args.aiperf_accuracy is None: return METRIC_ERROR accuracy = parse_aiperf_accuracy(self.test_run.output_path) return accuracy if accuracy is not None else METRIC_ERROR diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index 6a9274b8c..9b0f695d5 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -153,36 +153,6 @@ def test_dynamo_cmd( assert result.strip() == expected -def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: - td = cast(AIDynamoTestDefinition, strategy.test_run.test) - td.cmd_args.workloads = "aiperf.sh" - setup_cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'" - extra_inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' - td.cmd_args.aiperf = AIPerf.model_validate( - { - "setup-cmd": setup_cmd, - "args": { - "accuracy-benchmark": "mmlu", - "accuracy-n-shots": 5, - "accuracy-tasks": "abstract_algebra", - "concurrency": 10, - "extra-inputs": extra_inputs, - "num-requests": 100, - }, - } - ) - - result = strategy._gen_script_args(td) - - assert f'--aiperf-setup-cmd "{setup_cmd}"' in result - assert '--aiperf-args-accuracy-benchmark "mmlu"' in result - assert '--aiperf-args-accuracy-n-shots "5"' in result - assert '--aiperf-args-accuracy-tasks "abstract_algebra"' in result - assert '--aiperf-args-concurrency "10"' in result - assert f"--aiperf-args-extra-inputs '{extra_inputs}'" in result - assert '--aiperf-args-num-requests "100"' in result - - def test_gen_script_args_contains_split_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: td = cast(AIDynamoTestDefinition, strategy.test_run.test) td.cmd_args.workloads = "aiperf.sh" diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py index 1235ab3e6..9afdd6e72 100644 --- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py +++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py @@ -126,32 +126,6 @@ def ai_dynamo_aiperf_tr(tmp_path: Path) -> TestRun: return tr -@pytest.fixture -def ai_dynamo_aiperf_accuracy_tr(tmp_path: Path) -> TestRun: - test = AIDynamoTestDefinition( - name="ai_dynamo_aiperf_accuracy", - description="desc", - test_template_name="t", - cmd_args=AIDynamoCmdArgs( - docker_image_url="http://url", - workloads="aiperf.sh", - dynamo=AIDynamoArgs( - prefill_worker=WorkerConfig( - cmd="python3 -m dynamo.vllm --is-prefill-worker", - worker_initialized_regex="VllmWorker.*has.been.initialized", - args=WorkerBaseArgs(), - ), - ), - aiperf=AIPerf.model_validate({"args": {"accuracy-benchmark": "mmlu"}}), - lmcache=LMCache(args=LMCacheArgs()), - ), - ) - tr = TestRun(name="ai_dynamo_aiperf_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path) - (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content()) - (tr.output_path / test.success_marker).touch() - return tr - - @pytest.fixture def ai_dynamo_aiperf_with_split_accuracy_tr(tmp_path: Path) -> TestRun: test = AIDynamoTestDefinition( @@ -252,12 +226,6 @@ def test_ai_dynamo_get_metric_aiperf(slurm_system: SlurmSystem, ai_dynamo_aiperf assert strategy.get_metric("aiperf:Total Token Throughput (tokens/sec):avg") == 954.47 -def test_ai_dynamo_get_metric_aiperf_accuracy(slurm_system: SlurmSystem, ai_dynamo_aiperf_accuracy_tr: TestRun) -> None: - strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_accuracy_tr) - - assert strategy.get_metric("accuracy") == 0.35 - - def test_ai_dynamo_get_metric_split_aiperf_accuracy( slurm_system: SlurmSystem, ai_dynamo_aiperf_with_split_accuracy_tr: TestRun ) -> None: @@ -290,12 +258,6 @@ def test_was_run_successful(ai_dynamo_tr: TestRun) -> None: assert result.is_successful is True -def test_was_run_successful_with_aiperf_accuracy(ai_dynamo_aiperf_accuracy_tr: TestRun) -> None: - test_def = ai_dynamo_aiperf_accuracy_tr.test - result = test_def.was_run_successful(ai_dynamo_aiperf_accuracy_tr) - assert result.is_successful is True - - def test_was_run_successful_with_split_aiperf_accuracy( ai_dynamo_aiperf_with_split_accuracy_tr: TestRun, ) -> None: @@ -312,13 +274,6 @@ def test_was_run_successful_with_genai_perf_and_split_aiperf_accuracy( assert result.is_successful is True -def test_was_run_successful_requires_aiperf_accuracy(ai_dynamo_aiperf_accuracy_tr: TestRun) -> None: - test_def = ai_dynamo_aiperf_accuracy_tr.test - (ai_dynamo_aiperf_accuracy_tr.output_path / "accuracy_results.csv").unlink() - result = test_def.was_run_successful(ai_dynamo_aiperf_accuracy_tr) - assert result.is_successful is False - - def test_was_run_successful_requires_split_aiperf_accuracy( ai_dynamo_aiperf_with_split_accuracy_tr: TestRun, ) -> None: From 6ecf52e0180131b23575525612dff71464d0dccd Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 27 May 2026 09:14:27 -0700 Subject: [PATCH 11/16] udpate sglang config --- conf/experimental/ai_dynamo/test/sglang.toml | 4 +-- .../ai_dynamo/test_scenario/sglang_slurm.toml | 27 +++++++++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index 7d9930ecd..18e1681dd 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -32,7 +32,7 @@ workloads = "aiperf.sh" num-nodes = 1 cmd = 'python3 -m dynamo.sglang' extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics" - worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config' + worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config' [cmd_args.dynamo.prefill_worker.args] page-size = 16 @@ -48,7 +48,7 @@ workloads = "aiperf.sh" num-nodes = 1 cmd = 'python3 -m dynamo.sglang' extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics" - worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config' + worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config' [cmd_args.dynamo.decode_worker.args] page-size = 16 diff --git a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml index 26ed91285..4df1a6d64 100644 --- a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml +++ b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml @@ -15,11 +15,12 @@ # limitations under the License. name = "dynamo_sglang" +job_status_check = false [[Tests]] -id = "sglang-Qwen3-0.6B" +id = "test.disagg.single-node" test_name = "sglang" -time_limit = "00:20:00" +time_limit = "00:10:00" [Tests.cmd_args] [Tests.cmd_args.dynamo] @@ -37,3 +38,25 @@ time_limit = "00:20:00" [Tests.cmd_args.dynamo.decode_worker.args] tensor-parallel-size = 1 + +[[Tests]] +id = "test.disagg.multinode" +test_name = "sglang" +time_limit = "00:10:00" + + [Tests.cmd_args] + [Tests.cmd_args.dynamo] + model = "Qwen/Qwen3-0.6B" + node-setup-cmd = "hostname" + + [Tests.cmd_args.dynamo.prefill_worker] + num-nodes = 2 + + [Tests.cmd_args.dynamo.prefill_worker.args] + tensor-parallel-size = 1 + + [Tests.cmd_args.dynamo.decode_worker] + num-nodes = 2 + + [Tests.cmd_args.dynamo.decode_worker.args] + tensor-parallel-size = 1 From e07a44f213ea6bcd2da0e6fab83f6b50756635fb Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 27 May 2026 09:35:42 -0700 Subject: [PATCH 12/16] trying to fix missing aiperf for sgalng --- conf/experimental/ai_dynamo/test/sglang.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index 18e1681dd..b04deb92a 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -94,6 +94,8 @@ workloads = "aiperf.sh" concurrency = 2 [cmd_args.aiperf] + setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" + [cmd_args.aiperf.args] concurrency = 2 extra-inputs = '{"min_tokens":10}' From a3e70921e29b5dfb902a4f85ae7727cf5d87cf6d Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 27 May 2026 09:46:49 -0700 Subject: [PATCH 13/16] fixing sglang --- conf/experimental/ai_dynamo/test/sglang.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index b04deb92a..904c4a3ad 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -94,7 +94,7 @@ workloads = "aiperf.sh" concurrency = 2 [cmd_args.aiperf] - setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" + setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0" [cmd_args.aiperf.args] concurrency = 2 @@ -104,7 +104,7 @@ workloads = "aiperf.sh" synthetic-input-tokens-mean = 300 [cmd_args.aiperf_accuracy] - setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" + setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0" [cmd_args.aiperf_accuracy.args] accuracy-benchmark = "mmlu" From 333b272ddeca51420fd3b720313c08782646ef01 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 27 May 2026 11:27:45 -0700 Subject: [PATCH 14/16] allowing custom scripts --- conf/experimental/ai_dynamo/test/sglang.toml | 23 +-- conf/experimental/ai_dynamo/test/vllm.toml | 23 +-- doc/workloads/ai_dynamo.rst | 47 +++++-- src/cloudai/workloads/ai_dynamo/accuracy.sh | 133 ++++++++++++++++++ src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 33 ++--- src/cloudai/workloads/ai_dynamo/aiperf.sh | 77 +++++----- .../ai_dynamo/slurm_command_gen_strategy.py | 3 + .../ai_dynamo/test_accuracy_script.py | 123 ++++++++++++++++ .../test_command_gen_strategy_slurm.py | 49 +++++-- .../ai_dynamo/test_report_gen_strategy.py | 8 +- 10 files changed, 414 insertions(+), 105 deletions(-) create mode 100644 src/cloudai/workloads/ai_dynamo/accuracy.sh create mode 100644 tests/workloads/ai_dynamo/test_accuracy_script.py diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index 904c4a3ad..37b2c392b 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -104,15 +104,22 @@ workloads = "aiperf.sh" synthetic-input-tokens-mean = 300 [cmd_args.aiperf_accuracy] + entrypoint = "aiperf profile" setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0" - - [cmd_args.aiperf_accuracy.args] - accuracy-benchmark = "mmlu" - accuracy-n-shots = 5 - accuracy-tasks = "abstract_algebra" - concurrency = 10 - extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' - num-requests = 100 + cli = ''' +--model {model} +--url {url} +--endpoint-type chat +--streaming +--artifact-dir {artifact_dir} +--no-server-metrics +--accuracy-benchmark mmlu +--accuracy-n-shots 5 +--accuracy-tasks abstract_algebra +--concurrency 10 +--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' +--num-requests 100 +''' [extra_env_vars] UCX_LOG_LEVEL = "warn" diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index e314fe743..583d11a88 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -96,15 +96,22 @@ workloads = "aiperf.sh" synthetic-input-tokens-mean = 300 [cmd_args.aiperf_accuracy] + entrypoint = "aiperf profile" setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" - - [cmd_args.aiperf_accuracy.args] - accuracy-benchmark = "mmlu" - accuracy-n-shots = 5 - accuracy-tasks = "abstract_algebra" - concurrency = 10 - extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' - num-requests = 100 + cli = ''' +--model {model} +--url {url} +--endpoint-type chat +--streaming +--artifact-dir {artifact_dir} +--no-server-metrics +--accuracy-benchmark mmlu +--accuracy-n-shots 5 +--accuracy-tasks abstract_algebra +--concurrency 10 +--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' +--num-requests 100 +''' [extra_env_vars] UCX_LOG_LEVEL = "warn" diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index 54aa4c252..c00449681 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -130,26 +130,53 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it concurrency = 2 [cmd_args.aiperf_accuracy] + entrypoint = "aiperf profile" setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" - - [cmd_args.aiperf_accuracy.args] - accuracy-benchmark = "mmlu" - accuracy-n-shots = 5 - accuracy-tasks = "abstract_algebra" - concurrency = 10 - extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' - num-requests = 100 + cli = ''' + --model {model} + --url {url} + --endpoint-type chat + --streaming + --artifact-dir {artifact_dir} + --no-server-metrics + --accuracy-benchmark mmlu + --accuracy-n-shots 5 + --accuracy-tasks abstract_algebra + --concurrency 10 + --extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' + --num-requests 100 + ''' When ``cmd_args.aiperf_accuracy`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes the ``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt and token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark. -The ``setup-cmd`` field is optional. It is useful for Dynamo images that include an older system ``aiperf`` build without -the accuracy benchmark plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``. +The ``entrypoint`` and ``cli`` fields form the accuracy command. CloudAI expands ``{model}``, ``{url}``, +``{endpoint}``, ``{result_dir}``, and ``{artifact_dir}`` in ``cli`` before launching it. The ``setup-cmd`` field is +optional. It is useful for Dynamo images that include an older system ``aiperf`` build without the accuracy benchmark +plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``. MMLU is loaded from ``lighteval/mmlu``, so either allow Hugging Face dataset access or pre-cache that dataset before running with ``HF_HUB_OFFLINE``/``HF_DATASETS_OFFLINE`` enabled. For Qwen3 models, the example disables thinking mode so short MMLU answers can be parsed as choices. +Custom Accuracy Scripts +~~~~~~~~~~~~~~~~~~~~~~~ + +``cmd_args.aiperf_accuracy`` can also launch a custom mounted script instead of AIPerf. Mount the script or its parent +directory with ``extra_container_mounts`` and set ``entrypoint`` to the in-container command: + +.. code-block:: toml + + extra_container_mounts = ["/host/custom_accuracy:/custom_accuracy"] + + [cmd_args.aiperf_accuracy] + entrypoint = "python /custom_accuracy/dummy_accuracy.py" + cli = "--model {model} --url {url} --endpoint {endpoint} --artifact-dir {artifact_dir} --prompt ping" + +CloudAI expands placeholders in ``cli`` and runs ``entrypoint`` with that CLI string. The custom command must write +``accuracy_results.csv`` inside ``{artifact_dir}`` with an ``OVERALL`` row. CloudAI copies that file to the run output +directory and exposes the same ``accuracy`` metric as AIPerf accuracy mode. + Review Benchmark Results ------------------------ diff --git a/src/cloudai/workloads/ai_dynamo/accuracy.sh b/src/cloudai/workloads/ai_dynamo/accuracy.sh new file mode 100644 index 000000000..0e85ee109 --- /dev/null +++ b/src/cloudai/workloads/ai_dynamo/accuracy.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -Eeuo pipefail + +result_dir="" +model="" +url="http://localhost" +port=8000 +endpoint="v1/chat/completions" +entrypoint="" +cli="" +setup_cmd="" +artifact_dir_name="aiperf_accuracy_artifacts" + +log() { + echo "[$(date '+%F %T') $(hostname)]: $*" +} + +process_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --result-dir) result_dir="$2"; shift 2 ;; + --model) model="$2"; shift 2 ;; + --url) url="$2"; shift 2 ;; + --port) port="$2"; shift 2 ;; + --endpoint) endpoint="$2"; shift 2 ;; + --entrypoint) entrypoint="$2"; shift 2 ;; + --cli) cli="$2"; shift 2 ;; + --setup-cmd) setup_cmd="$2"; shift 2 ;; + --artifact-dir-name) artifact_dir_name="$2"; shift 2 ;; + --) shift; break ;; + --*) if [[ -n "${2:-}" && "${2}" != -* ]]; then shift 2; else shift 1; fi ;; + *) shift ;; + esac + done + + log "Parsed args: + result_dir: $result_dir + model: $model + url: $url + port: $port + endpoint: $endpoint + entrypoint: $entrypoint + setup_cmd: ${setup_cmd:-} + artifact_dir: $artifact_dir_name + cli: ${cli:-}" +} + +run_setup_cmd() { + if [[ -z "$setup_cmd" ]]; then + return + fi + + log "Running accuracy setup command: $setup_cmd" + bash -lc "$setup_cmd" + log "Accuracy setup command complete" +} + +expand_cli() { + local artifact_dir="$1" + local full_url="$2" + local expanded="$cli" + + expanded="${expanded//\{model\}/$model}" + expanded="${expanded//\{url\}/$full_url}" + expanded="${expanded//\{endpoint\}/$endpoint}" + expanded="${expanded//\{result_dir\}/$result_dir}" + expanded="${expanded//\{artifact_dir\}/$artifact_dir}" + expanded="${expanded//$'\n'/ }" + + echo "$expanded" +} + +copy_accuracy_results() { + local artifact_dir="$1" + local accuracy_path="$artifact_dir/accuracy_results.csv" + + if [[ ! -s "$accuracy_path" ]]; then + log "ERROR: accuracy benchmark was requested but $accuracy_path was not produced" + exit 1 + fi + + cp "$accuracy_path" "$result_dir/accuracy_results.csv" + log "accuracy report saved to $result_dir/accuracy_results.csv" +} + +main() { + process_args "$@" + + if [[ -z "$result_dir" ]]; then + log "ERROR: --result-dir is required"; exit 1 + fi + if [[ -z "$model" ]]; then + log "ERROR: --model is required"; exit 1 + fi + if [[ -z "$entrypoint" ]]; then + log "ERROR: --entrypoint is required"; exit 1 + fi + + run_setup_cmd + + local full_url="${url}:${port}" + local artifact_dir="$result_dir/$artifact_dir_name" + rm -rf "$artifact_dir" + mkdir -p "$artifact_dir" + + local expanded_cli + expanded_cli="$(expand_cli "$artifact_dir" "$full_url")" + + log "Launching accuracy command: $entrypoint $expanded_cli" + bash -lc "$entrypoint $expanded_cli" + log "accuracy command complete" + + copy_accuracy_results "$artifact_dir" +} + +main "$@" +exit 0 diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index 8c5dcaad1..35da5b782 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -310,26 +310,20 @@ class AIPerf(Workload): def installables(self) -> list[Installable]: return [self.script] - @property - def has_accuracy_benchmark(self) -> bool: - args_extra = getattr(self.args, "model_extra", {}) or {} - if args_extra.get("accuracy-benchmark") or args_extra.get("accuracy_benchmark"): - return True - extra_args = self.extra_args or "" - if isinstance(extra_args, list): - return "--accuracy-benchmark" in extra_args - return "--accuracy-benchmark" in extra_args +class AIPerfAccuracy(BaseModel): + """Optional accuracy benchmark configuration.""" - -class AIPerfAccuracy(AIPerf): - """Optional AIPerf accuracy benchmark configuration.""" + model_config = ConfigDict(extra="forbid", populate_by_name=True) name: str = "aiperf_accuracy" - report_name: str = Field( - default="aiperf_accuracy_report.csv", - serialization_alias="report-name", - validation_alias=AliasChoices("report-name", "report_name"), + entrypoint: str = "aiperf profile" + cli: str + script: File = File(Path(__file__).parent.parent / "ai_dynamo/accuracy.sh") + setup_cmd: str | None = Field( + default=None, + serialization_alias="setup-cmd", + validation_alias=AliasChoices("setup-cmd", "setup_cmd"), ) artifact_dir_name: str = Field( default=AIPERF_ACCURACY_ARTIFACTS_DIR, @@ -337,6 +331,10 @@ class AIPerfAccuracy(AIPerf): validation_alias=AliasChoices("artifact-dir-name", "artifact_dir_name"), ) + @property + def installables(self) -> list[Installable]: + return [self.script] + class Constraints(BaseModel): """Constraints for validation of AI Dynamo configurations when using DSE.""" @@ -410,9 +408,6 @@ def workload_scripts(self) -> "AIDynamoTestDefinition": if workload not in workload_map: raise ValueError(f"Invalid workload: {workload}. Available workloads: {list(workload_map.keys())}") - if self.cmd_args.aiperf_accuracy is not None and not self.cmd_args.aiperf_accuracy.has_accuracy_benchmark: - raise ValueError("cmd_args.aiperf_accuracy must configure an AIPerf --accuracy-benchmark argument") - return self def get_workload_map(self) -> dict[str, Workload]: diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh index bf08f869d..15cee3a58 100644 --- a/src/cloudai/workloads/ai_dynamo/aiperf.sh +++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh @@ -49,7 +49,7 @@ artifact_dir_name="aiperf_artifacts" cmd="aiperf profile" setup_cmd="" declare -a extra_args=() -declare -a aiperf_profile_args=() +declare -a profile_args=() log() { echo "[$(date '+%F %T') $(hostname)]: $*" @@ -58,27 +58,17 @@ log() { _parse_aiperf_args() { while [[ $# -ge 2 ]]; do case "$1" in - --*) aiperf_profile_args+=("$1" "$2"); shift 2 ;; + --*) profile_args+=("$1" "$2"); shift 2 ;; *) shift ;; esac done # Capture a trailing lone boolean flag if present. # Use if/fi — not [[ ]] && — so set -e does not trigger on a false condition. if [[ $# -eq 1 && "$1" == --* ]]; then - aiperf_profile_args+=("$1") + profile_args+=("$1") fi } -has_accuracy_benchmark() { - local arg - for arg in "${aiperf_profile_args[@]}" "${extra_args[@]}"; do - if [[ "$arg" == "--accuracy-benchmark" ]]; then - return 0 - fi - done - return 1 -} - process_args() { while [[ $# -gt 0 ]]; do case "$1" in @@ -107,7 +97,7 @@ process_args() { cmd: $cmd setup_cmd: ${setup_cmd:-} extra_args: ${extra_args[*]:-} - profile_args: ${aiperf_profile_args[*]:-}" + profile_args: ${profile_args[*]:-}" } run_setup_cmd() { @@ -123,18 +113,6 @@ run_setup_cmd() { process_results() { local artifact_dir="$result_dir/$artifact_dir_name" local csv_path="" - local accuracy_path="$artifact_dir/accuracy_results.csv" - - if has_accuracy_benchmark; then - if [[ ! -s "$accuracy_path" ]]; then - log "ERROR: AIPerf accuracy benchmark was requested but $accuracy_path was not produced" - exit 1 - fi - - cp "$accuracy_path" "$result_dir/accuracy_results.csv" - log "aiperf accuracy report saved to $result_dir/accuracy_results.csv" - return 0 - fi if [[ -f "$artifact_dir/profile_export_aiperf.csv" ]]; then csv_path="$artifact_dir/profile_export_aiperf.csv" @@ -152,6 +130,35 @@ process_results() { } +run_aiperf() { + local full_url="$1" + local artifact_dir="$2" + local -a run_cmd=() + read -ra run_cmd <<< "$cmd" + local -a launch_cmd=( + "${run_cmd[@]}" + --model "$model" + --url "$full_url" + --endpoint-type chat + --streaming + --artifact-dir "$artifact_dir" + --no-server-metrics + ) + + log "Launching aiperf: ${run_cmd[*]} --model $model --url $full_url" + + if [[ "${#profile_args[@]}" -gt 0 ]]; then + launch_cmd+=("${profile_args[@]}") + fi + if [[ "${#extra_args[@]}" -gt 0 ]]; then + launch_cmd+=("${extra_args[@]}") + fi + + "${launch_cmd[@]}" + + log "aiperf run complete" +} + main() { process_args "$@" @@ -168,23 +175,7 @@ main() { local artifact_dir="$result_dir/$artifact_dir_name" rm -rf "$artifact_dir" - # Split cmd into an array (e.g. "aiperf profile" → ["aiperf", "profile"]) - local -a run_cmd=() - read -ra run_cmd <<< "$cmd" - - log "Launching aiperf: ${run_cmd[*]} --model $model --url $full_url" - - "${run_cmd[@]}" \ - --model "$model" \ - --url "$full_url" \ - --endpoint-type chat \ - --streaming \ - --artifact-dir "$artifact_dir" \ - --no-server-metrics \ - "${aiperf_profile_args[@]}" \ - "${extra_args[@]}" - - log "aiperf run complete" + run_aiperf "$full_url" "$artifact_dir" process_results } diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 51e704102..c1a817853 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. import logging +import shlex from pathlib import Path from typing import List, cast @@ -71,6 +72,8 @@ def _get_toml_args(self, base_model: BaseModel, prefix: str, exclude: List[str] str_v = str(v) if str_v.startswith("{") and str_v.endswith("}"): args.append(f"{prefix}{k} '{str_v}'") + elif any(char in str_v for char in ['"', "'", "\n"]): + args.append(f"{prefix}{k} {shlex.quote(str_v)}") else: args.append(f'{prefix}{k} "{v}"') diff --git a/tests/workloads/ai_dynamo/test_accuracy_script.py b/tests/workloads/ai_dynamo/test_accuracy_script.py new file mode 100644 index 000000000..a6e3b8246 --- /dev/null +++ b/tests/workloads/ai_dynamo/test_accuracy_script.py @@ -0,0 +1,123 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import subprocess +import sys +from pathlib import Path + +ACCURACY_SCRIPT = Path("src/cloudai/workloads/ai_dynamo/accuracy.sh") + + +def test_accuracy_script_runs_custom_accuracy_command(tmp_path: Path) -> None: + custom_script = tmp_path / "custom_accuracy.py" + custom_script.write_text( + """ +import argparse +import csv +import json +from pathlib import Path + +parser = argparse.ArgumentParser() +parser.add_argument("--model", required=True) +parser.add_argument("--url", required=True) +parser.add_argument("--endpoint", required=True) +parser.add_argument("--result-dir", required=True) +parser.add_argument("--artifact-dir", required=True) +parser.add_argument("--prompt", required=True) +args = parser.parse_args() + +artifact_dir = Path(args.artifact_dir) +artifact_dir.mkdir(parents=True, exist_ok=True) +(artifact_dir / "args.json").write_text(json.dumps(vars(args)), encoding="utf-8") +with (artifact_dir / "accuracy_results.csv").open("w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(["task", "correct", "total", "accuracy"]) + writer.writerow(["OVERALL", 1, 1, "100.00%"]) +""", + encoding="utf-8", + ) + + result = subprocess.run( + [ + "bash", + str(ACCURACY_SCRIPT), + "--result-dir", + str(tmp_path), + "--model", + "Qwen/Qwen3-0.6B", + "--url", + "http://frontend", + "--port", + "8000", + "--endpoint", + "v1/chat/completions", + "--entrypoint", + f"{sys.executable} {custom_script}", + "--cli", + ( + "--model {model} --url {url} --endpoint {endpoint} " + "--result-dir {result_dir} --artifact-dir {artifact_dir} --prompt ping" + ), + "--artifact-dir-name", + "custom_accuracy_artifacts", + ], + check=False, + capture_output=True, + text=True, + ) + + assert result.returncode == 0, result.stderr + result.stdout + assert (tmp_path / "accuracy_results.csv").read_text(encoding="utf-8").splitlines()[-1] == "OVERALL,1,1,100.00%" + args = json.loads((tmp_path / "custom_accuracy_artifacts" / "args.json").read_text(encoding="utf-8")) + assert args == { + "model": "Qwen/Qwen3-0.6B", + "url": "http://frontend:8000", + "endpoint": "v1/chat/completions", + "result_dir": str(tmp_path), + "artifact_dir": str(tmp_path / "custom_accuracy_artifacts"), + "prompt": "ping", + } + + +def test_accuracy_script_fails_custom_accuracy_without_accuracy_csv(tmp_path: Path) -> None: + custom_script = tmp_path / "custom_accuracy.py" + custom_script.write_text("from pathlib import Path\nPath(__file__).exists()\n", encoding="utf-8") + + result = subprocess.run( + [ + "bash", + str(ACCURACY_SCRIPT), + "--result-dir", + str(tmp_path), + "--model", + "Qwen/Qwen3-0.6B", + "--url", + "http://frontend", + "--port", + "8000", + "--entrypoint", + f"{sys.executable} {custom_script}", + "--cli", + "--artifact-dir {artifact_dir}", + ], + check=False, + capture_output=True, + text=True, + ) + + assert result.returncode == 1 + assert "accuracy benchmark was requested" in result.stdout diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index 9b0f695d5..7b036b5a8 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import shlex from pathlib import Path from typing import cast @@ -157,7 +158,20 @@ def test_gen_script_args_contains_split_aiperf_accuracy_args(strategy: AIDynamoS td = cast(AIDynamoTestDefinition, strategy.test_run.test) td.cmd_args.workloads = "aiperf.sh" setup_cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0" - extra_inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' + cli = ( + "--model {model} " + "--url {url} " + "--endpoint-type chat " + "--streaming " + "--artifact-dir {artifact_dir} " + "--no-server-metrics " + "--accuracy-benchmark mmlu " + "--accuracy-n-shots 5 " + "--accuracy-tasks abstract_algebra " + "--concurrency 10 " + '--extra-inputs \'{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}\' ' + "--num-requests 100" + ) td.cmd_args.aiperf = AIPerf.model_validate( { "args": { @@ -171,14 +185,7 @@ def test_gen_script_args_contains_split_aiperf_accuracy_args(strategy: AIDynamoS td.cmd_args.aiperf_accuracy = AIPerfAccuracy.model_validate( { "setup-cmd": setup_cmd, - "args": { - "accuracy-benchmark": "mmlu", - "accuracy-n-shots": 5, - "accuracy-tasks": "abstract_algebra", - "concurrency": 10, - "extra-inputs": extra_inputs, - "num-requests": 100, - }, + "cli": cli, } ) @@ -189,13 +196,25 @@ def test_gen_script_args_contains_split_aiperf_accuracy_args(strategy: AIDynamoS assert '--aiperf-args-output-tokens-mean "500"' in result assert f'--aiperf_accuracy-setup-cmd "{setup_cmd}"' in result assert '--aiperf_accuracy-name "aiperf_accuracy"' in result + assert '--aiperf_accuracy-entrypoint "aiperf profile"' in result assert '--aiperf_accuracy-artifact-dir-name "aiperf_accuracy_artifacts"' in result - assert '--aiperf_accuracy-args-accuracy-benchmark "mmlu"' in result - assert '--aiperf_accuracy-args-accuracy-n-shots "5"' in result - assert '--aiperf_accuracy-args-accuracy-tasks "abstract_algebra"' in result - assert '--aiperf_accuracy-args-concurrency "10"' in result - assert f"--aiperf_accuracy-args-extra-inputs '{extra_inputs}'" in result - assert '--aiperf_accuracy-args-num-requests "100"' in result + assert f"--aiperf_accuracy-cli {shlex.quote(cli)}" in result + + +def test_gen_script_args_contains_custom_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: + td = cast(AIDynamoTestDefinition, strategy.test_run.test) + cli = "--model {model} --url {url} --endpoint {endpoint} --artifact-dir {artifact_dir} --prompt ping" + td.cmd_args.aiperf_accuracy = AIPerfAccuracy.model_validate( + { + "entrypoint": "python /custom_accuracy/dummy_accuracy.py", + "cli": cli, + } + ) + + result = strategy._gen_script_args(td) + + assert '--aiperf_accuracy-entrypoint "python /custom_accuracy/dummy_accuracy.py"' in result + assert f'--aiperf_accuracy-cli "{cli}"' in result def test_gen_script_args_quotes_worker_json_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None: diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py index 9afdd6e72..47e214421 100644 --- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py +++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py @@ -68,6 +68,10 @@ def get_aiperf_accuracy_csv_content() -> str: return "Task,Correct,Total,Accuracy\nabstract_algebra,35,100,35.00%\nOVERALL,35,100,35.00%\n" +def get_aiperf_accuracy_cli() -> str: + return "--model {model} --url {url} --artifact-dir {artifact_dir} --accuracy-benchmark mmlu" + + @pytest.fixture def ai_dynamo_tr(tmp_path: Path) -> TestRun: test = AIDynamoTestDefinition( @@ -143,7 +147,7 @@ def ai_dynamo_aiperf_with_split_accuracy_tr(tmp_path: Path) -> TestRun: ), ), aiperf=AIPerf(), - aiperf_accuracy=AIPerfAccuracy.model_validate({"args": {"accuracy-benchmark": "mmlu"}}), + aiperf_accuracy=AIPerfAccuracy.model_validate({"cli": get_aiperf_accuracy_cli()}), lmcache=LMCache(args=LMCacheArgs()), ), ) @@ -171,7 +175,7 @@ def ai_dynamo_genai_perf_with_split_accuracy_tr(tmp_path: Path) -> TestRun: ), ), genai_perf=GenAIPerf(), - aiperf_accuracy=AIPerfAccuracy.model_validate({"args": {"accuracy-benchmark": "mmlu"}}), + aiperf_accuracy=AIPerfAccuracy.model_validate({"cli": get_aiperf_accuracy_cli()}), lmcache=LMCache(args=LMCacheArgs()), ), ) From 1c99d609195007c5ada53b79276045f98bb278a3 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 27 May 2026 11:58:55 -0700 Subject: [PATCH 15/16] remove redundant test --- .../ai_dynamo/test_accuracy_script.py | 123 ------------------ 1 file changed, 123 deletions(-) delete mode 100644 tests/workloads/ai_dynamo/test_accuracy_script.py diff --git a/tests/workloads/ai_dynamo/test_accuracy_script.py b/tests/workloads/ai_dynamo/test_accuracy_script.py deleted file mode 100644 index a6e3b8246..000000000 --- a/tests/workloads/ai_dynamo/test_accuracy_script.py +++ /dev/null @@ -1,123 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import subprocess -import sys -from pathlib import Path - -ACCURACY_SCRIPT = Path("src/cloudai/workloads/ai_dynamo/accuracy.sh") - - -def test_accuracy_script_runs_custom_accuracy_command(tmp_path: Path) -> None: - custom_script = tmp_path / "custom_accuracy.py" - custom_script.write_text( - """ -import argparse -import csv -import json -from pathlib import Path - -parser = argparse.ArgumentParser() -parser.add_argument("--model", required=True) -parser.add_argument("--url", required=True) -parser.add_argument("--endpoint", required=True) -parser.add_argument("--result-dir", required=True) -parser.add_argument("--artifact-dir", required=True) -parser.add_argument("--prompt", required=True) -args = parser.parse_args() - -artifact_dir = Path(args.artifact_dir) -artifact_dir.mkdir(parents=True, exist_ok=True) -(artifact_dir / "args.json").write_text(json.dumps(vars(args)), encoding="utf-8") -with (artifact_dir / "accuracy_results.csv").open("w", newline="", encoding="utf-8") as f: - writer = csv.writer(f) - writer.writerow(["task", "correct", "total", "accuracy"]) - writer.writerow(["OVERALL", 1, 1, "100.00%"]) -""", - encoding="utf-8", - ) - - result = subprocess.run( - [ - "bash", - str(ACCURACY_SCRIPT), - "--result-dir", - str(tmp_path), - "--model", - "Qwen/Qwen3-0.6B", - "--url", - "http://frontend", - "--port", - "8000", - "--endpoint", - "v1/chat/completions", - "--entrypoint", - f"{sys.executable} {custom_script}", - "--cli", - ( - "--model {model} --url {url} --endpoint {endpoint} " - "--result-dir {result_dir} --artifact-dir {artifact_dir} --prompt ping" - ), - "--artifact-dir-name", - "custom_accuracy_artifacts", - ], - check=False, - capture_output=True, - text=True, - ) - - assert result.returncode == 0, result.stderr + result.stdout - assert (tmp_path / "accuracy_results.csv").read_text(encoding="utf-8").splitlines()[-1] == "OVERALL,1,1,100.00%" - args = json.loads((tmp_path / "custom_accuracy_artifacts" / "args.json").read_text(encoding="utf-8")) - assert args == { - "model": "Qwen/Qwen3-0.6B", - "url": "http://frontend:8000", - "endpoint": "v1/chat/completions", - "result_dir": str(tmp_path), - "artifact_dir": str(tmp_path / "custom_accuracy_artifacts"), - "prompt": "ping", - } - - -def test_accuracy_script_fails_custom_accuracy_without_accuracy_csv(tmp_path: Path) -> None: - custom_script = tmp_path / "custom_accuracy.py" - custom_script.write_text("from pathlib import Path\nPath(__file__).exists()\n", encoding="utf-8") - - result = subprocess.run( - [ - "bash", - str(ACCURACY_SCRIPT), - "--result-dir", - str(tmp_path), - "--model", - "Qwen/Qwen3-0.6B", - "--url", - "http://frontend", - "--port", - "8000", - "--entrypoint", - f"{sys.executable} {custom_script}", - "--cli", - "--artifact-dir {artifact_dir}", - ], - check=False, - capture_output=True, - text=True, - ) - - assert result.returncode == 1 - assert "accuracy benchmark was requested" in result.stdout From 1d09c7d845b7036c28c6d80302af846eba845088 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 27 May 2026 12:06:07 -0700 Subject: [PATCH 16/16] support custom scripts for vllm and sglang --- conf/experimental/sglang/test/sglang.toml | 4 ++-- conf/experimental/vllm/test/vllm.toml | 4 ++-- doc/workloads/sglang.rst | 17 +++++++------- doc/workloads/vllm.rst | 13 ++++++----- src/cloudai/workloads/common/llm_serving.py | 2 ++ src/cloudai/workloads/sglang/sglang.py | 6 +++-- .../sglang/slurm_command_gen_strategy.py | 13 ++--------- .../vllm/slurm_command_gen_strategy.py | 12 ++-------- src/cloudai/workloads/vllm/vllm.py | 6 +++-- .../sglang/test_command_gen_strategy_slurm.py | 16 +++++-------- .../vllm/test_command_gen_strategy_slurm.py | 23 +++++++++++++++---- 11 files changed, 59 insertions(+), 57 deletions(-) diff --git a/conf/experimental/sglang/test/sglang.toml b/conf/experimental/sglang/test/sglang.toml index e6d2c09b4..2866f656c 100644 --- a/conf/experimental/sglang/test/sglang.toml +++ b/conf/experimental/sglang/test/sglang.toml @@ -22,8 +22,8 @@ test_template_name = "sglang" docker_image_url = "lmsysorg/sglang:dev-cu13" [semantic_eval_cmd_args] -module = "sglang.test.run_eval" -args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" +entrypoint = "python3 -m sglang.test.run_eval" +cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" [extra_env_vars] UCX_NET_DEVICES = "all" diff --git a/conf/experimental/vllm/test/vllm.toml b/conf/experimental/vllm/test/vllm.toml index 891023201..a8061099c 100644 --- a/conf/experimental/vllm/test/vllm.toml +++ b/conf/experimental/vllm/test/vllm.toml @@ -27,8 +27,8 @@ mount_as = "/vllm_repo" docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1" [semantic_eval_cmd_args] -script = "/vllm_repo/tests/evals/gsm8k/gsm8k_eval.py" -args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json" +entrypoint = "python3 /vllm_repo/tests/evals/gsm8k/gsm8k_eval.py" +cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json" [extra_env_vars] UCX_NET_DEVICES = "all" diff --git a/doc/workloads/sglang.rst b/doc/workloads/sglang.rst index d0561c773..cdbd5cff1 100644 --- a/doc/workloads/sglang.rst +++ b/doc/workloads/sglang.rst @@ -29,8 +29,8 @@ Test + Scenario example num_prompts = 30 [semantic_eval_cmd_args] - module = "sglang.test.run_eval" - args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" + entrypoint = "python3 -m sglang.test.run_eval" + cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" .. code-block:: toml @@ -81,18 +81,19 @@ To run GSM8K semantic validation after the serving benchmark, add ``semantic_eva :caption: test.toml (semantic validation) [semantic_eval_cmd_args] - module = "sglang.test.run_eval" - args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" + entrypoint = "python3 -m sglang.test.run_eval" + cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" -For images that still use the legacy SGLang GSM8K runner, override the module and raw arguments: +For images that still use the legacy SGLang GSM8K runner, override the entrypoint and raw CLI: .. code-block:: toml [semantic_eval_cmd_args] - module = "sglang.test.few_shot_gsm8k" - args = "--num-questions 200" + entrypoint = "python3 -m sglang.test.few_shot_gsm8k" + cli = "--host {host} --port {port} --num-questions 200" -The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders. +The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{output_path}``, and ``{result_dir}`` +placeholders. Control number of GPUs diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst index 930bcf11b..57773992f 100644 --- a/doc/workloads/vllm.rst +++ b/doc/workloads/vllm.rst @@ -29,8 +29,8 @@ Test and Scenario Examples num_prompts = 30 [semantic_eval_cmd_args] - script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" - args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json" + entrypoint = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" + cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json" .. code-block:: toml @@ -81,13 +81,14 @@ To run GSM8K semantic validation after the serving benchmark, add ``semantic_eva :caption: test.toml (semantic validation) [semantic_eval_cmd_args] - script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" - args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json" + entrypoint = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" + cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json" If the runtime image does not contain the eval script, mount a vLLM repository with existing ``git_repos`` support and -point ``script`` at the mounted path. +point ``entrypoint`` at the mounted path. -The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders. +The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{output_path}``, and ``{result_dir}`` +placeholders. Controlling the Number of GPUs diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py index 87ad7b3a3..30a6943c1 100644 --- a/src/cloudai/workloads/common/llm_serving.py +++ b/src/cloudai/workloads/common/llm_serving.py @@ -624,7 +624,9 @@ def _expand_semantic_eval_args(self, args: str, *, host: str) -> str: "{model}": self.tdef.cmd_args.model, "{host}": host, "{port}": str(self.serve_port), + "{url}": f"{host}:{self.serve_port}", "{output_path}": str(self.test_run.output_path.absolute()), + "{result_dir}": str(self.test_run.output_path.absolute()), } for placeholder, value in replacements.items(): args = args.replace(placeholder, value) diff --git a/src/cloudai/workloads/sglang/sglang.py b/src/cloudai/workloads/sglang/sglang.py index 338bbfecc..49a7af140 100644 --- a/src/cloudai/workloads/sglang/sglang.py +++ b/src/cloudai/workloads/sglang/sglang.py @@ -92,8 +92,10 @@ class SglangBenchCmdArgs(CmdArgs): class SglangSemanticEvalCmdArgs(CmdArgs): """SGLang semantic validation command arguments.""" - module: str = "sglang.test.run_eval" - args: str = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" + model_config = ConfigDict(extra="forbid") + + entrypoint: str = "python3 -m sglang.test.run_eval" + cli: str = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" class SglangTestDefinition(LLMServingTestDefinition[SglangCmdArgs]): diff --git a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py index f1c7c741c..7a7a97d5b 100644 --- a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py @@ -130,17 +130,8 @@ def get_semantic_eval_command(self) -> list[str] | None: return None host = self.bench_host - command = [ - "python3", - "-m", - eval_args.module, - f"--host {host}", - f"--port {self.serve_port}", - ] - args = self._expand_semantic_eval_args(eval_args.args, host=host) - if args: - command.append(args) - return command + cli = self._expand_semantic_eval_args(eval_args.cli, host=host) + return [eval_args.entrypoint, cli] if cli else [eval_args.entrypoint] def aggregated_serve_env(self) -> dict[str, str]: return {"CUDA_VISIBLE_DEVICES": ",".join(str(gpu_id) for gpu_id in self.gpu_ids)} diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 13d87ad77..2f00e95f7 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -130,13 +130,5 @@ def get_semantic_eval_command(self) -> list[str] | None: host = self.bench_host http_host = host if host.startswith("http://") or host.startswith("https://") else f"http://{host}" - command = [ - "python3", - eval_args.script, - f"--host {http_host}", - f"--port {self.serve_port}", - ] - args = self._expand_semantic_eval_args(eval_args.args, host=http_host) - if args: - command.append(args) - return command + cli = self._expand_semantic_eval_args(eval_args.cli, host=http_host) + return [eval_args.entrypoint, cli] if cli else [eval_args.entrypoint] diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index d2fda3ab5..f77039edc 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -92,8 +92,10 @@ class VllmBenchCmdArgs(CmdArgs): class VllmSemanticEvalCmdArgs(CmdArgs): """vLLM semantic validation command arguments.""" - script: str = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" - args: str = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json" + model_config = ConfigDict(extra="forbid") + + entrypoint: str = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" + cli: str = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json" class VllmTestDefinition(LLMServingTestDefinition[VllmCmdArgs]): diff --git a/tests/workloads/sglang/test_command_gen_strategy_slurm.py b/tests/workloads/sglang/test_command_gen_strategy_slurm.py index 7d2812580..c07d1771d 100644 --- a/tests/workloads/sglang/test_command_gen_strategy_slurm.py +++ b/tests/workloads/sglang/test_command_gen_strategy_slurm.py @@ -150,28 +150,24 @@ def test_get_sglang_semantic_eval_command_defaults(sglang_cmd_gen_strategy: Sgla command = sglang_cmd_gen_strategy.get_semantic_eval_command() assert command == [ - "python3", - "-m", - "sglang.test.run_eval", - "--host ${NODE}", - "--port 8000", - "--eval-name gsm8k --num-examples 200 --num-threads 128 --model Qwen/Qwen3-8B", + "python3 -m sglang.test.run_eval", + "--host ${NODE} --port 8000 --eval-name gsm8k --num-examples 200 --num-threads 128 --model Qwen/Qwen3-8B", ] -def test_get_sglang_semantic_eval_command_supports_custom_module_and_args( +def test_get_sglang_semantic_eval_command_supports_custom_entrypoint_and_cli( sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy, ): sglang_test = cast(SglangTestDefinition, sglang_cmd_gen_strategy.test_run.test) sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs( - module="sglang.test.few_shot_gsm8k", - args="--num-questions 200 --data-path {output_path}/gsm8k.jsonl --seen {host}:{port}", + entrypoint="python3 /custom/semantic_eval.py", + cli="--num-questions 200 --data-path {result_dir}/gsm8k.jsonl --seen {url}", ) command = sglang_cmd_gen_strategy.get_semantic_eval_command() assert command is not None - assert command[2] == "sglang.test.few_shot_gsm8k" + assert command[0] == "python3 /custom/semantic_eval.py" assert command[-1] == ( f"--num-questions 200 --data-path {sglang_cmd_gen_strategy.test_run.output_path.absolute()}/gsm8k.jsonl " "--seen ${NODE}:8000" diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index 6bd6ada36..6eb62483c 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -193,14 +193,29 @@ def test_get_vllm_semantic_eval_command_defaults(self, vllm_cmd_gen_strategy: Vl command = vllm_cmd_gen_strategy.get_semantic_eval_command() assert command == [ - "python3", - "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py", - "--host http://${NODE}", - "--port 8000", + "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py", + "--host http://${NODE} --port 8000 " "--num-questions 200 --save-results " f"{vllm_cmd_gen_strategy.test_run.output_path.absolute()}/vllm-gsm8k.json", ] + def test_get_vllm_semantic_eval_command_supports_custom_entrypoint_and_cli( + self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy + ) -> None: + vllm_test = cast(VllmTestDefinition, vllm_cmd_gen_strategy.test_run.test) + vllm_test.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs( + entrypoint="python3 /custom/eval.py", + cli="--model {model} --api {url} --out {result_dir}/vllm-gsm8k.json", + ) + + command = vllm_cmd_gen_strategy.get_semantic_eval_command() + + assert command == [ + "python3 /custom/eval.py", + f"--model Qwen/Qwen3-0.6B --api http://${{NODE}}:8000 " + f"--out {vllm_cmd_gen_strategy.test_run.output_path.absolute()}/vllm-gsm8k.json", + ] + def test_gen_srun_command_contains_vllm_semantic_eval( self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy ) -> None: