From 27291976d331249e0cd164e9aba3479c0e0b7b20 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 26 May 2026 19:04:13 -0700
Subject: [PATCH 01/16] implement semantic degradataion for aidynamo using
 aiperf

---
 conf/experimental/ai_dynamo/test/sglang.toml  |   8 +-
 conf/experimental/ai_dynamo/test/vllm.toml    |   8 +-
 doc/workloads/ai_dynamo.rst                   |  33 +++++-
 src/cloudai/workloads/ai_dynamo/ai_dynamo.py  | 101 ++++++++++++++++++
 src/cloudai/workloads/ai_dynamo/aiperf.sh     |  16 ++-
 .../ai_dynamo/report_generation_strategy.py   |   9 ++
 .../test_command_gen_strategy_slurm.py        |  22 ++++
 .../ai_dynamo/test_report_gen_strategy.py     |  74 +++++++++++++
 8 files changed, 260 insertions(+), 11 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index 62f8f615d..28ec71d60 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -20,8 +20,8 @@ test_template_name = "AIDynamo"
 extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
 
 [cmd_args]
-docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0"
-workloads = "genai_perf.sh"
+docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.1.1"
+workloads = "aiperf.sh"
 
   [cmd_args.dynamo]
   backend = "sglang"
@@ -96,7 +96,11 @@ workloads = "genai_perf.sh"
   [cmd_args.aiperf]
 
     [cmd_args.aiperf.args]
+    accuracy-benchmark = "mmlu"
+    accuracy-n-shots = 5
+    accuracy-tasks = "abstract_algebra"
     concurrency = 2
+    extra-inputs = '{"temperature":0,"stop":["\n"]}'
     request-count = 50
     synthetic-input-tokens-mean = 300
     output-tokens-mean = 500
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index 193510728..91f12b08b 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -20,8 +20,8 @@ test_template_name = "AIDynamo"
 extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
 
 [cmd_args]
-docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1"
-workloads = "genai_perf.sh"
+docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.1.1"
+workloads = "aiperf.sh"
 
   [cmd_args.dynamo]
   backend = "vllm"
@@ -88,7 +88,11 @@ workloads = "genai_perf.sh"
   [cmd_args.aiperf]
 
     [cmd_args.aiperf.args]
+    accuracy-benchmark = "mmlu"
+    accuracy-n-shots = 5
+    accuracy-tasks = "abstract_algebra"
     concurrency = 2
+    extra-inputs = '{"temperature":0,"stop":["\n"]}'
     request-count = 50
     synthetic-input-tokens-mean = 300
     output-tokens-mean = 500
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index 023d92bf2..b266b0d77 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -87,14 +87,14 @@ The frontend node will initially wait to allow weight loading on all nodes. Once
 Choosing a Benchmark Tool
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The benchmark tool is controlled by the ``workloads`` field in the test TOML. The default is ``aiperf.sh``:
+The benchmark tool is controlled by the ``workloads`` field in the test TOML. Set ``aiperf.sh`` to use AIPerf:
 
 .. code-block:: toml
 
    [cmd_args]
-   workloads = "aiperf.sh"   # default — uses aiperf, writes aiperf_report.csv
+   workloads = "aiperf.sh"   # uses aiperf, writes aiperf_report.csv
 
-To use genai-perf instead, set:
+To use genai-perf, set:
 
 .. code-block:: toml
 
@@ -110,17 +110,40 @@ To use genai-perf instead, set:
      output-tokens-mean = 500
      request-count = 50
 
+Semantic Degradation With AIPerf Accuracy
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it by adding AIPerf accuracy flags under
+``[cmd_args.aiperf.args]`` and running the ``aiperf.sh`` workload:
+
+.. code-block:: toml
+
+   [cmd_args]
+   workloads = "aiperf.sh"
+
+     [cmd_args.aiperf.args]
+     accuracy-benchmark = "mmlu"
+     accuracy-n-shots = 5
+     accuracy-tasks = "abstract_algebra"
+     extra-inputs = '{"temperature":0,"stop":["\n"]}'
+
+When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes the
+``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction.
+
 Review Benchmark Results
 ------------------------
 
 After job completion, CloudAI places output logs and result files in the designated results directory. The result file name depends on the configured ``workloads`` field:
 
-- ``aiperf.sh`` (default) → ``aiperf_report.csv``
+- ``aiperf.sh`` → ``aiperf_report.csv``
 - ``genai_perf.sh`` → ``genai_perf_report.csv``
 
+If AIPerf accuracy mode is enabled, CloudAI also copies ``aiperf_artifacts/accuracy_results.csv`` to
+``accuracy_results.csv`` in the run output directory.
+
 Navigate to ``./results/<scenario>/<test-id>/0/`` and open the CSV to examine performance metrics.
 
-Example ``aiperf_report.csv`` (default):
+Example ``aiperf_report.csv``:
 
 ::
 
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
index 01912f0c1..7016fc42c 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import csv
 import logging
 from pathlib import Path
 from typing import Literal, Optional, cast
@@ -40,6 +41,89 @@
 from cloudai.models.workload import CmdArgs, TestDefinition
 from cloudai.systems.slurm import SlurmSystem
 
+AIPERF_ARTIFACTS_DIR = "aiperf_artifacts"
+AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv"
+
+
+def _parse_accuracy_value(value: str | int | float | None) -> float | None:
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        accuracy = float(value)
+        return accuracy / 100 if accuracy > 1 else accuracy
+
+    raw_value = value.strip()
+    if not raw_value:
+        return None
+
+    is_percentage = raw_value.endswith("%")
+    if is_percentage:
+        raw_value = raw_value[:-1].strip()
+
+    try:
+        accuracy = float(raw_value)
+    except ValueError:
+        return None
+
+    return accuracy / 100 if is_percentage or accuracy > 1 else accuracy
+
+
+def _parse_count_value(value: str | int | float | None) -> float | None:
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    try:
+        return float(value.strip())
+    except ValueError:
+        return None
+
+
+def parse_aiperf_accuracy(output_path: Path) -> float | None:
+    """
+    Parse AIPerf accuracy from accuracy_results.csv.
+
+    Expected CSV format:
+        Task,Correct,Total,Accuracy
+        abstract_algebra,35,100,35.00%
+        OVERALL,8368,14042,59.59%
+
+    AIPerf writes this file under aiperf_artifacts; CloudAI's wrapper also copies
+    it to the run output directory when present. The returned value is normalized
+    to a 0.0-1.0 fraction.
+    """
+    candidates = [
+        output_path / AIPERF_ACCURACY_RESULTS_CSV,
+        output_path / AIPERF_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV,
+    ]
+
+    for csv_file in candidates:
+        if not csv_file.exists() or csv_file.stat().st_size == 0:
+            continue
+
+        fallback_accuracy: float | None = None
+        with csv_file.open(newline="", encoding="utf-8") as f:
+            for row in csv.DictReader(f):
+                accuracy = _parse_accuracy_value(row.get("Accuracy") or row.get("accuracy") or row.get("Value"))
+                if accuracy is None:
+                    correct = _parse_count_value(row.get("Correct") or row.get("correct"))
+                    total = _parse_count_value(row.get("Total") or row.get("total"))
+                    if correct is not None and total:
+                        accuracy = correct / total
+                if accuracy is None:
+                    continue
+
+                task = (row.get("Task") or row.get("task") or row.get("Metric") or "").strip().upper()
+                if task == "OVERALL":
+                    return accuracy
+                if fallback_accuracy is None:
+                    fallback_accuracy = accuracy
+
+        if fallback_accuracy is not None:
+            return fallback_accuracy
+
+    return None
+
 
 class Args(BaseModel):
     """Arguments for custom workloads."""
@@ -300,6 +384,17 @@ class AIPerf(Workload):
     def installables(self) -> list[Installable]:
         return [self.script]
 
+    @property
+    def has_accuracy_benchmark(self) -> bool:
+        args_extra = getattr(self.args, "model_extra", {}) or {}
+        if args_extra.get("accuracy-benchmark") or args_extra.get("accuracy_benchmark"):
+            return True
+
+        extra_args = self.extra_args or ""
+        if isinstance(extra_args, list):
+            return "--accuracy-benchmark" in extra_args
+        return "--accuracy-benchmark" in extra_args
+
 
 class Constraints(BaseModel):
     """Constraints for validation of AI Dynamo configurations when using DSE."""
@@ -435,6 +530,12 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult:
             else:
                 logging.info(f"Result file ({workload_csv_file.absolute()}) exists for {workload}")
 
+            if workload == self.cmd_args.aiperf.script.src.name and self.cmd_args.aiperf.has_accuracy_benchmark:
+                accuracy = parse_aiperf_accuracy(output_path)
+                if accuracy is None:
+                    logging.info(f"AIPerf accuracy results not found in {output_path}.")
+                    result = False
+
         return JobStatusResult(result)
 
     def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool:
diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh
index 9f5a78b33..f0826686a 100644
--- a/src/cloudai/workloads/ai_dynamo/aiperf.sh
+++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh
@@ -94,8 +94,15 @@ process_args() {
 
 process_results() {
   local artifact_dir="$result_dir/aiperf_artifacts"
-  local csv_path
-  csv_path=$(find "$artifact_dir" -name "*.csv" -print -quit 2>/dev/null || true)
+  local csv_path=""
+  local accuracy_path="$artifact_dir/accuracy_results.csv"
+
+  if [[ -f "$artifact_dir/profile_export_aiperf.csv" ]]; then
+    csv_path="$artifact_dir/profile_export_aiperf.csv"
+  else
+    csv_path=$(find "$artifact_dir" -name "*aiperf*.csv" -print -quit 2>/dev/null || true)
+  fi
+
   if [[ -n "$csv_path" ]]; then
     cp "$csv_path" "$result_dir/$report_name"
     log "aiperf report saved to $result_dir/$report_name"
@@ -103,6 +110,11 @@ process_results() {
     log "ERROR: no CSV found in $artifact_dir — aiperf may not have completed"
     exit 1
   fi
+
+  if [[ -f "$accuracy_path" ]]; then
+    cp "$accuracy_path" "$result_dir/accuracy_results.csv"
+    log "aiperf accuracy report saved to $result_dir/accuracy_results.csv"
+  fi
 }
 
 main() {
diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
index a8e4e91b8..a665b0d6a 100644
--- a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
+++ b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
@@ -21,6 +21,7 @@
 
 from cloudai.core import METRIC_ERROR, MetricValue, ReportGenerationStrategy
 from cloudai.util.lazy_imports import lazy
+from cloudai.workloads.ai_dynamo.ai_dynamo import AIDynamoTestDefinition, parse_aiperf_accuracy
 
 
 class AIDynamoReportGenerationStrategy(ReportGenerationStrategy):
@@ -44,6 +45,14 @@ def extract_metric_from_csv(self, csv_file: Path, metric_name: str, metric_type:
 
     def get_metric(self, metric: str) -> MetricValue:
         logging.info(f"Getting metric: {metric}")
+
+        if metric.lower() == "accuracy":
+            tdef = self.test_run.test
+            if not isinstance(tdef, AIDynamoTestDefinition) or not tdef.cmd_args.aiperf.has_accuracy_benchmark:
+                return METRIC_ERROR
+            accuracy = parse_aiperf_accuracy(self.test_run.output_path)
+            return accuracy if accuracy is not None else METRIC_ERROR
+
         metric_name = metric
         metric_type = "avg"
 
diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
index a0a028caa..034576f89 100644
--- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 from pathlib import Path
+from typing import cast
 
 import pytest
 
@@ -26,6 +27,7 @@
     AIDynamoCmdArgs,
     AIDynamoSlurmCommandGenStrategy,
     AIDynamoTestDefinition,
+    AIPerf,
     GenAIPerf,
     LMCache,
     LMCacheArgs,
@@ -148,3 +150,23 @@ def test_dynamo_cmd(
 ) -> None:
     result = strategy.gen_dynamo_cmd(module, Path(config))
     assert result.strip() == expected
+
+
+def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
+    td = cast(AIDynamoTestDefinition, strategy.test_run.test)
+    td.cmd_args.workloads = "aiperf.sh"
+    td.cmd_args.aiperf = AIPerf.model_validate(
+        {
+            "args": {
+                "accuracy-benchmark": "mmlu",
+                "accuracy-n-shots": 5,
+                "accuracy-tasks": "abstract_algebra",
+            }
+        }
+    )
+
+    result = strategy._gen_script_args(td)
+
+    assert '--aiperf-args-accuracy-benchmark "mmlu"' in result
+    assert '--aiperf-args-accuracy-n-shots "5"' in result
+    assert '--aiperf-args-accuracy-tasks "abstract_algebra"' in result
diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
index 0e51c414f..0ba90024d 100644
--- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py
+++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
@@ -32,6 +32,7 @@
     WorkerBaseArgs,
     WorkerConfig,
 )
+from cloudai.workloads.ai_dynamo.ai_dynamo import parse_aiperf_accuracy
 from cloudai.workloads.ai_dynamo.report_generation_strategy import AIDynamoReportGenerationStrategy
 
 
@@ -62,6 +63,10 @@ def get_aiperf_csv_content() -> str:
     )
 
 
+def get_aiperf_accuracy_csv_content() -> str:
+    return "Task,Correct,Total,Accuracy\nabstract_algebra,35,100,35.00%\nOVERALL,35,100,35.00%\n"
+
+
 @pytest.fixture
 def ai_dynamo_tr(tmp_path: Path) -> TestRun:
     test = AIDynamoTestDefinition(
@@ -70,6 +75,7 @@ def ai_dynamo_tr(tmp_path: Path) -> TestRun:
         test_template_name="t",
         cmd_args=AIDynamoCmdArgs(
             docker_image_url="http://url",
+            workloads="genai_perf.sh",
             dynamo=AIDynamoArgs(
                 prefill_worker=WorkerConfig(
                     cmd="python3 -m dynamo.vllm --is-prefill-worker",
@@ -119,6 +125,33 @@ def ai_dynamo_aiperf_tr(tmp_path: Path) -> TestRun:
     return tr
 
 
+@pytest.fixture
+def ai_dynamo_aiperf_accuracy_tr(tmp_path: Path) -> TestRun:
+    test = AIDynamoTestDefinition(
+        name="ai_dynamo_aiperf_accuracy",
+        description="desc",
+        test_template_name="t",
+        cmd_args=AIDynamoCmdArgs(
+            docker_image_url="http://url",
+            workloads="aiperf.sh",
+            dynamo=AIDynamoArgs(
+                prefill_worker=WorkerConfig(
+                    cmd="python3 -m dynamo.vllm --is-prefill-worker",
+                    worker_initialized_regex="VllmWorker.*has.been.initialized",
+                    args=WorkerBaseArgs(),
+                ),
+            ),
+            aiperf=AIPerf.model_validate({"args": {"accuracy-benchmark": "mmlu"}}),
+            lmcache=LMCache(args=LMCacheArgs()),
+        ),
+    )
+    tr = TestRun(name="ai_dynamo_aiperf_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path)
+    (tr.output_path / "aiperf_report.csv").write_text(get_aiperf_csv_content())
+    (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content())
+    (tr.output_path / test.success_marker).touch()
+    return tr
+
+
 @pytest.fixture
 def csv_content() -> str:
     return get_csv_content()
@@ -161,6 +194,20 @@ def test_ai_dynamo_get_metric_aiperf(slurm_system: SlurmSystem, ai_dynamo_aiperf
     assert strategy.get_metric("aiperf:Total Token Throughput (tokens/sec):avg") == 954.47
 
 
+def test_ai_dynamo_get_metric_aiperf_accuracy(slurm_system: SlurmSystem, ai_dynamo_aiperf_accuracy_tr: TestRun) -> None:
+    strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_accuracy_tr)
+
+    assert strategy.get_metric("accuracy") == 0.35
+
+
+def test_ai_dynamo_accuracy_metric_requires_aiperf_accuracy_config(
+    slurm_system: SlurmSystem, ai_dynamo_aiperf_tr: TestRun
+) -> None:
+    strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_tr)
+
+    assert strategy.get_metric("accuracy") == METRIC_ERROR
+
+
 def test_ai_dynamo_get_metric_invalid(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None:
     strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr)
 
@@ -176,9 +223,36 @@ def test_was_run_successful(ai_dynamo_tr: TestRun) -> None:
     assert result.is_successful is True
 
 
+def test_was_run_successful_with_aiperf_accuracy(ai_dynamo_aiperf_accuracy_tr: TestRun) -> None:
+    test_def = ai_dynamo_aiperf_accuracy_tr.test
+    result = test_def.was_run_successful(ai_dynamo_aiperf_accuracy_tr)
+    assert result.is_successful is True
+
+
+def test_was_run_successful_requires_aiperf_accuracy(ai_dynamo_aiperf_accuracy_tr: TestRun) -> None:
+    test_def = ai_dynamo_aiperf_accuracy_tr.test
+    (ai_dynamo_aiperf_accuracy_tr.output_path / "accuracy_results.csv").unlink()
+    result = test_def.was_run_successful(ai_dynamo_aiperf_accuracy_tr)
+    assert result.is_successful is False
+
+
 def test_was_run_successful_no_results(ai_dynamo_tr: TestRun, tmp_path: Path) -> None:
     test_def = ai_dynamo_tr.test
     ai_dynamo_tr.output_path = tmp_path / "empty_output"
     ai_dynamo_tr.output_path.mkdir(parents=True, exist_ok=True)
     result = test_def.was_run_successful(ai_dynamo_tr)
     assert result.is_successful is False
+
+
+def test_parse_aiperf_accuracy_from_artifact_dir(tmp_path: Path) -> None:
+    artifact_dir = tmp_path / "aiperf_artifacts"
+    artifact_dir.mkdir()
+    (artifact_dir / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content(), encoding="utf-8")
+
+    assert parse_aiperf_accuracy(tmp_path) == 0.35
+
+
+def test_parse_aiperf_accuracy_missing_or_invalid(tmp_path: Path) -> None:
+    (tmp_path / "accuracy_results.csv").write_text("Task,Correct,Total,Accuracy\nOVERALL,n/a,100,n/a\n")
+
+    assert parse_aiperf_accuracy(tmp_path) is None

From 72c8ef2a70adfe4c0fbeabe27571745a69fc13e3 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 26 May 2026 20:34:31 -0700
Subject: [PATCH 02/16] update conf fix nixl connector

---
 conf/experimental/ai_dynamo/test/vllm.toml           |  2 ++
 .../ai_dynamo/test_command_gen_strategy_slurm.py     | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index 91f12b08b..8c044dba6 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -38,6 +38,7 @@ workloads = "aiperf.sh"
       tensor-parallel-size = 8
       pipeline-parallel-size = 1
       data-parallel-size = 1
+      kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
 
     [cmd_args.dynamo.decode_worker]
     num-nodes = 1
@@ -50,6 +51,7 @@ workloads = "aiperf.sh"
       tensor-parallel-size = 8
       pipeline-parallel-size = 1
       data-parallel-size = 1
+      kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
 
   [cmd_args.lmcache]
   controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001"
diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
index 034576f89..aab622578 100644
--- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
@@ -170,3 +170,15 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo
     assert '--aiperf-args-accuracy-benchmark "mmlu"' in result
     assert '--aiperf-args-accuracy-n-shots "5"' in result
     assert '--aiperf-args-accuracy-tasks "abstract_algebra"' in result
+
+
+def test_gen_script_args_quotes_worker_json_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
+    td = cast(AIDynamoTestDefinition, strategy.test_run.test)
+    config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+    td.cmd_args.dynamo.prefill_worker.args = WorkerBaseArgs.model_validate({"kv-transfer-config": config})
+    td.cmd_args.dynamo.decode_worker.args = WorkerBaseArgs.model_validate({"kv-transfer-config": config})
+
+    result = strategy._gen_script_args(td)
+
+    assert f"--prefill-args-kv-transfer-config '{config}'" in result
+    assert f"--decode-args-kv-transfer-config '{config}'" in result

From b023c34c3e0ca7ef15a711a5f312117240c1261a Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 26 May 2026 20:56:11 -0700
Subject: [PATCH 03/16] accuracy fixes

---
 conf/experimental/ai_dynamo/test/sglang.toml  |  6 ++---
 conf/experimental/ai_dynamo/test/vllm.toml    |  6 ++---
 doc/workloads/ai_dynamo.rst                   | 11 +++++---
 src/cloudai/workloads/ai_dynamo/ai_dynamo.py  | 16 +++++++-----
 src/cloudai/workloads/ai_dynamo/ai_dynamo.sh  | 16 +++++++++---
 src/cloudai/workloads/ai_dynamo/aiperf.sh     | 25 ++++++++++++++++---
 .../test_command_gen_strategy_slurm.py        |  4 +++
 .../ai_dynamo/test_report_gen_strategy.py     |  1 -
 8 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index 28ec71d60..adf9552af 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -99,11 +99,9 @@ workloads = "aiperf.sh"
     accuracy-benchmark = "mmlu"
     accuracy-n-shots = 5
     accuracy-tasks = "abstract_algebra"
-    concurrency = 2
+    concurrency = 10
     extra-inputs = '{"temperature":0,"stop":["\n"]}'
-    request-count = 50
-    synthetic-input-tokens-mean = 300
-    output-tokens-mean = 500
+    num-requests = 100
 
 [extra_env_vars]
 UCX_LOG_LEVEL = "warn"
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index 8c044dba6..76b8c3c6e 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -93,11 +93,9 @@ workloads = "aiperf.sh"
     accuracy-benchmark = "mmlu"
     accuracy-n-shots = 5
     accuracy-tasks = "abstract_algebra"
-    concurrency = 2
+    concurrency = 10
     extra-inputs = '{"temperature":0,"stop":["\n"]}'
-    request-count = 50
-    synthetic-input-tokens-mean = 300
-    output-tokens-mean = 500
+    num-requests = 100
 
 [extra_env_vars]
 UCX_LOG_LEVEL = "warn"
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index b266b0d77..1e077b7c1 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -125,21 +125,24 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it
      accuracy-benchmark = "mmlu"
      accuracy-n-shots = 5
      accuracy-tasks = "abstract_algebra"
+     concurrency = 10
      extra-inputs = '{"temperature":0,"stop":["\n"]}'
+     num-requests = 100
 
 When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes the
-``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction.
+``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt and
+token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark.
 
 Review Benchmark Results
 ------------------------
 
 After job completion, CloudAI places output logs and result files in the designated results directory. The result file name depends on the configured ``workloads`` field:
 
-- ``aiperf.sh`` → ``aiperf_report.csv``
+- ``aiperf.sh`` → ``aiperf_report.csv`` for performance mode, ``accuracy_results.csv`` for accuracy mode
 - ``genai_perf.sh`` → ``genai_perf_report.csv``
 
-If AIPerf accuracy mode is enabled, CloudAI also copies ``aiperf_artifacts/accuracy_results.csv`` to
-``accuracy_results.csv`` in the run output directory.
+If AIPerf accuracy mode is enabled, CloudAI copies ``aiperf_artifacts/accuracy_results.csv`` to ``accuracy_results.csv``
+in the run output directory and marks the run failed if that file is not produced.
 
 Navigate to ``./results/<scenario>/<test-id>/0/`` and open the CSV to examine performance metrics.
 
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
index 7016fc42c..2d0e5e8fd 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
@@ -518,6 +518,16 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult:
                 logging.info(f"Workload {workload} not found in workload map")
                 result = False
                 continue
+
+            if workload == self.cmd_args.aiperf.script.src.name and self.cmd_args.aiperf.has_accuracy_benchmark:
+                accuracy = parse_aiperf_accuracy(output_path)
+                if accuracy is None:
+                    logging.info(f"AIPerf accuracy results not found in {output_path}.")
+                    result = False
+                else:
+                    logging.info(f"AIPerf accuracy results found in {output_path}: {accuracy}")
+                continue
+
             report_name = workload_map[workload].report_name
             if report_name is None:
                 logging.warning(f"Workload {workload} has no report_name configured")
@@ -530,12 +540,6 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult:
             else:
                 logging.info(f"Result file ({workload_csv_file.absolute()}) exists for {workload}")
 
-            if workload == self.cmd_args.aiperf.script.src.name and self.cmd_args.aiperf.has_accuracy_benchmark:
-                accuracy = parse_aiperf_accuracy(output_path)
-                if accuracy is None:
-                    logging.info(f"AIPerf accuracy results not found in {output_path}.")
-                    result = False
-
         return JobStatusResult(result)
 
     def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool:
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
index 46f5daa42..8cb84f7d9 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
@@ -100,6 +100,10 @@ _resolve_host_ip() {
   echo "$ip"
 }
 
+_current_node_ip() {
+  _resolve_host_ip "$(_current_node_name)"
+}
+
 _apply_sglang_dsr1_section_args() {
   local self="$(_current_node_name)"
   local gpn="$(_gpus_per_node)"
@@ -733,6 +737,8 @@ function launch_decode()
   local base_kvbm_pub_port=${DYN_KVBM_LEADER_ZMQ_PUB_PORT:-56001}
   local base_kvbm_ack_port=${DYN_KVBM_LEADER_ZMQ_ACK_PORT:-56002}
   local kvbm_port_stride=2
+  local side_channel_host
+  side_channel_host="$(_current_node_ip)"
   log "Launching $workers_per_node decode worker(s) with unique port ranges"
 
   for i in $(seq 0 $(( $workers_per_node - 1 ))); do
@@ -754,10 +760,10 @@ function launch_decode()
       args_arr+=($key "${decode_args[$key]}")
     done
 
-    log "Launching decode worker $i on GPUs $gpu_list (NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)"
+    log "Launching decode worker $i on GPUs $gpu_list (NIXL host: $side_channel_host, NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)"
     log "Decode cmd: ${decode_config["cmd"]} ${args_arr[*]} ${decode_config["extra-args"]}"
     CUDA_VISIBLE_DEVICES=$gpu_list \
-      VLLM_NIXL_SIDE_CHANNEL_HOST=$(hostname -I | awk '{print $1}') \
+      VLLM_NIXL_SIDE_CHANNEL_HOST="$side_channel_host" \
       VLLM_NIXL_SIDE_CHANNEL_PORT=$nixl_port \
       DYN_VLLM_KV_EVENT_PORT=$kv_event_port \
       DYN_KVBM_LEADER_ZMQ_PUB_PORT=$kvbm_pub_port \
@@ -788,6 +794,8 @@ function launch_prefill()
   local base_kvbm_pub_port=${DYN_KVBM_LEADER_ZMQ_PUB_PORT:-56001}
   local base_kvbm_ack_port=${DYN_KVBM_LEADER_ZMQ_ACK_PORT:-56002}
   local kvbm_port_stride=2
+  local side_channel_host
+  side_channel_host="$(_current_node_ip)"
   log "Launching $workers_per_node prefill worker(s) with unique port ranges"
 
   for i in $(seq 0 $(( $workers_per_node - 1 ))); do
@@ -809,10 +817,10 @@ function launch_prefill()
       args_arr+=($key "${prefill_args[$key]}")
     done
 
-    log "Launching prefill worker $i on GPUs $gpu_list (NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)"
+    log "Launching prefill worker $i on GPUs $gpu_list (NIXL host: $side_channel_host, NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)"
     log "Prefill cmd: ${prefill_config["cmd"]} ${args_arr[*]} ${prefill_config["extra-args"]}"
     CUDA_VISIBLE_DEVICES=$gpu_list \
-      VLLM_NIXL_SIDE_CHANNEL_HOST=$(hostname -I | awk '{print $1}') \
+      VLLM_NIXL_SIDE_CHANNEL_HOST="$side_channel_host" \
       VLLM_NIXL_SIDE_CHANNEL_PORT=$nixl_port \
       DYN_VLLM_KV_EVENT_PORT=$kv_event_port \
       DYN_KVBM_LEADER_ZMQ_PUB_PORT=$kvbm_pub_port \
diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh
index f0826686a..62298dc97 100644
--- a/src/cloudai/workloads/ai_dynamo/aiperf.sh
+++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh
@@ -65,6 +65,16 @@ _parse_aiperf_args() {
   fi
 }
 
+has_accuracy_benchmark() {
+  local arg
+  for arg in "${aiperf_profile_args[@]}" "${extra_args[@]}"; do
+    if [[ "$arg" == "--accuracy-benchmark" ]]; then
+      return 0
+    fi
+  done
+  return 1
+}
+
 process_args() {
   while [[ $# -gt 0 ]]; do
     case "$1" in
@@ -97,6 +107,17 @@ process_results() {
   local csv_path=""
   local accuracy_path="$artifact_dir/accuracy_results.csv"
 
+  if has_accuracy_benchmark; then
+    if [[ ! -s "$accuracy_path" ]]; then
+      log "ERROR: AIPerf accuracy benchmark was requested but $accuracy_path was not produced"
+      exit 1
+    fi
+
+    cp "$accuracy_path" "$result_dir/accuracy_results.csv"
+    log "aiperf accuracy report saved to $result_dir/accuracy_results.csv"
+    return 0
+  fi
+
   if [[ -f "$artifact_dir/profile_export_aiperf.csv" ]]; then
     csv_path="$artifact_dir/profile_export_aiperf.csv"
   else
@@ -111,10 +132,6 @@ process_results() {
     exit 1
   fi
 
-  if [[ -f "$accuracy_path" ]]; then
-    cp "$accuracy_path" "$result_dir/accuracy_results.csv"
-    log "aiperf accuracy report saved to $result_dir/accuracy_results.csv"
-  fi
 }
 
 main() {
diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
index aab622578..146694734 100644
--- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
@@ -161,6 +161,8 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo
                 "accuracy-benchmark": "mmlu",
                 "accuracy-n-shots": 5,
                 "accuracy-tasks": "abstract_algebra",
+                "concurrency": 10,
+                "num-requests": 100,
             }
         }
     )
@@ -170,6 +172,8 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo
     assert '--aiperf-args-accuracy-benchmark "mmlu"' in result
     assert '--aiperf-args-accuracy-n-shots "5"' in result
     assert '--aiperf-args-accuracy-tasks "abstract_algebra"' in result
+    assert '--aiperf-args-concurrency "10"' in result
+    assert '--aiperf-args-num-requests "100"' in result
 
 
 def test_gen_script_args_quotes_worker_json_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
index 0ba90024d..1c9cbb013 100644
--- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py
+++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
@@ -146,7 +146,6 @@ def ai_dynamo_aiperf_accuracy_tr(tmp_path: Path) -> TestRun:
         ),
     )
     tr = TestRun(name="ai_dynamo_aiperf_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path)
-    (tr.output_path / "aiperf_report.csv").write_text(get_aiperf_csv_content())
     (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content())
     (tr.output_path / test.success_marker).touch()
     return tr

From 0e20d7493370d947da8edf4d863aee7e33154f3d Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 26 May 2026 21:43:39 -0700
Subject: [PATCH 04/16] add aiperf setup for accuracy test

---
 conf/experimental/ai_dynamo/test/sglang.toml  |  1 +
 conf/experimental/ai_dynamo/test/vllm.toml    |  1 +
 doc/workloads/ai_dynamo.rst                   | 20 ++++++++++++-------
 src/cloudai/workloads/ai_dynamo/ai_dynamo.py  |  5 +++++
 src/cloudai/workloads/ai_dynamo/aiperf.sh     | 16 +++++++++++++++
 .../test_command_gen_strategy_slurm.py        |  5 ++++-
 6 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index adf9552af..be393b829 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -94,6 +94,7 @@ workloads = "aiperf.sh"
     concurrency = 2
 
   [cmd_args.aiperf]
+  setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'"
 
     [cmd_args.aiperf.args]
     accuracy-benchmark = "mmlu"
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index 76b8c3c6e..a072b5a5e 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -88,6 +88,7 @@ workloads = "aiperf.sh"
     concurrency = 2
 
   [cmd_args.aiperf]
+  setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'"
 
     [cmd_args.aiperf.args]
     accuracy-benchmark = "mmlu"
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index 1e077b7c1..d9912fb06 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -121,18 +121,24 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it
    [cmd_args]
    workloads = "aiperf.sh"
 
-     [cmd_args.aiperf.args]
-     accuracy-benchmark = "mmlu"
-     accuracy-n-shots = 5
-     accuracy-tasks = "abstract_algebra"
-     concurrency = 10
-     extra-inputs = '{"temperature":0,"stop":["\n"]}'
-     num-requests = 100
+   [cmd_args.aiperf]
+   setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'"
+
+   [cmd_args.aiperf.args]
+   accuracy-benchmark = "mmlu"
+   accuracy-n-shots = 5
+   accuracy-tasks = "abstract_algebra"
+   concurrency = 10
+   extra-inputs = '{"temperature":0,"stop":["\n"]}'
+   num-requests = 100
 
 When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes the
 ``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt and
 token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark.
 
+The ``setup-cmd`` field is optional. It is useful for Dynamo images that include ``aiperf`` without its accuracy extra;
+CloudAI runs it immediately before launching ``aiperf profile``.
+
 Review Benchmark Results
 ------------------------
 
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
index 2d0e5e8fd..a28d84f4b 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
@@ -374,6 +374,11 @@ class AIPerf(Workload):
     name: str = "aiperf"
     cmd: str = "aiperf profile"
     script: File = File(Path(__file__).parent.parent / "ai_dynamo/aiperf.sh")
+    setup_cmd: str | None = Field(
+        default=None,
+        serialization_alias="setup-cmd",
+        validation_alias=AliasChoices("setup-cmd", "setup_cmd"),
+    )
     report_name: str = Field(
         default="aiperf_report.csv",
         serialization_alias="report-name",
diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh
index 62298dc97..ac3131b50 100644
--- a/src/cloudai/workloads/ai_dynamo/aiperf.sh
+++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh
@@ -29,6 +29,7 @@
 #   --port          HTTP port the dynamo.frontend is listening on.
 #   --report-name   Output CSV name (default: aiperf_report.csv).
 #   --cmd           Full launch command including subcommand (default: "aiperf profile").
+#   --setup-cmd     Optional shell command run before launching aiperf.
 #   --extra-args    Raw string appended verbatim after all other flags.
 #
 # All unrecognised flags (--install-dir, --gpus-per-node, etc.) are silently
@@ -44,6 +45,7 @@ url="http://localhost"
 port=8000
 report_name="aiperf_report.csv"
 cmd="aiperf profile"
+setup_cmd=""
 declare -a extra_args=()
 declare -a aiperf_profile_args=()
 
@@ -84,6 +86,7 @@ process_args() {
       --port)         port="$2";        shift 2 ;;
       --report-name)  report_name="$2"; shift 2 ;;
       --cmd)          cmd="$2";         shift 2 ;;
+      --setup-cmd)    setup_cmd="$2";   shift 2 ;;
       --extra-args)   read -ra extra_args <<< "$2"; shift 2 ;;
       --)             shift; _parse_aiperf_args "$@"; break ;;
       --*)            if [[ -n "${2:-}" && "${2}" != -* ]]; then shift 2; else shift 1; fi ;;  # consume unknown flag; shift 2 only if next arg is a value
@@ -98,10 +101,21 @@ process_args() {
     port:         $port
     report_name:  $report_name
     cmd:          $cmd
+    setup_cmd:    ${setup_cmd:-}
     extra_args:   ${extra_args[*]:-}
     profile_args: ${aiperf_profile_args[*]:-}"
 }
 
+run_setup_cmd() {
+  if [[ -z "$setup_cmd" ]]; then
+    return
+  fi
+
+  log "Running AIPerf setup command: $setup_cmd"
+  bash -lc "$setup_cmd"
+  log "AIPerf setup command complete"
+}
+
 process_results() {
   local artifact_dir="$result_dir/aiperf_artifacts"
   local csv_path=""
@@ -144,6 +158,8 @@ main() {
     log "ERROR: --model is required"; exit 1
   fi
 
+  run_setup_cmd
+
   local full_url="${url}:${port}"
   local artifact_dir="$result_dir/aiperf_artifacts"
   rm -rf "$artifact_dir"
diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
index 146694734..23132aef6 100644
--- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
@@ -155,20 +155,23 @@ def test_dynamo_cmd(
 def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
     td = cast(AIDynamoTestDefinition, strategy.test_run.test)
     td.cmd_args.workloads = "aiperf.sh"
+    setup_cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'"
     td.cmd_args.aiperf = AIPerf.model_validate(
         {
+            "setup-cmd": setup_cmd,
             "args": {
                 "accuracy-benchmark": "mmlu",
                 "accuracy-n-shots": 5,
                 "accuracy-tasks": "abstract_algebra",
                 "concurrency": 10,
                 "num-requests": 100,
-            }
+            },
         }
     )
 
     result = strategy._gen_script_args(td)
 
+    assert f'--aiperf-setup-cmd "{setup_cmd}"' in result
     assert '--aiperf-args-accuracy-benchmark "mmlu"' in result
     assert '--aiperf-args-accuracy-n-shots "5"' in result
     assert '--aiperf-args-accuracy-tasks "abstract_algebra"' in result

From b6c798277a9afc160917155c3a5e26e356d893e2 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 26 May 2026 21:52:27 -0700
Subject: [PATCH 05/16] hard bump aiperf

---
 conf/experimental/ai_dynamo/test/sglang.toml                | 2 +-
 conf/experimental/ai_dynamo/test/vllm.toml                  | 2 +-
 doc/workloads/ai_dynamo.rst                                 | 6 +++---
 .../workloads/ai_dynamo/test_command_gen_strategy_slurm.py  | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index be393b829..df62ee649 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -94,7 +94,7 @@ workloads = "aiperf.sh"
     concurrency = 2
 
   [cmd_args.aiperf]
-  setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'"
+  setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'"
 
     [cmd_args.aiperf.args]
     accuracy-benchmark = "mmlu"
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index a072b5a5e..e1f65daa3 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -88,7 +88,7 @@ workloads = "aiperf.sh"
     concurrency = 2
 
   [cmd_args.aiperf]
-  setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'"
+  setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'"
 
     [cmd_args.aiperf.args]
     accuracy-benchmark = "mmlu"
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index d9912fb06..66864bd54 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -122,7 +122,7 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it
    workloads = "aiperf.sh"
 
    [cmd_args.aiperf]
-   setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'"
+   setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'"
 
    [cmd_args.aiperf.args]
    accuracy-benchmark = "mmlu"
@@ -136,8 +136,8 @@ When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``a
 ``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt and
 token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark.
 
-The ``setup-cmd`` field is optional. It is useful for Dynamo images that include ``aiperf`` without its accuracy extra;
-CloudAI runs it immediately before launching ``aiperf profile``.
+The ``setup-cmd`` field is optional. It is useful for Dynamo images that include an older system ``aiperf`` build without
+the accuracy benchmark plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``.
 
 Review Benchmark Results
 ------------------------
diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
index 23132aef6..f9ca68008 100644
--- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
@@ -155,7 +155,7 @@ def test_dynamo_cmd(
 def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
     td = cast(AIDynamoTestDefinition, strategy.test_run.test)
     td.cmd_args.workloads = "aiperf.sh"
-    setup_cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.6.0.post1'"
+    setup_cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'"
     td.cmd_args.aiperf = AIPerf.model_validate(
         {
             "setup-cmd": setup_cmd,

From de1110ed0dd75be69b29f4b779cea20622fc6a6e Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 26 May 2026 22:02:43 -0700
Subject: [PATCH 06/16] enable hf online

---
 conf/experimental/ai_dynamo/test/sglang.toml |  6 +++---
 conf/experimental/ai_dynamo/test/vllm.toml   |  6 +++---
 doc/workloads/ai_dynamo.rst                  |  2 ++
 src/cloudai/workloads/ai_dynamo/ai_dynamo.sh | 20 ++++++++++++++++++--
 4 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index df62ee649..f73984d58 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -106,9 +106,9 @@ workloads = "aiperf.sh"
 
 [extra_env_vars]
 UCX_LOG_LEVEL = "warn"
-HF_HUB_OFFLINE = "1"
-TRANSFORMERS_OFFLINE = "1"
-HF_DATASETS_OFFLINE = "1"
+HF_HUB_OFFLINE = "0"
+TRANSFORMERS_OFFLINE = "0"
+HF_DATASETS_OFFLINE = "0"
 DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
 UCX_TLS = "all"
 #DYN_LOGGING_JSONL="true"
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index e1f65daa3..196595529 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -100,8 +100,8 @@ workloads = "aiperf.sh"
 
 [extra_env_vars]
 UCX_LOG_LEVEL = "warn"
-HF_HUB_OFFLINE = "1"
-TRANSFORMERS_OFFLINE = "1"
-HF_DATASETS_OFFLINE = "1"
+HF_HUB_OFFLINE = "0"
+TRANSFORMERS_OFFLINE = "0"
+HF_DATASETS_OFFLINE = "0"
 DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
 UCX_TLS = "all"
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index 66864bd54..63e617b73 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -138,6 +138,8 @@ token-length flags out of this mode; the benchmark dataset should come from AIPe
 
 The ``setup-cmd`` field is optional. It is useful for Dynamo images that include an older system ``aiperf`` build without
 the accuracy benchmark plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``.
+MMLU is loaded from ``lighteval/mmlu``, so either allow Hugging Face dataset access or pre-cache that dataset before
+running with ``HF_HUB_OFFLINE``/``HF_DATASETS_OFFLINE`` enabled.
 
 Review Benchmark Results
 ------------------------
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
index 8cb84f7d9..26a412647 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
@@ -424,6 +424,10 @@ function perform_exit()
 
 exit_on_error() {
   local fatal=$(_detect_fatal_once)
+  if [ -f "${FATAL_ERROR_MARKER}" ]; then
+    log "FATAL_ERROR_MARKER found. Terminating."
+    perform_exit 1
+  fi
   if [ -f "${DONE_MARKER}" ]; then
     log "DONE_MARKER found. Skipping error check."
     return
@@ -693,6 +697,13 @@ function mark_done()
   touch "$DONE_MARKER"
 }
 
+function mark_failed()
+{
+  local message="$1"
+  log "ERROR: ${message}"
+  printf '%s\n' "${message}" > "${FATAL_ERROR_MARKER}"
+}
+
 function launch_etcd()
 {
   log "Launching etcd with cmd: ${dynamo_args["etcd-cmd"]} --listen-client-urls http://0.0.0.0:${dynamo_args["etcd-port"]} --advertise-client-urls http://0.0.0.0:${dynamo_args["etcd-port"]}"
@@ -1034,6 +1045,11 @@ function launch_workload()
     --decode-nodes "${decode_config["node-list"]}" \
     "${config_arr[@]}" \
     -- "${args_arr[@]}" > "${RESULTS_DIR}/$workload_name.log" 2>&1
+  local workload_status=$?
+  if [[ "${workload_status}" -ne 0 ]]; then
+    mark_failed "Workload ${workload_name} failed with exit code ${workload_status}. See ${RESULTS_DIR}/${workload_name}.log"
+    return "${workload_status}"
+  fi
 
   log "Done with $workload_name run"
 }
@@ -1043,11 +1059,11 @@ function launch_workloads()
   wait_for_dynamo_frontend
 
   if _is_genai_perf_workload; then
-    launch_workload genai_perf_config genai_perf_args
+    launch_workload genai_perf_config genai_perf_args || return $?
   fi
 
   if _is_aiperf_workload; then
-    launch_workload aiperf_config aiperf_args
+    launch_workload aiperf_config aiperf_args || return $?
   fi
 
   mark_done

From 632e8f52bd008cac2ad1a37394098e0791dc6292 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 26 May 2026 22:12:22 -0700
Subject: [PATCH 07/16] remove first token conf

---
 conf/experimental/ai_dynamo/test/sglang.toml | 2 +-
 conf/experimental/ai_dynamo/test/vllm.toml   | 2 +-
 doc/workloads/ai_dynamo.rst                  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index f73984d58..36f8d7cc9 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -101,7 +101,7 @@ workloads = "aiperf.sh"
     accuracy-n-shots = 5
     accuracy-tasks = "abstract_algebra"
     concurrency = 10
-    extra-inputs = '{"temperature":0,"stop":["\n"]}'
+    extra-inputs = '{"temperature":0}'
     num-requests = 100
 
 [extra_env_vars]
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index 196595529..634f127b8 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -95,7 +95,7 @@ workloads = "aiperf.sh"
     accuracy-n-shots = 5
     accuracy-tasks = "abstract_algebra"
     concurrency = 10
-    extra-inputs = '{"temperature":0,"stop":["\n"]}'
+    extra-inputs = '{"temperature":0}'
     num-requests = 100
 
 [extra_env_vars]
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index 63e617b73..bdf2b8f70 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -129,7 +129,7 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it
    accuracy-n-shots = 5
    accuracy-tasks = "abstract_algebra"
    concurrency = 10
-   extra-inputs = '{"temperature":0,"stop":["\n"]}'
+   extra-inputs = '{"temperature":0}'
    num-requests = 100
 
 When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes the

From b8217ec0304f2dd264de141edc882e16beb07bbb Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 26 May 2026 22:21:07 -0700
Subject: [PATCH 08/16] disable qwen thinking

---
 conf/experimental/ai_dynamo/test/sglang.toml                 | 2 +-
 conf/experimental/ai_dynamo/test/vllm.toml                   | 2 +-
 doc/workloads/ai_dynamo.rst                                  | 3 ++-
 tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py | 3 +++
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index 36f8d7cc9..28e049c8a 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -101,7 +101,7 @@ workloads = "aiperf.sh"
     accuracy-n-shots = 5
     accuracy-tasks = "abstract_algebra"
     concurrency = 10
-    extra-inputs = '{"temperature":0}'
+    extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
     num-requests = 100
 
 [extra_env_vars]
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index 634f127b8..fdae52847 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -95,7 +95,7 @@ workloads = "aiperf.sh"
     accuracy-n-shots = 5
     accuracy-tasks = "abstract_algebra"
     concurrency = 10
-    extra-inputs = '{"temperature":0}'
+    extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
     num-requests = 100
 
 [extra_env_vars]
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index bdf2b8f70..3de4a5523 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -129,7 +129,7 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it
    accuracy-n-shots = 5
    accuracy-tasks = "abstract_algebra"
    concurrency = 10
-   extra-inputs = '{"temperature":0}'
+   extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
    num-requests = 100
 
 When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes the
@@ -140,6 +140,7 @@ The ``setup-cmd`` field is optional. It is useful for Dynamo images that include
 the accuracy benchmark plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``.
 MMLU is loaded from ``lighteval/mmlu``, so either allow Hugging Face dataset access or pre-cache that dataset before
 running with ``HF_HUB_OFFLINE``/``HF_DATASETS_OFFLINE`` enabled.
+For Qwen3 models, the example disables thinking mode so short MMLU answers can be parsed as choices.
 
 Review Benchmark Results
 ------------------------
diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
index f9ca68008..8667cf0d2 100644
--- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
@@ -156,6 +156,7 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo
     td = cast(AIDynamoTestDefinition, strategy.test_run.test)
     td.cmd_args.workloads = "aiperf.sh"
     setup_cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'"
+    extra_inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
     td.cmd_args.aiperf = AIPerf.model_validate(
         {
             "setup-cmd": setup_cmd,
@@ -164,6 +165,7 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo
                 "accuracy-n-shots": 5,
                 "accuracy-tasks": "abstract_algebra",
                 "concurrency": 10,
+                "extra-inputs": extra_inputs,
                 "num-requests": 100,
             },
         }
@@ -176,6 +178,7 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo
     assert '--aiperf-args-accuracy-n-shots "5"' in result
     assert '--aiperf-args-accuracy-tasks "abstract_algebra"' in result
     assert '--aiperf-args-concurrency "10"' in result
+    assert f"--aiperf-args-extra-inputs '{extra_inputs}'" in result
     assert '--aiperf-args-num-requests "100"' in result
 
 

From 3c05fb515a8066ac10dd6c94b6f8a92d3ded79f8 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Wed, 27 May 2026 08:39:47 -0700
Subject: [PATCH 09/16] run both perf and accuracy tests

---
 conf/experimental/ai_dynamo/test/sglang.toml  |  12 +-
 conf/experimental/ai_dynamo/test/vllm.toml    |  12 +-
 doc/workloads/ai_dynamo.rst                   |  30 +++--
 src/cloudai/workloads/ai_dynamo/__init__.py   |   2 +
 src/cloudai/workloads/ai_dynamo/ai_dynamo.py  | 106 +++++++++++++-----
 src/cloudai/workloads/ai_dynamo/ai_dynamo.sh  |  16 +++
 src/cloudai/workloads/ai_dynamo/aiperf.sh     |  18 +--
 .../ai_dynamo/report_generation_strategy.py   |   4 +-
 .../ai_dynamo/slurm_command_gen_strategy.py   |   2 +
 .../test_command_gen_strategy_slurm.py        |  46 ++++++++
 .../ai_dynamo/test_report_gen_strategy.py     | 101 +++++++++++++++++
 11 files changed, 297 insertions(+), 52 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index 28e049c8a..7d9930ecd 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -94,9 +94,17 @@ workloads = "aiperf.sh"
     concurrency = 2
 
   [cmd_args.aiperf]
-  setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'"
-
     [cmd_args.aiperf.args]
+    concurrency = 2
+    extra-inputs = '{"min_tokens":10}'
+    output-tokens-mean = 500
+    request-count = 50
+    synthetic-input-tokens-mean = 300
+
+  [cmd_args.aiperf_accuracy]
+  setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
+
+    [cmd_args.aiperf_accuracy.args]
     accuracy-benchmark = "mmlu"
     accuracy-n-shots = 5
     accuracy-tasks = "abstract_algebra"
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index fdae52847..e314fe743 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -88,9 +88,17 @@ workloads = "aiperf.sh"
     concurrency = 2
 
   [cmd_args.aiperf]
-  setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'"
-
     [cmd_args.aiperf.args]
+    concurrency = 2
+    extra-inputs = '{"min_tokens":10}'
+    output-tokens-mean = 500
+    request-count = 50
+    synthetic-input-tokens-mean = 300
+
+  [cmd_args.aiperf_accuracy]
+  setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
+
+    [cmd_args.aiperf_accuracy.args]
     accuracy-benchmark = "mmlu"
     accuracy-n-shots = 5
     accuracy-tasks = "abstract_algebra"
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index 3de4a5523..b0e077f9f 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -113,8 +113,9 @@ To use genai-perf, set:
 Semantic Degradation With AIPerf Accuracy
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it by adding AIPerf accuracy flags under
-``[cmd_args.aiperf.args]`` and running the ``aiperf.sh`` workload:
+AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it with
+``[cmd_args.aiperf_accuracy]``. This runs after the configured performance workload, so it can be used with either
+``aiperf.sh`` or ``genai_perf.sh``:
 
 .. code-block:: toml
 
@@ -122,9 +123,16 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it
    workloads = "aiperf.sh"
 
    [cmd_args.aiperf]
-   setup-cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'"
+     [cmd_args.aiperf.args]
+     request-count = 50
+     synthetic-input-tokens-mean = 300
+     output-tokens-mean = 500
+     concurrency = 2
+
+   [cmd_args.aiperf_accuracy]
+   setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
 
-   [cmd_args.aiperf.args]
+   [cmd_args.aiperf_accuracy.args]
    accuracy-benchmark = "mmlu"
    accuracy-n-shots = 5
    accuracy-tasks = "abstract_algebra"
@@ -132,9 +140,9 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it
    extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
    num-requests = 100
 
-When ``accuracy-benchmark`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes the
-``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt and
-token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark.
+When ``cmd_args.aiperf_accuracy`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes
+the ``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt
+and token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark.
 
 The ``setup-cmd`` field is optional. It is useful for Dynamo images that include an older system ``aiperf`` build without
 the accuracy benchmark plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``.
@@ -147,11 +155,13 @@ Review Benchmark Results
 
 After job completion, CloudAI places output logs and result files in the designated results directory. The result file name depends on the configured ``workloads`` field:
 
-- ``aiperf.sh`` → ``aiperf_report.csv`` for performance mode, ``accuracy_results.csv`` for accuracy mode
+- ``aiperf.sh`` → ``aiperf_report.csv``
 - ``genai_perf.sh`` → ``genai_perf_report.csv``
+- ``cmd_args.aiperf_accuracy`` → ``accuracy_results.csv``
 
-If AIPerf accuracy mode is enabled, CloudAI copies ``aiperf_artifacts/accuracy_results.csv`` to ``accuracy_results.csv``
-in the run output directory and marks the run failed if that file is not produced.
+If AIPerf accuracy mode is enabled, CloudAI copies ``aiperf_accuracy_artifacts/accuracy_results.csv`` to
+``accuracy_results.csv`` in the run output directory and marks the run failed if that file is not produced. The older
+one-shot form that puts ``accuracy-benchmark`` under ``cmd_args.aiperf.args`` remains supported for compatibility.
 
 Navigate to ``./results/<scenario>/<test-id>/0/`` and open the CSV to examine performance metrics.
 
diff --git a/src/cloudai/workloads/ai_dynamo/__init__.py b/src/cloudai/workloads/ai_dynamo/__init__.py
index 1360ce10d..4aac3fd2c 100644
--- a/src/cloudai/workloads/ai_dynamo/__init__.py
+++ b/src/cloudai/workloads/ai_dynamo/__init__.py
@@ -19,6 +19,7 @@
     AIDynamoCmdArgs,
     AIDynamoTestDefinition,
     AIPerf,
+    AIPerfAccuracy,
     GenAIPerf,
     LMCache,
     LMCacheArgs,
@@ -37,6 +38,7 @@
     "AIDynamoSlurmCommandGenStrategy",
     "AIDynamoTestDefinition",
     "AIPerf",
+    "AIPerfAccuracy",
     "GenAIPerf",
     "LMCache",
     "LMCacheArgs",
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
index a28d84f4b..ffe34fa82 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
@@ -42,6 +42,7 @@
 from cloudai.systems.slurm import SlurmSystem
 
 AIPERF_ARTIFACTS_DIR = "aiperf_artifacts"
+AIPERF_ACCURACY_ARTIFACTS_DIR = "aiperf_accuracy_artifacts"
 AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv"
 
 
@@ -94,6 +95,7 @@ def parse_aiperf_accuracy(output_path: Path) -> float | None:
     """
     candidates = [
         output_path / AIPERF_ACCURACY_RESULTS_CSV,
+        output_path / AIPERF_ACCURACY_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV,
         output_path / AIPERF_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV,
     ]
 
@@ -401,6 +403,22 @@ def has_accuracy_benchmark(self) -> bool:
         return "--accuracy-benchmark" in extra_args
 
 
+class AIPerfAccuracy(AIPerf):
+    """Optional AIPerf accuracy benchmark configuration."""
+
+    name: str = "aiperf_accuracy"
+    report_name: str = Field(
+        default="aiperf_accuracy_report.csv",
+        serialization_alias="report-name",
+        validation_alias=AliasChoices("report-name", "report_name"),
+    )
+    artifact_dir_name: str = Field(
+        default=AIPERF_ACCURACY_ARTIFACTS_DIR,
+        serialization_alias="artifact-dir-name",
+        validation_alias=AliasChoices("artifact-dir-name", "artifact_dir_name"),
+    )
+
+
 class Constraints(BaseModel):
     """Constraints for validation of AI Dynamo configurations when using DSE."""
 
@@ -421,6 +439,7 @@ class AIDynamoCmdArgs(CmdArgs):
     lmcache: LMCache = Field(default_factory=LMCache)
     genai_perf: GenAIPerf = Field(default_factory=GenAIPerf)
     aiperf: AIPerf = Field(default_factory=AIPerf)
+    aiperf_accuracy: AIPerfAccuracy | None = None
     workloads: str = "genai_perf.sh"
 
     @field_validator("workloads", mode="before")
@@ -443,6 +462,7 @@ def installables(self) -> list[Installable]:
             *self.lmcache.installables,
             *self.genai_perf.installables,
             *self.aiperf.installables,
+            *(self.aiperf_accuracy.installables if self.aiperf_accuracy else []),
         ]
 
 
@@ -471,6 +491,9 @@ def workload_scripts(self) -> "AIDynamoTestDefinition":
             if workload not in workload_map:
                 raise ValueError(f"Invalid workload: {workload}. Available workloads: {list(workload_map.keys())}")
 
+        if self.cmd_args.aiperf_accuracy is not None and not self.cmd_args.aiperf_accuracy.has_accuracy_benchmark:
+            raise ValueError("cmd_args.aiperf_accuracy must configure an AIPerf --accuracy-benchmark argument")
+
         return self
 
     def get_workload_map(self) -> dict[str, Workload]:
@@ -504,10 +527,58 @@ def installables(self) -> list[Installable]:
             *self.cmd_args.installables,
         ]
 
+    def _has_aiperf_accuracy_results(self, output_path: Path) -> bool:
+        accuracy = parse_aiperf_accuracy(output_path)
+        if accuracy is None:
+            logging.info(f"AIPerf accuracy results not found in {output_path}.")
+            return False
+
+        logging.info(f"AIPerf accuracy results found in {output_path}: {accuracy}")
+        return True
+
+    def _is_legacy_aiperf_accuracy_workload(self, workload: str) -> bool:
+        return workload == self.cmd_args.aiperf.script.src.name and self.cmd_args.aiperf.has_accuracy_benchmark
+
+    def _was_workload_report_produced(self, output_path: Path, workload: str, workload_config: Workload) -> bool:
+        report_name = workload_config.report_name
+        if report_name is None:
+            logging.warning(f"Workload {workload} has no report_name configured")
+            return False
+
+        workload_csv_file = output_path / report_name
+        if not workload_csv_file.exists():
+            logging.info(f"Result file ({workload_csv_file.absolute()}) not found for workload: {workload}")
+            return False
+
+        logging.info(f"Result file ({workload_csv_file.absolute()}) exists for {workload}")
+        return True
+
+    def _was_workload_successful(self, output_path: Path, workload: str, workload_map: dict[str, Workload]) -> bool:
+        workload_config = workload_map.get(workload)
+        if workload_config is None:
+            logging.info(f"Workload {workload} not found in workload map")
+            return False
+
+        if self._is_legacy_aiperf_accuracy_workload(workload):
+            return self._has_aiperf_accuracy_results(output_path)
+
+        return self._was_workload_report_produced(output_path, workload, workload_config)
+
+    def _were_workloads_successful(self, output_path: Path) -> bool:
+        workload_map = self.get_workload_map()
+        result = True
+        for workload in self.cmd_args.workloads_list:
+            result = self._was_workload_successful(output_path, workload, workload_map) and result
+        return result
+
+    def _was_aiperf_accuracy_successful(self, output_path: Path) -> bool:
+        if self.cmd_args.aiperf_accuracy is None:
+            return True
+
+        return self._has_aiperf_accuracy_results(output_path)
+
     def was_run_successful(self, tr: TestRun) -> JobStatusResult:
         output_path = tr.output_path
-        result = True
-        workload_map = self.get_workload_map()
         failure_marker = output_path / self.failure_marker
         success_marker = output_path / self.success_marker
 
@@ -518,34 +589,9 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult:
         if not success_marker.exists():
             return JobStatusResult(False, error_message=f"Success marker file not found: {success_marker.absolute()}")
 
-        for workload in self.cmd_args.workloads_list:
-            if workload not in workload_map:
-                logging.info(f"Workload {workload} not found in workload map")
-                result = False
-                continue
-
-            if workload == self.cmd_args.aiperf.script.src.name and self.cmd_args.aiperf.has_accuracy_benchmark:
-                accuracy = parse_aiperf_accuracy(output_path)
-                if accuracy is None:
-                    logging.info(f"AIPerf accuracy results not found in {output_path}.")
-                    result = False
-                else:
-                    logging.info(f"AIPerf accuracy results found in {output_path}: {accuracy}")
-                continue
-
-            report_name = workload_map[workload].report_name
-            if report_name is None:
-                logging.warning(f"Workload {workload} has no report_name configured")
-                result = False
-                continue
-            workload_csv_file = output_path / report_name
-            if not workload_csv_file.exists():
-                logging.info(f"Result file ({workload_csv_file.absolute()}) not found for workload: {workload}")
-                result = False
-            else:
-                logging.info(f"Result file ({workload_csv_file.absolute()}) exists for {workload}")
-
-        return JobStatusResult(result)
+        workloads_successful = self._were_workloads_successful(output_path)
+        accuracy_successful = self._was_aiperf_accuracy_successful(output_path)
+        return JobStatusResult(workloads_successful and accuracy_successful)
 
     def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool:
         prefill_worker = tr.test.cmd_args.dynamo.prefill_worker
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
index 26a412647..5b65db41f 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
@@ -37,6 +37,8 @@ declare -A genai_perf_args
 declare -A genai_perf_config
 declare -A aiperf_args
 declare -A aiperf_config
+declare -A aiperf_accuracy_args
+declare -A aiperf_accuracy_config
 
 declare -A dynamo_args
 dynamo_args["backend"]="vllm"
@@ -173,6 +175,10 @@ _parse_cli_pairs() {
         aiperf_args["--${key#--aiperf-args-}"]="$2" ;;
       --aiperf-*)
         aiperf_config["--${key#--aiperf-}"]="$2" ;;
+      --aiperf_accuracy-args-*)
+        aiperf_accuracy_args["--${key#--aiperf_accuracy-args-}"]="$2" ;;
+      --aiperf_accuracy-*)
+        aiperf_accuracy_config["--${key#--aiperf_accuracy-}"]="$2" ;;
       --hf-home)
         HUGGINGFACE_HOME="$2" ;;
       --storage-cache-dir)
@@ -365,6 +371,8 @@ _dump_args() {
   log "GenAI-Perf args:\n$(arg_array_to_string genai_perf_args)"
   log "AIPerf config params:\n$(arg_array_to_string aiperf_config)"
   log "AIPerf args:\n$(arg_array_to_string aiperf_args)"
+  log "AIPerf accuracy config params:\n$(arg_array_to_string aiperf_accuracy_config)"
+  log "AIPerf accuracy args:\n$(arg_array_to_string aiperf_accuracy_args)"
   log "--------------------------------"
 }
 
@@ -525,6 +533,10 @@ _is_aiperf_workload() {
   [[ "${dynamo_args["workloads"]}" == *"aiperf.sh"* ]]
 }
 
+_is_aiperf_accuracy_enabled() {
+  [[ -n "${aiperf_accuracy_config["--script"]:-}" ]]
+}
+
 _init_runtime_env() {
   if _is_vllm || _is_sglang; then
     export HF_HOME="${HUGGINGFACE_HOME}"
@@ -1066,6 +1078,10 @@ function launch_workloads()
     launch_workload aiperf_config aiperf_args || return $?
   fi
 
+  if _is_aiperf_accuracy_enabled; then
+    launch_workload aiperf_accuracy_config aiperf_accuracy_args || return $?
+  fi
+
   mark_done
 }
 
diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh
index ac3131b50..bf08f869d 100644
--- a/src/cloudai/workloads/ai_dynamo/aiperf.sh
+++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh
@@ -19,7 +19,7 @@
 #
 # Called from ai_dynamo.sh's launch_workload() with:
 #   bash aiperf.sh --result-dir <dir> --model <model> --url <url> --port <port>
-#                  [--cmd <cmd>] [--report-name <name>] [--extra-args <args>]
+#                  [--cmd <cmd>] [--report-name <name>] [--artifact-dir-name <name>] [--extra-args <args>]
 #                  -- <aiperf-args>...
 #
 # Context flags (before --) that are recognised and used:
@@ -28,6 +28,7 @@
 #   --url           Base URL of the dynamo.frontend (e.g. http://node01).
 #   --port          HTTP port the dynamo.frontend is listening on.
 #   --report-name   Output CSV name (default: aiperf_report.csv).
+#   --artifact-dir-name  Artifact directory name under --result-dir (default: aiperf_artifacts).
 #   --cmd           Full launch command including subcommand (default: "aiperf profile").
 #   --setup-cmd     Optional shell command run before launching aiperf.
 #   --extra-args    Raw string appended verbatim after all other flags.
@@ -44,6 +45,7 @@ model=""
 url="http://localhost"
 port=8000
 report_name="aiperf_report.csv"
+artifact_dir_name="aiperf_artifacts"
 cmd="aiperf profile"
 setup_cmd=""
 declare -a extra_args=()
@@ -85,10 +87,11 @@ process_args() {
       --url)          url="$2";         shift 2 ;;
       --port)         port="$2";        shift 2 ;;
       --report-name)  report_name="$2"; shift 2 ;;
-      --cmd)          cmd="$2";         shift 2 ;;
-      --setup-cmd)    setup_cmd="$2";   shift 2 ;;
-      --extra-args)   read -ra extra_args <<< "$2"; shift 2 ;;
-      --)             shift; _parse_aiperf_args "$@"; break ;;
+      --artifact-dir-name) artifact_dir_name="$2"; shift 2 ;;
+      --cmd)               cmd="$2";               shift 2 ;;
+      --setup-cmd)         setup_cmd="$2";         shift 2 ;;
+      --extra-args)        read -ra extra_args <<< "$2"; shift 2 ;;
+      --)                  shift; _parse_aiperf_args "$@"; break ;;
       --*)            if [[ -n "${2:-}" && "${2}" != -* ]]; then shift 2; else shift 1; fi ;;  # consume unknown flag; shift 2 only if next arg is a value
       *)              shift ;;
     esac
@@ -100,6 +103,7 @@ process_args() {
     url:          $url
     port:         $port
     report_name:  $report_name
+    artifact_dir: $artifact_dir_name
     cmd:          $cmd
     setup_cmd:    ${setup_cmd:-}
     extra_args:   ${extra_args[*]:-}
@@ -117,7 +121,7 @@ run_setup_cmd() {
 }
 
 process_results() {
-  local artifact_dir="$result_dir/aiperf_artifacts"
+  local artifact_dir="$result_dir/$artifact_dir_name"
   local csv_path=""
   local accuracy_path="$artifact_dir/accuracy_results.csv"
 
@@ -161,7 +165,7 @@ main() {
   run_setup_cmd
 
   local full_url="${url}:${port}"
-  local artifact_dir="$result_dir/aiperf_artifacts"
+  local artifact_dir="$result_dir/$artifact_dir_name"
   rm -rf "$artifact_dir"
 
   # Split cmd into an array (e.g. "aiperf profile" → ["aiperf", "profile"])
diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
index a665b0d6a..4b46cf5dc 100644
--- a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
+++ b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
@@ -48,7 +48,9 @@ def get_metric(self, metric: str) -> MetricValue:
 
         if metric.lower() == "accuracy":
             tdef = self.test_run.test
-            if not isinstance(tdef, AIDynamoTestDefinition) or not tdef.cmd_args.aiperf.has_accuracy_benchmark:
+            if not isinstance(tdef, AIDynamoTestDefinition):
+                return METRIC_ERROR
+            if tdef.cmd_args.aiperf_accuracy is None and not tdef.cmd_args.aiperf.has_accuracy_benchmark:
                 return METRIC_ERROR
             accuracy = parse_aiperf_accuracy(self.test_run.output_path)
             return accuracy if accuracy is not None else METRIC_ERROR
diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
index 17079875c..51e704102 100644
--- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
@@ -118,6 +118,8 @@ def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]:
         args.extend(self._get_nested_toml_args(td.cmd_args.lmcache, "--lmcache-"))
         args.extend(self._get_nested_toml_args(td.cmd_args.genai_perf, "--genai_perf-"))
         args.extend(self._get_nested_toml_args(td.cmd_args.aiperf, "--aiperf-"))
+        if td.cmd_args.aiperf_accuracy is not None:
+            args.extend(self._get_nested_toml_args(td.cmd_args.aiperf_accuracy, "--aiperf_accuracy-"))
 
         return args
 
diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
index 8667cf0d2..6a9274b8c 100644
--- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
@@ -28,6 +28,7 @@
     AIDynamoSlurmCommandGenStrategy,
     AIDynamoTestDefinition,
     AIPerf,
+    AIPerfAccuracy,
     GenAIPerf,
     LMCache,
     LMCacheArgs,
@@ -182,6 +183,51 @@ def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCo
     assert '--aiperf-args-num-requests "100"' in result
 
 
+def test_gen_script_args_contains_split_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
+    td = cast(AIDynamoTestDefinition, strategy.test_run.test)
+    td.cmd_args.workloads = "aiperf.sh"
+    setup_cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
+    extra_inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+    td.cmd_args.aiperf = AIPerf.model_validate(
+        {
+            "args": {
+                "concurrency": 2,
+                "request-count": 50,
+                "synthetic-input-tokens-mean": 300,
+                "output-tokens-mean": 500,
+            },
+        }
+    )
+    td.cmd_args.aiperf_accuracy = AIPerfAccuracy.model_validate(
+        {
+            "setup-cmd": setup_cmd,
+            "args": {
+                "accuracy-benchmark": "mmlu",
+                "accuracy-n-shots": 5,
+                "accuracy-tasks": "abstract_algebra",
+                "concurrency": 10,
+                "extra-inputs": extra_inputs,
+                "num-requests": 100,
+            },
+        }
+    )
+
+    result = strategy._gen_script_args(td)
+
+    assert '--aiperf-args-request-count "50"' in result
+    assert '--aiperf-args-synthetic-input-tokens-mean "300"' in result
+    assert '--aiperf-args-output-tokens-mean "500"' in result
+    assert f'--aiperf_accuracy-setup-cmd "{setup_cmd}"' in result
+    assert '--aiperf_accuracy-name "aiperf_accuracy"' in result
+    assert '--aiperf_accuracy-artifact-dir-name "aiperf_accuracy_artifacts"' in result
+    assert '--aiperf_accuracy-args-accuracy-benchmark "mmlu"' in result
+    assert '--aiperf_accuracy-args-accuracy-n-shots "5"' in result
+    assert '--aiperf_accuracy-args-accuracy-tasks "abstract_algebra"' in result
+    assert '--aiperf_accuracy-args-concurrency "10"' in result
+    assert f"--aiperf_accuracy-args-extra-inputs '{extra_inputs}'" in result
+    assert '--aiperf_accuracy-args-num-requests "100"' in result
+
+
 def test_gen_script_args_quotes_worker_json_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
     td = cast(AIDynamoTestDefinition, strategy.test_run.test)
     config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
index 1c9cbb013..1235ab3e6 100644
--- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py
+++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
@@ -26,6 +26,7 @@
     AIDynamoCmdArgs,
     AIDynamoTestDefinition,
     AIPerf,
+    AIPerfAccuracy,
     GenAIPerf,
     LMCache,
     LMCacheArgs,
@@ -151,6 +152,64 @@ def ai_dynamo_aiperf_accuracy_tr(tmp_path: Path) -> TestRun:
     return tr
 
 
+@pytest.fixture
+def ai_dynamo_aiperf_with_split_accuracy_tr(tmp_path: Path) -> TestRun:
+    test = AIDynamoTestDefinition(
+        name="ai_dynamo_aiperf_with_split_accuracy",
+        description="desc",
+        test_template_name="t",
+        cmd_args=AIDynamoCmdArgs(
+            docker_image_url="http://url",
+            workloads="aiperf.sh",
+            dynamo=AIDynamoArgs(
+                prefill_worker=WorkerConfig(
+                    cmd="python3 -m dynamo.vllm --is-prefill-worker",
+                    worker_initialized_regex="VllmWorker.*has.been.initialized",
+                    args=WorkerBaseArgs(),
+                ),
+            ),
+            aiperf=AIPerf(),
+            aiperf_accuracy=AIPerfAccuracy.model_validate({"args": {"accuracy-benchmark": "mmlu"}}),
+            lmcache=LMCache(args=LMCacheArgs()),
+        ),
+    )
+    tr = TestRun(name="ai_dynamo_aiperf_with_split_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path)
+    (tr.output_path / "aiperf_report.csv").write_text(get_aiperf_csv_content())
+    (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content())
+    (tr.output_path / test.success_marker).touch()
+    return tr
+
+
+@pytest.fixture
+def ai_dynamo_genai_perf_with_split_accuracy_tr(tmp_path: Path) -> TestRun:
+    test = AIDynamoTestDefinition(
+        name="ai_dynamo_genai_perf_with_split_accuracy",
+        description="desc",
+        test_template_name="t",
+        cmd_args=AIDynamoCmdArgs(
+            docker_image_url="http://url",
+            workloads="genai_perf.sh",
+            dynamo=AIDynamoArgs(
+                prefill_worker=WorkerConfig(
+                    cmd="python3 -m dynamo.vllm --is-prefill-worker",
+                    worker_initialized_regex="VllmWorker.*has.been.initialized",
+                    args=WorkerBaseArgs(),
+                ),
+            ),
+            genai_perf=GenAIPerf(),
+            aiperf_accuracy=AIPerfAccuracy.model_validate({"args": {"accuracy-benchmark": "mmlu"}}),
+            lmcache=LMCache(args=LMCacheArgs()),
+        ),
+    )
+    tr = TestRun(
+        name="ai_dynamo_genai_perf_with_split_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path
+    )
+    (tr.output_path / "genai_perf_report.csv").write_text(get_csv_content())
+    (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content())
+    (tr.output_path / test.success_marker).touch()
+    return tr
+
+
 @pytest.fixture
 def csv_content() -> str:
     return get_csv_content()
@@ -199,6 +258,15 @@ def test_ai_dynamo_get_metric_aiperf_accuracy(slurm_system: SlurmSystem, ai_dyna
     assert strategy.get_metric("accuracy") == 0.35
 
 
+def test_ai_dynamo_get_metric_split_aiperf_accuracy(
+    slurm_system: SlurmSystem, ai_dynamo_aiperf_with_split_accuracy_tr: TestRun
+) -> None:
+    strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_with_split_accuracy_tr)
+
+    assert strategy.get_metric("accuracy") == 0.35
+    assert strategy.get_metric("Inter Token Latency (ms)") == 2.83
+
+
 def test_ai_dynamo_accuracy_metric_requires_aiperf_accuracy_config(
     slurm_system: SlurmSystem, ai_dynamo_aiperf_tr: TestRun
 ) -> None:
@@ -228,6 +296,22 @@ def test_was_run_successful_with_aiperf_accuracy(ai_dynamo_aiperf_accuracy_tr: T
     assert result.is_successful is True
 
 
+def test_was_run_successful_with_split_aiperf_accuracy(
+    ai_dynamo_aiperf_with_split_accuracy_tr: TestRun,
+) -> None:
+    test_def = ai_dynamo_aiperf_with_split_accuracy_tr.test
+    result = test_def.was_run_successful(ai_dynamo_aiperf_with_split_accuracy_tr)
+    assert result.is_successful is True
+
+
+def test_was_run_successful_with_genai_perf_and_split_aiperf_accuracy(
+    ai_dynamo_genai_perf_with_split_accuracy_tr: TestRun,
+) -> None:
+    test_def = ai_dynamo_genai_perf_with_split_accuracy_tr.test
+    result = test_def.was_run_successful(ai_dynamo_genai_perf_with_split_accuracy_tr)
+    assert result.is_successful is True
+
+
 def test_was_run_successful_requires_aiperf_accuracy(ai_dynamo_aiperf_accuracy_tr: TestRun) -> None:
     test_def = ai_dynamo_aiperf_accuracy_tr.test
     (ai_dynamo_aiperf_accuracy_tr.output_path / "accuracy_results.csv").unlink()
@@ -235,6 +319,15 @@ def test_was_run_successful_requires_aiperf_accuracy(ai_dynamo_aiperf_accuracy_t
     assert result.is_successful is False
 
 
+def test_was_run_successful_requires_split_aiperf_accuracy(
+    ai_dynamo_aiperf_with_split_accuracy_tr: TestRun,
+) -> None:
+    test_def = ai_dynamo_aiperf_with_split_accuracy_tr.test
+    (ai_dynamo_aiperf_with_split_accuracy_tr.output_path / "accuracy_results.csv").unlink()
+    result = test_def.was_run_successful(ai_dynamo_aiperf_with_split_accuracy_tr)
+    assert result.is_successful is False
+
+
 def test_was_run_successful_no_results(ai_dynamo_tr: TestRun, tmp_path: Path) -> None:
     test_def = ai_dynamo_tr.test
     ai_dynamo_tr.output_path = tmp_path / "empty_output"
@@ -251,6 +344,14 @@ def test_parse_aiperf_accuracy_from_artifact_dir(tmp_path: Path) -> None:
     assert parse_aiperf_accuracy(tmp_path) == 0.35
 
 
+def test_parse_aiperf_accuracy_from_split_accuracy_artifact_dir(tmp_path: Path) -> None:
+    artifact_dir = tmp_path / "aiperf_accuracy_artifacts"
+    artifact_dir.mkdir()
+    (artifact_dir / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content(), encoding="utf-8")
+
+    assert parse_aiperf_accuracy(tmp_path) == 0.35
+
+
 def test_parse_aiperf_accuracy_missing_or_invalid(tmp_path: Path) -> None:
     (tmp_path / "accuracy_results.csv").write_text("Task,Correct,Total,Accuracy\nOVERALL,n/a,100,n/a\n")
 

From 92d4c89392aaaedce263aae8cf26518d09787a24 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Wed, 27 May 2026 09:05:53 -0700
Subject: [PATCH 10/16] refactor

---
 doc/workloads/ai_dynamo.rst                   |   3 +-
 src/cloudai/workloads/ai_dynamo/ai_dynamo.py  | 168 +++++++++---------
 .../ai_dynamo/report_generation_strategy.py   |   2 +-
 .../test_command_gen_strategy_slurm.py        |  30 ----
 .../ai_dynamo/test_report_gen_strategy.py     |  45 -----
 5 files changed, 83 insertions(+), 165 deletions(-)

diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index b0e077f9f..54aa4c252 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -160,8 +160,7 @@ After job completion, CloudAI places output logs and result files in the designa
 - ``cmd_args.aiperf_accuracy`` → ``accuracy_results.csv``
 
 If AIPerf accuracy mode is enabled, CloudAI copies ``aiperf_accuracy_artifacts/accuracy_results.csv`` to
-``accuracy_results.csv`` in the run output directory and marks the run failed if that file is not produced. The older
-one-shot form that puts ``accuracy-benchmark`` under ``cmd_args.aiperf.args`` remains supported for compatibility.
+``accuracy_results.csv`` in the run output directory and marks the run failed if that file is not produced.
 
 Navigate to ``./results/<scenario>/<test-id>/0/`` and open the CSV to examine performance metrics.
 
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
index ffe34fa82..8c5dcaad1 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
@@ -46,87 +46,6 @@
 AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv"
 
 
-def _parse_accuracy_value(value: str | int | float | None) -> float | None:
-    if value is None:
-        return None
-    if isinstance(value, (int, float)):
-        accuracy = float(value)
-        return accuracy / 100 if accuracy > 1 else accuracy
-
-    raw_value = value.strip()
-    if not raw_value:
-        return None
-
-    is_percentage = raw_value.endswith("%")
-    if is_percentage:
-        raw_value = raw_value[:-1].strip()
-
-    try:
-        accuracy = float(raw_value)
-    except ValueError:
-        return None
-
-    return accuracy / 100 if is_percentage or accuracy > 1 else accuracy
-
-
-def _parse_count_value(value: str | int | float | None) -> float | None:
-    if value is None:
-        return None
-    if isinstance(value, (int, float)):
-        return float(value)
-    try:
-        return float(value.strip())
-    except ValueError:
-        return None
-
-
-def parse_aiperf_accuracy(output_path: Path) -> float | None:
-    """
-    Parse AIPerf accuracy from accuracy_results.csv.
-
-    Expected CSV format:
-        Task,Correct,Total,Accuracy
-        abstract_algebra,35,100,35.00%
-        OVERALL,8368,14042,59.59%
-
-    AIPerf writes this file under aiperf_artifacts; CloudAI's wrapper also copies
-    it to the run output directory when present. The returned value is normalized
-    to a 0.0-1.0 fraction.
-    """
-    candidates = [
-        output_path / AIPERF_ACCURACY_RESULTS_CSV,
-        output_path / AIPERF_ACCURACY_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV,
-        output_path / AIPERF_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV,
-    ]
-
-    for csv_file in candidates:
-        if not csv_file.exists() or csv_file.stat().st_size == 0:
-            continue
-
-        fallback_accuracy: float | None = None
-        with csv_file.open(newline="", encoding="utf-8") as f:
-            for row in csv.DictReader(f):
-                accuracy = _parse_accuracy_value(row.get("Accuracy") or row.get("accuracy") or row.get("Value"))
-                if accuracy is None:
-                    correct = _parse_count_value(row.get("Correct") or row.get("correct"))
-                    total = _parse_count_value(row.get("Total") or row.get("total"))
-                    if correct is not None and total:
-                        accuracy = correct / total
-                if accuracy is None:
-                    continue
-
-                task = (row.get("Task") or row.get("task") or row.get("Metric") or "").strip().upper()
-                if task == "OVERALL":
-                    return accuracy
-                if fallback_accuracy is None:
-                    fallback_accuracy = accuracy
-
-        if fallback_accuracy is not None:
-            return fallback_accuracy
-
-    return None
-
-
 class Args(BaseModel):
     """Arguments for custom workloads."""
 
@@ -536,9 +455,6 @@ def _has_aiperf_accuracy_results(self, output_path: Path) -> bool:
         logging.info(f"AIPerf accuracy results found in {output_path}: {accuracy}")
         return True
 
-    def _is_legacy_aiperf_accuracy_workload(self, workload: str) -> bool:
-        return workload == self.cmd_args.aiperf.script.src.name and self.cmd_args.aiperf.has_accuracy_benchmark
-
     def _was_workload_report_produced(self, output_path: Path, workload: str, workload_config: Workload) -> bool:
         report_name = workload_config.report_name
         if report_name is None:
@@ -559,9 +475,6 @@ def _was_workload_successful(self, output_path: Path, workload: str, workload_ma
             logging.info(f"Workload {workload} not found in workload map")
             return False
 
-        if self._is_legacy_aiperf_accuracy_workload(workload):
-            return self._has_aiperf_accuracy_results(output_path)
-
         return self._was_workload_report_produced(output_path, workload, workload_config)
 
     def _were_workloads_successful(self, output_path: Path) -> bool:
@@ -623,3 +536,84 @@ def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool:
         logging.info("constraint_check passed for: tp_times_pp_le_gpus_per_node")
 
         return True
+
+
+def _parse_accuracy_value(value: str | int | float | None) -> float | None:
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        accuracy = float(value)
+        return accuracy / 100 if accuracy > 1 else accuracy
+
+    raw_value = value.strip()
+    if not raw_value:
+        return None
+
+    is_percentage = raw_value.endswith("%")
+    if is_percentage:
+        raw_value = raw_value[:-1].strip()
+
+    try:
+        accuracy = float(raw_value)
+    except ValueError:
+        return None
+
+    return accuracy / 100 if is_percentage or accuracy > 1 else accuracy
+
+
+def _parse_count_value(value: str | int | float | None) -> float | None:
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    try:
+        return float(value.strip())
+    except ValueError:
+        return None
+
+
+def parse_aiperf_accuracy(output_path: Path) -> float | None:
+    """
+    Parse AIPerf accuracy from accuracy_results.csv.
+
+    Expected CSV format:
+        Task,Correct,Total,Accuracy
+        abstract_algebra,35,100,35.00%
+        OVERALL,8368,14042,59.59%
+
+    AIPerf writes this file under aiperf_artifacts; CloudAI's wrapper also copies
+    it to the run output directory when present. The returned value is normalized
+    to a 0.0-1.0 fraction.
+    """
+    candidates = [
+        output_path / AIPERF_ACCURACY_RESULTS_CSV,
+        output_path / AIPERF_ACCURACY_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV,
+        output_path / AIPERF_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV,
+    ]
+
+    for csv_file in candidates:
+        if not csv_file.exists() or csv_file.stat().st_size == 0:
+            continue
+
+        fallback_accuracy: float | None = None
+        with csv_file.open(newline="", encoding="utf-8") as f:
+            for row in csv.DictReader(f):
+                accuracy = _parse_accuracy_value(row.get("Accuracy") or row.get("accuracy") or row.get("Value"))
+                if accuracy is None:
+                    correct = _parse_count_value(row.get("Correct") or row.get("correct"))
+                    total = _parse_count_value(row.get("Total") or row.get("total"))
+                    if correct is not None and total:
+                        accuracy = correct / total
+                if accuracy is None:
+                    continue
+
+                task = (row.get("Task") or row.get("task") or row.get("Metric") or "").strip().upper()
+                if task == "OVERALL":
+                    return accuracy
+                if fallback_accuracy is None:
+                    fallback_accuracy = accuracy
+
+        if fallback_accuracy is not None:
+            return fallback_accuracy
+
+    return None
diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
index 4b46cf5dc..a0ef92005 100644
--- a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
+++ b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
@@ -50,7 +50,7 @@ def get_metric(self, metric: str) -> MetricValue:
             tdef = self.test_run.test
             if not isinstance(tdef, AIDynamoTestDefinition):
                 return METRIC_ERROR
-            if tdef.cmd_args.aiperf_accuracy is None and not tdef.cmd_args.aiperf.has_accuracy_benchmark:
+            if tdef.cmd_args.aiperf_accuracy is None:
                 return METRIC_ERROR
             accuracy = parse_aiperf_accuracy(self.test_run.output_path)
             return accuracy if accuracy is not None else METRIC_ERROR
diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
index 6a9274b8c..9b0f695d5 100644
--- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
@@ -153,36 +153,6 @@ def test_dynamo_cmd(
     assert result.strip() == expected
 
 
-def test_gen_script_args_contains_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
-    td = cast(AIDynamoTestDefinition, strategy.test_run.test)
-    td.cmd_args.workloads = "aiperf.sh"
-    setup_cmd = "python -m pip install --break-system-packages --upgrade 'aiperf[accuracy]==0.8.0'"
-    extra_inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
-    td.cmd_args.aiperf = AIPerf.model_validate(
-        {
-            "setup-cmd": setup_cmd,
-            "args": {
-                "accuracy-benchmark": "mmlu",
-                "accuracy-n-shots": 5,
-                "accuracy-tasks": "abstract_algebra",
-                "concurrency": 10,
-                "extra-inputs": extra_inputs,
-                "num-requests": 100,
-            },
-        }
-    )
-
-    result = strategy._gen_script_args(td)
-
-    assert f'--aiperf-setup-cmd "{setup_cmd}"' in result
-    assert '--aiperf-args-accuracy-benchmark "mmlu"' in result
-    assert '--aiperf-args-accuracy-n-shots "5"' in result
-    assert '--aiperf-args-accuracy-tasks "abstract_algebra"' in result
-    assert '--aiperf-args-concurrency "10"' in result
-    assert f"--aiperf-args-extra-inputs '{extra_inputs}'" in result
-    assert '--aiperf-args-num-requests "100"' in result
-
-
 def test_gen_script_args_contains_split_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
     td = cast(AIDynamoTestDefinition, strategy.test_run.test)
     td.cmd_args.workloads = "aiperf.sh"
diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
index 1235ab3e6..9afdd6e72 100644
--- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py
+++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
@@ -126,32 +126,6 @@ def ai_dynamo_aiperf_tr(tmp_path: Path) -> TestRun:
     return tr
 
 
-@pytest.fixture
-def ai_dynamo_aiperf_accuracy_tr(tmp_path: Path) -> TestRun:
-    test = AIDynamoTestDefinition(
-        name="ai_dynamo_aiperf_accuracy",
-        description="desc",
-        test_template_name="t",
-        cmd_args=AIDynamoCmdArgs(
-            docker_image_url="http://url",
-            workloads="aiperf.sh",
-            dynamo=AIDynamoArgs(
-                prefill_worker=WorkerConfig(
-                    cmd="python3 -m dynamo.vllm --is-prefill-worker",
-                    worker_initialized_regex="VllmWorker.*has.been.initialized",
-                    args=WorkerBaseArgs(),
-                ),
-            ),
-            aiperf=AIPerf.model_validate({"args": {"accuracy-benchmark": "mmlu"}}),
-            lmcache=LMCache(args=LMCacheArgs()),
-        ),
-    )
-    tr = TestRun(name="ai_dynamo_aiperf_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path)
-    (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content())
-    (tr.output_path / test.success_marker).touch()
-    return tr
-
-
 @pytest.fixture
 def ai_dynamo_aiperf_with_split_accuracy_tr(tmp_path: Path) -> TestRun:
     test = AIDynamoTestDefinition(
@@ -252,12 +226,6 @@ def test_ai_dynamo_get_metric_aiperf(slurm_system: SlurmSystem, ai_dynamo_aiperf
     assert strategy.get_metric("aiperf:Total Token Throughput (tokens/sec):avg") == 954.47
 
 
-def test_ai_dynamo_get_metric_aiperf_accuracy(slurm_system: SlurmSystem, ai_dynamo_aiperf_accuracy_tr: TestRun) -> None:
-    strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_accuracy_tr)
-
-    assert strategy.get_metric("accuracy") == 0.35
-
-
 def test_ai_dynamo_get_metric_split_aiperf_accuracy(
     slurm_system: SlurmSystem, ai_dynamo_aiperf_with_split_accuracy_tr: TestRun
 ) -> None:
@@ -290,12 +258,6 @@ def test_was_run_successful(ai_dynamo_tr: TestRun) -> None:
     assert result.is_successful is True
 
 
-def test_was_run_successful_with_aiperf_accuracy(ai_dynamo_aiperf_accuracy_tr: TestRun) -> None:
-    test_def = ai_dynamo_aiperf_accuracy_tr.test
-    result = test_def.was_run_successful(ai_dynamo_aiperf_accuracy_tr)
-    assert result.is_successful is True
-
-
 def test_was_run_successful_with_split_aiperf_accuracy(
     ai_dynamo_aiperf_with_split_accuracy_tr: TestRun,
 ) -> None:
@@ -312,13 +274,6 @@ def test_was_run_successful_with_genai_perf_and_split_aiperf_accuracy(
     assert result.is_successful is True
 
 
-def test_was_run_successful_requires_aiperf_accuracy(ai_dynamo_aiperf_accuracy_tr: TestRun) -> None:
-    test_def = ai_dynamo_aiperf_accuracy_tr.test
-    (ai_dynamo_aiperf_accuracy_tr.output_path / "accuracy_results.csv").unlink()
-    result = test_def.was_run_successful(ai_dynamo_aiperf_accuracy_tr)
-    assert result.is_successful is False
-
-
 def test_was_run_successful_requires_split_aiperf_accuracy(
     ai_dynamo_aiperf_with_split_accuracy_tr: TestRun,
 ) -> None:

From 6ecf52e0180131b23575525612dff71464d0dccd Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Wed, 27 May 2026 09:14:27 -0700
Subject: [PATCH 11/16] udpate sglang config

---
 conf/experimental/ai_dynamo/test/sglang.toml  |  4 +--
 .../ai_dynamo/test_scenario/sglang_slurm.toml | 27 +++++++++++++++++--
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index 7d9930ecd..18e1681dd 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -32,7 +32,7 @@ workloads = "aiperf.sh"
     num-nodes = 1
     cmd = 'python3 -m dynamo.sglang'
     extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics"
-    worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
+    worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
 
       [cmd_args.dynamo.prefill_worker.args]
       page-size = 16
@@ -48,7 +48,7 @@ workloads = "aiperf.sh"
     num-nodes = 1
     cmd = 'python3 -m dynamo.sglang'
     extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics"
-    worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
+    worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
 
       [cmd_args.dynamo.decode_worker.args]
       page-size = 16
diff --git a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml
index 26ed91285..4df1a6d64 100644
--- a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml
+++ b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml
@@ -15,11 +15,12 @@
 # limitations under the License.
 
 name = "dynamo_sglang"
+job_status_check = false
 
 [[Tests]]
-id = "sglang-Qwen3-0.6B"
+id = "test.disagg.single-node"
 test_name = "sglang"
-time_limit = "00:20:00"
+time_limit = "00:10:00"
 
   [Tests.cmd_args]
     [Tests.cmd_args.dynamo]
@@ -37,3 +38,25 @@ time_limit = "00:20:00"
 
         [Tests.cmd_args.dynamo.decode_worker.args]
         tensor-parallel-size = 1
+
+[[Tests]]
+id = "test.disagg.multinode"
+test_name = "sglang"
+time_limit = "00:10:00"
+
+  [Tests.cmd_args]
+    [Tests.cmd_args.dynamo]
+    model = "Qwen/Qwen3-0.6B"
+    node-setup-cmd = "hostname"
+
+      [Tests.cmd_args.dynamo.prefill_worker]
+      num-nodes = 2
+
+        [Tests.cmd_args.dynamo.prefill_worker.args]
+        tensor-parallel-size = 1
+
+      [Tests.cmd_args.dynamo.decode_worker]
+      num-nodes = 2
+
+        [Tests.cmd_args.dynamo.decode_worker.args]
+        tensor-parallel-size = 1

From e07a44f213ea6bcd2da0e6fab83f6b50756635fb Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Wed, 27 May 2026 09:35:42 -0700
Subject: [PATCH 12/16] trying to fix missing aiperf for sgalng

---
 conf/experimental/ai_dynamo/test/sglang.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index 18e1681dd..b04deb92a 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -94,6 +94,8 @@ workloads = "aiperf.sh"
     concurrency = 2
 
   [cmd_args.aiperf]
+  setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
+
     [cmd_args.aiperf.args]
     concurrency = 2
     extra-inputs = '{"min_tokens":10}'

From a3e70921e29b5dfb902a4f85ae7727cf5d87cf6d Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Wed, 27 May 2026 09:46:49 -0700
Subject: [PATCH 13/16] fixing sglang

---
 conf/experimental/ai_dynamo/test/sglang.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index b04deb92a..904c4a3ad 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -94,7 +94,7 @@ workloads = "aiperf.sh"
     concurrency = 2
 
   [cmd_args.aiperf]
-  setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
+  setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
 
     [cmd_args.aiperf.args]
     concurrency = 2
@@ -104,7 +104,7 @@ workloads = "aiperf.sh"
     synthetic-input-tokens-mean = 300
 
   [cmd_args.aiperf_accuracy]
-  setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
+  setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
 
     [cmd_args.aiperf_accuracy.args]
     accuracy-benchmark = "mmlu"

From 333b272ddeca51420fd3b720313c08782646ef01 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Wed, 27 May 2026 11:27:45 -0700
Subject: [PATCH 14/16] allowing custom scripts

---
 conf/experimental/ai_dynamo/test/sglang.toml  |  23 +--
 conf/experimental/ai_dynamo/test/vllm.toml    |  23 +--
 doc/workloads/ai_dynamo.rst                   |  47 +++++--
 src/cloudai/workloads/ai_dynamo/accuracy.sh   | 133 ++++++++++++++++++
 src/cloudai/workloads/ai_dynamo/ai_dynamo.py  |  33 ++---
 src/cloudai/workloads/ai_dynamo/aiperf.sh     |  77 +++++-----
 .../ai_dynamo/slurm_command_gen_strategy.py   |   3 +
 .../ai_dynamo/test_accuracy_script.py         | 123 ++++++++++++++++
 .../test_command_gen_strategy_slurm.py        |  49 +++++--
 .../ai_dynamo/test_report_gen_strategy.py     |   8 +-
 10 files changed, 414 insertions(+), 105 deletions(-)
 create mode 100644 src/cloudai/workloads/ai_dynamo/accuracy.sh
 create mode 100644 tests/workloads/ai_dynamo/test_accuracy_script.py

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index 904c4a3ad..37b2c392b 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -104,15 +104,22 @@ workloads = "aiperf.sh"
     synthetic-input-tokens-mean = 300
 
   [cmd_args.aiperf_accuracy]
+  entrypoint = "aiperf profile"
   setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
-
-    [cmd_args.aiperf_accuracy.args]
-    accuracy-benchmark = "mmlu"
-    accuracy-n-shots = 5
-    accuracy-tasks = "abstract_algebra"
-    concurrency = 10
-    extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
-    num-requests = 100
+  cli = '''
+--model {model}
+--url {url}
+--endpoint-type chat
+--streaming
+--artifact-dir {artifact_dir}
+--no-server-metrics
+--accuracy-benchmark mmlu
+--accuracy-n-shots 5
+--accuracy-tasks abstract_algebra
+--concurrency 10
+--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+--num-requests 100
+'''
 
 [extra_env_vars]
 UCX_LOG_LEVEL = "warn"
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index e314fe743..583d11a88 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -96,15 +96,22 @@ workloads = "aiperf.sh"
     synthetic-input-tokens-mean = 300
 
   [cmd_args.aiperf_accuracy]
+  entrypoint = "aiperf profile"
   setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
-
-    [cmd_args.aiperf_accuracy.args]
-    accuracy-benchmark = "mmlu"
-    accuracy-n-shots = 5
-    accuracy-tasks = "abstract_algebra"
-    concurrency = 10
-    extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
-    num-requests = 100
+  cli = '''
+--model {model}
+--url {url}
+--endpoint-type chat
+--streaming
+--artifact-dir {artifact_dir}
+--no-server-metrics
+--accuracy-benchmark mmlu
+--accuracy-n-shots 5
+--accuracy-tasks abstract_algebra
+--concurrency 10
+--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+--num-requests 100
+'''
 
 [extra_env_vars]
 UCX_LOG_LEVEL = "warn"
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index 54aa4c252..c00449681 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -130,26 +130,53 @@ AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it
      concurrency = 2
 
    [cmd_args.aiperf_accuracy]
+   entrypoint = "aiperf profile"
    setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
-
-   [cmd_args.aiperf_accuracy.args]
-   accuracy-benchmark = "mmlu"
-   accuracy-n-shots = 5
-   accuracy-tasks = "abstract_algebra"
-   concurrency = 10
-   extra-inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
-   num-requests = 100
+   cli = '''
+   --model {model}
+   --url {url}
+   --endpoint-type chat
+   --streaming
+   --artifact-dir {artifact_dir}
+   --no-server-metrics
+   --accuracy-benchmark mmlu
+   --accuracy-n-shots 5
+   --accuracy-tasks abstract_algebra
+   --concurrency 10
+   --extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+   --num-requests 100
+   '''
 
 When ``cmd_args.aiperf_accuracy`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes
 the ``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt
 and token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark.
 
-The ``setup-cmd`` field is optional. It is useful for Dynamo images that include an older system ``aiperf`` build without
-the accuracy benchmark plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``.
+The ``entrypoint`` and ``cli`` fields form the accuracy command. CloudAI expands ``{model}``, ``{url}``,
+``{endpoint}``, ``{result_dir}``, and ``{artifact_dir}`` in ``cli`` before launching it. The ``setup-cmd`` field is
+optional. It is useful for Dynamo images that include an older system ``aiperf`` build without the accuracy benchmark
+plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``.
 MMLU is loaded from ``lighteval/mmlu``, so either allow Hugging Face dataset access or pre-cache that dataset before
 running with ``HF_HUB_OFFLINE``/``HF_DATASETS_OFFLINE`` enabled.
 For Qwen3 models, the example disables thinking mode so short MMLU answers can be parsed as choices.
 
+Custom Accuracy Scripts
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``cmd_args.aiperf_accuracy`` can also launch a custom mounted script instead of AIPerf. Mount the script or its parent
+directory with ``extra_container_mounts`` and set ``entrypoint`` to the in-container command:
+
+.. code-block:: toml
+
+   extra_container_mounts = ["/host/custom_accuracy:/custom_accuracy"]
+
+   [cmd_args.aiperf_accuracy]
+   entrypoint = "python /custom_accuracy/dummy_accuracy.py"
+   cli = "--model {model} --url {url} --endpoint {endpoint} --artifact-dir {artifact_dir} --prompt ping"
+
+CloudAI expands placeholders in ``cli`` and runs ``entrypoint`` with that CLI string. The custom command must write
+``accuracy_results.csv`` inside ``{artifact_dir}`` with an ``OVERALL`` row. CloudAI copies that file to the run output
+directory and exposes the same ``accuracy`` metric as AIPerf accuracy mode.
+
 Review Benchmark Results
 ------------------------
 
diff --git a/src/cloudai/workloads/ai_dynamo/accuracy.sh b/src/cloudai/workloads/ai_dynamo/accuracy.sh
new file mode 100644
index 000000000..0e85ee109
--- /dev/null
+++ b/src/cloudai/workloads/ai_dynamo/accuracy.sh
@@ -0,0 +1,133 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -Eeuo pipefail
+
+result_dir=""
+model=""
+url="http://localhost"
+port=8000
+endpoint="v1/chat/completions"
+entrypoint=""
+cli=""
+setup_cmd=""
+artifact_dir_name="aiperf_accuracy_artifacts"
+
+log() {
+  echo "[$(date '+%F %T') $(hostname)]: $*"
+}
+
+process_args() {
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      --result-dir)          result_dir="$2";        shift 2 ;;
+      --model)               model="$2";             shift 2 ;;
+      --url)                 url="$2";               shift 2 ;;
+      --port)                port="$2";              shift 2 ;;
+      --endpoint)            endpoint="$2";          shift 2 ;;
+      --entrypoint)          entrypoint="$2";        shift 2 ;;
+      --cli)                 cli="$2";               shift 2 ;;
+      --setup-cmd)           setup_cmd="$2";         shift 2 ;;
+      --artifact-dir-name)   artifact_dir_name="$2"; shift 2 ;;
+      --)                    shift; break ;;
+      --*)                   if [[ -n "${2:-}" && "${2}" != -* ]]; then shift 2; else shift 1; fi ;;
+      *)                     shift ;;
+    esac
+  done
+
+  log "Parsed args:
+    result_dir:    $result_dir
+    model:         $model
+    url:           $url
+    port:          $port
+    endpoint:      $endpoint
+    entrypoint:    $entrypoint
+    setup_cmd:     ${setup_cmd:-}
+    artifact_dir:  $artifact_dir_name
+    cli:           ${cli:-}"
+}
+
+run_setup_cmd() {
+  if [[ -z "$setup_cmd" ]]; then
+    return
+  fi
+
+  log "Running accuracy setup command: $setup_cmd"
+  bash -lc "$setup_cmd"
+  log "Accuracy setup command complete"
+}
+
+expand_cli() {
+  local artifact_dir="$1"
+  local full_url="$2"
+  local expanded="$cli"
+
+  expanded="${expanded//\{model\}/$model}"
+  expanded="${expanded//\{url\}/$full_url}"
+  expanded="${expanded//\{endpoint\}/$endpoint}"
+  expanded="${expanded//\{result_dir\}/$result_dir}"
+  expanded="${expanded//\{artifact_dir\}/$artifact_dir}"
+  expanded="${expanded//$'\n'/ }"
+
+  echo "$expanded"
+}
+
+copy_accuracy_results() {
+  local artifact_dir="$1"
+  local accuracy_path="$artifact_dir/accuracy_results.csv"
+
+  if [[ ! -s "$accuracy_path" ]]; then
+    log "ERROR: accuracy benchmark was requested but $accuracy_path was not produced"
+    exit 1
+  fi
+
+  cp "$accuracy_path" "$result_dir/accuracy_results.csv"
+  log "accuracy report saved to $result_dir/accuracy_results.csv"
+}
+
+main() {
+  process_args "$@"
+
+  if [[ -z "$result_dir" ]]; then
+    log "ERROR: --result-dir is required"; exit 1
+  fi
+  if [[ -z "$model" ]]; then
+    log "ERROR: --model is required"; exit 1
+  fi
+  if [[ -z "$entrypoint" ]]; then
+    log "ERROR: --entrypoint is required"; exit 1
+  fi
+
+  run_setup_cmd
+
+  local full_url="${url}:${port}"
+  local artifact_dir="$result_dir/$artifact_dir_name"
+  rm -rf "$artifact_dir"
+  mkdir -p "$artifact_dir"
+
+  local expanded_cli
+  expanded_cli="$(expand_cli "$artifact_dir" "$full_url")"
+
+  log "Launching accuracy command: $entrypoint $expanded_cli"
+  bash -lc "$entrypoint $expanded_cli"
+  log "accuracy command complete"
+
+  copy_accuracy_results "$artifact_dir"
+}
+
+main "$@"
+exit 0
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
index 8c5dcaad1..35da5b782 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
@@ -310,26 +310,20 @@ class AIPerf(Workload):
     def installables(self) -> list[Installable]:
         return [self.script]
 
-    @property
-    def has_accuracy_benchmark(self) -> bool:
-        args_extra = getattr(self.args, "model_extra", {}) or {}
-        if args_extra.get("accuracy-benchmark") or args_extra.get("accuracy_benchmark"):
-            return True
 
-        extra_args = self.extra_args or ""
-        if isinstance(extra_args, list):
-            return "--accuracy-benchmark" in extra_args
-        return "--accuracy-benchmark" in extra_args
+class AIPerfAccuracy(BaseModel):
+    """Optional accuracy benchmark configuration."""
 
-
-class AIPerfAccuracy(AIPerf):
-    """Optional AIPerf accuracy benchmark configuration."""
+    model_config = ConfigDict(extra="forbid", populate_by_name=True)
 
     name: str = "aiperf_accuracy"
-    report_name: str = Field(
-        default="aiperf_accuracy_report.csv",
-        serialization_alias="report-name",
-        validation_alias=AliasChoices("report-name", "report_name"),
+    entrypoint: str = "aiperf profile"
+    cli: str
+    script: File = File(Path(__file__).parent.parent / "ai_dynamo/accuracy.sh")
+    setup_cmd: str | None = Field(
+        default=None,
+        serialization_alias="setup-cmd",
+        validation_alias=AliasChoices("setup-cmd", "setup_cmd"),
     )
     artifact_dir_name: str = Field(
         default=AIPERF_ACCURACY_ARTIFACTS_DIR,
@@ -337,6 +331,10 @@ class AIPerfAccuracy(AIPerf):
         validation_alias=AliasChoices("artifact-dir-name", "artifact_dir_name"),
     )
 
+    @property
+    def installables(self) -> list[Installable]:
+        return [self.script]
+
 
 class Constraints(BaseModel):
     """Constraints for validation of AI Dynamo configurations when using DSE."""
@@ -410,9 +408,6 @@ def workload_scripts(self) -> "AIDynamoTestDefinition":
             if workload not in workload_map:
                 raise ValueError(f"Invalid workload: {workload}. Available workloads: {list(workload_map.keys())}")
 
-        if self.cmd_args.aiperf_accuracy is not None and not self.cmd_args.aiperf_accuracy.has_accuracy_benchmark:
-            raise ValueError("cmd_args.aiperf_accuracy must configure an AIPerf --accuracy-benchmark argument")
-
         return self
 
     def get_workload_map(self) -> dict[str, Workload]:
diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh
index bf08f869d..15cee3a58 100644
--- a/src/cloudai/workloads/ai_dynamo/aiperf.sh
+++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh
@@ -49,7 +49,7 @@ artifact_dir_name="aiperf_artifacts"
 cmd="aiperf profile"
 setup_cmd=""
 declare -a extra_args=()
-declare -a aiperf_profile_args=()
+declare -a profile_args=()
 
 log() {
   echo "[$(date '+%F %T') $(hostname)]: $*"
@@ -58,27 +58,17 @@ log() {
 _parse_aiperf_args() {
   while [[ $# -ge 2 ]]; do
     case "$1" in
-      --*) aiperf_profile_args+=("$1" "$2"); shift 2 ;;
+      --*) profile_args+=("$1" "$2"); shift 2 ;;
       *)   shift ;;
     esac
   done
   # Capture a trailing lone boolean flag if present.
   # Use if/fi — not [[ ]] && — so set -e does not trigger on a false condition.
   if [[ $# -eq 1 && "$1" == --* ]]; then
-    aiperf_profile_args+=("$1")
+    profile_args+=("$1")
   fi
 }
 
-has_accuracy_benchmark() {
-  local arg
-  for arg in "${aiperf_profile_args[@]}" "${extra_args[@]}"; do
-    if [[ "$arg" == "--accuracy-benchmark" ]]; then
-      return 0
-    fi
-  done
-  return 1
-}
-
 process_args() {
   while [[ $# -gt 0 ]]; do
     case "$1" in
@@ -107,7 +97,7 @@ process_args() {
     cmd:          $cmd
     setup_cmd:    ${setup_cmd:-}
     extra_args:   ${extra_args[*]:-}
-    profile_args: ${aiperf_profile_args[*]:-}"
+    profile_args: ${profile_args[*]:-}"
 }
 
 run_setup_cmd() {
@@ -123,18 +113,6 @@ run_setup_cmd() {
 process_results() {
   local artifact_dir="$result_dir/$artifact_dir_name"
   local csv_path=""
-  local accuracy_path="$artifact_dir/accuracy_results.csv"
-
-  if has_accuracy_benchmark; then
-    if [[ ! -s "$accuracy_path" ]]; then
-      log "ERROR: AIPerf accuracy benchmark was requested but $accuracy_path was not produced"
-      exit 1
-    fi
-
-    cp "$accuracy_path" "$result_dir/accuracy_results.csv"
-    log "aiperf accuracy report saved to $result_dir/accuracy_results.csv"
-    return 0
-  fi
 
   if [[ -f "$artifact_dir/profile_export_aiperf.csv" ]]; then
     csv_path="$artifact_dir/profile_export_aiperf.csv"
@@ -152,6 +130,35 @@ process_results() {
 
 }
 
+run_aiperf() {
+  local full_url="$1"
+  local artifact_dir="$2"
+  local -a run_cmd=()
+  read -ra run_cmd <<< "$cmd"
+  local -a launch_cmd=(
+    "${run_cmd[@]}"
+    --model "$model"
+    --url "$full_url"
+    --endpoint-type chat
+    --streaming
+    --artifact-dir "$artifact_dir"
+    --no-server-metrics
+  )
+
+  log "Launching aiperf: ${run_cmd[*]} --model $model --url $full_url"
+
+  if [[ "${#profile_args[@]}" -gt 0 ]]; then
+    launch_cmd+=("${profile_args[@]}")
+  fi
+  if [[ "${#extra_args[@]}" -gt 0 ]]; then
+    launch_cmd+=("${extra_args[@]}")
+  fi
+
+  "${launch_cmd[@]}"
+
+  log "aiperf run complete"
+}
+
 main() {
   process_args "$@"
 
@@ -168,23 +175,7 @@ main() {
   local artifact_dir="$result_dir/$artifact_dir_name"
   rm -rf "$artifact_dir"
 
-  # Split cmd into an array (e.g. "aiperf profile" → ["aiperf", "profile"])
-  local -a run_cmd=()
-  read -ra run_cmd <<< "$cmd"
-
-  log "Launching aiperf: ${run_cmd[*]} --model $model --url $full_url"
-
-  "${run_cmd[@]}" \
-    --model         "$model" \
-    --url           "$full_url" \
-    --endpoint-type chat \
-    --streaming \
-    --artifact-dir  "$artifact_dir" \
-    --no-server-metrics \
-    "${aiperf_profile_args[@]}" \
-    "${extra_args[@]}"
-
-  log "aiperf run complete"
+  run_aiperf "$full_url" "$artifact_dir"
   process_results
 }
 
diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
index 51e704102..c1a817853 100644
--- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import logging
+import shlex
 from pathlib import Path
 from typing import List, cast
 
@@ -71,6 +72,8 @@ def _get_toml_args(self, base_model: BaseModel, prefix: str, exclude: List[str]
             str_v = str(v)
             if str_v.startswith("{") and str_v.endswith("}"):
                 args.append(f"{prefix}{k} '{str_v}'")
+            elif any(char in str_v for char in ['"', "'", "\n"]):
+                args.append(f"{prefix}{k} {shlex.quote(str_v)}")
             else:
                 args.append(f'{prefix}{k} "{v}"')
 
diff --git a/tests/workloads/ai_dynamo/test_accuracy_script.py b/tests/workloads/ai_dynamo/test_accuracy_script.py
new file mode 100644
index 000000000..a6e3b8246
--- /dev/null
+++ b/tests/workloads/ai_dynamo/test_accuracy_script.py
@@ -0,0 +1,123 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+ACCURACY_SCRIPT = Path("src/cloudai/workloads/ai_dynamo/accuracy.sh")
+
+
+def test_accuracy_script_runs_custom_accuracy_command(tmp_path: Path) -> None:
+    custom_script = tmp_path / "custom_accuracy.py"
+    custom_script.write_text(
+        """
+import argparse
+import csv
+import json
+from pathlib import Path
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", required=True)
+parser.add_argument("--url", required=True)
+parser.add_argument("--endpoint", required=True)
+parser.add_argument("--result-dir", required=True)
+parser.add_argument("--artifact-dir", required=True)
+parser.add_argument("--prompt", required=True)
+args = parser.parse_args()
+
+artifact_dir = Path(args.artifact_dir)
+artifact_dir.mkdir(parents=True, exist_ok=True)
+(artifact_dir / "args.json").write_text(json.dumps(vars(args)), encoding="utf-8")
+with (artifact_dir / "accuracy_results.csv").open("w", newline="", encoding="utf-8") as f:
+    writer = csv.writer(f)
+    writer.writerow(["task", "correct", "total", "accuracy"])
+    writer.writerow(["OVERALL", 1, 1, "100.00%"])
+""",
+        encoding="utf-8",
+    )
+
+    result = subprocess.run(
+        [
+            "bash",
+            str(ACCURACY_SCRIPT),
+            "--result-dir",
+            str(tmp_path),
+            "--model",
+            "Qwen/Qwen3-0.6B",
+            "--url",
+            "http://frontend",
+            "--port",
+            "8000",
+            "--endpoint",
+            "v1/chat/completions",
+            "--entrypoint",
+            f"{sys.executable} {custom_script}",
+            "--cli",
+            (
+                "--model {model} --url {url} --endpoint {endpoint} "
+                "--result-dir {result_dir} --artifact-dir {artifact_dir} --prompt ping"
+            ),
+            "--artifact-dir-name",
+            "custom_accuracy_artifacts",
+        ],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.returncode == 0, result.stderr + result.stdout
+    assert (tmp_path / "accuracy_results.csv").read_text(encoding="utf-8").splitlines()[-1] == "OVERALL,1,1,100.00%"
+    args = json.loads((tmp_path / "custom_accuracy_artifacts" / "args.json").read_text(encoding="utf-8"))
+    assert args == {
+        "model": "Qwen/Qwen3-0.6B",
+        "url": "http://frontend:8000",
+        "endpoint": "v1/chat/completions",
+        "result_dir": str(tmp_path),
+        "artifact_dir": str(tmp_path / "custom_accuracy_artifacts"),
+        "prompt": "ping",
+    }
+
+
+def test_accuracy_script_fails_custom_accuracy_without_accuracy_csv(tmp_path: Path) -> None:
+    custom_script = tmp_path / "custom_accuracy.py"
+    custom_script.write_text("from pathlib import Path\nPath(__file__).exists()\n", encoding="utf-8")
+
+    result = subprocess.run(
+        [
+            "bash",
+            str(ACCURACY_SCRIPT),
+            "--result-dir",
+            str(tmp_path),
+            "--model",
+            "Qwen/Qwen3-0.6B",
+            "--url",
+            "http://frontend",
+            "--port",
+            "8000",
+            "--entrypoint",
+            f"{sys.executable} {custom_script}",
+            "--cli",
+            "--artifact-dir {artifact_dir}",
+        ],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.returncode == 1
+    assert "accuracy benchmark was requested" in result.stdout
diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
index 9b0f695d5..7b036b5a8 100644
--- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import shlex
 from pathlib import Path
 from typing import cast
 
@@ -157,7 +158,20 @@ def test_gen_script_args_contains_split_aiperf_accuracy_args(strategy: AIDynamoS
     td = cast(AIDynamoTestDefinition, strategy.test_run.test)
     td.cmd_args.workloads = "aiperf.sh"
     setup_cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
-    extra_inputs = '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+    cli = (
+        "--model {model} "
+        "--url {url} "
+        "--endpoint-type chat "
+        "--streaming "
+        "--artifact-dir {artifact_dir} "
+        "--no-server-metrics "
+        "--accuracy-benchmark mmlu "
+        "--accuracy-n-shots 5 "
+        "--accuracy-tasks abstract_algebra "
+        "--concurrency 10 "
+        '--extra-inputs \'{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}\' '
+        "--num-requests 100"
+    )
     td.cmd_args.aiperf = AIPerf.model_validate(
         {
             "args": {
@@ -171,14 +185,7 @@ def test_gen_script_args_contains_split_aiperf_accuracy_args(strategy: AIDynamoS
     td.cmd_args.aiperf_accuracy = AIPerfAccuracy.model_validate(
         {
             "setup-cmd": setup_cmd,
-            "args": {
-                "accuracy-benchmark": "mmlu",
-                "accuracy-n-shots": 5,
-                "accuracy-tasks": "abstract_algebra",
-                "concurrency": 10,
-                "extra-inputs": extra_inputs,
-                "num-requests": 100,
-            },
+            "cli": cli,
         }
     )
 
@@ -189,13 +196,25 @@ def test_gen_script_args_contains_split_aiperf_accuracy_args(strategy: AIDynamoS
     assert '--aiperf-args-output-tokens-mean "500"' in result
     assert f'--aiperf_accuracy-setup-cmd "{setup_cmd}"' in result
     assert '--aiperf_accuracy-name "aiperf_accuracy"' in result
+    assert '--aiperf_accuracy-entrypoint "aiperf profile"' in result
     assert '--aiperf_accuracy-artifact-dir-name "aiperf_accuracy_artifacts"' in result
-    assert '--aiperf_accuracy-args-accuracy-benchmark "mmlu"' in result
-    assert '--aiperf_accuracy-args-accuracy-n-shots "5"' in result
-    assert '--aiperf_accuracy-args-accuracy-tasks "abstract_algebra"' in result
-    assert '--aiperf_accuracy-args-concurrency "10"' in result
-    assert f"--aiperf_accuracy-args-extra-inputs '{extra_inputs}'" in result
-    assert '--aiperf_accuracy-args-num-requests "100"' in result
+    assert f"--aiperf_accuracy-cli {shlex.quote(cli)}" in result
+
+
+def test_gen_script_args_contains_custom_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
+    td = cast(AIDynamoTestDefinition, strategy.test_run.test)
+    cli = "--model {model} --url {url} --endpoint {endpoint} --artifact-dir {artifact_dir} --prompt ping"
+    td.cmd_args.aiperf_accuracy = AIPerfAccuracy.model_validate(
+        {
+            "entrypoint": "python /custom_accuracy/dummy_accuracy.py",
+            "cli": cli,
+        }
+    )
+
+    result = strategy._gen_script_args(td)
+
+    assert '--aiperf_accuracy-entrypoint "python /custom_accuracy/dummy_accuracy.py"' in result
+    assert f'--aiperf_accuracy-cli "{cli}"' in result
 
 
 def test_gen_script_args_quotes_worker_json_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
index 9afdd6e72..47e214421 100644
--- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py
+++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
@@ -68,6 +68,10 @@ def get_aiperf_accuracy_csv_content() -> str:
     return "Task,Correct,Total,Accuracy\nabstract_algebra,35,100,35.00%\nOVERALL,35,100,35.00%\n"
 
 
+def get_aiperf_accuracy_cli() -> str:
+    return "--model {model} --url {url} --artifact-dir {artifact_dir} --accuracy-benchmark mmlu"
+
+
 @pytest.fixture
 def ai_dynamo_tr(tmp_path: Path) -> TestRun:
     test = AIDynamoTestDefinition(
@@ -143,7 +147,7 @@ def ai_dynamo_aiperf_with_split_accuracy_tr(tmp_path: Path) -> TestRun:
                 ),
             ),
             aiperf=AIPerf(),
-            aiperf_accuracy=AIPerfAccuracy.model_validate({"args": {"accuracy-benchmark": "mmlu"}}),
+            aiperf_accuracy=AIPerfAccuracy.model_validate({"cli": get_aiperf_accuracy_cli()}),
             lmcache=LMCache(args=LMCacheArgs()),
         ),
     )
@@ -171,7 +175,7 @@ def ai_dynamo_genai_perf_with_split_accuracy_tr(tmp_path: Path) -> TestRun:
                 ),
             ),
             genai_perf=GenAIPerf(),
-            aiperf_accuracy=AIPerfAccuracy.model_validate({"args": {"accuracy-benchmark": "mmlu"}}),
+            aiperf_accuracy=AIPerfAccuracy.model_validate({"cli": get_aiperf_accuracy_cli()}),
             lmcache=LMCache(args=LMCacheArgs()),
         ),
     )

From 1c99d609195007c5ada53b79276045f98bb278a3 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Wed, 27 May 2026 11:58:55 -0700
Subject: [PATCH 15/16] remove redundant test

---
 .../ai_dynamo/test_accuracy_script.py         | 123 ------------------
 1 file changed, 123 deletions(-)
 delete mode 100644 tests/workloads/ai_dynamo/test_accuracy_script.py

diff --git a/tests/workloads/ai_dynamo/test_accuracy_script.py b/tests/workloads/ai_dynamo/test_accuracy_script.py
deleted file mode 100644
index a6e3b8246..000000000
--- a/tests/workloads/ai_dynamo/test_accuracy_script.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import subprocess
-import sys
-from pathlib import Path
-
-ACCURACY_SCRIPT = Path("src/cloudai/workloads/ai_dynamo/accuracy.sh")
-
-
-def test_accuracy_script_runs_custom_accuracy_command(tmp_path: Path) -> None:
-    custom_script = tmp_path / "custom_accuracy.py"
-    custom_script.write_text(
-        """
-import argparse
-import csv
-import json
-from pathlib import Path
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--model", required=True)
-parser.add_argument("--url", required=True)
-parser.add_argument("--endpoint", required=True)
-parser.add_argument("--result-dir", required=True)
-parser.add_argument("--artifact-dir", required=True)
-parser.add_argument("--prompt", required=True)
-args = parser.parse_args()
-
-artifact_dir = Path(args.artifact_dir)
-artifact_dir.mkdir(parents=True, exist_ok=True)
-(artifact_dir / "args.json").write_text(json.dumps(vars(args)), encoding="utf-8")
-with (artifact_dir / "accuracy_results.csv").open("w", newline="", encoding="utf-8") as f:
-    writer = csv.writer(f)
-    writer.writerow(["task", "correct", "total", "accuracy"])
-    writer.writerow(["OVERALL", 1, 1, "100.00%"])
-""",
-        encoding="utf-8",
-    )
-
-    result = subprocess.run(
-        [
-            "bash",
-            str(ACCURACY_SCRIPT),
-            "--result-dir",
-            str(tmp_path),
-            "--model",
-            "Qwen/Qwen3-0.6B",
-            "--url",
-            "http://frontend",
-            "--port",
-            "8000",
-            "--endpoint",
-            "v1/chat/completions",
-            "--entrypoint",
-            f"{sys.executable} {custom_script}",
-            "--cli",
-            (
-                "--model {model} --url {url} --endpoint {endpoint} "
-                "--result-dir {result_dir} --artifact-dir {artifact_dir} --prompt ping"
-            ),
-            "--artifact-dir-name",
-            "custom_accuracy_artifacts",
-        ],
-        check=False,
-        capture_output=True,
-        text=True,
-    )
-
-    assert result.returncode == 0, result.stderr + result.stdout
-    assert (tmp_path / "accuracy_results.csv").read_text(encoding="utf-8").splitlines()[-1] == "OVERALL,1,1,100.00%"
-    args = json.loads((tmp_path / "custom_accuracy_artifacts" / "args.json").read_text(encoding="utf-8"))
-    assert args == {
-        "model": "Qwen/Qwen3-0.6B",
-        "url": "http://frontend:8000",
-        "endpoint": "v1/chat/completions",
-        "result_dir": str(tmp_path),
-        "artifact_dir": str(tmp_path / "custom_accuracy_artifacts"),
-        "prompt": "ping",
-    }
-
-
-def test_accuracy_script_fails_custom_accuracy_without_accuracy_csv(tmp_path: Path) -> None:
-    custom_script = tmp_path / "custom_accuracy.py"
-    custom_script.write_text("from pathlib import Path\nPath(__file__).exists()\n", encoding="utf-8")
-
-    result = subprocess.run(
-        [
-            "bash",
-            str(ACCURACY_SCRIPT),
-            "--result-dir",
-            str(tmp_path),
-            "--model",
-            "Qwen/Qwen3-0.6B",
-            "--url",
-            "http://frontend",
-            "--port",
-            "8000",
-            "--entrypoint",
-            f"{sys.executable} {custom_script}",
-            "--cli",
-            "--artifact-dir {artifact_dir}",
-        ],
-        check=False,
-        capture_output=True,
-        text=True,
-    )
-
-    assert result.returncode == 1
-    assert "accuracy benchmark was requested" in result.stdout

From 1d09c7d845b7036c28c6d80302af846eba845088 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Wed, 27 May 2026 12:06:07 -0700
Subject: [PATCH 16/16] support custom scripts for vllm and sglang

---
 conf/experimental/sglang/test/sglang.toml     |  4 ++--
 conf/experimental/vllm/test/vllm.toml         |  4 ++--
 doc/workloads/sglang.rst                      | 17 +++++++-------
 doc/workloads/vllm.rst                        | 13 ++++++-----
 src/cloudai/workloads/common/llm_serving.py   |  2 ++
 src/cloudai/workloads/sglang/sglang.py        |  6 +++--
 .../sglang/slurm_command_gen_strategy.py      | 13 ++---------
 .../vllm/slurm_command_gen_strategy.py        | 12 ++--------
 src/cloudai/workloads/vllm/vllm.py            |  6 +++--
 .../sglang/test_command_gen_strategy_slurm.py | 16 +++++--------
 .../vllm/test_command_gen_strategy_slurm.py   | 23 +++++++++++++++----
 11 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/conf/experimental/sglang/test/sglang.toml b/conf/experimental/sglang/test/sglang.toml
index e6d2c09b4..2866f656c 100644
--- a/conf/experimental/sglang/test/sglang.toml
+++ b/conf/experimental/sglang/test/sglang.toml
@@ -22,8 +22,8 @@ test_template_name = "sglang"
 docker_image_url = "lmsysorg/sglang:dev-cu13"
 
 [semantic_eval_cmd_args]
-module = "sglang.test.run_eval"
-args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+entrypoint = "python3 -m sglang.test.run_eval"
+cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
 
 [extra_env_vars]
 UCX_NET_DEVICES = "all"
diff --git a/conf/experimental/vllm/test/vllm.toml b/conf/experimental/vllm/test/vllm.toml
index 891023201..a8061099c 100644
--- a/conf/experimental/vllm/test/vllm.toml
+++ b/conf/experimental/vllm/test/vllm.toml
@@ -27,8 +27,8 @@ mount_as = "/vllm_repo"
 docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1"
 
 [semantic_eval_cmd_args]
-script = "/vllm_repo/tests/evals/gsm8k/gsm8k_eval.py"
-args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+entrypoint = "python3 /vllm_repo/tests/evals/gsm8k/gsm8k_eval.py"
+cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
 
 [extra_env_vars]
 UCX_NET_DEVICES = "all"
diff --git a/doc/workloads/sglang.rst b/doc/workloads/sglang.rst
index d0561c773..cdbd5cff1 100644
--- a/doc/workloads/sglang.rst
+++ b/doc/workloads/sglang.rst
@@ -29,8 +29,8 @@ Test + Scenario example
    num_prompts = 30
 
    [semantic_eval_cmd_args]
-   module = "sglang.test.run_eval"
-   args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+   entrypoint = "python3 -m sglang.test.run_eval"
+   cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
 
 
 .. code-block:: toml
@@ -81,18 +81,19 @@ To run GSM8K semantic validation after the serving benchmark, add ``semantic_eva
    :caption: test.toml (semantic validation)
 
    [semantic_eval_cmd_args]
-   module = "sglang.test.run_eval"
-   args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+   entrypoint = "python3 -m sglang.test.run_eval"
+   cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
 
-For images that still use the legacy SGLang GSM8K runner, override the module and raw arguments:
+For images that still use the legacy SGLang GSM8K runner, override the entrypoint and raw CLI:
 
 .. code-block:: toml
 
    [semantic_eval_cmd_args]
-   module = "sglang.test.few_shot_gsm8k"
-   args = "--num-questions 200"
+   entrypoint = "python3 -m sglang.test.few_shot_gsm8k"
+   cli = "--host {host} --port {port} --num-questions 200"
 
-The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders.
+The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{output_path}``, and ``{result_dir}``
+placeholders.
 
 
 Control number of GPUs
diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst
index 930bcf11b..57773992f 100644
--- a/doc/workloads/vllm.rst
+++ b/doc/workloads/vllm.rst
@@ -29,8 +29,8 @@ Test and Scenario Examples
    num_prompts = 30
 
    [semantic_eval_cmd_args]
-   script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
-   args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+   entrypoint = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
+   cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
 
 
 .. code-block:: toml
@@ -81,13 +81,14 @@ To run GSM8K semantic validation after the serving benchmark, add ``semantic_eva
    :caption: test.toml (semantic validation)
 
    [semantic_eval_cmd_args]
-   script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
-   args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+   entrypoint = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
+   cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
 
 If the runtime image does not contain the eval script, mount a vLLM repository with existing ``git_repos`` support and
-point ``script`` at the mounted path.
+point ``entrypoint`` at the mounted path.
 
-The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders.
+The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{output_path}``, and ``{result_dir}``
+placeholders.
 
 
 Controlling the Number of GPUs
diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py
index 87ad7b3a3..30a6943c1 100644
--- a/src/cloudai/workloads/common/llm_serving.py
+++ b/src/cloudai/workloads/common/llm_serving.py
@@ -624,7 +624,9 @@ def _expand_semantic_eval_args(self, args: str, *, host: str) -> str:
             "{model}": self.tdef.cmd_args.model,
             "{host}": host,
             "{port}": str(self.serve_port),
+            "{url}": f"{host}:{self.serve_port}",
             "{output_path}": str(self.test_run.output_path.absolute()),
+            "{result_dir}": str(self.test_run.output_path.absolute()),
         }
         for placeholder, value in replacements.items():
             args = args.replace(placeholder, value)
diff --git a/src/cloudai/workloads/sglang/sglang.py b/src/cloudai/workloads/sglang/sglang.py
index 338bbfecc..49a7af140 100644
--- a/src/cloudai/workloads/sglang/sglang.py
+++ b/src/cloudai/workloads/sglang/sglang.py
@@ -92,8 +92,10 @@ class SglangBenchCmdArgs(CmdArgs):
 class SglangSemanticEvalCmdArgs(CmdArgs):
     """SGLang semantic validation command arguments."""
 
-    module: str = "sglang.test.run_eval"
-    args: str = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+    model_config = ConfigDict(extra="forbid")
+
+    entrypoint: str = "python3 -m sglang.test.run_eval"
+    cli: str = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
 
 
 class SglangTestDefinition(LLMServingTestDefinition[SglangCmdArgs]):
diff --git a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py
index f1c7c741c..7a7a97d5b 100644
--- a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py
@@ -130,17 +130,8 @@ def get_semantic_eval_command(self) -> list[str] | None:
             return None
 
         host = self.bench_host
-        command = [
-            "python3",
-            "-m",
-            eval_args.module,
-            f"--host {host}",
-            f"--port {self.serve_port}",
-        ]
-        args = self._expand_semantic_eval_args(eval_args.args, host=host)
-        if args:
-            command.append(args)
-        return command
+        cli = self._expand_semantic_eval_args(eval_args.cli, host=host)
+        return [eval_args.entrypoint, cli] if cli else [eval_args.entrypoint]
 
     def aggregated_serve_env(self) -> dict[str, str]:
         return {"CUDA_VISIBLE_DEVICES": ",".join(str(gpu_id) for gpu_id in self.gpu_ids)}
diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py
index 13d87ad77..2f00e95f7 100644
--- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py
@@ -130,13 +130,5 @@ def get_semantic_eval_command(self) -> list[str] | None:
 
         host = self.bench_host
         http_host = host if host.startswith("http://") or host.startswith("https://") else f"http://{host}"
-        command = [
-            "python3",
-            eval_args.script,
-            f"--host {http_host}",
-            f"--port {self.serve_port}",
-        ]
-        args = self._expand_semantic_eval_args(eval_args.args, host=http_host)
-        if args:
-            command.append(args)
-        return command
+        cli = self._expand_semantic_eval_args(eval_args.cli, host=http_host)
+        return [eval_args.entrypoint, cli] if cli else [eval_args.entrypoint]
diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py
index d2fda3ab5..f77039edc 100644
--- a/src/cloudai/workloads/vllm/vllm.py
+++ b/src/cloudai/workloads/vllm/vllm.py
@@ -92,8 +92,10 @@ class VllmBenchCmdArgs(CmdArgs):
 class VllmSemanticEvalCmdArgs(CmdArgs):
     """vLLM semantic validation command arguments."""
 
-    script: str = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
-    args: str = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+    model_config = ConfigDict(extra="forbid")
+
+    entrypoint: str = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
+    cli: str = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
 
 
 class VllmTestDefinition(LLMServingTestDefinition[VllmCmdArgs]):
diff --git a/tests/workloads/sglang/test_command_gen_strategy_slurm.py b/tests/workloads/sglang/test_command_gen_strategy_slurm.py
index 7d2812580..c07d1771d 100644
--- a/tests/workloads/sglang/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/sglang/test_command_gen_strategy_slurm.py
@@ -150,28 +150,24 @@ def test_get_sglang_semantic_eval_command_defaults(sglang_cmd_gen_strategy: Sgla
     command = sglang_cmd_gen_strategy.get_semantic_eval_command()
 
     assert command == [
-        "python3",
-        "-m",
-        "sglang.test.run_eval",
-        "--host ${NODE}",
-        "--port 8000",
-        "--eval-name gsm8k --num-examples 200 --num-threads 128 --model Qwen/Qwen3-8B",
+        "python3 -m sglang.test.run_eval",
+        "--host ${NODE} --port 8000 --eval-name gsm8k --num-examples 200 --num-threads 128 --model Qwen/Qwen3-8B",
     ]
 
 
-def test_get_sglang_semantic_eval_command_supports_custom_module_and_args(
+def test_get_sglang_semantic_eval_command_supports_custom_entrypoint_and_cli(
     sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy,
 ):
     sglang_test = cast(SglangTestDefinition, sglang_cmd_gen_strategy.test_run.test)
     sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs(
-        module="sglang.test.few_shot_gsm8k",
-        args="--num-questions 200 --data-path {output_path}/gsm8k.jsonl --seen {host}:{port}",
+        entrypoint="python3 /custom/semantic_eval.py",
+        cli="--num-questions 200 --data-path {result_dir}/gsm8k.jsonl --seen {url}",
     )
 
     command = sglang_cmd_gen_strategy.get_semantic_eval_command()
 
     assert command is not None
-    assert command[2] == "sglang.test.few_shot_gsm8k"
+    assert command[0] == "python3 /custom/semantic_eval.py"
     assert command[-1] == (
         f"--num-questions 200 --data-path {sglang_cmd_gen_strategy.test_run.output_path.absolute()}/gsm8k.jsonl "
         "--seen ${NODE}:8000"
diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py
index 6bd6ada36..6eb62483c 100644
--- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py
@@ -193,14 +193,29 @@ def test_get_vllm_semantic_eval_command_defaults(self, vllm_cmd_gen_strategy: Vl
         command = vllm_cmd_gen_strategy.get_semantic_eval_command()
 
         assert command == [
-            "python3",
-            "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py",
-            "--host http://${NODE}",
-            "--port 8000",
+            "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py",
+            "--host http://${NODE} --port 8000 "
             "--num-questions 200 --save-results "
             f"{vllm_cmd_gen_strategy.test_run.output_path.absolute()}/vllm-gsm8k.json",
         ]
 
+    def test_get_vllm_semantic_eval_command_supports_custom_entrypoint_and_cli(
+        self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy
+    ) -> None:
+        vllm_test = cast(VllmTestDefinition, vllm_cmd_gen_strategy.test_run.test)
+        vllm_test.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs(
+            entrypoint="python3 /custom/eval.py",
+            cli="--model {model} --api {url} --out {result_dir}/vllm-gsm8k.json",
+        )
+
+        command = vllm_cmd_gen_strategy.get_semantic_eval_command()
+
+        assert command == [
+            "python3 /custom/eval.py",
+            f"--model Qwen/Qwen3-0.6B --api http://${{NODE}}:8000 "
+            f"--out {vllm_cmd_gen_strategy.test_run.output_path.absolute()}/vllm-gsm8k.json",
+        ]
+
     def test_gen_srun_command_contains_vllm_semantic_eval(
         self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy
     ) -> None: