diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index 62f8f615d..37b2c392b 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -20,8 +20,8 @@ test_template_name = "AIDynamo"
 extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
 
 [cmd_args]
-docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0"
-workloads = "genai_perf.sh"
+docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.1.1"
+workloads = "aiperf.sh"
 
   [cmd_args.dynamo]
   backend = "sglang"
@@ -32,7 +32,7 @@ workloads = "genai_perf.sh"
     num-nodes = 1
     cmd = 'python3 -m dynamo.sglang'
     extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics"
-    worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
+    worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
 
       [cmd_args.dynamo.prefill_worker.args]
       page-size = 16
@@ -48,7 +48,7 @@ workloads = "genai_perf.sh"
     num-nodes = 1
     cmd = 'python3 -m dynamo.sglang'
     extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics"
-    worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
+    worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
 
       [cmd_args.dynamo.decode_worker.args]
       page-size = 16
@@ -94,18 +94,38 @@ workloads = "genai_perf.sh"
     concurrency = 2
 
   [cmd_args.aiperf]
+  setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
 
     [cmd_args.aiperf.args]
     concurrency = 2
+    extra-inputs = '{"min_tokens":10}'
+    output-tokens-mean = 500
     request-count = 50
     synthetic-input-tokens-mean = 300
-    output-tokens-mean = 500
+
+  [cmd_args.aiperf_accuracy]
+  entrypoint = "aiperf profile"
+  setup-cmd = "python -m pip install --break-system-packages --ignore-installed blinker==1.9.0 && python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
+  cli = '''
+--model {model}
+--url {url}
+--endpoint-type chat
+--streaming
+--artifact-dir {artifact_dir}
+--no-server-metrics
+--accuracy-benchmark mmlu
+--accuracy-n-shots 5
+--accuracy-tasks abstract_algebra
+--concurrency 10
+--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+--num-requests 100
+'''
 
 [extra_env_vars]
 UCX_LOG_LEVEL = "warn"
-HF_HUB_OFFLINE = "1"
-TRANSFORMERS_OFFLINE = "1"
-HF_DATASETS_OFFLINE = "1"
+HF_HUB_OFFLINE = "0"
+TRANSFORMERS_OFFLINE = "0"
+HF_DATASETS_OFFLINE = "0"
 DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
 UCX_TLS = "all"
 #DYN_LOGGING_JSONL="true"
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index 193510728..583d11a88 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -20,8 +20,8 @@ test_template_name = "AIDynamo"
 extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
 
 [cmd_args]
-docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1"
-workloads = "genai_perf.sh"
+docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.1.1"
+workloads = "aiperf.sh"
 
   [cmd_args.dynamo]
   backend = "vllm"
@@ -38,6 +38,7 @@ workloads = "genai_perf.sh"
       tensor-parallel-size = 8
       pipeline-parallel-size = 1
       data-parallel-size = 1
+      kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
 
     [cmd_args.dynamo.decode_worker]
     num-nodes = 1
@@ -50,6 +51,7 @@ workloads = "genai_perf.sh"
       tensor-parallel-size = 8
       pipeline-parallel-size = 1
       data-parallel-size = 1
+      kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
 
   [cmd_args.lmcache]
   controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001"
@@ -86,17 +88,35 @@ workloads = "genai_perf.sh"
     concurrency = 2
 
   [cmd_args.aiperf]
-
     [cmd_args.aiperf.args]
     concurrency = 2
+    extra-inputs = '{"min_tokens":10}'
+    output-tokens-mean = 500
     request-count = 50
     synthetic-input-tokens-mean = 300
-    output-tokens-mean = 500
+
+  [cmd_args.aiperf_accuracy]
+  entrypoint = "aiperf profile"
+  setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
+  cli = '''
+--model {model}
+--url {url}
+--endpoint-type chat
+--streaming
+--artifact-dir {artifact_dir}
+--no-server-metrics
+--accuracy-benchmark mmlu
+--accuracy-n-shots 5
+--accuracy-tasks abstract_algebra
+--concurrency 10
+--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+--num-requests 100
+'''
 
 [extra_env_vars]
 UCX_LOG_LEVEL = "warn"
-HF_HUB_OFFLINE = "1"
-TRANSFORMERS_OFFLINE = "1"
-HF_DATASETS_OFFLINE = "1"
+HF_HUB_OFFLINE = "0"
+TRANSFORMERS_OFFLINE = "0"
+HF_DATASETS_OFFLINE = "0"
 DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
 UCX_TLS = "all"
diff --git a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml
index 26ed91285..4df1a6d64 100644
--- a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml
+++ b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml
@@ -15,11 +15,12 @@
 # limitations under the License.
 
 name = "dynamo_sglang"
+job_status_check = false
 
 [[Tests]]
-id = "sglang-Qwen3-0.6B"
+id = "test.disagg.single-node"
 test_name = "sglang"
-time_limit = "00:20:00"
+time_limit = "00:10:00"
 
   [Tests.cmd_args]
     [Tests.cmd_args.dynamo]
@@ -37,3 +38,25 @@ time_limit = "00:20:00"
 
         [Tests.cmd_args.dynamo.decode_worker.args]
         tensor-parallel-size = 1
+
+[[Tests]]
+id = "test.disagg.multinode"
+test_name = "sglang"
+time_limit = "00:10:00"
+
+  [Tests.cmd_args]
+    [Tests.cmd_args.dynamo]
+    model = "Qwen/Qwen3-0.6B"
+    node-setup-cmd = "hostname"
+
+      [Tests.cmd_args.dynamo.prefill_worker]
+      num-nodes = 2
+
+        [Tests.cmd_args.dynamo.prefill_worker.args]
+        tensor-parallel-size = 1
+
+      [Tests.cmd_args.dynamo.decode_worker]
+      num-nodes = 2
+
+        [Tests.cmd_args.dynamo.decode_worker.args]
+        tensor-parallel-size = 1
diff --git a/conf/experimental/sglang/test/sglang.toml b/conf/experimental/sglang/test/sglang.toml
index e6d2c09b4..2866f656c 100644
--- a/conf/experimental/sglang/test/sglang.toml
+++ b/conf/experimental/sglang/test/sglang.toml
@@ -22,8 +22,8 @@ test_template_name = "sglang"
 docker_image_url = "lmsysorg/sglang:dev-cu13"
 
 [semantic_eval_cmd_args]
-module = "sglang.test.run_eval"
-args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+entrypoint = "python3 -m sglang.test.run_eval"
+cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
 
 [extra_env_vars]
 UCX_NET_DEVICES = "all"
diff --git a/conf/experimental/vllm/test/vllm.toml b/conf/experimental/vllm/test/vllm.toml
index 891023201..a8061099c 100644
--- a/conf/experimental/vllm/test/vllm.toml
+++ b/conf/experimental/vllm/test/vllm.toml
@@ -27,8 +27,8 @@ mount_as = "/vllm_repo"
 docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1"
 
 [semantic_eval_cmd_args]
-script = "/vllm_repo/tests/evals/gsm8k/gsm8k_eval.py"
-args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+entrypoint = "python3 /vllm_repo/tests/evals/gsm8k/gsm8k_eval.py"
+cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
 
 [extra_env_vars]
 UCX_NET_DEVICES = "all"
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index 023d92bf2..c00449681 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -87,14 +87,14 @@ The frontend node will initially wait to allow weight loading on all nodes. Once
 Choosing a Benchmark Tool
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The benchmark tool is controlled by the ``workloads`` field in the test TOML. The default is ``aiperf.sh``:
+The benchmark tool is controlled by the ``workloads`` field in the test TOML. Set ``aiperf.sh`` to use AIPerf:
 
 .. code-block:: toml
 
    [cmd_args]
-   workloads = "aiperf.sh"   # default — uses aiperf, writes aiperf_report.csv
+   workloads = "aiperf.sh"   # uses aiperf, writes aiperf_report.csv
 
-To use genai-perf instead, set:
+To use genai-perf, set:
 
 .. code-block:: toml
 
@@ -110,17 +110,88 @@ To use genai-perf instead, set:
      output-tokens-mean = 500
      request-count = 50
 
+Semantic Degradation With AIPerf Accuracy
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+AIDynamo uses AIPerf accuracy mode as its semantic degradation signal. Enable it with
+``[cmd_args.aiperf_accuracy]``. This runs after the configured performance workload, so it can be used with either
+``aiperf.sh`` or ``genai_perf.sh``:
+
+.. code-block:: toml
+
+   [cmd_args]
+   workloads = "aiperf.sh"
+
+   [cmd_args.aiperf]
+     [cmd_args.aiperf.args]
+     request-count = 50
+     synthetic-input-tokens-mean = 300
+     output-tokens-mean = 500
+     concurrency = 2
+
+   [cmd_args.aiperf_accuracy]
+   entrypoint = "aiperf profile"
+   setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
+   cli = '''
+   --model {model}
+   --url {url}
+   --endpoint-type chat
+   --streaming
+   --artifact-dir {artifact_dir}
+   --no-server-metrics
+   --accuracy-benchmark mmlu
+   --accuracy-n-shots 5
+   --accuracy-tasks abstract_algebra
+   --concurrency 10
+   --extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+   --num-requests 100
+   '''
+
+When ``cmd_args.aiperf_accuracy`` is configured, CloudAI expects AIPerf to produce ``accuracy_results.csv`` and exposes
+the ``accuracy`` metric from its ``OVERALL`` row. The metric is reported as a 0.0-1.0 fraction. Keep synthetic prompt
+and token-length flags out of this mode; the benchmark dataset should come from AIPerf's accuracy benchmark.
+
+The ``entrypoint`` and ``cli`` fields form the accuracy command. CloudAI expands ``{model}``, ``{url}``,
+``{endpoint}``, ``{result_dir}``, and ``{artifact_dir}`` in ``cli`` before launching it. The ``setup-cmd`` field is
+optional. It is useful for Dynamo images that include an older system ``aiperf`` build without the accuracy benchmark
+plugins. The example upgrades the image-level ``aiperf`` before launching ``aiperf profile``.
+MMLU is loaded from ``lighteval/mmlu``, so either allow Hugging Face dataset access or pre-cache that dataset before
+running with ``HF_HUB_OFFLINE``/``HF_DATASETS_OFFLINE`` enabled.
+For Qwen3 models, the example disables thinking mode so short MMLU answers can be parsed as choices.
+
+Custom Accuracy Scripts
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``cmd_args.aiperf_accuracy`` can also launch a custom mounted script instead of AIPerf. Mount the script or its parent
+directory with ``extra_container_mounts`` and set ``entrypoint`` to the in-container command:
+
+.. code-block:: toml
+
+   extra_container_mounts = ["/host/custom_accuracy:/custom_accuracy"]
+
+   [cmd_args.aiperf_accuracy]
+   entrypoint = "python /custom_accuracy/dummy_accuracy.py"
+   cli = "--model {model} --url {url} --endpoint {endpoint} --artifact-dir {artifact_dir} --prompt ping"
+
+CloudAI expands placeholders in ``cli`` and runs ``entrypoint`` with that CLI string. The custom command must write
+``accuracy_results.csv`` inside ``{artifact_dir}`` with an ``OVERALL`` row. CloudAI copies that file to the run output
+directory and exposes the same ``accuracy`` metric as AIPerf accuracy mode.
+
 Review Benchmark Results
 ------------------------
 
 After job completion, CloudAI places output logs and result files in the designated results directory. The result file name depends on the configured ``workloads`` field:
 
-- ``aiperf.sh`` (default) → ``aiperf_report.csv``
+- ``aiperf.sh`` → ``aiperf_report.csv``
 - ``genai_perf.sh`` → ``genai_perf_report.csv``
+- ``cmd_args.aiperf_accuracy`` → ``accuracy_results.csv``
+
+If AIPerf accuracy mode is enabled, CloudAI copies ``aiperf_accuracy_artifacts/accuracy_results.csv`` to
+``accuracy_results.csv`` in the run output directory and marks the run failed if that file is not produced.
 
 Navigate to ``./results/<scenario>/<test-id>/0/`` and open the CSV to examine performance metrics.
 
-Example ``aiperf_report.csv`` (default):
+Example ``aiperf_report.csv``:
 
 ::
 
diff --git a/doc/workloads/sglang.rst b/doc/workloads/sglang.rst
index d0561c773..cdbd5cff1 100644
--- a/doc/workloads/sglang.rst
+++ b/doc/workloads/sglang.rst
@@ -29,8 +29,8 @@ Test + Scenario example
    num_prompts = 30
 
    [semantic_eval_cmd_args]
-   module = "sglang.test.run_eval"
-   args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+   entrypoint = "python3 -m sglang.test.run_eval"
+   cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
 
 
 .. code-block:: toml
@@ -81,18 +81,19 @@ To run GSM8K semantic validation after the serving benchmark, add ``semantic_eva
    :caption: test.toml (semantic validation)
 
    [semantic_eval_cmd_args]
-   module = "sglang.test.run_eval"
-   args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+   entrypoint = "python3 -m sglang.test.run_eval"
+   cli = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
 
-For images that still use the legacy SGLang GSM8K runner, override the module and raw arguments:
+For images that still use the legacy SGLang GSM8K runner, override the entrypoint and raw CLI:
 
 .. code-block:: toml
 
    [semantic_eval_cmd_args]
-   module = "sglang.test.few_shot_gsm8k"
-   args = "--num-questions 200"
+   entrypoint = "python3 -m sglang.test.few_shot_gsm8k"
+   cli = "--host {host} --port {port} --num-questions 200"
 
-The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders.
+The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{output_path}``, and ``{result_dir}``
+placeholders.
 
 
 Control number of GPUs
diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst
index 930bcf11b..57773992f 100644
--- a/doc/workloads/vllm.rst
+++ b/doc/workloads/vllm.rst
@@ -29,8 +29,8 @@ Test and Scenario Examples
    num_prompts = 30
 
    [semantic_eval_cmd_args]
-   script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
-   args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+   entrypoint = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
+   cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
 
 
 .. code-block:: toml
@@ -81,13 +81,14 @@ To run GSM8K semantic validation after the serving benchmark, add ``semantic_eva
    :caption: test.toml (semantic validation)
 
    [semantic_eval_cmd_args]
-   script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
-   args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+   entrypoint = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
+   cli = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
 
 If the runtime image does not contain the eval script, mount a vLLM repository with existing ``git_repos`` support and
-point ``script`` at the mounted path.
+point ``entrypoint`` at the mounted path.
 
-The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders.
+The ``cli`` string supports ``{model}``, ``{host}``, ``{port}``, ``{url}``, ``{output_path}``, and ``{result_dir}``
+placeholders.
 
 
 Controlling the Number of GPUs
diff --git a/src/cloudai/workloads/ai_dynamo/__init__.py b/src/cloudai/workloads/ai_dynamo/__init__.py
index 1360ce10d..4aac3fd2c 100644
--- a/src/cloudai/workloads/ai_dynamo/__init__.py
+++ b/src/cloudai/workloads/ai_dynamo/__init__.py
@@ -19,6 +19,7 @@
     AIDynamoCmdArgs,
     AIDynamoTestDefinition,
     AIPerf,
+    AIPerfAccuracy,
     GenAIPerf,
     LMCache,
     LMCacheArgs,
@@ -37,6 +38,7 @@
     "AIDynamoSlurmCommandGenStrategy",
     "AIDynamoTestDefinition",
     "AIPerf",
+    "AIPerfAccuracy",
     "GenAIPerf",
     "LMCache",
     "LMCacheArgs",
diff --git a/src/cloudai/workloads/ai_dynamo/accuracy.sh b/src/cloudai/workloads/ai_dynamo/accuracy.sh
new file mode 100644
index 000000000..0e85ee109
--- /dev/null
+++ b/src/cloudai/workloads/ai_dynamo/accuracy.sh
@@ -0,0 +1,133 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -Eeuo pipefail
+
+result_dir=""
+model=""
+url="http://localhost"
+port=8000
+endpoint="v1/chat/completions"
+entrypoint=""
+cli=""
+setup_cmd=""
+artifact_dir_name="aiperf_accuracy_artifacts"
+
+log() {
+  echo "[$(date '+%F %T') $(hostname)]: $*"
+}
+
+process_args() {
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      --result-dir)          result_dir="$2";        shift 2 ;;
+      --model)               model="$2";             shift 2 ;;
+      --url)                 url="$2";               shift 2 ;;
+      --port)                port="$2";              shift 2 ;;
+      --endpoint)            endpoint="$2";          shift 2 ;;
+      --entrypoint)          entrypoint="$2";        shift 2 ;;
+      --cli)                 cli="$2";               shift 2 ;;
+      --setup-cmd)           setup_cmd="$2";         shift 2 ;;
+      --artifact-dir-name)   artifact_dir_name="$2"; shift 2 ;;
+      --)                    shift; break ;;
+      --*)                   if [[ -n "${2:-}" && "${2}" != -* ]]; then shift 2; else shift 1; fi ;;
+      *)                     shift ;;
+    esac
+  done
+
+  log "Parsed args:
+    result_dir:    $result_dir
+    model:         $model
+    url:           $url
+    port:          $port
+    endpoint:      $endpoint
+    entrypoint:    $entrypoint
+    setup_cmd:     ${setup_cmd:-}
+    artifact_dir:  $artifact_dir_name
+    cli:           ${cli:-}"
+}
+
+run_setup_cmd() {
+  if [[ -z "$setup_cmd" ]]; then
+    return
+  fi
+
+  log "Running accuracy setup command: $setup_cmd"
+  bash -lc "$setup_cmd"
+  log "Accuracy setup command complete"
+}
+
+expand_cli() {
+  local artifact_dir="$1"
+  local full_url="$2"
+  local expanded="$cli"
+
+  expanded="${expanded//\{model\}/$model}"
+  expanded="${expanded//\{url\}/$full_url}"
+  expanded="${expanded//\{endpoint\}/$endpoint}"
+  expanded="${expanded//\{result_dir\}/$result_dir}"
+  expanded="${expanded//\{artifact_dir\}/$artifact_dir}"
+  expanded="${expanded//$'\n'/ }"
+
+  echo "$expanded"
+}
+
+copy_accuracy_results() {
+  local artifact_dir="$1"
+  local accuracy_path="$artifact_dir/accuracy_results.csv"
+
+  if [[ ! -s "$accuracy_path" ]]; then
+    log "ERROR: accuracy benchmark was requested but $accuracy_path was not produced"
+    exit 1
+  fi
+
+  cp "$accuracy_path" "$result_dir/accuracy_results.csv"
+  log "accuracy report saved to $result_dir/accuracy_results.csv"
+}
+
+main() {
+  process_args "$@"
+
+  if [[ -z "$result_dir" ]]; then
+    log "ERROR: --result-dir is required"; exit 1
+  fi
+  if [[ -z "$model" ]]; then
+    log "ERROR: --model is required"; exit 1
+  fi
+  if [[ -z "$entrypoint" ]]; then
+    log "ERROR: --entrypoint is required"; exit 1
+  fi
+
+  run_setup_cmd
+
+  local full_url="${url}:${port}"
+  local artifact_dir="$result_dir/$artifact_dir_name"
+  rm -rf "$artifact_dir"
+  mkdir -p "$artifact_dir"
+
+  local expanded_cli
+  expanded_cli="$(expand_cli "$artifact_dir" "$full_url")"
+
+  log "Launching accuracy command: $entrypoint $expanded_cli"
+  bash -lc "$entrypoint $expanded_cli"
+  log "accuracy command complete"
+
+  copy_accuracy_results "$artifact_dir"
+}
+
+main "$@"
+exit 0
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
index 01912f0c1..35da5b782 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import csv
 import logging
 from pathlib import Path
 from typing import Literal, Optional, cast
@@ -40,6 +41,10 @@
 from cloudai.models.workload import CmdArgs, TestDefinition
 from cloudai.systems.slurm import SlurmSystem
 
+AIPERF_ARTIFACTS_DIR = "aiperf_artifacts"
+AIPERF_ACCURACY_ARTIFACTS_DIR = "aiperf_accuracy_artifacts"
+AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv"
+
 
 class Args(BaseModel):
     """Arguments for custom workloads."""
@@ -290,6 +295,11 @@ class AIPerf(Workload):
     name: str = "aiperf"
     cmd: str = "aiperf profile"
     script: File = File(Path(__file__).parent.parent / "ai_dynamo/aiperf.sh")
+    setup_cmd: str | None = Field(
+        default=None,
+        serialization_alias="setup-cmd",
+        validation_alias=AliasChoices("setup-cmd", "setup_cmd"),
+    )
     report_name: str = Field(
         default="aiperf_report.csv",
         serialization_alias="report-name",
@@ -301,6 +311,31 @@ def installables(self) -> list[Installable]:
         return [self.script]
 
 
+class AIPerfAccuracy(BaseModel):
+    """Optional accuracy benchmark configuration."""
+
+    model_config = ConfigDict(extra="forbid", populate_by_name=True)
+
+    name: str = "aiperf_accuracy"
+    entrypoint: str = "aiperf profile"
+    cli: str
+    script: File = File(Path(__file__).parent.parent / "ai_dynamo/accuracy.sh")
+    setup_cmd: str | None = Field(
+        default=None,
+        serialization_alias="setup-cmd",
+        validation_alias=AliasChoices("setup-cmd", "setup_cmd"),
+    )
+    artifact_dir_name: str = Field(
+        default=AIPERF_ACCURACY_ARTIFACTS_DIR,
+        serialization_alias="artifact-dir-name",
+        validation_alias=AliasChoices("artifact-dir-name", "artifact_dir_name"),
+    )
+
+    @property
+    def installables(self) -> list[Installable]:
+        return [self.script]
+
+
 class Constraints(BaseModel):
     """Constraints for validation of AI Dynamo configurations when using DSE."""
 
@@ -321,6 +356,7 @@ class AIDynamoCmdArgs(CmdArgs):
     lmcache: LMCache = Field(default_factory=LMCache)
     genai_perf: GenAIPerf = Field(default_factory=GenAIPerf)
     aiperf: AIPerf = Field(default_factory=AIPerf)
+    aiperf_accuracy: AIPerfAccuracy | None = None
     workloads: str = "genai_perf.sh"
 
     @field_validator("workloads", mode="before")
@@ -343,6 +379,7 @@ def installables(self) -> list[Installable]:
             *self.lmcache.installables,
             *self.genai_perf.installables,
             *self.aiperf.installables,
+            *(self.aiperf_accuracy.installables if self.aiperf_accuracy else []),
         ]
 
 
@@ -404,10 +441,52 @@ def installables(self) -> list[Installable]:
             *self.cmd_args.installables,
         ]
 
+    def _has_aiperf_accuracy_results(self, output_path: Path) -> bool:
+        accuracy = parse_aiperf_accuracy(output_path)
+        if accuracy is None:
+            logging.info(f"AIPerf accuracy results not found in {output_path}.")
+            return False
+
+        logging.info(f"AIPerf accuracy results found in {output_path}: {accuracy}")
+        return True
+
+    def _was_workload_report_produced(self, output_path: Path, workload: str, workload_config: Workload) -> bool:
+        report_name = workload_config.report_name
+        if report_name is None:
+            logging.warning(f"Workload {workload} has no report_name configured")
+            return False
+
+        workload_csv_file = output_path / report_name
+        if not workload_csv_file.exists():
+            logging.info(f"Result file ({workload_csv_file.absolute()}) not found for workload: {workload}")
+            return False
+
+        logging.info(f"Result file ({workload_csv_file.absolute()}) exists for {workload}")
+        return True
+
+    def _was_workload_successful(self, output_path: Path, workload: str, workload_map: dict[str, Workload]) -> bool:
+        workload_config = workload_map.get(workload)
+        if workload_config is None:
+            logging.info(f"Workload {workload} not found in workload map")
+            return False
+
+        return self._was_workload_report_produced(output_path, workload, workload_config)
+
+    def _were_workloads_successful(self, output_path: Path) -> bool:
+        workload_map = self.get_workload_map()
+        result = True
+        for workload in self.cmd_args.workloads_list:
+            result = self._was_workload_successful(output_path, workload, workload_map) and result
+        return result
+
+    def _was_aiperf_accuracy_successful(self, output_path: Path) -> bool:
+        if self.cmd_args.aiperf_accuracy is None:
+            return True
+
+        return self._has_aiperf_accuracy_results(output_path)
+
     def was_run_successful(self, tr: TestRun) -> JobStatusResult:
         output_path = tr.output_path
-        result = True
-        workload_map = self.get_workload_map()
         failure_marker = output_path / self.failure_marker
         success_marker = output_path / self.success_marker
 
@@ -418,24 +497,9 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult:
         if not success_marker.exists():
             return JobStatusResult(False, error_message=f"Success marker file not found: {success_marker.absolute()}")
 
-        for workload in self.cmd_args.workloads_list:
-            if workload not in workload_map:
-                logging.info(f"Workload {workload} not found in workload map")
-                result = False
-                continue
-            report_name = workload_map[workload].report_name
-            if report_name is None:
-                logging.warning(f"Workload {workload} has no report_name configured")
-                result = False
-                continue
-            workload_csv_file = output_path / report_name
-            if not workload_csv_file.exists():
-                logging.info(f"Result file ({workload_csv_file.absolute()}) not found for workload: {workload}")
-                result = False
-            else:
-                logging.info(f"Result file ({workload_csv_file.absolute()}) exists for {workload}")
-
-        return JobStatusResult(result)
+        workloads_successful = self._were_workloads_successful(output_path)
+        accuracy_successful = self._was_aiperf_accuracy_successful(output_path)
+        return JobStatusResult(workloads_successful and accuracy_successful)
 
     def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool:
         prefill_worker = tr.test.cmd_args.dynamo.prefill_worker
@@ -467,3 +531,84 @@ def constraint_check(self, tr: TestRun, system: Optional[System]) -> bool:
         logging.info("constraint_check passed for: tp_times_pp_le_gpus_per_node")
 
         return True
+
+
+def _parse_accuracy_value(value: str | int | float | None) -> float | None:
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        accuracy = float(value)
+        return accuracy / 100 if accuracy > 1 else accuracy
+
+    raw_value = value.strip()
+    if not raw_value:
+        return None
+
+    is_percentage = raw_value.endswith("%")
+    if is_percentage:
+        raw_value = raw_value[:-1].strip()
+
+    try:
+        accuracy = float(raw_value)
+    except ValueError:
+        return None
+
+    return accuracy / 100 if is_percentage or accuracy > 1 else accuracy
+
+
+def _parse_count_value(value: str | int | float | None) -> float | None:
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    try:
+        return float(value.strip())
+    except ValueError:
+        return None
+
+
+def parse_aiperf_accuracy(output_path: Path) -> float | None:
+    """
+    Parse AIPerf accuracy from accuracy_results.csv.
+
+    Expected CSV format:
+        Task,Correct,Total,Accuracy
+        abstract_algebra,35,100,35.00%
+        OVERALL,8368,14042,59.59%
+
+    AIPerf writes this file under aiperf_artifacts; CloudAI's wrapper also copies
+    it to the run output directory when present. The returned value is normalized
+    to a 0.0-1.0 fraction.
+    """
+    candidates = [
+        output_path / AIPERF_ACCURACY_RESULTS_CSV,
+        output_path / AIPERF_ACCURACY_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV,
+        output_path / AIPERF_ARTIFACTS_DIR / AIPERF_ACCURACY_RESULTS_CSV,
+    ]
+
+    for csv_file in candidates:
+        if not csv_file.exists() or csv_file.stat().st_size == 0:
+            continue
+
+        fallback_accuracy: float | None = None
+        with csv_file.open(newline="", encoding="utf-8") as f:
+            for row in csv.DictReader(f):
+                accuracy = _parse_accuracy_value(row.get("Accuracy") or row.get("accuracy") or row.get("Value"))
+                if accuracy is None:
+                    correct = _parse_count_value(row.get("Correct") or row.get("correct"))
+                    total = _parse_count_value(row.get("Total") or row.get("total"))
+                    if correct is not None and total:
+                        accuracy = correct / total
+                if accuracy is None:
+                    continue
+
+                task = (row.get("Task") or row.get("task") or row.get("Metric") or "").strip().upper()
+                if task == "OVERALL":
+                    return accuracy
+                if fallback_accuracy is None:
+                    fallback_accuracy = accuracy
+
+        if fallback_accuracy is not None:
+            return fallback_accuracy
+
+    return None
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
index 46f5daa42..5b65db41f 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
@@ -37,6 +37,8 @@ declare -A genai_perf_args
 declare -A genai_perf_config
 declare -A aiperf_args
 declare -A aiperf_config
+declare -A aiperf_accuracy_args
+declare -A aiperf_accuracy_config
 
 declare -A dynamo_args
 dynamo_args["backend"]="vllm"
@@ -100,6 +102,10 @@ _resolve_host_ip() {
   echo "$ip"
 }
 
+_current_node_ip() {
+  _resolve_host_ip "$(_current_node_name)"
+}
+
 _apply_sglang_dsr1_section_args() {
   local self="$(_current_node_name)"
   local gpn="$(_gpus_per_node)"
@@ -169,6 +175,10 @@ _parse_cli_pairs() {
         aiperf_args["--${key#--aiperf-args-}"]="$2" ;;
       --aiperf-*)
         aiperf_config["--${key#--aiperf-}"]="$2" ;;
+      --aiperf_accuracy-args-*)
+        aiperf_accuracy_args["--${key#--aiperf_accuracy-args-}"]="$2" ;;
+      --aiperf_accuracy-*)
+        aiperf_accuracy_config["--${key#--aiperf_accuracy-}"]="$2" ;;
       --hf-home)
         HUGGINGFACE_HOME="$2" ;;
       --storage-cache-dir)
@@ -361,6 +371,8 @@ _dump_args() {
   log "GenAI-Perf args:\n$(arg_array_to_string genai_perf_args)"
   log "AIPerf config params:\n$(arg_array_to_string aiperf_config)"
   log "AIPerf args:\n$(arg_array_to_string aiperf_args)"
+  log "AIPerf accuracy config params:\n$(arg_array_to_string aiperf_accuracy_config)"
+  log "AIPerf accuracy args:\n$(arg_array_to_string aiperf_accuracy_args)"
   log "--------------------------------"
 }
 
@@ -420,6 +432,10 @@ function perform_exit()
 
 exit_on_error() {
   local fatal=$(_detect_fatal_once)
+  if [ -f "${FATAL_ERROR_MARKER}" ]; then
+    log "FATAL_ERROR_MARKER found. Terminating."
+    perform_exit 1
+  fi
   if [ -f "${DONE_MARKER}" ]; then
     log "DONE_MARKER found. Skipping error check."
     return
@@ -517,6 +533,10 @@ _is_aiperf_workload() {
   [[ "${dynamo_args["workloads"]}" == *"aiperf.sh"* ]]
 }
 
+_is_aiperf_accuracy_enabled() {
+  [[ -n "${aiperf_accuracy_config["--script"]:-}" ]]
+}
+
 _init_runtime_env() {
   if _is_vllm || _is_sglang; then
     export HF_HOME="${HUGGINGFACE_HOME}"
@@ -689,6 +709,13 @@ function mark_done()
   touch "$DONE_MARKER"
 }
 
+function mark_failed()
+{
+  local message="$1"
+  log "ERROR: ${message}"
+  printf '%s\n' "${message}" > "${FATAL_ERROR_MARKER}"
+}
+
 function launch_etcd()
 {
   log "Launching etcd with cmd: ${dynamo_args["etcd-cmd"]} --listen-client-urls http://0.0.0.0:${dynamo_args["etcd-port"]} --advertise-client-urls http://0.0.0.0:${dynamo_args["etcd-port"]}"
@@ -733,6 +760,8 @@ function launch_decode()
   local base_kvbm_pub_port=${DYN_KVBM_LEADER_ZMQ_PUB_PORT:-56001}
   local base_kvbm_ack_port=${DYN_KVBM_LEADER_ZMQ_ACK_PORT:-56002}
   local kvbm_port_stride=2
+  local side_channel_host
+  side_channel_host="$(_current_node_ip)"
   log "Launching $workers_per_node decode worker(s) with unique port ranges"
 
   for i in $(seq 0 $(( $workers_per_node - 1 ))); do
@@ -754,10 +783,10 @@ function launch_decode()
       args_arr+=($key "${decode_args[$key]}")
     done
 
-    log "Launching decode worker $i on GPUs $gpu_list (NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)"
+    log "Launching decode worker $i on GPUs $gpu_list (NIXL host: $side_channel_host, NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)"
     log "Decode cmd: ${decode_config["cmd"]} ${args_arr[*]} ${decode_config["extra-args"]}"
     CUDA_VISIBLE_DEVICES=$gpu_list \
-      VLLM_NIXL_SIDE_CHANNEL_HOST=$(hostname -I | awk '{print $1}') \
+      VLLM_NIXL_SIDE_CHANNEL_HOST="$side_channel_host" \
       VLLM_NIXL_SIDE_CHANNEL_PORT=$nixl_port \
       DYN_VLLM_KV_EVENT_PORT=$kv_event_port \
       DYN_KVBM_LEADER_ZMQ_PUB_PORT=$kvbm_pub_port \
@@ -788,6 +817,8 @@ function launch_prefill()
   local base_kvbm_pub_port=${DYN_KVBM_LEADER_ZMQ_PUB_PORT:-56001}
   local base_kvbm_ack_port=${DYN_KVBM_LEADER_ZMQ_ACK_PORT:-56002}
   local kvbm_port_stride=2
+  local side_channel_host
+  side_channel_host="$(_current_node_ip)"
   log "Launching $workers_per_node prefill worker(s) with unique port ranges"
 
   for i in $(seq 0 $(( $workers_per_node - 1 ))); do
@@ -809,10 +840,10 @@ function launch_prefill()
       args_arr+=($key "${prefill_args[$key]}")
     done
 
-    log "Launching prefill worker $i on GPUs $gpu_list (NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)"
+    log "Launching prefill worker $i on GPUs $gpu_list (NIXL host: $side_channel_host, NIXL port: $nixl_port, KV event port: $kv_event_port, KVBM pub/ack: $kvbm_pub_port/$kvbm_ack_port)"
     log "Prefill cmd: ${prefill_config["cmd"]} ${args_arr[*]} ${prefill_config["extra-args"]}"
     CUDA_VISIBLE_DEVICES=$gpu_list \
-      VLLM_NIXL_SIDE_CHANNEL_HOST=$(hostname -I | awk '{print $1}') \
+      VLLM_NIXL_SIDE_CHANNEL_HOST="$side_channel_host" \
       VLLM_NIXL_SIDE_CHANNEL_PORT=$nixl_port \
       DYN_VLLM_KV_EVENT_PORT=$kv_event_port \
       DYN_KVBM_LEADER_ZMQ_PUB_PORT=$kvbm_pub_port \
@@ -1026,6 +1057,11 @@ function launch_workload()
     --decode-nodes "${decode_config["node-list"]}" \
     "${config_arr[@]}" \
     -- "${args_arr[@]}" > "${RESULTS_DIR}/$workload_name.log" 2>&1
+  local workload_status=$?
+  if [[ "${workload_status}" -ne 0 ]]; then
+    mark_failed "Workload ${workload_name} failed with exit code ${workload_status}. See ${RESULTS_DIR}/${workload_name}.log"
+    return "${workload_status}"
+  fi
 
   log "Done with $workload_name run"
 }
@@ -1035,11 +1071,15 @@ function launch_workloads()
   wait_for_dynamo_frontend
 
   if _is_genai_perf_workload; then
-    launch_workload genai_perf_config genai_perf_args
+    launch_workload genai_perf_config genai_perf_args || return $?
   fi
 
   if _is_aiperf_workload; then
-    launch_workload aiperf_config aiperf_args
+    launch_workload aiperf_config aiperf_args || return $?
+  fi
+
+  if _is_aiperf_accuracy_enabled; then
+    launch_workload aiperf_accuracy_config aiperf_accuracy_args || return $?
   fi
 
   mark_done
diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh
index 9f5a78b33..15cee3a58 100644
--- a/src/cloudai/workloads/ai_dynamo/aiperf.sh
+++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh
@@ -19,7 +19,7 @@
 #
 # Called from ai_dynamo.sh's launch_workload() with:
 #   bash aiperf.sh --result-dir <dir> --model <model> --url <url> --port <port>
-#                  [--cmd <cmd>] [--report-name <name>] [--extra-args <args>]
+#                  [--cmd <cmd>] [--report-name <name>] [--artifact-dir-name <name>] [--extra-args <args>]
 #                  -- <aiperf-args>...
 #
 # Context flags (before --) that are recognised and used:
@@ -28,7 +28,9 @@
 #   --url           Base URL of the dynamo.frontend (e.g. http://node01).
 #   --port          HTTP port the dynamo.frontend is listening on.
 #   --report-name   Output CSV name (default: aiperf_report.csv).
+#   --artifact-dir-name  Artifact directory name under --result-dir (default: aiperf_artifacts).
 #   --cmd           Full launch command including subcommand (default: "aiperf profile").
+#   --setup-cmd     Optional shell command run before launching aiperf.
 #   --extra-args    Raw string appended verbatim after all other flags.
 #
 # All unrecognised flags (--install-dir, --gpus-per-node, etc.) are silently
@@ -43,9 +45,11 @@ model=""
 url="http://localhost"
 port=8000
 report_name="aiperf_report.csv"
+artifact_dir_name="aiperf_artifacts"
 cmd="aiperf profile"
+setup_cmd=""
 declare -a extra_args=()
-declare -a aiperf_profile_args=()
+declare -a profile_args=()
 
 log() {
   echo "[$(date '+%F %T') $(hostname)]: $*"
@@ -54,14 +58,14 @@ log() {
 _parse_aiperf_args() {
   while [[ $# -ge 2 ]]; do
     case "$1" in
-      --*) aiperf_profile_args+=("$1" "$2"); shift 2 ;;
+      --*) profile_args+=("$1" "$2"); shift 2 ;;
       *)   shift ;;
     esac
   done
   # Capture a trailing lone boolean flag if present.
   # Use if/fi — not [[ ]] && — so set -e does not trigger on a false condition.
   if [[ $# -eq 1 && "$1" == --* ]]; then
-    aiperf_profile_args+=("$1")
+    profile_args+=("$1")
   fi
 }
 
@@ -73,9 +77,11 @@ process_args() {
       --url)          url="$2";         shift 2 ;;
       --port)         port="$2";        shift 2 ;;
       --report-name)  report_name="$2"; shift 2 ;;
-      --cmd)          cmd="$2";         shift 2 ;;
-      --extra-args)   read -ra extra_args <<< "$2"; shift 2 ;;
-      --)             shift; _parse_aiperf_args "$@"; break ;;
+      --artifact-dir-name) artifact_dir_name="$2"; shift 2 ;;
+      --cmd)               cmd="$2";               shift 2 ;;
+      --setup-cmd)         setup_cmd="$2";         shift 2 ;;
+      --extra-args)        read -ra extra_args <<< "$2"; shift 2 ;;
+      --)                  shift; _parse_aiperf_args "$@"; break ;;
       --*)            if [[ -n "${2:-}" && "${2}" != -* ]]; then shift 2; else shift 1; fi ;;  # consume unknown flag; shift 2 only if next arg is a value
       *)              shift ;;
     esac
@@ -87,15 +93,33 @@ process_args() {
     url:          $url
     port:         $port
     report_name:  $report_name
+    artifact_dir: $artifact_dir_name
     cmd:          $cmd
+    setup_cmd:    ${setup_cmd:-}
     extra_args:   ${extra_args[*]:-}
-    profile_args: ${aiperf_profile_args[*]:-}"
+    profile_args: ${profile_args[*]:-}"
+}
+
+run_setup_cmd() {
+  if [[ -z "$setup_cmd" ]]; then
+    return
+  fi
+
+  log "Running AIPerf setup command: $setup_cmd"
+  bash -lc "$setup_cmd"
+  log "AIPerf setup command complete"
 }
 
 process_results() {
-  local artifact_dir="$result_dir/aiperf_artifacts"
-  local csv_path
-  csv_path=$(find "$artifact_dir" -name "*.csv" -print -quit 2>/dev/null || true)
+  local artifact_dir="$result_dir/$artifact_dir_name"
+  local csv_path=""
+
+  if [[ -f "$artifact_dir/profile_export_aiperf.csv" ]]; then
+    csv_path="$artifact_dir/profile_export_aiperf.csv"
+  else
+    csv_path=$(find "$artifact_dir" -name "*aiperf*.csv" -print -quit 2>/dev/null || true)
+  fi
+
   if [[ -n "$csv_path" ]]; then
     cp "$csv_path" "$result_dir/$report_name"
     log "aiperf report saved to $result_dir/$report_name"
@@ -103,6 +127,36 @@ process_results() {
     log "ERROR: no CSV found in $artifact_dir — aiperf may not have completed"
     exit 1
   fi
+
+}
+
+run_aiperf() {
+  local full_url="$1"
+  local artifact_dir="$2"
+  local -a run_cmd=()
+  read -ra run_cmd <<< "$cmd"
+  local -a launch_cmd=(
+    "${run_cmd[@]}"
+    --model "$model"
+    --url "$full_url"
+    --endpoint-type chat
+    --streaming
+    --artifact-dir "$artifact_dir"
+    --no-server-metrics
+  )
+
+  log "Launching aiperf: ${run_cmd[*]} --model $model --url $full_url"
+
+  if [[ "${#profile_args[@]}" -gt 0 ]]; then
+    launch_cmd+=("${profile_args[@]}")
+  fi
+  if [[ "${#extra_args[@]}" -gt 0 ]]; then
+    launch_cmd+=("${extra_args[@]}")
+  fi
+
+  "${launch_cmd[@]}"
+
+  log "aiperf run complete"
 }
 
 main() {
@@ -115,27 +169,13 @@ main() {
     log "ERROR: --model is required"; exit 1
   fi
 
+  run_setup_cmd
+
   local full_url="${url}:${port}"
-  local artifact_dir="$result_dir/aiperf_artifacts"
+  local artifact_dir="$result_dir/$artifact_dir_name"
   rm -rf "$artifact_dir"
 
-  # Split cmd into an array (e.g. "aiperf profile" → ["aiperf", "profile"])
-  local -a run_cmd=()
-  read -ra run_cmd <<< "$cmd"
-
-  log "Launching aiperf: ${run_cmd[*]} --model $model --url $full_url"
-
-  "${run_cmd[@]}" \
-    --model         "$model" \
-    --url           "$full_url" \
-    --endpoint-type chat \
-    --streaming \
-    --artifact-dir  "$artifact_dir" \
-    --no-server-metrics \
-    "${aiperf_profile_args[@]}" \
-    "${extra_args[@]}"
-
-  log "aiperf run complete"
+  run_aiperf "$full_url" "$artifact_dir"
   process_results
 }
 
diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
index a8e4e91b8..a0ef92005 100644
--- a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
+++ b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
@@ -21,6 +21,7 @@
 
 from cloudai.core import METRIC_ERROR, MetricValue, ReportGenerationStrategy
 from cloudai.util.lazy_imports import lazy
+from cloudai.workloads.ai_dynamo.ai_dynamo import AIDynamoTestDefinition, parse_aiperf_accuracy
 
 
 class AIDynamoReportGenerationStrategy(ReportGenerationStrategy):
@@ -44,6 +45,16 @@ def extract_metric_from_csv(self, csv_file: Path, metric_name: str, metric_type:
 
     def get_metric(self, metric: str) -> MetricValue:
         logging.info(f"Getting metric: {metric}")
+
+        if metric.lower() == "accuracy":
+            tdef = self.test_run.test
+            if not isinstance(tdef, AIDynamoTestDefinition):
+                return METRIC_ERROR
+            if tdef.cmd_args.aiperf_accuracy is None:
+                return METRIC_ERROR
+            accuracy = parse_aiperf_accuracy(self.test_run.output_path)
+            return accuracy if accuracy is not None else METRIC_ERROR
+
         metric_name = metric
         metric_type = "avg"
 
diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
index 17079875c..c1a817853 100644
--- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import logging
+import shlex
 from pathlib import Path
 from typing import List, cast
 
@@ -71,6 +72,8 @@ def _get_toml_args(self, base_model: BaseModel, prefix: str, exclude: List[str]
             str_v = str(v)
             if str_v.startswith("{") and str_v.endswith("}"):
                 args.append(f"{prefix}{k} '{str_v}'")
+            elif any(char in str_v for char in ['"', "'", "\n"]):
+                args.append(f"{prefix}{k} {shlex.quote(str_v)}")
             else:
                 args.append(f'{prefix}{k} "{v}"')
 
@@ -118,6 +121,8 @@ def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]:
         args.extend(self._get_nested_toml_args(td.cmd_args.lmcache, "--lmcache-"))
         args.extend(self._get_nested_toml_args(td.cmd_args.genai_perf, "--genai_perf-"))
         args.extend(self._get_nested_toml_args(td.cmd_args.aiperf, "--aiperf-"))
+        if td.cmd_args.aiperf_accuracy is not None:
+            args.extend(self._get_nested_toml_args(td.cmd_args.aiperf_accuracy, "--aiperf_accuracy-"))
 
         return args
 
diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py
index 87ad7b3a3..30a6943c1 100644
--- a/src/cloudai/workloads/common/llm_serving.py
+++ b/src/cloudai/workloads/common/llm_serving.py
@@ -624,7 +624,9 @@ def _expand_semantic_eval_args(self, args: str, *, host: str) -> str:
             "{model}": self.tdef.cmd_args.model,
             "{host}": host,
             "{port}": str(self.serve_port),
+            "{url}": f"{host}:{self.serve_port}",
             "{output_path}": str(self.test_run.output_path.absolute()),
+            "{result_dir}": str(self.test_run.output_path.absolute()),
         }
         for placeholder, value in replacements.items():
             args = args.replace(placeholder, value)
diff --git a/src/cloudai/workloads/sglang/sglang.py b/src/cloudai/workloads/sglang/sglang.py
index 338bbfecc..49a7af140 100644
--- a/src/cloudai/workloads/sglang/sglang.py
+++ b/src/cloudai/workloads/sglang/sglang.py
@@ -92,8 +92,10 @@ class SglangBenchCmdArgs(CmdArgs):
 class SglangSemanticEvalCmdArgs(CmdArgs):
     """SGLang semantic validation command arguments."""
 
-    module: str = "sglang.test.run_eval"
-    args: str = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+    model_config = ConfigDict(extra="forbid")
+
+    entrypoint: str = "python3 -m sglang.test.run_eval"
+    cli: str = "--host {host} --port {port} --eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
 
 
 class SglangTestDefinition(LLMServingTestDefinition[SglangCmdArgs]):
diff --git a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py
index f1c7c741c..7a7a97d5b 100644
--- a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py
@@ -130,17 +130,8 @@ def get_semantic_eval_command(self) -> list[str] | None:
             return None
 
         host = self.bench_host
-        command = [
-            "python3",
-            "-m",
-            eval_args.module,
-            f"--host {host}",
-            f"--port {self.serve_port}",
-        ]
-        args = self._expand_semantic_eval_args(eval_args.args, host=host)
-        if args:
-            command.append(args)
-        return command
+        cli = self._expand_semantic_eval_args(eval_args.cli, host=host)
+        return [eval_args.entrypoint, cli] if cli else [eval_args.entrypoint]
 
     def aggregated_serve_env(self) -> dict[str, str]:
         return {"CUDA_VISIBLE_DEVICES": ",".join(str(gpu_id) for gpu_id in self.gpu_ids)}
diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py
index 13d87ad77..2f00e95f7 100644
--- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py
@@ -130,13 +130,5 @@ def get_semantic_eval_command(self) -> list[str] | None:
 
         host = self.bench_host
         http_host = host if host.startswith("http://") or host.startswith("https://") else f"http://{host}"
-        command = [
-            "python3",
-            eval_args.script,
-            f"--host {http_host}",
-            f"--port {self.serve_port}",
-        ]
-        args = self._expand_semantic_eval_args(eval_args.args, host=http_host)
-        if args:
-            command.append(args)
-        return command
+        cli = self._expand_semantic_eval_args(eval_args.cli, host=http_host)
+        return [eval_args.entrypoint, cli] if cli else [eval_args.entrypoint]
diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py
index d2fda3ab5..f77039edc 100644
--- a/src/cloudai/workloads/vllm/vllm.py
+++ b/src/cloudai/workloads/vllm/vllm.py
@@ -92,8 +92,10 @@ class VllmBenchCmdArgs(CmdArgs):
 class VllmSemanticEvalCmdArgs(CmdArgs):
     """vLLM semantic validation command arguments."""
 
-    script: str = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
-    args: str = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+    model_config = ConfigDict(extra="forbid")
+
+    entrypoint: str = "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
+    cli: str = "--host {host} --port {port} --num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
 
 
 class VllmTestDefinition(LLMServingTestDefinition[VllmCmdArgs]):
diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
index a0a028caa..7b036b5a8 100644
--- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
@@ -14,7 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import shlex
 from pathlib import Path
+from typing import cast
 
 import pytest
 
@@ -26,6 +28,8 @@
     AIDynamoCmdArgs,
     AIDynamoSlurmCommandGenStrategy,
     AIDynamoTestDefinition,
+    AIPerf,
+    AIPerfAccuracy,
     GenAIPerf,
     LMCache,
     LMCacheArgs,
@@ -148,3 +152,78 @@ def test_dynamo_cmd(
 ) -> None:
     result = strategy.gen_dynamo_cmd(module, Path(config))
     assert result.strip() == expected
+
+
+def test_gen_script_args_contains_split_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
+    td = cast(AIDynamoTestDefinition, strategy.test_run.test)
+    td.cmd_args.workloads = "aiperf.sh"
+    setup_cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
+    cli = (
+        "--model {model} "
+        "--url {url} "
+        "--endpoint-type chat "
+        "--streaming "
+        "--artifact-dir {artifact_dir} "
+        "--no-server-metrics "
+        "--accuracy-benchmark mmlu "
+        "--accuracy-n-shots 5 "
+        "--accuracy-tasks abstract_algebra "
+        "--concurrency 10 "
+        '--extra-inputs \'{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}\' '
+        "--num-requests 100"
+    )
+    td.cmd_args.aiperf = AIPerf.model_validate(
+        {
+            "args": {
+                "concurrency": 2,
+                "request-count": 50,
+                "synthetic-input-tokens-mean": 300,
+                "output-tokens-mean": 500,
+            },
+        }
+    )
+    td.cmd_args.aiperf_accuracy = AIPerfAccuracy.model_validate(
+        {
+            "setup-cmd": setup_cmd,
+            "cli": cli,
+        }
+    )
+
+    result = strategy._gen_script_args(td)
+
+    assert '--aiperf-args-request-count "50"' in result
+    assert '--aiperf-args-synthetic-input-tokens-mean "300"' in result
+    assert '--aiperf-args-output-tokens-mean "500"' in result
+    assert f'--aiperf_accuracy-setup-cmd "{setup_cmd}"' in result
+    assert '--aiperf_accuracy-name "aiperf_accuracy"' in result
+    assert '--aiperf_accuracy-entrypoint "aiperf profile"' in result
+    assert '--aiperf_accuracy-artifact-dir-name "aiperf_accuracy_artifacts"' in result
+    assert f"--aiperf_accuracy-cli {shlex.quote(cli)}" in result
+
+
+def test_gen_script_args_contains_custom_aiperf_accuracy_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
+    td = cast(AIDynamoTestDefinition, strategy.test_run.test)
+    cli = "--model {model} --url {url} --endpoint {endpoint} --artifact-dir {artifact_dir} --prompt ping"
+    td.cmd_args.aiperf_accuracy = AIPerfAccuracy.model_validate(
+        {
+            "entrypoint": "python /custom_accuracy/dummy_accuracy.py",
+            "cli": cli,
+        }
+    )
+
+    result = strategy._gen_script_args(td)
+
+    assert '--aiperf_accuracy-entrypoint "python /custom_accuracy/dummy_accuracy.py"' in result
+    assert f'--aiperf_accuracy-cli "{cli}"' in result
+
+
+def test_gen_script_args_quotes_worker_json_args(strategy: AIDynamoSlurmCommandGenStrategy) -> None:
+    td = cast(AIDynamoTestDefinition, strategy.test_run.test)
+    config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+    td.cmd_args.dynamo.prefill_worker.args = WorkerBaseArgs.model_validate({"kv-transfer-config": config})
+    td.cmd_args.dynamo.decode_worker.args = WorkerBaseArgs.model_validate({"kv-transfer-config": config})
+
+    result = strategy._gen_script_args(td)
+
+    assert f"--prefill-args-kv-transfer-config '{config}'" in result
+    assert f"--decode-args-kv-transfer-config '{config}'" in result
diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
index 0e51c414f..47e214421 100644
--- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py
+++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
@@ -26,12 +26,14 @@
     AIDynamoCmdArgs,
     AIDynamoTestDefinition,
     AIPerf,
+    AIPerfAccuracy,
     GenAIPerf,
     LMCache,
     LMCacheArgs,
     WorkerBaseArgs,
     WorkerConfig,
 )
+from cloudai.workloads.ai_dynamo.ai_dynamo import parse_aiperf_accuracy
 from cloudai.workloads.ai_dynamo.report_generation_strategy import AIDynamoReportGenerationStrategy
 
 
@@ -62,6 +64,14 @@ def get_aiperf_csv_content() -> str:
     )
 
 
+def get_aiperf_accuracy_csv_content() -> str:
+    return "Task,Correct,Total,Accuracy\nabstract_algebra,35,100,35.00%\nOVERALL,35,100,35.00%\n"
+
+
+def get_aiperf_accuracy_cli() -> str:
+    return "--model {model} --url {url} --artifact-dir {artifact_dir} --accuracy-benchmark mmlu"
+
+
 @pytest.fixture
 def ai_dynamo_tr(tmp_path: Path) -> TestRun:
     test = AIDynamoTestDefinition(
@@ -70,6 +80,7 @@ def ai_dynamo_tr(tmp_path: Path) -> TestRun:
         test_template_name="t",
         cmd_args=AIDynamoCmdArgs(
             docker_image_url="http://url",
+            workloads="genai_perf.sh",
             dynamo=AIDynamoArgs(
                 prefill_worker=WorkerConfig(
                     cmd="python3 -m dynamo.vllm --is-prefill-worker",
@@ -119,6 +130,64 @@ def ai_dynamo_aiperf_tr(tmp_path: Path) -> TestRun:
     return tr
 
 
+@pytest.fixture
+def ai_dynamo_aiperf_with_split_accuracy_tr(tmp_path: Path) -> TestRun:
+    test = AIDynamoTestDefinition(
+        name="ai_dynamo_aiperf_with_split_accuracy",
+        description="desc",
+        test_template_name="t",
+        cmd_args=AIDynamoCmdArgs(
+            docker_image_url="http://url",
+            workloads="aiperf.sh",
+            dynamo=AIDynamoArgs(
+                prefill_worker=WorkerConfig(
+                    cmd="python3 -m dynamo.vllm --is-prefill-worker",
+                    worker_initialized_regex="VllmWorker.*has.been.initialized",
+                    args=WorkerBaseArgs(),
+                ),
+            ),
+            aiperf=AIPerf(),
+            aiperf_accuracy=AIPerfAccuracy.model_validate({"cli": get_aiperf_accuracy_cli()}),
+            lmcache=LMCache(args=LMCacheArgs()),
+        ),
+    )
+    tr = TestRun(name="ai_dynamo_aiperf_with_split_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path)
+    (tr.output_path / "aiperf_report.csv").write_text(get_aiperf_csv_content())
+    (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content())
+    (tr.output_path / test.success_marker).touch()
+    return tr
+
+
+@pytest.fixture
+def ai_dynamo_genai_perf_with_split_accuracy_tr(tmp_path: Path) -> TestRun:
+    test = AIDynamoTestDefinition(
+        name="ai_dynamo_genai_perf_with_split_accuracy",
+        description="desc",
+        test_template_name="t",
+        cmd_args=AIDynamoCmdArgs(
+            docker_image_url="http://url",
+            workloads="genai_perf.sh",
+            dynamo=AIDynamoArgs(
+                prefill_worker=WorkerConfig(
+                    cmd="python3 -m dynamo.vllm --is-prefill-worker",
+                    worker_initialized_regex="VllmWorker.*has.been.initialized",
+                    args=WorkerBaseArgs(),
+                ),
+            ),
+            genai_perf=GenAIPerf(),
+            aiperf_accuracy=AIPerfAccuracy.model_validate({"cli": get_aiperf_accuracy_cli()}),
+            lmcache=LMCache(args=LMCacheArgs()),
+        ),
+    )
+    tr = TestRun(
+        name="ai_dynamo_genai_perf_with_split_accuracy", test=test, num_nodes=1, nodes=[], output_path=tmp_path
+    )
+    (tr.output_path / "genai_perf_report.csv").write_text(get_csv_content())
+    (tr.output_path / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content())
+    (tr.output_path / test.success_marker).touch()
+    return tr
+
+
 @pytest.fixture
 def csv_content() -> str:
     return get_csv_content()
@@ -161,6 +230,23 @@ def test_ai_dynamo_get_metric_aiperf(slurm_system: SlurmSystem, ai_dynamo_aiperf
     assert strategy.get_metric("aiperf:Total Token Throughput (tokens/sec):avg") == 954.47
 
 
+def test_ai_dynamo_get_metric_split_aiperf_accuracy(
+    slurm_system: SlurmSystem, ai_dynamo_aiperf_with_split_accuracy_tr: TestRun
+) -> None:
+    strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_with_split_accuracy_tr)
+
+    assert strategy.get_metric("accuracy") == 0.35
+    assert strategy.get_metric("Inter Token Latency (ms)") == 2.83
+
+
+def test_ai_dynamo_accuracy_metric_requires_aiperf_accuracy_config(
+    slurm_system: SlurmSystem, ai_dynamo_aiperf_tr: TestRun
+) -> None:
+    strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_tr)
+
+    assert strategy.get_metric("accuracy") == METRIC_ERROR
+
+
 def test_ai_dynamo_get_metric_invalid(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None:
     strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr)
 
@@ -176,9 +262,56 @@ def test_was_run_successful(ai_dynamo_tr: TestRun) -> None:
     assert result.is_successful is True
 
 
+def test_was_run_successful_with_split_aiperf_accuracy(
+    ai_dynamo_aiperf_with_split_accuracy_tr: TestRun,
+) -> None:
+    test_def = ai_dynamo_aiperf_with_split_accuracy_tr.test
+    result = test_def.was_run_successful(ai_dynamo_aiperf_with_split_accuracy_tr)
+    assert result.is_successful is True
+
+
+def test_was_run_successful_with_genai_perf_and_split_aiperf_accuracy(
+    ai_dynamo_genai_perf_with_split_accuracy_tr: TestRun,
+) -> None:
+    test_def = ai_dynamo_genai_perf_with_split_accuracy_tr.test
+    result = test_def.was_run_successful(ai_dynamo_genai_perf_with_split_accuracy_tr)
+    assert result.is_successful is True
+
+
+def test_was_run_successful_requires_split_aiperf_accuracy(
+    ai_dynamo_aiperf_with_split_accuracy_tr: TestRun,
+) -> None:
+    test_def = ai_dynamo_aiperf_with_split_accuracy_tr.test
+    (ai_dynamo_aiperf_with_split_accuracy_tr.output_path / "accuracy_results.csv").unlink()
+    result = test_def.was_run_successful(ai_dynamo_aiperf_with_split_accuracy_tr)
+    assert result.is_successful is False
+
+
 def test_was_run_successful_no_results(ai_dynamo_tr: TestRun, tmp_path: Path) -> None:
     test_def = ai_dynamo_tr.test
     ai_dynamo_tr.output_path = tmp_path / "empty_output"
     ai_dynamo_tr.output_path.mkdir(parents=True, exist_ok=True)
     result = test_def.was_run_successful(ai_dynamo_tr)
     assert result.is_successful is False
+
+
+def test_parse_aiperf_accuracy_from_artifact_dir(tmp_path: Path) -> None:
+    artifact_dir = tmp_path / "aiperf_artifacts"
+    artifact_dir.mkdir()
+    (artifact_dir / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content(), encoding="utf-8")
+
+    assert parse_aiperf_accuracy(tmp_path) == 0.35
+
+
+def test_parse_aiperf_accuracy_from_split_accuracy_artifact_dir(tmp_path: Path) -> None:
+    artifact_dir = tmp_path / "aiperf_accuracy_artifacts"
+    artifact_dir.mkdir()
+    (artifact_dir / "accuracy_results.csv").write_text(get_aiperf_accuracy_csv_content(), encoding="utf-8")
+
+    assert parse_aiperf_accuracy(tmp_path) == 0.35
+
+
+def test_parse_aiperf_accuracy_missing_or_invalid(tmp_path: Path) -> None:
+    (tmp_path / "accuracy_results.csv").write_text("Task,Correct,Total,Accuracy\nOVERALL,n/a,100,n/a\n")
+
+    assert parse_aiperf_accuracy(tmp_path) is None
diff --git a/tests/workloads/sglang/test_command_gen_strategy_slurm.py b/tests/workloads/sglang/test_command_gen_strategy_slurm.py
index 7d2812580..c07d1771d 100644
--- a/tests/workloads/sglang/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/sglang/test_command_gen_strategy_slurm.py
@@ -150,28 +150,24 @@ def test_get_sglang_semantic_eval_command_defaults(sglang_cmd_gen_strategy: Sgla
     command = sglang_cmd_gen_strategy.get_semantic_eval_command()
 
     assert command == [
-        "python3",
-        "-m",
-        "sglang.test.run_eval",
-        "--host ${NODE}",
-        "--port 8000",
-        "--eval-name gsm8k --num-examples 200 --num-threads 128 --model Qwen/Qwen3-8B",
+        "python3 -m sglang.test.run_eval",
+        "--host ${NODE} --port 8000 --eval-name gsm8k --num-examples 200 --num-threads 128 --model Qwen/Qwen3-8B",
     ]
 
 
-def test_get_sglang_semantic_eval_command_supports_custom_module_and_args(
+def test_get_sglang_semantic_eval_command_supports_custom_entrypoint_and_cli(
     sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy,
 ):
     sglang_test = cast(SglangTestDefinition, sglang_cmd_gen_strategy.test_run.test)
     sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs(
-        module="sglang.test.few_shot_gsm8k",
-        args="--num-questions 200 --data-path {output_path}/gsm8k.jsonl --seen {host}:{port}",
+        entrypoint="python3 /custom/semantic_eval.py",
+        cli="--num-questions 200 --data-path {result_dir}/gsm8k.jsonl --seen {url}",
     )
 
     command = sglang_cmd_gen_strategy.get_semantic_eval_command()
 
     assert command is not None
-    assert command[2] == "sglang.test.few_shot_gsm8k"
+    assert command[0] == "python3 /custom/semantic_eval.py"
     assert command[-1] == (
         f"--num-questions 200 --data-path {sglang_cmd_gen_strategy.test_run.output_path.absolute()}/gsm8k.jsonl "
         "--seen ${NODE}:8000"
diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py
index 6bd6ada36..6eb62483c 100644
--- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py
@@ -193,14 +193,29 @@ def test_get_vllm_semantic_eval_command_defaults(self, vllm_cmd_gen_strategy: Vl
         command = vllm_cmd_gen_strategy.get_semantic_eval_command()
 
         assert command == [
-            "python3",
-            "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py",
-            "--host http://${NODE}",
-            "--port 8000",
+            "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py",
+            "--host http://${NODE} --port 8000 "
             "--num-questions 200 --save-results "
             f"{vllm_cmd_gen_strategy.test_run.output_path.absolute()}/vllm-gsm8k.json",
         ]
 
+    def test_get_vllm_semantic_eval_command_supports_custom_entrypoint_and_cli(
+        self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy
+    ) -> None:
+        vllm_test = cast(VllmTestDefinition, vllm_cmd_gen_strategy.test_run.test)
+        vllm_test.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs(
+            entrypoint="python3 /custom/eval.py",
+            cli="--model {model} --api {url} --out {result_dir}/vllm-gsm8k.json",
+        )
+
+        command = vllm_cmd_gen_strategy.get_semantic_eval_command()
+
+        assert command == [
+            "python3 /custom/eval.py",
+            f"--model Qwen/Qwen3-0.6B --api http://${{NODE}}:8000 "
+            f"--out {vllm_cmd_gen_strategy.test_run.output_path.absolute()}/vllm-gsm8k.json",
+        ]
+
     def test_gen_srun_command_contains_vllm_semantic_eval(
         self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy
     ) -> None: