From 62ff662d233013e459d63f23b056c2eb40404501 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Mon, 1 Jun 2026 23:29:56 +0200
Subject: [PATCH 1/7] restart router between aiperf runs

---
 src/cloudai/workloads/ai_dynamo/ai_dynamo.py  |  11 ++
 src/cloudai/workloads/ai_dynamo/ai_dynamo.sh  | 131 +++++++++++++++++-
 .../ai_dynamo/slurm_command_gen_strategy.py   |  29 ++++
 tests/ref_data/ai-dynamo-aiperf.sh            |   2 +
 .../test_command_gen_strategy_slurm.py        |   4 +
 5 files changed, 174 insertions(+), 3 deletions(-)

diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
index b85b35d9a..faf70a2de 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
@@ -44,6 +44,7 @@
 AIPERF_ARTIFACTS_DIR = "aiperf_artifacts"
 AIPERF_ACCURACY_ARTIFACTS_DIR = "aiperf_accuracy_artifacts"
 AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv"
+AIPERF_DEFAULT_BETWEEN_PHASE_CMD = "/cloudai_run_results/routerctl.sh restart --reset-states"
 LMCACHE_CONFIG_FILE_NAME = "lmcache-config.yaml"
 LMCACHE_CONFIG_BACKUP_FILE_NAME = "lmcache-config.original.yaml"
 
@@ -294,6 +295,11 @@ class AIPerf(Workload):
         serialization_alias="continue-on-phase-failure",
         validation_alias=AliasChoices("continue-on-phase-failure", "continue_on_phase_failure"),
     )
+    between_phase_cmd: str | None = Field(
+        default=AIPERF_DEFAULT_BETWEEN_PHASE_CMD,
+        serialization_alias="between-phase-cmd",
+        validation_alias=AliasChoices("between-phase-cmd", "between_phase_cmd"),
+    )
 
     @property
     def installables(self) -> list[Installable]:
@@ -334,6 +340,11 @@ class AIPerfPhase(BaseModel):
         serialization_alias="extra-args",
         validation_alias=AliasChoices("extra-args", "extra_args"),
     )
+    between_phase_cmd: str | None = Field(
+        default=None,
+        serialization_alias="between-phase-cmd",
+        validation_alias=AliasChoices("between-phase-cmd", "between_phase_cmd"),
+    )
 
 
 class AIPerfAccuracy(BaseModel):
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
index 5697a78ea..f9e84ef9a 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
@@ -427,6 +427,9 @@ function perform_exit()
     log "Sleeping for ${sleep_before_exit} seconds before exit"
     sleep "${sleep_before_exit}"
   fi
+  if _is_frontend_node && [[ -x "${RESULTS_DIR}/routerctl.sh" ]]; then
+    "${RESULTS_DIR}/routerctl.sh" stop || true
+  fi
   exit "${exit_code}"
 }
 
@@ -733,8 +736,130 @@ function launch_nats()
 
 function launch_ingress()
 {
-  log "Launching ingress with cmd: ${dynamo_args["ingress-cmd"]} --http-port ${dynamo_args["port"]}"
-  ${dynamo_args["ingress-cmd"]} --http-port ${dynamo_args["port"]} > ${RESULTS_DIR}/dynamo_ingress.log 2>&1
+  write_routerctl
+  start_router
+}
+
+function write_routerctl()
+{
+  export ROUTER_CMD="${dynamo_args["ingress-cmd"]} --http-port ${dynamo_args["port"]}"
+  export ROUTER_URL="${dynamo_args["url"]}"
+  export ROUTER_PID_FILE="${RESULTS_DIR}/router.pid"
+  export ROUTER_LOG_FILE="${RESULTS_DIR}/dynamo_ingress.log"
+  export ROUTER_START_TIMEOUT="${ROUTER_START_TIMEOUT:-120}"
+  export ROUTER_STOP_TIMEOUT="${ROUTER_STOP_TIMEOUT:-30}"
+
+  cat > "${RESULTS_DIR}/routerctl.sh" <<'EOF'
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+log() { echo "[$(date +%F\ %T) $(hostname)]: $*"; }
+
+: "${ROUTER_CMD:?ROUTER_CMD is not set}"
+: "${ROUTER_URL:?ROUTER_URL is not set}"
+: "${ROUTER_PID_FILE:?ROUTER_PID_FILE is not set}"
+: "${ROUTER_LOG_FILE:?ROUTER_LOG_FILE is not set}"
+: "${ROUTER_START_TIMEOUT:=120}"
+: "${ROUTER_STOP_TIMEOUT:=30}"
+
+router_pid() {
+  if [[ -s "${ROUTER_PID_FILE}" ]]; then
+    cat "${ROUTER_PID_FILE}"
+  fi
+}
+
+router_is_running() {
+  local pid
+  pid="$(router_pid)"
+  [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null
+}
+
+wait_for_router() {
+  local deadline=$((SECONDS + ROUTER_START_TIMEOUT))
+  until curl -sS -o /dev/null --connect-timeout 1 --max-time 2 "${ROUTER_URL}"; do
+    if ! router_is_running; then
+      log "ERROR: Router process exited before ${ROUTER_URL} became reachable"
+      return 1
+    fi
+    if (( SECONDS >= deadline )); then
+      log "ERROR: Router did not become reachable at ${ROUTER_URL} within ${ROUTER_START_TIMEOUT}s"
+      return 1
+    fi
+    sleep 1
+  done
+  log "Router is reachable at ${ROUTER_URL}"
+}
+
+start_router() {
+  local cmd="${ROUTER_CMD}"
+  if [[ "${1:-}" == "--reset-states" && "${cmd}" != *"--router-reset-states"* ]]; then
+    cmd="${cmd} --router-reset-states"
+  fi
+
+  if router_is_running; then
+    log "Router is already running with PID $(router_pid)"
+    return 0
+  fi
+
+  mkdir -p "$(dirname "${ROUTER_LOG_FILE}")"
+  log "Starting router with cmd: ${cmd}"
+  nohup bash -lc "${cmd}" >> "${ROUTER_LOG_FILE}" 2>&1 &
+  local pid=$!
+  echo "${pid}" > "${ROUTER_PID_FILE}"
+  log "Router PID: ${pid}"
+  wait_for_router
+}
+
+stop_router() {
+  if ! router_is_running; then
+    rm -f "${ROUTER_PID_FILE}"
+    log "Router is not running"
+    return 0
+  fi
+
+  local pid
+  pid="$(router_pid)"
+  log "Stopping router PID ${pid}"
+  kill -TERM "${pid}" 2>/dev/null || true
+
+  local deadline=$((SECONDS + ROUTER_STOP_TIMEOUT))
+  while kill -0 "${pid}" 2>/dev/null; do
+    if (( SECONDS >= deadline )); then
+      log "ERROR: Router PID ${pid} did not stop within ${ROUTER_STOP_TIMEOUT}s"
+      return 1
+    fi
+    sleep 1
+  done
+
+  rm -f "${ROUTER_PID_FILE}"
+  log "Router stopped"
+}
+
+case "${1:-}" in
+  start)
+    shift
+    start_router "$@"
+    ;;
+  stop)
+    stop_router
+    ;;
+  restart)
+    shift
+    stop_router
+    start_router "$@"
+    ;;
+  *)
+    echo "Usage: $0 {start|stop|restart} [--reset-states]" >&2
+    exit 2
+    ;;
+esac
+EOF
+  chmod +x "${RESULTS_DIR}/routerctl.sh"
+}
+
+function start_router()
+{
+  "${RESULTS_DIR}/routerctl.sh" start
 }
 
 launch_sgl_http_server() {
@@ -1184,7 +1309,7 @@ function main()
     launch_etcd &
     launch_nats &
     wait_for_etcd
-    launch_ingress &
+    launch_ingress
     if _is_sglang_dsr1; then
       launch_sgl_http_server
     fi
diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
index cc3b51273..439c0eda8 100644
--- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
@@ -215,6 +215,26 @@ def _render_aiperf_setup_blocks(self, log_message: str, setup_cmd: str | None) -
             ).rstrip()
         ]
 
+    def _render_between_aiperf_phases_block(
+        self,
+        phase_name: str,
+        cmd: str | None,
+    ) -> list[str]:
+        if not cmd:
+            return []
+
+        cleanup_argv = ["bash", "-lc", cmd]
+        return (
+            textwrap.dedent(
+                f"""\
+            log {shlex.quote(f"Running AIPerf between-phase command after {phase_name}: {shlex.join(cleanup_argv)}")}
+            {shlex.join(cleanup_argv)}
+            """
+            )
+            .rstrip()
+            .splitlines()
+        )
+
     def _render_aiperf_script(self) -> str:
         phases = self.td.cmd_args.aiperf_phases or [AIPerfPhase.model_validate({"name": "aiperf"})]
         single_phase = len(phases) == 1
@@ -298,6 +318,15 @@ def _render_aiperf_script(self) -> str:
                     phase_lines.append(f"  cp {shlex.quote(report_file)} {shlex.quote(final_report_file)}")
                 phase_lines.append(f"  log {shlex.quote(f'Final AIPerf report saved to {final_report_file}')}")
 
+            if not single_phase and idx < len(phases) - 1:
+                phase_lines.extend(
+                    "  " + line
+                    for line in self._render_between_aiperf_phases_block(
+                        phase_name=phase.name,
+                        cmd=resolved_phase.between_phase_cmd,
+                    )
+                )
+
             if not single_phase and idx < len(phases) - 1 and resolved_phase.health_check_between_phases:
                 health_probe_cmd = (
                     '  if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" '
diff --git a/tests/ref_data/ai-dynamo-aiperf.sh b/tests/ref_data/ai-dynamo-aiperf.sh
index 60798ef8b..495959d70 100644
--- a/tests/ref_data/ai-dynamo-aiperf.sh
+++ b/tests/ref_data/ai-dynamo-aiperf.sh
@@ -24,6 +24,8 @@ if [[ "$phase_status" -eq 0 ]]; then
   mkdir -p /cloudai_run_results
   cp /cloudai_run_results/aiperf_artifacts/round_1/profile_export_aiperf.csv /cloudai_run_results/aiperf_round_1_report.csv
   log 'AIPerf report saved to /cloudai_run_results/aiperf_round_1_report.csv'
+  log 'Running AIPerf between-phase command after round_1: bash -lc '"'"'/cloudai_run_results/routerctl.sh restart --reset-states'"'"''
+  bash -lc '/cloudai_run_results/routerctl.sh restart --reset-states'
   if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then
     log 'FATAL: failure marker found between AIPerf phases'
     exit 1
diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
index 46a10906b..d5c832156 100644
--- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py
@@ -226,6 +226,7 @@ def test_gen_script_args_writes_resolved_aiperf_script(strategy: AIDynamoSlurmCo
     td.cmd_args.aiperf = AIPerf.model_validate(
         {
             "setup-cmd": "python -m pip install --upgrade aiperf",
+            "between-phase-cmd": "curl -fsS -X POST ${FRONTEND_URL}/reset_prefix_cache || true",
             "args": {
                 "concurrency": 2,
                 "request-count": 50,
@@ -254,6 +255,9 @@ def test_gen_script_args_writes_resolved_aiperf_script(strategy: AIDynamoSlurmCo
     assert "Running AIPerf phase setup for round_1" not in script
     assert "Running AIPerf phase setup for round_2" in script
     assert "bash -lc 'python -m pip install --upgrade another-aiperf-plugin'" in script
+    assert script.count("Running AIPerf between-phase command after") == 1
+    assert "Running AIPerf between-phase command after round_1" in script
+    assert "bash -lc 'curl -fsS -X POST ${FRONTEND_URL}/reset_prefix_cache || true'" in script
     assert ': "${FRONTEND_URL:?FRONTEND_URL is not set}"' in script
     assert '--url "$FRONTEND_URL"' in script
     assert f"--artifact-dir {strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_artifacts/round_1" in script

From 9045b5da4e683ec0b3a3fe9015dc8f35cf5428be Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Mon, 1 Jun 2026 23:40:09 +0200
Subject: [PATCH 2/7] better waiting for router to start

---
 .../ai_dynamo/slurm_command_gen_strategy.py   | 19 ++++++++++++++-----
 tests/ref_data/ai-dynamo-aiperf.sh            | 17 +++++++++++++----
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
index 439c0eda8..8f0f98e9e 100644
--- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
@@ -250,6 +250,7 @@ def _render_aiperf_script(self) -> str:
                 : "${{AIPERF_MODEL:={self.td.cmd_args.dynamo.model}}}"
                 : "${{AIPERF_ENDPOINT:={self.td.cmd_args.dynamo.endpoint}}}"
                 : "${{AIPERF_FAILURE_MARKER:={self.CONTAINER_MOUNT_OUTPUT}/{self.td.failure_marker}}}"
+                : "${{AIPERF_HEALTH_TIMEOUT:=120}}"
                 """
             ).rstrip()
         ]
@@ -329,22 +330,30 @@ def _render_aiperf_script(self) -> str:
 
             if not single_phase and idx < len(phases) - 1 and resolved_phase.health_check_between_phases:
                 health_probe_cmd = (
-                    '  if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" '
+                    '  until curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" '
                     "-H 'Content-Type: application/json' "
                     '-d "{\\"model\\":\\"${AIPERF_MODEL}\\",\\"messages\\":[{\\"role\\":\\"user\\",'
                     '\\"content\\":\\"ping\\"}],\\"stream\\":false,\\"max_tokens\\":1}" '
-                    ">/dev/null; then"
+                    ">/dev/null; do"
                 )
                 phase_lines.extend(
                     [
+                        "  health_deadline=$((SECONDS + AIPERF_HEALTH_TIMEOUT))",
                         '  if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then',
                         "    log 'FATAL: failure marker found between AIPerf phases'",
                         "    exit 1",
                         "  fi",
                         health_probe_cmd,
-                        "    log 'FATAL: frontend health probe failed between AIPerf phases'",
-                        "    exit 1",
-                        "  fi",
+                        '    if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then',
+                        "      log 'FATAL: failure marker found while waiting for frontend between AIPerf phases'",
+                        "      exit 1",
+                        "    fi",
+                        "    if (( SECONDS >= health_deadline )); then",
+                        "      log 'FATAL: frontend health probe failed between AIPerf phases'",
+                        "      exit 1",
+                        "    fi",
+                        "    sleep 1",
+                        "  done",
                     ]
                 )
             phase_lines.append("fi")
diff --git a/tests/ref_data/ai-dynamo-aiperf.sh b/tests/ref_data/ai-dynamo-aiperf.sh
index 495959d70..ac268122c 100644
--- a/tests/ref_data/ai-dynamo-aiperf.sh
+++ b/tests/ref_data/ai-dynamo-aiperf.sh
@@ -7,6 +7,7 @@ log() { echo "[$(date +%F\ %T) $(hostname)]: $*"; }
 : "${AIPERF_MODEL:=model}"
 : "${AIPERF_ENDPOINT:=v1/chat/completions}"
 : "${AIPERF_FAILURE_MARKER:=/cloudai_run_results/failure-marker.txt}"
+: "${AIPERF_HEALTH_TIMEOUT:=120}"
 
 rm -rf /cloudai_run_results/aiperf_artifacts/round_1
 mkdir -p /cloudai_run_results/aiperf_artifacts/round_1
@@ -26,14 +27,22 @@ if [[ "$phase_status" -eq 0 ]]; then
   log 'AIPerf report saved to /cloudai_run_results/aiperf_round_1_report.csv'
   log 'Running AIPerf between-phase command after round_1: bash -lc '"'"'/cloudai_run_results/routerctl.sh restart --reset-states'"'"''
   bash -lc '/cloudai_run_results/routerctl.sh restart --reset-states'
+  health_deadline=$((SECONDS + AIPERF_HEALTH_TIMEOUT))
   if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then
     log 'FATAL: failure marker found between AIPerf phases'
     exit 1
   fi
-  if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" -H 'Content-Type: application/json' -d "{\"model\":\"${AIPERF_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"stream\":false,\"max_tokens\":1}" >/dev/null; then
-    log 'FATAL: frontend health probe failed between AIPerf phases'
-    exit 1
-  fi
+  until curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" -H 'Content-Type: application/json' -d "{\"model\":\"${AIPERF_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"stream\":false,\"max_tokens\":1}" >/dev/null; do
+    if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then
+      log 'FATAL: failure marker found while waiting for frontend between AIPerf phases'
+      exit 1
+    fi
+    if (( SECONDS >= health_deadline )); then
+      log 'FATAL: frontend health probe failed between AIPerf phases'
+      exit 1
+    fi
+    sleep 1
+  done
 fi
 
 rm -rf /cloudai_run_results/aiperf_artifacts/round_2

From 1ab58d7c0880a70d5bd0237fe6fedc05f47a02bf Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Mon, 1 Jun 2026 23:43:29 +0200
Subject: [PATCH 3/7] deduplicate router rediness check

---
 src/cloudai/workloads/ai_dynamo/ai_dynamo.sh  | 15 +++++++++++----
 .../ai_dynamo/slurm_command_gen_strategy.py   | 19 +++++--------------
 tests/ref_data/ai-dynamo-aiperf.sh            | 17 ++++-------------
 3 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
index f9e84ef9a..8b1f3475a 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
@@ -744,6 +744,8 @@ function write_routerctl()
 {
   export ROUTER_CMD="${dynamo_args["ingress-cmd"]} --http-port ${dynamo_args["port"]}"
   export ROUTER_URL="${dynamo_args["url"]}"
+  export ROUTER_HEALTH_ENDPOINT="${dynamo_args["endpoint"]}"
+  export ROUTER_HEALTH_MODEL="${dynamo_args["model"]}"
   export ROUTER_PID_FILE="${RESULTS_DIR}/router.pid"
   export ROUTER_LOG_FILE="${RESULTS_DIR}/dynamo_ingress.log"
   export ROUTER_START_TIMEOUT="${ROUTER_START_TIMEOUT:-120}"
@@ -757,6 +759,8 @@ log() { echo "[$(date +%F\ %T) $(hostname)]: $*"; }
 
 : "${ROUTER_CMD:?ROUTER_CMD is not set}"
 : "${ROUTER_URL:?ROUTER_URL is not set}"
+: "${ROUTER_HEALTH_ENDPOINT:?ROUTER_HEALTH_ENDPOINT is not set}"
+: "${ROUTER_HEALTH_MODEL:?ROUTER_HEALTH_MODEL is not set}"
 : "${ROUTER_PID_FILE:?ROUTER_PID_FILE is not set}"
 : "${ROUTER_LOG_FILE:?ROUTER_LOG_FILE is not set}"
 : "${ROUTER_START_TIMEOUT:=120}"
@@ -776,18 +780,21 @@ router_is_running() {
 
 wait_for_router() {
   local deadline=$((SECONDS + ROUTER_START_TIMEOUT))
-  until curl -sS -o /dev/null --connect-timeout 1 --max-time 2 "${ROUTER_URL}"; do
+  until curl -fsS -X POST "${ROUTER_URL}/${ROUTER_HEALTH_ENDPOINT}" \
+    -H 'Content-Type: application/json' \
+    -d "{\"model\":\"${ROUTER_HEALTH_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"stream\":false,\"max_tokens\":1}" \
+    >/dev/null; do
     if ! router_is_running; then
-      log "ERROR: Router process exited before ${ROUTER_URL} became reachable"
+      log "ERROR: Router process exited before ${ROUTER_URL}/${ROUTER_HEALTH_ENDPOINT} became ready"
       return 1
     fi
     if (( SECONDS >= deadline )); then
-      log "ERROR: Router did not become reachable at ${ROUTER_URL} within ${ROUTER_START_TIMEOUT}s"
+      log "ERROR: Router did not become ready at ${ROUTER_URL}/${ROUTER_HEALTH_ENDPOINT} within ${ROUTER_START_TIMEOUT}s"
       return 1
     fi
     sleep 1
   done
-  log "Router is reachable at ${ROUTER_URL}"
+  log "Router is ready at ${ROUTER_URL}/${ROUTER_HEALTH_ENDPOINT}"
 }
 
 start_router() {
diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
index 8f0f98e9e..439c0eda8 100644
--- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
@@ -250,7 +250,6 @@ def _render_aiperf_script(self) -> str:
                 : "${{AIPERF_MODEL:={self.td.cmd_args.dynamo.model}}}"
                 : "${{AIPERF_ENDPOINT:={self.td.cmd_args.dynamo.endpoint}}}"
                 : "${{AIPERF_FAILURE_MARKER:={self.CONTAINER_MOUNT_OUTPUT}/{self.td.failure_marker}}}"
-                : "${{AIPERF_HEALTH_TIMEOUT:=120}}"
                 """
             ).rstrip()
         ]
@@ -330,30 +329,22 @@ def _render_aiperf_script(self) -> str:
 
             if not single_phase and idx < len(phases) - 1 and resolved_phase.health_check_between_phases:
                 health_probe_cmd = (
-                    '  until curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" '
+                    '  if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" '
                     "-H 'Content-Type: application/json' "
                     '-d "{\\"model\\":\\"${AIPERF_MODEL}\\",\\"messages\\":[{\\"role\\":\\"user\\",'
                     '\\"content\\":\\"ping\\"}],\\"stream\\":false,\\"max_tokens\\":1}" '
-                    ">/dev/null; do"
+                    ">/dev/null; then"
                 )
                 phase_lines.extend(
                     [
-                        "  health_deadline=$((SECONDS + AIPERF_HEALTH_TIMEOUT))",
                         '  if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then',
                         "    log 'FATAL: failure marker found between AIPerf phases'",
                         "    exit 1",
                         "  fi",
                         health_probe_cmd,
-                        '    if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then',
-                        "      log 'FATAL: failure marker found while waiting for frontend between AIPerf phases'",
-                        "      exit 1",
-                        "    fi",
-                        "    if (( SECONDS >= health_deadline )); then",
-                        "      log 'FATAL: frontend health probe failed between AIPerf phases'",
-                        "      exit 1",
-                        "    fi",
-                        "    sleep 1",
-                        "  done",
+                        "    log 'FATAL: frontend health probe failed between AIPerf phases'",
+                        "    exit 1",
+                        "  fi",
                     ]
                 )
             phase_lines.append("fi")
diff --git a/tests/ref_data/ai-dynamo-aiperf.sh b/tests/ref_data/ai-dynamo-aiperf.sh
index ac268122c..495959d70 100644
--- a/tests/ref_data/ai-dynamo-aiperf.sh
+++ b/tests/ref_data/ai-dynamo-aiperf.sh
@@ -7,7 +7,6 @@ log() { echo "[$(date +%F\ %T) $(hostname)]: $*"; }
 : "${AIPERF_MODEL:=model}"
 : "${AIPERF_ENDPOINT:=v1/chat/completions}"
 : "${AIPERF_FAILURE_MARKER:=/cloudai_run_results/failure-marker.txt}"
-: "${AIPERF_HEALTH_TIMEOUT:=120}"
 
 rm -rf /cloudai_run_results/aiperf_artifacts/round_1
 mkdir -p /cloudai_run_results/aiperf_artifacts/round_1
@@ -27,22 +26,14 @@ if [[ "$phase_status" -eq 0 ]]; then
   log 'AIPerf report saved to /cloudai_run_results/aiperf_round_1_report.csv'
   log 'Running AIPerf between-phase command after round_1: bash -lc '"'"'/cloudai_run_results/routerctl.sh restart --reset-states'"'"''
   bash -lc '/cloudai_run_results/routerctl.sh restart --reset-states'
-  health_deadline=$((SECONDS + AIPERF_HEALTH_TIMEOUT))
   if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then
     log 'FATAL: failure marker found between AIPerf phases'
     exit 1
   fi
-  until curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" -H 'Content-Type: application/json' -d "{\"model\":\"${AIPERF_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"stream\":false,\"max_tokens\":1}" >/dev/null; do
-    if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then
-      log 'FATAL: failure marker found while waiting for frontend between AIPerf phases'
-      exit 1
-    fi
-    if (( SECONDS >= health_deadline )); then
-      log 'FATAL: frontend health probe failed between AIPerf phases'
-      exit 1
-    fi
-    sleep 1
-  done
+  if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" -H 'Content-Type: application/json' -d "{\"model\":\"${AIPERF_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"stream\":false,\"max_tokens\":1}" >/dev/null; then
+    log 'FATAL: frontend health probe failed between AIPerf phases'
+    exit 1
+  fi
 fi
 
 rm -rf /cloudai_run_results/aiperf_artifacts/round_2

From 28a73551e690f6c70867fcf7f4c3c3e85477f605 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 2 Jun 2026 00:06:14 +0200
Subject: [PATCH 4/7] trying to improve accuracy

---
 conf/experimental/ai_dynamo/test/sglang.toml                | 2 +-
 conf/experimental/ai_dynamo/test/vllm.toml                  | 2 +-
 conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index 34bc9cbff..bc4b6068d 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -117,7 +117,7 @@ workloads = "aiperf.sh"
 --accuracy-n-shots 5
 --accuracy-tasks abstract_algebra
 --concurrency 10
---extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+--extra-inputs '{"temperature":0,"stop":["\n"],"chat_template_kwargs":{"enable_thinking":false}}'
 --num-requests 100
 '''
 
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index ea2a4552c..b09a2d1cf 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -115,7 +115,7 @@ workloads = "aiperf.sh"
 --accuracy-n-shots 5
 --accuracy-tasks abstract_algebra
 --concurrency 10
---extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+--extra-inputs '{"temperature":0,"stop":["\n"],"chat_template_kwargs":{"enable_thinking":false}}'
 --num-requests 100
 '''
 
diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml
index c63319b4e..09672c786 100644
--- a/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml
+++ b/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml
@@ -121,7 +121,7 @@ dse_excluded_args = [
 --accuracy-n-shots 5
 --accuracy-tasks abstract_algebra
 --concurrency 10
---extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+--extra-inputs '{"temperature":0,"stop":["\n"],"chat_template_kwargs":{"enable_thinking":false}}'
 --num-requests 100
 '''
 

From a10d29e3655eec6c312ef8d4f5945cc6c6fdbb43 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 2 Jun 2026 17:01:26 +0200
Subject: [PATCH 5/7] update docs

---
 doc/workloads/ai_dynamo.rst                  | 49 ++++++++++++++++++++
 src/cloudai/workloads/ai_dynamo/ai_dynamo.py |  3 +-
 tests/ref_data/ai-dynamo-aiperf.sh           |  4 +-
 3 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index 7dbba92b6..4d62fc762 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -84,6 +84,14 @@ The job progress monitoring can be done using either of the following options:
 
 The frontend node will initially wait to allow weight loading on all nodes. Once ready, it will launch the configured benchmark tool (``aiperf`` by default), which begins generating requests to the frontend server. All servers cooperate to complete inference, and the output will appear in ``stdout.txt``.
 
+Recent AIDynamo Slurm features:
+
+- Multi-phase AIPerf runs with base config plus per-phase overrides.
+- Optional between-phase bash hook for backend-specific cleanup; the default hook is a no-op.
+- ``server-metrics = "auto"`` support, including CloudAI-started DCGM exporters.
+- LMCache config propagation from structured TOML to worker-visible YAML, with optional LMCache controller launch.
+- ``dse_excluded_args`` for list-valued config that must not become a DSE sweep dimension.
+
 Choosing a Benchmark Tool
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -110,6 +118,47 @@ To use genai-perf, set:
      output-tokens-mean = 500
      request-count = 50
 
+AIPerf Multi-Phase Runs
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``cmd_args.aiperf`` is the base AIPerf config. ``cmd_args.aiperf_phases`` can run several AIPerf rounds against the
+same live Dynamo stack without restarting prefill, decode, or router processes:
+
+.. code-block:: toml
+
+   dse_excluded_args = ["cmd_args.aiperf_phases"]
+
+   [cmd_args.aiperf]
+   health-check-between-phases = true
+   between-phase-cmd = "true"  # default no-op
+
+     [cmd_args.aiperf.args]
+     request-count = 50
+     server-metrics = "auto"
+
+   [[cmd_args.aiperf_phases]]
+   name = "round_1"
+     [cmd_args.aiperf_phases.args]
+     concurrency = 2
+
+   [[cmd_args.aiperf_phases]]
+   name = "round_2"
+     [cmd_args.aiperf_phases.args]
+     concurrency = 4
+
+Single-phase runs keep the old artifact layout: ``aiperf_artifacts/``, ``aiperf.log``, and ``aiperf_report.csv``.
+Multi-phase runs write per-phase artifacts/logs/reports and copy the last phase report to ``aiperf_report.csv`` for
+existing report generation.
+
+``between-phase-cmd`` is a bash command run after each non-final phase. The default is a no-op. Set it explicitly for
+backend-specific cache cleanup, for example ``/cloudai_run_results/routerctl.sh restart --reset-states`` if a test needs
+to restart the Dynamo router between phases. ``health-check-between-phases`` probes the frontend after the command.
+
+AIPerf args are rendered as normal CLI flags. Multi-value AIPerf options should be passed with AIPerf CLI syntax, such
+as ``server-metrics-formats = "csv,json,jsonl"`` or ``gpu-telemetry = "node1:9401,node2:9401"``. ``server-metrics =
+"auto"`` expands to the frontend metrics endpoint, Dynamo worker metrics endpoints, and any CloudAI-started DCGM
+exporters.
+
 Propagating LMCache Configuration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
index faf70a2de..298d9a79d 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
@@ -44,7 +44,6 @@
 AIPERF_ARTIFACTS_DIR = "aiperf_artifacts"
 AIPERF_ACCURACY_ARTIFACTS_DIR = "aiperf_accuracy_artifacts"
 AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv"
-AIPERF_DEFAULT_BETWEEN_PHASE_CMD = "/cloudai_run_results/routerctl.sh restart --reset-states"
 LMCACHE_CONFIG_FILE_NAME = "lmcache-config.yaml"
 LMCACHE_CONFIG_BACKUP_FILE_NAME = "lmcache-config.original.yaml"
 
@@ -296,7 +295,7 @@ class AIPerf(Workload):
         validation_alias=AliasChoices("continue-on-phase-failure", "continue_on_phase_failure"),
     )
     between_phase_cmd: str | None = Field(
-        default=AIPERF_DEFAULT_BETWEEN_PHASE_CMD,
+        default="true",
         serialization_alias="between-phase-cmd",
         validation_alias=AliasChoices("between-phase-cmd", "between_phase_cmd"),
     )
diff --git a/tests/ref_data/ai-dynamo-aiperf.sh b/tests/ref_data/ai-dynamo-aiperf.sh
index 495959d70..29fa95b7c 100644
--- a/tests/ref_data/ai-dynamo-aiperf.sh
+++ b/tests/ref_data/ai-dynamo-aiperf.sh
@@ -24,8 +24,8 @@ if [[ "$phase_status" -eq 0 ]]; then
   mkdir -p /cloudai_run_results
   cp /cloudai_run_results/aiperf_artifacts/round_1/profile_export_aiperf.csv /cloudai_run_results/aiperf_round_1_report.csv
   log 'AIPerf report saved to /cloudai_run_results/aiperf_round_1_report.csv'
-  log 'Running AIPerf between-phase command after round_1: bash -lc '"'"'/cloudai_run_results/routerctl.sh restart --reset-states'"'"''
-  bash -lc '/cloudai_run_results/routerctl.sh restart --reset-states'
+  log 'Running AIPerf between-phase command after round_1: bash -lc true'
+  bash -lc true
   if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then
     log 'FATAL: failure marker found between AIPerf phases'
     exit 1

From 1d2060861be88292e4ad0a97303fe1a3af706c3f Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 2 Jun 2026 17:12:03 +0200
Subject: [PATCH 6/7] cleaner docs

---
 doc/USER_GUIDE.rst          | 19 +++++++++++++------
 doc/workloads/ai_dynamo.rst |  8 --------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/doc/USER_GUIDE.rst b/doc/USER_GUIDE.rst
index 966fadb9f..ea42e516d 100644
--- a/doc/USER_GUIDE.rst
+++ b/doc/USER_GUIDE.rst
@@ -210,11 +210,13 @@ DSE parameter exclusions
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 CloudAI builds the DSE parameter space implicitly from list-valued fields under ``cmd_args``, list-valued
-``extra_env_vars``, and list-valued ``num_nodes``. If a list-valued ``cmd_args`` field is configuration data rather than
-a sweep dimension, exclude it with ``dse_excluded_args`` in the test or scenario definition.
+``extra_env_vars``, and list-valued ``num_nodes``. Most lists mean "try each value", but some workload settings are
+real list-valued configuration, such as worker port lists or ordered benchmark phases.
 
-Entries in ``dse_excluded_args`` must be dot-separated paths that start with ``cmd_args.``. Each entry excludes that
-field and any nested fields below it from DSE parameter discovery:
+Use ``dse_excluded_args`` when a list under ``cmd_args`` should stay intact instead of becoming a sweep dimension.
+Entries must be dot-separated paths that start with ``cmd_args.`` and may point to either a single field or a parent
+object. Matching is prefix-based, so excluding ``cmd_args.foo`` also excludes nested list-valued fields such as
+``cmd_args.foo.bar`` from DSE parameter discovery.
 
 .. code-block:: toml
 
@@ -228,8 +230,13 @@ field and any nested fields below it from DSE parameter discovery:
      lmcache_worker_ports = [8788, 8789, 8790, 8791]
 
 In this example, ``cmd_args.lmcache.chunk_size`` is still swept, while
-``cmd_args.lmcache.lmcache_worker_ports`` is treated as a single configuration value. The exclusion mechanism currently
-applies only to ``cmd_args`` paths; it does not exclude ``extra_env_vars`` or ``num_nodes`` from DSE.
+``cmd_args.lmcache.lmcache_worker_ports`` is passed through as one list value. The exclusion does not remove or mutate
+the field; it only prevents CloudAI from adding that path to the DSE parameter space.
+
+``dse_excluded_args`` currently applies only to ``cmd_args`` paths. It does not exclude list-valued ``extra_env_vars``
+or ``num_nodes``; those lists are still interpreted as sweep dimensions. To exclude many nested list fields at once,
+exclude their common parent path. Common examples are ``cmd_args.aiperf_phases`` and
+``cmd_args.lmcache.lmcache_worker_ports``.
 
 Metric errors and report strategies
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index 4d62fc762..46382038d 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -84,14 +84,6 @@ The job progress monitoring can be done using either of the following options:
 
 The frontend node will initially wait to allow weight loading on all nodes. Once ready, it will launch the configured benchmark tool (``aiperf`` by default), which begins generating requests to the frontend server. All servers cooperate to complete inference, and the output will appear in ``stdout.txt``.
 
-Recent AIDynamo Slurm features:
-
-- Multi-phase AIPerf runs with base config plus per-phase overrides.
-- Optional between-phase bash hook for backend-specific cleanup; the default hook is a no-op.
-- ``server-metrics = "auto"`` support, including CloudAI-started DCGM exporters.
-- LMCache config propagation from structured TOML to worker-visible YAML, with optional LMCache controller launch.
-- ``dse_excluded_args`` for list-valued config that must not become a DSE sweep dimension.
-
 Choosing a Benchmark Tool
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 

From c385839c7f0ec855fdf02fdc6b6bfa3edf328cd2 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 2 Jun 2026 17:20:24 +0200
Subject: [PATCH 7/7] update configs, update docs

---
 conf/experimental/ai_dynamo/test/sglang.toml | 1 +
 conf/experimental/ai_dynamo/test/vllm.toml   | 1 +
 doc/workloads/ai_dynamo.rst                  | 6 +++---
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index bc4b6068d..8bdefb688 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -28,6 +28,7 @@ workloads = "aiperf.sh"
   backend = "sglang"
   model = "Qwen/Qwen3-0.6B"
   endpoint = "v1/chat/completions"
+  ingress-cmd = "python -m dynamo.frontend --router-mode kv --router-reset-states"
 
     [cmd_args.dynamo.prefill_worker]
     num-nodes = 1
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index b09a2d1cf..20890b65d 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -27,6 +27,7 @@ workloads = "aiperf.sh"
   [cmd_args.dynamo]
   backend = "vllm"
   model = "Qwen/Qwen3-0.6B"
+  ingress-cmd = "python -m dynamo.frontend --router-mode kv --router-reset-states"
 
     [cmd_args.dynamo.prefill_worker]
     num-nodes = 1
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index 46382038d..560107090 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -114,7 +114,7 @@ AIPerf Multi-Phase Runs
 ~~~~~~~~~~~~~~~~~~~~~~~
 
 ``cmd_args.aiperf`` is the base AIPerf config. ``cmd_args.aiperf_phases`` can run several AIPerf rounds against the
-same live Dynamo stack without restarting prefill, decode, or router processes:
+same live Dynamo stack. By default, CloudAI does not restart prefill, decode, or router processes between phases:
 
 .. code-block:: toml
 
@@ -143,8 +143,8 @@ Multi-phase runs write per-phase artifacts/logs/reports and copy the last phase
 existing report generation.
 
 ``between-phase-cmd`` is a bash command run after each non-final phase. The default is a no-op. Set it explicitly for
-backend-specific cache cleanup, for example ``/cloudai_run_results/routerctl.sh restart --reset-states`` if a test needs
-to restart the Dynamo router between phases. ``health-check-between-phases`` probes the frontend after the command.
+backend-specific cache cleanup, for example ``/cloudai_run_results/routerctl.sh restart`` if a test needs to restart the
+Dynamo router between phases. ``health-check-between-phases`` probes the frontend after the command.
 
 AIPerf args are rendered as normal CLI flags. Multi-value AIPerf options should be passed with AIPerf CLI syntax, such
 as ``server-metrics-formats = "csv,json,jsonl"`` or ``gpu-telemetry = "node1:9401,node2:9401"``. ``server-metrics =