From 62ff662d233013e459d63f23b056c2eb40404501 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 23:29:56 +0200 Subject: [PATCH 1/7] restart router between aiperf runs --- src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 11 ++ src/cloudai/workloads/ai_dynamo/ai_dynamo.sh | 131 +++++++++++++++++- .../ai_dynamo/slurm_command_gen_strategy.py | 29 ++++ tests/ref_data/ai-dynamo-aiperf.sh | 2 + .../test_command_gen_strategy_slurm.py | 4 + 5 files changed, 174 insertions(+), 3 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index b85b35d9a..faf70a2de 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -44,6 +44,7 @@ AIPERF_ARTIFACTS_DIR = "aiperf_artifacts" AIPERF_ACCURACY_ARTIFACTS_DIR = "aiperf_accuracy_artifacts" AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv" +AIPERF_DEFAULT_BETWEEN_PHASE_CMD = "/cloudai_run_results/routerctl.sh restart --reset-states" LMCACHE_CONFIG_FILE_NAME = "lmcache-config.yaml" LMCACHE_CONFIG_BACKUP_FILE_NAME = "lmcache-config.original.yaml" @@ -294,6 +295,11 @@ class AIPerf(Workload): serialization_alias="continue-on-phase-failure", validation_alias=AliasChoices("continue-on-phase-failure", "continue_on_phase_failure"), ) + between_phase_cmd: str | None = Field( + default=AIPERF_DEFAULT_BETWEEN_PHASE_CMD, + serialization_alias="between-phase-cmd", + validation_alias=AliasChoices("between-phase-cmd", "between_phase_cmd"), + ) @property def installables(self) -> list[Installable]: @@ -334,6 +340,11 @@ class AIPerfPhase(BaseModel): serialization_alias="extra-args", validation_alias=AliasChoices("extra-args", "extra_args"), ) + between_phase_cmd: str | None = Field( + default=None, + serialization_alias="between-phase-cmd", + validation_alias=AliasChoices("between-phase-cmd", "between_phase_cmd"), + ) class AIPerfAccuracy(BaseModel): diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh index 5697a78ea..f9e84ef9a 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh @@ -427,6 +427,9 @@ function perform_exit() log "Sleeping for ${sleep_before_exit} seconds before exit" sleep "${sleep_before_exit}" fi + if _is_frontend_node && [[ -x "${RESULTS_DIR}/routerctl.sh" ]]; then + "${RESULTS_DIR}/routerctl.sh" stop || true + fi exit "${exit_code}" } @@ -733,8 +736,130 @@ function launch_nats() function launch_ingress() { - log "Launching ingress with cmd: ${dynamo_args["ingress-cmd"]} --http-port ${dynamo_args["port"]}" - ${dynamo_args["ingress-cmd"]} --http-port ${dynamo_args["port"]} > ${RESULTS_DIR}/dynamo_ingress.log 2>&1 + write_routerctl + start_router +} + +function write_routerctl() +{ + export ROUTER_CMD="${dynamo_args["ingress-cmd"]} --http-port ${dynamo_args["port"]}" + export ROUTER_URL="${dynamo_args["url"]}" + export ROUTER_PID_FILE="${RESULTS_DIR}/router.pid" + export ROUTER_LOG_FILE="${RESULTS_DIR}/dynamo_ingress.log" + export ROUTER_START_TIMEOUT="${ROUTER_START_TIMEOUT:-120}" + export ROUTER_STOP_TIMEOUT="${ROUTER_STOP_TIMEOUT:-30}" + + cat > "${RESULTS_DIR}/routerctl.sh" <<'EOF' +#!/usr/bin/env bash +set -Eeuo pipefail + +log() { echo "[$(date +%F\ %T) $(hostname)]: $*"; } + +: "${ROUTER_CMD:?ROUTER_CMD is not set}" +: "${ROUTER_URL:?ROUTER_URL is not set}" +: "${ROUTER_PID_FILE:?ROUTER_PID_FILE is not set}" +: "${ROUTER_LOG_FILE:?ROUTER_LOG_FILE is not set}" +: "${ROUTER_START_TIMEOUT:=120}" +: "${ROUTER_STOP_TIMEOUT:=30}" + +router_pid() { + if [[ -s "${ROUTER_PID_FILE}" ]]; then + cat "${ROUTER_PID_FILE}" + fi +} + +router_is_running() { + local pid + pid="$(router_pid)" + [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null +} + +wait_for_router() { + local deadline=$((SECONDS + ROUTER_START_TIMEOUT)) + until curl -sS -o /dev/null --connect-timeout 1 --max-time 2 "${ROUTER_URL}"; do + if ! router_is_running; then + log "ERROR: Router process exited before ${ROUTER_URL} became reachable" + return 1 + fi + if (( SECONDS >= deadline )); then + log "ERROR: Router did not become reachable at ${ROUTER_URL} within ${ROUTER_START_TIMEOUT}s" + return 1 + fi + sleep 1 + done + log "Router is reachable at ${ROUTER_URL}" +} + +start_router() { + local cmd="${ROUTER_CMD}" + if [[ "${1:-}" == "--reset-states" && "${cmd}" != *"--router-reset-states"* ]]; then + cmd="${cmd} --router-reset-states" + fi + + if router_is_running; then + log "Router is already running with PID $(router_pid)" + return 0 + fi + + mkdir -p "$(dirname "${ROUTER_LOG_FILE}")" + log "Starting router with cmd: ${cmd}" + nohup bash -lc "${cmd}" >> "${ROUTER_LOG_FILE}" 2>&1 & + local pid=$! + echo "${pid}" > "${ROUTER_PID_FILE}" + log "Router PID: ${pid}" + wait_for_router +} + +stop_router() { + if ! router_is_running; then + rm -f "${ROUTER_PID_FILE}" + log "Router is not running" + return 0 + fi + + local pid + pid="$(router_pid)" + log "Stopping router PID ${pid}" + kill -TERM "${pid}" 2>/dev/null || true + + local deadline=$((SECONDS + ROUTER_STOP_TIMEOUT)) + while kill -0 "${pid}" 2>/dev/null; do + if (( SECONDS >= deadline )); then + log "ERROR: Router PID ${pid} did not stop within ${ROUTER_STOP_TIMEOUT}s" + return 1 + fi + sleep 1 + done + + rm -f "${ROUTER_PID_FILE}" + log "Router stopped" +} + +case "${1:-}" in + start) + shift + start_router "$@" + ;; + stop) + stop_router + ;; + restart) + shift + stop_router + start_router "$@" + ;; + *) + echo "Usage: $0 {start|stop|restart} [--reset-states]" >&2 + exit 2 + ;; +esac +EOF + chmod +x "${RESULTS_DIR}/routerctl.sh" +} + +function start_router() +{ + "${RESULTS_DIR}/routerctl.sh" start } launch_sgl_http_server() { @@ -1184,7 +1309,7 @@ function main() launch_etcd & launch_nats & wait_for_etcd - launch_ingress & + launch_ingress if _is_sglang_dsr1; then launch_sgl_http_server fi diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index cc3b51273..439c0eda8 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -215,6 +215,26 @@ def _render_aiperf_setup_blocks(self, log_message: str, setup_cmd: str | None) - ).rstrip() ] + def _render_between_aiperf_phases_block( + self, + phase_name: str, + cmd: str | None, + ) -> list[str]: + if not cmd: + return [] + + cleanup_argv = ["bash", "-lc", cmd] + return ( + textwrap.dedent( + f"""\ + log {shlex.quote(f"Running AIPerf between-phase command after {phase_name}: {shlex.join(cleanup_argv)}")} + {shlex.join(cleanup_argv)} + """ + ) + .rstrip() + .splitlines() + ) + def _render_aiperf_script(self) -> str: phases = self.td.cmd_args.aiperf_phases or [AIPerfPhase.model_validate({"name": "aiperf"})] single_phase = len(phases) == 1 @@ -298,6 +318,15 @@ def _render_aiperf_script(self) -> str: phase_lines.append(f" cp {shlex.quote(report_file)} {shlex.quote(final_report_file)}") phase_lines.append(f" log {shlex.quote(f'Final AIPerf report saved to {final_report_file}')}") + if not single_phase and idx < len(phases) - 1: + phase_lines.extend( + " " + line + for line in self._render_between_aiperf_phases_block( + phase_name=phase.name, + cmd=resolved_phase.between_phase_cmd, + ) + ) + if not single_phase and idx < len(phases) - 1 and resolved_phase.health_check_between_phases: health_probe_cmd = ( ' if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" ' diff --git a/tests/ref_data/ai-dynamo-aiperf.sh b/tests/ref_data/ai-dynamo-aiperf.sh index 60798ef8b..495959d70 100644 --- a/tests/ref_data/ai-dynamo-aiperf.sh +++ b/tests/ref_data/ai-dynamo-aiperf.sh @@ -24,6 +24,8 @@ if [[ "$phase_status" -eq 0 ]]; then mkdir -p /cloudai_run_results cp /cloudai_run_results/aiperf_artifacts/round_1/profile_export_aiperf.csv /cloudai_run_results/aiperf_round_1_report.csv log 'AIPerf report saved to /cloudai_run_results/aiperf_round_1_report.csv' + log 'Running AIPerf between-phase command after round_1: bash -lc '"'"'/cloudai_run_results/routerctl.sh restart --reset-states'"'"'' + bash -lc '/cloudai_run_results/routerctl.sh restart --reset-states' if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then log 'FATAL: failure marker found between AIPerf phases' exit 1 diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index 46a10906b..d5c832156 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -226,6 +226,7 @@ def test_gen_script_args_writes_resolved_aiperf_script(strategy: AIDynamoSlurmCo td.cmd_args.aiperf = AIPerf.model_validate( { "setup-cmd": "python -m pip install --upgrade aiperf", + "between-phase-cmd": "curl -fsS -X POST ${FRONTEND_URL}/reset_prefix_cache || true", "args": { "concurrency": 2, "request-count": 50, @@ -254,6 +255,9 @@ def test_gen_script_args_writes_resolved_aiperf_script(strategy: AIDynamoSlurmCo assert "Running AIPerf phase setup for round_1" not in script assert "Running AIPerf phase setup for round_2" in script assert "bash -lc 'python -m pip install --upgrade another-aiperf-plugin'" in script + assert script.count("Running AIPerf between-phase command after") == 1 + assert "Running AIPerf between-phase command after round_1" in script + assert "bash -lc 'curl -fsS -X POST ${FRONTEND_URL}/reset_prefix_cache || true'" in script assert ': "${FRONTEND_URL:?FRONTEND_URL is not set}"' in script assert '--url "$FRONTEND_URL"' in script assert f"--artifact-dir {strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_artifacts/round_1" in script From 9045b5da4e683ec0b3a3fe9015dc8f35cf5428be Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 23:40:09 +0200 Subject: [PATCH 2/7] better waiting for router to start --- .../ai_dynamo/slurm_command_gen_strategy.py | 19 ++++++++++++++----- tests/ref_data/ai-dynamo-aiperf.sh | 17 +++++++++++++---- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 439c0eda8..8f0f98e9e 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -250,6 +250,7 @@ def _render_aiperf_script(self) -> str: : "${{AIPERF_MODEL:={self.td.cmd_args.dynamo.model}}}" : "${{AIPERF_ENDPOINT:={self.td.cmd_args.dynamo.endpoint}}}" : "${{AIPERF_FAILURE_MARKER:={self.CONTAINER_MOUNT_OUTPUT}/{self.td.failure_marker}}}" + : "${{AIPERF_HEALTH_TIMEOUT:=120}}" """ ).rstrip() ] @@ -329,22 +330,30 @@ def _render_aiperf_script(self) -> str: if not single_phase and idx < len(phases) - 1 and resolved_phase.health_check_between_phases: health_probe_cmd = ( - ' if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" ' + ' until curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" ' "-H 'Content-Type: application/json' " '-d "{\\"model\\":\\"${AIPERF_MODEL}\\",\\"messages\\":[{\\"role\\":\\"user\\",' '\\"content\\":\\"ping\\"}],\\"stream\\":false,\\"max_tokens\\":1}" ' - ">/dev/null; then" + ">/dev/null; do" ) phase_lines.extend( [ + " health_deadline=$((SECONDS + AIPERF_HEALTH_TIMEOUT))", ' if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then', " log 'FATAL: failure marker found between AIPerf phases'", " exit 1", " fi", health_probe_cmd, - " log 'FATAL: frontend health probe failed between AIPerf phases'", - " exit 1", - " fi", + ' if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then', + " log 'FATAL: failure marker found while waiting for frontend between AIPerf phases'", + " exit 1", + " fi", + " if (( SECONDS >= health_deadline )); then", + " log 'FATAL: frontend health probe failed between AIPerf phases'", + " exit 1", + " fi", + " sleep 1", + " done", ] ) phase_lines.append("fi") diff --git a/tests/ref_data/ai-dynamo-aiperf.sh b/tests/ref_data/ai-dynamo-aiperf.sh index 495959d70..ac268122c 100644 --- a/tests/ref_data/ai-dynamo-aiperf.sh +++ b/tests/ref_data/ai-dynamo-aiperf.sh @@ -7,6 +7,7 @@ log() { echo "[$(date +%F\ %T) $(hostname)]: $*"; } : "${AIPERF_MODEL:=model}" : "${AIPERF_ENDPOINT:=v1/chat/completions}" : "${AIPERF_FAILURE_MARKER:=/cloudai_run_results/failure-marker.txt}" +: "${AIPERF_HEALTH_TIMEOUT:=120}" rm -rf /cloudai_run_results/aiperf_artifacts/round_1 mkdir -p /cloudai_run_results/aiperf_artifacts/round_1 @@ -26,14 +27,22 @@ if [[ "$phase_status" -eq 0 ]]; then log 'AIPerf report saved to /cloudai_run_results/aiperf_round_1_report.csv' log 'Running AIPerf between-phase command after round_1: bash -lc '"'"'/cloudai_run_results/routerctl.sh restart --reset-states'"'"'' bash -lc '/cloudai_run_results/routerctl.sh restart --reset-states' + health_deadline=$((SECONDS + AIPERF_HEALTH_TIMEOUT)) if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then log 'FATAL: failure marker found between AIPerf phases' exit 1 fi - if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" -H 'Content-Type: application/json' -d "{\"model\":\"${AIPERF_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"stream\":false,\"max_tokens\":1}" >/dev/null; then - log 'FATAL: frontend health probe failed between AIPerf phases' - exit 1 - fi + until curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" -H 'Content-Type: application/json' -d "{\"model\":\"${AIPERF_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"stream\":false,\"max_tokens\":1}" >/dev/null; do + if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then + log 'FATAL: failure marker found while waiting for frontend between AIPerf phases' + exit 1 + fi + if (( SECONDS >= health_deadline )); then + log 'FATAL: frontend health probe failed between AIPerf phases' + exit 1 + fi + sleep 1 + done fi rm -rf /cloudai_run_results/aiperf_artifacts/round_2 From 1ab58d7c0880a70d5bd0237fe6fedc05f47a02bf Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 1 Jun 2026 23:43:29 +0200 Subject: [PATCH 3/7] deduplicate router rediness check --- src/cloudai/workloads/ai_dynamo/ai_dynamo.sh | 15 +++++++++++---- .../ai_dynamo/slurm_command_gen_strategy.py | 19 +++++-------------- tests/ref_data/ai-dynamo-aiperf.sh | 17 ++++------------- 3 files changed, 20 insertions(+), 31 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh index f9e84ef9a..8b1f3475a 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh @@ -744,6 +744,8 @@ function write_routerctl() { export ROUTER_CMD="${dynamo_args["ingress-cmd"]} --http-port ${dynamo_args["port"]}" export ROUTER_URL="${dynamo_args["url"]}" + export ROUTER_HEALTH_ENDPOINT="${dynamo_args["endpoint"]}" + export ROUTER_HEALTH_MODEL="${dynamo_args["model"]}" export ROUTER_PID_FILE="${RESULTS_DIR}/router.pid" export ROUTER_LOG_FILE="${RESULTS_DIR}/dynamo_ingress.log" export ROUTER_START_TIMEOUT="${ROUTER_START_TIMEOUT:-120}" @@ -757,6 +759,8 @@ log() { echo "[$(date +%F\ %T) $(hostname)]: $*"; } : "${ROUTER_CMD:?ROUTER_CMD is not set}" : "${ROUTER_URL:?ROUTER_URL is not set}" +: "${ROUTER_HEALTH_ENDPOINT:?ROUTER_HEALTH_ENDPOINT is not set}" +: "${ROUTER_HEALTH_MODEL:?ROUTER_HEALTH_MODEL is not set}" : "${ROUTER_PID_FILE:?ROUTER_PID_FILE is not set}" : "${ROUTER_LOG_FILE:?ROUTER_LOG_FILE is not set}" : "${ROUTER_START_TIMEOUT:=120}" @@ -776,18 +780,21 @@ router_is_running() { wait_for_router() { local deadline=$((SECONDS + ROUTER_START_TIMEOUT)) - until curl -sS -o /dev/null --connect-timeout 1 --max-time 2 "${ROUTER_URL}"; do + until curl -fsS -X POST "${ROUTER_URL}/${ROUTER_HEALTH_ENDPOINT}" \ + -H 'Content-Type: application/json' \ + -d "{\"model\":\"${ROUTER_HEALTH_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"stream\":false,\"max_tokens\":1}" \ + >/dev/null; do if ! router_is_running; then - log "ERROR: Router process exited before ${ROUTER_URL} became reachable" + log "ERROR: Router process exited before ${ROUTER_URL}/${ROUTER_HEALTH_ENDPOINT} became ready" return 1 fi if (( SECONDS >= deadline )); then - log "ERROR: Router did not become reachable at ${ROUTER_URL} within ${ROUTER_START_TIMEOUT}s" + log "ERROR: Router did not become ready at ${ROUTER_URL}/${ROUTER_HEALTH_ENDPOINT} within ${ROUTER_START_TIMEOUT}s" return 1 fi sleep 1 done - log "Router is reachable at ${ROUTER_URL}" + log "Router is ready at ${ROUTER_URL}/${ROUTER_HEALTH_ENDPOINT}" } start_router() { diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 8f0f98e9e..439c0eda8 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -250,7 +250,6 @@ def _render_aiperf_script(self) -> str: : "${{AIPERF_MODEL:={self.td.cmd_args.dynamo.model}}}" : "${{AIPERF_ENDPOINT:={self.td.cmd_args.dynamo.endpoint}}}" : "${{AIPERF_FAILURE_MARKER:={self.CONTAINER_MOUNT_OUTPUT}/{self.td.failure_marker}}}" - : "${{AIPERF_HEALTH_TIMEOUT:=120}}" """ ).rstrip() ] @@ -330,30 +329,22 @@ def _render_aiperf_script(self) -> str: if not single_phase and idx < len(phases) - 1 and resolved_phase.health_check_between_phases: health_probe_cmd = ( - ' until curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" ' + ' if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" ' "-H 'Content-Type: application/json' " '-d "{\\"model\\":\\"${AIPERF_MODEL}\\",\\"messages\\":[{\\"role\\":\\"user\\",' '\\"content\\":\\"ping\\"}],\\"stream\\":false,\\"max_tokens\\":1}" ' - ">/dev/null; do" + ">/dev/null; then" ) phase_lines.extend( [ - " health_deadline=$((SECONDS + AIPERF_HEALTH_TIMEOUT))", ' if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then', " log 'FATAL: failure marker found between AIPerf phases'", " exit 1", " fi", health_probe_cmd, - ' if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then', - " log 'FATAL: failure marker found while waiting for frontend between AIPerf phases'", - " exit 1", - " fi", - " if (( SECONDS >= health_deadline )); then", - " log 'FATAL: frontend health probe failed between AIPerf phases'", - " exit 1", - " fi", - " sleep 1", - " done", + " log 'FATAL: frontend health probe failed between AIPerf phases'", + " exit 1", + " fi", ] ) phase_lines.append("fi") diff --git a/tests/ref_data/ai-dynamo-aiperf.sh b/tests/ref_data/ai-dynamo-aiperf.sh index ac268122c..495959d70 100644 --- a/tests/ref_data/ai-dynamo-aiperf.sh +++ b/tests/ref_data/ai-dynamo-aiperf.sh @@ -7,7 +7,6 @@ log() { echo "[$(date +%F\ %T) $(hostname)]: $*"; } : "${AIPERF_MODEL:=model}" : "${AIPERF_ENDPOINT:=v1/chat/completions}" : "${AIPERF_FAILURE_MARKER:=/cloudai_run_results/failure-marker.txt}" -: "${AIPERF_HEALTH_TIMEOUT:=120}" rm -rf /cloudai_run_results/aiperf_artifacts/round_1 mkdir -p /cloudai_run_results/aiperf_artifacts/round_1 @@ -27,22 +26,14 @@ if [[ "$phase_status" -eq 0 ]]; then log 'AIPerf report saved to /cloudai_run_results/aiperf_round_1_report.csv' log 'Running AIPerf between-phase command after round_1: bash -lc '"'"'/cloudai_run_results/routerctl.sh restart --reset-states'"'"'' bash -lc '/cloudai_run_results/routerctl.sh restart --reset-states' - health_deadline=$((SECONDS + AIPERF_HEALTH_TIMEOUT)) if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then log 'FATAL: failure marker found between AIPerf phases' exit 1 fi - until curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" -H 'Content-Type: application/json' -d "{\"model\":\"${AIPERF_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"stream\":false,\"max_tokens\":1}" >/dev/null; do - if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then - log 'FATAL: failure marker found while waiting for frontend between AIPerf phases' - exit 1 - fi - if (( SECONDS >= health_deadline )); then - log 'FATAL: frontend health probe failed between AIPerf phases' - exit 1 - fi - sleep 1 - done + if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" -H 'Content-Type: application/json' -d "{\"model\":\"${AIPERF_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"stream\":false,\"max_tokens\":1}" >/dev/null; then + log 'FATAL: frontend health probe failed between AIPerf phases' + exit 1 + fi fi rm -rf /cloudai_run_results/aiperf_artifacts/round_2 From 28a73551e690f6c70867fcf7f4c3c3e85477f605 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 2 Jun 2026 00:06:14 +0200 Subject: [PATCH 4/7] trying to improve accuracy --- conf/experimental/ai_dynamo/test/sglang.toml | 2 +- conf/experimental/ai_dynamo/test/vllm.toml | 2 +- conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index 34bc9cbff..bc4b6068d 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -117,7 +117,7 @@ workloads = "aiperf.sh" --accuracy-n-shots 5 --accuracy-tasks abstract_algebra --concurrency 10 ---extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' +--extra-inputs '{"temperature":0,"stop":["\n"],"chat_template_kwargs":{"enable_thinking":false}}' --num-requests 100 ''' diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index ea2a4552c..b09a2d1cf 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -115,7 +115,7 @@ workloads = "aiperf.sh" --accuracy-n-shots 5 --accuracy-tasks abstract_algebra --concurrency 10 ---extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' +--extra-inputs '{"temperature":0,"stop":["\n"],"chat_template_kwargs":{"enable_thinking":false}}' --num-requests 100 ''' diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml index c63319b4e..09672c786 100644 --- a/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml +++ b/conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml @@ -121,7 +121,7 @@ dse_excluded_args = [ --accuracy-n-shots 5 --accuracy-tasks abstract_algebra --concurrency 10 ---extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' +--extra-inputs '{"temperature":0,"stop":["\n"],"chat_template_kwargs":{"enable_thinking":false}}' --num-requests 100 ''' From a10d29e3655eec6c312ef8d4f5945cc6c6fdbb43 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 2 Jun 2026 17:01:26 +0200 Subject: [PATCH 5/7] update docs --- doc/workloads/ai_dynamo.rst | 49 ++++++++++++++++++++ src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 3 +- tests/ref_data/ai-dynamo-aiperf.sh | 4 +- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index 7dbba92b6..4d62fc762 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -84,6 +84,14 @@ The job progress monitoring can be done using either of the following options: The frontend node will initially wait to allow weight loading on all nodes. Once ready, it will launch the configured benchmark tool (``aiperf`` by default), which begins generating requests to the frontend server. All servers cooperate to complete inference, and the output will appear in ``stdout.txt``. +Recent AIDynamo Slurm features: + +- Multi-phase AIPerf runs with base config plus per-phase overrides. +- Optional between-phase bash hook for backend-specific cleanup; the default hook is a no-op. +- ``server-metrics = "auto"`` support, including CloudAI-started DCGM exporters. +- LMCache config propagation from structured TOML to worker-visible YAML, with optional LMCache controller launch. +- ``dse_excluded_args`` for list-valued config that must not become a DSE sweep dimension. + Choosing a Benchmark Tool ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -110,6 +118,47 @@ To use genai-perf, set: output-tokens-mean = 500 request-count = 50 +AIPerf Multi-Phase Runs +~~~~~~~~~~~~~~~~~~~~~~~ + +``cmd_args.aiperf`` is the base AIPerf config. ``cmd_args.aiperf_phases`` can run several AIPerf rounds against the +same live Dynamo stack without restarting prefill, decode, or router processes: + +.. code-block:: toml + + dse_excluded_args = ["cmd_args.aiperf_phases"] + + [cmd_args.aiperf] + health-check-between-phases = true + between-phase-cmd = "true" # default no-op + + [cmd_args.aiperf.args] + request-count = 50 + server-metrics = "auto" + + [[cmd_args.aiperf_phases]] + name = "round_1" + [cmd_args.aiperf_phases.args] + concurrency = 2 + + [[cmd_args.aiperf_phases]] + name = "round_2" + [cmd_args.aiperf_phases.args] + concurrency = 4 + +Single-phase runs keep the old artifact layout: ``aiperf_artifacts/``, ``aiperf.log``, and ``aiperf_report.csv``. +Multi-phase runs write per-phase artifacts/logs/reports and copy the last phase report to ``aiperf_report.csv`` for +existing report generation. + +``between-phase-cmd`` is a bash command run after each non-final phase. The default is a no-op. Set it explicitly for +backend-specific cache cleanup, for example ``/cloudai_run_results/routerctl.sh restart --reset-states`` if a test needs +to restart the Dynamo router between phases. ``health-check-between-phases`` probes the frontend after the command. + +AIPerf args are rendered as normal CLI flags. Multi-value AIPerf options should be passed with AIPerf CLI syntax, such +as ``server-metrics-formats = "csv,json,jsonl"`` or ``gpu-telemetry = "node1:9401,node2:9401"``. ``server-metrics = +"auto"`` expands to the frontend metrics endpoint, Dynamo worker metrics endpoints, and any CloudAI-started DCGM +exporters. + Propagating LMCache Configuration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index faf70a2de..298d9a79d 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -44,7 +44,6 @@ AIPERF_ARTIFACTS_DIR = "aiperf_artifacts" AIPERF_ACCURACY_ARTIFACTS_DIR = "aiperf_accuracy_artifacts" AIPERF_ACCURACY_RESULTS_CSV = "accuracy_results.csv" -AIPERF_DEFAULT_BETWEEN_PHASE_CMD = "/cloudai_run_results/routerctl.sh restart --reset-states" LMCACHE_CONFIG_FILE_NAME = "lmcache-config.yaml" LMCACHE_CONFIG_BACKUP_FILE_NAME = "lmcache-config.original.yaml" @@ -296,7 +295,7 @@ class AIPerf(Workload): validation_alias=AliasChoices("continue-on-phase-failure", "continue_on_phase_failure"), ) between_phase_cmd: str | None = Field( - default=AIPERF_DEFAULT_BETWEEN_PHASE_CMD, + default="true", serialization_alias="between-phase-cmd", validation_alias=AliasChoices("between-phase-cmd", "between_phase_cmd"), ) diff --git a/tests/ref_data/ai-dynamo-aiperf.sh b/tests/ref_data/ai-dynamo-aiperf.sh index 495959d70..29fa95b7c 100644 --- a/tests/ref_data/ai-dynamo-aiperf.sh +++ b/tests/ref_data/ai-dynamo-aiperf.sh @@ -24,8 +24,8 @@ if [[ "$phase_status" -eq 0 ]]; then mkdir -p /cloudai_run_results cp /cloudai_run_results/aiperf_artifacts/round_1/profile_export_aiperf.csv /cloudai_run_results/aiperf_round_1_report.csv log 'AIPerf report saved to /cloudai_run_results/aiperf_round_1_report.csv' - log 'Running AIPerf between-phase command after round_1: bash -lc '"'"'/cloudai_run_results/routerctl.sh restart --reset-states'"'"'' - bash -lc '/cloudai_run_results/routerctl.sh restart --reset-states' + log 'Running AIPerf between-phase command after round_1: bash -lc true' + bash -lc true if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then log 'FATAL: failure marker found between AIPerf phases' exit 1 From 1d2060861be88292e4ad0a97303fe1a3af706c3f Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 2 Jun 2026 17:12:03 +0200 Subject: [PATCH 6/7] cleaner docs --- doc/USER_GUIDE.rst | 19 +++++++++++++------ doc/workloads/ai_dynamo.rst | 8 -------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/doc/USER_GUIDE.rst b/doc/USER_GUIDE.rst index 966fadb9f..ea42e516d 100644 --- a/doc/USER_GUIDE.rst +++ b/doc/USER_GUIDE.rst @@ -210,11 +210,13 @@ DSE parameter exclusions ~~~~~~~~~~~~~~~~~~~~~~~~ CloudAI builds the DSE parameter space implicitly from list-valued fields under ``cmd_args``, list-valued -``extra_env_vars``, and list-valued ``num_nodes``. If a list-valued ``cmd_args`` field is configuration data rather than -a sweep dimension, exclude it with ``dse_excluded_args`` in the test or scenario definition. +``extra_env_vars``, and list-valued ``num_nodes``. Most lists mean "try each value", but some workload settings are +real list-valued configuration, such as worker port lists or ordered benchmark phases. -Entries in ``dse_excluded_args`` must be dot-separated paths that start with ``cmd_args.``. Each entry excludes that -field and any nested fields below it from DSE parameter discovery: +Use ``dse_excluded_args`` when a list under ``cmd_args`` should stay intact instead of becoming a sweep dimension. +Entries must be dot-separated paths that start with ``cmd_args.`` and may point to either a single field or a parent +object. Matching is prefix-based, so excluding ``cmd_args.foo`` also excludes nested list-valued fields such as +``cmd_args.foo.bar`` from DSE parameter discovery. .. code-block:: toml @@ -228,8 +230,13 @@ field and any nested fields below it from DSE parameter discovery: lmcache_worker_ports = [8788, 8789, 8790, 8791] In this example, ``cmd_args.lmcache.chunk_size`` is still swept, while -``cmd_args.lmcache.lmcache_worker_ports`` is treated as a single configuration value. The exclusion mechanism currently -applies only to ``cmd_args`` paths; it does not exclude ``extra_env_vars`` or ``num_nodes`` from DSE. +``cmd_args.lmcache.lmcache_worker_ports`` is passed through as one list value. The exclusion does not remove or mutate +the field; it only prevents CloudAI from adding that path to the DSE parameter space. + +``dse_excluded_args`` currently applies only to ``cmd_args`` paths. It does not exclude list-valued ``extra_env_vars`` +or ``num_nodes``; those lists are still interpreted as sweep dimensions. To exclude many nested list fields at once, +exclude their common parent path. Common examples are ``cmd_args.aiperf_phases`` and +``cmd_args.lmcache.lmcache_worker_ports``. Metric errors and report strategies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index 4d62fc762..46382038d 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -84,14 +84,6 @@ The job progress monitoring can be done using either of the following options: The frontend node will initially wait to allow weight loading on all nodes. Once ready, it will launch the configured benchmark tool (``aiperf`` by default), which begins generating requests to the frontend server. All servers cooperate to complete inference, and the output will appear in ``stdout.txt``. -Recent AIDynamo Slurm features: - -- Multi-phase AIPerf runs with base config plus per-phase overrides. -- Optional between-phase bash hook for backend-specific cleanup; the default hook is a no-op. -- ``server-metrics = "auto"`` support, including CloudAI-started DCGM exporters. -- LMCache config propagation from structured TOML to worker-visible YAML, with optional LMCache controller launch. -- ``dse_excluded_args`` for list-valued config that must not become a DSE sweep dimension. - Choosing a Benchmark Tool ~~~~~~~~~~~~~~~~~~~~~~~~~ From c385839c7f0ec855fdf02fdc6b6bfa3edf328cd2 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 2 Jun 2026 17:20:24 +0200 Subject: [PATCH 7/7] update configs, update docs --- conf/experimental/ai_dynamo/test/sglang.toml | 1 + conf/experimental/ai_dynamo/test/vllm.toml | 1 + doc/workloads/ai_dynamo.rst | 6 +++--- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index bc4b6068d..8bdefb688 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -28,6 +28,7 @@ workloads = "aiperf.sh" backend = "sglang" model = "Qwen/Qwen3-0.6B" endpoint = "v1/chat/completions" + ingress-cmd = "python -m dynamo.frontend --router-mode kv --router-reset-states" [cmd_args.dynamo.prefill_worker] num-nodes = 1 diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index b09a2d1cf..20890b65d 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -27,6 +27,7 @@ workloads = "aiperf.sh" [cmd_args.dynamo] backend = "vllm" model = "Qwen/Qwen3-0.6B" + ingress-cmd = "python -m dynamo.frontend --router-mode kv --router-reset-states" [cmd_args.dynamo.prefill_worker] num-nodes = 1 diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index 46382038d..560107090 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -114,7 +114,7 @@ AIPerf Multi-Phase Runs ~~~~~~~~~~~~~~~~~~~~~~~ ``cmd_args.aiperf`` is the base AIPerf config. ``cmd_args.aiperf_phases`` can run several AIPerf rounds against the -same live Dynamo stack without restarting prefill, decode, or router processes: +same live Dynamo stack. By default, CloudAI does not restart prefill, decode, or router processes between phases: .. code-block:: toml @@ -143,8 +143,8 @@ Multi-phase runs write per-phase artifacts/logs/reports and copy the last phase existing report generation. ``between-phase-cmd`` is a bash command run after each non-final phase. The default is a no-op. Set it explicitly for -backend-specific cache cleanup, for example ``/cloudai_run_results/routerctl.sh restart --reset-states`` if a test needs -to restart the Dynamo router between phases. ``health-check-between-phases`` probes the frontend after the command. +backend-specific cache cleanup, for example ``/cloudai_run_results/routerctl.sh restart`` if a test needs to restart the +Dynamo router between phases. ``health-check-between-phases`` probes the frontend after the command. AIPerf args are rendered as normal CLI flags. Multi-value AIPerf options should be passed with AIPerf CLI syntax, such as ``server-metrics-formats = "csv,json,jsonl"`` or ``gpu-telemetry = "node1:9401,node2:9401"``. ``server-metrics =