From 3c06305636ce57502542abf0d5af5867247f4e5f Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 11 Jun 2026 12:55:00 +0200 Subject: [PATCH] Fix NIXL UCX worker node placement --- src/cloudai/workloads/common/nixl.py | 9 ++++++++- tests/ref_data/nixl-kvbench.sbatch | 4 ++-- tests/ref_data/nixl_bench.sbatch | 4 ++-- .../nixl_bench/test_command_gen_strategy_slurm.py | 3 ++- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py index 430a63951..3e35e46e9 100644 --- a/src/cloudai/workloads/common/nixl.py +++ b/src/cloudai/workloads/common/nixl.py @@ -335,7 +335,14 @@ def gen_nixlbench_srun_commands(self, test_cmd: list[str], backend: str) -> list nnodes, _ = self.get_cached_nodes_spec() if nnodes > 1: cmds = [ - [*prefix_part, "--overlap", f"--relative={idx}", *tpn_part, *bash_part] for idx in range(nnodes) + [ + *prefix_part, + "--overlap", + f"--nodelist=$(scontrol show hostname $SLURM_JOB_NODELIST | sed -n '{idx + 1}p')", + *tpn_part, + *bash_part, + ] + for idx in range(nnodes) ] else: cmds *= max(2, nnodes) diff --git a/tests/ref_data/nixl-kvbench.sbatch b/tests/ref_data/nixl-kvbench.sbatch index 817b0eacb..b5ae979ce 100644 --- a/tests/ref_data/nixl-kvbench.sbatch +++ b/tests/ref_data/nixl-kvbench.sbatch @@ -22,9 +22,9 @@ timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; echo "ETCD ($NIXL_ETCD_ENDPOINTS) was unreachable after 60 seconds"; exit 1 } -srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=0 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" & +srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --nodelist=$(scontrol show hostname $SLURM_JOB_NODELIST | sed -n '1p') --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" & sleep 15 -srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" +srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --nodelist=$(scontrol show hostname $SLURM_JOB_NODELIST | sed -n '2p') --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" kill -TERM $etcd_pid timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; diff --git a/tests/ref_data/nixl_bench.sbatch b/tests/ref_data/nixl_bench.sbatch index b3191cc0d..cd59cb3f0 100644 --- a/tests/ref_data/nixl_bench.sbatch +++ b/tests/ref_data/nixl_bench.sbatch @@ -22,9 +22,9 @@ timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; echo "ETCD ($NIXL_ETCD_ENDPOINTS) was unreachable after 60 seconds"; exit 1 } -srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=0 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints=http://$NIXL_ETCD_ENDPOINTS --backend=UCX" & +srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --nodelist=$(scontrol show hostname $SLURM_JOB_NODELIST | sed -n '1p') --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints=http://$NIXL_ETCD_ENDPOINTS --backend=UCX" & sleep 15 -srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints=http://$NIXL_ETCD_ENDPOINTS --backend=UCX" +srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --nodelist=$(scontrol show hostname $SLURM_JOB_NODELIST | sed -n '2p') --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints=http://$NIXL_ETCD_ENDPOINTS --backend=UCX" kill -TERM $etcd_pid timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; diff --git a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py index 814b20e7b..90c03d2eb 100644 --- a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py +++ b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py @@ -241,7 +241,8 @@ def test_gen_nixl_srun_command( assert "-N1" in cmd if backend == "UCX": if nnodes > 1: - assert f"--relative={idx}" in cmd + assert f"--nodelist=$(scontrol show hostname $SLURM_JOB_NODELIST | sed -n '{idx + 1}p')" in cmd + assert "--relative" not in cmd else: assert "--relative" not in cmd assert "--nodelist=$SLURM_JOB_MASTER_NODE" in cmd