Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/cloudai/workloads/common/nixl.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,14 @@ def gen_nixlbench_srun_commands(self, test_cmd: list[str], backend: str) -> list
nnodes, _ = self.get_cached_nodes_spec()
if nnodes > 1:
cmds = [
[*prefix_part, "--overlap", f"--relative={idx}", *tpn_part, *bash_part] for idx in range(nnodes)
[
*prefix_part,
"--overlap",
f"--nodelist=$(scontrol show hostname $SLURM_JOB_NODELIST | sed -n '{idx + 1}p')",
*tpn_part,
*bash_part,
]
for idx in range(nnodes)
]
else:
cmds *= max(2, nnodes)
Expand Down
4 changes: 2 additions & 2 deletions tests/ref_data/nixl-kvbench.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1;
echo "ETCD ($NIXL_ETCD_ENDPOINTS) was unreachable after 60 seconds";
exit 1
}
srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=0 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" &
srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --nodelist=$(scontrol show hostname $SLURM_JOB_NODELIST | sed -n '1p') --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" &
sleep 15
srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS"
srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --nodelist=$(scontrol show hostname $SLURM_JOB_NODELIST | sed -n '2p') --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS"
kill -TERM $etcd_pid
timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || {
echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds";
Expand Down
4 changes: 2 additions & 2 deletions tests/ref_data/nixl_bench.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1;
echo "ETCD ($NIXL_ETCD_ENDPOINTS) was unreachable after 60 seconds";
exit 1
}
srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=0 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints=http://$NIXL_ETCD_ENDPOINTS --backend=UCX" &
srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --nodelist=$(scontrol show hostname $SLURM_JOB_NODELIST | sed -n '1p') --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints=http://$NIXL_ETCD_ENDPOINTS --backend=UCX" &
sleep 15
srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints=http://$NIXL_ETCD_ENDPOINTS --backend=UCX"
srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --nodelist=$(scontrol show hostname $SLURM_JOB_NODELIST | sed -n '2p') --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints=http://$NIXL_ETCD_ENDPOINTS --backend=UCX"
kill -TERM $etcd_pid
timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || {
echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,8 @@ def test_gen_nixl_srun_command(
assert "-N1" in cmd
if backend == "UCX":
if nnodes > 1:
assert f"--relative={idx}" in cmd
assert f"--nodelist=$(scontrol show hostname $SLURM_JOB_NODELIST | sed -n '{idx + 1}p')" in cmd
assert "--relative" not in cmd
else:
assert "--relative" not in cmd
assert "--nodelist=$SLURM_JOB_MASTER_NODE" in cmd
Expand Down
Loading