forked from huggingface/nanoVLM
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare.sh
More file actions
executable file
·65 lines (56 loc) · 1.85 KB
/
prepare.sh
File metadata and controls
executable file
·65 lines (56 loc) · 1.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/bin/bash
#SBATCH --job-name=train_nanoVLM_torchrun
#SBATCH --output=logs/train_nanoVLM/%A_%a.out
#SBATCH --error=logs/train_nanoVLM/%A_%a.err
#SBATCH --time=47:59:00
#SBATCH --nodes=4
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=88
#SBATCH --partition=hopper-prod
#SBATCH --qos=high
#SBATCH --array=4
echo "--- Starting parallel data copy on all nodes... ---"
# This srun command launches the copy script on all 4 nodes simultaneously.
# The shell will not proceed to the next line until ALL nodes have finished.
srun --ntasks-per-node=1 bash -c '
mkdir -p /scratch/cache/asterix_rated && \
cd /fsx/luis_wiedmann/.cache/asterix_rated && \
find . -type f | parallel -j 16 rsync -R {} /scratch/cache/asterix_rated/
'
echo "--- All nodes have finished copying data. ---"
module load cuda/12.9
export RDMAV_FORK_SAFE=1
export FI_EFA_FORK_SAFE=1
export FI_EFA_USE_DEVICE_RDMA=1
export FI_PROVIDER=efa
export FI_LOG_LEVEL=1
export NCCL_SOCKET_IFNAME=enp
export FI_EFA_ENABLE_SHM_TRANSFER=0
export NCCL_SHM_DISABLE=1
export NCCL_P2P_DISABLE=1
export NCCL_IB_DISABLE=0
export NCCL_DEBUG=WARN
# Change to project directory
cd /fsx/luis_wiedmann/nanoVLM
source .venv/bin/activate
# Activate virtual environment
export TOKENIZERS_PARALLELISM=false
# -------------------------------------------------------------------------------
# Get the master node's address
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
# From https://i.hsfzxjy.site/2021-03-10-obtain-a-random-unused-tcp-port-with-bash/
function unused_port() {
N=${1:-1}
comm -23 \
<(seq "1025" "65535" | sort) \
<(ss -Htan |
awk '{print $4}' |
cut -d':' -f2 |
sort -u) |
shuf |
head -n "$N"
}
export MASTER_PORT=$(unused_port)
# Run using torchrun on all allocated nodes
ulimit -n 99999