forked from KellerJordan/modded-nanogpt
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrun_experiments.sh
More file actions
49 lines (43 loc) · 2.19 KB
/
run_experiments.sh
File metadata and controls
49 lines (43 loc) · 2.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env bash
# chmod +x run_experiments.sh
# ./run_experiments.sh
set -euo pipefail
# ─── Config ────────────────────────────────────────────────────────────────────
EXPERIMENT_DIR="./experiments"
IMAGE="speedrun_plm"
HOST_MOUNT="${PWD}:/workspace"
CONTAINER_WORKDIR="/workspace"
SHM_SIZE="128g"
# ─── Sanity checks ─────────────────────────────────────────────────────────────
if [ ! -d "$EXPERIMENT_DIR" ]; then
echo "❌ Directory '$EXPERIMENT_DIR' not found!" >&2
exit 1
fi
# ─── Prompt for token ──────────────────────────────────────────────────────────
read -rp "🔑 Enter your HuggingFace token: " HF_TOKEN
read -rp "🔑 Enter your wandb token: " WANDB_TOKEN
# ─── Detect GPUs ──────────────────────────────────────────────────────────────
if command -v nvidia-smi &> /dev/null; then
NUM_GPUS=$(nvidia-smi -L | wc -l | tr -d '[:space:]')
else
echo "⚠️ 'nvidia-smi' not found—defaulting to 1 GPU"
NUM_GPUS=1
fi
echo "🖥️ Using $NUM_GPUS GPU(s)"
# ─── Loop and launch ───────────────────────────────────────────────────────────
for yaml_file in "$EXPERIMENT_DIR"/*.yaml; do
# if no matches, break
[ -e "$yaml_file" ] || { echo "ℹ️ No .yaml files in $EXPERIMENT_DIR"; break; }
echo
echo "🚀 Running experiment: $yaml_file"
sudo docker run --gpus all \
--shm-size="$SHM_SIZE" \
-v "$HOST_MOUNT" \
-w "$CONTAINER_WORKDIR" \
"$IMAGE" \
torchrun --standalone --nproc_per_node="$NUM_GPUS" \
train.py \
--hf_token "$HF_TOKEN" \
--wandb_token "$WANDB_TOKEN" \
--yaml_path "$yaml_file"
done