forked from huggingface/nanoVLM
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheval.slurm
More file actions
99 lines (81 loc) · 2.81 KB
/
eval.slurm
File metadata and controls
99 lines (81 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/bin/bash
#SBATCH --job-name=lmms_eval
#SBATCH --output=logs/lmms_eval/%j.out
#SBATCH --error=logs/lmms_eval/%j.err
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=4 # request as many GPUs as you want to use in parallel
#SBATCH --cpus-per-task=44 # total CPUs for the whole allocation
#SBATCH --partition=hopper-prod
#SBATCH --qos=normal
#SBATCH --gres-flags=enforce-binding
set -euo pipefail
# Clean distributed defaults that can confuse eval scripts
unset RANK LOCAL_RANK WORLD_SIZE MASTER_ADDR MASTER_PORT NCCL_SOCKET_IFNAME
cd /fsx/andi/nanoVLM
source .venv/bin/activate
export TOKENIZERS_PARALLELISM=false
if [ "$#" -ne 6 ]; then
echo "Usage: sbatch eval.slurm <checkpoint_path> <global_step> <run_name> <limit> <tasks> <batch_size>"
exit 1
fi
CHECKPOINT_PATH=$1
GLOBAL_STEP=$2
RUN_NAME=$3
LIMIT=$4
EVAL_TASKS=$5 # comma-separated list, e.g. "mmstar,mmmu,ocrbench"
EVAL_BATCH_SIZE=$6
echo "Starting evaluation for checkpoint: $CHECKPOINT_PATH at step $GLOBAL_STEP"
echo "Tasks: $EVAL_TASKS"
# Discover available GPUs in this allocation
NUM_GPUS=${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l | awk '{print $1}')}
if [ -z "$NUM_GPUS" ] || [ "$NUM_GPUS" -lt 1 ]; then
echo "No GPUs detected in allocation"
exit 1
fi
echo "GPUs available: $NUM_GPUS"
# Compute CPU share per parallel worker
TOTAL_CPUS=${SLURM_CPUS_PER_TASK:-$(nproc)}
CPUS_PER_WORKER=$(( TOTAL_CPUS / NUM_GPUS ))
if [ "$CPUS_PER_WORKER" -lt 1 ]; then CPUS_PER_WORKER=1; fi
echo "CPUs per worker: $CPUS_PER_WORKER (total: $TOTAL_CPUS)"
IFS=',' read -r -a TASK_ARR <<< "$EVAL_TASKS"
# Build the base common args once
BASE_ARGS=( run_evaluation.py
--checkpoint_path "$CHECKPOINT_PATH"
--global_step "$GLOBAL_STEP"
--run_name "$RUN_NAME"
--batch_size "$EVAL_BATCH_SIZE"
)
if [ "$LIMIT" != "None" ]; then
BASE_ARGS+=( --limit "$LIMIT" )
fi
# Simple concurrency gate equal to number of GPUs
inflight=0
pids=()
for task in "${TASK_ARR[@]}"; do
task_trimmed="$(echo "$task" | xargs)"
if [ -z "$task_trimmed" ]; then
continue
fi
echo "Launching task: $task_trimmed"
# One srun per task, each grabs 1 GPU exclusively and CPUS_PER_WORKER CPUs
srun --gres=gpu:1 --cpu-bind=cores --gpu-bind=closest -c "$CPUS_PER_WORKER" \
python "${BASE_ARGS[@]}" --tasks "$task_trimmed" &
pids+=( $! )
inflight=$(( inflight + 1 ))
# If we already launched as many tasks as GPUs, wait for one to finish
if [ "$inflight" -ge "$NUM_GPUS" ]; then
wait -n
inflight=$(( inflight - 1 ))
fi
done
# Wait for remaining tasks
if [ "${#pids[@]}" -gt 0 ]; then
wait "${pids[@]}"
fi
# Merge per-task results into a single file for the step
echo "Merging results..."
python merge_eval_results.py --run_name "$RUN_NAME" --global_step "$GLOBAL_STEP"
echo "All evaluations finished and merged."