-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdgx.j2
More file actions
95 lines (84 loc) · 2.56 KB
/
dgx.j2
File metadata and controls
95 lines (84 loc) · 2.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/bin/bash
{%- from "./utils.j2" import cliopts, export_env, export_job_config %}
#SBATCH --job-name {{ name or "mist" }}
#SBATCH --partition {{ partition or "defq" }}
{%- if job_queue is defined %}
#SBATCH --ntasks {{ nodes * job_queue.tasks * gpus_per_node }}
#SBATCH --gpus {{ nodes * job_queue.tasks * gpus_per_node }}
{%- else %}
#SBATCH --nodes {{ nodes or 1 }}
#SBATCH --ntasks-per-node {{ gpus_per_node or 8 }}
#SBATCH --gpus-per-node {{ gpus_per_node or 8 }}
{%- endif %}
#SBATCH --time {{ walltime or "4:0:0" }}
#SBATCH --cpus-per-task 12
#SBATCH --mem-per-cpu 200G
#SBATCH --export=NONE
{%- if requeue %}
#SBATCH --signal=USR1@90
#SBATCH --requeue
#SBATCH --open-mode=append
{%- endif %}
{%- if job_array is defined %}
#SBATCH --array={{ job_array.spec }}
{%- endif %}
set -ex
module load slurm
# Move to git root
cd "$(git rev-parse --show-toplevel)"
# Set env variables
ENV_FILE="${TMPDIR:-/tmp}/env-${SLURM_JOB_ID}.sh"
cat > "$ENV_FILE"<<'EOF'
{{- export_env( env ) }}
EOF
# Local scratch for this job
TMPDIR="${TMPDIR:-/tmp}/mist-${SLURM_JOB_ID}/"
mkdir -p "$TMPDIR"
{%- if job_queue is defined %}
submit/sweep.py scheduler --num-workers {{ job_queue.tasks }} {{ job_queue.queue }}
{%- else -%}
{% if deepspeed is defined %}
# Write out deepspeed config
cat > "$TMPDIR/deepspeed.json"<<'EOF'
{{ deepspeed | tojson(indent=4) }}
EOF
export PL_DEEPSPEED_CONFIG_PATH="$TMPDIR/deepspeed.json"
export DEEPSPEED_CONFIG="$(cat "$PL_DEEPSPEED_CONFIG_PATH")"
sbcast -fp "$PL_DEEPSPEED_CONFIG_PATH" "$PL_DEEPSPEED_CONFIG_PATH"
{%- endif %}
{% if job_array is defined %}
export PL_CONFIG="$TMPDIR/lightning_${SLURM_JOB_ID}_${SLURM_ARRAY_TASK_ID}.json"
sed "${SLURM_ARRAY_TASK_ID}q;d" {{ job_array.file }} > "$PL_CONFIG"
export JOB_CONFIG="$(cat "$PL_CONFIG")"
{% else %}
{{- export_job_config( __config__ ) }}
# Write out lightning config
export PL_CONFIG="$TMPDIR/lightning-${SLURM_JOB_ID}.json"
cat > "$PL_CONFIG"<<'EOF'
{{ train | tojson }}
EOF
{% endif %}
sbcast -fp "$PL_CONFIG" "$PL_CONFIG"
cat "$PL_CONFIG"
cat "$PL_CONFIG"
{%- if requeue %}
# Look for a requeue checkpoint
ckpt="./mist/requeue/${SLURM_JOB_ID}"
if [ -d "$ckpt" ]; then
export PL_FIT__CKPT_PATH="$ckpt"
export WANDB_ID="$(basename "$(dirname "$(realpath "$ckpt")")")"
export WANDB_RESUME="must"
fi
{%- endif %}
srun --mpi=pmix \
"$(which apptainer)" run \
--bind /lustre/fs0,/tmp,"$TMPDIR" \
--nv \
{{ container }} \
"${PWD}/submit/set_node_rank" \
{%- if nsys is defined %}
nsys profile \
{{- cliopts( nsys ) | indent }} \
{%- endif %}
/mist/.venv/bin/python {{ program }} --config "$PL_CONFIG"
{%- endif %}