forked from Y-debug-sys/Diffusion-TS
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_all.sub
More file actions
115 lines (98 loc) · 3.28 KB
/
train_all.sub
File metadata and controls
115 lines (98 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/bin/bash
#SBATCH -A aqu2_lab_gpu
#SBATCH -J diffts_all
#SBATCH -p gpu
#SBATCH --nodes=1
#SBATCH --mem=64GB
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=4
#SBATCH --gres=gpu:1
#SBATCH --time=72:00:00
#SBATCH --error=logs/diffts-%J.err
#SBATCH --output=logs/diffts-%J.out
#SBATCH --mail-type=fail,end
#SBATCH --mail-user=shilligo@uci.edu
# ================================================================
# SLURM submission script for Diffusion-TS training (all datasets)
#
# Trains unconditional generation on: Stocks, ETTh, Energy, fMRI,
# Sines, MuJoCo (sequentially on a single GPU).
#
# Usage:
# sbatch train_all.sub # all 6 datasets
# sbatch --export=DATASETS="stocks;etth" train_all.sub # subset
#
# Environment variables:
# DATASETS - SEMICOLON-separated dataset names to train
# (default: stocks;etth;energy;fmri;sines;mujoco)
# ================================================================
# Load modules and activate conda environment
module purge
module load anaconda/2024.06
source "$(conda info --base)/etc/profile.d/conda.sh"
conda activate diffts
# Change to project directory
cd ~/Diffusion-TS || { echo "ERROR: Could not cd to ~/Diffusion-TS"; exit 1; }
# Create logs directory
mkdir -p logs
# Log job information
echo "=========================================="
echo "Diffusion-TS Training - All Datasets"
echo "=========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $(hostname)"
echo "CPUs: $SLURM_CPUS_PER_TASK"
echo "Memory: $SLURM_MEM_PER_NODE"
echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null || echo 'unknown')"
echo "Time: $(date)"
echo "Working Directory: $(pwd)"
echo "DATASETS: ${DATASETS:-stocks;etth;energy;fmri;sines;mujoco (default)}"
echo "=========================================="
# Dataset definitions: name;config_file
declare -A CONFIGS
CONFIGS[stocks]="Config/stocks.yaml"
CONFIGS[etth]="Config/etth.yaml"
CONFIGS[energy]="Config/energy.yaml"
CONFIGS[fmri]="Config/fmri.yaml"
CONFIGS[sines]="Config/sines.yaml"
CONFIGS[mujoco]="Config/mujoco.yaml"
# Default dataset order
DATASETS="${DATASETS:-stocks;etth;energy;fmri;sines;mujoco}"
# Track overall exit code
OVERALL_EXIT=0
# Loop through datasets
IFS=';' read -ra DATASET_LIST <<< "$DATASETS"
for dataset in "${DATASET_LIST[@]}"; do
config="${CONFIGS[$dataset]}"
if [ -z "$config" ]; then
echo "WARNING: Unknown dataset '$dataset', skipping."
continue
fi
echo ""
echo "=========================================="
echo "Training: $dataset"
echo "Config: $config"
echo "Started: $(date)"
echo "=========================================="
python main.py \
--name "${dataset}" \
--config_file "${config}" \
--gpu 0 \
--train
EXIT_CODE=$?
if [ $EXIT_CODE -ne 0 ]; then
echo "WARNING: $dataset training exited with code $EXIT_CODE"
OVERALL_EXIT=1
else
echo "$dataset training completed successfully."
fi
echo "Finished: $(date)"
echo "=========================================="
done
# Log completion
echo ""
echo "=========================================="
echo "All training completed at: $(date)"
echo "Exit code: $OVERALL_EXIT"
echo "=========================================="
exit $OVERALL_EXIT