-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathwatch_training.sh
More file actions
executable file
·179 lines (164 loc) · 6.9 KB
/
watch_training.sh
File metadata and controls
executable file
·179 lines (164 loc) · 6.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/bin/bash
# QuantLLM 训练进度监控脚本
# 用法: bash /opt/quant-llm/watch_training.sh
# 守护模式: bash /opt/quant-llm/watch_training.sh --guard(自动重启)
# 自动检测当前活跃的训练日志和输出目录
if [ -f "/opt/quant-llm/output/train_v3.log" ] && pgrep -f "train.py" >/dev/null 2>&1; then
LOG_FILE="/opt/quant-llm/output/train_v3.log"
OUTPUT_DIR="/opt/quant-llm/output/quant-qwen2.5-14b-v3"
elif [ -f "/opt/quant-llm/output/train_r32_clean.log" ]; then
LOG_FILE="/opt/quant-llm/output/train_r32_clean.log"
OUTPUT_DIR="/opt/quant-llm/output/quant-qwen2.5-14b-lora-r32"
else
LOG_FILE="/opt/quant-llm/output/training_log.txt"
OUTPUT_DIR="/opt/quant-llm/output/quant-qwen2.5-14b-lora"
fi
PID_FILE="/opt/quant-llm/output/train.pid"
# 检查训练进程
train_pid=""
if [ -f "$PID_FILE" ]; then
saved_pid=$(cat "$PID_FILE")
if kill -0 "$saved_pid" 2>/dev/null; then
train_pid="$saved_pid"
fi
fi
if [ -z "$train_pid" ]; then
train_pid=$(pgrep -f "python.*/opt/quant-llm/scripts/train.py" | head -1)
fi
# 从日志的 tqdm 进度条提取实时进度
# 格式: " 7%|▋ | 1307/19692 [2:06:59<27:20:42, 5.35s/it]"
parse_log_progress() {
if [ ! -f "$LOG_FILE" ]; then
return 1
fi
# 取最后一条主训练进度行(排除 eval 进度条,eval 的总数通常较小)
# 主训练进度的 total 通常 > 5000
local line
line=$(grep -oP '\d+%\|[^|]*\|\s*\d+/\d+\s*\[[\d:]+<[\d:]+,\s*[\d.]+s/it\]' "$LOG_FILE" | \
while IFS= read -r l; do
t=$(echo "$l" | grep -oP '\d+/\K\d+')
s=$(echo "$l" | grep -oP '[\d.]+(?=s/it)')
# 过滤:总步数>5000 且速度<30s/it(排除 eval 后的异常估算)
if [ "$t" -gt 5000 ] 2>/dev/null; then
awk "BEGIN{exit ($s < 30) ? 0 : 1}" && echo "$l"
fi
done | tail -1)
if [ -z "$line" ]; then
# 兜底:取任何最后一条进度行
line=$(grep -oP '\d+%\|[^|]*\|\s*\d+/\d+\s*\[[\d:]+<[\d:]+,\s*[\d.]+s/it\]' "$LOG_FILE" | tail -1)
fi
[ -z "$line" ] && return 1
# 解析各字段
current=$(echo "$line" | grep -oP '\|\s*\K\d+(?=/)')
total=$(echo "$line" | grep -oP '\d+/\K\d+')
elapsed=$(echo "$line" | grep -oP '\[\K[\d:]+(?=<)')
remaining=$(echo "$line" | grep -oP '<\K[\d:]+')
speed=$(echo "$line" | grep -oP '[\d.]+(?=s/it)')
pct=$(echo "$line" | grep -oP '^\d+(?=%)')
echo "$current $total $elapsed $remaining $speed $pct"
}
# 从 checkpoint 的 trainer_state.json 提取 loss/lr(如果存在)
parse_checkpoint() {
local latest_ckpt
latest_ckpt=$(ls -d ${OUTPUT_DIR}/checkpoint-* 2>/dev/null | sort -t- -k2 -n | tail -1)
[ -z "$latest_ckpt" ] && return 1
local state_file="${latest_ckpt}/trainer_state.json"
[ ! -f "$state_file" ] && return 1
python3 -c "
import json
d = json.load(open('$state_file'))
logs = [l for l in d['log_history'] if 'loss' in l]
loss = f\"{logs[-1]['loss']:.4f}\" if logs else 'N/A'
lr_logs = [l for l in d['log_history'] if 'learning_rate' in l]
lr = f\"{lr_logs[-1]['learning_rate']:.2e}\" if lr_logs else 'N/A'
print(f'{loss} {lr}')
" 2>/dev/null
}
echo "========== QuantLLM 训练进度 =========="
progress=$(parse_log_progress)
if [ -n "$progress" ]; then
read -r current total elapsed remaining speed pct <<< "$progress"
echo " 进度: ${current}/${total} (${pct}%)"
echo " 已运行: ${elapsed}"
echo " 预计剩余: ${remaining}"
echo " 速度: ${speed}s/step"
# 优先从日志直接输出行获取 loss/lr/grad_norm/epoch(比 checkpoint 更实时)
log_loss_line=$(grep -P "^\{'loss'" "$LOG_FILE" | tail -1)
if [ -n "$log_loss_line" ]; then
loss=$(echo "$log_loss_line" | grep -oP "'loss':\s*'?\K[\d.]+")
lr=$(echo "$log_loss_line" | grep -oP "'learning_rate':\s*'?\K[\de.+-]+")
grad_norm=$(echo "$log_loss_line" | grep -oP "'grad_norm':\s*'?\K[\d.]+")
epoch=$(echo "$log_loss_line" | grep -oP "'epoch':\s*'?\K[\d.]+")
[ -n "$loss" ] && echo " Loss: ${loss}"
[ -n "$grad_norm" ] && echo " Grad Norm: ${grad_norm}"
[ -n "$lr" ] && echo " LR: ${lr}"
[ -n "$epoch" ] && echo " Epoch: ${epoch}"
else
# 回退:从 checkpoint 获取
ckpt_info=$(parse_checkpoint)
if [ -n "$ckpt_info" ]; then
read -r loss lr <<< "$ckpt_info"
echo " Loss: ${loss}"
echo " LR: ${lr}"
fi
fi
else
# 回退:从 checkpoint 读取(旧模式)
latest_ckpt=$(ls -d ${OUTPUT_DIR}/checkpoint-* 2>/dev/null | sort -t- -k2 -n | tail -1)
if [ -n "$latest_ckpt" ]; then
state_file="${latest_ckpt}/trainer_state.json"
if [ -f "$state_file" ]; then
info=$(python3 -c "
import json
d = json.load(open('$state_file'))
step = d['global_step']
mx = d['max_steps']
ep = d['epoch']
logs = [l for l in d['log_history'] if 'loss' in l]
loss = logs[-1]['loss'] if logs else 0
lr_logs = [l for l in d['log_history'] if 'learning_rate' in l]
lr = lr_logs[-1]['learning_rate'] if lr_logs else 0
print(f'{step} {mx} {ep:.2f} {loss:.4f} {lr:.2e}')
" 2>/dev/null)
if [ -n "$info" ]; then
read -r step mx ep loss lr <<< "$info"
pct=$(python3 -c "print(f'{${step}/${mx}*100:.1f}')")
echo " 进度: ${step}/${mx} (${pct}%)"
echo " Epoch: ${ep}"
echo " Loss: ${loss}"
echo " LR: ${lr}"
echo " (来源: checkpoint, 日志无 tqdm 输出)"
fi
fi
else
echo " 未找到训练进度信息"
fi
fi
# 训练进程状态
if [ -n "$train_pid" ]; then
# 计算进程运行时间
start_time=$(ps -o lstart= -p "$train_pid" 2>/dev/null)
echo " 训练进程: PID ${train_pid} (运行中)"
gpu_mem=$(nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1)
[ -n "$gpu_mem" ] && echo " GPU 显存: ${gpu_mem} MB"
else
if [ -f "$LOG_FILE" ] && grep -q "训练完成" "$LOG_FILE" 2>/dev/null; then
echo " 训练进程: 未运行(训练已完成)"
elif [ -n "$progress" ] && [ "$current" -lt "$total" ] 2>/dev/null; then
echo " 训练进程: 未运行(异常中断!)"
if [ "${1}" = "--guard" ]; then
echo " >>> 自动重启训练..."
cd /opt/quant-llm
source /opt/quant-llm/finetune-env/bin/activate
export http_proxy=http://192.168.0.10:6152 https_proxy=http://192.168.0.10:6152 no_proxy=localhost,127.0.0.1,::1,10.0.0.0/8,192.168.0.0/16
nohup bash run.sh train >> "$LOG_FILE" 2>&1 &
echo "$!" > "$PID_FILE"
echo " >>> 已重启: PID $!"
else
echo " 提示: 用 --guard 可自动重启"
fi
else
echo " 训练进程: 未运行"
fi
fi
echo "======================================="