forked from msr-fiddle/dejavu
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_nf4_loading.py
More file actions
69 lines (59 loc) · 1.9 KB
/
test_nf4_loading.py
File metadata and controls
69 lines (59 loc) · 1.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python3
"""Test NF4 weight loading in Python"""
import sys
import os
sys.path.insert(0, 'examples/pytorch/gpt')
from utils.gpt_dv import GPT
import torch
print("=" * 80)
print("Testing NF4 Weight Loading")
print("=" * 80)
# Create GPT instance with minimal config (just 1 layer for testing)
try:
gpt = GPT(
head_num=32,
size_per_head=64,
vocab_size=151936,
start_id=151643,
end_id=151645,
layer_num=1, # Only 1 layer for memory test
max_seq_len=2048,
tensor_para_size=1,
pipeline_para_size=1,
lib_path='build/lib/libth_transformer.so',
ckpt_path='/root/Qwen3-30B-A3B-FT/1-gpu-nf4',
inference_data_type='fp16',
weights_data_type=torch.float16,
layernorm_eps=1e-6,
layernorm_type='pre_layernorm',
activation_type='Gelu',
has_positional_encoding=False,
has_pre_decoder_layernorm=False,
has_post_decoder_layernorm=True,
has_adapters=False,
adapter_inter_size=0,
use_attention_linear_bias=False,
int8_mode=0,
expert_num=128,
moe_k=8,
moe_layer_index=[0],
)
print("✅ GPT instance created")
print("\nWeights already loaded during __init__")
print("✅ NF4 weights loaded successfully!")
# Check memory usage
import torch.cuda
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1e9
reserved = torch.cuda.memory_reserved() / 1e9
print(f"\nGPU Memory:")
print(f" Allocated: {allocated:.2f} GB")
print(f" Reserved: {reserved:.2f} GB")
print("\n" + "=" * 80)
print("SUCCESS: NF4 loading works correctly!")
print("=" * 80)
except Exception as e:
print(f"\n❌ Error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)