forked from msr-fiddle/dejavu
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_simple.py
More file actions
123 lines (102 loc) · 3.25 KB
/
test_simple.py
File metadata and controls
123 lines (102 loc) · 3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python3
"""Simple test to identify where the code gets stuck"""
import os
import sys
import torch
print("Step 1: Imports starting...")
sys.stdout.flush()
dir_path = os.path.dirname(os.path.realpath(__file__))
sys.path.append(os.path.join(dir_path, "examples/pytorch/gpt"))
print("Step 2: Loading modules...")
sys.stdout.flush()
# Skip the comm module since it hangs without MPI
# from examples.pytorch.gpt.utils import comm
from examples.pytorch.gpt.utils.parallel_gpt_dv import ParallelGPT
print("Step 3: ParallelGPT loaded")
sys.stdout.flush()
config = {
'layer_num': 48,
'head_num': 32,
'size_per_head': 128,
'hidden_size': 2048,
'inter_size': 6144,
'vocab_size': 151936,
'max_seq_len': 2048,
'expert_num': 128,
'moe_k': 8,
'moe_layer_index': list(range(48)),
'tensor_para_size': 1,
'pipeline_para_size': 1,
'ckpt_path': '/home/victoryang00/Qwen3-30B-A3B-FT/1-gpu',
'lib_path': os.path.join(dir_path, 'build/lib/libth_transformer.so'),
}
print("Step 4: Creating model (skipping model parallel init)...")
sys.stdout.flush()
# Set up basic environment for single-GPU
torch.cuda.set_device(0)
gpt = ParallelGPT(
head_num=config['head_num'],
size_per_head=config['size_per_head'],
vocab_size=config['vocab_size'],
start_id=151643,
end_id=151643,
layer_num=config['layer_num'],
ckpt_path=config['ckpt_path'],
max_seq_len=config['max_seq_len'],
tensor_para_size=1,
pipeline_para_size=1,
lib_path=config['lib_path'],
inference_data_type='fp16',
int8_mode=0,
weights_data_type='fp16',
layernorm_eps=1e-6,
layernorm_type='pre_layernorm',
activation_type='silu',
has_positional_encoding=False,
has_pre_decoder_layernorm=False,
has_post_decoder_layernorm=True,
has_adapters=False,
adapter_inter_size=0,
use_attention_linear_bias=False,
inter_size=config['inter_size'],
gpt_with_moe=True,
expert_num=config['expert_num'],
moe_k=config['moe_k'],
moe_layer_index=config['moe_layer_index'],
shared_contexts_ratio=1.0,
prompt_world_size=1,
token_world_size=1,
torch_rank=0,
restart=False,
hidden_size=config['hidden_size'],
num_kv_heads=4,
)
print("Step 5: Model created, checking weights...")
sys.stdout.flush()
print(f" Weights count: {len(gpt.weights.w)}")
for i in [0, 96]:
if i < len(gpt.weights.w):
w = gpt.weights.w[i]
print(f" w[{i}]: numel={w.numel()}, device={w.device}")
sys.stdout.flush()
print("Step 6: Calling cuda()...")
sys.stdout.flush()
gpt.cuda()
print("Step 7: cuda() completed!")
sys.stdout.flush()
print("Step 8: Running inference test...")
sys.stdout.flush()
# Simple inference test
# The forward function expects lists of tensors for batched inference
input_ids = [torch.tensor([[1, 2, 3, 4, 5, 6]], dtype=torch.int32).cuda()]
input_lengths = [torch.tensor([6], dtype=torch.int32).cuda()]
output_len = torch.tensor([2], dtype=torch.int32) # Should be a tensor
print(f" input_ids[0] shape: {input_ids[0].shape}")
print(f" input_lengths[0]: {input_lengths[0]}")
sys.stdout.flush()
with torch.no_grad():
outputs = gpt(input_ids, input_lengths, output_len)
print(f"Step 9: Inference completed!")
print(f" Output: {outputs}")
sys.stdout.flush()
print("SUCCESS!")