-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebug_data.py
More file actions
58 lines (45 loc) · 1.55 KB
/
debug_data.py
File metadata and controls
58 lines (45 loc) · 1.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import sys
import os
import numpy as np
from src.data.streaming_loader import HFStreamingDataLoader, MixedDataLoader
def test_loader():
print("Testing DataLoader...")
# Mock Config
datasets_config = [
{'name': 'ytu-ce-cosmos/Cosmos-Turkish-Corpus-v1.0', 'weight': 1.0},
# {'name': 'bigcode/the-stack-smol', 'data_dir': 'data/python', 'weight': 0.3}
]
try:
loader = MixedDataLoader(
datasets=datasets_config,
weights=[1.0],
batch_size=1,
seq_len=64, # Short sequence for debug
patch_size=4
)
print("Loader initialized. Fetching batch...")
batch = next(loader)
x = batch['input'] # (B, L)
y = batch['label'] # (B, L)
print(f"Batch Shape: X={x.shape}, Y={y.shape}")
# Check Sample Content
raw_x = x[0].tolist()
raw_y = y[0].tolist()
print(f"First 50 bytes X: {raw_x[:50]}")
print(f"First 50 bytes Y: {raw_y[:50]}")
# Check for zeros
zeros_x = np.sum(x == 0)
total_x = x.size
print(f"Zeros in X: {zeros_x}/{total_x} ({zeros_x/total_x:.2%})")
# Decode check
try:
txt = bytes(raw_x).decode('utf-8', errors='replace')
print(f"Decoded X: {txt[:100]}")
except:
print("Decode failed")
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
test_loader()