-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgpu_collect.py
More file actions
153 lines (125 loc) · 4.85 KB
/
gpu_collect.py
File metadata and controls
153 lines (125 loc) · 4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python3
"""
GPU Data Collector - runs on server, no third-party dependencies.
Saves nvidia-smi dmon output to a standardized CSV file.
Usage:
python3 gpu_collect.py # default 1s sampling
python3 gpu_collect.py -i 2 # 2s sampling interval
python3 gpu_collect.py -o gpu_data.csv # specify output file
python3 gpu_collect.py -d 3600 # run for 3600 seconds
Press Ctrl+C to stop.
"""
import argparse
import csv
import os
import signal
import subprocess
import sys
import time
from datetime import datetime
def parse_args():
p = argparse.ArgumentParser(description="Collect nvidia-smi dmon data to CSV")
p.add_argument("-i", "--interval", type=int, default=1,
help="Sampling interval in seconds (default: 1)")
p.add_argument("-o", "--output", type=str, default="gpu_data.csv",
help="Output CSV file path (default: gpu_data.csv)")
p.add_argument("-d", "--duration", type=int, default=0,
help="Duration in seconds, 0 for unlimited (default: 0)")
return p.parse_args()
def get_gpu_uuid_map():
"""Query nvidia-smi for GPU index -> UUID mapping."""
uuid_map = {}
try:
result = subprocess.run(
["nvidia-smi", "--query-gpu=index,uuid", "--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=10
)
if result.returncode == 0:
for line in result.stdout.strip().splitlines():
parts = [p.strip() for p in line.split(",")]
if len(parts) == 2:
uuid_map[parts[0]] = parts[1]
except Exception:
pass
return uuid_map
def main():
args = parse_args()
running = True
def on_signal(signum, frame):
nonlocal running
running = False
signal.signal(signal.SIGINT, on_signal)
signal.signal(signal.SIGTERM, on_signal)
uuid_map = get_gpu_uuid_map()
if uuid_map:
print(f"Detected {len(uuid_map)} GPU(s):")
for idx, uuid in sorted(uuid_map.items(), key=lambda x: int(x[0]) if x[0].isdigit() else x[0]):
print(f" GPU {idx}: {uuid}")
print()
cmd = ["nvidia-smi", "dmon", "-s", "puc", "-d", str(args.interval)]
print(f"Starting: {' '.join(cmd)}")
print(f"Output: {os.path.abspath(args.output)}")
print(f"Press Ctrl+C to stop\n")
try:
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
text=True, bufsize=1)
except FileNotFoundError:
print("Error: nvidia-smi not found")
sys.exit(1)
columns = []
header_parsed = False
count = 0
start = time.time()
with open(args.output, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["timestamp", "gpu", "uuid", "pwr", "gtemp", "sm", "mem",
"enc", "dec", "mclk", "pclk"])
try:
for line in proc.stdout:
if not running:
break
line = line.strip()
if not line:
continue
if line.startswith("#"):
if not header_parsed and ("gpu" in line.lower() or "idx" in line.lower()):
columns = line.lstrip("# ").split()
columns = [c.lower() for c in columns]
header_parsed = True
continue
if not header_parsed:
continue
parts = line.split()
if len(parts) != len(columns):
continue
row = dict(zip(columns, parts))
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
gpu_idx = row.get("gpu", row.get("idx", "0"))
uuid = uuid_map.get(gpu_idx, "N/A")
writer.writerow([
now, gpu_idx, uuid,
row.get("pwr", "-"),
row.get("gtemp", "-"),
row.get("sm", "-"),
row.get("mem", "-"),
row.get("enc", "-"),
row.get("dec", "-"),
row.get("mclk", "-"),
row.get("pclk", "-"),
])
f.flush()
count += 1
if count % 60 == 0:
print(f"[{now}] Collected {count} samples")
if args.duration > 0 and time.time() - start >= args.duration:
print(f"Reached {args.duration}s duration limit, stopping.")
break
finally:
proc.terminate()
try:
proc.wait(timeout=5)
except subprocess.TimeoutExpired:
proc.kill()
print(f"\nDone. {count} samples -> {os.path.abspath(args.output)}")
if __name__ == "__main__":
main()