-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstats.py
More file actions
76 lines (63 loc) · 2.05 KB
/
stats.py
File metadata and controls
76 lines (63 loc) · 2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# file to track experiment statistics
import torch
import time
from threading import Lock
from collections import defaultdict
cuda_time = torch.cuda.is_available()
def get_time():
if cuda_time:
torch.cuda.synchronize()
return time.perf_counter()
return time.time()
class ServerStats:
def __init__(self):
self.stats_lock = Lock()
self.total_tokens = 0
self.total_elapsed = 0
self.total_requests = 0
self.total_latency = 0
self.table = {}
self.rid_lock = Lock()
self.rid = 0
self.data_based_on_token = defaultdict(list)
def get_new_rid(self):
new_rid = -1
with self.rid_lock:
new_rid = self.rid
self.rid += 1
return new_rid
def start_request(self, num_tokens):
new_rid = self.get_new_rid()
self.table[new_rid] = (get_time(), num_tokens)
return new_rid
def finish_request(self, rid):
elapsed = get_time() - self.table[rid][0]
tokens = self.table[rid][1]
with self.stats_lock:
self.total_tokens += tokens
self.total_elapsed += elapsed
self.total_requests += 1
self.total_latency += elapsed / tokens
self.data_based_on_token[tokens].append(elapsed)
def token_breakdown(self):
ret = None
with self.stats_lock:
if len(self.data_based_on_token) > 0:
ret = self.data_based_on_token
return ret
def latency_per_token(self):
# average time a user has to wait for an individual token
ret = None
if self.total_requests <= 0:
return ret
with self.stats_lock:
ret = self.total_latency / self.total_requests
return ret
def throughput(self):
# the number of output tokens per second the inference server generates
ret = None
if self.total_elapsed <= 0:
return ret
with self.stats_lock:
ret = self.total_tokens / self.total_elapsed
return ret