forked from YiboZhao624/E-2GraphRAG
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
82 lines (72 loc) · 2.77 KB
/
utils.py
File metadata and controls
82 lines (72 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from typing import List
from transformers import AutoTokenizer
from rouge import Rouge
from dataloader import NovelQALoader, InfiniteChoiceLoader, InfiniteQALoader
import json,os
from extract_graph import build_graph
def load_dataset(dataset_name:str, dataset_path:str):
if dataset_name == "NovelQA":
return NovelQALoader(dataset_path)
elif dataset_name == "InfiniteChoice":
return InfiniteChoiceLoader(dataset_path)
elif dataset_name == "InfiniteQALoader":
return InfiniteQALoader(dataset_path)
else:
raise ValueError("Invalid dataset")
def load_tree_graph(cache_folder:str):
tree = json.load(open(os.path.join(cache_folder, "tree.json")))
graph_file_path = os.path.join(cache_folder, "graph.json")
index_file_path = os.path.join(cache_folder, "index.json")
appearance_count_file_path = os.path.join(cache_folder, "appearance_count.json")
edges = json.load(open(graph_file_path, "r"))
index = json.load(open(index_file_path, "r"))
appearance_count = json.load(open(appearance_count_file_path, "r"))
G = build_graph(edges)
return tree, G, index, appearance_count
def sequential_split(text:str, tokenizer:AutoTokenizer,
length:int, overlap:int)->List[str]:
'''
Split the text into chunks of length length with overlap.
'''
chunks = []
text_ids = tokenizer(text, return_tensors="pt")["input_ids"][0]
for i in range(0, len(text_ids), length - overlap):
chunk = tokenizer.decode(text_ids[i:i+length])
chunks.append(chunk)
return chunks
import time
import multiprocessing as mp
from contextlib import contextmanager
from functools import wraps
from typing import Dict, Optional
class Timer:
"""计时器类,用于跟踪任务执行时间"""
def __init__(self):
self.manager = mp.Manager()
self.times = self.manager.dict()
@contextmanager
def timer(self, name: str):
"""上下文管理器形式的计时器"""
try:
start_time = time.time()
yield
finally:
self.times[name] = time.time() - start_time
def __getitem__(self, key: str) -> float:
return self.times.get(key, 0.0)
def summary(self) -> str:
"""返回格式化的时间统计信息"""
return "\n".join(f"{task}: {duration:.2f}秒"
for task, duration in self.times.items())
def timed(timer: Timer, name: Optional[str] = None):
"""函数装饰器,用于计时"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
task_name = name or func.__name__
with timer.timer(task_name):
return func(*args, **kwargs)
return wrapper
return decorator
if __name__ == "__main__":
pass