-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_loader.py
More file actions
109 lines (88 loc) · 4.54 KB
/
data_loader.py
File metadata and controls
109 lines (88 loc) · 4.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from pathlib import Path
from typing import Dict
import json
import utils
class DataLoader:
@staticmethod
def load_analysis_json_file(directory: str = './paper_analysis/',
papers_json_path: str = './paper_metadata/metadata_2024_12_31_143026.json') -> Dict[str, Dict]:
"""
Load json files containing paper analysis and corresponding paper metadata
Args:
directory: Directory containing json analysis files
papers_json_path: Path to JSON file containing paper metadata
Returns:
Dictionary with paper_id as key and dict containing analysis content, questions and metadata as value
"""
# Load paper metadata from JSON
try:
with open(papers_json_path, 'r', encoding='utf-8') as f:
papers_metadata = json.load(f)
except Exception as e:
print(f"Error reading metadata JSON: {e}")
papers_metadata = {}
# Load and process analysis files
files_content = {}
for file_path in Path(directory).glob('*_analysis.json'):
try:
# Read and parse JSON file
with open(file_path, 'r', encoding='utf-8') as file:
analysis_data = json.load(file)
# Extract paper ID from filename
filename = file_path.stem # Gets filename without extension
tag, paper_id = filename.replace('_analysis', '').split("_")
# Extract research questions
research_content = analysis_data.get("research", "")
questions, _ = utils.extract_questions_and_filename(file_path.name, research_content)
# Create document with content and metadata
document = {
'questions': " - ".join(questions),
'metadata': {
'paper_id': paper_id,
}
}
files_content[paper_id] = document
except Exception as e:
print(f"Error processing {file_path}: {e}")
continue
return files_content
@staticmethod
def load_card_markdown(directory: str = './card_papers/', papers_json_path: str = './paper_metadata/metadata_2024_12_31_143026.json') -> Dict[str, Dict]:
"""
Load markdown files and corresponding paper metadata from JSON
Args:
directory: Directory containing markdown files
papers_json_path: Path to JSON file containing paper metadata
Returns:
Dictionary with paper_id as key and dict containing content and metadata as value
"""
# Load paper metadata from JSON
try:
with open(papers_json_path, 'r', encoding='utf-8') as f:
papers_metadata = json.load(f)
except Exception as e:
print(f"Error reading metadata JSON: {e}")
papers_metadata = {}
# Load markdown files and combine with metadata
files_content = {}
for file_path in Path(directory).glob('*.md'):
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
paper_id = file_path.name.replace('_card.md', '')
tag, paper_id = paper_id.split("_")
# Create document with content and metadata
document = {
'content': content,
'metadata': {
'paper_id': paper_id,
'title': next((paper['title'] for paper in papers_metadata if paper['id'] == paper_id), "Unknown Title"),
'authors': next((paper['authors'] for paper in papers_metadata if paper['id'] == paper_id), "Unknown authors"),
'submission_date': next((paper['submission_date'] for paper in papers_metadata if paper['id'] == paper_id), "Unknown Date"),
'link': next((paper['link'] for paper in papers_metadata if paper['id'] == paper_id), "Unknown Link")
}
}
files_content[paper_id] = document
except Exception as e:
print(f"Error reading {file_path}: {e}")
return files_content