haut/data_loader.py at main · anvix9/haut · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from pathlib import Path
from typing import Dict
import json
import utils

class DataLoader:
    @staticmethod
    def load_analysis_json_file(directory: str = './paper_analysis/',
                          papers_json_path: str = './paper_metadata/metadata_2024_12_31_143026.json') -> Dict[str, Dict]:
        """
        Load json files containing paper analysis and corresponding paper metadata

        Args:
            directory: Directory containing json analysis files
            papers_json_path: Path to JSON file containing paper metadata

        Returns:
            Dictionary with paper_id as key and dict containing analysis content, questions and metadata as value
        """
        # Load paper metadata from JSON
        try:
            with open(papers_json_path, 'r', encoding='utf-8') as f:
                papers_metadata = json.load(f)
        except Exception as e:
            print(f"Error reading metadata JSON: {e}")
            papers_metadata = {}

        # Load and process analysis files
        files_content = {}

        for file_path in Path(directory).glob('*_analysis.json'):
            try:
                # Read and parse JSON file
                with open(file_path, 'r', encoding='utf-8') as file:
                    analysis_data = json.load(file)

                # Extract paper ID from filename
                filename = file_path.stem  # Gets filename without extension
                tag, paper_id = filename.replace('_analysis', '').split("_")

                # Extract research questions
                research_content = analysis_data.get("research", "")
                questions, _ = utils.extract_questions_and_filename(file_path.name, research_content)

                # Create document with content and metadata
                document = {
                    'questions': " - ".join(questions),
                    'metadata': {
                        'paper_id': paper_id,
                    }
                }

                files_content[paper_id] = document

            except Exception as e:
                print(f"Error processing {file_path}: {e}")
                continue

        return files_content

    @staticmethod
    def load_card_markdown(directory: str = './card_papers/', papers_json_path: str = './paper_metadata/metadata_2024_12_31_143026.json') -> Dict[str, Dict]:
        """
        Load markdown files and corresponding paper metadata from JSON

        Args:
            directory: Directory containing markdown files
            papers_json_path: Path to JSON file containing paper metadata

        Returns:
            Dictionary with paper_id as key and dict containing content and metadata as value
        """
        # Load paper metadata from JSON
        try:
            with open(papers_json_path, 'r', encoding='utf-8') as f:
                papers_metadata = json.load(f)
        except Exception as e:
            print(f"Error reading metadata JSON: {e}")
            papers_metadata = {}

        # Load markdown files and combine with metadata
        files_content = {}

        for file_path in Path(directory).glob('*.md'):
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    paper_id = file_path.name.replace('_card.md', '')
                    tag, paper_id = paper_id.split("_")
                    # Create document with content and metadata
                    document = {
                        'content': content,
                        'metadata': {
                            'paper_id': paper_id,
                            'title': next((paper['title'] for paper in papers_metadata if paper['id'] == paper_id), "Unknown Title"),
                            'authors': next((paper['authors'] for paper in papers_metadata if paper['id'] == paper_id), "Unknown authors"),
                            'submission_date': next((paper['submission_date'] for paper in papers_metadata if paper['id'] == paper_id), "Unknown Date"),
                            'link': next((paper['link'] for paper in papers_metadata if paper['id'] == paper_id), "Unknown Link")
                            }
                    }

                    files_content[paper_id] = document
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

        return files_content