casper/caspertools.py at main · IUCVLab/casper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import texttools

def parse_raw(text : str) -> list:
    '''
    Метод принимает на вход сырые данные пользователя и преобразует их в список
    записей {title: заголовок, authors: авторы}
    '''
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    result = []
    for line in lines:
        parts = line.split('\t')
        if len(parts) == 0:
            continue
        else:
            item = {}
            item['title'] = parts[0]
            if len(parts) > 1:
                item['authors'] = parts[1]
            else:
                item['authors'] = None
            result.append(item)
    return result


def _get_relevant_preprints(title : str, n=100) -> list:
    '''
    Метод принимает на вход заголовок статьи и ищет с помощью API arxiv.org
    статьи-кандидаты. По умолчанию ищет 100, так как у сервиса не очень хороший поиск
    '''
    api = f"http://export.arxiv.org/api/query?max_results={n}&search_query="
    import time
    import feedparser
    # 3 секунды задержки нужны, чтобы соблюдать пользовательское соглашение сайта
    time.sleep(3)
    return feedparser.parse(api + title.replace(' ', '+'))


def _feed_to_papers(feed : dict) -> list:
    '''
    Метод принимает результаты поиска сервиса и преобразует их в нужный нам формат записей.
    На выходе метода - список записей
    '''
    result = []
    for e in feed["entries"]:
        id = e['id'][21:].replace('/', '_')
        page = e['id']
        year = e['published'].split('-')[0]
        pdfurl = [l['href'] for l in e['links'] if l['type'] == 'application/pdf'][0]
        title = e['title'].replace('\n', '').replace('  ', ' ')
        authors = [a['name'] for a in e.authors]
        result.append({
            'id': id,
            'url': page,
            'year': year,
            'pdfurl': pdfurl,
            'title': title,
            'authors': authors
        })
    return result


def _filter_relevant_papers(feed, item, LD=10, IOU=.01):
    '''
    Метод оставляет в списке найденных статей только те, что четко соответствуют критериям поиска
    Если расстояние Левенштейна более 10 - кандидат отклоняется
    Если пользователь ввел список автором - в нем должны быть совпадения выше 0.01 по мере Жаккарда
    '''
    import itertools
    import Levenshtein
    title = item['title'].lower()

    def author_set(authors):
        print("authors", authors)
        if authors is str:
            authors = authors.split()
        else:
            return []
        return set([name.lower() for name
                    in itertools.chain(*[i.split() for i in authors]) if '.' not in name])

    query_authors = author_set(item['authors'])
    result = []
    for paper in feed:
        titles_dist = Levenshtein.distance(paper['title'].lower(), title)
        candidate_authors = author_set(paper['authors'])
        if candidate_authors and query_authors:
            iou = len(set.intersection(query_authors, candidate_authors)) / len(set.union(query_authors, candidate_authors))
        else:
            iou = 1.
        if iou >= IOU and titles_dist <= LD:
            paper['source'] = item['title']
            result.append(paper)
    return result


def collect_paper_meta(sources):
    '''
    Метод принимает входные данные от пользователя, осуществляет поиск и позвращает только те статьи,
    что соответствуют критериям
    '''
    result = []
    for paper in sources:
        feed = _get_relevant_preprints(paper['title'])
        print("Feed length:", len(feed))
        candidates = _feed_to_papers(feed)
        print("Candidate papers length:", len(candidates))
        filtered = _filter_relevant_papers(candidates, paper)
        print("Remaining candidates length:", len(filtered))
        result += filtered
    return result


def _download(url, filename):
    '''
    Метод скачивает файл по ссылке url в указанное расположение filename
    '''
    import requests
    import shutil
    with requests.get(url, stream=True, allow_redirects=True) as r:
        if str(r.status_code)[0] in '45':
            print(f"Error: {r.status_code}, {r.url}")
            if str(r.status_code) == '403':
                raise Exception("We are banned by arxiv :(")
        else:
            with open(filename, 'wb') as f:
                shutil.copyfileobj(r.raw, f, 1024 * 1024 * 5)


def _recognize(fromfile, tofile):
    '''
    Метод рапознает текст документа fromfile, используя библиотеку textract
    Также метод производит очистку текста и раскрывает аббревиатуры
    '''
    import textract
    try:
        bin = textract.process(fromfile, method='pdfminer')
    except BaseException as e:
        print(e)
        return False
    text = str(bin, encoding="utf8")

    abbr = texttools.detect_abbreviations(text)
    print("Abbreviations:", abbr)
    text = texttools.expand_abbreviations(text, abbr)
    text = texttools.clean_text(text)

    with open(tofile, 'w') as txt:
        txt.write(text)
    return True


def download_and_parse_papers(index, folder="static/null", keepPDF=False, delay=3) -> str:
    '''
    Метод скачивает статьи, распознает и упаковывает в архив
    Возвращает ссылку на архив
    '''
    import shutil
    import time
    import os

    fullfolder = folder
    if not os.path.exists(fullfolder):
        os.mkdir(fullfolder)

    for item in index:
        yearfolder = os.path.join(fullfolder, str(item['year']))
        if not os.path.exists(yearfolder):
            os.mkdir(yearfolder)
        # + '.pdf' - добавляет к имени файла
        url = item['pdfurl'].replace('http:', 'https:') + '.pdf'
        short_file = item['id'] + '.pdf'
        filename = os.path.join(yearfolder, short_file)
        if os.path.exists(filename):
            # Проверка на частично скачанный файл
            if os.path.getsize(filename) > 16 * 1024:
                continue
        time.sleep(delay)

        _download(url, filename)
        if not _recognize(filename, filename.replace('.pdf', '.txt')):
            continue

        # удаляет исходный файл, если он не запрошен
        if not keepPDF:
            os.remove(filename)

    # последний шаг - архивация
    archfile = "dump"
    shutil.make_archive(folder + "/" + archfile, 'zip', folder)
    return archfile + ".zip"