-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocessing_queries.py
More file actions
62 lines (54 loc) · 2.35 KB
/
processing_queries.py
File metadata and controls
62 lines (54 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""Main file for processing e-prints and getting opinion of them.
To retrieve and update the local copy of arXiv files, use:
gsutil -m rsync -r gs://arxiv-dataset/arxiv/ dest/
"""
import argparse
import os
from pathlib import PurePath, Path
import pandas as pd
from src.processing.opinion import OpinionProcessing
if __name__ == '__main__' :
# Get command line args
parser = argparse.ArgumentParser(description=('Processes opinion on e-prints'
' database.'))
parser.add_argument('--arxiv',
dest='arxiv_path',
action='store',
help='path to the arxiv folder',
default=PurePath('/mnt/arxiv'))
parser.add_argument('--eprints',
dest='eprints_path',
action='store',
help='path to the e-prints folder',
default=PurePath('data').joinpath('query_e-prints_2021_11_11'))
parser.add_argument('--output',
dest='output',
action='store',
help='path to the output folder',
default=PurePath('result'))
args = parser.parse_args()
# Cleaning path for Windows compatibility
args.arxiv_path = PurePath(args.arxiv_path)
args.eprints_path = PurePath(args.eprints_path)
args.output = PurePath(args.output)
# mkdir
Path(args.output).mkdir(parents=True, exist_ok=True)
# Get list of all files sorted
queries_files = sorted(os.listdir(args.eprints_path))
def url_to_arxiv_folder(path: str, arxiv_path: PurePath):
"""Modify URL into path"""
return arxiv_path.joinpath(path)
# For each files
for query in queries_files :
# Get crtc name
crtc_name = query[3:3 + 2]
# Read file and update the arXiv path
df = pd.read_csv(args.eprints_path.joinpath(query),
dtype='str')
df['arxiv_path'] = df['arxiv_path'].apply(url_to_arxiv_folder,
args=[args.arxiv_path])
# Init the opinion analysis
sentiAnalysis = OpinionProcessing(df['arxiv_path'],
args.output, crtc_name)
# Processing
sentiAnalysis.processing(len(os.sched_getaffinity(0)))