PdbManager/importFIS.py at master · joseantem/PdbManager · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
"""
Created on Feb25 2019
@author: Jerónimo Arenas García

Import FIS Database to MySQL DB

    * Creating database from crawling the FIS portal

"""

import argparse
import configparser

from dbmanager.FISmanager import FISmanager

import ipdb
import time
import os

import re

try:
    # UCS-4
    regex = re.compile('[\U00010000-\U0010ffff]')
except re.error:
    # UCS-2
    regex = re.compile('[\uD800-\uDBFF][\uDC00-\uDFFF]')


def clean_utf8(rawdata):
    return regex.sub(' ', rawdata)


def main(download=False, resetDB=False, importData=False,
         lemmatize=False, lemmas_query=None):
    """
    """

    cf = configparser.ConfigParser()
    cf.read('config.cf')

    #########################
    # Configuration variables
    #
    dbUSER = cf.get('DB', 'dbUSER')
    dbPASS = cf.get('DB', 'dbPASS')
    dbSERVER = cf.get('DB', 'dbSERVER')
    dbCONNECTOR = cf.get('DB', 'dbCONNECTOR')
    dbSOCKET = cf.get('DB', 'dbSOCKET')
    dbNAME = cf.get('FIS', 'dbNAME')

    #########################
    # Datafiles
    #
    data_folder = cf.get('FIS', 'download_folder')
    ttsleep = int(cf.get('FIS', 'ttsleep'))

    ####################################################
    #1. Data download

    if download:
        #Import is only carried out if download option is activated
        #this avoids error messages if you already have the files
        #but do not have selenium installed in your system
        from selenium import webdriver
        from selenium.common.exceptions import NoSuchElementException
        browser = webdriver.Firefox()

        """Step 1: Retrieve all valid project URLs from the FIS portal"""
        if os.path.isfile(os.path.join(data_folder, 'allUrls.txt')):
            print('Reading URLs from file', os.path.join(data_folder, 'allUrls.txt'))
            print('Remove the file if you want to retrieve project URLs again')
            print('\n')
            with open(os.path.join(data_folder, 'allUrls.txt'), 'r') as fin:
                allUrls = fin.readlines()
                allUrls = [el.strip() for el in allUrls]

        else:

            allUrls = []

            FISUrl = 'https://portalfis.isciii.es/es/Paginas/Busqueda.aspx'
            browser.get(FISUrl)
            #Fill in the field for the search and submit query
            #searchtext = driver.find_element_by_id('ctl00_ctl34_g_b8905950_4e9a_4a7e_9d2d_d728f1b64287_txtBusqueda')
            browser.find_element_by_id(
                'ctl00_ctl34_g_b8905950_4e9a_4a7e_9d2d_d728f1b64287_txtBusqueda'
            ).send_keys('de')
            browser.find_element_by_id(
                'ctl00_ctl34_g_b8905950_4e9a_4a7e_9d2d_d728f1b64287_chkCoincidenciaExacta'
            ).click()
            browser.find_element_by_id(
                'ctl00_ctl34_g_b8905950_4e9a_4a7e_9d2d_d728f1b64287_btnBuscar'
            ).click()
            time.sleep(ttsleep)

            #get links to all project pages
            pageUrls = browser.find_elements_by_class_name('enlaceProyecto')
            for elm in pageUrls:
                if elm.tag_name == 'a':
                    allUrls.append(elm.get_attribute('href'))

            #Next we iterate over "next page buttom"
            try:
                nxtbtn = browser.find_element_by_id(
                    'ctl00_ctl34_g_b8905950_4e9a_4a7e_9d2d_d728f1b64287_ctl00_lnbSiguiente'
                )
            except NoSuchElementException:
                nxtbtn = None

            while(nxtbtn):
                nxtbtn.click()
                time.sleep(ttsleep)
                pageUrls = browser.find_elements_by_class_name('enlaceProyecto')
                #get links to all project pages
                for elm in pageUrls:
                    if elm.tag_name == 'a':
                        allUrls.append(elm.get_attribute('href'))
                try:
                    nxtbtn = browser.find_element_by_id(
                        'ctl00_ctl34_g_b8905950_4e9a_4a7e_9d2d_d728f1b64287_ctl00_lnbSiguiente'
                    )
                except NoSuchElementException:
                    nxtbtn = None

            #Save retrieved URLs in predefined file
            with open(os.path.join(data_folder, 'allUrls.txt'), 'w') as fout:
                fout.write('\n'.join(allUrls))

        """Step 2: Download project HTML page for all available projects"""
        for elm in allUrls:
            idProyecto = elm.split('idProyecto=')[1].replace('%2f','_')
            if os.path.isfile(os.path.join(data_folder, idProyecto+'.html')):
                print('Ya se ha descargado el proyecto:', idProyecto)
            else:
                browser.get(elm)
                time.sleep(ttsleep)
                with open(os.path.join(data_folder, idProyecto+'.html'), 'w') as fout:
                    fout.write(browser.page_source )

        browser.close()

    ####################################################
    #2. Database connection

    if resetDB or importData or lemmatize:

        if dbSOCKET:
            print('socket')
            DB = FISmanager (db_name=dbNAME, db_connector=dbCONNECTOR, path2db=None,
                        db_server=dbSERVER, db_user=dbUSER, db_password=dbPASS,
                        unix_socket=dbSOCKET)
        else:
            print('tcp')
            DB = FISmanager (db_name=dbNAME, db_connector=dbCONNECTOR, path2db=None,
                        db_server=dbSERVER, db_user=dbUSER, db_password=dbPASS)

    ####################################################
    #3. If activated, remove and create again database tables
    if resetDB:
        print('Regenerating the database. Existing data will be removed.')
        # The following method deletes all existing tables, and create them
        # again without data
        DB.deleteDBtables()
        DB.createDBschema()

    ####################################################
    # 4. If activated project information will be inserted in the table
    if importData:
        print('Importing projects data ...')
        DB.importData(data_folder)


    ####################################################
    # 7. If activated, will carry out lemmas extraction for the
    # imported papers
    if lemmatize:
        print('Lemmatizing Titles and Abstracts ...')

        #Now we start the heavy part. To avoid collapsing the server, we will
        #read and process in chunks of N articles
        chunksize = 25000
        cont = 0
        lemmas_server = cf.get('Lemmatizer', 'server')
        stw_file = cf.get('Lemmatizer', 'stw_file')
        dict_eq_file = cf.get('Lemmatizer', 'dict_eq_file')
        POS = cf.get('Lemmatizer', 'POS')
        concurrent_posts = int(cf.get('Lemmatizer', 'concurrent_posts'))
        removenumbers = cf.get('Lemmatizer', 'removenumbers') == 'True'
        keepSentence = cf.get('Lemmatizer', 'keepSentence') == 'True'

        #Initialize lemmatizer
        ENLM = ENLemmatizer(lemmas_server=lemmas_server, stw_file=stw_file,
                    dict_eq_file=dict_eq_file, POS=POS, removenumbers=removenumbers,
                    keepSentence=keepSentence)
        selectOptions = 'paperID, title, paperAbstract'
        if lemmas_query:
            filterOptions = 'paperID>0 AND ' + lemmas_query
        else:
            filterOptions = 'paperID>0'
        init_time = time.time()
        df = DB.readDBtable('S2papers', limit=chunksize, selectOptions=selectOptions,
                 filterOptions = filterOptions, orderOptions='paperID ASC')
        while (len(df)):
            cont = cont+len(df)

            #Next time, we will read from the largest paperID. This is the
            #last element of the dataframe, given that we requested an ordered df
            largest_id = df['paperID'][len(df)-1]
            print('Number of articles processed:', cont)
            print('Last Article Id read:', largest_id)

            df['alltext'] = df['title'] + '. ' + df['paperAbstract']
            df['alltext'] = df['alltext'].apply(clean_utf8)
            lemasBatch = ENLM.lemmatizeBatch(df[['paperID', 'alltext']].values.tolist(),
                                                processes=concurrent_posts)
            #Remove entries that where not lemmatized correctly
            lemasBatch = [[el[0], clean_utf8(el[1])] for el in lemasBatch if len(el[1])]
            print('Successful lemmatized documents:', len(lemasBatch))
            DB.setField('S2papers', 'paperID', ['LEMAS'], lemasBatch)
            if lemmas_query:
                filterOptions = 'paperID>' + str(largest_id) + ' AND ' + lemmas_query
            else:
                filterOptions = 'paperID>' + str(largest_id)
            df = DB.readDBtable('S2papers', limit=chunksize, selectOptions=selectOptions,
                 filterOptions = filterOptions, orderOptions='paperID ASC')
            elapsed_time = time.time() - init_time
            print('Elapsed Time (seconds):', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

    return


if __name__ == "__main__":

    parser = argparse.ArgumentParser(prog='importFIS')
    parser.add_argument('--download', action='store_true', help='If activated, download data from FIS portal')
    parser.add_argument('--resetDB', action='store_true', help='If activated, the database will be reset and re-created')
    parser.add_argument('--importData', action='store_true', help='If activated, import downloaded data into database')
    parser.add_argument('--lemmatize', action='store_true', help='If activated, lemmatize database')
    parser.add_argument('--lemmas_query', type=str, dest='lemmas_query', help='Query for DB elements to lemmatize')
    parser.set_defaults(lemmas_query=None)
    args = parser.parse_args()

    main(download=args.download, resetDB=args.resetDB, importData=args.importData,
         lemmatize=args.lemmatize, lemmas_query=args.lemmas_query)