From 35d3810f494f8b12ddda852b60109954bfa82b76 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Tue, 3 Sep 2019 16:20:52 -0700 Subject: [PATCH 1/2] recover paper ids --- scripts/recover_paperids.py | 61 +++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 scripts/recover_paperids.py diff --git a/scripts/recover_paperids.py b/scripts/recover_paperids.py new file mode 100644 index 0000000..7858533 --- /dev/null +++ b/scripts/recover_paperids.py @@ -0,0 +1,61 @@ +from s2base import elastic +import os +import glob +import datetime + + +es = elastic.default_es_client(ES_URL=elastic.paths.ES_URL_DEV) +in_dir = 'raw_text_files/out_without_DONE' +out_dir = 'raw_text_files/paper_ids' +for in_filename in sorted(glob.glob(f'{in_dir}/*.out')): + + filename = in_filename.split('/')[-1] + out_filename = f'{out_dir}/{filename}' + + if os.path.exists(out_filename): + print(f'{str(datetime.datetime.now())} == SKIPPING: {in_filename}. Output: {out_filename}') + continue + else: + print(f'{str(datetime.datetime.now())} == Processing: {in_filename}. Output: {out_filename}') + + with open(in_filename) as f: + with open(out_filename, 'w') as fout: + line_index = -1 + paper_count = 0 + found_count = 0 + found = False + for line in f: + line = line.strip() + + if (line == ""): + fout.write(f'{paper_id}\n') + if found: + found_count += 1 + paper_count += 1 + + if paper_count % 500 == 0: + print(f'{str(datetime.datetime.now())} == {found_count}/{paper_count}') + + line_index = -1 + found = False + continue + + line_index += 1 + + if line_index >= 2: + continue + + if not found: + r = es.search(index='paper', doc_type='paper', body={"query": {"match": { "paperAbstract": {"query": line, "operator" : "and"}}}}, _source_include='_id') + hits_count = r['hits']['total'] + paper_id = 'no paper found' + if hits_count > 0: + paper_id = r['hits']['hits'][0]['_id'] + found = True + + if not found: + r = es.search(index='paper', doc_type='paper', body={"query": {"match": { "bodyText": {"query": line, "operator" : "and"}}}}, _source_include='_id') + hits_count = r['hits']['total'] + if hits_count > 0: + paper_id = r['hits']['hits'][0]['_id'] + found = True From 41510f5c80e51afb96471be651841700d22133e7 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 4 Sep 2019 07:16:48 -0700 Subject: [PATCH 2/2] handle ES exceptions --- scripts/recover_paperids.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/scripts/recover_paperids.py b/scripts/recover_paperids.py index 7858533..f0ae156 100644 --- a/scripts/recover_paperids.py +++ b/scripts/recover_paperids.py @@ -46,16 +46,22 @@ continue if not found: - r = es.search(index='paper', doc_type='paper', body={"query": {"match": { "paperAbstract": {"query": line, "operator" : "and"}}}}, _source_include='_id') - hits_count = r['hits']['total'] - paper_id = 'no paper found' - if hits_count > 0: - paper_id = r['hits']['hits'][0]['_id'] - found = True + try: + r = es.search(index='paper', doc_type='paper', body={"query": {"match": { "paperAbstract": {"query": line, "operator" : "and"}}}}, _source_include='_id') + hits_count = r['hits']['total'] + paper_id = 'no paper found' + if hits_count > 0: + paper_id = r['hits']['hits'][0]['_id'] + found = True + except Exception as e: + print(e) if not found: - r = es.search(index='paper', doc_type='paper', body={"query": {"match": { "bodyText": {"query": line, "operator" : "and"}}}}, _source_include='_id') - hits_count = r['hits']['total'] - if hits_count > 0: - paper_id = r['hits']['hits'][0]['_id'] - found = True + try: + r = es.search(index='paper', doc_type='paper', body={"query": {"match": { "bodyText": {"query": line, "operator" : "and"}}}}, _source_include='_id') + hits_count = r['hits']['total'] + if hits_count > 0: + paper_id = r['hits']['hits'][0]['_id'] + found = True + except Exception as e: + print(e)