From 35d3810f494f8b12ddda852b60109954bfa82b76 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Tue, 3 Sep 2019 16:20:52 -0700
Subject: [PATCH 1/2] recover paper ids

---
 scripts/recover_paperids.py | 61 +++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 scripts/recover_paperids.py

diff --git a/scripts/recover_paperids.py b/scripts/recover_paperids.py
new file mode 100644
index 0000000..7858533
--- /dev/null
+++ b/scripts/recover_paperids.py
@@ -0,0 +1,61 @@
+from s2base import elastic
+import os
+import glob
+import datetime
+
+
+es = elastic.default_es_client(ES_URL=elastic.paths.ES_URL_DEV) 
+in_dir = 'raw_text_files/out_without_DONE'
+out_dir = 'raw_text_files/paper_ids'
+for in_filename in sorted(glob.glob(f'{in_dir}/*.out')):
+
+    filename = in_filename.split('/')[-1]
+    out_filename = f'{out_dir}/{filename}'
+   
+    if os.path.exists(out_filename):
+        print(f'{str(datetime.datetime.now())} == SKIPPING: {in_filename}. Output: {out_filename}')
+        continue
+    else:
+        print(f'{str(datetime.datetime.now())} == Processing: {in_filename}. Output: {out_filename}')
+        
+    with open(in_filename) as f:
+        with open(out_filename, 'w') as fout:
+            line_index = -1
+            paper_count = 0
+            found_count = 0
+            found = False
+            for line in f:
+                line = line.strip()
+
+                if (line == ""):
+                    fout.write(f'{paper_id}\n')
+                    if found:
+                        found_count  += 1
+                    paper_count += 1
+
+                    if paper_count % 500 == 0:
+                        print(f'{str(datetime.datetime.now())} == {found_count}/{paper_count}')
+
+                    line_index = -1
+                    found = False
+                    continue
+
+                line_index += 1
+
+                if line_index >= 2:
+                    continue
+
+                if not found:
+                    r = es.search(index='paper', doc_type='paper', body={"query": {"match": { "paperAbstract": {"query": line, "operator" : "and"}}}}, _source_include='_id')
+                    hits_count = r['hits']['total']
+                    paper_id = 'no paper found'
+                    if hits_count > 0:
+                        paper_id = r['hits']['hits'][0]['_id']
+                        found = True
+
+                if not found:
+                    r = es.search(index='paper', doc_type='paper', body={"query": {"match": { "bodyText": {"query": line, "operator" : "and"}}}}, _source_include='_id')
+                    hits_count = r['hits']['total']
+                    if hits_count > 0:
+                        paper_id = r['hits']['hits'][0]['_id']
+                        found = True

From 41510f5c80e51afb96471be651841700d22133e7 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 4 Sep 2019 07:16:48 -0700
Subject: [PATCH 2/2] handle ES exceptions

---
 scripts/recover_paperids.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/scripts/recover_paperids.py b/scripts/recover_paperids.py
index 7858533..f0ae156 100644
--- a/scripts/recover_paperids.py
+++ b/scripts/recover_paperids.py
@@ -46,16 +46,22 @@
                     continue
 
                 if not found:
-                    r = es.search(index='paper', doc_type='paper', body={"query": {"match": { "paperAbstract": {"query": line, "operator" : "and"}}}}, _source_include='_id')
-                    hits_count = r['hits']['total']
-                    paper_id = 'no paper found'
-                    if hits_count > 0:
-                        paper_id = r['hits']['hits'][0]['_id']
-                        found = True
+                    try:
+                        r = es.search(index='paper', doc_type='paper', body={"query": {"match": { "paperAbstract": {"query": line, "operator" : "and"}}}}, _source_include='_id')
+                        hits_count = r['hits']['total']
+                        paper_id = 'no paper found'
+                        if hits_count > 0:
+                            paper_id = r['hits']['hits'][0]['_id']
+                            found = True
+                    except Exception as e:
+                        print(e)
 
                 if not found:
-                    r = es.search(index='paper', doc_type='paper', body={"query": {"match": { "bodyText": {"query": line, "operator" : "and"}}}}, _source_include='_id')
-                    hits_count = r['hits']['total']
-                    if hits_count > 0:
-                        paper_id = r['hits']['hits'][0]['_id']
-                        found = True
+                    try:
+                        r = es.search(index='paper', doc_type='paper', body={"query": {"match": { "bodyText": {"query": line, "operator" : "and"}}}}, _source_include='_id')
+                        hits_count = r['hits']['total']
+                        if hits_count > 0:
+                            paper_id = r['hits']['hits'][0]['_id']
+                            found = True
+                    except Exception as e:
+                        print(e)