From 71d9cebd4072fd0cc97d8d910e7874467e345e2b Mon Sep 17 00:00:00 2001
From: Jeffery <jeffery1236@gmail.com>
Date: Sat, 19 Aug 2023 10:43:42 -0400
Subject: [PATCH] added regex to extract first arxiv url from line instead of
 using hardcoded indices

---
 sotawhat/sotawhat.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sotawhat/sotawhat.py b/sotawhat/sotawhat.py
index 8c9d6f1..c37a742 100644
--- a/sotawhat/sotawhat.py
+++ b/sotawhat/sotawhat.py
@@ -48,10 +48,14 @@ def get_next_result(lines, start):
     + 'authors': []
     + 'abstract': str
     """
+    def extract_first_arxiv_url(html_string):
+        pattern = r'https://arxiv\.org/[^"]+' # Regular expression to match URLs from the arxiv.org domain
+        match = re.search(pattern, html_string)
+        return match.group(0) if match else None
 
     result = {}
     idx = lines[start + 3][10:].find('"')
-    result['main_page'] = lines[start + 3][9:10 + idx]
+    result['main_page'] = extract_first_arxiv_url(lines[start + 3])
     idx = lines[start + 4][23:].find('"')
     result['pdf'] = lines[start + 4][22: 23 + idx] + '.pdf'