From 71d9cebd4072fd0cc97d8d910e7874467e345e2b Mon Sep 17 00:00:00 2001 From: Jeffery Date: Sat, 19 Aug 2023 10:43:42 -0400 Subject: [PATCH] added regex to extract first arxiv url from line instead of using hardcoded indices --- sotawhat/sotawhat.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sotawhat/sotawhat.py b/sotawhat/sotawhat.py index 8c9d6f1..c37a742 100644 --- a/sotawhat/sotawhat.py +++ b/sotawhat/sotawhat.py @@ -48,10 +48,14 @@ def get_next_result(lines, start): + 'authors': [] + 'abstract': str """ + def extract_first_arxiv_url(html_string): + pattern = r'https://arxiv\.org/[^"]+' # Regular expression to match URLs from the arxiv.org domain + match = re.search(pattern, html_string) + return match.group(0) if match else None result = {} idx = lines[start + 3][10:].find('"') - result['main_page'] = lines[start + 3][9:10 + idx] + result['main_page'] = extract_first_arxiv_url(lines[start + 3]) idx = lines[start + 4][23:].find('"') result['pdf'] = lines[start + 4][22: 23 + idx] + '.pdf'