chiphuyen · mayanksingh09 · Aug 25, 2023 · Feb 2, 2024 · Feb 2, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+.cache
+*.egg-info
+build/
+dist/
+venv*
+/venv/
+__pycache__
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ Read more about SOTAWHAT [here](https://huyenchip.com/2018/10/04/sotawhat.html).
 
 You can use sotawhat through a web interface [here](https://sotawhat.herokuapp.com/#/). Thanks hmchuong!
 
-This script runs using Python 3. It requires ``nltk``, ``six``, and ``pyspellchecker``. To install it as a Python package, follow the following steps:
+This script runs using Python 3 (Python 3.8.12 works best). It requires ``nltk``, ``six``, ``pyspellchecker``, and ``openai`` (if using the summarise or keyfindings arguments). To install it as a Python package, follow the following steps:
 
 
 Step 1: clone this repo, and go inside that repo:
@@ -66,3 +66,20 @@ We've found that this script works well with keywords that are:
 + a task (e.g. language model, machine translation, fuzzing, ...)
 + a metric (e.g. BLEU, perplexity, ...)
 + random stuff
+
+## Summarization
+You can also use the script to summarize a paper using GPT3.5 after you get it's url from the step above. For example:
+
+```bash
+$ sotawhat summarize https://arxiv.org/abs/1809.04281
+```
+
+It uses the gpt-3.5-turbo-16k model and will request for your OpenAI API key. You can get one [here](https://platform.openai.com/signup). The simple prompt will generate a 150 word summary of the paper to help you decide if you want to read further.
+
+You can also use the script to list down the key findings of the paper if you don't feel like leaving the command line interface using:
+
+```bash
+$ sotawhat keyfindings https://arxiv.org/abs/1809.04281
+```
+
+The script works well with papers shorter than 20 pages or so as the max token length is 16k, any paper or document bigger than that will be clipped to 45000 characters and then summarized.
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,5 @@
 nltk
 six
-pyspellchecker
+pyspellchecker 
+PyPDF2
+openai
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
                          'abstracts and extract summaries from them. '),
     url='https://huyenchip.com/2018/10/04/sotawhat.html',
     license="",
-    install_requires=['six', 'nltk', 'pyspellchecker'],
+    install_requires=['six', 'nltk', 'pyspellchecker', 'PyPDF2', 'openai'],
     entry_points={
         'console_scripts': ['sotawhat=sotawhat.sotawhat:main'],
     }

diff --git a/sotawhat/sotawhat.py b/sotawhat/sotawhat.py
@@ -1,9 +1,15 @@
 import os
 import re
 import sys
+import PyPDF2
 import urllib.error
 import urllib.request
 import warnings
+import requests
+from io import BytesIO
+import openai
+from getpass import getpass
+
 
 import nltk
 from nltk.tokenize import word_tokenize
@@ -36,6 +42,11 @@ def get_authors(lines, i):
         i += 1
     return authors, i
 
+# Source: https://github.com/chiphuyen/sotawhat/pull/33/files
+def extract_first_arxiv_url(html_string):
+    pattern = r'https://[^"]+arxiv\.org/[^"]+' # Regular expression to match URLs from the arxiv.org domain
+    match = re.search(pattern, html_string)
+    return match.group(0) if match else None
 
 def get_next_result(lines, start):
     """
@@ -51,7 +62,7 @@ def get_next_result(lines, start):
 
     result = {}
     idx = lines[start + 3][10:].find('"')
-    result['main_page'] = lines[start + 3][9:10 + idx]
+    result['main_page'] = extract_first_arxiv_url(lines[start + 3])
     idx = lines[start + 4][23:].find('"')
     result['pdf'] = lines[start + 4][22: 23 + idx] + '.pdf'
 
@@ -272,6 +283,66 @@ def get_papers(keyword, num_results=5):
             all_unshown.extend(unshown)
         page += 1
 
+def get_paper_content(pdf_url):
+    """
+    Gets the content of the paper from the pdf url
+    """
+    response = requests.get(pdf_url)
+    if response.status_code == 200:
+        pdf_content = BytesIO(response.content)
+        pdf_reader = PyPDF2.PdfReader(pdf_content)
+
+    all_text = ''
+    for page_num in range(len(pdf_reader.pages)):
+        page = pdf_reader.pages[page_num]
+        text = page.extract_text()
+        all_text += text
+
+    return all_text
+
+def summarise_paper(paper_content, model="gpt-3.5-turbo-0125", role="user"):
+    """
+    Summarises the paper content using the OpenAI GPT-3.5 16k model
+    """
+    if len(paper_content) > 45000:
+        warnings.warn(f'Paper content is too long - {len(paper_content)} characters. Using only first 45000 characters.')
+        paper_content = paper_content[:45000]    
+    response = openai_client.chat.completions.create(
+    model=model,
+    messages=[
+        {
+            "role": "system",
+            "content": "You are a text summarising system that provides easy to understand summaries of research papers."
+        },
+        {"role": role, 
+         "content": "Summarise paper text in 150 words: " + paper_content
+         }])
+
+    return response.choices[0].message.content
+
+def key_findings(paper_content, model="gpt-3.5-turbo-0125", role="user"):
+    """
+    Retrieves Key Findings from a paper using OpenAI GPT-3.5 16k model
+    """
+    if len(paper_content) > 45000:
+        warnings.warn(f'Paper content is too long - {len(paper_content)} characters. Using only first 45000 characters.')
+        paper_content = paper_content[:45000]
+
+    openai_client = openai.OpenAI()
+
+    response = openai_client.chat.completions.create(
+    model=model,
+    messages=[
+        {
+            "role": "system",
+            "content": "You are a academic paper parsing system that provides key findings/learnings of research papers."
+        },
+        {"role": role, 
+         "content": "List down the key findings of the paper: " + paper_content
+         }])
+
+    return response.choices[0].message.content
+
 
 def main():
     if 'nt' in os.name:
@@ -283,6 +354,38 @@ def main():
                           'If such errors occur, please install `win_unicode_consolde` via \n'
                           'the command `pip install win-unicode-console`.')
 
+    if len(sys.argv) > 1 and sys.argv[1] == 'summarize':
+
+        # Check for OpenAI API key only when 'summarize' is the argument
+        if 'OPENAI_API_KEY' not in os.environ:
+            openai_api_key = getpass('Please enter your OpenAI API Key: ')
+
+        os.environ['OPENAI_API_KEY'] = openai_api_key
+
+        if len(sys.argv) < 3:
+            raise ValueError('You must specify a paper url')
+        paper_url = sys.argv[2]
+        paper_content = get_paper_content(paper_url)
+        paper_summary = summarise_paper(paper_content)
+        print("Paper Summary: \n", paper_summary)
+        return
+
+    if len(sys.argv) > 1 and sys.argv[1] == 'keyfindings':
+
+        # Check for OpenAI API key only when 'keyfindings' is the argument
+        if 'OPENAI_API_KEY' not in os.environ:
+            openai_api_key = getpass('Please enter your OpenAI API Key: ')
+
+        os.environ['OPENAI_API_KEY'] = openai_api_key
+
+        if len(sys.argv) < 3:
+            raise ValueError('You must specify a paper url')
+        paper_url = sys.argv[2]
+        paper_content = get_paper_content(paper_url)
+        paper_summary = key_findings(paper_content)
+        print("Key Findings: \n", paper_summary)
+        return
+
     if len(sys.argv) < 2:
         raise ValueError('You must specify a keyword')