Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.cache
*.egg-info
build/
dist/
venv*
/venv/
__pycache__
19 changes: 18 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Read more about SOTAWHAT [here](https://huyenchip.com/2018/10/04/sotawhat.html).

You can use sotawhat through a web interface [here](https://sotawhat.herokuapp.com/#/). Thanks hmchuong!

This script runs using Python 3. It requires ``nltk``, ``six``, and ``pyspellchecker``. To install it as a Python package, follow the following steps:
This script runs using Python 3 (Python 3.8.12 works best). It requires ``nltk``, ``six``, ``pyspellchecker``, and ``openai`` (if using the summarise or keyfindings arguments). To install it as a Python package, follow the following steps:


Step 1: clone this repo, and go inside that repo:
Expand Down Expand Up @@ -66,3 +66,20 @@ We've found that this script works well with keywords that are:
+ a task (e.g. language model, machine translation, fuzzing, ...)
+ a metric (e.g. BLEU, perplexity, ...)
+ random stuff

## Summarization
You can also use the script to summarize a paper using GPT3.5 after you get it's url from the step above. For example:

```bash
$ sotawhat summarize https://arxiv.org/abs/1809.04281
```

It uses the gpt-3.5-turbo-16k model and will request for your OpenAI API key. You can get one [here](https://platform.openai.com/signup). The simple prompt will generate a 150 word summary of the paper to help you decide if you want to read further.

You can also use the script to list down the key findings of the paper if you don't feel like leaving the command line interface using:

```bash
$ sotawhat keyfindings https://arxiv.org/abs/1809.04281
```

The script works well with papers shorter than 20 pages or so as the max token length is 16k, any paper or document bigger than that will be clipped to 45000 characters and then summarized.
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
nltk
six
pyspellchecker
pyspellchecker
PyPDF2
openai
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
'abstracts and extract summaries from them. '),
url='https://huyenchip.com/2018/10/04/sotawhat.html',
license="",
install_requires=['six', 'nltk', 'pyspellchecker'],
install_requires=['six', 'nltk', 'pyspellchecker', 'PyPDF2', 'openai'],
entry_points={
'console_scripts': ['sotawhat=sotawhat.sotawhat:main'],
}
Expand Down
105 changes: 104 additions & 1 deletion sotawhat/sotawhat.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import os
import re
import sys
import PyPDF2
import urllib.error
import urllib.request
import warnings
import requests
from io import BytesIO
import openai
from getpass import getpass


import nltk
from nltk.tokenize import word_tokenize
Expand Down Expand Up @@ -36,6 +42,11 @@ def get_authors(lines, i):
i += 1
return authors, i

# Source: https://github.com/chiphuyen/sotawhat/pull/33/files
def extract_first_arxiv_url(html_string):
pattern = r'https://[^"]+arxiv\.org/[^"]+' # Regular expression to match URLs from the arxiv.org domain
match = re.search(pattern, html_string)
return match.group(0) if match else None

def get_next_result(lines, start):
"""
Expand All @@ -51,7 +62,7 @@ def get_next_result(lines, start):

result = {}
idx = lines[start + 3][10:].find('"')
result['main_page'] = lines[start + 3][9:10 + idx]
result['main_page'] = extract_first_arxiv_url(lines[start + 3])
idx = lines[start + 4][23:].find('"')
result['pdf'] = lines[start + 4][22: 23 + idx] + '.pdf'

Expand Down Expand Up @@ -272,6 +283,66 @@ def get_papers(keyword, num_results=5):
all_unshown.extend(unshown)
page += 1

def get_paper_content(pdf_url):
"""
Gets the content of the paper from the pdf url
"""
response = requests.get(pdf_url)
if response.status_code == 200:
pdf_content = BytesIO(response.content)
pdf_reader = PyPDF2.PdfReader(pdf_content)

all_text = ''
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text = page.extract_text()
all_text += text

return all_text

def summarise_paper(paper_content, model="gpt-3.5-turbo-0125", role="user"):
"""
Summarises the paper content using the OpenAI GPT-3.5 16k model
"""
if len(paper_content) > 45000:
warnings.warn(f'Paper content is too long - {len(paper_content)} characters. Using only first 45000 characters.')
paper_content = paper_content[:45000]
response = openai_client.chat.completions.create(
model=model,
messages=[
{
"role": "system",
"content": "You are a text summarising system that provides easy to understand summaries of research papers."
},
{"role": role,
"content": "Summarise paper text in 150 words: " + paper_content
}])

return response.choices[0].message.content

def key_findings(paper_content, model="gpt-3.5-turbo-0125", role="user"):
"""
Retrieves Key Findings from a paper using OpenAI GPT-3.5 16k model
"""
if len(paper_content) > 45000:
warnings.warn(f'Paper content is too long - {len(paper_content)} characters. Using only first 45000 characters.')
paper_content = paper_content[:45000]

openai_client = openai.OpenAI()

response = openai_client.chat.completions.create(
model=model,
messages=[
{
"role": "system",
"content": "You are a academic paper parsing system that provides key findings/learnings of research papers."
},
{"role": role,
"content": "List down the key findings of the paper: " + paper_content
}])

return response.choices[0].message.content


def main():
if 'nt' in os.name:
Expand All @@ -283,6 +354,38 @@ def main():
'If such errors occur, please install `win_unicode_consolde` via \n'
'the command `pip install win-unicode-console`.')

if len(sys.argv) > 1 and sys.argv[1] == 'summarize':

# Check for OpenAI API key only when 'summarize' is the argument
if 'OPENAI_API_KEY' not in os.environ:
openai_api_key = getpass('Please enter your OpenAI API Key: ')

os.environ['OPENAI_API_KEY'] = openai_api_key

if len(sys.argv) < 3:
raise ValueError('You must specify a paper url')
paper_url = sys.argv[2]
paper_content = get_paper_content(paper_url)
paper_summary = summarise_paper(paper_content)
print("Paper Summary: \n", paper_summary)
return

if len(sys.argv) > 1 and sys.argv[1] == 'keyfindings':

# Check for OpenAI API key only when 'keyfindings' is the argument
if 'OPENAI_API_KEY' not in os.environ:
openai_api_key = getpass('Please enter your OpenAI API Key: ')

os.environ['OPENAI_API_KEY'] = openai_api_key

if len(sys.argv) < 3:
raise ValueError('You must specify a paper url')
paper_url = sys.argv[2]
paper_content = get_paper_content(paper_url)
paper_summary = key_findings(paper_content)
print("Key Findings: \n", paper_summary)
return

if len(sys.argv) < 2:
raise ValueError('You must specify a keyword')

Expand Down