diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5789772 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.cache +*.egg-info +build/ +dist/ +venv* +/venv/ +__pycache__ \ No newline at end of file diff --git a/README.md b/README.md index d641267..cde46e2 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Read more about SOTAWHAT [here](https://huyenchip.com/2018/10/04/sotawhat.html). You can use sotawhat through a web interface [here](https://sotawhat.herokuapp.com/#/). Thanks hmchuong! -This script runs using Python 3. It requires ``nltk``, ``six``, and ``pyspellchecker``. To install it as a Python package, follow the following steps: +This script runs using Python 3 (Python 3.8.12 works best). It requires ``nltk``, ``six``, ``pyspellchecker``, and ``openai`` (if using the summarise or keyfindings arguments). To install it as a Python package, follow the following steps: Step 1: clone this repo, and go inside that repo: @@ -66,3 +66,20 @@ We've found that this script works well with keywords that are: + a task (e.g. language model, machine translation, fuzzing, ...) + a metric (e.g. BLEU, perplexity, ...) + random stuff + +## Summarization +You can also use the script to summarize a paper using GPT3.5 after you get it's url from the step above. For example: + +```bash +$ sotawhat summarize https://arxiv.org/abs/1809.04281 +``` + +It uses the gpt-3.5-turbo-16k model and will request for your OpenAI API key. You can get one [here](https://platform.openai.com/signup). The simple prompt will generate a 150 word summary of the paper to help you decide if you want to read further. + +You can also use the script to list down the key findings of the paper if you don't feel like leaving the command line interface using: + +```bash +$ sotawhat keyfindings https://arxiv.org/abs/1809.04281 +``` + +The script works well with papers shorter than 20 pages or so as the max token length is 16k, any paper or document bigger than that will be clipped to 45000 characters and then summarized. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f72ffd2..afc2944 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ nltk six -pyspellchecker +pyspellchecker +PyPDF2 +openai \ No newline at end of file diff --git a/setup.py b/setup.py index cbd072e..3807a87 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ 'abstracts and extract summaries from them. '), url='https://huyenchip.com/2018/10/04/sotawhat.html', license="", - install_requires=['six', 'nltk', 'pyspellchecker'], + install_requires=['six', 'nltk', 'pyspellchecker', 'PyPDF2', 'openai'], entry_points={ 'console_scripts': ['sotawhat=sotawhat.sotawhat:main'], } diff --git a/sotawhat/sotawhat.py b/sotawhat/sotawhat.py index 8c9d6f1..5e6c40d 100644 --- a/sotawhat/sotawhat.py +++ b/sotawhat/sotawhat.py @@ -1,9 +1,15 @@ import os import re import sys +import PyPDF2 import urllib.error import urllib.request import warnings +import requests +from io import BytesIO +import openai +from getpass import getpass + import nltk from nltk.tokenize import word_tokenize @@ -36,6 +42,11 @@ def get_authors(lines, i): i += 1 return authors, i +# Source: https://github.com/chiphuyen/sotawhat/pull/33/files +def extract_first_arxiv_url(html_string): + pattern = r'https://[^"]+arxiv\.org/[^"]+' # Regular expression to match URLs from the arxiv.org domain + match = re.search(pattern, html_string) + return match.group(0) if match else None def get_next_result(lines, start): """ @@ -51,7 +62,7 @@ def get_next_result(lines, start): result = {} idx = lines[start + 3][10:].find('"') - result['main_page'] = lines[start + 3][9:10 + idx] + result['main_page'] = extract_first_arxiv_url(lines[start + 3]) idx = lines[start + 4][23:].find('"') result['pdf'] = lines[start + 4][22: 23 + idx] + '.pdf' @@ -272,6 +283,66 @@ def get_papers(keyword, num_results=5): all_unshown.extend(unshown) page += 1 +def get_paper_content(pdf_url): + """ + Gets the content of the paper from the pdf url + """ + response = requests.get(pdf_url) + if response.status_code == 200: + pdf_content = BytesIO(response.content) + pdf_reader = PyPDF2.PdfReader(pdf_content) + + all_text = '' + for page_num in range(len(pdf_reader.pages)): + page = pdf_reader.pages[page_num] + text = page.extract_text() + all_text += text + + return all_text + +def summarise_paper(paper_content, model="gpt-3.5-turbo-0125", role="user"): + """ + Summarises the paper content using the OpenAI GPT-3.5 16k model + """ + if len(paper_content) > 45000: + warnings.warn(f'Paper content is too long - {len(paper_content)} characters. Using only first 45000 characters.') + paper_content = paper_content[:45000] + response = openai_client.chat.completions.create( + model=model, + messages=[ + { + "role": "system", + "content": "You are a text summarising system that provides easy to understand summaries of research papers." + }, + {"role": role, + "content": "Summarise paper text in 150 words: " + paper_content + }]) + + return response.choices[0].message.content + +def key_findings(paper_content, model="gpt-3.5-turbo-0125", role="user"): + """ + Retrieves Key Findings from a paper using OpenAI GPT-3.5 16k model + """ + if len(paper_content) > 45000: + warnings.warn(f'Paper content is too long - {len(paper_content)} characters. Using only first 45000 characters.') + paper_content = paper_content[:45000] + + openai_client = openai.OpenAI() + + response = openai_client.chat.completions.create( + model=model, + messages=[ + { + "role": "system", + "content": "You are a academic paper parsing system that provides key findings/learnings of research papers." + }, + {"role": role, + "content": "List down the key findings of the paper: " + paper_content + }]) + + return response.choices[0].message.content + def main(): if 'nt' in os.name: @@ -283,6 +354,38 @@ def main(): 'If such errors occur, please install `win_unicode_consolde` via \n' 'the command `pip install win-unicode-console`.') + if len(sys.argv) > 1 and sys.argv[1] == 'summarize': + + # Check for OpenAI API key only when 'summarize' is the argument + if 'OPENAI_API_KEY' not in os.environ: + openai_api_key = getpass('Please enter your OpenAI API Key: ') + + os.environ['OPENAI_API_KEY'] = openai_api_key + + if len(sys.argv) < 3: + raise ValueError('You must specify a paper url') + paper_url = sys.argv[2] + paper_content = get_paper_content(paper_url) + paper_summary = summarise_paper(paper_content) + print("Paper Summary: \n", paper_summary) + return + + if len(sys.argv) > 1 and sys.argv[1] == 'keyfindings': + + # Check for OpenAI API key only when 'keyfindings' is the argument + if 'OPENAI_API_KEY' not in os.environ: + openai_api_key = getpass('Please enter your OpenAI API Key: ') + + os.environ['OPENAI_API_KEY'] = openai_api_key + + if len(sys.argv) < 3: + raise ValueError('You must specify a paper url') + paper_url = sys.argv[2] + paper_content = get_paper_content(paper_url) + paper_summary = key_findings(paper_content) + print("Key Findings: \n", paper_summary) + return + if len(sys.argv) < 2: raise ValueError('You must specify a keyword')