From 54dc4d74c0d72d569f104bf66f2f0d8dd6d251b9 Mon Sep 17 00:00:00 2001 From: mayanksingh09 Date: Fri, 25 Aug 2023 19:36:16 -0400 Subject: [PATCH 1/3] Added OpenAI gpt3.5 summarise and keyfinding capability --- .gitignore | 7 +++ README.md | 19 +++++++- requirements.txt | 4 +- setup.py | 2 +- sotawhat/sotawhat.py | 105 ++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 133 insertions(+), 4 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5789772 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.cache +*.egg-info +build/ +dist/ +venv* +/venv/ +__pycache__ \ No newline at end of file diff --git a/README.md b/README.md index d641267..cde46e2 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Read more about SOTAWHAT [here](https://huyenchip.com/2018/10/04/sotawhat.html). You can use sotawhat through a web interface [here](https://sotawhat.herokuapp.com/#/). Thanks hmchuong! -This script runs using Python 3. It requires ``nltk``, ``six``, and ``pyspellchecker``. To install it as a Python package, follow the following steps: +This script runs using Python 3 (Python 3.8.12 works best). It requires ``nltk``, ``six``, ``pyspellchecker``, and ``openai`` (if using the summarise or keyfindings arguments). To install it as a Python package, follow the following steps: Step 1: clone this repo, and go inside that repo: @@ -66,3 +66,20 @@ We've found that this script works well with keywords that are: + a task (e.g. language model, machine translation, fuzzing, ...) + a metric (e.g. BLEU, perplexity, ...) + random stuff + +## Summarization +You can also use the script to summarize a paper using GPT3.5 after you get it's url from the step above. For example: + +```bash +$ sotawhat summarize https://arxiv.org/abs/1809.04281 +``` + +It uses the gpt-3.5-turbo-16k model and will request for your OpenAI API key. You can get one [here](https://platform.openai.com/signup). The simple prompt will generate a 150 word summary of the paper to help you decide if you want to read further. + +You can also use the script to list down the key findings of the paper if you don't feel like leaving the command line interface using: + +```bash +$ sotawhat keyfindings https://arxiv.org/abs/1809.04281 +``` + +The script works well with papers shorter than 20 pages or so as the max token length is 16k, any paper or document bigger than that will be clipped to 45000 characters and then summarized. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f72ffd2..afc2944 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ nltk six -pyspellchecker +pyspellchecker +PyPDF2 +openai \ No newline at end of file diff --git a/setup.py b/setup.py index cbd072e..3807a87 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ 'abstracts and extract summaries from them. '), url='https://huyenchip.com/2018/10/04/sotawhat.html', license="", - install_requires=['six', 'nltk', 'pyspellchecker'], + install_requires=['six', 'nltk', 'pyspellchecker', 'PyPDF2', 'openai'], entry_points={ 'console_scripts': ['sotawhat=sotawhat.sotawhat:main'], } diff --git a/sotawhat/sotawhat.py b/sotawhat/sotawhat.py index 8c9d6f1..b93de60 100644 --- a/sotawhat/sotawhat.py +++ b/sotawhat/sotawhat.py @@ -1,9 +1,15 @@ import os import re import sys +import PyPDF2 import urllib.error import urllib.request import warnings +import requests +from io import BytesIO +import openai +from getpass import getpass + import nltk from nltk.tokenize import word_tokenize @@ -36,6 +42,11 @@ def get_authors(lines, i): i += 1 return authors, i +# Source: https://github.com/chiphuyen/sotawhat/pull/33/files +def extract_first_arxiv_url(html_string): + pattern = r'https://arxiv\.org/[^"]+' # Regular expression to match URLs from the arxiv.org domain + match = re.search(pattern, html_string) + return match.group(0) if match else None def get_next_result(lines, start): """ @@ -51,7 +62,7 @@ def get_next_result(lines, start): result = {} idx = lines[start + 3][10:].find('"') - result['main_page'] = lines[start + 3][9:10 + idx] + result['main_page'] = extract_first_arxiv_url(lines[start + 3]) idx = lines[start + 4][23:].find('"') result['pdf'] = lines[start + 4][22: 23 + idx] + '.pdf' @@ -272,6 +283,64 @@ def get_papers(keyword, num_results=5): all_unshown.extend(unshown) page += 1 +def get_paper_content(pdf_url): + """ + Gets the content of the paper from the pdf url + """ + response = requests.get(pdf_url) + if response.status_code == 200: + pdf_content = BytesIO(response.content) + pdf_reader = PyPDF2.PdfReader(pdf_content) + + all_text = '' + for page_num in range(len(pdf_reader.pages)): + page = pdf_reader.pages[page_num] + text = page.extract_text() + all_text += text + + return all_text + +def summarise_paper(paper_content, model="gpt-3.5-turbo-16k", role="user"): + """ + Summarises the paper content using the OpenAI GPT-3.5 16k model + """ + if len(paper_content) > 45000: + warnings.warn(f'Paper content is too long - {len(paper_content)} characters. Using only first 45000 characters.') + paper_content = paper_content[:45000] + response = openai.ChatCompletion.create( + model=model, + messages=[ + { + "role": "system", + "content": "You are a text summarising system that provides easy to understand summaries of research papers." + }, + {"role": role, + "content": "Summarise paper text in 150 words: " + paper_content + }]) + + return response["choices"][0]['message']['content'] + +def key_findings(paper_content, model="gpt-3.5-turbo-16k", role="user"): + """ + Retrieves Key Findings from a paper using OpenAI GPT-3.5 16k model + """ + if len(paper_content) > 45000: + warnings.warn(f'Paper content is too long - {len(paper_content)} characters. Using only first 45000 characters.') + paper_content = paper_content[:45000] + + response = openai.ChatCompletion.create( + model=model, + messages=[ + { + "role": "system", + "content": "You are a academic paper parsing system that provides key findings/learnings of research papers." + }, + {"role": role, + "content": "List down the key findings of the paper: " + paper_content + }]) + + return response["choices"][0]['message']['content'] + def main(): if 'nt' in os.name: @@ -283,6 +352,40 @@ def main(): 'If such errors occur, please install `win_unicode_consolde` via \n' 'the command `pip install win-unicode-console`.') + if len(sys.argv) > 1 and sys.argv[1] == 'summarize': + + # Check for OpenAI API key only when 'summarize' is the argument + if 'OPENAI_API_KEY' not in os.environ: + openai_api_key = getpass('Please enter your OpenAI API Key: ') + os.environ['OPENAI_API_KEY'] = openai_api_key + + openai.api_key = os.environ['OPENAI_API_KEY'] + + if len(sys.argv) < 3: + raise ValueError('You must specify a paper url') + paper_url = sys.argv[2] + paper_content = get_paper_content(paper_url) + paper_summary = summarise_paper(paper_content) + print("Paper Summary: \n", paper_summary) + return + + if len(sys.argv) > 1 and sys.argv[1] == 'keyfindings': + + # Check for OpenAI API key only when 'keyfindings' is the argument + if 'OPENAI_API_KEY' not in os.environ: + openai_api_key = getpass('Please enter your OpenAI API Key: ') + os.environ['OPENAI_API_KEY'] = openai_api_key + + openai.api_key = os.environ['OPENAI_API_KEY'] + + if len(sys.argv) < 3: + raise ValueError('You must specify a paper url') + paper_url = sys.argv[2] + paper_content = get_paper_content(paper_url) + paper_summary = key_findings(paper_content) + print("Key Findings: \n", paper_summary) + return + if len(sys.argv) < 2: raise ValueError('You must specify a keyword') From 96233009b95aec0371687c24ea1710c30956e505 Mon Sep 17 00:00:00 2001 From: mayanksingh09 Date: Fri, 2 Feb 2024 09:40:50 +0530 Subject: [PATCH 2/3] Added fix for arxiv url --- sotawhat/sotawhat.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/sotawhat/sotawhat.py b/sotawhat/sotawhat.py index b93de60..0fafbd6 100644 --- a/sotawhat/sotawhat.py +++ b/sotawhat/sotawhat.py @@ -44,7 +44,7 @@ def get_authors(lines, i): # Source: https://github.com/chiphuyen/sotawhat/pull/33/files def extract_first_arxiv_url(html_string): - pattern = r'https://arxiv\.org/[^"]+' # Regular expression to match URLs from the arxiv.org domain + pattern = r'https://[^"]+arxiv\.org/[^"]+' # Regular expression to match URLs from the arxiv.org domain match = re.search(pattern, html_string) return match.group(0) if match else None @@ -355,10 +355,8 @@ def main(): if len(sys.argv) > 1 and sys.argv[1] == 'summarize': # Check for OpenAI API key only when 'summarize' is the argument - if 'OPENAI_API_KEY' not in os.environ: - openai_api_key = getpass('Please enter your OpenAI API Key: ') - os.environ['OPENAI_API_KEY'] = openai_api_key - + os.system(f"(export OPENAI_API_KEY={getpass('Please enter your OpenAI API Key: ')}; bash)") + openai.api_key = os.environ['OPENAI_API_KEY'] if len(sys.argv) < 3: @@ -372,10 +370,8 @@ def main(): if len(sys.argv) > 1 and sys.argv[1] == 'keyfindings': # Check for OpenAI API key only when 'keyfindings' is the argument - if 'OPENAI_API_KEY' not in os.environ: - openai_api_key = getpass('Please enter your OpenAI API Key: ') - os.environ['OPENAI_API_KEY'] = openai_api_key - + os.system(f"(export OPENAI_API_KEY={getpass('Please enter your OpenAI API Key: ')}; bash)") + openai.api_key = os.environ['OPENAI_API_KEY'] if len(sys.argv) < 3: From 25a0d8b92acb67e29551842a50ea94f01ae02fb0 Mon Sep 17 00:00:00 2001 From: mayanksingh09 Date: Fri, 2 Feb 2024 09:59:56 +0530 Subject: [PATCH 3/3] Updated OpenAI model and response format --- sotawhat/sotawhat.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/sotawhat/sotawhat.py b/sotawhat/sotawhat.py index 0fafbd6..5e6c40d 100644 --- a/sotawhat/sotawhat.py +++ b/sotawhat/sotawhat.py @@ -300,14 +300,14 @@ def get_paper_content(pdf_url): return all_text -def summarise_paper(paper_content, model="gpt-3.5-turbo-16k", role="user"): +def summarise_paper(paper_content, model="gpt-3.5-turbo-0125", role="user"): """ Summarises the paper content using the OpenAI GPT-3.5 16k model """ if len(paper_content) > 45000: warnings.warn(f'Paper content is too long - {len(paper_content)} characters. Using only first 45000 characters.') - paper_content = paper_content[:45000] - response = openai.ChatCompletion.create( + paper_content = paper_content[:45000] + response = openai_client.chat.completions.create( model=model, messages=[ { @@ -318,17 +318,19 @@ def summarise_paper(paper_content, model="gpt-3.5-turbo-16k", role="user"): "content": "Summarise paper text in 150 words: " + paper_content }]) - return response["choices"][0]['message']['content'] + return response.choices[0].message.content -def key_findings(paper_content, model="gpt-3.5-turbo-16k", role="user"): +def key_findings(paper_content, model="gpt-3.5-turbo-0125", role="user"): """ Retrieves Key Findings from a paper using OpenAI GPT-3.5 16k model """ if len(paper_content) > 45000: warnings.warn(f'Paper content is too long - {len(paper_content)} characters. Using only first 45000 characters.') paper_content = paper_content[:45000] - - response = openai.ChatCompletion.create( + + openai_client = openai.OpenAI() + + response = openai_client.chat.completions.create( model=model, messages=[ { @@ -339,7 +341,7 @@ def key_findings(paper_content, model="gpt-3.5-turbo-16k", role="user"): "content": "List down the key findings of the paper: " + paper_content }]) - return response["choices"][0]['message']['content'] + return response.choices[0].message.content def main(): @@ -355,9 +357,10 @@ def main(): if len(sys.argv) > 1 and sys.argv[1] == 'summarize': # Check for OpenAI API key only when 'summarize' is the argument - os.system(f"(export OPENAI_API_KEY={getpass('Please enter your OpenAI API Key: ')}; bash)") - - openai.api_key = os.environ['OPENAI_API_KEY'] + if 'OPENAI_API_KEY' not in os.environ: + openai_api_key = getpass('Please enter your OpenAI API Key: ') + + os.environ['OPENAI_API_KEY'] = openai_api_key if len(sys.argv) < 3: raise ValueError('You must specify a paper url') @@ -370,9 +373,10 @@ def main(): if len(sys.argv) > 1 and sys.argv[1] == 'keyfindings': # Check for OpenAI API key only when 'keyfindings' is the argument - os.system(f"(export OPENAI_API_KEY={getpass('Please enter your OpenAI API Key: ')}; bash)") - - openai.api_key = os.environ['OPENAI_API_KEY'] + if 'OPENAI_API_KEY' not in os.environ: + openai_api_key = getpass('Please enter your OpenAI API Key: ') + + os.environ['OPENAI_API_KEY'] = openai_api_key if len(sys.argv) < 3: raise ValueError('You must specify a paper url')