Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 18 additions & 11 deletions analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class GPTAnalyzer:
"""

def __init__(
self, pdfs, main_query, variable_specs, email, output_fmt, additional_info
self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
):
"""
Initializes the GPTAnalyzer with the given parameters.
Expand All @@ -24,6 +24,7 @@ def __init__(
self.email = email
self.output_fmt = output_fmt
self.additional_info = additional_info
self.gpt_model = gpt_model

def __str__(self):
"""
Expand Down Expand Up @@ -91,6 +92,12 @@ def resp_format_type(self):
Returns the response format type.
"""
return "json_object"

def get_gpt_model(self):
"""
Returns the gpt model selected by the user (or default is "o4-mini").
"""
return self.gpt_model


class DefaultAnalyzer(GPTAnalyzer):
Expand All @@ -99,13 +106,13 @@ class DefaultAnalyzer(GPTAnalyzer):
"""

def __init__(
self, pdfs, main_query, variable_specs, email, output_fmt, additional_info
self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
):
"""
Initializes the DefaultAnalyzer with the given parameters.
"""
super().__init__(
pdfs, main_query, variable_specs, email, output_fmt, additional_info
pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
)

def output_fmt_prompt(self, var_name):
Expand All @@ -128,13 +135,13 @@ class CustomOutputAnalyzer(GPTAnalyzer):
"""

def __init__(
self, pdfs, main_query, variable_specs, email, output_fmt, additional_info
self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
):
"""
Initializes the CustomOutputAnalyzer with the given parameters.
"""
super().__init__(
pdfs, main_query, variable_specs, email, output_fmt, additional_info
pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
)

def output_fmt_prompt(self, var_name):
Expand Down Expand Up @@ -168,13 +175,13 @@ class QuoteAnalyzer(GPTAnalyzer):
"""

def __init__(
self, pdfs, main_query, variable_specs, email, output_fmt, additional_info
self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
):
"""
Initializes the QuoteAnalyzer with the given parameters.
"""
super().__init__(
pdfs, main_query, variable_specs, email, output_fmt, additional_info
pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
)

def output_fmt_prompt(self, var_name):
Expand Down Expand Up @@ -335,13 +342,13 @@ class SummaryAnalyzer(GPTAnalyzer):
"""

def __init__(
self, pdfs, main_query, variable_specs, email, output_fmt, additional_info
self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
):
"""
Initializes the SummaryAnalyzer with the given parameters.
"""
super().__init__(
pdfs, main_query, variable_specs, email, output_fmt, additional_info
pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
)

def output_fmt_prompt(self, var_name):
Expand Down Expand Up @@ -403,12 +410,12 @@ def get_task_types():


def get_analyzer(
task_type, output_fmt, pdfs, main_query, variable_specs, email, additional_info
task_type, output_fmt, pdfs, main_query, variable_specs, email, additional_info, gpt_model="o4-mini"
):
"""
Returns an instance of the appropriate analyzer class based on the task type.
"""
task_analyzer_class = get_task_types()[task_type]
return task_analyzer_class(
pdfs, main_query, variable_specs, email, output_fmt, additional_info
pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
)
80 changes: 58 additions & 22 deletions interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@ def load_header():
st.markdown(html_temp, unsafe_allow_html=True)


def load_text():
instructions = """
def load_instructions():
with st.expander("ℹ️ Instructions", expanded=True):

instructions = """
## How to use
Reading through each uploaded policy document, this tool will ask ChatGPT the main query template for each data 'variable' specified below.
- **Step 0:** IF YOU ARE A NEW USER, FIRST TEST FUNCTIONALITY ON 1-3 DOCUMENTS.
Expand All @@ -50,9 +52,9 @@ def load_text():
- **Step 8:** Once results are satisfactory, contact aipolicyreader@sei.org for access to full batch-processing functionality.
- **Step 9:** Re-run once more on all policy documents."""

st.markdown(instructions)
# st.warning("Please first run on a subset of PDF's to fine-tune functionality. Repeatedly running on many PDF's causes avoidable AI-borne GHG emissions.", icon="⚠️")
st.markdown("""## Submit your processing request""")
st.markdown(instructions)
# st.warning("Please first run on a subset of PDF's to fine-tune functionality. Repeatedly running on many PDF's causes avoidable AI-borne GHG emissions.", icon="⚠️")


def upload_file(temp_dir):
st.subheader("I. Upload Policy Document(s)")
Expand Down Expand Up @@ -201,13 +203,28 @@ def populate_with_just_transition():
just_transition_df = var_json_to_df("just_trans_var_specs.json")
st.session_state["variables_df"] = just_transition_df


def clear_variables():
empty_df = pd.DataFrame(
[{"variable_name": None, "variable_description": None, "context": None}]
)
st.session_state["variables_df"] = empty_df

def update_var_spec_df_from_csv():
csv_file = st.session_state["csv_upload"]
if csv_file is None:
return # Don't do anything if no file is uploaded
try:
df = pd.read_csv(csv_file)
if list(df.columns) != ["variable_name", "variable_description"] and list(df.columns) != ["variable_name", "variable_description", "context"]:
df = pd.read_csv(csv_file, header=None)
if df.shape[1] == 2:
df.columns = ["variable_name", "variable_description"]
df["context"] = None # Add a context column with None values
elif df.shape[1] == 3:
df.columns = ["variable_name", "variable_description", "context"]
st.session_state["variables_df"] = df
except Exception as e:
st.error(f"Error reading CSV: {e}")

def input_data_specs():
st.markdown("")
Expand All @@ -220,7 +237,7 @@ def input_data_specs():
)
st.markdown(hdr)
st.markdown(
"**Type-in variable details or copy-and-paste from an excel spreadsheet (3 columns, no headers).**"
"**Type-in variable details, upload a csv, or copy-and-paste from an excel spreadsheet (3 columns, no headers).**"
)
if "variables_df" not in st.session_state:
st.session_state["variables_df"] = var_json_to_df("default_var_specs.json")
Expand All @@ -237,17 +254,21 @@ def input_data_specs():
hide_index=True,
column_order=variable_specification_parameters,
)
btn1, btn2, btn3 = st.columns([1, 1, 1])
btn1, btn2, _, btn4 = st.columns([5, 5, 2, 3])
with btn1:
st.button("Clear", on_click=clear_variables)
with btn2:
st.button("Populate with SDGs", on_click=populate_with_SDGs)
with btn3:
st.button(
"Use Just-Transition Themes",
on_click=populate_with_just_transition,
use_container_width=True,
)
with st.popover("Populate with..."):
st.button("SDGs", on_click=populate_with_SDGs)
st.button("Just-Transition Themes", on_click=populate_with_just_transition)
with btn4:
with st.popover("📤 Upload CSV"):
st.file_uploader(
"Choose a CSV file (headers optional, 2 or 3 columns):",
type=["csv"],
key="csv_upload",
on_change=update_var_spec_df_from_csv
)
with st.expander("Advanced settings"):
st.selectbox(
"Optional: specify the overall operation type",
Expand Down Expand Up @@ -342,13 +363,22 @@ def is_valid_email(email):
validated = re.match(email_regex, email) is not None
return validated

def input_email():
st.markdown(
"For variables with short descriptions, processing time will be about 1 minute per 100 PDF pages per variable."
def select_gpt_model():
if "gpt_model" not in st.session_state:
st.session_state["gpt_model"] = "o4-mini" # Default model
model_options = {
"o4-mini": "o4-mini",
"o3": "o3 (slower, smarter, more expensive)",
"gpt-4.1": "4.1",
}
st.session_state["gpt_model"] = st.selectbox(
"Select the OpenAI model to use for processing:",
options=list(model_options.keys()),
format_func=lambda x: model_options[x],
)

email = st.text_input("Enter your email where you'd like to receive the results:")

def input_email():
email = st.text_input("Enter your email where'd like to receive the results:")
if "email" not in st.session_state:
st.session_state["email"] = None # Set to None if email is empty, for warning to user
if not is_valid_email(email):
Expand All @@ -363,7 +393,8 @@ def build_interface(tmp_dir):
st.session_state["task_type"] = "Quote extraction"
if "is_test_run" not in st.session_state:
st.session_state["is_test_run"] = True
load_text()
load_instructions()
st.markdown("""## Submit your processing request""")
upload_file(tmp_dir)
input_main_query()
if "output_format_options" not in st.session_state:
Expand All @@ -387,6 +418,10 @@ def build_interface(tmp_dir):
st.session_state["output_detail_df"] = None
input_data_specs()
st.divider()
st.markdown(
"For variables with short descriptions, processing time will be about 1 minute per 100 PDF pages per variable (with default model selection)."
)
select_gpt_model()
input_email()


Expand Down Expand Up @@ -439,8 +474,9 @@ def get_user_inputs():
"custom_output_fmt": st.session_state["custom_output_fmt"],
"output_detail": st.session_state["output_detail_df"],
}
gpt_model = st.session_state["gpt_model"]
return get_analyzer(
task_type, output_fmt, pdfs, main_query, variable_specs, email, additional_info
task_type, output_fmt, pdfs, main_query, variable_specs, email, additional_info, gpt_model
)


Expand Down
15 changes: 7 additions & 8 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def extract_policy_doc_info(
var_embeddings,
num_excerpts,
openai_apikey,
gpt_model
):
"""
Extracts policy document information by querying GPT for each variable specified.
Expand All @@ -86,7 +87,8 @@ def extract_policy_doc_info(
"""
policy_doc_data = {}
text_chunks = input_text_chunks
client, gpt_model, max_num_chars = new_openai_session(openai_apikey)
client, max_num_chars = new_openai_session(openai_apikey)
gpt_model = gpt_analyzer.get_gpt_model()
# If the text is short, we don't need to generate embeddings to find "relevant texts"
# If the text is long, text_chunks (defined above) will be replaced with the top relevant texts
run_on_full_text = char_count < (max_num_chars - 1000)
Expand Down Expand Up @@ -197,6 +199,7 @@ def main(gpt_analyzer, openai_apikey):
total_num_pages = 0
total_start_time = time.time()
failed_pdfs = []
gpt_model = gpt_analyzer.get_gpt_model()
for pdf in gpt_analyzer.pdfs:
pdf_path = get_resource_path(f"{pdf.replace('.pdf','')}.pdf")
try:
Expand All @@ -211,9 +214,7 @@ def main(gpt_analyzer, openai_apikey):
num_pages_in_pdf = 0
num_sections = len(text_sections)
## Most PDFs will only have 1 text_section: this is used to break up long documents (>250 pages)
print(2)
for text_section in text_sections:
print(3)
text_chunks, num_pages, char_count, section = [
text_section[k]
for k in ["text_chunks", "num_pages", "num_chars", "section_num"]
Expand All @@ -224,19 +225,17 @@ def main(gpt_analyzer, openai_apikey):
output_pdf_path = f"{pdf_path}"
num_pages_in_pdf += num_pages
total_num_pages += num_pages
openai_client, _, _ = new_openai_session(openai_apikey)
openai_client, _ = new_openai_session(openai_apikey)
pdf_embeddings, pdf_text_chunks = generate_all_embeddings(
openai_client, output_pdf_path, text_chunks, get_resource_path
)
print(4)
# 2) Prepare embeddings to grab most relevant text excerpts for each variable
openai_client, _, _ = new_openai_session(openai_apikey)
openai_client, _ = new_openai_session(openai_apikey)
var_embeddings = embed_variable_specifications(
openai_client, gpt_analyzer.variable_specs
) # i.e. {"var_name": {"embedding": <...>", "variable_description": <...>, "context": <...>}, ...}

# 3) Iterate through each variable specification to grab relevant texts and query
print(5)
num_excerpts = gpt_analyzer.get_num_excerpts(num_pages)
policy_info = extract_policy_doc_info(
gpt_analyzer,
Expand All @@ -246,8 +245,8 @@ def main(gpt_analyzer, openai_apikey):
var_embeddings,
num_excerpts,
openai_apikey,
gpt_model
)
print(6)
# 4) Output Results
output_results(gpt_analyzer, output_doc, output_pdf_path, policy_info)
print_milestone(
Expand Down
25 changes: 16 additions & 9 deletions query_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
def new_openai_session(openai_apikey):
os.environ["OPENAI_API_KEY"] = openai_apikey
client = OpenAI()
gpt_model = "gpt-4o" # "o1-preview"
max_num_chars = 25000
return client, gpt_model, max_num_chars
return client, max_num_chars


def create_gpt_messages(query, run_on_full_text):
Expand All @@ -26,12 +25,20 @@ def create_gpt_messages(query, run_on_full_text):


def chat_gpt_query(gpt_client, gpt_model, resp_fmt, msgs):
response = gpt_client.chat.completions.create(
model=gpt_model,
temperature=0,
response_format={"type": resp_fmt},
messages=msgs,
)
print(gpt_model)
if gpt_model == "gpt-4.1":
response = gpt_client.chat.completions.create(
model=gpt_model,
temperature=0,
response_format={"type": resp_fmt},
messages=msgs,
)
else:
response = gpt_client.chat.completions.create(
model=gpt_model,
response_format={"type": resp_fmt},
messages=msgs,
)
return response.choices[0].message.content


Expand All @@ -53,7 +60,7 @@ def query_gpt_for_variable_specification(
relevant_texts,
run_on_full_text,
gpt_client,
gpt_model,
gpt_model="o4-mini",
):
query_template = gpt_analyzer.main_query
excerpts = "\n".join(relevant_texts)
Expand Down
Binary file modified results.docx
Binary file not shown.