diff --git a/analysis.py b/analysis.py index c00ebec..4e305ca 100644 --- a/analysis.py +++ b/analysis.py @@ -13,7 +13,7 @@ class GPTAnalyzer: """ def __init__( - self, pdfs, main_query, variable_specs, email, output_fmt, additional_info + self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model ): """ Initializes the GPTAnalyzer with the given parameters. @@ -24,6 +24,7 @@ def __init__( self.email = email self.output_fmt = output_fmt self.additional_info = additional_info + self.gpt_model = gpt_model def __str__(self): """ @@ -91,6 +92,12 @@ def resp_format_type(self): Returns the response format type. """ return "json_object" + + def get_gpt_model(self): + """ + Returns the gpt model selected by the user (or default is "o4-mini"). + """ + return self.gpt_model class DefaultAnalyzer(GPTAnalyzer): @@ -99,13 +106,13 @@ class DefaultAnalyzer(GPTAnalyzer): """ def __init__( - self, pdfs, main_query, variable_specs, email, output_fmt, additional_info + self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model ): """ Initializes the DefaultAnalyzer with the given parameters. """ super().__init__( - pdfs, main_query, variable_specs, email, output_fmt, additional_info + pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model ) def output_fmt_prompt(self, var_name): @@ -128,13 +135,13 @@ class CustomOutputAnalyzer(GPTAnalyzer): """ def __init__( - self, pdfs, main_query, variable_specs, email, output_fmt, additional_info + self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model ): """ Initializes the CustomOutputAnalyzer with the given parameters. """ super().__init__( - pdfs, main_query, variable_specs, email, output_fmt, additional_info + pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model ) def output_fmt_prompt(self, var_name): @@ -168,13 +175,13 @@ class QuoteAnalyzer(GPTAnalyzer): """ def __init__( - self, pdfs, main_query, variable_specs, email, output_fmt, additional_info + self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model ): """ Initializes the QuoteAnalyzer with the given parameters. """ super().__init__( - pdfs, main_query, variable_specs, email, output_fmt, additional_info + pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model ) def output_fmt_prompt(self, var_name): @@ -335,13 +342,13 @@ class SummaryAnalyzer(GPTAnalyzer): """ def __init__( - self, pdfs, main_query, variable_specs, email, output_fmt, additional_info + self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model ): """ Initializes the SummaryAnalyzer with the given parameters. """ super().__init__( - pdfs, main_query, variable_specs, email, output_fmt, additional_info + pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model ) def output_fmt_prompt(self, var_name): @@ -403,12 +410,12 @@ def get_task_types(): def get_analyzer( - task_type, output_fmt, pdfs, main_query, variable_specs, email, additional_info + task_type, output_fmt, pdfs, main_query, variable_specs, email, additional_info, gpt_model="o4-mini" ): """ Returns an instance of the appropriate analyzer class based on the task type. """ task_analyzer_class = get_task_types()[task_type] return task_analyzer_class( - pdfs, main_query, variable_specs, email, output_fmt, additional_info + pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model ) diff --git a/interface.py b/interface.py index 62b99ad..c9a8f27 100644 --- a/interface.py +++ b/interface.py @@ -35,8 +35,10 @@ def load_header(): st.markdown(html_temp, unsafe_allow_html=True) -def load_text(): - instructions = """ +def load_instructions(): + with st.expander("ℹ️ Instructions", expanded=True): + + instructions = """ ## How to use Reading through each uploaded policy document, this tool will ask ChatGPT the main query template for each data 'variable' specified below. - **Step 0:** IF YOU ARE A NEW USER, FIRST TEST FUNCTIONALITY ON 1-3 DOCUMENTS. @@ -50,9 +52,9 @@ def load_text(): - **Step 8:** Once results are satisfactory, contact aipolicyreader@sei.org for access to full batch-processing functionality. - **Step 9:** Re-run once more on all policy documents.""" - st.markdown(instructions) - # st.warning("Please first run on a subset of PDF's to fine-tune functionality. Repeatedly running on many PDF's causes avoidable AI-borne GHG emissions.", icon="⚠️") - st.markdown("""## Submit your processing request""") + st.markdown(instructions) + # st.warning("Please first run on a subset of PDF's to fine-tune functionality. Repeatedly running on many PDF's causes avoidable AI-borne GHG emissions.", icon="⚠️") + def upload_file(temp_dir): st.subheader("I. Upload Policy Document(s)") @@ -201,13 +203,28 @@ def populate_with_just_transition(): just_transition_df = var_json_to_df("just_trans_var_specs.json") st.session_state["variables_df"] = just_transition_df - def clear_variables(): empty_df = pd.DataFrame( [{"variable_name": None, "variable_description": None, "context": None}] ) st.session_state["variables_df"] = empty_df +def update_var_spec_df_from_csv(): + csv_file = st.session_state["csv_upload"] + if csv_file is None: + return # Don't do anything if no file is uploaded + try: + df = pd.read_csv(csv_file) + if list(df.columns) != ["variable_name", "variable_description"] and list(df.columns) != ["variable_name", "variable_description", "context"]: + df = pd.read_csv(csv_file, header=None) + if df.shape[1] == 2: + df.columns = ["variable_name", "variable_description"] + df["context"] = None # Add a context column with None values + elif df.shape[1] == 3: + df.columns = ["variable_name", "variable_description", "context"] + st.session_state["variables_df"] = df + except Exception as e: + st.error(f"Error reading CSV: {e}") def input_data_specs(): st.markdown("") @@ -220,7 +237,7 @@ def input_data_specs(): ) st.markdown(hdr) st.markdown( - "**Type-in variable details or copy-and-paste from an excel spreadsheet (3 columns, no headers).**" + "**Type-in variable details, upload a csv, or copy-and-paste from an excel spreadsheet (3 columns, no headers).**" ) if "variables_df" not in st.session_state: st.session_state["variables_df"] = var_json_to_df("default_var_specs.json") @@ -237,17 +254,21 @@ def input_data_specs(): hide_index=True, column_order=variable_specification_parameters, ) - btn1, btn2, btn3 = st.columns([1, 1, 1]) + btn1, btn2, _, btn4 = st.columns([5, 5, 2, 3]) with btn1: st.button("Clear", on_click=clear_variables) with btn2: - st.button("Populate with SDGs", on_click=populate_with_SDGs) - with btn3: - st.button( - "Use Just-Transition Themes", - on_click=populate_with_just_transition, - use_container_width=True, - ) + with st.popover("Populate with..."): + st.button("SDGs", on_click=populate_with_SDGs) + st.button("Just-Transition Themes", on_click=populate_with_just_transition) + with btn4: + with st.popover("📤 Upload CSV"): + st.file_uploader( + "Choose a CSV file (headers optional, 2 or 3 columns):", + type=["csv"], + key="csv_upload", + on_change=update_var_spec_df_from_csv + ) with st.expander("Advanced settings"): st.selectbox( "Optional: specify the overall operation type", @@ -342,13 +363,22 @@ def is_valid_email(email): validated = re.match(email_regex, email) is not None return validated -def input_email(): - st.markdown( - "For variables with short descriptions, processing time will be about 1 minute per 100 PDF pages per variable." +def select_gpt_model(): + if "gpt_model" not in st.session_state: + st.session_state["gpt_model"] = "o4-mini" # Default model + model_options = { + "o4-mini": "o4-mini", + "o3": "o3 (slower, smarter, more expensive)", + "gpt-4.1": "4.1", + } + st.session_state["gpt_model"] = st.selectbox( + "Select the OpenAI model to use for processing:", + options=list(model_options.keys()), + format_func=lambda x: model_options[x], ) - email = st.text_input("Enter your email where you'd like to receive the results:") - +def input_email(): + email = st.text_input("Enter your email where'd like to receive the results:") if "email" not in st.session_state: st.session_state["email"] = None # Set to None if email is empty, for warning to user if not is_valid_email(email): @@ -363,7 +393,8 @@ def build_interface(tmp_dir): st.session_state["task_type"] = "Quote extraction" if "is_test_run" not in st.session_state: st.session_state["is_test_run"] = True - load_text() + load_instructions() + st.markdown("""## Submit your processing request""") upload_file(tmp_dir) input_main_query() if "output_format_options" not in st.session_state: @@ -387,6 +418,10 @@ def build_interface(tmp_dir): st.session_state["output_detail_df"] = None input_data_specs() st.divider() + st.markdown( + "For variables with short descriptions, processing time will be about 1 minute per 100 PDF pages per variable (with default model selection)." + ) + select_gpt_model() input_email() @@ -439,8 +474,9 @@ def get_user_inputs(): "custom_output_fmt": st.session_state["custom_output_fmt"], "output_detail": st.session_state["output_detail_df"], } + gpt_model = st.session_state["gpt_model"] return get_analyzer( - task_type, output_fmt, pdfs, main_query, variable_specs, email, additional_info + task_type, output_fmt, pdfs, main_query, variable_specs, email, additional_info, gpt_model ) diff --git a/main.py b/main.py index da87706..f3637d2 100644 --- a/main.py +++ b/main.py @@ -67,6 +67,7 @@ def extract_policy_doc_info( var_embeddings, num_excerpts, openai_apikey, + gpt_model ): """ Extracts policy document information by querying GPT for each variable specified. @@ -86,7 +87,8 @@ def extract_policy_doc_info( """ policy_doc_data = {} text_chunks = input_text_chunks - client, gpt_model, max_num_chars = new_openai_session(openai_apikey) + client, max_num_chars = new_openai_session(openai_apikey) + gpt_model = gpt_analyzer.get_gpt_model() # If the text is short, we don't need to generate embeddings to find "relevant texts" # If the text is long, text_chunks (defined above) will be replaced with the top relevant texts run_on_full_text = char_count < (max_num_chars - 1000) @@ -197,6 +199,7 @@ def main(gpt_analyzer, openai_apikey): total_num_pages = 0 total_start_time = time.time() failed_pdfs = [] + gpt_model = gpt_analyzer.get_gpt_model() for pdf in gpt_analyzer.pdfs: pdf_path = get_resource_path(f"{pdf.replace('.pdf','')}.pdf") try: @@ -211,9 +214,7 @@ def main(gpt_analyzer, openai_apikey): num_pages_in_pdf = 0 num_sections = len(text_sections) ## Most PDFs will only have 1 text_section: this is used to break up long documents (>250 pages) - print(2) for text_section in text_sections: - print(3) text_chunks, num_pages, char_count, section = [ text_section[k] for k in ["text_chunks", "num_pages", "num_chars", "section_num"] @@ -224,19 +225,17 @@ def main(gpt_analyzer, openai_apikey): output_pdf_path = f"{pdf_path}" num_pages_in_pdf += num_pages total_num_pages += num_pages - openai_client, _, _ = new_openai_session(openai_apikey) + openai_client, _ = new_openai_session(openai_apikey) pdf_embeddings, pdf_text_chunks = generate_all_embeddings( openai_client, output_pdf_path, text_chunks, get_resource_path ) - print(4) # 2) Prepare embeddings to grab most relevant text excerpts for each variable - openai_client, _, _ = new_openai_session(openai_apikey) + openai_client, _ = new_openai_session(openai_apikey) var_embeddings = embed_variable_specifications( openai_client, gpt_analyzer.variable_specs ) # i.e. {"var_name": {"embedding": <...>", "variable_description": <...>, "context": <...>}, ...} # 3) Iterate through each variable specification to grab relevant texts and query - print(5) num_excerpts = gpt_analyzer.get_num_excerpts(num_pages) policy_info = extract_policy_doc_info( gpt_analyzer, @@ -246,8 +245,8 @@ def main(gpt_analyzer, openai_apikey): var_embeddings, num_excerpts, openai_apikey, + gpt_model ) - print(6) # 4) Output Results output_results(gpt_analyzer, output_doc, output_pdf_path, policy_info) print_milestone( diff --git a/query_gpt.py b/query_gpt.py index faf5501..ae75084 100644 --- a/query_gpt.py +++ b/query_gpt.py @@ -5,9 +5,8 @@ def new_openai_session(openai_apikey): os.environ["OPENAI_API_KEY"] = openai_apikey client = OpenAI() - gpt_model = "gpt-4o" # "o1-preview" max_num_chars = 25000 - return client, gpt_model, max_num_chars + return client, max_num_chars def create_gpt_messages(query, run_on_full_text): @@ -26,12 +25,20 @@ def create_gpt_messages(query, run_on_full_text): def chat_gpt_query(gpt_client, gpt_model, resp_fmt, msgs): - response = gpt_client.chat.completions.create( - model=gpt_model, - temperature=0, - response_format={"type": resp_fmt}, - messages=msgs, - ) + print(gpt_model) + if gpt_model == "gpt-4.1": + response = gpt_client.chat.completions.create( + model=gpt_model, + temperature=0, + response_format={"type": resp_fmt}, + messages=msgs, + ) + else: + response = gpt_client.chat.completions.create( + model=gpt_model, + response_format={"type": resp_fmt}, + messages=msgs, + ) return response.choices[0].message.content @@ -53,7 +60,7 @@ def query_gpt_for_variable_specification( relevant_texts, run_on_full_text, gpt_client, - gpt_model, + gpt_model="o4-mini", ): query_template = gpt_analyzer.main_query excerpts = "\n".join(relevant_texts) diff --git a/results.docx b/results.docx index 3118a26..9df4a85 100644 Binary files a/results.docx and b/results.docx differ