sei-international · billybabis · Jun 26, 2025 · Jun 26, 2025
diff --git a/analysis.py b/analysis.py
@@ -13,7 +13,7 @@ class GPTAnalyzer:
     """
 
     def __init__(
-        self, pdfs, main_query, variable_specs, email, output_fmt, additional_info
+        self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
     ):
         """
         Initializes the GPTAnalyzer with the given parameters.
@@ -24,6 +24,7 @@ def __init__(
         self.email = email
         self.output_fmt = output_fmt
         self.additional_info = additional_info
+        self.gpt_model = gpt_model
 
     def __str__(self):
         """
@@ -91,6 +92,12 @@ def resp_format_type(self):
         Returns the response format type.
         """
         return "json_object"
+
+    def get_gpt_model(self):
+        """
+        Returns the gpt model selected by the user (or default is "o4-mini").
+        """
+        return self.gpt_model
 
 
 class DefaultAnalyzer(GPTAnalyzer):
@@ -99,13 +106,13 @@ class DefaultAnalyzer(GPTAnalyzer):
     """
 
     def __init__(
-        self, pdfs, main_query, variable_specs, email, output_fmt, additional_info
+        self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
     ):
         """
         Initializes the DefaultAnalyzer with the given parameters.
         """
         super().__init__(
-            pdfs, main_query, variable_specs, email, output_fmt, additional_info
+            pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
         )
 
     def output_fmt_prompt(self, var_name):
@@ -128,13 +135,13 @@ class CustomOutputAnalyzer(GPTAnalyzer):
     """
 
     def __init__(
-        self, pdfs, main_query, variable_specs, email, output_fmt, additional_info
+        self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
     ):
         """
         Initializes the CustomOutputAnalyzer with the given parameters.
         """
         super().__init__(
-            pdfs, main_query, variable_specs, email, output_fmt, additional_info
+            pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
         )
 
     def output_fmt_prompt(self, var_name):
@@ -168,13 +175,13 @@ class QuoteAnalyzer(GPTAnalyzer):
     """
 
     def __init__(
-        self, pdfs, main_query, variable_specs, email, output_fmt, additional_info
+        self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
     ):
         """
         Initializes the QuoteAnalyzer with the given parameters.
         """
         super().__init__(
-            pdfs, main_query, variable_specs, email, output_fmt, additional_info
+            pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
         )
 
     def output_fmt_prompt(self, var_name):
@@ -335,13 +342,13 @@ class SummaryAnalyzer(GPTAnalyzer):
     """
 
     def __init__(
-        self, pdfs, main_query, variable_specs, email, output_fmt, additional_info
+        self, pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
     ):
         """
         Initializes the SummaryAnalyzer with the given parameters.
         """
         super().__init__(
-            pdfs, main_query, variable_specs, email, output_fmt, additional_info
+            pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
         )
 
     def output_fmt_prompt(self, var_name):
@@ -403,12 +410,12 @@ def get_task_types():
 
 
 def get_analyzer(
-    task_type, output_fmt, pdfs, main_query, variable_specs, email, additional_info
+    task_type, output_fmt, pdfs, main_query, variable_specs, email, additional_info, gpt_model="o4-mini"
 ):
     """
     Returns an instance of the appropriate analyzer class based on the task type.
     """
     task_analyzer_class = get_task_types()[task_type]
     return task_analyzer_class(
-        pdfs, main_query, variable_specs, email, output_fmt, additional_info
+        pdfs, main_query, variable_specs, email, output_fmt, additional_info, gpt_model
     )
diff --git a/interface.py b/interface.py
@@ -35,8 +35,10 @@ def load_header():
     st.markdown(html_temp, unsafe_allow_html=True)
 
 
-def load_text():
-    instructions = """
+def load_instructions():
+    with st.expander("ℹ️ Instructions", expanded=True):
+
+        instructions = """
 ## How to use
 Reading through each uploaded policy document, this tool will ask ChatGPT the main query template for each data 'variable' specified below. 
 - **Step 0:** IF YOU ARE A NEW USER, FIRST TEST FUNCTIONALITY ON 1-3 DOCUMENTS.
@@ -50,9 +52,9 @@ def load_text():
 - **Step 8:** Once results are satisfactory, contact aipolicyreader@sei.org for access to full batch-processing functionality.
 - **Step 9:** Re-run once more on all policy documents."""
 
-    st.markdown(instructions)
-    # st.warning("Please first run on a subset of PDF's to fine-tune functionality. Repeatedly running on many PDF's causes avoidable AI-borne GHG emissions.", icon="⚠️")
-    st.markdown("""## Submit your processing request""")
+        st.markdown(instructions)
+        # st.warning("Please first run on a subset of PDF's to fine-tune functionality. Repeatedly running on many PDF's causes avoidable AI-borne GHG emissions.", icon="⚠️")
+
 
 def upload_file(temp_dir):
     st.subheader("I. Upload Policy Document(s)")
@@ -201,13 +203,28 @@ def populate_with_just_transition():
     just_transition_df = var_json_to_df("just_trans_var_specs.json")
     st.session_state["variables_df"] = just_transition_df
 
-
 def clear_variables():
     empty_df = pd.DataFrame(
         [{"variable_name": None, "variable_description": None, "context": None}]
     )
     st.session_state["variables_df"] = empty_df
 
+def update_var_spec_df_from_csv():
+    csv_file = st.session_state["csv_upload"]
+    if csv_file is None:
+        return  # Don't do anything if no file is uploaded
+    try:
+        df = pd.read_csv(csv_file)
+        if list(df.columns) != ["variable_name", "variable_description"] and list(df.columns) != ["variable_name", "variable_description", "context"]:
+            df = pd.read_csv(csv_file, header=None)
+            if df.shape[1] == 2:
+                df.columns = ["variable_name", "variable_description"]
+                df["context"] = None  # Add a context column with None values
+            elif df.shape[1] == 3:
+                df.columns = ["variable_name", "variable_description", "context"]
+        st.session_state["variables_df"] = df
+    except Exception as e:
+        st.error(f"Error reading CSV: {e}")
 
 def input_data_specs():
     st.markdown("")
@@ -220,7 +237,7 @@ def input_data_specs():
     )
     st.markdown(hdr)
     st.markdown(
-        "**Type-in variable details or copy-and-paste from an excel spreadsheet (3 columns, no headers).**"
+        "**Type-in variable details, upload a csv, or copy-and-paste from an excel spreadsheet (3 columns, no headers).**"
     )
     if "variables_df" not in st.session_state:
         st.session_state["variables_df"] = var_json_to_df("default_var_specs.json")
@@ -237,17 +254,21 @@ def input_data_specs():
         hide_index=True,
         column_order=variable_specification_parameters,
     )
-    btn1, btn2, btn3 = st.columns([1, 1, 1])
+    btn1, btn2, _, btn4 = st.columns([5, 5, 2, 3])
     with btn1:
         st.button("Clear", on_click=clear_variables)
     with btn2:
-        st.button("Populate with SDGs", on_click=populate_with_SDGs)
-    with btn3:
-        st.button(
-            "Use Just-Transition Themes",
-            on_click=populate_with_just_transition,
-            use_container_width=True,
-        )
+        with st.popover("Populate with..."):
+            st.button("SDGs", on_click=populate_with_SDGs)
+            st.button("Just-Transition Themes", on_click=populate_with_just_transition)
+    with btn4:
+        with st.popover("📤 Upload CSV"):
+            st.file_uploader(
+                "Choose a CSV file (headers optional, 2 or 3 columns):",
+                type=["csv"],
+                key="csv_upload",
+                on_change=update_var_spec_df_from_csv
+            )
     with st.expander("Advanced settings"):
         st.selectbox(
             "Optional: specify the overall operation type",
@@ -342,13 +363,22 @@ def is_valid_email(email):
     validated = re.match(email_regex, email) is not None
     return validated
 
-def input_email():
-    st.markdown(
-        "For variables with short descriptions, processing time will be about 1 minute per 100 PDF pages per variable."
+def select_gpt_model():
+    if "gpt_model" not in st.session_state:
+        st.session_state["gpt_model"] = "o4-mini"  # Default model
+    model_options = {
+        "o4-mini": "o4-mini", 
+        "o3": "o3 (slower, smarter, more expensive)",
+        "gpt-4.1": "4.1",
+    }  
+    st.session_state["gpt_model"] = st.selectbox(
+        "Select the OpenAI model to use for processing:",
+        options=list(model_options.keys()),
+        format_func=lambda x: model_options[x],
     )
 
-    email = st.text_input("Enter your email where you'd like to receive the results:")
-
+def input_email():
+    email = st.text_input("Enter your email where'd like to receive the results:")
     if "email" not in st.session_state:
         st.session_state["email"] = None  # Set to None if email is empty, for warning to user
     if not is_valid_email(email):
@@ -363,7 +393,8 @@ def build_interface(tmp_dir):
         st.session_state["task_type"] = "Quote extraction"
     if "is_test_run" not in st.session_state:
         st.session_state["is_test_run"] = True
-    load_text()
+    load_instructions()
+    st.markdown("""## Submit your processing request""")
     upload_file(tmp_dir)
     input_main_query()
     if "output_format_options" not in st.session_state:
@@ -387,6 +418,10 @@ def build_interface(tmp_dir):
         st.session_state["output_detail_df"] = None
     input_data_specs()
     st.divider()
+    st.markdown(
+        "For variables with short descriptions, processing time will be about 1 minute per 100 PDF pages per variable (with default model selection)."
+    )
+    select_gpt_model()
     input_email()
 
 
@@ -439,8 +474,9 @@ def get_user_inputs():
             "custom_output_fmt": st.session_state["custom_output_fmt"],
             "output_detail": st.session_state["output_detail_df"],
         }
+    gpt_model = st.session_state["gpt_model"]
     return get_analyzer(
-        task_type, output_fmt, pdfs, main_query, variable_specs, email, additional_info
+        task_type, output_fmt, pdfs, main_query, variable_specs, email, additional_info, gpt_model
     )
 
 

diff --git a/main.py b/main.py
@@ -67,6 +67,7 @@ def extract_policy_doc_info(
     var_embeddings,
     num_excerpts,
     openai_apikey,
+    gpt_model
 ):
     """
     Extracts policy document information by querying GPT for each variable specified.
@@ -86,7 +87,8 @@ def extract_policy_doc_info(
     """
     policy_doc_data = {}
     text_chunks = input_text_chunks
-    client, gpt_model, max_num_chars = new_openai_session(openai_apikey)
+    client, max_num_chars = new_openai_session(openai_apikey)
+    gpt_model = gpt_analyzer.get_gpt_model()
     # If the text is short, we don't need to generate embeddings to find "relevant texts"
     # If the text is long, text_chunks (defined above) will be replaced with the top relevant texts
     run_on_full_text = char_count < (max_num_chars - 1000)
@@ -197,6 +199,7 @@ def main(gpt_analyzer, openai_apikey):
     total_num_pages = 0
     total_start_time = time.time()
     failed_pdfs = []
+    gpt_model = gpt_analyzer.get_gpt_model()
     for pdf in gpt_analyzer.pdfs:
         pdf_path = get_resource_path(f"{pdf.replace('.pdf','')}.pdf")
         try:
@@ -211,9 +214,7 @@ def main(gpt_analyzer, openai_apikey):
             num_pages_in_pdf = 0
             num_sections = len(text_sections)
             ## Most PDFs will only have 1 text_section: this is used to break up long documents (>250 pages)
-            print(2)
             for text_section in text_sections:
-                print(3)
                 text_chunks, num_pages, char_count, section = [
                     text_section[k]
                     for k in ["text_chunks", "num_pages", "num_chars", "section_num"]
@@ -224,19 +225,17 @@ def main(gpt_analyzer, openai_apikey):
                     output_pdf_path = f"{pdf_path}"
                 num_pages_in_pdf += num_pages
                 total_num_pages += num_pages
-                openai_client, _, _ = new_openai_session(openai_apikey)
+                openai_client, _ = new_openai_session(openai_apikey)
                 pdf_embeddings, pdf_text_chunks = generate_all_embeddings(
                     openai_client, output_pdf_path, text_chunks, get_resource_path
                 )
-                print(4)
                 # 2) Prepare embeddings to grab most relevant text excerpts for each variable
-                openai_client, _, _ = new_openai_session(openai_apikey)
+                openai_client, _ = new_openai_session(openai_apikey)
                 var_embeddings = embed_variable_specifications(
                     openai_client, gpt_analyzer.variable_specs
                 )  # i.e. {"var_name": {"embedding": <...>", "variable_description": <...>, "context": <...>},  ...}
 
                 # 3) Iterate through each variable specification to grab relevant texts and query
-                print(5)
                 num_excerpts = gpt_analyzer.get_num_excerpts(num_pages)
                 policy_info = extract_policy_doc_info(
                     gpt_analyzer,
@@ -246,8 +245,8 @@ def main(gpt_analyzer, openai_apikey):
                     var_embeddings,
                     num_excerpts,
                     openai_apikey,
+                    gpt_model
                 )
-                print(6)
                 # 4) Output Results
                 output_results(gpt_analyzer, output_doc, output_pdf_path, policy_info)
             print_milestone(

diff --git a/query_gpt.py b/query_gpt.py
@@ -5,9 +5,8 @@
 def new_openai_session(openai_apikey):
     os.environ["OPENAI_API_KEY"] = openai_apikey
     client = OpenAI()
-    gpt_model = "gpt-4o"  # "o1-preview"
     max_num_chars = 25000
-    return client, gpt_model, max_num_chars
+    return client, max_num_chars
 
 
 def create_gpt_messages(query, run_on_full_text):
@@ -26,12 +25,20 @@ def create_gpt_messages(query, run_on_full_text):
 
 
 def chat_gpt_query(gpt_client, gpt_model, resp_fmt, msgs):
-    response = gpt_client.chat.completions.create(
-        model=gpt_model,
-        temperature=0,
-        response_format={"type": resp_fmt},
-        messages=msgs,
-    )
+    print(gpt_model)
+    if gpt_model == "gpt-4.1":
+        response = gpt_client.chat.completions.create(
+            model=gpt_model,
+            temperature=0,
+            response_format={"type": resp_fmt},
+            messages=msgs,
+        )
+    else:
+        response = gpt_client.chat.completions.create(
+            model=gpt_model,
+            response_format={"type": resp_fmt},
+            messages=msgs,
+        )
     return response.choices[0].message.content
 
 
@@ -53,7 +60,7 @@ def query_gpt_for_variable_specification(
     relevant_texts,
     run_on_full_text,
     gpt_client,
-    gpt_model,
+    gpt_model="o4-mini",
 ):
     query_template = gpt_analyzer.main_query
     excerpts = "\n".join(relevant_texts)

diff --git a/results.docx b/results.docx