From 99e11895f6d87009a6629a174bd0d67a4e44febe Mon Sep 17 00:00:00 2001 From: Sean Pedrick-Case Date: Wed, 6 May 2026 12:38:06 +0100 Subject: [PATCH 1/7] Downgrade gradio version to 6.10.0 to avoid tab switch freezing --- pyproject.toml | 2 +- requirements.txt | 4 ++-- requirements_cpu.txt | 2 +- requirements_gpu.txt | 2 +- requirements_lightweight.txt | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0d543b9..37fe2f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ classifiers = [ ] dependencies = [ - "gradio<=6.12.0", + "gradio<=6.10.0", "transformers<=5.3.0", "spaces<=0.48.1", "boto3<=1.42.80", diff --git a/requirements.txt b/requirements.txt index fd6715a..ec822cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Note that this requirements file is optimised for Hugging Face spaces / Python 3.10. Please use requirements_no_local.txt for installation without local model inference (simplest approach to get going). Please use requirements_cpu.txt for CPU instances and requirements_gpu.txt for GPU instances using Python 3.11 -gradio<=6.12.0 +gradio<=6.10.0 transformers<=5.30.0 spaces==0.48.1 boto3<=1.42.80 @@ -13,7 +13,7 @@ google-genai<=1.73.0 openai<=2.31.0 html5lib<=1.1 beautifulsoup4<=4.14.3 -rapidfuzz<=3.14.3 +rapidfuzz<=3.14.5 python-dotenv<=1.2.2 # GPU (for huggingface instance) # Torch/Unsloth and llama-cpp-python diff --git a/requirements_cpu.txt b/requirements_cpu.txt index cb83692..ca30c2b 100644 --- a/requirements_cpu.txt +++ b/requirements_cpu.txt @@ -1,4 +1,4 @@ -gradio<=6.12.0 +gradio<=6.10.0 transformers<=5.3.0 spaces<=0.48.1 pandas<=2.3.3 diff --git a/requirements_gpu.txt b/requirements_gpu.txt index c362d19..d391146 100644 --- a/requirements_gpu.txt +++ b/requirements_gpu.txt @@ -1,4 +1,4 @@ -gradio<=6.12.0 +gradio<=6.10.0 transformers<=5.3.0 spaces<=0.48.1 pandas<=2.3.3 diff --git a/requirements_lightweight.txt b/requirements_lightweight.txt index 471891c..1ac2eb9 100644 --- a/requirements_lightweight.txt +++ b/requirements_lightweight.txt @@ -1,5 +1,5 @@ # This requirements file is optimised for AWS ECS using Python 3.11 alongside the Dockerfile, without local torch and llama-cpp-python. For AWS ECS, torch and llama-cpp-python are optionally installed in the main Dockerfile -gradio<=6.12.0 +gradio<=6.10.0 transformers<=5.3.0 spaces<=0.48.1 boto3<=1.42.80 From cab5feb345e2c433a6ff67dfa1de9dfba557dbf8 Mon Sep 17 00:00:00 2001 From: Sean Pedrick-Case Date: Thu, 7 May 2026 13:10:49 +0100 Subject: [PATCH 2/7] Minor formatting and component/tab visibility adjustments --- app.py | 170 +++++++++++++++++++------------------- tools/helper_functions.py | 2 +- 2 files changed, 86 insertions(+), 86 deletions(-) diff --git a/app.py b/app.py index 61302c6..dff1808 100644 --- a/app.py +++ b/app.py @@ -201,7 +201,7 @@ else: context_textbox = gr.Textbox( label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')", - visible="hidden", + visible=False, ) topic_extraction_output_files_xlsx = gr.File( label="Overall summary xlsx file. CSV outputs are available on the 'Advanced' tab.", @@ -257,7 +257,7 @@ # Create the gradio interface app = gr.Blocks( - fill_width=True, + fill_width=False, analytics_enabled=False, title="LLM topic modelling", delete_cache=(43200, 43200), @@ -270,42 +270,42 @@ ### # Workaround for Gradio 6 issue where 'hidden' element are still sometimes visible as a thing line in the UI - with gr.Accordion(visible="hidden", elem_classes="hidden_component", open=False): + with gr.Accordion(visible=False, elem_classes="hidden_component", open=False): text_output_file_list_state = gr.Dropdown( list(), allow_custom_value=True, - visible="hidden", + visible=False, label="text_output_file_list_state", elem_classes="hidden_component", ) text_output_modify_file_list_state = gr.Dropdown( list(), allow_custom_value=True, - visible="hidden", + visible=False, label="text_output_modify_file_list_state", elem_classes="hidden_component", ) log_files_output_list_state = gr.Dropdown( list(), allow_custom_value=True, - visible="hidden", + visible=False, label="log_files_output_list_state", elem_classes="hidden_component", ) first_loop_state = gr.Checkbox( - True, visible="hidden", elem_classes="hidden_component" + True, visible=False, elem_classes="hidden_component" ) second_loop_state = gr.Checkbox( - False, visible="hidden", elem_classes="hidden_component" + False, visible=False, elem_classes="hidden_component" ) modified_unique_table_change_bool = gr.Checkbox( - True, visible="hidden", elem_classes="hidden_component" + True, visible=False, elem_classes="hidden_component" ) # This boolean is used to flag whether a file upload should change just the modified unique table object on the second tab file_data_state = gr.Dataframe( value=pd.DataFrame(), label="file_data_state", - visible="hidden", + visible=False, type="pandas", interactive=True, elem_classes="hidden_component", @@ -313,14 +313,14 @@ master_topic_df_state = gr.Dataframe( value=pd.DataFrame(), label="master_topic_df_state", - visible="hidden", + visible=False, type="pandas", interactive=True, ) master_unique_topics_df_state = gr.Dataframe( value=pd.DataFrame(), label="master_unique_topics_df_state", - visible="hidden", + visible=False, type="pandas", interactive=True, elem_classes="hidden_component", @@ -328,7 +328,7 @@ master_reference_df_state = gr.Dataframe( value=pd.DataFrame(), label="master_reference_df_state", - visible="hidden", + visible=False, type="pandas", interactive=True, elem_classes="hidden_component", @@ -336,7 +336,7 @@ missing_df_state = gr.Dataframe( value=pd.DataFrame(), label="missing_df_state", - visible="hidden", + visible=False, type="pandas", interactive=True, elem_classes="hidden_component", @@ -345,7 +345,7 @@ master_modify_unique_topics_df_state = gr.Dataframe( value=pd.DataFrame(), label="master_modify_unique_topics_df_state", - visible="hidden", + visible=False, type="pandas", interactive=True, elem_classes="hidden_component", @@ -353,7 +353,7 @@ master_modify_reference_df_state = gr.Dataframe( value=pd.DataFrame(), label="master_modify_reference_df_state", - visible="hidden", + visible=False, type="pandas", interactive=True, elem_classes="hidden_component", @@ -364,40 +364,40 @@ value="", label="Query metadata - usage counts and other parameters", lines=8, - visible="hidden", + visible=False, elem_classes="hidden_component", ) - session_hash_state = gr.Textbox(visible="hidden", value=HOST_NAME) + session_hash_state = gr.Textbox(visible=False, value=HOST_NAME) output_folder_state = gr.Textbox( - visible="hidden", value=OUTPUT_FOLDER, elem_classes="hidden_component" + visible=False, value=OUTPUT_FOLDER, elem_classes="hidden_component" ) input_folder_state = gr.Textbox( - visible="hidden", value=INPUT_FOLDER, elem_classes="hidden_component" + visible=False, value=INPUT_FOLDER, elem_classes="hidden_component" ) # s3 bucket name s3_default_bucket = gr.Textbox( label="Default S3 bucket", value=S3_LOG_BUCKET, - visible="hidden", + visible=False, elem_classes="hidden_component", ) s3_log_bucket_name = gr.Textbox( - visible="hidden", value=S3_LOG_BUCKET, elem_classes="hidden_component" + visible=False, value=S3_LOG_BUCKET, elem_classes="hidden_component" ) # S3 output settings s3_output_folder_state = gr.Textbox( label="s3_output_folder_state", value=S3_OUTPUTS_FOLDER, - visible="hidden", + visible=False, elem_classes="hidden_component", ) save_outputs_to_s3_checkbox = gr.Checkbox( label="save_outputs_to_s3_checkbox", value=convert_string_to_boolean(SAVE_OUTPUTS_TO_S3), - visible="hidden", + visible=False, elem_classes="hidden_component", ) @@ -405,37 +405,37 @@ access_logs_state = gr.Textbox( label="access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, - visible="hidden", + visible=False, elem_classes="hidden_component", ) access_s3_logs_loc_state = gr.Textbox( label="access_s3_logs_loc_state", value=S3_ACCESS_LOGS_FOLDER, - visible="hidden", + visible=False, elem_classes="hidden_component", ) feedback_logs_state = gr.Textbox( label="feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + FEEDBACK_LOG_FILE_NAME, - visible="hidden", + visible=False, elem_classes="hidden_component", ) feedback_s3_logs_loc_state = gr.Textbox( label="feedback_s3_logs_loc_state", value=S3_FEEDBACK_LOGS_FOLDER, - visible="hidden", + visible=False, elem_classes="hidden_component", ) usage_logs_state = gr.Textbox( label="usage_logs_state", value=USAGE_LOGS_FOLDER + USAGE_LOG_FILE_NAME, - visible="hidden", + visible=False, elem_classes="hidden_component", ) usage_s3_logs_loc_state = gr.Textbox( label="usage_s3_logs_loc_state", value=S3_USAGE_LOGS_FOLDER, - visible="hidden", + visible=False, elem_classes="hidden_component", ) @@ -443,7 +443,7 @@ logged_content_df = gr.Dataframe( label="logged_content_df", value=pd.DataFrame(), - visible="hidden", + visible=False, type="pandas", elem_classes="hidden_component", ) @@ -451,19 +451,19 @@ # Logging for input / output tokens input_tokens_num = gr.Textbox( "0", - visible="hidden", + visible=False, label="Total input tokens", elem_classes="hidden_component", ) output_tokens_num = gr.Textbox( "0", - visible="hidden", + visible=False, label="Total output tokens", elem_classes="hidden_component", ) number_of_calls_num = gr.Textbox( "0", - visible="hidden", + visible=False, label="Total LLM calls", elem_classes="hidden_component", ) @@ -471,25 +471,25 @@ # Additional UI components for validation max_tokens_num = gr.Number( value=8192, - visible="hidden", + visible=False, label="Max tokens", elem_classes="hidden_component", ) reasoning_suffix_textbox = gr.Textbox( value="", - visible="hidden", + visible=False, label="Reasoning suffix", elem_classes="hidden_component", ) output_debug_files_radio = gr.Radio( value="False", choices=["True", "False"], - visible="hidden", + visible=False, label="Output debug files", ) max_time_for_loop_num = gr.Number( value=99999, - visible="hidden", + visible=False, label="Max time for loop", elem_classes="hidden_component", ) @@ -500,7 +500,7 @@ headers=None, column_count=None, label="summary_reference_table_sample_state", - visible="hidden", + visible=False, type="pandas", elem_classes="hidden_component", ) @@ -509,7 +509,7 @@ headers=None, column_count=None, label="master_reference_df_revised_summaries_state", - visible="hidden", + visible=False, type="pandas", elem_classes="hidden_component", ) @@ -518,7 +518,7 @@ headers=None, column_count=None, label="master_unique_topics_df_revised_summaries_state", - visible="hidden", + visible=False, type="pandas", elem_classes="hidden_component", ) @@ -527,29 +527,29 @@ headers=None, column_count=None, label="summarised_output_df", - visible="hidden", + visible=False, type="pandas", elem_classes="hidden_component", ) summarised_references_markdown = gr.Markdown( - "", visible="hidden", elem_classes="hidden_component" + "", visible=False, elem_classes="hidden_component" ) summarised_outputs_list = gr.Dropdown( value=list(), choices=list(), - visible="hidden", + visible=False, label="List of summarised outputs", allow_custom_value=True, elem_classes="hidden_component", ) latest_summary_completed_num = gr.Number( - 0, visible="hidden", elem_classes="hidden_component" + 0, visible=False, elem_classes="hidden_component" ) summary_xlsx_output_files_list = gr.Dropdown( value=list(), choices=list(), - visible="hidden", + visible=False, label="List of xlsx summary output files", allow_custom_value=True, elem_classes="hidden_component", @@ -558,37 +558,37 @@ original_data_file_name_textbox = gr.Textbox( label="Reference data file name", value="", - visible="hidden", + visible=False, elem_classes="hidden_component", ) working_data_file_name_textbox = gr.Textbox( label="Working data file name", value="", - visible="hidden", + visible=False, elem_classes="hidden_component", ) unique_topics_table_file_name_textbox = gr.Textbox( label="Unique topics data file name textbox", - visible="hidden", + visible=False, elem_classes="hidden_component", ) dummy_consultation_table_textbox = gr.Textbox( value=dummy_consultation_table, - visible="hidden", + visible=False, label="Dummy consultation table", elem_classes="hidden_component", ) case_notes_table_textbox = gr.Textbox( value=case_notes_table, - visible="hidden", + visible=False, label="Case notes table", elem_classes="hidden_component", ) model_name_map_state = gr.JSON( model_name_map, - visible="hidden", + visible=False, label="model_name_map_state", elem_classes="hidden_component", ) @@ -597,25 +597,25 @@ s3_default_cost_codes_file = gr.Textbox( label="Default cost centre file", value=S3_COST_CODES_PATH, - visible="hidden", + visible=False, elem_classes="hidden_component", ) default_cost_codes_output_folder_location = gr.Textbox( label="Output default cost centre location", value=OUTPUT_COST_CODES_PATH, - visible="hidden", + visible=False, elem_classes="hidden_component", ) enforce_cost_code_textbox = gr.Textbox( label="Enforce cost code textbox", value=ENFORCE_COST_CODES, - visible="hidden", + visible=False, elem_classes="hidden_component", ) default_cost_code_textbox = gr.Textbox( label="Default cost code textbox", value=DEFAULT_COST_CODE, - visible="hidden", + visible=False, elem_classes="hidden_component", ) @@ -628,13 +628,13 @@ show_search="filter", wrap=True, max_height=200, - visible="hidden", + visible=False, interactive=True, ) cost_code_dataframe = gr.Dataframe( value=pd.DataFrame(columns=["Cost code", "Description"]), type="pandas", - visible="hidden", + visible=False, wrap=True, interactive=True, elem_classes="hidden_component", @@ -644,7 +644,7 @@ label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, - visible="hidden", + visible=False, elem_classes="hidden_component", ) @@ -652,7 +652,7 @@ value=0, label="Number of files prepared", interactive=False, - visible="hidden", + visible=False, elem_classes="hidden_component", ) # Duplicate version of the above variable for when you don't want to initiate the summarisation loop @@ -660,7 +660,7 @@ value=0, label="Number of files prepared", interactive=False, - visible="hidden", + visible=False, elem_classes="hidden_component", ) @@ -668,7 +668,7 @@ session_hash_textbox = gr.Textbox( label="Session hash", value="", - visible="hidden", + visible=False, elem_classes="hidden_component", ) @@ -676,20 +676,20 @@ label="Estimated time taken (seconds)", value=0.0, precision=1, - visible="hidden", + visible=False, elem_classes="hidden_component", ) # This keeps track of the time taken to redact files for logging purposes. total_number_of_batches = gr.Number( label="Current batch number", value=1, precision=0, - visible="hidden", + visible=False, elem_classes="hidden_component", ) text_output_logs = gr.Textbox( label="Output summary logs", - visible="hidden", + visible=False, elem_classes="hidden_component", ) @@ -906,7 +906,7 @@ def show_info_box_on_click( in_excel_sheets = gr.Dropdown( multiselect=False, label="Select the Excel sheet of interest.", - visible="hidden", + visible=False, allow_custom_value=True, ) in_colnames.render() @@ -929,7 +929,7 @@ def show_info_box_on_click( ) produce_structured_summary_radio.render() - with gr.Accordion("Response sentiment analysis", open=False): + with gr.Accordion("Response sentiment analysis (default is Negative or Positive)", open=False): sentiment_checkbox = gr.Radio( label="Should the model assess the sentiment of responses?", value="Negative or Positive", @@ -983,22 +983,22 @@ def show_info_box_on_click( display_topic_table_markdown.render() data_feedback_title = gr.Markdown( - value="## Please give feedback", visible="hidden" + value="## Please give feedback", visible=False ) data_feedback_radio = gr.Radio( label="Please give some feedback about the results of the topic extraction.", choices=["The results were good", "The results were not good"], - visible="hidden", + visible=False, ) data_further_details_text = gr.Textbox( label="Please give more detailed feedback about the results:", - visible="hidden", + visible=False, ) - data_submit_feedback_btn = gr.Button(value="Submit feedback", visible="hidden") + data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False) with gr.Row(): s3_logs_output_textbox = gr.Textbox( - label="Feedback submission logs", visible="hidden" + label="Feedback submission logs", visible=False ) with gr.Tab(label="Advanced - Step by step topic extraction and summarisation"): @@ -1014,7 +1014,7 @@ def show_info_box_on_click( ) else: additional_summary_instructions_textbox = gr.Textbox( - value="", visible="hidden", label="Additional summary instructions" + value="", visible=False, label="Additional summary instructions" ) extract_topics_btn = gr.Button("1. Extract topics", variant="secondary") @@ -1050,12 +1050,12 @@ def show_info_box_on_click( label="Provide response data to validation process", value="Yes", choices=["Yes", "No"], - visible="hidden", + visible=False, scale=1, ) additional_validation_issues_textbox = gr.Textbox( value="", - visible="hidden", + visible=False, label="Additional validation issues for the model to consider (bullet-point list)", scale=3, ) @@ -1101,7 +1101,7 @@ def show_info_box_on_click( file_types=[".xlsx", ".xls", ".csv", ".parquet"], ) deduplication_input_files_status = gr.Textbox( - value="", label="Previous file input", visible="hidden" + value="", label="Previous file input", visible=False ) with gr.Row(): @@ -1206,7 +1206,7 @@ def show_info_box_on_click( two_para_summary_format_prompt, single_para_summary_format_prompt, ], - visible="hidden", + visible=False, ) # This is currently an invisible placeholder in case in future I want to add in overall summarisation customisation overall_summarise_previous_data_btn = gr.Button( @@ -1231,7 +1231,7 @@ def show_info_box_on_click( value="### Overall summary will appear here" ) - with gr.Tab(label="Topic table viewer", visible="hidden"): + with gr.Tab(label="Review outputs", visible=True): with gr.Accordion( "View LLM log files containing prompts and responses", open=True ): @@ -1307,7 +1307,7 @@ def show_info_box_on_click( value="", label="View table (legacy)", buttons=["copy"], visible=False ) - with gr.Tab(label="Continue unfinished topic extraction", visible="hidden"): + with gr.Tab(label="Continue unfinished topic extraction", visible=False): gr.Markdown( """### Load in output files from a previous topic extraction process and continue topic extraction with new data.""" ) @@ -1350,7 +1350,7 @@ def show_info_box_on_click( maximum=1000, ) random_seed = gr.Number( - value=LLM_SEED, label="Random seed for LLM generation", visible="hidden" + value=LLM_SEED, label="Random seed for LLM generation", visible=False ) with gr.Accordion("AWS API keys", open=False): @@ -1427,13 +1427,13 @@ def show_info_box_on_click( lines=8, ) - with gr.Accordion("Prompt settings", open=False, visible="hidden"): + with gr.Accordion("Prompt settings", open=False, visible=False): number_of_prompts = gr.Number( value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3, - visible="hidden", + visible=False, ) system_prompt_textbox = gr.Textbox( label="Initial system prompt", lines=4, value=system_prompt @@ -1476,7 +1476,7 @@ def show_info_box_on_click( ) with gr.Accordion( - "Export output files to xlsx format", open=False, visible="hidden" + "Export output files to xlsx format", open=False, visible=False ): export_xlsx_btn = gr.Button( "Export output files to xlsx format", variant="primary" @@ -2556,9 +2556,9 @@ def update_on_filter_change( ### reference_df_data_file_name_textbox = gr.Textbox( - label="reference_df_data_file_name_textbox", visible="hidden" + label="reference_df_data_file_name_textbox", visible=False ) - master_reference_df_state_joined = gr.Dataframe(visible="hidden") + master_reference_df_state_joined = gr.Dataframe(visible=False) join_cols_btn.click( fn=load_in_previous_reference_file, diff --git a/tools/helper_functions.py b/tools/helper_functions.py index a7dbca0..6b06c0a 100644 --- a/tools/helper_functions.py +++ b/tools/helper_functions.py @@ -730,7 +730,7 @@ def count_responses(ref_str): # Wrap text in each column to the specified max width, including whole words -def wrap_text(text: str, max_width=80, max_text_length=None): +def wrap_text(text: str, max_width=60, max_text_length=None): if not isinstance(text, str): return text From 9032469d8d0a2024d2303d4aac74892e287a8f78 Mon Sep 17 00:00:00 2001 From: Sean Pedrick-Case Date: Thu, 7 May 2026 13:17:16 +0100 Subject: [PATCH 3/7] Version bump, lint check, gradio version correction in readme --- README.md | 6 +++--- app.py | 4 +++- pyproject.toml | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a9ef51b..c3dc6b5 100644 --- a/README.md +++ b/README.md @@ -4,16 +4,16 @@ emoji: 📚 colorFrom: purple colorTo: yellow sdk: gradio -sdk_version: 6.12.0 +sdk_version: 6.10.0 app_file: app.py pinned: true license: agpl-3.0 -short_description: Create thematic summaries for open text data with LLMs +short_description: Extract topics and create thematic summaries for open text data with LLMs --- # Large language model topic modelling -Version: 0.10.0 +Version: 0.10.1 Extract topics and summarise outputs using Large Language Models (LLMs), either local, Gemini, Azure, or AWS Bedrock models (e.g. Claude, Nova models). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and a topic summary. Instructions on use can be found in the README.md file. You can try out examples by clicking on one of the example datasets on the main app page, which will show you example outputs from a local model run. API keys for AWS, Azure, and Gemini services can be entered on the settings page (note that Gemini has a free public API). diff --git a/app.py b/app.py index dff1808..c29032f 100644 --- a/app.py +++ b/app.py @@ -929,7 +929,9 @@ def show_info_box_on_click( ) produce_structured_summary_radio.render() - with gr.Accordion("Response sentiment analysis (default is Negative or Positive)", open=False): + with gr.Accordion( + "Response sentiment analysis (default is Negative or Positive)", open=False + ): sentiment_checkbox = gr.Radio( label="Should the model assess the sentiment of responses?", value="Negative or Positive", diff --git a/pyproject.toml b/pyproject.toml index 37fe2f0..8ad20fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "llm_topic_modelling" -version = "0.10.0" +version = "0.10.1" description = "Generate thematic summaries from open text in tabular data files with a large language model." requires-python = ">=3.10" readme = "README.md" From 6fede69a640fae13f99ca47f61550a67ca2c4937 Mon Sep 17 00:00:00 2001 From: Sean Pedrick-Case Date: Thu, 7 May 2026 13:29:24 +0100 Subject: [PATCH 4/7] Aligned pyproject.toml gradio mcp version with others (6.10.0) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8ad20fc..09f69dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,7 +94,7 @@ llamacpp = [ # Run Gradio as an mcp server mcp = [ - "gradio[mcp]<=6.12.0" + "gradio[mcp]<=6.10.0" ] [project.urls] From fe6b374ee2897477cf8b673d10d289cfac056cfa Mon Sep 17 00:00:00 2001 From: Sean Pedrick-Case Date: Thu, 7 May 2026 13:36:39 +0100 Subject: [PATCH 5/7] Shortened HF space description --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c3dc6b5..2f9847c 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ sdk_version: 6.10.0 app_file: app.py pinned: true license: agpl-3.0 -short_description: Extract topics and create thematic summaries for open text data with LLMs +short_description: Extract topics from open text data with LLMs --- # Large language model topic modelling From 262aee0309158f0ff4135a14aa9a5781e4ef63d9 Mon Sep 17 00:00:00 2001 From: Sean Pedrick-Case Date: Thu, 7 May 2026 13:41:21 +0100 Subject: [PATCH 6/7] Aligned spaces package requirement across all relevant files --- pyproject.toml | 2 +- requirements.txt | 2 +- requirements_cpu.txt | 2 +- requirements_gpu.txt | 2 +- requirements_lightweight.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 09f69dd..d2cf1e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,7 @@ classifiers = [ dependencies = [ "gradio<=6.10.0", "transformers<=5.3.0", - "spaces<=0.48.1", + "spaces<=0.49.0", "boto3<=1.42.80", "pandas<=2.3.3", "pyarrow<=23.0.1", diff --git a/requirements.txt b/requirements.txt index ec822cd..f0079ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Note that this requirements file is optimised for Hugging Face spaces / Python 3.10. Please use requirements_no_local.txt for installation without local model inference (simplest approach to get going). Please use requirements_cpu.txt for CPU instances and requirements_gpu.txt for GPU instances using Python 3.11 gradio<=6.10.0 transformers<=5.30.0 -spaces==0.48.1 +spaces==0.49.0 boto3<=1.42.80 pandas<=2.3.3 pyarrow<=23.0.1 diff --git a/requirements_cpu.txt b/requirements_cpu.txt index ca30c2b..3cf1560 100644 --- a/requirements_cpu.txt +++ b/requirements_cpu.txt @@ -1,6 +1,6 @@ gradio<=6.10.0 transformers<=5.3.0 -spaces<=0.48.1 +spaces<=0.49.0 pandas<=2.3.3 boto3<=1.42.80 pyarrow<=23.0.1 diff --git a/requirements_gpu.txt b/requirements_gpu.txt index d391146..804a6ae 100644 --- a/requirements_gpu.txt +++ b/requirements_gpu.txt @@ -1,6 +1,6 @@ gradio<=6.10.0 transformers<=5.3.0 -spaces<=0.48.1 +spaces<=0.49.0 pandas<=2.3.3 boto3<=1.42.80 pyarrow<=23.0.1 diff --git a/requirements_lightweight.txt b/requirements_lightweight.txt index 1ac2eb9..5ff34c5 100644 --- a/requirements_lightweight.txt +++ b/requirements_lightweight.txt @@ -1,7 +1,7 @@ # This requirements file is optimised for AWS ECS using Python 3.11 alongside the Dockerfile, without local torch and llama-cpp-python. For AWS ECS, torch and llama-cpp-python are optionally installed in the main Dockerfile gradio<=6.10.0 transformers<=5.3.0 -spaces<=0.48.1 +spaces<=0.49.0 boto3<=1.42.80 pandas<=2.3.3 pyarrow<=23.0.1 From 3bf8b9fb0863e53ed9db86a54c24a650a03e9d26 Mon Sep 17 00:00:00 2001 From: Sean Pedrick-Case Date: Fri, 29 May 2026 16:11:43 +0100 Subject: [PATCH 7/7] Fix on mismatched tables where batch size was 1 --- tools/helper_functions.py | 2 +- tools/llm_api_call.py | 82 ++++++++++++++++++++++++++++++++------- 2 files changed, 69 insertions(+), 15 deletions(-) diff --git a/tools/helper_functions.py b/tools/helper_functions.py index 6b06c0a..6395587 100644 --- a/tools/helper_functions.py +++ b/tools/helper_functions.py @@ -730,7 +730,7 @@ def count_responses(ref_str): # Wrap text in each column to the specified max width, including whole words -def wrap_text(text: str, max_width=60, max_text_length=None): +def wrap_text(text: str, max_width=50, max_text_length=None): if not isinstance(text, str): return text diff --git a/tools/llm_api_call.py b/tools/llm_api_call.py index d152d6c..7fa8580 100644 --- a/tools/llm_api_call.py +++ b/tools/llm_api_call.py @@ -2005,6 +2005,33 @@ def convert_to_html_table(input_string: str, table_type: str = "Main table"): return html_table +def _normalize_parsed_table_column_name(name: object) -> str: + return str(name).lower().strip().replace("_", " ") + + +def _find_parsed_table_column(df: pd.DataFrame, standard_name: str) -> str | None: + target = _normalize_parsed_table_column_name(standard_name) + for col in df.columns: + if _normalize_parsed_table_column_name(col) == target: + return col + return None + + +def _four_column_table_has_sentiment(df: pd.DataFrame) -> bool: + """Distinguish 4-column tables with Sentiment (batch_size==1) from those without.""" + if _find_parsed_table_column(df, "Sentiment") is not None: + return True + if _find_parsed_table_column(df, "Response References") is not None: + return False + if df.shape[1] < 3: + return False + col2 = df.iloc[:, 2].astype(str).str.strip().str.lower() + sentiment_values = {"negative", "neutral", "positive", "not assessed"} + if col2.empty: + return False + return col2.isin(sentiment_values).mean() >= 0.5 + + def convert_response_text_to_dataframe( response_text: str, table_type: str = "Main table" ): @@ -2324,20 +2351,47 @@ def write_llm_output_and_logs( topic_with_response_df = topic_with_response_df.rename(columns=new_column_names) elif topic_with_response_df.shape[1] == 4: - # Handle 4-column case (missing Sentiment column) - # Rename all 4 columns first - new_column_names = { - topic_with_response_df.columns[0]: "General topic", - topic_with_response_df.columns[1]: "Subtopic", - topic_with_response_df.columns[2]: "Response References", - topic_with_response_df.columns[3]: "Summary", - } - topic_with_response_df = topic_with_response_df.rename(columns=new_column_names) - # Add missing Sentiment column - topic_with_response_df["Sentiment"] = pd.Series( - ["Not assessed"] * len(topic_with_response_df), dtype=str - ) - # Reorder columns to match expected format + if _four_column_table_has_sentiment(topic_with_response_df): + # batch_size==1: General topic, Subtopic, Sentiment, Summary (no Response References) + rename_map = {} + for standard_name in [ + "General topic", + "Subtopic", + "Sentiment", + "Summary", + ]: + found_col = _find_parsed_table_column( + topic_with_response_df, standard_name + ) + if found_col is not None: + rename_map[found_col] = standard_name + if len(rename_map) < 4: + rename_map.update( + { + topic_with_response_df.columns[0]: "General topic", + topic_with_response_df.columns[1]: "Subtopic", + topic_with_response_df.columns[2]: "Sentiment", + topic_with_response_df.columns[3]: "Summary", + } + ) + topic_with_response_df = topic_with_response_df.rename(columns=rename_map) + topic_with_response_df["Response References"] = pd.Series( + ["1"] * len(topic_with_response_df), dtype=str + ) + else: + # 4-column case without Sentiment (Response References present instead) + new_column_names = { + topic_with_response_df.columns[0]: "General topic", + topic_with_response_df.columns[1]: "Subtopic", + topic_with_response_df.columns[2]: "Response References", + topic_with_response_df.columns[3]: "Summary", + } + topic_with_response_df = topic_with_response_df.rename( + columns=new_column_names + ) + topic_with_response_df["Sentiment"] = pd.Series( + ["Not assessed"] * len(topic_with_response_df), dtype=str + ) topic_with_response_df = topic_with_response_df[ ["General topic", "Subtopic", "Sentiment", "Response References", "Summary"] ]