From 92e05f53c63b77c0ccaf6569b8aa9a37363f94d2 Mon Sep 17 00:00:00 2001 From: Matheus Buniotto Date: Mon, 2 Feb 2026 21:58:47 -0300 Subject: [PATCH] fix image input --- llm_council_tool/llm_council.py | 53 +++++++++++++-- llm_council_tool/llm_council_pt.py | 52 +++++++++++++-- llm_council_tool/test_council_mock.py | 93 ++++++++++++++++++++++++--- 3 files changed, 178 insertions(+), 20 deletions(-) diff --git a/llm_council_tool/llm_council.py b/llm_council_tool/llm_council.py index 31e424d..c050b83 100644 --- a/llm_council_tool/llm_council.py +++ b/llm_council_tool/llm_council.py @@ -9,7 +9,7 @@ import os import asyncio import re -from typing import List, Dict, Any, Tuple, Optional +from typing import List, Dict, Any, Tuple, Optional, Union from pydantic import BaseModel, Field import requests @@ -192,6 +192,42 @@ async def _query_model_async( ) return model, result + def _normalize_topic_to_content( + self, topic: Union[str, List[Dict[str, Any]]] + ) -> Union[str, List[Dict[str, Any]]]: + """ + Normalize topic (string or OpenWebUI multimodal content) for API message content. + Returns content as-is for API: str or list of parts (text + image_url). + """ + if isinstance(topic, str): + return topic + if isinstance(topic, list): + return topic if topic else "" + # Fallback: coerce to string (e.g. unexpected type) + return str(topic) if topic is not None else "" + + def _topic_to_text(self, topic: Union[str, List[Dict[str, Any]]]) -> str: + """ + Extract a plain-text representation of topic for use in prompts (ranking, chairman). + Handles string input and multimodal content (text + image parts). + """ + if topic is None: + return "" + if isinstance(topic, str): + return topic + if not isinstance(topic, list): + return str(topic) + parts = [] + for item in topic: + if not isinstance(item, dict): + continue + kind = item.get("type") + if kind == "text": + parts.append(item.get("text", "")) + elif kind == "image_url": + parts.append("[Image attached]") + return " ".join(parts).strip() or "[No text content]" + def _parse_ranking_from_text(self, ranking_text: str) -> List[str]: """ Extracts the ranking list from the model's text response. @@ -241,7 +277,7 @@ def _get_available_models(self, api_key: str, base_url: str) -> List[str]: async def consult_council( self, - topic: str, + topic: Union[str, List[Dict[str, Any]]], __user__: Optional[dict] = None, __event_emitter__: Any = None, ) -> str: @@ -250,7 +286,14 @@ async def consult_council( 1. Council provides individual responses. 2. Council ranks peer responses. 3. Chairperson synthesizes the final answer. + + topic: User input as plain text (str) or OpenWebUI multimodal content (list of + parts with "type": "text" and/or "type": "image_url"). Supports images. """ + # Normalize topic for API (preserve images) and for text-only prompts + user_content = self._normalize_topic_to_content(topic) + topic_text = self._topic_to_text(topic) + # Resolve API key and base URL (try OpenWebUI first, then fallback) api_key = self._resolve_api_key(__user__) base_url = self._resolve_base_url() @@ -358,7 +401,7 @@ async def consult_council( False, ) - stage1_messages = [{"role": "user", "content": topic}] + stage1_messages = [{"role": "user", "content": user_content}] tasks = [ self._query_model_async(model, stage1_messages, api_key, base_url) for model in council_models_list @@ -406,7 +449,7 @@ async def consult_council( ranking_prompt = f"""You are evaluating different responses to the following question: -Question: {topic} +Question: {topic_text} Here are the responses from different models (anonymized): @@ -466,7 +509,7 @@ async def consult_council( chairman_prompt = f"""You are the Chairperson of an LLM Council. -Original Question: {topic} +Original Question: {topic_text} STAGE 1 - Individual Responses: {stage1_summary} diff --git a/llm_council_tool/llm_council_pt.py b/llm_council_tool/llm_council_pt.py index e879d72..2f4d678 100644 --- a/llm_council_tool/llm_council_pt.py +++ b/llm_council_tool/llm_council_pt.py @@ -9,7 +9,7 @@ import os import asyncio import re -from typing import List, Dict, Any, Tuple, Optional +from typing import List, Dict, Any, Tuple, Optional, Union from pydantic import BaseModel, Field import requests @@ -190,6 +190,41 @@ async def _query_model_async( ) return model, result + def _normalize_topic_to_content( + self, topic: Union[str, List[Dict[str, Any]]] + ) -> Union[str, List[Dict[str, Any]]]: + """ + Normaliza o topico (string ou conteudo multimodal do OpenWebUI) para o content da mensagem da API. + Retorna o content como esta para a API: str ou lista de partes (texto + image_url). + """ + if isinstance(topic, str): + return topic + if isinstance(topic, list): + return topic if topic else "" + return str(topic) if topic is not None else "" + + def _topic_to_text(self, topic: Union[str, List[Dict[str, Any]]]) -> str: + """ + Extrai representacao em texto puro do topico para uso em prompts (ranking, presidente). + Aceita string e conteudo multimodal (texto + imagens). + """ + if topic is None: + return "" + if isinstance(topic, str): + return topic + if not isinstance(topic, list): + return str(topic) + parts = [] + for item in topic: + if not isinstance(item, dict): + continue + kind = item.get("type") + if kind == "text": + parts.append(item.get("text", "")) + elif kind == "image_url": + parts.append("[Imagem anexada]") + return " ".join(parts).strip() or "[Sem conteudo de texto]" + def _parse_ranking_from_text(self, ranking_text: str) -> List[str]: """ Extrai a lista de ranking da resposta de texto do modelo. @@ -235,7 +270,7 @@ def _get_available_models(self, api_key: str, base_url: str) -> List[str]: async def consultar_conselho( self, - topico: str, + topico: Union[str, List[Dict[str, Any]]], __user__: Optional[dict] = None, __event_emitter__: Any = None, ) -> str: @@ -244,7 +279,14 @@ async def consultar_conselho( 1. Conselho fornece respostas individuais. 2. Conselho classifica as respostas dos pares. 3. Presidente sintetiza a resposta final. + + topico: Entrada do usuario como texto (str) ou conteudo multimodal do OpenWebUI + (lista de partes com "type": "text" e/ou "type": "image_url"). Suporta imagens. """ + # Normaliza topico para API (preserva imagens) e para prompts so texto + user_content = self._normalize_topic_to_content(topico) + topic_text = self._topic_to_text(topico) + # Resolve chave de API e URL base (tenta OpenWebUI primeiro, depois fallback) api_key = self._resolve_api_key(__user__) base_url = self._resolve_base_url() @@ -346,7 +388,7 @@ async def consultar_conselho( False, ) - stage1_messages = [{"role": "user", "content": topico}] + stage1_messages = [{"role": "user", "content": user_content}] tasks = [ self._query_model_async(model, stage1_messages, api_key, base_url) for model in council_models_list @@ -391,7 +433,7 @@ async def consultar_conselho( ranking_prompt = f"""Voce esta avaliando diferentes respostas para a seguinte pergunta: -Pergunta: {topico} +Pergunta: {topic_text} Aqui estao as respostas de diferentes modelos (anonimizadas): @@ -452,7 +494,7 @@ async def consultar_conselho( chairman_prompt = f"""Voce e o Presidente de um Conselho de LLMs. -Pergunta Original: {topico} +Pergunta Original: {topic_text} ETAPA 1 - Respostas Individuais: {stage1_summary} diff --git a/llm_council_tool/test_council_mock.py b/llm_council_tool/test_council_mock.py index 3a9ee25..23b2ad7 100644 --- a/llm_council_tool/test_council_mock.py +++ b/llm_council_tool/test_council_mock.py @@ -25,7 +25,7 @@ ] } -def mock_requests_get(url, headers, timeout): +def mock_requests_get(url, *args, **kwargs): mock_resp = MagicMock() if "/models" in url: mock_resp.status_code = 200 @@ -33,16 +33,32 @@ def mock_requests_get(url, headers, timeout): return mock_resp return mock_resp +def _content_to_str(content): + """Extract a single string from message content (str or multimodal list).""" + if content is None: + return "" + if isinstance(content, str): + return content + if isinstance(content, list): + parts = [] + for item in content: + if isinstance(item, dict) and item.get("type") == "text": + parts.append(item.get("text", "")) + return " ".join(parts) + return str(content) + + def mock_requests_post(url, headers, json, timeout): mock_resp = MagicMock() mock_resp.status_code = 200 - + model = json.get("model") messages = json.get("messages", []) - last_msg = messages[-1]["content"] if messages else "" - + raw_content = messages[-1]["content"] if messages else "" + last_msg = _content_to_str(raw_content) + content = "" - + # Simple heuristic to determine stage if "FINAL RANKING:" in last_msg: content = MOCK_STAGE_2_RANKINGS.get(model, "FINAL RANKING:\n1. Response A") @@ -75,19 +91,76 @@ async def async_emitter(x): print(f"\nResult: {result}") - assert result == MOCK_CHAIRMAN_RESPONSE + assert MOCK_CHAIRMAN_RESPONSE in result + assert "Stage 1" in result and "Stage 3" in result # Verification: - # 1. GET /models called once - mock_get.assert_called_once() + # 1. GET was used (base URL probe + /models, or just /models) + assert mock_get.called # 2. Check that 'invalid-model' was NOT queried in Stage 1 # Extract all models called in POST requests called_models = [call.kwargs['json']['model'] for call in mock_post.mock_calls] assert "invalid-model" not in called_models assert "llama3:latest" in called_models - assert "gpt-4o" in called_models - + assert "gpt-4o" in called_models + + +@pytest.mark.asyncio +async def test_consult_council_with_image_input(): + """Council tool accepts multimodal input (text + image) without breaking.""" + tools = Tools() + tools.valves.council_models = "llama3:latest,gpt-4o" + tools.valves.chairperson_model = "gpt-4o" + tools.valves.openwebui_api_key = "test-key" + + mock_emitter = MagicMock() + async def async_emitter(x): + mock_emitter(x) + + # Simulate OpenWebUI passing content with an image (list of parts) + multimodal_topic = [ + {"type": "text", "text": "What is in this image?"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc123"}}, + ] + + with patch("requests.post", side_effect=mock_requests_post) as mock_post, \ + patch("requests.get", side_effect=mock_requests_get) as mock_get: + result = await tools.consult_council(multimodal_topic, __event_emitter__=async_emitter) + assert result + # Stage 1 requests must send the list content (with image) to the API + stage1_calls = [c for c in mock_post.mock_calls if c.kwargs.get("json") and "FINAL RANKING:" not in _content_to_str((c.kwargs["json"].get("messages") or [{}])[-1].get("content", "")) and "Chairperson" not in _content_to_str((c.kwargs["json"].get("messages") or [{}])[-1].get("content", ""))] + assert stage1_calls, "Expected at least one Stage 1 request" + first_msg_content = stage1_calls[0].kwargs["json"]["messages"][0]["content"] + assert first_msg_content == multimodal_topic + + +@pytest.mark.asyncio +async def test_consult_council_empty_list_topic(): + """Empty list topic is normalized to empty string and does not send '[]' to API.""" + tools = Tools() + tools.valves.council_models = "llama3:latest,gpt-4o" + tools.valves.chairperson_model = "gpt-4o" + tools.valves.openwebui_api_key = "test-key" + mock_emitter = MagicMock() + async def async_emitter(x): + mock_emitter(x) + + with patch("requests.post", side_effect=mock_requests_post) as mock_post, \ + patch("requests.get", side_effect=mock_requests_get) as mock_get: + result = await tools.consult_council([], __event_emitter__=async_emitter) + assert result + # Stage 1 must not send the literal "[]" as content + for call in mock_post.mock_calls: + if not call.kwargs.get("json"): + continue + messages = call.kwargs["json"].get("messages", []) + if not messages: + continue + content = messages[0].get("content", "") + assert content != "[]", "Empty list should normalize to '' not '[]'" + + if __name__ == "__main__": # Manually running the async test if executed as script loop = asyncio.new_event_loop()