From 89dde54a0669b2ff173f8105cb2337dcec7ee6df Mon Sep 17 00:00:00 2001 From: camilleAND Date: Thu, 23 Apr 2026 16:23:10 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8(conversation)=20summarize=20messages?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a summarization of last messages every n turns to optimize context --- CHANGELOG.md | 1 + docs/attachments.md | 29 +- docs/env.md | 4 + src/backend/chat/agents/history_processors.py | 338 +++++++++++++++++ src/backend/chat/clients/pydantic_ai.py | 111 +++++- src/backend/chat/document_context_builder.py | 3 +- src/backend/chat/llm_configuration.py | 16 +- .../0008_chatconversation_history_summary.py | 19 + ...conversation_history_summary_checkpoint.py | 18 + src/backend/chat/models.py | 9 + .../tests/agents/test_history_processors.py | 350 ++++++++++++++++++ .../test_document_context_window.py | 20 +- .../test_thinking_history_stripping.py | 6 +- .../tests/test_document_context_builder.py | 3 +- src/backend/conversations/settings.py | 18 +- .../features/chat/components/MessageItem.tsx | 23 +- 16 files changed, 920 insertions(+), 48 deletions(-) create mode 100644 src/backend/chat/agents/history_processors.py create mode 100644 src/backend/chat/migrations/0008_chatconversation_history_summary.py create mode 100644 src/backend/chat/migrations/0009_chatconversation_history_summary_checkpoint.py create mode 100644 src/backend/chat/tests/agents/test_history_processors.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 247e9e76..d3c9b507 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ and this project adheres to - 🐛(fix) add prevent_url_hallucination instruction to ConversationAgent - ✨(projects) handle project files for RAG search - ✨(banner) configurable banner with level, title, content and start/end +- ✨(conversation) summarize messages ### Changed diff --git a/docs/attachments.md b/docs/attachments.md index 1dd15eb0..a449a29b 100644 --- a/docs/attachments.md +++ b/docs/attachments.md @@ -425,11 +425,14 @@ Notes: The decision of which documents are inlined as `full-context` vs left as `tool_call_only` is made by `chat/document_context_builder.py:build_document_context_instruction` on each turn: -1. Compute the `document_budget` in tokens: +1. Compute budgets in tokens (`chat/clients/pydantic_ai.py` subtracts the security buffer once, then splits the remainder): ```text - document_budget = max(int(model.max_token_context * DOCUMENT_CONTEXT_BUDGET_RATIO) - - DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS, 0) + usable_context = max(model.max_token_context - DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS, 0) + document_budget = max(int(usable_context * DOCUMENT_CONTEXT_BUDGET_RATIO), 0) ``` + The conversation history budget (summarization trigger) uses the other share: + `message_token_budget = max(int(usable_context * (1 - DOCUMENT_CONTEXT_BUDGET_RATIO)), 0)`. + `build_document_context_instruction` receives `usable_context` as its `max_token_context` argument (buffer already applied). 2. Iterate documents oldest-first. For each document: - If its token count exceeds the whole budget alone → keep `tool_call_only`. - Otherwise, while adding it would overflow the budget, **evict the oldest currently-inlined document** (FIFO): demote it to `tool_call_only`, free its tokens. @@ -444,6 +447,17 @@ Token estimation uses `tiktoken` with the `cl100k_base` encoding (GPT-4 tokenize The assembled instruction is **cached** per turn keyed on: `conversation_id`, `user_id`, `model_hrid`, `model.max_token_context`, `DOCUMENT_CONTEXT_BUDGET_RATIO`, `DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS`, and a fingerprint of `(attachment.id, attachment.updated_at)` for every text attachment - **conversation and project text attachments both contribute to the fingerprint**. Any attachment add / remove / edit (including project files), or any settings change, invalidates the cache. TTL is 30 minutes (`CACHE_TIMEOUT`). +#### Conversation history summarization + +When `message_token_budget` is exceeded, `chat/agents/history_processors.py` calls a separate summarization model (`LLM_SUMMARIZATION_MODEL_HRID`) and stores the result on the conversation (`history_summary`, `history_summary_checkpoint`). This runs at the **start of a new user turn**, before `agent.iter`, against **stored** `pydantic_messages` from previous turns only (the current user prompt is extracted separately and is not in that list yet). That stored history usually ends on an assistant `ModelResponse`. + +1. **Trigger**: estimated tokens in the active history slice exceed `message_token_budget` (see formulas above) and there are new messages after the last checkpoint. +2. **After a summary**: the model receives the stored summary text (dynamic instruction) plus the last `CONVERSATION_SUMMARY_CONTEXT_MESSAGES` `ModelMessage` entries before the checkpoint. Use an **even** value so the retained window starts on a user `ModelRequest` in a plain user/assistant alternation (tool messages can break parity). +3. **Summary length**: capped by `CONVERSATION_SUMMARY_MAX_TOKENS` on the summarization LLM call. +4. **Disable summarization** without changing document budgets: remove `max_token_context` from the chat model in `LLM_CONFIGURATIONS`, or set `message_token_budget` to zero (`DOCUMENT_CONTEXT_BUDGET_RATIO=1` also zeroes it but reallocates all `usable_context` to documents). + +The security buffer is **not** a dedicated reserve for system prompts, tool schemas, or completion tokens; those are added on top of the planned document/history split. Size the buffer (and/or plan `max_token_context` below the model nominal window) to leave headroom for that overhead. + #### Targeted document operations (`document_id`) Three tools accept an optional `document_id` argument, each with its own IDOR boundary: @@ -522,8 +536,11 @@ A `READY` attachment whose `rag_document_id` is null (e.g. parse succeeded but t | `RAG_DOCUMENT_SEARCH_BACKEND` | `AlbertRagBackend` | Import path of the vector-search backend used for indexing and search (Albert or Find) | | `PROJECT_FILES_MAX_COUNT` | `10` | Max non-image attachments per project (excludes hidden markdown companions). Enforced at upload-time in `ChatProjectAttachmentViewSet`. Bounds per-turn system-prompt token cost (every entry contributes to `project_documents` on every conversation turn). | | `PROJECT_IMAGES_MAX_COUNT` | `3` | Max image attachments per project. Enforced at upload-time. Bounds per-turn vision token cost - every project image is pinned to every turn alongside conversation-message images, and provider request-level image caps (Anthropic ~20/request) clip the trailing entries first. | -| `DOCUMENT_CONTEXT_BUDGET_RATIO` | `0.5` | Fraction of `model.max_token_context` reserved for inlined documents (0 disables full-context inlining; everything stays `tool_call_only`) | -| `DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS` | `1000` | Tokens subtracted from the inlining budget to absorb tokenizer drift on non-OpenAI models | +| `DOCUMENT_CONTEXT_BUDGET_RATIO` | `0.5` | Fraction of `usable_context` (after the security buffer) reserved for inlined documents. `(1 - ratio)` goes to the conversation history budget (summarization trigger). `0` disables full-context inlining (everything stays `tool_call_only`). | +| `DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS` | `1000` | Tokens subtracted once from `model.max_token_context` before the document/history split, to absorb tokenizer drift on non-OpenAI models and leave headroom beyond the planned split | +| `CONVERSATION_SUMMARY_CONTEXT_MESSAGES` | `10` | Number of pydantic-ai `ModelMessage` entries kept in runtime history after a conversation summary (in addition to the stored summary text). Use an even value (see [Conversation history summarization](#conversation-history-summarization)) | +| `CONVERSATION_SUMMARY_MAX_TOKENS` | `2048` | `max_tokens` for the summarization LLM call when generating or updating `history_summary` | +| `LLM_SUMMARIZATION_MODEL_HRID` | `default-summarization-model` | HRID in `LLM_CONFIGURATIONS` for the conversation summarization agent (see [LLM Configuration](llm-configuration.md)) | #### RAG_FILES_ACCEPTED_FORMATS @@ -571,7 +588,7 @@ RAG_FILES_ACCEPTED_FORMATS = [ ### Per-model setting: `max_token_context` -Each entry in `LLM_CONFIGURATIONS` accepts a `max_token_context` integer field declaring the model's context window size. When set, it drives the inlining budget for the documents listing (`document_budget = max_token_context * DOCUMENT_CONTEXT_BUDGET_RATIO - DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS`). +Each entry in `LLM_CONFIGURATIONS` accepts a `max_token_context` integer field declaring the model's context window size. When set, it drives document inlining and conversation summarization budgets (`usable_context = max_token_context - DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS`; `document_budget = usable_context * DOCUMENT_CONTEXT_BUDGET_RATIO`). If a model has no `max_token_context`, all of its documents are kept `tool_call_only` regardless of size and a warning is logged on every chat turn. Setting the field accurately matters: too low and small documents get pushed to RAG-only when they could be inlined; too high and the LLM may exceed its real window. diff --git a/docs/env.md b/docs/env.md index 3d408c5c..7dafd234 100644 --- a/docs/env.md +++ b/docs/env.md @@ -80,6 +80,10 @@ These are the environment variables you can set for the `conversations-backend` | LLM_CONFIGURATION_FILE_PATH | Path to the LLM configuration JSON file. See [LLM Configuration](llm-configuration.md) for details | /conversations/configuration/llm/default.json | | LLM_DEFAULT_MODEL_HRID | HRID of the model used for conversations | default-model | | LLM_SUMMARIZATION_MODEL_HRID | HRID of the model used for summarization | default-summarization-model | +| DOCUMENT_CONTEXT_BUDGET_RATIO | Fraction of `usable_context` (after security buffer) for inlined conversation documents; `(1 - ratio)` is the conversation history token budget. See [attachments.md](attachments.md) | `0.5` | +| DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS | Tokens subtracted once from `max_token_context` before the document/history split | `1000` | +| CONVERSATION_SUMMARY_CONTEXT_MESSAGES | `ModelMessage` count kept after a conversation summary (use an even value). See [attachments.md](attachments.md) | `10` | +| CONVERSATION_SUMMARY_MAX_TOKENS | Max tokens for the conversation summarization LLM output | `2048` | | AI_API_KEY | AI API key to be used for the default provider (used in default LLM configuration, not for production use) | | | AI_BASE_URL | OpenAI compatible AI base URL (used in default LLM configuration, not for production use) | | | AI_MODEL | AI Model name to use (used in default LLM configuration, not for production use) | | diff --git a/src/backend/chat/agents/history_processors.py b/src/backend/chat/agents/history_processors.py new file mode 100644 index 00000000..8e0f5256 --- /dev/null +++ b/src/backend/chat/agents/history_processors.py @@ -0,0 +1,338 @@ +"""History processors for model message cleanup.""" + +import dataclasses +import logging + +from pydantic_ai.messages import ( + ModelMessage, + ModelRequest, + ModelResponse, + TextPart, + ToolCallPart, + ToolReturnPart, + UserPromptPart, +) + +from chat.document_context_builder import count_approx_tokens + +from .summarize import SummarizationAgent + +logger = logging.getLogger(__name__) + +SUMMARY_SYSTEM_PREFIX = ( + "[Conversation summary from previous turns] (context only, not a user request):\n" +) + + +@dataclasses.dataclass(frozen=True) +class HistoryCleanupResult: + """Result of history cleanup, with optional generated summary.""" + + history: list[ModelMessage] + summary: str | None = None + summary_checkpoint: int | None = None + + +def _stringify_message_content(content: object) -> str: + """Convert part content to a plain text representation.""" + if isinstance(content, str): + return content + if isinstance(content, list): + return " ".join(str(item) for item in content if item is not None) + return str(content) + + +def _format_exchanges_for_summary(messages: list[ModelMessage]) -> str: + lines = [] + + for msg in messages: + if isinstance(msg, ModelRequest): + role = "User" + parts = (p for p in msg.parts if isinstance(p, UserPromptPart)) + elif isinstance(msg, ModelResponse): + role = "Assistant" + parts = (p for p in msg.parts if isinstance(p, TextPart)) + else: + continue + + for part in parts: + content = part.content + text = _stringify_message_content(content).strip() + if text: + lines.append(f"{role}: {text}") + return "\n".join(lines) + + +async def conversation_summarization( + messages: list[ModelMessage], *, max_tokens: int = 300, previous_summary: str | None = None +) -> str | None: + """ + Summarize the conversation. + """ + + summarization_agent = SummarizationAgent() + latest_summary = previous_summary + exchanges = _format_exchanges_for_summary(messages) + prompt = ( + "You are a conversation summarization assistant. Your role is to maintain\n" + "a concise and accurate running summary of a conversation, " + "omitting small talk and unrelated topics.\n\n" + "Given the previous summary (if any) and the new exchanges provided,\n" + "generate an updated summary that:\n\n" + "- **Preserves** every key information, decisions, and important facts\n" + "- **Integrates** the new exchanges in a coherent way\n" + "- **Removes** redundant or non-essential details\n" + "- **Maintains** the context needed for the conversation to continue\n" + "- Is written in a neutral, factual, third-person style\n" + "- Stays **concise** (5-10 lines maximum)\n\n" + "## Previous Summary:\n" + f"{latest_summary if latest_summary else ''}\n\n" + "## New Exchanges:\n" + f"{exchanges}\n\n" + "Only answer with the updated summary, including the new exchanges " + "information and the previous summary.\n\n" + "## Updated Summary:\n" + ) + logger.debug("Prompt for summarization: %s", prompt) + logger.debug("Latest summary: %s", latest_summary) + try: + resp = await summarization_agent.run( + prompt, + model_settings={"max_tokens": max_tokens}, + ) + except Exception as exc: # pylint: disable=broad-except # noqa: BLE001 + logger.warning("Conversation summarization failed: %s", exc, exc_info=True) + return None + updated_summary = (resp.output or "").strip() + logger.debug("Updated summary: %s", updated_summary) + return updated_summary or None + + +def _latest_tool_call_ids(messages: list[ModelMessage]) -> set[str]: + for message in reversed(messages): + if not isinstance(message, ModelResponse): + continue + response_tool_calls = [ + part.tool_call_id + for part in message.parts + if isinstance(part, ToolCallPart) and isinstance(part.tool_call_id, str) + ] + if response_tool_calls: + return set(response_tool_calls) + return set() + + +def _clean_request_parts(parts: list, latest_tool_call_ids: set[str]) -> list: + kept_parts = [] + for part in parts: + if not isinstance(part, ToolReturnPart): + kept_parts.append(part) + continue + if part.tool_call_id in latest_tool_call_ids: + kept_parts.append(part) + continue + tool_name = getattr(part, "tool_name", None) or "unknown_tool" + kept_parts.append( + ToolReturnPart( + tool_call_id=part.tool_call_id, + tool_name=tool_name, + content=f"<{tool_name} response compacted>", + ) + ) + return kept_parts + + +def clean_tool_history(messages: list[ModelMessage]) -> list[ModelMessage]: + """Compact old tool returns while preserving the latest tool cycle.""" + latest_tool_call_ids = _latest_tool_call_ids(messages) + cleaned_history: list[ModelMessage] = [] + + for message in messages: + if isinstance(message, ModelRequest): + kept_parts = _clean_request_parts(message.parts, latest_tool_call_ids) + if kept_parts: + cleaned_history.append(dataclasses.replace(message, parts=kept_parts)) + continue + + cleaned_history.append(message) + + return cleaned_history + + +def safe_clean_tool_history(messages: list[ModelMessage]) -> list[ModelMessage]: + """Compact tool history, falling back to the input on unexpected errors.""" + try: + return clean_tool_history(messages) + except Exception as exc: # pylint: disable=broad-except # noqa: BLE001 + logger.warning( + "Tool history cleanup failed, using raw history: %s", + exc, + exc_info=True, + ) + return messages + + +def _history_cleanup_fallback( + messages: list[ModelMessage], summary_checkpoint: int, context_messages: int +) -> HistoryCleanupResult: + """Return the active history slice when summarization logic fails unexpectedly.""" + checkpoint = _safe_checkpoint(messages, summary_checkpoint) + return HistoryCleanupResult(history=_active_history(messages, checkpoint, context_messages)) + + +def _estimate_message_tokens(message: ModelMessage) -> int: + """Estimate the token weight of one model message.""" + parts = getattr(message, "parts", []) or [] + if not parts: + return 0 + + part_tokens = 0 + for part in parts: + content = getattr(part, "content", "") + text = _stringify_message_content(content).strip() + if text: + part_tokens += count_approx_tokens(text) + + args = getattr(part, "args", "") + if isinstance(args, str) and args.strip(): + part_tokens += count_approx_tokens(args) + + return part_tokens + (4 * len(parts)) + 4 + + +def _estimate_history_tokens(messages: list[ModelMessage]) -> int: + """Estimate token weight for a message list.""" + return sum(_estimate_message_tokens(message) for message in messages) + + +def _safe_checkpoint(messages: list[ModelMessage], summary_checkpoint: int) -> int: + """Clamp the stored summary checkpoint to the current history size.""" + return max(0, min(summary_checkpoint, len(messages))) + + +def _active_start_index( + messages: list[ModelMessage], summary_checkpoint: int, context_messages: int +) -> int: + """Return where active history starts for a summary checkpoint.""" + checkpoint = _safe_checkpoint(messages, summary_checkpoint) + return max(0, checkpoint - max(context_messages, 1)) + + +def _active_history( + messages: list[ModelMessage], summary_checkpoint: int, context_messages: int +) -> list[ModelMessage]: + """Keep the last `context_messages` ModelMessage entries before the checkpoint.""" + active_start = _active_start_index(messages, summary_checkpoint, context_messages) + active = messages[active_start:] + if active: + return active + return messages[-1:] if messages else [] + + +def build_active_history( + messages: list[ModelMessage], summary_checkpoint: int, context_messages: int +) -> list[ModelMessage]: + """Return the runtime history window for a summary checkpoint.""" + return _active_history(messages, summary_checkpoint, context_messages) + + +async def maybe_summarize_history( # noqa: PLR0913 # pylint: disable=too-many-arguments + messages: list[ModelMessage], + *, + previous_summary: str | None = None, + summary_checkpoint: int = 0, + message_token_budget: int = 0, + context_messages: int = 10, + summary_max_tokens: int = 2048, + allow_summary_generation: bool = True, +) -> HistoryCleanupResult: + """ + Summarize when active history exceeds token budget. + + Called at the start of a new user turn against stored history from previous + turns (the current user prompt is not in `messages` yet). That history + normally ends on an assistant ModelResponse; use an even `context_messages` + so the post-summary window starts on a user message. + + `summary_checkpoint` is the message index up to which `previous_summary` + is valid. Runtime history keeps the last `context_messages` ModelMessage + entries before the checkpoint so the model still has recent detailed context + in addition to the summary. + """ + try: + checkpoint = _safe_checkpoint(messages, summary_checkpoint) + active_history = _active_history(messages, checkpoint, context_messages) + active_tokens = _estimate_history_tokens(active_history) + logger.debug( + ( + "maybe_summarize_history state: total_messages=%s checkpoint=%s " + "active_messages=%s active_tokens=%s token_budget=%s" + ), + len(messages), + checkpoint, + len(active_history), + active_tokens, + message_token_budget, + ) + + if message_token_budget <= 0 or active_tokens <= message_token_budget: + return HistoryCleanupResult(history=active_history) + + next_checkpoint = len(messages) + if not allow_summary_generation or next_checkpoint <= checkpoint: + return HistoryCleanupResult(history=active_history) + + summary_input = messages[checkpoint:next_checkpoint] + logger.debug( + "maybe_summarize_history summarizing messages %s:%s (%s message(s)).", + checkpoint, + next_checkpoint, + len(summary_input), + ) + summary = await conversation_summarization( + summary_input, + max_tokens=summary_max_tokens, + previous_summary=previous_summary, + ) + if not summary: + logger.warning( + "No updated summary generated, keeping previous summary and active context." + ) + return HistoryCleanupResult(history=active_history) + + return HistoryCleanupResult( + history=_active_history(messages, next_checkpoint, context_messages), + summary=summary, + summary_checkpoint=next_checkpoint, + ) + except Exception as exc: # pylint: disable=broad-except # noqa: BLE001 + logger.warning( + "History summarization failed, keeping active context: %s", + exc, + exc_info=True, + ) + return _history_cleanup_fallback(messages, summary_checkpoint, context_messages) + + +# check for summary generation need to know if the current +# turn should trigger a frontend summary event +def should_generate_conversation_summary( + messages: list[ModelMessage], + *, + previous_summary: str | None = None, + summary_checkpoint: int = 0, + message_token_budget: int = 0, + context_messages: int = 10, +) -> bool: + """Return True when active history exceeds budget and new messages can be summarized.""" + _ = previous_summary + if message_token_budget <= 0: + return False + + cleaned_history = safe_clean_tool_history(messages) + checkpoint = _safe_checkpoint(cleaned_history, summary_checkpoint) + active_history = _active_history(cleaned_history, checkpoint, context_messages) + return ( + _estimate_history_tokens(active_history) > message_token_budget + and len(cleaned_history) > checkpoint + ) diff --git a/src/backend/chat/clients/pydantic_ai.py b/src/backend/chat/clients/pydantic_ai.py index 43299bd3..520a7324 100644 --- a/src/backend/chat/clients/pydantic_ai.py +++ b/src/backend/chat/clients/pydantic_ai.py @@ -127,6 +127,12 @@ from chat import models from chat.agents.conversation import ConversationAgent, TitleGenerationAgent +from chat.agents.history_processors import ( + SUMMARY_SYSTEM_PREFIX, + maybe_summarize_history, + safe_clean_tool_history, + should_generate_conversation_summary, +) from chat.agents.local_media_url_processors import ( build_project_image_urls, update_history_local_urls, @@ -251,6 +257,7 @@ def __init__( # pylint: disable=too-many-arguments,too-many-positional-argument self.conversation = conversation self.user = user # authenticated user only self.model_hrid = model_hrid or settings.LLM_DEFAULT_MODEL_HRID # HRID of the model to use + self.model_configuration = get_model_configuration(self.model_hrid) self.language = language # might be None self._last_stop_check = 0 @@ -259,7 +266,7 @@ def __init__( # pylint: disable=too-many-arguments,too-many-positional-argument self.event_encoder = EventEncoder(CURRENT_EVENT_ENCODER_VERSION) # We use v4 for now self._support_streaming = True - if (streaming := get_model_configuration(self.model_hrid).supports_streaming) is not None: + if (streaming := self.model_configuration.supports_streaming) is not None: self._support_streaming = streaming # Feature flags @@ -276,6 +283,8 @@ def __init__( # pylint: disable=too-many-arguments,too-many-positional-argument ) self._web_search_tool_registered = False self._self_documentation_tool_registered = False + self._history_summary = conversation.history_summary.strip() or None + self._history_summary_checkpoint = max(conversation.history_summary_checkpoint, 0) self.conversation_agent = ConversationAgent( model_hrid=self.model_hrid, @@ -287,8 +296,17 @@ def __init__( # pylint: disable=too-many-arguments,too-many-positional-argument if self._langfuse_available else False, deps_type=ContextDeps, + history_processors=[self._history_processor], ) add_document_rag_search_tool_from_setting(self.conversation_agent, self.user) + max_token_context = self.model_configuration.max_token_context or 0 + usable_token_context = max( + max_token_context - settings.DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS, + 0, + ) + self._conversation_message_token_budget = max( + int((1 - settings.DOCUMENT_CONTEXT_BUDGET_RATIO) * usable_token_context), 0 + ) # Inject project-level custom instructions if the conversation belongs to a project llm_user_instructions = ( @@ -302,6 +320,12 @@ def __init__( # pylint: disable=too-many-arguments,too-many-positional-argument def project_instructions() -> str: return llm_user_instructions + @self.conversation_agent.instructions + def history_summary_instructions() -> str: + if not self._history_summary: + return "" + return f"{SUMMARY_SYSTEM_PREFIX}{self._history_summary.strip()}" + @property def _stop_cache_key(self): return f"streaming:stop:{self.conversation.pk}" @@ -392,8 +416,16 @@ async def _clean(self): # --------------------------------------------------------------------- # async def _prepare_agent_run( - self, messages: List[UIMessage] - ) -> Tuple[str, List, List, Dict[str, str], Dict[str, Union[int, float]], List, bool]: + self, messages: List[UIMessage], history: List[ModelMessage] + ) -> Tuple[ + str, + List, + List, + Dict[str, str], + Dict[str, Union[int, float]], + List, + bool, + ]: """ Prepare all inputs needed before running the agent. @@ -416,7 +448,6 @@ async def _prepare_agent_run( - history: Validated message history for the agent - conversation_has_documents: Whether RAG should be enabled """ - history = ModelMessagesTypeAdapter.validate_python(self.conversation.pydantic_messages) history = update_history_local_urls( self.conversation, history ) # presign URLs for local images @@ -479,6 +510,32 @@ async def _prepare_agent_run( conversation_has_documents, ) + async def _apply_history_cleanup( + self, history: list[ModelMessage], *, allow_summary_generation: bool + ) -> list[ModelMessage]: + """Compact history and persist any generated summary metadata.""" + cleaned_history = safe_clean_tool_history(history) + cleanup_result = await maybe_summarize_history( + cleaned_history, + previous_summary=self._history_summary, + summary_checkpoint=self._history_summary_checkpoint, + message_token_budget=self._conversation_message_token_budget, + context_messages=settings.CONVERSATION_SUMMARY_CONTEXT_MESSAGES, + summary_max_tokens=settings.CONVERSATION_SUMMARY_MAX_TOKENS, + allow_summary_generation=allow_summary_generation, + ) + if cleanup_result.summary: + self._history_summary = cleanup_result.summary + self.conversation.history_summary = cleanup_result.summary + if cleanup_result.summary_checkpoint is not None: + self._history_summary_checkpoint = cleanup_result.summary_checkpoint + self.conversation.history_summary_checkpoint = cleanup_result.summary_checkpoint + return cleanup_result.history + + async def _history_processor(self, history: list[ModelMessage]) -> list[ModelMessage]: + """Native pydantic-ai history processor for internal tool-cycle cleanup.""" + return safe_clean_tool_history(history) + def _setup_web_search(self, force_web_search: bool) -> bool: """Configure web search if forced. Returns whether web search is actually forced.""" @@ -769,7 +826,8 @@ async def _build_document_context_instruction(self) -> str: upload_state=models.AttachmentStatus.READY, ).order_by("created_at", "id") ) - max_token_context = self.conversation_agent.configuration.max_token_context + max_token_context = self.model_configuration.max_token_context or 0 + usable_token_context = max(max_token_context - security_buffer_tokens, 0) # Cache the assembled instruction: building it requires reading every text # attachment from object storage and tokenizing it. The fingerprint includes @@ -779,7 +837,7 @@ async def _build_document_context_instruction(self) -> str: f"conv={self.conversation.id}", f"user={getattr(self.user, 'id', None)}", f"model={self.model_hrid}", - f"max_ctx={max_token_context}", + f"max_ctx={usable_token_context}", f"ratio={budget_ratio}", f"buffer={security_buffer_tokens}", *(f"{att.id}:{att.updated_at.isoformat()}" for att in text_attachments), @@ -799,9 +857,8 @@ async def _build_document_context_instruction(self) -> str: text_attachments=text_attachments, project_text_attachments=project_text_attachments, model_hrid=self.model_hrid, - max_token_context=max_token_context, + max_token_context=usable_token_context, budget_ratio=budget_ratio, - security_buffer_tokens=security_buffer_tokens, ) await sync_to_async(cache.set)(cache_key, instruction, CACHE_TIMEOUT) return instruction @@ -1179,7 +1236,8 @@ async def _finalize_conversation( # pylint: disable=too-many-arguments,too-many ), ) - async def _run_agent( # pylint: disable=too-many-locals + # pylint: disable=too-many-locals,too-many-branches,too-many-statements + async def _run_agent( # noqa: PLR0912 self, messages: List[UIMessage], force_web_search: bool = False, @@ -1200,7 +1258,7 @@ async def _run_agent( # pylint: disable=too-many-locals ) else: langfuse = None - + raw_history = ModelMessagesTypeAdapter.validate_python(self.conversation.pydantic_messages) ( user_prompt, input_images, @@ -1209,7 +1267,24 @@ async def _run_agent( # pylint: disable=too-many-locals usage, history, conversation_has_documents, - ) = await self._prepare_agent_run(messages) + ) = await self._prepare_agent_run(messages, raw_history) + + # Conversation summary process. This runs before agent.iter so dynamic + # instructions can read the updated summary in the same model request. + should_emit_summary_event = should_generate_conversation_summary( + history, + previous_summary=self._history_summary, + summary_checkpoint=self._history_summary_checkpoint, + message_token_budget=self._conversation_message_token_budget, + context_messages=settings.CONVERSATION_SUMMARY_CONTEXT_MESSAGES, + ) + if should_emit_summary_event: + tool_call_id = str(uuid.uuid4()) + yield events_v4.ToolCallPart( + tool_call_id=tool_call_id, + tool_name="summarize", + args={"state": "running", "summary_scope": "conversation"}, + ) doc_result = None async for item in self._handle_input_documents( @@ -1225,6 +1300,16 @@ async def _run_agent( # pylint: disable=too-many-locals conversation_has_documents = doc_result.has_documents + history = await self._apply_history_cleanup( + history, + allow_summary_generation=should_emit_summary_event, + ) + if should_emit_summary_event: + yield events_v4.ToolResultPart( + tool_call_id=tool_call_id, + result={"state": "done"}, + ) + await self._agent_stop_streaming(force_cache_check=True) self._setup_self_documentation_tool() self._setup_web_search_tool() @@ -1242,10 +1327,12 @@ async def _run_agent( # pylint: disable=too-many-locals if history and history[-1].kind == "request": if history[-1].parts and history[-1].parts[-1].part_kind == "tool-return": history.append(ModelResponse(parts=[TextPart(content="ok")], kind="response")) + message_history = history if history else None async with self.conversation_agent.iter( [user_prompt] + input_images, - message_history=history, # history will pass through agent's history_processors + # History passes through history_processors when set on the agent. + message_history=message_history, deps=self._context_deps, toolsets=mcp_servers, ) as run: diff --git a/src/backend/chat/document_context_builder.py b/src/backend/chat/document_context_builder.py index 98dc6c47..55c7a124 100644 --- a/src/backend/chat/document_context_builder.py +++ b/src/backend/chat/document_context_builder.py @@ -267,7 +267,6 @@ async def build_document_context_instruction( # noqa: PLR0913 # pylint: disable model_hrid: str, max_token_context: int | None, budget_ratio: float, - security_buffer_tokens: int, project_text_attachments: Sequence[models.ChatConversationAttachment] = (), ) -> str: """ @@ -328,7 +327,7 @@ async def build_document_context_instruction( # noqa: PLR0913 # pylint: disable ) ) - document_budget = max(int(max_token_context * budget_ratio) - security_buffer_tokens, 0) + document_budget = max(int(max_token_context * budget_ratio), 0) async def _load_document( index: int, attachment: models.ChatConversationAttachment diff --git a/src/backend/chat/llm_configuration.py b/src/backend/chat/llm_configuration.py index a534fb2f..64820d62 100644 --- a/src/backend/chat/llm_configuration.py +++ b/src/backend/chat/llm_configuration.py @@ -141,11 +141,11 @@ class LLModel(BaseModel): is_active: bool icon: LongStringAsListValue | None = None supports_streaming: bool | None = None + max_token_context: int | None = None system_prompt: SettingEnvValue tools: list[str] web_search: SettingEnvValue | None = None concatenate_instruction_messages: bool | None = None - max_token_context: int | None = None @field_validator("tools", mode="before") @classmethod @@ -176,20 +176,20 @@ def validate_web_search(cls, value: str | None) -> str | None: @field_validator("max_token_context", mode="before") @classmethod - def validate_max_token_context(cls, value: int | str | None) -> int | None: - """Accept integer-like values from JSON for model context size.""" + def validate_max_token_context(cls, value: Any) -> int | None: + """Parse max_token_context from literal, setting, or env value.""" if value is None or value == "": return None if isinstance(value, bool): # bool is an int subclass in Python, reject explicitly so True/False # don't become 1/0. raise ValueError("max_token_context must be an integer value.") - if isinstance(value, int): - parsed_value = value - elif isinstance(value, str): + if isinstance(value, str): + value = _get_setting_or_env_or_value(value) + try: parsed_value = int(value) - else: - raise ValueError("max_token_context must be an integer value.") + except (TypeError, ValueError) as exc: + raise ValueError("max_token_context must be an integer value.") from exc if parsed_value <= 0: raise ValueError("max_token_context must be a positive integer") diff --git a/src/backend/chat/migrations/0008_chatconversation_history_summary.py b/src/backend/chat/migrations/0008_chatconversation_history_summary.py new file mode 100644 index 00000000..25bf5b9f --- /dev/null +++ b/src/backend/chat/migrations/0008_chatconversation_history_summary.py @@ -0,0 +1,19 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("chat", "0007_chatconversationattachment_project_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="chatconversation", + name="history_summary", + field=models.TextField( + blank=True, + default="", + help_text="Latest generated conversation summary used as system context", + ), + ), + ] diff --git a/src/backend/chat/migrations/0009_chatconversation_history_summary_checkpoint.py b/src/backend/chat/migrations/0009_chatconversation_history_summary_checkpoint.py new file mode 100644 index 00000000..1b670314 --- /dev/null +++ b/src/backend/chat/migrations/0009_chatconversation_history_summary_checkpoint.py @@ -0,0 +1,18 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("chat", "0008_chatconversation_history_summary"), + ] + + operations = [ + migrations.AddField( + model_name="chatconversation", + name="history_summary_checkpoint", + field=models.PositiveIntegerField( + default=0, + help_text="Number of pydantic history messages already compacted into history_summary", + ), + ), + ] diff --git a/src/backend/chat/models.py b/src/backend/chat/models.py index 8a40e5c2..0b87007e 100644 --- a/src/backend/chat/models.py +++ b/src/backend/chat/models.py @@ -136,6 +136,15 @@ class ChatConversation(BaseModel): blank=True, help_text="Pydantic messages for the chat conversation, used for history", ) + history_summary = models.TextField( + blank=True, + default="", + help_text="Latest generated conversation summary used as system context", + ) + history_summary_checkpoint = models.PositiveIntegerField( + default=0, + help_text="Number of pydantic history messages already compacted into history_summary", + ) messages: Sequence[UIMessage] = SchemaField( schema=list[UIMessage], default=list, diff --git a/src/backend/chat/tests/agents/test_history_processors.py b/src/backend/chat/tests/agents/test_history_processors.py new file mode 100644 index 00000000..66c8d166 --- /dev/null +++ b/src/backend/chat/tests/agents/test_history_processors.py @@ -0,0 +1,350 @@ +"""Tests for history processors.""" + +import pytest +from pydantic_ai import ( + Agent, + ModelMessage, + ModelRequest, + ModelResponse, + TextPart, + UserPromptPart, +) +from pydantic_ai.messages import ( + ToolCallPart, + ToolReturnPart, +) +from pydantic_ai.models.function import AgentInfo, FunctionModel + +from chat.agents import history_processors + + +@pytest.fixture +def _received_messages_fixture() -> list[ModelMessage]: + """Fixture to capture messages received by the function model.""" + return [] + + +@pytest.fixture(name="received_messages") +def received_messages_fixture_alias( + _received_messages_fixture: list[ModelMessage], +) -> list[ModelMessage]: + """Expose received messages fixture with a stable pytest name.""" + return _received_messages_fixture + + +@pytest.fixture(name="function_model") +def function_model_fixture(received_messages: list[ModelMessage]) -> FunctionModel: + """Fixture to capture model function.""" + + def capture_model_function(messages: list[ModelMessage], _info: AgentInfo) -> ModelResponse: + # Capture exactly what reaches the provider after history processors. + received_messages.clear() + received_messages.extend(messages) + return ModelResponse(parts=[TextPart(content="Provider response")]) + + return FunctionModel(capture_model_function) + + +def _build_turns(turn_count: int) -> list: + """Build a list of turns for testing.""" + messages = [] + for turn in range(1, turn_count + 1): + messages.append(ModelRequest(parts=[UserPromptPart(content=[f"user-{turn}"])])) + messages.append(ModelResponse(parts=[TextPart(content=f"assistant-{turn}")])) + return messages + + +def test_history_processors_are_applied_before_provider_call( + function_model: FunctionModel, received_messages: list[ModelMessage] +): + """History processors should run before provider invocation.""" + + def keep_only_requests(messages: list[ModelMessage]) -> list[ModelMessage]: + return [msg for msg in messages if isinstance(msg, ModelRequest)] + + agent = Agent(function_model, history_processors=[keep_only_requests]) + message_history = [ + ModelRequest(parts=[UserPromptPart(content="Question 1")]), + ModelResponse(parts=[TextPart(content="Answer 1")]), + ] + + agent.run_sync("Question 2", message_history=message_history) + assert len(received_messages) == 1 + assert isinstance(received_messages[0], ModelRequest) + user_prompt_contents = [ + part.content for part in received_messages[0].parts if isinstance(part, UserPromptPart) + ] + assert user_prompt_contents == ["Question 1", "Question 2"] + + +@pytest.mark.asyncio +async def test_history_cleanup_keeps_full_history_when_under_budget(): + """No summary should be produced when the active slice fits budget.""" + messages = _build_turns(2) + + result = await history_processors.maybe_summarize_history(messages, message_token_budget=10_000) + + assert result.summary is None + assert result.history == messages + + +def test_clean_tool_history_redacts_old_tool_returns_but_keeps_latest_tool_result(): + """Old tool results are compacted, latest tool result stays intact.""" + messages = [ + ModelResponse(parts=[ToolCallPart(tool_call_id="old", tool_name="search", args="{}")]), + ModelRequest( + parts=[ + ToolReturnPart( + tool_call_id="old", + tool_name="search", + content="large old result", + ) + ] + ), + ModelResponse(parts=[ToolCallPart(tool_call_id="latest", tool_name="search", args="{}")]), + ModelRequest( + parts=[ + ToolReturnPart( + tool_call_id="latest", + tool_name="search", + content="fresh result", + ) + ] + ), + ] + + result = history_processors.clean_tool_history(messages) + + old_return = result[1].parts[0] + latest_return = result[3].parts[0] + assert isinstance(old_return, ToolReturnPart) + assert isinstance(latest_return, ToolReturnPart) + assert old_return.content == "" + assert latest_return.content == "fresh result" + + +@pytest.mark.asyncio +async def test_history_cleanup_over_budget_generates_summary_and_advances_checkpoint(monkeypatch): + """Summary is generated when unsummarized runtime history exceeds budget.""" + summarized_messages = [] + + async def fake_summary(messages, *, max_tokens=300, previous_summary=None): + summarized_messages.extend(messages) + _ = max_tokens + _ = previous_summary + return "summary-v1" + + monkeypatch.setattr(history_processors, "conversation_summarization", fake_summary) + messages = _build_turns(3) + + result = await history_processors.maybe_summarize_history( + messages, message_token_budget=1, context_messages=1 + ) + + assert result.summary == "summary-v1" + assert result.summary_checkpoint == len(messages) + assert result.history == messages[5:] + assert summarized_messages == messages[: result.summary_checkpoint] + + +@pytest.mark.asyncio +async def test_history_cleanup_existing_summary_uses_checkpoint_slice(monkeypatch): + """When already summarized, runtime history starts at checkpoint minus context.""" + + async def fake_summary(_messages, *, max_tokens=300, previous_summary=None): + _ = max_tokens + _ = previous_summary + raise AssertionError("Summary should not be regenerated at turn 5") + + monkeypatch.setattr(history_processors, "conversation_summarization", fake_summary) + messages = _build_turns(5) + + result = await history_processors.maybe_summarize_history( + messages, + previous_summary="summary-v1", + summary_checkpoint=6, + message_token_budget=10_000, + context_messages=1, + ) + + assert result.summary is None + assert result.history == messages[5:] + + +@pytest.mark.asyncio +async def test_history_cleanup_budget_only_counts_active_window_after_checkpoint(monkeypatch): + """Old summarized messages should not retrigger summaries after checkpoint.""" + + async def fake_summary(_messages, *, max_tokens=300, previous_summary=None): + _ = max_tokens + _ = previous_summary + raise AssertionError("Messages before the active window should not count") + + monkeypatch.setattr(history_processors, "conversation_summarization", fake_summary) + messages = [ + ModelRequest(parts=[UserPromptPart(content=["old user " + ("x " * 500)])]), + ModelResponse(parts=[TextPart(content="old assistant " + ("x " * 500))]), + ModelRequest(parts=[UserPromptPart(content=["context user"])]), + ModelResponse(parts=[TextPart(content="context assistant")]), + ModelRequest(parts=[UserPromptPart(content=["new user"])]), + ModelResponse(parts=[TextPart(content="new assistant")]), + ] + + result = await history_processors.maybe_summarize_history( + messages, + previous_summary="summary-v1", + summary_checkpoint=4, + message_token_budget=100, + context_messages=1, + ) + + assert result.summary is None + assert result.history == messages[3:] + + +@pytest.mark.asyncio +async def test_history_cleanup_does_not_resummarize_when_checkpoint_is_current(monkeypatch): + """After a summary, context overlap alone should not be summarized again.""" + + async def fake_summary(_messages, *, max_tokens=300, previous_summary=None): + _ = max_tokens + _ = previous_summary + raise AssertionError("Latest active turn should not be summarized") + + monkeypatch.setattr(history_processors, "conversation_summarization", fake_summary) + messages = _build_turns(3) + + result = await history_processors.maybe_summarize_history( + messages, + previous_summary="summary-v1", + summary_checkpoint=len(messages), + message_token_budget=1, + context_messages=1, + ) + + assert result.summary is None + assert result.summary_checkpoint is None + assert result.history == messages[5:] + + +@pytest.mark.asyncio +async def test_history_cleanup_never_returns_empty_history_when_checkpoint_is_at_end(): + """pydantic-ai rejects empty processed history when it passed history in.""" + messages = _build_turns(2) + + result = await history_processors.maybe_summarize_history( + messages, + previous_summary="summary-v1", + summary_checkpoint=len(messages), + message_token_budget=10_000, + context_messages=1, + ) + + assert result.summary is None + assert result.history == messages[3:] + + +def test_clean_tool_history_has_no_summary_checkpoint_behavior(): + """The pydantic-ai history processor path only cleans tools.""" + messages = _build_turns(2) + + result = history_processors.clean_tool_history(messages) + + assert result == messages + + +@pytest.mark.asyncio +async def test_history_cleanup_summary_failure_keeps_runtime_slice(monkeypatch): + """If summary generation fails, keep runtime slice unchanged.""" + + async def fake_summary(_messages, *, max_tokens=300, previous_summary=None): + _ = max_tokens + _ = previous_summary + + monkeypatch.setattr(history_processors, "conversation_summarization", fake_summary) + messages = _build_turns(4) + + result = await history_processors.maybe_summarize_history( + messages, + previous_summary="summary-v1", + summary_checkpoint=2, + message_token_budget=1, + context_messages=1, + ) + + assert result.summary is None + assert result.history == messages[1:] + + +def test_should_generate_conversation_summary_when_budget_exceeded(): + """Frontend summary event should trigger only when over budget.""" + messages = _build_turns(3) + assert history_processors.should_generate_conversation_summary(messages, message_token_budget=1) + assert not history_processors.should_generate_conversation_summary( + messages, message_token_budget=10_000 + ) + assert not history_processors.should_generate_conversation_summary( + messages, + previous_summary="summary-v1", + summary_checkpoint=len(messages), + message_token_budget=1, + context_messages=1, + ) + assert not history_processors.should_generate_conversation_summary( + messages, + previous_summary="summary-v1", + summary_checkpoint=28, + message_token_budget=1, + context_messages=1, + ) + messages_with_large_summarized_prefix = [ + ModelRequest(parts=[UserPromptPart(content=["old user " + ("x " * 500)])]), + ModelResponse(parts=[TextPart(content="old assistant " + ("x " * 500))]), + ModelRequest(parts=[UserPromptPart(content=["context user"])]), + ModelResponse(parts=[TextPart(content="context assistant")]), + ModelRequest(parts=[UserPromptPart(content=["new user"])]), + ModelResponse(parts=[TextPart(content="new assistant")]), + ] + assert not history_processors.should_generate_conversation_summary( + messages_with_large_summarized_prefix, + previous_summary="summary-v1", + summary_checkpoint=4, + message_token_budget=100, + context_messages=1, + ) + + +def test_safe_clean_tool_history_falls_back_to_raw_history(monkeypatch): + """Unexpected cleanup errors should not break the conversation flow.""" + messages = _build_turns(2) + + def raise_cleanup(_messages): + raise RuntimeError("cleanup failed") + + monkeypatch.setattr(history_processors, "clean_tool_history", raise_cleanup) + + result = history_processors.safe_clean_tool_history(messages) + + assert result == messages + + +@pytest.mark.asyncio +async def test_maybe_summarize_history_falls_back_on_unexpected_error(monkeypatch): + """Unexpected summarization errors should keep the active history slice.""" + messages = _build_turns(4) + + def raise_estimate(_message): + raise RuntimeError("token estimate failed") + + monkeypatch.setattr(history_processors, "_estimate_message_tokens", raise_estimate) + + result = await history_processors.maybe_summarize_history( + messages, + previous_summary="summary-v1", + summary_checkpoint=2, + message_token_budget=1, + context_messages=1, + ) + + assert result.summary is None + assert result.history == messages[1:] diff --git a/src/backend/chat/tests/clients/pydantic_ai/test_document_context_window.py b/src/backend/chat/tests/clients/pydantic_ai/test_document_context_window.py index 9736c418..0933dabf 100644 --- a/src/backend/chat/tests/clients/pydantic_ai/test_document_context_window.py +++ b/src/backend/chat/tests/clients/pydantic_ai/test_document_context_window.py @@ -39,6 +39,9 @@ def _parse_listing(instruction: str) -> dict: def _llm_config_with_context(settings): """Configure a model with max_token_context for context window tests.""" settings.DOCUMENT_CONTEXT_BUDGET_RATIO = 0.5 + # Branch default is 10_000; with max_token_context=4000 that zeroes usable_context. + # Tests target buffer=1_000 so budget math stays: usable=3000, document_budget=1500. + settings.DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS = 1000 settings.LLM_CONFIGURATIONS = { "default-model": LLModel( hrid="default-model", @@ -48,8 +51,6 @@ def _llm_config_with_context(settings): icon=None, system_prompt="You are an amazing assistant.", tools=[], - # Keep context large enough so tests can exercise rolling-window behavior - # despite the fixed security buffer applied by the service. max_token_context=4000, provider=LLMProvider( hrid="unused", @@ -82,9 +83,10 @@ async def fake_read_attachment_content(_attachment): # NOSONAR "chat.document_context_builder.read_attachment_content", fake_read_attachment_content, ) + # max_token_context=4000, buffer=1000, ratio=0.5 => document_budget=1500. monkeypatch.setattr( "chat.document_context_builder.count_approx_tokens", - lambda _text: 1201, + lambda _text: 1501, ) instruction = async_to_sync(service._build_document_context_instruction)() # pylint: disable=protected-access @@ -143,14 +145,14 @@ async def fake_read_attachment_content(attachment): # NOSONAR monkeypatch.setattr( "chat.document_context_builder.count_approx_tokens", - lambda _text: 400, + lambda _text: 600, ) instruction = async_to_sync(service._build_document_context_instruction)() # pylint: disable=protected-access listing = _parse_listing(instruction) - # max_token_context=4000, ratio=0.5 => budget=1000 after buffer. - # With 3 docs at 400 tokens each, rolling outcome should inline doc-2 + doc-3. + # max_token_context=4000, buffer=1000, ratio=0.5 => budget=1500. + # With 3 docs at 600 tokens each (1800 total), FIFO evicts doc-1; doc-2 + doc-3 stay inlined. assert listing["documents_order"] == "newest_to_oldest" by_title = {d["title"]: d for d in listing["documents"]} assert set(by_title) == {"doc-1", "doc-2", "doc-3"} @@ -315,7 +317,7 @@ async def fake_read_attachment_content(attachment): # NOSONAR def test_document_context_uses_configurable_ratio(_llm_config_with_context, monkeypatch, settings): """Budget ratio comes from Django settings and changes inlining behavior.""" - settings.DOCUMENT_CONTEXT_BUDGET_RATIO = 0.3 # max_token_context=4000 => budget=200 + settings.DOCUMENT_CONTEXT_BUDGET_RATIO = 0.3 # usable_context=3000 => budget=900 user = UserFactory() conversation = ChatConversationFactory(owner=user) @@ -360,8 +362,8 @@ async def fake_read_attachment_content(attachment): # NOSONAR by_title = {d["title"]: d for d in listing["documents"]} assert set(by_title) == {"doc-1", "doc-2"} - # ratio=0.3, max_context=4000, buffer=1000 => budget=200; only newest fits. - assert by_title["doc-1"]["access"] == ACCESS_TOOL_CALL_ONLY + # ratio=0.3, max_context=4000, buffer=1000 => budget=900; both fit. + assert by_title["doc-1"]["access"] == ACCESS_FULL_CONTEXT assert by_title["doc-2"]["access"] == ACCESS_FULL_CONTEXT diff --git a/src/backend/chat/tests/clients/pydantic_ai/test_thinking_history_stripping.py b/src/backend/chat/tests/clients/pydantic_ai/test_thinking_history_stripping.py index 2a2a2dca..420a33dd 100644 --- a/src/backend/chat/tests/clients/pydantic_ai/test_thinking_history_stripping.py +++ b/src/backend/chat/tests/clients/pydantic_ai/test_thinking_history_stripping.py @@ -116,7 +116,8 @@ async def test_thinking_parts_stripped_when_model_does_not_support_thinking(): ), patch("chat.clients.pydantic_ai.update_history_local_urls", side_effect=lambda _conv, h: h), ): - _, _, _, _, _, history, _ = await service._prepare_agent_run([ui_message]) + raw_history = ModelMessagesTypeAdapter.validate_python(conversation.pydantic_messages) + _, _, _, _, _, history, _ = await service._prepare_agent_run([ui_message], raw_history) assert len(history) == 1 assert isinstance(history[0], ModelResponse) @@ -145,7 +146,8 @@ async def test_thinking_parts_kept_when_model_supports_thinking(): ), patch("chat.clients.pydantic_ai.update_history_local_urls", side_effect=lambda _conv, h: h), ): - _, _, _, _, _, history, _ = await service._prepare_agent_run([ui_message]) + raw_history = ModelMessagesTypeAdapter.validate_python(conversation.pydantic_messages) + _, _, _, _, _, history, _ = await service._prepare_agent_run([ui_message], raw_history) assert len(history) == 1 assert isinstance(history[0], ModelResponse) diff --git a/src/backend/chat/tests/test_document_context_builder.py b/src/backend/chat/tests/test_document_context_builder.py index 1ef2194c..7bd536af 100644 --- a/src/backend/chat/tests/test_document_context_builder.py +++ b/src/backend/chat/tests/test_document_context_builder.py @@ -83,7 +83,7 @@ def _parse_listing(instruction: str) -> dict: return json.loads(instruction.split(prefix, 1)[1]) -async def _build(conversation, *, max_token_context=100, budget_ratio=0.5, security_buffer=0): +async def _build(conversation, *, max_token_context=100, budget_ratio=0.5): """Run build_document_context_instruction with real components.""" text_attachments = await sync_to_async(list)( conversation.attachments.filter(content_type__startswith="text/").order_by( @@ -96,7 +96,6 @@ async def _build(conversation, *, max_token_context=100, budget_ratio=0.5, secur model_hrid="test-model", max_token_context=max_token_context, budget_ratio=budget_ratio, - security_buffer_tokens=security_buffer, ) diff --git a/src/backend/conversations/settings.py b/src/backend/conversations/settings.py index 7750df61..4cbb4f56 100755 --- a/src/backend/conversations/settings.py +++ b/src/backend/conversations/settings.py @@ -683,6 +683,22 @@ class Base(BraveSettings, Configuration): environ_name="DEFAULT_ALLOW_SMART_WEB_SEARCH", environ_prefix=None, ) + # Conversation summary: at the start of a new user turn (before agent.iter), when the + # active slice of stored pydantic_messages (previous turns only; the current user + # prompt is not in that list yet) exceeds the message token budget + # (usable_context * (1 - DOCUMENT_CONTEXT_BUDGET_RATIO)). That history usually ends + # on an assistant ModelResponse. After a summary, keep the last N ModelMessage + # entries before the checkpoint. Use an even N so the window starts on a user message. + CONVERSATION_SUMMARY_CONTEXT_MESSAGES = values.PositiveIntegerValue( + default=10, + environ_name="CONVERSATION_SUMMARY_CONTEXT_MESSAGES", + environ_prefix=None, + ) + CONVERSATION_SUMMARY_MAX_TOKENS = values.PositiveIntegerValue( + default=2048, + environ_name="CONVERSATION_SUMMARY_MAX_TOKENS", + environ_prefix=None, + ) # These settings are default values used in the default LLM_CONFIGURATIONS # They allow a deployment with only one model without a specific configuration file @@ -919,7 +935,7 @@ class Base(BraveSettings, Configuration): environ_prefix=None, ) DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS = values.PositiveIntegerValue( - default=1000, + default=10000, environ_name="DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS", environ_prefix=None, ) diff --git a/src/frontend/apps/conversations/src/features/chat/components/MessageItem.tsx b/src/frontend/apps/conversations/src/features/chat/components/MessageItem.tsx index 0bd1f5ad..8f48167e 100644 --- a/src/frontend/apps/conversations/src/features/chat/components/MessageItem.tsx +++ b/src/frontend/apps/conversations/src/features/chat/components/MessageItem.tsx @@ -243,9 +243,13 @@ const MessageItemComponent: React.FC = ({ }, [toolInvocationParts]); const activeToolInvocation = React.useMemo(() => { - const tool = toolInvocationParts.find( - (part) => part.toolInvocation.toolName !== 'document_parsing', - ); + const tool = [...toolInvocationParts] + .reverse() + .find( + (part) => + part.toolInvocation.toolName !== 'document_parsing' && + part.toolInvocation.state !== 'result', + ); return tool?.toolInvocation; }, [toolInvocationParts]); @@ -367,7 +371,8 @@ const MessageItemComponent: React.FC = ({ {isCurrentlyStreaming && isLastAssistantMessage && status === 'streaming' && - hasNonDocumentParsingTool && ( + hasNonDocumentParsingTool && + activeToolInvocation && ( = ({ > - {activeToolInvocation?.toolName === 'summarize' - ? t('Summarizing...') + {activeToolInvocation.toolName === 'summarize' + ? ( + activeToolInvocation.args as { + summary_scope?: string; + } + )?.summary_scope === 'conversation' + ? t('Summarizing conversation...') + : t('Summarizing...') : t('Search...')}