From 89dde54a0669b2ff173f8105cb2337dcec7ee6df Mon Sep 17 00:00:00 2001
From: camilleAND <camille.andre@modernisation.gouv.fr>
Date: Thu, 23 Apr 2026 16:23:10 +0200
Subject: [PATCH] =?UTF-8?q?=E2=9C=A8(conversation)=20summarize=20messages?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a summarization of last messages every n turns to optimize context
---
 CHANGELOG.md                                  |   1 +
 docs/attachments.md                           |  29 +-
 docs/env.md                                   |   4 +
 src/backend/chat/agents/history_processors.py | 338 +++++++++++++++++
 src/backend/chat/clients/pydantic_ai.py       | 111 +++++-
 src/backend/chat/document_context_builder.py  |   3 +-
 src/backend/chat/llm_configuration.py         |  16 +-
 .../0008_chatconversation_history_summary.py  |  19 +
 ...conversation_history_summary_checkpoint.py |  18 +
 src/backend/chat/models.py                    |   9 +
 .../tests/agents/test_history_processors.py   | 350 ++++++++++++++++++
 .../test_document_context_window.py           |  20 +-
 .../test_thinking_history_stripping.py        |   6 +-
 .../tests/test_document_context_builder.py    |   3 +-
 src/backend/conversations/settings.py         |  18 +-
 .../features/chat/components/MessageItem.tsx  |  23 +-
 16 files changed, 920 insertions(+), 48 deletions(-)
 create mode 100644 src/backend/chat/agents/history_processors.py
 create mode 100644 src/backend/chat/migrations/0008_chatconversation_history_summary.py
 create mode 100644 src/backend/chat/migrations/0009_chatconversation_history_summary_checkpoint.py
 create mode 100644 src/backend/chat/tests/agents/test_history_processors.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 247e9e76..d3c9b507 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ and this project adheres to
 - 🐛(fix) add prevent_url_hallucination instruction to ConversationAgent
 - ✨(projects) handle project files for RAG search
 - ✨(banner) configurable banner with level, title, content and start/end
+- ✨(conversation) summarize messages
 
 ### Changed
 
diff --git a/docs/attachments.md b/docs/attachments.md
index 1dd15eb0..a449a29b 100644
--- a/docs/attachments.md
+++ b/docs/attachments.md
@@ -425,11 +425,14 @@ Notes:
 
 The decision of which documents are inlined as `full-context` vs left as `tool_call_only` is made by `chat/document_context_builder.py:build_document_context_instruction` on each turn:
 
-1. Compute the `document_budget` in tokens:
+1. Compute budgets in tokens (`chat/clients/pydantic_ai.py` subtracts the security buffer once, then splits the remainder):
    ```text
-   document_budget = max(int(model.max_token_context * DOCUMENT_CONTEXT_BUDGET_RATIO)
-                         - DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS, 0)
+   usable_context = max(model.max_token_context - DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS, 0)
+   document_budget = max(int(usable_context * DOCUMENT_CONTEXT_BUDGET_RATIO), 0)
    ```
+   The conversation history budget (summarization trigger) uses the other share:
+   `message_token_budget = max(int(usable_context * (1 - DOCUMENT_CONTEXT_BUDGET_RATIO)), 0)`.
+   `build_document_context_instruction` receives `usable_context` as its `max_token_context` argument (buffer already applied).
 2. Iterate documents oldest-first. For each document:
    - If its token count exceeds the whole budget alone → keep `tool_call_only`.
    - Otherwise, while adding it would overflow the budget, **evict the oldest currently-inlined document** (FIFO): demote it to `tool_call_only`, free its tokens.
@@ -444,6 +447,17 @@ Token estimation uses `tiktoken` with the `cl100k_base` encoding (GPT-4 tokenize
 The assembled instruction is **cached** per turn keyed on:
 `conversation_id`, `user_id`, `model_hrid`, `model.max_token_context`, `DOCUMENT_CONTEXT_BUDGET_RATIO`, `DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS`, and a fingerprint of `(attachment.id, attachment.updated_at)` for every text attachment - **conversation and project text attachments both contribute to the fingerprint**. Any attachment add / remove / edit (including project files), or any settings change, invalidates the cache. TTL is 30 minutes (`CACHE_TIMEOUT`).
 
+#### Conversation history summarization
+
+When `message_token_budget` is exceeded, `chat/agents/history_processors.py` calls a separate summarization model (`LLM_SUMMARIZATION_MODEL_HRID`) and stores the result on the conversation (`history_summary`, `history_summary_checkpoint`). This runs at the **start of a new user turn**, before `agent.iter`, against **stored** `pydantic_messages` from previous turns only (the current user prompt is extracted separately and is not in that list yet). That stored history usually ends on an assistant `ModelResponse`.
+
+1. **Trigger**: estimated tokens in the active history slice exceed `message_token_budget` (see formulas above) and there are new messages after the last checkpoint.
+2. **After a summary**: the model receives the stored summary text (dynamic instruction) plus the last `CONVERSATION_SUMMARY_CONTEXT_MESSAGES` `ModelMessage` entries before the checkpoint. Use an **even** value so the retained window starts on a user `ModelRequest` in a plain user/assistant alternation (tool messages can break parity).
+3. **Summary length**: capped by `CONVERSATION_SUMMARY_MAX_TOKENS` on the summarization LLM call.
+4. **Disable summarization** without changing document budgets: remove `max_token_context` from the chat model in `LLM_CONFIGURATIONS`, or set `message_token_budget` to zero (`DOCUMENT_CONTEXT_BUDGET_RATIO=1` also zeroes it but reallocates all `usable_context` to documents).
+
+The security buffer is **not** a dedicated reserve for system prompts, tool schemas, or completion tokens; those are added on top of the planned document/history split. Size the buffer (and/or plan `max_token_context` below the model nominal window) to leave headroom for that overhead.
+
 #### Targeted document operations (`document_id`)
 
 Three tools accept an optional `document_id` argument, each with its own IDOR boundary:
@@ -522,8 +536,11 @@ A `READY` attachment whose `rag_document_id` is null (e.g. parse succeeded but t
 | `RAG_DOCUMENT_SEARCH_BACKEND`                | `AlbertRagBackend` | Import path of the vector-search backend used for indexing and search (Albert or Find) |
 | `PROJECT_FILES_MAX_COUNT`                    | `10`           | Max non-image attachments per project (excludes hidden markdown companions). Enforced at upload-time in `ChatProjectAttachmentViewSet`. Bounds per-turn system-prompt token cost (every entry contributes to `project_documents` on every conversation turn). |
 | `PROJECT_IMAGES_MAX_COUNT`                   | `3`            | Max image attachments per project. Enforced at upload-time. Bounds per-turn vision token cost - every project image is pinned to every turn alongside conversation-message images, and provider request-level image caps (Anthropic ~20/request) clip the trailing entries first. |
-| `DOCUMENT_CONTEXT_BUDGET_RATIO`              | `0.5`          | Fraction of `model.max_token_context` reserved for inlined documents (0 disables full-context inlining; everything stays `tool_call_only`) |
-| `DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS`    | `1000`         | Tokens subtracted from the inlining budget to absorb tokenizer drift on non-OpenAI models |
+| `DOCUMENT_CONTEXT_BUDGET_RATIO`              | `0.5`          | Fraction of `usable_context` (after the security buffer) reserved for inlined documents. `(1 - ratio)` goes to the conversation history budget (summarization trigger). `0` disables full-context inlining (everything stays `tool_call_only`). |
+| `DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS`    | `1000`         | Tokens subtracted once from `model.max_token_context` before the document/history split, to absorb tokenizer drift on non-OpenAI models and leave headroom beyond the planned split |
+| `CONVERSATION_SUMMARY_CONTEXT_MESSAGES`      | `10`           | Number of pydantic-ai `ModelMessage` entries kept in runtime history after a conversation summary (in addition to the stored summary text). Use an even value (see [Conversation history summarization](#conversation-history-summarization)) |
+| `CONVERSATION_SUMMARY_MAX_TOKENS`            | `2048`         | `max_tokens` for the summarization LLM call when generating or updating `history_summary` |
+| `LLM_SUMMARIZATION_MODEL_HRID`               | `default-summarization-model` | HRID in `LLM_CONFIGURATIONS` for the conversation summarization agent (see [LLM Configuration](llm-configuration.md)) |
 
 #### RAG_FILES_ACCEPTED_FORMATS
 
@@ -571,7 +588,7 @@ RAG_FILES_ACCEPTED_FORMATS = [
 
 ### Per-model setting: `max_token_context`
 
-Each entry in `LLM_CONFIGURATIONS` accepts a `max_token_context` integer field declaring the model's context window size. When set, it drives the inlining budget for the documents listing (`document_budget = max_token_context * DOCUMENT_CONTEXT_BUDGET_RATIO - DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS`).
+Each entry in `LLM_CONFIGURATIONS` accepts a `max_token_context` integer field declaring the model's context window size. When set, it drives document inlining and conversation summarization budgets (`usable_context = max_token_context - DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS`; `document_budget = usable_context * DOCUMENT_CONTEXT_BUDGET_RATIO`).
 
 If a model has no `max_token_context`, all of its documents are kept `tool_call_only` regardless of size and a warning is logged on every chat turn. Setting the field accurately matters: too low and small documents get pushed to RAG-only when they could be inlined; too high and the LLM may exceed its real window.
 
diff --git a/docs/env.md b/docs/env.md
index 3d408c5c..7dafd234 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -80,6 +80,10 @@ These are the environment variables you can set for the `conversations-backend`
 | LLM_CONFIGURATION_FILE_PATH                     | Path to the LLM configuration JSON file. See [LLM Configuration](llm-configuration.md) for details                                | <BASE_DIR>/conversations/configuration/llm/default.json |
 | LLM_DEFAULT_MODEL_HRID                          | HRID of the model used for conversations                                                                                          | default-model                                           |
 | LLM_SUMMARIZATION_MODEL_HRID                    | HRID of the model used for summarization                                                                                          | default-summarization-model                             |
+| DOCUMENT_CONTEXT_BUDGET_RATIO                   | Fraction of `usable_context` (after security buffer) for inlined conversation documents; `(1 - ratio)` is the conversation history token budget. See [attachments.md](attachments.md) | `0.5`                                                   |
+| DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS         | Tokens subtracted once from `max_token_context` before the document/history split                                                              | `1000`                                                  |
+| CONVERSATION_SUMMARY_CONTEXT_MESSAGES           | `ModelMessage` count kept after a conversation summary (use an even value). See [attachments.md](attachments.md)                               | `10`                                                    |
+| CONVERSATION_SUMMARY_MAX_TOKENS                 | Max tokens for the conversation summarization LLM output                                                                                        | `2048`                                                  |
 | AI_API_KEY                                      | AI API key to be used for the default provider (used in default LLM configuration, not for production use)                        |                                                         |
 | AI_BASE_URL                                     | OpenAI compatible AI base URL (used in default LLM configuration, not for production use)                                         |                                                         |
 | AI_MODEL                                        | AI Model name to use (used in default LLM configuration, not for production use)                                                  |                                                         |
diff --git a/src/backend/chat/agents/history_processors.py b/src/backend/chat/agents/history_processors.py
new file mode 100644
index 00000000..8e0f5256
--- /dev/null
+++ b/src/backend/chat/agents/history_processors.py
@@ -0,0 +1,338 @@
+"""History processors for model message cleanup."""
+
+import dataclasses
+import logging
+
+from pydantic_ai.messages import (
+    ModelMessage,
+    ModelRequest,
+    ModelResponse,
+    TextPart,
+    ToolCallPart,
+    ToolReturnPart,
+    UserPromptPart,
+)
+
+from chat.document_context_builder import count_approx_tokens
+
+from .summarize import SummarizationAgent
+
+logger = logging.getLogger(__name__)
+
+SUMMARY_SYSTEM_PREFIX = (
+    "[Conversation summary from previous turns] (context only, not a user request):\n"
+)
+
+
+@dataclasses.dataclass(frozen=True)
+class HistoryCleanupResult:
+    """Result of history cleanup, with optional generated summary."""
+
+    history: list[ModelMessage]
+    summary: str | None = None
+    summary_checkpoint: int | None = None
+
+
+def _stringify_message_content(content: object) -> str:
+    """Convert part content to a plain text representation."""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        return " ".join(str(item) for item in content if item is not None)
+    return str(content)
+
+
+def _format_exchanges_for_summary(messages: list[ModelMessage]) -> str:
+    lines = []
+
+    for msg in messages:
+        if isinstance(msg, ModelRequest):
+            role = "User"
+            parts = (p for p in msg.parts if isinstance(p, UserPromptPart))
+        elif isinstance(msg, ModelResponse):
+            role = "Assistant"
+            parts = (p for p in msg.parts if isinstance(p, TextPart))
+        else:
+            continue
+
+        for part in parts:
+            content = part.content
+            text = _stringify_message_content(content).strip()
+            if text:
+                lines.append(f"{role}: {text}")
+    return "\n".join(lines)
+
+
+async def conversation_summarization(
+    messages: list[ModelMessage], *, max_tokens: int = 300, previous_summary: str | None = None
+) -> str | None:
+    """
+    Summarize the conversation.
+    """
+
+    summarization_agent = SummarizationAgent()
+    latest_summary = previous_summary
+    exchanges = _format_exchanges_for_summary(messages)
+    prompt = (
+        "You are a conversation summarization assistant. Your role is to maintain\n"
+        "a concise and accurate running summary of a conversation, "
+        "omitting small talk and unrelated topics.\n\n"
+        "Given the previous summary (if any) and the new exchanges provided,\n"
+        "generate an updated summary that:\n\n"
+        "- **Preserves** every key information, decisions, and important facts\n"
+        "- **Integrates** the new exchanges in a coherent way\n"
+        "- **Removes** redundant or non-essential details\n"
+        "- **Maintains** the context needed for the conversation to continue\n"
+        "- Is written in a neutral, factual, third-person style\n"
+        "- Stays **concise** (5-10 lines maximum)\n\n"
+        "## Previous Summary:\n"
+        f"{latest_summary if latest_summary else ''}\n\n"
+        "## New Exchanges:\n"
+        f"{exchanges}\n\n"
+        "Only answer with the updated summary, including the new exchanges "
+        "information and the previous summary.\n\n"
+        "## Updated Summary:\n"
+    )
+    logger.debug("Prompt for summarization: %s", prompt)
+    logger.debug("Latest summary: %s", latest_summary)
+    try:
+        resp = await summarization_agent.run(
+            prompt,
+            model_settings={"max_tokens": max_tokens},
+        )
+    except Exception as exc:  # pylint: disable=broad-except  # noqa: BLE001
+        logger.warning("Conversation summarization failed: %s", exc, exc_info=True)
+        return None
+    updated_summary = (resp.output or "").strip()
+    logger.debug("Updated summary: %s", updated_summary)
+    return updated_summary or None
+
+
+def _latest_tool_call_ids(messages: list[ModelMessage]) -> set[str]:
+    for message in reversed(messages):
+        if not isinstance(message, ModelResponse):
+            continue
+        response_tool_calls = [
+            part.tool_call_id
+            for part in message.parts
+            if isinstance(part, ToolCallPart) and isinstance(part.tool_call_id, str)
+        ]
+        if response_tool_calls:
+            return set(response_tool_calls)
+    return set()
+
+
+def _clean_request_parts(parts: list, latest_tool_call_ids: set[str]) -> list:
+    kept_parts = []
+    for part in parts:
+        if not isinstance(part, ToolReturnPart):
+            kept_parts.append(part)
+            continue
+        if part.tool_call_id in latest_tool_call_ids:
+            kept_parts.append(part)
+            continue
+        tool_name = getattr(part, "tool_name", None) or "unknown_tool"
+        kept_parts.append(
+            ToolReturnPart(
+                tool_call_id=part.tool_call_id,
+                tool_name=tool_name,
+                content=f"<{tool_name} response compacted>",
+            )
+        )
+    return kept_parts
+
+
+def clean_tool_history(messages: list[ModelMessage]) -> list[ModelMessage]:
+    """Compact old tool returns while preserving the latest tool cycle."""
+    latest_tool_call_ids = _latest_tool_call_ids(messages)
+    cleaned_history: list[ModelMessage] = []
+
+    for message in messages:
+        if isinstance(message, ModelRequest):
+            kept_parts = _clean_request_parts(message.parts, latest_tool_call_ids)
+            if kept_parts:
+                cleaned_history.append(dataclasses.replace(message, parts=kept_parts))
+            continue
+
+        cleaned_history.append(message)
+
+    return cleaned_history
+
+
+def safe_clean_tool_history(messages: list[ModelMessage]) -> list[ModelMessage]:
+    """Compact tool history, falling back to the input on unexpected errors."""
+    try:
+        return clean_tool_history(messages)
+    except Exception as exc:  # pylint: disable=broad-except  # noqa: BLE001
+        logger.warning(
+            "Tool history cleanup failed, using raw history: %s",
+            exc,
+            exc_info=True,
+        )
+        return messages
+
+
+def _history_cleanup_fallback(
+    messages: list[ModelMessage], summary_checkpoint: int, context_messages: int
+) -> HistoryCleanupResult:
+    """Return the active history slice when summarization logic fails unexpectedly."""
+    checkpoint = _safe_checkpoint(messages, summary_checkpoint)
+    return HistoryCleanupResult(history=_active_history(messages, checkpoint, context_messages))
+
+
+def _estimate_message_tokens(message: ModelMessage) -> int:
+    """Estimate the token weight of one model message."""
+    parts = getattr(message, "parts", []) or []
+    if not parts:
+        return 0
+
+    part_tokens = 0
+    for part in parts:
+        content = getattr(part, "content", "")
+        text = _stringify_message_content(content).strip()
+        if text:
+            part_tokens += count_approx_tokens(text)
+
+        args = getattr(part, "args", "")
+        if isinstance(args, str) and args.strip():
+            part_tokens += count_approx_tokens(args)
+
+    return part_tokens + (4 * len(parts)) + 4
+
+
+def _estimate_history_tokens(messages: list[ModelMessage]) -> int:
+    """Estimate token weight for a message list."""
+    return sum(_estimate_message_tokens(message) for message in messages)
+
+
+def _safe_checkpoint(messages: list[ModelMessage], summary_checkpoint: int) -> int:
+    """Clamp the stored summary checkpoint to the current history size."""
+    return max(0, min(summary_checkpoint, len(messages)))
+
+
+def _active_start_index(
+    messages: list[ModelMessage], summary_checkpoint: int, context_messages: int
+) -> int:
+    """Return where active history starts for a summary checkpoint."""
+    checkpoint = _safe_checkpoint(messages, summary_checkpoint)
+    return max(0, checkpoint - max(context_messages, 1))
+
+
+def _active_history(
+    messages: list[ModelMessage], summary_checkpoint: int, context_messages: int
+) -> list[ModelMessage]:
+    """Keep the last `context_messages` ModelMessage entries before the checkpoint."""
+    active_start = _active_start_index(messages, summary_checkpoint, context_messages)
+    active = messages[active_start:]
+    if active:
+        return active
+    return messages[-1:] if messages else []
+
+
+def build_active_history(
+    messages: list[ModelMessage], summary_checkpoint: int, context_messages: int
+) -> list[ModelMessage]:
+    """Return the runtime history window for a summary checkpoint."""
+    return _active_history(messages, summary_checkpoint, context_messages)
+
+
+async def maybe_summarize_history(  # noqa: PLR0913  # pylint: disable=too-many-arguments
+    messages: list[ModelMessage],
+    *,
+    previous_summary: str | None = None,
+    summary_checkpoint: int = 0,
+    message_token_budget: int = 0,
+    context_messages: int = 10,
+    summary_max_tokens: int = 2048,
+    allow_summary_generation: bool = True,
+) -> HistoryCleanupResult:
+    """
+    Summarize when active history exceeds token budget.
+
+    Called at the start of a new user turn against stored history from previous
+    turns (the current user prompt is not in `messages` yet). That history
+    normally ends on an assistant ModelResponse; use an even `context_messages`
+    so the post-summary window starts on a user message.
+
+    `summary_checkpoint` is the message index up to which `previous_summary`
+    is valid. Runtime history keeps the last `context_messages` ModelMessage
+    entries before the checkpoint so the model still has recent detailed context
+    in addition to the summary.
+    """
+    try:
+        checkpoint = _safe_checkpoint(messages, summary_checkpoint)
+        active_history = _active_history(messages, checkpoint, context_messages)
+        active_tokens = _estimate_history_tokens(active_history)
+        logger.debug(
+            (
+                "maybe_summarize_history state: total_messages=%s checkpoint=%s "
+                "active_messages=%s active_tokens=%s token_budget=%s"
+            ),
+            len(messages),
+            checkpoint,
+            len(active_history),
+            active_tokens,
+            message_token_budget,
+        )
+
+        if message_token_budget <= 0 or active_tokens <= message_token_budget:
+            return HistoryCleanupResult(history=active_history)
+
+        next_checkpoint = len(messages)
+        if not allow_summary_generation or next_checkpoint <= checkpoint:
+            return HistoryCleanupResult(history=active_history)
+
+        summary_input = messages[checkpoint:next_checkpoint]
+        logger.debug(
+            "maybe_summarize_history summarizing messages %s:%s (%s message(s)).",
+            checkpoint,
+            next_checkpoint,
+            len(summary_input),
+        )
+        summary = await conversation_summarization(
+            summary_input,
+            max_tokens=summary_max_tokens,
+            previous_summary=previous_summary,
+        )
+        if not summary:
+            logger.warning(
+                "No updated summary generated, keeping previous summary and active context."
+            )
+            return HistoryCleanupResult(history=active_history)
+
+        return HistoryCleanupResult(
+            history=_active_history(messages, next_checkpoint, context_messages),
+            summary=summary,
+            summary_checkpoint=next_checkpoint,
+        )
+    except Exception as exc:  # pylint: disable=broad-except  # noqa: BLE001
+        logger.warning(
+            "History summarization failed, keeping active context: %s",
+            exc,
+            exc_info=True,
+        )
+        return _history_cleanup_fallback(messages, summary_checkpoint, context_messages)
+
+
+# check for summary generation need to know if the current
+# turn should trigger a frontend summary event
+def should_generate_conversation_summary(
+    messages: list[ModelMessage],
+    *,
+    previous_summary: str | None = None,
+    summary_checkpoint: int = 0,
+    message_token_budget: int = 0,
+    context_messages: int = 10,
+) -> bool:
+    """Return True when active history exceeds budget and new messages can be summarized."""
+    _ = previous_summary
+    if message_token_budget <= 0:
+        return False
+
+    cleaned_history = safe_clean_tool_history(messages)
+    checkpoint = _safe_checkpoint(cleaned_history, summary_checkpoint)
+    active_history = _active_history(cleaned_history, checkpoint, context_messages)
+    return (
+        _estimate_history_tokens(active_history) > message_token_budget
+        and len(cleaned_history) > checkpoint
+    )
diff --git a/src/backend/chat/clients/pydantic_ai.py b/src/backend/chat/clients/pydantic_ai.py
index 43299bd3..520a7324 100644
--- a/src/backend/chat/clients/pydantic_ai.py
+++ b/src/backend/chat/clients/pydantic_ai.py
@@ -127,6 +127,12 @@
 
 from chat import models
 from chat.agents.conversation import ConversationAgent, TitleGenerationAgent
+from chat.agents.history_processors import (
+    SUMMARY_SYSTEM_PREFIX,
+    maybe_summarize_history,
+    safe_clean_tool_history,
+    should_generate_conversation_summary,
+)
 from chat.agents.local_media_url_processors import (
     build_project_image_urls,
     update_history_local_urls,
@@ -251,6 +257,7 @@ def __init__(  # pylint: disable=too-many-arguments,too-many-positional-argument
         self.conversation = conversation
         self.user = user  # authenticated user only
         self.model_hrid = model_hrid or settings.LLM_DEFAULT_MODEL_HRID  # HRID of the model to use
+        self.model_configuration = get_model_configuration(self.model_hrid)
         self.language = language  # might be None
         self._last_stop_check = 0
 
@@ -259,7 +266,7 @@ def __init__(  # pylint: disable=too-many-arguments,too-many-positional-argument
         self.event_encoder = EventEncoder(CURRENT_EVENT_ENCODER_VERSION)  # We use v4 for now
 
         self._support_streaming = True
-        if (streaming := get_model_configuration(self.model_hrid).supports_streaming) is not None:
+        if (streaming := self.model_configuration.supports_streaming) is not None:
             self._support_streaming = streaming
 
         # Feature flags
@@ -276,6 +283,8 @@ def __init__(  # pylint: disable=too-many-arguments,too-many-positional-argument
         )
         self._web_search_tool_registered = False
         self._self_documentation_tool_registered = False
+        self._history_summary = conversation.history_summary.strip() or None
+        self._history_summary_checkpoint = max(conversation.history_summary_checkpoint, 0)
 
         self.conversation_agent = ConversationAgent(
             model_hrid=self.model_hrid,
@@ -287,8 +296,17 @@ def __init__(  # pylint: disable=too-many-arguments,too-many-positional-argument
             if self._langfuse_available
             else False,
             deps_type=ContextDeps,
+            history_processors=[self._history_processor],
         )
         add_document_rag_search_tool_from_setting(self.conversation_agent, self.user)
+        max_token_context = self.model_configuration.max_token_context or 0
+        usable_token_context = max(
+            max_token_context - settings.DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS,
+            0,
+        )
+        self._conversation_message_token_budget = max(
+            int((1 - settings.DOCUMENT_CONTEXT_BUDGET_RATIO) * usable_token_context), 0
+        )
 
         # Inject project-level custom instructions if the conversation belongs to a project
         llm_user_instructions = (
@@ -302,6 +320,12 @@ def __init__(  # pylint: disable=too-many-arguments,too-many-positional-argument
             def project_instructions() -> str:
                 return llm_user_instructions
 
+        @self.conversation_agent.instructions
+        def history_summary_instructions() -> str:
+            if not self._history_summary:
+                return ""
+            return f"{SUMMARY_SYSTEM_PREFIX}{self._history_summary.strip()}"
+
     @property
     def _stop_cache_key(self):
         return f"streaming:stop:{self.conversation.pk}"
@@ -392,8 +416,16 @@ async def _clean(self):
     # --------------------------------------------------------------------- #
 
     async def _prepare_agent_run(
-        self, messages: List[UIMessage]
-    ) -> Tuple[str, List, List, Dict[str, str], Dict[str, Union[int, float]], List, bool]:
+        self, messages: List[UIMessage], history: List[ModelMessage]
+    ) -> Tuple[
+        str,
+        List,
+        List,
+        Dict[str, str],
+        Dict[str, Union[int, float]],
+        List,
+        bool,
+    ]:
         """
         Prepare all inputs needed before running the agent.
 
@@ -416,7 +448,6 @@ async def _prepare_agent_run(
             - history: Validated message history for the agent
             - conversation_has_documents: Whether RAG should be enabled
         """
-        history = ModelMessagesTypeAdapter.validate_python(self.conversation.pydantic_messages)
         history = update_history_local_urls(
             self.conversation, history
         )  # presign URLs for local images
@@ -479,6 +510,32 @@ async def _prepare_agent_run(
             conversation_has_documents,
         )
 
+    async def _apply_history_cleanup(
+        self, history: list[ModelMessage], *, allow_summary_generation: bool
+    ) -> list[ModelMessage]:
+        """Compact history and persist any generated summary metadata."""
+        cleaned_history = safe_clean_tool_history(history)
+        cleanup_result = await maybe_summarize_history(
+            cleaned_history,
+            previous_summary=self._history_summary,
+            summary_checkpoint=self._history_summary_checkpoint,
+            message_token_budget=self._conversation_message_token_budget,
+            context_messages=settings.CONVERSATION_SUMMARY_CONTEXT_MESSAGES,
+            summary_max_tokens=settings.CONVERSATION_SUMMARY_MAX_TOKENS,
+            allow_summary_generation=allow_summary_generation,
+        )
+        if cleanup_result.summary:
+            self._history_summary = cleanup_result.summary
+            self.conversation.history_summary = cleanup_result.summary
+        if cleanup_result.summary_checkpoint is not None:
+            self._history_summary_checkpoint = cleanup_result.summary_checkpoint
+            self.conversation.history_summary_checkpoint = cleanup_result.summary_checkpoint
+        return cleanup_result.history
+
+    async def _history_processor(self, history: list[ModelMessage]) -> list[ModelMessage]:
+        """Native pydantic-ai history processor for internal tool-cycle cleanup."""
+        return safe_clean_tool_history(history)
+
     def _setup_web_search(self, force_web_search: bool) -> bool:
         """Configure web search if forced. Returns whether web search is actually
         forced."""
@@ -769,7 +826,8 @@ async def _build_document_context_instruction(self) -> str:
                     upload_state=models.AttachmentStatus.READY,
                 ).order_by("created_at", "id")
             )
-        max_token_context = self.conversation_agent.configuration.max_token_context
+        max_token_context = self.model_configuration.max_token_context or 0
+        usable_token_context = max(max_token_context - security_buffer_tokens, 0)
 
         # Cache the assembled instruction: building it requires reading every text
         # attachment from object storage and tokenizing it. The fingerprint includes
@@ -779,7 +837,7 @@ async def _build_document_context_instruction(self) -> str:
             f"conv={self.conversation.id}",
             f"user={getattr(self.user, 'id', None)}",
             f"model={self.model_hrid}",
-            f"max_ctx={max_token_context}",
+            f"max_ctx={usable_token_context}",
             f"ratio={budget_ratio}",
             f"buffer={security_buffer_tokens}",
             *(f"{att.id}:{att.updated_at.isoformat()}" for att in text_attachments),
@@ -799,9 +857,8 @@ async def _build_document_context_instruction(self) -> str:
             text_attachments=text_attachments,
             project_text_attachments=project_text_attachments,
             model_hrid=self.model_hrid,
-            max_token_context=max_token_context,
+            max_token_context=usable_token_context,
             budget_ratio=budget_ratio,
-            security_buffer_tokens=security_buffer_tokens,
         )
         await sync_to_async(cache.set)(cache_key, instruction, CACHE_TIMEOUT)
         return instruction
@@ -1179,7 +1236,8 @@ async def _finalize_conversation(  # pylint: disable=too-many-arguments,too-many
             ),
         )
 
-    async def _run_agent(  # pylint: disable=too-many-locals
+    # pylint: disable=too-many-locals,too-many-branches,too-many-statements
+    async def _run_agent(  # noqa: PLR0912
         self,
         messages: List[UIMessage],
         force_web_search: bool = False,
@@ -1200,7 +1258,7 @@ async def _run_agent(  # pylint: disable=too-many-locals
             )
         else:
             langfuse = None
-
+        raw_history = ModelMessagesTypeAdapter.validate_python(self.conversation.pydantic_messages)
         (
             user_prompt,
             input_images,
@@ -1209,7 +1267,24 @@ async def _run_agent(  # pylint: disable=too-many-locals
             usage,
             history,
             conversation_has_documents,
-        ) = await self._prepare_agent_run(messages)
+        ) = await self._prepare_agent_run(messages, raw_history)
+
+        # Conversation summary process. This runs before agent.iter so dynamic
+        # instructions can read the updated summary in the same model request.
+        should_emit_summary_event = should_generate_conversation_summary(
+            history,
+            previous_summary=self._history_summary,
+            summary_checkpoint=self._history_summary_checkpoint,
+            message_token_budget=self._conversation_message_token_budget,
+            context_messages=settings.CONVERSATION_SUMMARY_CONTEXT_MESSAGES,
+        )
+        if should_emit_summary_event:
+            tool_call_id = str(uuid.uuid4())
+            yield events_v4.ToolCallPart(
+                tool_call_id=tool_call_id,
+                tool_name="summarize",
+                args={"state": "running", "summary_scope": "conversation"},
+            )
 
         doc_result = None
         async for item in self._handle_input_documents(
@@ -1225,6 +1300,16 @@ async def _run_agent(  # pylint: disable=too-many-locals
 
         conversation_has_documents = doc_result.has_documents
 
+        history = await self._apply_history_cleanup(
+            history,
+            allow_summary_generation=should_emit_summary_event,
+        )
+        if should_emit_summary_event:
+            yield events_v4.ToolResultPart(
+                tool_call_id=tool_call_id,
+                result={"state": "done"},
+            )
+
         await self._agent_stop_streaming(force_cache_check=True)
         self._setup_self_documentation_tool()
         self._setup_web_search_tool()
@@ -1242,10 +1327,12 @@ async def _run_agent(  # pylint: disable=too-many-locals
             if history and history[-1].kind == "request":
                 if history[-1].parts and history[-1].parts[-1].part_kind == "tool-return":
                     history.append(ModelResponse(parts=[TextPart(content="ok")], kind="response"))
+            message_history = history if history else None
 
             async with self.conversation_agent.iter(
                 [user_prompt] + input_images,
-                message_history=history,  # history will pass through agent's history_processors
+                # History passes through history_processors when set on the agent.
+                message_history=message_history,
                 deps=self._context_deps,
                 toolsets=mcp_servers,
             ) as run:
diff --git a/src/backend/chat/document_context_builder.py b/src/backend/chat/document_context_builder.py
index 98dc6c47..55c7a124 100644
--- a/src/backend/chat/document_context_builder.py
+++ b/src/backend/chat/document_context_builder.py
@@ -267,7 +267,6 @@ async def build_document_context_instruction(  # noqa: PLR0913 # pylint: disable
     model_hrid: str,
     max_token_context: int | None,
     budget_ratio: float,
-    security_buffer_tokens: int,
     project_text_attachments: Sequence[models.ChatConversationAttachment] = (),
 ) -> str:
     """
@@ -328,7 +327,7 @@ async def build_document_context_instruction(  # noqa: PLR0913 # pylint: disable
             )
         )
 
-    document_budget = max(int(max_token_context * budget_ratio) - security_buffer_tokens, 0)
+    document_budget = max(int(max_token_context * budget_ratio), 0)
 
     async def _load_document(
         index: int, attachment: models.ChatConversationAttachment
diff --git a/src/backend/chat/llm_configuration.py b/src/backend/chat/llm_configuration.py
index a534fb2f..64820d62 100644
--- a/src/backend/chat/llm_configuration.py
+++ b/src/backend/chat/llm_configuration.py
@@ -141,11 +141,11 @@ class LLModel(BaseModel):
     is_active: bool
     icon: LongStringAsListValue | None = None
     supports_streaming: bool | None = None
+    max_token_context: int | None = None
     system_prompt: SettingEnvValue
     tools: list[str]
     web_search: SettingEnvValue | None = None
     concatenate_instruction_messages: bool | None = None
-    max_token_context: int | None = None
 
     @field_validator("tools", mode="before")
     @classmethod
@@ -176,20 +176,20 @@ def validate_web_search(cls, value: str | None) -> str | None:
 
     @field_validator("max_token_context", mode="before")
     @classmethod
-    def validate_max_token_context(cls, value: int | str | None) -> int | None:
-        """Accept integer-like values from JSON for model context size."""
+    def validate_max_token_context(cls, value: Any) -> int | None:
+        """Parse max_token_context from literal, setting, or env value."""
         if value is None or value == "":
             return None
         if isinstance(value, bool):
             # bool is an int subclass in Python, reject explicitly so True/False
             # don't become 1/0.
             raise ValueError("max_token_context must be an integer value.")
-        if isinstance(value, int):
-            parsed_value = value
-        elif isinstance(value, str):
+        if isinstance(value, str):
+            value = _get_setting_or_env_or_value(value)
+        try:
             parsed_value = int(value)
-        else:
-            raise ValueError("max_token_context must be an integer value.")
+        except (TypeError, ValueError) as exc:
+            raise ValueError("max_token_context must be an integer value.") from exc
 
         if parsed_value <= 0:
             raise ValueError("max_token_context must be a positive integer")
diff --git a/src/backend/chat/migrations/0008_chatconversation_history_summary.py b/src/backend/chat/migrations/0008_chatconversation_history_summary.py
new file mode 100644
index 00000000..25bf5b9f
--- /dev/null
+++ b/src/backend/chat/migrations/0008_chatconversation_history_summary.py
@@ -0,0 +1,19 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("chat", "0007_chatconversationattachment_project_and_more"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="chatconversation",
+            name="history_summary",
+            field=models.TextField(
+                blank=True,
+                default="",
+                help_text="Latest generated conversation summary used as system context",
+            ),
+        ),
+    ]
diff --git a/src/backend/chat/migrations/0009_chatconversation_history_summary_checkpoint.py b/src/backend/chat/migrations/0009_chatconversation_history_summary_checkpoint.py
new file mode 100644
index 00000000..1b670314
--- /dev/null
+++ b/src/backend/chat/migrations/0009_chatconversation_history_summary_checkpoint.py
@@ -0,0 +1,18 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("chat", "0008_chatconversation_history_summary"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="chatconversation",
+            name="history_summary_checkpoint",
+            field=models.PositiveIntegerField(
+                default=0,
+                help_text="Number of pydantic history messages already compacted into history_summary",
+            ),
+        ),
+    ]
diff --git a/src/backend/chat/models.py b/src/backend/chat/models.py
index 8a40e5c2..0b87007e 100644
--- a/src/backend/chat/models.py
+++ b/src/backend/chat/models.py
@@ -136,6 +136,15 @@ class ChatConversation(BaseModel):
         blank=True,
         help_text="Pydantic messages for the chat conversation, used for history",
     )
+    history_summary = models.TextField(
+        blank=True,
+        default="",
+        help_text="Latest generated conversation summary used as system context",
+    )
+    history_summary_checkpoint = models.PositiveIntegerField(
+        default=0,
+        help_text="Number of pydantic history messages already compacted into history_summary",
+    )
     messages: Sequence[UIMessage] = SchemaField(
         schema=list[UIMessage],
         default=list,
diff --git a/src/backend/chat/tests/agents/test_history_processors.py b/src/backend/chat/tests/agents/test_history_processors.py
new file mode 100644
index 00000000..66c8d166
--- /dev/null
+++ b/src/backend/chat/tests/agents/test_history_processors.py
@@ -0,0 +1,350 @@
+"""Tests for history processors."""
+
+import pytest
+from pydantic_ai import (
+    Agent,
+    ModelMessage,
+    ModelRequest,
+    ModelResponse,
+    TextPart,
+    UserPromptPart,
+)
+from pydantic_ai.messages import (
+    ToolCallPart,
+    ToolReturnPart,
+)
+from pydantic_ai.models.function import AgentInfo, FunctionModel
+
+from chat.agents import history_processors
+
+
+@pytest.fixture
+def _received_messages_fixture() -> list[ModelMessage]:
+    """Fixture to capture messages received by the function model."""
+    return []
+
+
+@pytest.fixture(name="received_messages")
+def received_messages_fixture_alias(
+    _received_messages_fixture: list[ModelMessage],
+) -> list[ModelMessage]:
+    """Expose received messages fixture with a stable pytest name."""
+    return _received_messages_fixture
+
+
+@pytest.fixture(name="function_model")
+def function_model_fixture(received_messages: list[ModelMessage]) -> FunctionModel:
+    """Fixture to capture model function."""
+
+    def capture_model_function(messages: list[ModelMessage], _info: AgentInfo) -> ModelResponse:
+        # Capture exactly what reaches the provider after history processors.
+        received_messages.clear()
+        received_messages.extend(messages)
+        return ModelResponse(parts=[TextPart(content="Provider response")])
+
+    return FunctionModel(capture_model_function)
+
+
+def _build_turns(turn_count: int) -> list:
+    """Build a list of turns for testing."""
+    messages = []
+    for turn in range(1, turn_count + 1):
+        messages.append(ModelRequest(parts=[UserPromptPart(content=[f"user-{turn}"])]))
+        messages.append(ModelResponse(parts=[TextPart(content=f"assistant-{turn}")]))
+    return messages
+
+
+def test_history_processors_are_applied_before_provider_call(
+    function_model: FunctionModel, received_messages: list[ModelMessage]
+):
+    """History processors should run before provider invocation."""
+
+    def keep_only_requests(messages: list[ModelMessage]) -> list[ModelMessage]:
+        return [msg for msg in messages if isinstance(msg, ModelRequest)]
+
+    agent = Agent(function_model, history_processors=[keep_only_requests])
+    message_history = [
+        ModelRequest(parts=[UserPromptPart(content="Question 1")]),
+        ModelResponse(parts=[TextPart(content="Answer 1")]),
+    ]
+
+    agent.run_sync("Question 2", message_history=message_history)
+    assert len(received_messages) == 1
+    assert isinstance(received_messages[0], ModelRequest)
+    user_prompt_contents = [
+        part.content for part in received_messages[0].parts if isinstance(part, UserPromptPart)
+    ]
+    assert user_prompt_contents == ["Question 1", "Question 2"]
+
+
+@pytest.mark.asyncio
+async def test_history_cleanup_keeps_full_history_when_under_budget():
+    """No summary should be produced when the active slice fits budget."""
+    messages = _build_turns(2)
+
+    result = await history_processors.maybe_summarize_history(messages, message_token_budget=10_000)
+
+    assert result.summary is None
+    assert result.history == messages
+
+
+def test_clean_tool_history_redacts_old_tool_returns_but_keeps_latest_tool_result():
+    """Old tool results are compacted, latest tool result stays intact."""
+    messages = [
+        ModelResponse(parts=[ToolCallPart(tool_call_id="old", tool_name="search", args="{}")]),
+        ModelRequest(
+            parts=[
+                ToolReturnPart(
+                    tool_call_id="old",
+                    tool_name="search",
+                    content="large old result",
+                )
+            ]
+        ),
+        ModelResponse(parts=[ToolCallPart(tool_call_id="latest", tool_name="search", args="{}")]),
+        ModelRequest(
+            parts=[
+                ToolReturnPart(
+                    tool_call_id="latest",
+                    tool_name="search",
+                    content="fresh result",
+                )
+            ]
+        ),
+    ]
+
+    result = history_processors.clean_tool_history(messages)
+
+    old_return = result[1].parts[0]
+    latest_return = result[3].parts[0]
+    assert isinstance(old_return, ToolReturnPart)
+    assert isinstance(latest_return, ToolReturnPart)
+    assert old_return.content == "<search response compacted>"
+    assert latest_return.content == "fresh result"
+
+
+@pytest.mark.asyncio
+async def test_history_cleanup_over_budget_generates_summary_and_advances_checkpoint(monkeypatch):
+    """Summary is generated when unsummarized runtime history exceeds budget."""
+    summarized_messages = []
+
+    async def fake_summary(messages, *, max_tokens=300, previous_summary=None):
+        summarized_messages.extend(messages)
+        _ = max_tokens
+        _ = previous_summary
+        return "summary-v1"
+
+    monkeypatch.setattr(history_processors, "conversation_summarization", fake_summary)
+    messages = _build_turns(3)
+
+    result = await history_processors.maybe_summarize_history(
+        messages, message_token_budget=1, context_messages=1
+    )
+
+    assert result.summary == "summary-v1"
+    assert result.summary_checkpoint == len(messages)
+    assert result.history == messages[5:]
+    assert summarized_messages == messages[: result.summary_checkpoint]
+
+
+@pytest.mark.asyncio
+async def test_history_cleanup_existing_summary_uses_checkpoint_slice(monkeypatch):
+    """When already summarized, runtime history starts at checkpoint minus context."""
+
+    async def fake_summary(_messages, *, max_tokens=300, previous_summary=None):
+        _ = max_tokens
+        _ = previous_summary
+        raise AssertionError("Summary should not be regenerated at turn 5")
+
+    monkeypatch.setattr(history_processors, "conversation_summarization", fake_summary)
+    messages = _build_turns(5)
+
+    result = await history_processors.maybe_summarize_history(
+        messages,
+        previous_summary="summary-v1",
+        summary_checkpoint=6,
+        message_token_budget=10_000,
+        context_messages=1,
+    )
+
+    assert result.summary is None
+    assert result.history == messages[5:]
+
+
+@pytest.mark.asyncio
+async def test_history_cleanup_budget_only_counts_active_window_after_checkpoint(monkeypatch):
+    """Old summarized messages should not retrigger summaries after checkpoint."""
+
+    async def fake_summary(_messages, *, max_tokens=300, previous_summary=None):
+        _ = max_tokens
+        _ = previous_summary
+        raise AssertionError("Messages before the active window should not count")
+
+    monkeypatch.setattr(history_processors, "conversation_summarization", fake_summary)
+    messages = [
+        ModelRequest(parts=[UserPromptPart(content=["old user " + ("x " * 500)])]),
+        ModelResponse(parts=[TextPart(content="old assistant " + ("x " * 500))]),
+        ModelRequest(parts=[UserPromptPart(content=["context user"])]),
+        ModelResponse(parts=[TextPart(content="context assistant")]),
+        ModelRequest(parts=[UserPromptPart(content=["new user"])]),
+        ModelResponse(parts=[TextPart(content="new assistant")]),
+    ]
+
+    result = await history_processors.maybe_summarize_history(
+        messages,
+        previous_summary="summary-v1",
+        summary_checkpoint=4,
+        message_token_budget=100,
+        context_messages=1,
+    )
+
+    assert result.summary is None
+    assert result.history == messages[3:]
+
+
+@pytest.mark.asyncio
+async def test_history_cleanup_does_not_resummarize_when_checkpoint_is_current(monkeypatch):
+    """After a summary, context overlap alone should not be summarized again."""
+
+    async def fake_summary(_messages, *, max_tokens=300, previous_summary=None):
+        _ = max_tokens
+        _ = previous_summary
+        raise AssertionError("Latest active turn should not be summarized")
+
+    monkeypatch.setattr(history_processors, "conversation_summarization", fake_summary)
+    messages = _build_turns(3)
+
+    result = await history_processors.maybe_summarize_history(
+        messages,
+        previous_summary="summary-v1",
+        summary_checkpoint=len(messages),
+        message_token_budget=1,
+        context_messages=1,
+    )
+
+    assert result.summary is None
+    assert result.summary_checkpoint is None
+    assert result.history == messages[5:]
+
+
+@pytest.mark.asyncio
+async def test_history_cleanup_never_returns_empty_history_when_checkpoint_is_at_end():
+    """pydantic-ai rejects empty processed history when it passed history in."""
+    messages = _build_turns(2)
+
+    result = await history_processors.maybe_summarize_history(
+        messages,
+        previous_summary="summary-v1",
+        summary_checkpoint=len(messages),
+        message_token_budget=10_000,
+        context_messages=1,
+    )
+
+    assert result.summary is None
+    assert result.history == messages[3:]
+
+
+def test_clean_tool_history_has_no_summary_checkpoint_behavior():
+    """The pydantic-ai history processor path only cleans tools."""
+    messages = _build_turns(2)
+
+    result = history_processors.clean_tool_history(messages)
+
+    assert result == messages
+
+
+@pytest.mark.asyncio
+async def test_history_cleanup_summary_failure_keeps_runtime_slice(monkeypatch):
+    """If summary generation fails, keep runtime slice unchanged."""
+
+    async def fake_summary(_messages, *, max_tokens=300, previous_summary=None):
+        _ = max_tokens
+        _ = previous_summary
+
+    monkeypatch.setattr(history_processors, "conversation_summarization", fake_summary)
+    messages = _build_turns(4)
+
+    result = await history_processors.maybe_summarize_history(
+        messages,
+        previous_summary="summary-v1",
+        summary_checkpoint=2,
+        message_token_budget=1,
+        context_messages=1,
+    )
+
+    assert result.summary is None
+    assert result.history == messages[1:]
+
+
+def test_should_generate_conversation_summary_when_budget_exceeded():
+    """Frontend summary event should trigger only when over budget."""
+    messages = _build_turns(3)
+    assert history_processors.should_generate_conversation_summary(messages, message_token_budget=1)
+    assert not history_processors.should_generate_conversation_summary(
+        messages, message_token_budget=10_000
+    )
+    assert not history_processors.should_generate_conversation_summary(
+        messages,
+        previous_summary="summary-v1",
+        summary_checkpoint=len(messages),
+        message_token_budget=1,
+        context_messages=1,
+    )
+    assert not history_processors.should_generate_conversation_summary(
+        messages,
+        previous_summary="summary-v1",
+        summary_checkpoint=28,
+        message_token_budget=1,
+        context_messages=1,
+    )
+    messages_with_large_summarized_prefix = [
+        ModelRequest(parts=[UserPromptPart(content=["old user " + ("x " * 500)])]),
+        ModelResponse(parts=[TextPart(content="old assistant " + ("x " * 500))]),
+        ModelRequest(parts=[UserPromptPart(content=["context user"])]),
+        ModelResponse(parts=[TextPart(content="context assistant")]),
+        ModelRequest(parts=[UserPromptPart(content=["new user"])]),
+        ModelResponse(parts=[TextPart(content="new assistant")]),
+    ]
+    assert not history_processors.should_generate_conversation_summary(
+        messages_with_large_summarized_prefix,
+        previous_summary="summary-v1",
+        summary_checkpoint=4,
+        message_token_budget=100,
+        context_messages=1,
+    )
+
+
+def test_safe_clean_tool_history_falls_back_to_raw_history(monkeypatch):
+    """Unexpected cleanup errors should not break the conversation flow."""
+    messages = _build_turns(2)
+
+    def raise_cleanup(_messages):
+        raise RuntimeError("cleanup failed")
+
+    monkeypatch.setattr(history_processors, "clean_tool_history", raise_cleanup)
+
+    result = history_processors.safe_clean_tool_history(messages)
+
+    assert result == messages
+
+
+@pytest.mark.asyncio
+async def test_maybe_summarize_history_falls_back_on_unexpected_error(monkeypatch):
+    """Unexpected summarization errors should keep the active history slice."""
+    messages = _build_turns(4)
+
+    def raise_estimate(_message):
+        raise RuntimeError("token estimate failed")
+
+    monkeypatch.setattr(history_processors, "_estimate_message_tokens", raise_estimate)
+
+    result = await history_processors.maybe_summarize_history(
+        messages,
+        previous_summary="summary-v1",
+        summary_checkpoint=2,
+        message_token_budget=1,
+        context_messages=1,
+    )
+
+    assert result.summary is None
+    assert result.history == messages[1:]
diff --git a/src/backend/chat/tests/clients/pydantic_ai/test_document_context_window.py b/src/backend/chat/tests/clients/pydantic_ai/test_document_context_window.py
index 9736c418..0933dabf 100644
--- a/src/backend/chat/tests/clients/pydantic_ai/test_document_context_window.py
+++ b/src/backend/chat/tests/clients/pydantic_ai/test_document_context_window.py
@@ -39,6 +39,9 @@ def _parse_listing(instruction: str) -> dict:
 def _llm_config_with_context(settings):
     """Configure a model with max_token_context for context window tests."""
     settings.DOCUMENT_CONTEXT_BUDGET_RATIO = 0.5
+    # Branch default is 10_000; with max_token_context=4000 that zeroes usable_context.
+    # Tests target buffer=1_000 so budget math stays: usable=3000, document_budget=1500.
+    settings.DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS = 1000
     settings.LLM_CONFIGURATIONS = {
         "default-model": LLModel(
             hrid="default-model",
@@ -48,8 +51,6 @@ def _llm_config_with_context(settings):
             icon=None,
             system_prompt="You are an amazing assistant.",
             tools=[],
-            # Keep context large enough so tests can exercise rolling-window behavior
-            # despite the fixed security buffer applied by the service.
             max_token_context=4000,
             provider=LLMProvider(
                 hrid="unused",
@@ -82,9 +83,10 @@ async def fake_read_attachment_content(_attachment):  # NOSONAR
         "chat.document_context_builder.read_attachment_content",
         fake_read_attachment_content,
     )
+    # max_token_context=4000, buffer=1000, ratio=0.5 => document_budget=1500.
     monkeypatch.setattr(
         "chat.document_context_builder.count_approx_tokens",
-        lambda _text: 1201,
+        lambda _text: 1501,
     )
 
     instruction = async_to_sync(service._build_document_context_instruction)()  # pylint: disable=protected-access
@@ -143,14 +145,14 @@ async def fake_read_attachment_content(attachment):  # NOSONAR
 
     monkeypatch.setattr(
         "chat.document_context_builder.count_approx_tokens",
-        lambda _text: 400,
+        lambda _text: 600,
     )
 
     instruction = async_to_sync(service._build_document_context_instruction)()  # pylint: disable=protected-access
     listing = _parse_listing(instruction)
 
-    # max_token_context=4000, ratio=0.5 => budget=1000 after buffer.
-    # With 3 docs at 400 tokens each, rolling outcome should inline doc-2 + doc-3.
+    # max_token_context=4000, buffer=1000, ratio=0.5 => budget=1500.
+    # With 3 docs at 600 tokens each (1800 total), FIFO evicts doc-1; doc-2 + doc-3 stay inlined.
     assert listing["documents_order"] == "newest_to_oldest"
     by_title = {d["title"]: d for d in listing["documents"]}
     assert set(by_title) == {"doc-1", "doc-2", "doc-3"}
@@ -315,7 +317,7 @@ async def fake_read_attachment_content(attachment):  # NOSONAR
 
 def test_document_context_uses_configurable_ratio(_llm_config_with_context, monkeypatch, settings):
     """Budget ratio comes from Django settings and changes inlining behavior."""
-    settings.DOCUMENT_CONTEXT_BUDGET_RATIO = 0.3  # max_token_context=4000 => budget=200
+    settings.DOCUMENT_CONTEXT_BUDGET_RATIO = 0.3  # usable_context=3000 => budget=900
 
     user = UserFactory()
     conversation = ChatConversationFactory(owner=user)
@@ -360,8 +362,8 @@ async def fake_read_attachment_content(attachment):  # NOSONAR
 
     by_title = {d["title"]: d for d in listing["documents"]}
     assert set(by_title) == {"doc-1", "doc-2"}
-    # ratio=0.3, max_context=4000, buffer=1000 => budget=200; only newest fits.
-    assert by_title["doc-1"]["access"] == ACCESS_TOOL_CALL_ONLY
+    # ratio=0.3, max_context=4000, buffer=1000 => budget=900; both fit.
+    assert by_title["doc-1"]["access"] == ACCESS_FULL_CONTEXT
     assert by_title["doc-2"]["access"] == ACCESS_FULL_CONTEXT
 
 
diff --git a/src/backend/chat/tests/clients/pydantic_ai/test_thinking_history_stripping.py b/src/backend/chat/tests/clients/pydantic_ai/test_thinking_history_stripping.py
index 2a2a2dca..420a33dd 100644
--- a/src/backend/chat/tests/clients/pydantic_ai/test_thinking_history_stripping.py
+++ b/src/backend/chat/tests/clients/pydantic_ai/test_thinking_history_stripping.py
@@ -116,7 +116,8 @@ async def test_thinking_parts_stripped_when_model_does_not_support_thinking():
         ),
         patch("chat.clients.pydantic_ai.update_history_local_urls", side_effect=lambda _conv, h: h),
     ):
-        _, _, _, _, _, history, _ = await service._prepare_agent_run([ui_message])
+        raw_history = ModelMessagesTypeAdapter.validate_python(conversation.pydantic_messages)
+        _, _, _, _, _, history, _ = await service._prepare_agent_run([ui_message], raw_history)
 
     assert len(history) == 1
     assert isinstance(history[0], ModelResponse)
@@ -145,7 +146,8 @@ async def test_thinking_parts_kept_when_model_supports_thinking():
         ),
         patch("chat.clients.pydantic_ai.update_history_local_urls", side_effect=lambda _conv, h: h),
     ):
-        _, _, _, _, _, history, _ = await service._prepare_agent_run([ui_message])
+        raw_history = ModelMessagesTypeAdapter.validate_python(conversation.pydantic_messages)
+        _, _, _, _, _, history, _ = await service._prepare_agent_run([ui_message], raw_history)
 
     assert len(history) == 1
     assert isinstance(history[0], ModelResponse)
diff --git a/src/backend/chat/tests/test_document_context_builder.py b/src/backend/chat/tests/test_document_context_builder.py
index 1ef2194c..7bd536af 100644
--- a/src/backend/chat/tests/test_document_context_builder.py
+++ b/src/backend/chat/tests/test_document_context_builder.py
@@ -83,7 +83,7 @@ def _parse_listing(instruction: str) -> dict:
     return json.loads(instruction.split(prefix, 1)[1])
 
 
-async def _build(conversation, *, max_token_context=100, budget_ratio=0.5, security_buffer=0):
+async def _build(conversation, *, max_token_context=100, budget_ratio=0.5):
     """Run build_document_context_instruction with real components."""
     text_attachments = await sync_to_async(list)(
         conversation.attachments.filter(content_type__startswith="text/").order_by(
@@ -96,7 +96,6 @@ async def _build(conversation, *, max_token_context=100, budget_ratio=0.5, secur
         model_hrid="test-model",
         max_token_context=max_token_context,
         budget_ratio=budget_ratio,
-        security_buffer_tokens=security_buffer,
     )
 
 
diff --git a/src/backend/conversations/settings.py b/src/backend/conversations/settings.py
index 7750df61..4cbb4f56 100755
--- a/src/backend/conversations/settings.py
+++ b/src/backend/conversations/settings.py
@@ -683,6 +683,22 @@ class Base(BraveSettings, Configuration):
         environ_name="DEFAULT_ALLOW_SMART_WEB_SEARCH",
         environ_prefix=None,
     )
+    # Conversation summary: at the start of a new user turn (before agent.iter), when the
+    # active slice of stored pydantic_messages (previous turns only; the current user
+    # prompt is not in that list yet) exceeds the message token budget
+    # (usable_context * (1 - DOCUMENT_CONTEXT_BUDGET_RATIO)). That history usually ends
+    # on an assistant ModelResponse. After a summary, keep the last N ModelMessage
+    # entries before the checkpoint. Use an even N so the window starts on a user message.
+    CONVERSATION_SUMMARY_CONTEXT_MESSAGES = values.PositiveIntegerValue(
+        default=10,
+        environ_name="CONVERSATION_SUMMARY_CONTEXT_MESSAGES",
+        environ_prefix=None,
+    )
+    CONVERSATION_SUMMARY_MAX_TOKENS = values.PositiveIntegerValue(
+        default=2048,
+        environ_name="CONVERSATION_SUMMARY_MAX_TOKENS",
+        environ_prefix=None,
+    )
 
     # These settings are default values used in the default LLM_CONFIGURATIONS
     # They allow a deployment with only one model without a specific configuration file
@@ -919,7 +935,7 @@ class Base(BraveSettings, Configuration):
         environ_prefix=None,
     )
     DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS = values.PositiveIntegerValue(
-        default=1000,
+        default=10000,
         environ_name="DOCUMENT_CONTEXT_SECURITY_BUFFER_TOKENS",
         environ_prefix=None,
     )
diff --git a/src/frontend/apps/conversations/src/features/chat/components/MessageItem.tsx b/src/frontend/apps/conversations/src/features/chat/components/MessageItem.tsx
index 0bd1f5ad..8f48167e 100644
--- a/src/frontend/apps/conversations/src/features/chat/components/MessageItem.tsx
+++ b/src/frontend/apps/conversations/src/features/chat/components/MessageItem.tsx
@@ -243,9 +243,13 @@ const MessageItemComponent: React.FC<MessageItemProps> = ({
   }, [toolInvocationParts]);
 
   const activeToolInvocation = React.useMemo(() => {
-    const tool = toolInvocationParts.find(
-      (part) => part.toolInvocation.toolName !== 'document_parsing',
-    );
+    const tool = [...toolInvocationParts]
+      .reverse()
+      .find(
+        (part) =>
+          part.toolInvocation.toolName !== 'document_parsing' &&
+          part.toolInvocation.state !== 'result',
+      );
     return tool?.toolInvocation;
   }, [toolInvocationParts]);
 
@@ -367,7 +371,8 @@ const MessageItemComponent: React.FC<MessageItemProps> = ({
             {isCurrentlyStreaming &&
               isLastAssistantMessage &&
               status === 'streaming' &&
-              hasNonDocumentParsingTool && (
+              hasNonDocumentParsingTool &&
+              activeToolInvocation && (
                 <Box
                   $direction="row"
                   $align="center"
@@ -382,8 +387,14 @@ const MessageItemComponent: React.FC<MessageItemProps> = ({
                 >
                   <Loader />
                   <Text $variation="600" $size="md">
-                    {activeToolInvocation?.toolName === 'summarize'
-                      ? t('Summarizing...')
+                    {activeToolInvocation.toolName === 'summarize'
+                      ? (
+                          activeToolInvocation.args as {
+                            summary_scope?: string;
+                          }
+                        )?.summary_scope === 'conversation'
+                        ? t('Summarizing conversation...')
+                        : t('Summarizing...')
                       : t('Search...')}
                   </Text>
                 </Box>