luceinaltis · luceinaltis · Apr 8, 2026 · Apr 8, 2026
diff --git a/docs/memory-system.md b/docs/memory-system.md
@@ -17,22 +17,22 @@ Complete audit trail — reconstructs exactly what the agent did and why.
 
 ## Tier 2 — Compressed Summary (Markdown)
 
-> **구현 예정** — 현재 SessionCompactor가 존재하지만 요약이 파일로 저장되지 않습니다.
-
-When a session exceeds 8,000 tokens (measured via `tiktoken`), SessionManager compacts the conversation into a Markdown summary using the reporter role (Haiku). The summary replaces raw turns in the active context window; the JSONL is preserved.
+When a session exceeds 8,000 tokens (rough `len // 4` estimate of the JSONL log), the `ConversationEngine` invokes `SessionCompactor.compact_and_save()` after each turn via `_maybe_compact()`. The reporter role (Haiku) condenses the turns into a concise Markdown summary, which is written to `~/.qracer/summaries/<session_id>.md`. The raw JSONL log is preserved untouched.
 
 ## Tier 3 — Search Index (DuckDB)
 
-> **구현 예정** — 현재 MemorySearcher가 DuckDB FTS로 키워드 검색을 지원하지만, 벡터 임베딩(HNSW) 검색은 미구현입니다.
-
-DuckDB indexes all Tier 2 Markdown summaries for hybrid retrieval: keyword (FTS) + vector similarity (VSS/HNSW). Writes occur only at compaction time.
+`MemorySearcher` indexes Tier 2 Markdown summaries in DuckDB for hybrid retrieval: keyword (BM25 via FTS) and, when an embedding function is supplied, vector similarity via DuckDB's `list_cosine_similarity`. The two branches are fused with reciprocal rank fusion so scores from different scales can be combined without normalisation.
 
-- Embedding model: `text-embedding-3-small` (OpenAI API) or `all-MiniLM-L6-v2` (local fallback).
-- Tables: `session_index` (FTS), `session_embeddings` (HNSW).
+- Embedding is pluggable via the `embedding_fn: Callable[[str], list[float]]` parameter — callers can back it with the Claude API, `text-embedding-3-small`, `sentence-transformers`, or any other model. When `embedding_fn` is `None` the searcher falls back to keyword-only search.
+- Tables: `session_index` (FTS) and `session_embeddings` (cosine similarity).
 - Source of truth is the Markdown files; DuckDB is the index only.
 
 The agent calls `memory_search` autonomously when past context may be relevant.
 
+## Cross-Session Loading
+
+On `qracer repl` startup, the CLI instantiates a file-backed `MemorySearcher` at `~/.qracer/memory_index.duckdb` and re-indexes every Markdown file in `~/.qracer/summaries/`. The number of loaded contexts is printed to the user so returning sessions immediately know how much prior memory is in scope.
+
 ## MEMORY.md vs. Tier 2
 
 > **구현 예정** — MEMORY.md, BOOTSTRAP.md 기반 크로스 세션 메모리는 아직 구현되지 않았습니다.

diff --git a/qracer/cli.py b/qracer/cli.py
@@ -867,6 +867,7 @@ def repl() -> None:
     from qracer.alert_monitor import AlertMonitor
     from qracer.alerts import AlertStore
     from qracer.conversation.engine import ConversationEngine
+    from qracer.memory.memory_searcher import MemorySearcher
     from qracer.memory.session_logger import SessionLogger
     from qracer.watchlist import Watchlist
 
@@ -894,6 +895,14 @@ def repl() -> None:
     session_id = uuid.uuid4().hex[:12]
     session_logger = SessionLogger(sessions_dir / f"{session_id}.jsonl")
 
+    # Cross-session memory (Tier 2 summaries + Tier 3 search index).
+    summaries_dir = _user_dir() / "summaries"
+    summaries_dir.mkdir(parents=True, exist_ok=True)
+    memory_searcher = MemorySearcher(_user_dir() / "memory_index.duckdb")
+    loaded_contexts = memory_searcher.index_directory(summaries_dir)
+    if loaded_contexts:
+        click.echo(f"  ✓ Loaded {loaded_contexts} past session summaries from {summaries_dir}")
+
     reports_dir = _user_dir() / "reports"
     watchlist = Watchlist(_user_dir() / "watchlist.json")
 
@@ -915,6 +924,8 @@ def repl() -> None:
         session_logger=session_logger,
         report_dir=reports_dir,
         language=app_cfg.language,
+        memory_searcher=memory_searcher,
+        summaries_dir=summaries_dir,
     )
 
     task_executor = TaskExecutor(task_store, data_registry, llm_registry, engine=engine)

diff --git a/qracer/conversation/engine.py b/qracer/conversation/engine.py
@@ -70,13 +70,15 @@ def __init__(
         report_dir: Path | None = None,
         memory_searcher: MemorySearcher | None = None,
         language: str = "en",
+        summaries_dir: Path | None = None,
     ) -> None:
         self._llm = llm_registry
         self._data = data_registry
         self._intent_parser = IntentParser(llm_registry)
         self._portfolio_config = portfolio_config or PortfolioConfig()
         self._memory_searcher = memory_searcher
         self._language = language
+        self._summaries_dir = summaries_dir
 
         analysis_loop = AnalysisLoop(
             llm_registry,
@@ -190,19 +192,34 @@ def _log_turn(self, role: str, content: str, **kwargs: object) -> None:
         )
 
     async def _maybe_compact(self) -> None:
-        """Trigger compaction if the session log exceeds the token threshold."""
+        """Trigger compaction if the session log exceeds the token threshold.
+
+        When a ``summaries_dir`` is configured the compacted summary is also
+        persisted to disk (Tier 2) and, if a ``memory_searcher`` is present,
+        indexed into the search index (Tier 3) so future sessions can find
+        it.
+        """
         if self._compactor is None or self._session_logger is None:
             return
-        if self._compactor.needs_compaction(self._session_logger):
-            try:
-                result = await self._compactor.compact(self._session_logger)
-                logger.info(
-                    "Session compacted: %d turns → %d tokens summary",
-                    result.turn_count,
-                    result.output_tokens,
+        if not self._compactor.needs_compaction(self._session_logger):
+            return
+        try:
+            if self._summaries_dir is not None:
+                result = await self._compactor.compact_and_save(
+                    self._session_logger, self._summaries_dir
                 )
-            except Exception:
-                logger.warning("Session compaction failed", exc_info=True)
+                if self._memory_searcher is not None:
+                    session_id = self._session_logger.path.stem
+                    self._memory_searcher.index_summary(session_id, result.summary)
+            else:
+                result = await self._compactor.compact(self._session_logger)
+            logger.info(
+                "Session compacted: %d turns → %d tokens summary",
+                result.turn_count,
+                result.output_tokens,
+            )
+        except Exception:
+            logger.warning("Session compaction failed", exc_info=True)
 
     async def query(self, user_input: str) -> EngineResponse:
         """Process a user query through the full pipeline."""