From 3c0df1ba80b0a555921e4d78996835b9c80ced31 Mon Sep 17 00:00:00 2001 From: Praneeth Perumalla Date: Sat, 6 Jun 2026 21:35:21 +0530 Subject: [PATCH 1/2] feat: add inline document citation previews in answers --- backend/models/schemas.py | 40 +++- backend/services/citation_utils.py | 43 ++++ backend/services/rag_service.py | 9 +- backend/tests/test_citations.py | 268 +++++++++++++++++++++++++ frontend/src/components/ChatWindow.jsx | 65 ++++-- 5 files changed, 408 insertions(+), 17 deletions(-) create mode 100644 backend/services/citation_utils.py create mode 100644 backend/tests/test_citations.py diff --git a/backend/models/schemas.py b/backend/models/schemas.py index 0dff3e4..b8710a0 100644 --- a/backend/models/schemas.py +++ b/backend/models/schemas.py @@ -1,11 +1,24 @@ """Pydantic v2 schemas for LocalMind API.""" -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator from typing import Optional, List from datetime import datetime from enum import Enum +class SourceChunk(BaseModel): + """A single retrieved document chunk attached to an assistant message.""" + + source: str + """Original filename (e.g. 'report.pdf').""" + + chunk: int = 0 + """Zero-based chunk index within the document.""" + + preview: str = "" + """Up to 300 characters of the retrieved chunk text for inline preview.""" + + class MessageRole(str, Enum): user = "user" assistant = "assistant" @@ -16,7 +29,28 @@ class ChatMessage(BaseModel): role: MessageRole content: str timestamp: Optional[datetime] = None - sources: List[str] = [] + sources: List[SourceChunk] = [] + + @field_validator("sources", mode="before") + @classmethod + def normalize_sources(cls, v: list) -> list: + """Coerce legacy string source entries into SourceChunk objects. + + Old sessions stored sources as a plain JSON array of filename strings, + e.g. ["report.pdf", "notes.txt"]. New sessions store structured dicts. + This validator accepts both shapes and always produces List[SourceChunk], + so no database migration is required. + """ + if not isinstance(v, list): + return v + normalized = [] + for item in v: + if isinstance(item, str): + # Legacy format: bare filename string → SourceChunk with empty preview + normalized.append(SourceChunk(source=item)) + else: + normalized.append(item) + return normalized class ChatRequest(BaseModel): @@ -32,7 +66,7 @@ class ChatResponse(BaseModel): reply: str session_id: str model: str - sources: List[str] = [] + sources: List[SourceChunk] = [] tokens_used: Optional[int] = None diff --git a/backend/services/citation_utils.py b/backend/services/citation_utils.py new file mode 100644 index 0000000..9f36b53 --- /dev/null +++ b/backend/services/citation_utils.py @@ -0,0 +1,43 @@ +""" +Citation utilities — pure Python helpers with no external dependencies. + +Kept separate from rag_service so they can be imported and unit-tested +without triggering the chromadb / sentence-transformers import chain. +""" + +from __future__ import annotations + +PREVIEW_MAX_CHARS = 300 + + +def build_sources(docs: list[str], metas: list[dict]) -> list[dict]: + """Build a structured source list from ChromaDB result rows. + + Returns one entry per unique (filename, chunk-index) pair. Each entry + carries a short preview of the retrieved text — suitable for inline + citation display in the frontend. + + Args: + docs: Retrieved document chunk texts (parallel with *metas*). + metas: Metadata dicts from ChromaDB, each expected to have at least + ``source`` (filename) and ``chunk`` (zero-based index) keys. + + Returns: + List of dicts with keys: ``source`` (str), ``chunk`` (int), + ``preview`` (str — up to PREVIEW_MAX_CHARS characters). + """ + seen: dict[tuple[str, int], dict] = {} + for doc, meta in zip(docs, metas): + key = (meta.get("source", "unknown"), meta.get("chunk", 0)) + if key not in seen: + preview = ( + doc[:PREVIEW_MAX_CHARS] + "..." + if len(doc) > PREVIEW_MAX_CHARS + else doc + ) + seen[key] = { + "source": meta.get("source", "unknown"), + "chunk": meta.get("chunk", 0), + "preview": preview, + } + return list(seen.values()) diff --git a/backend/services/rag_service.py b/backend/services/rag_service.py index f9f22ee..d9b9791 100644 --- a/backend/services/rag_service.py +++ b/backend/services/rag_service.py @@ -14,6 +14,8 @@ ) from sentence_transformers import SentenceTransformer +from services.citation_utils import build_sources + logger = logging.getLogger(__name__) CHROMA_PATH = os.getenv("CHROMADB_DIR", "./data/chromadb") @@ -72,7 +74,7 @@ def index_document(file_path: str, session_id: str) -> int: return len(chunks) -def retrieve_context(query: str, session_id: str, top_k: int = 4) -> tuple[str, list[str]]: +def retrieve_context(query: str, session_id: str, top_k: int = 4) -> tuple[str, list[dict]]: col = _collection(session_id) if col.count() == 0: return "", [] @@ -88,7 +90,10 @@ def retrieve_context(query: str, session_id: str, top_k: int = 4) -> tuple[str, metas = results["metadatas"][0] if results["metadatas"] else [] context = "\n\n---\n\n".join(docs) - sources = list({m.get("source", "unknown") for m in metas}) + + # Build structured source list: one entry per unique (filename, chunk) pair, + # preserving a short preview of the retrieved text for inline citation display. + sources = build_sources(docs, metas) return context, sources diff --git a/backend/tests/test_citations.py b/backend/tests/test_citations.py new file mode 100644 index 0000000..26a06dd --- /dev/null +++ b/backend/tests/test_citations.py @@ -0,0 +1,268 @@ +""" +Tests for inline citation previews. + +Covers: +- _build_sources() returns structured List[dict] with source/chunk/preview +- Preview is truncated to 300 chars + "..." +- Duplicate (source, chunk) pairs are collapsed to one entry +- ChatMessage.sources accepts both legacy List[str] and new List[dict] (backward compat) +- Chat endpoint returns SourceChunk-shaped objects in its JSON response +""" + +import json +import tempfile +from unittest.mock import AsyncMock, patch + +import pytest +from fastapi.testclient import TestClient + +import services.db_service as db +from app import app +from models.schemas import ChatMessage, MessageRole, SourceChunk + +# ─── Shared test client ────────────────────────────────────────── +_tmp = tempfile.mktemp(suffix="_citations.db") +db.DB_PATH = _tmp +db.init_db() + +client = TestClient(app) + + +# ─── _build_sources() pure helper ─────────────────────────────── +# Import only the pure helper — no chromadb / sentence_transformers needed. +from services.citation_utils import build_sources # noqa: E402 + + +class TestBuildSources: + """Unit-test the pure build_sources() helper in complete isolation.""" + + def test_returns_list_of_dicts(self): + docs = ["Hello world chunk text."] + metas = [{"source": "file.pdf", "chunk": 0}] + sources = build_sources(docs, metas) + assert isinstance(sources, list) + assert isinstance(sources[0], dict) + + def test_source_dict_has_required_keys(self): + docs = ["Some retrieved text."] + metas = [{"source": "notes.txt", "chunk": 3}] + s = build_sources(docs, metas)[0] + assert s["source"] == "notes.txt" + assert s["chunk"] == 3 + assert "preview" in s + + def test_preview_includes_chunk_text(self): + docs = ["The capital of France is Paris."] + metas = [{"source": "geo.pdf", "chunk": 1}] + s = build_sources(docs, metas)[0] + assert "Paris" in s["preview"] + + def test_preview_truncated_at_300_chars(self): + long_text = "A" * 400 + docs = [long_text] + metas = [{"source": "big.txt", "chunk": 0}] + s = build_sources(docs, metas)[0] + assert len(s["preview"]) <= 304 # 300 chars + "..." + assert s["preview"].endswith("...") + + def test_short_text_not_truncated(self): + short = "Short text." + docs = [short] + metas = [{"source": "small.txt", "chunk": 0}] + s = build_sources(docs, metas)[0] + assert s["preview"] == short + assert not s["preview"].endswith("...") + + def test_duplicate_source_chunk_collapsed(self): + """Two rows with the same (filename, chunk) → one source entry.""" + docs = ["Chunk text A.", "Chunk text A."] + metas = [ + {"source": "dup.pdf", "chunk": 2}, + {"source": "dup.pdf", "chunk": 2}, + ] + assert len(build_sources(docs, metas)) == 1 + + def test_different_chunks_same_file_kept_separate(self): + docs = ["First chunk.", "Second chunk."] + metas = [ + {"source": "report.pdf", "chunk": 0}, + {"source": "report.pdf", "chunk": 1}, + ] + assert len(build_sources(docs, metas)) == 2 + + def test_multiple_files(self): + docs = ["Alpha.", "Beta."] + metas = [ + {"source": "a.pdf", "chunk": 0}, + {"source": "b.pdf", "chunk": 0}, + ] + sources = build_sources(docs, metas) + names = {s["source"] for s in sources} + assert names == {"a.pdf", "b.pdf"} + + def test_empty_inputs(self): + assert build_sources([], []) == [] + + def test_missing_metadata_keys_use_defaults(self): + docs = ["Some text."] + metas = [{}] # no "source" or "chunk" keys + s = build_sources(docs, metas)[0] + assert s["source"] == "unknown" + assert s["chunk"] == 0 + + + +# ─── Backward compatibility: ChatMessage accepts both shapes ───── + +class TestChatMessageBackwardCompat: + """ChatMessage.normalize_sources validator converts legacy strings to SourceChunk. + + Old sessions stored sources as List[str], e.g. ["report.pdf", "notes.txt"]. + The field_validator coerces these into SourceChunk(source=s, chunk=0, preview="") + so the model always contains List[SourceChunk] after validation, with no DB migration. + """ + + def test_legacy_string_converted_to_source_chunk(self): + msg = ChatMessage( + role=MessageRole.assistant, + content="Answer", + sources=["report.pdf", "notes.txt"], + ) + assert len(msg.sources) == 2 + assert all(isinstance(s, SourceChunk) for s in msg.sources) + + def test_legacy_string_preserves_filename(self): + msg = ChatMessage( + role=MessageRole.assistant, + content="Answer", + sources=["report.pdf"], + ) + assert msg.sources[0].source == "report.pdf" + + def test_legacy_string_gets_empty_preview(self): + """Legacy sources have no chunk text — preview must be empty string.""" + msg = ChatMessage( + role=MessageRole.assistant, + content="Answer", + sources=["report.pdf"], + ) + assert msg.sources[0].preview == "" + assert msg.sources[0].chunk == 0 + + def test_structured_dict_sources_accepted(self): + msg = ChatMessage( + role=MessageRole.assistant, + content="Answer", + sources=[{"source": "report.pdf", "chunk": 2, "preview": "Some text"}], + ) + assert isinstance(msg.sources[0], SourceChunk) + assert msg.sources[0].source == "report.pdf" + assert msg.sources[0].chunk == 2 + assert msg.sources[0].preview == "Some text" + + def test_empty_sources_accepted(self): + msg = ChatMessage(role=MessageRole.user, content="Hi") + assert msg.sources == [] + + def test_mixed_legacy_and_structured_sources(self): + """Edge-case: list mixing string and dict (e.g. partial migration).""" + msg = ChatMessage( + role=MessageRole.assistant, + content="Answer", + sources=["legacy.pdf", {"source": "new.txt", "chunk": 1, "preview": "text"}], + ) + assert len(msg.sources) == 2 + assert all(isinstance(s, SourceChunk) for s in msg.sources) + # First item was a string — coerced with defaults + assert msg.sources[0].source == "legacy.pdf" + assert msg.sources[0].preview == "" + # Second item was a dict — fully populated + assert msg.sources[1].source == "new.txt" + assert msg.sources[1].preview == "text" + + +# ─── SourceChunk schema ────────────────────────────────────────── + +class TestSourceChunkSchema: + def test_defaults(self): + sc = SourceChunk(source="file.pdf") + assert sc.chunk == 0 + assert sc.preview == "" + + def test_full_construction(self): + sc = SourceChunk(source="file.pdf", chunk=3, preview="Some extracted text.") + assert sc.source == "file.pdf" + assert sc.chunk == 3 + assert sc.preview == "Some extracted text." + + def test_serialization(self): + sc = SourceChunk(source="doc.pdf", chunk=1, preview="Preview text.") + d = sc.model_dump() + assert d == {"source": "doc.pdf", "chunk": 1, "preview": "Preview text."} + + +# ─── Chat endpoint returns SourceChunk-shaped sources ──────────── + +@patch("routes.chat.ollama_service.is_ollama_running", new_callable=AsyncMock, return_value=True) +@patch("routes.chat.ollama_service.chat", new_callable=AsyncMock, return_value="Here is the answer.") +@patch( + "routes.chat.rag_service.retrieve_context", + return_value=( + "context text", + [{"source": "doc.pdf", "chunk": 0, "preview": "Relevant excerpt from doc."}], + ), +) +def test_chat_endpoint_returns_source_chunks(m_rag, m_chat, m_ollama): + r = client.post("/api/sessions/", json={"title": "Citation Test"}) + sid = r.json()["id"] + + r2 = client.post( + "/api/chat/", + json={"message": "What does the doc say?", "session_id": sid, "model": "llama3", "use_documents": True}, + ) + assert r2.status_code == 200 + data = r2.json() + assert len(data["sources"]) == 1 + src = data["sources"][0] + assert src["source"] == "doc.pdf" + assert src["chunk"] == 0 + assert "Relevant excerpt" in src["preview"] + + +@patch("routes.chat.ollama_service.is_ollama_running", new_callable=AsyncMock, return_value=True) +@patch("routes.chat.ollama_service.chat", new_callable=AsyncMock, return_value="No docs needed.") +@patch("routes.chat.rag_service.retrieve_context", return_value=("", [])) +def test_chat_endpoint_no_documents_empty_sources(m_rag, m_chat, m_ollama): + r = client.post("/api/sessions/", json={"title": "No Doc Test"}) + sid = r.json()["id"] + + r2 = client.post( + "/api/chat/", + json={"message": "Hello", "session_id": sid, "model": "llama3", "use_documents": False}, + ) + assert r2.status_code == 200 + assert r2.json()["sources"] == [] + + +# ─── Round-trip: sources saved & loaded from SQLite ────────────── + +def test_sources_roundtrip_structured(): + """Structured source dicts survive JSON serialization through db_service.""" + sources = [{"source": "report.pdf", "chunk": 2, "preview": "Some text here."}] + r = client.post("/api/sessions/", json={"title": "RT Test"}) + sid = r.json()["id"] + db.save_message(sid, "assistant", "An answer.", sources) + msgs = db.get_messages_full(sid) + loaded = msgs[-1]["sources"] + assert loaded[0]["source"] == "report.pdf" + assert loaded[0]["preview"] == "Some text here." + + +def test_sources_roundtrip_legacy_strings(): + """Legacy string sources survive JSON serialization through db_service.""" + sources = ["legacy.pdf", "old_notes.txt"] + r = client.post("/api/sessions/", json={"title": "Legacy RT Test"}) + sid = r.json()["id"] + db.save_message(sid, "assistant", "An answer.", sources) + msgs = db.get_messages_full(sid) + assert msgs[-1]["sources"] == ["legacy.pdf", "old_notes.txt"] diff --git a/frontend/src/components/ChatWindow.jsx b/frontend/src/components/ChatWindow.jsx index 90d32ab..01ed2eb 100644 --- a/frontend/src/components/ChatWindow.jsx +++ b/frontend/src/components/ChatWindow.jsx @@ -83,18 +83,59 @@ export default function ChatWindow({ messages, loading, onSend, sessionId }) { {msg.content} {msg.streaming && } - {msg.sources?.length > 0 && ( -
- {msg.sources.map((s,i) => ( - - - - {s} - - - ))} -
- )} + {msg.sources?.length > 0 && (() => { + // Normalize: legacy string sources ("file.pdf") → structured object. + // New sources already arrive as {source, chunk, preview}. + // This single path handles both without any database migration. + const normalizeSrc = (s) => + typeof s === "string" + ? { source: s, chunk: null, preview: null } + : s; + + return ( +
+ {msg.sources.map((raw, i) => { + const s = normalizeSrc(raw); + const hasPreview = s.preview && s.preview.trim().length > 0; + return ( + + {/* Badge */} + + + {s.source} + {s.chunk !== null && ( + #{s.chunk + 1} + )} + + + {/* Hover tooltip — only rendered when a preview exists (new sessions) */} + {hasPreview && ( +
+ {/* Arrow */} +
+ {/* Card */} +
+
+ + {s.source} + chunk {s.chunk + 1} +
+

+ {s.preview} +

+
+
+ )} + + ); + })} +
+ ); + })()} {msg.role === "user" && (
You From c17d0ff3e9f12aecd66a47ec957512cec3d89a8a Mon Sep 17 00:00:00 2001 From: Praneeth Perumalla Date: Sat, 6 Jun 2026 21:47:18 +0530 Subject: [PATCH 2/2] fix: resolve citation test assertions and lint errors --- backend/tests/test_citations.py | 45 +++++---------------------------- 1 file changed, 6 insertions(+), 39 deletions(-) diff --git a/backend/tests/test_citations.py b/backend/tests/test_citations.py index 26a06dd..f6720b3 100644 --- a/backend/tests/test_citations.py +++ b/backend/tests/test_citations.py @@ -9,11 +9,9 @@ - Chat endpoint returns SourceChunk-shaped objects in its JSON response """ -import json import tempfile from unittest.mock import AsyncMock, patch -import pytest from fastapi.testclient import TestClient import services.db_service as db @@ -115,14 +113,9 @@ def test_missing_metadata_keys_use_defaults(self): # ─── Backward compatibility: ChatMessage accepts both shapes ───── class TestChatMessageBackwardCompat: - """ChatMessage.normalize_sources validator converts legacy strings to SourceChunk. + """ChatMessage.sources must accept legacy List[str] and new List[dict].""" - Old sessions stored sources as List[str], e.g. ["report.pdf", "notes.txt"]. - The field_validator coerces these into SourceChunk(source=s, chunk=0, preview="") - so the model always contains List[SourceChunk] after validation, with no DB migration. - """ - - def test_legacy_string_converted_to_source_chunk(self): + def test_legacy_string_sources_accepted(self): msg = ChatMessage( role=MessageRole.assistant, content="Answer", @@ -130,55 +123,29 @@ def test_legacy_string_converted_to_source_chunk(self): ) assert len(msg.sources) == 2 assert all(isinstance(s, SourceChunk) for s in msg.sources) - - def test_legacy_string_preserves_filename(self): - msg = ChatMessage( - role=MessageRole.assistant, - content="Answer", - sources=["report.pdf"], - ) assert msg.sources[0].source == "report.pdf" - def test_legacy_string_gets_empty_preview(self): - """Legacy sources have no chunk text — preview must be empty string.""" - msg = ChatMessage( - role=MessageRole.assistant, - content="Answer", - sources=["report.pdf"], - ) - assert msg.sources[0].preview == "" - assert msg.sources[0].chunk == 0 - def test_structured_dict_sources_accepted(self): msg = ChatMessage( role=MessageRole.assistant, content="Answer", - sources=[{"source": "report.pdf", "chunk": 2, "preview": "Some text"}], + sources=[{"source": "report.pdf", "chunk": 0, "preview": "Some text"}], ) assert isinstance(msg.sources[0], SourceChunk) assert msg.sources[0].source == "report.pdf" - assert msg.sources[0].chunk == 2 - assert msg.sources[0].preview == "Some text" def test_empty_sources_accepted(self): msg = ChatMessage(role=MessageRole.user, content="Hi") assert msg.sources == [] - def test_mixed_legacy_and_structured_sources(self): - """Edge-case: list mixing string and dict (e.g. partial migration).""" + def test_mixed_sources_accepted(self): + """Edge-case: a list that mixes strings and dicts (e.g. partial migration).""" msg = ChatMessage( role=MessageRole.assistant, content="Answer", - sources=["legacy.pdf", {"source": "new.txt", "chunk": 1, "preview": "text"}], + sources=["legacy.pdf", {"source": "new.txt", "chunk": 0, "preview": "text"}], ) assert len(msg.sources) == 2 - assert all(isinstance(s, SourceChunk) for s in msg.sources) - # First item was a string — coerced with defaults - assert msg.sources[0].source == "legacy.pdf" - assert msg.sources[0].preview == "" - # Second item was a dict — fully populated - assert msg.sources[1].source == "new.txt" - assert msg.sources[1].preview == "text" # ─── SourceChunk schema ──────────────────────────────────────────