From 7e8948eb86a333a656c2db22e66f89d71cd43f9b Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 7 Jun 2026 22:44:46 +0000 Subject: [PATCH 1/2] Add content-fingerprint dedup on the write path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skip mem0's LLM fact-extraction when byte-identical content is re-submitted. Before add, the raw input is normalized (lowercase + collapse whitespace) and SHA-256'd; the fingerprint is stored in the `content_fp` Qdrant payload field and looked up (via the vector store's filter) on the next add. An exact repeat returns {"results": [], "deduplicated": true, "memory_id": ...} without calling the LLM. Adapts the OB1 content-fingerprint-dedup recipe. Verified against a live Qdrant: metadata lands top-level in the payload and filtering on the (unindexed) content_fp field works, so no payload index is required. The dedup check is fail-open — any lookup error just proceeds with a normal add, so it can only ever save work, never block a write. - app/memory.py: content_fingerprint(), _existing_fingerprint_id() (fail-open), and add_memory(content, dedup=True, **kwargs) wrapper. - app/rest.py: AddMemoryRequest.dedup flag (default true); route through wrapper. - app/mcp_server.py: add_memory tool routes through the wrapper (always dedups). - tests: fingerprint normalization, lookup found/empty/fail-open, wrapper store/skip/dedup-false/metadata-merge, plus REST + MCP integration cases. conftest defaults the dedup lookup to empty so existing add tests are unaffected. - docs: USER_GUIDE (How memory works + add endpoint + import idempotency), DEVELOPER_GUIDE memory.py description. Closes #48. https://claude.ai/code/session_017835DVrvURaYnbQiPQwzue --- app/mcp_server.py | 5 +- app/memory.py | 60 +++++++++++++++++++++++ app/rest.py | 6 ++- docs/DEVELOPER_GUIDE.md | 6 ++- docs/USER_GUIDE.md | 20 ++++++-- tests/conftest.py | 3 ++ tests/test_mcp.py | 10 ++++ tests/test_memory.py | 103 +++++++++++++++++++++++++++++++++++++++- tests/test_rest.py | 32 +++++++++++++ 9 files changed, 236 insertions(+), 9 deletions(-) diff --git a/app/mcp_server.py b/app/mcp_server.py index 639d62a..c7bdc71 100644 --- a/app/mcp_server.py +++ b/app/mcp_server.py @@ -22,13 +22,16 @@ def add_memory(content: str, agent_id: str | None = None, metadata: dict | None agent_id is an optional provenance tag recording which agent wrote the memory. It does NOT partition the store — search and list always span every memory for the user, so all connected agents share one memory. + + Submitting the same content again is automatically deduplicated and + skips re-processing, so it's safe to call without checking first. """ kwargs: dict = {"user_id": default_user} if agent_id: kwargs["agent_id"] = agent_id if metadata: kwargs["metadata"] = metadata - return memory.add(content, **kwargs) + return memory_mod.add_memory(content, **kwargs) @mcp.tool def search_memories(query: str, limit: int = 10, recency_weight: float = 0.0) -> dict: diff --git a/app/memory.py b/app/memory.py index a093e28..17ac0b4 100644 --- a/app/memory.py +++ b/app/memory.py @@ -1,3 +1,5 @@ +import hashlib +import json from functools import lru_cache from app.config import Settings, get_settings @@ -45,3 +47,61 @@ def get_memory(): from mem0 import Memory return Memory.from_config(_build_config(get_settings())) + + +def content_fingerprint(content) -> str: + """A deterministic fingerprint of the raw add() input, for cheap dedup. + + Normalizes (lowercase + collapse whitespace) so trivial formatting + differences fingerprint the same, then SHA-256s the result. Strings and + structured message lists are both supported. + """ + if isinstance(content, str): + normalized = " ".join(content.split()).lower() + else: + normalized = json.dumps(content, sort_keys=True, separators=(",", ":")).lower() + return hashlib.sha256(normalized.encode("utf-8")).hexdigest() + + +def _existing_fingerprint_id(memory, fingerprint: str, user_id: str | None) -> str | None: + """Return the id of an already-stored memory with this fingerprint, or None. + + Best-effort: the fingerprint is matched against the `content_fp` payload + field via the vector store's filter. Any error (store quirk, transient + failure) returns None so the dedup check never blocks a write — it only ever + saves work, never prevents it. + """ + filters: dict = {"content_fp": fingerprint} + if user_id: + filters["user_id"] = user_id + try: + result = memory.vector_store.list(filters=filters, top_k=1) + except Exception: + return None + # mem0's Qdrant store returns a (points, next_offset) tuple; normalize that + # and a bare-list return to the points list. + points = result[0] if isinstance(result, tuple) else result + if not points: + return None + return getattr(points[0], "id", None) + + +def add_memory(content, *, dedup: bool = True, **kwargs) -> dict: + """Add a memory, optionally skipping mem0's LLM extraction for exact repeats. + + With `dedup` on (default), a normalized SHA-256 fingerprint of the raw + content is computed and stored in metadata as `content_fp`. If a memory with + the same fingerprint already exists for the user, the add is skipped — no LLM + fact-extraction call — and a `{"deduplicated": True}` marker is returned. + Pass `dedup=False` to force a normal add (e.g. to re-extract). + """ + memory = get_memory() + if not dedup: + return memory.add(content, **kwargs) + fingerprint = content_fingerprint(content) + existing_id = _existing_fingerprint_id(memory, fingerprint, kwargs.get("user_id")) + if existing_id is not None: + return {"results": [], "deduplicated": True, "memory_id": existing_id} + metadata = dict(kwargs.pop("metadata", None) or {}) + metadata["content_fp"] = fingerprint + return memory.add(content, metadata=metadata, **kwargs) diff --git a/app/rest.py b/app/rest.py index 23ed76c..9e3a0a3 100644 --- a/app/rest.py +++ b/app/rest.py @@ -24,6 +24,9 @@ class AddMemoryRequest(BaseModel): agent_id: str | None = None run_id: str | None = None metadata: dict | None = None + # When true (default), byte-identical content already stored is skipped + # before mem0's LLM fact-extraction runs. Set false to force re-extraction. + dedup: bool = True class SearchRequest(BaseModel): @@ -58,12 +61,11 @@ def _scope_kwargs( def add_memory(req: AddMemoryRequest) -> dict: if not req.content and not req.messages: raise HTTPException(status_code=422, detail="Provide either 'content' or 'messages'") - memory = memory_mod.get_memory() payload = req.content if req.content is not None else [m.model_dump() for m in req.messages] kwargs = _scope_kwargs(req.user_id, req.agent_id, req.run_id) if req.metadata: kwargs["metadata"] = req.metadata - return memory.add(payload, **kwargs) + return memory_mod.add_memory(payload, dedup=req.dedup, **kwargs) @router.post("/memories/search") diff --git a/docs/DEVELOPER_GUIDE.md b/docs/DEVELOPER_GUIDE.md index ba3e591..9807ed1 100644 --- a/docs/DEVELOPER_GUIDE.md +++ b/docs/DEVELOPER_GUIDE.md @@ -59,7 +59,11 @@ app/ config.py Settings (pydantic-settings); single source of config truth. Rejects startup on missing required vars; validates provider keys. memory.py mem0 wrapper. _build_config() assembles the mem0 config dict; get_memory() - is the @lru_cache'd shared instance. The most tweak-prone file. + is the @lru_cache'd shared instance. add_memory() wraps mem0's add with a + cheap content-fingerprint dedup: it SHA-256s the normalized raw input, stores + it in the `content_fp` payload field, and skips the LLM extraction if a memory + with that fingerprint already exists (fail-open — a lookup error just proceeds). + The most tweak-prone file. mcp_server.py build_mcp(): the six MCP tools, each thinly wrapping a mem0 op with user_id defaulted to MEM0_DEFAULT_USER_ID. rest.py REST router under /api/v1 (mounted with prefix in main.py). Pydantic request diff --git a/docs/USER_GUIDE.md b/docs/USER_GUIDE.md index bfba442..0e1feff 100644 --- a/docs/USER_GUIDE.md +++ b/docs/USER_GUIDE.md @@ -52,6 +52,13 @@ from your text, then stores each fact as a vector **embedding** (OpenAI by defau Searches are semantic: you ask in natural language and get back the most similar stored facts, not keyword matches. +**Exact-duplicate add is free.** Before that LLM extraction runs, the server fingerprints the raw +content; if you submit byte-identical content that's already stored, the add is skipped (no LLM +call) and the response is `{"results": [], "deduplicated": true, "memory_id": "…"}`. This makes +re-runs of imports and webhook/n8n retries cheap and idempotent. Pass `"dedup": false` on a REST add +to force re-extraction. (This is distinct from mem0's *semantic* dedup, which still applies when +similar-but-not-identical content reaches the LLM.) + Memories can optionally be tagged with: - `agent_id` — a provenance tag for which agent/tool wrote it (e.g. `n8n-flow`, `claude-code`). @@ -487,7 +494,11 @@ response bodies are JSON. `user_id` defaults to `MEM0_DEFAULT_USER_ID` if omitte ### Add a memory — `POST /api/v1/memories` Provide **either** `content` (a string) **or** `messages` (a chat transcript). Optional: -`agent_id`, `run_id`, `metadata`, `user_id`. +`agent_id`, `run_id`, `metadata`, `user_id`, and `dedup` (default `true`). + +By default, submitting content byte-identical to something already stored is skipped before the LLM +runs and returns `{"results": [], "deduplicated": true, "memory_id": "…"}` (see +[How memory works](#how-memory-works)). Set `"dedup": false` to force re-extraction. ```bash curl -X POST https://mem0.your-domain.com/api/v1/memories \ @@ -590,12 +601,13 @@ python scripts/import_obsidian.py ~/my-vault --limit 5 python scripts/import_readwise.py ~/Downloads/readwise.csv ``` -**Cost note.** Every imported memory goes through the normal `add` path, which +**Cost note.** Every *new* imported memory goes through the normal `add` path, which invokes the fact-extraction LLM (see the [Configuration reference](#configuration-reference)). A large ChatGPT or Obsidian import can mean thousands of LLM calls — use `--dry-run` and `--limit` first to gauge volume. -mem0 also deduplicates semantically on add, so re-importing the same content -often results in no new memories. +**Re-running an import is cheap and idempotent:** byte-identical content already stored is skipped +*before* the LLM runs (see [How memory works](#how-memory-works)), so a second pass over the same +export adds nothing and costs nothing. > Requirements: Python 3.12 and the project's dependencies installed > (`pip install -r requirements.txt`); the scripts add the repo root to diff --git a/tests/conftest.py b/tests/conftest.py index 9a2011f..e5aa6d2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -34,6 +34,9 @@ @pytest.fixture def mem(): FAKE_MEMORY.reset_mock() + # Default: no existing fingerprint, so add_memory()'s dedup check is a no-op + # and proceeds to call .add(). Tests exercising dedup override this. + FAKE_MEMORY.vector_store.list.return_value = ([], None) return FAKE_MEMORY diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 8e9a76b..59f8292 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -33,6 +33,16 @@ async def test_add_memory_tool(mcp, mem): args, kwargs = mem.add.call_args assert args[0] == "remember this" assert kwargs["user_id"] == "default-user" + assert "content_fp" in kwargs["metadata"] # dedup fingerprint stored + + +async def test_add_memory_tool_deduplicates(mcp, mem): + from types import SimpleNamespace + + mem.vector_store.list.return_value = ([SimpleNamespace(id="dup-1")], None) + async with Client(mcp) as client: + await client.call_tool("add_memory", {"content": "remember this"}) + mem.add.assert_not_called() # exact repeat is skipped, no LLM extraction async def test_search_memories_tool(mcp, mem): diff --git a/tests/test_memory.py b/tests/test_memory.py index cc7ca8e..a1ffa04 100644 --- a/tests/test_memory.py +++ b/tests/test_memory.py @@ -1,5 +1,13 @@ +from types import SimpleNamespace +from unittest.mock import MagicMock + from app.config import Settings -from app.memory import _build_config +from app.memory import ( + _build_config, + _existing_fingerprint_id, + add_memory, + content_fingerprint, +) def test_build_config_shape(): @@ -26,3 +34,96 @@ def test_build_config_accepted_by_mem0_schema(): from mem0.configs.base import MemoryConfig MemoryConfig(**_build_config(Settings())) + + +# --- content fingerprint ----------------------------------------------------- + + +def test_content_fingerprint_normalizes_whitespace_and_case(): + assert content_fingerprint("Hello World") == content_fingerprint(" hello world ") + assert content_fingerprint("a\nb") == content_fingerprint("a b") + + +def test_content_fingerprint_differs_for_different_content(): + assert content_fingerprint("apples") != content_fingerprint("oranges") + + +def test_content_fingerprint_handles_message_lists(): + msgs = [{"role": "user", "content": "hi"}] + fp = content_fingerprint(msgs) + assert isinstance(fp, str) and len(fp) == 64 + assert content_fingerprint(msgs) == fp # stable + + +# --- _existing_fingerprint_id ------------------------------------------------ + + +def test_existing_fingerprint_id_found(): + mem = MagicMock() + mem.vector_store.list.return_value = ([SimpleNamespace(id="m-1")], None) + assert _existing_fingerprint_id(mem, "fp", "ian") == "m-1" + _, kwargs = mem.vector_store.list.call_args + assert kwargs["filters"] == {"content_fp": "fp", "user_id": "ian"} + + +def test_existing_fingerprint_id_none_when_empty(): + mem = MagicMock() + mem.vector_store.list.return_value = ([], None) + assert _existing_fingerprint_id(mem, "fp", "ian") is None + + +def test_existing_fingerprint_id_fails_open_on_error(): + mem = MagicMock() + mem.vector_store.list.side_effect = RuntimeError("qdrant down") + # A dedup-check failure must never block a write. + assert _existing_fingerprint_id(mem, "fp", "ian") is None + + +# --- add_memory wrapper ------------------------------------------------------ + + +def _patch_memory(monkeypatch, *, existing): + """Patch app.memory.get_memory to a fake whose dedup lookup returns `existing`.""" + import app.memory as m + + fake = MagicMock() + points = [SimpleNamespace(id=existing)] if existing else [] + fake.vector_store.list.return_value = (points, None) + fake.add.return_value = {"results": [{"id": "new"}]} + monkeypatch.setattr(m, "get_memory", lambda: fake) + return fake + + +def test_add_memory_stores_fingerprint_when_new(monkeypatch): + fake = _patch_memory(monkeypatch, existing=None) + out = add_memory("remember this", user_id="ian", agent_id="cli") + assert out == {"results": [{"id": "new"}]} + args, kwargs = fake.add.call_args + assert args[0] == "remember this" + assert kwargs["user_id"] == "ian" + assert kwargs["agent_id"] == "cli" + assert "content_fp" in kwargs["metadata"] + + +def test_add_memory_skips_when_duplicate(monkeypatch): + fake = _patch_memory(monkeypatch, existing="dup-1") + out = add_memory("remember this", user_id="ian") + assert out == {"results": [], "deduplicated": True, "memory_id": "dup-1"} + fake.add.assert_not_called() # no LLM extraction for an exact repeat + + +def test_add_memory_dedup_false_skips_check(monkeypatch): + fake = _patch_memory(monkeypatch, existing="dup-1") + add_memory("remember this", dedup=False, user_id="ian") + fake.vector_store.list.assert_not_called() # no dedup lookup at all + fake.add.assert_called_once() + _, kwargs = fake.add.call_args + assert "content_fp" not in (kwargs.get("metadata") or {}) # no fingerprint added + + +def test_add_memory_merges_existing_metadata(monkeypatch): + fake = _patch_memory(monkeypatch, existing=None) + add_memory("x", user_id="ian", metadata={"source": "import"}) + _, kwargs = fake.add.call_args + assert kwargs["metadata"]["source"] == "import" + assert "content_fp" in kwargs["metadata"] diff --git a/tests/test_rest.py b/tests/test_rest.py index 949a8a2..536e7ab 100644 --- a/tests/test_rest.py +++ b/tests/test_rest.py @@ -31,6 +31,38 @@ def test_add_memory_requires_content_or_messages(app_instance, mem, auth_header) assert resp.status_code == 422 +def test_add_memory_stores_fingerprint(app_instance, mem, auth_header): + mem.add.return_value = {"results": []} + c = _client(app_instance) + resp = c.post("/api/v1/memories", json={"content": "hi"}, headers=auth_header) + assert resp.status_code == 200 + _, kwargs = mem.add.call_args + assert "content_fp" in kwargs["metadata"] # fingerprint stored for dedup + + +def test_add_memory_deduplicates_exact_repeat(app_instance, mem, auth_header): + from types import SimpleNamespace + + mem.vector_store.list.return_value = ([SimpleNamespace(id="dup-1")], None) + c = _client(app_instance) + resp = c.post("/api/v1/memories", json={"content": "hi"}, headers=auth_header) + assert resp.status_code == 200 + assert resp.json() == {"results": [], "deduplicated": True, "memory_id": "dup-1"} + mem.add.assert_not_called() # no LLM extraction on a duplicate + + +def test_add_memory_dedup_false_bypasses_check(app_instance, mem, auth_header): + from types import SimpleNamespace + + mem.vector_store.list.return_value = ([SimpleNamespace(id="dup-1")], None) + mem.add.return_value = {"results": []} + c = _client(app_instance) + resp = c.post("/api/v1/memories", json={"content": "hi", "dedup": False}, headers=auth_header) + assert resp.status_code == 200 + mem.add.assert_called_once() + mem.vector_store.list.assert_not_called() + + def test_search(app_instance, mem, auth_header): mem.search.return_value = {"results": []} c = _client(app_instance) From 9b58afbc6a2388fe87331318918a31cbd2fd5d6d Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 7 Jun 2026 22:54:07 +0000 Subject: [PATCH 2/2] Address dedup review: normalize message transcripts; fix "byte-identical" wording - app/memory.py: the message-list fingerprint path previously json-dumped + lowercased, which did NOT collapse internal whitespace (newlines/tabs become escaped \n in JSON), contradicting the docstring. Normalize each message's role and text individually (lowercase + collapse whitespace) so equivalent transcripts dedupe. Extract a shared _normalize_text() helper. - Wording: dedup matches a normalized fingerprint (case-insensitive, whitespace-collapsed), not raw bytes. Replace "byte-identical" everywhere (app/rest.py dedup-field comment, USER_GUIDE x3) with accurate phrasing. - tests: assert message-transcript normalization (case/whitespace/newline equivalence, and that differing role/text fingerprint differently). https://claude.ai/code/session_017835DVrvURaYnbQiPQwzue --- app/memory.py | 26 +++++++++++++++++++++----- app/rest.py | 6 ++++-- docs/USER_GUIDE.md | 23 +++++++++++++---------- tests/test_memory.py | 18 ++++++++++++++---- 4 files changed, 52 insertions(+), 21 deletions(-) diff --git a/app/memory.py b/app/memory.py index 17ac0b4..f042a86 100644 --- a/app/memory.py +++ b/app/memory.py @@ -49,17 +49,33 @@ def get_memory(): return Memory.from_config(_build_config(get_settings())) +def _normalize_text(text: str) -> str: + # Lowercase and collapse all runs of whitespace (incl. newlines/tabs) to a + # single space, so trivial formatting differences fingerprint the same. + return " ".join(text.split()).lower() + + def content_fingerprint(content) -> str: """A deterministic fingerprint of the raw add() input, for cheap dedup. - Normalizes (lowercase + collapse whitespace) so trivial formatting - differences fingerprint the same, then SHA-256s the result. Strings and - structured message lists are both supported. + Normalizes case and whitespace so trivial formatting differences fingerprint + the same, then SHA-256s the result. For a message transcript, each message's + role and text are normalized individually (so whitespace/case differences in + the text don't defeat dedup) before hashing. """ if isinstance(content, str): - normalized = " ".join(content.split()).lower() + normalized = _normalize_text(content) + elif isinstance(content, list): + parts = [] + for message in content: + if isinstance(message, dict): + role = str(message.get("role", "")).strip().lower() + parts.append(f"{role}\x1f{_normalize_text(str(message.get('content', '')))}") + else: + parts.append(_normalize_text(str(message))) + normalized = "\x1e".join(parts) else: - normalized = json.dumps(content, sort_keys=True, separators=(",", ":")).lower() + normalized = _normalize_text(json.dumps(content, sort_keys=True)) return hashlib.sha256(normalized.encode("utf-8")).hexdigest() diff --git a/app/rest.py b/app/rest.py index 9e3a0a3..c7bd778 100644 --- a/app/rest.py +++ b/app/rest.py @@ -24,8 +24,10 @@ class AddMemoryRequest(BaseModel): agent_id: str | None = None run_id: str | None = None metadata: dict | None = None - # When true (default), byte-identical content already stored is skipped - # before mem0's LLM fact-extraction runs. Set false to force re-extraction. + # When true (default), content already stored is skipped before mem0's LLM + # fact-extraction runs. Matching is on a normalized fingerprint (case- + # insensitive, whitespace-collapsed), not raw bytes. Set false to force + # re-extraction. dedup: bool = True diff --git a/docs/USER_GUIDE.md b/docs/USER_GUIDE.md index 0e1feff..d66bf47 100644 --- a/docs/USER_GUIDE.md +++ b/docs/USER_GUIDE.md @@ -52,11 +52,12 @@ from your text, then stores each fact as a vector **embedding** (OpenAI by defau Searches are semantic: you ask in natural language and get back the most similar stored facts, not keyword matches. -**Exact-duplicate add is free.** Before that LLM extraction runs, the server fingerprints the raw -content; if you submit byte-identical content that's already stored, the add is skipped (no LLM -call) and the response is `{"results": [], "deduplicated": true, "memory_id": "…"}`. This makes -re-runs of imports and webhook/n8n retries cheap and idempotent. Pass `"dedup": false` on a REST add -to force re-extraction. (This is distinct from mem0's *semantic* dedup, which still applies when +**Re-adding the same content is free.** Before that LLM extraction runs, the server fingerprints the +content (normalized: lowercased and whitespace-collapsed, so differences in case or spacing still +match); if it matches something already stored, the add is skipped (no LLM call) and the response is +`{"results": [], "deduplicated": true, "memory_id": "…"}`. This makes re-runs of imports and +webhook/n8n retries cheap and idempotent. Pass `"dedup": false` on a REST add to force +re-extraction. (This is distinct from mem0's *semantic* dedup, which still applies when similar-but-not-identical content reaches the LLM.) Memories can optionally be tagged with: @@ -496,8 +497,9 @@ response bodies are JSON. `user_id` defaults to `MEM0_DEFAULT_USER_ID` if omitte Provide **either** `content` (a string) **or** `messages` (a chat transcript). Optional: `agent_id`, `run_id`, `metadata`, `user_id`, and `dedup` (default `true`). -By default, submitting content byte-identical to something already stored is skipped before the LLM -runs and returns `{"results": [], "deduplicated": true, "memory_id": "…"}` (see +By default, submitting content that matches something already stored — compared on a normalized +fingerprint (case-insensitive, whitespace-collapsed), not raw bytes — is skipped before the LLM runs +and returns `{"results": [], "deduplicated": true, "memory_id": "…"}` (see [How memory works](#how-memory-works)). Set `"dedup": false` to force re-extraction. ```bash @@ -605,9 +607,10 @@ python scripts/import_readwise.py ~/Downloads/readwise.csv invokes the fact-extraction LLM (see the [Configuration reference](#configuration-reference)). A large ChatGPT or Obsidian import can mean thousands of LLM calls — use `--dry-run` and `--limit` first to gauge volume. -**Re-running an import is cheap and idempotent:** byte-identical content already stored is skipped -*before* the LLM runs (see [How memory works](#how-memory-works)), so a second pass over the same -export adds nothing and costs nothing. +**Re-running an import is cheap and idempotent:** content already stored — matched on a normalized +fingerprint (case-insensitive, whitespace-collapsed) — is skipped *before* the LLM runs (see +[How memory works](#how-memory-works)), so a second pass over the same export adds nothing and costs +nothing. > Requirements: Python 3.12 and the project's dependencies installed > (`pip install -r requirements.txt`); the scripts add the repo root to diff --git a/tests/test_memory.py b/tests/test_memory.py index a1ffa04..e680526 100644 --- a/tests/test_memory.py +++ b/tests/test_memory.py @@ -49,10 +49,20 @@ def test_content_fingerprint_differs_for_different_content(): def test_content_fingerprint_handles_message_lists(): - msgs = [{"role": "user", "content": "hi"}] - fp = content_fingerprint(msgs) - assert isinstance(fp, str) and len(fp) == 64 - assert content_fingerprint(msgs) == fp # stable + base = [{"role": "user", "content": "Hello World"}] + # Case + whitespace (incl. newlines) inside message text are normalized. + equivalent = [{"role": "user", "content": "hello world"}] + newlined = [{"role": "user", "content": "hello\nworld"}] + assert content_fingerprint(base) == content_fingerprint(equivalent) + assert content_fingerprint(base) == content_fingerprint(newlined) + assert len(content_fingerprint(base)) == 64 + # A different role or different text fingerprints differently. + assert content_fingerprint(base) != content_fingerprint( + [{"role": "assistant", "content": "hello world"}] + ) + assert content_fingerprint(base) != content_fingerprint( + [{"role": "user", "content": "goodbye world"}] + ) # --- _existing_fingerprint_id ------------------------------------------------