From 7e8948eb86a333a656c2db22e66f89d71cd43f9b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 7 Jun 2026 22:44:46 +0000
Subject: [PATCH 1/2] Add content-fingerprint dedup on the write path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Skip mem0's LLM fact-extraction when byte-identical content is re-submitted.
Before add, the raw input is normalized (lowercase + collapse whitespace) and
SHA-256'd; the fingerprint is stored in the `content_fp` Qdrant payload field
and looked up (via the vector store's filter) on the next add. An exact repeat
returns {"results": [], "deduplicated": true, "memory_id": ...} without calling
the LLM. Adapts the OB1 content-fingerprint-dedup recipe.

Verified against a live Qdrant: metadata lands top-level in the payload and
filtering on the (unindexed) content_fp field works, so no payload index is
required. The dedup check is fail-open — any lookup error just proceeds with a
normal add, so it can only ever save work, never block a write.

- app/memory.py: content_fingerprint(), _existing_fingerprint_id() (fail-open),
  and add_memory(content, dedup=True, **kwargs) wrapper.
- app/rest.py: AddMemoryRequest.dedup flag (default true); route through wrapper.
- app/mcp_server.py: add_memory tool routes through the wrapper (always dedups).
- tests: fingerprint normalization, lookup found/empty/fail-open, wrapper
  store/skip/dedup-false/metadata-merge, plus REST + MCP integration cases.
  conftest defaults the dedup lookup to empty so existing add tests are unaffected.
- docs: USER_GUIDE (How memory works + add endpoint + import idempotency),
  DEVELOPER_GUIDE memory.py description.

Closes #48.

https://claude.ai/code/session_017835DVrvURaYnbQiPQwzue
---
 app/mcp_server.py       |   5 +-
 app/memory.py           |  60 +++++++++++++++++++++++
 app/rest.py             |   6 ++-
 docs/DEVELOPER_GUIDE.md |   6 ++-
 docs/USER_GUIDE.md      |  20 ++++++--
 tests/conftest.py       |   3 ++
 tests/test_mcp.py       |  10 ++++
 tests/test_memory.py    | 103 +++++++++++++++++++++++++++++++++++++++-
 tests/test_rest.py      |  32 +++++++++++++
 9 files changed, 236 insertions(+), 9 deletions(-)

diff --git a/app/mcp_server.py b/app/mcp_server.py
index 639d62a..c7bdc71 100644
--- a/app/mcp_server.py
+++ b/app/mcp_server.py
@@ -22,13 +22,16 @@ def add_memory(content: str, agent_id: str | None = None, metadata: dict | None
         agent_id is an optional provenance tag recording which agent wrote the
         memory. It does NOT partition the store — search and list always span
         every memory for the user, so all connected agents share one memory.
+
+        Submitting the same content again is automatically deduplicated and
+        skips re-processing, so it's safe to call without checking first.
         """
         kwargs: dict = {"user_id": default_user}
         if agent_id:
             kwargs["agent_id"] = agent_id
         if metadata:
             kwargs["metadata"] = metadata
-        return memory.add(content, **kwargs)
+        return memory_mod.add_memory(content, **kwargs)
 
     @mcp.tool
     def search_memories(query: str, limit: int = 10, recency_weight: float = 0.0) -> dict:
diff --git a/app/memory.py b/app/memory.py
index a093e28..17ac0b4 100644
--- a/app/memory.py
+++ b/app/memory.py
@@ -1,3 +1,5 @@
+import hashlib
+import json
 from functools import lru_cache
 
 from app.config import Settings, get_settings
@@ -45,3 +47,61 @@ def get_memory():
     from mem0 import Memory
 
     return Memory.from_config(_build_config(get_settings()))
+
+
+def content_fingerprint(content) -> str:
+    """A deterministic fingerprint of the raw add() input, for cheap dedup.
+
+    Normalizes (lowercase + collapse whitespace) so trivial formatting
+    differences fingerprint the same, then SHA-256s the result. Strings and
+    structured message lists are both supported.
+    """
+    if isinstance(content, str):
+        normalized = " ".join(content.split()).lower()
+    else:
+        normalized = json.dumps(content, sort_keys=True, separators=(",", ":")).lower()
+    return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
+
+
+def _existing_fingerprint_id(memory, fingerprint: str, user_id: str | None) -> str | None:
+    """Return the id of an already-stored memory with this fingerprint, or None.
+
+    Best-effort: the fingerprint is matched against the `content_fp` payload
+    field via the vector store's filter. Any error (store quirk, transient
+    failure) returns None so the dedup check never blocks a write — it only ever
+    saves work, never prevents it.
+    """
+    filters: dict = {"content_fp": fingerprint}
+    if user_id:
+        filters["user_id"] = user_id
+    try:
+        result = memory.vector_store.list(filters=filters, top_k=1)
+    except Exception:
+        return None
+    # mem0's Qdrant store returns a (points, next_offset) tuple; normalize that
+    # and a bare-list return to the points list.
+    points = result[0] if isinstance(result, tuple) else result
+    if not points:
+        return None
+    return getattr(points[0], "id", None)
+
+
+def add_memory(content, *, dedup: bool = True, **kwargs) -> dict:
+    """Add a memory, optionally skipping mem0's LLM extraction for exact repeats.
+
+    With `dedup` on (default), a normalized SHA-256 fingerprint of the raw
+    content is computed and stored in metadata as `content_fp`. If a memory with
+    the same fingerprint already exists for the user, the add is skipped — no LLM
+    fact-extraction call — and a `{"deduplicated": True}` marker is returned.
+    Pass `dedup=False` to force a normal add (e.g. to re-extract).
+    """
+    memory = get_memory()
+    if not dedup:
+        return memory.add(content, **kwargs)
+    fingerprint = content_fingerprint(content)
+    existing_id = _existing_fingerprint_id(memory, fingerprint, kwargs.get("user_id"))
+    if existing_id is not None:
+        return {"results": [], "deduplicated": True, "memory_id": existing_id}
+    metadata = dict(kwargs.pop("metadata", None) or {})
+    metadata["content_fp"] = fingerprint
+    return memory.add(content, metadata=metadata, **kwargs)
diff --git a/app/rest.py b/app/rest.py
index 23ed76c..9e3a0a3 100644
--- a/app/rest.py
+++ b/app/rest.py
@@ -24,6 +24,9 @@ class AddMemoryRequest(BaseModel):
     agent_id: str | None = None
     run_id: str | None = None
     metadata: dict | None = None
+    # When true (default), byte-identical content already stored is skipped
+    # before mem0's LLM fact-extraction runs. Set false to force re-extraction.
+    dedup: bool = True
 
 
 class SearchRequest(BaseModel):
@@ -58,12 +61,11 @@ def _scope_kwargs(
 def add_memory(req: AddMemoryRequest) -> dict:
     if not req.content and not req.messages:
         raise HTTPException(status_code=422, detail="Provide either 'content' or 'messages'")
-    memory = memory_mod.get_memory()
     payload = req.content if req.content is not None else [m.model_dump() for m in req.messages]
     kwargs = _scope_kwargs(req.user_id, req.agent_id, req.run_id)
     if req.metadata:
         kwargs["metadata"] = req.metadata
-    return memory.add(payload, **kwargs)
+    return memory_mod.add_memory(payload, dedup=req.dedup, **kwargs)
 
 
 @router.post("/memories/search")
diff --git a/docs/DEVELOPER_GUIDE.md b/docs/DEVELOPER_GUIDE.md
index ba3e591..9807ed1 100644
--- a/docs/DEVELOPER_GUIDE.md
+++ b/docs/DEVELOPER_GUIDE.md
@@ -59,7 +59,11 @@ app/
   config.py         Settings (pydantic-settings); single source of config truth. Rejects
                     startup on missing required vars; validates provider keys.
   memory.py         mem0 wrapper. _build_config() assembles the mem0 config dict; get_memory()
-                    is the @lru_cache'd shared instance. The most tweak-prone file.
+                    is the @lru_cache'd shared instance. add_memory() wraps mem0's add with a
+                    cheap content-fingerprint dedup: it SHA-256s the normalized raw input, stores
+                    it in the `content_fp` payload field, and skips the LLM extraction if a memory
+                    with that fingerprint already exists (fail-open — a lookup error just proceeds).
+                    The most tweak-prone file.
   mcp_server.py     build_mcp(): the six MCP tools, each thinly wrapping a mem0 op with
                     user_id defaulted to MEM0_DEFAULT_USER_ID.
   rest.py           REST router under /api/v1 (mounted with prefix in main.py). Pydantic request
diff --git a/docs/USER_GUIDE.md b/docs/USER_GUIDE.md
index bfba442..0e1feff 100644
--- a/docs/USER_GUIDE.md
+++ b/docs/USER_GUIDE.md
@@ -52,6 +52,13 @@ from your text, then stores each fact as a vector **embedding** (OpenAI by defau
 Searches are semantic: you ask in natural language and get back the most similar stored facts, not
 keyword matches.
 
+**Exact-duplicate add is free.** Before that LLM extraction runs, the server fingerprints the raw
+content; if you submit byte-identical content that's already stored, the add is skipped (no LLM
+call) and the response is `{"results": [], "deduplicated": true, "memory_id": "…"}`. This makes
+re-runs of imports and webhook/n8n retries cheap and idempotent. Pass `"dedup": false` on a REST add
+to force re-extraction. (This is distinct from mem0's *semantic* dedup, which still applies when
+similar-but-not-identical content reaches the LLM.)
+
 Memories can optionally be tagged with:
 
 - `agent_id` — a provenance tag for which agent/tool wrote it (e.g. `n8n-flow`, `claude-code`).
@@ -487,7 +494,11 @@ response bodies are JSON. `user_id` defaults to `MEM0_DEFAULT_USER_ID` if omitte
 ### Add a memory — `POST /api/v1/memories`
 
 Provide **either** `content` (a string) **or** `messages` (a chat transcript). Optional:
-`agent_id`, `run_id`, `metadata`, `user_id`.
+`agent_id`, `run_id`, `metadata`, `user_id`, and `dedup` (default `true`).
+
+By default, submitting content byte-identical to something already stored is skipped before the LLM
+runs and returns `{"results": [], "deduplicated": true, "memory_id": "…"}` (see
+[How memory works](#how-memory-works)). Set `"dedup": false` to force re-extraction.
 
 ```bash
 curl -X POST https://mem0.your-domain.com/api/v1/memories \
@@ -590,12 +601,13 @@ python scripts/import_obsidian.py ~/my-vault --limit 5
 python scripts/import_readwise.py ~/Downloads/readwise.csv
 ```
 
-**Cost note.** Every imported memory goes through the normal `add` path, which
+**Cost note.** Every *new* imported memory goes through the normal `add` path, which
 invokes the fact-extraction LLM (see the
 [Configuration reference](#configuration-reference)). A large ChatGPT or Obsidian import can mean
 thousands of LLM calls — use `--dry-run` and `--limit` first to gauge volume.
-mem0 also deduplicates semantically on add, so re-importing the same content
-often results in no new memories.
+**Re-running an import is cheap and idempotent:** byte-identical content already stored is skipped
+*before* the LLM runs (see [How memory works](#how-memory-works)), so a second pass over the same
+export adds nothing and costs nothing.
 
 > Requirements: Python 3.12 and the project's dependencies installed
 > (`pip install -r requirements.txt`); the scripts add the repo root to
diff --git a/tests/conftest.py b/tests/conftest.py
index 9a2011f..e5aa6d2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -34,6 +34,9 @@
 @pytest.fixture
 def mem():
     FAKE_MEMORY.reset_mock()
+    # Default: no existing fingerprint, so add_memory()'s dedup check is a no-op
+    # and proceeds to call .add(). Tests exercising dedup override this.
+    FAKE_MEMORY.vector_store.list.return_value = ([], None)
     return FAKE_MEMORY
 
 
diff --git a/tests/test_mcp.py b/tests/test_mcp.py
index 8e9a76b..59f8292 100644
--- a/tests/test_mcp.py
+++ b/tests/test_mcp.py
@@ -33,6 +33,16 @@ async def test_add_memory_tool(mcp, mem):
     args, kwargs = mem.add.call_args
     assert args[0] == "remember this"
     assert kwargs["user_id"] == "default-user"
+    assert "content_fp" in kwargs["metadata"]  # dedup fingerprint stored
+
+
+async def test_add_memory_tool_deduplicates(mcp, mem):
+    from types import SimpleNamespace
+
+    mem.vector_store.list.return_value = ([SimpleNamespace(id="dup-1")], None)
+    async with Client(mcp) as client:
+        await client.call_tool("add_memory", {"content": "remember this"})
+    mem.add.assert_not_called()  # exact repeat is skipped, no LLM extraction
 
 
 async def test_search_memories_tool(mcp, mem):
diff --git a/tests/test_memory.py b/tests/test_memory.py
index cc7ca8e..a1ffa04 100644
--- a/tests/test_memory.py
+++ b/tests/test_memory.py
@@ -1,5 +1,13 @@
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
 from app.config import Settings
-from app.memory import _build_config
+from app.memory import (
+    _build_config,
+    _existing_fingerprint_id,
+    add_memory,
+    content_fingerprint,
+)
 
 
 def test_build_config_shape():
@@ -26,3 +34,96 @@ def test_build_config_accepted_by_mem0_schema():
     from mem0.configs.base import MemoryConfig
 
     MemoryConfig(**_build_config(Settings()))
+
+
+# --- content fingerprint -----------------------------------------------------
+
+
+def test_content_fingerprint_normalizes_whitespace_and_case():
+    assert content_fingerprint("Hello  World") == content_fingerprint("  hello world  ")
+    assert content_fingerprint("a\nb") == content_fingerprint("a b")
+
+
+def test_content_fingerprint_differs_for_different_content():
+    assert content_fingerprint("apples") != content_fingerprint("oranges")
+
+
+def test_content_fingerprint_handles_message_lists():
+    msgs = [{"role": "user", "content": "hi"}]
+    fp = content_fingerprint(msgs)
+    assert isinstance(fp, str) and len(fp) == 64
+    assert content_fingerprint(msgs) == fp  # stable
+
+
+# --- _existing_fingerprint_id ------------------------------------------------
+
+
+def test_existing_fingerprint_id_found():
+    mem = MagicMock()
+    mem.vector_store.list.return_value = ([SimpleNamespace(id="m-1")], None)
+    assert _existing_fingerprint_id(mem, "fp", "ian") == "m-1"
+    _, kwargs = mem.vector_store.list.call_args
+    assert kwargs["filters"] == {"content_fp": "fp", "user_id": "ian"}
+
+
+def test_existing_fingerprint_id_none_when_empty():
+    mem = MagicMock()
+    mem.vector_store.list.return_value = ([], None)
+    assert _existing_fingerprint_id(mem, "fp", "ian") is None
+
+
+def test_existing_fingerprint_id_fails_open_on_error():
+    mem = MagicMock()
+    mem.vector_store.list.side_effect = RuntimeError("qdrant down")
+    # A dedup-check failure must never block a write.
+    assert _existing_fingerprint_id(mem, "fp", "ian") is None
+
+
+# --- add_memory wrapper ------------------------------------------------------
+
+
+def _patch_memory(monkeypatch, *, existing):
+    """Patch app.memory.get_memory to a fake whose dedup lookup returns `existing`."""
+    import app.memory as m
+
+    fake = MagicMock()
+    points = [SimpleNamespace(id=existing)] if existing else []
+    fake.vector_store.list.return_value = (points, None)
+    fake.add.return_value = {"results": [{"id": "new"}]}
+    monkeypatch.setattr(m, "get_memory", lambda: fake)
+    return fake
+
+
+def test_add_memory_stores_fingerprint_when_new(monkeypatch):
+    fake = _patch_memory(monkeypatch, existing=None)
+    out = add_memory("remember this", user_id="ian", agent_id="cli")
+    assert out == {"results": [{"id": "new"}]}
+    args, kwargs = fake.add.call_args
+    assert args[0] == "remember this"
+    assert kwargs["user_id"] == "ian"
+    assert kwargs["agent_id"] == "cli"
+    assert "content_fp" in kwargs["metadata"]
+
+
+def test_add_memory_skips_when_duplicate(monkeypatch):
+    fake = _patch_memory(monkeypatch, existing="dup-1")
+    out = add_memory("remember this", user_id="ian")
+    assert out == {"results": [], "deduplicated": True, "memory_id": "dup-1"}
+    fake.add.assert_not_called()  # no LLM extraction for an exact repeat
+
+
+def test_add_memory_dedup_false_skips_check(monkeypatch):
+    fake = _patch_memory(monkeypatch, existing="dup-1")
+    add_memory("remember this", dedup=False, user_id="ian")
+    fake.vector_store.list.assert_not_called()  # no dedup lookup at all
+    fake.add.assert_called_once()
+    _, kwargs = fake.add.call_args
+    assert "content_fp" not in (kwargs.get("metadata") or {})  # no fingerprint added
+
+
+def test_add_memory_merges_existing_metadata(monkeypatch):
+    fake = _patch_memory(monkeypatch, existing=None)
+    add_memory("x", user_id="ian", metadata={"source": "import"})
+    _, kwargs = fake.add.call_args
+    assert kwargs["metadata"]["source"] == "import"
+    assert "content_fp" in kwargs["metadata"]
diff --git a/tests/test_rest.py b/tests/test_rest.py
index 949a8a2..536e7ab 100644
--- a/tests/test_rest.py
+++ b/tests/test_rest.py
@@ -31,6 +31,38 @@ def test_add_memory_requires_content_or_messages(app_instance, mem, auth_header)
     assert resp.status_code == 422
 
 
+def test_add_memory_stores_fingerprint(app_instance, mem, auth_header):
+    mem.add.return_value = {"results": []}
+    c = _client(app_instance)
+    resp = c.post("/api/v1/memories", json={"content": "hi"}, headers=auth_header)
+    assert resp.status_code == 200
+    _, kwargs = mem.add.call_args
+    assert "content_fp" in kwargs["metadata"]  # fingerprint stored for dedup
+
+
+def test_add_memory_deduplicates_exact_repeat(app_instance, mem, auth_header):
+    from types import SimpleNamespace
+
+    mem.vector_store.list.return_value = ([SimpleNamespace(id="dup-1")], None)
+    c = _client(app_instance)
+    resp = c.post("/api/v1/memories", json={"content": "hi"}, headers=auth_header)
+    assert resp.status_code == 200
+    assert resp.json() == {"results": [], "deduplicated": True, "memory_id": "dup-1"}
+    mem.add.assert_not_called()  # no LLM extraction on a duplicate
+
+
+def test_add_memory_dedup_false_bypasses_check(app_instance, mem, auth_header):
+    from types import SimpleNamespace
+
+    mem.vector_store.list.return_value = ([SimpleNamespace(id="dup-1")], None)
+    mem.add.return_value = {"results": []}
+    c = _client(app_instance)
+    resp = c.post("/api/v1/memories", json={"content": "hi", "dedup": False}, headers=auth_header)
+    assert resp.status_code == 200
+    mem.add.assert_called_once()
+    mem.vector_store.list.assert_not_called()
+
+
 def test_search(app_instance, mem, auth_header):
     mem.search.return_value = {"results": []}
     c = _client(app_instance)

From 9b58afbc6a2388fe87331318918a31cbd2fd5d6d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 7 Jun 2026 22:54:07 +0000
Subject: [PATCH 2/2] Address dedup review: normalize message transcripts; fix
 "byte-identical" wording

- app/memory.py: the message-list fingerprint path previously json-dumped +
  lowercased, which did NOT collapse internal whitespace (newlines/tabs become
  escaped \n in JSON), contradicting the docstring. Normalize each message's
  role and text individually (lowercase + collapse whitespace) so equivalent
  transcripts dedupe. Extract a shared _normalize_text() helper.
- Wording: dedup matches a normalized fingerprint (case-insensitive,
  whitespace-collapsed), not raw bytes. Replace "byte-identical" everywhere
  (app/rest.py dedup-field comment, USER_GUIDE x3) with accurate phrasing.
- tests: assert message-transcript normalization (case/whitespace/newline
  equivalence, and that differing role/text fingerprint differently).

https://claude.ai/code/session_017835DVrvURaYnbQiPQwzue
---
 app/memory.py        | 26 +++++++++++++++++++++-----
 app/rest.py          |  6 ++++--
 docs/USER_GUIDE.md   | 23 +++++++++++++----------
 tests/test_memory.py | 18 ++++++++++++++----
 4 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/app/memory.py b/app/memory.py
index 17ac0b4..f042a86 100644
--- a/app/memory.py
+++ b/app/memory.py
@@ -49,17 +49,33 @@ def get_memory():
     return Memory.from_config(_build_config(get_settings()))
 
 
+def _normalize_text(text: str) -> str:
+    # Lowercase and collapse all runs of whitespace (incl. newlines/tabs) to a
+    # single space, so trivial formatting differences fingerprint the same.
+    return " ".join(text.split()).lower()
+
+
 def content_fingerprint(content) -> str:
     """A deterministic fingerprint of the raw add() input, for cheap dedup.
 
-    Normalizes (lowercase + collapse whitespace) so trivial formatting
-    differences fingerprint the same, then SHA-256s the result. Strings and
-    structured message lists are both supported.
+    Normalizes case and whitespace so trivial formatting differences fingerprint
+    the same, then SHA-256s the result. For a message transcript, each message's
+    role and text are normalized individually (so whitespace/case differences in
+    the text don't defeat dedup) before hashing.
     """
     if isinstance(content, str):
-        normalized = " ".join(content.split()).lower()
+        normalized = _normalize_text(content)
+    elif isinstance(content, list):
+        parts = []
+        for message in content:
+            if isinstance(message, dict):
+                role = str(message.get("role", "")).strip().lower()
+                parts.append(f"{role}\x1f{_normalize_text(str(message.get('content', '')))}")
+            else:
+                parts.append(_normalize_text(str(message)))
+        normalized = "\x1e".join(parts)
     else:
-        normalized = json.dumps(content, sort_keys=True, separators=(",", ":")).lower()
+        normalized = _normalize_text(json.dumps(content, sort_keys=True))
     return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
 
 
diff --git a/app/rest.py b/app/rest.py
index 9e3a0a3..c7bd778 100644
--- a/app/rest.py
+++ b/app/rest.py
@@ -24,8 +24,10 @@ class AddMemoryRequest(BaseModel):
     agent_id: str | None = None
     run_id: str | None = None
     metadata: dict | None = None
-    # When true (default), byte-identical content already stored is skipped
-    # before mem0's LLM fact-extraction runs. Set false to force re-extraction.
+    # When true (default), content already stored is skipped before mem0's LLM
+    # fact-extraction runs. Matching is on a normalized fingerprint (case-
+    # insensitive, whitespace-collapsed), not raw bytes. Set false to force
+    # re-extraction.
     dedup: bool = True
 
 
diff --git a/docs/USER_GUIDE.md b/docs/USER_GUIDE.md
index 0e1feff..d66bf47 100644
--- a/docs/USER_GUIDE.md
+++ b/docs/USER_GUIDE.md
@@ -52,11 +52,12 @@ from your text, then stores each fact as a vector **embedding** (OpenAI by defau
 Searches are semantic: you ask in natural language and get back the most similar stored facts, not
 keyword matches.
 
-**Exact-duplicate add is free.** Before that LLM extraction runs, the server fingerprints the raw
-content; if you submit byte-identical content that's already stored, the add is skipped (no LLM
-call) and the response is `{"results": [], "deduplicated": true, "memory_id": "…"}`. This makes
-re-runs of imports and webhook/n8n retries cheap and idempotent. Pass `"dedup": false` on a REST add
-to force re-extraction. (This is distinct from mem0's *semantic* dedup, which still applies when
+**Re-adding the same content is free.** Before that LLM extraction runs, the server fingerprints the
+content (normalized: lowercased and whitespace-collapsed, so differences in case or spacing still
+match); if it matches something already stored, the add is skipped (no LLM call) and the response is
+`{"results": [], "deduplicated": true, "memory_id": "…"}`. This makes re-runs of imports and
+webhook/n8n retries cheap and idempotent. Pass `"dedup": false` on a REST add to force
+re-extraction. (This is distinct from mem0's *semantic* dedup, which still applies when
 similar-but-not-identical content reaches the LLM.)
 
 Memories can optionally be tagged with:
@@ -496,8 +497,9 @@ response bodies are JSON. `user_id` defaults to `MEM0_DEFAULT_USER_ID` if omitte
 Provide **either** `content` (a string) **or** `messages` (a chat transcript). Optional:
 `agent_id`, `run_id`, `metadata`, `user_id`, and `dedup` (default `true`).
 
-By default, submitting content byte-identical to something already stored is skipped before the LLM
-runs and returns `{"results": [], "deduplicated": true, "memory_id": "…"}` (see
+By default, submitting content that matches something already stored — compared on a normalized
+fingerprint (case-insensitive, whitespace-collapsed), not raw bytes — is skipped before the LLM runs
+and returns `{"results": [], "deduplicated": true, "memory_id": "…"}` (see
 [How memory works](#how-memory-works)). Set `"dedup": false` to force re-extraction.
 
 ```bash
@@ -605,9 +607,10 @@ python scripts/import_readwise.py ~/Downloads/readwise.csv
 invokes the fact-extraction LLM (see the
 [Configuration reference](#configuration-reference)). A large ChatGPT or Obsidian import can mean
 thousands of LLM calls — use `--dry-run` and `--limit` first to gauge volume.
-**Re-running an import is cheap and idempotent:** byte-identical content already stored is skipped
-*before* the LLM runs (see [How memory works](#how-memory-works)), so a second pass over the same
-export adds nothing and costs nothing.
+**Re-running an import is cheap and idempotent:** content already stored — matched on a normalized
+fingerprint (case-insensitive, whitespace-collapsed) — is skipped *before* the LLM runs (see
+[How memory works](#how-memory-works)), so a second pass over the same export adds nothing and costs
+nothing.
 
 > Requirements: Python 3.12 and the project's dependencies installed
 > (`pip install -r requirements.txt`); the scripts add the repo root to
diff --git a/tests/test_memory.py b/tests/test_memory.py
index a1ffa04..e680526 100644
--- a/tests/test_memory.py
+++ b/tests/test_memory.py
@@ -49,10 +49,20 @@ def test_content_fingerprint_differs_for_different_content():
 
 
 def test_content_fingerprint_handles_message_lists():
-    msgs = [{"role": "user", "content": "hi"}]
-    fp = content_fingerprint(msgs)
-    assert isinstance(fp, str) and len(fp) == 64
-    assert content_fingerprint(msgs) == fp  # stable
+    base = [{"role": "user", "content": "Hello   World"}]
+    # Case + whitespace (incl. newlines) inside message text are normalized.
+    equivalent = [{"role": "user", "content": "hello world"}]
+    newlined = [{"role": "user", "content": "hello\nworld"}]
+    assert content_fingerprint(base) == content_fingerprint(equivalent)
+    assert content_fingerprint(base) == content_fingerprint(newlined)
+    assert len(content_fingerprint(base)) == 64
+    # A different role or different text fingerprints differently.
+    assert content_fingerprint(base) != content_fingerprint(
+        [{"role": "assistant", "content": "hello world"}]
+    )
+    assert content_fingerprint(base) != content_fingerprint(
+        [{"role": "user", "content": "goodbye world"}]
+    )
 
 
 # --- _existing_fingerprint_id ------------------------------------------------