Skip to content

Commit 309af4e

Browse files
penny-team[bot]jaredlockhartclaude
authored
Use AND strategy with lowercase embeddings for thought dedup (jaredlockhart#884)
OR strategy produced false positives from common short words ("2026", "AI", "agent") matching via TCR on short titles. Switched to AND (both TCR >= 0.6 AND embedding >= 0.6 required) which eliminates all false positives while catching real duplicates. Also lowercase titles before embedding so casing doesn't affect similarity (e.g., "THE GHOST IN THE SHELL" vs "Ghost in the Shell" was 0.381, now 0.652 after lowercasing). Lowered THOUGHT_DEDUP_EMBEDDING_THRESHOLD default from 0.80 to 0.60 since title embeddings score lower than full-content embeddings. Co-authored-by: Jared Lockhart <119884+jaredlockhart@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 0e9c98f commit 309af4e

2 files changed

Lines changed: 5 additions & 5 deletions

File tree

penny/penny/agents/thinking.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ async def after_run(self, user: str) -> bool:
186186
if report and not await self._is_duplicate_thought(user, report):
187187
title, content = self._parse_title(report)
188188
content_embedding = await self._embed_and_serialize(content)
189-
title_embedding = await self._embed_and_serialize(title) if title else None
189+
title_embedding = await self._embed_and_serialize(title.lower()) if title else None
190190
self.db.thoughts.add(
191191
user,
192192
content,
@@ -273,7 +273,7 @@ async def _is_duplicate_thought(self, user: str, report: str) -> bool:
273273
if not title:
274274
return False
275275
title_vec = (
276-
await embed_text(self._embedding_model_client, title)
276+
await embed_text(self._embedding_model_client, title.lower())
277277
if self._embedding_model_client
278278
else None
279279
)
@@ -284,7 +284,7 @@ async def _is_duplicate_thought(self, user: str, report: str) -> bool:
284284
title,
285285
title_vec,
286286
existing_items,
287-
DedupStrategy.TCR_OR_EMBEDDING,
287+
DedupStrategy.TCR_AND_EMBEDDING,
288288
embedding_threshold=self.config.runtime.THOUGHT_DEDUP_EMBEDDING_THRESHOLD,
289289
tcr_threshold=self.config.runtime.THOUGHT_DEDUP_TCR_THRESHOLD,
290290
)

penny/penny/config_params.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,9 +232,9 @@ def _validate_unit_float(value: str) -> float:
232232

233233
ConfigParam(
234234
key="THOUGHT_DEDUP_EMBEDDING_THRESHOLD",
235-
description="Embedding similarity threshold for thought deduplication (0-1)",
235+
description="Embedding similarity threshold for thought title deduplication (0-1)",
236236
type=float,
237-
default=0.80,
237+
default=0.60,
238238
validator=_validate_unit_float,
239239
group=GROUP_INNER_MONOLOGUE,
240240
)

0 commit comments

Comments
 (0)