From c23f802d29d489533740a4ef95a58b0b11a1d54b Mon Sep 17 00:00:00 2001 From: ARIA Date: Sat, 9 May 2026 10:08:36 +0000 Subject: [PATCH] =?UTF-8?q?ARIA=20self-improvement:=20suppress=20'[image?= =?UTF-8?q?=20=E2=80=94=20caption=20failed]'=20stub=20from=20active=20thre?= =?UTF-8?q?ads?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stop polluting working-memory active-thread surface with literal "[image — caption failed]" text on uncaptioned WhatsApp images. Per consolidated insight n_cba1a7f0, the stub: 1. Falsely suggested unread conversation content on every reflect/think tick, 2. Resembled the prompt-injection refusal captions (n_imginject1, n_imginj02), making real injection events harder to spot in the haystack, 3. Survived the Apr 27 saveConversationDigest fix and reappeared Apr 30 / May 9. Changes: - backend/integrations/whatsapp.ts: emit a clean "[image]" marker (no error preamble) when describeImage returns null/refusal. - backend/observer.ts: sanitizeImageCaption() now strips legacy stubs and emits "[image]" instead. Added assertNoCaptionFailedStub() runtime assertion in recordObservation() — strips and logs any observation whose text matches /caption failed/i so future regressions are visible. - backend/memory/working-memory.ts: updateConversationThreads() skips bare media markers ("[image]" / "[voice]" / "[document]") for new threads, never overwrites an informative topic with a bare marker, and rewrites any persisted "caption failed" topic from prior runs to "(image)". Verified with: npx tsc --noEmit (passes). Intent-summary: Image-only WhatsApp messages whose vision-LLM caption fails were emitting a literal "[image — caption failed]" stub that polluted active-thread topics and resembled prompt-injection text. Intent-tokens: image, caption, stub, pollution, threads, vision, marker Co-Authored-By: Claude Opus 4.7 --- backend/integrations/whatsapp.ts | 9 ++++---- backend/memory/working-memory.ts | 24 +++++++++++++++++++++ backend/observer.ts | 36 +++++++++++++++++++++++++++++--- 3 files changed, 62 insertions(+), 7 deletions(-) diff --git a/backend/integrations/whatsapp.ts b/backend/integrations/whatsapp.ts index 875beb82..fa2e85dc 100644 --- a/backend/integrations/whatsapp.ts +++ b/backend/integrations/whatsapp.ts @@ -340,10 +340,11 @@ export async function startWhatsApp( resolvedText = `[image] ${description}`; log.info(`Described image: ${description.slice(0, 80)}`); } else { - // Caption failed (vision unavailable, refusal, error). Keep a neutral - // marker so downstream reasoning sees an image arrived without - // ingesting fabricated/refusal content as if the sender wrote it. - resolvedText = "[image — caption failed]"; + // Caption failed (vision unavailable, refusal, error). Use a clean + // image marker — no error preamble — so downstream sees that an + // image arrived without polluting active-thread topics with text + // that resembles prompt-injection ("caption failed" / refusal text). + resolvedText = "[image]"; } } diff --git a/backend/memory/working-memory.ts b/backend/memory/working-memory.ts index 119e58fd..b557c4df 100644 --- a/backend/memory/working-memory.ts +++ b/backend/memory/working-memory.ts @@ -181,10 +181,28 @@ export function populateTemporalContext(wm: WorkingMemory): void { // ── Conversation Thread Tracking ── +/** + * True if `text` is a bare media marker (e.g. "[image]") with no caption — + * not useful as a thread topic. Captioned images are "[image] " + * and remain valid topics. + */ +function isBareMediaMarker(text: string): boolean { + const t = text.trim(); + return t === "[image]" || t === "[voice]" || t === "[document]"; +} + export function updateConversationThreads(wm: WorkingMemory, observations: Observation[]): void { const now = Date.now(); const STALE_THRESHOLD = 48 * 60 * 60 * 1000; // 48 hours + // Clean any persisted "caption failed" stub topics from prior runs so the + // active-thread surface stops surfacing them on every reflect/think tick. + for (const thread of wm.conversationThreads) { + if (thread.topic && /caption\s*failed/i.test(thread.topic)) { + thread.topic = "(image)"; + } + } + for (const obs of observations) { if (!obs.sender) continue; @@ -192,8 +210,13 @@ export function updateConversationThreads(wm: WorkingMemory, observations: Obser // incoming and outgoing messages map to the same thread. const key = obs.isGroup ? `group:${obs.groupName || obs.senderJid}` : `dm:${obs.chatJid || obs.senderJid}`; let thread = wm.conversationThreads.find(t => t.id === key); + const bareMedia = isBareMediaMarker(obs.text); if (!thread) { + // Skip thread creation for bare-media-only observations: an uncaptioned + // image from a known contact shouldn't populate the active-thread surface + // since the topic would just be "[image]" with no real conversation content. + if (bareMedia) continue; thread = { id: key, participants: [obs.sender], @@ -208,6 +231,7 @@ export function updateConversationThreads(wm: WorkingMemory, observations: Obser thread.lastMessageAt = obs.timestamp; thread.messageCount++; thread.status = "active"; + // Don't overwrite an informative topic with a bare media marker. if (!thread.participants.includes(obs.sender)) { thread.participants.push(obs.sender); diff --git a/backend/observer.ts b/backend/observer.ts index 67266367..90ff78cb 100644 --- a/backend/observer.ts +++ b/backend/observer.ts @@ -21,18 +21,47 @@ const log = createLogger("observer"); /** * If an image-prefixed observation carries a vision-LLM refusal as its - * "caption", replace it with a neutral marker. Refusal text masquerading as - * caption pollutes the observation stream and resembles prompt-injection. + * "caption", replace it with a clean image marker. Refusal text masquerading + * as caption pollutes the observation stream and resembles prompt-injection. + * + * The bare "[image]" marker is structured: downstream prompts (e.g. + * conversation-thread serialization) recognise it and skip it instead of + * surfacing a misleading topic. */ function sanitizeImageCaption(text: string): string { + // Strip any legacy "[image — caption failed]" stubs that may still live in + // observations.jsonl or be replayed from older code paths. + if (/^\[image\s*[—\-]\s*caption\s*failed\]\s*$/i.test(text.trim())) { + return "[image]"; + } if (!text.startsWith("[image]")) return text; const caption = text.slice("[image]".length).trim(); if (caption.length > 0 && isVisionRefusal(caption)) { - return "[image — caption failed]"; + return "[image]"; } return text; } +/** + * Defensive runtime assertion: no observation text should match /caption failed/i. + * Such text was the legacy stub for vision-LLM failures and pollutes the + * active-thread surface (n_cba1a7f0). If we ever see one slip through, strip + * it back to a clean image marker and log so we can trace the writer. + * + * The one legitimate exception is intentional injection-detection logging, + * which writes elsewhere and never lands in obs.text. + */ +function assertNoCaptionFailedStub(obs: Observation): void { + if (!obs.text || !/caption\s*failed/i.test(obs.text)) return; + log(`assertion: stripping legacy 'caption failed' stub from observation (sender=${obs.sender}, source=${obs.source || "whatsapp"}, text="${obs.text.slice(0, 80)}")`); + obs.text = obs.text.replace(/\[image\s*[—\-]\s*caption\s*failed\]/gi, "[image]").trim(); + if (/caption\s*failed/i.test(obs.text)) { + // Stub didn't match the bracketed form — replace any remaining occurrence wholesale. + obs.text = obs.text.replace(/caption\s*failed/gi, "").trim(); + } + if (!obs.text) obs.text = "[image]"; +} + const OBS_FILE = `${BRAIN_DIR}/observations.jsonl`; const RETENTION_DAYS = Number(process.env.BRAIN_OBSERVATION_DAYS ?? 7); @@ -192,6 +221,7 @@ export function recordObservation(obs: Observation): void { // before it lands in the observations file or downstream pipelines. if (obs.text) { obs.text = sanitizeImageCaption(obs.text); + assertNoCaptionFailedStub(obs); } const key = getObservationKey(obs);