diff --git a/src/__tests__/CitationDrawer.test.tsx b/src/__tests__/CitationDrawer.test.tsx index ae41f937..538fdfbf 100644 --- a/src/__tests__/CitationDrawer.test.tsx +++ b/src/__tests__/CitationDrawer.test.tsx @@ -857,7 +857,6 @@ describe("useCitationDrawer", () => { describe("getStatusPriority", () => { it("returns 1 for verified statuses", () => { expect(getStatusPriority({ status: "found" })).toBe(1); - expect(getStatusPriority({ status: "found_context_missed_source_match" })).toBe(1); }); it("returns 2 for pending/null statuses", () => { @@ -872,6 +871,8 @@ describe("getStatusPriority", () => { expect(getStatusPriority({ status: "found_on_other_line" })).toBe(3); expect(getStatusPriority({ status: "first_word_found" })).toBe(3); expect(getStatusPriority({ status: "found_source_match_only" })).toBe(3); + // issue-228: found_context_missed_source_match is a partial match (sourceMatch ⊄ sourceContext) + expect(getStatusPriority({ status: "found_context_missed_source_match" })).toBe(3); }); it("returns 4 for not_found status", () => { diff --git a/src/__tests__/citationParser.test.ts b/src/__tests__/citationParser.test.ts index 72ee0cc7..0d688e7c 100644 --- a/src/__tests__/citationParser.test.ts +++ b/src/__tests__/citationParser.test.ts @@ -294,7 +294,7 @@ ${CITATION_DATA_END_DELIMITER}`; expect(Object.keys(citations).length).toBe(0); }); - it("skips citations without sourceContext", () => { + it("admits citations with sourceMatch but no sourceContext (issue-235)", () => { const response = `Test [1]. ${CITATION_DATA_START_DELIMITER} @@ -302,7 +302,8 @@ ${CITATION_DATA_START_DELIMITER} ${CITATION_DATA_END_DELIMITER}`; const citations = getAllCitationsFromNumericResponse(response); - expect(Object.keys(citations).length).toBe(0); + // sourceMatch-only entries are admitted instead of dropped + expect(Object.keys(citations).length).toBe(1); }); }); diff --git a/src/__tests__/citationParsingEdgeCases.test.ts b/src/__tests__/citationParsingEdgeCases.test.ts index bdbf687b..ddd5fb3b 100644 --- a/src/__tests__/citationParsingEdgeCases.test.ts +++ b/src/__tests__/citationParsingEdgeCases.test.ts @@ -75,15 +75,16 @@ describe("Citation Parsing Edge Cases", () => { }); describe("Edge cases with incomplete data", () => { - it("skips citations without source_context", () => { + it("admits citations with sourceMatch but no source_context (issue-235)", () => { const input = makeNumericResponse("Test [1] [2]", [ { id: 1, attachment_id: "test123", source_match: "no phrase" }, { id: 2, attachment_id: "test123", source_context: "has phrase", source_match: "phrase", page_id: "1_0" }, ]); const result = getAllCitationsFromLlmOutput(input); - // Only citations with sourceContext are included - expect(Object.keys(result).length).toBe(1); - expect(Object.values(result)[0].sourceContext).toBe("has phrase"); + // Both citations are admitted — sourceMatch-only entries are no longer dropped + expect(Object.keys(result).length).toBe(2); + const withContext = Object.values(result).find(c => c.sourceContext === "has phrase"); + expect(withContext).toBeDefined(); }); it("handles empty input", () => { diff --git a/src/__tests__/parseCitation.test.ts b/src/__tests__/parseCitation.test.ts index 70da0889..b44f3551 100644 --- a/src/__tests__/parseCitation.test.ts +++ b/src/__tests__/parseCitation.test.ts @@ -269,11 +269,13 @@ describe("getCitationStatus", () => { expect(status.isVerified).toBe(true); // Partial matches ARE verified (amber checkmark) }); - it("treats found_context_missed_source_match as verified but not partial", () => { + // issue-228: found_context_missed_source_match must downgrade to partial — sourceMatch ⊄ sourceContext + // violates §1 and must never read as fully Verified. + it("treats found_context_missed_source_match as partial match (amber), not fully verified", () => { const verification: Verification = { citation: { - sourceMatch: "term", - sourceContext: "term", + sourceMatch: "F43.10", + sourceContext: "Most responsible DSM-5 diagnosis / borderline personality disorder.", attachmentId: "file", }, document: { @@ -283,8 +285,8 @@ describe("getCitationStatus", () => { sourceSnippet: "snippet", }; const status = getCitationStatus(verification); - expect(status.isVerified).toBe(true); - expect(status.isPartialMatch).toBe(false); + expect(status.isVerified).toBe(true); // Partial matches ARE verified (amber checkmark) + expect(status.isPartialMatch).toBe(true); }); it("treats found_source_match_only as partial match", () => { diff --git a/src/__tests__/parseCitationResponse.test.ts b/src/__tests__/parseCitationResponse.test.ts index b1989d80..4fd263a6 100644 --- a/src/__tests__/parseCitationResponse.test.ts +++ b/src/__tests__/parseCitationResponse.test.ts @@ -430,3 +430,81 @@ describe("parseCitationResponse — cite link format", () => { expect(segments[0]).not.toContain("(cite:"); }); }); + +// ─── Issue-235: orphan marker admission (sourceMatch without sourceContext) ── + +describe("parseCitationResponse — sourceMatch-only admission (issue-235)", () => { + it("admits a citation that has sourceMatch but no sourceContext", () => { + // Simulates an LLM output where the citation block has sourceMatch + pageNumber + // but omits sourceContext entirely. Before the fix, this entry was silently + // dropped, leaving [3] in prose with no markerMap entry → permanent pulsing chip. + const response = makeNumericResponse( + "The patient has moderate impairment [3] and follows medication schedule [4].", + [ + { + id: 3, + attachment_id: "att_aish_form_123456789", + reasoning: "Selected option for mental health impairment", + // No source_context — only source_match + source_match: "Medium or moderate impairment", + page_id: "page_number_1_index_0", + line_ids: [14], + }, + { + id: 4, + attachment_id: "att_aish_form_123456789", + reasoning: "Medication adherence", + source_context: "Patient follows prescribed medication schedule", + source_match: "medication schedule", + page_id: "page_number_2_index_0", + line_ids: [7], + }, + ], + ); + + const result = parseCitationResponse(response); + + // Both markers must have entries — no orphan + expect(result.markerMap[3]).toBeDefined(); + expect(result.markerMap[4]).toBeDefined(); + + // The sourceMatch-only entry must be in citations + const citationKey3 = result.markerMap[3]; + const citation3 = result.citations[citationKey3]; + expect(citation3).toBeDefined(); + expect(citation3.sourceMatch).toBe("Medium or moderate impairment"); + // sourceContext should be absent/empty (not fabricated) + expect(citation3.sourceContext ?? "").toBe(""); + + // The normal entry is unaffected + const citationKey4 = result.markerMap[4]; + expect(result.citations[citationKey4].sourceContext).toBe("Patient follows prescribed medication schedule"); + }); + + it("does not admit an entry with neither sourceContext nor sourceMatch", () => { + // An entry with no searchable text should still be dropped — it provides + // no way to locate or display the citation. + const response = makeNumericResponse("Claim one [1] and claim two [2].", [ + { + id: 1, + attachment_id: "att_aish_form_123456789", + reasoning: "First claim", + source_context: "Valid source context", + source_match: "Valid match", + page_id: "page_number_1_index_0", + }, + { + id: 2, + attachment_id: "att_aish_form_123456789", + reasoning: "Second claim has no searchable text", + // Both source_context and source_match are absent + page_id: "page_number_1_index_0", + }, + ]); + + const result = parseCitationResponse(response); + expect(result.markerMap[1]).toBeDefined(); + // Entry 2 has no searchable text — still silently dropped + expect(result.markerMap[2]).toBeUndefined(); + }); +}); diff --git a/src/__tests__/searchNarrative.test.ts b/src/__tests__/searchNarrative.test.ts index dfb322cb..9d44a974 100644 --- a/src/__tests__/searchNarrative.test.ts +++ b/src/__tests__/searchNarrative.test.ts @@ -22,6 +22,16 @@ describe("buildSearchNarrative", () => { expect(narrative.colorScheme).toBe("amber"); }); + it("returns partial_match for 'found_context_missed_source_match'", () => { + const attempts: SearchAttempt[] = [ + { method: "source_match_fallback", success: true, searchPhrase: "F43.10", pageSearched: 1 }, + ]; + const narrative = buildSearchNarrative(attempts, "found_context_missed_source_match"); + expect(narrative.outcome).toBe("partial_match"); + expect(narrative.colorScheme).toBe("amber"); + expect(narrative.outcomeSummary).toBe("Partial match"); + }); + it("returns not_found for 'not_found' status", () => { const attempts: SearchAttempt[] = [ { method: "exact_line_match", success: false, searchPhrase: "missing text", pageSearched: 1 }, @@ -64,6 +74,23 @@ describe("buildSearchNarrative", () => { expect(narrative.showAllRows).toBe(true); }); + it("is true for 'found_context_missed_source_match' so the partial search trail remains visible", () => { + const attempts: SearchAttempt[] = [ + { method: "exact_line_match", success: false, searchPhrase: "diagnosis", pageSearched: 1 }, + { + method: "source_match_fallback", + success: true, + searchPhrase: "F43.10", + pageSearched: 1, + foundLocation: { page: 1 }, + }, + ]; + const narrative = buildSearchNarrative(attempts, "found_context_missed_source_match"); + expect(narrative.showAllRows).toBe(true); + expect(narrative.groupedAttemptCount).toBe(2); + expect(narrative.rows.map(row => row.kind)).toEqual(["failure", "success"]); + }); + it("is true for null status", () => { const narrative = buildSearchNarrative([], null); expect(narrative.showAllRows).toBe(true); diff --git a/src/__tests__/searchSummaryUtils.test.ts b/src/__tests__/searchSummaryUtils.test.ts index 4cde0330..42d800df 100644 --- a/src/__tests__/searchSummaryUtils.test.ts +++ b/src/__tests__/searchSummaryUtils.test.ts @@ -445,6 +445,32 @@ describe("buildIntentSummary", () => { expect(result.snippets[0].isProximate).toBe(false); // adjacent_pages = distal }); + it("returns related_found for found_context_missed_source_match", () => { + const verification: Verification = { + status: "found_context_missed_source_match", + citation: { + type: "document", + sourceContext: "Most responsible DSM-5 diagnosis / borderline personality disorder.", + sourceMatch: "F43.10", + pageNumber: 1, + }, + }; + const result = buildIntentSummary(verification, [ + attempt({ + searchPhrase: "F43.10", + success: true, + matchedText: "F43.10", + method: "source_match_fallback", + foundLocation: { page: 1 }, + }), + ]); + expect(result).not.toBeNull(); + if (result == null) return; + expect(result.outcome).toBe("related_found"); + expect(result.snippets).toHaveLength(1); + expect(result.snippets[0].matchedText).toBe("F43.10"); + }); + it("classifies proximate methods correctly", () => { const verification: Verification = { status: "found_on_other_line", diff --git a/src/__tests__/urlAccessExplanation.test.tsx b/src/__tests__/urlAccessExplanation.test.tsx index 4598ab58..fdc19000 100644 --- a/src/__tests__/urlAccessExplanation.test.tsx +++ b/src/__tests__/urlAccessExplanation.test.tsx @@ -8,6 +8,7 @@ mock.module("react-dom", () => { import { act, cleanup, fireEvent, render, screen, waitFor } from "@testing-library/react"; import { CitationComponent } from "../react/Citation"; +import { mapSearchStatusToFetchStatus } from "../react/urlAccessExplanation"; import type { Citation } from "../types/citation"; import type { UrlAccessStatus, Verification } from "../types/verification"; @@ -363,3 +364,9 @@ describe("URL Access Explanation in CitationComponent", () => { }); }); }); + +describe("mapSearchStatusToFetchStatus", () => { + it("maps found_context_missed_source_match to partial", () => { + expect(mapSearchStatusToFetchStatus("found_context_missed_source_match")).toBe("partial"); + }); +}); diff --git a/src/analysis/intent.ts b/src/analysis/intent.ts index 1ba255c7..e0413f0e 100644 --- a/src/analysis/intent.ts +++ b/src/analysis/intent.ts @@ -225,8 +225,9 @@ export function buildIntentSummary( }; } - // For found status without displacement → exact_match - if (status === "found" || status === "found_context_missed_source_match") { + // For found status without displacement → exact_match. Other found variants + // can still be related/partial matches even when a source phrase was located. + if (status === "found") { return { outcome: "exact_match", sourceContext, diff --git a/src/analysis/narrative.ts b/src/analysis/narrative.ts index 2213e301..9a7a494a 100644 --- a/src/analysis/narrative.ts +++ b/src/analysis/narrative.ts @@ -203,8 +203,9 @@ function getOutcomeSummary( switch (status) { case "found": - case "found_context_missed_source_match": return t("outcome.exactMatch"); + case "found_context_missed_source_match": + return t("outcome.partialMatch"); case "found_source_match_only": return t("outcome.sourceMatchOnly"); case "found_on_other_page": diff --git a/src/analysis/statusRegistry.ts b/src/analysis/statusRegistry.ts index 9959323b..499717c5 100644 --- a/src/analysis/statusRegistry.ts +++ b/src/analysis/statusRegistry.ts @@ -39,10 +39,10 @@ export const STATUS_MAP = { showOnlyHit: false, }, found_context_missed_source_match: { - outcome: "exact_match", - colorScheme: "green", - headerKey: "status.verified", - showOnlyHit: true, + outcome: "partial_match", + colorScheme: "amber", + headerKey: "status.partialMatch", + showOnlyHit: false, }, found_on_other_page: { outcome: "partial_match", diff --git a/src/parsing/__tests__/parseWorkAround.test.ts b/src/parsing/__tests__/parseWorkAround.test.ts new file mode 100644 index 00000000..06038321 --- /dev/null +++ b/src/parsing/__tests__/parseWorkAround.test.ts @@ -0,0 +1,87 @@ +import { describe, expect, it } from "bun:test"; +import { cleanRepeatingLastSentence, isGeminiGarbage } from "../parseWorkAround"; + +describe("isGeminiGarbage", () => { + describe("single-character repetition", () => { + it("detects all-same-character strings", () => { + expect(isGeminiGarbage("a".repeat(100))).toBe(true); + }); + + it("returns false for short all-same-character strings", () => { + expect(isGeminiGarbage("a".repeat(10))).toBe(false); + }); + + it("returns false for normal text", () => { + expect(isGeminiGarbage("The quick brown fox jumps over the lazy dog.")).toBe(false); + }); + }); + + describe("multi-character repeating unit", () => { + it("detects repeated lines", () => { + const garbage = "\n".repeat(20); + expect(isGeminiGarbage(garbage)).toBe(true); + }); + + it("detects repeated HTML tags without trailing newline", () => { + const lines = Array(20).fill("").join("\n"); + expect(isGeminiGarbage(lines)).toBe(true); + }); + + it("detects other repeated multi-char tokens", () => { + const garbage = Array(15).fill("
").join("\n"); + expect(isGeminiGarbage(garbage)).toBe(true); + }); + + it("detects repeated markup separated by blank lines", () => { + const garbage = Array(15).fill("\n").join("\n"); + expect(isGeminiGarbage(garbage)).toBe(true); + }); + + it("returns false when lines differ", () => { + const normal = ["First sentence.", "Second sentence.", "Third sentence."].join("\n"); + expect(isGeminiGarbage(normal)).toBe(false); + }); + + it("does not classify legitimate repeated text rows as garbage", () => { + const repeatedRows = Array(30).fill("N/A").join("\n"); + expect(isGeminiGarbage(repeatedRows)).toBe(false); + }); + + it("returns false when fewer than MIN_REPETITIONS lines", () => { + expect(isGeminiGarbage("")).toBe(false); + }); + }); + + describe("edge cases", () => { + it("returns false for empty string", () => { + expect(isGeminiGarbage("")).toBe(false); + }); + + it("returns false for whitespace-only string", () => { + expect(isGeminiGarbage(" ")).toBe(false); + }); + }); +}); + +describe("cleanRepeatingLastSentence", () => { + it("removes trailing repeated sentence", () => { + const repeated = "The cat sat on the mat. The dog ran fast. The dog ran fast."; + expect(cleanRepeatingLastSentence(repeated)).toBe("The cat sat on the mat. The dog ran fast."); + }); + + it("removes many repetitions keeping one copy", () => { + const base = "Something happened here."; + const repeated = base + " The fog rolled in. The fog rolled in. The fog rolled in."; + expect(cleanRepeatingLastSentence(repeated)).toBe(base + " The fog rolled in."); + }); + + it("returns text unchanged when no repetition", () => { + const text = "First sentence. Second sentence. Third sentence."; + expect(cleanRepeatingLastSentence(text)).toBe(text); + }); + + it("returns text unchanged when too short to repeat", () => { + const text = "Hello world."; + expect(cleanRepeatingLastSentence(text)).toBe(text); + }); +}); diff --git a/src/parsing/citationParser.ts b/src/parsing/citationParser.ts index 1e13e445..ac73d209 100644 --- a/src/parsing/citationParser.ts +++ b/src/parsing/citationParser.ts @@ -849,7 +849,11 @@ export function getAllCitationsFromNumericResponse(llmResponse: string): { for (const data of parsed.citations) { const citation = citationDataToCitation(data); - if (citation.sourceContext) { + // Admit citations that have either sourceContext OR sourceMatch. + // Dropping entries that only have sourceMatch (no sourceContext) caused + // orphan [N] markers in prose to render as permanently-pulsing chips + // because no map entry existed for the marker number. (issue-235) + if (citation.sourceContext || citation.sourceMatch) { const baseCitationKey = getCitationKey(citation); const citationKey = citations[baseCitationKey] && citations[baseCitationKey].citationNumber !== citation.citationNumber diff --git a/src/parsing/parseCitationResponse.ts b/src/parsing/parseCitationResponse.ts index 9e6fced7..7f30770c 100644 --- a/src/parsing/parseCitationResponse.ts +++ b/src/parsing/parseCitationResponse.ts @@ -56,7 +56,11 @@ function parseNumericFormat(llmOutput: string): ParsedCitationResult { const candidatesByMarker = new Map(); for (const data of parsed.citations) { const citation: Citation = citationDataToCitation(data); - if (citation.sourceContext) { + // Admit citations that have either sourceContext OR sourceMatch. + // Dropping entries that only have sourceMatch (no sourceContext) caused + // orphan [N] markers in prose to render as permanently-pulsing chips + // because no map entry existed for the marker number. (issue-235) + if (citation.sourceContext || citation.sourceMatch) { const key = allocateCitationKey( citations, getCitationKey(citation), diff --git a/src/parsing/parseWorkAround.ts b/src/parsing/parseWorkAround.ts index 3144df27..4814e96c 100644 --- a/src/parsing/parseWorkAround.ts +++ b/src/parsing/parseWorkAround.ts @@ -2,18 +2,58 @@ const MIN_CONTENT_LENGTH_FOR_GEMINI_GARBAGE = 64; const MIN_REPETITIONS = 2; const MIN_SENTENCE_CONTENT_LENGTH = 10; +const MIN_REPEATING_UNIT_LENGTH = 2; +const MAX_REPEATING_UNIT_LENGTH = 80; + +function isMarkupLikeRepeatingUnit(value: string): boolean { + if (value.length < MIN_REPEATING_UNIT_LENGTH || value.length > MAX_REPEATING_UNIT_LENGTH) return false; + return /^<\/?[a-z][a-z0-9:-]*(?:\s[^<>]*)?>$/i.test(value); +} + +function hasRepeatedMarkupLines(text: string): boolean { + let firstLine: string | undefined; + let repetitions = 0; + let lineStart = 0; + + for (let index = 0; index <= text.length; index++) { + if (index < text.length && text[index] !== "\n") continue; + + const line = text.slice(lineStart, index).trim(); + lineStart = index + 1; + if (!line) continue; + + if (firstLine === undefined) { + if (!isMarkupLikeRepeatingUnit(line)) return false; + firstLine = line; + repetitions = 1; + continue; + } + + if (line !== firstLine) return false; + repetitions++; + } + + return repetitions >= MIN_REPETITIONS; +} export const isGeminiGarbage = (content: string) => { if (!content) return false; const trimmedContent = content.trim(); if (trimmedContent.length < MIN_CONTENT_LENGTH_FOR_GEMINI_GARBAGE) return false; + // Single-character repetition (e.g. "aaaaaaa...") const firstCharacter = trimmedContent[0]; - + let allSameChar = true; for (let i = 1; i < trimmedContent.length; i++) { - if (trimmedContent[i] !== firstCharacter) return false; + if (trimmedContent[i] !== firstCharacter) { + allSameChar = false; + break; + } } - return true; + if (allSameChar) return true; + + // Multi-character markup repetition (e.g. "\n\n..."). + return hasRepeatedMarkupLines(trimmedContent); }; // Single linear scan — no regex, so the 100KB validateRegexInput cap does not diff --git a/src/react/urlAccessExplanation.ts b/src/react/urlAccessExplanation.ts index acd0777c..8f45a9e5 100644 --- a/src/react/urlAccessExplanation.ts +++ b/src/react/urlAccessExplanation.ts @@ -99,8 +99,8 @@ export function mapSearchStatusToFetchStatus(status: SearchStatus | null | undef switch (status) { case "found": case "found_source_match_only": - case "found_context_missed_source_match": return "verified"; + case "found_context_missed_source_match": case "found_on_other_page": case "found_on_other_line": case "partial_text_found": diff --git a/src/utils/citationStatus.ts b/src/utils/citationStatus.ts index c6ba011a..38fa7ca9 100644 --- a/src/utils/citationStatus.ts +++ b/src/utils/citationStatus.ts @@ -15,6 +15,7 @@ import { isExactOrDashVariantMatch, isExactOrDashVariantPrefixMatch } from "./te */ export const PARTIAL_STATUSES: ReadonlySet = new Set([ "found_source_match_only", + "found_context_missed_source_match", "partial_text_found", "found_on_other_page", "found_on_other_line", @@ -121,7 +122,8 @@ export function getCitationStatus( // verifier itself flagged the located occurrence as not confidently the // intended one, so it is never fully Verified (issue 58). const isPartialMatch = PARTIAL_STATUSES.has(status) || hasLowTrustMatch || approximate || ambiguous; - const isVerified = status === "found" || status === "found_context_missed_source_match" || isPartialMatch; + // issue-228: found_context_missed_source_match is now in PARTIAL_STATUSES — covered by isPartialMatch. + const isVerified = status === "found" || isPartialMatch; return { isVerified, isMiss, isPartialMatch, isPending }; } diff --git a/tests/playwright/specs/__snapshots__/visualShowcase.spec.tsx-snapshots/popover-showcase-chromium-linux.avif b/tests/playwright/specs/__snapshots__/visualShowcase.spec.tsx-snapshots/popover-showcase-chromium-linux.avif index 80bfe998..da87b9e6 100644 Binary files a/tests/playwright/specs/__snapshots__/visualShowcase.spec.tsx-snapshots/popover-showcase-chromium-linux.avif and b/tests/playwright/specs/__snapshots__/visualShowcase.spec.tsx-snapshots/popover-showcase-chromium-linux.avif differ diff --git a/tests/playwright/specs/__snapshots__/visualShowcase.spec.tsx-snapshots/popover-showcase-dark-chromium-linux.avif b/tests/playwright/specs/__snapshots__/visualShowcase.spec.tsx-snapshots/popover-showcase-dark-chromium-linux.avif index 2dd31832..085c7dd1 100644 Binary files a/tests/playwright/specs/__snapshots__/visualShowcase.spec.tsx-snapshots/popover-showcase-dark-chromium-linux.avif and b/tests/playwright/specs/__snapshots__/visualShowcase.spec.tsx-snapshots/popover-showcase-dark-chromium-linux.avif differ