diff --git a/ui/__tests__/research-rag.test.ts b/ui/__tests__/research-rag.test.ts new file mode 100644 index 0000000..c2b3ac7 --- /dev/null +++ b/ui/__tests__/research-rag.test.ts @@ -0,0 +1,137 @@ +import { + buildEvidencePayload, + buildResearchMetadata, + buildResearchQueries, + detectScientificSection, + fuseChromaResults, + parseBoundedInteger, +} from '@/utils/server/research-rag'; + +import { describe, expect, it } from 'vitest'; + +describe('research RAG helpers', () => { + it('builds deterministic research query variants', () => { + expect( + buildResearchQueries( + 'What does "retrieval augmented generation" improve in scientific workflows?', + ), + ).toEqual([ + 'What does "retrieval augmented generation" improve in scientific workflows', + 'retrieval augmented generation', + 'retrieval augmented generation improve scientific workflows', + ]); + }); + + it('detects scientific sections near chunk starts', () => { + expect(detectScientificSection('Abstract\nThis paper studies RAG.')).toBe( + 'abstract', + ); + expect( + detectScientificSection('Materials and Methods\nWe used a benchmark.'), + ).toBe('materials and methods'); + expect(detectScientificSection('A general paragraph.')).toBe('body'); + }); + + it('builds citation metadata without leaking temporary upload paths', () => { + const metadata = buildResearchMetadata( + { + pageContent: 'Results\nThe method improves grounded answers.', + metadata: { + loc: { pageNumber: 7 }, + pdf: { info: { Title: 'Grounded Scientific RAG' } }, + source: '/tmp/uploads/private/source-paper.pdf', + }, + }, + 'source-paper.pdf', + 3, + 1, + ); + + expect(metadata).toMatchObject({ + citationKey: 'grounded-scientific-rag:p7:c2', + page: 7, + section: 'results', + source: 'source-paper.pdf', + title: 'Grounded Scientific RAG', + }); + expect(metadata.source).not.toContain('/tmp/uploads'); + }); + + it('bounds integer request parameters', () => { + expect(parseBoundedInteger('20', 8, 16)).toBe(16); + expect(parseBoundedInteger(0, 8, 16)).toBe(8); + expect(parseBoundedInteger('bad', 8, 16)).toBe(8); + }); + + it('fuses duplicate chunks across query variants', () => { + const fused = fuseChromaResults( + { + documents: [ + ['The answer is grounded in chunk one.', 'A second chunk.'], + ['The answer is grounded in chunk one.'], + ], + metadatas: [ + [ + { + citationKey: 'paper:p1:c1', + page: 1, + source: 'paper.pdf', + title: 'Paper', + }, + { + citationKey: 'paper:p2:c1', + page: 2, + source: 'paper.pdf', + title: 'Paper', + }, + ], + [ + { + citationKey: 'paper:p1:c1', + page: 1, + source: 'paper.pdf', + title: 'Paper', + }, + ], + ], + distances: [[0.05, 0.4], [0.06]], + ids: [['a', 'b'], ['a']], + }, + 4, + ); + + expect(fused).toHaveLength(2); + expect(fused[0].citationKey).toBe('paper:p1:c1'); + expect(fused[0].rankScore).toBeGreaterThan(fused[1].rankScore); + }); + + it('formats bounded evidence and source manifests', () => { + const payload = buildEvidencePayload( + { + documents: [['Chunk about scientific retrieval.'.repeat(20)]], + metadatas: [ + [ + { + citationKey: 'paper:p1:c1', + page: 1, + section: 'abstract', + source: 'paper.pdf', + title: 'Paper', + }, + ], + ], + distances: [[0.1]], + }, + { maxChunkChars: 40, maxEvidenceChars: 200, maxResults: 2 }, + ); + + expect(payload.citations).toHaveLength(1); + expect(payload.citations[0].content.length).toBeLessThanOrEqual(40); + expect(payload.evidenceContext).toContain('[paper:p1:c1]'); + expect(payload.sourceManifest[0]).toMatchObject({ + citationKeys: ['paper:p1:c1'], + source: 'paper.pdf', + title: 'Paper', + }); + }); +}); diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts index 9304e48..133fab3 100644 --- a/ui/pages/api/fetch-documents.ts +++ b/ui/pages/api/fetch-documents.ts @@ -1,25 +1,66 @@ -import type { NextApiRequest, NextApiResponse } from "next"; -import { ChromaClient, TransformersEmbeddingFunction } from "chromadb"; +import type { NextApiRequest, NextApiResponse } from 'next'; -export default async function handler(req: NextApiRequest, res: NextApiResponse) { +import { + buildEvidencePayload, + buildResearchQueries, + parseBoundedInteger, +} from '@/utils/server/research-rag'; + +import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; + +export default async function handler( + req: NextApiRequest, + res: NextApiResponse, +) { try { + if (req.method !== 'POST') { + res.setHeader('Allow', 'POST'); + return res.status(405).json({ error: 'Method not allowed' }); + } + const client = new ChromaClient({ - path: "http://chroma-server:8000", + path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const query = req.body.input; + const query = + typeof req.body.input === 'string' ? req.body.input.trim() : ''; + + if (!query) { + return res.status(400).json({ error: 'Missing retrieval query' }); + } + + const nResults = parseBoundedInteger(req.body.nResults, 8, 16); + const maxEvidenceChars = parseBoundedInteger( + req.body.maxEvidenceChars, + 12000, + 30000, + ); + const queryTexts = buildResearchQueries(query); const embedder = new TransformersEmbeddingFunction(); - const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder }); + const collection = await client.getOrCreateCollection({ + name: 'default-collection', + embeddingFunction: embedder, + }); + + // Query deterministic research-focused variants, then fuse the result sets. + const results = await collection.query({ + nResults, + queryTexts, + include: ['documents', 'metadatas', 'distances'] as any, + }); - // query the collection - const results = await collection.query({ - nResults: 4, - queryTexts: [query] - }) + const evidence = buildEvidencePayload(results, { + maxEvidenceChars, + maxResults: nResults, + }); - res.status(200).json(results); + res.status(200).json({ + ...results, + queryTexts, + ...evidence, + }); } catch (error) { if (error instanceof Error) { console.error('Error message:', error.message); @@ -29,4 +70,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) } res.status(500).json({ error: 'An unexpected error occurred :(' }); } -} \ No newline at end of file +} diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 532a635..df25c59 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -1,10 +1,16 @@ import type { NextApiRequest, NextApiResponse } from 'next'; +import { + type LoadedDocument, + type PrimitiveMetadata, + RESEARCH_TEXT_SEPARATORS, + buildResearchMetadata, +} from '@/utils/server/research-rag'; + import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; import { IncomingForm } from 'formidable'; import { PDFLoader } from 'langchain/document_loaders/fs/pdf'; -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; - +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import path from 'path'; import { v4 as uuidv4 } from 'uuid'; @@ -33,22 +39,31 @@ export default async function handler( path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const loader = new PDFLoader(files.pdf[0].filepath); + const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf; - const originalDocs = await loader.load(); + if (!pdfFile?.filepath) { + return res.status(400).json({ error: 'Missing PDF upload' }); + } - console.log(JSON.stringify(originalDocs)); + const fallbackSource = + pdfFile.originalFilename ?? path.basename(pdfFile.filepath); + const loader = new PDFLoader(pdfFile.filepath); + const originalDocs = await loader.load(); const splitter = new RecursiveCharacterTextSplitter({ - chunkSize: 500, - chunkOverlap: 100, - }); + chunkSize: 900, + chunkOverlap: 180, + separators: RESEARCH_TEXT_SEPARATORS, + }); const docs = await splitter.splitDocuments(originalDocs); - + // Process the documents and perform other logic - const { ids, metadatas, documentContents } = processDocuments(docs); + const { ids, metadatas, documentContents } = processDocuments( + docs, + fallbackSource, + ); const embedder = new TransformersEmbeddingFunction(); const collection = await client.getOrCreateCollection({ @@ -75,28 +90,41 @@ export default async function handler( } } -function processDocuments(docs: any) { - const ids = []; - const metadatas = []; - const documentContents = []; +function processDocuments(docs: LoadedDocument[], fallbackSource: string) { + const ids: string[] = []; + const metadatas: PrimitiveMetadata[] = []; + const documentContents: string[] = []; + const pageChunkCounts = new Map(); - for (const document of docs) { + for (let index = 0; index < docs.length; index += 1) { + const document = docs[index]; // Generate an ID for each document, or use some existing unique identifier const id = uuidv4(); ids.push(id); - const fallbackTitle = path.basename(document.metadata.source); - const titleFromMetadata = document.metadata.pdf.info.Title; - - const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle; - - - const metadata = { - title: title, - page: document.metadata.loc.pageNumber, // Define this function to extract chapter info - source: document.metadata.source, // Define this function to extract verse info - }; - metadatas.push(metadata); + const source = + typeof document.metadata === 'object' && + document.metadata !== null && + 'source' in document.metadata && + typeof document.metadata.source === 'string' + ? document.metadata.source + : fallbackSource; + const page = + typeof document.metadata === 'object' && + document.metadata !== null && + 'loc' in document.metadata && + typeof document.metadata.loc === 'object' && + document.metadata.loc !== null && + 'pageNumber' in document.metadata.loc + ? document.metadata.loc.pageNumber + : 'unknown'; + const pageKey = `${source}:${page}`; + const pageChunkIndex = pageChunkCounts.get(pageKey) ?? 0; + pageChunkCounts.set(pageKey, pageChunkIndex + 1); + + metadatas.push( + buildResearchMetadata(document, fallbackSource, index, pageChunkIndex), + ); // Add the page content to the documents array documentContents.push(document.pageContent); diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index ce84d67..184248a 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -1,6 +1,9 @@ import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const'; import { OpenAIError, OpenAIStream } from '@/utils/server'; -import { codeBlock, oneLine } from 'common-tags' +import type { + ResearchCitation, + SourceManifestEntry, +} from '@/utils/server/research-rag'; import { ChatBody, Message } from '@/types/chat'; @@ -9,46 +12,66 @@ import wasm from '../../node_modules/@dqbd/tiktoken/lite/tiktoken_bg.wasm?module import tiktokenModel from '@dqbd/tiktoken/encoders/cl100k_base.json'; import { Tiktoken, init } from '@dqbd/tiktoken/lite/init'; +import { codeBlock, oneLine } from 'common-tags'; export const config = { runtime: 'edge', }; -// Function to fetch and format documents -async function fetchAndFormatDocuments(lastMessageContent: string) { +type FetchDocumentsResponse = { + citations?: ResearchCitation[]; + evidenceContext?: string; + sourceManifest?: SourceManifestEntry[]; +}; + +function formatSourceManifest(sourceManifest: SourceManifestEntry[]) { + if (sourceManifest.length === 0) { + return 'No source manifest was produced.'; + } + + return sourceManifest + .map( + (source, index) => + `${index + 1}. ${source.title} (${ + source.source + }) -> ${source.citationKeys.join(', ')}`, + ) + .join('\n'); +} + +async function fetchResearchEvidence(req: Request, lastMessageContent: string) { try { - console.log("fetching documents") - const response = await fetch('http://localhost:3000/api/fetch-documents', { + const response = await fetch(new URL('/api/fetch-documents', req.url), { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ input: lastMessageContent }), + body: JSON.stringify({ + input: lastMessageContent, + maxEvidenceChars: 12000, + nResults: 8, + }), }); - + if (!response.ok) { throw new Error(`Error fetching documents: ${response.statusText}`); } - const data = await response.json(); - const result = data.metadatas[0].map((metadata: any, index: number) => { - return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`; - }).join(''); - - console.log(result); - - return result; + const data = (await response.json()) as FetchDocumentsResponse; + return { + citations: Array.isArray(data.citations) ? data.citations : [], + evidenceContext: + typeof data.evidenceContext === 'string' ? data.evidenceContext : '', + sourceManifest: Array.isArray(data.sourceManifest) + ? data.sourceManifest + : [], + }; } catch (error) { - console.error('Error fetching and formatting documents:', error); - throw error; // You may want to throw a more specific error object here + console.error('Error fetching research evidence:', error); + throw error; } } - - - - const handler = async (req: Request): Promise => { - try { const { model, messages, key, prompt, temperature } = (await req.json()) as ChatBody; @@ -62,10 +85,8 @@ const handler = async (req: Request): Promise => { let promptToSend = codeBlock` ${oneLine` - You are a very enthusiastic AI assistant who loves - to help people! Given the following information from - relevant documentation, answer the user's question using - only that information, outputted in markdown format. + You are a scientific research assistant. Given retrieved evidence from + uploaded documents, answer the user's question using only that evidence. `} ${oneLine` @@ -75,7 +96,7 @@ const handler = async (req: Request): Promise => { `} ${oneLine` - Always include citations from the documentation. + Every factual claim must include the exact citation key from the evidence. `} `; @@ -85,8 +106,9 @@ const handler = async (req: Request): Promise => { const lastMessage = messages[messages.length - 1]; - const relevantDocuments = await fetchAndFormatDocuments(lastMessage.content); - + const { citations, evidenceContext, sourceManifest } = + await fetchResearchEvidence(req, lastMessage.content); + let temperatureToUse = temperature; if (temperatureToUse == null) { temperatureToUse = DEFAULT_TEMPERATURE; @@ -97,30 +119,43 @@ const handler = async (req: Request): Promise => { let tokenCount = prompt_tokens.length; let messagesToSend: Message[] = []; - encoding.free(); - console.log(model, promptToSend, temperatureToUse, key, messagesToSend); - - - messagesToSend = [ + messagesToSend = [ + { + role: 'user', + content: codeBlock` + Here is the retrieved evidence: + ${evidenceContext || 'No matching evidence was retrieved.'} + `, + }, { - role: "user", + role: 'user', content: codeBlock` - Here is the relevant documentation: - ${relevantDocuments} + Here is the source manifest: + ${formatSourceManifest(sourceManifest)} `, }, { - role: "user", + role: 'user', content: codeBlock` ${oneLine` - Answer my next question using only the above documentation. + Answer my next question using only the above evidence. You must also follow the below rules when answering: `} ${oneLine` - Do not make up answers that are not provided in the documentation. `} + ${oneLine` + - Cite claims inline with the exact keys shown in square brackets, + for example [paper-title:p3:c2]. + `} + ${oneLine` + - Only cite keys that appear in the source manifest or evidence. + `} + ${oneLine` + - Prefer the highest-ranked evidence when sources disagree. + `} ${oneLine` - If you are unsure and the answer is not explicitly written in the documentation context, say @@ -135,19 +170,26 @@ const handler = async (req: Request): Promise => { `, }, { - role: "user", + role: 'user', content: codeBlock` Here is my question: ${oneLine`${lastMessage.content}`} `, }, - ] - + ]; + + if (citations.length === 0) { + messagesToSend.splice(2, 0, { + role: 'user', + content: + 'No citations were found. If the evidence is empty, say "Sorry, I don\'t know how to help with that."', + }); + } const stream = await OpenAIStream( model, promptToSend, - 0, + temperatureToUse, key, messagesToSend, ); diff --git a/ui/utils/server/research-rag.ts b/ui/utils/server/research-rag.ts new file mode 100644 index 0000000..3a64f1f --- /dev/null +++ b/ui/utils/server/research-rag.ts @@ -0,0 +1,590 @@ +type Primitive = string | number | boolean; + +type UnknownRecord = Record; + +export type PrimitiveMetadata = Record; + +export type LoadedDocument = { + pageContent: string; + metadata?: unknown; +}; + +export type ResearchChunkMetadata = PrimitiveMetadata & { + citationKey: string; + chunkIndex: number; + page: number | string; + section: string; + source: string; + sourceId: string; + title: string; +}; + +export type ChromaQueryResults = { + documents?: unknown; + distances?: unknown; + ids?: unknown; + metadatas?: unknown; +}; + +export type FusedResearchResult = { + citationKey: string; + content: string; + distance?: number; + id?: string; + metadata: Partial & PrimitiveMetadata; + rank: number; + rankScore: number; + sourceQueryIndex: number; +}; + +export type ResearchCitation = { + key: string; + title: string; + source: string; + page: number | string; + section: string; + distance?: number; + rankScore: number; + content: string; +}; + +export type SourceManifestEntry = { + sourceId: string; + title: string; + source: string; + citationKeys: string[]; +}; + +export type EvidencePayload = { + citations: ResearchCitation[]; + evidenceContext: string; + results: FusedResearchResult[]; + sourceManifest: SourceManifestEntry[]; +}; + +const SCIENTIFIC_SECTIONS = [ + 'abstract', + 'introduction', + 'background', + 'related work', + 'materials and methods', + 'methodology', + 'methods', + 'experiment', + 'experiments', + 'evaluation', + 'results', + 'discussion', + 'limitations', + 'conclusion', + 'references', +]; + +const STOP_WORDS = new Set([ + 'about', + 'after', + 'again', + 'also', + 'answer', + 'based', + 'before', + 'between', + 'could', + 'describe', + 'does', + 'explain', + 'from', + 'have', + 'how', + 'into', + 'paper', + 'papers', + 'please', + 'research', + 'should', + 'show', + 'that', + 'their', + 'there', + 'these', + 'this', + 'using', + 'what', + 'when', + 'where', + 'which', + 'with', +]); + +export const RESEARCH_TEXT_SEPARATORS = [ + '\nAbstract', + '\nABSTRACT', + '\nIntroduction', + '\nINTRODUCTION', + '\nBackground', + '\nBACKGROUND', + '\nRelated Work', + '\nRELATED WORK', + '\nMethods', + '\nMETHODS', + '\nMaterials and Methods', + '\nMATERIALS AND METHODS', + '\nMethodology', + '\nMETHODOLOGY', + '\nExperiments', + '\nEXPERIMENTS', + '\nResults', + '\nRESULTS', + '\nEvaluation', + '\nEVALUATION', + '\nDiscussion', + '\nDISCUSSION', + '\nLimitations', + '\nLIMITATIONS', + '\nConclusion', + '\nCONCLUSION', + '\nReferences', + '\nREFERENCES', + '\n\n', + '\n', + '. ', + ' ', + '', +]; + +function isRecord(value: unknown): value is UnknownRecord { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function primitive(value: unknown): Primitive | undefined { + if ( + typeof value === 'string' || + typeof value === 'number' || + typeof value === 'boolean' + ) { + return value; + } + + return undefined; +} + +function firstPrimitive( + record: UnknownRecord, + keys: string[], +): Primitive | undefined { + for (const key of keys) { + const value = primitive(record[key]); + if (value !== undefined) { + return value; + } + } + + return undefined; +} + +function nestedPrimitive( + record: UnknownRecord, + path: string[], +): Primitive | undefined { + let cursor: unknown = record; + + for (const part of path) { + if (!isRecord(cursor)) { + return undefined; + } + + cursor = cursor[part]; + } + + return primitive(cursor); +} + +export function asCleanString(value: unknown): string | undefined { + if (typeof value === 'string') { + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : undefined; + } + + if (typeof value === 'number' || typeof value === 'boolean') { + return String(value); + } + + return undefined; +} + +function asFiniteNumber(value: unknown): number | undefined { + if (typeof value === 'number' && Number.isFinite(value)) { + return value; + } + + if (typeof value === 'string') { + const parsed = Number(value); + if (Number.isFinite(parsed)) { + return parsed; + } + } + + return undefined; +} + +function basename(value: string) { + const [withoutQuery] = value.split(/[?#]/); + const parts = withoutQuery.replace(/\\/g, '/').split('/').filter(Boolean); + + return parts[parts.length - 1] ?? value; +} + +function slugify(value: string, fallback: string) { + return ( + value + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, '') + .slice(0, 48) || fallback + ); +} + +function hashString(value: string) { + let hash = 0x811c9dc5; + + for (let index = 0; index < value.length; index += 1) { + hash ^= value.charCodeAt(index); + hash += + (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); + } + + return (hash >>> 0).toString(16).toUpperCase().padStart(8, '0'); +} + +function collapseWhitespace(value: string) { + return value.replace(/\s+/g, ' ').trim(); +} + +export function parseBoundedInteger( + value: unknown, + fallback: number, + max: number, +) { + const parsed = asFiniteNumber(value); + + if (parsed === undefined || parsed <= 0) { + return fallback; + } + + return Math.min(Math.floor(parsed), max); +} + +export function detectScientificSection(content: string) { + const sample = content.split('\n').slice(0, 10).join(' ').toLowerCase(); + + const matchedSection = SCIENTIFIC_SECTIONS.find((section) => { + const escaped = section.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + return new RegExp(`\\b${escaped}\\b`).test(sample); + }); + + return matchedSection ?? 'body'; +} + +export function buildCitationKey({ + chunkIndex, + page, + title, +}: { + chunkIndex: number; + page: number | string; + title: string; +}) { + return `${slugify(title, 'document')}:p${page}:c${chunkIndex + 1}`; +} + +export function buildResearchMetadata( + document: LoadedDocument, + fallbackSource: string, + chunkIndex: number, + pageChunkIndex = chunkIndex, +): ResearchChunkMetadata { + const metadata = isRecord(document.metadata) ? document.metadata : {}; + const publicSource = basename( + asCleanString( + firstPrimitive(metadata, [ + 'originalFilename', + 'filename', + 'fileName', + 'sourceLabel', + 'source', + ]), + ) ?? fallbackSource, + ); + const fallbackTitle = publicSource || fallbackSource || 'document'; + const title = + asCleanString(firstPrimitive(metadata, ['title', 'documentTitle'])) ?? + asCleanString(nestedPrimitive(metadata, ['pdf', 'info', 'Title'])) ?? + fallbackTitle; + const page = + asFiniteNumber(firstPrimitive(metadata, ['page', 'pageNumber'])) ?? + asFiniteNumber(nestedPrimitive(metadata, ['loc', 'pageNumber'])) ?? + 'unknown'; + const section = detectScientificSection(document.pageContent); + const citationKey = buildCitationKey({ + chunkIndex: pageChunkIndex, + page, + title, + }); + const sourceId = `DOC-${hashString( + `${title.toLowerCase()}|${publicSource.toLowerCase()}`, + )}`; + + return { + citationKey, + chunkIndex, + page, + section, + source: publicSource, + sourceId, + title, + }; +} + +export function buildResearchQueries(input: string, maxQueries = 3) { + const normalized = collapseWhitespace(input.replace(/[?!.]+$/g, '')); + const queries = [normalized]; + const quotedPhrases = Array.from(input.matchAll(/"([^"]{8,120})"/g)) + .map((match) => collapseWhitespace(match[1])) + .filter(Boolean); + + queries.push(...quotedPhrases); + + const terms = normalized + .toLowerCase() + .replace(/[^a-z0-9\s-]/g, ' ') + .split(/\s+/) + .map((term) => term.trim()) + .filter((term) => term.length >= 4 && !STOP_WORDS.has(term)); + const uniqueTerms = Array.from(new Set(terms)).slice(0, 10); + + if (uniqueTerms.length >= 2) { + queries.push(uniqueTerms.join(' ')); + } + + if (uniqueTerms.length >= 4) { + queries.push(uniqueTerms.slice(0, 6).join(' ')); + } + + return Array.from(new Set(queries.filter(Boolean))).slice(0, maxQueries); +} + +function matrixRow(value: unknown, index: number) { + if (!Array.isArray(value)) { + return []; + } + + const row = value[index]; + return Array.isArray(row) ? row : []; +} + +function normalizeResultMetadata( + rawMetadata: unknown, + fallbackIndex: number, +): Partial & PrimitiveMetadata { + const metadata = isRecord(rawMetadata) ? rawMetadata : {}; + const title = + asCleanString(firstPrimitive(metadata, ['title', 'documentTitle'])) ?? + 'Untitled source'; + const source = basename( + asCleanString( + firstPrimitive(metadata, [ + 'source', + 'filename', + 'fileName', + 'sourcePath', + ]), + ) ?? 'unknown-source', + ); + const page = + asFiniteNumber(firstPrimitive(metadata, ['page', 'pageNumber'])) ?? + asCleanString(firstPrimitive(metadata, ['page', 'pageNumber'])) ?? + 'unknown'; + const chunkIndex = + asFiniteNumber(firstPrimitive(metadata, ['chunkIndex', 'chunk_index'])) ?? + fallbackIndex; + const section = + asCleanString(firstPrimitive(metadata, ['section'])) ?? 'body'; + const citationKey = + asCleanString(firstPrimitive(metadata, ['citationKey', 'citation_key'])) ?? + buildCitationKey({ chunkIndex, page, title }); + const sourceId = + asCleanString(firstPrimitive(metadata, ['sourceId', 'source_id'])) ?? + `DOC-${hashString(`${title.toLowerCase()}|${source.toLowerCase()}`)}`; + + return { + citationKey, + chunkIndex, + page, + section, + source, + sourceId, + title, + }; +} + +export function fuseChromaResults( + results: ChromaQueryResults, + limit: number, +): FusedResearchResult[] { + const documentsByQuery = Array.isArray(results.documents) + ? results.documents + : []; + const fused = new Map(); + + for ( + let queryIndex = 0; + queryIndex < documentsByQuery.length; + queryIndex += 1 + ) { + const documents = matrixRow(results.documents, queryIndex); + const metadatas = matrixRow(results.metadatas, queryIndex); + const distances = matrixRow(results.distances, queryIndex); + const ids = matrixRow(results.ids, queryIndex); + + for (let rank = 0; rank < documents.length; rank += 1) { + const content = asCleanString(documents[rank]); + if (!content) { + continue; + } + + const normalizedContent = collapseWhitespace(content); + const metadata = normalizeResultMetadata(metadatas[rank], rank); + const distance = asFiniteNumber(distances[rank]); + const citationKey = + metadata.citationKey ?? + `SRC-${hashString( + `${metadata.source}|${metadata.page}|${normalizedContent}`, + )}`; + const dedupeKey = `${citationKey}|${hashString( + normalizedContent.slice(0, 400), + )}`; + const queryWeight = queryIndex === 0 ? 1 : 0.88; + const rankContribution = queryWeight / (rank + 1); + const distanceContribution = + distance === undefined ? 0 : 0.2 / (1 + Math.max(distance, 0)); + const rankScore = rankContribution + distanceContribution; + const existing = fused.get(dedupeKey); + + if (existing) { + existing.rankScore += rankScore; + if ( + existing.distance === undefined || + (distance !== undefined && distance < existing.distance) + ) { + existing.distance = distance; + existing.rank = rank; + existing.sourceQueryIndex = queryIndex; + } + continue; + } + + fused.set(dedupeKey, { + citationKey, + content: normalizedContent, + distance, + id: asCleanString(ids[rank]), + metadata, + rank, + rankScore, + sourceQueryIndex: queryIndex, + }); + } + } + + return Array.from(fused.values()) + .sort((left, right) => right.rankScore - left.rankScore) + .slice(0, limit); +} + +function safeTruncate(value: string, maxChars: number) { + if (value.length <= maxChars) { + return value; + } + + if (maxChars <= 3) { + return '.'.repeat(Math.max(maxChars, 0)); + } + + return `${value.slice(0, maxChars - 3).trimEnd()}...`; +} + +export function buildEvidencePayload( + results: ChromaQueryResults, + options: { + maxChunkChars?: number; + maxEvidenceChars?: number; + maxResults?: number; + } = {}, +): EvidencePayload { + const maxResults = options.maxResults ?? 8; + const maxChunkChars = options.maxChunkChars ?? 1400; + const maxEvidenceChars = options.maxEvidenceChars ?? 12000; + const fusedResults = fuseChromaResults(results, maxResults); + const citations: ResearchCitation[] = fusedResults.map((result) => ({ + key: result.citationKey, + title: asCleanString(result.metadata.title) ?? 'Untitled source', + source: asCleanString(result.metadata.source) ?? 'unknown-source', + page: result.metadata.page ?? 'unknown', + section: asCleanString(result.metadata.section) ?? 'body', + distance: result.distance, + rankScore: result.rankScore, + content: safeTruncate(result.content, maxChunkChars), + })); + const manifest = new Map(); + + for (const citation of citations) { + const sourceId = `DOC-${hashString( + `${citation.title.toLowerCase()}|${citation.source.toLowerCase()}`, + )}`; + const existing = manifest.get(sourceId); + + if (existing) { + existing.citationKeys.push(citation.key); + continue; + } + + manifest.set(sourceId, { + sourceId, + title: citation.title, + source: citation.source, + citationKeys: [citation.key], + }); + } + + let usedChars = 0; + const evidenceBlocks: string[] = []; + + for (const citation of citations) { + const distance = + citation.distance === undefined + ? '' + : ` | Distance: ${citation.distance.toFixed(4)}`; + const block = [ + `[${citation.key}] Title: ${citation.title} | Source: ${citation.source} | Page: ${citation.page} | Section: ${citation.section}${distance}`, + citation.content, + ].join('\n'); + + if (usedChars + block.length > maxEvidenceChars) { + const remaining = maxEvidenceChars - usedChars; + if (remaining > 0) { + evidenceBlocks.push(safeTruncate(block, remaining)); + } + break; + } + + evidenceBlocks.push(block); + usedChars += block.length; + } + + return { + citations, + evidenceContext: evidenceBlocks.join('\n\n'), + results: fusedResults, + sourceManifest: Array.from(manifest.values()), + }; +}