From 7bcf6a44be67115cf009988a50986754c16b7428 Mon Sep 17 00:00:00 2001 From: codedogQBY <1369175442@qq.com> Date: Sat, 13 Jun 2026 05:23:51 +0800 Subject: [PATCH] fix(rag): vectorize chapters from epub toc structure --- packages/app/src/lib/rag/book-extractor.ts | 75 +++----- packages/core/src/ai/__tests__/tools.test.ts | 6 +- packages/core/src/ai/tools/rag-tools.ts | 5 +- .../core/src/rag/chapter-structure.test.ts | 72 +++++++ packages/core/src/rag/chapter-structure.ts | 181 ++++++++++++++++++ packages/core/src/rag/index.ts | 7 + 6 files changed, 294 insertions(+), 52 deletions(-) create mode 100644 packages/core/src/rag/chapter-structure.test.ts create mode 100644 packages/core/src/rag/chapter-structure.ts diff --git a/packages/app/src/lib/rag/book-extractor.ts b/packages/app/src/lib/rag/book-extractor.ts index 4ba12396..1a3038a6 100644 --- a/packages/app/src/lib/rag/book-extractor.ts +++ b/packages/app/src/lib/rag/book-extractor.ts @@ -4,7 +4,7 @@ * text segments with EPUB CFI references for precise navigation. */ import { DocumentLoader } from "@/lib/reader/document-loader"; -import type { TOCItem } from "@/lib/reader/document-loader"; +import { buildChapterSectionGroups } from "@readany/core/rag"; import * as CFI from "foliate-js/epubcfi.js"; export interface TextSegment { @@ -35,32 +35,34 @@ export async function extractBookChapters(filePath: string): Promise s.text).join("\n\n"); + const content = chapterSegments.map((s) => s.text).join("\n\n"); - chapters.push({ index: i, title, content, segments }); - } catch (err) { - console.warn(`[extractBookChapters] Failed to extract section ${i}:`, err); - } + chapters.push({ index: group.index, title: group.title, content, segments: chapterSegments }); } return chapters; @@ -127,9 +129,10 @@ function getTextNodes(element: Element): Text[] { const walker = element.ownerDocument.createTreeWalker(element, NodeFilter.SHOW_TEXT, null); const nodes: Text[] = []; - let node: Text | null; - while ((node = walker.nextNode() as Text | null)) { - if (node.textContent && node.textContent.trim()) { + while (true) { + const node = walker.nextNode() as Text | null; + if (!node) break; + if (node.textContent?.trim()) { nodes.push(node); } } @@ -141,8 +144,9 @@ function extractBlockText(block: Element): string { const walker = block.ownerDocument.createTreeWalker(block, NodeFilter.SHOW_TEXT, null); const texts: string[] = []; - let node: Text | null; - while ((node = walker.nextNode() as Text | null)) { + while (true) { + const node = walker.nextNode() as Text | null; + if (!node) break; const text = node.textContent?.trim(); if (text) { texts.push(text); @@ -152,29 +156,6 @@ function extractBlockText(block: Element): string { return texts.join(" "); } -function buildTocMap(toc: TOCItem[]): Map { - const map = new Map(); - - function walk(items: TOCItem[]) { - for (const item of items) { - if (item.label) { - map.set(item.index, item.label); - if (item.href) { - const base = item.href.split("#")[0]; - map.set(base, item.label); - map.set(item.href, item.label); - } - } - if (item.subitems?.length) { - walk(item.subitems); - } - } - } - - walk(toc); - return map; -} - async function extractPdfChapters(fileBytes: Uint8Array): Promise { const pdfjsLib = await import("pdfjs-dist"); pdfjsLib.GlobalWorkerOptions.workerSrc = `https://cdn.jsdelivr.net/npm/pdfjs-dist@${pdfjsLib.version}/build/pdf.worker.min.mjs`; diff --git a/packages/core/src/ai/__tests__/tools.test.ts b/packages/core/src/ai/__tests__/tools.test.ts index bce3b395..6e6764e1 100644 --- a/packages/core/src/ai/__tests__/tools.test.ts +++ b/packages/core/src/ai/__tests__/tools.test.ts @@ -285,9 +285,9 @@ describe("ragToc tool", () => { expect(result.totalChapters).toBe(3); expect(result.chapters).toEqual([ - { index: 0, title: "Intro" }, - { index: 1, title: "Chapter 1" }, - { index: 2, title: "Chapter 2" }, + { index: 0, number: 1, title: "Intro" }, + { index: 1, number: 2, title: "Chapter 1" }, + { index: 2, number: 3, title: "Chapter 2" }, ]); }); }); diff --git a/packages/core/src/ai/tools/rag-tools.ts b/packages/core/src/ai/tools/rag-tools.ts index 8d45a1be..96ea5b90 100644 --- a/packages/core/src/ai/tools/rag-tools.ts +++ b/packages/core/src/ai/tools/rag-tools.ts @@ -97,7 +97,7 @@ export function createRagTocTool(bookId: string): ToolDefinition { return { name: "ragToc", description: - "Get the table of contents of the current book. Use this when the user wants to see the book structure or navigate to a specific chapter.", + "Get the table of contents of the current book. Use this when the user wants to see the book structure or navigate to a specific chapter. Use the returned 'index' when calling chapter tools; 'number' is the human-readable chapter order.", parameters: {}, execute: async () => { // Get unique chapter titles from chunks @@ -110,8 +110,9 @@ export function createRagTocTool(bookId: string): ToolDefinition { } return { - chapters: Array.from(chapters.entries()).map(([index, title]) => ({ + chapters: Array.from(chapters.entries()).map(([index, title], ordinal) => ({ index, + number: ordinal + 1, title, })), totalChapters: chapters.size, diff --git a/packages/core/src/rag/chapter-structure.test.ts b/packages/core/src/rag/chapter-structure.test.ts new file mode 100644 index 00000000..a52a81db --- /dev/null +++ b/packages/core/src/rag/chapter-structure.test.ts @@ -0,0 +1,72 @@ +import { describe, expect, it } from "vitest"; +import { buildChapterSectionGroups } from "./chapter-structure"; + +describe("buildChapterSectionGroups", () => { + it("uses leaf TOC entries as logical chapters for multi-volume books", () => { + const groups = buildChapterSectionGroups( + [ + { href: "cover.xhtml" }, + { href: "volume-1.xhtml" }, + { href: "chapter-1.xhtml" }, + { href: "chapter-1-extra.xhtml" }, + { href: "chapter-2.xhtml" }, + { href: "volume-2.xhtml" }, + { href: "chapter-3.xhtml" }, + ], + [ + { + label: "第一卷", + href: "volume-1.xhtml", + subitems: [ + { label: "第一章", href: "chapter-1.xhtml" }, + { label: "第二章", href: "chapter-2.xhtml" }, + ], + }, + { + label: "第二卷", + href: "volume-2.xhtml", + subitems: [{ label: "第三章", href: "chapter-3.xhtml" }], + }, + ], + ); + + expect(groups).toEqual([ + { index: 0, title: "第一章", sectionIndices: [2, 3] }, + { index: 1, title: "第二章", sectionIndices: [4] }, + { index: 2, title: "第三章", sectionIndices: [6] }, + ]); + }); + + it("falls back to top-level TOC entries when no leaf hrefs exist", () => { + const groups = buildChapterSectionGroups( + [{ href: "intro.xhtml" }, { href: "body.xhtml" }], + [{ label: "正文", href: "body.xhtml", subitems: [] }], + ); + + expect(groups).toEqual([{ index: 0, title: "正文", sectionIndices: [1] }]); + }); + + it("normalizes encoded and relative hrefs before matching sections", () => { + const groups = buildChapterSectionGroups( + [{ href: "Text/第1章.xhtml" }, { href: "Text/%E7%AC%AC2%E7%AB%A0.xhtml" }], + [ + { label: "第一章", href: "./Text/%E7%AC%AC1%E7%AB%A0.xhtml#start" }, + { label: "第二章", href: "第2章.xhtml" }, + ], + ); + + expect(groups).toEqual([ + { index: 0, title: "第一章", sectionIndices: [0] }, + { index: 1, title: "第二章", sectionIndices: [1] }, + ]); + }); + + it("falls back to one group per section when TOC has no usable anchors", () => { + const groups = buildChapterSectionGroups([{ href: "a.xhtml" }, { href: "b.xhtml" }], []); + + expect(groups).toEqual([ + { index: 0, title: "Section 1", sectionIndices: [0] }, + { index: 1, title: "Section 2", sectionIndices: [1] }, + ]); + }); +}); diff --git a/packages/core/src/rag/chapter-structure.ts b/packages/core/src/rag/chapter-structure.ts new file mode 100644 index 00000000..23ac1cbd --- /dev/null +++ b/packages/core/src/rag/chapter-structure.ts @@ -0,0 +1,181 @@ +export interface TocTreeItemLike { + label?: string; + href?: string; + index?: number; + subitems?: TocTreeItemLike[]; +} + +export interface SectionRefLike { + href?: string; +} + +export interface ChapterSectionGroup { + index: number; + title: string; + sectionIndices: number[]; +} + +interface TocAnchor { + title: string; + sectionIndex: number; +} + +export function buildChapterSectionGroups( + sections: SectionRefLike[], + toc: TocTreeItemLike[] = [], +): ChapterSectionGroup[] { + if (sections.length === 0) return []; + + const tocAnchors = getTocAnchors(sections, toc); + if (tocAnchors.length === 0) { + return sections.map((_, index) => ({ + index, + title: `Section ${index + 1}`, + sectionIndices: [index], + })); + } + + const boundaryIndices = getTocBoundaryIndices(sections, toc); + return tocAnchors.map((anchor, index) => { + const endExclusive = + boundaryIndices.find((boundaryIndex) => boundaryIndex > anchor.sectionIndex) ?? + sections.length; + return { + index, + title: anchor.title, + sectionIndices: range(anchor.sectionIndex, Math.max(anchor.sectionIndex + 1, endExclusive)), + }; + }); +} + +function getTocAnchors(sections: SectionRefLike[], toc: TocTreeItemLike[]): TocAnchor[] { + const hrefToSectionIndex = buildSectionHrefIndex(sections); + const leaves = flattenTocLeaves(toc).filter((item) => item.label?.trim() && item.href); + const candidates = leaves.length > 0 ? leaves : flattenToc(toc); + const anchors: TocAnchor[] = []; + const seenSectionIndices = new Set(); + + for (const item of candidates) { + const title = item.label?.trim(); + if (!title) continue; + + const sectionIndex = getSectionIndexForTocItem(item, hrefToSectionIndex, sections.length); + if (sectionIndex === null || seenSectionIndices.has(sectionIndex)) continue; + + seenSectionIndices.add(sectionIndex); + anchors.push({ title, sectionIndex }); + } + + return anchors.sort((a, b) => a.sectionIndex - b.sectionIndex); +} + +function getTocBoundaryIndices(sections: SectionRefLike[], toc: TocTreeItemLike[]): number[] { + const hrefToSectionIndex = buildSectionHrefIndex(sections); + const seen = new Set(); + + for (const item of flattenToc(toc)) { + const sectionIndex = getSectionIndexForTocItem(item, hrefToSectionIndex, sections.length); + if (sectionIndex !== null) { + seen.add(sectionIndex); + } + } + + return Array.from(seen).sort((a, b) => a - b); +} + +function buildSectionHrefIndex(sections: SectionRefLike[]): Map { + const hrefToSectionIndex = new Map(); + + for (let index = 0; index < sections.length; index++) { + const href = sections[index]?.href; + if (!href) continue; + + for (const key of getHrefLookupKeys(href)) { + if (!hrefToSectionIndex.has(key)) { + hrefToSectionIndex.set(key, index); + } + } + } + + return hrefToSectionIndex; +} + +function getSectionIndexForTocItem( + item: TocTreeItemLike, + hrefToSectionIndex: Map, + sectionCount: number, +): number | null { + if (item.href) { + for (const key of getHrefLookupKeys(item.href)) { + const sectionIndex = hrefToSectionIndex.get(key); + if (sectionIndex !== undefined) return sectionIndex; + } + } + + if ( + typeof item.index === "number" && + Number.isInteger(item.index) && + item.index >= 0 && + item.index < sectionCount + ) { + return item.index; + } + + return null; +} + +function flattenTocLeaves(toc: TocTreeItemLike[]): TocTreeItemLike[] { + const leaves: TocTreeItemLike[] = []; + + for (const item of toc) { + if (item.subitems?.length) { + leaves.push(...flattenTocLeaves(item.subitems)); + } else { + leaves.push(item); + } + } + + return leaves; +} + +function flattenToc(toc: TocTreeItemLike[]): TocTreeItemLike[] { + const items: TocTreeItemLike[] = []; + + for (const item of toc) { + items.push(item); + if (item.subitems?.length) { + items.push(...flattenToc(item.subitems)); + } + } + + return items; +} + +function getHrefLookupKeys(href: string): string[] { + const decoded = safeDecodeUri(href); + const withoutFragment = decoded.split("#")[0] || decoded; + const normalized = normalizeHrefPath(withoutFragment); + const fileName = normalized.split("/").pop() || normalized; + + return Array.from(new Set([decoded, withoutFragment, normalized, fileName].filter(Boolean))); +} + +function normalizeHrefPath(href: string): string { + return href.replace(/^\/+/, "").replace(/^\.\//, ""); +} + +function safeDecodeUri(value: string): string { + try { + return decodeURIComponent(value); + } catch { + return value; + } +} + +function range(start: number, endExclusive: number): number[] { + const values: number[] = []; + for (let index = start; index < endExclusive; index++) { + values.push(index); + } + return values; +} diff --git a/packages/core/src/rag/index.ts b/packages/core/src/rag/index.ts index 722a952e..50c86bfe 100644 --- a/packages/core/src/rag/index.ts +++ b/packages/core/src/rag/index.ts @@ -1,5 +1,12 @@ export type { TextSegment, ChapterData } from "./rag-types"; +export { buildChapterSectionGroups } from "./chapter-structure"; +export type { + ChapterSectionGroup, + SectionRefLike, + TocTreeItemLike, +} from "./chapter-structure"; + export { chunkContent, estimateTokens } from "./chunker"; export type { ChunkerConfig } from "./chunker";