Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 28 additions & 47 deletions packages/app/src/lib/rag/book-extractor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* text segments with EPUB CFI references for precise navigation.
*/
import { DocumentLoader } from "@/lib/reader/document-loader";
import type { TOCItem } from "@/lib/reader/document-loader";
import { buildChapterSectionGroups } from "@readany/core/rag";
import * as CFI from "foliate-js/epubcfi.js";

export interface TextSegment {
Expand Down Expand Up @@ -35,32 +35,34 @@ export async function extractBookChapters(filePath: string): Promise<ChapterData

const sections = book.sections ?? [];
const toc = book.toc ?? [];
const tocMap = buildTocMap(toc);
const chapterGroups = buildChapterSectionGroups(sections, toc);

const chapters: ChapterData[] = [];

for (let i = 0; i < sections.length; i++) {
const section = sections[i];
if (!section.createDocument) continue;
for (const group of chapterGroups) {
const chapterSegments: TextSegment[] = [];

try {
const doc = await section.createDocument();
const body = doc.body;
if (!body) continue;

const title = tocMap.get(i) ?? tocMap.get(section.href ?? "") ?? `Section ${i + 1}`;
const baseCfi = section.cfi || CFI.fake.fromIndex(i);
for (const sectionIndex of group.sectionIndices) {
const section = sections[sectionIndex];
if (!section?.createDocument) continue;

const segments = extractSegmentsWithCfi(doc, baseCfi);
try {
const doc = await section.createDocument();
const body = doc.body;
if (!body) continue;

const baseCfi = section.cfi || CFI.fake.fromIndex(sectionIndex);
chapterSegments.push(...extractSegmentsWithCfi(doc, baseCfi));
} catch (err) {
console.warn(`[extractBookChapters] Failed to extract section ${sectionIndex}:`, err);
}
}

if (segments.length === 0) continue;
if (chapterSegments.length === 0) continue;

const content = segments.map((s) => s.text).join("\n\n");
const content = chapterSegments.map((s) => s.text).join("\n\n");

chapters.push({ index: i, title, content, segments });
} catch (err) {
console.warn(`[extractBookChapters] Failed to extract section ${i}:`, err);
}
chapters.push({ index: group.index, title: group.title, content, segments: chapterSegments });
}

return chapters;
Expand Down Expand Up @@ -127,9 +129,10 @@ function getTextNodes(element: Element): Text[] {
const walker = element.ownerDocument.createTreeWalker(element, NodeFilter.SHOW_TEXT, null);

const nodes: Text[] = [];
let node: Text | null;
while ((node = walker.nextNode() as Text | null)) {
if (node.textContent && node.textContent.trim()) {
while (true) {
const node = walker.nextNode() as Text | null;
if (!node) break;
if (node.textContent?.trim()) {
nodes.push(node);
}
}
Expand All @@ -141,8 +144,9 @@ function extractBlockText(block: Element): string {
const walker = block.ownerDocument.createTreeWalker(block, NodeFilter.SHOW_TEXT, null);

const texts: string[] = [];
let node: Text | null;
while ((node = walker.nextNode() as Text | null)) {
while (true) {
const node = walker.nextNode() as Text | null;
if (!node) break;
const text = node.textContent?.trim();
if (text) {
texts.push(text);
Expand All @@ -152,29 +156,6 @@ function extractBlockText(block: Element): string {
return texts.join(" ");
}

function buildTocMap(toc: TOCItem[]): Map<string | number, string> {
const map = new Map<string | number, string>();

function walk(items: TOCItem[]) {
for (const item of items) {
if (item.label) {
map.set(item.index, item.label);
if (item.href) {
const base = item.href.split("#")[0];
map.set(base, item.label);
map.set(item.href, item.label);
}
}
if (item.subitems?.length) {
walk(item.subitems);
}
}
}

walk(toc);
return map;
}

async function extractPdfChapters(fileBytes: Uint8Array): Promise<ChapterData[]> {
const pdfjsLib = await import("pdfjs-dist");
pdfjsLib.GlobalWorkerOptions.workerSrc = `https://cdn.jsdelivr.net/npm/pdfjs-dist@${pdfjsLib.version}/build/pdf.worker.min.mjs`;
Expand Down
6 changes: 3 additions & 3 deletions packages/core/src/ai/__tests__/tools.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,9 @@ describe("ragToc tool", () => {

expect(result.totalChapters).toBe(3);
expect(result.chapters).toEqual([
{ index: 0, title: "Intro" },
{ index: 1, title: "Chapter 1" },
{ index: 2, title: "Chapter 2" },
{ index: 0, number: 1, title: "Intro" },
{ index: 1, number: 2, title: "Chapter 1" },
{ index: 2, number: 3, title: "Chapter 2" },
]);
});
});
Expand Down
5 changes: 3 additions & 2 deletions packages/core/src/ai/tools/rag-tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ export function createRagTocTool(bookId: string): ToolDefinition {
return {
name: "ragToc",
description:
"Get the table of contents of the current book. Use this when the user wants to see the book structure or navigate to a specific chapter.",
"Get the table of contents of the current book. Use this when the user wants to see the book structure or navigate to a specific chapter. Use the returned 'index' when calling chapter tools; 'number' is the human-readable chapter order.",
parameters: {},
execute: async () => {
// Get unique chapter titles from chunks
Expand All @@ -110,8 +110,9 @@ export function createRagTocTool(bookId: string): ToolDefinition {
}

return {
chapters: Array.from(chapters.entries()).map(([index, title]) => ({
chapters: Array.from(chapters.entries()).map(([index, title], ordinal) => ({
index,
number: ordinal + 1,
title,
})),
totalChapters: chapters.size,
Expand Down
72 changes: 72 additions & 0 deletions packages/core/src/rag/chapter-structure.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import { describe, expect, it } from "vitest";
import { buildChapterSectionGroups } from "./chapter-structure";

describe("buildChapterSectionGroups", () => {
it("uses leaf TOC entries as logical chapters for multi-volume books", () => {
const groups = buildChapterSectionGroups(
[
{ href: "cover.xhtml" },
{ href: "volume-1.xhtml" },
{ href: "chapter-1.xhtml" },
{ href: "chapter-1-extra.xhtml" },
{ href: "chapter-2.xhtml" },
{ href: "volume-2.xhtml" },
{ href: "chapter-3.xhtml" },
],
[
{
label: "第一卷",
href: "volume-1.xhtml",
subitems: [
{ label: "第一章", href: "chapter-1.xhtml" },
{ label: "第二章", href: "chapter-2.xhtml" },
],
},
{
label: "第二卷",
href: "volume-2.xhtml",
subitems: [{ label: "第三章", href: "chapter-3.xhtml" }],
},
],
);

expect(groups).toEqual([
{ index: 0, title: "第一章", sectionIndices: [2, 3] },
{ index: 1, title: "第二章", sectionIndices: [4] },
{ index: 2, title: "第三章", sectionIndices: [6] },
]);
});

it("falls back to top-level TOC entries when no leaf hrefs exist", () => {
const groups = buildChapterSectionGroups(
[{ href: "intro.xhtml" }, { href: "body.xhtml" }],
[{ label: "正文", href: "body.xhtml", subitems: [] }],
);

expect(groups).toEqual([{ index: 0, title: "正文", sectionIndices: [1] }]);
});

it("normalizes encoded and relative hrefs before matching sections", () => {
const groups = buildChapterSectionGroups(
[{ href: "Text/第1章.xhtml" }, { href: "Text/%E7%AC%AC2%E7%AB%A0.xhtml" }],
[
{ label: "第一章", href: "./Text/%E7%AC%AC1%E7%AB%A0.xhtml#start" },
{ label: "第二章", href: "第2章.xhtml" },
],
);

expect(groups).toEqual([
{ index: 0, title: "第一章", sectionIndices: [0] },
{ index: 1, title: "第二章", sectionIndices: [1] },
]);
});

it("falls back to one group per section when TOC has no usable anchors", () => {
const groups = buildChapterSectionGroups([{ href: "a.xhtml" }, { href: "b.xhtml" }], []);

expect(groups).toEqual([
{ index: 0, title: "Section 1", sectionIndices: [0] },
{ index: 1, title: "Section 2", sectionIndices: [1] },
]);
});
});
Loading