From c2124c78cc4a4787e495e535178f9a0e869138cf Mon Sep 17 00:00:00 2001 From: pauliyobo Date: Tue, 11 Feb 2025 09:01:23 +0100 Subject: [PATCH 1/7] Initial commit --- bookworm/document/formats/daisy.py | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 bookworm/document/formats/daisy.py diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py new file mode 100644 index 00000000..9559c46c --- /dev/null +++ b/bookworm/document/formats/daisy.py @@ -0,0 +1,4 @@ +"""Daisy 3.0 document format """ +from lxml import etree + +from . import FileSystemHtmlDocument \ No newline at end of file From b3aed43330222b2417f9504745be6b79ba42a52c Mon Sep 17 00:00:00 2001 From: pauliyobo Date: Tue, 1 Apr 2025 20:56:13 +0200 Subject: [PATCH 2/7] Implemented minimal daisy parser --- bookworm/document/formats/daisy.py | 94 +++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py index 9559c46c..48ad07cd 100644 --- a/bookworm/document/formats/daisy.py +++ b/bookworm/document/formats/daisy.py @@ -1,4 +1,96 @@ """Daisy 3.0 document format """ +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List +from zipfile import ZipFile + from lxml import etree -from . import FileSystemHtmlDocument \ No newline at end of file +@dataclass +class DaisyMetadata: + """metadata of a daisy book""" + title: str + author: str + publisher: str + language: str + path: str + +@dataclass +class DaisyNavPoint: + """Representation of a navigation point""" + id: str + content: str + label: str + + +def _parse_opf(path: Path) -> DaisyMetadata: + entries = list(path.glob("*.opf")) + if not entries: + raise FileNotFoundError("Could not find daisy OPF file") + opf = entries[0] + with open(opf, 'rb') as f: + tree = etree.fromstring(f.read()) + dc_metadata = tree.find('metadata/dc-metadata', tree.nsmap) + nsmap = dc_metadata.nsmap + # We can now obtain the book's information + metadata = DaisyMetadata( + title=dc_metadata.find('dc:Title', nsmap).text, + language=dc_metadata.find('dc:Language', nsmap).text, + author=dc_metadata.find('dc:Creator', nsmap).text, + publisher=dc_metadata.find('dc:Publisher', nsmap).text, + path=path + ) + return metadata + +@dataclass +class DaisyBook: + metadata: DaisyMetadata + toc: List[DaisyNavPoint] + nav_ref: Dict[str, str] + +def _parse_ncx(path: Path) -> List[DaisyNavPoint]: + entries = list(path.glob("*.ncx")) + if not entries: + return [] + with open(entries[0], 'rb') as f: + tree = etree.fromstring(f.read()) + # navPoints are all nested inside the navMap + # We are not interested in the navInfo element, which means that findall() will likely suffice + nav_points = tree.findall('navMap/navPoint', tree.nsmap) + def parse_point(element) -> DaisyNavPoint: + _id = element.attrib.get('id') + label = element.find('navLabel/text', element.nsmap).text + content = element.find('content', element.nsmap).attrib.get('src') + return DaisyNavPoint( + id=_id, + label=label, + content=content + ) + + return [parse_point(x) for x in nav_points] + + +def read_daisy(path: Path) -> DaisyBook: + metadata = _parse_opf(path) + toc = _parse_ncx(path) + tree_cache = {} + nav_ref = {} + def get_smil(file: str): + entry = tree_cache.get(file) + if not entry: + with open(path / file, 'rb') as f: + entry = etree.parse(f) + tree_cache[file] = entry + return entry + for point in toc: + file, ref = point.content.split("#") + tree = get_smil(file) + child = tree.xpath(f"//*[@id='{ref}']")[0] + text = child.find('text', child.nsmap).attrib.get('src') + nav_ref[point.content] = text + + return DaisyBook( + metadata=metadata, + toc=toc, + nav_ref=nav_ref + ) From 7d8ba6fab6130b511aadc686fd1a755fa6b612a9 Mon Sep 17 00:00:00 2001 From: pauliyobo Date: Wed, 25 Jun 2025 01:47:29 +0200 Subject: [PATCH 3/7] Initial daisy implementation --- bookworm/document/formats/__init__.py | 3 +- bookworm/document/formats/daisy.py | 114 +++++++++++++++++++++++--- 2 files changed, 104 insertions(+), 13 deletions(-) diff --git a/bookworm/document/formats/__init__.py b/bookworm/document/formats/__init__.py index 3bc6f842..18d74985 100644 --- a/bookworm/document/formats/__init__.py +++ b/bookworm/document/formats/__init__.py @@ -1,6 +1,7 @@ # coding: utf-8 -from .archive import ArchivedDocument +#from .archive import ArchivedDocument +from .daisy import DaisyDocument from .epub import EpubDocument from .fb2 import FB2Document, FitzFB2Document from .html import FileSystemHtmlDocument, WebHtmlDocument diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py index 48ad07cd..3980b9f0 100644 --- a/bookworm/document/formats/daisy.py +++ b/bookworm/document/formats/daisy.py @@ -1,11 +1,22 @@ """Daisy 3.0 document format """ +from collections import OrderedDict from dataclasses import dataclass +import glob from pathlib import Path from typing import Dict, List +import zipfile from zipfile import ZipFile from lxml import etree +from bookworm.document.base import SinglePageDocument, SINGLE_PAGE_DOCUMENT_PAGER, TreeStackBuilder +from bookworm.document import BookMetadata, DocumentCapability as DC, Section +from bookworm.logger import logger +from bookworm.structured_text import TextRange +from bookworm.structured_text.structured_html_parser import StructuredHtmlParser + +log = logger.getChild(__name__) + @dataclass class DaisyMetadata: """metadata of a daisy book""" @@ -13,7 +24,7 @@ class DaisyMetadata: author: str publisher: str language: str - path: str + path: Path | zipfile.Path @dataclass class DaisyNavPoint: @@ -22,14 +33,15 @@ class DaisyNavPoint: content: str label: str - -def _parse_opf(path: Path) -> DaisyMetadata: - entries = list(path.glob("*.opf")) +def _parse_opf(path: Path | zipfile.Path) -> DaisyMetadata: + """Parses the OPF file of a daisy3 book in order to obtain its book metadata""" + # we have to use path.iterdir() instead of path.glob() because we want to be generic over the type of path this is + # ZipFile.Path() does not support glob + entries = [x for x in list(path.iterdir()) if x.name.endswith('.opf')] if not entries: raise FileNotFoundError("Could not find daisy OPF file") opf = entries[0] - with open(opf, 'rb') as f: - tree = etree.fromstring(f.read()) + tree = etree.fromstring(opf.read_bytes()) dc_metadata = tree.find('metadata/dc-metadata', tree.nsmap) nsmap = dc_metadata.nsmap # We can now obtain the book's information @@ -44,16 +56,20 @@ def _parse_opf(path: Path) -> DaisyMetadata: @dataclass class DaisyBook: + """A daisy3 book representation""" metadata: DaisyMetadata toc: List[DaisyNavPoint] nav_ref: Dict[str, str] -def _parse_ncx(path: Path) -> List[DaisyNavPoint]: - entries = list(path.glob("*.ncx")) +def _parse_ncx(path: Path | zipfile.Path) -> List[DaisyNavPoint]: + """ + Parses a daisy NCX file in order to extract the book's table of content + """ + entries = [x for x in list(path.iterdir()) if x.name.endswith('.ncx')] if not entries: + # We return an empty list if no NCX file is found return [] - with open(entries[0], 'rb') as f: - tree = etree.fromstring(f.read()) + tree = etree.fromstring(entries[0].read_bytes()) # navPoints are all nested inside the navMap # We are not interested in the navInfo element, which means that findall() will likely suffice nav_points = tree.findall('navMap/navPoint', tree.nsmap) @@ -71,6 +87,13 @@ def parse_point(element) -> DaisyNavPoint: def read_daisy(path: Path) -> DaisyBook: + """ + Reads a daisy book either from an extracted directory, or from a zipfile + """ + # TODO: Is it ok to just read from the zipfile rather than extracting it and be done with it? + if path.is_file() and zipfile.is_zipfile(path): + zip = ZipFile(path) + path = zipfile.Path(zip) metadata = _parse_opf(path) toc = _parse_ncx(path) tree_cache = {} @@ -78,8 +101,7 @@ def read_daisy(path: Path) -> DaisyBook: def get_smil(file: str): entry = tree_cache.get(file) if not entry: - with open(path / file, 'rb') as f: - entry = etree.parse(f) + entry = etree.fromstring((path / file).read_bytes()) tree_cache[file] = entry return entry for point in toc: @@ -94,3 +116,71 @@ def get_smil(file: str): toc=toc, nav_ref=nav_ref ) + +class DaisyDocument(SinglePageDocument): + """Daisy document""" + format = "daisy" + name = _("Daisy") + extensions = ("*.zip",) + capabilities = ( + DC.TOC_TREE + | DC.METADATA + | DC.SINGLE_PAGE + ) + + def read(self) -> None: + super().read() + self._book: DaisyBook = read_daisy(self.get_file_system_path()) + self.structure = StructuredHtmlParser.from_string(self._get_xml()) + self._toc = self._build_toc() + + def get_content(self) -> str: + return self.structure.get_text() + + @property + def toc_tree(self) -> Section: + return self._toc + + @property + def metadata(self) -> BookMetadata: + return BookMetadata( + title=self._book.metadata.title, + author=self._book.metadata.author, + publisher=self._book.metadata.publisher, + ) + + def _get_xml(self) -> str: + fragments: set[str] = {self._book.nav_ref[x.content].split('#')[0] for x in self._book.toc} + content: list[str] = [] + for text_file in fragments: + try: + text_path = self._book.metadata.path / text_file + if text_path.exists(): + log.debug(f"Reading from {text_file}") + html_content = text_path.read_text(encoding='utf-8') + content.append(html_content) + except (KeyError, FileNotFoundError): + continue + return '\n'.join(content) + + def _build_toc(self) -> Section: + root = Section( + title=self._book.metadata.title, + pager = SINGLE_PAGE_DOCUMENT_PAGER, + level=1, + text_range=TextRange(0, len(self.structure.get_text())), + ) + stack = TreeStackBuilder(root) + for entry in self._book.toc: + item_ref = self._book.nav_ref[entry.content].split('#')[1] + item_range = self.structure.html_id_ranges.get(item_ref) + if item_range: + s = Section( + title=entry.label, + pager = SINGLE_PAGE_DOCUMENT_PAGER, + level = 2, + text_range=TextRange(*item_range) + ) + stack.push(s) + return root + \ No newline at end of file From 1d41b5d8ad9eab76a542eb67522ba56f80e654eb Mon Sep 17 00:00:00 2001 From: pauliyobo Date: Wed, 25 Jun 2025 09:11:17 +0200 Subject: [PATCH 4/7] Handle table of contents with subchapters --- bookworm/document/formats/daisy.py | 35 ++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py index 3980b9f0..c99b6387 100644 --- a/bookworm/document/formats/daisy.py +++ b/bookworm/document/formats/daisy.py @@ -1,7 +1,5 @@ """Daisy 3.0 document format """ -from collections import OrderedDict from dataclasses import dataclass -import glob from pathlib import Path from typing import Dict, List import zipfile @@ -9,7 +7,7 @@ from lxml import etree -from bookworm.document.base import SinglePageDocument, SINGLE_PAGE_DOCUMENT_PAGER, TreeStackBuilder +from bookworm.document.base import SinglePageDocument, SINGLE_PAGE_DOCUMENT_PAGER from bookworm.document import BookMetadata, DocumentCapability as DC, Section from bookworm.logger import logger from bookworm.structured_text import TextRange @@ -32,6 +30,7 @@ class DaisyNavPoint: id: str content: str label: str + children: list['DaisyNavPoint'] def _parse_opf(path: Path | zipfile.Path) -> DaisyMetadata: """Parses the OPF file of a daisy3 book in order to obtain its book metadata""" @@ -74,13 +73,19 @@ def _parse_ncx(path: Path | zipfile.Path) -> List[DaisyNavPoint]: # We are not interested in the navInfo element, which means that findall() will likely suffice nav_points = tree.findall('navMap/navPoint', tree.nsmap) def parse_point(element) -> DaisyNavPoint: + # Only get direct child navPoint elements, not nested ones + # children_nav_points = [child for child in element if child.tag.endswith('navPoint')] + children_nav_points = element.findall('navPoint', element.nsmap) + children = [parse_point(x) for x in children_nav_points] + _id = element.attrib.get('id') label = element.find('navLabel/text', element.nsmap).text content = element.find('content', element.nsmap).attrib.get('src') return DaisyNavPoint( id=_id, label=label, - content=content + content=content, + children=children, ) return [parse_point(x) for x in nav_points] @@ -104,12 +109,16 @@ def get_smil(file: str): entry = etree.fromstring((path / file).read_bytes()) tree_cache[file] = entry return entry - for point in toc: + def build_nav_ref(point: DaisyNavPoint) -> None: + for child in point.children: + build_nav_ref(child) file, ref = point.content.split("#") tree = get_smil(file) child = tree.xpath(f"//*[@id='{ref}']")[0] text = child.find('text', child.nsmap).attrib.get('src') nav_ref[point.content] = text + for point in toc: + build_nav_ref(point) return DaisyBook( metadata=metadata, @@ -164,23 +173,27 @@ def _get_xml(self) -> str: return '\n'.join(content) def _build_toc(self) -> Section: + level = 1 root = Section( title=self._book.metadata.title, pager = SINGLE_PAGE_DOCUMENT_PAGER, - level=1, + level=level, text_range=TextRange(0, len(self.structure.get_text())), ) - stack = TreeStackBuilder(root) - for entry in self._book.toc: + def add_children(stack: Section, entry: DaisyNavPoint, level: int) -> Section: item_ref = self._book.nav_ref[entry.content].split('#')[1] item_range = self.structure.html_id_ranges.get(item_ref) if item_range: s = Section( title=entry.label, pager = SINGLE_PAGE_DOCUMENT_PAGER, - level = 2, - text_range=TextRange(*item_range) + level = level, + text_range=TextRange(*item_range), ) - stack.push(s) + s.children=[add_children(s, x, level+1) for x in entry.children] + stack.append(s) + return s + for entry in self._book.toc: + add_children(root, entry, level+1 ) return root \ No newline at end of file From c7f0fc43f1945e656cf015b97d63126bc52540ef Mon Sep 17 00:00:00 2001 From: pauliyobo Date: Thu, 24 Jul 2025 17:55:07 +0200 Subject: [PATCH 5/7] Add document structure navigation --- bookworm/document/formats/daisy.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py index c99b6387..18b207b3 100644 --- a/bookworm/document/formats/daisy.py +++ b/bookworm/document/formats/daisy.py @@ -134,6 +134,7 @@ class DaisyDocument(SinglePageDocument): capabilities = ( DC.TOC_TREE | DC.METADATA + | DC.STRUCTURED_NAVIGATION | DC.SINGLE_PAGE ) @@ -146,6 +147,10 @@ def read(self) -> None: def get_content(self) -> str: return self.structure.get_text() + def get_document_semantic_structure(self): + return self.structure.semantic_elements + + @property def toc_tree(self) -> Section: return self._toc From 7fcbb0b92498ad2f7dbdcb2d59ebc42bf02d5204 Mon Sep 17 00:00:00 2001 From: pauliyobo Date: Tue, 7 Oct 2025 06:45:30 +0200 Subject: [PATCH 6/7] Add links support --- bookworm/document/formats/daisy.py | 17 ++++++++++++++++- bookworm/reader.py | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py index 18b207b3..aed4d3f8 100644 --- a/bookworm/document/formats/daisy.py +++ b/bookworm/document/formats/daisy.py @@ -2,16 +2,18 @@ from dataclasses import dataclass from pathlib import Path from typing import Dict, List +import urllib.parse as urllib_parse import zipfile from zipfile import ZipFile from lxml import etree -from bookworm.document.base import SinglePageDocument, SINGLE_PAGE_DOCUMENT_PAGER +from bookworm.document.base import LinkTarget, SinglePageDocument, SINGLE_PAGE_DOCUMENT_PAGER from bookworm.document import BookMetadata, DocumentCapability as DC, Section from bookworm.logger import logger from bookworm.structured_text import TextRange from bookworm.structured_text.structured_html_parser import StructuredHtmlParser +from bookworm.utils import is_external_url log = logger.getChild(__name__) @@ -136,6 +138,8 @@ class DaisyDocument(SinglePageDocument): | DC.METADATA | DC.STRUCTURED_NAVIGATION | DC.SINGLE_PAGE + | DC.LINKS + | DC.INTERNAL_ANCHORS ) def read(self) -> None: @@ -151,6 +155,17 @@ def get_document_semantic_structure(self): return self.structure.semantic_elements + def resolve_link(self, link_range) -> LinkTarget: + href = urllib_parse.unquote(self.structure.link_targets[link_range]) + if is_external_url(href): + return LinkTarget(url=href, is_external=True) + else: + for html_id, text_range in self.structure.html_id_ranges.items(): + if html_id.endswith(href): + return LinkTarget( + url=href, is_external=False, page=None, position=text_range + ) + @property def toc_tree(self) -> Section: return self._toc diff --git a/bookworm/reader.py b/bookworm/reader.py index 747c342e..e07a99ef 100644 --- a/bookworm/reader.py +++ b/bookworm/reader.py @@ -371,6 +371,7 @@ def pop_navigation_stack(self): ) def handle_special_action_for_position(self, position: int) -> bool: + log.debug(f"Executing special action in position: {position}") for link_range in self.iter_semantic_ranges_for_elements_of_type( SemanticElementType.LINK ): From f8d8738e74ada25ed2010434951d7c1d30f60afc Mon Sep 17 00:00:00 2001 From: pauliyobo Date: Sat, 25 Oct 2025 15:49:42 +0200 Subject: [PATCH 7/7] Fix internal anchor resolution --- bookworm/document/formats/daisy.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py index aed4d3f8..28480dfb 100644 --- a/bookworm/document/formats/daisy.py +++ b/bookworm/document/formats/daisy.py @@ -160,11 +160,13 @@ def resolve_link(self, link_range) -> LinkTarget: if is_external_url(href): return LinkTarget(url=href, is_external=True) else: - for html_id, text_range in self.structure.html_id_ranges.items(): - if html_id.endswith(href): - return LinkTarget( - url=href, is_external=False, page=None, position=text_range - ) + # we can obtain the target by looking up the anchors map populated in the Structured HTML parser + if href.startswith('#'): + href = href[1:] + text_range = self.structure.anchors[href] + return LinkTarget( + url=href, is_external=False, page=None, position=text_range + ) @property def toc_tree(self) -> Section: