From c2124c78cc4a4787e495e535178f9a0e869138cf Mon Sep 17 00:00:00 2001
From: pauliyobo <pauliyobo@gmail.com>
Date: Tue, 11 Feb 2025 09:01:23 +0100
Subject: [PATCH 1/7] Initial commit

---
 bookworm/document/formats/daisy.py | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 bookworm/document/formats/daisy.py

diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py
new file mode 100644
index 00000000..9559c46c
--- /dev/null
+++ b/bookworm/document/formats/daisy.py
@@ -0,0 +1,4 @@
+"""Daisy 3.0  document format """
+from lxml import etree
+
+from . import FileSystemHtmlDocument            
\ No newline at end of file

From b3aed43330222b2417f9504745be6b79ba42a52c Mon Sep 17 00:00:00 2001
From: pauliyobo <pauliyobo@gmail.com>
Date: Tue, 1 Apr 2025 20:56:13 +0200
Subject: [PATCH 2/7] Implemented minimal daisy parser

---
 bookworm/document/formats/daisy.py | 94 +++++++++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py
index 9559c46c..48ad07cd 100644
--- a/bookworm/document/formats/daisy.py
+++ b/bookworm/document/formats/daisy.py
@@ -1,4 +1,96 @@
 """Daisy 3.0  document format """
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List
+from zipfile import ZipFile
+
 from lxml import etree
 
-from . import FileSystemHtmlDocument            
\ No newline at end of file
+@dataclass
+class DaisyMetadata:
+    """metadata of a daisy book"""
+    title: str
+    author: str
+    publisher: str
+    language: str
+    path: str
+
+@dataclass
+class DaisyNavPoint:
+    """Representation of a navigation point"""
+    id: str
+    content: str
+    label: str
+
+
+def _parse_opf(path: Path) -> DaisyMetadata:
+    entries = list(path.glob("*.opf"))
+    if not entries:
+        raise FileNotFoundError("Could not find daisy OPF file")
+    opf = entries[0]
+    with open(opf, 'rb') as f:
+        tree = etree.fromstring(f.read())
+    dc_metadata = tree.find('metadata/dc-metadata', tree.nsmap)
+    nsmap = dc_metadata.nsmap
+    # We can now obtain the book's information
+    metadata = DaisyMetadata(
+        title=dc_metadata.find('dc:Title', nsmap).text,
+        language=dc_metadata.find('dc:Language', nsmap).text,
+        author=dc_metadata.find('dc:Creator', nsmap).text,
+        publisher=dc_metadata.find('dc:Publisher', nsmap).text,
+        path=path
+    )
+    return metadata
+
+@dataclass
+class DaisyBook:
+    metadata: DaisyMetadata
+    toc: List[DaisyNavPoint]
+    nav_ref: Dict[str, str]
+
+def _parse_ncx(path: Path) -> List[DaisyNavPoint]:
+    entries = list(path.glob("*.ncx"))
+    if not entries:
+        return []
+    with open(entries[0], 'rb') as f:
+        tree = etree.fromstring(f.read())
+    # navPoints are all nested inside the navMap
+    # We are not interested in the navInfo element, which means that findall() will likely suffice
+    nav_points = tree.findall('navMap/navPoint', tree.nsmap)
+    def parse_point(element) -> DaisyNavPoint:
+        _id = element.attrib.get('id')
+        label = element.find('navLabel/text', element.nsmap).text
+        content = element.find('content', element.nsmap).attrib.get('src')
+        return DaisyNavPoint(
+            id=_id,
+            label=label,
+            content=content
+        )
+    
+    return [parse_point(x) for x in nav_points]
+
+
+def read_daisy(path: Path) -> DaisyBook:
+    metadata = _parse_opf(path)
+    toc = _parse_ncx(path)
+    tree_cache = {}
+    nav_ref = {}
+    def get_smil(file: str):
+        entry = tree_cache.get(file)
+        if not entry:
+            with open(path / file, 'rb') as f:
+                entry = etree.parse(f)
+            tree_cache[file] = entry
+        return entry
+    for point in toc:
+        file, ref = point.content.split("#")
+        tree = get_smil(file)
+        child = tree.xpath(f"//*[@id='{ref}']")[0]
+        text = child.find('text', child.nsmap).attrib.get('src')
+        nav_ref[point.content] = text
+    
+    return DaisyBook(
+        metadata=metadata,
+        toc=toc,
+        nav_ref=nav_ref
+    )

From 7d8ba6fab6130b511aadc686fd1a755fa6b612a9 Mon Sep 17 00:00:00 2001
From: pauliyobo <pauliyobo@gmail.com>
Date: Wed, 25 Jun 2025 01:47:29 +0200
Subject: [PATCH 3/7] Initial daisy implementation

---
 bookworm/document/formats/__init__.py |   3 +-
 bookworm/document/formats/daisy.py    | 114 +++++++++++++++++++++++---
 2 files changed, 104 insertions(+), 13 deletions(-)

diff --git a/bookworm/document/formats/__init__.py b/bookworm/document/formats/__init__.py
index 3bc6f842..18d74985 100644
--- a/bookworm/document/formats/__init__.py
+++ b/bookworm/document/formats/__init__.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 
-from .archive import ArchivedDocument
+#from .archive import ArchivedDocument
+from .daisy import DaisyDocument
 from .epub import EpubDocument
 from .fb2 import FB2Document, FitzFB2Document
 from .html import FileSystemHtmlDocument, WebHtmlDocument
diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py
index 48ad07cd..3980b9f0 100644
--- a/bookworm/document/formats/daisy.py
+++ b/bookworm/document/formats/daisy.py
@@ -1,11 +1,22 @@
 """Daisy 3.0  document format """
+from collections import OrderedDict
 from dataclasses import dataclass
+import glob
 from pathlib import Path
 from typing import Dict, List
+import zipfile
 from zipfile import ZipFile
 
 from lxml import etree
 
+from bookworm.document.base import SinglePageDocument, SINGLE_PAGE_DOCUMENT_PAGER, TreeStackBuilder
+from bookworm.document import BookMetadata, DocumentCapability as DC, Section
+from bookworm.logger import logger
+from bookworm.structured_text import TextRange
+from bookworm.structured_text.structured_html_parser import StructuredHtmlParser
+
+log = logger.getChild(__name__)
+
 @dataclass
 class DaisyMetadata:
     """metadata of a daisy book"""
@@ -13,7 +24,7 @@ class DaisyMetadata:
     author: str
     publisher: str
     language: str
-    path: str
+    path: Path | zipfile.Path
 
 @dataclass
 class DaisyNavPoint:
@@ -22,14 +33,15 @@ class DaisyNavPoint:
     content: str
     label: str
 
-
-def _parse_opf(path: Path) -> DaisyMetadata:
-    entries = list(path.glob("*.opf"))
+def _parse_opf(path: Path | zipfile.Path) -> DaisyMetadata:
+    """Parses the OPF file of a daisy3 book in order to obtain its book metadata"""
+    # we have to use path.iterdir() instead of path.glob() because we want to be generic over the type of path this is
+    # ZipFile.Path() does not support glob
+    entries = [x for x in list(path.iterdir()) if x.name.endswith('.opf')]
     if not entries:
         raise FileNotFoundError("Could not find daisy OPF file")
     opf = entries[0]
-    with open(opf, 'rb') as f:
-        tree = etree.fromstring(f.read())
+    tree = etree.fromstring(opf.read_bytes())
     dc_metadata = tree.find('metadata/dc-metadata', tree.nsmap)
     nsmap = dc_metadata.nsmap
     # We can now obtain the book's information
@@ -44,16 +56,20 @@ def _parse_opf(path: Path) -> DaisyMetadata:
 
 @dataclass
 class DaisyBook:
+    """A daisy3 book representation"""
     metadata: DaisyMetadata
     toc: List[DaisyNavPoint]
     nav_ref: Dict[str, str]
 
-def _parse_ncx(path: Path) -> List[DaisyNavPoint]:
-    entries = list(path.glob("*.ncx"))
+def _parse_ncx(path: Path | zipfile.Path) -> List[DaisyNavPoint]:
+    """
+    Parses a daisy NCX file in order to extract the book's table of content
+    """
+    entries = [x for x in list(path.iterdir()) if x.name.endswith('.ncx')]
     if not entries:
+        # We return an empty list if no NCX file is found
         return []
-    with open(entries[0], 'rb') as f:
-        tree = etree.fromstring(f.read())
+    tree = etree.fromstring(entries[0].read_bytes())
     # navPoints are all nested inside the navMap
     # We are not interested in the navInfo element, which means that findall() will likely suffice
     nav_points = tree.findall('navMap/navPoint', tree.nsmap)
@@ -71,6 +87,13 @@ def parse_point(element) -> DaisyNavPoint:
 
 
 def read_daisy(path: Path) -> DaisyBook:
+    """
+    Reads a daisy book either from an extracted directory, or from a zipfile
+    """
+    # TODO: Is it ok to just read from the zipfile rather than extracting it and be done with it?
+    if path.is_file() and zipfile.is_zipfile(path):
+            zip = ZipFile(path)
+            path = zipfile.Path(zip)
     metadata = _parse_opf(path)
     toc = _parse_ncx(path)
     tree_cache = {}
@@ -78,8 +101,7 @@ def read_daisy(path: Path) -> DaisyBook:
     def get_smil(file: str):
         entry = tree_cache.get(file)
         if not entry:
-            with open(path / file, 'rb') as f:
-                entry = etree.parse(f)
+            entry = etree.fromstring((path / file).read_bytes())
             tree_cache[file] = entry
         return entry
     for point in toc:
@@ -94,3 +116,71 @@ def get_smil(file: str):
         toc=toc,
         nav_ref=nav_ref
     )
+
+class DaisyDocument(SinglePageDocument):
+    """Daisy document"""
+    format = "daisy"
+    name = _("Daisy")
+    extensions = ("*.zip",)
+    capabilities = (
+        DC.TOC_TREE
+        | DC.METADATA
+        | DC.SINGLE_PAGE
+    )
+
+    def read(self) -> None:
+        super().read()
+        self._book: DaisyBook = read_daisy(self.get_file_system_path())
+        self.structure = StructuredHtmlParser.from_string(self._get_xml())
+        self._toc = self._build_toc()
+
+    def get_content(self) -> str:
+        return self.structure.get_text()
+
+    @property
+    def toc_tree(self) -> Section:
+        return self._toc
+
+    @property
+    def metadata(self) -> BookMetadata:
+        return BookMetadata(
+            title=self._book.metadata.title,
+            author=self._book.metadata.author,
+            publisher=self._book.metadata.publisher,
+        )
+
+    def _get_xml(self) -> str:        
+        fragments: set[str] = {self._book.nav_ref[x.content].split('#')[0] for x in self._book.toc}
+        content: list[str] = []
+        for text_file in fragments:
+            try:
+                text_path = self._book.metadata.path / text_file
+                if text_path.exists():
+                    log.debug(f"Reading from {text_file}")
+                    html_content = text_path.read_text(encoding='utf-8')
+                    content.append(html_content)
+            except (KeyError, FileNotFoundError):
+                continue
+        return '\n'.join(content)
+
+    def _build_toc(self) -> Section:
+        root = Section(
+            title=self._book.metadata.title,
+            pager = SINGLE_PAGE_DOCUMENT_PAGER,
+            level=1,
+            text_range=TextRange(0, len(self.structure.get_text())),
+        )
+        stack = TreeStackBuilder(root)
+        for entry in self._book.toc:
+            item_ref = self._book.nav_ref[entry.content].split('#')[1]
+            item_range = self.structure.html_id_ranges.get(item_ref)
+            if item_range:
+                s = Section(
+                    title=entry.label,
+                    pager = SINGLE_PAGE_DOCUMENT_PAGER,
+                    level = 2,
+                    text_range=TextRange(*item_range)
+                )
+                stack.push(s)
+        return root
+        
\ No newline at end of file

From 1d41b5d8ad9eab76a542eb67522ba56f80e654eb Mon Sep 17 00:00:00 2001
From: pauliyobo <pauliyobo@gmail.com>
Date: Wed, 25 Jun 2025 09:11:17 +0200
Subject: [PATCH 4/7] Handle table of contents with subchapters

---
 bookworm/document/formats/daisy.py | 35 ++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py
index 3980b9f0..c99b6387 100644
--- a/bookworm/document/formats/daisy.py
+++ b/bookworm/document/formats/daisy.py
@@ -1,7 +1,5 @@
 """Daisy 3.0  document format """
-from collections import OrderedDict
 from dataclasses import dataclass
-import glob
 from pathlib import Path
 from typing import Dict, List
 import zipfile
@@ -9,7 +7,7 @@
 
 from lxml import etree
 
-from bookworm.document.base import SinglePageDocument, SINGLE_PAGE_DOCUMENT_PAGER, TreeStackBuilder
+from bookworm.document.base import SinglePageDocument, SINGLE_PAGE_DOCUMENT_PAGER
 from bookworm.document import BookMetadata, DocumentCapability as DC, Section
 from bookworm.logger import logger
 from bookworm.structured_text import TextRange
@@ -32,6 +30,7 @@ class DaisyNavPoint:
     id: str
     content: str
     label: str
+    children: list['DaisyNavPoint']
 
 def _parse_opf(path: Path | zipfile.Path) -> DaisyMetadata:
     """Parses the OPF file of a daisy3 book in order to obtain its book metadata"""
@@ -74,13 +73,19 @@ def _parse_ncx(path: Path | zipfile.Path) -> List[DaisyNavPoint]:
     # We are not interested in the navInfo element, which means that findall() will likely suffice
     nav_points = tree.findall('navMap/navPoint', tree.nsmap)
     def parse_point(element) -> DaisyNavPoint:
+        # Only get direct child navPoint elements, not nested ones
+        # children_nav_points = [child for child in element if child.tag.endswith('navPoint')]
+        children_nav_points = element.findall('navPoint', element.nsmap)
+        children = [parse_point(x) for x in children_nav_points]
+
         _id = element.attrib.get('id')
         label = element.find('navLabel/text', element.nsmap).text
         content = element.find('content', element.nsmap).attrib.get('src')
         return DaisyNavPoint(
             id=_id,
             label=label,
-            content=content
+            content=content,
+            children=children,
         )
     
     return [parse_point(x) for x in nav_points]
@@ -104,12 +109,16 @@ def get_smil(file: str):
             entry = etree.fromstring((path / file).read_bytes())
             tree_cache[file] = entry
         return entry
-    for point in toc:
+    def build_nav_ref(point: DaisyNavPoint) -> None:
+        for child in point.children:
+            build_nav_ref(child)
         file, ref = point.content.split("#")
         tree = get_smil(file)
         child = tree.xpath(f"//*[@id='{ref}']")[0]
         text = child.find('text', child.nsmap).attrib.get('src')
         nav_ref[point.content] = text
+    for point in toc:
+        build_nav_ref(point)
     
     return DaisyBook(
         metadata=metadata,
@@ -164,23 +173,27 @@ def _get_xml(self) -> str:
         return '\n'.join(content)
 
     def _build_toc(self) -> Section:
+        level = 1
         root = Section(
             title=self._book.metadata.title,
             pager = SINGLE_PAGE_DOCUMENT_PAGER,
-            level=1,
+            level=level,
             text_range=TextRange(0, len(self.structure.get_text())),
         )
-        stack = TreeStackBuilder(root)
-        for entry in self._book.toc:
+        def add_children(stack: Section, entry: DaisyNavPoint, level: int) -> Section:
             item_ref = self._book.nav_ref[entry.content].split('#')[1]
             item_range = self.structure.html_id_ranges.get(item_ref)
             if item_range:
                 s = Section(
                     title=entry.label,
                     pager = SINGLE_PAGE_DOCUMENT_PAGER,
-                    level = 2,
-                    text_range=TextRange(*item_range)
+                    level = level,
+                    text_range=TextRange(*item_range),
                 )
-                stack.push(s)
+                s.children=[add_children(s, x, level+1) for x in entry.children]
+                stack.append(s)
+            return s
+        for entry in self._book.toc:
+            add_children(root, entry, level+1 )
         return root
         
\ No newline at end of file

From c7f0fc43f1945e656cf015b97d63126bc52540ef Mon Sep 17 00:00:00 2001
From: pauliyobo <pauliyobo@gmail.com>
Date: Thu, 24 Jul 2025 17:55:07 +0200
Subject: [PATCH 5/7] Add document structure navigation

---
 bookworm/document/formats/daisy.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py
index c99b6387..18b207b3 100644
--- a/bookworm/document/formats/daisy.py
+++ b/bookworm/document/formats/daisy.py
@@ -134,6 +134,7 @@ class DaisyDocument(SinglePageDocument):
     capabilities = (
         DC.TOC_TREE
         | DC.METADATA
+        | DC.STRUCTURED_NAVIGATION
         | DC.SINGLE_PAGE
     )
 
@@ -146,6 +147,10 @@ def read(self) -> None:
     def get_content(self) -> str:
         return self.structure.get_text()
 
+    def get_document_semantic_structure(self):
+        return self.structure.semantic_elements
+
+    
     @property
     def toc_tree(self) -> Section:
         return self._toc

From 7fcbb0b92498ad2f7dbdcb2d59ebc42bf02d5204 Mon Sep 17 00:00:00 2001
From: pauliyobo <pauliyobo@gmail.com>
Date: Tue, 7 Oct 2025 06:45:30 +0200
Subject: [PATCH 6/7] Add links support

---
 bookworm/document/formats/daisy.py | 17 ++++++++++++++++-
 bookworm/reader.py                 |  1 +
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py
index 18b207b3..aed4d3f8 100644
--- a/bookworm/document/formats/daisy.py
+++ b/bookworm/document/formats/daisy.py
@@ -2,16 +2,18 @@
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, List
+import urllib.parse as urllib_parse
 import zipfile
 from zipfile import ZipFile
 
 from lxml import etree
 
-from bookworm.document.base import SinglePageDocument, SINGLE_PAGE_DOCUMENT_PAGER
+from bookworm.document.base import LinkTarget, SinglePageDocument, SINGLE_PAGE_DOCUMENT_PAGER
 from bookworm.document import BookMetadata, DocumentCapability as DC, Section
 from bookworm.logger import logger
 from bookworm.structured_text import TextRange
 from bookworm.structured_text.structured_html_parser import StructuredHtmlParser
+from bookworm.utils import is_external_url
 
 log = logger.getChild(__name__)
 
@@ -136,6 +138,8 @@ class DaisyDocument(SinglePageDocument):
         | DC.METADATA
         | DC.STRUCTURED_NAVIGATION
         | DC.SINGLE_PAGE
+        | DC.LINKS
+        | DC.INTERNAL_ANCHORS
     )
 
     def read(self) -> None:
@@ -151,6 +155,17 @@ def get_document_semantic_structure(self):
         return self.structure.semantic_elements
 
     
+    def resolve_link(self, link_range) -> LinkTarget:
+        href = urllib_parse.unquote(self.structure.link_targets[link_range])
+        if is_external_url(href):
+            return LinkTarget(url=href, is_external=True)
+        else:
+            for html_id, text_range in self.structure.html_id_ranges.items():
+                if html_id.endswith(href):
+                    return LinkTarget(
+                        url=href, is_external=False, page=None, position=text_range
+                    )
+
     @property
     def toc_tree(self) -> Section:
         return self._toc
diff --git a/bookworm/reader.py b/bookworm/reader.py
index 747c342e..e07a99ef 100644
--- a/bookworm/reader.py
+++ b/bookworm/reader.py
@@ -371,6 +371,7 @@ def pop_navigation_stack(self):
             )
 
     def handle_special_action_for_position(self, position: int) -> bool:
+        log.debug(f"Executing special action in position: {position}")
         for link_range in self.iter_semantic_ranges_for_elements_of_type(
             SemanticElementType.LINK
         ):

From f8d8738e74ada25ed2010434951d7c1d30f60afc Mon Sep 17 00:00:00 2001
From: pauliyobo <pauliyobo@gmail.com>
Date: Sat, 25 Oct 2025 15:49:42 +0200
Subject: [PATCH 7/7] Fix internal anchor resolution

---
 bookworm/document/formats/daisy.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py
index aed4d3f8..28480dfb 100644
--- a/bookworm/document/formats/daisy.py
+++ b/bookworm/document/formats/daisy.py
@@ -160,11 +160,13 @@ def resolve_link(self, link_range) -> LinkTarget:
         if is_external_url(href):
             return LinkTarget(url=href, is_external=True)
         else:
-            for html_id, text_range in self.structure.html_id_ranges.items():
-                if html_id.endswith(href):
-                    return LinkTarget(
-                        url=href, is_external=False, page=None, position=text_range
-                    )
+            # we can obtain the target by looking up the anchors map populated in the Structured HTML parser
+            if href.startswith('#'):
+                href = href[1:]
+            text_range = self.structure.anchors[href]
+            return LinkTarget(
+                url=href, is_external=False, page=None, position=text_range
+            )
 
     @property
     def toc_tree(self) -> Section: