From fc0634c357c50f284397ac960402d64cd21c0246 Mon Sep 17 00:00:00 2001
From: Alessio Vertemati <alessio@oneofftech.xyz>
Date: Wed, 18 Feb 2026 10:59:46 +0100
Subject: [PATCH 1/8] wip

---
 src/parxy_core/models/models.py | 100 ++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py
index 258b847..d6bd3f0 100644
--- a/src/parxy_core/models/models.py
+++ b/src/parxy_core/models/models.py
@@ -155,6 +155,106 @@ def text(self, page_separator: str = '---') -> str:
 
         return '\n'.join(texts)
 
+    def contentmd(
+        self,
+        title: Optional[str] = None,
+        description: Optional[str] = None,
+        date: Optional[str] = None,
+        license: Optional[str] = None,
+        author: Optional[str] = None,
+    ) -> str:
+        """Get the document content formatted as content-md.
+
+        Generates a content-md string: YAML frontmatter followed by Markdown.
+        Per the spec, all heading levels are shifted up by one so the document
+        title occupies the sole h1, and images use ``<figure>`` blocks.
+
+        Parameters
+        ----------
+        title : str, optional
+            Document title. Falls back to metadata.title, then filename.
+        description : str, optional
+            Short summary (~200 characters). Required by the spec; omitted from
+            frontmatter when not provided.
+        date : str, optional
+            Creation/publication date in ISO 8601. Falls back to metadata dates.
+        license : str, optional
+            License name or SPDX identifier.
+        author : str, optional
+            Author name. Falls back to metadata.author.
+
+        Returns
+        -------
+        str
+            The document content formatted as content-md.
+        """
+        resolved_title = (
+            title
+            or (self.metadata.title if self.metadata else None)
+            or self.filename
+            or 'Untitled'
+        )
+        resolved_date = date or (
+            (self.metadata.created_at or self.metadata.updated_at)
+            if self.metadata
+            else None
+        )
+        resolved_author = author or (self.metadata.author if self.metadata else None)
+
+        def _yaml_str(v: str) -> str:
+            return '"' + v.replace('\\', '\\\\').replace('"', '\\"') + '"'
+
+        fm = ['---', f'title: {_yaml_str(resolved_title)}']
+        if description:
+            fm.append(f'description: {_yaml_str(description)}')
+        if resolved_date:
+            fm.append(f'date: {_yaml_str(resolved_date)}')
+        if license:
+            fm.append(f'license: {_yaml_str(license)}')
+        if resolved_author:
+            fm.append(f'author: {_yaml_str(resolved_author)}')
+        fm.append('---')
+        frontmatter = '\n'.join(fm)
+
+        if not self.pages:
+            return f'{frontmatter}\n\n# {resolved_title}\n'
+
+        parts = [f'# {resolved_title}']
+
+        for page in self.pages:
+            if not page.blocks:
+                if page.text.strip():
+                    parts.append(page.text.strip())
+                continue
+
+            for block in page.blocks:
+                if isinstance(block, TextBlock):
+                    if block.category and block.category.lower() in [
+                        'heading',
+                        'title',
+                        'header',
+                    ]:
+                        # Shift all heading levels by +1 so h1 content becomes h2
+                        shifted = min((block.level or 1) + 1, 6)
+                        parts.append(f'{"#" * shifted} {block.text.strip()}')
+                    elif block.category and block.category.lower() == 'list':
+                        for line in block.text.splitlines():
+                            if line.strip():
+                                parts.append(f'- {line.strip()}')
+                    else:
+                        if block.text.strip():
+                            parts.append(block.text.strip())
+
+                elif isinstance(block, ImageBlock):
+                    alt = block.alt_text or ''
+                    parts.append(f'<figure>\n{alt}\n</figure>')
+
+                elif isinstance(block, TableBlock):
+                    if block.text.strip():
+                        parts.append(block.text.strip())
+
+        return f'{frontmatter}\n\n' + '\n\n'.join(parts) + '\n'
+
     def markdown(self) -> str:
         """Get the document content formatted as Markdown.
 

From 731bf6536fdc03eaf4440e3efb2cac8cefcc483e Mon Sep 17 00:00:00 2001
From: Alessio Vertemati <alessio@oneofftech.xyz>
Date: Wed, 18 Feb 2026 11:03:01 +0100
Subject: [PATCH 2/8] wip

---
 src/parxy_core/models/models.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py
index d6bd3f0..ce3a48e 100644
--- a/src/parxy_core/models/models.py
+++ b/src/parxy_core/models/models.py
@@ -188,9 +188,30 @@ def contentmd(
         str
             The document content formatted as content-md.
         """
+        def _guess_title_from_first_page() -> Optional[str]:
+            if not self.pages:
+                return None
+            first_page = self.pages[0]
+            if not first_page.blocks:
+                return None
+            heading_categories = {'heading', 'title', 'header'}
+            # Pick the highest-ranking heading (lowest level number) on the first page
+            candidates = [
+                b
+                for b in first_page.blocks
+                if isinstance(b, TextBlock)
+                and b.category
+                and b.category.lower() in heading_categories
+                and b.text.strip()
+            ]
+            if not candidates:
+                return None
+            return min(candidates, key=lambda b: b.level or 1).text.strip()
+
         resolved_title = (
             title
             or (self.metadata.title if self.metadata else None)
+            or _guess_title_from_first_page()
             or self.filename
             or 'Untitled'
         )

From 62d81af06b6267194e9dc2906db6078be94e8de3 Mon Sep 17 00:00:00 2001
From: Alessio Vertemati <alessio@oneofftech.xyz>
Date: Wed, 18 Feb 2026 11:07:48 +0100
Subject: [PATCH 3/8] wip

---
 src/parxy_core/models/models.py | 65 +++++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 15 deletions(-)

diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py
index ce3a48e..8fd1f65 100644
--- a/src/parxy_core/models/models.py
+++ b/src/parxy_core/models/models.py
@@ -194,19 +194,29 @@ def _guess_title_from_first_page() -> Optional[str]:
             first_page = self.pages[0]
             if not first_page.blocks:
                 return None
-            heading_categories = {'heading', 'title', 'header'}
-            # Pick the highest-ranking heading (lowest level number) on the first page
-            candidates = [
+            # Prefer an explicit doc-title block, then the highest-ranking heading
+            doc_title = next(
+                (
+                    b
+                    for b in first_page.blocks
+                    if isinstance(b, TextBlock)
+                    and b.role == 'doc-title'
+                    and b.text.strip()
+                ),
+                None,
+            )
+            if doc_title:
+                return doc_title.text.strip()
+            headings = [
                 b
                 for b in first_page.blocks
                 if isinstance(b, TextBlock)
-                and b.category
-                and b.category.lower() in heading_categories
+                and b.role == 'heading'
                 and b.text.strip()
             ]
-            if not candidates:
+            if not headings:
                 return None
-            return min(candidates, key=lambda b: b.level or 1).text.strip()
+            return min(headings, key=lambda b: b.level or 1).text.strip()
 
         resolved_title = (
             title
@@ -222,12 +232,31 @@ def _guess_title_from_first_page() -> Optional[str]:
         )
         resolved_author = author or (self.metadata.author if self.metadata else None)
 
+        def _infer_description() -> Optional[str]:
+            first_two_pages = self.pages[:2]
+            blocks = [
+                b
+                for page in first_two_pages
+                if page.blocks
+                for b in page.blocks
+                if isinstance(b, TextBlock) and b.text.strip()
+            ]
+            abstract = next((b for b in blocks if b.role == 'doc-abstract'), None)
+            if abstract:
+                return abstract.text.strip()
+            text_blocks = [b for b in blocks if b.role != 'doc-title']
+            if not text_blocks:
+                return None
+            return max(text_blocks, key=lambda b: len(b.text)).text.strip()
+
+        resolved_description = description or _infer_description()
+
         def _yaml_str(v: str) -> str:
             return '"' + v.replace('\\', '\\\\').replace('"', '\\"') + '"'
 
         fm = ['---', f'title: {_yaml_str(resolved_title)}']
-        if description:
-            fm.append(f'description: {_yaml_str(description)}')
+        if resolved_description:
+            fm.append(f'description: {_yaml_str(resolved_description)}')
         if resolved_date:
             fm.append(f'date: {_yaml_str(resolved_date)}')
         if license:
@@ -249,19 +278,25 @@ def _yaml_str(v: str) -> str:
                 continue
 
             for block in page.blocks:
+                role = (block.role or 'generic').lower()
+
                 if isinstance(block, TextBlock):
-                    if block.category and block.category.lower() in [
-                        'heading',
-                        'title',
-                        'header',
-                    ]:
+                    if role == 'doc-title':
+                        # Already rendered as the top-level # heading — skip
+                        pass
+                    elif role == 'heading':
                         # Shift all heading levels by +1 so h1 content becomes h2
                         shifted = min((block.level or 1) + 1, 6)
                         parts.append(f'{"#" * shifted} {block.text.strip()}')
-                    elif block.category and block.category.lower() == 'list':
+                    elif role in ('list', 'listitem'):
                         for line in block.text.splitlines():
                             if line.strip():
                                 parts.append(f'- {line.strip()}')
+                    elif role == 'doc-abstract':
+                        lang_attr = f' lang="{self.language}"' if self.language else ''
+                        parts.append(
+                            f'<abstract{lang_attr}>\n{block.text.strip()}\n</abstract>'
+                        )
                     else:
                         if block.text.strip():
                             parts.append(block.text.strip())

From 89494f0fcd8e440402928377098184f19d76a3e7 Mon Sep 17 00:00:00 2001
From: Alessio Vertemati <alessio@oneofftech.xyz>
Date: Wed, 18 Feb 2026 11:12:22 +0100
Subject: [PATCH 4/8] wip

---
 src/parxy_core/models/models.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py
index 8fd1f65..3e58aa3 100644
--- a/src/parxy_core/models/models.py
+++ b/src/parxy_core/models/models.py
@@ -206,7 +206,7 @@ def _guess_title_from_first_page() -> Optional[str]:
                 None,
             )
             if doc_title:
-                return doc_title.text.strip()
+                return _normalize(doc_title.text)
             headings = [
                 b
                 for b in first_page.blocks
@@ -216,7 +216,7 @@ def _guess_title_from_first_page() -> Optional[str]:
             ]
             if not headings:
                 return None
-            return min(headings, key=lambda b: b.level or 1).text.strip()
+            return _normalize(min(headings, key=lambda b: b.level or 1).text)
 
         resolved_title = (
             title
@@ -243,14 +243,18 @@ def _infer_description() -> Optional[str]:
             ]
             abstract = next((b for b in blocks if b.role == 'doc-abstract'), None)
             if abstract:
-                return abstract.text.strip()
+                return _normalize(abstract.text)
             text_blocks = [b for b in blocks if b.role != 'doc-title']
             if not text_blocks:
                 return None
-            return max(text_blocks, key=lambda b: len(b.text)).text.strip()
+            return _normalize(max(text_blocks, key=lambda b: len(b.text)).text)
 
         resolved_description = description or _infer_description()
 
+        def _normalize(text: str) -> str:
+            """Collapse runs of whitespace to a single space and strip."""
+            return ' '.join(text.split())
+
         def _yaml_str(v: str) -> str:
             return '"' + v.replace('\\', '\\\\').replace('"', '\\"') + '"'
 
@@ -274,7 +278,7 @@ def _yaml_str(v: str) -> str:
         for page in self.pages:
             if not page.blocks:
                 if page.text.strip():
-                    parts.append(page.text.strip())
+                    parts.append(_normalize(page.text))
                 continue
 
             for block in page.blocks:
@@ -287,19 +291,20 @@ def _yaml_str(v: str) -> str:
                     elif role == 'heading':
                         # Shift all heading levels by +1 so h1 content becomes h2
                         shifted = min((block.level or 1) + 1, 6)
-                        parts.append(f'{"#" * shifted} {block.text.strip()}')
+                        parts.append(f'{"#" * shifted} {_normalize(block.text)}')
                     elif role in ('list', 'listitem'):
                         for line in block.text.splitlines():
                             if line.strip():
-                                parts.append(f'- {line.strip()}')
+                                parts.append(f'- {_normalize(line)}')
                     elif role == 'doc-abstract':
                         lang_attr = f' lang="{self.language}"' if self.language else ''
                         parts.append(
-                            f'<abstract{lang_attr}>\n{block.text.strip()}\n</abstract>'
+                            f'<abstract{lang_attr}>\n{_normalize(block.text)}\n</abstract>'
                         )
                     else:
-                        if block.text.strip():
-                            parts.append(block.text.strip())
+                        normalized = _normalize(block.text)
+                        if normalized:
+                            parts.append(normalized)
 
                 elif isinstance(block, ImageBlock):
                     alt = block.alt_text or ''

From b7c374287bc9ba44aebe5de6dc1333fb59f24cc9 Mon Sep 17 00:00:00 2001
From: Alessio Vertemati <alessio@oneofftech.xyz>
Date: Wed, 18 Feb 2026 11:21:28 +0100
Subject: [PATCH 5/8] wip

---
 src/parxy_core/models/models.py              | 146 ++---------
 src/parxy_core/services/__init__.py          |   3 +-
 src/parxy_core/services/contentmd_service.py | 240 +++++++++++++++++++
 3 files changed, 256 insertions(+), 133 deletions(-)
 create mode 100644 src/parxy_core/services/contentmd_service.py

diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py
index 3e58aa3..0bc6f1d 100644
--- a/src/parxy_core/models/models.py
+++ b/src/parxy_core/models/models.py
@@ -165,17 +165,16 @@ def contentmd(
     ) -> str:
         """Get the document content formatted as content-md.
 
-        Generates a content-md string: YAML frontmatter followed by Markdown.
-        Per the spec, all heading levels are shifted up by one so the document
-        title occupies the sole h1, and images use ``<figure>`` blocks.
+        Delegates to :class:`~parxy_core.services.ContentMdService`.
 
         Parameters
         ----------
         title : str, optional
-            Document title. Falls back to metadata.title, then filename.
+            Document title. Falls back to metadata.title, a heading inferred
+            from the first page, filename, then 'Untitled'.
         description : str, optional
-            Short summary (~200 characters). Required by the spec; omitted from
-            frontmatter when not provided.
+            Short summary (~200 characters). Falls back to a doc-abstract block,
+            then the longest TextBlock across the first two pages.
         date : str, optional
             Creation/publication date in ISO 8601. Falls back to metadata dates.
         license : str, optional
@@ -188,133 +187,16 @@ def contentmd(
         str
             The document content formatted as content-md.
         """
-        def _guess_title_from_first_page() -> Optional[str]:
-            if not self.pages:
-                return None
-            first_page = self.pages[0]
-            if not first_page.blocks:
-                return None
-            # Prefer an explicit doc-title block, then the highest-ranking heading
-            doc_title = next(
-                (
-                    b
-                    for b in first_page.blocks
-                    if isinstance(b, TextBlock)
-                    and b.role == 'doc-title'
-                    and b.text.strip()
-                ),
-                None,
-            )
-            if doc_title:
-                return _normalize(doc_title.text)
-            headings = [
-                b
-                for b in first_page.blocks
-                if isinstance(b, TextBlock)
-                and b.role == 'heading'
-                and b.text.strip()
-            ]
-            if not headings:
-                return None
-            return _normalize(min(headings, key=lambda b: b.level or 1).text)
-
-        resolved_title = (
-            title
-            or (self.metadata.title if self.metadata else None)
-            or _guess_title_from_first_page()
-            or self.filename
-            or 'Untitled'
+        from parxy_core.services.contentmd_service import ContentMdService
+
+        return ContentMdService.render(
+            self,
+            title=title,
+            description=description,
+            date=date,
+            license=license,
+            author=author,
         )
-        resolved_date = date or (
-            (self.metadata.created_at or self.metadata.updated_at)
-            if self.metadata
-            else None
-        )
-        resolved_author = author or (self.metadata.author if self.metadata else None)
-
-        def _infer_description() -> Optional[str]:
-            first_two_pages = self.pages[:2]
-            blocks = [
-                b
-                for page in first_two_pages
-                if page.blocks
-                for b in page.blocks
-                if isinstance(b, TextBlock) and b.text.strip()
-            ]
-            abstract = next((b for b in blocks if b.role == 'doc-abstract'), None)
-            if abstract:
-                return _normalize(abstract.text)
-            text_blocks = [b for b in blocks if b.role != 'doc-title']
-            if not text_blocks:
-                return None
-            return _normalize(max(text_blocks, key=lambda b: len(b.text)).text)
-
-        resolved_description = description or _infer_description()
-
-        def _normalize(text: str) -> str:
-            """Collapse runs of whitespace to a single space and strip."""
-            return ' '.join(text.split())
-
-        def _yaml_str(v: str) -> str:
-            return '"' + v.replace('\\', '\\\\').replace('"', '\\"') + '"'
-
-        fm = ['---', f'title: {_yaml_str(resolved_title)}']
-        if resolved_description:
-            fm.append(f'description: {_yaml_str(resolved_description)}')
-        if resolved_date:
-            fm.append(f'date: {_yaml_str(resolved_date)}')
-        if license:
-            fm.append(f'license: {_yaml_str(license)}')
-        if resolved_author:
-            fm.append(f'author: {_yaml_str(resolved_author)}')
-        fm.append('---')
-        frontmatter = '\n'.join(fm)
-
-        if not self.pages:
-            return f'{frontmatter}\n\n# {resolved_title}\n'
-
-        parts = [f'# {resolved_title}']
-
-        for page in self.pages:
-            if not page.blocks:
-                if page.text.strip():
-                    parts.append(_normalize(page.text))
-                continue
-
-            for block in page.blocks:
-                role = (block.role or 'generic').lower()
-
-                if isinstance(block, TextBlock):
-                    if role == 'doc-title':
-                        # Already rendered as the top-level # heading — skip
-                        pass
-                    elif role == 'heading':
-                        # Shift all heading levels by +1 so h1 content becomes h2
-                        shifted = min((block.level or 1) + 1, 6)
-                        parts.append(f'{"#" * shifted} {_normalize(block.text)}')
-                    elif role in ('list', 'listitem'):
-                        for line in block.text.splitlines():
-                            if line.strip():
-                                parts.append(f'- {_normalize(line)}')
-                    elif role == 'doc-abstract':
-                        lang_attr = f' lang="{self.language}"' if self.language else ''
-                        parts.append(
-                            f'<abstract{lang_attr}>\n{_normalize(block.text)}\n</abstract>'
-                        )
-                    else:
-                        normalized = _normalize(block.text)
-                        if normalized:
-                            parts.append(normalized)
-
-                elif isinstance(block, ImageBlock):
-                    alt = block.alt_text or ''
-                    parts.append(f'<figure>\n{alt}\n</figure>')
-
-                elif isinstance(block, TableBlock):
-                    if block.text.strip():
-                        parts.append(block.text.strip())
-
-        return f'{frontmatter}\n\n' + '\n\n'.join(parts) + '\n'
 
     def markdown(self) -> str:
         """Get the document content formatted as Markdown.
diff --git a/src/parxy_core/services/__init__.py b/src/parxy_core/services/__init__.py
index 5071d08..5342a63 100644
--- a/src/parxy_core/services/__init__.py
+++ b/src/parxy_core/services/__init__.py
@@ -1,5 +1,6 @@
 """Services module for parxy_core."""
 
+from parxy_core.services.contentmd_service import ContentMdService
 from parxy_core.services.pdf_service import PdfService
 
-__all__ = ['PdfService']
+__all__ = ['ContentMdService', 'PdfService']
diff --git a/src/parxy_core/services/contentmd_service.py b/src/parxy_core/services/contentmd_service.py
new file mode 100644
index 0000000..59a03db
--- /dev/null
+++ b/src/parxy_core/services/contentmd_service.py
@@ -0,0 +1,240 @@
+"""Service for rendering documents as content-md."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    from parxy_core.models.models import Document
+
+
+class ContentMdService:
+    """Render a :class:`Document` as a content-md string.
+
+    content-md is an open specification for optimised content exchange: a YAML
+    frontmatter section followed by CommonMark / GitHub-flavoured Markdown.
+    All methods are static; the class acts as a namespace.
+    """
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _normalize(text: str) -> str:
+        """Collapse any run of whitespace to a single space and strip."""
+        return ' '.join(text.split())
+
+    @staticmethod
+    def _yaml_str(value: str) -> str:
+        """Wrap *value* in double quotes and escape internal quotes/backslashes."""
+        return '"' + value.replace('\\', '\\\\').replace('"', '\\"') + '"'
+
+    @staticmethod
+    def _guess_title(document: Document) -> Optional[str]:
+        """Infer a title from the first page blocks.
+
+        Prefers an explicit ``doc-title`` role; falls back to the
+        highest-ranking (lowest level number) ``heading`` block.
+        """
+        from parxy_core.models.models import TextBlock
+
+        if not document.pages:
+            return None
+        first_page = document.pages[0]
+        if not first_page.blocks:
+            return None
+
+        doc_title = next(
+            (
+                b
+                for b in first_page.blocks
+                if isinstance(b, TextBlock)
+                and b.role == 'doc-title'
+                and b.text.strip()
+            ),
+            None,
+        )
+        if doc_title:
+            return ContentMdService._normalize(doc_title.text)
+
+        headings = [
+            b
+            for b in first_page.blocks
+            if isinstance(b, TextBlock) and b.role == 'heading' and b.text.strip()
+        ]
+        if not headings:
+            return None
+        return ContentMdService._normalize(
+            min(headings, key=lambda b: b.level or 1).text
+        )
+
+    @staticmethod
+    def _infer_description(document: Document) -> Optional[str]:
+        """Infer a description from document content.
+
+        Uses the ``doc-abstract`` block when present, otherwise the longest
+        :class:`TextBlock` across the first two pages.
+        """
+        from parxy_core.models.models import TextBlock
+
+        blocks = [
+            b
+            for page in document.pages[:2]
+            if page.blocks
+            for b in page.blocks
+            if isinstance(b, TextBlock) and b.text.strip()
+        ]
+
+        abstract = next((b for b in blocks if b.role == 'doc-abstract'), None)
+        if abstract:
+            return ContentMdService._normalize(abstract.text)
+
+        text_blocks = [b for b in blocks if b.role != 'doc-title']
+        if not text_blocks:
+            return None
+        return ContentMdService._normalize(
+            max(text_blocks, key=lambda b: len(b.text)).text
+        )
+
+    @staticmethod
+    def _build_frontmatter(
+        title: str,
+        description: Optional[str],
+        date: Optional[str],
+        license: Optional[str],
+        author: Optional[str],
+    ) -> str:
+        ys = ContentMdService._yaml_str
+        lines = ['---', f'title: {ys(title)}']
+        if description:
+            lines.append(f'description: {ys(description)}')
+        if date:
+            lines.append(f'date: {ys(date)}')
+        if license:
+            lines.append(f'license: {ys(license)}')
+        if author:
+            lines.append(f'author: {ys(author)}')
+        lines.append('---')
+        return '\n'.join(lines)
+
+    @staticmethod
+    def _build_body(document: Document, title: str) -> str:
+        from parxy_core.models.models import ImageBlock, TableBlock, TextBlock
+
+        normalize = ContentMdService._normalize
+        parts = [f'# {title}']
+
+        for page in document.pages:
+            if not page.blocks:
+                if page.text.strip():
+                    parts.append(normalize(page.text))
+                continue
+
+            for block in page.blocks:
+                role = (block.role or 'generic').lower()
+
+                if isinstance(block, TextBlock):
+                    if role == 'doc-title':
+                        # Already the top-level h1 — skip to avoid duplication
+                        pass
+                    elif role == 'heading':
+                        # Shift levels +1: h1 content → h2, per content-md spec
+                        shifted = min((block.level or 1) + 1, 6)
+                        parts.append(f'{"#" * shifted} {normalize(block.text)}')
+                    elif role in ('list', 'listitem'):
+                        for line in block.text.splitlines():
+                            if line.strip():
+                                parts.append(f'- {normalize(line)}')
+                    elif role == 'doc-abstract':
+                        lang_attr = (
+                            f' lang="{document.language}"' if document.language else ''
+                        )
+                        parts.append(
+                            f'<abstract{lang_attr}>\n{normalize(block.text)}\n</abstract>'
+                        )
+                    else:
+                        normalized = normalize(block.text)
+                        if normalized:
+                            parts.append(normalized)
+
+                elif isinstance(block, ImageBlock):
+                    parts.append(f'<figure>\n{block.alt_text or ""}\n</figure>')
+
+                elif isinstance(block, TableBlock):
+                    # Preserve table whitespace (column alignment, padding)
+                    if block.text.strip():
+                        parts.append(block.text.strip())
+
+        return '\n\n'.join(parts)
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def render(
+        document: Document,
+        title: Optional[str] = None,
+        description: Optional[str] = None,
+        date: Optional[str] = None,
+        license: Optional[str] = None,
+        author: Optional[str] = None,
+    ) -> str:
+        """Render *document* as a content-md string.
+
+        Parameters
+        ----------
+        document:
+            The document to render.
+        title:
+            Document title. Falls back to ``metadata.title``, a heading
+            inferred from the first page, ``filename``, then ``'Untitled'``.
+        description:
+            Short summary (~200 characters). Falls back to a ``doc-abstract``
+            block, then the longest :class:`TextBlock` in the first two pages.
+        date:
+            Creation/publication date in ISO 8601. Falls back to
+            ``metadata.created_at`` / ``metadata.updated_at``.
+        license:
+            License name or SPDX identifier.
+        author:
+            Author name. Falls back to ``metadata.author``.
+
+        Returns
+        -------
+        str
+            The document formatted as content-md.
+        """
+        resolved_title = (
+            title
+            or (document.metadata.title if document.metadata else None)
+            or ContentMdService._guess_title(document)
+            or document.filename
+            or 'Untitled'
+        )
+        resolved_description = description or ContentMdService._infer_description(
+            document
+        )
+        resolved_date = date or (
+            (document.metadata.created_at or document.metadata.updated_at)
+            if document.metadata
+            else None
+        )
+        resolved_author = (
+            author or (document.metadata.author if document.metadata else None)
+        )
+
+        frontmatter = ContentMdService._build_frontmatter(
+            title=resolved_title,
+            description=resolved_description,
+            date=resolved_date,
+            license=license,
+            author=resolved_author,
+        )
+
+        if not document.pages:
+            return f'{frontmatter}\n\n# {resolved_title}\n'
+
+        body = ContentMdService._build_body(document, resolved_title)
+        return f'{frontmatter}\n\n{body}\n'

From be15068cff36076cb41ce4d89371ba407dc47e53 Mon Sep 17 00:00:00 2001
From: Alessio Vertemati <alessio@oneofftech.xyz>
Date: Wed, 18 Feb 2026 11:27:07 +0100
Subject: [PATCH 6/8] wip

---
 src/parxy_core/services/contentmd_service.py |   8 +-
 tests/services/test_contentmd_service.py     | 456 +++++++++++++++++++
 2 files changed, 459 insertions(+), 5 deletions(-)
 create mode 100644 tests/services/test_contentmd_service.py

diff --git a/src/parxy_core/services/contentmd_service.py b/src/parxy_core/services/contentmd_service.py
index 59a03db..9383151 100644
--- a/src/parxy_core/services/contentmd_service.py
+++ b/src/parxy_core/services/contentmd_service.py
@@ -49,9 +49,7 @@ def _guess_title(document: Document) -> Optional[str]:
             (
                 b
                 for b in first_page.blocks
-                if isinstance(b, TextBlock)
-                and b.role == 'doc-title'
-                and b.text.strip()
+                if isinstance(b, TextBlock) and b.role == 'doc-title' and b.text.strip()
             ),
             None,
         )
@@ -221,8 +219,8 @@ def render(
             if document.metadata
             else None
         )
-        resolved_author = (
-            author or (document.metadata.author if document.metadata else None)
+        resolved_author = author or (
+            document.metadata.author if document.metadata else None
         )
 
         frontmatter = ContentMdService._build_frontmatter(
diff --git a/tests/services/test_contentmd_service.py b/tests/services/test_contentmd_service.py
new file mode 100644
index 0000000..c6e5782
--- /dev/null
+++ b/tests/services/test_contentmd_service.py
@@ -0,0 +1,456 @@
+"""Test suite for ContentMdService."""
+
+import pytest
+
+from parxy_core.models.models import (
+    Document,
+    ImageBlock,
+    Metadata,
+    Page,
+    TableBlock,
+    TextBlock,
+)
+from parxy_core.services.contentmd_service import ContentMdService
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def make_page(
+    number: int = 1,
+    text: str = '',
+    blocks: list | None = None,
+) -> Page:
+    return Page(number=number, text=text, blocks=blocks)
+
+
+def make_text_block(
+    text: str,
+    role: str = 'generic',
+    level: int | None = None,
+) -> TextBlock:
+    return TextBlock(type='text', text=text, role=role, level=level)
+
+
+def make_image_block(
+    alt_text: str | None = None, name: str | None = None
+) -> ImageBlock:
+    return ImageBlock(type='image', alt_text=alt_text, name=name)
+
+
+def make_table_block(text: str) -> TableBlock:
+    return TableBlock(type='table', text=text)
+
+
+def make_doc(
+    pages: list[Page],
+    metadata: Metadata | None = None,
+    filename: str | None = None,
+    language: str | None = None,
+) -> Document:
+    return Document(
+        pages=pages,
+        metadata=metadata,
+        filename=filename,
+        language=language,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def minimal_doc():
+    """Document with a single page, no blocks, no metadata."""
+    return make_doc(pages=[make_page(text='Hello world')])
+
+
+@pytest.fixture
+def metadata_doc():
+    """Document with full metadata and one plain paragraph block."""
+    meta = Metadata(
+        title='Metadata Title',
+        author='Jane Doe',
+        created_at='2025-01-15',
+    )
+    page = make_page(
+        text='Paragraph text.',
+        blocks=[make_text_block('Paragraph text.')],
+    )
+    return make_doc(pages=[page], metadata=meta, filename='report.pdf')
+
+
+@pytest.fixture
+def all_blocks_doc():
+    """Document whose first page contains every supported block type."""
+    blocks = [
+        make_text_block('My Document', role='doc-title'),
+        make_text_block('Introduction', role='heading', level=1),
+        make_text_block('Background', role='heading', level=2),
+        make_text_block('First item\nSecond item', role='list'),
+        make_text_block('A plain paragraph.', role='paragraph'),
+        make_text_block('A brief overview.', role='doc-abstract'),
+        make_image_block(alt_text='A sunset over mountains', name='sunset.jpg'),
+        make_table_block('| Col A | Col B |\n| ----- | ----- |\n| 1     | 2     |'),
+    ]
+    page = make_page(text='My Document', blocks=blocks)
+    return make_doc(pages=[page], language='en')
+
+
+# ---------------------------------------------------------------------------
+# Frontmatter
+# ---------------------------------------------------------------------------
+
+
+class TestFrontmatter:
+    def test_frontmatter_delimiters_present(self, minimal_doc):
+        result = ContentMdService.render(minimal_doc, title='T', description='D')
+        lines = result.splitlines()
+        assert lines[0] == '---'
+        closing = lines.index('---', 1)
+        assert closing > 0
+
+    def test_explicit_title_in_frontmatter(self, minimal_doc):
+        result = ContentMdService.render(minimal_doc, title='Explicit Title')
+        assert 'title: "Explicit Title"' in result
+
+    def test_title_from_metadata(self, metadata_doc):
+        result = ContentMdService.render(metadata_doc)
+        assert 'title: "Metadata Title"' in result
+
+    def test_title_from_doc_title_role_preferred_over_heading(self):
+        blocks = [
+            make_text_block('Real Title', role='doc-title'),
+            make_text_block('Section One', role='heading', level=1),
+        ]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc)
+        assert 'title: "Real Title"' in result
+
+    def test_title_from_heading_when_no_doc_title(self):
+        blocks = [
+            make_text_block('Section One', role='heading', level=2),
+            make_text_block('Section Two', role='heading', level=1),
+        ]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc)
+        # Level 1 heading wins (lowest level = highest rank)
+        assert 'title: "Section Two"' in result
+
+    def test_title_from_filename_when_no_headings(self):
+        doc = make_doc(
+            pages=[make_page(text='body text')],
+            filename='my-report.pdf',
+        )
+        result = ContentMdService.render(doc)
+        assert 'title: "my-report.pdf"' in result
+
+    def test_title_fallback_to_untitled(self):
+        doc = make_doc(pages=[make_page(text='body text')])
+        result = ContentMdService.render(doc)
+        assert 'title: "Untitled"' in result
+
+    def test_description_from_explicit_param(self, minimal_doc):
+        result = ContentMdService.render(
+            minimal_doc, title='T', description='My summary.'
+        )
+        assert 'description: "My summary."' in result
+
+    def test_description_from_doc_abstract_block(self):
+        blocks = [
+            make_text_block('Abstract content here.', role='doc-abstract'),
+            make_text_block(
+                'A much longer paragraph that should not be picked.', role='paragraph'
+            ),
+        ]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc)
+        assert 'description: "Abstract content here."' in result
+
+    def test_description_from_longest_textblock_when_no_abstract(self):
+        blocks = [
+            make_text_block('Short.', role='paragraph'),
+            make_text_block(
+                'This is a considerably longer paragraph block.', role='paragraph'
+            ),
+        ]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc)
+        assert 'description: "This is a considerably longer paragraph block."' in result
+
+    def test_description_excludes_doc_title_from_longest_candidate(self):
+        blocks = [
+            make_text_block(
+                'This is a very long doc-title block text.', role='doc-title'
+            ),
+            make_text_block('Shorter paragraph.', role='paragraph'),
+        ]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc)
+        assert 'description: "Shorter paragraph."' in result
+
+    def test_description_searches_first_two_pages(self):
+        page1 = make_page(number=1, text='', blocks=[make_text_block('Page 1 text.')])
+        page2 = make_page(
+            number=2,
+            text='',
+            blocks=[make_text_block('Page 2 has a longer text block.')],
+        )
+        page3 = make_page(
+            number=3,
+            text='',
+            blocks=[make_text_block('Page 3 has the longest block of all by far.')],
+        )
+        doc = make_doc(pages=[page1, page2, page3])
+        result = ContentMdService.render(doc)
+        # Page 3 is out of the two-page window
+        assert 'Page 3' not in result.split('---')[1]  # not in frontmatter
+
+    def test_date_from_metadata_created_at(self, metadata_doc):
+        result = ContentMdService.render(metadata_doc)
+        assert 'date: "2025-01-15"' in result
+
+    def test_date_from_metadata_updated_at_when_no_created_at(self):
+        meta = Metadata(updated_at='2025-06-01')
+        doc = make_doc(pages=[make_page(text='')], metadata=meta)
+        result = ContentMdService.render(doc)
+        assert 'date: "2025-06-01"' in result
+
+    def test_explicit_date_overrides_metadata(self, metadata_doc):
+        result = ContentMdService.render(metadata_doc, date='2026-01-01')
+        assert 'date: "2026-01-01"' in result
+        assert '2025-01-15' not in result
+
+    def test_author_from_metadata(self, metadata_doc):
+        result = ContentMdService.render(metadata_doc)
+        assert 'author: "Jane Doe"' in result
+
+    def test_optional_fields_omitted_when_absent(self, minimal_doc):
+        result = ContentMdService.render(minimal_doc, title='T')
+        assert 'description:' not in result
+        assert 'date:' not in result
+        assert 'license:' not in result
+        assert 'author:' not in result
+
+    def test_license_included_when_provided(self, minimal_doc):
+        result = ContentMdService.render(minimal_doc, title='T', license='CC-BY-4.0')
+        assert 'license: "CC-BY-4.0"' in result
+
+    def test_yaml_values_escaped(self, minimal_doc):
+        result = ContentMdService.render(
+            minimal_doc,
+            title='Title with "quotes"',
+            description='Back\\slash',
+        )
+        assert r'title: "Title with \"quotes\""' in result
+        assert r'description: "Back\\slash"' in result
+
+
+# ---------------------------------------------------------------------------
+# Body – block rendering
+# ---------------------------------------------------------------------------
+
+
+class TestBodyBlocks:
+    def test_body_starts_with_h1_title(self, metadata_doc):
+        result = ContentMdService.render(metadata_doc)
+        body = result.split('---\n', 2)[-1]
+        assert body.lstrip().startswith('# Metadata Title')
+
+    def test_doc_title_block_skipped_in_body(self, all_blocks_doc):
+        result = ContentMdService.render(all_blocks_doc)
+        body = result.split('---\n', 2)[-1]
+        # Should appear exactly once (as the h1), not twice
+        assert body.count('My Document') == 1
+
+    def test_heading_level_shifted_by_one(self):
+        blocks = [make_text_block('Section', role='heading', level=1)]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert '## Section' in result
+
+    def test_heading_level_2_becomes_3(self):
+        blocks = [make_text_block('Subsection', role='heading', level=2)]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert '### Subsection' in result
+
+    def test_heading_without_level_defaults_to_h2(self):
+        blocks = [make_text_block('Heading', role='heading')]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert '## Heading' in result
+
+    def test_heading_level_capped_at_6(self):
+        blocks = [make_text_block('Deep', role='heading', level=6)]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert '###### Deep' in result
+        assert '####### Deep' not in result
+
+    def test_list_role_rendered_as_bullets(self):
+        blocks = [make_text_block('Alpha\nBeta\nGamma', role='list')]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert '- Alpha' in result
+        assert '- Beta' in result
+        assert '- Gamma' in result
+
+    def test_listitem_role_rendered_as_bullet(self):
+        blocks = [make_text_block('Single item', role='listitem')]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert '- Single item' in result
+
+    def test_doc_abstract_rendered_as_abstract_tag(self, all_blocks_doc):
+        result = ContentMdService.render(all_blocks_doc)
+        assert '<abstract lang="en">' in result
+        assert 'A brief overview.' in result
+        assert '</abstract>' in result
+
+    def test_doc_abstract_without_language_omits_lang_attr(self):
+        blocks = [make_text_block('Summary.', role='doc-abstract')]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert '<abstract>\nSummary.\n</abstract>' in result
+
+    def test_generic_textblock_rendered_as_paragraph(self):
+        blocks = [make_text_block('Plain paragraph text.', role='generic')]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert 'Plain paragraph text.' in result
+
+    def test_empty_textblock_not_rendered(self):
+        blocks = [make_text_block('   ', role='paragraph')]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        # Body should only contain the h1 line
+        body = result.split('---\n', 2)[-1].strip()
+        assert body == '# T'
+
+    def test_image_block_rendered_as_figure(self):
+        blocks = [make_image_block(alt_text='A sunset over mountains')]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert '<figure>\nA sunset over mountains\n</figure>' in result
+
+    def test_image_block_without_alt_text(self):
+        blocks = [make_image_block()]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert '<figure>\n\n</figure>' in result
+
+    def test_table_block_rendered_as_is(self):
+        table_text = '| Col A | Col B |\n| ----- | ----- |\n| 1     | 2     |'
+        blocks = [make_table_block(table_text)]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert table_text in result
+
+    def test_page_without_blocks_uses_page_text(self):
+        page = make_page(text='Fallback page text', blocks=None)
+        doc = make_doc(pages=[page])
+        result = ContentMdService.render(doc, title='T')
+        assert 'Fallback page text' in result
+
+    def test_empty_page_text_not_rendered(self):
+        page = make_page(text='   ', blocks=None)
+        doc = make_doc(pages=[page])
+        result = ContentMdService.render(doc, title='T')
+        body = result.split('---\n', 2)[-1].strip()
+        assert body == '# T'
+
+
+# ---------------------------------------------------------------------------
+# Whitespace normalisation
+# ---------------------------------------------------------------------------
+
+
+class TestWhitespaceNormalisation:
+    def test_multiple_spaces_in_paragraph_collapsed(self):
+        blocks = [make_text_block('Word1   Word2     Word3')]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert 'Word1 Word2 Word3' in result
+
+    def test_tabs_in_paragraph_collapsed(self):
+        blocks = [make_text_block('Word1\t\tWord2')]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert 'Word1 Word2' in result
+
+    def test_whitespace_in_heading_collapsed(self):
+        blocks = [make_text_block('My   Section', role='heading', level=1)]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert '## My Section' in result
+
+    def test_whitespace_in_title_collapsed(self):
+        blocks = [make_text_block('  My   Title  ', role='doc-title')]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc)
+        assert 'title: "My Title"' in result
+
+    def test_whitespace_in_description_collapsed(self):
+        blocks = [make_text_block('Summary   with   gaps.', role='doc-abstract')]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert 'description: "Summary with gaps."' in result
+
+    def test_table_whitespace_preserved(self):
+        table_text = '| Col A | Col B |\n| ----- | ----- |'
+        blocks = [make_table_block(table_text)]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert '| Col A | Col B |' in result
+
+
+# ---------------------------------------------------------------------------
+# Output structure
+# ---------------------------------------------------------------------------
+
+
+class TestOutputStructure:
+    def test_result_ends_with_newline(self, minimal_doc):
+        result = ContentMdService.render(minimal_doc, title='T')
+        assert result.endswith('\n')
+
+    def test_empty_pages_list_returns_frontmatter_and_title(self):
+        doc = Document(pages=[])
+        result = ContentMdService.render(doc, title='Empty')
+        assert 'title: "Empty"' in result
+        assert '# Empty' in result
+
+    def test_blocks_separated_by_blank_line(self):
+        blocks = [
+            make_text_block('First paragraph.'),
+            make_text_block('Second paragraph.'),
+        ]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc, title='T')
+        assert 'First paragraph.\n\nSecond paragraph.' in result
+
+    def test_multipage_document_renders_all_pages(self):
+        page1 = make_page(
+            number=1,
+            text='',
+            blocks=[make_text_block('Page one content.')],
+        )
+        page2 = make_page(
+            number=2,
+            text='',
+            blocks=[make_text_block('Page two content.')],
+        )
+        doc = make_doc(pages=[page1, page2])
+        result = ContentMdService.render(doc, title='T')
+        assert 'Page one content.' in result
+        assert 'Page two content.' in result
+
+    def test_render_delegates_from_document_method(self, metadata_doc):
+        via_service = ContentMdService.render(metadata_doc)
+        via_method = metadata_doc.contentmd()
+        assert via_service == via_method

From 5d7fd9c9f4bc758bce335cecd06cc9123174fdab Mon Sep 17 00:00:00 2001
From: Alessio Vertemati <alessio@oneofftech.xyz>
Date: Wed, 18 Feb 2026 11:35:21 +0100
Subject: [PATCH 7/8] wip

---
 src/parxy_core/services/contentmd_service.py | 34 ++++++++++++---
 tests/services/test_contentmd_service.py     | 46 ++++++++++++++------
 2 files changed, 59 insertions(+), 21 deletions(-)

diff --git a/src/parxy_core/services/contentmd_service.py b/src/parxy_core/services/contentmd_service.py
index 9383151..4531054 100644
--- a/src/parxy_core/services/contentmd_service.py
+++ b/src/parxy_core/services/contentmd_service.py
@@ -20,6 +20,20 @@ class ContentMdService:
     # Private helpers
     # ------------------------------------------------------------------
 
+    # Roles that provide structure or navigation rather than readable body text
+    _STRUCTURAL_ROLES: frozenset[str] = frozenset(
+        {
+            'heading',
+            'doc-title',
+            'doc-subtitle',
+            'doc-abstract',
+            'doc-toc',
+            'doc-pageheader',
+            'doc-pagefooter',
+            'caption',
+        }
+    )
+
     @staticmethod
     def _normalize(text: str) -> str:
         """Collapse any run of whitespace to a single space and strip."""
@@ -71,8 +85,10 @@ def _guess_title(document: Document) -> Optional[str]:
     def _infer_description(document: Document) -> Optional[str]:
         """Infer a description from document content.
 
-        Uses the ``doc-abstract`` block when present, otherwise the longest
-        :class:`TextBlock` across the first two pages.
+        Uses the ``doc-abstract`` block when present. Otherwise concatenates
+        the first five body :class:`TextBlock` objects (non-structural, across
+        the first two pages), normalises whitespace, and returns at most 200
+        characters.
         """
         from parxy_core.models.models import TextBlock
 
@@ -88,12 +104,16 @@ def _infer_description(document: Document) -> Optional[str]:
         if abstract:
             return ContentMdService._normalize(abstract.text)
 
-        text_blocks = [b for b in blocks if b.role != 'doc-title']
-        if not text_blocks:
+        body_blocks = [
+            b
+            for b in blocks
+            if (b.role or 'generic') not in ContentMdService._STRUCTURAL_ROLES
+        ]
+        if not body_blocks:
             return None
-        return ContentMdService._normalize(
-            max(text_blocks, key=lambda b: len(b.text)).text
-        )
+
+        combined = ' '.join(b.text for b in body_blocks[:5])
+        return ContentMdService._normalize(combined)[:200]
 
     @staticmethod
     def _build_frontmatter(
diff --git a/tests/services/test_contentmd_service.py b/tests/services/test_contentmd_service.py
index c6e5782..71ba804 100644
--- a/tests/services/test_contentmd_service.py
+++ b/tests/services/test_contentmd_service.py
@@ -171,27 +171,45 @@ def test_description_from_doc_abstract_block(self):
         result = ContentMdService.render(doc)
         assert 'description: "Abstract content here."' in result
 
-    def test_description_from_longest_textblock_when_no_abstract(self):
-        blocks = [
-            make_text_block('Short.', role='paragraph'),
-            make_text_block(
-                'This is a considerably longer paragraph block.', role='paragraph'
-            ),
-        ]
+    def test_description_from_first_five_body_blocks(self):
+        blocks = [make_text_block(f'Sentence {i}.', role='paragraph') for i in range(7)]
         doc = make_doc(pages=[make_page(text='', blocks=blocks)])
         result = ContentMdService.render(doc)
-        assert 'description: "This is a considerably longer paragraph block."' in result
+        # Only the first five contribute; the sixth and seventh are ignored
+        assert 'Sentence 5' not in result.split('---\n')[1].split('\n')[0]
+        assert 'Sentence 0' in result
 
-    def test_description_excludes_doc_title_from_longest_candidate(self):
+    def test_description_excludes_structural_roles(self):
         blocks = [
-            make_text_block(
-                'This is a very long doc-title block text.', role='doc-title'
-            ),
-            make_text_block('Shorter paragraph.', role='paragraph'),
+            make_text_block('Table of contents text.', role='doc-toc'),
+            make_text_block('Page header text.', role='doc-pageheader'),
+            make_text_block('A heading block.', role='heading'),
+            make_text_block('Body content.', role='paragraph'),
         ]
         doc = make_doc(pages=[make_page(text='', blocks=blocks)])
         result = ContentMdService.render(doc)
-        assert 'description: "Shorter paragraph."' in result
+        assert 'description: "Body content."' in result
+
+    def test_description_truncated_to_200_chars(self):
+        long_text = 'word ' * 60  # well over 200 chars
+        blocks = [make_text_block(long_text, role='paragraph')]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc)
+        fm_end = result.index('---\n', 4)
+        frontmatter = result[:fm_end]
+        desc_line = next(l for l in frontmatter.splitlines() if l.startswith('description:'))
+        # Strip the YAML quoting to measure the actual value length
+        value = desc_line[len('description: "'):-1]
+        assert len(value) <= 200
+
+    def test_description_contains_no_newlines(self):
+        blocks = [make_text_block('Line one.\nLine two.\nLine three.', role='paragraph')]
+        doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+        result = ContentMdService.render(doc)
+        fm_end = result.index('---\n', 4)
+        frontmatter = result[:fm_end]
+        desc_line = next(l for l in frontmatter.splitlines() if l.startswith('description:'))
+        assert '\n' not in desc_line
 
     def test_description_searches_first_two_pages(self):
         page1 = make_page(number=1, text='', blocks=[make_text_block('Page 1 text.')])

From 57476f3df634410a8c1e3296490da0ae79f3bfea Mon Sep 17 00:00:00 2001
From: Alessio Vertemati <alessio@oneofftech.xyz>
Date: Wed, 18 Feb 2026 11:37:40 +0100
Subject: [PATCH 8/8] wip

---
 src/parxy_core/services/contentmd_service.py | 12 +++--
 tests/services/test_contentmd_service.py     | 57 +++++++++++++++-----
 2 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/src/parxy_core/services/contentmd_service.py b/src/parxy_core/services/contentmd_service.py
index 4531054..3836c06 100644
--- a/src/parxy_core/services/contentmd_service.py
+++ b/src/parxy_core/services/contentmd_service.py
@@ -207,10 +207,11 @@ def render(
             The document to render.
         title:
             Document title. Falls back to ``metadata.title``, a heading
-            inferred from the first page, ``filename``, then ``'Untitled'``.
+            inferred from the first page, then ``filename``. Raises
+            ``ValueError`` if no title can be resolved.
         description:
             Short summary (~200 characters). Falls back to a ``doc-abstract``
-            block, then the longest :class:`TextBlock` in the first two pages.
+            block, then the first five body blocks in the first two pages.
         date:
             Creation/publication date in ISO 8601. Falls back to
             ``metadata.created_at`` / ``metadata.updated_at``.
@@ -229,8 +230,13 @@ def render(
             or (document.metadata.title if document.metadata else None)
             or ContentMdService._guess_title(document)
             or document.filename
-            or 'Untitled'
         )
+        if not resolved_title:
+            raise ValueError(
+                'Cannot render content-md: no title could be resolved. '
+                'Provide a title via metadata, a doc-title/heading block, '
+                'a filename, or pass title= explicitly.'
+            )
         resolved_description = description or ContentMdService._infer_description(
             document
         )
diff --git a/tests/services/test_contentmd_service.py b/tests/services/test_contentmd_service.py
index 71ba804..b817fe0 100644
--- a/tests/services/test_contentmd_service.py
+++ b/tests/services/test_contentmd_service.py
@@ -149,10 +149,10 @@ def test_title_from_filename_when_no_headings(self):
         result = ContentMdService.render(doc)
         assert 'title: "my-report.pdf"' in result
 
-    def test_title_fallback_to_untitled(self):
+    def test_title_raises_when_unresolvable(self):
         doc = make_doc(pages=[make_page(text='body text')])
-        result = ContentMdService.render(doc)
-        assert 'title: "Untitled"' in result
+        with pytest.raises(ValueError, match='no title could be resolved'):
+            ContentMdService.render(doc)
 
     def test_description_from_explicit_param(self, minimal_doc):
         result = ContentMdService.render(
@@ -168,13 +168,13 @@ def test_description_from_doc_abstract_block(self):
             ),
         ]
         doc = make_doc(pages=[make_page(text='', blocks=blocks)])
-        result = ContentMdService.render(doc)
+        result = ContentMdService.render(doc, title='T')
         assert 'description: "Abstract content here."' in result
 
     def test_description_from_first_five_body_blocks(self):
         blocks = [make_text_block(f'Sentence {i}.', role='paragraph') for i in range(7)]
         doc = make_doc(pages=[make_page(text='', blocks=blocks)])
-        result = ContentMdService.render(doc)
+        result = ContentMdService.render(doc, title='T')
         # Only the first five contribute; the sixth and seventh are ignored
         assert 'Sentence 5' not in result.split('---\n')[1].split('\n')[0]
         assert 'Sentence 0' in result
@@ -194,21 +194,27 @@ def test_description_truncated_to_200_chars(self):
         long_text = 'word ' * 60  # well over 200 chars
         blocks = [make_text_block(long_text, role='paragraph')]
         doc = make_doc(pages=[make_page(text='', blocks=blocks)])
-        result = ContentMdService.render(doc)
+        result = ContentMdService.render(doc, title='T')
         fm_end = result.index('---\n', 4)
         frontmatter = result[:fm_end]
-        desc_line = next(l for l in frontmatter.splitlines() if l.startswith('description:'))
+        desc_line = next(
+            l for l in frontmatter.splitlines() if l.startswith('description:')
+        )
         # Strip the YAML quoting to measure the actual value length
-        value = desc_line[len('description: "'):-1]
+        value = desc_line[len('description: "') : -1]
         assert len(value) <= 200
 
     def test_description_contains_no_newlines(self):
-        blocks = [make_text_block('Line one.\nLine two.\nLine three.', role='paragraph')]
+        blocks = [
+            make_text_block('Line one.\nLine two.\nLine three.', role='paragraph')
+        ]
         doc = make_doc(pages=[make_page(text='', blocks=blocks)])
-        result = ContentMdService.render(doc)
+        result = ContentMdService.render(doc, title='T')
         fm_end = result.index('---\n', 4)
         frontmatter = result[:fm_end]
-        desc_line = next(l for l in frontmatter.splitlines() if l.startswith('description:'))
+        desc_line = next(
+            l for l in frontmatter.splitlines() if l.startswith('description:')
+        )
         assert '\n' not in desc_line
 
     def test_description_searches_first_two_pages(self):
@@ -224,7 +230,7 @@ def test_description_searches_first_two_pages(self):
             blocks=[make_text_block('Page 3 has the longest block of all by far.')],
         )
         doc = make_doc(pages=[page1, page2, page3])
-        result = ContentMdService.render(doc)
+        result = ContentMdService.render(doc, title='T')
         # Page 3 is out of the two-page window
         assert 'Page 3' not in result.split('---')[1]  # not in frontmatter
 
@@ -235,7 +241,7 @@ def test_date_from_metadata_created_at(self, metadata_doc):
     def test_date_from_metadata_updated_at_when_no_created_at(self):
         meta = Metadata(updated_at='2025-06-01')
         doc = make_doc(pages=[make_page(text='')], metadata=meta)
-        result = ContentMdService.render(doc)
+        result = ContentMdService.render(doc, title='T')
         assert 'date: "2025-06-01"' in result
 
     def test_explicit_date_overrides_metadata(self, metadata_doc):
@@ -472,3 +478,28 @@ def test_render_delegates_from_document_method(self, metadata_doc):
         via_service = ContentMdService.render(metadata_doc)
         via_method = metadata_doc.contentmd()
         assert via_service == via_method
+
+    def test_empty_document_without_args_raises(self):
+        """A document with no metadata, no blocks, no filename, and no user
+        arguments cannot satisfy the required title constraint."""
+        doc = Document(pages=[])
+        with pytest.raises(ValueError, match='no title could be resolved'):
+            ContentMdService.render(doc)
+
+    def test_empty_document_with_title_arg_returns_contentmd(self):
+        """Passing title= explicitly must succeed even when the document is
+        completely empty."""
+        doc = Document(pages=[])
+        result = ContentMdService.render(doc, title='Provided Title')
+        assert 'title: "Provided Title"' in result
+        assert '# Provided Title' in result
+
+    def test_empty_document_with_title_and_description_returns_contentmd(self):
+        """Both title= and description= passed explicitly on an empty document."""
+        doc = Document(pages=[])
+        result = ContentMdService.render(
+            doc, title='My Title', description='My description.'
+        )
+        assert 'title: "My Title"' in result
+        assert 'description: "My description."' in result
+        assert result.endswith('\n')