From fc0634c357c50f284397ac960402d64cd21c0246 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 18 Feb 2026 10:59:46 +0100 Subject: [PATCH 1/8] wip --- src/parxy_core/models/models.py | 100 ++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py index 258b847..d6bd3f0 100644 --- a/src/parxy_core/models/models.py +++ b/src/parxy_core/models/models.py @@ -155,6 +155,106 @@ def text(self, page_separator: str = '---') -> str: return '\n'.join(texts) + def contentmd( + self, + title: Optional[str] = None, + description: Optional[str] = None, + date: Optional[str] = None, + license: Optional[str] = None, + author: Optional[str] = None, + ) -> str: + """Get the document content formatted as content-md. + + Generates a content-md string: YAML frontmatter followed by Markdown. + Per the spec, all heading levels are shifted up by one so the document + title occupies the sole h1, and images use ``
`` blocks. + + Parameters + ---------- + title : str, optional + Document title. Falls back to metadata.title, then filename. + description : str, optional + Short summary (~200 characters). Required by the spec; omitted from + frontmatter when not provided. + date : str, optional + Creation/publication date in ISO 8601. Falls back to metadata dates. + license : str, optional + License name or SPDX identifier. + author : str, optional + Author name. Falls back to metadata.author. + + Returns + ------- + str + The document content formatted as content-md. + """ + resolved_title = ( + title + or (self.metadata.title if self.metadata else None) + or self.filename + or 'Untitled' + ) + resolved_date = date or ( + (self.metadata.created_at or self.metadata.updated_at) + if self.metadata + else None + ) + resolved_author = author or (self.metadata.author if self.metadata else None) + + def _yaml_str(v: str) -> str: + return '"' + v.replace('\\', '\\\\').replace('"', '\\"') + '"' + + fm = ['---', f'title: {_yaml_str(resolved_title)}'] + if description: + fm.append(f'description: {_yaml_str(description)}') + if resolved_date: + fm.append(f'date: {_yaml_str(resolved_date)}') + if license: + fm.append(f'license: {_yaml_str(license)}') + if resolved_author: + fm.append(f'author: {_yaml_str(resolved_author)}') + fm.append('---') + frontmatter = '\n'.join(fm) + + if not self.pages: + return f'{frontmatter}\n\n# {resolved_title}\n' + + parts = [f'# {resolved_title}'] + + for page in self.pages: + if not page.blocks: + if page.text.strip(): + parts.append(page.text.strip()) + continue + + for block in page.blocks: + if isinstance(block, TextBlock): + if block.category and block.category.lower() in [ + 'heading', + 'title', + 'header', + ]: + # Shift all heading levels by +1 so h1 content becomes h2 + shifted = min((block.level or 1) + 1, 6) + parts.append(f'{"#" * shifted} {block.text.strip()}') + elif block.category and block.category.lower() == 'list': + for line in block.text.splitlines(): + if line.strip(): + parts.append(f'- {line.strip()}') + else: + if block.text.strip(): + parts.append(block.text.strip()) + + elif isinstance(block, ImageBlock): + alt = block.alt_text or '' + parts.append(f'
\n{alt}\n
') + + elif isinstance(block, TableBlock): + if block.text.strip(): + parts.append(block.text.strip()) + + return f'{frontmatter}\n\n' + '\n\n'.join(parts) + '\n' + def markdown(self) -> str: """Get the document content formatted as Markdown. From 731bf6536fdc03eaf4440e3efb2cac8cefcc483e Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 18 Feb 2026 11:03:01 +0100 Subject: [PATCH 2/8] wip --- src/parxy_core/models/models.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py index d6bd3f0..ce3a48e 100644 --- a/src/parxy_core/models/models.py +++ b/src/parxy_core/models/models.py @@ -188,9 +188,30 @@ def contentmd( str The document content formatted as content-md. """ + def _guess_title_from_first_page() -> Optional[str]: + if not self.pages: + return None + first_page = self.pages[0] + if not first_page.blocks: + return None + heading_categories = {'heading', 'title', 'header'} + # Pick the highest-ranking heading (lowest level number) on the first page + candidates = [ + b + for b in first_page.blocks + if isinstance(b, TextBlock) + and b.category + and b.category.lower() in heading_categories + and b.text.strip() + ] + if not candidates: + return None + return min(candidates, key=lambda b: b.level or 1).text.strip() + resolved_title = ( title or (self.metadata.title if self.metadata else None) + or _guess_title_from_first_page() or self.filename or 'Untitled' ) From 62d81af06b6267194e9dc2906db6078be94e8de3 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 18 Feb 2026 11:07:48 +0100 Subject: [PATCH 3/8] wip --- src/parxy_core/models/models.py | 65 +++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py index ce3a48e..8fd1f65 100644 --- a/src/parxy_core/models/models.py +++ b/src/parxy_core/models/models.py @@ -194,19 +194,29 @@ def _guess_title_from_first_page() -> Optional[str]: first_page = self.pages[0] if not first_page.blocks: return None - heading_categories = {'heading', 'title', 'header'} - # Pick the highest-ranking heading (lowest level number) on the first page - candidates = [ + # Prefer an explicit doc-title block, then the highest-ranking heading + doc_title = next( + ( + b + for b in first_page.blocks + if isinstance(b, TextBlock) + and b.role == 'doc-title' + and b.text.strip() + ), + None, + ) + if doc_title: + return doc_title.text.strip() + headings = [ b for b in first_page.blocks if isinstance(b, TextBlock) - and b.category - and b.category.lower() in heading_categories + and b.role == 'heading' and b.text.strip() ] - if not candidates: + if not headings: return None - return min(candidates, key=lambda b: b.level or 1).text.strip() + return min(headings, key=lambda b: b.level or 1).text.strip() resolved_title = ( title @@ -222,12 +232,31 @@ def _guess_title_from_first_page() -> Optional[str]: ) resolved_author = author or (self.metadata.author if self.metadata else None) + def _infer_description() -> Optional[str]: + first_two_pages = self.pages[:2] + blocks = [ + b + for page in first_two_pages + if page.blocks + for b in page.blocks + if isinstance(b, TextBlock) and b.text.strip() + ] + abstract = next((b for b in blocks if b.role == 'doc-abstract'), None) + if abstract: + return abstract.text.strip() + text_blocks = [b for b in blocks if b.role != 'doc-title'] + if not text_blocks: + return None + return max(text_blocks, key=lambda b: len(b.text)).text.strip() + + resolved_description = description or _infer_description() + def _yaml_str(v: str) -> str: return '"' + v.replace('\\', '\\\\').replace('"', '\\"') + '"' fm = ['---', f'title: {_yaml_str(resolved_title)}'] - if description: - fm.append(f'description: {_yaml_str(description)}') + if resolved_description: + fm.append(f'description: {_yaml_str(resolved_description)}') if resolved_date: fm.append(f'date: {_yaml_str(resolved_date)}') if license: @@ -249,19 +278,25 @@ def _yaml_str(v: str) -> str: continue for block in page.blocks: + role = (block.role or 'generic').lower() + if isinstance(block, TextBlock): - if block.category and block.category.lower() in [ - 'heading', - 'title', - 'header', - ]: + if role == 'doc-title': + # Already rendered as the top-level # heading — skip + pass + elif role == 'heading': # Shift all heading levels by +1 so h1 content becomes h2 shifted = min((block.level or 1) + 1, 6) parts.append(f'{"#" * shifted} {block.text.strip()}') - elif block.category and block.category.lower() == 'list': + elif role in ('list', 'listitem'): for line in block.text.splitlines(): if line.strip(): parts.append(f'- {line.strip()}') + elif role == 'doc-abstract': + lang_attr = f' lang="{self.language}"' if self.language else '' + parts.append( + f'\n{block.text.strip()}\n' + ) else: if block.text.strip(): parts.append(block.text.strip()) From 89494f0fcd8e440402928377098184f19d76a3e7 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 18 Feb 2026 11:12:22 +0100 Subject: [PATCH 4/8] wip --- src/parxy_core/models/models.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py index 8fd1f65..3e58aa3 100644 --- a/src/parxy_core/models/models.py +++ b/src/parxy_core/models/models.py @@ -206,7 +206,7 @@ def _guess_title_from_first_page() -> Optional[str]: None, ) if doc_title: - return doc_title.text.strip() + return _normalize(doc_title.text) headings = [ b for b in first_page.blocks @@ -216,7 +216,7 @@ def _guess_title_from_first_page() -> Optional[str]: ] if not headings: return None - return min(headings, key=lambda b: b.level or 1).text.strip() + return _normalize(min(headings, key=lambda b: b.level or 1).text) resolved_title = ( title @@ -243,14 +243,18 @@ def _infer_description() -> Optional[str]: ] abstract = next((b for b in blocks if b.role == 'doc-abstract'), None) if abstract: - return abstract.text.strip() + return _normalize(abstract.text) text_blocks = [b for b in blocks if b.role != 'doc-title'] if not text_blocks: return None - return max(text_blocks, key=lambda b: len(b.text)).text.strip() + return _normalize(max(text_blocks, key=lambda b: len(b.text)).text) resolved_description = description or _infer_description() + def _normalize(text: str) -> str: + """Collapse runs of whitespace to a single space and strip.""" + return ' '.join(text.split()) + def _yaml_str(v: str) -> str: return '"' + v.replace('\\', '\\\\').replace('"', '\\"') + '"' @@ -274,7 +278,7 @@ def _yaml_str(v: str) -> str: for page in self.pages: if not page.blocks: if page.text.strip(): - parts.append(page.text.strip()) + parts.append(_normalize(page.text)) continue for block in page.blocks: @@ -287,19 +291,20 @@ def _yaml_str(v: str) -> str: elif role == 'heading': # Shift all heading levels by +1 so h1 content becomes h2 shifted = min((block.level or 1) + 1, 6) - parts.append(f'{"#" * shifted} {block.text.strip()}') + parts.append(f'{"#" * shifted} {_normalize(block.text)}') elif role in ('list', 'listitem'): for line in block.text.splitlines(): if line.strip(): - parts.append(f'- {line.strip()}') + parts.append(f'- {_normalize(line)}') elif role == 'doc-abstract': lang_attr = f' lang="{self.language}"' if self.language else '' parts.append( - f'\n{block.text.strip()}\n' + f'\n{_normalize(block.text)}\n' ) else: - if block.text.strip(): - parts.append(block.text.strip()) + normalized = _normalize(block.text) + if normalized: + parts.append(normalized) elif isinstance(block, ImageBlock): alt = block.alt_text or '' From b7c374287bc9ba44aebe5de6dc1333fb59f24cc9 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 18 Feb 2026 11:21:28 +0100 Subject: [PATCH 5/8] wip --- src/parxy_core/models/models.py | 146 ++--------- src/parxy_core/services/__init__.py | 3 +- src/parxy_core/services/contentmd_service.py | 240 +++++++++++++++++++ 3 files changed, 256 insertions(+), 133 deletions(-) create mode 100644 src/parxy_core/services/contentmd_service.py diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py index 3e58aa3..0bc6f1d 100644 --- a/src/parxy_core/models/models.py +++ b/src/parxy_core/models/models.py @@ -165,17 +165,16 @@ def contentmd( ) -> str: """Get the document content formatted as content-md. - Generates a content-md string: YAML frontmatter followed by Markdown. - Per the spec, all heading levels are shifted up by one so the document - title occupies the sole h1, and images use ``
`` blocks. + Delegates to :class:`~parxy_core.services.ContentMdService`. Parameters ---------- title : str, optional - Document title. Falls back to metadata.title, then filename. + Document title. Falls back to metadata.title, a heading inferred + from the first page, filename, then 'Untitled'. description : str, optional - Short summary (~200 characters). Required by the spec; omitted from - frontmatter when not provided. + Short summary (~200 characters). Falls back to a doc-abstract block, + then the longest TextBlock across the first two pages. date : str, optional Creation/publication date in ISO 8601. Falls back to metadata dates. license : str, optional @@ -188,133 +187,16 @@ def contentmd( str The document content formatted as content-md. """ - def _guess_title_from_first_page() -> Optional[str]: - if not self.pages: - return None - first_page = self.pages[0] - if not first_page.blocks: - return None - # Prefer an explicit doc-title block, then the highest-ranking heading - doc_title = next( - ( - b - for b in first_page.blocks - if isinstance(b, TextBlock) - and b.role == 'doc-title' - and b.text.strip() - ), - None, - ) - if doc_title: - return _normalize(doc_title.text) - headings = [ - b - for b in first_page.blocks - if isinstance(b, TextBlock) - and b.role == 'heading' - and b.text.strip() - ] - if not headings: - return None - return _normalize(min(headings, key=lambda b: b.level or 1).text) - - resolved_title = ( - title - or (self.metadata.title if self.metadata else None) - or _guess_title_from_first_page() - or self.filename - or 'Untitled' + from parxy_core.services.contentmd_service import ContentMdService + + return ContentMdService.render( + self, + title=title, + description=description, + date=date, + license=license, + author=author, ) - resolved_date = date or ( - (self.metadata.created_at or self.metadata.updated_at) - if self.metadata - else None - ) - resolved_author = author or (self.metadata.author if self.metadata else None) - - def _infer_description() -> Optional[str]: - first_two_pages = self.pages[:2] - blocks = [ - b - for page in first_two_pages - if page.blocks - for b in page.blocks - if isinstance(b, TextBlock) and b.text.strip() - ] - abstract = next((b for b in blocks if b.role == 'doc-abstract'), None) - if abstract: - return _normalize(abstract.text) - text_blocks = [b for b in blocks if b.role != 'doc-title'] - if not text_blocks: - return None - return _normalize(max(text_blocks, key=lambda b: len(b.text)).text) - - resolved_description = description or _infer_description() - - def _normalize(text: str) -> str: - """Collapse runs of whitespace to a single space and strip.""" - return ' '.join(text.split()) - - def _yaml_str(v: str) -> str: - return '"' + v.replace('\\', '\\\\').replace('"', '\\"') + '"' - - fm = ['---', f'title: {_yaml_str(resolved_title)}'] - if resolved_description: - fm.append(f'description: {_yaml_str(resolved_description)}') - if resolved_date: - fm.append(f'date: {_yaml_str(resolved_date)}') - if license: - fm.append(f'license: {_yaml_str(license)}') - if resolved_author: - fm.append(f'author: {_yaml_str(resolved_author)}') - fm.append('---') - frontmatter = '\n'.join(fm) - - if not self.pages: - return f'{frontmatter}\n\n# {resolved_title}\n' - - parts = [f'# {resolved_title}'] - - for page in self.pages: - if not page.blocks: - if page.text.strip(): - parts.append(_normalize(page.text)) - continue - - for block in page.blocks: - role = (block.role or 'generic').lower() - - if isinstance(block, TextBlock): - if role == 'doc-title': - # Already rendered as the top-level # heading — skip - pass - elif role == 'heading': - # Shift all heading levels by +1 so h1 content becomes h2 - shifted = min((block.level or 1) + 1, 6) - parts.append(f'{"#" * shifted} {_normalize(block.text)}') - elif role in ('list', 'listitem'): - for line in block.text.splitlines(): - if line.strip(): - parts.append(f'- {_normalize(line)}') - elif role == 'doc-abstract': - lang_attr = f' lang="{self.language}"' if self.language else '' - parts.append( - f'\n{_normalize(block.text)}\n' - ) - else: - normalized = _normalize(block.text) - if normalized: - parts.append(normalized) - - elif isinstance(block, ImageBlock): - alt = block.alt_text or '' - parts.append(f'
\n{alt}\n
') - - elif isinstance(block, TableBlock): - if block.text.strip(): - parts.append(block.text.strip()) - - return f'{frontmatter}\n\n' + '\n\n'.join(parts) + '\n' def markdown(self) -> str: """Get the document content formatted as Markdown. diff --git a/src/parxy_core/services/__init__.py b/src/parxy_core/services/__init__.py index 5071d08..5342a63 100644 --- a/src/parxy_core/services/__init__.py +++ b/src/parxy_core/services/__init__.py @@ -1,5 +1,6 @@ """Services module for parxy_core.""" +from parxy_core.services.contentmd_service import ContentMdService from parxy_core.services.pdf_service import PdfService -__all__ = ['PdfService'] +__all__ = ['ContentMdService', 'PdfService'] diff --git a/src/parxy_core/services/contentmd_service.py b/src/parxy_core/services/contentmd_service.py new file mode 100644 index 0000000..59a03db --- /dev/null +++ b/src/parxy_core/services/contentmd_service.py @@ -0,0 +1,240 @@ +"""Service for rendering documents as content-md.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + from parxy_core.models.models import Document + + +class ContentMdService: + """Render a :class:`Document` as a content-md string. + + content-md is an open specification for optimised content exchange: a YAML + frontmatter section followed by CommonMark / GitHub-flavoured Markdown. + All methods are static; the class acts as a namespace. + """ + + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + @staticmethod + def _normalize(text: str) -> str: + """Collapse any run of whitespace to a single space and strip.""" + return ' '.join(text.split()) + + @staticmethod + def _yaml_str(value: str) -> str: + """Wrap *value* in double quotes and escape internal quotes/backslashes.""" + return '"' + value.replace('\\', '\\\\').replace('"', '\\"') + '"' + + @staticmethod + def _guess_title(document: Document) -> Optional[str]: + """Infer a title from the first page blocks. + + Prefers an explicit ``doc-title`` role; falls back to the + highest-ranking (lowest level number) ``heading`` block. + """ + from parxy_core.models.models import TextBlock + + if not document.pages: + return None + first_page = document.pages[0] + if not first_page.blocks: + return None + + doc_title = next( + ( + b + for b in first_page.blocks + if isinstance(b, TextBlock) + and b.role == 'doc-title' + and b.text.strip() + ), + None, + ) + if doc_title: + return ContentMdService._normalize(doc_title.text) + + headings = [ + b + for b in first_page.blocks + if isinstance(b, TextBlock) and b.role == 'heading' and b.text.strip() + ] + if not headings: + return None + return ContentMdService._normalize( + min(headings, key=lambda b: b.level or 1).text + ) + + @staticmethod + def _infer_description(document: Document) -> Optional[str]: + """Infer a description from document content. + + Uses the ``doc-abstract`` block when present, otherwise the longest + :class:`TextBlock` across the first two pages. + """ + from parxy_core.models.models import TextBlock + + blocks = [ + b + for page in document.pages[:2] + if page.blocks + for b in page.blocks + if isinstance(b, TextBlock) and b.text.strip() + ] + + abstract = next((b for b in blocks if b.role == 'doc-abstract'), None) + if abstract: + return ContentMdService._normalize(abstract.text) + + text_blocks = [b for b in blocks if b.role != 'doc-title'] + if not text_blocks: + return None + return ContentMdService._normalize( + max(text_blocks, key=lambda b: len(b.text)).text + ) + + @staticmethod + def _build_frontmatter( + title: str, + description: Optional[str], + date: Optional[str], + license: Optional[str], + author: Optional[str], + ) -> str: + ys = ContentMdService._yaml_str + lines = ['---', f'title: {ys(title)}'] + if description: + lines.append(f'description: {ys(description)}') + if date: + lines.append(f'date: {ys(date)}') + if license: + lines.append(f'license: {ys(license)}') + if author: + lines.append(f'author: {ys(author)}') + lines.append('---') + return '\n'.join(lines) + + @staticmethod + def _build_body(document: Document, title: str) -> str: + from parxy_core.models.models import ImageBlock, TableBlock, TextBlock + + normalize = ContentMdService._normalize + parts = [f'# {title}'] + + for page in document.pages: + if not page.blocks: + if page.text.strip(): + parts.append(normalize(page.text)) + continue + + for block in page.blocks: + role = (block.role or 'generic').lower() + + if isinstance(block, TextBlock): + if role == 'doc-title': + # Already the top-level h1 — skip to avoid duplication + pass + elif role == 'heading': + # Shift levels +1: h1 content → h2, per content-md spec + shifted = min((block.level or 1) + 1, 6) + parts.append(f'{"#" * shifted} {normalize(block.text)}') + elif role in ('list', 'listitem'): + for line in block.text.splitlines(): + if line.strip(): + parts.append(f'- {normalize(line)}') + elif role == 'doc-abstract': + lang_attr = ( + f' lang="{document.language}"' if document.language else '' + ) + parts.append( + f'\n{normalize(block.text)}\n' + ) + else: + normalized = normalize(block.text) + if normalized: + parts.append(normalized) + + elif isinstance(block, ImageBlock): + parts.append(f'
\n{block.alt_text or ""}\n
') + + elif isinstance(block, TableBlock): + # Preserve table whitespace (column alignment, padding) + if block.text.strip(): + parts.append(block.text.strip()) + + return '\n\n'.join(parts) + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + @staticmethod + def render( + document: Document, + title: Optional[str] = None, + description: Optional[str] = None, + date: Optional[str] = None, + license: Optional[str] = None, + author: Optional[str] = None, + ) -> str: + """Render *document* as a content-md string. + + Parameters + ---------- + document: + The document to render. + title: + Document title. Falls back to ``metadata.title``, a heading + inferred from the first page, ``filename``, then ``'Untitled'``. + description: + Short summary (~200 characters). Falls back to a ``doc-abstract`` + block, then the longest :class:`TextBlock` in the first two pages. + date: + Creation/publication date in ISO 8601. Falls back to + ``metadata.created_at`` / ``metadata.updated_at``. + license: + License name or SPDX identifier. + author: + Author name. Falls back to ``metadata.author``. + + Returns + ------- + str + The document formatted as content-md. + """ + resolved_title = ( + title + or (document.metadata.title if document.metadata else None) + or ContentMdService._guess_title(document) + or document.filename + or 'Untitled' + ) + resolved_description = description or ContentMdService._infer_description( + document + ) + resolved_date = date or ( + (document.metadata.created_at or document.metadata.updated_at) + if document.metadata + else None + ) + resolved_author = ( + author or (document.metadata.author if document.metadata else None) + ) + + frontmatter = ContentMdService._build_frontmatter( + title=resolved_title, + description=resolved_description, + date=resolved_date, + license=license, + author=resolved_author, + ) + + if not document.pages: + return f'{frontmatter}\n\n# {resolved_title}\n' + + body = ContentMdService._build_body(document, resolved_title) + return f'{frontmatter}\n\n{body}\n' From be15068cff36076cb41ce4d89371ba407dc47e53 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 18 Feb 2026 11:27:07 +0100 Subject: [PATCH 6/8] wip --- src/parxy_core/services/contentmd_service.py | 8 +- tests/services/test_contentmd_service.py | 456 +++++++++++++++++++ 2 files changed, 459 insertions(+), 5 deletions(-) create mode 100644 tests/services/test_contentmd_service.py diff --git a/src/parxy_core/services/contentmd_service.py b/src/parxy_core/services/contentmd_service.py index 59a03db..9383151 100644 --- a/src/parxy_core/services/contentmd_service.py +++ b/src/parxy_core/services/contentmd_service.py @@ -49,9 +49,7 @@ def _guess_title(document: Document) -> Optional[str]: ( b for b in first_page.blocks - if isinstance(b, TextBlock) - and b.role == 'doc-title' - and b.text.strip() + if isinstance(b, TextBlock) and b.role == 'doc-title' and b.text.strip() ), None, ) @@ -221,8 +219,8 @@ def render( if document.metadata else None ) - resolved_author = ( - author or (document.metadata.author if document.metadata else None) + resolved_author = author or ( + document.metadata.author if document.metadata else None ) frontmatter = ContentMdService._build_frontmatter( diff --git a/tests/services/test_contentmd_service.py b/tests/services/test_contentmd_service.py new file mode 100644 index 0000000..c6e5782 --- /dev/null +++ b/tests/services/test_contentmd_service.py @@ -0,0 +1,456 @@ +"""Test suite for ContentMdService.""" + +import pytest + +from parxy_core.models.models import ( + Document, + ImageBlock, + Metadata, + Page, + TableBlock, + TextBlock, +) +from parxy_core.services.contentmd_service import ContentMdService + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def make_page( + number: int = 1, + text: str = '', + blocks: list | None = None, +) -> Page: + return Page(number=number, text=text, blocks=blocks) + + +def make_text_block( + text: str, + role: str = 'generic', + level: int | None = None, +) -> TextBlock: + return TextBlock(type='text', text=text, role=role, level=level) + + +def make_image_block( + alt_text: str | None = None, name: str | None = None +) -> ImageBlock: + return ImageBlock(type='image', alt_text=alt_text, name=name) + + +def make_table_block(text: str) -> TableBlock: + return TableBlock(type='table', text=text) + + +def make_doc( + pages: list[Page], + metadata: Metadata | None = None, + filename: str | None = None, + language: str | None = None, +) -> Document: + return Document( + pages=pages, + metadata=metadata, + filename=filename, + language=language, + ) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def minimal_doc(): + """Document with a single page, no blocks, no metadata.""" + return make_doc(pages=[make_page(text='Hello world')]) + + +@pytest.fixture +def metadata_doc(): + """Document with full metadata and one plain paragraph block.""" + meta = Metadata( + title='Metadata Title', + author='Jane Doe', + created_at='2025-01-15', + ) + page = make_page( + text='Paragraph text.', + blocks=[make_text_block('Paragraph text.')], + ) + return make_doc(pages=[page], metadata=meta, filename='report.pdf') + + +@pytest.fixture +def all_blocks_doc(): + """Document whose first page contains every supported block type.""" + blocks = [ + make_text_block('My Document', role='doc-title'), + make_text_block('Introduction', role='heading', level=1), + make_text_block('Background', role='heading', level=2), + make_text_block('First item\nSecond item', role='list'), + make_text_block('A plain paragraph.', role='paragraph'), + make_text_block('A brief overview.', role='doc-abstract'), + make_image_block(alt_text='A sunset over mountains', name='sunset.jpg'), + make_table_block('| Col A | Col B |\n| ----- | ----- |\n| 1 | 2 |'), + ] + page = make_page(text='My Document', blocks=blocks) + return make_doc(pages=[page], language='en') + + +# --------------------------------------------------------------------------- +# Frontmatter +# --------------------------------------------------------------------------- + + +class TestFrontmatter: + def test_frontmatter_delimiters_present(self, minimal_doc): + result = ContentMdService.render(minimal_doc, title='T', description='D') + lines = result.splitlines() + assert lines[0] == '---' + closing = lines.index('---', 1) + assert closing > 0 + + def test_explicit_title_in_frontmatter(self, minimal_doc): + result = ContentMdService.render(minimal_doc, title='Explicit Title') + assert 'title: "Explicit Title"' in result + + def test_title_from_metadata(self, metadata_doc): + result = ContentMdService.render(metadata_doc) + assert 'title: "Metadata Title"' in result + + def test_title_from_doc_title_role_preferred_over_heading(self): + blocks = [ + make_text_block('Real Title', role='doc-title'), + make_text_block('Section One', role='heading', level=1), + ] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc) + assert 'title: "Real Title"' in result + + def test_title_from_heading_when_no_doc_title(self): + blocks = [ + make_text_block('Section One', role='heading', level=2), + make_text_block('Section Two', role='heading', level=1), + ] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc) + # Level 1 heading wins (lowest level = highest rank) + assert 'title: "Section Two"' in result + + def test_title_from_filename_when_no_headings(self): + doc = make_doc( + pages=[make_page(text='body text')], + filename='my-report.pdf', + ) + result = ContentMdService.render(doc) + assert 'title: "my-report.pdf"' in result + + def test_title_fallback_to_untitled(self): + doc = make_doc(pages=[make_page(text='body text')]) + result = ContentMdService.render(doc) + assert 'title: "Untitled"' in result + + def test_description_from_explicit_param(self, minimal_doc): + result = ContentMdService.render( + minimal_doc, title='T', description='My summary.' + ) + assert 'description: "My summary."' in result + + def test_description_from_doc_abstract_block(self): + blocks = [ + make_text_block('Abstract content here.', role='doc-abstract'), + make_text_block( + 'A much longer paragraph that should not be picked.', role='paragraph' + ), + ] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc) + assert 'description: "Abstract content here."' in result + + def test_description_from_longest_textblock_when_no_abstract(self): + blocks = [ + make_text_block('Short.', role='paragraph'), + make_text_block( + 'This is a considerably longer paragraph block.', role='paragraph' + ), + ] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc) + assert 'description: "This is a considerably longer paragraph block."' in result + + def test_description_excludes_doc_title_from_longest_candidate(self): + blocks = [ + make_text_block( + 'This is a very long doc-title block text.', role='doc-title' + ), + make_text_block('Shorter paragraph.', role='paragraph'), + ] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc) + assert 'description: "Shorter paragraph."' in result + + def test_description_searches_first_two_pages(self): + page1 = make_page(number=1, text='', blocks=[make_text_block('Page 1 text.')]) + page2 = make_page( + number=2, + text='', + blocks=[make_text_block('Page 2 has a longer text block.')], + ) + page3 = make_page( + number=3, + text='', + blocks=[make_text_block('Page 3 has the longest block of all by far.')], + ) + doc = make_doc(pages=[page1, page2, page3]) + result = ContentMdService.render(doc) + # Page 3 is out of the two-page window + assert 'Page 3' not in result.split('---')[1] # not in frontmatter + + def test_date_from_metadata_created_at(self, metadata_doc): + result = ContentMdService.render(metadata_doc) + assert 'date: "2025-01-15"' in result + + def test_date_from_metadata_updated_at_when_no_created_at(self): + meta = Metadata(updated_at='2025-06-01') + doc = make_doc(pages=[make_page(text='')], metadata=meta) + result = ContentMdService.render(doc) + assert 'date: "2025-06-01"' in result + + def test_explicit_date_overrides_metadata(self, metadata_doc): + result = ContentMdService.render(metadata_doc, date='2026-01-01') + assert 'date: "2026-01-01"' in result + assert '2025-01-15' not in result + + def test_author_from_metadata(self, metadata_doc): + result = ContentMdService.render(metadata_doc) + assert 'author: "Jane Doe"' in result + + def test_optional_fields_omitted_when_absent(self, minimal_doc): + result = ContentMdService.render(minimal_doc, title='T') + assert 'description:' not in result + assert 'date:' not in result + assert 'license:' not in result + assert 'author:' not in result + + def test_license_included_when_provided(self, minimal_doc): + result = ContentMdService.render(minimal_doc, title='T', license='CC-BY-4.0') + assert 'license: "CC-BY-4.0"' in result + + def test_yaml_values_escaped(self, minimal_doc): + result = ContentMdService.render( + minimal_doc, + title='Title with "quotes"', + description='Back\\slash', + ) + assert r'title: "Title with \"quotes\""' in result + assert r'description: "Back\\slash"' in result + + +# --------------------------------------------------------------------------- +# Body – block rendering +# --------------------------------------------------------------------------- + + +class TestBodyBlocks: + def test_body_starts_with_h1_title(self, metadata_doc): + result = ContentMdService.render(metadata_doc) + body = result.split('---\n', 2)[-1] + assert body.lstrip().startswith('# Metadata Title') + + def test_doc_title_block_skipped_in_body(self, all_blocks_doc): + result = ContentMdService.render(all_blocks_doc) + body = result.split('---\n', 2)[-1] + # Should appear exactly once (as the h1), not twice + assert body.count('My Document') == 1 + + def test_heading_level_shifted_by_one(self): + blocks = [make_text_block('Section', role='heading', level=1)] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert '## Section' in result + + def test_heading_level_2_becomes_3(self): + blocks = [make_text_block('Subsection', role='heading', level=2)] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert '### Subsection' in result + + def test_heading_without_level_defaults_to_h2(self): + blocks = [make_text_block('Heading', role='heading')] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert '## Heading' in result + + def test_heading_level_capped_at_6(self): + blocks = [make_text_block('Deep', role='heading', level=6)] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert '###### Deep' in result + assert '####### Deep' not in result + + def test_list_role_rendered_as_bullets(self): + blocks = [make_text_block('Alpha\nBeta\nGamma', role='list')] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert '- Alpha' in result + assert '- Beta' in result + assert '- Gamma' in result + + def test_listitem_role_rendered_as_bullet(self): + blocks = [make_text_block('Single item', role='listitem')] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert '- Single item' in result + + def test_doc_abstract_rendered_as_abstract_tag(self, all_blocks_doc): + result = ContentMdService.render(all_blocks_doc) + assert '' in result + assert 'A brief overview.' in result + assert '' in result + + def test_doc_abstract_without_language_omits_lang_attr(self): + blocks = [make_text_block('Summary.', role='doc-abstract')] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert '\nSummary.\n' in result + + def test_generic_textblock_rendered_as_paragraph(self): + blocks = [make_text_block('Plain paragraph text.', role='generic')] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert 'Plain paragraph text.' in result + + def test_empty_textblock_not_rendered(self): + blocks = [make_text_block(' ', role='paragraph')] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + # Body should only contain the h1 line + body = result.split('---\n', 2)[-1].strip() + assert body == '# T' + + def test_image_block_rendered_as_figure(self): + blocks = [make_image_block(alt_text='A sunset over mountains')] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert '
\nA sunset over mountains\n
' in result + + def test_image_block_without_alt_text(self): + blocks = [make_image_block()] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert '
\n\n
' in result + + def test_table_block_rendered_as_is(self): + table_text = '| Col A | Col B |\n| ----- | ----- |\n| 1 | 2 |' + blocks = [make_table_block(table_text)] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert table_text in result + + def test_page_without_blocks_uses_page_text(self): + page = make_page(text='Fallback page text', blocks=None) + doc = make_doc(pages=[page]) + result = ContentMdService.render(doc, title='T') + assert 'Fallback page text' in result + + def test_empty_page_text_not_rendered(self): + page = make_page(text=' ', blocks=None) + doc = make_doc(pages=[page]) + result = ContentMdService.render(doc, title='T') + body = result.split('---\n', 2)[-1].strip() + assert body == '# T' + + +# --------------------------------------------------------------------------- +# Whitespace normalisation +# --------------------------------------------------------------------------- + + +class TestWhitespaceNormalisation: + def test_multiple_spaces_in_paragraph_collapsed(self): + blocks = [make_text_block('Word1 Word2 Word3')] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert 'Word1 Word2 Word3' in result + + def test_tabs_in_paragraph_collapsed(self): + blocks = [make_text_block('Word1\t\tWord2')] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert 'Word1 Word2' in result + + def test_whitespace_in_heading_collapsed(self): + blocks = [make_text_block('My Section', role='heading', level=1)] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert '## My Section' in result + + def test_whitespace_in_title_collapsed(self): + blocks = [make_text_block(' My Title ', role='doc-title')] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc) + assert 'title: "My Title"' in result + + def test_whitespace_in_description_collapsed(self): + blocks = [make_text_block('Summary with gaps.', role='doc-abstract')] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert 'description: "Summary with gaps."' in result + + def test_table_whitespace_preserved(self): + table_text = '| Col A | Col B |\n| ----- | ----- |' + blocks = [make_table_block(table_text)] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert '| Col A | Col B |' in result + + +# --------------------------------------------------------------------------- +# Output structure +# --------------------------------------------------------------------------- + + +class TestOutputStructure: + def test_result_ends_with_newline(self, minimal_doc): + result = ContentMdService.render(minimal_doc, title='T') + assert result.endswith('\n') + + def test_empty_pages_list_returns_frontmatter_and_title(self): + doc = Document(pages=[]) + result = ContentMdService.render(doc, title='Empty') + assert 'title: "Empty"' in result + assert '# Empty' in result + + def test_blocks_separated_by_blank_line(self): + blocks = [ + make_text_block('First paragraph.'), + make_text_block('Second paragraph.'), + ] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc, title='T') + assert 'First paragraph.\n\nSecond paragraph.' in result + + def test_multipage_document_renders_all_pages(self): + page1 = make_page( + number=1, + text='', + blocks=[make_text_block('Page one content.')], + ) + page2 = make_page( + number=2, + text='', + blocks=[make_text_block('Page two content.')], + ) + doc = make_doc(pages=[page1, page2]) + result = ContentMdService.render(doc, title='T') + assert 'Page one content.' in result + assert 'Page two content.' in result + + def test_render_delegates_from_document_method(self, metadata_doc): + via_service = ContentMdService.render(metadata_doc) + via_method = metadata_doc.contentmd() + assert via_service == via_method From 5d7fd9c9f4bc758bce335cecd06cc9123174fdab Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 18 Feb 2026 11:35:21 +0100 Subject: [PATCH 7/8] wip --- src/parxy_core/services/contentmd_service.py | 34 ++++++++++++--- tests/services/test_contentmd_service.py | 46 ++++++++++++++------ 2 files changed, 59 insertions(+), 21 deletions(-) diff --git a/src/parxy_core/services/contentmd_service.py b/src/parxy_core/services/contentmd_service.py index 9383151..4531054 100644 --- a/src/parxy_core/services/contentmd_service.py +++ b/src/parxy_core/services/contentmd_service.py @@ -20,6 +20,20 @@ class ContentMdService: # Private helpers # ------------------------------------------------------------------ + # Roles that provide structure or navigation rather than readable body text + _STRUCTURAL_ROLES: frozenset[str] = frozenset( + { + 'heading', + 'doc-title', + 'doc-subtitle', + 'doc-abstract', + 'doc-toc', + 'doc-pageheader', + 'doc-pagefooter', + 'caption', + } + ) + @staticmethod def _normalize(text: str) -> str: """Collapse any run of whitespace to a single space and strip.""" @@ -71,8 +85,10 @@ def _guess_title(document: Document) -> Optional[str]: def _infer_description(document: Document) -> Optional[str]: """Infer a description from document content. - Uses the ``doc-abstract`` block when present, otherwise the longest - :class:`TextBlock` across the first two pages. + Uses the ``doc-abstract`` block when present. Otherwise concatenates + the first five body :class:`TextBlock` objects (non-structural, across + the first two pages), normalises whitespace, and returns at most 200 + characters. """ from parxy_core.models.models import TextBlock @@ -88,12 +104,16 @@ def _infer_description(document: Document) -> Optional[str]: if abstract: return ContentMdService._normalize(abstract.text) - text_blocks = [b for b in blocks if b.role != 'doc-title'] - if not text_blocks: + body_blocks = [ + b + for b in blocks + if (b.role or 'generic') not in ContentMdService._STRUCTURAL_ROLES + ] + if not body_blocks: return None - return ContentMdService._normalize( - max(text_blocks, key=lambda b: len(b.text)).text - ) + + combined = ' '.join(b.text for b in body_blocks[:5]) + return ContentMdService._normalize(combined)[:200] @staticmethod def _build_frontmatter( diff --git a/tests/services/test_contentmd_service.py b/tests/services/test_contentmd_service.py index c6e5782..71ba804 100644 --- a/tests/services/test_contentmd_service.py +++ b/tests/services/test_contentmd_service.py @@ -171,27 +171,45 @@ def test_description_from_doc_abstract_block(self): result = ContentMdService.render(doc) assert 'description: "Abstract content here."' in result - def test_description_from_longest_textblock_when_no_abstract(self): - blocks = [ - make_text_block('Short.', role='paragraph'), - make_text_block( - 'This is a considerably longer paragraph block.', role='paragraph' - ), - ] + def test_description_from_first_five_body_blocks(self): + blocks = [make_text_block(f'Sentence {i}.', role='paragraph') for i in range(7)] doc = make_doc(pages=[make_page(text='', blocks=blocks)]) result = ContentMdService.render(doc) - assert 'description: "This is a considerably longer paragraph block."' in result + # Only the first five contribute; the sixth and seventh are ignored + assert 'Sentence 5' not in result.split('---\n')[1].split('\n')[0] + assert 'Sentence 0' in result - def test_description_excludes_doc_title_from_longest_candidate(self): + def test_description_excludes_structural_roles(self): blocks = [ - make_text_block( - 'This is a very long doc-title block text.', role='doc-title' - ), - make_text_block('Shorter paragraph.', role='paragraph'), + make_text_block('Table of contents text.', role='doc-toc'), + make_text_block('Page header text.', role='doc-pageheader'), + make_text_block('A heading block.', role='heading'), + make_text_block('Body content.', role='paragraph'), ] doc = make_doc(pages=[make_page(text='', blocks=blocks)]) result = ContentMdService.render(doc) - assert 'description: "Shorter paragraph."' in result + assert 'description: "Body content."' in result + + def test_description_truncated_to_200_chars(self): + long_text = 'word ' * 60 # well over 200 chars + blocks = [make_text_block(long_text, role='paragraph')] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc) + fm_end = result.index('---\n', 4) + frontmatter = result[:fm_end] + desc_line = next(l for l in frontmatter.splitlines() if l.startswith('description:')) + # Strip the YAML quoting to measure the actual value length + value = desc_line[len('description: "'):-1] + assert len(value) <= 200 + + def test_description_contains_no_newlines(self): + blocks = [make_text_block('Line one.\nLine two.\nLine three.', role='paragraph')] + doc = make_doc(pages=[make_page(text='', blocks=blocks)]) + result = ContentMdService.render(doc) + fm_end = result.index('---\n', 4) + frontmatter = result[:fm_end] + desc_line = next(l for l in frontmatter.splitlines() if l.startswith('description:')) + assert '\n' not in desc_line def test_description_searches_first_two_pages(self): page1 = make_page(number=1, text='', blocks=[make_text_block('Page 1 text.')]) From 57476f3df634410a8c1e3296490da0ae79f3bfea Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Wed, 18 Feb 2026 11:37:40 +0100 Subject: [PATCH 8/8] wip --- src/parxy_core/services/contentmd_service.py | 12 +++-- tests/services/test_contentmd_service.py | 57 +++++++++++++++----- 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/src/parxy_core/services/contentmd_service.py b/src/parxy_core/services/contentmd_service.py index 4531054..3836c06 100644 --- a/src/parxy_core/services/contentmd_service.py +++ b/src/parxy_core/services/contentmd_service.py @@ -207,10 +207,11 @@ def render( The document to render. title: Document title. Falls back to ``metadata.title``, a heading - inferred from the first page, ``filename``, then ``'Untitled'``. + inferred from the first page, then ``filename``. Raises + ``ValueError`` if no title can be resolved. description: Short summary (~200 characters). Falls back to a ``doc-abstract`` - block, then the longest :class:`TextBlock` in the first two pages. + block, then the first five body blocks in the first two pages. date: Creation/publication date in ISO 8601. Falls back to ``metadata.created_at`` / ``metadata.updated_at``. @@ -229,8 +230,13 @@ def render( or (document.metadata.title if document.metadata else None) or ContentMdService._guess_title(document) or document.filename - or 'Untitled' ) + if not resolved_title: + raise ValueError( + 'Cannot render content-md: no title could be resolved. ' + 'Provide a title via metadata, a doc-title/heading block, ' + 'a filename, or pass title= explicitly.' + ) resolved_description = description or ContentMdService._infer_description( document ) diff --git a/tests/services/test_contentmd_service.py b/tests/services/test_contentmd_service.py index 71ba804..b817fe0 100644 --- a/tests/services/test_contentmd_service.py +++ b/tests/services/test_contentmd_service.py @@ -149,10 +149,10 @@ def test_title_from_filename_when_no_headings(self): result = ContentMdService.render(doc) assert 'title: "my-report.pdf"' in result - def test_title_fallback_to_untitled(self): + def test_title_raises_when_unresolvable(self): doc = make_doc(pages=[make_page(text='body text')]) - result = ContentMdService.render(doc) - assert 'title: "Untitled"' in result + with pytest.raises(ValueError, match='no title could be resolved'): + ContentMdService.render(doc) def test_description_from_explicit_param(self, minimal_doc): result = ContentMdService.render( @@ -168,13 +168,13 @@ def test_description_from_doc_abstract_block(self): ), ] doc = make_doc(pages=[make_page(text='', blocks=blocks)]) - result = ContentMdService.render(doc) + result = ContentMdService.render(doc, title='T') assert 'description: "Abstract content here."' in result def test_description_from_first_five_body_blocks(self): blocks = [make_text_block(f'Sentence {i}.', role='paragraph') for i in range(7)] doc = make_doc(pages=[make_page(text='', blocks=blocks)]) - result = ContentMdService.render(doc) + result = ContentMdService.render(doc, title='T') # Only the first five contribute; the sixth and seventh are ignored assert 'Sentence 5' not in result.split('---\n')[1].split('\n')[0] assert 'Sentence 0' in result @@ -194,21 +194,27 @@ def test_description_truncated_to_200_chars(self): long_text = 'word ' * 60 # well over 200 chars blocks = [make_text_block(long_text, role='paragraph')] doc = make_doc(pages=[make_page(text='', blocks=blocks)]) - result = ContentMdService.render(doc) + result = ContentMdService.render(doc, title='T') fm_end = result.index('---\n', 4) frontmatter = result[:fm_end] - desc_line = next(l for l in frontmatter.splitlines() if l.startswith('description:')) + desc_line = next( + l for l in frontmatter.splitlines() if l.startswith('description:') + ) # Strip the YAML quoting to measure the actual value length - value = desc_line[len('description: "'):-1] + value = desc_line[len('description: "') : -1] assert len(value) <= 200 def test_description_contains_no_newlines(self): - blocks = [make_text_block('Line one.\nLine two.\nLine three.', role='paragraph')] + blocks = [ + make_text_block('Line one.\nLine two.\nLine three.', role='paragraph') + ] doc = make_doc(pages=[make_page(text='', blocks=blocks)]) - result = ContentMdService.render(doc) + result = ContentMdService.render(doc, title='T') fm_end = result.index('---\n', 4) frontmatter = result[:fm_end] - desc_line = next(l for l in frontmatter.splitlines() if l.startswith('description:')) + desc_line = next( + l for l in frontmatter.splitlines() if l.startswith('description:') + ) assert '\n' not in desc_line def test_description_searches_first_two_pages(self): @@ -224,7 +230,7 @@ def test_description_searches_first_two_pages(self): blocks=[make_text_block('Page 3 has the longest block of all by far.')], ) doc = make_doc(pages=[page1, page2, page3]) - result = ContentMdService.render(doc) + result = ContentMdService.render(doc, title='T') # Page 3 is out of the two-page window assert 'Page 3' not in result.split('---')[1] # not in frontmatter @@ -235,7 +241,7 @@ def test_date_from_metadata_created_at(self, metadata_doc): def test_date_from_metadata_updated_at_when_no_created_at(self): meta = Metadata(updated_at='2025-06-01') doc = make_doc(pages=[make_page(text='')], metadata=meta) - result = ContentMdService.render(doc) + result = ContentMdService.render(doc, title='T') assert 'date: "2025-06-01"' in result def test_explicit_date_overrides_metadata(self, metadata_doc): @@ -472,3 +478,28 @@ def test_render_delegates_from_document_method(self, metadata_doc): via_service = ContentMdService.render(metadata_doc) via_method = metadata_doc.contentmd() assert via_service == via_method + + def test_empty_document_without_args_raises(self): + """A document with no metadata, no blocks, no filename, and no user + arguments cannot satisfy the required title constraint.""" + doc = Document(pages=[]) + with pytest.raises(ValueError, match='no title could be resolved'): + ContentMdService.render(doc) + + def test_empty_document_with_title_arg_returns_contentmd(self): + """Passing title= explicitly must succeed even when the document is + completely empty.""" + doc = Document(pages=[]) + result = ContentMdService.render(doc, title='Provided Title') + assert 'title: "Provided Title"' in result + assert '# Provided Title' in result + + def test_empty_document_with_title_and_description_returns_contentmd(self): + """Both title= and description= passed explicitly on an empty document.""" + doc = Document(pages=[]) + result = ContentMdService.render( + doc, title='My Title', description='My description.' + ) + assert 'title: "My Title"' in result + assert 'description: "My description."' in result + assert result.endswith('\n')