diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py
index 258b847..0bc6f1d 100644
--- a/src/parxy_core/models/models.py
+++ b/src/parxy_core/models/models.py
@@ -155,6 +155,49 @@ def text(self, page_separator: str = '---') -> str:
return '\n'.join(texts)
+ def contentmd(
+ self,
+ title: Optional[str] = None,
+ description: Optional[str] = None,
+ date: Optional[str] = None,
+ license: Optional[str] = None,
+ author: Optional[str] = None,
+ ) -> str:
+ """Get the document content formatted as content-md.
+
+ Delegates to :class:`~parxy_core.services.ContentMdService`.
+
+ Parameters
+ ----------
+ title : str, optional
+ Document title. Falls back to metadata.title, a heading inferred
+ from the first page, filename, then 'Untitled'.
+ description : str, optional
+ Short summary (~200 characters). Falls back to a doc-abstract block,
+ then the longest TextBlock across the first two pages.
+ date : str, optional
+ Creation/publication date in ISO 8601. Falls back to metadata dates.
+ license : str, optional
+ License name or SPDX identifier.
+ author : str, optional
+ Author name. Falls back to metadata.author.
+
+ Returns
+ -------
+ str
+ The document content formatted as content-md.
+ """
+ from parxy_core.services.contentmd_service import ContentMdService
+
+ return ContentMdService.render(
+ self,
+ title=title,
+ description=description,
+ date=date,
+ license=license,
+ author=author,
+ )
+
def markdown(self) -> str:
"""Get the document content formatted as Markdown.
diff --git a/src/parxy_core/services/__init__.py b/src/parxy_core/services/__init__.py
index 5071d08..5342a63 100644
--- a/src/parxy_core/services/__init__.py
+++ b/src/parxy_core/services/__init__.py
@@ -1,5 +1,6 @@
"""Services module for parxy_core."""
+from parxy_core.services.contentmd_service import ContentMdService
from parxy_core.services.pdf_service import PdfService
-__all__ = ['PdfService']
+__all__ = ['ContentMdService', 'PdfService']
diff --git a/src/parxy_core/services/contentmd_service.py b/src/parxy_core/services/contentmd_service.py
new file mode 100644
index 0000000..3836c06
--- /dev/null
+++ b/src/parxy_core/services/contentmd_service.py
@@ -0,0 +1,264 @@
+"""Service for rendering documents as content-md."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+ from parxy_core.models.models import Document
+
+
+class ContentMdService:
+ """Render a :class:`Document` as a content-md string.
+
+ content-md is an open specification for optimised content exchange: a YAML
+ frontmatter section followed by CommonMark / GitHub-flavoured Markdown.
+ All methods are static; the class acts as a namespace.
+ """
+
+ # ------------------------------------------------------------------
+ # Private helpers
+ # ------------------------------------------------------------------
+
+ # Roles that provide structure or navigation rather than readable body text
+ _STRUCTURAL_ROLES: frozenset[str] = frozenset(
+ {
+ 'heading',
+ 'doc-title',
+ 'doc-subtitle',
+ 'doc-abstract',
+ 'doc-toc',
+ 'doc-pageheader',
+ 'doc-pagefooter',
+ 'caption',
+ }
+ )
+
+ @staticmethod
+ def _normalize(text: str) -> str:
+ """Collapse any run of whitespace to a single space and strip."""
+ return ' '.join(text.split())
+
+ @staticmethod
+ def _yaml_str(value: str) -> str:
+ """Wrap *value* in double quotes and escape internal quotes/backslashes."""
+ return '"' + value.replace('\\', '\\\\').replace('"', '\\"') + '"'
+
+ @staticmethod
+ def _guess_title(document: Document) -> Optional[str]:
+ """Infer a title from the first page blocks.
+
+ Prefers an explicit ``doc-title`` role; falls back to the
+ highest-ranking (lowest level number) ``heading`` block.
+ """
+ from parxy_core.models.models import TextBlock
+
+ if not document.pages:
+ return None
+ first_page = document.pages[0]
+ if not first_page.blocks:
+ return None
+
+ doc_title = next(
+ (
+ b
+ for b in first_page.blocks
+ if isinstance(b, TextBlock) and b.role == 'doc-title' and b.text.strip()
+ ),
+ None,
+ )
+ if doc_title:
+ return ContentMdService._normalize(doc_title.text)
+
+ headings = [
+ b
+ for b in first_page.blocks
+ if isinstance(b, TextBlock) and b.role == 'heading' and b.text.strip()
+ ]
+ if not headings:
+ return None
+ return ContentMdService._normalize(
+ min(headings, key=lambda b: b.level or 1).text
+ )
+
+ @staticmethod
+ def _infer_description(document: Document) -> Optional[str]:
+ """Infer a description from document content.
+
+ Uses the ``doc-abstract`` block when present. Otherwise concatenates
+ the first five body :class:`TextBlock` objects (non-structural, across
+ the first two pages), normalises whitespace, and returns at most 200
+ characters.
+ """
+ from parxy_core.models.models import TextBlock
+
+ blocks = [
+ b
+ for page in document.pages[:2]
+ if page.blocks
+ for b in page.blocks
+ if isinstance(b, TextBlock) and b.text.strip()
+ ]
+
+ abstract = next((b for b in blocks if b.role == 'doc-abstract'), None)
+ if abstract:
+ return ContentMdService._normalize(abstract.text)
+
+ body_blocks = [
+ b
+ for b in blocks
+ if (b.role or 'generic') not in ContentMdService._STRUCTURAL_ROLES
+ ]
+ if not body_blocks:
+ return None
+
+ combined = ' '.join(b.text for b in body_blocks[:5])
+ return ContentMdService._normalize(combined)[:200]
+
+ @staticmethod
+ def _build_frontmatter(
+ title: str,
+ description: Optional[str],
+ date: Optional[str],
+ license: Optional[str],
+ author: Optional[str],
+ ) -> str:
+ ys = ContentMdService._yaml_str
+ lines = ['---', f'title: {ys(title)}']
+ if description:
+ lines.append(f'description: {ys(description)}')
+ if date:
+ lines.append(f'date: {ys(date)}')
+ if license:
+ lines.append(f'license: {ys(license)}')
+ if author:
+ lines.append(f'author: {ys(author)}')
+ lines.append('---')
+ return '\n'.join(lines)
+
+ @staticmethod
+ def _build_body(document: Document, title: str) -> str:
+ from parxy_core.models.models import ImageBlock, TableBlock, TextBlock
+
+ normalize = ContentMdService._normalize
+ parts = [f'# {title}']
+
+ for page in document.pages:
+ if not page.blocks:
+ if page.text.strip():
+ parts.append(normalize(page.text))
+ continue
+
+ for block in page.blocks:
+ role = (block.role or 'generic').lower()
+
+ if isinstance(block, TextBlock):
+ if role == 'doc-title':
+ # Already the top-level h1 — skip to avoid duplication
+ pass
+ elif role == 'heading':
+ # Shift levels +1: h1 content → h2, per content-md spec
+ shifted = min((block.level or 1) + 1, 6)
+ parts.append(f'{"#" * shifted} {normalize(block.text)}')
+ elif role in ('list', 'listitem'):
+ for line in block.text.splitlines():
+ if line.strip():
+ parts.append(f'- {normalize(line)}')
+ elif role == 'doc-abstract':
+ lang_attr = (
+ f' lang="{document.language}"' if document.language else ''
+ )
+ parts.append(
+ f'\n{normalize(block.text)}\n'
+ )
+ else:
+ normalized = normalize(block.text)
+ if normalized:
+ parts.append(normalized)
+
+ elif isinstance(block, ImageBlock):
+ parts.append(f'\n{block.alt_text or ""}\n')
+
+ elif isinstance(block, TableBlock):
+ # Preserve table whitespace (column alignment, padding)
+ if block.text.strip():
+ parts.append(block.text.strip())
+
+ return '\n\n'.join(parts)
+
+ # ------------------------------------------------------------------
+ # Public API
+ # ------------------------------------------------------------------
+
+ @staticmethod
+ def render(
+ document: Document,
+ title: Optional[str] = None,
+ description: Optional[str] = None,
+ date: Optional[str] = None,
+ license: Optional[str] = None,
+ author: Optional[str] = None,
+ ) -> str:
+ """Render *document* as a content-md string.
+
+ Parameters
+ ----------
+ document:
+ The document to render.
+ title:
+ Document title. Falls back to ``metadata.title``, a heading
+ inferred from the first page, then ``filename``. Raises
+ ``ValueError`` if no title can be resolved.
+ description:
+ Short summary (~200 characters). Falls back to a ``doc-abstract``
+ block, then the first five body blocks in the first two pages.
+ date:
+ Creation/publication date in ISO 8601. Falls back to
+ ``metadata.created_at`` / ``metadata.updated_at``.
+ license:
+ License name or SPDX identifier.
+ author:
+ Author name. Falls back to ``metadata.author``.
+
+ Returns
+ -------
+ str
+ The document formatted as content-md.
+ """
+ resolved_title = (
+ title
+ or (document.metadata.title if document.metadata else None)
+ or ContentMdService._guess_title(document)
+ or document.filename
+ )
+ if not resolved_title:
+ raise ValueError(
+ 'Cannot render content-md: no title could be resolved. '
+ 'Provide a title via metadata, a doc-title/heading block, '
+ 'a filename, or pass title= explicitly.'
+ )
+ resolved_description = description or ContentMdService._infer_description(
+ document
+ )
+ resolved_date = date or (
+ (document.metadata.created_at or document.metadata.updated_at)
+ if document.metadata
+ else None
+ )
+ resolved_author = author or (
+ document.metadata.author if document.metadata else None
+ )
+
+ frontmatter = ContentMdService._build_frontmatter(
+ title=resolved_title,
+ description=resolved_description,
+ date=resolved_date,
+ license=license,
+ author=resolved_author,
+ )
+
+ if not document.pages:
+ return f'{frontmatter}\n\n# {resolved_title}\n'
+
+ body = ContentMdService._build_body(document, resolved_title)
+ return f'{frontmatter}\n\n{body}\n'
diff --git a/tests/services/test_contentmd_service.py b/tests/services/test_contentmd_service.py
new file mode 100644
index 0000000..b817fe0
--- /dev/null
+++ b/tests/services/test_contentmd_service.py
@@ -0,0 +1,505 @@
+"""Test suite for ContentMdService."""
+
+import pytest
+
+from parxy_core.models.models import (
+ Document,
+ ImageBlock,
+ Metadata,
+ Page,
+ TableBlock,
+ TextBlock,
+)
+from parxy_core.services.contentmd_service import ContentMdService
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def make_page(
+ number: int = 1,
+ text: str = '',
+ blocks: list | None = None,
+) -> Page:
+ return Page(number=number, text=text, blocks=blocks)
+
+
+def make_text_block(
+ text: str,
+ role: str = 'generic',
+ level: int | None = None,
+) -> TextBlock:
+ return TextBlock(type='text', text=text, role=role, level=level)
+
+
+def make_image_block(
+ alt_text: str | None = None, name: str | None = None
+) -> ImageBlock:
+ return ImageBlock(type='image', alt_text=alt_text, name=name)
+
+
+def make_table_block(text: str) -> TableBlock:
+ return TableBlock(type='table', text=text)
+
+
+def make_doc(
+ pages: list[Page],
+ metadata: Metadata | None = None,
+ filename: str | None = None,
+ language: str | None = None,
+) -> Document:
+ return Document(
+ pages=pages,
+ metadata=metadata,
+ filename=filename,
+ language=language,
+ )
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def minimal_doc():
+ """Document with a single page, no blocks, no metadata."""
+ return make_doc(pages=[make_page(text='Hello world')])
+
+
+@pytest.fixture
+def metadata_doc():
+ """Document with full metadata and one plain paragraph block."""
+ meta = Metadata(
+ title='Metadata Title',
+ author='Jane Doe',
+ created_at='2025-01-15',
+ )
+ page = make_page(
+ text='Paragraph text.',
+ blocks=[make_text_block('Paragraph text.')],
+ )
+ return make_doc(pages=[page], metadata=meta, filename='report.pdf')
+
+
+@pytest.fixture
+def all_blocks_doc():
+ """Document whose first page contains every supported block type."""
+ blocks = [
+ make_text_block('My Document', role='doc-title'),
+ make_text_block('Introduction', role='heading', level=1),
+ make_text_block('Background', role='heading', level=2),
+ make_text_block('First item\nSecond item', role='list'),
+ make_text_block('A plain paragraph.', role='paragraph'),
+ make_text_block('A brief overview.', role='doc-abstract'),
+ make_image_block(alt_text='A sunset over mountains', name='sunset.jpg'),
+ make_table_block('| Col A | Col B |\n| ----- | ----- |\n| 1 | 2 |'),
+ ]
+ page = make_page(text='My Document', blocks=blocks)
+ return make_doc(pages=[page], language='en')
+
+
+# ---------------------------------------------------------------------------
+# Frontmatter
+# ---------------------------------------------------------------------------
+
+
+class TestFrontmatter:
+ def test_frontmatter_delimiters_present(self, minimal_doc):
+ result = ContentMdService.render(minimal_doc, title='T', description='D')
+ lines = result.splitlines()
+ assert lines[0] == '---'
+ closing = lines.index('---', 1)
+ assert closing > 0
+
+ def test_explicit_title_in_frontmatter(self, minimal_doc):
+ result = ContentMdService.render(minimal_doc, title='Explicit Title')
+ assert 'title: "Explicit Title"' in result
+
+ def test_title_from_metadata(self, metadata_doc):
+ result = ContentMdService.render(metadata_doc)
+ assert 'title: "Metadata Title"' in result
+
+ def test_title_from_doc_title_role_preferred_over_heading(self):
+ blocks = [
+ make_text_block('Real Title', role='doc-title'),
+ make_text_block('Section One', role='heading', level=1),
+ ]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc)
+ assert 'title: "Real Title"' in result
+
+ def test_title_from_heading_when_no_doc_title(self):
+ blocks = [
+ make_text_block('Section One', role='heading', level=2),
+ make_text_block('Section Two', role='heading', level=1),
+ ]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc)
+ # Level 1 heading wins (lowest level = highest rank)
+ assert 'title: "Section Two"' in result
+
+ def test_title_from_filename_when_no_headings(self):
+ doc = make_doc(
+ pages=[make_page(text='body text')],
+ filename='my-report.pdf',
+ )
+ result = ContentMdService.render(doc)
+ assert 'title: "my-report.pdf"' in result
+
+ def test_title_raises_when_unresolvable(self):
+ doc = make_doc(pages=[make_page(text='body text')])
+ with pytest.raises(ValueError, match='no title could be resolved'):
+ ContentMdService.render(doc)
+
+ def test_description_from_explicit_param(self, minimal_doc):
+ result = ContentMdService.render(
+ minimal_doc, title='T', description='My summary.'
+ )
+ assert 'description: "My summary."' in result
+
+ def test_description_from_doc_abstract_block(self):
+ blocks = [
+ make_text_block('Abstract content here.', role='doc-abstract'),
+ make_text_block(
+ 'A much longer paragraph that should not be picked.', role='paragraph'
+ ),
+ ]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert 'description: "Abstract content here."' in result
+
+ def test_description_from_first_five_body_blocks(self):
+ blocks = [make_text_block(f'Sentence {i}.', role='paragraph') for i in range(7)]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ # Only the first five contribute; the sixth and seventh are ignored
+ assert 'Sentence 5' not in result.split('---\n')[1].split('\n')[0]
+ assert 'Sentence 0' in result
+
+ def test_description_excludes_structural_roles(self):
+ blocks = [
+ make_text_block('Table of contents text.', role='doc-toc'),
+ make_text_block('Page header text.', role='doc-pageheader'),
+ make_text_block('A heading block.', role='heading'),
+ make_text_block('Body content.', role='paragraph'),
+ ]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc)
+ assert 'description: "Body content."' in result
+
+ def test_description_truncated_to_200_chars(self):
+ long_text = 'word ' * 60 # well over 200 chars
+ blocks = [make_text_block(long_text, role='paragraph')]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ fm_end = result.index('---\n', 4)
+ frontmatter = result[:fm_end]
+ desc_line = next(
+ l for l in frontmatter.splitlines() if l.startswith('description:')
+ )
+ # Strip the YAML quoting to measure the actual value length
+ value = desc_line[len('description: "') : -1]
+ assert len(value) <= 200
+
+ def test_description_contains_no_newlines(self):
+ blocks = [
+ make_text_block('Line one.\nLine two.\nLine three.', role='paragraph')
+ ]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ fm_end = result.index('---\n', 4)
+ frontmatter = result[:fm_end]
+ desc_line = next(
+ l for l in frontmatter.splitlines() if l.startswith('description:')
+ )
+ assert '\n' not in desc_line
+
+ def test_description_searches_first_two_pages(self):
+ page1 = make_page(number=1, text='', blocks=[make_text_block('Page 1 text.')])
+ page2 = make_page(
+ number=2,
+ text='',
+ blocks=[make_text_block('Page 2 has a longer text block.')],
+ )
+ page3 = make_page(
+ number=3,
+ text='',
+ blocks=[make_text_block('Page 3 has the longest block of all by far.')],
+ )
+ doc = make_doc(pages=[page1, page2, page3])
+ result = ContentMdService.render(doc, title='T')
+ # Page 3 is out of the two-page window
+ assert 'Page 3' not in result.split('---')[1] # not in frontmatter
+
+ def test_date_from_metadata_created_at(self, metadata_doc):
+ result = ContentMdService.render(metadata_doc)
+ assert 'date: "2025-01-15"' in result
+
+ def test_date_from_metadata_updated_at_when_no_created_at(self):
+ meta = Metadata(updated_at='2025-06-01')
+ doc = make_doc(pages=[make_page(text='')], metadata=meta)
+ result = ContentMdService.render(doc, title='T')
+ assert 'date: "2025-06-01"' in result
+
+ def test_explicit_date_overrides_metadata(self, metadata_doc):
+ result = ContentMdService.render(metadata_doc, date='2026-01-01')
+ assert 'date: "2026-01-01"' in result
+ assert '2025-01-15' not in result
+
+ def test_author_from_metadata(self, metadata_doc):
+ result = ContentMdService.render(metadata_doc)
+ assert 'author: "Jane Doe"' in result
+
+ def test_optional_fields_omitted_when_absent(self, minimal_doc):
+ result = ContentMdService.render(minimal_doc, title='T')
+ assert 'description:' not in result
+ assert 'date:' not in result
+ assert 'license:' not in result
+ assert 'author:' not in result
+
+ def test_license_included_when_provided(self, minimal_doc):
+ result = ContentMdService.render(minimal_doc, title='T', license='CC-BY-4.0')
+ assert 'license: "CC-BY-4.0"' in result
+
+ def test_yaml_values_escaped(self, minimal_doc):
+ result = ContentMdService.render(
+ minimal_doc,
+ title='Title with "quotes"',
+ description='Back\\slash',
+ )
+ assert r'title: "Title with \"quotes\""' in result
+ assert r'description: "Back\\slash"' in result
+
+
+# ---------------------------------------------------------------------------
+# Body – block rendering
+# ---------------------------------------------------------------------------
+
+
+class TestBodyBlocks:
+ def test_body_starts_with_h1_title(self, metadata_doc):
+ result = ContentMdService.render(metadata_doc)
+ body = result.split('---\n', 2)[-1]
+ assert body.lstrip().startswith('# Metadata Title')
+
+ def test_doc_title_block_skipped_in_body(self, all_blocks_doc):
+ result = ContentMdService.render(all_blocks_doc)
+ body = result.split('---\n', 2)[-1]
+ # Should appear exactly once (as the h1), not twice
+ assert body.count('My Document') == 1
+
+ def test_heading_level_shifted_by_one(self):
+ blocks = [make_text_block('Section', role='heading', level=1)]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert '## Section' in result
+
+ def test_heading_level_2_becomes_3(self):
+ blocks = [make_text_block('Subsection', role='heading', level=2)]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert '### Subsection' in result
+
+ def test_heading_without_level_defaults_to_h2(self):
+ blocks = [make_text_block('Heading', role='heading')]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert '## Heading' in result
+
+ def test_heading_level_capped_at_6(self):
+ blocks = [make_text_block('Deep', role='heading', level=6)]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert '###### Deep' in result
+ assert '####### Deep' not in result
+
+ def test_list_role_rendered_as_bullets(self):
+ blocks = [make_text_block('Alpha\nBeta\nGamma', role='list')]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert '- Alpha' in result
+ assert '- Beta' in result
+ assert '- Gamma' in result
+
+ def test_listitem_role_rendered_as_bullet(self):
+ blocks = [make_text_block('Single item', role='listitem')]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert '- Single item' in result
+
+ def test_doc_abstract_rendered_as_abstract_tag(self, all_blocks_doc):
+ result = ContentMdService.render(all_blocks_doc)
+ assert '' in result
+ assert 'A brief overview.' in result
+ assert '' in result
+
+ def test_doc_abstract_without_language_omits_lang_attr(self):
+ blocks = [make_text_block('Summary.', role='doc-abstract')]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert '\nSummary.\n' in result
+
+ def test_generic_textblock_rendered_as_paragraph(self):
+ blocks = [make_text_block('Plain paragraph text.', role='generic')]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert 'Plain paragraph text.' in result
+
+ def test_empty_textblock_not_rendered(self):
+ blocks = [make_text_block(' ', role='paragraph')]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ # Body should only contain the h1 line
+ body = result.split('---\n', 2)[-1].strip()
+ assert body == '# T'
+
+ def test_image_block_rendered_as_figure(self):
+ blocks = [make_image_block(alt_text='A sunset over mountains')]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert '\nA sunset over mountains\n' in result
+
+ def test_image_block_without_alt_text(self):
+ blocks = [make_image_block()]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert '\n\n' in result
+
+ def test_table_block_rendered_as_is(self):
+ table_text = '| Col A | Col B |\n| ----- | ----- |\n| 1 | 2 |'
+ blocks = [make_table_block(table_text)]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert table_text in result
+
+ def test_page_without_blocks_uses_page_text(self):
+ page = make_page(text='Fallback page text', blocks=None)
+ doc = make_doc(pages=[page])
+ result = ContentMdService.render(doc, title='T')
+ assert 'Fallback page text' in result
+
+ def test_empty_page_text_not_rendered(self):
+ page = make_page(text=' ', blocks=None)
+ doc = make_doc(pages=[page])
+ result = ContentMdService.render(doc, title='T')
+ body = result.split('---\n', 2)[-1].strip()
+ assert body == '# T'
+
+
+# ---------------------------------------------------------------------------
+# Whitespace normalisation
+# ---------------------------------------------------------------------------
+
+
+class TestWhitespaceNormalisation:
+ def test_multiple_spaces_in_paragraph_collapsed(self):
+ blocks = [make_text_block('Word1 Word2 Word3')]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert 'Word1 Word2 Word3' in result
+
+ def test_tabs_in_paragraph_collapsed(self):
+ blocks = [make_text_block('Word1\t\tWord2')]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert 'Word1 Word2' in result
+
+ def test_whitespace_in_heading_collapsed(self):
+ blocks = [make_text_block('My Section', role='heading', level=1)]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert '## My Section' in result
+
+ def test_whitespace_in_title_collapsed(self):
+ blocks = [make_text_block(' My Title ', role='doc-title')]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc)
+ assert 'title: "My Title"' in result
+
+ def test_whitespace_in_description_collapsed(self):
+ blocks = [make_text_block('Summary with gaps.', role='doc-abstract')]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert 'description: "Summary with gaps."' in result
+
+ def test_table_whitespace_preserved(self):
+ table_text = '| Col A | Col B |\n| ----- | ----- |'
+ blocks = [make_table_block(table_text)]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert '| Col A | Col B |' in result
+
+
+# ---------------------------------------------------------------------------
+# Output structure
+# ---------------------------------------------------------------------------
+
+
+class TestOutputStructure:
+ def test_result_ends_with_newline(self, minimal_doc):
+ result = ContentMdService.render(minimal_doc, title='T')
+ assert result.endswith('\n')
+
+ def test_empty_pages_list_returns_frontmatter_and_title(self):
+ doc = Document(pages=[])
+ result = ContentMdService.render(doc, title='Empty')
+ assert 'title: "Empty"' in result
+ assert '# Empty' in result
+
+ def test_blocks_separated_by_blank_line(self):
+ blocks = [
+ make_text_block('First paragraph.'),
+ make_text_block('Second paragraph.'),
+ ]
+ doc = make_doc(pages=[make_page(text='', blocks=blocks)])
+ result = ContentMdService.render(doc, title='T')
+ assert 'First paragraph.\n\nSecond paragraph.' in result
+
+ def test_multipage_document_renders_all_pages(self):
+ page1 = make_page(
+ number=1,
+ text='',
+ blocks=[make_text_block('Page one content.')],
+ )
+ page2 = make_page(
+ number=2,
+ text='',
+ blocks=[make_text_block('Page two content.')],
+ )
+ doc = make_doc(pages=[page1, page2])
+ result = ContentMdService.render(doc, title='T')
+ assert 'Page one content.' in result
+ assert 'Page two content.' in result
+
+ def test_render_delegates_from_document_method(self, metadata_doc):
+ via_service = ContentMdService.render(metadata_doc)
+ via_method = metadata_doc.contentmd()
+ assert via_service == via_method
+
+ def test_empty_document_without_args_raises(self):
+ """A document with no metadata, no blocks, no filename, and no user
+ arguments cannot satisfy the required title constraint."""
+ doc = Document(pages=[])
+ with pytest.raises(ValueError, match='no title could be resolved'):
+ ContentMdService.render(doc)
+
+ def test_empty_document_with_title_arg_returns_contentmd(self):
+ """Passing title= explicitly must succeed even when the document is
+ completely empty."""
+ doc = Document(pages=[])
+ result = ContentMdService.render(doc, title='Provided Title')
+ assert 'title: "Provided Title"' in result
+ assert '# Provided Title' in result
+
+ def test_empty_document_with_title_and_description_returns_contentmd(self):
+ """Both title= and description= passed explicitly on an empty document."""
+ doc = Document(pages=[])
+ result = ContentMdService.render(
+ doc, title='My Title', description='My description.'
+ )
+ assert 'title: "My Title"' in result
+ assert 'description: "My description."' in result
+ assert result.endswith('\n')