Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions src/parxy_core/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,49 @@ def text(self, page_separator: str = '---') -> str:

return '\n'.join(texts)

def contentmd(
self,
title: Optional[str] = None,
description: Optional[str] = None,
date: Optional[str] = None,
license: Optional[str] = None,
author: Optional[str] = None,
) -> str:
"""Get the document content formatted as content-md.

Delegates to :class:`~parxy_core.services.ContentMdService`.

Parameters
----------
title : str, optional
Document title. Falls back to metadata.title, a heading inferred
from the first page, filename, then 'Untitled'.
description : str, optional
Short summary (~200 characters). Falls back to a doc-abstract block,
then the longest TextBlock across the first two pages.
date : str, optional
Creation/publication date in ISO 8601. Falls back to metadata dates.
license : str, optional
License name or SPDX identifier.
author : str, optional
Author name. Falls back to metadata.author.

Returns
-------
str
The document content formatted as content-md.
"""
from parxy_core.services.contentmd_service import ContentMdService

return ContentMdService.render(
self,
title=title,
description=description,
date=date,
license=license,
author=author,
)

def markdown(self) -> str:
"""Get the document content formatted as Markdown.

Expand Down
3 changes: 2 additions & 1 deletion src/parxy_core/services/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Services module for parxy_core."""

from parxy_core.services.contentmd_service import ContentMdService
from parxy_core.services.pdf_service import PdfService

__all__ = ['PdfService']
__all__ = ['ContentMdService', 'PdfService']
264 changes: 264 additions & 0 deletions src/parxy_core/services/contentmd_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
"""Service for rendering documents as content-md."""

from __future__ import annotations

from typing import TYPE_CHECKING, Optional

if TYPE_CHECKING:
from parxy_core.models.models import Document


class ContentMdService:
"""Render a :class:`Document` as a content-md string.

content-md is an open specification for optimised content exchange: a YAML
frontmatter section followed by CommonMark / GitHub-flavoured Markdown.
All methods are static; the class acts as a namespace.
"""

# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------

# Roles that provide structure or navigation rather than readable body text
_STRUCTURAL_ROLES: frozenset[str] = frozenset(
{
'heading',
'doc-title',
'doc-subtitle',
'doc-abstract',
'doc-toc',
'doc-pageheader',
'doc-pagefooter',
'caption',
}
)

@staticmethod
def _normalize(text: str) -> str:
"""Collapse any run of whitespace to a single space and strip."""
return ' '.join(text.split())

@staticmethod
def _yaml_str(value: str) -> str:
"""Wrap *value* in double quotes and escape internal quotes/backslashes."""
return '"' + value.replace('\\', '\\\\').replace('"', '\\"') + '"'

@staticmethod
def _guess_title(document: Document) -> Optional[str]:
"""Infer a title from the first page blocks.

Prefers an explicit ``doc-title`` role; falls back to the
highest-ranking (lowest level number) ``heading`` block.
"""
from parxy_core.models.models import TextBlock

if not document.pages:
return None
first_page = document.pages[0]
if not first_page.blocks:
return None

doc_title = next(
(
b
for b in first_page.blocks
if isinstance(b, TextBlock) and b.role == 'doc-title' and b.text.strip()
),
None,
)
if doc_title:
return ContentMdService._normalize(doc_title.text)

headings = [
b
for b in first_page.blocks
if isinstance(b, TextBlock) and b.role == 'heading' and b.text.strip()
]
if not headings:
return None
return ContentMdService._normalize(
min(headings, key=lambda b: b.level or 1).text
)

@staticmethod
def _infer_description(document: Document) -> Optional[str]:
"""Infer a description from document content.

Uses the ``doc-abstract`` block when present. Otherwise concatenates
the first five body :class:`TextBlock` objects (non-structural, across
the first two pages), normalises whitespace, and returns at most 200
characters.
"""
from parxy_core.models.models import TextBlock

blocks = [
b
for page in document.pages[:2]
if page.blocks
for b in page.blocks
if isinstance(b, TextBlock) and b.text.strip()
]

abstract = next((b for b in blocks if b.role == 'doc-abstract'), None)
if abstract:
return ContentMdService._normalize(abstract.text)

body_blocks = [
b
for b in blocks
if (b.role or 'generic') not in ContentMdService._STRUCTURAL_ROLES
]
if not body_blocks:
return None

combined = ' '.join(b.text for b in body_blocks[:5])
return ContentMdService._normalize(combined)[:200]

@staticmethod
def _build_frontmatter(
title: str,
description: Optional[str],
date: Optional[str],
license: Optional[str],
author: Optional[str],
) -> str:
ys = ContentMdService._yaml_str
lines = ['---', f'title: {ys(title)}']
if description:
lines.append(f'description: {ys(description)}')
if date:
lines.append(f'date: {ys(date)}')
if license:
lines.append(f'license: {ys(license)}')
if author:
lines.append(f'author: {ys(author)}')
lines.append('---')
return '\n'.join(lines)

@staticmethod
def _build_body(document: Document, title: str) -> str:
from parxy_core.models.models import ImageBlock, TableBlock, TextBlock

normalize = ContentMdService._normalize
parts = [f'# {title}']

for page in document.pages:
if not page.blocks:
if page.text.strip():
parts.append(normalize(page.text))
continue

for block in page.blocks:
role = (block.role or 'generic').lower()

if isinstance(block, TextBlock):
if role == 'doc-title':
# Already the top-level h1 — skip to avoid duplication
pass
elif role == 'heading':
# Shift levels +1: h1 content → h2, per content-md spec
shifted = min((block.level or 1) + 1, 6)
parts.append(f'{"#" * shifted} {normalize(block.text)}')
elif role in ('list', 'listitem'):
for line in block.text.splitlines():
if line.strip():
parts.append(f'- {normalize(line)}')
elif role == 'doc-abstract':
lang_attr = (
f' lang="{document.language}"' if document.language else ''
)
parts.append(
f'<abstract{lang_attr}>\n{normalize(block.text)}\n</abstract>'
)
else:
normalized = normalize(block.text)
if normalized:
parts.append(normalized)

elif isinstance(block, ImageBlock):
parts.append(f'<figure>\n{block.alt_text or ""}\n</figure>')

elif isinstance(block, TableBlock):
# Preserve table whitespace (column alignment, padding)
if block.text.strip():
parts.append(block.text.strip())

return '\n\n'.join(parts)

# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------

@staticmethod
def render(
document: Document,
title: Optional[str] = None,
description: Optional[str] = None,
date: Optional[str] = None,
license: Optional[str] = None,
author: Optional[str] = None,
) -> str:
"""Render *document* as a content-md string.

Parameters
----------
document:
The document to render.
title:
Document title. Falls back to ``metadata.title``, a heading
inferred from the first page, then ``filename``. Raises
``ValueError`` if no title can be resolved.
description:
Short summary (~200 characters). Falls back to a ``doc-abstract``
block, then the first five body blocks in the first two pages.
date:
Creation/publication date in ISO 8601. Falls back to
``metadata.created_at`` / ``metadata.updated_at``.
license:
License name or SPDX identifier.
author:
Author name. Falls back to ``metadata.author``.

Returns
-------
str
The document formatted as content-md.
"""
resolved_title = (
title
or (document.metadata.title if document.metadata else None)
or ContentMdService._guess_title(document)
or document.filename
)
if not resolved_title:
raise ValueError(
'Cannot render content-md: no title could be resolved. '
'Provide a title via metadata, a doc-title/heading block, '
'a filename, or pass title= explicitly.'
)
resolved_description = description or ContentMdService._infer_description(
document
)
resolved_date = date or (
(document.metadata.created_at or document.metadata.updated_at)
if document.metadata
else None
)
resolved_author = author or (
document.metadata.author if document.metadata else None
)

frontmatter = ContentMdService._build_frontmatter(
title=resolved_title,
description=resolved_description,
date=resolved_date,
license=license,
author=resolved_author,
)

if not document.pages:
return f'{frontmatter}\n\n# {resolved_title}\n'

body = ContentMdService._build_body(document, resolved_title)
return f'{frontmatter}\n\n{body}\n'
Loading