Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 173 additions & 29 deletions src/bot/utils/html_format.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,99 @@
"""HTML formatting utilities for Telegram messages.

Telegram's HTML mode only requires escaping 3 characters (<, >, &) vs the many
ambiguous Markdown v1 metacharacters, making it far more robust for rendering
Claude's output which contains underscores, asterisks, brackets, etc.
Telegram's HTML mode supports: <b>, <i>, <u>, <s>, <code>, <pre>,
<pre><code class="language-X">, <a href>, <blockquote>,
<blockquote expandable>, <tg-spoiler>.

This module converts Claude's markdown output into that subset.
"""

import re
from typing import List, Tuple


def escape_html(text: str) -> str:
"""Escape the 3 HTML-special characters for Telegram.
_INLINE_TAGS = {"b", "i", "s", "u", "code"}
_TAG_RE = re.compile(r"<(/?)(\w+)(?:\s[^>]*)?>")


def _repair_html_nesting(html: str) -> str:
"""Fix misnested inline HTML tags that Telegram would reject.

This replaces all 3 _escape_markdown functions previously scattered
across the codebase.
Telegram requires strict nesting: <b><i>...</i></b> is OK,
but <i><b>...</i></b> is rejected. This walks the tag stack
and closes/reopens tags when it detects a mismatch.
"""
result = []
stack: List[str] = []
last_end = 0

for m in _TAG_RE.finditer(html):
# Append text before this tag
result.append(html[last_end:m.start()])
last_end = m.end()

is_close = m.group(1) == "/"
tag = m.group(2).lower()

# Only repair inline tags; skip <pre>, <blockquote>, <a>, etc.
if tag not in _INLINE_TAGS:
result.append(m.group(0))
continue

if not is_close:
stack.append(tag)
result.append(m.group(0))
else:
if tag in stack:
# Close tags in reverse order up to the matching opener
idx = len(stack) - 1 - stack[::-1].index(tag)
tags_to_reopen = stack[idx + 1:]
# Close everything from top to idx
for t in reversed(stack[idx:]):
result.append(f"</{t}>")
stack = stack[:idx]
# Reopen tags that were above the matched one
for t in tags_to_reopen:
result.append(f"<{t}>")
stack.append(t)
else:
# Orphan close tag β€” skip it
pass

# Append remaining text
result.append(html[last_end:])

# Close any unclosed tags
for t in reversed(stack):
result.append(f"</{t}>")

return "".join(result)


def escape_html(text: str) -> str:
"""Escape the 3 HTML-special characters for Telegram."""
return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")


def markdown_to_telegram_html(text: str) -> str:
"""Convert Claude's markdown output to Telegram-compatible HTML.

Telegram supports a narrow HTML subset: <b>, <i>, <code>, <pre>,
<a href>, <s>, <u>. This function converts common markdown patterns
to that subset while preserving code blocks verbatim.

Order of operations:
1. Extract fenced code blocks -> placeholders
2. Extract inline code -> placeholders
3. HTML-escape remaining text
4. Convert bold (**text** / __text__)
5. Convert italic (*text*, _text_ with word boundaries)
6. Convert links [text](url)
7. Convert headers (# Header -> <b>Header</b>)
8. Convert strikethrough (~~text~~)
9. Restore placeholders
Order of operations (early steps extract content into placeholders
to protect it from later regex passes):

0. Markdown tables β†’ aligned <pre> blocks
1. Fenced code blocks β†’ <pre><code>
2. Inline code β†’ <code>
3. Blockquotes (> text) β†’ <blockquote>
4. HTML-escape remaining text
5. Horizontal rules (--- / ***) β†’ ── separator
6. Bold (**text** / __text__)
7. Italic (*text* / _text_)
8. Links [text](url)
9. Headers (# Header β†’ <b>Header</b>)
10. Strikethrough (~~text~~)
11. Unordered lists (- item / * item)
12. Ordered lists (1. item)
13. Restore placeholders
"""
placeholders: List[Tuple[str, str]] = []
placeholder_counter = 0
Expand All @@ -46,6 +105,52 @@ def _make_placeholder(html_content: str) -> str:
placeholders.append((key, html_content))
return key

# --- 0. Extract markdown tables β†’ monospace <pre> blocks ---
def _replace_table(m: re.Match) -> str: # type: ignore[type-arg]
table_text = m.group(0)
lines = table_text.strip().split("\n")
rows = []
for line in lines:
stripped = line.strip()
if not stripped.startswith("|"):
continue
if re.match(r"^\|[\s\-:|]+\|$", stripped):
continue
cells = [c.strip() for c in stripped.split("|")[1:-1]]
if cells:
rows.append(cells)

if not rows:
return table_text

num_cols = max(len(r) for r in rows)
col_widths = [0] * num_cols
for row in rows:
for i, cell in enumerate(row):
if i < num_cols:
col_widths[i] = max(col_widths[i], len(cell))

formatted_lines = []
for row in rows:
parts = []
for i in range(num_cols):
cell = row[i] if i < len(row) else ""
parts.append(cell.ljust(col_widths[i]))
formatted_lines.append(" β”‚ ".join(parts))
if len(formatted_lines) == 1:
sep_parts = ["─" * w for w in col_widths]
formatted_lines.append("─┼─".join(sep_parts))

pre_content = "\n".join(formatted_lines)
return _make_placeholder(f"<pre>{escape_html(pre_content)}</pre>")

text = re.sub(
r"(?:^\|.+\|$\n?){2,}",
_replace_table,
text,
flags=re.MULTILINE,
)

# --- 1. Extract fenced code blocks ---
def _replace_fenced(m: re.Match) -> str: # type: ignore[type-arg]
lang = m.group(1) or ""
Expand All @@ -72,33 +177,72 @@ def _replace_inline_code(m: re.Match) -> str: # type: ignore[type-arg]

text = re.sub(r"`([^`\n]+)`", _replace_inline_code, text)

# --- 3. HTML-escape remaining text ---
# --- 3. Blockquotes: > text β†’ <blockquote> ---
def _replace_blockquote(m: re.Match) -> str: # type: ignore[type-arg]
block = m.group(0)
# Strip the leading > (and optional space) from each line
lines = []
for line in block.split("\n"):
stripped = re.sub(r"^>\s?", "", line)
lines.append(stripped)
inner = "\n".join(lines)
# Recursively format the blockquote content
inner_html = escape_html(inner)
return _make_placeholder(f"<blockquote>{inner_html}</blockquote>")

text = re.sub(
r"(?:^>.*$\n?)+",
_replace_blockquote,
text,
flags=re.MULTILINE,
)

# --- 4. HTML-escape remaining text ---
text = escape_html(text)

# --- 4. Bold: **text** or __text__ ---
# --- 5. Horizontal rules: --- or *** or ___ β†’ visual separator ---
text = re.sub(
r"^(?:---+|\*\*\*+|___+)\s*$",
"──────────",
text,
flags=re.MULTILINE,
)

# --- 6. Bold: **text** or __text__ ---
text = re.sub(r"\*\*(.+?)\*\*", r"<b>\1</b>", text)
text = re.sub(r"__(.+?)__", r"<b>\1</b>", text)

# --- 5. Italic: *text* (require non-space after/before) ---
# --- 7. Italic: *text* (require non-space after/before) ---
text = re.sub(r"\*(\S.*?\S|\S)\*", r"<i>\1</i>", text)
# _text_ only at word boundaries (avoid my_var_name)
text = re.sub(r"(?<!\w)_(\S.*?\S|\S)_(?!\w)", r"<i>\1</i>", text)

# --- 6. Links: [text](url) ---
# --- 8. Links: [text](url) ---
text = re.sub(
r"\[([^\]]+)\]\(([^)]+)\)",
r'<a href="\2">\1</a>',
text,
)

# --- 7. Headers: # Header -> <b>Header</b> ---
# --- 9. Headers: # Header β†’ <b>Header</b> ---
text = re.sub(r"^#{1,6}\s+(.+)$", r"<b>\1</b>", text, flags=re.MULTILINE)

# --- 8. Strikethrough: ~~text~~ ---
# --- 10. Strikethrough: ~~text~~ ---
text = re.sub(r"~~(.+?)~~", r"<s>\1</s>", text)

# --- 9. Restore placeholders ---
# --- 11. Unordered lists: - item / * item β†’ bullet ---
text = re.sub(r"^[\-\*]\s+", "β€’ ", text, flags=re.MULTILINE)

# --- 12. Ordered lists: 1. item β†’ keep number with period ---
# (Telegram has no <ol>, so just clean up the formatting)
text = re.sub(r"^(\d+)\.\s+", r"\1. ", text, flags=re.MULTILINE)

# --- 13. Restore placeholders ---
for key, html_content in placeholders:
text = text.replace(key, html_content)

# --- 14. Repair HTML tag nesting ---
# Telegram is strict about nesting: <b><i>...</i></b> is OK,
# but <i><b>...</i></b> is rejected. Fix any mismatches.
text = _repair_html_nesting(text)

return text