From 60eb8d4f87eab1c2a779bebe1cd6dd88d2257443 Mon Sep 17 00:00:00 2001 From: Franz Chandi Date: Sun, 15 Mar 2026 00:04:32 +0000 Subject: [PATCH] feat: rich HTML formatting for Telegram messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Convert markdown tables to aligned monospace
 blocks
- Add blockquote support (> text → 
) - Add bullet list conversion (- item → • item) - Add numbered list formatting - Add horizontal rule conversion (--- → ──────────) - Fix HTML tag nesting issues that cause Telegram rejection - Add _repair_html_nesting() to fix overlapping bold/italic tags --- src/bot/utils/html_format.py | 202 ++++++++++++++++++++++++++++++----- 1 file changed, 173 insertions(+), 29 deletions(-) diff --git a/src/bot/utils/html_format.py b/src/bot/utils/html_format.py index 2799a4ee..b84bbd58 100644 --- a/src/bot/utils/html_format.py +++ b/src/bot/utils/html_format.py @@ -1,40 +1,99 @@ """HTML formatting utilities for Telegram messages. -Telegram's HTML mode only requires escaping 3 characters (<, >, &) vs the many -ambiguous Markdown v1 metacharacters, making it far more robust for rendering -Claude's output which contains underscores, asterisks, brackets, etc. +Telegram's HTML mode supports: , , , , ,
,
+
, , 
, +
, . + +This module converts Claude's markdown output into that subset. """ import re from typing import List, Tuple -def escape_html(text: str) -> str: - """Escape the 3 HTML-special characters for Telegram. +_INLINE_TAGS = {"b", "i", "s", "u", "code"} +_TAG_RE = re.compile(r"<(/?)(\w+)(?:\s[^>]*)?>") + + +def _repair_html_nesting(html: str) -> str: + """Fix misnested inline HTML tags that Telegram would reject. - This replaces all 3 _escape_markdown functions previously scattered - across the codebase. + Telegram requires strict nesting: ... is OK, + but ... is rejected. This walks the tag stack + and closes/reopens tags when it detects a mismatch. """ + result = [] + stack: List[str] = [] + last_end = 0 + + for m in _TAG_RE.finditer(html): + # Append text before this tag + result.append(html[last_end:m.start()]) + last_end = m.end() + + is_close = m.group(1) == "/" + tag = m.group(2).lower() + + # Only repair inline tags; skip
, 
, , etc. + if tag not in _INLINE_TAGS: + result.append(m.group(0)) + continue + + if not is_close: + stack.append(tag) + result.append(m.group(0)) + else: + if tag in stack: + # Close tags in reverse order up to the matching opener + idx = len(stack) - 1 - stack[::-1].index(tag) + tags_to_reopen = stack[idx + 1:] + # Close everything from top to idx + for t in reversed(stack[idx:]): + result.append(f"") + stack = stack[:idx] + # Reopen tags that were above the matched one + for t in tags_to_reopen: + result.append(f"<{t}>") + stack.append(t) + else: + # Orphan close tag — skip it + pass + + # Append remaining text + result.append(html[last_end:]) + + # Close any unclosed tags + for t in reversed(stack): + result.append(f"") + + return "".join(result) + + +def escape_html(text: str) -> str: + """Escape the 3 HTML-special characters for Telegram.""" return text.replace("&", "&").replace("<", "<").replace(">", ">") def markdown_to_telegram_html(text: str) -> str: """Convert Claude's markdown output to Telegram-compatible HTML. - Telegram supports a narrow HTML subset: , , ,
,
-    , , . This function converts common markdown patterns
-    to that subset while preserving code blocks verbatim.
-
-    Order of operations:
-    1. Extract fenced code blocks -> placeholders
-    2. Extract inline code -> placeholders
-    3. HTML-escape remaining text
-    4. Convert bold (**text** / __text__)
-    5. Convert italic (*text*, _text_ with word boundaries)
-    6. Convert links [text](url)
-    7. Convert headers (# Header -> Header)
-    8. Convert strikethrough (~~text~~)
-    9. Restore placeholders
+    Order of operations (early steps extract content into placeholders
+    to protect it from later regex passes):
+
+    0.  Markdown tables → aligned 
 blocks
+    1.  Fenced code blocks → 

+    2.  Inline code → 
+    3.  Blockquotes (> text) → 
+ 4. HTML-escape remaining text + 5. Horizontal rules (--- / ***) → ── separator + 6. Bold (**text** / __text__) + 7. Italic (*text* / _text_) + 8. Links [text](url) + 9. Headers (# Header → Header) + 10. Strikethrough (~~text~~) + 11. Unordered lists (- item / * item) + 12. Ordered lists (1. item) + 13. Restore placeholders """ placeholders: List[Tuple[str, str]] = [] placeholder_counter = 0 @@ -46,6 +105,52 @@ def _make_placeholder(html_content: str) -> str: placeholders.append((key, html_content)) return key + # --- 0. Extract markdown tables → monospace
 blocks ---
+    def _replace_table(m: re.Match) -> str:  # type: ignore[type-arg]
+        table_text = m.group(0)
+        lines = table_text.strip().split("\n")
+        rows = []
+        for line in lines:
+            stripped = line.strip()
+            if not stripped.startswith("|"):
+                continue
+            if re.match(r"^\|[\s\-:|]+\|$", stripped):
+                continue
+            cells = [c.strip() for c in stripped.split("|")[1:-1]]
+            if cells:
+                rows.append(cells)
+
+        if not rows:
+            return table_text
+
+        num_cols = max(len(r) for r in rows)
+        col_widths = [0] * num_cols
+        for row in rows:
+            for i, cell in enumerate(row):
+                if i < num_cols:
+                    col_widths[i] = max(col_widths[i], len(cell))
+
+        formatted_lines = []
+        for row in rows:
+            parts = []
+            for i in range(num_cols):
+                cell = row[i] if i < len(row) else ""
+                parts.append(cell.ljust(col_widths[i]))
+            formatted_lines.append(" │ ".join(parts))
+            if len(formatted_lines) == 1:
+                sep_parts = ["─" * w for w in col_widths]
+                formatted_lines.append("─┼─".join(sep_parts))
+
+        pre_content = "\n".join(formatted_lines)
+        return _make_placeholder(f"
{escape_html(pre_content)}
") + + text = re.sub( + r"(?:^\|.+\|$\n?){2,}", + _replace_table, + text, + flags=re.MULTILINE, + ) + # --- 1. Extract fenced code blocks --- def _replace_fenced(m: re.Match) -> str: # type: ignore[type-arg] lang = m.group(1) or "" @@ -72,33 +177,72 @@ def _replace_inline_code(m: re.Match) -> str: # type: ignore[type-arg] text = re.sub(r"`([^`\n]+)`", _replace_inline_code, text) - # --- 3. HTML-escape remaining text --- + # --- 3. Blockquotes: > text →
--- + def _replace_blockquote(m: re.Match) -> str: # type: ignore[type-arg] + block = m.group(0) + # Strip the leading > (and optional space) from each line + lines = [] + for line in block.split("\n"): + stripped = re.sub(r"^>\s?", "", line) + lines.append(stripped) + inner = "\n".join(lines) + # Recursively format the blockquote content + inner_html = escape_html(inner) + return _make_placeholder(f"
{inner_html}
") + + text = re.sub( + r"(?:^>.*$\n?)+", + _replace_blockquote, + text, + flags=re.MULTILINE, + ) + + # --- 4. HTML-escape remaining text --- text = escape_html(text) - # --- 4. Bold: **text** or __text__ --- + # --- 5. Horizontal rules: --- or *** or ___ → visual separator --- + text = re.sub( + r"^(?:---+|\*\*\*+|___+)\s*$", + "──────────", + text, + flags=re.MULTILINE, + ) + + # --- 6. Bold: **text** or __text__ --- text = re.sub(r"\*\*(.+?)\*\*", r"\1", text) text = re.sub(r"__(.+?)__", r"\1", text) - # --- 5. Italic: *text* (require non-space after/before) --- + # --- 7. Italic: *text* (require non-space after/before) --- text = re.sub(r"\*(\S.*?\S|\S)\*", r"\1", text) - # _text_ only at word boundaries (avoid my_var_name) text = re.sub(r"(?\1
", text) - # --- 6. Links: [text](url) --- + # --- 8. Links: [text](url) --- text = re.sub( r"\[([^\]]+)\]\(([^)]+)\)", r'
\1', text, ) - # --- 7. Headers: # Header -> Header --- + # --- 9. Headers: # Header → Header --- text = re.sub(r"^#{1,6}\s+(.+)$", r"\1", text, flags=re.MULTILINE) - # --- 8. Strikethrough: ~~text~~ --- + # --- 10. Strikethrough: ~~text~~ --- text = re.sub(r"~~(.+?)~~", r"\1", text) - # --- 9. Restore placeholders --- + # --- 11. Unordered lists: - item / * item → bullet --- + text = re.sub(r"^[\-\*]\s+", "• ", text, flags=re.MULTILINE) + + # --- 12. Ordered lists: 1. item → keep number with period --- + # (Telegram has no
    , so just clean up the formatting) + text = re.sub(r"^(\d+)\.\s+", r"\1. ", text, flags=re.MULTILINE) + + # --- 13. Restore placeholders --- for key, html_content in placeholders: text = text.replace(key, html_content) + # --- 14. Repair HTML tag nesting --- + # Telegram is strict about nesting: ... is OK, + # but ... is rejected. Fix any mismatches. + text = _repair_html_nesting(text) + return text