From 60eb8d4f87eab1c2a779bebe1cd6dd88d2257443 Mon Sep 17 00:00:00 2001
From: Franz Chandi <franz.chandi@gmail.com>
Date: Sun, 15 Mar 2026 00:04:32 +0000
Subject: [PATCH] feat: rich HTML formatting for Telegram messages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Convert markdown tables to aligned monospace <pre> blocks
- Add blockquote support (> text → <blockquote>)
- Add bullet list conversion (- item → • item)
- Add numbered list formatting
- Add horizontal rule conversion (--- → ──────────)
- Fix HTML tag nesting issues that cause Telegram rejection
- Add _repair_html_nesting() to fix overlapping bold/italic tags
---
 src/bot/utils/html_format.py | 202 ++++++++++++++++++++++++++++++-----
 1 file changed, 173 insertions(+), 29 deletions(-)
diff --git a/src/bot/utils/html_format.py b/src/bot/utils/html_format.py
index 2799a4ee..b84bbd58 100644
--- a/src/bot/utils/html_format.py
+++ b/src/bot/utils/html_format.py
@@ -1,40 +1,99 @@
 """HTML formatting utilities for Telegram messages.
 
-Telegram's HTML mode only requires escaping 3 characters (<, >, &) vs the many
-ambiguous Markdown v1 metacharacters, making it far more robust for rendering
-Claude's output which contains underscores, asterisks, brackets, etc.
+Telegram's HTML mode supports: <b>, <i>, <u>, <s>, <code>, <pre>,
+<pre><code class="language-X">, <a href>, <blockquote>,
+<blockquote expandable>, <tg-spoiler>.
+
+This module converts Claude's markdown output into that subset.
 """
 
 import re
 from typing import List, Tuple
 
 
-def escape_html(text: str) -> str:
-    """Escape the 3 HTML-special characters for Telegram.
+_INLINE_TAGS = {"b", "i", "s", "u", "code"}
+_TAG_RE = re.compile(r"<(/?)(\w+)(?:\s[^>]*)?>")
+
+
+def _repair_html_nesting(html: str) -> str:
+    """Fix misnested inline HTML tags that Telegram would reject.
 
-    This replaces all 3 _escape_markdown functions previously scattered
-    across the codebase.
+    Telegram requires strict nesting: <b><i>...</i></b> is OK,
+    but <i><b>...</i></b> is rejected. This walks the tag stack
+    and closes/reopens tags when it detects a mismatch.
     """
+    result = []
+    stack: List[str] = []
+    last_end = 0
+
+    for m in _TAG_RE.finditer(html):
+        # Append text before this tag
+        result.append(html[last_end:m.start()])
+        last_end = m.end()
+
+        is_close = m.group(1) == "/"
+        tag = m.group(2).lower()
+
+        # Only repair inline tags; skip <pre>, <blockquote>, <a>, etc.
+        if tag not in _INLINE_TAGS:
+            result.append(m.group(0))
+            continue
+
+        if not is_close:
+            stack.append(tag)
+            result.append(m.group(0))
+        else:
+            if tag in stack:
+                # Close tags in reverse order up to the matching opener
+                idx = len(stack) - 1 - stack[::-1].index(tag)
+                tags_to_reopen = stack[idx + 1:]
+                # Close everything from top to idx
+                for t in reversed(stack[idx:]):
+                    result.append(f"</{t}>")
+                stack = stack[:idx]
+                # Reopen tags that were above the matched one
+                for t in tags_to_reopen:
+                    result.append(f"<{t}>")
+                    stack.append(t)
+            else:
+                # Orphan close tag — skip it
+                pass
+
+    # Append remaining text
+    result.append(html[last_end:])
+
+    # Close any unclosed tags
+    for t in reversed(stack):
+        result.append(f"</{t}>")
+
+    return "".join(result)
+
+
+def escape_html(text: str) -> str:
+    """Escape the 3 HTML-special characters for Telegram."""
     return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
 
 
 def markdown_to_telegram_html(text: str) -> str:
     """Convert Claude's markdown output to Telegram-compatible HTML.
 
-    Telegram supports a narrow HTML subset: <b>, <i>, <code>, <pre>,
-    <a href>, <s>, <u>. This function converts common markdown patterns
-    to that subset while preserving code blocks verbatim.
-
-    Order of operations:
-    1. Extract fenced code blocks -> placeholders
-    2. Extract inline code -> placeholders
-    3. HTML-escape remaining text
-    4. Convert bold (**text** / __text__)
-    5. Convert italic (*text*, _text_ with word boundaries)
-    6. Convert links [text](url)
-    7. Convert headers (# Header -> <b>Header</b>)
-    8. Convert strikethrough (~~text~~)
-    9. Restore placeholders
+    Order of operations (early steps extract content into placeholders
+    to protect it from later regex passes):
+
+    0.  Markdown tables → aligned <pre> blocks
+    1.  Fenced code blocks → <pre><code>
+    2.  Inline code → <code>
+    3.  Blockquotes (> text) → <blockquote>
+    4.  HTML-escape remaining text
+    5.  Horizontal rules (--- / ***) → ── separator
+    6.  Bold (**text** / __text__)
+    7.  Italic (*text* / _text_)
+    8.  Links [text](url)
+    9.  Headers (# Header → <b>Header</b>)
+    10. Strikethrough (~~text~~)
+    11. Unordered lists (- item / * item)
+    12. Ordered lists (1. item)
+    13. Restore placeholders
     """
     placeholders: List[Tuple[str, str]] = []
     placeholder_counter = 0
@@ -46,6 +105,52 @@ def _make_placeholder(html_content: str) -> str:
         placeholders.append((key, html_content))
         return key
 
+    # --- 0. Extract markdown tables → monospace <pre> blocks ---
+    def _replace_table(m: re.Match) -> str:  # type: ignore[type-arg]
+        table_text = m.group(0)
+        lines = table_text.strip().split("\n")
+        rows = []
+        for line in lines:
+            stripped = line.strip()
+            if not stripped.startswith("|"):
+                continue
+            if re.match(r"^\|[\s\-:|]+\|$", stripped):
+                continue
+            cells = [c.strip() for c in stripped.split("|")[1:-1]]
+            if cells:
+                rows.append(cells)
+
+        if not rows:
+            return table_text
+
+        num_cols = max(len(r) for r in rows)
+        col_widths = [0] * num_cols
+        for row in rows:
+            for i, cell in enumerate(row):
+                if i < num_cols:
+                    col_widths[i] = max(col_widths[i], len(cell))
+
+        formatted_lines = []
+        for row in rows:
+            parts = []
+            for i in range(num_cols):
+                cell = row[i] if i < len(row) else ""
+                parts.append(cell.ljust(col_widths[i]))
+            formatted_lines.append(" │ ".join(parts))
+            if len(formatted_lines) == 1:
+                sep_parts = ["─" * w for w in col_widths]
+                formatted_lines.append("─┼─".join(sep_parts))
+
+        pre_content = "\n".join(formatted_lines)
+        return _make_placeholder(f"<pre>{escape_html(pre_content)}</pre>")
+
+    text = re.sub(
+        r"(?:^\|.+\|$\n?){2,}",
+        _replace_table,
+        text,
+        flags=re.MULTILINE,
+    )
+
     # --- 1. Extract fenced code blocks ---
     def _replace_fenced(m: re.Match) -> str:  # type: ignore[type-arg]
         lang = m.group(1) or ""
@@ -72,33 +177,72 @@ def _replace_inline_code(m: re.Match) -> str:  # type: ignore[type-arg]
 
     text = re.sub(r"`([^`\n]+)`", _replace_inline_code, text)
 
-    # --- 3. HTML-escape remaining text ---
+    # --- 3. Blockquotes: > text → <blockquote> ---
+    def _replace_blockquote(m: re.Match) -> str:  # type: ignore[type-arg]
+        block = m.group(0)
+        # Strip the leading > (and optional space) from each line
+        lines = []
+        for line in block.split("\n"):
+            stripped = re.sub(r"^>\s?", "", line)
+            lines.append(stripped)
+        inner = "\n".join(lines)
+        # Recursively format the blockquote content
+        inner_html = escape_html(inner)
+        return _make_placeholder(f"<blockquote>{inner_html}</blockquote>")
+
+    text = re.sub(
+        r"(?:^>.*$\n?)+",
+        _replace_blockquote,
+        text,
+        flags=re.MULTILINE,
+    )
+
+    # --- 4. HTML-escape remaining text ---
     text = escape_html(text)
 
-    # --- 4. Bold: **text** or __text__ ---
+    # --- 5. Horizontal rules: --- or *** or ___ → visual separator ---
+    text = re.sub(
+        r"^(?:---+|\*\*\*+|___+)\s*$",
+        "──────────",
+        text,
+        flags=re.MULTILINE,
+    )
+
+    # --- 6. Bold: **text** or __text__ ---
     text = re.sub(r"\*\*(.+?)\*\*", r"<b>\1</b>", text)
     text = re.sub(r"__(.+?)__", r"<b>\1</b>", text)
 
-    # --- 5. Italic: *text* (require non-space after/before) ---
+    # --- 7. Italic: *text* (require non-space after/before) ---
     text = re.sub(r"\*(\S.*?\S|\S)\*", r"<i>\1</i>", text)
-    # _text_ only at word boundaries (avoid my_var_name)
     text = re.sub(r"(?<!\w)_(\S.*?\S|\S)_(?!\w)", r"<i>\1</i>", text)
 
-    # --- 6. Links: [text](url) ---
+    # --- 8. Links: [text](url) ---
     text = re.sub(
         r"\[([^\]]+)\]\(([^)]+)\)",
         r'<a href="\2">\1</a>',
         text,
     )
 
-    # --- 7. Headers: # Header -> <b>Header</b> ---
+    # --- 9. Headers: # Header → <b>Header</b> ---
     text = re.sub(r"^#{1,6}\s+(.+)$", r"<b>\1</b>", text, flags=re.MULTILINE)
 
-    # --- 8. Strikethrough: ~~text~~ ---
+    # --- 10. Strikethrough: ~~text~~ ---
     text = re.sub(r"~~(.+?)~~", r"<s>\1</s>", text)
 
-    # --- 9. Restore placeholders ---
+    # --- 11. Unordered lists: - item / * item → bullet ---
+    text = re.sub(r"^[\-\*]\s+", "• ", text, flags=re.MULTILINE)
+
+    # --- 12. Ordered lists: 1. item → keep number with period ---
+    # (Telegram has no <ol>, so just clean up the formatting)
+    text = re.sub(r"^(\d+)\.\s+", r"\1. ", text, flags=re.MULTILINE)
+
+    # --- 13. Restore placeholders ---
     for key, html_content in placeholders:
         text = text.replace(key, html_content)
 
+    # --- 14. Repair HTML tag nesting ---
+    # Telegram is strict about nesting: <b><i>...</i></b> is OK,
+    # but <i><b>...</i></b> is rejected. Fix any mismatches.
+    text = _repair_html_nesting(text)
+
     return text