Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ WEBFETCH_OUTPUT={"*": "raw", "example.com": "trafilatura", "news.com": "markdown
| `"raw"` | Return raw HTML as-is (default) |
| `"markdown"` | Convert full HTML to Markdown via `markdownify` |
| `"trafilatura"` | Extract main content and return as Markdown via `trafilatura` (falls back to raw if extraction fails) |
| `"lighthtml"` | Strip `<style>`, `<script>` (except JSON-LD), comments, and all tag attributes; returns minimal HTML structure |

Merge order (later wins): `*` → domain-specific → per-call `output_format` parameter.

Expand Down
49 changes: 48 additions & 1 deletion server.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
# Constants
# ---------------------------------------------------------------------------

_VALID_OUTPUT_FORMATS = frozenset({"raw", "markdown", "trafilatura", "json"})
_VALID_OUTPUT_FORMATS = frozenset({"raw", "markdown", "trafilatura", "json", "lighthtml"})

# Default response-body cap (10 MB). Prevents OOM when a remote server returns
# an unexpectedly large payload. Pass max_bytes=-1 to disable explicitly.
Expand Down Expand Up @@ -763,10 +763,57 @@ def _apply_output_format(content: str, fmt: str) -> str:
except json.JSONDecodeError:
_log.warning("output_format=json but response is not valid JSON; returning raw")
return content
if fmt == "lighthtml":
return _apply_lighthtml(content)
_log.error("Unknown output format %r; returning raw content", fmt)
return content


def _apply_lighthtml(html: str) -> str:
"""
Return a lightweight version of *html* with all noise stripped.

Removes:
- <style> tags (and their content)
- Inline style attributes
- All <script> tags EXCEPT type="application/ld+json"
- HTML comments
- All tag attributes (class, id, data-*, aria-*, etc.)

JSON-LD <script> blocks are kept with only their ``type`` attribute.
Falls back to raw HTML if BeautifulSoup is unavailable or raises.
"""
try:
from bs4 import BeautifulSoup, Comment # lazy import

soup = BeautifulSoup(html, "html.parser")

# Remove <style> blocks
for tag in soup.find_all("style"):
tag.decompose()

# Remove <script> blocks except JSON-LD
for tag in soup.find_all("script"):
if tag.get("type", "").lower() != "application/ld+json":
tag.decompose()

# Remove HTML comments
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()

# Strip all attributes; restore type="application/ld+json" on kept scripts
for tag in soup.find_all(True):
if tag.name and tag.name.lower() == "script":
tag.attrs = {"type": "application/ld+json"}
else:
tag.attrs = {}

return str(soup)
except Exception as exc:
_log.warning("lighthtml apply failed (%s); returning raw HTML", exc)
return html


def _extract_text(html: str) -> str:
"""Strip HTML tags and collapse whitespace."""
text = re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=re.DOTALL | re.I)
Expand Down
Loading