11"""URL ingestion engine — fetch URLs, extract content, generate ideas."""
22
3+ import ipaddress
34import re
5+ import socket
46from dataclasses import dataclass
57from urllib .parse import parse_qs , urlencode , urlparse
68
@@ -31,16 +33,70 @@ class UrlContent:
3133 text : str
3234
3335
36+ def _check_ssrf (hostname : str ) -> None :
37+ """Resolve hostname and raise ValueError if it resolves to a private/reserved address.
38+
39+ Protects against Server-Side Request Forgery (SSRF) by blocking requests
40+ to loopback, private, link-local, and other reserved IP ranges.
41+
42+ Raises:
43+ ValueError: If the hostname resolves to a non-public IP address.
44+ socket.gaierror: If the hostname cannot be resolved (propagated to caller).
45+ """
46+ try :
47+ addrinfos = socket .getaddrinfo (hostname , None )
48+ except socket .gaierror :
49+ # DNS resolution failure — not a private IP issue; let caller handle
50+ raise
51+
52+ for addrinfo in addrinfos :
53+ raw_ip = addrinfo [4 ][0 ]
54+ try :
55+ addr = ipaddress .ip_address (raw_ip )
56+ except ValueError :
57+ continue
58+ if addr .is_loopback or addr .is_private or addr .is_link_local or addr .is_reserved :
59+ raise ValueError (f"Requests to private/reserved addresses are not allowed: { raw_ip } " )
60+
61+
3462def validate_url (url : str ) -> bool :
35- """Check if URL is valid http(s)."""
63+ """Check if URL is valid http(s) and does not point to a private/reserved address.
64+
65+ Returns:
66+ True if the URL is structurally valid and resolves to a public address.
67+
68+ Raises:
69+ ValueError: If the URL resolves to a private, loopback, or link-local IP (SSRF guard).
70+ """
3671 if not url :
3772 return False
3873 try :
3974 parsed = urlparse (url )
40- return parsed .scheme in ("http" , "https" ) and bool (parsed .netloc )
75+ if not (parsed .scheme in ("http" , "https" ) and bool (parsed .netloc )):
76+ return False
4177 except Exception :
4278 return False
4379
80+ # Strip port from netloc to get bare hostname for DNS resolution
81+ hostname = parsed .hostname
82+ if not hostname :
83+ return False
84+
85+ # For bare IP addresses, validate directly without a DNS lookup
86+ try :
87+ addr = ipaddress .ip_address (hostname )
88+ if addr .is_loopback or addr .is_private or addr .is_link_local or addr .is_reserved :
89+ raise ValueError (f"Requests to private/reserved addresses are not allowed: { hostname } " )
90+ return True
91+ except ValueError as exc :
92+ # Re-raise only the SSRF guard errors; ignore the "not a valid IP" parse error
93+ if "not allowed" in str (exc ):
94+ raise
95+
96+ # Hostname is not a bare IP — resolve via DNS
97+ _check_ssrf (hostname )
98+ return True
99+
44100
45101def extract_domain (url : str ) -> str :
46102 """Extract clean domain from URL (strip www.)."""
@@ -63,8 +119,19 @@ def clean_url(url: str) -> str:
63119
64120
65121async def fetch_url_content (url : str ) -> UrlContent :
66- """Fetch URL and extract content."""
67- async with httpx .AsyncClient (follow_redirects = True , timeout = 30.0 ) as client :
122+ """Fetch URL and extract content.
123+
124+ Validates the URL for SSRF safety before making any network request.
125+ Redirects are disabled to prevent redirect-based SSRF bypasses.
126+
127+ Raises:
128+ ValueError: If the URL resolves to a private/reserved address.
129+ UrlFetchError: If the HTTP response indicates an error (status >= 400).
130+ """
131+ # SSRF guard — must run before any network I/O
132+ validate_url (url )
133+
134+ async with httpx .AsyncClient (follow_redirects = False , timeout = 30.0 ) as client :
68135 response = await client .get (url )
69136
70137 if response .status_code >= 400 :
0 commit comments