-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathURL extractor.py
More file actions
111 lines (94 loc) · 3.75 KB
/
URL extractor.py
File metadata and controls
111 lines (94 loc) · 3.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import requests
from bs4 import BeautifulSoup
import urllib3
import re
from urllib.parse import urljoin, urlparse
from typing import List, Optional
import argparse
# Configure colored output
COLOR_RED = "\033[91m"
COLOR_GREEN = "\033[92m"
COLOR_YELLOW = "\033[93m"
COLOR_RESET = "\033[0m"
# Suppress SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def is_valid_url(url: str) -> bool:
"""Validate URL format using regex"""
pattern = re.compile(
r'^(?:http|ftp)s?://' # Protocol
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # Domain
r'localhost|' # Localhost
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # IPv4
r'(?::\d+)?' # Port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return re.match(pattern, url) is not None
def get_user_input(prompt: str, validation_func) -> str:
"""Get and validate user input with retry logic"""
while True:
user_input = input(prompt).strip()
if validation_func(user_input):
return user_input
print(f"{COLOR_RED}Invalid input. Please try again.{COLOR_RESET}")
def fetch_url_content(url: str, verify_ssl: bool = False) -> Optional[str]:
"""Fetch URL content with error handling"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = requests.get(
url,
headers=headers,
verify=verify_ssl,
timeout=10,
allow_redirects=True
)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"{COLOR_RED}Request failed: {e}{COLOR_RESET}")
return None
def extract_urls(html: str, base_url: str) -> List[str]:
"""Extract and normalize URLs from HTML content"""
soup = BeautifulSoup(html, 'html.parser')
urls = []
for tag in soup.find_all(['a', 'link', 'script', 'img'], href=True):
url = tag['href'].strip()
absolute_url = urljoin(base_url, url)
urls.append(absolute_url)
return sorted(set(urls)) # Remove duplicates and sort
def save_results(urls: List[str], filename: str) -> bool:
"""Save extracted URLs to file"""
try:
with open(filename, 'w', encoding='utf-8') as f:
f.write("\n".join(urls))
print(f"{COLOR_GREEN}Successfully saved {len(urls)} URLs to {filename}{COLOR_RESET}")
return True
except IOError as e:
print(f"{COLOR_RED}File save failed: {e}{COLOR_RESET}")
return False
def main():
"""Main execution flow"""
print(f"{COLOR_GREEN}\n=== URL Extractor 2.0 ==={COLOR_RESET}")
# Get validated user input
target_url = get_user_input(
"Enter target URL (http/https): ",
is_valid_url
)
output_file = get_user_input(
"Enter output filename (default: urls.txt): ",
lambda x: x.endswith('.txt') or x == ''
) or 'urls.txt'
# Security confirmation
verify_ssl = input("Verify SSL certificates? (y/N): ").lower() == 'y'
# Fetch and process content
if html_content := fetch_url_content(target_url, verify_ssl):
print(f"{COLOR_YELLOW}Processing {target_url}...{COLOR_RESET}")
urls = extract_urls(html_content, target_url)
if urls:
save_results(urls, output_file)
else:
print(f"{COLOR_YELLOW}No URLs found on the page{COLOR_RESET}")
else:
print(f"{COLOR_RED}Failed to extract content from {target_url}{COLOR_RESET}")
if __name__ == "__main__":
main()