-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrip-wget-slow.py
More file actions
68 lines (52 loc) · 2.22 KB
/
rip-wget-slow.py
File metadata and controls
68 lines (52 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
import subprocess
import sys
from bs4 import BeautifulSoup
def rip_website(url, output_dir):
print(f"Ripping website: {url}")
# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Use wget to recursively download the website
subprocess.run(['wget.exe', '--recursive', url, '--no-parent', url], cwd=output_dir)
def extract_text_from_html(html_file):
with open(html_file, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file, 'html.parser')
# Define CSS selectors for elements to exclude
exclude_selectors = ['script', 'style', 'nav', '.sidebar', '#footer'] # Example selectors to exclude
# Remove elements matching the specified selectors
for selector in exclude_selectors:
for tag in soup.select(selector):
tag.extract()
# Extract text from the HTML file
text = soup.get_text()
return text
def remove_excessive_blank_lines(text):
lines = text.split('\n')
cleaned_lines = [line for line in lines if line.strip()]
return '\n'.join(cleaned_lines)
def main():
if len(sys.argv) != 2:
print("Usage: python rip-wget-slow.py <website_url>")
return
website_url = sys.argv[1] # URL of the website passed as command-line argument
output_dir = "website_dl" # Output directory for downloaded files
output_file = 'outputz.txt' # Output file for extracted text
print(f"Processing website: {website_url}")
# Rip the website
rip_website(website_url, output_dir)
print("Website ripping complete")
# Process the downloaded files
html_files = []
# Iterate through directories and find HTML files
for root, dirs, files in os.walk(output_dir):
for file in files:
if file.endswith('.html'):
html_files.append(os.path.join(root, file))
# Extract text from HTML files and append to output file
with open(output_file, 'w', encoding='utf-8') as outfile:
for html_file in html_files:
text = extract_text_from_html(html_file)
cleaned_text = remove_excessive_blank_lines(text)
outfile.write(cleaned_text + '\n')
if __name__ == "__main__":
main()