forked from RichardSwierk/WebCopy
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
109 lines (98 loc) · 4.02 KB
/
main.py
File metadata and controls
109 lines (98 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/python
import requests, os, shutil
from pathlib import Path
sources=[]
oldSources=[]
filePaths=[]
printFileNames=True
#website='http://www.suzannecollinsbooks.com/'
website=str(input('Enter website url: '))
if 'www.' not in website:
website='www.'+website
filename=website[website.find('www.')+4:website.find('.com')]
my_dir=Path(filename)
if my_dir.is_dir():
shutil.rmtree(filename)
os.makedirs(filename, exist_ok=True)
def getWebsite(url,path):
html=requests.get(url)
os.makedirs(os.path.dirname(path), exist_ok=True)
f=open(path,'wb')
f.write(html.content)
f.close()
def getContent(pathToFile):
with open(pathToFile) as f:
content=f.readlines()
f.close()
#print content
content=[x.strip() for x in content]
return content
def add(found,lang):
found=found[:found.find(lang)+len(lang)]
found=found[found.rfind(' ')+1:]
found=found[found.rfind('"')+1:]
if found not in sources and found not in oldSources:
if printFileNames==True:
print (found)
sources.append(found)
def findSources(content):
for x in range(len(content)):
line=content[x]
if "'" in line:
line=list(line)
num=int(line.index("'"))
line[num]='"'
line = ''.join(line)
found=str(line)
if ('href' in found or 'src' in found) and ('http' not in found or 'https' not in found):
if '.js' in found:
add(found,'.js')
if '.php' in found:
add(found,'.php')
if '.css' in found:
add(found,'.css')
if '.jpg' in found:
add(found,'.jpg')
if '.gif' in found:
add(found,'.gif')
if '.png' in found:
add(found,'.png')
if '.htm' in found:
add(found,'.htm')
if '.html' in found:
add(found,'.html')
def getSources():
for x in range(len(sources)):
source=sources[x]
dir=source
file=''
if source.find('/')!=source.rfind('/'):
for y in range(source.count('/')-1):
file=file+dir[dir.find('/'):dir.find('/', dir.find('/')+1)]
dir=dir[dir.find('/', dir.find('/')+1):]
myDir=Path(filename+file)
if not myDir.is_dir():
os.makedirs(filename+file, exist_ok=True)
if '.jpg' in source or '.gif' in source:
img_url = website + source[source.find('/')+1:]
img_path = filename + source[source.find('/'):]
img_response = requests.get(img_url)
os.makedirs(os.path.dirname(img_path), exist_ok=True)
with open(img_path, 'wb') as img_file:
img_file.write(img_response.content)
else:
getWebsite(website+source[source.find('/')+1:],filename+source[source.find('/'):])
if '.htm' in source or '.html' in source:
filePaths.append(filename+source[source.find('/'):])
getWebsite(website,filename+'/index.html')
findSources(getContent(filename+'/index.html'))
getSources()
oldSources=sources
sources=[]
for x in range(len(filePaths)):
#print filePaths[x]
source=filePaths[x]
findSources(getContent(source))
getSources()
print("\nWebsite copied successful")
print(f"File Location: {os.path.abspath(filename)}")