WebCopy/main.py at master · Wuamp-dev/WebCopy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/python
import requests, os, shutil
from pathlib import Path

sources=[]
oldSources=[]
filePaths=[]
printFileNames=True
#website='http://www.suzannecollinsbooks.com/'
website=str(input('Enter website url: '))
if 'www.' not in website:
        website='www.'+website
filename=website[website.find('www.')+4:website.find('.com')]

my_dir=Path(filename)
if my_dir.is_dir():
        shutil.rmtree(filename)
os.makedirs(filename, exist_ok=True)

def getWebsite(url,path):
        html=requests.get(url)
        os.makedirs(os.path.dirname(path), exist_ok=True)
        f=open(path,'wb')
        f.write(html.content)
        f.close()

def getContent(pathToFile):
        with open(pathToFile) as f:
                content=f.readlines()
        f.close()
        #print content
        content=[x.strip() for x in content]
        return content

def add(found,lang):
        found=found[:found.find(lang)+len(lang)]
        found=found[found.rfind(' ')+1:]
        found=found[found.rfind('"')+1:]
        if found not in sources and found not in oldSources:
                if printFileNames==True:
                        print (found)
                sources.append(found)

def findSources(content):
        for x in range(len(content)):
                line=content[x]
                if "'" in line:
                        line=list(line)
                        num=int(line.index("'"))
                        line[num]='"'
                        line = ''.join(line)
                found=str(line)
                if ('href' in found or 'src' in found) and ('http' not in found or 'https' not in found):
                        if '.js' in found:
                                add(found,'.js')
                        if '.php' in found:
                                add(found,'.php')
                        if '.css' in found:
                                add(found,'.css')
                        if '.jpg' in found:
                                add(found,'.jpg')
                        if '.gif' in found:
                                add(found,'.gif')
                        if '.png' in found:
                                add(found,'.png')
                        if '.htm' in found:
                                add(found,'.htm')
                        if '.html' in found:
                                add(found,'.html')

def getSources():
        for x in range(len(sources)):
                source=sources[x]
                dir=source
                file=''
                if source.find('/')!=source.rfind('/'):
                        for y in range(source.count('/')-1):
                                file=file+dir[dir.find('/'):dir.find('/', dir.find('/')+1)]
                                dir=dir[dir.find('/', dir.find('/')+1):]
                                myDir=Path(filename+file)
                                if not myDir.is_dir():
                                       os.makedirs(filename+file, exist_ok=True)
                if '.jpg' in source or '.gif' in source:
                        img_url = website + source[source.find('/')+1:]
                        img_path = filename + source[source.find('/'):]
                        img_response = requests.get(img_url)
                        os.makedirs(os.path.dirname(img_path), exist_ok=True)
                        with open(img_path, 'wb') as img_file:
                                img_file.write(img_response.content)
                else:
                        getWebsite(website+source[source.find('/')+1:],filename+source[source.find('/'):])
                        if '.htm' in source or '.html' in source:
                                filePaths.append(filename+source[source.find('/'):])

getWebsite(website,filename+'/index.html')
findSources(getContent(filename+'/index.html'))
getSources()
oldSources=sources
sources=[]
for x in range(len(filePaths)):
        #print filePaths[x]
        source=filePaths[x]
        findSources(getContent(source))
getSources()


print("\nWebsite copied successful")
print(f"File Location: {os.path.abspath(filename)}")