-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathretriever.py
More file actions
93 lines (73 loc) · 2.71 KB
/
retriever.py
File metadata and controls
93 lines (73 loc) · 2.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!usr/bin/python
"""
This module contains classes for retrieving the links from the
files downloaded.
filename method returns an absolute path name corresponding to a url.
getLinks returns the links found on the page
"""
from os import makedirs
from os.path import isdir, exists, dirname, splitext
from urlparse import urlparse
import BeautifulSoup
class Retriever(object):
"""
Downloads the page corresponding to the url passed and saves it in a folder
"""
_invalidExt = [
'.pdf', 'jpg', 'jpeg', '.doc',
'docx', '.gif', '.zip', '.rar', '.PDF'
]
def __init__(self):
self.docs_list = []
def filename(self, url, default_file = "index.html"):
"""
Creates a folder corressponding to the url and returns the file name
"""
purl = urlparse(url)
file_name = purl[1] + purl[2]
folder_name = (purl[1] + purl[2])
if purl[2] == '':
folder_name += ('/' + default_file)
file_name += ('/' + default_file)
elif purl[2] == '/':
folder_name += default_file
file_name += default_file
elif (purl[2])[-1] == '/':
file_name += ('/' + default_file)
folder_path = dirname(folder_name)
if not isdir(folder_path): # create archive dir if nec.
if not exists(folder_path):
makedirs(folder_path)
return file_name
def getLinks(self, url, tag = "a", attr = "href"):
"""
Retrieve links from the file on the system corresponding to the url
"""
try:
response = open(self.filename(url)).read() #read from the file
except IOError:
raise IOError
parsed_url = urlparse(url)
domain = parsed_url[0] + '://' + parsed_url[1]
try:
soup = BeautifulSoup.BeautifulSoup(response)
l = soup.findAll(tag, href = True)
except Exception:
raise Exception
links = []
for tag in l:
link = str(tag[attr]) #convert the link to a string
purl = urlparse(link)
if purl[1] == '': #if the link is relative make it absolute
link = domain+link
#check if the extension is that of a document
if splitext(link)[1] in self._invalidExt:
self.docs_list.append(link)
#append only the html link
links.append(link)
return list(set(links)) #returns only distinct links
def getDocsList(self):
"""
Return the list of Documents.
"""
return self.docs_list