-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
138 lines (106 loc) · 4.36 KB
/
scraper.py
File metadata and controls
138 lines (106 loc) · 4.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
url = "http://www.interpol.int/Public/Wanted/Search/Form.asp"
baseurl = "http://www.interpol.int"
record={}
import mechanize
import lxml.etree, lxml.html
from lxml.html.clean import Cleaner
import scraperwiki
#SFD:additions to get suspect links
################################################################################
#
#Retrieve function - given a url it will attempt to open the url and retrieve
# the data that was returned
#
################################################################################
def retrieve(url):
check = False
try:
html=mechanize.Browser()
html.set_handle_robots(False)
html.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.44 Safari/534.7')]
data=html.open(url)
check=True
except:
print "unable to retrieve page"
if check:
return data
else:
return check
def getsusdata(url):
data = {}
#debug to track the url to visit
#print "get the links from "+url
#call the retrieve function to try to get the url
returl = retrieve(url)
#check if data was returned
if returl:
#parse the address passed in
root = lxml.html.parse(returl).getroot()
#select the table elements
sustable = root.cssselect('table')
#get the interesting tables in a list and clean them
instable=[sustable[3],sustable[5],sustable[7]]
cleantables=[]
for i in instable:
cleantables.append(cleanup(lxml.etree.tostring(i),['td','thead','font','strong']))
for cleantable in cleantables:
for tr in cleantable.cssselect('tr'):
if tr is not None:
temp= tr.text
if temp is not None:
temp = temp.replace("\n", "")
ele=temp.split(':')
data[ele[0]] = ele[1]
scraperwiki.sqlite.save(["Date of birth"],data)
else:
print "unable to get the links"
def cleanup(data,tags):
cleaner= Cleaner(remove_tags=tags)
clean=cleaner.clean_html(data)
root = lxml.html.fromstring(clean)
return root
#def cleandata(susdata):
#SFD:additions to get suspect links
br = mechanize.Browser()
br.open(url)
br.select_form(name='form1')
br['cboNbHitsPerPage'] = ['200']
br['cboNbPages']=['50']
countrycontrol = br.form.find_control('ArrestWarrantIssuedBy')
#print [ item.name for item in countrycontrol.items ]
# should be a loop over each country
# (good way to thin this down as there always is an issuing country)
br['ArrestWarrantIssuedBy'] = [ countrycontrol.items[160].name ] # happens to be NIGERIA
response = br.submit() # like a file handle object
# lxml not the most friendly library, but good when you use cssselect; and it
# handles all the string character escaping issues
root = lxml.html.parse(response).getroot()
tables = root.cssselect('table')
# the elements are not given classes or ids, so we have to find them by index ("the 4th table in the page")
# extract result numbers (to verify matching rest of data is consistent)
#print [ b.text for b in tables[3].cssselect('tr td font b') ]
#SFD:additions to get suspect links
susurls=[]
#SFD:additions to get suspect links
for susimage in tables[4].cssselect('table table img'):
#print "--", lxml.etree.tostring(susimage)
susdata = susimage
#SFD:additions to get suspect links
suslinks = susdata.getparent()
susurls.append([a.get('href') for a in suslinks.cssselect('a')])
#SFD:additions to get suspect links
for i in range(7):
susdata = susdata.getparent()
# print out the unstructured data from the fields
#print "\n\n".join([ lxml.etree.tostring(td) for td in susdata.cssselect('td') ])
#SFD:additions to get suspect links
for i,url in enumerate(susurls):
getsusdata(baseurl+url[0])
#nexturl = [a.get('href') for a in root.cssselect('a')if re.search(r"Next", lxml.etree.tostring(a))]
#if nexturl is not None:
#SFD:additions to get suspect links
# To do:
# (1) get the structured fields out of this table and structure it
# (2) step into each page per suspect and parse the data out of that
# (3) save the data to the datastore
# (4) worry about how we improve the datastore so as to monitor changes to what appears on the interpol webpage