-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
130 lines (102 loc) · 3.04 KB
/
scrape.py
File metadata and controls
130 lines (102 loc) · 3.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from mongoengine import *
import requests
import urllib
# Import Location Data
from california_locations import state, counties, blacklist
# Connect to Mongo
connect('github')
# City Model
class City(Document):
name = StringField()
county = StringField()
state = StringField()
completed = BooleanField()
# User Model
class User(Document):
username = StringField()
name = StringField()
email = StringField()
city = ReferenceField(City)
county = StringField()
state = StringField()
location = StringField()
language = StringField()
repos = IntField()
followers = IntField()
if True:
import ipdb
ipdb.set_trace()
# Scrape Github URL
url_for_page = "https://api.github.com/legacy/user/search/{}?sort=followers&order=desc&per_page=100&start_page={}&access_token=%TOKEN%".format
# Query to Format
query = {
'followers': '>3',
'repos': '>0'
}
# Format Query
def fmt_query(q):
s = []
for key, value in q.items():
s.append("{}:{}".format(key, value.replace('>', '%3E').replace(' ', '+')))
return '%20'.join(s)
# Read Access Token
with open('.token') as file:
token = file.read()
for county, cities in counties.iteritems():
for name in cities:
# Create city if not exists
try:
city = City.objects.get(name=name, county=county, state=state)
except DoesNotExist:
print "Creating city..."
city = City(name=name, county=county, state=state).save()
if city.completed:
print "\n[x] Already completed: {} in {} County, {}".format(city.name, city.county, city.state)
continue
print "\n[ ] Processing: {} in {} County, {}".format(city.name, city.county, city.state)
query['location'] = city.name
page = 0
count = 0
while True:
page += 1
url = url_for_page(fmt_query(query), page).replace('%TOKEN%', token)
print " {} {}".format(page, url)
data = requests.get(url).json()
if not 'users' in data:
raise Exception("Message: {}".format(data['message']))
if len(data['users']) == 0:
break
for udata in data['users']:
# Skip existing user documents
try:
User.objects.get(username=udata['username'])
continue
except DoesNotExist:
pass
# Check user location against the blacklist
check = udata.get('location', '')
bad = False
for word in blacklist:
if word in check:
bad = True
if bad:
continue
user = User(
username=udata['username'],
name=udata.get('fullname'),
email=udata.get('email'),
city=city,
county=city.county,
state=city.state,
location=udata.get('location'),
language=udata.get('language'),
repos=udata.get('repos'),
followers=udata.get('followers')
)
user.save()
count += 1
if len(data['users']) < 100:
break
print ' = {} users'.format(count)
city.completed = True
city.save()