-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcrawl_delicious.py
More file actions
32 lines (25 loc) · 897 Bytes
/
crawl_delicious.py
File metadata and controls
32 lines (25 loc) · 897 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import cPickle
import urllib
import simplejson
import featurize
BOSS = "http://boss.yahooapis.com/ysearch/web/v1/%s?appid=DfNrQ3bV34H_Ll3bncrHNjWL6z_1K_1xX8UIfpugGKdPGTZ_CkCJjNq8FxTLlvI-&abstract=long&view=delicious_toptags&start=%d&count=50"
vector_data = []
for q in open("tags.txt", "r"):
for o in xrange(0, 10):
offset = o * 50
req = BOSS % (q.strip(), offset)
try:
resp = urllib.urlopen(req).read()
data = simplejson.loads(resp)
for r in data["ysearchresponse"]["resultset_web"]:
text = r["title"] + " " + r["abstract"]
vector = featurize.vectorize(text)
tt = r["delicious_toptags"]
if len(tt) > 0:
tags = tt["tags"]
for t in tags:
name = t["name"].lower()
vector_data.append( (vector, name) )
except:
pass
cPickle.dump(vector_data, open("vector_data.cpickle", "w"))