-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathword2vec.py
More file actions
116 lines (96 loc) · 2.96 KB
/
word2vec.py
File metadata and controls
116 lines (96 loc) · 2.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from tumblrUtil import TumblrAgent as TA
from vocabUtil import VocabAgent as VA
import sys
import numpy as np
import math
def loadModel(number):
model = {}
f = open('small-GN-300-neg.txt','r')
# tmp = f.readline()
# tmp = f.readline()
### Take only the first @number of words ###
for i in range(number):
line = f.readline()
tmp = line.split()
vec = []
if i % 10000 == 0:
print >> sys.stderr, i, tmp[0]
for j in range(1, 301):
vec.append(float(tmp[j]))
model[tmp[0]] = np.array(vec)
return model
def cosineDistance(v1, v2):
eucV1 = math.sqrt(sum(x**2 for x in v1))
eucV2 = math.sqrt(sum(x**2 for x in v2))
if eucV1 == 0 or eucV2 == 0:
return float('-inf')
return np.dot(v1, v2) / (eucV1 * eucV2)
def loadVecs(names):
vecs = []
f = open('w2v_for_blogs.txt','r')
for i, line in enumerate(f):
v = line.strip().split()
v = [float(x) for x in v]
vecs.append((names[i], v))
return vecs
def loadVecFromBlog(ta, blog):
# blog = ta.getBlogByName(name)
print blog.getName()
postIds = blog.getAllPosts()
count = 0
v = np.zeros(300)
for postId in postIds:
post = ta.getPostById(blog.getName(), postId)
tags = post.getTags()
### tags ###
for tag in tags:
if tag in model:
v += model[tag]
### other terms ###
otherTerms = VA.extractTermsFromPost(post)
for term in otherTerms:
if term in model:
v += model[term]
return v
if __name__ == '__main__':
ta = TA()
print >> sys.stderr, 'Done loading TumblrAgent'
model = loadModel(200000)
print >> sys.stderr, 'Done loading word2vec model'
blognames = ta.getAllBlogs()
# blogs = []
w = open('w2v_for_blogs.txt', 'w')
w2v = []
vecs = []
for name in blognames:
blog = ta.getBlogByName(name)
v = loadVecFromBlog(ta, blog)
vecs.append((name, v))
for element in v:
w.write(str(element) + " ")
w.write("\n")
topK = 10
while True:
queryName = raw_input()
if queryName == "EXIT":
break
try:
if queryName not in ta.getAllBlogs():
blog = ta.getBlogByName(queryName)
### calculate the vector for query blog ###
newV = loadVecFromBlog(ta, blog)
vecs.append((queryName, newV))
else:
blog = ta.getBlogByName(queryName)
except:
print 'No such blog name'
continue
v = loadVecFromBlog(ta, ta.getBlogByName(queryName))
# Now I have the vector
dists = []
for bv in vecs:
tmp = cosineDistance(bv[1], v)
dists.append((bv[0], tmp))
dists.sort(key=lambda tup: tup[1], reverse=True)
for i in range(topK):
print dists[i][0], dists[i][1]