-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest.py
More file actions
78 lines (63 loc) · 1.91 KB
/
test.py
File metadata and controls
78 lines (63 loc) · 1.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# import tensorflow as tf
import numpy as np
import nltk
import pickle
import pandas as pd
# nltk.download()
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import csv
wordLeammer=WordNetLemmatizer()
porterStemmer=PorterStemmer()
stopDict=set(stopwords.words('english'))
target=open('science.sql','r')
# target=open('source.txt','r')
remap={
ord('\''):' ',
ord('\n'): None,
ord('\r'): None,
ord('\\'):' ',
# ord(','):' '
}
textPast=[]
for x in target:
x=x.replace('\\n','')
# inTarget.append(x.translate(remap))
textPast.append("".join(x.translate(remap).split(',')[5:-2]))
text=list(set(textPast))
text.sort(key=textPast.index)
text=text[:100]
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer=TfidfVectorizer(stop_words=nltk.corpus.stopwords.words('english'))
transformer=TfidfTransformer()
tfidf=transformer.fit_transform(vectorizer.fit_transform(text))
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
def vectorDistance(v1,v2):
return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))
final=[]
# sess=tf.Session()
for i in range(len(weight)):
final.append([])
for j in range(len(weight)):
# final[i].append((j,sess.run(tf.reduce_sum(tf.multiply(weight[i],weight[j])))))
final[i].append((j,vectorDistance(weight[i],weight[j])))
final[i].sort(key=lambda x:x[1],reverse=True)
# myData=pd.DataFrame(final)
# myData.to_csv("pandasData")
csvFile=open('saveText.csv','w')
writer=csv.writer(csvFile)
for row in text:
writer.writerow(row)
# saveFile=open('saveFinal','wb')
# pickle.dump(final,saveFile)
# saveFile.close()
# loadFile=open('saveFinal','rb')
# saveTest=pickle.load(loadFile)
for i in range(len(final)):
print("Document",i)
for j in range(5):
print(final[i][j+1])
# print(final)