-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathInvertedIndex.py
More file actions
83 lines (74 loc) · 2.37 KB
/
InvertedIndex.py
File metadata and controls
83 lines (74 loc) · 2.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
import utils
import math
# 构建倒排索引(Reuters 的默认路径在上级目录)
def create_index():
index = {}
doc_size = [0 for d in range(1, utils.D*2+2)] # 21576
print(len(doc_size))
files = os.listdir(utils.rpath)
for file in files:
content = utils.process_doc_content(utils.rpath+file)
docID = utils.get_doc_ID(file)
num = 0 # word 在文档中的位置
for word in content:
if word not in index:
doclist = {}
doclist[docID] = [num]
index[word] = doclist
else:
if docID not in index[word]:
index[word][docID] = [num]
else:
index[word][docID].append(num)
num += 1
doc_size[docID]=num
return index, doc_size
# 用倒排索引生成词表
def get_wordlist(index):
wordlist = []
for word in index.keys():
wordlist.append(word)
return wordlist
# 生成 VSM
# TF_word_i = len(index[word][article_i])/doc_size[article_i]
# IDF = log_2(D/len(index[word])), D=10788
# TF-IDF_word_i = TF*IDF
def create_VSM(index, doc_size, wordlist):
VSM = {}
for d in range(1, utils.D*2+1): # 21576
if d % 1000 == 0:
print('Processing:'+str(d))
# 不考虑文件夹内不存在的篇目
if doc_size[d]==0:
continue
tf_idf_list = []
num = 0
for word in wordlist:
# 简单的索引压缩
if str(d) not in index[word]:
num += 1
continue
else:
if num > 0:
tf_idf_list.append(str(num))
tf = float(len(index[word][str(d)])/doc_size[d])
idf = math.log2(utils.D/len(index[word]))
tf_idf = '%.3f' % float(tf*idf) # 保留三位小数
tf_idf_list.append(tf_idf)
num = 0
VSM[d] = tf_idf_list
return VSM
# 为 Top K 暴力查表做准备
def VSM_sum(VSM):
sum_VSM = {}
for d in range(1, utils.D*2+1): # 21576
if d % 1000 == 0:
print('Processing'+str(d))
if str(d) in VSM.keys():
sum = 0.0
for tfidf in VSM[str(d)]:
if float(tfidf) < 1:
sum += float(tfidf)
sum_VSM[d] = '%.3f' % sum
return sum_VSM