RI/parseCollection.py at master · seitrec/RI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
##############################################################
# Name: ParseCollection
# Purpose: This modeule is designed to parse the CACM collection,
#          and create tools to build frequency and inverse frequency indexes
# Author: Damien Peltier & Corentin Seitre
# Created: 12/15 - 01/16
##############################################################

import itertools
import json
import os.path
from nltk.tokenize import RegexpTokenizer
from processTfidf import build_CACMNormtfidf, build_CACMtfidf

tokenizer = RegexpTokenizer(r'\w+')


def replacePunct(s):
    """
    Parse a string into a list of words and removes punctuation and capital letters
    :param s: (str) string to parse
    :return: list of parsed words from s
    """
    return [word.lower() for word in tokenizer.tokenize(s)]


def count_words(common_words, string):
    """
    Count the meaningfull words in a string
    :param common_words (iterator): list of stopwords
    :param string: string to parse and count
    :return: (dict) word: occurences
    """
    dict = {}
    for word in string:
        if word not in common_words:
            if word not in dict:
                dict[word] = 1
            else:
                dict[word] += 1
    return dict


def buildFrequencies(files, common_words):
    """
    Count and order the occurences of the words in files
    :param files: CACM collection
    :param common_words: dictionary of stop words (not to be taken into account)
    :return frequencies: (dict) doc_id: {word: frequency}
    """
    parts = {item[0].rstrip(): list(itertools.chain(*([replacePunct(line[1:])
                                                       for line in item[1:]
                                                       if line[0] in ["T", "W", "K"]])))
             for item in files}

    frequencies = {key: count_words(common_words, parts[key])
                   for key in parts}

    with open("../CACMindexes/freq.json", "w") as export:
        export.write(json.dumps(frequencies, indent=4))
    return frequencies


def buildCACMIndex():
    """
    Build the term frequencies index for the CACM collection
    :return frequencies: (dict) doc_id: {word: frequency}
    """
    with open("../CACM/cacm.all", "r") as cacm:
        collection = cacm.read()
        with open("../CACM/common_words", "r") as cw:
            common_words = replacePunct(cw.read())
            files = [item.split("\n.") for item in collection.split(".I ")]
            return buildFrequencies(files, common_words)


def buildCACMReversedIndex(frequencies):
    """
    Build the reversed frequencies index for the CACM collection
    :param frequencies: (dict) frequencies dictionary for the CACM collection {doc_id: {word: frequency}}
    :return: invertFreq: (dict)
    """
    invertFreq = {}
    for key in frequencies:
        for word in frequencies[key]:
            if word not in invertFreq:
                invertFreq[word] = [(key, frequencies[key][word])]
            else:
                invertFreq[word] += [(key, frequencies[key][word])]
    with open("../CACMindexes/revertFreq.json", "w") as export:
        export.write(json.dumps(invertFreq, indent=4))
    return invertFreq


def loadCACMfreq():
    if not os.path.isfile("../CACMindexes/freq.json"):
        frequencies = buildCACMIndex()
    else:
        with open("../CACMindexes/freq.json", "r") as freq:
            frequencies = json.loads(freq.read())
    return frequencies


def loadCACMst(frequencies):
    if not os.path.exists("../CACMindexes/"):
        os.mkdir("../CACMindexes/")
    if not os.path.isfile("../CACMindexes/revert_freq.json"):
        revert_freq = buildCACMReversedIndex(frequencies)
    else:
        with open("../CACMindexes/revert_freq.json", "r") as revF:
            revert_freq = json.loads(revF.read())
    return revert_freq


def loadCACMtfidf(freq, ifreq):
    if not os.path.isfile("../CACMindexes/revertFreqTfidf.json"):
        revert_freq = build_CACMtfidf(freq, ifreq)
    else:
        with open("../CACMindexes/revertFreqTfidf.json", "r") as revF:
            revert_freq = json.loads(revF.read())
    return revert_freq


def loadCACMtfidfnorm(freq, tfidf):
    if not os.path.isfile("../CACMindexes/revertFreqNormTfidf.json"):
        revert_freq = build_CACMNormtfidf(freq, tfidf)
    else:
        with open("../CACMindexes/revertFreqNormTfidf.json", "r") as revF:
            revert_freq = json.loads(revF.read())
    return revert_freq


def loadCACMJsons(reverseType):
    """
    Load/Build if necessary CACM indexes
    :param reverseType: (string) standard for occurences,
                                 tfidf for occurences*idf,
                                 tfidfnorm for occurences*idf/nbwords
    :return frequencies: (dict) doc_id: {word: occurences]
    :return revert_freq: (dict) word: doc occurencies (standard, tfidf, or tfidfnorm
    """
    frequencies, revert_freq = loadCACMfreq(), {}
    if reverseType == "standard":
        return frequencies, loadCACMst(frequencies)
    elif reverseType == "tfidf":
        return frequencies, loadCACMtfidf(frequencies, loadCACMst(frequencies))
    elif reverseType == "tfidfnorm":
        return frequencies, loadCACMtfidfnorm(frequencies, loadCACMtfidf(frequencies, loadCACMst(frequencies)))
    return frequencies, revert_freq


def loadWIKIJsons(words, reverseType):
    """We consider here that wiki indexes are already done. We're not going to build them on the go anyways"""
    if not os.path.exists("../WIKIindexes/"):
        os.mkdir("../WIKIindexes/")
    try:
        with open("../WIKIindexes/finalWiki/countWords.json", "r") as counts:
            doc_lengths = json.loads(counts.read())
    except IOError:
        doc_lengths = {}
        print "missing document lengths index"
    print('Loading indexes')
    revertFreq = {}
    indexes = {"standard": "finalWiki/",
               "tfidf": "finalWikiTfidf/",
               "tfidfnorm": "finalWikiTfidfNorm/"}
    for word in words:
        try:
            with open("../WIKIindexes/" + indexes[reverseType] + word[0:2] + ".json", "r") as revF:
                part = json.loads(revF.read())
                revertFreq.update(part)
        except IOError:
            print "missing indexes", "../WIKIindexes/" + indexes[reverseType] + word[0:2] + ".json"
    print('Indexes loaded')
    return doc_lengths, revertFreq


def loadJsons(collection, reverseType, words):
    """
    Load the necessary indexes for CACM or WIKI collections
    :param collection: (str) the used collection = "CACM" or "WIKI"
    :param reverseType: (string) standard for occurences,
                                 tfidf for occurences*idf,
                                 tfidfnorm for occurences*idf/nbwords
    :param words: requested words (for WIKI collection, we need to target loaded indexes on requested words)
    :return freq, ifreq: (dicts) frequencies and inverse frequencies dictionaries requested
    """
    if collection == "CACM":
        return loadCACMJsons(reverseType)
    if collection == "WIKI":
        return loadWIKIJsons(words, reverseType)


if __name__ == "__main__":
    freq, ifreq = loadJsons("CACM", "standard", [])
    freq, ifreq = loadJsons("CACM", "tfidf", [])
    freq, ifreq = loadJsons("CACM", "tfidfnorm", [])