TransBoostler/similarity.py at master · MeLLL-UFF/TransBoostler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477

from __future__ import division
from collections import Counter
from gensim.corpora.dictionary import Dictionary
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.models import KeyedVectors, FastText
from preprocessing import Preprocessing
from gensim import matutils, corpora
from scipy.spatial import distance
import parameters as params
import utils as utils
from pyemd import emd
import pandas as pd
from wmd import WMD
import numpy as np
import spacy
import wmd
import os

class Similarity:

  def __init__(self, preprocessing, similarity_matrix, dictionary):
    self.preprocessing = preprocessing
    self.similarity_matrix = similarity_matrix
    self.dictionary = dictionary

  def __nbow(self, document, dictionary, vocab_len):
    """
      nBoW representation of a document using a dictionary of words

      Args:
          document(array): one given predicate
          dictionary(Dictionary): pair (source, target)
      Returns:
          a list of frequencies/size_of_document
    """
    d = np.zeros(vocab_len, dtype=np.double)
    nbow = dictionary.doc2bow(document)  # Word frequencies.
    doc_len = len(document)
    for idx, freq in nbow:
        d[idx] = freq / float(doc_len)  # Normalized word frequencies.
    return d

  def __bow(self, source, source_vectors, target, target_vectors, dimension):
    """
        Builds a Bag-of-Words so source and target predicates have the same size

      Args:
          source(str): source predicate
          source_vectors: embedding vectors for source-predicate
          target(str): target predicate
          target_vectors: embedding vectors for target-predicate
          dimension(int): size of word vectors
      Returns:
          source and target embeddings set to the same size
    """

    words = source + target
    new_source, new_target = [[0]* dimension] * len(words), [[0]* dimension] * len(words)

    for i in range(len(words)):
      if words[i] in source and words[i] in target:
        source_index = source.index(words[i])
        target_index = target.index(words[i])
        new_source[i] = source_vectors[source_index][:]
        new_target[i] = target_vectors[target_index][:]
      elif words[i] in source:
        index = source.index(words[i])
        new_source[i] = source_vectors[index][:]
      elif words[i] in target:
        index = target.index(words[i])
        new_target[i] = target_vectors[index][:]

    return new_source, new_target

  def __get_distance_matrix(self, source, target, model, dictionary, vocab_len):
    """
        Compute distance matrix between the predicates

	    Args:
	        source(str): source predicate
	        target(str): target predicate
	        model(KeyedVectors): embedding pre-trained model
	        vocab_len(int): size of the vocabulary
	    Returns:
	        a list of distancies
    """

    # Sets for faster look-up.
    docset1 = set(source)
    docset2 = set(target)

    # Compute distance matrix.
    distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.double)
    for i, t1 in dictionary.items():
        for j, t2 in dictionary.items():
            if not t1 in docset1 or not t2 in docset2:
                continue

            if(params.METHOD):
                source_segmented = self.preprocessing.pre_process_text(t1)
                target_segmented = self.preprocessing.pre_process_text(t2)

                sent_1 = [model[w] for w in source_segmented]
                sent_2 = [model[w] for w in target_segmented]

            else:
                _t1, _t2 = model[t1], model[t2]

            # Compute Euclidean distance between word vectors.
            distance_matrix[i, j] = np.sqrt(np.sum((_t1 - _t2)**2))

    if np.sum(distance_matrix) == 0.0:
        # `emd` gets stuck if the distance matrix contains only zeros.
        print('The distance matrix is all zeros. Aborting (returning inf).')
        return float('inf')
    return distance_matrix

  def __wmdistance(self, source, target, model):
    """
        Calculate Word Mover's Distance for single-vectors (concatenate)

        Args:
            source(str): source predicate
            target(str): target predicate
       Returns:
            distance between two word vectors
    """

    source, target = self.preprocessing.pre_process_text(source), self.preprocessing.pre_process_text(target)

    dictionary = Dictionary(documents=[source, target])
    vocab_len = len(dictionary)

    # Source and target can be of the same type, distance_matrix must have at least two unique tokens
    if(len(dictionary) == 1):
      return 1.0

    distance_matrix = self.__get_distance_matrix(source, target, model, dictionary, vocab_len)

     # Compute nBOW representation of documents
    d1 = self.__nbow(source, dictionary, vocab_len)
    d2 = self.__nbow(target, dictionary, vocab_len)

    # Compute WMD
    return emd(d1, d2, distance_matrix)

  def __spacy_nbow(self, texts, nlp):
    """
        Calculates SpaCy nbow model

        Args:
            texts(list): source predicate
            nlp(spaCy): the SpaCy embedding model
       Returns:
            a dictionary containing the nBoW model
    """
    documents = {}
    for text in texts:
        text = nlp(text)
        tokens = [t for t in text if t.is_alpha and not t.is_stop]
        words = Counter(t.text for t in tokens)
        orths = {t.text: t.orth for t in tokens}
        sorted_words = sorted(words)
        documents[text] = (text, [orths[t] for t in sorted_words],
                        np.array([words[t] for t in sorted_words],
                                    dtype=np.float32))
    return documents

  def __create_key(self, source, target):
    """
        Create key to to dataframe used for mapping

        Args:
            source(list/str): source predicate and its types
            target(list/str): target predicate and its types
       Returns:
            a string corresponding that corresponds to the mapping
    """
    return source[0] + '(' + ','.join(source[1]) + ')' + ',' + target[0] + '(' + ','.join(target[1]) + ')'
    #key = s + '(' + ','.join([chr(65+i) for i in range(len(source[s][1]))]) + ')' + ',' + t + '(' + ','.join([chr(65+i) for i in range(len(target[t][1]))]) + ')'

  def compute_similarities(self, source, targets, similarity_metric, model='', model_name=''):
    """
        Calculate similarities between a clause and the targets

        Args:
            source(str): source predicate
            target(str): target predicate
       Returns:
            a dataframe containing each pair similarity
    """

    if(similarity_metric == 'cosine'):
      return self.cosine_similarities(source, targets, model)

    if(similarity_metric == 'euclidean'):
      return self.euclidean_distance(source, targets)

    if(similarity_metric == 'softcosine'):
      return self.soft_cosine_similarities(source, targets)

    if(similarity_metric == 'wmd'):
      return self.wmd_similarities(source, targets, model)

    if(similarity_metric == 'relax-wmd' and model_name==params.FASTTEXT):
      return self.relaxed_wmd_similarities(source, targets, params.WIKIPEDIA_FASTTEXT_SPACY)

    if(similarity_metric == 'relax-wmd' and model_name==params.WORD2VEC):
      return self.relaxed_wmd_similarities(source, targets, params.GOOGLE_WORD2VEC_SPACY)

    raise "Similarity metric not implemented."

  def cosine_similarities(self, sources, targets, model):
    """
        Calculate cosine similarity of embedded arrays
        for every possible pairs (source, target)

        Args:
            sources(list): all word embeddings from the source dataset
            targets(list): all word embeddings from the target dataset
       Returns:
            a pandas dataframe containing every pair (source, target) similarity
    """
    similarity = {}
    for source in sources:
      for target in targets:

        key = self.__create_key(source, target)

        # Predicates have the form [predicate_name, [argument_1, argument_2]]
        # Arities much match
        if(len(source[1]) != len(target[1])):
          continue

        if '()' in key: key = key.replace('(', '').replace(')', '')

        source_segmented = self.preprocessing.pre_process_text(source[0])
        target_segmented = self.preprocessing.pre_process_text(target[0])

        n_source = [model[word] for word in source_segmented if word in model]
        n_target = [model[word] for word in target_segmented if word in model]

        #n_source, n_target = self.__bow(source_segmented, sources[s][0], target_segmented, targets[t][0], params.EMBEDDING_DIMENSION)

        #if(params.METHOD):
        #  n_source, n_target = np.concatenate(n_source), np.concatenate(n_target)

        # This function corresponds to 1 - distance as presented at https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html
        similarity[key] = np.dot(matutils.unitvec(np.array(n_source).mean(axis=0)), matutils.unitvec(np.array(n_target).mean(axis=0)))

    df = pd.DataFrame.from_dict(similarity, orient="index", columns=['similarity'])
    return df.rename_axis('candidates').sort_values(by=['similarity', 'candidates'], ascending=[False, True])

  def soft_cosine_similarities(self, sources, targets):
    """
        Calculate soft cosine similarity of embedded arrays
        for every possible pairs (source, target)

        Args:
            sources(array): all predicates from the source dataset
            targets(array): all predicates from the target dataset
       Returns:
            a pandas dataframe containing every pair (source, target) similarity
    """

    similarity = {}
    for source in sources:
        for target in targets:

            # Predicates have the form [predicate_name, [argument_1, argument_2]]
            # # Arities much match
            if(len(source[1]) != len(target[1])):
              continue

            key = self.__create_key(source, target)

            sent_1 = self.preprocessing.pre_process_text(source[0])
            sent_2 = self.preprocessing.pre_process_text(target[0])

            # Prepare a dictionary and a corpus.
            #documents  = [sent_1, sent_2]
            #dictionary = corpora.Dictionary(documents)

            #similarity_matrix = SparseTermSimilarityMatrix(self.similarity_matrix, dictionary)

            # Convert the sentences into bag-of-words vectors.
            sent_1 = self.dictionary.doc2bow(sent_1)
            sent_2 = self.dictionary.doc2bow(sent_2)

            # Compute soft cosine similarity
            similarity[key] = self.similarity_matrix.inner_product(sent_1, sent_2, normalized=(True,True))

    df = pd.DataFrame.from_dict(similarity, orient="index", columns=['similarity'])
    return df.rename_axis('candidates').sort_values(by=['similarity', 'candidates'], ascending=[False, True])

  def wmd_similarities(self, sources, targets, model):
    """
        Calculate similarity of embedded arrays
        using Word Mover's Distances for all possible pairs (source, target)

        Args:
            sources(array): all word embeddings from the source dataset
            targets(array): all word embeddings from the target dataset
       Returns:
            a pandas dataframe containing every pair (source, target) similarity
    """

    similarity = {}
    for source in sources:
      for target in targets:

        # Predicates have the form [predicate_name, [argument_1, argument_2]]
        # Arities much match
        if(len(source[1]) != len(target[1])):
          continue

        key = self.__create_key(source, target)

        similarity[key] = self.__wmdistance(source[0], target[0], model)

    df = pd.DataFrame.from_dict(similarity, orient="index", columns=['similarity'])
    return df.rename_axis('candidates').sort_values(by=['similarity', 'candidates'])

  def relaxed_wmd_similarities(self, sources, targets, modelname):
      """
    	Calculate similarity of embedded arrays
	    using Relaxed Word Mover's Distance for all possible pairs (source, target)

	    Args:
	        sources(array): all word embeddings from the source dataset
	        targets(array): all word embeddings from the target dataset
	        modelname(str): name of the model to be loaded
	    Returns:
	        a pandas dataframe containing every pair (source, target) similarity
      """

      # Loads model
      nlp = spacy.blank("en").from_disk(modelname)
      wmd_instance = WMD.SpacySimilarityHook(nlp)

      similarity = {}
      for source in sources:
        for target in targets:

            # Predicates have the form [predicate_name, [argument_1, argument_2]]
            # Arities much match
            if(len(source[1]) != len(target[1])):
                continue

            key = self.__create_key(source, target)

            if(params.METHOD):
                words = set([source[0]]).union([target[0]])
                embeddings = [np.concatenate([nlp.vocab[w].vector for w in self.preprocessing.pre_process_text(word)]) for word in words]

                #embeddings = [np.concatenate([nlp.vocab[w].vector for w in self.seg.segment(source[0]).split()]),np.concatenate([nlp.vocab[w].vector for w in self.seg.segment(target[0]).split()])]

                if(len(embeddings) > 1 and len(embeddings[0]) != len(embeddings[1])):
                    embeddings[0], embeddings[1] = utils.set_to_same_size(embeddings[0], embeddings[1], params.EMBEDDING_DIMENSION)

                similarity[key] = wmd_instance.compute_similarity(nlp(source[0]), nlp(target[0]), evec=np.array(embeddings, dtype=np.float32), single_vector=True)
            else:
                # Convert the sentences into SpaCy format.
                sent_1 = nlp(' '.join(self.preprocessing.pre_process_text(source[0])))
                sent_2 = nlp(' '.join(self.preprocessing.pre_process_text(target[0])))

                similarity[key] = wmd_instance.compute_similarity(sent_1, sent_2)

      df = pd.DataFrame.from_dict(similarity, orient="index", columns=['similarity'])
      return df.rename_axis('candidates').sort_values(by=['similarity', 'candidates'])

  def euclidean_distance(self, sources, targets):
    """
    	Calculate similarity of embedded arrays
	    using Euclidean Distance for all possible pairs (source, target)

	    Args:
	        sources(dict): all word embeddings from the source dataset
	        targets(dict): all word embeddings from the target dataset
	    Returns:
	        a pandas dataframe containing every pair (source, target) similarity
    """
    similarity = {}
    for s in sources:
      for t in targets:

        # Predicates have the form [predicate_name, [argument_1, argument_2]]
        # Arities much match
        if(len(sources[s][1]) != len(targets[t][1])):
          continue

        key = self.__create_key([s, sources[s][1]], [t, targets[t][1]])

        if '()' in key: key = key.replace('(', '').replace(')', '')

        source_segmented = self.preprocessing.pre_process_text(s)
        target_segmented = self.preprocessing.pre_process_text(t)

        n_source, n_target = sources[s][0], targets[t][0]
        if(len(source_segmented) != len(target_segmented)):
          n_source, n_target = self.__bow(source_segmented, sources[s][0], target_segmented, targets[t][0], params.EMBEDDING_DIMENSION)

        n_source, n_target = np.concatenate(n_source), np.concatenate(n_target)

        similarity[key] = distance.euclidean(n_source, n_target)

    df = pd.DataFrame.from_dict(similarity, orient="index", columns=['similarity'])
    return df.rename_axis('candidates').sort_values(by=['similarity', 'candidates'])

# from ekphrasis.classes.segmenter import Segmenter
# from pyemd import emd
# import gensim.downloader as api

# # Segmenter using the word statistics from Wikipedia
# seg = Segmenter(corpus="english")

# fraseA = 'Obama speaks to the media in Illinois'
# fraseB = 'The president greets the press in Chicago'
# fraseC = 'Having a tough time finding an orange juice press machine?'

# # sent_1 = [['Dravid is a cricket player and a opening batsman', ['A']]]
# # sent_2 = [['Leo is a cricket player too He is a batsman,baller and keeper', ['B']]]

# #model = KeyedVectors.load_word2vec_format('resources/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
# model = api.load("glove-wiki-gigaword-50")

# preprocessing = Preprocessing(seg)

# similarity_matrix, dictionary = '', ''
# similarity_matrix, dictionary = utils.get_softcosine_matrix([fraseA + '(A,B)'], [fraseB + '(A,B)',fraseC + '(A,B)'], model, preprocessing)

# similarity = Similarity(preprocessing, similarity_matrix, dictionary)

# #sim = Similarity(preprocessing, similarity_matrix)

# a = similarity.soft_cosine_similarities([[fraseA, ['', '']]], [[fraseB, ['', '']], [fraseC, ['', '']]])

# print(a)

# from nltk.corpus import stopwords

# fraseA = 'Obama speaks to the media in Illinois'
# fraseB = 'The president greets the press in Chicago'
# fraseC = 'Having a tough time finding an orange juice press machine?'

# # Remove stopwords.
# stop_words = stopwords.words('english')
# fraseA = [w for w in fraseA if w not in stop_words]
# fraseB = [w for w in fraseB if w not in stop_words]
# fraseC = [w for w in fraseC if w not in stop_words]

# # Prepare a dictionary and a corpus.
# from gensim import corpora
# documents = [fraseA, fraseB, fraseC]
# dictionary = corpora.Dictionary(documents)

# # Convert the sentences into bag-of-words vectors.
# fraseA = dictionary.doc2bow(fraseA)
# fraseB = dictionary.doc2bow(fraseB)
# fraseC = dictionary.doc2bow(fraseC)

# similarity_index = WordEmbeddingSimilarityIndex(model)
# similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary)

# similarity = similarity_matrix.inner_product(fraseA, fraseB, normalized=(True,True))
# print('similarity = %.4f' % similarity)

# similarity = similarity_matrix.inner_product(fraseA, fraseC, normalized=(True,True))
# print('similarity = %.4f' % similarity)


#print(sim.wmd_similarities([['Obama speaks to the media in Illinois', 'person', 'person']], [['The president greets the press in Chicago', 'person', 'person']], model))

#print(sim.wmd_similarities([[''.join(fraseA), 'person', 'person']], [[''.join(fraseB), 'person', 'person']], model))
#print(model.wmdistance(''.join(fraseA), ''.join(fraseB)))