-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhungarian.py
More file actions
145 lines (110 loc) · 4.59 KB
/
hungarian.py
File metadata and controls
145 lines (110 loc) · 4.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from scipy.optimize import linear_sum_assignment
import parameters as params
import utils as utils
import pandas as pd
import re
class Hungarian:
def __init__(self, similarities):
self.similarities = similarities
def __get_target_indexes(self,arity=2):
"""
Maps the index of targets predicates accordingly to the matrix columns
Returns:
a dictionary containing the index of a given key
a dicitonary containing the target of a given index
"""
columns = []
column_indexes = {}
index_to_target = {}
indexes = self.similarities.index.tolist()
for index in indexes:
index = re.split(r',\s*(?![^()]*\))', index)
source, target = index[0].rstrip(), index[1].rstrip()
# Check if predicates are of the same arity
if(not len(utils.get_all_literals([source])) == arity):
continue
if target not in columns:
columns.append(target)
for column in columns:
column_indexes[column] = len(column_indexes)
for key in column_indexes:
index_to_target[column_indexes[key]] = key
return column_indexes, index_to_target
def __build_cost_matrix(self, arity=2):
"""
Returns cost matrix of the bipartite graph given the dataframe of similarities.
Vertex i of the first partite set (a source predicate) and vertex j of the second set (a target predicate).
Args:
similarities(DataFrame): similarities between (source, target) pairs of predicates
Returns:
cost matrix of the bipartite graph
"""
cost_matrix = []
indexes = self.similarities.index.tolist()
for index in indexes:
index = re.split(r',\s*(?![^()]*\))', index)
source, target = index[0].rstrip(), index[1].rstrip()
# Check if predicates are of the same arity
if(not len(utils.get_all_literals([source])) == arity):
continue
if source not in self.candidates:
self.candidates[source] = [target]
else:
self.candidates[source].append(target)
similarities_dictionary = self.similarities.to_dict()['similarity']
for i in self.candidates:
row = []
for j in self.candidates[i]:
# Check if predicates are of the same arity
if(not (len(utils.get_all_literals([i])) == arity)):
continue
row = [0]*len(self.target_ind)
row[self.target_ind[j]] = similarities_dictionary[i + ',' + j]
self.source_ind[len(cost_matrix)] = i
cost_matrix.append(row)
return cost_matrix
def __get_targets(self, col_ind):
"""
Returns the corresponding mapping to a source predicate given the index assigment
Args:
col_ind(list): list of columns
Returns:
dictionary of mappings provided by the Hungarian algorithm
"""
current_mappings = {}
for i in range(len(col_ind)):
source_predicate = self.source_ind[i]
current_mappings[source_predicate] = [self.ind_to_target[col_ind[i]]]
return current_mappings
def assigment(self):
"""
Returns columns indexes after solving the linear sum assignment problem using the Hungarian algorithm.
Each row is assignment to at most one column, and each column to at most one row.
Returns:
An array of row indices and one of corresponding column indices giving the optimal assignment.
"""
mappings = {}
# Assuming we only have predicates of arity one and two
for i in range(1,3):
self.candidates = {}
self.source_ind = {}
self.target_ind, self.ind_to_target = self.__get_target_indexes(arity=i)
cost_matrix = self.__build_cost_matrix(arity=i)
if(not cost_matrix):
continue
row_ind, col_ind = linear_sum_assignment(cost_matrix)
mappings.update(self.__get_targets(col_ind))
return mappings
# similarities = pd.DataFrame()
# for predicate in ['athleteledsportsteam', 'athleteplaysforteam','athleteplaysinleague', 'athleteplayssport', 'teamalsoknownas', 'teamplaysagainstteam','teamplaysinleague']:
# current = pd.read_csv(params.ROOT_PATH + 'resources/9_nell_sports_nell_finances/rwmd-similarities/{}_similarities.csv'.format(predicate)).set_index('candidates')
# similarities = pd.concat([similarities, current])
# import time
# start = time.time()
# hug = Hungarian(similarities)
# #print(similarities)
# print(hug.assigment())
# print(time.time()-start)
# cost = [[4.9586896896362305, 5.472095489501953],[4.785916328430176,5.348875999450684],[4.654685974121094,5.441676139831543],[4.956645488739014,5.603146553039551],[4.749782085418701,5.210647106170654],[4.361474990844727,5.199010848999023],[4.411990165710449,5.244400978088379],[4.829348564147949,5.565415382385254],[4.5297322273254395,5.28743839263916]]
# row_ind, col_ind = linear_sum_assignment(cost_matrix)
# print(col_ind)