gml/gml.py at master · gml-explore/gml · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
import heapq
import pickle
import sys
import time
import numpy as np
from numbskull_extend import numbskull
import logging
from sklearn import metrics
import gml_utils
from evidential_support import EvidentialSupport
from evidence_select import EvidenceSelect
from approximate_probability_estimation import ApproximateProbabilityEstimation
from construct_subgraph import ConstructSubgraph
from configparser import ConfigParser
from concurrent.futures import ProcessPoolExecutor
from gml_utils import Logger

class GML:
    '''
     GML main process: evidentail support->select_topm->ApproximateProbabilityEstimation->select->topk->inference->label->score
    '''
    def __init__(self,variables, features, learning_method,top_m, top_k, top_n,update_proportion, balance,optimization_threshold,learning_epoches,inference_epoches ,nprocess,out):
        #check data
        variables_keys= ['var_id','is_easy','is_evidence','true_label','label','feature_set']
        features_keys = ['feature_id','feature_type','parameterize','feature_name','weight']
        learning_methods = ['sgd', 'bgd']
        if learning_method not in learning_methods:
            raise ValueError('learning_methods has no this method: '+learning_method)
        #check variables
        for variable in variables:
            for attribute in variables_keys:
                if attribute not in variable:
                    raise ValueError('variables has no key: '+attribute)
        #check features
        for feature in features:
            for attribute in features_keys:
                if attribute not in feature:
                    raise ValueError('features has no key: '+attribute)
        self.variables = variables
        self.features = features
        self.learning_method = learning_method # now support sgd and bgd
        self.labeled_variables_set = set()
        self.top_m = top_m
        self.top_k = top_k
        self.top_n = top_n
        self.optimization_threshold = optimization_threshold #entropy optimization threshold: less than 0 means no need to optimize
        self.update_proportion = update_proportion #Evidence support update ratio: it is necessary to recalculate the evidence support after inferring a certain percentage of hidden variables
        self.support = EvidentialSupport(variables, features)
        self.select = EvidenceSelect(variables, features)
        self.approximate = ApproximateProbabilityEstimation(variables,features)
        self.subgraph = ConstructSubgraph(variables, features, balance)
        self.learing_epoches = learning_epoches   #Factor graph parameter learning rounds
        self.inference_epoches = inference_epoches #Factor graph inference rounds
        self.nprocess = nprocess #Number of multiple processes
        self.out = out  #Do you need to record the results
        self.evidence_interval_count = 10  #Number of evidence intervals divided by featureValue
        self.all_feature_set = set([x for x in range(0,len(features))])   #Feature ID collection
        self.observed_variables_set, self.poential_variables_set = gml_utils.separate_variables(variables)   #Dividing evidence variables and latent variables
        #Initialize the necessary properties
        self.evidence_interval = gml_utils.init_evidence_interval(self.evidence_interval_count)    #Divide the evidence interval
        gml_utils.init_bound(variables,features)   #Initial value of initialization parameter
        gml_utils.init_evidence(features,self.evidence_interval,self.observed_variables_set)
        #save results
        self.now = str(time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())))
        self.result = self.now+'-result.txt'
        if self.out:
            with open(self.result, 'w') as f:
                f.write('var_id'+' '+'inferenced_probability'+' '+'inferenced_label'+' '+'ture_label'+'\n')
        #logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - [%(levelname)s]: %(message)s'
        )
        logging.info("GML inference begin")

    @staticmethod
    def initial(configFile,variables,features):
        '''
        load config from file
        @param configFile:
        @param variables:
        @param features:
        @return:
        '''
        config = ConfigParser()
        #Set default parameters
        config.read_dict({'para':{'learning_method':'sgd',
                                  'learning_epoches':'1000',
                                  'inference_epoches':'1000',
                                  'top_m':'2000',
                                  'top_k':'10',
                                  'top_n':'1',
                                  'n_process':'1',
                                  'update_proportion':'0.01',
                                  'balance':'False',
                                  'optimization_threshold':'-1',
                                  'out':'True'}
                          })
        config.read(configFile, encoding='UTF-8')
        learning_method = config['para']['learning_method']
        learning_epoches = int(config['para']['learning_epoches'])
        inference_epoches = int(config['para']['inference_epoches'])
        top_m = int(config['para']['top_m'])
        top_k = int(config['para']['top_k'])
        top_n = int(config['para']['top_n'])
        n_process = int(config['para']['n_process'])
        update_proportion = float(config['para']['update_proportion'])
        balance = config['para'].getboolean('balance')
        optimization_threshold = float(config['para']['optimization_threshold'])
        out = config['para'].getboolean('out')
        return GML(variables, features, learning_method, top_m, top_k, top_n,update_proportion,balance,optimization_threshold,learning_epoches,inference_epoches,n_process,out)

    def evidential_support(self, variable_set,update_feature_set):
        '''
        calculate evidential_support
        @param variable_set:
        @param update_feature_set:
        @return:
        '''
        self.support.evidential_support(variable_set,update_feature_set)

    def approximate_probability_estimation(self, variable_set):
        '''
        estimation approximate_probability
        @param variable_set:
        @return:
        '''
        self.approximate.approximate_probability_estimation(variable_set)


    def select_top_m_by_es(self, m):
        '''
        select tom m largest ES poential variables
        @param m:
        @return: m_id_list
        '''
        # If the current number of hidden variables is less than m, directly return to the hidden variable list
        if m > len(self.poential_variables_set):
            return list(self.poential_variables_set)
        poential_var_list = list()
        m_id_list = list()
        for var_id in self.poential_variables_set:
            poential_var_list.append([var_id, self.variables[var_id]['evidential_support']])
        # Select the evidence to support the top m largest
        topm_var = heapq.nlargest(m, poential_var_list, key=lambda s: s[1])

        # edit at 2021-4-22
        entropy_list = []

        for elem in topm_var:
            m_id_list.append(elem[0])
            entropy_list.append(self.variables[elem[0]]['entropy'])

        m_id_list = np.array(m_id_list)[np.argsort(np.array(entropy_list))]
        logging.info('select m finished')
        return list(m_id_list)

    def select_top_k_by_entropy(self, var_id_list, k):
        '''
        # 按照熵已经排过序
        select top k  smallest entropy poential variables
        @param var_id_list:
        @param k:
        @return:
        '''
        # If the number of hidden variables is less than k, return the hidden variable list directly
        if len(var_id_list) < k:
            return var_id_list
        k_id_list = list(var_id_list[0:k])
        #logging.info('select k finished')

        return k_id_list
    def evidence_select(self, var_id):
        '''
        Determine the subgraph structure
        @param var_id:
        @return:
        '''
        connected_var_set, connected_edge_set, connected_feature_set = self.select.evidence_select(var_id)
        return connected_var_set, connected_edge_set, connected_feature_set

    def construct_subgraph(self, var_id):
        '''
        Construct subgraphs according to numbskull requirements
        @param var_id:
        @return:
        '''
        evidences = self.evidence_select(var_id)
        weight, variable, factor, fmap, domain_mask, edges_num, var_map,alpha_bound,tau_bound,weight_map_feature,sample_list,wmap,wfactor = self.subgraph.construct_subgraph(evidences,var_id)
        return weight, variable, factor, fmap, domain_mask, edges_num, var_map,alpha_bound,tau_bound,weight_map_feature,sample_list,wmap,wfactor


    def inference_subgraph(self, var_id):
        '''
        Subgraph parameter learning and reasoning
        @param var_id:
        @return:
        '''
        if not type(var_id) == int:
            raise ValueError('var_id should be int' )
        ns_learing = numbskull.NumbSkull(
            n_inference_epoch=self.learing_epoches,
            n_learning_epoch=self.inference_epoches,
            stepsize=0.01,
            decay=0.95,
            reg_param=1e-6,
            regularization=2,
            truncation=10,
            quiet=(not False),
            verbose=False,
            learn_non_evidence=True,
            sample_evidence=False,
            burn_in=10,
            nthreads=3,
            learning_method = self.learning_method
        )
        weight, variable, factor, fmap, domain_mask, edges_num, var_map,alpha_bound,tau_bound,weight_map_feature,sample_list,wmap,wfactor = self.construct_subgraph(var_id)
        subgraph = weight, variable, factor, fmap, domain_mask, edges_num,alpha_bound,tau_bound,sample_list,wmap,wfactor
        ns_learing.loadFactorGraph(*subgraph)
        # parameter learning
        ns_learing.learning()
        logging.info("subgraph learning finished")
        # Control the range of the learned parameters and set the isfixed attribute of weight to true
        for index, w in enumerate(weight):
            feature_id = weight_map_feature[index]
            w["isFixed"] = True
            if self.features[feature_id]['parameterize'] == 1:
                theta = self.variables[var_id]['feature_set'][feature_id][0]
                x = self.variables[var_id]['feature_set'][feature_id][1]
                a = ns_learing.factorGraphs[0].weight[index]['a']
                b = ns_learing.factorGraphs[0].weight[index]['b']
                w['initialValue'] = theta * a* (x - b)
            else:
                w['initialValue'] = ns_learing.factorGraphs[0].poential_weight[index]
        # reasoning
        ns_inference = numbskull.NumbSkull(
            n_inference_epoch=self.learing_epoches,
            n_learning_epoch= self.inference_epoches,
            stepsize=0.001,
            decay=0.95,
            reg_param=1e-6,
            regularization=2,
            truncation=10,
            quiet=(not False),
            verbose=False,
            learn_non_evidence=False,
            sample_evidence=False,
            burn_in=10,
            nthreads=2,
            learning_method=self.learning_method
        )
        ns_inference.loadFactorGraph(*subgraph)
        ns_inference.inference()
        logging.info("subgraph inference finished")
        # Write back the probability to self.variables
        if type(var_id) == set or type(var_id) == list:
            for id in var_id:
                self.variables[id]['inferenced_probability'] = ns_inference.factorGraphs[0].marginals[var_map[id]]
        elif type(var_id) == int:
            self.variables[var_id]['inferenced_probability'] = ns_inference.factorGraphs[0].marginals[var_map[var_id]]
        logging.info("inferenced probability recored")
        return var_id,ns_inference.factorGraphs[0].marginals[var_map[var_id]]

    def label(self, var_id_list, isapprox, mlist=None):
        '''
        Select n from k inferred hidden variables for labeling
        @param var_id_list:
        @return:
        '''
        entropy_list = list()
        label_list = list()
        probability_indicator = None
        if isapprox == True:
            probability_indicator = 'approximate_probability'
        else:
            probability_indicator = 'inferenced_probability'
        # Calculate the entropy of k hidden variables
        for var_id in var_id_list:
            var_index = var_id
            self.variables[var_index]['entropy'] = gml_utils.entropy(self.variables[var_index][probability_indicator])
            entropy_list.append([var_id, self.variables[var_index]['entropy']])
        # If labelnum is less than the number of variables passed in, mark top_n
        if len(var_id_list) > self.top_n:
            min_var_list = heapq.nsmallest(self.top_n, entropy_list, key=lambda x: x[1])  # 选出熵最小的变量
            for mv in min_var_list:
                label_list.append(mv[0])
        # Otherwise mark all the variables passed in
        else:
            label_list = var_id_list
        for var_index in label_list:
            if mlist is not None:
                mlist.remove(var_index)
            self.variables[var_index]['probability'] = self.variables[var_index][probability_indicator]
            self.variables[var_index]['label'] = 1 if self.variables[var_index]['probability'] >= 0.5 else 0
            self.variables[var_index]['is_evidence'] = True
            self.poential_variables_set.remove(var_index)
            self.observed_variables_set.add(var_index)
            self.labeled_variables_set.add(var_index)
            probability = self.variables[var_index]['probability']
            label = self.variables[var_index]['label']
            true_label = self.variables[var_index]['true_label']
            if self.out:
                with open(self.result, 'a') as f:
                    f.write(f'{var_index:7} {probability:10} {label:4} {true_label:4}')
                    f.write('\n')
        gml_utils.update_evidence(self.variables, self.features, label_list, self.evidence_interval)
        return len(label_list)

    def inference(self):
        '''
        Through the main process
        @return:
        '''
        labeled_var = 0
        labeled_count = 0
        update_feature_set = set()  # Stores features that have changed during a round of updates
        inferenced_variables_id = set()  #Hidden variables that have been established and inferred during a round of update
        pool = ProcessPoolExecutor(self.nprocess)
        if self.update_proportion > 0:
            update_cache = int(self.update_proportion * len(self.poential_variables_set))  # Evidential support needs to be recalculated every time update_cache variables are inferred
        self.evidential_support(self.poential_variables_set, self.all_feature_set)
        self.approximate_probability_estimation(self.poential_variables_set)
        m_list = self.select_top_m_by_es(self.top_m)
        while len(self.poential_variables_set) > 0:
            if len(m_list) == 0:
                m_list = self.select_top_m_by_es(self.top_m)
            # update_proportion is less than or equal to 0, which means that every time the variable is marked, the emergency_support needs to be updated.
            if self.update_proportion > 0 and labeled_var >= update_cache:
                # When the number of marked variables reaches update_cache, re-regression and calculate the emergency support
                for var_id in self.labeled_variables_set:
                    for feature_id in self.variables[var_id]['feature_set'].keys():
                        update_feature_set.add(feature_id)
                self.evidential_support(self.poential_variables_set, update_feature_set)
                self.approximate_probability_estimation(self.poential_variables_set)
                labeled_var = 0
                update_feature_set.clear()
                self.labeled_variables_set.clear()
                inferenced_variables_id.clear()
                m_list = self.select_top_m_by_es(self.top_m)
            k_list = self.select_top_k_by_entropy(m_list, self.top_k)
            approx_list = []
            add_list = [x for x in k_list if
                        x not in inferenced_variables_id]  # Added variables in each round of reasoning
            if len(k_list) > 0:
                if (self.nprocess == 1):
                    for var_id in k_list:
                        # If the entropy is less than a certain threshold, mark it directly without reasoning
                        if self.optimization_threshold == -2 or self.optimization_threshold >= 0 and \
                                self.variables[var_id]['entropy'] <= self.optimization_threshold:
                            approx_list.append(var_id)
                    #  把近似标的变量放入approx_list 然后交给label()函数
                    if len(approx_list) > 0:
                        len_label_list = self.label(approx_list, isapprox=True, mlist=m_list)  # edit at 2021-4-22
                        k_list.clear()
                        labeled_var += len_label_list
                        labeled_count += len_label_list
                    else:
                        for var_id in k_list:
                            if var_id in add_list:
                                self.inference_subgraph(int(var_id))
                                # For the variables that have been inferred during each round of update, because the parameters are not updated, there is no need for inference.
                                inferenced_variables_id.add(var_id)
                else:
                    futures = []
                    for var_id in add_list:
                        future = pool.submit(self.inference_subgraph, var_id)
                        futures.append(future)
                        inferenced_variables_id.add(var_id)
                    for ft in futures:
                        self.variables[ft.result()[0]]['inferenced_probability'] = ft.result()[1]
            len_label_list = self.label(k_list, isapprox=False, mlist=m_list)

            # gml_utils.update_bound(self.variables,self.features,label_list)  #Update the upper and lower bounds after each variable is marked
            labeled_var += len_label_list
            labeled_count += len_label_list
            logging.info("label_count=" + str(labeled_count))
            # output results
        self.save_results()
        self.score()

    def save_results(self):
        '''
        save results
        @return:
        '''
        with open(self.now+"_variables.pkl",'wb') as v:
            pickle.dump(self.variables,v)
        with open(self.now+"_features.pkl",'wb') as v:
            pickle.dump(self.features,v)

    def score(self):
        '''
        Get results, including: accuracy, precision, recall, F1 score
        :return:
        '''
        easys_pred_label = list()
        easys_true_label = list()
        hards_pred_label = list()
        hards_true_label = list()
        for var in self.variables:
            if var['is_easy'] == True:
                easys_true_label.append(var['true_label'])
                easys_pred_label.append(var['label'])
            else:
                hards_true_label.append(var['true_label'])
                hards_pred_label.append(var['label'])
        all_true_label = easys_true_label + hards_true_label
        all_pred_label = easys_pred_label + hards_pred_label
        sys.stdout = Logger(self.result)
        print("--------------------------------------------")
        print("total:")
        print("--------------------------------------------")
        print("total accuracy_score: " + str(metrics.accuracy_score(all_true_label, all_pred_label)))
        print("total precision_score: " + str(metrics.precision_score(all_true_label, all_pred_label)))
        print("total recall_score: " + str(metrics.recall_score(all_true_label, all_pred_label)))
        print("total f1_score: " + str(metrics.f1_score(all_true_label, all_pred_label)))
        print("--------------------------------------------")
        print("easys:")
        print("--------------------------------------------")
        print("easys accuracy_score:" + str(metrics.accuracy_score(easys_true_label, easys_pred_label)))
        print("easys precision_score:" + str(metrics.precision_score(easys_true_label, easys_pred_label)))
        print("easys recall_score:" + str(metrics.recall_score(easys_true_label, easys_pred_label)))
        print("easys f1_score: " + str(metrics.f1_score(easys_true_label, easys_pred_label)))
        print("--------------------------------------------")
        print("hards:")
        print("--------------------------------------------")
        print("hards accuracy_score:" + str(metrics.accuracy_score(hards_true_label, hards_pred_label)))
        print("hards precision_score: " + str(metrics.precision_score(hards_true_label, hards_pred_label)))
        print("hards recall_score: " + str(metrics.recall_score(hards_true_label, hards_pred_label)))
        print("hards f1_score: " + str(metrics.f1_score(hards_true_label, hards_pred_label)))