|
| 1 | + |
| 2 | +#import logging |
| 3 | +#logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', handlers=[logging.FileHandler('app.log','w'),logging.StreamHandler()]) |
| 4 | + |
| 5 | +from experiments import experiments, bk, setups |
| 6 | +from gensim.test.utils import datapath |
| 7 | +from datasets.get_datasets import * |
| 8 | +from boostsrl import boostsrl |
| 9 | +import parameters as params |
| 10 | +import utils as utils |
| 11 | +import numpy as np |
| 12 | +import random |
| 13 | +import time |
| 14 | +import sys |
| 15 | +import os |
| 16 | + |
| 17 | +#verbose=True |
| 18 | +source_balanced = False |
| 19 | +balanced = False |
| 20 | + |
| 21 | +experiment_title = '' |
| 22 | + |
| 23 | +def save_experiment(data, experiment_title): |
| 24 | + if not os.path.exists('experiments/' + experiment_title): |
| 25 | + os.makedirs('experiments/' + experiment_title) |
| 26 | + results = [] |
| 27 | + if os.path.isfile('experiments/rdnb.json'): |
| 28 | + with open('experiments/{}/rdnb.json'.format(experiment_title), 'r') as fp: |
| 29 | + results = json.load(fp) |
| 30 | + results.append(data) |
| 31 | + with open('experiments/{}/rdnb.json'.format(experiment_title), 'w') as fp: |
| 32 | + json.dump(results, fp) |
| 33 | + |
| 34 | +def train_and_test(background, train_pos, train_neg, train_facts, test_pos, test_neg, test_facts): |
| 35 | + ''' |
| 36 | + Train RDN-B |
| 37 | + ''' |
| 38 | + |
| 39 | + start = time.time() |
| 40 | + model = boostsrl.train(background, train_pos, train_neg, train_facts, refine=None, transfer=None, trees=params.TREES) |
| 41 | + |
| 42 | + end = time.time() |
| 43 | + learning_time = end-start |
| 44 | + |
| 45 | + utils.print_function('Model training time {}'.format(learning_time), experiment_title) |
| 46 | + |
| 47 | + will = ['WILL Produced-Tree #'+str(i+1)+'\n'+('\n'.join(model.get_will_produced_tree(treenumber=i+1))) for i in range(10)] |
| 48 | + for w in will: |
| 49 | + utils.print_function(w, experiment_title) |
| 50 | + |
| 51 | + start = time.time() |
| 52 | + |
| 53 | + # Test model |
| 54 | + results = boostsrl.test(model, test_pos, test_neg, test_facts, trees=params.TREES) |
| 55 | + |
| 56 | + end = time.time() |
| 57 | + inference_time = end-start |
| 58 | + |
| 59 | + utils.print_function('Inference time {}'.format(inference_time), experiment_title) |
| 60 | + |
| 61 | + return model, results.summarize_results(), learning_time, inference_time |
| 62 | + |
| 63 | +def get_confusion_matrix(to_predicate): |
| 64 | + # Get confusion matrix by reading results from db files created by the Java application |
| 65 | + utils.print_function('Converting results file to txt', experiment_title) |
| 66 | + |
| 67 | + utils.convert_db_to_txt(to_predicate, params.TEST_OUTPUT) |
| 68 | + y_true, y_pred = utils.read_results(params.TEST_OUTPUT.format(to_predicate).replace('.db', '.txt')) |
| 69 | + |
| 70 | + |
| 71 | + utils.print_function('Building confusion matrix', experiment_title) |
| 72 | + |
| 73 | + # True Negatives, False Positives, False Negatives, True Positives |
| 74 | + TN, FP, FN, TP = utils.get_confusion_matrix(y_true, y_pred) |
| 75 | + |
| 76 | + utils.print_function('Confusion matrix \n', experiment_title) |
| 77 | + matrix = ['TP: {}'.format(TP), 'FP: {}'.format(FP), 'TN: {}'.format(TN), 'FN: {}'.format(FN)] |
| 78 | + for m in matrix: |
| 79 | + utils.print_function(m, experiment_title) |
| 80 | + |
| 81 | + # Converts to int to fix JSON np.int64 problem |
| 82 | + return {'TP': int(TP), 'FP': int(FP), 'TN': int(TN), 'FN': int(FN)} |
| 83 | + |
| 84 | +def main(): |
| 85 | + |
| 86 | + if not os.path.exists('experiments'): |
| 87 | + os.makedirs('experiments') |
| 88 | + |
| 89 | + results, confusion_matrix = {}, {} |
| 90 | + |
| 91 | + # Dictionaries to keep all experiments results |
| 92 | + #transboostler_experiments = {} |
| 93 | + rdnb_confusion_matrix = {} |
| 94 | + |
| 95 | + for experiment in experiments: |
| 96 | + |
| 97 | + confusion_matrix_save_all = [] |
| 98 | + |
| 99 | + experiment_title = experiment['id'] + '_' + experiment['source'] + '_' + experiment['target'] |
| 100 | + |
| 101 | + target = experiment['target'] |
| 102 | + |
| 103 | + # Load total target dataset |
| 104 | + tar_total_data = datasets.load(target, bk[target], seed=params.SEED) |
| 105 | + |
| 106 | + if target in ['nell_sports', 'nell_finances', 'yago2s']: |
| 107 | + n_runs = params.N_FOLDS |
| 108 | + else: |
| 109 | + n_runs = len(tar_total_data[0]) |
| 110 | + |
| 111 | + results = { 'save': { }} |
| 112 | + |
| 113 | + utils.print_function('Starting experiment {} \n'.format(experiment_title), experiment_title) |
| 114 | + |
| 115 | + _id = experiment['id'] |
| 116 | + source = experiment['source'] |
| 117 | + target = experiment['target'] |
| 118 | + predicate = experiment['predicate'] |
| 119 | + to_predicate = experiment['to_predicate'] |
| 120 | + arity = experiment['arity'] |
| 121 | + |
| 122 | + if target in ['twitter', 'yeast']: |
| 123 | + recursion = True |
| 124 | + else: |
| 125 | + recursion = False |
| 126 | + |
| 127 | + # Get sources and targets |
| 128 | + sources = [s.replace('.', '').replace('+', '').replace('-', '') for s in set(bk[source]) if s.split('(')[0] != to_predicate and 'recursion_' not in s] |
| 129 | + targets = [t.replace('.', '').replace('+', '').replace('-', '') for t in set(bk[target]) if t.split('(')[0] != to_predicate and 'recursion_' not in t] |
| 130 | + |
| 131 | + path = os.getcwd() + '/experiments/' + experiment_title |
| 132 | + if not os.path.exists(path): |
| 133 | + os.mkdir(path) |
| 134 | + |
| 135 | + results['save'] = { |
| 136 | + 'experiment': 0, |
| 137 | + 'n_runs': 0, |
| 138 | + 'seed': 441773, |
| 139 | + 'source_balanced' : False, |
| 140 | + 'balanced' : False, |
| 141 | + 'folds' : n_runs, |
| 142 | + 'nodeSize' : params.NODESIZE, |
| 143 | + 'numOfClauses' : params.NUMOFCLAUSES, |
| 144 | + 'maxTreeDepth' : params.MAXTREEDEPTH |
| 145 | + } |
| 146 | + |
| 147 | + # APAGAR ISSO AQUI |
| 148 | + n_runs = 1 |
| 149 | + while results['save']['n_runs'] < n_runs: |
| 150 | + |
| 151 | + utils.print_function('Run: ' + str(results['save']['n_runs'] + 1), experiment_title) |
| 152 | + |
| 153 | + if('rdn-b' not in rdnb_confusion_matrix): |
| 154 | + #transboostler_experiments[embeddingModel] = {} |
| 155 | + rdnb_confusion_matrix['rdn-b'] = {} |
| 156 | + |
| 157 | + #transboostler_experiments[embeddingModel][similarityMetric] = [] |
| 158 | + #experiment_metrics = {key: {'CLL': [], 'AUC ROC': [], 'AUC PR': [], 'Learning Time': [], 'Inference Time': []} for key in params.AMOUNTS} |
| 159 | + rdnb_confusion_matrix['rdn-b'] = [] |
| 160 | + confusion_matrix = {'TP': [], 'FP': [], 'TN': [], 'FN': []} |
| 161 | + |
| 162 | + utils.print_function('Starting experiments for RDN-B \n', experiment_title) |
| 163 | + |
| 164 | + if target in ['nell_sports', 'nell_finances', 'yago2s']: |
| 165 | + n_folds = params.N_FOLDS |
| 166 | + else: |
| 167 | + n_folds = len(tar_total_data[0]) |
| 168 | + |
| 169 | + results_save, confusion_matrix_save = [], [] |
| 170 | + for i in range(n_folds): |
| 171 | + utils.print_function('\n Starting fold {} of {} folds \n'.format(i+1, n_folds), experiment_title) |
| 172 | + |
| 173 | + ob_save, cm_save = {}, {} |
| 174 | + |
| 175 | + if target not in ['nell_sports', 'nell_finances', 'yago2s']: |
| 176 | + [tar_train_pos, tar_test_pos] = datasets.get_kfold_small(i, tar_total_data[0]) |
| 177 | + else: |
| 178 | + t_total_data = datasets.load(target, bk[target], target=to_predicate, balanced=balanced, seed=params.SEED) |
| 179 | + tar_train_pos = datasets.split_into_folds(t_total_data[1][0], n_folds=n_folds, seed=params.SEED)[i] + t_total_data[0][0] |
| 180 | + |
| 181 | + # Load new predicate target dataset |
| 182 | + tar_data = datasets.load(target, bk[target], target=to_predicate, balanced=balanced, seed=params.SEED) |
| 183 | + |
| 184 | + # Group and shuffle |
| 185 | + if target not in ['nell_sports', 'nell_finances', 'yago2s']: |
| 186 | + [tar_train_facts, tar_test_facts] = datasets.get_kfold_small(i, tar_data[0]) |
| 187 | + [tar_train_pos, tar_test_pos] = datasets.get_kfold_small(i, tar_data[1]) |
| 188 | + [tar_train_neg, tar_test_neg] = datasets.get_kfold_small(i, tar_data[2]) |
| 189 | + else: |
| 190 | + [tar_train_facts, tar_test_facts] = [tar_data[0][0], tar_data[0][0]] |
| 191 | + to_folds_pos = datasets.split_into_folds(tar_data[1][0], n_folds=n_folds, seed=params.SEED) |
| 192 | + to_folds_neg = datasets.split_into_folds(tar_data[2][0], n_folds=n_folds, seed=params.SEED) |
| 193 | + [tar_train_pos, tar_test_pos] = datasets.get_kfold_small(i, to_folds_pos) |
| 194 | + [tar_train_neg, tar_test_neg] = datasets.get_kfold_small(i, to_folds_neg) |
| 195 | + |
| 196 | + random.shuffle(tar_train_pos) |
| 197 | + random.shuffle(tar_train_neg) |
| 198 | + |
| 199 | + utils.print_function('Start training from scratch\n', experiment_title) |
| 200 | + |
| 201 | + utils.print_function('Target train facts examples: %s' % len(tar_train_facts), experiment_title) |
| 202 | + utils.print_function('Target train pos examples: %s' % len(tar_train_pos), experiment_title) |
| 203 | + utils.print_function('Target train neg examples: %s\n' % len(tar_train_neg), experiment_title) |
| 204 | + |
| 205 | + utils.print_function('Target test facts examples: %s' % len(tar_test_facts), experiment_title) |
| 206 | + utils.print_function('Target test pos examples: %s' % len(tar_test_pos), experiment_title) |
| 207 | + utils.print_function('Target test neg examples: %s\n' % len(tar_test_neg), experiment_title) |
| 208 | + |
| 209 | + # Creating background |
| 210 | + background = boostsrl.modes(bk[target], [to_predicate], useStdLogicVariables=False, maxTreeDepth=params.MAXTREEDEPTH, nodeSize=params.NODESIZE, numOfClauses=params.NUMOFCLAUSES) |
| 211 | + |
| 212 | + # Train and test |
| 213 | + utils.print_function('Training from scratch \n', experiment_title) |
| 214 | + |
| 215 | + # Learn and test model not revising theory |
| 216 | + model, t_results, learning_time, inference_time = train_and_test(background, tar_train_pos, tar_train_neg, tar_train_facts, tar_test_pos, tar_test_neg, tar_test_facts) |
| 217 | + del model |
| 218 | + |
| 219 | + t_results['Learning time'] = learning_time |
| 220 | + ob_save['rdn-b'] = t_results |
| 221 | + |
| 222 | + utils.show_results(utils.get_results_dict(t_results, learning_time, inference_time), experiment_title) |
| 223 | + |
| 224 | + cm = get_confusion_matrix(to_predicate) |
| 225 | + cm_save['rdn-b'] = cm |
| 226 | + |
| 227 | + confusion_matrix['TP'].append(cm['TP']) |
| 228 | + confusion_matrix['FP'].append(cm['FP']) |
| 229 | + confusion_matrix['TN'].append(cm['TN']) |
| 230 | + confusion_matrix['FN'].append(cm['FN']) |
| 231 | + |
| 232 | + rdnb_confusion_matrix['rdn-b'].append(confusion_matrix) |
| 233 | + del cm, t_results, learning_time, inference_time |
| 234 | + |
| 235 | + results_save.append(ob_save) |
| 236 | + save_experiment(results_save, experiment_title) |
| 237 | + results['save']['n_runs'] += 1 |
| 238 | + |
| 239 | + matrix_filename = os.getcwd() + '/experiments/{}_{}_{}/rdnb_confusion_matrix.json'.format(_id, source, target) |
| 240 | + |
| 241 | + # Save all results |
| 242 | + utils.save_json_file(matrix_filename, rdnb_confusion_matrix) |
| 243 | + |
| 244 | +if __name__ == '__main__': |
| 245 | + sys.exit(main()) |
0 commit comments