FlareML/flareml_train.py at main · ccsc-tools/FlareML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
'''
 (c) Copyright 2021
 All rights reserved
 Programs written by Yasser Abduallah
 Department of Computer Science
 New Jersey Institute of Technology
 University Heights, Newark, NJ 07102, USA

 Permission to use, copy, modify, and distribute this
 software and its documentation for any purpose and without
 fee is hereby granted, provided that this copyright
 notice appears in all copies. Programmer(s) makes no
 representations about the suitability of this
 software for any purpose.  It is provided "as is" without
 express or implied warranty.

 @author: Yasser Abduallah
'''

import numpy as np
import os
import csv
import argparse
import time
from time import sleep
from flareml_utils import *

TRAIN_INPUT = 'data/train_data/flaringar_training_sample.csv'
TEST_INPUT = 'data/test_data/flaringar_simple_random_40.csv'
normalize_data = False
def train_model(args):
    if not 'algorithm' in args:
        args['algorithm'] ='ENS'
    algorithm = args['algorithm']
    if not algorithm.strip().upper() in algorithms:
        print('Invalid algorithm:', algorithm, '\nAlgorithm must one of: ', algorithms)
        sys.exit()
    TRAIN_INPUT = args['train_data_file']
    if TRAIN_INPUT.strip() == '':
        print('Training data file can not be empty')
        sys.exit()
    if not os.path.exists(TRAIN_INPUT):
        print('Training data file does not exist:', TRAIN_INPUT)
        sys.exit()
    if not os.path.isfile(TRAIN_INPUT):
        print('Training data is not a file:', TRAIN_INPUT)
        sys.exit()
    modelid = args['modelid']
    if modelid.strip() == '':
        print('Model id can not be empty')
        sys.exit()
    if modelid.strip().lower() == 'default_model':
        ans = input('Using default_model as an id will overwrite the default models. Are you want to want to continue? [n] ')
        if not boolean(ans):
            print('Existing..')
            sys.exit()
    if 'normalize_data' in args.keys():
        normalize_data = boolean(args['normalize_data'])
    else:
        normalize_data = False
    log('normalize_data:', normalize_data)

    if 'verbose' in args.keys():
        set_log_to_terminal(boolean(args['verbose']))
    else:
        verbose = False
    log('Your provided arguments as: ', args)
    log("=============================== Logging Stared using algorithm: " + algorithm +" ==============================")
    log("Execution time started: " + timestr)
    log("Log files used in this run: " + logFile)
    log("train data set: " + TRAIN_INPUT)
    log("Creating a model with id: " + modelid)
    print("Starting training with a model with id:",  modelid, 'training data file:', TRAIN_INPUT)
    print('Loading data set...')
    dataset = load_dataset_csv(TRAIN_INPUT)
    log("orig cols: " , dataset.columns)
    for c in dataset.columns:
        if not c in req_columns:
            dataset = removeDataColumn(c,dataset)
    log("after removal cols: " , dataset.columns)
    cols = list(dataset.columns)
    if not flares_col_name in cols:
        print('The required flares class column:', flares_col_name, ' is not included in the data file')
        sys.exit()
    dataset['flarecn'] = [convert_class_to_num(c) for c in dataset[flares_col_name]]
    log('all columns: ', dataset.columns)
    log('\n', dataset.head())
    dataset = removeDataColumn(flares_col_name, dataset)
    if normalize_data:
        log('Normalizing and scaling the data...')
        for c in cols:
            if not c =='flarecn' and not c== flares_col_name:
                dataset[c] = normalize_scale_data(dataset[c].values)
#     (train_x, test_x, train_y, test_y) = split_data(dataset)
    train_y = dataset['flarecn']
    train_x = removeDataColumn('flarecn',dataset)
    test_x = None
    test_y = None
    models_dir = custom_models_dir
    printOutput = False
    if modelid == 'default_model' :
        models_dir = default_models_dir
    alg = algorithm.strip().upper()
    print('Training is in progress, please wait until it is done...')
    t = 33
    start_time = datetime.datetime.now()
    print('Training started at:', start_time.strftime("%Y-%m-%d %H:%M:%S"))
    if alg in ['RF','ENS']:
        rf_model = rf_train_model(train_x, test_x, train_y, test_y, model_id=modelid)
        if alg =='ENS':
            print('Finished 1/3 training..')
        t  = t + 33
    if alg in ['MLP','ENS']:
        mlp_model = mlp_train_model(train_x, test_x, train_y, test_y, model_id=modelid)
        if alg =='ENS':
            print('Finished 2/3 training..')
    if alg in ['ELM','ENS']:
        elm_model = elm_train_model(train_x, test_x, train_y, test_y,model_id=modelid)
        if alg =='ENS':
            print('Finished 3/3 training..')
    end_time= datetime.datetime.now()
    print('Training finished at:', end_time.strftime("%Y-%m-%d %H:%M:%S"))
    total_time = end_time - start_time
    total_minutes = round(total_time.total_seconds()/ 60,2)
    print('Training total time:', total_minutes, 'Minute(s)')
    ens = ''
    # if alg == 'ENS':
    #     ens = '(s)'
    print('\nFinished training the', alg ,'model' + str(ens)+  ', you may use the flareml_test.py program to make prediction.')

'''
Command line parameters parser
'''
parser = argparse.ArgumentParser()
parser.add_argument('-t', '--train_data_file',default=TRAIN_INPUT, help='full path to a file includes training data to create a model, must be in csv with comma separator')
parser.add_argument('-l', '--logfile', default=logFile,  help='full path to a file to write logging information about current execution.')
parser.add_argument('-v', '--verbose', default=False,  help='True/False value to include logging information in result json object, note that result will contain a lot of information')
parser.add_argument('-a',  '--algorithm', default='ENS',  help='Algorithm to use for training. Available algorithms: ENS, RF,MLP, and ELM. \nENS \tthe Ensemble algorithm is the default, RF\t Random Forest algorithm, \nMLP\tMultilayer Perceptron algorithm, \nELM\tExtreme Learning Machine')
parser.add_argument('-m', '--modelid', default='default_model', help='model id to save or load it as a file name. This is to identity each trained model.')
parser.add_argument('-n', '--normalize_data', default=normalize_data, help='Normalize and scale data.')

args, unknown = parser.parse_known_args()
args = vars(args)

if __name__ == "__main__":
    train_model(args)