modis_analysis/create_rfc_model.py at master · wrightni/modis_analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import h5py
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pickle

from lib import load_data
from lib import model_analysis as aa


def main():

    root_path = os.getcwd()
    tds_file = os.path.join(root_path, "modis_artificial_tds_8cat.hdf")

    # # folder = (os.path.join(wv_tds_folder, '*JUL*tds.hdf'))
    features, labels = load_data.load_tds_file(tds_file)

    # Whether or not to do a test/train split
    tts = False
    features_test = []
    labels_test = []
    if tts:
        # Create a test set as a split from the main set.
        features_train, features_test, labels_train, labels_test = train_test_split(features, labels,
                                                                                    test_size=0.3, random_state=51)
        labels_train = np.array(labels_train, dtype=np.float32)
        labels_test = np.array(labels_test, dtype=np.float32)
        features_train = np.array(features_train, dtype=np.float32)
        features_test = np.array(features_test, dtype=np.float32)
    else:
        features_train = np.array(features, dtype=np.float32)
        labels_train = np.array(labels, dtype=np.float32)

    # Display model and test input information
    print("Size of training set: {}".format(len(features_train)))
    if tts:
        print("Size of test set:     {}".format(len(features_test)))

    print(np.shape(features_train))
    print(np.shape(labels_train))

    # sklearn Random Forest
    rfc = RandomForestRegressor(n_estimators=100, oob_score=True)
    rfc.fit(features_train, labels_train)

    # Save the model
    root_path = os.getcwd()
    model_filename = os.path.join(root_path, "rfc_8cat.p")
    with open(model_filename, 'wb') as mf:
        pickle.dump(rfc, mf)

    # To test an existing model:
    #model_filename = '/Users/nicholas/Documents/Dartmouth/Projects/modis_unmixing/ANN/artificial_tds/rfc_model_8cat.p'
    #with open(model_filename, 'rb') as mf:
    #    rfc = pickle.load(mf)

    print(rfc.oob_score_)

    importances = rfc.feature_importances_
    indices = np.argsort(importances)[::-1]
    print("Feature ranking:")

    feature_names_sorted = []
    feature_names = range(1,7)
    for f in range(6):
        print("%d. feature %s (%f)" % (f+1, feature_names[indices[f]], importances[indices[f]]))
        feature_names_sorted.append(feature_names[indices[f]])
    #
    # If we split a test set, use it for some tests!
    if tts:
        labels_predict = rfc.predict(features_test)
        aa.print_stats(labels_test, labels_predict)
        aa.plot_results(labels_test, labels_predict)


def write_hdf(fname, features, labels, weights):
    tds_file = h5py.File(fname, 'w')
    tds_file.create_dataset("features", data=features)
    tds_file.create_dataset("labels", data=labels)
    tds_file.create_dataset("weights", data=weights)
    tds_file.close()


if __name__ == '__main__':
    main()