python_data_science_tutorial/kfold_cross_validation.py at master · codeastar/python_data_science_tutorial · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

#using iris data set
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/bezdekIris.data",
names = ["Sepal Length", "Sepal Width", "Petal Length", "Petal Width", "Class"])

models = []
models.append(("LoR", LogisticRegression()) )
models.append(("LDA", LinearDiscriminantAnalysis()) )
models.append(("QDA", QuadraticDiscriminantAnalysis()) )
models.append(("SVC", SVC()) )
models.append(("LSVC", LinearSVC()) )
models.append(("SGD", SGDClassifier()) )
models.append(("KNN", KNeighborsClassifier()) )
models.append(("GNB", GaussianNB() ))
models.append(("DT", DecisionTreeClassifier()) )
models.append(("RF", RandomForestClassifier()) )

model_names = []
means = []
stds = []

#shuffle our data and we use 121 out of 150 as training data
data_array = df.values
np.random.shuffle(data_array)
X_learning = data_array[:121][:,0:4]
Y_learning = data_array[:121][:,4]

#split our data in 10 folds
kfold = model_selection.KFold(n_splits=10)

def showSplitting(X_learning):
    for train_index, test_index in kfold.split(X_learning):
        print("Train Index:")
        print(train_index)
        print("Test Index:")
        print(test_index)

#uncomment following to x how do samples split
#showSplitting(X_learning)

for name, model in models:
     #cross validation among models, score based on accuracy
     cv_results = model_selection.cross_val_score(model, X_learning, Y_learning, scoring='accuracy', cv=kfold )
     print("\n"+name)
     model_names.append(name)
     print("Results: "+str(cv_results))
     print("Mean: " + str(cv_results.mean()))
     print("Standard Deviation: " + str(cv_results.std()))
     means.append(cv_results.mean())
     stds.append(cv_results.std())

#bigger size in inches (width, height) & resolution(DPI)
plt.figure(figsize=(9, 5), dpi=100)

#plot the graphs with bar chart
x_loc = np.arange(len(models)) # the x locations for the groups
width = 0.5   # bar width

models_graph = plt.bar(x_loc, means, width, yerr=stds)
plt.ylabel('Accuracy')
plt.title('Scores by models')
plt.xticks(x_loc, model_names) # models name on x-axis

#add valve on the top of every bar
def addLabel(rects):
    for rect in rects:
        height = rect.get_height()
        plt.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                '%f' % height, ha='center',
                 va='bottom')

addLabel(models_graph)

plt.show()