-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathPythonCode
More file actions
103 lines (70 loc) · 3.3 KB
/
PythonCode
File metadata and controls
103 lines (70 loc) · 3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#Importing the libraries and reading dataset
import pandas as pd
import numpy as np
data = pd.read_csv("mushrooms.csv")
data.head()
target = 'class' # The class we want to predict
labels = data[target]
features = data.drop(target, axis=1) # Remove the target class from the dataset
categorical = features.columns # Since every fearure is categorical we use features.columns
features = pd.concat([features, pd.get_dummies(features[categorical])], axis=1) # Convert every categorical feature with one hot encoding
features.drop(categorical, axis=1, inplace=True) # Drop the original feature, leave only the encoded ones
labels = pd.get_dummies(labels)['p'] # Encode the target class, 1 is deadly 0 is safe to eat
#Split the dataset into training and testing, the 80% of the records are in the trainig set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,labels, test_size=0.2, random_state=0)
#Train predict pipeline
from sklearn.metrics import fbeta_score, accuracy_score
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test):
'''
inputs:
- learner: the learning algorithm to be trained and predicted on
- sample_size: the size of samples (number) to be drawn from training set
- X_train: features training set
- y_train: income training set
- X_test: features testing set
- y_test: income testing set
'''
results = {}
start = time() # Get start time
learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
end = time() # Get end time
results['train_time'] = end - start
start = time() # Get start time
predictions_test = learner.predict(X_test)
predictions_train = learner.predict(X_train[:300])
end = time() # Get end time
results['pred_time'] = end - start
results['acc_train'] = accuracy_score(y_train[:300],predictions_train)
results['acc_test'] = accuracy_score(y_test,predictions_test)
results['f_train'] = fbeta_score(y_train[:300],predictions_train, beta=0.5)
results['f_test'] = fbeta_score(y_test,predictions_test, beta=0.5)
print "{} trained on {} samples.".format(learner.__class__.__name__, sample_size)
return results
#Choosing Best Mode
from time import time
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
clf_A = GaussianNB()
clf_B = RandomForestClassifier()
clf_C = KNeighborsClassifier()
training_length = len(X_train)
samples_1 = int(training_length * 0.01)
samples_10 = int(training_length * 0.1)
samples_100 = int(training_length * 1)
results = {}
for clf in [clf_A, clf_B, clf_C]:
clf_name = clf.__class__.__name__
results[clf_name] = {}
for i, samples in enumerate([samples_1, samples_10, samples_100]):
results[clf_name][i] = \
train_predict(clf, samples, X_train, y_train, X_test, y_test)
#Checking Our Best Model
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
#As a Bonus we can now see features which determine more whether its edible or not
z = zip(clf.feature_importances_,X_train.columns)
z.sort(reverse=True)
z[:10]