-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrf.py
More file actions
199 lines (177 loc) · 5.83 KB
/
rf.py
File metadata and controls
199 lines (177 loc) · 5.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# -*- coding: utf-8 -*-
import numpy as np
import math
import sys
from datasets import Sample
## Hyperparameters
sys.setrecursionlimit(10000)
soF = 1 # number of features randomly chosen
MAXDEPTH = 10#math.inf
## Psuedo Code
'''
Precondition: A training set S := (x1, y1), . . . ,(xn, yn), features F, and number
of trees in forest B.
1 function RandomForest(S , F)
2 H ← ∅
3 for i ∈ 1, . . . , B do
4 S(i) ← A bootstrap sample from S
5 hi ← RandomizedTreeLearn(S(i), F)
6 H ← H ∪ {hi}
7 end for
8 return H
9 end function
10 function RandomizedTreeLearn(S , F)
11 At each node:
12 f ← very small subset of F
13 Split on best feature in f
14 return The learned tree
15 end function
'''
def RandomForest(S, nbrTrees = 100, F=None, SF=True):
""" Return the random forest trained on data S, with trees H[i] and out-of-bag data samples Sb[i] for the i-th tree."""
if F == None:
F = range(S[0].getNbrAttributes())
B = nbrTrees # number of trees in forest
H = []
Sb = []
Sset = set(S)
for i in range(B):
[S_i, Stst] = BootstrapSample(S, Sset)
h_i = RandomizedTreeLearn(S_i, F, SF)
H.append(h_i)
Sb.append(Stst)
return [H, Sb]
def RandomizedTreeLearn(S, F, singleFeature, maxdepth = MAXDEPTH): # here S is a subsample
def buildBranch(dataset, default, attributes):
if not dataset:
return TreeLeaf(default)
allClass,theClass = allAnyClass(dataset)
if allClass:
return TreeLeaf(theClass)
return RandomizedTreeLearn(dataset, attributes, singleFeature, maxdepth-1)
default = mostCommon(S)
if maxdepth < 1:
return TreeLeaf(default)
f = FeatureSubsample(F, singleFeature) # feature subsample
[a, v, dataR, dataL] = bestFeature(S, f)
attributesLeftL = F
attributesLeftR = F
#[dataR, dataL] = select(S, a, v)
#dataL = select(S, a, v, False) #under
#dataR = select(S, a, v, True) #over
branches = [buildBranch(dataL, default, attributesLeftL),
buildBranch(dataR, default, attributesLeftR)]
return TreeNode(a, branches, default, v)
def BootstrapSample(S, Sset):
soS = int(len(S) * 2/3) # size of subsample
train = np.random.choice(S, soS, replace = True).tolist()
tst = Sset.difference(set(train))
return [train, tst]
def FeatureSubsample(F, singleFeature):
soS = soF
if not singleFeature:
soS = int(math.log(len(F)+1,2))
return np.random.choice(F, soS).tolist()
class TreeNode:
def __init__(self, attribute, branches, default, value):
self.attribute = attribute
self.branches = branches
self.default = default
self.value = value
def __repr__(self):
accum = str(self.attribute) + '('
for branch in self.branches:
accum += str(branch)
return accum + ')'
class TreeLeaf:
def __init__(self, cvalue):
self.cvalue = cvalue
def __repr__(self):
return str(self.cvalue)
def getAttributeValues(S,attribute):
values = []
for x in S:
val = x.getAttributeValue(attribute)
if not val in values:
values += [val]
return sorted(values)
def bestFeature(S, features):
if len(S) <= 1:
return None
top = None
tover = None
tunder = None
for f in features:
for v in getAttributeValues(S,f):
[gain, sover, sunder] = averageGain(S, f, v)
if top is None:
top = (gain, f, v)
tover = sover
tunder = sunder
elif top[0] < gain:
top = (gain, f, v)
tover = sover
tunder = sunder
#gains = [(averageGain(S, f, v), f, v) for f in features for v in getAttributeValues(S,f)]
#return max(gains, key=lambda x: x[0])[1:]
return [top[1], top[2], tover, tunder]
def mostCommon(S):
classes = list(set([x.getClass() for x in S]))
return classes[np.argmax([len([x for x in S if x.getClass()==c]) for c in classes])]
def averageGain(dataset, attribute, value):
weighted = 0.0
[subsetOver, subsetUnder] = select(dataset, attribute, value)
weighted += entropy(subsetOver) * len(subsetOver)
weighted += entropy(subsetUnder) * len(subsetUnder)
return [entropy(dataset) - weighted/len(dataset), subsetOver, subsetUnder]
def entropy(dataset):
"Calculate the entropy of a dataset"
n = len(dataset)
entropy = 0
counts = dict()
for x in dataset:
cls = x.getClass()
if cls not in counts:
counts[cls] = 1
else:
counts[cls] += 1
for v in counts.values():
entropy -= float(v)/n * math.log(float(v)/n,2)
return entropy
"""
def entropy(dataset):
"Calculate the entropy of a dataset"
classes = list(set([x.getClass() for x in dataset]))
n = len(dataset)
entropy = 0
for c in classes:
nclass = len([x for x in dataset if x.getClass() == c])
if nclass == 0:
continue
else:
entropy -= float(nclass)/n * math.log(float(nclass)/n,2)
return entropy
"""
def select(dataset, attribute, value):
#print('value', value)
#print('attribute', attribute)
#print(dataset[0].attribute[attribute])
over = []
under = []
for x in dataset:
if (x.getAttributeValue(attribute) >= value):
over += [x]
else:
under += [x]
return [over, under]
def allFromClass(dataset,c):
"Check if all samples are from class c"
return all([x.getClass() == c for x in dataset])
def allAnyClass(dataset):
if len(dataset) == 0:
return (False,0)
"Check if all samples are from the same class"
c = dataset[0].getClass()
if allFromClass(dataset,c):
return (True,c)
return (False,0)