-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkmeans.rb
More file actions
141 lines (112 loc) · 4.34 KB
/
kmeans.rb
File metadata and controls
141 lines (112 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# this class is a simple implementation of the K means algorithm
# It is a way to cluster data points, an unsupervised learning technique
class KMeans
#a data point has a name, data, and a centroid_info
#(which contains the id of the centroid it belongs to as well as the distance to it)
class DataPoint
attr_accessor :centroid_info, :data, :name
def initialize(name, data)
@data = data
@name = name
end
end
attr_accessor :centroids, :data
LARGE_VALUE = 1000000
EPSILON_VALUE = 1
#each entry of the data array is a list of numbers which represent a certain attribute of the stock
#for our purposes, the way we store data will be as an array of type DataPoint
def initialize(num_clusters, data)
@data = []
data.each { |data_value| @data << DataPoint.new(data_value[0], data_value.slice(1, data_value.length - 1)) }
#here we would get the data, and randomize the centroids roughly based on the distribution of the data
randomize_centroids(num_clusters)
end
# this method randomizes the centroids in a roughly uniform manner
def randomize_centroids(num_clusters)
if (num_clusters > @data.length)
puts "There are more clusters than data points!"
puts "num_clusters will be set to the # of data points"
num_clusters = @data.length
end
#we get the centroids randomly from the data itself
@centroids = @data.sort_by{rand}[0..num_clusters-1].map{ |x| x.data }
end
#this actually runs the k-means clustering algorithm
#we know that once the centroids stop moving, we can stop
def runKMeans()
#this variable represents how much the centroids have moved after an iteration
#let's initialize it to a large value
centroid_shift = LARGE_VALUE
while (centroid_shift > EPSILON_VALUE)
set_closest_centroids()
centroid_shift = shift_centroids()
end
end
#this method does two things:
#1), it shifts the centroids based on the avg of data points that
#are closest to it
#2), it returns the total number of shift that occurs as a result
#of step 1
def shift_centroids
old_centroid = []
total_shift = 0
@centroids = @centroids.each_with_index.map do |centroid, index|
old_centroid = centroid
centroid = shift_centroid(old_centroid, index)
total_shift += calculate_distance(centroid, old_centroid)
centroid
end
return total_shift
end
#this method will go through all the data points that belong to the
#given centroid and calculates their average and sets the centroid's
#value to that point
def shift_centroid(centroid, index)
new_location = Array.new(centroid.length, 0)
num_points = 0
@data.each do |data_point|
if data_point.centroid_info[1] == index
new_location = new_location.each_with_index.map { |datum, dat_index| datum += data_point.data[dat_index] }
num_points += 1
end
end
return centroid if num_points == 0
new_location.map { |datum| datum = datum / num_points}
end
def set_closest_centroids
@data.map {|x| x.centroid_info = set_closest_centroid(x.data) }
end
#this function takes in a data point and returns an array that contains
#the distance of the closest centroid to the data point and the index of
#that closest centroid
def set_closest_centroid(data_point)
min = [LARGE_VALUE, 2]
@centroids.each_with_index do |centroid_point, centroids_index|
new_min = find_distance_between(data_point, centroid_point, centroids_index)
if (new_min[0] < min[0])
min = new_min
end
end
return min
end
#this method finds the Euclidean distance between a data point and a centroid point
def find_distance_between(data_point, centroid_point, centroids_index)
return [calculate_distance(data_point, centroid_point), centroids_index]
end
#this method finds the Euclidean distance between two points
def calculate_distance(point1, point2)
sum_of_squares = 0
point1.each_with_index do |point1_coord, index|
sum_of_squares += (point1_coord - point2[index])**2
end
return Math.sqrt(sum_of_squares)
end
end
### let's write some sample code, then possibly write some tests for this code
data = []
for i in 1..5
data << ["SAMPLE#{i}", rand(100).to_f, rand(100.to_f)]
end
puts data
kmeans = KMeans.new(2, data)
kmeans.runKMeans()