-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcrypto_PCA_kmeans.py
More file actions
149 lines (119 loc) · 4.31 KB
/
crypto_PCA_kmeans.py
File metadata and controls
149 lines (119 loc) · 4.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# %%
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px
# %% [markdown]
# # Data Preprocessing
# %%
# data load
crypto_df = pd.read_csv('./Resources/crypto_data.csv')
crypto_df.head()
# %%
crypto_df.dtypes
# %%
# Remove all cryptocurrencies that aren’t trading
active_crypto_df = crypto_df[crypto_df['IsTrading'] == True]
active_crypto_df.head(3)
# %%
# Remove all cryptocurrencies that don’t have an algorithm defined
active_crypto_df['Algorithm'].isnull().sum()
# %%
# Remove the IsTrading column
active_crypto_df = active_crypto_df.drop(columns = ['IsTrading'])
# %%
# Remove all cryptocurrencies with at least one null value
active_crypto_df.isnull().sum()
# %%
notnull_crypto_df = active_crypto_df.dropna()
notnull_crypto_df.head()
# %%
# Remove all cryptocurrencies without coins mined(TotalCoinsMined = 0)
cleaned_crypto_df = notnull_crypto_df[notnull_crypto_df['TotalCoinsMined'] != 0]
cleaned_crypto_df.head()
# %%
# Store the names of all cryptocurrencies on a DataFrame and use the original df index as the index for it
coins_name = pd.DataFrame(cleaned_crypto_df[['Unnamed: 0','CoinName']])
coins_name.set_index('Unnamed: 0', drop = True, inplace = True)
coins_name.head()
# %%
# Remove the CoinName column
cleaned_crypto_df = cleaned_crypto_df.drop(columns = ['CoinName'])
cleaned_crypto_df.head()
# %%
cleaned_crypto_df.dtypes
# %%
cleaned_crypto_df['TotalCoinSupply'] = cleaned_crypto_df['TotalCoinSupply'].astype('float')
# %%
# Create dummies variables for all of the text features, and store the resulting data on a DataFrame
X = pd.get_dummies(cleaned_crypto_df[['Algorithm','ProofType']])
# duummies extend to 98 features
# %%
# standardize all of the data from the X
scale_model = StandardScaler()
scaled_X = scale_model.fit_transform(X) #ndarray
# %% [markdown]
# # PCA
# %%
# Reducing X DataFrame Dimensions Using PCA to 3 features
pca = PCA(n_components=3)
X_pca = pca.fit_transform(scaled_X)
print(f'The pca ratio is {pca.explained_variance_ratio_}')
# %%
pca.explained_variance_
# %%
pcs_df = pd.DataFrame(X_pca, index=cleaned_crypto_df['Unnamed: 0'], columns=['PC 1','PC 2','PC 3'])
pcs_df.head(10)
# %% [markdown]
# # Clustering by KMeans
# %%
# Create an elbow curve to find the best value for K, X-axis is K, y-axis is inertia
inertia_list = list()
k_value = list(range(1,11))
for k in k_value:
k_model = KMeans(n_clusters=k, random_state=1)
k_model.fit(pcs_df)
inertia_list.append(k_model.inertia_)
# build a dataframe for plotting
elbow_df = pd.DataFrame({'K': k_value, 'Inertia': inertia_list})
# %%
# elbow curve
obj = elbow_df.hvplot.line(x = 'K', y = 'Inertia', xticks = k_value, title='Elbow Curve')
hvplot.show(obj)
# %% [markdown]
# Based on the elbow curve, at the point 4, the line shifts to a strong horizontal line.
# As a result, I chosed K=4 as the best estimate number of cluster in KMeans model.
# %%
# run the K-means algorithm to predict the K clusters for the cryptocurrencies’ data
model = KMeans(n_clusters=4, random_state=1)
predictions = model.fit_predict(pcs_df)
# %%
# combine all information with predicted cluster into a new DataFrame
clustered_df = cleaned_crypto_df.merge(pcs_df, on = 'Unnamed: 0')
clustered_df = clustered_df.merge(coins_name, on = 'Unnamed: 0')
clustered_df['Class'] = model.labels_
clustered_df.set_index('Unnamed: 0', drop = True, inplace = True)
clustered_df.head(10)
# %% [markdown]
# # Visualizing Results
# %%
# 3D scatter plot
fig = px.scatter_3d(clustered_df, x= 'PC 1', y='PC 2',z='PC 3',
color='Class', symbol='Class', hover_name='CoinName',
hover_data=['Algorithm'])
fig.update_layout(legend = {'x':0,'y':1})
fig.show()
# %%
# create a hvplot table for all the current tradable cryptocurrencies
obj_table = clustered_df.hvplot.table(columns = ['CoinName', 'Algorithm',
'ProofType', 'TotalCoinSupply',
'TotalCoinsMined', 'Class'], width =500)
hvplot.show(obj_table)
# %%
# create a scatter plot to present the clustered data about cryptocurrencies
obj = clustered_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply",
by = 'Class', hover_cols = ['CoinName'])
hvplot.show(obj)
# %%