use elbow point for hierarchical and kmeans
kmeans:
hierarchical:
need Silhouette Width (https://en.wikipedia.org/wiki/Silhouette_(clustering))
...
...
http://cs.joensuu.fi/sipu/datasets/ https://towardsdatascience.com/k-means-clustering-implementation-2018-ac5cd1e51d0a https://github.com/deric/clustering-benchmark/blob/master/README.md http://neupy.com/2017/12/09/sofm_applications.html https://wonikjang.github.io/deeplearning_unsupervised_som/2017/06/30/som.html https://www.kaggle.com/raghavrastogi75/fraud-detection-using-self-organising-maps https://medium.com/@navdeepsingh_2336/self-organizing-maps-for-machine-learning-algorithms-ad256a395fc5 https://heartbeat.fritz.ai/introduction-to-self-organizing-maps-soms-98e88b568f5d
In [1]:
from datetime import datetime as dt
import numpy as np
import pandas as pd
# viz libs
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
random_state=42
nb_start = dt.now()
In [ ]:
In [2]:
features = pd.read_csv('train_values.csv')
labels = pd.read_csv('train_labels.csv')
xlab = 'serum_cholesterol_mg_per_dl'
ylab = 'resting_blood_pressure'
print(labels.head())
features.head()
Out[2]:
In [3]:
cluster_arr = np.array(features[[xlab,ylab]]).reshape(-1,2)
cluster_arr[:5]
Out[3]:
In [4]:
x = features['serum_cholesterol_mg_per_dl']
y = features['resting_blood_pressure']
trace = [go.Scatter(
x = x,
y = y,
name = 'data',
mode = 'markers',
hoverinfo = 'text',
text = ['x: %s<br>y: %s' % (x_i, y_i) for x_i, y_i in zip(x, y)]
)]
layout = go.Layout(
xaxis = dict({'title': xlab}),
yaxis = dict({'title': ylab})
)
fig = go.Figure(data=trace, layout=layout)
iplot(fig, layout)
In [ ]:
In [5]:
from scipy.cluster.hierarchy import dendrogram, linkage
In [ ]:
In [6]:
plt.figure(figsize=(15, 7))
linked = linkage(cluster_arr, 'single')
# labelList = range(1, 11)
dendrogram(linked,
orientation='top',
# labels=labelList,
distance_sort='descending',
show_leaf_counts=True)
plt.show()
In [ ]:
In [ ]:
In [8]:
plt.figure(figsize=(15, 7))
linked = linkage(cluster_arr, 'complete')
# labelList = range(1, 11)
dendrogram(linked,
orientation='top',
# labels=labelList,
distance_sort='descending',
show_leaf_counts=True)
plt.show()
In [ ]:
In [ ]:
In [9]:
plt.figure(figsize=(15, 7))
linked = linkage(cluster_arr, 'average')
# labelList = range(1, 11)
dendrogram(linked,
orientation='top',
# labels=labelList,
distance_sort='descending',
show_leaf_counts=True)
plt.show()
In [ ]:
In [ ]:
In [10]:
plt.figure(figsize=(15, 7))
linked = linkage(cluster_arr, 'ward')
# labelList = range(1, 11)
dendrogram(linked,
orientation='top',
# labels=labelList,
distance_sort='descending',
show_leaf_counts=True)
plt.show()
In [ ]:
In [ ]:
In [11]:
from sklearn.cluster import DBSCAN
In [12]:
clustering = DBSCAN(eps=3, min_samples=2).fit(cluster_arr)
clustering
Out[12]:
In [69]:
y_pred = clustering.labels_
y_pred
Out[69]:
In [72]:
x = cluster_arr[:, 0]
y = cluster_arr[:, 1]
# col = ['#F33' if i == 1 else '#33F' for i in y_pred]
trace = [go.Scatter(
x = x,
y = y,
marker = dict(
# color = col,
color = y_pred,
colorscale='MAGMA',
colorbar=dict(
title='Labels'
),
),
name = 'data',
mode = 'markers',
hoverinfo = 'text',
text = ['x: %s<br>y: %s' % (x_i, y_i) for x_i, y_i in zip(x, y)]
)]
layout = go.Layout(
xaxis = dict({'title': xlab}),
yaxis = dict({'title': ylab})
)
fig = go.Figure(data=trace, layout=layout)
iplot(fig, layout)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [14]:
from sklearn.cluster import KMeans
In [30]:
y_pred = KMeans(n_clusters=2, random_state=random_state).fit_predict(cluster_arr)
y_pred
Out[30]:
In [73]:
x = cluster_arr[:, 0]
y = cluster_arr[:, 1]
# col = ['#F33' if i == 1 else '#33F' for i in y_pred]
trace = [go.Scatter(
x = x,
y = y,
marker = dict(
# color = col,
color = y_pred,
colorscale='YlOrRd',
colorbar=dict(
title='Labels'
),
),
name = 'data',
mode = 'markers',
hoverinfo = 'text',
text = ['x: %s<br>y: %s' % (x_i, y_i) for x_i, y_i in zip(x, y)]
)]
layout = go.Layout(
xaxis = dict({'title': xlab}),
yaxis = dict({'title': ylab})
)
fig = go.Figure(data=trace, layout=layout)
iplot(fig, layout)
In [66]:
Ks = range(2, 11)
km = [KMeans(n_clusters=i) for i in Ks] # , verbose=True
# score = [km[i].fit(cluster_arr).score(cluster_arr) for i in range(len(km))]
fitted = [km[i].fit(cluster_arr) for i in range(len(km))]
score = [fitted[i].score(cluster_arr) for i in range(len(km))]
inertia = [fitted[i].inertia_ for i in range(len(km))]
relative_diff = [inertia[0]]
relative_diff.extend([inertia[i-1] - inertia[i] for i in range(1, len(inertia))])
print(fitted[:1])
print(score[:1])
print(inertia[:1])
print(relative_diff)
In [37]:
fitted[0]
Out[37]:
In [44]:
dir(fitted[0])[:5]
Out[44]:
In [65]:
data = [
# go.Bar(
# x = list(Ks),
# y = score
# ),
go.Bar(
x = list(Ks),
y = inertia,
text = ['Diff is: %s' % diff for diff in relative_diff]
),
go.Scatter(
x = list(Ks),
y = inertia
),
]
layout = go.Layout(
xaxis = dict(
title = 'No of Clusters [%s-%s]' % (min(Ks), max(Ks))
),
yaxis = dict(
title = 'Sklearn score / inertia'
),
# barmode='stack'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
In [68]:
data = [
go.Bar(
x = list(Ks),
y = relative_diff
),
go.Scatter(
x = list(Ks),
y = relative_diff
),
]
layout = go.Layout(
xaxis = dict(
title = 'No of Clusters [%s-%s]' % (min(Ks), max(Ks))
),
yaxis = dict(
title = 'Pairwise difference'
),
# barmode='stack'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
In [ ]:
In [ ]:
In [ ]:
In [16]:
nb_end = dt.now()
'Time elapsed: %s' % (nb_end - nb_start)
Out[16]:
In [ ]:
In [ ]: