In [1]:
from datetime import datetime as dt
import numpy as np
import pandas as pd
# viz libs
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.cm as cm
from matplotlib.colors import colorConverter
import plotly.graph_objs as go
from plotly import tools
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
random_state=42
nb_start = dt.now()
In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
In [3]:
%%bash
ls -l | grep 2d
cat 2d-10c.dat | head -2
cat 2d-10c.dat | tail -1
In [4]:
# df = pd.read_csv('2d-10c.dat')
# df.info()
data2d = np.loadtxt('2d-10c.dat', skiprows=3)
data2d.shape
Out[4]:
In [5]:
df = pd.DataFrame(data={
'x': data2d[:, 0],
'y': data2d[:, 1],
'c': data2d[:, 2],
})
df.info()
In [6]:
df.head()
Out[6]:
In [7]:
x = df.x
y = df.y
c = df.c
trace = [go.Scatter(
x = x,
y = y,
marker = dict(
# color = col,
color = c,
colorscale='Viridis',
colorbar=dict(
title='Labels'
),
),
name = 'data',
mode = 'markers',
hoverinfo = 'text',
text = ['x: %s<br>y: %s<br>cluster %i' % (x_i, y_i, c_i) for x_i, y_i, c_i in zip(x, y, c)]
)]
layout = go.Layout(
xaxis = dict({'title': 'x'}),
yaxis = dict({'title': 'y'}),
hovermode='closest',
)
fig = go.Figure(data=trace, layout=layout)
iplot(fig, layout)
In [8]:
Ks = range(2, 20)
km = [KMeans(n_clusters=i) for i in Ks] # , verbose=True
# score = [km[i].fit(cluster_arr).score(cluster_arr) for i in range(len(km))]
fitted = [km[i].fit(data2d) for i in range(len(km))]
score = [fitted[i].score(data2d) for i in range(len(km))]
inertia = [fitted[i].inertia_ for i in range(len(km))]
relative_diff = [inertia[0]]
relative_diff.extend([inertia[i-1] - inertia[i] for i in range(1, len(inertia))])
print(fitted[:1])
print(score[:1])
print(inertia[:1])
print(relative_diff)
In [9]:
data = [
go.Bar(
x = list(Ks),
y = inertia,
text = ['Diff is: %s' % diff for diff in relative_diff]
),
go.Scatter(
x = list(Ks),
y = inertia
),
]
layout = go.Layout(
xaxis = dict(
title = 'No of Clusters [%s-%s]' % (min(Ks), max(Ks))
),
yaxis = dict(
title = 'Sklearn score / inertia'
),
# barmode='stack'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
In [10]:
data = [
go.Bar(
x = list(Ks),
y = relative_diff
),
go.Scatter(
x = list(Ks),
y = relative_diff
),
]
layout = go.Layout(
xaxis = dict(
title = 'No of Clusters [%s-%s]' % (min(Ks), max(Ks))
),
yaxis = dict(
title = 'Pairwise difference'
),
# barmode='stack'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
In [11]:
# silhouette_avg = silhouette_score(X, cluster_labels)
In [12]:
Ks = range(2, 20)
km = [KMeans(n_clusters=i) for i in Ks] # , verbose=True
# score = [km[i].fit(cluster_arr).score(cluster_arr) for i in range(len(km))]
cluster_lab = [km[i].fit_predict(data2d) for i in range(len(km))]
score_avg = [silhouette_score(data2d, cluster_lab[i]) for i in range(len(km))]
sample_values = [silhouette_samples(data2d, cluster_lab[i]) for i in range(len(km))]
print(cluster_lab)
print(score)
print(sample_values)
In [13]:
data = [
go.Bar(
x = list(Ks),
y = score_avg
),
go.Scatter(
x = list(Ks),
y = score_avg
),
]
layout = go.Layout(
xaxis = dict(
title = 'No of Clusters [%s-%s]' % (min(Ks), max(Ks))
),
yaxis = dict(
title = 'Sihouette score (avg)'
),
# barmode='stack'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
In [ ]:
In [14]:
X = data2d
figures = []
range_n_clusters = range(2,20)
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig = tools.make_subplots(rows=1, cols=2,
print_grid=False,
subplot_titles=('The silhouette plot for the various clusters.',
'The visualization of the clustered data.'))
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
fig['layout']['xaxis1'].update(title='The silhouette coefficient values',
range=[-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
fig['layout']['yaxis1'].update(title='Cluster label',
showticklabels=False,
range=[0, len(X) + (n_clusters + 1) * 10])
fig['layout'].update(hovermode='closest')
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(X)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
filled_area = go.Scatter(y=np.arange(y_lower, y_upper),
x=ith_cluster_silhouette_values,
mode='lines',
showlegend=False,
line=dict(width=0.5,
color=colors),
fill='tozerox')
fig.append_trace(filled_area, 1, 1)
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
# The vertical line for average silhouette score of all the values
y_end = len(X) + (n_clusters + 1) * 10
axis_line = go.Scatter(x=[silhouette_avg, silhouette_avg, silhouette_avg],
y=[0, y_end/2, y_end],
showlegend=False,
mode='lines+markers',
name='silhouette avg',
line=dict(color="red", dash='dash',
width =1) )
fig.append_trace(axis_line, 1, 1)
# fig['layout']['shapes'].append({
# 'type': 'line',
# 'x0': silhouette_avg,
# 'y0': 0,
# 'x1': silhouette_avg,
# 'y1': len(X) + (n_clusters + 1) * 10,
# 'line': {
# 'color': 'rgb(55, 128, 191)',
# 'width': 3,
# },
# })
# 2nd Plot showing the actual clusters formed
colors = colorConverter.to_rgb(cm.spectral(float(i) / n_clusters))
colors = 'rgb'+str(colors)
clusters = go.Scatter(x=X[:, 0],
y=X[:, 1],
showlegend=False,
mode='markers',
marker=dict(color=colors,
size=4)
)
fig.append_trace(clusters, 1, 2)
# Labeling the clusters
centers_ = clusterer.cluster_centers_
# Draw white circles at cluster centers
centers = go.Scatter(x=centers_[:, 0],
y=centers_[:, 1],
showlegend=False,
mode='markers',
marker=dict(color='green', size=10,
line=dict(color='black',
width=1))
)
fig.append_trace(centers, 1, 2)
fig['layout']['xaxis2'].update(title='Feature space for the 1st feature',
zeroline=False)
fig['layout']['yaxis2'].update(title='Feature space for the 2nd feature',
zeroline=False)
fig['layout'].update(title="Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters)
figures.append(fig)
In [15]:
iplot(figures[0])
In [ ]:
In [16]:
iplot(figures[1])
In [17]:
iplot(figures[2])
In [18]:
iplot(figures[3])
In [19]:
iplot(figures[4])
In [20]:
iplot(figures[5])
In [21]:
iplot(figures[6])
In [22]:
iplot(figures[7])
In [23]:
iplot(figures[8])
In [ ]:
In [24]:
nb_end = dt.now()
'Time elapsed: %s' % (nb_end - nb_start)
Out[24]: