2d-10c dataset

Imports

Import deps


In [1]:
from datetime import datetime as dt

import numpy as np
import pandas as pd

# viz libs
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.cm as cm
from matplotlib.colors import colorConverter

import plotly.graph_objs as go
from plotly import tools
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

random_state=42
nb_start = dt.now()



In [2]:
from sklearn.cluster import KMeans


from sklearn.metrics import silhouette_samples, silhouette_score

Import data


In [3]:
%%bash

ls -l | grep 2d
cat 2d-10c.dat | head -2
cat 2d-10c.dat | tail -1


-rw-rw-r-- 1 1000 1000    47078 May 13 15:09 2d-10c.dat
-rw-r--r-- 1 root root 28936890 May 28 03:50 2d-10c.ipynb
2525 2 10
-22.2778 -10.8739
-5.40705 9.34509 9

In [4]:
# df = pd.read_csv('2d-10c.dat')
# df.info()

data2d = np.loadtxt('2d-10c.dat', skiprows=3)

data2d.shape


Out[4]:
(2525, 3)

In [5]:
df = pd.DataFrame(data={
    'x': data2d[:, 0],
    'y': data2d[:, 1],
    'c': data2d[:, 2],
})

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2525 entries, 0 to 2524
Data columns (total 3 columns):
c    2525 non-null float64
x    2525 non-null float64
y    2525 non-null float64
dtypes: float64(3)
memory usage: 59.3 KB

In [6]:
df.head()


Out[6]:
c x y
0 0.0 12.757900 -4.81962
1 0.0 -0.298175 -5.03868
2 0.0 9.155580 -3.81778
3 0.0 4.298230 -6.25105
4 0.0 8.833200 -3.48504

Viz


In [7]:
x = df.x
y = df.y
c = df.c

trace = [go.Scatter(
    x = x,
    y = y,
    marker = dict(
        # color = col,
        color = c,
        colorscale='Viridis',
        colorbar=dict(
            title='Labels'
        ),
    ),
    name = 'data',
    mode = 'markers',
    hoverinfo = 'text',
    text = ['x: %s<br>y: %s<br>cluster %i' % (x_i, y_i, c_i) for x_i, y_i, c_i in zip(x, y, c)]
)]

layout = go.Layout(
    xaxis = dict({'title': 'x'}),
    yaxis = dict({'title': 'y'}),
    hovermode='closest',
)

fig = go.Figure(data=trace, layout=layout)
iplot(fig, layout)


KMeans

Elbow


In [8]:
Ks = range(2, 20)
km = [KMeans(n_clusters=i) for i in Ks] # , verbose=True
# score   = [km[i].fit(cluster_arr).score(cluster_arr) for i in range(len(km))]

fitted = [km[i].fit(data2d) for i in range(len(km))]
score  = [fitted[i].score(data2d) for i in range(len(km))]
inertia  = [fitted[i].inertia_ for i in range(len(km))]

relative_diff = [inertia[0]]
relative_diff.extend([inertia[i-1] - inertia[i] for i in range(1, len(inertia))])

print(fitted[:1])
print(score[:1])
print(inertia[:1])
print(relative_diff)


[KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)]
[-114153.17469896222]
[114153.17469896223]
[114153.17469896223, 46549.745935441955, 12726.576645303125, 11686.00725909329, 10033.509778030966, 6108.058954693712, 4315.724603012466, 3952.187440054604, 3400.986425621464, 3068.3005319362674, 1821.7760272006253, 1392.5646851940655, 1199.7708778306205, 971.0235110894282, 553.9750647364253, 528.9137710492832, 508.8645874046997, 358.38948216681]

In [9]:
data = [
    go.Bar(
        x = list(Ks),
        y = inertia,
        text = ['Diff is: %s' % diff for diff in relative_diff]
    ),
    go.Scatter(
        x = list(Ks),
        y = inertia
    ),
]


layout = go.Layout(
    xaxis = dict(
        title = 'No of Clusters [%s-%s]' % (min(Ks), max(Ks))
    ),
    yaxis = dict(
        title = 'Sklearn score / inertia'
    ),
    # barmode='stack'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)



In [10]:
data = [
    go.Bar(
        x = list(Ks),
        y = relative_diff
    ),
    go.Scatter(
        x = list(Ks),
        y = relative_diff
    ),
]


layout = go.Layout(
    xaxis = dict(
        title = 'No of Clusters [%s-%s]' % (min(Ks), max(Ks))
    ),
    yaxis = dict(
        title = 'Pairwise difference'
    ),
    # barmode='stack'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)


Silhouette


In [11]:
# silhouette_avg = silhouette_score(X, cluster_labels)

In [12]:
Ks = range(2, 20)
km = [KMeans(n_clusters=i) for i in Ks] # , verbose=True
# score   = [km[i].fit(cluster_arr).score(cluster_arr) for i in range(len(km))]

cluster_lab = [km[i].fit_predict(data2d) for i in range(len(km))]
score_avg  = [silhouette_score(data2d, cluster_lab[i])   for i in range(len(km))]
sample_values = [silhouette_samples(data2d, cluster_lab[i])   for i in range(len(km))]

print(cluster_lab)
print(score)
print(sample_values)


[array([1, 1, 1, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 2, 2, 2], dtype=int32), array([2, 0, 2, ..., 1, 1, 1], dtype=int32), array([1, 3, 1, ..., 0, 0, 0], dtype=int32), array([4, 3, 4, ..., 1, 1, 1], dtype=int32), array([5, 3, 5, ..., 1, 1, 1], dtype=int32), array([4, 6, 4, ..., 7, 7, 2], dtype=int32), array([7, 0, 7, ..., 2, 2, 4], dtype=int32), array([9, 7, 9, ..., 6, 6, 3], dtype=int32), array([7, 0, 7, ..., 1, 1, 6], dtype=int32), array([ 4,  8,  4, ..., 11, 11,  3], dtype=int32), array([ 9,  3,  9, ...,  0,  0, 10], dtype=int32), array([ 4, 13,  5, ..., 10, 10, 10], dtype=int32), array([ 6,  0, 14, ..., 12, 12, 12], dtype=int32), array([13, 14,  4, ..., 12, 12, 12], dtype=int32), array([ 9, 15, 16, ...,  1,  1,  1], dtype=int32), array([ 8, 15, 16, ..., 14, 14, 14], dtype=int32), array([13, 11,  0, ..., 12, 12, 12], dtype=int32)]
[-114153.17469896222, -67603.42876352029, -54876.85211821715, -43190.844859123856, -33157.33508109288, -27049.27612639919, -22733.55152338672, -18781.364083332115, -15380.377657710647, -12312.07712577438, -10490.301098573755, -9097.7364133797, -7897.965535549071, -6926.942024459651, -6372.9669597232205, -5844.053188673932, -5335.188601269232, -4976.799119102425]
[array([0.5092887 , 0.14985351, 0.53707377, ..., 0.55415664, 0.56197729,
       0.46478879]), array([0.36215001, 0.38803437, 0.46616089, ..., 0.53160952, 0.53923211,
       0.3948391 ]), array([0.38279062, 0.11490126, 0.48501588, ..., 0.56470308, 0.58165806,
       0.53409848]), array([0.38712784, 0.07180306, 0.43287696, ..., 0.56441734, 0.58139257,
       0.53016265]), array([0.39974927, 0.07744901, 0.44357972, ..., 0.54657266, 0.55722172,
       0.34748283]), array([0.43109358, 0.47866486, 0.47291437, ..., 0.54590434, 0.55650023,
       0.34624763]), array([0.43085276, 0.47866486, 0.47267329, ..., 0.3528004 , 0.32728916,
       0.26754374]), array([0.35584865, 0.47619305, 0.4170091 , ..., 0.35381273, 0.32841137,
       0.2667531 ]), array([0.42378822, 0.01824316, 0.01535369, ..., 0.35332182, 0.32635037,
       0.27388791]), array([0.63811184, 0.12513956, 0.50259468, ..., 0.3580297 , 0.33356985,
       0.26223444]), array([0.63811184, 0.12513956, 0.50259468, ..., 0.3510626 , 0.32383737,
       0.27588492]), array([0.66608417, 0.62024054, 0.00119273, ..., 0.34987754, 0.32315156,
       0.27551181]), array([0.66945929, 0.62077271, 0.03131682, ..., 0.3692014 , 0.4439851 ,
       0.4737557 ]), array([0.66856075, 0.59503311, 0.07512136, ..., 0.37425019, 0.4672245 ,
       0.4790752 ]), array([0.62131881, 0.63998592, 0.56286228, ..., 0.36234506, 0.44352697,
       0.47085182]), array([0.55409343, 0.65023434, 0.61889422, ..., 0.36234506, 0.44352697,
       0.47085182]), array([0.62131881, 0.6213993 , 0.56286228, ..., 0.3613828 , 0.42142583,
       0.45546532]), array([0.59839175, 0.6350811 , 0.61154311, ..., 0.35667262, 0.41951117,
       0.45718956])]

In [13]:
data = [
    go.Bar(
        x = list(Ks),
        y = score_avg
    ),
    go.Scatter(
        x = list(Ks),
        y = score_avg
    ),
]


layout = go.Layout(
    xaxis = dict(
        title = 'No of Clusters [%s-%s]' % (min(Ks), max(Ks))
    ),
    yaxis = dict(
        title = 'Sihouette score (avg)'
    ),
    # barmode='stack'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)



In [ ]:


In [14]:
X = data2d
figures = []
range_n_clusters = range(2,20)

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig = tools.make_subplots(rows=1, cols=2,
                              print_grid=False,
                              subplot_titles=('The silhouette plot for the various clusters.',
                                              'The visualization of the clustered data.'))

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    fig['layout']['xaxis1'].update(title='The silhouette coefficient values',
                                   range=[-0.1, 1])
   
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    fig['layout']['yaxis1'].update(title='Cluster label',
                                   showticklabels=False,
                                   range=[0, len(X) + (n_clusters + 1) * 10])
    
    fig['layout'].update(hovermode='closest')

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)
    y_lower = 10
    
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
        
        filled_area = go.Scatter(y=np.arange(y_lower, y_upper),
                                 x=ith_cluster_silhouette_values,
                                 mode='lines',
                                 showlegend=False,
                                 line=dict(width=0.5,
                                          color=colors),
                                 fill='tozerox')
        fig.append_trace(filled_area, 1, 1)
        
        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples
        

    # The vertical line for average silhouette score of all the values
    y_end = len(X) + (n_clusters + 1) * 10
    axis_line = go.Scatter(x=[silhouette_avg, silhouette_avg, silhouette_avg],
                           y=[0, y_end/2, y_end],
                           showlegend=False,
                           mode='lines+markers',
                           name='silhouette avg',
                           line=dict(color="red", dash='dash',
                                     width =1) )

    fig.append_trace(axis_line, 1, 1)

#     fig['layout']['shapes'].append({
#         'type': 'line',
#         'x0': silhouette_avg,
#         'y0': 0,
#         'x1': silhouette_avg,
#         'y1': len(X) + (n_clusters + 1) * 10,
#         'line': {
#             'color': 'rgb(55, 128, 191)',
#             'width': 3,
#         },
#     })
    
    # 2nd Plot showing the actual clusters formed
    colors = colorConverter.to_rgb(cm.spectral(float(i) / n_clusters))
    colors = 'rgb'+str(colors)
    clusters = go.Scatter(x=X[:, 0], 
                          y=X[:, 1], 
                          showlegend=False,
                          mode='markers',
                          marker=dict(color=colors,
                                     size=4)
                         )
    fig.append_trace(clusters, 1, 2)
    
    # Labeling the clusters
    centers_ = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    centers = go.Scatter(x=centers_[:, 0], 
                         y=centers_[:, 1],
                         showlegend=False,
                         mode='markers',
                         marker=dict(color='green', size=10,
                                     line=dict(color='black',
                                                             width=1))
                        )

    fig.append_trace(centers, 1, 2)
    
    fig['layout']['xaxis2'].update(title='Feature space for the 1st feature',
                                   zeroline=False)
    fig['layout']['yaxis2'].update(title='Feature space for the 2nd feature',
                                  zeroline=False)
    

    fig['layout'].update(title="Silhouette analysis for KMeans clustering on sample data "
                         "with n_clusters = %d" % n_clusters)
    
    figures.append(fig)


For n_clusters = 2 The average silhouette_score is : 0.4597898317237656
For n_clusters = 3 The average silhouette_score is : 0.48532565590573695
For n_clusters = 4 The average silhouette_score is : 0.4859528835140874
For n_clusters = 5 The average silhouette_score is : 0.4964370409257924
For n_clusters = 6 The average silhouette_score is : 0.5203118154456644
For n_clusters = 7 The average silhouette_score is : 0.5494491094400401
For n_clusters = 8 The average silhouette_score is : 0.5315175536955833
For n_clusters = 9 The average silhouette_score is : 0.5703337932686048
For n_clusters = 10 The average silhouette_score is : 0.5791762594898638
For n_clusters = 11 The average silhouette_score is : 0.6036712121016099
For n_clusters = 12 The average silhouette_score is : 0.6032277311230565
For n_clusters = 13 The average silhouette_score is : 0.617337434368881
For n_clusters = 14 The average silhouette_score is : 0.6167144226297993
For n_clusters = 15 The average silhouette_score is : 0.607829003976631
For n_clusters = 16 The average silhouette_score is : 0.6005841074412955
For n_clusters = 17 The average silhouette_score is : 0.5986115151383333
For n_clusters = 18 The average silhouette_score is : 0.5788550283244093
For n_clusters = 19 The average silhouette_score is : 0.5777471660750043

Silhouette analysis for n_clusters = 2


In [15]:
iplot(figures[0])



In [ ]:

Silhouette analysis for n_clusters = 3


In [16]:
iplot(figures[1])


Silhouette analysis for n_clusters = 4


In [17]:
iplot(figures[2])


Silhouette analysis for n_clusters = 5


In [18]:
iplot(figures[3])


Silhouette analysis for n_clusters = 6


In [19]:
iplot(figures[4])


Silhouette analysis for n_clusters = 7


In [20]:
iplot(figures[5])


Silhouette analysis for n_clusters = 8


In [21]:
iplot(figures[6])


Silhouette analysis for n_clusters = 9


In [22]:
iplot(figures[7])


Silhouette analysis for n_clusters = 10


In [23]:
iplot(figures[8])



In [ ]:


In [24]:
nb_end = dt.now()

'Time elapsed: %s' % (nb_end - nb_start)


Out[24]:
'Time elapsed: 0:00:19.155646'

Bibliography