In [1]:

    
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
import scipy.io as sio

import sys
sys.path.append('..')

from helper import kmeans as km



In [2]:

    
mat = sio.loadmat('./data/ex7data2.mat')
data2 = pd.DataFrame(mat.get('X'), columns=['X1', 'X2'])
print(data2.head())

sns.set(context="notebook", style="white")
sns.lmplot('X1', 'X2', data=data2, fit_reg=False)









    



         X1        X2
0  1.842080  4.607572
1  5.658583  4.799964
2  6.352579  3.290854
3  2.904017  4.612204
4  3.231979  4.939894






    Out[2]:





<seaborn.axisgrid.FacetGrid at 0x7fe4fc0cd160>

0. random init

for initial centroids



In [3]:

    
km.random_init(data2, 3)









    Out[3]:





array([[ 2.11496411,  5.37373587],
       [ 0.99253246,  5.01567424],
       [ 1.40260822,  1.08726536]])

1. cluster assignment

http://stackoverflow.com/questions/14432557/matplotlib-scatter-plot-with-different-text-at-each-data-point

find closest cluster experiment



In [4]:

    
init_centroids = km.random_init(data2, 3)
init_centroids









    Out[4]:





array([[ 5.05274526,  2.75692163],
       [ 2.12857843,  5.01149793],
       [ 5.29239452,  0.36873298]])



In [5]:

    
x = np.array([1, 1])



In [6]:

    
fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(x=init_centroids[:, 0], y=init_centroids[:, 1])

for i, node in enumerate(init_centroids):
    ax.annotate('{}: ({},{})'.format(i, node[0], node[1]), node)
    
ax.scatter(x[0], x[1], marker='x', s=200)









    Out[6]:





<matplotlib.collections.PathCollection at 0x7fe4f478a400>



In [7]:

    
km._find_your_cluster(x, init_centroids)









    Out[7]:





1

1 epoch cluster assigning



In [8]:

    
C = km.assign_cluster(data2, init_centroids)
data_with_c = km.combine_data_C(data2, C)
data_with_c.head()

See the first round clustering result



In [9]:

    
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)









    Out[9]:





<seaborn.axisgrid.FacetGrid at 0x7fe4f4718518>

2. calculate new centroid



In [10]:

    
km.new_centroids(data2, C)









    Out[10]:





array([[ 5.67762686,  2.83305931],
       [ 1.94033883,  4.95015645],
       [ 3.07010849,  0.90867247]])

putting all together, take1

this is just 1 shot k-means, if the random init pick the bad starting centroids, the final clustering may be very sub-optimal



In [11]:

    
final_C, final_centroid, _= km._k_means_iter(data2, 3)
data_with_c = km.combine_data_C(data2, final_C)



In [12]:

    
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)









    Out[12]:





<seaborn.axisgrid.FacetGrid at 0x7fe4f47424e0>

calculate the cost



In [13]:

    
km.cost(data2, final_centroid, final_C)









    Out[13]:





0.79417636337158704

k-mean with multiple tries of randome init, pick the best one with least cost



In [14]:

    
best_C, best_centroids, least_cost = km.k_means(data2, 3)



In [15]:

    
least_cost









    Out[15]:





0.79417636337158704



In [16]:

    
data_with_c = km.combine_data_C(data2, best_C)
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)









    Out[16]:





<seaborn.axisgrid.FacetGrid at 0x7fe4f46c2128>

try sklearn kmeans



In [17]:

    
from sklearn.cluster import KMeans



In [18]:

    
sk_kmeans = KMeans(n_clusters=3)



In [19]:

    
sk_kmeans.fit(data2)









    Out[19]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)



In [20]:

    
sk_C = sk_kmeans.predict(data2)



In [21]:

    
data_with_c = km.combine_data_C(data2, sk_C)
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)









    Out[21]:





<seaborn.axisgrid.FacetGrid at 0x7fe4f03a9128>



In [ ]:

	X1	X2	C
0	1.842080	4.607572	1
1	5.658583	4.799964	0
2	6.352579	3.290854	0
3	2.904017	4.612204	1
4	3.231979	4.939894	1