In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
import scipy.io as sio

import sys
sys.path.append('..')

from helper import kmeans as km

In [2]:
mat = sio.loadmat('./data/ex7data2.mat')
data2 = pd.DataFrame(mat.get('X'), columns=['X1', 'X2'])
print(data2.head())

sns.set(context="notebook", style="white")
sns.lmplot('X1', 'X2', data=data2, fit_reg=False)


         X1        X2
0  1.842080  4.607572
1  5.658583  4.799964
2  6.352579  3.290854
3  2.904017  4.612204
4  3.231979  4.939894
Out[2]:
<seaborn.axisgrid.FacetGrid at 0x7fe4fc0cd160>

0. random init

for initial centroids


In [3]:
km.random_init(data2, 3)


Out[3]:
array([[ 2.11496411,  5.37373587],
       [ 0.99253246,  5.01567424],
       [ 1.40260822,  1.08726536]])

find closest cluster experiment


In [4]:
init_centroids = km.random_init(data2, 3)
init_centroids


Out[4]:
array([[ 5.05274526,  2.75692163],
       [ 2.12857843,  5.01149793],
       [ 5.29239452,  0.36873298]])

In [5]:
x = np.array([1, 1])

In [6]:
fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(x=init_centroids[:, 0], y=init_centroids[:, 1])

for i, node in enumerate(init_centroids):
    ax.annotate('{}: ({},{})'.format(i, node[0], node[1]), node)
    
ax.scatter(x[0], x[1], marker='x', s=200)


Out[6]:
<matplotlib.collections.PathCollection at 0x7fe4f478a400>

In [7]:
km._find_your_cluster(x, init_centroids)


Out[7]:
1

1 epoch cluster assigning


In [8]:
C = km.assign_cluster(data2, init_centroids)
data_with_c = km.combine_data_C(data2, C)
data_with_c.head()


Out[8]:
X1 X2 C
0 1.842080 4.607572 1
1 5.658583 4.799964 0
2 6.352579 3.290854 0
3 2.904017 4.612204 1
4 3.231979 4.939894 1

See the first round clustering result


In [9]:
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)


Out[9]:
<seaborn.axisgrid.FacetGrid at 0x7fe4f4718518>

2. calculate new centroid


In [10]:
km.new_centroids(data2, C)


Out[10]:
array([[ 5.67762686,  2.83305931],
       [ 1.94033883,  4.95015645],
       [ 3.07010849,  0.90867247]])

putting all together, take1

this is just 1 shot k-means, if the random init pick the bad starting centroids, the final clustering may be very sub-optimal


In [11]:
final_C, final_centroid, _= km._k_means_iter(data2, 3)
data_with_c = km.combine_data_C(data2, final_C)

In [12]:
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)


Out[12]:
<seaborn.axisgrid.FacetGrid at 0x7fe4f47424e0>

calculate the cost


In [13]:
km.cost(data2, final_centroid, final_C)


Out[13]:
0.79417636337158704

k-mean with multiple tries of randome init, pick the best one with least cost


In [14]:
best_C, best_centroids, least_cost = km.k_means(data2, 3)

In [15]:
least_cost


Out[15]:
0.79417636337158704

In [16]:
data_with_c = km.combine_data_C(data2, best_C)
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)


Out[16]:
<seaborn.axisgrid.FacetGrid at 0x7fe4f46c2128>

try sklearn kmeans


In [17]:
from sklearn.cluster import KMeans

In [18]:
sk_kmeans = KMeans(n_clusters=3)

In [19]:
sk_kmeans.fit(data2)


Out[19]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [20]:
sk_C = sk_kmeans.predict(data2)

In [21]:
data_with_c = km.combine_data_C(data2, sk_C)
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)


Out[21]:
<seaborn.axisgrid.FacetGrid at 0x7fe4f03a9128>

In [ ]: