In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy.io as sio
import sys
sys.path.append('..')
from helper import kmeans as km
In [2]:
mat = sio.loadmat('./data/ex7data2.mat')
data2 = pd.DataFrame(mat.get('X'), columns=['X1', 'X2'])
print(data2.head())
sns.set(context="notebook", style="white")
sns.lmplot('X1', 'X2', data=data2, fit_reg=False)
Out[2]:
In [3]:
km.random_init(data2, 3)
Out[3]:
In [4]:
init_centroids = km.random_init(data2, 3)
init_centroids
Out[4]:
In [5]:
x = np.array([1, 1])
In [6]:
fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(x=init_centroids[:, 0], y=init_centroids[:, 1])
for i, node in enumerate(init_centroids):
ax.annotate('{}: ({},{})'.format(i, node[0], node[1]), node)
ax.scatter(x[0], x[1], marker='x', s=200)
Out[6]:
In [7]:
km._find_your_cluster(x, init_centroids)
Out[7]:
In [8]:
C = km.assign_cluster(data2, init_centroids)
data_with_c = km.combine_data_C(data2, C)
data_with_c.head()
Out[8]:
See the first round clustering result
In [9]:
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)
Out[9]:
In [10]:
km.new_centroids(data2, C)
Out[10]:
In [11]:
final_C, final_centroid, _= km._k_means_iter(data2, 3)
data_with_c = km.combine_data_C(data2, final_C)
In [12]:
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)
Out[12]:
In [13]:
km.cost(data2, final_centroid, final_C)
Out[13]:
In [14]:
best_C, best_centroids, least_cost = km.k_means(data2, 3)
In [15]:
least_cost
Out[15]:
In [16]:
data_with_c = km.combine_data_C(data2, best_C)
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)
Out[16]:
In [17]:
from sklearn.cluster import KMeans
In [18]:
sk_kmeans = KMeans(n_clusters=3)
In [19]:
sk_kmeans.fit(data2)
Out[19]:
In [20]:
sk_C = sk_kmeans.predict(data2)
In [21]:
data_with_c = km.combine_data_C(data2, sk_C)
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)
Out[21]:
In [ ]: