In [1]:
%load_ext watermark
%watermark -a 'Vahid Mirjalili' -d -p scikit-learn,numpy,numexpr,pandas,matplotlib,plotly -v
In [2]:
## Import the necessary modules
import datetime
import numpy as np
import pandas as pd
import pandas.io.data
import sklearn
from matplotlib import pyplot as plt
import plotly.plotly as py
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import prettytable as PT
%matplotlib inline
In [3]:
from matplotlib.colors import LogNorm
smat_1 = np.array([[0.45, 0.02], [0.02, 0.25]])
smat_2 = np.array([[0.65, 0.02], [0.02, 0.85]])
smat_3 = np.array([[0.85, -0.06], [-0.06, 0.35]])
dnorm_1 = np.random.multivariate_normal(mean=[-1.5, 2.5], cov=smat_1, size=5000)
dnorm_2 = np.random.multivariate_normal(mean=[2.0, 1.0], cov=smat_2, size=5000)
dnorm_3 = np.random.multivariate_normal(mean=[-0.5, -2.5], cov=smat_3, size=5000)
dnorm_mixture = np.concatenate((dnorm_1, dnorm_2, dnorm_3))
fig4 = plt.figure(4, figsize=(16,7))
ax4 = fig4.add_subplot(1, 2, 1)
plt.scatter(dnorm_mixture[:,0], dnorm_mixture[:,1], color='b', marker='o', s=20, alpha=0.25)
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title('Guassian Mixture Model', size=20)
plt.xlim([-5, 5])
plt.ylim([-5, 5])
plt.setp(ax4.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax4.get_yticklabels(), rotation='horizontal', fontsize=16)
ax4 = fig4.add_subplot(1, 2, 2)
plt.hist2d(dnorm_mixture[:,0], dnorm_mixture[:,1], bins=50, norm=LogNorm())
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title('2D Probability Density', size=20)
plt.xlim([-5, 5])
plt.ylim([-5, 5])
plt.setp(ax4.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax4.get_yticklabels(), rotation='horizontal', fontsize=16)
plt.colorbar()
plt.show()
In [4]:
np.random.seed(123)
init_centroids = np.random.randint(low=0, high=dnorm_mixture.shape[0], size=3)
init_centroids
Out[4]:
In RKM:
$C_{lj} = \mu_j \pm r\sigma_{jj}/d$
In [5]:
global_mean = np.mean(dnorm_mixture, axis=0)
print(global_mean)
global_sigma = np.cov(dnorm_mixture.T)
print(global_sigma)
print(global_sigma.diagonal(0))
k = 3
ndim = dnorm_mixture.shape[1]
r = np.random.uniform(low=-1, high=1, size=(k,ndim))
init_centroids_RKM = global_mean + r * global_sigma.diagonal(0).T / ndim
init_centroids_RKM
Out[5]:
In [6]:
fig2 = plt.figure(2, figsize=(16,7))
ax21 = fig2.add_subplot(1, 2, 1)
plt.scatter(dnorm_mixture[:,0], dnorm_mixture[:,1], color='gray', marker='o', s=20, alpha=0.25)
plt.scatter(dnorm_mixture[init_centroids[0],0], dnorm_mixture[init_centroids[0],1], color='r', marker='*', s=600, alpha=0.8)
plt.scatter(dnorm_mixture[init_centroids[1],0], dnorm_mixture[init_centroids[1],1], color='g', marker='*', s=600, alpha=0.8)
plt.scatter(dnorm_mixture[init_centroids[2],0], dnorm_mixture[init_centroids[2],1], color='b', marker='*', s=600, alpha=0.8)
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title('Originial KMeams', size=20)
plt.xlim([-5, 5])
plt.ylim([-5, 5])
plt.setp(ax21.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax21.get_yticklabels(), rotation='horizontal', fontsize=16)
ax22 = fig2.add_subplot(1, 2, 2)
plt.scatter(dnorm_mixture[:,0], dnorm_mixture[:,1], color='gray', marker='o', s=20, alpha=0.25)
plt.scatter(global_mean[0], global_mean[1], color='r', marker='+', s=400, alpha=0.8, linewidth=4.0)
plt.scatter(init_centroids_RKM[0,0], init_centroids_RKM[0,1], color='r', marker='*', s=600, alpha=0.8)
plt.scatter(init_centroids_RKM[1,0], init_centroids_RKM[1,1], color='g', marker='*', s=600, alpha=0.8)
plt.scatter(init_centroids_RKM[2,0], init_centroids_RKM[2,1], color='b', marker='*', s=600, alpha=0.8)
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title('RKM', size=20)
plt.xlim([-5, 5])
plt.ylim([-5, 5])
plt.setp(ax22.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax22.get_yticklabels(), rotation='horizontal', fontsize=16)
plt.show()
#py.iplot_mpl(fig2, filename='DataScience/clustering/kmeans.init_centroids')
In [7]:
### First shuffle the data:
np.random.shuffle(dnorm_mixture)
In [8]:
### Cal_distance:
def cal_dist(p1, p2):
vecdiff = p1 - p2
return (np.sum(vecdiff*vecdiff))
#test:
#cal_dist(dnorm_mixture[0,:], dnorm_mixture[1,:])
def find_closest_centroid(p, centlist):
dist = np.repeat(-1.0, centlist.shape[0])
for i in range(centlist.shape[0]):
dist[i] = cal_dist(p, centlist[i,:])
return(np.argmin(dist))
#test:
find_closest_centroid(dnorm_mixture[0,:], dnorm_mixture[init_centroids])
Out[8]:
In [9]:
### Original KMeans:
cluster_origkm = np.repeat(-1, dnorm_mixture.shape[0])
for i in range(dnorm_mixture.shape[0]):
cluster_origkm[i] = find_closest_centroid(dnorm_mixture[i,:], dnorm_mixture[init_centroids])
upd_centroids = np.empty([3, 2], dtype=float)
## print the size of each cluster:
tab1 = PT.PrettyTable(["Cluster", "Size", "Center", "Sum of Distance to Center"])
for j in range(init_centroids.shape[0]):
csize = np.sum(cluster_origkm == j)
## Update the centroids:
upd_centroids[j] = np.mean(dnorm_mixture[cluster_origkm == j], axis=0)
sumdist = 0.0
for p in dnorm_mixture[cluster_origkm == j]:
sumdist += np.sqrt(cal_dist(p, upd_centroids[j]))
tab1.add_row([j, csize, upd_centroids[j,:], sumdist])
print(tab1)
In [10]:
### RKM:
cluster_RKM = np.repeat(-1, dnorm_mixture.shape[0])
N = dnorm_mixture.shape[0]
L = int(np.sqrt(N))
upd_centroids_RKM = init_centroids_RKM
i = -1
while i < N:
for l in range(L):
i += 1
if i<N:
cluster_RKM[i] = find_closest_centroid(dnorm_mixture[i,:], upd_centroids_RKM)
for j in range(k):
upd_centroids_RKM[j] = np.mean(dnorm_mixture[cluster_RKM == j], axis=0)
## print the size of each cluster:
tab2 = PT.PrettyTable(["Cluster", "Size", "Center", "Sum of Distances to Center"])
for j in range(upd_centroids_RKM.shape[0]):
csize = np.sum(cluster_RKM == j)
## Update the centroids:
#upd_centroids_RKM[j] = np.mean(dnorm_mixture[cluster_RKM == j], axis=0)
sumdist = 0.0
for p in dnorm_mixture[cluster_RKM == j]:
sumdist += np.sqrt(cal_dist(p, upd_centroids_RKM[j]))
tab2.add_row([j, csize, upd_centroids_RKM[j,:], sumdist])
print(tab2)
In [11]:
fig3 = plt.figure(3, figsize=(16,7))
ax31 = fig3.add_subplot(1, 2, 1)
color_list = ['r', 'g', 'b']
for j in range(k):
plt.scatter(dnorm_mixture[cluster_origkm == j,0], dnorm_mixture[cluster_origkm == j,1],
color=color_list[j], marker='o', s=20, alpha=0.15)
plt.scatter(upd_centroids[0,0], upd_centroids[0,1], color='black', marker='*', s=600, alpha=0.8)
plt.scatter(upd_centroids[1,0], upd_centroids[1,1], color='black', marker='*', s=600, alpha=0.8)
plt.scatter(upd_centroids[2,0], upd_centroids[2,1], color='black', marker='*', s=600, alpha=0.8)
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title('Originial KMeams', size=20)
plt.xlim([-5, 5])
plt.ylim([-5, 5])
plt.setp(ax31.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax31.get_yticklabels(), rotation='horizontal', fontsize=16)
ax32 = fig3.add_subplot(1, 2, 2)
for j in range(k):
plt.scatter(dnorm_mixture[cluster_RKM == j,0], dnorm_mixture[cluster_RKM == j,1],
color=color_list[j], marker='o', s=20, alpha=0.15)
#plt.scatter(global_mean[0], global_mean[1], color='r', marker='+', s=400, alpha=0.8, linewidth=4.0)
plt.scatter(upd_centroids_RKM[0,0], upd_centroids_RKM[0,1], color='black', marker='*', s=600, alpha=0.8)
plt.scatter(upd_centroids_RKM[1,0], upd_centroids_RKM[1,1], color='black', marker='*', s=600, alpha=0.8)
plt.scatter(upd_centroids_RKM[2,0], upd_centroids_RKM[2,1], color='black', marker='*', s=600, alpha=0.8)
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title('RKM', size=20)
plt.xlim([-5, 5])
plt.ylim([-5, 5])
plt.setp(ax32.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax32.get_yticklabels(), rotation='horizontal', fontsize=16)
plt.show()
#py.iplot_mpl(fig2, filename='DataScience/clustering/kmeans.init_centroids')
In [ ]: