In [1]:
%load_ext watermark

%watermark -a 'Vahid Mirjalili' -d -p scikit-learn,numpy,numexpr,pandas,matplotlib,plotly -v


Vahid Mirjalili 07/12/2014 

CPython 2.7.3
IPython 2.3.1

scikit-learn 0.15.2
numpy 1.9.1
numexpr 2.2.2
pandas 0.15.1
matplotlib 1.4.2
plotly 1.4.7

In [2]:
## Import the necessary modules

import datetime
import numpy as np
import pandas as pd
import pandas.io.data

import sklearn

from matplotlib import pyplot as plt
import plotly.plotly as py

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

import prettytable as PT

%matplotlib inline

In [3]:
from matplotlib.colors import LogNorm

smat_1 = np.array([[0.45, 0.02], [0.02, 0.25]])
smat_2 = np.array([[0.65, 0.02], [0.02, 0.85]])
smat_3 = np.array([[0.85, -0.06], [-0.06, 0.35]])

dnorm_1 = np.random.multivariate_normal(mean=[-1.5, 2.5], cov=smat_1, size=5000)
dnorm_2 = np.random.multivariate_normal(mean=[2.0, 1.0], cov=smat_2, size=5000)
dnorm_3 = np.random.multivariate_normal(mean=[-0.5, -2.5], cov=smat_3, size=5000)

dnorm_mixture = np.concatenate((dnorm_1, dnorm_2, dnorm_3))

fig4 = plt.figure(4, figsize=(16,7))
ax4 = fig4.add_subplot(1, 2, 1)

plt.scatter(dnorm_mixture[:,0], dnorm_mixture[:,1], color='b', marker='o', s=20, alpha=0.25)
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title('Guassian Mixture Model', size=20)
plt.xlim([-5, 5])
plt.ylim([-5, 5])

plt.setp(ax4.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax4.get_yticklabels(), rotation='horizontal', fontsize=16)

ax4 = fig4.add_subplot(1, 2, 2)

plt.hist2d(dnorm_mixture[:,0], dnorm_mixture[:,1], bins=50, norm=LogNorm())
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title('2D Probability Density', size=20)
plt.xlim([-5, 5])
plt.ylim([-5, 5])

plt.setp(ax4.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax4.get_yticklabels(), rotation='horizontal', fontsize=16)

plt.colorbar()

plt.show()


Choosing Initial Centroids


In [4]:
np.random.seed(123)

init_centroids = np.random.randint(low=0, high=dnorm_mixture.shape[0], size=3)

init_centroids


Out[4]:
array([ 3582, 11646,  1346])

In RKM:

$C_{lj} = \mu_j \pm r\sigma_{jj}/d$


In [5]:
global_mean = np.mean(dnorm_mixture, axis=0)
print(global_mean)

global_sigma = np.cov(dnorm_mixture.T)
print(global_sigma)
print(global_sigma.diagonal(0))

k = 3
ndim = dnorm_mixture.shape[1]

r = np.random.uniform(low=-1, high=1, size=(k,ndim))

init_centroids_RKM = global_mean + r * global_sigma.diagonal(0).T / ndim

init_centroids_RKM


[-0.00426087  0.34017501]
[[ 2.82446184 -0.15858484]
 [-0.15858484  4.84990477]]
[ 2.82446184  4.84990477]
Out[5]:
array([[-0.77575852,  0.58904675],
       [ 0.61562086, -0.03275134],
       [ 1.35363926,  1.23658164]])

In [6]:
fig2 = plt.figure(2, figsize=(16,7))
ax21 = fig2.add_subplot(1, 2, 1)

plt.scatter(dnorm_mixture[:,0], dnorm_mixture[:,1], color='gray', marker='o', s=20, alpha=0.25)
plt.scatter(dnorm_mixture[init_centroids[0],0], dnorm_mixture[init_centroids[0],1], color='r', marker='*', s=600, alpha=0.8)
plt.scatter(dnorm_mixture[init_centroids[1],0], dnorm_mixture[init_centroids[1],1], color='g', marker='*', s=600, alpha=0.8)
plt.scatter(dnorm_mixture[init_centroids[2],0], dnorm_mixture[init_centroids[2],1], color='b', marker='*', s=600, alpha=0.8)
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title('Originial KMeams', size=20)
plt.xlim([-5, 5])
plt.ylim([-5, 5])

plt.setp(ax21.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax21.get_yticklabels(), rotation='horizontal', fontsize=16)

ax22 = fig2.add_subplot(1, 2, 2)

plt.scatter(dnorm_mixture[:,0], dnorm_mixture[:,1], color='gray', marker='o', s=20, alpha=0.25)
plt.scatter(global_mean[0], global_mean[1], color='r', marker='+', s=400, alpha=0.8, linewidth=4.0)
plt.scatter(init_centroids_RKM[0,0], init_centroids_RKM[0,1], color='r', marker='*', s=600, alpha=0.8)
plt.scatter(init_centroids_RKM[1,0], init_centroids_RKM[1,1], color='g', marker='*', s=600, alpha=0.8)
plt.scatter(init_centroids_RKM[2,0], init_centroids_RKM[2,1], color='b', marker='*', s=600, alpha=0.8)
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title('RKM', size=20)
plt.xlim([-5, 5])
plt.ylim([-5, 5])

plt.setp(ax22.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax22.get_yticklabels(), rotation='horizontal', fontsize=16)

plt.show()
#py.iplot_mpl(fig2, filename='DataScience/clustering/kmeans.init_centroids')


Assiging points to closest cluster centroid

  • Original KMeans: assign all data points to their closest cluster centroid, then update cluster centroids
  • RKM: Assign every $L=\sqrt{n}$ data points, then update cluster centroids and repeat for the next $L$ points

In [7]:
### First shuffle the data:

np.random.shuffle(dnorm_mixture)

In [8]:
### Cal_distance:

def cal_dist(p1, p2):
    vecdiff = p1 - p2
    return (np.sum(vecdiff*vecdiff))

#test: 
#cal_dist(dnorm_mixture[0,:], dnorm_mixture[1,:])

def find_closest_centroid(p, centlist):
    dist = np.repeat(-1.0, centlist.shape[0])
    for i in range(centlist.shape[0]):
        dist[i] = cal_dist(p, centlist[i,:])
        
    return(np.argmin(dist))

#test: 
find_closest_centroid(dnorm_mixture[0,:], dnorm_mixture[init_centroids])


Out[8]:
1

In [9]:
### Original KMeans:
cluster_origkm = np.repeat(-1, dnorm_mixture.shape[0])

for i in range(dnorm_mixture.shape[0]):
    cluster_origkm[i] = find_closest_centroid(dnorm_mixture[i,:], dnorm_mixture[init_centroids])


upd_centroids = np.empty([3, 2], dtype=float)
    
## print the size of each cluster:
tab1 = PT.PrettyTable(["Cluster", "Size", "Center", "Sum of Distance to Center"])
for j in range(init_centroids.shape[0]):
    csize = np.sum(cluster_origkm == j)
    ## Update the centroids:    
    upd_centroids[j] = np.mean(dnorm_mixture[cluster_origkm == j], axis=0)
    sumdist = 0.0
    for p in dnorm_mixture[cluster_origkm == j]:
        sumdist += np.sqrt(cal_dist(p, upd_centroids[j]))
    
    tab1.add_row([j, csize, upd_centroids[j,:], sumdist])
    

print(tab1)


+---------+------+---------------------------+---------------------------+
| Cluster | Size |           Center          | Sum of Distance to Center |
+---------+------+---------------------------+---------------------------+
|    0    | 5442 | [-0.06159471 -1.60747132] |       9196.03200105       |
|    1    | 1679 | [ 1.38200963 -1.96934027] |       2886.72331663       |
|    2    | 7879 | [-0.26007219  2.17756141] |       13807.6696925       |
+---------+------+---------------------------+---------------------------+

In [10]:
### RKM:
cluster_RKM = np.repeat(-1, dnorm_mixture.shape[0])

N = dnorm_mixture.shape[0]
L = int(np.sqrt(N))


upd_centroids_RKM = init_centroids_RKM
i = -1
while i < N:
    for l in range(L):
        i += 1
        if i<N:
            cluster_RKM[i] = find_closest_centroid(dnorm_mixture[i,:], upd_centroids_RKM)
        
    for j in range(k):
        upd_centroids_RKM[j] = np.mean(dnorm_mixture[cluster_RKM == j], axis=0)
    
## print the size of each cluster:
tab2 = PT.PrettyTable(["Cluster", "Size", "Center", "Sum of Distances to Center"])
for j in range(upd_centroids_RKM.shape[0]):
    csize = np.sum(cluster_RKM == j)
    ## Update the centroids:    
    #upd_centroids_RKM[j] = np.mean(dnorm_mixture[cluster_RKM == j], axis=0)
    sumdist = 0.0
    for p in dnorm_mixture[cluster_RKM == j]:
        sumdist += np.sqrt(cal_dist(p, upd_centroids_RKM[j]))
    
    tab2.add_row([j, csize, upd_centroids_RKM[j,:], sumdist])
    

print(tab2)


+---------+------+---------------------------+----------------------------+
| Cluster | Size |           Center          | Sum of Distances to Center |
+---------+------+---------------------------+----------------------------+
|    0    | 5041 | [-1.49495357  2.48353507] |       3794.21617631        |
|    1    | 5036 | [-0.4924119  -2.47643093] |       4904.17685008        |
|    2    | 4923 | [ 2.02151823  1.02669734] |        5229.1042159        |
+---------+------+---------------------------+----------------------------+

In [11]:
fig3 = plt.figure(3, figsize=(16,7))
ax31 = fig3.add_subplot(1, 2, 1)

color_list = ['r', 'g', 'b']

for j in range(k):
    plt.scatter(dnorm_mixture[cluster_origkm == j,0], dnorm_mixture[cluster_origkm == j,1], 
                color=color_list[j], marker='o', s=20, alpha=0.15)
    
plt.scatter(upd_centroids[0,0], upd_centroids[0,1], color='black', marker='*', s=600, alpha=0.8)
plt.scatter(upd_centroids[1,0], upd_centroids[1,1], color='black', marker='*', s=600, alpha=0.8)
plt.scatter(upd_centroids[2,0], upd_centroids[2,1], color='black', marker='*', s=600, alpha=0.8)
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title('Originial KMeams', size=20)
plt.xlim([-5, 5])
plt.ylim([-5, 5])

plt.setp(ax31.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax31.get_yticklabels(), rotation='horizontal', fontsize=16)

ax32 = fig3.add_subplot(1, 2, 2)

for j in range(k):
    plt.scatter(dnorm_mixture[cluster_RKM == j,0], dnorm_mixture[cluster_RKM == j,1], 
                color=color_list[j], marker='o', s=20, alpha=0.15)
    
#plt.scatter(global_mean[0], global_mean[1], color='r', marker='+', s=400, alpha=0.8, linewidth=4.0)
plt.scatter(upd_centroids_RKM[0,0], upd_centroids_RKM[0,1], color='black', marker='*', s=600, alpha=0.8)
plt.scatter(upd_centroids_RKM[1,0], upd_centroids_RKM[1,1], color='black', marker='*', s=600, alpha=0.8)
plt.scatter(upd_centroids_RKM[2,0], upd_centroids_RKM[2,1], color='black', marker='*', s=600, alpha=0.8)
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20)
plt.title('RKM', size=20)
plt.xlim([-5, 5])
plt.ylim([-5, 5])

plt.setp(ax32.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax32.get_yticklabels(), rotation='horizontal', fontsize=16)

plt.show()
#py.iplot_mpl(fig2, filename='DataScience/clustering/kmeans.init_centroids')



In [ ]: