Importing Libraries


In [332]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.io
import scipy.misc
import matplotlib.pyplot as plt
from math import *

In [4]:
%matplotlib inline
sns.set_style(style="whitegrid")

K-Means Clustering

Functions


In [5]:
def findClosestCentroids(X, centroids):
    c = np.ones((len(X), 1), dtype= np.uint8)
    for i in range(len(X)):
        
        # Computing distance between training example and centroids
        dist = centroids - X[i]
        dist = dist ** 2
        dist = np.sum(dist, axis=1)
        
        # Finding closest Centroid
        argmin = np.argmin(dist)
        c[i] = argmin
    
    # Returning index of the closest centroids
    return c

def computeCentroids(X, idx, K):
    
    # Creating new centroids matrix
    centroids = np.zeros((K, X.shape[1]))
    
    for i in range(K):
        
        # Finding trainig sets with same class
        same_idx = (idx == i)
       
        # Computing mean of the points 
        centroids[i] = np.mean(X[same_idx], axis=0)
    
    # Returining new moved centroids
    return centroids

def runKMeans(X, initial_centroids, max_iters, plot_progress = False):
    
    #Initialize values
    m, n = X.shape
    K = len(initial_centroids)
    centroids = initial_centroids
    previous_centroids = centroids
    idx = np.zeros((m,1), dtype= np.uint8)
    
    # Run K-Means
    for i in range(max_iters):
        
        # Finding closest centroids
        idx = findClosestCentroids(X, centroids).T[0]
        
        if(plot_progress):
            plotProgressKMeans(X, centroids, previous_centroids, idx, K, i)
            previous_centroids = centroids
            
        # Given the memberships, compute new centroids
        centroids = computeCentroids(X, idx, K)
    
    return ( centroids, idx )

def plotDataPoints(X, idx, K):
    colors = np.zeros(( len(idx) , K))
    colors[ idx == 0 ] = [ 1,0,0 ]
    colors[ idx == 1 ] = [ 0,1,0 ]
    colors[ idx == 2 ] = [ 0,0,1 ]
    plt.scatter(X[:,0], X[:,1], s=60, c= colors)
        
def plotProgressKMeans(X, centroids, previous_centroids, idx, K, i):
    
    # Scattering Data Points
    plotDataPoints(X, idx, K)
    
    # Plotting new Centroids as "X"
    plt.plot(centroids[:,0], centroids[:,1], lw= 0 ,marker='X', c=[0,0,0], ms= 10)
    
    # Plotting Progress of Algorithm
    for i in range(K):
        
        # Draw line from point1 to point2
        drawLine(centroids[i, :], previous_centroids[i, :])
        
def drawLine(p1, p2):
    plt.plot([p1[0], p2[0]], [p1[1], p2[1]], color=[0,0,0], lw=3)
    
def kMeansInitCentroids(X, K):
    randIdx = np.random.permutation(K)
    return X[randIdx[0:K]]

Load The Data


In [6]:
# Load the Data
mat = scipy.io.loadmat("ex7data2.mat")
X = mat["X"]

Test "findClosestCentroids" , "computeMean" Functions


In [7]:
#Select an initial set of centroids
initial_centroids = np.array([[3, 3],
                             [6, 2],
                             [8, 5]])
K = 3

In [8]:
#Find the closest centroids for the examples using the initial_centroids
idx = findClosestCentroids(X,initial_centroids).T[0]
idx[0:3] + 1


Out[8]:
array([1, 3, 2], dtype=uint8)

In [9]:
# Centroids computed after initial finding of closest centroids
computeCentroids(X, idx, K)


Out[9]:
array([[ 2.42830111,  3.15792418],
       [ 5.81350331,  2.63365645],
       [ 7.11938687,  3.6166844 ]])

Test K-Means Algorithm


In [10]:
#Settings for running K-Means
K = 3
max_iter = 10

In [11]:
initial_centroids[0,:]


Out[11]:
array([3, 3])

In [12]:
# Running K-Means function with inital_centroids
plt.figure(figsize=(12,8))
runKMeans(X,initial_centroids,max_iters=10,plot_progress=True)


Out[12]:
(array([[ 1.95399466,  5.02557006],
        [ 3.04367119,  1.01541041],
        [ 6.03366736,  3.00052511]]),
 array([0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        0], dtype=uint8))

K-Means on Pixels

Load the Image


In [13]:
# Convert Real image to 3d RGB matrix
A = scipy.misc.imread("bird_small.png")

# Reshaping Matrix to 2D Matrix
A_2d = A.reshape( ( A.shape[0] * A.shape[1], A.shape[2] ) )

In [14]:
# Computing memory usage of the matrix
A.nbytes * 8


Out[14]:
393216

Running K-Means Algorithm


In [15]:
# Divide by 255 so that all values are in the range 0 - 1
A = A / 255
A_2d = A_2d / 255

In [16]:
# Initial Values of K and max_iters
K = 16; 
max_iters = 100;

In [17]:
A_2d


Out[17]:
array([[ 0.85882353,  0.70588235,  0.40392157],
       [ 0.90196078,  0.7254902 ,  0.45490196],
       [ 0.88627451,  0.72941176,  0.43137255],
       ..., 
       [ 0.25490196,  0.16862745,  0.15294118],
       [ 0.22745098,  0.14509804,  0.14901961],
       [ 0.20392157,  0.15294118,  0.13333333]])

In [18]:
# initialize the centroids
init_centroids = kMeansInitCentroids(A_2d, K)

In [19]:
A_2d.shape[1]


Out[19]:
3

In [20]:
# Running Algorithm
centroids, idx = runKMeans(A_2d, init_centroids ,max_iters)

In [21]:
print("New 16 Different Colors\n")
print(centroids)
print("\nIndices of each pixel\n")
print(idx)


New 16 Different Colors

[[ 0.73184329  0.5776567   0.31854035]
 [ 0.53817969  0.44357819  0.38935128]
 [ 0.81442319  0.73466258  0.72813665]
 [ 0.35137993  0.4044133   0.54620444]
 [ 0.0911979   0.09848923  0.08841637]
 [ 0.2001637   0.19427292  0.18855306]
 [ 0.38681812  0.30688788  0.24777841]
 [ 0.60893969  0.43881145  0.18726419]
 [ 0.97682748  0.89736743  0.71040853]
 [ 0.90000596  0.54178437  0.20853448]
 [ 0.94918348  0.81282138  0.55267559]
 [ 0.81762228  0.7049097   0.481371  ]
 [ 0.96970439  0.95789617  0.85502162]
 [ 0.64636497  0.579549    0.53681137]
 [ 0.54072645  0.67696561  0.84444873]
 [ 0.89266188  0.67452403  0.37811129]]

Indices of each pixel

[15 11 15 ...,  5  5  5]

In [22]:
# Recover image with new colors
X_recovered = np.array([centroids[i] for i in idx])
X_recovered = X_recoverd.reshape(A.shape)

In [23]:
fig = plt.figure()

ax1 = fig.add_axes([0,0,1,1])
ax2 = fig.add_axes([1,0,1,1])

ax1.imshow(A)
ax1.set_title("Before Using K-Means Algorithm")

ax2.imshow(X_recovered)
ax2.set_title("After Using K-Means Algorithm")


Out[23]:
<matplotlib.text.Text at 0x7773fa6668>

Principal Component Analysis

Functions


In [88]:
def featureNormalize(X):
    X_norm  = X - np.mean(X,axis=0)
    X_norm = X_norm / X.std(axis=0)
    
    return (X_norm, np.mean(X,axis=0), X.std(axis=0))

def pca(X):
    m, n = X.shape
    sigma = ( 1 / m ) * np.dot(X.T,X)
    
    return np.linalg.svd(sigma)

def projectData(X, U, K):
    return np.dot(X, U[:,0:K])

def recoverData(Z, U, K):
    return np.dot(Z, U[:,0:K].T)

Load the Data


In [94]:
mat = scipy.io.loadmat('ex7data1.mat')
X = mat['X']
m,n = X.shape

Plot the Data


In [52]:
plt.figure(figsize=(12,8))
plt.plot(X[:,0],X[:,1],data=X, lw=0, marker="o",ms = 10)


Out[52]:
[<matplotlib.lines.Line2D at 0x7773d73f60>]

Use PCA Algorithm


In [64]:
# We should Feature normalize before using PCA Algorithm
X_norm, mu, sigma = featureNormalize(X)

# Finding eigenvectors using PCA algorithm
U, S, V = pca(X_norm)

In [54]:
pca(X_norm)


Out[54]:
(array([[-0.70710678, -0.70710678],
        [-0.70710678,  0.70710678]]),
 array([ 1.73553038,  0.26446962]),
 array([[-0.70710678, -0.70710678],
        [-0.70710678,  0.70710678]]))

In [55]:
# Draw the eigenvectors centered at mean of data.
plt.figure(figsize=(12,8))
drawLine(mu, mu + 1.5 * S[0] * U[:,0])
drawLine(mu, mu + 1.5 * S[1] * U[:,1])
plt.plot(X[:,0],X[:,1], lw=0, marker="o",ms = 10)


Out[55]:
[<matplotlib.lines.Line2D at 0x7773ff65f8>]

In [56]:
# Plot the normalized dataset (returned from pca)
plt.figure(figsize=(12,8))
plt.plot(X_norm[:,0],X_norm[:,1], lw=0, marker="o",ms = 10)


Out[56]:
[<matplotlib.lines.Line2D at 0x77768d0278>]

In [76]:
# Project data into K dimension
Z = projectData(X_norm, U, 1)

# Printing first 5 projected value
print(Z[0:5].T[0])


[ 1.49631261 -0.92218067  1.22439232  1.64386173  1.2732206 ]

In [89]:
# Recovering Data
X_recovered = recoverData(Z, U, 1)

# Printing first 5 recoverd values
print(X_recovered[0:5,:])


[[-1.05805279 -1.05805279]
 [ 0.65208021  0.65208021]
 [-0.86577611 -0.86577611]
 [-1.16238578 -1.16238578]
 [-0.90030292 -0.90030292]]

In [115]:
# Draw lines connecting the projected points to the original points
plt.figure(figsize=(12,8))

plt.plot(X_recovered[:,0], X_recovered[:,1],
         lw=0, markerfacecolor= "white", markeredgewidth=2 ,markeredgecolor="red" ,marker="o",ms = 10)
plt.plot(X_norm[:,0], X_norm[:,1],
         lw=0,markerfacecolor= "white", markeredgewidth=2 , markeredgecolor="blue", marker="o",ms = 10)

for i in range(m):
    drawLine(X_norm[i,:], X_recovered[i,:])


Face Dataset

Functions


In [639]:
def magic_display(matrix = None, cmap= 'gray'):
    if matrix is None:
        # selecting 100 random rows of the X
        rand_indces = np.random.permutation(m)[0:100]
        X_dis = X[rand_indces]
    else:
        X_dis = matrix
      
    if( len(X_dis.shape) > 1 ): 
        m_test,n_test = X_dis.shape
        axis_bound = 1
    else:
        m_test = 1
        n_test = X_dis.shape[0]
        axis_bound = 0
    
    # each number width , height in plot
    example_width = int(round(sqrt(n_test)))
    example_height = int(round( n_test / example_width ))

    # number of numbers to show in plot
    display_rows = floor(sqrt(m_test))
    display_cols = ceil(m_test / display_rows )

    # padding between numbers
    pad = 2

    # intilazation array for holding previos 100 random numbers
    display_array = np.ones((
                             pad + display_rows * ( example_height + pad ),
                             pad + display_cols * ( example_width + pad )
                            ))
    count = 0;
    
    for i in range(display_cols):
        for j in range(display_rows):
            if( count >= m_test ):
                break

            # max_val of each row in X_dis
            max_val = np.max( X_dis[count : count+1], axis= axis_bound)

            # Starting x,y point of numbers shape in array 
            ex_x_range = pad + ( j ) * ( example_height + pad )
            ex_y_range = pad + ( i ) * ( example_width + pad )
            
            if(m_test > 1):
                ex_arr = X_dis[ count : count + 1 , 0:].reshape(example_height , example_width)
            else:
                ex_arr = X_dis[1:].reshape(example_height , example_width)
                
            # Setting values
            display_array[ ex_y_range : ex_y_range + example_width,
                         ex_x_range : ex_x_range + example_height] = np.divide(ex_arr , max_val).T
            count += 1


    # Get rod of grid
    plt.grid(False)
    plt.imshow(display_array, cmap= cmap)

Load the Data


In [640]:
mat = scipy.io.loadmat("ex7faces.mat")
X = mat['X']

In [641]:
# One example of X
print(X[0])
print("\n")
print(X.shape)


[ -37.86631387  -45.86631387  -53.86631387 ..., -110.86631387 -111.86631387
  -99.86631387]


(5000, 1024)

Display Face


In [642]:
plt.figure(figsize=(12,8))
magic_display(X[0:100,:])


PCA Algorithm


In [626]:
X_norm, mu, sigma = featureNormalize(X)

In [627]:
# Finding eigenvectors using PCA algorithm
U, S, V = pca(X_norm)

In [628]:
# Visualize the top 36 eigenvectors found
plt.figure(figsize=(12,8))
magic_display(U[:,0:36].T, cmap= 'gist_ncar')



In [629]:
# Dimension Reduction for Faces
K = 150
Z = projectData(X_norm, U, K)

In [630]:
Z.shape


Out[630]:
(5000, 150)

In [631]:
# Visualization of Faces after PCA Dimension Reduction
K = 150
X_rec = recoverData(Z, U, K)

In [632]:
X_rec.shape


Out[632]:
(5000, 1024)

In [634]:
plt.figure(figsize=(12,8))

plt.subplot(1,2,1)
plt.subplot(1,2,1).set_title("Original faces")
magic_display(X[0:100,:])

plt.subplot(1,2,2)
plt.subplot(1,2,2).set_title("Recovered faces")
magic_display(X_rec[0:100,:])



In [ ]: