In [1]:
    
import numpy as np
import scipy
from scipy.io import loadmat
from scipy import optimize
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.image import NonUniformImage
from matplotlib import cm
matplotlib.style.use('ggplot')
%matplotlib inline
%load_ext autoreload
%autoreload 2
    
In [2]:
    
kMeanfile_path = '../course_materials/ex7data2.mat'
kMeanData = loadmat(kMeanfile_path)
kMeanData.keys()
kMeanData['X'].shape
    
    Out[2]:
In [3]:
    
X = kMeanData['X']
plt.plot(X[:,:1],X[:,1:], 'o')
plt.show
    
    Out[3]:
    
In [4]:
    
def initiate_k_centroids(X, k):
    kIndices = np.random.choice(X.shape[0], k, replace=False)
    return X[kIndices]
def get_closest_centroid(X, k, centroid):
    centroidDistance = np.zeros((X.shape[0], k))
    for i in range(k):
        centroidDistance[:,i] = np.sum((X-centroid[i])**2, axis=1)
    return np.argmin(centroidDistance, axis=1)
        
def update_k_centroids(X, k, closestCentroid):
    kCentroids = np.zeros((k,X.shape[1]))
    for i in range(k):
        iCluster = X[np.where(closestCentroid==i)]
        iClusterCentre = np.mean(iCluster, axis=0)
        kCentroids[i] = iClusterCentre
    return kCentroids
    
In [5]:
    
def k_mean_clustering(X, k, iterations):
    centroidHistory = np.zeros((iterations+1, k, X.shape[1]))
#     Initialize centroids
    centroid = initiate_k_centroids(X, k)
    centroidHistory[0,:,:] = centroid
#     Perform iterations
    for i in range(iterations):
#         Find closes centroids
        closestCentroid = get_closest_centroid(X, k, centroid)
#         Update centroids
        centroid = update_k_centroids(X, k, closestCentroid)
        centroidHistory[i+1,:,:] = centroid
    return centroid, centroidHistory
    
In [6]:
    
X = kMeanData['X']
k = 3
iterations = 5
K = k_mean_clustering(X, k, iterations)[1]
centroid = k_mean_clustering(X, k, iterations)[0]
clusterAssignment = get_closest_centroid(X, k, centroid)
plt.scatter(X[:,:1],X[:,1:], c=clusterAssignment)
plt.plot(K[:,0,:1],K[:,0,1:], 'x-', ms=9, mew=3)
plt.plot(K[:,1,:1],K[:,1,1:], 'x-', ms=9, mew=3)
plt.plot(K[:,2,:1],K[:,2,1:], 'x-', ms=9, mew=3)
plt.show
    
    
    
In [ ]:
    
birdPNG = 'bird_small.png'
# This creates a three-dimensional matrix A whose first two indices 
# identify a pixel position and whose last index represents red, green, or blue.
bird = scipy.misc.imread(birdPNG)
print (bird.shape)
plt.imshow(bird)
birdReshaped = bird.reshape(bird.shape[0]*bird.shape[1], bird.shape[2])/255
birdReshaped.shape
    
In [ ]:
    
k = 16
iterations = 10
birdCentroid = k_mean_clustering(birdReshaped, k, iterations)[0]
birdClustered = get_closest_centroid(birdReshaped, k, birdCentroid)
print(birdClustered.shape)
birdRecoloured = np.zeros(birdReshaped.shape)
for i in range(birdReshaped.shape[0]):
    birdRecoloured[i,:] = birdCentroid[birdClustered[i],:]
newBird = birdRecoloured.reshape(bird.shape)
plt.imshow(newBird)
    
In [ ]:
    
PCA_file_path = 'ex7data1.mat'
kMeanData = loadmat(PCA_file_path)
kMeanData.keys()
kMeanData['X'].shape
    
In [ ]:
    
X = kMeanData['X']
plt.plot(X[:,:1],X[:,1:], 'o')
plt.show
    
In [ ]:
    
def normalise_data(X):
    '''Returns normalised feature for a matrix or a vector'''
    return (X-np.mean(X, 0))/np.std(X, 0)
def covMatrix_SVD(X):
    '''Returns U, Sigma, and V'''
#     For matrix X(n,k), returns covariance matrix (k,k)
    Sigma = np.dot(X.T, X)/X.shape[0]
#     Some matrix-magic need to look up  
    return np.linalg.svd(Sigma, full_matrices=1, compute_uv=1)
    
In [ ]:
    
normX = normalise_data(X)
plt.plot(normX[:,:1],normX[:,1:], 'o')
plt.show
U, Sigma, V = covMatrix_SVD(normX)
print(U)
print(Sigma)
print(np.dot(normX.T, normX)/normX.shape[0])
    
In [ ]:
    
normX = normalise_data(X)
U, Sigma, V = covMatrix_SVD(normX)
print("Top Principal Component (Eigen Vector) is", U[:,:1])
    
In [ ]:
    
# "...output the top principal component (eigen- vector) found, 
# and you should expect to see an output of about [-0.707 -0.707]"
X = kMeanData['X']
plt.plot(X[:,:1],X[:,1:], 'o')
meanX = np.mean(X, 0)
#at the mean of the data
plt.plot([meanX[0], meanX[0] + Sigma[0]*U[0,0]], 
         [meanX[1], meanX[1] + Sigma[0]*U[0,1]],
         linewidth=3,
        label='First Principal Component')
plt.plot([meanX[0], meanX[0] + Sigma[1]*U[1,0]], 
         [meanX[1], meanX[1] + Sigma[1]*U[1,1]],
         linewidth=3,
         label='Second Principal Component')
plt.axis('equal')
plt.legend()
plt.show
    
In [ ]:
    
def get_projection(X, U, K):
    topK_U = U[:,:K]
    return np.dot(X, topK_U)
    
In [ ]:
    
K = 1
projectionX = get_projection(normX, U, K)
print(projectionX[0])
    
In [ ]:
    
def recoverData(projectionX, U, K):
    topK_U = U[:,:K]
    return np.dot(projectionX, topK_U.T)
    
In [ ]:
    
recoveredX = recoverData(projectionX,U,1)
print(recoveredX[0])
    
In [ ]:
    
normX = normalise_data(X)
plt.plot(normX[:,:1],normX[:,1:], 'o')
plt.plot(recoveredX[:,:1],recoveredX[:,1:], 'o')
for i in range(X.shape[0]):
    plt.plot([normX[i,0],recoveredX[i,0]], [normX[i,1], recoveredX[i,1]], 'k--')
plt.axis('equal')
plt.show
    
In [ ]:
    
facefile_path = 'ex7faces.mat'
faceData = loadmat(facefile_path)
print(faceData.keys())
faces = faceData['X']
print(faces.shape)
    
In [ ]:
    
np.random.seed(100)
def faceDatum(x, squareSide):
    '''converts vector x (1,1024) into a matrix (32,32)'''
#     by default order='C', but in this case the lines are assembled in wrong order
    return np.reshape(x, (squareSide, squareSide), order='F')
def showRandomFace(dataSet, squareSide):
    fig = plt.figure(figsize=(2, 2))
    face = faceDatum(dataSet[np.random.randint(dataSet.shape[0])], squareSide)
    plt.imshow(face, cmap = 'gist_gray')
    plt.axis('off')
    return plt.show()
def showHundredFaces(visualisationSet, squareSide):
    fig = plt.figure(figsize=(10, 10))
    for row in range(10):
        for column in range(10):
            digit = faceDatum(visualisationSet[10*row+column], squareSide)
            sub = plt.subplot(10, 10, 10*row+column+1)
            sub.axis('off')
            sub.imshow(digit, cmap = 'gist_gray')
    return plt.show()
    
In [ ]:
    
n_samples, n_variables = faces.shape
squareSide = 32
# showRandomFace(facesData['X'], squareSide)
random_index = np.random.randint(faces.shape[0], size = 100)
visualisationSet = faces[random_index,:]
showHundredFaces(visualisationSet, squareSide)
    
In [ ]:
    
normFaces = normalise_data(faces)
facesPCA = covMatrix_SVD(normFaces)
facesU = facesPCA[0]
print (facesPCA[0].shape)
    
In [ ]:
    
K = 100
projectionFaces = get_projection(normFaces, facesU, K)
recoveredFaces = recoverData(projectionFaces, facesU, K)
print(recoveredFaces.shape)
n_samples, n_variables = recoveredFaces.shape
squareSide = 32
random_index = np.random.randint(recoveredFaces.shape[0], size = 100)
visualisationSet = recoveredFaces[random_index,:]
showHundredFaces(visualisationSet, squareSide)
    
In [ ]:
    
normBird = normalise_data(birdReshaped)
print(normBird.shape)
birdU, birdSigma, birdV = covMatrix_SVD(normBird)
print(birdU.shape)
    
In [ ]:
    
K = 2
projectBird = get_projection(normBird, birdU, K)
    
In [ ]:
    
# Make the 2D plot
plt.scatter(projectBird[:,:1], projectBird[:,1:], c=birdClustered, s=5)
plt.grid(True)
    
In [ ]: