A Network Tour of Data Science

      Xavier Bresson, Winter 2016/17

Exercise 4 - Code 2 : Unsupervised Learning

Unsupervised Clustering with Kernel K-Means


In [2]:
# Load libraries

# Math
import numpy as np

# Visualization 
%matplotlib notebook 
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy import ndimage

# Print output of LFR code
import subprocess

# Sparse matrix
import scipy.sparse
import scipy.sparse.linalg

# 3D visualization
import pylab
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot

# Import data
import scipy.io

# Import functions in lib folder
import sys
sys.path.insert(1, 'lib')

# Import helper functions
%load_ext autoreload
%autoreload 2
from lib.utils import construct_kernel
from lib.utils import compute_kernel_kmeans_EM
from lib.utils import compute_kernel_kmeans_spectral
from lib.utils import compute_purity

# Import distance function
import sklearn.metrics.pairwise

# Remove warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load MNIST raw data images
mat = scipy.io.loadmat('datasets/mnist_raw_data.mat')
X = mat['Xraw']
n = X.shape[0]
d = X.shape[1]
Cgt = mat['Cgt'] - 1; Cgt = Cgt.squeeze()
nc = len(np.unique(Cgt))
print('Number of data =',n)
print('Data dimensionality =',d);
print('Number of classes =',nc);


Number of data = 2000
Data dimensionality = 784
Number of classes = 10

Question 1a: What is the clustering accuracy of standard/linear K-Means?
Hint: You may use functions Ker=construct_kernel(X,'linear') to compute the linear kernel and [C_kmeans, En_kmeans]=compute_kernel_kmeans_EM(n_classes,Ker,Theta,10) with Theta= np.ones(n) to run the standard K-Means algorithm, and accuracy = compute_purity(C_computed,C_solution,n_clusters) that returns the accuracy.


In [5]:
Ker=construct_kernel(X,'linear')
Theta= np.ones(n)
n_clusters = 8
[C_solution, En_solution]=compute_kernel_kmeans_EM(nc,Ker,Theta,10)
[C_kmeans, En_kmeans]=compute_kernel_kmeans_EM(n_clusters,Ker,Theta,10)
C_computed = C_kmeans
print('C_kmeans',C_kmeans);
print('En_kmeans=',En_kmeans);
accuracy = compute_purity(C_computed,Cgt,nc)
print('accuracy = ',accuracy);


Construct Linear Kernel
C_kmeans [4 4 7 ..., 6 5 2]
En_kmeans= 1.82192602392
accuracy =  12.8

In [ ]:

Question 1b: What is the clustering accuracy for the kernel K-Means algorithm with
(1) Gaussian Kernel for the EM approach and the Spectral approach?
(2) Polynomial Kernel for the EM approach and the Spectral approach?
Hint: You may use functions Ker=construct_kernel(X,'gaussian') and Ker=construct_kernel(X,'polynomial',[1,0,2]) to compute the non-linear kernels
Hint: You may use functions C_kmeans,__ = compute_kernel_kmeans_EM(K,Ker,Theta,10) for the EM kernel KMeans algorithm and C_kmeans,__ = compute_kernel_kmeans_spectral(K,Ker,Theta,10) for the Spectral kernel K-Means algorithm.


In [6]:
Ker=construct_kernel(X,'gaussian')
Theta= np.ones(n)
n_clusters = 8
[C_solution, En_solution]=compute_kernel_kmeans_EM(nc,Ker,Theta,10)
[C_kmeans, En_kmeans]=compute_kernel_kmeans_EM(n_clusters,Ker,Theta,10)
C_computed = C_kmeans
print('C_kmeans',C_kmeans);
print('En_kmeans=',En_kmeans);
accuracy = compute_purity(C_computed,Cgt,nc)
print('accuracy = ',accuracy);


Construct Gaussian Kernel
C_kmeans [0 2 5 ..., 2 3 7]
En_kmeans= 0.772257521499
accuracy =  50.05

In [7]:
Ker=construct_kernel(X,'polynomial',[1,0,2])
Theta= np.ones(n)
n_clusters = 8
[C_solution, En_solution]=compute_kernel_kmeans_EM(nc,Ker,Theta,10)
[C_kmeans, En_kmeans]=compute_kernel_kmeans_EM(n_clusters,Ker,Theta,10)
C_computed = C_kmeans
print('C_kmeans',C_kmeans);
print('En_kmeans=',En_kmeans);
accuracy = compute_purity(C_computed,C_solution,n_clusters)
print('accuracy = ',accuracy);


Construct Polynomial Kernel
C_kmeans [5 1 2 ..., 6 0 1]
En_kmeans= 0.89094735792
accuracy =  68.45

Question 1c: What is the clustering accuracy for the kernel K-Means algorithm with
(1) KNN_Gaussian Kernel for the EM approach and the Spectral approach?
(2) KNN_Cosine_Binary Kernel for the EM approach and the Spectral approach?
You can test for the value KNN_kernel=50.
Hint: You may use functions Ker = construct_kernel(X,'kNN_gaussian',KNN_kernel) and Ker = construct_kernel(X,'kNN_cosine_binary',KNN_kernel) to compute the non-linear kernels.


In [13]:
KNN_kernel = 50;
Ker = construct_kernel(X,'kNN_gaussian',50)
Theta= np.ones(n)
n_clusters = 8
[C_solution, En_solution]=compute_kernel_kmeans_EM(nc,Ker,Theta,10)
[C_kmeans, En_kmeans]=compute_kernel_kmeans_EM(nc,Ker,Theta,10)
C_computed = C_kmeans
print('C_kmeans',C_kmeans);
print('En_kmeans=',En_kmeans);
accuracy = compute_purity(C_computed,Cgt,nc)
print('accuracy = ',accuracy);


Construct kNN Gaussian Kernel
C_kmeans [4 7 6 ..., 3 8 6]
En_kmeans= 0.878689197027
accuracy =  55.50000000000001

In [15]:
KNN_kernel = 50;
Ker = construct_kernel(X,'kNN_cosine_binary',KNN_kernel) 
[C_kmeans, En_kmeans]=compute_kernel_kmeans_EM(nc,Ker,Theta,10)
C_computed = C_kmeans
accuracy = compute_purity(C_computed,Cgt,nc)
print('accuracy = ',accuracy);


Construct kNN Cosine Binary Kernel
accuracy =  62.64999999999999

In [ ]: