In [1]:
# Extension to have package information reported
# %install_ext http://raw.github.com/jrjohansson/version_information/master/version_information.py
%load_ext version_information
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib.patches import FancyArrowPatch
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
# Define the 3D arrow object used in the eigenvector plot
class Arrow3D(FancyArrowPatch):
def __init__(self, xs, ys, zs, *args, **kwargs):
FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
self._verts3d = xs, ys, zs
def draw(self, renderer):
xs3d, ys3d, zs3d = self._verts3d
xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
FancyArrowPatch.draw(self, renderer)
import numpy as np
In [2]:
%version_information numpy, matplotlib, pandas
Out[2]:
This notebook follows this tutorial for PCA.
Both Multiple Discriminant Analysis (MDA) and Principal Component Analysis (PCA) are linear transformation methods and closely related to each other. In PCA, we are interested to find the directions (components) that maximize the variance in our dataset, where in MDA, we are additionally interested to find the directions that maximize the separation (or discrimination) between different classes (for example, in pattern classification problems where our dataset consists of multiple classes. In contrast to PCA, which ignores the class labels). In other words, via PCA, we are projecting the entire set of data (without class labels) onto a different subspace, and in MDA, we are trying to determine a suitable subspace to distinguish between patterns that belong to different classes.
Goal: Reduce the dimensions of a d-dimensional dataset by mapping it onto a subspace of k < d dimensions.
This goal requires us to find a suitable value for k, the number of dimensions in the subspace. We determine the value of k by inspecting the eigenvalues of the components.
The steps of PCA are listed below:
- Take the whole dataset consisting of d-dimensional samples ignoring the class labels
- Compute the d-dimensional mean vector (i.e., the means for every dimension of the whole dataset)
- Compute the scatter matrix (alternatively, the covariance matrix) of the whole data set
- Compute eigenvectors and corresponding eigenvalues
- Sort the eigenvectors by decreasing eigenvalues and choose k eigenvectors with the largest eigenvalues to form a d x k dimensional matrix W (where every column represents an eigenvector)
- Use this d x k eigenvector matrix to transform the samples onto the new subspace. This can be summarized by the mathematical equation, y = W^T ** x, where x is a d x 1 -dimensional vector representing one sample, and y is the transformed k x 1 -dimensional sample in the new subspace.
We generate 40 samples of 3-dimensional data from a multivariate Gaussian distribution. Half of the samples from Class 1, and half are from Class 2.
In [3]:
np.random.seed(1)
# Generate Class 1
mu_vec1 = np.array([0,0,0])
cov_mat1 = np.array([[1,0,0], [0,1,0], [0,0,1]])
class1_sample = np.random.multivariate_normal(mu_vec1, cov_mat1, 20)
class1_sample = class1_sample.transpose()
# 3 x 20 matrix
class1_sample.view()
Out[3]:
In [4]:
# Generate Class 2 in the same way, but with a different set of means
mu_vec2 = np.array([1,1,1])
cov_mat2 = np.array([[1,0,0], [0,1,0], [0,0,1]])
# self.T is the same as self.transpose() when self.ndim >= 2
class2_sample = np.random.multivariate_normal(mu_vec2, cov_mat2, 20).T
# Sanity checks
assert class1_sample.shape == (3,20), "The matrix has not the dimensions 3x20"
assert class1_sample.shape == (3,20), "The matrix has not the dimensions 3x20"
In [5]:
# Since the data are 3-dimensions, plot points in 3D
fig = plt.figure(figsize = (8, 8))
ax = fig.add_subplot(111, projection = '3d')
plt.rcParams['legend.fontsize'] = 10
ax.plot(class1_sample[0, :], class1_sample[1, :],\
class1_sample[2, :], 'o', markersize = 8, color = 'blue', alpha = 0.5, label = 'class1')
ax.plot(class2_sample[0, :], class2_sample[1, :],\
class2_sample[2, :], '^', markersize = 8, alpha = 0.5, color = 'red', label = 'class2')
plt.title('Samples for class 1 and class 2')
ax.legend(loc = 'upper right')
plt.show()
In [6]:
# PCA works with unlabeled data, so combine the two-classes together
all_samples = np.concatenate((class1_sample, class2_sample), axis = 1)
assert all_samples.shape == (3,40), "The matrix has not the dimensions 3x40"
In [7]:
# Take means of each row (dimension). Combine into a vector.
mean_x = np.mean(all_samples[0,:])
mean_y = np.mean(all_samples[1,:])
mean_z = np.mean(all_samples[2,:])
mean_vector = np.array([[mean_x], [mean_y], [mean_z]])
mean_vector
Out[7]:
A scatter matrix is basically an unscaled covariance matrix.
$$ S = \sum_{j=1}^n (\mathbf{x}_j-\overline{\mathbf{x}})(\mathbf{x}_j-\overline{\mathbf{x}})^T $$
In [8]:
# Manually compute the scatter matrix:
scatter_matrix = np.zeros((3,3))
# For each column (sample)
for i in range(all_samples.shape[1]):
# Mean-center the column
x_left = all_samples[:,i].reshape(3,1) - mean_vector
x_right = x_left.T
# Add its dot-product to the scatter matrix
scatter_matrix += x_left.dot(x_right)
print('Scatter Matrix:\n', scatter_matrix)
In [9]:
# Alternatively, unscale the covariance matrix
n_obvs = all_samples.shape[1]
np.cov(all_samples) * (n_obvs - 1)
Out[9]:
In [10]:
np.cov(all_samples, ddof = n_obvs - 1)
Out[10]:
In [11]:
cov_mat = np.cov(all_samples)
cov_mat
Out[11]:
In [12]:
# Compute eigenvalues and eigenvectors from both matrices.
eig_val_sc, eig_vec_sc = np.linalg.eig(scatter_matrix)
eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat)
# Confirm identical eigenvectors
assert eig_vec_sc.all() == eig_vec_cov.all(), 'Eigenvectors are not identical'
In [13]:
# Double check the eigenvalues and the scaling
for i in range(len(eig_val_sc)):
eigvec_sc = eig_vec_sc[:,i].reshape(1,3).T
eigvec_cov = eig_vec_cov[:,i].reshape(1,3).T
print('Eigenvector {}: \n{}'.format(i + 1, eigvec_sc))
print('Eigenvalue {} from scatter matrix: {}'.format(i + 1, eig_val_sc[i]))
print('Eigenvalue {} from covariance matrix: {}'.format(i + 1, eig_val_cov[i]))
print('Scaling factor: ', eig_val_sc[i] / eig_val_cov[i], "\n")
In [14]:
# Confirm that CovMat*Eigenvector = Eigenvalue*Eigenvector
for i in range(len(eig_val_sc)):
eigv = eig_vec_sc[:,i].reshape(1,3).T
np.testing.assert_array_almost_equal(scatter_matrix.dot(eigv),\
eig_val_sc[i] * eigv, decimal=6,\
err_msg='', verbose=True)
In [15]:
# Draw the eigenvectors
fig = plt.figure(figsize=(7,7))
ax = fig.add_subplot(111, projection='3d')
# Draw the points
ax.plot(all_samples[0,:], all_samples[1,:], all_samples[2,:], 'o',
markersize=8, color='green', alpha=0.2)
# Draw the mean/centroid
ax.plot([mean_x], [mean_y], [mean_z], 'o',
markersize=10, color='red', alpha=0.5)
# Draw the eigenvectors originating from the centroid
for v in eig_vec_sc.T:
a = Arrow3D([mean_x, v[0]], [mean_y, v[1]],[mean_z, v[2]],
mutation_scale=20, lw=3, arrowstyle="-|>", color="r")
ax.add_artist(a)
ax.set_xlabel('x_values')
ax.set_ylabel('y_values')
ax.set_zlabel('z_values')
plt.title('Eigenvectors')
plt.show()
In [ ]: