Question 1.2 PCA



In [ ]:

    
import sys
print(sys.version)



In [ ]:

    
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import time

import pandas as pd
import seaborn as sns



In [ ]:

    
import sys
sys.path.append('../code/')



In [ ]:

    
from pca import Pca, plot_fractional_reconstruction_error



In [ ]:

    
from classification_base import MNIST_PATH
from mnist_helpers import mnist_training, mnist_testing



In [ ]:

    
train_X, train_y = mnist_training(shuffled=False)
#test_X, test_y = mnist_testing(shuffled=True)

Calculate Sigma



In [ ]:

    
toy = np.array([[1., 2, 3], [0, 1., 2], [2, 1, 0], [0, 2, 1]])



In [ ]:

    
mean = toy.sum(axis=0)/toy.shape[0]
mean



In [ ]:

    
toy



In [ ]:

    
toy - mean



In [ ]:

    
t = Pca(toy, 2, verbose=True)



In [ ]:

    
t.calc_eigen_stuff()

Check with sklearn



In [ ]:

    
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(toy)

Compare Sigma, the covariance matrix:



In [ ]:

    
t.sigma



In [ ]:

    
pca.get_covariance().T

Compare the eigenvectors:

Note that SKLearn transposes theirs.



In [ ]:

    
# components_ : array, [n_components, n_features]
# Principal axes in feature space, representing the directions of 
# maximum variance in the data. The components are sorted by 
# explained_variance_.
pca.components_



In [ ]:

    
np.set_printoptions(suppress=True)



In [ ]:

    
pca.components_



In [ ]:

    
t.eigenvects

Compare the eigenvalues:



In [ ]:

    
t.eigenvals



In [ ]:

    
pca.explained_variance_

Run on the real MNIST data



In [ ]:

    
X_num = train_X.shape[0]
train = Pca(train_X, dimensions=50, y=train_y)



In [ ]:

    
train.calc_eigen_stuff()



In [ ]:

    
train.eigenvals[0:10]



In [ ]:

    
train.sigma



In [ ]:

    
train.sum_of_top_eigenvalues()



In [ ]:

    
! ls '../figures'



In [ ]:

    
train.fractional_reconstruction_error()



In [ ]:

    
train.fractional_reconstruction_df.head(2)



In [ ]:

    
plot_fractional_reconstruction_error(train, start=0, stop=50, 
                                     title ='Fractional Reconstruction Error')



In [ ]:

    
plot_fractional_reconstruction_error(train, start=2, stop=50, 
                                     title ='Fractional Reconstruction Error')



In [ ]:

    
len(train.eigenvals[0:50])



In [ ]:

    
np.sum(train.eigenvals[0:50])



In [ ]:

    
np.round(train.eigenvals.copy(),1)[0:50]



In [ ]:

    
fig, ax = plt.subplots(1, 1, figsize=(3.5,3))
ax.get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plot_x = range(1, 51)
plot_y = np.round(train.eigenvals.copy(),1)[0:50]
#plot_data = pd.DataFrame({'eigenvalue':np.round(train.eigenvals.copy(),1)[0:50], 
#              'number': range(1,50+1)}).plot.scatter(x='number', y='eigenvalue')
plt.plot(plot_x, plot_y, linestyle='--', marker='o', color='b')
plt.title('first 50 eigenvalues')
plt.xlabel('eigenvalue')
plt.xlabel('value')
plt.tight_layout()
fig.savefig('../figures/Q-1-2-1_first_50_eigenvalues.pdf')



In [ ]:

    
np.save("./data/Q-1-2_sigma.npy", train.sigma)



In [ ]:

    
np.save("./data/Q-1-2_eigenvalues.npy", train.eigenvals)



In [ ]:

    
np.save("./data/Q-1-2_eigenvectors.npy", train.eigenvects)



In [ ]:

    
train.eigenvects.shape



In [ ]:

    
train



In [ ]:

    
import pickle
pickle.dump(obj=train, file=open('./data/PCA_training_data.pickle', "wb"))



In [ ]:

    
! ls -l ./data/PCA_training_data.pickle



In [ ]:

    
train.find_first(0)



In [ ]:

    
train.find_0_through_4()



In [ ]:

    
train.y[train.find_0_through_4()]



In [ ]:

    
train.X[train.find_0_through_4()].shape



In [ ]:

    
for k in [2, 5, 10, 20, 50]:
    trans_X = train.transform_sample_digits(n_components = k)
    filename = "./data/digits_0_through_4_transformed--{}_components.npy".format(k)
    np.save(filename, trans_X)



In [ ]:

    
X_transformed_50 = train.transform_all_digits_down(n_components = 50)



In [ ]:

    
np.save('./data/X_transformed_by_50_components.npy', X_transformed_50)

Transform the test data for SGD later



In [ ]:

    
X_test, y_test = mnist_testing(shuffled=False)



In [ ]:

    
assert X_test.shape[0] == y_test.shape[0]



In [ ]:

    
X_test.shape



In [ ]:

    
train.X = X_test



In [ ]:

    
train.X.shape



In [ ]:

    
X_test_transformed_50 = train.transform_all_digits_down(n_components = 50)



In [ ]:

    
X_test_transformed_50.shape



In [ ]:

    
np.save('./data/X_test_transformed_by_50_components.npy', X_test_transformed_50)



In [ ]: