Question 1.2 PCA


In [ ]:
import sys
print(sys.version)

In [ ]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import time

import pandas as pd
import seaborn as sns

In [ ]:
import sys
sys.path.append('../code/')

In [ ]:
from pca import Pca, plot_fractional_reconstruction_error

In [ ]:
from classification_base import MNIST_PATH
from mnist_helpers import mnist_training, mnist_testing

In [ ]:
train_X, train_y = mnist_training(shuffled=False)
#test_X, test_y = mnist_testing(shuffled=True)

Calculate Sigma


In [ ]:
toy = np.array([[1., 2, 3], [0, 1., 2], [2, 1, 0], [0, 2, 1]])

In [ ]:
mean = toy.sum(axis=0)/toy.shape[0]
mean

In [ ]:
toy

In [ ]:
toy - mean

In [ ]:
t = Pca(toy, 2, verbose=True)

In [ ]:
t.calc_eigen_stuff()

Check with sklearn


In [ ]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(toy)

Compare Sigma, the covariance matrix:


In [ ]:
t.sigma

In [ ]:
pca.get_covariance().T

Compare the eigenvectors:

Note that SKLearn transposes theirs.


In [ ]:
# components_ : array, [n_components, n_features]
# Principal axes in feature space, representing the directions of 
# maximum variance in the data. The components are sorted by 
# explained_variance_.
pca.components_

In [ ]:
np.set_printoptions(suppress=True)

In [ ]:
pca.components_

In [ ]:
t.eigenvects

Compare the eigenvalues:


In [ ]:
t.eigenvals

In [ ]:
pca.explained_variance_

Run on the real MNIST data


In [ ]:
X_num = train_X.shape[0]
train = Pca(train_X, dimensions=50, y=train_y)

In [ ]:
train.calc_eigen_stuff()

In [ ]:
train.eigenvals[0:10]

In [ ]:
train.sigma

In [ ]:
train.sum_of_top_eigenvalues()

In [ ]:
! ls '../figures'

In [ ]:
train.fractional_reconstruction_error()

In [ ]:
train.fractional_reconstruction_df.head(2)

In [ ]:
plot_fractional_reconstruction_error(train, start=0, stop=50, 
                                     title ='Fractional Reconstruction Error')

In [ ]:
plot_fractional_reconstruction_error(train, start=2, stop=50, 
                                     title ='Fractional Reconstruction Error')

In [ ]:
len(train.eigenvals[0:50])

In [ ]:
np.sum(train.eigenvals[0:50])

In [ ]:
np.round(train.eigenvals.copy(),1)[0:50]

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(3.5,3))
ax.get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plot_x = range(1, 51)
plot_y = np.round(train.eigenvals.copy(),1)[0:50]
#plot_data = pd.DataFrame({'eigenvalue':np.round(train.eigenvals.copy(),1)[0:50], 
#              'number': range(1,50+1)}).plot.scatter(x='number', y='eigenvalue')
plt.plot(plot_x, plot_y, linestyle='--', marker='o', color='b')
plt.title('first 50 eigenvalues')
plt.xlabel('eigenvalue')
plt.xlabel('value')
plt.tight_layout()
fig.savefig('../figures/Q-1-2-1_first_50_eigenvalues.pdf')

In [ ]:
np.save("./data/Q-1-2_sigma.npy", train.sigma)

In [ ]:
np.save("./data/Q-1-2_eigenvalues.npy", train.eigenvals)

In [ ]:
np.save("./data/Q-1-2_eigenvectors.npy", train.eigenvects)

In [ ]:
train.eigenvects.shape

In [ ]:
train

In [ ]:
import pickle
pickle.dump(obj=train, file=open('./data/PCA_training_data.pickle', "wb"))

In [ ]:
! ls -l ./data/PCA_training_data.pickle

In [ ]:
train.find_first(0)

In [ ]:
train.find_0_through_4()

In [ ]:
train.y[train.find_0_through_4()]

In [ ]:
train.X[train.find_0_through_4()].shape

In [ ]:
for k in [2, 5, 10, 20, 50]:
    trans_X = train.transform_sample_digits(n_components = k)
    filename = "./data/digits_0_through_4_transformed--{}_components.npy".format(k)
    np.save(filename, trans_X)

In [ ]:
X_transformed_50 = train.transform_all_digits_down(n_components = 50)

In [ ]:
np.save('./data/X_transformed_by_50_components.npy', X_transformed_50)

Transform the test data for SGD later


In [ ]:
X_test, y_test = mnist_testing(shuffled=False)

In [ ]:
assert X_test.shape[0] == y_test.shape[0]

In [ ]:
X_test.shape

In [ ]:
train.X = X_test

In [ ]:
train.X.shape

In [ ]:
X_test_transformed_50 = train.transform_all_digits_down(n_components = 50)

In [ ]:
X_test_transformed_50.shape

In [ ]:
np.save('./data/X_test_transformed_by_50_components.npy', X_test_transformed_50)

In [ ]: