Question 1.2 PCA


In [1]:
import sys
print(sys.version)


3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, 17:52:12) 
[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]

In [2]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import time

import pandas as pd
import seaborn as sns

In [3]:
! pwd


/Users/janet/Machine_Learning_CSE_546/HW4/notebooks

In [4]:
! ls ../../HW3/code/


__pycache__                          least_squares_sgd.py
classification_base.py               logistic_regression.py
hyperparameter_explorer.py           mnist_helpers.py
hyperparameter_explorer_semi-orig.py not_updated
k_means.py                           pca.py
kernel.py

In [5]:
import sys
sys.path.append('../../HW3/code/')

In [6]:
from pca import Pca, plot_fractional_reconstruction_error

In [7]:
from classification_base import MNIST_PATH
from mnist_helpers import mnist_training, mnist_testing

In [8]:
train_X, train_y = mnist_training(shuffled=False)
#test_X, test_y = mnist_testing(shuffled=True)

PCA MNIST data


In [9]:
X_num = train_X.shape[0]
train = Pca(train_X, dimensions=50, y=train_y, center=False)

In [10]:
train.calc_eigen_stuff()


........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [11]:
train.eigenvals[0:10]


Out[11]:
array([ 2476871.87792173,   285468.25323556,   243256.98911909,
         209751.69652609,   181172.2224198 ,   148084.56338301,
         124745.98943206,    99889.15588382,    98709.32708764,
          81022.15396414])

In [12]:
train.sigma


Out[12]:
array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [13]:
train.sum_of_top_eigenvalues()


Out[13]:
5108096.3418362727

In [14]:
! ls '../figures'


Q-1-2-2_0_to_50.pdf        Q-2-1_weight_visualization
Q-1-2-2_2_to_50.pdf

In [15]:
train.fractional_reconstruction_error()

In [16]:
train.fractional_reconstruction_df.head(2)


Out[16]:
fractional reconstruction k
0 0.566210 1
0 0.516215 2

In [17]:
plot_fractional_reconstruction_error(train, start=0, stop=50, 
                                     title ='Fractional Reconstruction Error')


Out[17]:

In [18]:
plot_fractional_reconstruction_error(train, start=2, stop=50, 
                                     title ='Fractional Reconstruction Error')


Out[18]:

In [19]:
len(train.eigenvals[0:50])


Out[19]:
50

In [20]:
np.sum(train.eigenvals[0:50])


Out[20]:
5108096.3418362727

In [21]:
np.round(train.eigenvals.copy(),1)[0:50]


Out[21]:
array([ 2476871.9,   285468.3,   243257. ,   209751.7,   181172.2,
         148084.6,   124746. ,    99889.2,    98709.3,    81022.2,
          72880.3,    70324.1,    58836.2,    58486.5,    54547.7,
          51070.4,    46834.4,    44423.4,    40782. ,    39522.8,
          36556.4,    34523.6,    32726.1,    31359.9,    30316.6,
          28799.5,    27920.1,    26962.9,    25534.3,    23690.4,
          22575. ,    22228.6,    20672.4,    20249.7,    19546.9,
          18661.9,    17341.1,    16726.5,    16507. ,    16205.2,
          15678.8,    15254.2,    14407.1,    13655.3,    13216.7,
          12864. ,    12414.6,    12060.1,    11684.5,    11076.8])

In [22]:
fig, ax = plt.subplots(1, 1, figsize=(3.5,3))
ax.get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plot_x = range(1, 51)
plot_y = np.round(train.eigenvals.copy(),1)[0:50]
#plot_data = pd.DataFrame({'eigenvalue':np.round(train.eigenvals.copy(),1)[0:50], 
#              'number': range(1,50+1)}).plot.scatter(x='number', y='eigenvalue')
plt.plot(plot_x, plot_y, linestyle='--', marker='o', color='b')
plt.title('first 50 eigenvalues')
plt.xlabel('eigenvalue')
plt.xlabel('value')
plt.tight_layout()
#fig.savefig('../figures/Q-1-2-1_first_50_eigenvalues.pdf')


np.save("./data/Q-0_PCA_sigma.npy", train.sigma)

In [23]:
np.save("../data/Q-0_PCA_eigenvalues_uncentered.npy", train.eigenvals)

In [24]:
np.save("../data/Q-0_PCA_eigenvectors_uncentered.npy", train.eigenvects)

In [25]:
train.eigenvects.shape


Out[25]:
(784, 784)

In [26]:
train


Out[26]:
<pca.Pca at 0x10e28c550>

In [27]:
import pickle
pickle.dump(obj=train, file=open('../data/PCA_training_data_uncentered.pickle', "wb"))

In [28]:
! ls -l ../data/*.pickle


-rw-r--r--  1 janet  staff  386247310 Nov 21 07:51 ../data/PCA_training_data.pickle
-rw-r--r--  1 janet  staff  386247310 Dec  5 07:22 ../data/PCA_training_data_uncentered.pickle

Save transformed version of the train and test data

Train


In [29]:
X_transformed_50 = train.transform_all_digits_down(n_components = 50)

In [30]:
X_transformed_50.shape


Out[30]:
(60000, 50)

In [31]:
np.save('../data/X_transformed_by_50_components_uncentered.npy', X_transformed_50)

Test


In [32]:
X_test, y_test = mnist_testing(shuffled=False)

In [33]:
assert X_test.shape[0] == y_test.shape[0]

In [34]:
X_test.shape


Out[34]:
(10000, 784)

In [35]:
train.X = X_test

In [36]:
train.X.shape


Out[36]:
(10000, 784)

In [37]:
X_test_transformed_50 = train.transform_all_digits_down(n_components = 50)

In [38]:
X_test_transformed_50.shape


Out[38]:
(10000, 50)

In [39]:
np.save('../data/X_test_transformed_by_50_components_uncentered.npy', X_test_transformed_50)

In [ ]: