In [ ]:
import sys
print(sys.version)
In [ ]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import time
import pandas as pd
import seaborn as sns
In [ ]:
import sys
sys.path.append('../code/')
In [ ]:
from pca import Pca, plot_fractional_reconstruction_error
In [ ]:
from classification_base import MNIST_PATH
from mnist_helpers import mnist_training, mnist_testing
In [ ]:
train_X, train_y = mnist_training(shuffled=False)
#test_X, test_y = mnist_testing(shuffled=True)
In [ ]:
toy = np.array([[1., 2, 3], [0, 1., 2], [2, 1, 0], [0, 2, 1]])
In [ ]:
mean = toy.sum(axis=0)/toy.shape[0]
mean
In [ ]:
toy
In [ ]:
toy - mean
In [ ]:
t = Pca(toy, 2, verbose=True)
In [ ]:
t.calc_eigen_stuff()
Check with sklearn
In [ ]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(toy)
Compare Sigma, the covariance matrix:
In [ ]:
t.sigma
In [ ]:
pca.get_covariance().T
Compare the eigenvectors:
Note that SKLearn transposes theirs.
In [ ]:
# components_ : array, [n_components, n_features]
# Principal axes in feature space, representing the directions of
# maximum variance in the data. The components are sorted by
# explained_variance_.
pca.components_
In [ ]:
np.set_printoptions(suppress=True)
In [ ]:
pca.components_
In [ ]:
t.eigenvects
Compare the eigenvalues:
In [ ]:
t.eigenvals
In [ ]:
pca.explained_variance_
In [ ]:
X_num = train_X.shape[0]
train = Pca(train_X, dimensions=50, y=train_y)
In [ ]:
train.calc_eigen_stuff()
In [ ]:
train.eigenvals[0:10]
In [ ]:
train.sigma
In [ ]:
train.sum_of_top_eigenvalues()
In [ ]:
! ls '../figures'
In [ ]:
train.fractional_reconstruction_error()
In [ ]:
train.fractional_reconstruction_df.head(2)
In [ ]:
plot_fractional_reconstruction_error(train, start=0, stop=50,
title ='Fractional Reconstruction Error')
In [ ]:
plot_fractional_reconstruction_error(train, start=2, stop=50,
title ='Fractional Reconstruction Error')
In [ ]:
len(train.eigenvals[0:50])
In [ ]:
np.sum(train.eigenvals[0:50])
In [ ]:
np.round(train.eigenvals.copy(),1)[0:50]
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(3.5,3))
ax.get_yaxis().set_major_formatter(
mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plot_x = range(1, 51)
plot_y = np.round(train.eigenvals.copy(),1)[0:50]
#plot_data = pd.DataFrame({'eigenvalue':np.round(train.eigenvals.copy(),1)[0:50],
# 'number': range(1,50+1)}).plot.scatter(x='number', y='eigenvalue')
plt.plot(plot_x, plot_y, linestyle='--', marker='o', color='b')
plt.title('first 50 eigenvalues')
plt.xlabel('eigenvalue')
plt.xlabel('value')
plt.tight_layout()
fig.savefig('../figures/Q-1-2-1_first_50_eigenvalues.pdf')
In [ ]:
np.save("./data/Q-1-2_sigma.npy", train.sigma)
In [ ]:
np.save("./data/Q-1-2_eigenvalues.npy", train.eigenvals)
In [ ]:
np.save("./data/Q-1-2_eigenvectors.npy", train.eigenvects)
In [ ]:
train.eigenvects.shape
In [ ]:
train
In [ ]:
import pickle
pickle.dump(obj=train, file=open('./data/PCA_training_data.pickle', "wb"))
In [ ]:
! ls -l ./data/PCA_training_data.pickle
In [ ]:
train.find_first(0)
In [ ]:
train.find_0_through_4()
In [ ]:
train.y[train.find_0_through_4()]
In [ ]:
train.X[train.find_0_through_4()].shape
In [ ]:
for k in [2, 5, 10, 20, 50]:
trans_X = train.transform_sample_digits(n_components = k)
filename = "./data/digits_0_through_4_transformed--{}_components.npy".format(k)
np.save(filename, trans_X)
In [ ]:
X_transformed_50 = train.transform_all_digits_down(n_components = 50)
In [ ]:
np.save('./data/X_transformed_by_50_components.npy', X_transformed_50)
In [ ]:
X_test, y_test = mnist_testing(shuffled=False)
In [ ]:
assert X_test.shape[0] == y_test.shape[0]
In [ ]:
X_test.shape
In [ ]:
train.X = X_test
In [ ]:
train.X.shape
In [ ]:
X_test_transformed_50 = train.transform_all_digits_down(n_components = 50)
In [ ]:
X_test_transformed_50.shape
In [ ]:
np.save('./data/X_test_transformed_by_50_components.npy', X_test_transformed_50)
In [ ]: