In [1]:
#import libs, load data

%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df_load = pd.read_csv('https://nthu-datalab.github.io/ml/labs/02_EDA_PCA/gen_dataset.csv')

X_load = df_load.drop('Class label', 1)
Y_load = df_load['Class label']

df_load.head()


Out[1]:
Class label a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15
0 2.0 -0.016488 -1.310538 -1.552489 -0.785475 1.548429 0.476687 1.090010 -0.351870 -0.000855 -1.932941 0.499177 0.149137 -0.640413 -0.782951 -0.903561
1 0.0 -0.844201 -1.235142 -0.624408 1.502470 -0.079536 1.482053 1.178544 -1.150090 -1.040124 -1.041435 0.281037 -0.283710 -1.176802 0.718408 -0.392095
2 0.0 -0.181053 0.039422 -0.307827 0.162256 -1.283705 0.541288 0.019113 -0.470718 -1.045754 0.983150 -0.121205 -0.189225 -0.539178 0.825261 0.612889
3 2.0 -0.423555 -1.598754 1.597206 -0.239330 1.443564 2.657538 1.824393 -1.809287 1.058634 -4.058539 0.255908 -0.952422 -0.315551 1.854246 -2.369018
4 2.0 -0.499408 -0.814229 -0.178777 -1.757823 0.678134 3.552825 1.483069 -2.341943 2.155062 -4.380612 -0.239352 -1.730919 0.586125 3.902178 -2.891653

In [4]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# Z-normalize data
sc = StandardScaler()
Z = sc.fit_transform(X_load)
# Estimate the correlation matrix
R = np.dot(Z.T, Z) / df_load.shape[0]

#calculate the eigen values, eigen vectors
eigen_vals, eigen_vecs = np.linalg.eigh(R)

# Make a list of (eigenvalue, eigenvector) tuples
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eigen_pairs.sort(reverse=True)

#form the projection matrix
W_2D = np.hstack((eigen_pairs[0][1][:, np.newaxis],
               eigen_pairs[1][1][:, np.newaxis]))

#you should form a projection matrix which projects from raw-data dimension to 3 dimension here
W_3D = np.hstack((eigen_pairs[0][1][:, np.newaxis],
                eigen_pairs[1][1][:, np.newaxis],
                eigen_pairs[2][1][:, np.newaxis]))

In [8]:
import os
import seaborn as sns
sns.set(style='whitegrid', context='notebook')

#import Axes3D for plottin 3d scatter
from mpl_toolkits.mplot3d import Axes3D

#cacculate z_pca(2d and 3d)
Z_pca2 = Z.dot(W_2D)
Z_pca3 = Z.dot(W_3D)

#plot settings
colors = ['r', 'b', 'g']
markers = ['s', 'x', 'o']
fig = plt.figure(figsize=(12,6))

#plot 2D
plt2 = fig.add_subplot(1,2,1)
for l, c, m in zip(np.unique(Y_load), colors, markers):
    plt2.scatter(Z_pca2[Y_load==l, 0], 
                Z_pca2[Y_load==l, 1], 
                c=c, label=l, marker=m)

plt.title('Z_pca 2D')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc='lower left')
plt.tight_layout()

#plot 3D
plt3 = fig.add_subplot(1,2,2, projection='3d')
#you should plot a 3D scatter using plt3.scatter here (see Axes3D.scatter in matplotlib)
for l, c, m in zip(np.unique(Y_load), colors, markers):
    plt3.scatter(Z_pca3[Y_load==l, 0], 
                 Z_pca3[Y_load==l, 1],
                 Z_pca3[Y_load==l, 2],
                c=c, label=l, marker=m)

plt3.set_xlabel('PC 1')
plt3.set_ylabel('PC 2')
plt3.set_zlabel('PC 3')    
    
plt.title('Z_pca 3D')
plt.legend(loc='lower right')
plt.tight_layout()
    
    
if not os.path.exists('./output'):
    os.makedirs('./output')
plt.savefig('./output/fig-pca-2-3-z.png', dpi=300)
plt.show()



In [ ]:


In [ ]: