In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import sklearn.datasets
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [8, 4]
In [2]:
iris = sklearn.datasets.load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
X_zscaled = (X - X.mean()) / X.std(ddof=1)
Y = pd.DataFrame(iris.target, columns=['target'])
Y['species'] = Y.apply(lambda r: iris.target_names[r])
Using $corr(X)$, check to see that we some level of multicollinearity in the data, enough to warrant using PCA of visualization in reduced-rank principal component space. As a rule of thumb, the off-diagonal correlation values (either in the upper- or lower-triangle) should have absolute values of around 0.30 or so.
In [3]:
corr = X_zscaled.corr()
tmp = pd.np.triu(corr) - np.eye(corr.shape[0])
tmp = tmp.flatten()
tmp = tmp[np.nonzero(tmp)]
tmp = pd.Series(np.abs(tmp))
print('Correlation matrix:\n\n{}\n\n'.format(corr.values))
print('Multicollinearity check using off-diagonal values:\n\n{}'.format(tmp.describe()))
From the factorization of symmetric matrix $S$ into orthogonal matrix $Q$ of the eigenvectors and diagonal matrix $\Lambda$ of the eigenvalues, we can likewise decompose $cov(X)$ (or in our case $corr(X)$ since we standardized our data).
\begin{align} S &= Q \Lambda Q^\intercal \\ \Rightarrow X^\intercal X &= V \Lambda V^\intercal \\ \end{align}where $V$ are the orthonormal eigenvectors of $X X^\intercal$.
We can normalize the eigenvalues to see how much variance is captured per each respective principal component. We will also calculate the cumulative variance explained. This will help inform our decision of how many principal components to keep when reducing dimensions in visualizing the data.
In [4]:
eigenvalues, eigenvectors = np.linalg.eig(X_zscaled.cov())
eigenvalues_normalized = eigenvalues / eigenvalues.sum()
cumvar_explained = np.cumsum(eigenvalues_normalized)
In [5]:
T = pd.DataFrame(X_zscaled.dot(eigenvectors))
# set column names
T.columns = ['pc1', 'pc2', 'pc3', 'pc4']
# also add the species label as
T = pd.concat([T, Y.species], axis=1)
We can visualize the original, 4D iris data of $X$ by using the first $k$ eigenvectors of $X$, projecting the original data into a reduced-rank $k$-dimensional principal component space.
\begin{align} T_{rank=k} &= X V_{rank=k} \end{align}
In [6]:
# let's try using the first 2 principal components
k = 2
# divide T by label
irises = [T[T.species=='setosa'],
T[T.species=='versicolor'],
T[T.species=='virginica']]
# define a color-blind friendly set of quantative colors
# for each species
colors = ['#1b9e77', '#d95f02', '#7570b3']
_, (ax1, ax2) = plt.subplots(1, 2, sharey=False)
# plot principal component vis-a-vis total variance in data
ax1.plot([1,2,3,4],
eigenvalues_normalized,
'-o',
color='#8da0cb',
label='eigenvalue (normalized)',
alpha=0.8,
zorder=1000)
ax1.plot([1,2,3,4],
cumvar_explained,
'-o',
color='#fc8d62',
label='cum. variance explained',
alpha=0.8,
zorder=1000)
ax1.set_xlim(0.8, 4.2)
ax1.set_xticks([1,2,3,4])
ax1.set_xlabel('Principal component')
ax1.set_ylabel('Variance')
ax1.legend(loc='center right', fontsize=7)
ax1.grid(color='#fdfefe')
ax1.set_facecolor('#f4f6f7')
# plot the reduced-rank score matrix representation
for group, color in zip(irises, colors):
ax2.scatter(group.pc1,
group.pc2,
marker='^',
color=color,
label=group.species,
alpha=0.5,
zorder=1000)
ax2.set_xlabel(r'PC 1')
ax2.set_ylabel(r'PC 2')
ax2.grid(color='#fdfefe')
ax2.set_facecolor('#f4f6f7')
ax2.legend(labels=iris.target_names, fontsize=7)
plt.suptitle(r'Fig. 1: PCA via eigen-decomposition, 2D visualization')
plt.tight_layout(pad=3.0)
plt.show()
In [7]:
feature_norms = np.linalg.norm(eigenvectors[:, 0:k], axis=1)
feature_weights = feature_norms / feature_norms.sum()
msg = ('Using {} principal components, '
'the original features are represented with the following weights:')
print(msg.format(k))
for feature, weight in zip(iris.feature_names, feature_weights):
print('- {}: {:0.3f}'.format(feature, weight))
Singular value decomposition factors any matrix $A$ into right-singular vector matrix $U$; diagonal matrix of singular values $\Sigma$; and right-singular vector matrix $V$.
\begin{align} A &= U \Sigma V^\intercal \\ \end{align}If we start with
\begin{align} X &= U \Sigma V^\intercal \\ \\ X^\intercal X &= (U \Sigma V^\intercal)^\intercal U \Sigma V^\intercal \\ &= V \Sigma^\intercal U^\intercal U \Sigma V^\intercal \\ &= V \Sigma^\intercal \Sigma V^\intercal \\ \\ \Rightarrow \Sigma^\intercal \Sigma &= \Lambda \end{align}
In [8]:
U, S, Vt = np.linalg.svd(X_zscaled)
In [9]:
print(eigenvectors)
In [10]:
print(Vt.T)
In [11]:
Vt.T.shape
Out[11]:
In [ ]: