The sheer size of data in the modern age is not only a challenge for computer hardware but also a main bottleneck for the performance of many machine learning algorithms. The main goal of a PCA analysis is to identify patterns in data; PCA aims to detect the correlation between variables. If a strong correlation between variables exists, the attempt to reduce the dimensionality only makes sense. In a nutshell, this is what PCA is all about: Finding the directions of maximum variance in high-dimensional data and project it onto a smaller dimensional subspace while retaining most of the information.
In [ ]:
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn import datasets
from sklearn.decomposition import PCA
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
In [ ]:
iris = sns.load_dataset('iris')
iris.loc[:, 'species'] = (iris.loc[:, 'species']
.astype('category'))
iris.info()
iris.head()
In [ ]:
ax_formatter = {
'billions': FuncFormatter(lambda x, position: f'{x * 1e-9:.0f}'),
'millions': FuncFormatter(lambda x, position: f'{x * 1e-6:.0f}'),
'percent_convert': FuncFormatter(lambda x, position: f'{x * 100:.0f}%'),
'percent': FuncFormatter(lambda x, position: f'{x * 100:.0f}%'),
'thousands': FuncFormatter(lambda x, position: f'{x * 1e-3:.0f}'),
}
names = (
'Sepal Length',
'Sepal Width',
'Petal Length',
'Petal Width',
)
column_names = [x.replace(' ', '_').lower()
for x in names]
size = {
'label': 14,
'legend': 12,
'title': 20,
'super_title': 24,
}
Often, the desired goal is to reduce the dimensions of a d-dimensional dataset by projecting it onto a (k)-dimensional subspace (where k < d) in order to increase the computational efficiency while retaining most of the information. An important question is “what is the size of k that represents the data ‘well’?”
Later, we will compute eigenvectors (the principal components) of a dataset and collect them in a projection matrix. Each of those eigenvectors is associated with an eigenvalue which can be interpreted as the “length” or “magnitude” of the corresponding eigenvector. If some eigenvalues have a significantly larger magnitude than others that the reduction of the dataset via PCA onto a smaller dimensional subspace by dropping the “less informative” eigenpairs is reasonable.
In [ ]:
fig = plt.figure('Iris Violin Plot',
figsize=(12, 5), facecolor='white',
edgecolor='black')
rows, cols = (1, 2)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
ax1 = plt.subplot2grid((rows, cols), (0, 1), sharey=ax0)
sns.boxplot(data=iris, width=0.4, ax=ax0)
sns.violinplot(data=iris, inner='quartile', ax=ax1)
for ax in (ax0, ax1):
ax.set_xlabel('Characteristics', fontsize=size['label'])
ax.set_xticklabels(names)
ax.set_ylabel('Centimeters $(cm)$', fontsize=size['label'])
plt.suptitle('Iris Dataset', fontsize=size['title']);
In [ ]:
fig = plt.figure('Iris Data Distribution Plots', figsize=(10, 15),
facecolor='white', edgecolor='black')
rows, cols = (4, 2)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
ax1 = plt.subplot2grid((rows, cols), (0, 1))
ax2 = plt.subplot2grid((rows, cols), (1, 0))
ax3 = plt.subplot2grid((rows, cols), (1, 1))
ax4 = plt.subplot2grid((rows, cols), (2, 0))
ax5 = plt.subplot2grid((rows, cols), (2, 1))
ax6 = plt.subplot2grid((rows, cols), (3, 0))
ax7 = plt.subplot2grid((rows, cols), (3, 1))
n_bins = 40
for n, ax, data in zip(range(4), (ax0, ax2, ax4, ax6), column_names):
iris[data].plot(kind='hist', alpha=0.5, bins=n_bins, color=f'C{n}',
edgecolor='black', label='_nolegend_', ax=ax)
ax.axvline(iris[data].mean(), color='crimson', label='Mean',
linestyle='--')
ax.axvline(iris[data].median(), color='black', label='Median',
linestyle='-.')
ax.set_title(names[n])
ax.set_ylabel('Count', fontsize=size['label'])
for n, ax, data in zip(range(4), (ax1, ax3, ax5, ax7), column_names):
sns.distplot(iris[data], axlabel=False, bins=n_bins,
hist_kws={'alpha': 0.5, 'color': f'C{n}',
'edgecolor': 'black'},
kde_kws={'color': 'darkblue', 'label': 'KDE'},
ax=ax)
ax.set_title(names[n])
ax.set_ylabel('Density', fontsize=size['label'])
for ax in (ax0, ax1, ax2, ax3, ax4, ax5, ax6, ax7):
ax.legend(fontsize=size['legend'])
ax.set_xlabel('Centimeters ($cm$)', fontsize=size['label'])
plt.tight_layout()
plt.suptitle('Iris Data Distribution Plots',
fontsize=size['super_title'], y=1.03);
In [ ]:
grid = sns.pairplot(iris,
diag_kws={'alpha': 0.5, 'bins': 30, 'edgecolor': 'black'},
hue='species', markers=['o', 's', 'D'],
plot_kws={'alpha': 0.7})
grid.fig.suptitle('Iris Dataset Correlation',
fontsize=size['super_title'], y=1.03)
handles = grid._legend_data.values()
labels = grid._legend_data.keys()
grid._legend.remove()
grid.fig.legend(bbox_to_anchor=(1.02, 0.5), fontsize=size['legend'],
handles=handles,
labels=[x.capitalize() for x in labels],
loc='center right')
for n in range(4):
grid.axes[3, n].set_xlabel(names[n], fontsize=size['label'])
grid.axes[n, 0].set_ylabel(names[n], fontsize=size['label'])
plt.show();
General Steps for PCA (walkthrough in R if you get stuck):
The class should be able to:
In [ ]:
class K2PCA:
"""
Class to perform a Principal Component Analysis.
:Attributes:
- **categories**: *pd.Series* categories of data
- **covariance**: *np.array* covariance matrix of normalized data
- **data**: *pd.DataFrame* original data
- **eigen_val**: *np.array* covariance matrix eigenvalues
- **eigen_vec**: *np.array* covariance matrix eigenvectors
- **n_components**: *int* number of priciple components to return
- **normalize**: *np.array* normalized data
- **trans_data**: *pd.DataFrame* original data transformed into the \
two dimensional component space
- **variance**: *np.array* percentage of variance by feature
"""
def __init__(self, data, categories, n_components=None):
self.categories = categories
self.data = data
self.eigen_val = None
self.eigen_vec = None
self.n_components = n_components
self.trans_data =None
self._covariance = None
self._normalize = None
self._variance = None
@property
def covariance(self):
self.calc_normalize()
self.calc_covariance()
return self._covariance
@property
def normalize(self):
self.calc_normalize()
return self._normalize
@property
def variance(self):
self.calc_variance()
return self._variance
def __repr__(self):
return f'K2PCA(data={self.data}, n_components{self.n_components})'
def fit(self):
"""
Standardize the data and determine the sorted eigenvalues and eigenvectors.
"""
self.calc_normalize()
self.calc_covariance()
self.calc_eigen()
def calc_covariance(self):
"""
Calculate the covariance matrix.
"""
self._covariance = np.cov(self._normalize, rowvar=False)
def calc_eigen(self):
"""
Calculate the covariance eigenvalues and eigenvectors.
.. note:: NumPy eig returns the eigen vector as a column
"""
if self._covariance is None:
self.calc_covariance()
self.eigen_val, self.eigen_vec = (np.linalg
.eig(self._covariance))
idx = self.eigen_val.argsort()[::-1]
self.eigen_val = self.eigen_val[idx]
self.eigen_vec = self.eigen_vec[:,idx].T
def calc_normalize(self, data=None):
"""
Standardize the data.
:param pd.DataFrame data: data to be normalized
"""
if data is None:
data = self.data
self._normalize = (sklearn.preprocessing
.StandardScaler()
.fit_transform(data))
def calc_variance(self):
"""
Calculate the percentage of variance by feature.
"""
if self.eigen_val is None:
self.calc_eigen()
self._variance = self.eigen_val / pca.eigen_val.sum()
def filter_components(self):
"""
Return the first n components specified by n_components attribute.
"""
if self.eigen_val is None:
self.calc_eigen()
if self.n_components is not None:
self.eigen_val = self.eigen_val[idx][:self.n_components]
self.eigen_vec = self.eigen_vect[idx][:self.n_components]
def plot_variance(self, save=False):
"""
Plot the feature percentage of variance per component.
:param bool save: if True the figure will be saved
"""
if self._variance is None:
self.calc_variance()
var_pct = pd.Series(self._variance)
cum_var_pct = var_pct.cumsum()
ax = (pd.concat([var_pct, cum_var_pct], axis=1)
.rename(index={x: x + 1 for x in range(var_pct.size)})
.plot(kind='bar', alpha=0.5, edgecolor='black',
figsize=(10, 5)))
ax.set_title('Dataset Components', fontsize=size['title'])
ax.legend(['Individual Variance', 'Cumulative Variance'],
fontsize=size['legend'])
ax.set_xlabel('Principal Components',
fontsize=size['label'])
ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=0)
ax.set_ylabel('Percent (%)', fontsize=size['label'])
ax.yaxis.set_major_formatter(ax_formatter['percent'])
for patch in ax.patches:
height = patch.get_height()
ax.text(x=patch.get_x() + patch.get_width() / 2,
y=height + 0.01,
s=f'{height * 100:1.1f}%',
ha='center')
if save:
plt.savefig(f'variance_pct.png', bbox_inches='tight',
bbox_extra_artists=[size['super_title']])
else:
plt.show()
def plot_transform_2d(self):
"""
Plot the original data in the 2D PCA space.
"""
if self.trans_data is None:
self.transform_2d()
grid = sns.lmplot(x='comp_1', y='comp_2', data=pca.trans_data,
hue='categories', fit_reg=False,
markers=['o', 's', 'd'], size=6)
grid.fig.suptitle('Principal Components',
fontsize=size['title'], y=1.05)
handles = grid._legend_data.values()
labels = grid._legend_data.keys()
grid._legend.remove()
grid.fig.legend(bbox_to_anchor=(1.02, 0.94),
fontsize=size['legend'], handles=handles,
labels=[x.capitalize() for x in labels],
loc='center right')
grid.axes[0, 0].set_xlabel('PCA $1^{st}$ Component',
fontsize=size['label'])
grid.axes[0, 0].set_ylabel('PCA $2^{nd}$ Component',
fontsize=size['label'])
def transform_2d(self):
"""
Transform the original data into the 2D PCA space.
"""
if self.eigen_vec is None:
self.calc_eigen()
trans = self.eigen_vec[:2].T
self.trans_data = (pd.DataFrame(self._normalize.dot(trans),
columns=['comp_1', 'comp_2'])
.assign(categories=self.categories))
self.trans_data.loc[:, 'comp_2'] = self.trans_data.comp_2 * -1
In [ ]:
pca = K2PCA(iris.drop('species', axis=1), iris.species)
pca.fit()
pca.plot_variance()
In [ ]:
pca.plot_transform_2d()
In [ ]:
pca = PCA(n_components=4)
pca.fit(sklearn.preprocessing
.StandardScaler()
.fit_transform(iris.drop('species', axis=1)))
var_pct = pd.Series(pca.explained_variance_ratio_)
cum_var_pct = var_pct.cumsum()
ax = (pd.concat([var_pct, cum_var_pct], axis=1)
.plot(kind='bar', alpha=0.5, edgecolor='black',
figsize=(10, 5)))
ax.set_title('Iris Data Components', fontsize=size['title'])
ax.legend(['Individual Variance', 'Cumulative Variance'],
fontsize=size['legend'])
ax.set_xlabel('Components', fontsize=size['label'])
ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=0)
ax.set_ylabel('Percent (%)', fontsize=size['label'])
ax.yaxis.set_major_formatter(ax_formatter['percent'])
for patch in ax.patches:
height = patch.get_height()
ax.text(x=patch.get_x() + patch.get_width() / 2,
y=height + 0.01,
s=f'{height * 100:1.1f}%',
ha='center')
plt.show();
In [ ]:
components = 2
pca = PCA(n_components=components)
X = iris.drop('species', axis=1)
X_std = (sklearn.preprocessing
.StandardScaler()
.fit_transform(X))
y = pca.fit_transform(X_std)
y = (pd.DataFrame(y, columns=['first', 'second'])
.assign(species=iris.species))
grid = sns.lmplot(x='first', y='second', data=y,
hue='species', fit_reg=False,
markers=['o', 's', 'd'], size=6)
grid.fig.suptitle('Principal Components',
fontsize=size['title'], y=1.05)
handles = grid._legend_data.values()
labels = grid._legend_data.keys()
grid._legend.remove()
grid.fig.legend(bbox_to_anchor=(1.02, 0.94), fontsize=size['legend'],
handles=handles,
labels=[x.capitalize() for x in labels],
loc='center right')
grid.axes[0, 0].set_xlabel('PCA $1^{st}$ Component',
fontsize=size['label'])
grid.axes[0, 0].set_ylabel('PCA $2^{nd}$ Component',
fontsize=size['label'])
plt.show();