In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn.decomposition import PCA
In [ ]:
xy = np.random.multivariate_normal([0,0], [[10,7],[7,10]],1000)
plt.plot(xy[:,0],xy[:,1],"o")
plt.show()
In [ ]:
pca = PCA(n_components=2)
num_components is the number of axes on which you spread the data out. You can only have as many components as you have axes (2 in this case).
In [ ]:
xy_pca = pca.fit(xy)
Does the PCA, finding the primary axes of variation.
In [ ]:
plt.plot(xy[:,0],xy[:,1],"o")
scalar = xy_pca.explained_variance_[0]
plt.plot([0,xy_pca.components_[0,0]*scalar/2],[0,xy_pca.components_[0,1]*scalar/2],color="red")
plt.plot([0,-xy_pca.components_[0,0]*scalar/2],[0,-xy_pca.components_[0,1]*scalar/2],color="red")
scalar = xy_pca.explained_variance_[1]
plt.plot([0,xy_pca.components_[1,0]*scalar/2],[0,xy_pca.components_[1,1]*scalar/2],color="yellow")
plt.plot([0,-xy_pca.components_[1,0]*scalar/2],[0,-xy_pca.components_[1,1]*scalar/2],color="yellow")
In [ ]:
xy_trans = xy_pca.transform(xy)
Transforms x and y onto the PCA axes.
In [ ]:
fig, ax = plt.subplots(1,2,figsize=(10,5))
ax[0].plot(xy[:,0],xy[:,1],"o")
ax[0].set_xlabel("x")
ax[0].set_ylabel("y")
ax[0].set_xlim((-15,15)); ax[0].set_ylim((-15,15))
ax[1].plot(xy_trans[:,0],xy_trans[:,1],"o")
ax[1].set_xlabel("PCA1")
ax[1].set_ylabel("PCA2")
ax[1].set_xlim((-15,15)); ax[1].set_ylim((-15,15))
plt.show()
In [ ]:
print("Variation explained:")
print("First component: {:.3f}".format(xy_pca.explained_variance_ratio_[0]))
print("Second component: {:.3f}".format(xy_pca.explained_variance_ratio_[1]))
Describes how much variation each PCA axis captures.
Informally: if you only included the first component in a predictive model, the $R^{2}$ between you prediction and reality would be 0.85.
In [ ]:
def pca_wrapper(xy_pairs):
"""
Take an array of x/y data and perform a principle component analysis.
"""
fig, ax = plt.subplots(1,2,figsize=(10,5))
ax[0].plot(xy_pairs[:,0],xy_pairs[:,1],"o")
ax[0].set_xlim((-18,18))
ax[0].set_ylim((-18,18))
ax[0].set_title("raw x,y data")
ax[0].set_xlabel("x")
ax[0].set_ylabel("y")
# Perform the PCA fit
pca = PCA(n_components=2)
z = pca.fit(xy_pairs)
# Transforom the data onto the new PCA axes
new_xy_pairs = z.transform(xy_pairs)
# Plot the PCA data
ax[1].plot(new_xy_pairs[:,0],new_xy_pairs[:,1],"o")
ax[1].set_title("PCA transformed data")
ax[1].set_xlim((-18,18))
ax[1].set_ylim((-18,18))
ax[1].set_xlabel("PCA1")
ax[1].set_ylabel("PCA2")
print("Variation explained:")
print("First component: {:.3f}".format(pca.explained_variance_ratio_[0]))
print("Second component: {:.3f}".format(pca.explained_variance_ratio_[1]))
In [ ]:
d1 = np.random.multivariate_normal([0,0], [[10,1],[1,10]],1000)
pca_wrapper(d1)
In [ ]:
d2 = np.random.multivariate_normal([0,0], [[10,5],[5,10]],1000)
pca_wrapper(d2)
In [ ]:
d3 = np.random.multivariate_normal([0,0], [[10,9],[9,10]],1000)
pca_wrapper(d3)
The stronger the covariation between parameters, the more readily the PCA can reduce dimensionality.
In [ ]:
iris = datasets.load_iris()
obs = iris.data
species = iris.target
mean = obs.mean(axis=0)
std = obs.std(axis=0)
obs = (obs - mean)/std
The mean, standard deviation business normalizes the data so the values are all on the same scale.
In [ ]:
def plot_slice(obs_r,axis_i,axis_j):
"""
Define a helper function.
"""
plt.plot(obs_r[species == 0,axis_i],obs_r[species == 0,axis_j],"o",color='navy')
plt.plot(obs_r[species == 1,axis_i],obs_r[species == 1,axis_j],"o",color='turquoise')
plt.plot(obs_r[species == 2,axis_i],obs_r[species == 2,axis_j],"o",color='darkorange')
plt.xlabel(axis_i)
plt.ylabel(axis_j)
plt.show()
In [ ]:
plot_slice(obs,axis_i=0,axis_j=1)
In [ ]:
pca = PCA(n_components=4)
obs_pca = pca.fit(obs)
obs_trans = obs_pca.transform(obs)
In [ ]:
plot_slice(obs_trans,axis_i=0,axis_j=1)
All of that separating power is jammed into the first axis.
In [ ]:
for r in obs_pca.explained_variance_ratio_:
print("{:.3f}".format(r))