In [ ]:
    
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
    
In [ ]:
    
rng = np.random.RandomState(1)
X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T
plt.scatter(X[:, 0], X[:, 1])
plt.axis('equal');
    
In [ ]:
    
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
    
PCA learns what the components are an how variance is explained by them.
In [ ]:
    
print(pca.components_)
    
In [ ]:
    
print(pca.explained_variance_)
    
In [ ]:
    
def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->',
                    linewidth=2,
                    shrinkA=0, shrinkB=0)
    ax.annotate('', v1, v0, arrowprops=arrowprops)
# plot data
plt.scatter(X[:, 0], X[:, 1], alpha=0.2)
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal');
    
What do the principal components of the following dataset look like?
In [ ]:
    
X = rng.randn(250, 2)
plt.scatter(X[:, 0], X[:, 1])
    
In [ ]:
    
# fit estimator
pca = PCA(n_components=2)
pca.fit(X)
# plot data
plt.scatter(X[:, 0], X[:, 1], alpha=1)
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v)
    
In [ ]:
    
rng = np.random.RandomState(1)
X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T
pca = PCA(n_components=1)
pca.fit(X)
X_pca = pca.transform(X)
print("original shape:   ", X.shape)
print("transformed shape:", X_pca.shape)
    
In [ ]:
    
X_new = pca.inverse_transform(X_pca)
plt.scatter(X[:, 0], X[:, 1], alpha=0.5)
plt.scatter(X_new[:, 0], X_new[:, 1], alpha=0.5)
plt.axis('equal');
    
Recall: The iris dataset is four dimensional
In [ ]:
    
import seaborn as sns
iris = sns.load_dataset('iris')
X_iris = iris.drop('species', axis=1)
y_iris = iris['species']
iris.head()
    
In [ ]:
    
from sklearn.decomposition import PCA
model = PCA(n_components=2)
model.fit(X_iris)
X_2D = model.transform(X_iris)
    
In [ ]:
    
colormap = y_iris.copy()
colormap[colormap == 'setosa'] = 'b'
colormap[colormap == 'virginica'] = 'r'
colormap[colormap == 'versicolor'] = 'g'
plt.scatter(X_2D[:, 0], X_2D[:, 1], c=colormap)
plt.xlabel('PCA1')
plt.xlabel('PCA2')
    
What do we see from this plot?
Let's see how this looks with the digits data. First we will plot several of the input noise-free data:
In [ ]:
    
from sklearn.datasets import load_digits
digits = load_digits()
def plot_digits(data):
    fig, axes = plt.subplots(4, 10, figsize=(10, 4),
                             subplot_kw={'xticks':[], 'yticks':[]},
                             gridspec_kw=dict(hspace=0.1, wspace=0.1))
    for i, ax in enumerate(axes.flat):
        ax.imshow(data[i].reshape(8, 8),
                  cmap='binary', interpolation='nearest',
                  clim=(0, 16))
plot_digits(digits.data)
    
Now, let's add some noise:
In [ ]:
    
np.random.seed(42)
noisy = np.random.normal(digits.data, 4)
plot_digits(noisy)
    
In [ ]:
    
pca = PCA(0.50).fit(noisy)
pca.n_components_
    
In [ ]:
    
components = pca.transform(noisy)
filtered = pca.inverse_transform(components)
plot_digits(filtered)