notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()



In [3]:

    
rng = np.random.RandomState(1)
X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T
plt.scatter(X[:, 0], X[:, 1])
plt.axis('equal')









    Out[3]:





(-3.0, 3.0, -1.0, 1.5)



In [4]:

    
from sklearn.decomposition import PCA
model = PCA(2)
model.fit(X)









    Out[4]:





PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)



In [5]:

    
model.components_









    Out[5]:





array([[-0.94446029, -0.32862557],
       [-0.32862557,  0.94446029]])



In [6]:

    
model.explained_variance_









    Out[6]:





array([ 0.75871884,  0.01838551])



In [8]:

    
def draw_vector(v0, v1, ax=None): 
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->', linewidth=2, shrinkA=0, shrinkB=0)
    ax.annotate('', v1, v0, arrowprops=arrowprops)
    

plt.scatter(X[:, 0], X[:, 1], alpha=0.2)
for length, vector in zip(model.explained_variance_, model.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(model.mean_, model.mean_ + v)
plt.axis('equal');



In [11]:

    
pca = PCA(1)
pca.fit(X)
Xp = pca.transform(X)
print X.shape, Xp.shape









    



(200, 2) (200, 1)



In [15]:

    
X_new = pca.inverse_transform(Xp)
plt.scatter(X[:, 0], X[:, 1], alpha=0.2)
plt.scatter(X_new[:, 0], X_new[:, 1], alpha=0.8)
plt.axis('equal');



In [16]:

    
from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape









    Out[16]:





(1797, 64)



In [18]:

    
pca = PCA(2) # project from 64 to 2 dimensions 
projected = pca.fit_transform(digits.data) 
print(digits.data.shape) 
print(projected.shape)









    



(1797, 64)
(1797, 2)



In [19]:

    
plt.scatter(projected[:, 0], projected[:, 1],
            c=digits.target, edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('spectral', 10))
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar();



In [20]:

    
pca = PCA().fit(digits.data)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');



In [21]:

    
pca.explained_variance_ratio_









    Out[21]:





array([  1.48905936e-01,   1.36187712e-01,   1.17945938e-01,
         8.40997942e-02,   5.78241466e-02,   4.91691032e-02,
         4.31598701e-02,   3.66137258e-02,   3.35324810e-02,
         3.07880621e-02,   2.37234084e-02,   2.27269657e-02,
         1.82186331e-02,   1.77385494e-02,   1.46710109e-02,
         1.40971560e-02,   1.31858920e-02,   1.24813782e-02,
         1.01771796e-02,   9.05617439e-03,   8.89538461e-03,
         7.97123157e-03,   7.67493255e-03,   7.22903569e-03,
         6.95888851e-03,   5.96081458e-03,   5.75614688e-03,
         5.15157582e-03,   4.89539777e-03,   4.28887968e-03,
         3.73606048e-03,   3.53274223e-03,   3.36683986e-03,
         3.28029851e-03,   3.08320884e-03,   2.93778629e-03,
         2.56588609e-03,   2.27742397e-03,   2.22277922e-03,
         2.11430393e-03,   1.89909062e-03,   1.58652907e-03,
         1.51159934e-03,   1.40578764e-03,   1.16622290e-03,
         1.07492521e-03,   9.64053065e-04,   7.74630271e-04,
         5.57211553e-04,   4.04330693e-04,   2.09916327e-04,
         8.24797098e-05,   5.25149980e-05,   5.05243719e-05,
         3.29961363e-05,   1.24365445e-05,   7.04827911e-06,
         3.01432139e-06,   1.06230800e-06,   5.50074587e-07,
         3.42905702e-07,   9.50687638e-34,   9.50687638e-34,
         9.36179501e-34])



In [22]:

    
def plot_digits(data):
    fig, axes = plt.subplots(4, 10, figsize=(10, 4), subplot_kw={'xticks':[], 'yticks':[]},
                            gridspec_kw=dict(hspace=0.1, wspace=0.1)) 
    for i, ax in enumerate(axes.flat):
        ax.imshow(data[i].reshape(8, 8), cmap='binary', interpolation='nearest', clim=(0, 16))

        
plot_digits(digits.data)



In [23]:

    
np.random.seed(42)
noisy = np.random.normal(digits.data, 4)
plot_digits(noisy)



In [24]:

    
pca = PCA(0.50).fit(noisy)
pca.n_components_









    Out[24]:





12



In [25]:

    
components = pca.transform(noisy)
filtered = pca.inverse_transform(components)
plot_digits(filtered)



In [26]:

    
from sklearn.datasets import fetch_lfw_people 
faces = fetch_lfw_people(min_faces_per_person=60) 
print(faces.target_names) 
print(faces.images.shape)









    



['Ariel Sharon' 'Colin Powell' 'Donald Rumsfeld' 'George W Bush'
 'Gerhard Schroeder' 'Hugo Chavez' 'Junichiro Koizumi' 'Tony Blair']
(1348, 62, 47)



In [27]:

    
pca = PCA(150)
pca.fit(faces.data)









    Out[27]:





PCA(copy=True, iterated_power='auto', n_components=150, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)



In [28]:

    
fig, axes = plt.subplots(3, 8, figsize=(9, 4),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1)) 
for i, ax in enumerate(axes.flat):
    ax.imshow(pca.components_[i].reshape(62, 47), cmap='bone')



In [29]:

    
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');



In [30]:

    
components = pca.transform(faces.data)
projected = pca.inverse_transform(components)
fig, ax = plt.subplots(2, 10, figsize=(10, 2.5),
                       subplot_kw={'xticks':[], 'yticks':[]},
                       gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i in range(10):
    ax[0, i].imshow(faces.data[i].reshape(62, 47), cmap='binary_r') 
    ax[1, i].imshow(projected[i].reshape(62, 47), cmap='binary_r')
    ax[0, 0].set_ylabel('full-dim\ninput')
    ax[1, 0].set_ylabel('150-dim\nreconstruction');



In [31]:

    
62*47









    Out[31]:





2914



In [ ]: