In [27]:

    
%matplotlib nbagg
import matplotlib.pyplot as plt
import numpy as np









    



//anaconda/lib/python2.7/site-packages/IPython/kernel/__init__.py:13: ShimWarning: The `IPython.kernel` package has been deprecated. You should import from ipykernel or jupyter_client instead.
  "You should import from ipykernel or jupyter_client instead.", ShimWarning)



In [2]:

    
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
import numpy as np
np.set_printoptions(suppress=True)

digits = load_digits()
X, y = digits.data, digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

Removing mean and scaling variance



In [3]:

    
from sklearn.preprocessing import StandardScaler

1) Instantiate the model



In [4]:

    
scaler = StandardScaler()

2) Fit using only the data.



In [5]:

    
scaler.fit(X_train)









    Out[5]:





StandardScaler(copy=True, with_mean=True, with_std=True)

3) transform the data (not predict).



In [6]:

    
X_train_scaled = scaler.transform(X_train)



In [7]:

    
X_train.shape









    Out[7]:





(1347, 64)



In [8]:

    
X_train_scaled.shape









    Out[8]:





(1347, 64)

The transformed version of the data has the mean removed:



In [9]:

    
X_train_scaled.mean(axis=0)









    Out[9]:





array([ 0., -0.,  0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,  0.,
       -0.,  0.,  0., -0.,  0., -0., -0., -0.,  0.,  0., -0., -0., -0.,
        0.,  0.,  0., -0., -0., -0.,  0., -0., -0.,  0., -0.,  0.,  0.,
        0., -0., -0.,  0., -0., -0., -0., -0., -0., -0., -0.,  0.,  0.,
       -0., -0.,  0.,  0., -0.,  0., -0.,  0., -0., -0., -0., -0.])



In [10]:

    
X_train_scaled.std(axis=0)









    Out[10]:





array([ 0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])



In [11]:

    
X_test_transformed = scaler.transform(X_test)

Principal Component Analysis

0) Import the model



In [12]:

    
from sklearn.decomposition import PCA

1) Instantiate the model



In [13]:

    
pca = PCA(n_components=2)

2) Fit to training data



In [14]:

    
pca.fit(X)









    Out[14]:





PCA(copy=True, n_components=2, whiten=False)

3) Transform to lower-dimensional representation



In [15]:

    
print(X.shape)
X_pca = pca.transform(X)
X_pca.shape









    



(1797, 64)






    Out[15]:





(1797, 2)

Visualize



In [31]:

    
plt.figure()
i1 = y == 1
i2 = y == 2
plt.scatter(X_pca[:, 0][i1], X_pca[:, 1][i1], c='b')#, c=y)
plt.scatter(X_pca[:, 0][i2], X_pca[:, 1][i2], c='r')









    














    











    Out[31]:





<matplotlib.collections.PathCollection at 0x10e5d3990>

Manifold Learning



In [17]:

    
from sklearn.manifold import Isomap
isomap = Isomap()



In [18]:

    
X_isomap = isomap.fit_transform(X)



In [30]:

    
plt.figure()
plt.scatter(X_isomap[:, 0], X_isomap[:, 1], c=y)









    














    











    Out[30]:





<matplotlib.collections.PathCollection at 0x108b81950>

Exercises

Visualize the digits dataset using the TSNE algorithm from the sklearn.manifold module (it runs for a couple of seconds).



In [20]:

    
# %load solutions/digits_tsne.py



In [21]:

    
from sklearn.manifold import TSNE



In [22]:

    
tsne = TSNE()



In [23]:

    
tsne.fit(X_train)



In [24]:

    
X_tsne = tsne.fit_transform(X)



In [25]:

    
X_tsne.shape









    Out[25]:





(1797, 2)



In [32]:

    
plt.figure()
plt.scatter(X_tsne[:,0], X_tsne[:,1], c=y)









    














    











    Out[32]:





<matplotlib.collections.PathCollection at 0x108345a10>



In [ ]: