notebook.community

Edit and run



In [1]:

    
import sklearn



In [2]:

    
from sklearn import datasets



In [3]:

    
from sklearn.cross_validation import cross_val_score



In [4]:

    
from sklearn import svm, metrics



In [5]:

    
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import RandomForestClassifier



In [6]:

    
import pylab as pl



In [7]:

    
print sklearn.__version__



In [8]:

    
print pylab.__version__



In [9]:

    
import pylab as pl
import numpy as np

# Some nice default configuration for plots
pl.rcParams['figure.figsize'] = 10, 7.5
pl.rcParams['axes.grid'] = True
pl.gray()









    





<matplotlib.figure.Figure at 0x36dd5d0>



In [10]:

    
from sklearn.datasets import load_digits



In [11]:

    
digits = load_digits()



In [12]:

    
digits









    Out[12]:





{'DESCR': " Optical Recognition of Handwritten Digits Data Set\n\nNotes\n-----\nData Set Characteristics:\n    :Number of Instances: 5620\n    :Number of Attributes: 64\n    :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n    :Missing Attribute Values: None\n    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n    :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttp://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number of on pixels are counted in each block. This generates\nan input matrix of 8x8 where each element is an integer in the range\n0..16. This reduces dimensionality and gives invariance to small\ndistortions.\n\nFor info on NIST preprocessing routines, see M. D. Garris, J. L. Blue, G.\nT. Candela, D. L. Dimmick, J. Geist, P. J. Grother, S. A. Janet, and C.\nL. Wilson, NIST Form-Based Handprint Recognition System, NISTIR 5469,\n1994.\n\nReferences\n----------\n  - C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their\n    Applications to Handwritten Digit Recognition, MSc Thesis, Institute of\n    Graduate Studies in Science and Engineering, Bogazici University.\n  - E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika.\n  - Ken Tang and Ponnuthurai N. Suganthan and Xi Yao and A. Kai Qin.\n    Linear dimensionalityreduction using relevance weighted LDA. School of\n    Electrical and Electronic Engineering Nanyang Technological University.\n    2005.\n  - Claudio Gentile. A New Approximate Maximal Margin Classification\n    Algorithm. NIPS. 2000.\n",
 'data': array([[  0.,   0.,   5., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,  10.,   0.,   0.],
       [  0.,   0.,   0., ...,  16.,   9.,   0.],
       ..., 
       [  0.,   0.,   1., ...,   6.,   0.,   0.],
       [  0.,   0.,   2., ...,  12.,   0.,   0.],
       [  0.,   0.,  10., ...,  12.,   1.,   0.]]),
 'images': array([[[  0.,   0.,   5., ...,   1.,   0.,   0.],
        [  0.,   0.,  13., ...,  15.,   5.,   0.],
        [  0.,   3.,  15., ...,  11.,   8.,   0.],
        ..., 
        [  0.,   4.,  11., ...,  12.,   7.,   0.],
        [  0.,   2.,  14., ...,  12.,   0.,   0.],
        [  0.,   0.,   6., ...,   0.,   0.,   0.]],

       [[  0.,   0.,   0., ...,   5.,   0.,   0.],
        [  0.,   0.,   0., ...,   9.,   0.,   0.],
        [  0.,   0.,   3., ...,   6.,   0.,   0.],
        ..., 
        [  0.,   0.,   1., ...,   6.,   0.,   0.],
        [  0.,   0.,   1., ...,   6.,   0.,   0.],
        [  0.,   0.,   0., ...,  10.,   0.,   0.]],

       [[  0.,   0.,   0., ...,  12.,   0.,   0.],
        [  0.,   0.,   3., ...,  14.,   0.,   0.],
        [  0.,   0.,   8., ...,  16.,   0.,   0.],
        ..., 
        [  0.,   9.,  16., ...,   0.,   0.,   0.],
        [  0.,   3.,  13., ...,  11.,   5.,   0.],
        [  0.,   0.,   0., ...,  16.,   9.,   0.]],

       ..., 
       [[  0.,   0.,   1., ...,   1.,   0.,   0.],
        [  0.,   0.,  13., ...,   2.,   1.,   0.],
        [  0.,   0.,  16., ...,  16.,   5.,   0.],
        ..., 
        [  0.,   0.,  16., ...,  15.,   0.,   0.],
        [  0.,   0.,  15., ...,  16.,   0.,   0.],
        [  0.,   0.,   2., ...,   6.,   0.,   0.]],

       [[  0.,   0.,   2., ...,   0.,   0.,   0.],
        [  0.,   0.,  14., ...,  15.,   1.,   0.],
        [  0.,   4.,  16., ...,  16.,   7.,   0.],
        ..., 
        [  0.,   0.,   0., ...,  16.,   2.,   0.],
        [  0.,   0.,   4., ...,  16.,   2.,   0.],
        [  0.,   0.,   5., ...,  12.,   0.,   0.]],

       [[  0.,   0.,  10., ...,   1.,   0.,   0.],
        [  0.,   2.,  16., ...,   1.,   0.,   0.],
        [  0.,   0.,  15., ...,  15.,   0.,   0.],
        ..., 
        [  0.,   4.,  16., ...,  16.,   6.,   0.],
        [  0.,   8.,  16., ...,  16.,   8.,   0.],
        [  0.,   1.,   8., ...,  12.,   1.,   0.]]]),
 'target': array([0, 1, 2, ..., 8, 9, 8]),
 'target_names': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])}



In [13]:

    
X = digits.data
y = digits.target



In [14]:

    
X.shape









    Out[14]:





(1797, 64)



In [15]:

    
y.shape









    Out[15]:





(1797,)



In [45]:

    
n_samples, n_features = X.shape



In [46]:

    
n_samples, n_features









    Out[46]:





(1797, 64)



In [48]:

    
for i, j in enumerate(np.random.permutation(X.shape[0])[:6]):
    pl.subplot(1, 6, (i + 1))
    pl.imshow(X[j].reshape((8, 8)), interpolation='nearest')
    pl.title("true class: %d" % y[j])
    pl.xticks(()), pl.yticks(())



In [19]:

    
for i, j in enumerate(np.random.permutation([x for x in range(12)])):
                      print i,j



In [20]:

    
print np.random.permutation([np.exp(-x) for x in np.arange(1,5,.5)])









    



[ 0.04978707  0.011109    0.082085    0.22313016  0.36787944  0.01831564
  0.03019738  0.13533528]



In [21]:

    
for i,j in  enumerate(np.random.permutation([np.exp(-x) for x in np.arange(1,5,.5)])):
    print i,j









    



0 0.223130160148
1 0.0111089965382
2 0.135335283237
3 0.367879441171
4 0.0183156388887
5 0.0820849986239
6 0.0497870683679
7 0.0301973834223



In [24]:

    
for i,j in  enumerate(np.random.permutation([x*x for x in np.arange(1,5,.5)])):
    print i,j



In [25]:

    
np.random.permutation(10).reshape((5,2))









    Out[25]:





array([[0, 4],
       [2, 8],
       [5, 7],
       [3, 1],
       [9, 6]])



In [26]:

    
X.shape, y.shape









    Out[26]:





((1797, 64), (1797,))



In [27]:

    
%time x_pca= RandomizedPCA(n_components=2).fit_transform(X)









    



CPU times: user 28 ms, sys: 4 ms, total: 32 ms
Wall time: 31.8 ms



In [28]:

    
x_pca.shape









    Out[28]:





(1797, 2)



In [29]:

    
from itertools import cycle



In [30]:

    
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']



In [42]:

    
for i, c in zip(np.unique(y), cycle(colors)):
    pl.scatter(x_pca[y==i, 0], x_pca[y==i, 1], c=c, label=str(i), alpha=.8)
    pl.legend(loc='upper right')



In [65]:

    
for i, c in zip(np.unique(y), cycle(colors)):
    print i, c
    print y==i
    #print x_pca[y==i]









    



0 b
[ True False False ..., False False False]
1 g
[False  True False ..., False False False]
2 r
[False False  True ..., False False False]
3 c
[False False False ..., False False False]
4 m
[False False False ..., False False False]
5 y
[False False False ..., False False False]
6 k
[False False False ..., False False False]
7 b
[False False False ..., False False False]
8 g
[False False False ...,  True False  True]
9 r
[False False False ..., False  True False]



In [60]:

    
y









    Out[60]:





array([0, 1, 2, ..., 8, 9, 8])



In [61]:

    
y[:20]









    Out[61]:





array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])



In [40]:

    
#printing all 0
pl.scatter(x_pca[y==0,0], x_pca[y==0,1], label='0', c='b', alpha=.5)
pl.legend(loc='upper right')









    Out[40]:





<matplotlib.legend.Legend at 0x3b30d10>



In [77]:

    
from time import sleep
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']

#plot 0s
pl.scatter(x_pca[y==0, 0], x_pca[y==0, 1], label=1, c='b', alpha=0.5)   
#sleep(2)
#pl.show()
#plot 1s
pl.scatter(x_pca[y==1, 0], x_pca[y==1, 1], label=1, c='g', alpha=0.5)        

#plot 2s
pl.scatter(x_pca[y==2, 0], x_pca[y==2, 1], label=1, c='r', alpha=0.5)

#plot 3s
pl.scatter(x_pca[y==3, 0], x_pca[y==3, 1], label=1, c='c', alpha=0.5)
#plot 4s
pl.scatter(x_pca[y==4, 0], x_pca[y==4, 1], label=1, c='m', alpha=0.5)
#plot 5s
pl.scatter(x_pca[y==5, 0], x_pca[y==5, 1], label=1, c='y', alpha=0.5)
#plot 6s
pl.scatter(x_pca[y==6, 0], x_pca[y==6, 1], label=1, c='k', alpha=0.5)
#plot 7
pl.scatter(x_pca[y==7, 0], x_pca[y==7, 1], label=1, c='b', alpha=0.5)

pl.scatter(x_pca[y==8, 0], x_pca[y==8, 1], label=1, c='g', alpha=0.5)

pl.scatter(x_pca[y==9, 0], x_pca[y==9, 1], label=1, c='r', alpha=0.5)









    Out[77]:





<matplotlib.collections.PathCollection at 0xb3c8b8c>



In [80]:

    
x_pca.shape









    Out[80]:





(1797, 2)



In [81]:

    
x_pca









    Out[81]:





array([[ -1.26061418, -21.27774313],
       [  7.9583523 ,  20.7674123 ],
       [  6.99188533,   9.95577913],
       ..., 
       [ 10.80016393,   6.95988699],
       [ -4.86672552, -12.42676615],
       [ -0.34518114,  -6.36189919]])



In [83]:

    
from sklearn import svm



In [85]:

    
%time svm.SVC().fit(X,y).score(X,y)









    



CPU times: user 2.95 s, sys: 0.05 s, total: 3.00 s
Wall time: 3.33 s






    Out[85]:





1.0



In [86]:

    
from sklearn.cross_validation import train_test_split



In [87]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=0)



In [88]:

    
X_train.shape, X_test.shape, y_train.shape, y_test.shape









    Out[88]:





((1203, 64), (594, 64), (1203,), (594,))



In [99]:

    
svc = svm.SVC(kernel='rbf').fit(X_train, y_train)
svc_score = svc.score(X_test, y_test)
print svc_score









    



0.393939393939



In [95]:

    
print svc









    



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel=rbf, max_iter=-1, probability=False, shrinking=True, tol=0.001,
  verbose=False)



In [96]:

    
svc2 = svm.SVC(kernel='rbf', C=100, gamma=.001).fit(X_train, y_train)



In [97]:

    
svc2









    Out[97]:





SVC(C=100, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.001, kernel='rbf', max_iter=-1, probability=False,
  shrinking=True, tol=0.001, verbose=False)



In [100]:

    
svc2_score = svc2.score(X_test, y_test)



In [101]:

    
print svc2_score









    



0.989898989899



In [125]:

    
X[train].shape, X[test].shape









    Out[125]:





((1617, 64), (180, 64))



In [103]:

    
from sklearn.cross_validation import ShuffleSplit

cv = ShuffleSplit(n_samples, n_iter=3, test_size=0.1,
    random_state=0)

for cv_index, (train, test) in enumerate(cv):
    print("# Cross Validation Iteration #%d" % cv_index)
    print("train indices: {0}...".format(train[:10]))
    print("test indices: {0}...".format(test[:10]))
    
    svc = svm.SVC(kernel="rbf", C=1, gamma=0.001).fit(X[train], y[train])
    print("train score: {0:.3f}, test score: {1:.3f}\n".format(
        svc.score(X[train], y[train]), svc.score(X[test], y[test])))









    



# Cross Validation Iteration #0
train indices: [ 353    5   58 1349 1025  575 1074 1110 1745  689]...
test indices: [1081 1707  927  713  262  182  303  895  933 1266]...
train score: 0.999, test score: 0.989

# Cross Validation Iteration #1
train indices: [1336  608  977   22  526 1587 1130  569 1481  962]...
test indices: [1014  755 1633  117  181  501  948 1076   45  659]...
train score: 0.998, test score: 0.994

# Cross Validation Iteration #2
train indices: [ 451  409  911 1551  133  691 1306  111  852  825]...
test indices: [ 795  697  655  573  412  743  635  851 1466 1383]...
train score: 0.999, test score: 0.994



In [121]:

    
svc = svm.SVC(kernel="poly", degree=3, C=100, gamma=0.001)
cv = ShuffleSplit(n_samples, n_iter=10, test_size=0.1,
    random_state=0)

test_scores = cross_val_score(svc, X, y, cv=cv, n_jobs=2)
print test_scores
print "meanscore=" + str(test_scores.mean())









    



[ 0.98333333  0.99444444  0.99444444  0.98888889  0.98888889  0.99444444
  0.98333333  0.99444444  0.97222222  1.        ]
meanscore=0.989444444444



In [105]:

    
n_samples









    Out[105]:





1797



In [127]:

    
np.logspace(-7, -1, 10)









    Out[127]:





array([  1.00000000e-07,   4.64158883e-07,   2.15443469e-06,
         1.00000000e-05,   4.64158883e-05,   2.15443469e-04,
         1.00000000e-03,   4.64158883e-03,   2.15443469e-02,
         1.00000000e-01])



In [129]:

    
n_gammas = 10
n_iter = 5
cv = ShuffleSplit(n_samples, n_iter=n_iter, train_size=500, test_size=500,
    random_state=0)

train_scores = np.zeros((n_gammas, n_iter))
test_scores = np.zeros((n_gammas, n_iter))
gammas = np.logspace(-7, -1, n_gammas)

for i, gamma in enumerate(gammas):
    for j, (train, test) in enumerate(cv):
        clf = svm.SVC(C=10, gamma=gamma).fit(X[train], y[train])
        train_scores[i, j] = clf.score(X[train], y[train])
        test_scores[i, j] = clf.score(X[test], y[test])



In [130]:

    
cv









    Out[130]:





ShuffleSplit(1797, n_iter=5, test_size=500, indices=True, random_state=0)



In [131]:

    
type(cv)









    Out[131]:





sklearn.cross_validation.ShuffleSplit



In [134]:

    
cv.train_size









    Out[134]:





500



In [138]:

    
for i in range(n_iter):
    pl.semilogx(gammas, train_scores[:, i], alpha=0.4, lw=5, c='b')
    pl.semilogx(gammas, test_scores[:, i], alpha=0.4, lw=1, c='g')
pl.ylabel("score for SVC(C=10, gamma=gamma)")
pl.xlabel("gamma")









    Out[138]:





<matplotlib.text.Text at 0xc0bf0cc>



In [139]:

    
from sklearn.grid_search import GridSearchCV



In [144]:

    
from pprint import pprint
svc_params = {
    'C': np.logspace(-1, 2, 4),
    'gamma': np.logspace(-4, 0,5),
}
pprint(svc_params)









    



{'C': array([   0.1,    1. ,   10. ,  100. ]),
 'gamma': array([  1.00000000e-04,   1.00000000e-03,   1.00000000e-02,
         1.00000000e-01,   1.00000000e+00])}



In [172]:

    
n_subsamples = 1000
X_small_train, y_small_train = X_train[:n_subsamples], y_train[:n_subsamples]



In [173]:

    
X_small_train.shape, y_small_train.shape









    Out[173]:





((1000, 64), (1000,))



In [174]:

    
gs_svc = GridSearchCV(svm.SVC(), svc_params, cv=14, n_jobs=-1)

%time _ = gs_svc.fit(X_small_train, y_small_train)









    



CPU times: user 129.42 s, sys: 1.99 s, total: 131.42 s
Wall time: 161.22 s



In [175]:

    
gs_svc.best_estimator_









    Out[175]:





SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.001, kernel='rbf', max_iter=-1, probability=False,
  shrinking=True, tol=0.001, verbose=False)



In [176]:

    
gs_svc.best_params_, gs_svc.best_score_









    Out[176]:





({'C': 10.0, 'gamma': 0.001}, 0.9849653476414042)



In [177]:

    
def display_scores(params, scores, append_star=False):
    """Format the mean score +/- std error for params"""
    params = ", ".join("{0}={1}".format(k, v)
                      for k, v in params.items())
    line = "{0}:\t{1:.3f} (+/-{2:.3f})".format(
        params, np.mean(scores), sem(scores))
    if append_star:
        line += " *"
    return line

def display_grid_scores(grid_scores, top=None):
    """Helper function to format a report on a grid of scores"""
    
    grid_scores = sorted(grid_scores, key=lambda x: x[1], reverse=True)
    if top is not None:
        grid_scores = grid_scores[:top]
        
    # Compute a threshold for staring models with overlapping
    # stderr:
    _, best_mean, best_scores = grid_scores[0]
    threshold = best_mean - 2 * sem(best_scores)
    
    for params, mean_score, scores in grid_scores:
        append_star = mean_score + 2 * sem(scores) > threshold
        print(display_scores(params, scores, append_star=append_star))



In [178]:

    
from scipy.stats import sem #standard error mean

def mean_score(scores):
    """Print the empirical mean score and standard error of the mean."""
    return ("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), sem(scores))

display_grid_scores(gs_svc.grid_scores_, top=20)









    



C=10.0, gamma=0.001:	0.985 (+/-0.004) *
C=100.0, gamma=0.001:	0.985 (+/-0.004) *
C=1.0, gamma=0.001:	0.984 (+/-0.004) *
C=100.0, gamma=0.0001:	0.984 (+/-0.004) *
C=10.0, gamma=0.0001:	0.982 (+/-0.005) *
C=1.0, gamma=0.0001:	0.967 (+/-0.006) *
C=0.1, gamma=0.001:	0.954 (+/-0.007)
C=0.1, gamma=0.0001:	0.775 (+/-0.009)
C=10.0, gamma=0.01:	0.732 (+/-0.012)
C=100.0, gamma=0.01:	0.732 (+/-0.012)
C=1.0, gamma=0.01:	0.713 (+/-0.014)
C=0.1, gamma=1.0:	0.125 (+/-0.010)
C=0.1, gamma=0.01:	0.112 (+/-0.001)
C=10.0, gamma=0.1:	0.111 (+/-0.002)
C=100.0, gamma=0.1:	0.111 (+/-0.002)
C=0.1, gamma=0.1:	0.109 (+/-0.002)
C=1.0, gamma=0.1:	0.109 (+/-0.002)
C=1.0, gamma=1.0:	0.109 (+/-0.002)
C=10.0, gamma=1.0:	0.109 (+/-0.002)
C=100.0, gamma=1.0:	0.109 (+/-0.002)



In [1]:

    
import pandas as pd



In [ ]: