In [1]:
import sklearn

In [2]:
from sklearn import datasets

In [3]:
from sklearn.cross_validation import cross_val_score

In [4]:
from sklearn import svm, metrics

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import RandomForestClassifier

In [6]:
import pylab as pl

In [7]:
print sklearn.__version__


0.14.1

In [8]:
print pylab.__version__


1.7.1

In [9]:
import pylab as pl
import numpy as np

# Some nice default configuration for plots
pl.rcParams['figure.figsize'] = 10, 7.5
pl.rcParams['axes.grid'] = True
pl.gray()


<matplotlib.figure.Figure at 0x36dd5d0>

In [10]:
from sklearn.datasets import load_digits

In [11]:
digits = load_digits()

In [12]:
digits


Out[12]:
{'DESCR': " Optical Recognition of Handwritten Digits Data Set\n\nNotes\n-----\nData Set Characteristics:\n    :Number of Instances: 5620\n    :Number of Attributes: 64\n    :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n    :Missing Attribute Values: None\n    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n    :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttp://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number of on pixels are counted in each block. This generates\nan input matrix of 8x8 where each element is an integer in the range\n0..16. This reduces dimensionality and gives invariance to small\ndistortions.\n\nFor info on NIST preprocessing routines, see M. D. Garris, J. L. Blue, G.\nT. Candela, D. L. Dimmick, J. Geist, P. J. Grother, S. A. Janet, and C.\nL. Wilson, NIST Form-Based Handprint Recognition System, NISTIR 5469,\n1994.\n\nReferences\n----------\n  - C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their\n    Applications to Handwritten Digit Recognition, MSc Thesis, Institute of\n    Graduate Studies in Science and Engineering, Bogazici University.\n  - E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika.\n  - Ken Tang and Ponnuthurai N. Suganthan and Xi Yao and A. Kai Qin.\n    Linear dimensionalityreduction using relevance weighted LDA. School of\n    Electrical and Electronic Engineering Nanyang Technological University.\n    2005.\n  - Claudio Gentile. A New Approximate Maximal Margin Classification\n    Algorithm. NIPS. 2000.\n",
 'data': array([[  0.,   0.,   5., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,  10.,   0.,   0.],
       [  0.,   0.,   0., ...,  16.,   9.,   0.],
       ..., 
       [  0.,   0.,   1., ...,   6.,   0.,   0.],
       [  0.,   0.,   2., ...,  12.,   0.,   0.],
       [  0.,   0.,  10., ...,  12.,   1.,   0.]]),
 'images': array([[[  0.,   0.,   5., ...,   1.,   0.,   0.],
        [  0.,   0.,  13., ...,  15.,   5.,   0.],
        [  0.,   3.,  15., ...,  11.,   8.,   0.],
        ..., 
        [  0.,   4.,  11., ...,  12.,   7.,   0.],
        [  0.,   2.,  14., ...,  12.,   0.,   0.],
        [  0.,   0.,   6., ...,   0.,   0.,   0.]],

       [[  0.,   0.,   0., ...,   5.,   0.,   0.],
        [  0.,   0.,   0., ...,   9.,   0.,   0.],
        [  0.,   0.,   3., ...,   6.,   0.,   0.],
        ..., 
        [  0.,   0.,   1., ...,   6.,   0.,   0.],
        [  0.,   0.,   1., ...,   6.,   0.,   0.],
        [  0.,   0.,   0., ...,  10.,   0.,   0.]],

       [[  0.,   0.,   0., ...,  12.,   0.,   0.],
        [  0.,   0.,   3., ...,  14.,   0.,   0.],
        [  0.,   0.,   8., ...,  16.,   0.,   0.],
        ..., 
        [  0.,   9.,  16., ...,   0.,   0.,   0.],
        [  0.,   3.,  13., ...,  11.,   5.,   0.],
        [  0.,   0.,   0., ...,  16.,   9.,   0.]],

       ..., 
       [[  0.,   0.,   1., ...,   1.,   0.,   0.],
        [  0.,   0.,  13., ...,   2.,   1.,   0.],
        [  0.,   0.,  16., ...,  16.,   5.,   0.],
        ..., 
        [  0.,   0.,  16., ...,  15.,   0.,   0.],
        [  0.,   0.,  15., ...,  16.,   0.,   0.],
        [  0.,   0.,   2., ...,   6.,   0.,   0.]],

       [[  0.,   0.,   2., ...,   0.,   0.,   0.],
        [  0.,   0.,  14., ...,  15.,   1.,   0.],
        [  0.,   4.,  16., ...,  16.,   7.,   0.],
        ..., 
        [  0.,   0.,   0., ...,  16.,   2.,   0.],
        [  0.,   0.,   4., ...,  16.,   2.,   0.],
        [  0.,   0.,   5., ...,  12.,   0.,   0.]],

       [[  0.,   0.,  10., ...,   1.,   0.,   0.],
        [  0.,   2.,  16., ...,   1.,   0.,   0.],
        [  0.,   0.,  15., ...,  15.,   0.,   0.],
        ..., 
        [  0.,   4.,  16., ...,  16.,   6.,   0.],
        [  0.,   8.,  16., ...,  16.,   8.,   0.],
        [  0.,   1.,   8., ...,  12.,   1.,   0.]]]),
 'target': array([0, 1, 2, ..., 8, 9, 8]),
 'target_names': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])}

In [13]:
X = digits.data
y = digits.target

In [14]:
X.shape


Out[14]:
(1797, 64)

In [15]:
y.shape


Out[15]:
(1797,)

In [45]:
n_samples, n_features = X.shape

In [46]:
n_samples, n_features


Out[46]:
(1797, 64)

In [48]:
for i, j in enumerate(np.random.permutation(X.shape[0])[:6]):
    pl.subplot(1, 6, (i + 1))
    pl.imshow(X[j].reshape((8, 8)), interpolation='nearest')
    pl.title("true class: %d" % y[j])
    pl.xticks(()), pl.yticks(())



In [19]:
for i, j in enumerate(np.random.permutation([x for x in range(12)])):
                      print i,j


0 3
1 2
2 6
3 8
4 0
5 11
6 7
7 5
8 10
9 4
10 9
11 1

In [20]:
print np.random.permutation([np.exp(-x) for x in np.arange(1,5,.5)])


[ 0.04978707  0.011109    0.082085    0.22313016  0.36787944  0.01831564
  0.03019738  0.13533528]

In [21]:
for i,j in  enumerate(np.random.permutation([np.exp(-x) for x in np.arange(1,5,.5)])):
    print i,j


0 0.223130160148
1 0.0111089965382
2 0.135335283237
3 0.367879441171
4 0.0183156388887
5 0.0820849986239
6 0.0497870683679
7 0.0301973834223

In [24]:
for i,j in  enumerate(np.random.permutation([x*x for x in np.arange(1,5,.5)])):
    print i,j


0 4.0
1 2.25
2 20.25
3 16.0
4 1.0
5 9.0
6 12.25
7 6.25

In [25]:
np.random.permutation(10).reshape((5,2))


Out[25]:
array([[0, 4],
       [2, 8],
       [5, 7],
       [3, 1],
       [9, 6]])

In [26]:
X.shape, y.shape


Out[26]:
((1797, 64), (1797,))

In [27]:
%time x_pca= RandomizedPCA(n_components=2).fit_transform(X)


CPU times: user 28 ms, sys: 4 ms, total: 32 ms
Wall time: 31.8 ms

In [28]:
x_pca.shape


Out[28]:
(1797, 2)

In [29]:
from itertools import cycle

In [30]:
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']

In [42]:
for i, c in zip(np.unique(y), cycle(colors)):
    pl.scatter(x_pca[y==i, 0], x_pca[y==i, 1], c=c, label=str(i), alpha=.8)
    pl.legend(loc='upper right')



In [65]:
for i, c in zip(np.unique(y), cycle(colors)):
    print i, c
    print y==i
    #print x_pca[y==i]


0 b
[ True False False ..., False False False]
1 g
[False  True False ..., False False False]
2 r
[False False  True ..., False False False]
3 c
[False False False ..., False False False]
4 m
[False False False ..., False False False]
5 y
[False False False ..., False False False]
6 k
[False False False ..., False False False]
7 b
[False False False ..., False False False]
8 g
[False False False ...,  True False  True]
9 r
[False False False ..., False  True False]

In [60]:
y


Out[60]:
array([0, 1, 2, ..., 8, 9, 8])

In [61]:
y[:20]


Out[61]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [40]:
#printing all 0
pl.scatter(x_pca[y==0,0], x_pca[y==0,1], label='0', c='b', alpha=.5)
pl.legend(loc='upper right')


Out[40]:
<matplotlib.legend.Legend at 0x3b30d10>

In [77]:
from time import sleep
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']

#plot 0s
pl.scatter(x_pca[y==0, 0], x_pca[y==0, 1], label=1, c='b', alpha=0.5)   
#sleep(2)
#pl.show()
#plot 1s
pl.scatter(x_pca[y==1, 0], x_pca[y==1, 1], label=1, c='g', alpha=0.5)        

#plot 2s
pl.scatter(x_pca[y==2, 0], x_pca[y==2, 1], label=1, c='r', alpha=0.5)

#plot 3s
pl.scatter(x_pca[y==3, 0], x_pca[y==3, 1], label=1, c='c', alpha=0.5)
#plot 4s
pl.scatter(x_pca[y==4, 0], x_pca[y==4, 1], label=1, c='m', alpha=0.5)
#plot 5s
pl.scatter(x_pca[y==5, 0], x_pca[y==5, 1], label=1, c='y', alpha=0.5)
#plot 6s
pl.scatter(x_pca[y==6, 0], x_pca[y==6, 1], label=1, c='k', alpha=0.5)
#plot 7
pl.scatter(x_pca[y==7, 0], x_pca[y==7, 1], label=1, c='b', alpha=0.5)

pl.scatter(x_pca[y==8, 0], x_pca[y==8, 1], label=1, c='g', alpha=0.5)

pl.scatter(x_pca[y==9, 0], x_pca[y==9, 1], label=1, c='r', alpha=0.5)


Out[77]:
<matplotlib.collections.PathCollection at 0xb3c8b8c>

In [80]:
x_pca.shape


Out[80]:
(1797, 2)

In [81]:
x_pca


Out[81]:
array([[ -1.26061418, -21.27774313],
       [  7.9583523 ,  20.7674123 ],
       [  6.99188533,   9.95577913],
       ..., 
       [ 10.80016393,   6.95988699],
       [ -4.86672552, -12.42676615],
       [ -0.34518114,  -6.36189919]])

In [83]:
from sklearn import svm

In [85]:
%time svm.SVC().fit(X,y).score(X,y)


CPU times: user 2.95 s, sys: 0.05 s, total: 3.00 s
Wall time: 3.33 s
Out[85]:
1.0

In [86]:
from sklearn.cross_validation import train_test_split

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=0)

In [88]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape


Out[88]:
((1203, 64), (594, 64), (1203,), (594,))

In [99]:
svc = svm.SVC(kernel='rbf').fit(X_train, y_train)
svc_score = svc.score(X_test, y_test)
print svc_score


0.393939393939

In [95]:
print svc


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel=rbf, max_iter=-1, probability=False, shrinking=True, tol=0.001,
  verbose=False)

In [96]:
svc2 = svm.SVC(kernel='rbf', C=100, gamma=.001).fit(X_train, y_train)

In [97]:
svc2


Out[97]:
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.001, kernel='rbf', max_iter=-1, probability=False,
  shrinking=True, tol=0.001, verbose=False)

In [100]:
svc2_score = svc2.score(X_test, y_test)

In [101]:
print svc2_score


0.989898989899

In [125]:
X[train].shape, X[test].shape


Out[125]:
((1617, 64), (180, 64))

In [103]:
from sklearn.cross_validation import ShuffleSplit

cv = ShuffleSplit(n_samples, n_iter=3, test_size=0.1,
    random_state=0)

for cv_index, (train, test) in enumerate(cv):
    print("# Cross Validation Iteration #%d" % cv_index)
    print("train indices: {0}...".format(train[:10]))
    print("test indices: {0}...".format(test[:10]))
    
    svc = svm.SVC(kernel="rbf", C=1, gamma=0.001).fit(X[train], y[train])
    print("train score: {0:.3f}, test score: {1:.3f}\n".format(
        svc.score(X[train], y[train]), svc.score(X[test], y[test])))


# Cross Validation Iteration #0
train indices: [ 353    5   58 1349 1025  575 1074 1110 1745  689]...
test indices: [1081 1707  927  713  262  182  303  895  933 1266]...
train score: 0.999, test score: 0.989

# Cross Validation Iteration #1
train indices: [1336  608  977   22  526 1587 1130  569 1481  962]...
test indices: [1014  755 1633  117  181  501  948 1076   45  659]...
train score: 0.998, test score: 0.994

# Cross Validation Iteration #2
train indices: [ 451  409  911 1551  133  691 1306  111  852  825]...
test indices: [ 795  697  655  573  412  743  635  851 1466 1383]...
train score: 0.999, test score: 0.994


In [121]:
svc = svm.SVC(kernel="poly", degree=3, C=100, gamma=0.001)
cv = ShuffleSplit(n_samples, n_iter=10, test_size=0.1,
    random_state=0)

test_scores = cross_val_score(svc, X, y, cv=cv, n_jobs=2)
print test_scores
print "meanscore=" + str(test_scores.mean())


[ 0.98333333  0.99444444  0.99444444  0.98888889  0.98888889  0.99444444
  0.98333333  0.99444444  0.97222222  1.        ]
meanscore=0.989444444444

In [105]:
n_samples


Out[105]:
1797

In [127]:
np.logspace(-7, -1, 10)


Out[127]:
array([  1.00000000e-07,   4.64158883e-07,   2.15443469e-06,
         1.00000000e-05,   4.64158883e-05,   2.15443469e-04,
         1.00000000e-03,   4.64158883e-03,   2.15443469e-02,
         1.00000000e-01])

In [129]:
n_gammas = 10
n_iter = 5
cv = ShuffleSplit(n_samples, n_iter=n_iter, train_size=500, test_size=500,
    random_state=0)

train_scores = np.zeros((n_gammas, n_iter))
test_scores = np.zeros((n_gammas, n_iter))
gammas = np.logspace(-7, -1, n_gammas)

for i, gamma in enumerate(gammas):
    for j, (train, test) in enumerate(cv):
        clf = svm.SVC(C=10, gamma=gamma).fit(X[train], y[train])
        train_scores[i, j] = clf.score(X[train], y[train])
        test_scores[i, j] = clf.score(X[test], y[test])

In [130]:
cv


Out[130]:
ShuffleSplit(1797, n_iter=5, test_size=500, indices=True, random_state=0)

In [131]:
type(cv)


Out[131]:
sklearn.cross_validation.ShuffleSplit

In [134]:
cv.train_size


Out[134]:
500

In [138]:
for i in range(n_iter):
    pl.semilogx(gammas, train_scores[:, i], alpha=0.4, lw=5, c='b')
    pl.semilogx(gammas, test_scores[:, i], alpha=0.4, lw=1, c='g')
pl.ylabel("score for SVC(C=10, gamma=gamma)")
pl.xlabel("gamma")


Out[138]:
<matplotlib.text.Text at 0xc0bf0cc>

In [139]:
from sklearn.grid_search import GridSearchCV

In [144]:
from pprint import pprint
svc_params = {
    'C': np.logspace(-1, 2, 4),
    'gamma': np.logspace(-4, 0,5),
}
pprint(svc_params)


{'C': array([   0.1,    1. ,   10. ,  100. ]),
 'gamma': array([  1.00000000e-04,   1.00000000e-03,   1.00000000e-02,
         1.00000000e-01,   1.00000000e+00])}

In [172]:
n_subsamples = 1000
X_small_train, y_small_train = X_train[:n_subsamples], y_train[:n_subsamples]

In [173]:
X_small_train.shape, y_small_train.shape


Out[173]:
((1000, 64), (1000,))

In [174]:
gs_svc = GridSearchCV(svm.SVC(), svc_params, cv=14, n_jobs=-1)

%time _ = gs_svc.fit(X_small_train, y_small_train)


CPU times: user 129.42 s, sys: 1.99 s, total: 131.42 s
Wall time: 161.22 s

In [175]:
gs_svc.best_estimator_


Out[175]:
SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.001, kernel='rbf', max_iter=-1, probability=False,
  shrinking=True, tol=0.001, verbose=False)

In [176]:
gs_svc.best_params_, gs_svc.best_score_


Out[176]:
({'C': 10.0, 'gamma': 0.001}, 0.9849653476414042)

In [177]:
def display_scores(params, scores, append_star=False):
    """Format the mean score +/- std error for params"""
    params = ", ".join("{0}={1}".format(k, v)
                      for k, v in params.items())
    line = "{0}:\t{1:.3f} (+/-{2:.3f})".format(
        params, np.mean(scores), sem(scores))
    if append_star:
        line += " *"
    return line

def display_grid_scores(grid_scores, top=None):
    """Helper function to format a report on a grid of scores"""
    
    grid_scores = sorted(grid_scores, key=lambda x: x[1], reverse=True)
    if top is not None:
        grid_scores = grid_scores[:top]
        
    # Compute a threshold for staring models with overlapping
    # stderr:
    _, best_mean, best_scores = grid_scores[0]
    threshold = best_mean - 2 * sem(best_scores)
    
    for params, mean_score, scores in grid_scores:
        append_star = mean_score + 2 * sem(scores) > threshold
        print(display_scores(params, scores, append_star=append_star))

In [178]:
from scipy.stats import sem #standard error mean

def mean_score(scores):
    """Print the empirical mean score and standard error of the mean."""
    return ("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), sem(scores))

display_grid_scores(gs_svc.grid_scores_, top=20)


C=10.0, gamma=0.001:	0.985 (+/-0.004) *
C=100.0, gamma=0.001:	0.985 (+/-0.004) *
C=1.0, gamma=0.001:	0.984 (+/-0.004) *
C=100.0, gamma=0.0001:	0.984 (+/-0.004) *
C=10.0, gamma=0.0001:	0.982 (+/-0.005) *
C=1.0, gamma=0.0001:	0.967 (+/-0.006) *
C=0.1, gamma=0.001:	0.954 (+/-0.007)
C=0.1, gamma=0.0001:	0.775 (+/-0.009)
C=10.0, gamma=0.01:	0.732 (+/-0.012)
C=100.0, gamma=0.01:	0.732 (+/-0.012)
C=1.0, gamma=0.01:	0.713 (+/-0.014)
C=0.1, gamma=1.0:	0.125 (+/-0.010)
C=0.1, gamma=0.01:	0.112 (+/-0.001)
C=10.0, gamma=0.1:	0.111 (+/-0.002)
C=100.0, gamma=0.1:	0.111 (+/-0.002)
C=0.1, gamma=0.1:	0.109 (+/-0.002)
C=1.0, gamma=0.1:	0.109 (+/-0.002)
C=1.0, gamma=1.0:	0.109 (+/-0.002)
C=10.0, gamma=1.0:	0.109 (+/-0.002)
C=100.0, gamma=1.0:	0.109 (+/-0.002)

In [1]:
import pandas as pd

In [ ]: