Homework: https://work.caltech.edu/homework/hw8.pdf

✔ Answers:

e, but not so sure --> d
a ✔
a ✔
c ✔
d ✔
b ✔
b ✔
c ✔
e ✔
c ✔

Answer key: https://work.caltech.edu/homework/hw8_sol.pdf



In [4]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display



In [71]:

    
train = pd.read_fwf('http://www.amlbook.com/data/zip/features.train', header=None, 
                    names=['digit', 'intensity', 'symmetry'])
test = pd.read_fwf('http://www.amlbook.com/data/zip/features.test', header=None, 
                   names=['digit', 'intensity', 'symmetry'])



In [72]:

    
display(train.describe())
display(train.head())
display(test.describe())
display(test.head())









    






  
    
      
      digit
      intensity
      symmetry
    
  
  
    
      count
      7291.000000
      7291.000000
      7291.000000
    
    
      mean
      3.903443
      0.254481
      -3.403779
    
    
      std
      2.996386
      0.092944
      1.492641
    
    
      min
      0.000000
      0.046820
      -7.326688
    
    
      25%
      1.000000
      0.185008
      -4.466406
    
    
      50%
      4.000000
      0.245305
      -3.581875
    
    
      75%
      7.000000
      0.313678
      -2.560031
    
    
      max
      9.000000
      0.655941
      -0.119500
    
  








    






  
    
      
      digit
      intensity
      symmetry
    
  
  
    
      0
      6
      0.341092
      -4.528937
    
    
      1
      5
      0.444131
      -5.496812
    
    
      2
      4
      0.231002
      -2.886750
    
    
      3
      7
      0.200275
      -3.534375
    
    
      4
      3
      0.291936
      -4.352062
    
  








    






  
    
      
      digit
      intensity
      symmetry
    
  
  
    
      count
      2007.000000
      2007.000000
      2007.000000
    
    
      mean
      3.850523
      0.267609
      -3.450593
    
    
      std
      3.018484
      0.099506
      1.479704
    
    
      min
      0.000000
      0.057043
      -7.700000
    
    
      25%
      1.000000
      0.191987
      -4.482000
    
    
      50%
      4.000000
      0.256168
      -3.605625
    
    
      75%
      6.000000
      0.332122
      -2.597500
    
    
      max
      9.000000
      0.642658
      -0.189062
    
  








    






  
    
      
      digit
      intensity
      symmetry
    
  
  
    
      0
      9
      0.272178
      -4.847937
    
    
      1
      6
      0.265133
      -5.102000
    
    
      2
      3
      0.335926
      -2.921562
    
    
      3
      6
      0.264850
      -4.156625
    
    
      4
      6
      0.345338
      -6.718438



In [75]:

    
X_in = train.iloc[:, 1:]
y_in = train.iloc[:, 0]

X_out = test.iloc[:, 1:]
y_out = test.iloc[:, 0]



In [83]:

    
def get_E_in(target, X_in, y_in):
    classifier = svm.SVC(C=0.01, # Penalty parameter C of the error term.
                            kernel='poly', # It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used
                            degree=2, # Degree of the polynomial kernel, ignored by others
                            gamma=1.0, # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used instead.
                            coef0=1.0, # Independent term in kernel function. It is only significant in ‘poly’ and ‘sigmoid’.
                            shrinking=False, # Whether to use the shrinking heuristic.
                            probability=False, # Whether to enable probability estimates. This must be enabled prior to calling fit, and will slow down that method.
                            tol=0.001, # Tolerance for stopping criterion.
                            cache_size=200, # Specify the size of the kernel cache (in MB).
                            class_weight=None, 
                            verbose=False, 
                            max_iter=-1, 
                            decision_function_shape='ovr', # Whether to return a one-vs-rest (‘ovo’) ecision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). The default of None will currently behave as ‘ovo’ for backward compatibility and raise a deprecation warning, but will change ‘ovr’ in 0.18.
                            random_state=None)
    y = y_in.copy()
    y[y != target] = -1
    classifier.fit(X_in, y)
    y_pred = classifier.predict(X_in)
    misclassified = y_pred != y
    return classifier, sum(misclassified)*100./len(misclassified)



In [84]:

    
for target in [0, 2, 4, 6, 8]:
    clf, E_in = get_E_in(target, X_in, y_in)
    print "E_in({}) = {:.2f}%, N_SVs = {}".format(target, E_in, clf.n_support_)









    



E_in(0) = 10.59%, N_SVs = [1090 1089]
E_in(2) = 10.03%, N_SVs = [1258  731]
E_in(4) = 8.94%, N_SVs = [1223  652]
E_in(6) = 9.11%, N_SVs = [1228  664]
E_in(8) = 7.43%, N_SVs = [1239  542]



In [85]:

    
for target in [1, 3, 5, 7, 9]:
    clf, E_in = get_E_in(target, X_in, y_in)
    print "E_in({}) = {:.2f}%, N_SVs = {}".format(target, E_in, clf.n_support_)









    



E_in(1) = 1.44%, N_SVs = [193 193]
E_in(3) = 9.02%, N_SVs = [1288  658]
E_in(5) = 7.63%, N_SVs = [1046  556]
E_in(7) = 8.85%, N_SVs = [1070  645]
E_in(9) = 8.83%, N_SVs = [1341  644]



In [86]:

    
print "Difference in support vectors is: {}".format(1090 + 1089 - 193 - 193)









    



Difference in support vectors is: 1793



In [102]:

    
def computeError(clf, X, y):
    y_pred = clf.predict(X)
    misclassified = y_pred != y
    return sum(misclassified)*100./len(misclassified)
        
def run_1_vs_5(train, test):
    train_subset = train.loc[train.digit.isin([1, 5])]
    X_in = train_subset.iloc[:, 1:]
    y_in = train_subset.iloc[:, 0]

    test_subset = test.loc[test.digit.isin([1, 5])]
    X_out = test_subset.iloc[:, 1:]
    y_out = test_subset.iloc[:, 0]
    
    for Q_val in [2, 5]:
        for C_val in [0.0001, 0.001, 0.01, 0.1, 1]:
            classifier = svm.SVC(C=C_val, # Penalty parameter C of the error term.
                                kernel='poly', # It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used
                                degree=Q_val, # Degree of the polynomial kernel, ignored by others
                                gamma=1.0, # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used instead.
                                coef0=1.0, # Independent term in kernel function. It is only significant in ‘poly’ and ‘sigmoid’.
                                shrinking=False, # Whether to use the shrinking heuristic.
                                probability=False, # Whether to enable probability estimates. This must be enabled prior to calling fit, and will slow down that method.
                                tol=0.001, # Tolerance for stopping criterion.
                                cache_size=200, # Specify the size of the kernel cache (in MB).
                                class_weight=None, 
                                verbose=False, 
                                max_iter=-1, 
                                decision_function_shape='ovr', # Whether to return a one-vs-rest (‘ovo’) ecision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). The default of None will currently behave as ‘ovo’ for backward compatibility and raise a deprecation warning, but will change ‘ovr’ in 0.18.
                                random_state=None)

            classifier.fit(X_in, y_in)
            E_in = computeError(classifier, X_in, y_in)
            E_out = computeError(classifier, X_out, y_out)
            print "Q: {}, C: {:.4f}, SV_s: {}, E_in: {:.2f}, E_out: {:.2f}".format(Q_val, C_val, classifier.n_support_, E_in, E_out)

run_1_vs_5(train, test)









    



Q: 2, C: 0.0001, SV_s: [118 118], E_in: 0.90, E_out: 1.65
Q: 2, C: 0.0010, SV_s: [38 38], E_in: 0.45, E_out: 1.65
Q: 2, C: 0.0100, SV_s: [17 17], E_in: 0.45, E_out: 1.89
Q: 2, C: 0.1000, SV_s: [12 12], E_in: 0.45, E_out: 1.89
Q: 2, C: 1.0000, SV_s: [12 12], E_in: 0.32, E_out: 1.89
Q: 5, C: 0.0001, SV_s: [13 13], E_in: 0.45, E_out: 1.89
Q: 5, C: 0.0010, SV_s: [12 13], E_in: 0.45, E_out: 2.12
Q: 5, C: 0.0100, SV_s: [11 12], E_in: 0.38, E_out: 2.12
Q: 5, C: 0.1000, SV_s: [11 14], E_in: 0.32, E_out: 1.89
Q: 5, C: 1.0000, SV_s: [10 11], E_in: 0.32, E_out: 2.12



In [132]:

    
classifier = svm.SVC(C=.01, # Penalty parameter C of the error term.
                    kernel='poly', # It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used
                    degree=2, # Degree of the polynomial kernel, ignored by others
                    gamma=1.0, # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used instead.
                    coef0=1.0, # Independent term in kernel function. It is only significant in ‘poly’ and ‘sigmoid’.
                    shrinking=False, # Whether to use the shrinking heuristic.
                    probability=False, # Whether to enable probability estimates. This must be enabled prior to calling fit, and will slow down that method.
                    tol=0.001, # Tolerance for stopping criterion.
                    cache_size=200, # Specify the size of the kernel cache (in MB).
                    class_weight=None, 
                    verbose=False, 
                    max_iter=-1, 
                    decision_function_shape='ovr', # Whether to return a one-vs-rest (‘ovo’) ecision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). The default of None will currently behave as ‘ovo’ for backward compatibility and raise a deprecation warning, but will change ‘ovr’ in 0.18.
                    random_state=None)
E_CV_s = []
skf = cross_validation.KFold(len(y_in), n_folds=10)
print len(X_in)
for train_ind, test_ind in skf:
    print len(train_ind), len(X_in.iloc[train_ind, :])
    print len(test_ind), len(X_in.iloc[test_ind, :])
    print
    classifier.fit(X_in.iloc[train_ind, :], y_in[train_ind])
    E_CV_s.append(computeError(classifier, X_in.iloc[test_ind, :], y_in[test_ind]))
print E_CV_s
E_cv = float(sum(E_CV_s))/len(E_CV_s)
print E_cv









    



7291
6561 6561
730 730

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

[60.0, 57.201646090534979, 54.320987654320987, 65.843621399176953, 68.724279835390945, 61.454046639231827, 65.02057613168725, 60.493827160493829, 63.923182441700959, 58.436213991769549]
61.5418381344



In [147]:

    
from sklearn import cross_validation

def computeError(clf, X, y):
    y_pred = clf.predict(X)
    misclassified = y_pred != y
    return sum(misclassified)*100./len(misclassified)

def run_1_vs_5_with_CF(train, test):
    train_subset = train.loc[train.digit.isin([1, 5])]
    X_in = train_subset.iloc[:, 1:]
    y_in = train_subset.iloc[:, 0]

    print "Len of train", (len(train))
    print "Len of train subset", len(train_subset)
    print "Len of X_in and y_in", len(X_in), len(y_in)
    
    test_subset = test.loc[test.digit.isin([1, 5])]
    X_out = test_subset.iloc[:, 1:]
    y_out = test_subset.iloc[:, 0]
    
    Q_val = 2
    chosen = {}
    for i in range(100):
        chosen_C = None
        chosen_E_cv = None
        for C_val in [0.0001, 0.001, 0.01, 0.1, 1]:
            classifier = svm.SVC(C=C_val, # Penalty parameter C of the error term.
                                kernel='poly', # It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used
                                degree=Q_val, # Degree of the polynomial kernel, ignored by others
                                gamma=1.0, # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used instead.
                                coef0=1.0, # Independent term in kernel function. It is only significant in ‘poly’ and ‘sigmoid’.
                                shrinking=False, # Whether to use the shrinking heuristic.
                                probability=False, # Whether to enable probability estimates. This must be enabled prior to calling fit, and will slow down that method.
                                tol=0.001, # Tolerance for stopping criterion.
                                cache_size=200, # Specify the size of the kernel cache (in MB).
                                class_weight=None, 
                                verbose=False, 
                                max_iter=-1, 
                                decision_function_shape='ovr', # Whether to return a one-vs-rest (‘ovo’) ecision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). The default of None will currently behave as ‘ovo’ for backward compatibility and raise a deprecation warning, but will change ‘ovr’ in 0.18.
                                random_state=None)
            E_CV_s = []
            skf = cross_validation.KFold(len(y_in), n_folds=10)
            for train_ind, test_ind in skf:
                classifier.fit(X_in.iloc[train_ind, :], y_in.iloc[train_ind])
                E_CV_s.append(computeError(classifier, X_in.iloc[test_ind, :], y_in.iloc[test_ind]))
            E_cv = float(sum(E_CV_s))/len(E_CV_s)
            if chosen_C is None or chosen_E_cv > E_cv:
                chosen_C = C_val
                chosen_E_cv = E_cv
        if chosen_C not in chosen:
            chosen[chosen_C] = []
        chosen[chosen_C].append(chosen_E_cv)
    for k, v in chosen.iteritems():
        print k, len(v), sum(v)*1./len(v)

run_1_vs_5_with_CF(train, test)









    



Len of train 7291
Len of train subset 1561
Len of X_in and y_in 1561 1561
0.001 100 0.448309652131



In [115]:

    
def computeError(clf, X, y):
    y_pred = clf.predict(X)
    misclassified = y_pred != y
    return sum(misclassified)*100./len(misclassified)
        
def run_1_vs_5_RBF(train, test):
    train_subset = train.loc[train.digit.isin([1, 5])]
    X_in = train_subset.iloc[:, 1:]
    y_in = train_subset.iloc[:, 0]

    test_subset = test.loc[test.digit.isin([1, 5])]
    X_out = test_subset.iloc[:, 1:]
    y_out = test_subset.iloc[:, 0]
    
    for C_val in [0.01, 1, 100, 10 ** 4, 10 ** 6]:
        classifier = svm.SVC(C=C_val, # Penalty parameter C of the error term.
                            kernel='rbf', # It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used
                            gamma=1.0, # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used instead.
                            shrinking=False, # Whether to use the shrinking heuristic.
                            probability=False, # Whether to enable probability estimates. This must be enabled prior to calling fit, and will slow down that method.
                            decision_function_shape='ovr', # Whether to return a one-vs-rest (‘ovo’) ecision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). The default of None will currently behave as ‘ovo’ for backward compatibility and raise a deprecation warning, but will change ‘ovr’ in 0.18.
                            )
        classifier.fit(X_in, y_in)
        E_in = computeError(classifier, X_in, y_in)
        E_out = computeError(classifier, X_out, y_out)
        print "C: {:.4f}, SV_s: {}, E_in: {:.2f}, E_out: {:.2f}".format(C_val, classifier.n_support_, E_in, E_out)

run_1_vs_5_RBF(train, test)









    



C: 0.0100, SV_s: [200 203], E_in: 0.38, E_out: 2.36
C: 1.0000, SV_s: [14 17], E_in: 0.45, E_out: 2.12
C: 100.0000, SV_s: [ 8 14], E_in: 0.32, E_out: 1.89
C: 10000.0000, SV_s: [ 7 12], E_in: 0.26, E_out: 2.36
C: 1000000.0000, SV_s: [8 9], E_in: 0.06, E_out: 2.36



In [ ]:

	digit	intensity	symmetry
count	7291.000000	7291.000000	7291.000000
mean	3.903443	0.254481	-3.403779
std	2.996386	0.092944	1.492641
min	0.000000	0.046820	-7.326688
25%	1.000000	0.185008	-4.466406
50%	4.000000	0.245305	-3.581875
75%	7.000000	0.313678	-2.560031
max	9.000000	0.655941	-0.119500

	digit	intensity	symmetry
0	6	0.341092	-4.528937
1	5	0.444131	-5.496812
2	4	0.231002	-2.886750
3	7	0.200275	-3.534375
4	3	0.291936	-4.352062

	digit	intensity	symmetry
count	2007.000000	2007.000000	2007.000000
mean	3.850523	0.267609	-3.450593
std	3.018484	0.099506	1.479704
min	0.000000	0.057043	-7.700000
25%	1.000000	0.191987	-4.482000
50%	4.000000	0.256168	-3.605625
75%	6.000000	0.332122	-2.597500
max	9.000000	0.642658	-0.189062

	digit	intensity	symmetry
0	9	0.272178	-4.847937
1	6	0.265133	-5.102000
2	3	0.335926	-2.921562
3	6	0.264850	-4.156625
4	6	0.345338	-6.718438