Homework: https://work.caltech.edu/homework/hw8.pdf

✔ Answers:

  1. e, but not so sure --> d
  2. a ✔
  3. a ✔
  4. c ✔
  5. d ✔
  6. b ✔
  7. b ✔
  8. c ✔
  9. e ✔
  10. c ✔

Answer key: https://work.caltech.edu/homework/hw8_sol.pdf


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display

In [71]:
train = pd.read_fwf('http://www.amlbook.com/data/zip/features.train', header=None, 
                    names=['digit', 'intensity', 'symmetry'])
test = pd.read_fwf('http://www.amlbook.com/data/zip/features.test', header=None, 
                   names=['digit', 'intensity', 'symmetry'])

In [72]:
display(train.describe())
display(train.head())
display(test.describe())
display(test.head())


digit intensity symmetry
count 7291.000000 7291.000000 7291.000000
mean 3.903443 0.254481 -3.403779
std 2.996386 0.092944 1.492641
min 0.000000 0.046820 -7.326688
25% 1.000000 0.185008 -4.466406
50% 4.000000 0.245305 -3.581875
75% 7.000000 0.313678 -2.560031
max 9.000000 0.655941 -0.119500
digit intensity symmetry
0 6 0.341092 -4.528937
1 5 0.444131 -5.496812
2 4 0.231002 -2.886750
3 7 0.200275 -3.534375
4 3 0.291936 -4.352062
digit intensity symmetry
count 2007.000000 2007.000000 2007.000000
mean 3.850523 0.267609 -3.450593
std 3.018484 0.099506 1.479704
min 0.000000 0.057043 -7.700000
25% 1.000000 0.191987 -4.482000
50% 4.000000 0.256168 -3.605625
75% 6.000000 0.332122 -2.597500
max 9.000000 0.642658 -0.189062
digit intensity symmetry
0 9 0.272178 -4.847937
1 6 0.265133 -5.102000
2 3 0.335926 -2.921562
3 6 0.264850 -4.156625
4 6 0.345338 -6.718438

In [75]:
X_in = train.iloc[:, 1:]
y_in = train.iloc[:, 0]

X_out = test.iloc[:, 1:]
y_out = test.iloc[:, 0]

In [83]:
def get_E_in(target, X_in, y_in):
    classifier = svm.SVC(C=0.01, # Penalty parameter C of the error term.
                            kernel='poly', # It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used
                            degree=2, # Degree of the polynomial kernel, ignored by others
                            gamma=1.0, # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used instead.
                            coef0=1.0, # Independent term in kernel function. It is only significant in ‘poly’ and ‘sigmoid’.
                            shrinking=False, # Whether to use the shrinking heuristic.
                            probability=False, # Whether to enable probability estimates. This must be enabled prior to calling fit, and will slow down that method.
                            tol=0.001, # Tolerance for stopping criterion.
                            cache_size=200, # Specify the size of the kernel cache (in MB).
                            class_weight=None, 
                            verbose=False, 
                            max_iter=-1, 
                            decision_function_shape='ovr', # Whether to return a one-vs-rest (‘ovo’) ecision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). The default of None will currently behave as ‘ovo’ for backward compatibility and raise a deprecation warning, but will change ‘ovr’ in 0.18.
                            random_state=None)
    y = y_in.copy()
    y[y != target] = -1
    classifier.fit(X_in, y)
    y_pred = classifier.predict(X_in)
    misclassified = y_pred != y
    return classifier, sum(misclassified)*100./len(misclassified)

In [84]:
for target in [0, 2, 4, 6, 8]:
    clf, E_in = get_E_in(target, X_in, y_in)
    print "E_in({}) = {:.2f}%, N_SVs = {}".format(target, E_in, clf.n_support_)


E_in(0) = 10.59%, N_SVs = [1090 1089]
E_in(2) = 10.03%, N_SVs = [1258  731]
E_in(4) = 8.94%, N_SVs = [1223  652]
E_in(6) = 9.11%, N_SVs = [1228  664]
E_in(8) = 7.43%, N_SVs = [1239  542]

In [85]:
for target in [1, 3, 5, 7, 9]:
    clf, E_in = get_E_in(target, X_in, y_in)
    print "E_in({}) = {:.2f}%, N_SVs = {}".format(target, E_in, clf.n_support_)


E_in(1) = 1.44%, N_SVs = [193 193]
E_in(3) = 9.02%, N_SVs = [1288  658]
E_in(5) = 7.63%, N_SVs = [1046  556]
E_in(7) = 8.85%, N_SVs = [1070  645]
E_in(9) = 8.83%, N_SVs = [1341  644]

In [86]:
print "Difference in support vectors is: {}".format(1090 + 1089 - 193 - 193)


Difference in support vectors is: 1793

In [102]:
def computeError(clf, X, y):
    y_pred = clf.predict(X)
    misclassified = y_pred != y
    return sum(misclassified)*100./len(misclassified)
        
def run_1_vs_5(train, test):
    train_subset = train.loc[train.digit.isin([1, 5])]
    X_in = train_subset.iloc[:, 1:]
    y_in = train_subset.iloc[:, 0]

    test_subset = test.loc[test.digit.isin([1, 5])]
    X_out = test_subset.iloc[:, 1:]
    y_out = test_subset.iloc[:, 0]
    
    for Q_val in [2, 5]:
        for C_val in [0.0001, 0.001, 0.01, 0.1, 1]:
            classifier = svm.SVC(C=C_val, # Penalty parameter C of the error term.
                                kernel='poly', # It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used
                                degree=Q_val, # Degree of the polynomial kernel, ignored by others
                                gamma=1.0, # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used instead.
                                coef0=1.0, # Independent term in kernel function. It is only significant in ‘poly’ and ‘sigmoid’.
                                shrinking=False, # Whether to use the shrinking heuristic.
                                probability=False, # Whether to enable probability estimates. This must be enabled prior to calling fit, and will slow down that method.
                                tol=0.001, # Tolerance for stopping criterion.
                                cache_size=200, # Specify the size of the kernel cache (in MB).
                                class_weight=None, 
                                verbose=False, 
                                max_iter=-1, 
                                decision_function_shape='ovr', # Whether to return a one-vs-rest (‘ovo’) ecision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). The default of None will currently behave as ‘ovo’ for backward compatibility and raise a deprecation warning, but will change ‘ovr’ in 0.18.
                                random_state=None)

            classifier.fit(X_in, y_in)
            E_in = computeError(classifier, X_in, y_in)
            E_out = computeError(classifier, X_out, y_out)
            print "Q: {}, C: {:.4f}, SV_s: {}, E_in: {:.2f}, E_out: {:.2f}".format(Q_val, C_val, classifier.n_support_, E_in, E_out)

run_1_vs_5(train, test)


Q: 2, C: 0.0001, SV_s: [118 118], E_in: 0.90, E_out: 1.65
Q: 2, C: 0.0010, SV_s: [38 38], E_in: 0.45, E_out: 1.65
Q: 2, C: 0.0100, SV_s: [17 17], E_in: 0.45, E_out: 1.89
Q: 2, C: 0.1000, SV_s: [12 12], E_in: 0.45, E_out: 1.89
Q: 2, C: 1.0000, SV_s: [12 12], E_in: 0.32, E_out: 1.89
Q: 5, C: 0.0001, SV_s: [13 13], E_in: 0.45, E_out: 1.89
Q: 5, C: 0.0010, SV_s: [12 13], E_in: 0.45, E_out: 2.12
Q: 5, C: 0.0100, SV_s: [11 12], E_in: 0.38, E_out: 2.12
Q: 5, C: 0.1000, SV_s: [11 14], E_in: 0.32, E_out: 1.89
Q: 5, C: 1.0000, SV_s: [10 11], E_in: 0.32, E_out: 2.12

In [132]:
classifier = svm.SVC(C=.01, # Penalty parameter C of the error term.
                    kernel='poly', # It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used
                    degree=2, # Degree of the polynomial kernel, ignored by others
                    gamma=1.0, # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used instead.
                    coef0=1.0, # Independent term in kernel function. It is only significant in ‘poly’ and ‘sigmoid’.
                    shrinking=False, # Whether to use the shrinking heuristic.
                    probability=False, # Whether to enable probability estimates. This must be enabled prior to calling fit, and will slow down that method.
                    tol=0.001, # Tolerance for stopping criterion.
                    cache_size=200, # Specify the size of the kernel cache (in MB).
                    class_weight=None, 
                    verbose=False, 
                    max_iter=-1, 
                    decision_function_shape='ovr', # Whether to return a one-vs-rest (‘ovo’) ecision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). The default of None will currently behave as ‘ovo’ for backward compatibility and raise a deprecation warning, but will change ‘ovr’ in 0.18.
                    random_state=None)
E_CV_s = []
skf = cross_validation.KFold(len(y_in), n_folds=10)
print len(X_in)
for train_ind, test_ind in skf:
    print len(train_ind), len(X_in.iloc[train_ind, :])
    print len(test_ind), len(X_in.iloc[test_ind, :])
    print
    classifier.fit(X_in.iloc[train_ind, :], y_in[train_ind])
    E_CV_s.append(computeError(classifier, X_in.iloc[test_ind, :], y_in[test_ind]))
print E_CV_s
E_cv = float(sum(E_CV_s))/len(E_CV_s)
print E_cv


7291
6561 6561
730 730

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

6562 6562
729 729

[60.0, 57.201646090534979, 54.320987654320987, 65.843621399176953, 68.724279835390945, 61.454046639231827, 65.02057613168725, 60.493827160493829, 63.923182441700959, 58.436213991769549]
61.5418381344

In [147]:
from sklearn import cross_validation

def computeError(clf, X, y):
    y_pred = clf.predict(X)
    misclassified = y_pred != y
    return sum(misclassified)*100./len(misclassified)

def run_1_vs_5_with_CF(train, test):
    train_subset = train.loc[train.digit.isin([1, 5])]
    X_in = train_subset.iloc[:, 1:]
    y_in = train_subset.iloc[:, 0]

    print "Len of train", (len(train))
    print "Len of train subset", len(train_subset)
    print "Len of X_in and y_in", len(X_in), len(y_in)
    
    test_subset = test.loc[test.digit.isin([1, 5])]
    X_out = test_subset.iloc[:, 1:]
    y_out = test_subset.iloc[:, 0]
    
    Q_val = 2
    chosen = {}
    for i in range(100):
        chosen_C = None
        chosen_E_cv = None
        for C_val in [0.0001, 0.001, 0.01, 0.1, 1]:
            classifier = svm.SVC(C=C_val, # Penalty parameter C of the error term.
                                kernel='poly', # It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used
                                degree=Q_val, # Degree of the polynomial kernel, ignored by others
                                gamma=1.0, # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used instead.
                                coef0=1.0, # Independent term in kernel function. It is only significant in ‘poly’ and ‘sigmoid’.
                                shrinking=False, # Whether to use the shrinking heuristic.
                                probability=False, # Whether to enable probability estimates. This must be enabled prior to calling fit, and will slow down that method.
                                tol=0.001, # Tolerance for stopping criterion.
                                cache_size=200, # Specify the size of the kernel cache (in MB).
                                class_weight=None, 
                                verbose=False, 
                                max_iter=-1, 
                                decision_function_shape='ovr', # Whether to return a one-vs-rest (‘ovo’) ecision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). The default of None will currently behave as ‘ovo’ for backward compatibility and raise a deprecation warning, but will change ‘ovr’ in 0.18.
                                random_state=None)
            E_CV_s = []
            skf = cross_validation.KFold(len(y_in), n_folds=10)
            for train_ind, test_ind in skf:
                classifier.fit(X_in.iloc[train_ind, :], y_in.iloc[train_ind])
                E_CV_s.append(computeError(classifier, X_in.iloc[test_ind, :], y_in.iloc[test_ind]))
            E_cv = float(sum(E_CV_s))/len(E_CV_s)
            if chosen_C is None or chosen_E_cv > E_cv:
                chosen_C = C_val
                chosen_E_cv = E_cv
        if chosen_C not in chosen:
            chosen[chosen_C] = []
        chosen[chosen_C].append(chosen_E_cv)
    for k, v in chosen.iteritems():
        print k, len(v), sum(v)*1./len(v)

run_1_vs_5_with_CF(train, test)


Len of train 7291
Len of train subset 1561
Len of X_in and y_in 1561 1561
0.001 100 0.448309652131

In [115]:
def computeError(clf, X, y):
    y_pred = clf.predict(X)
    misclassified = y_pred != y
    return sum(misclassified)*100./len(misclassified)
        
def run_1_vs_5_RBF(train, test):
    train_subset = train.loc[train.digit.isin([1, 5])]
    X_in = train_subset.iloc[:, 1:]
    y_in = train_subset.iloc[:, 0]

    test_subset = test.loc[test.digit.isin([1, 5])]
    X_out = test_subset.iloc[:, 1:]
    y_out = test_subset.iloc[:, 0]
    
    for C_val in [0.01, 1, 100, 10 ** 4, 10 ** 6]:
        classifier = svm.SVC(C=C_val, # Penalty parameter C of the error term.
                            kernel='rbf', # It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used
                            gamma=1.0, # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used instead.
                            shrinking=False, # Whether to use the shrinking heuristic.
                            probability=False, # Whether to enable probability estimates. This must be enabled prior to calling fit, and will slow down that method.
                            decision_function_shape='ovr', # Whether to return a one-vs-rest (‘ovo’) ecision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). The default of None will currently behave as ‘ovo’ for backward compatibility and raise a deprecation warning, but will change ‘ovr’ in 0.18.
                            )
        classifier.fit(X_in, y_in)
        E_in = computeError(classifier, X_in, y_in)
        E_out = computeError(classifier, X_out, y_out)
        print "C: {:.4f}, SV_s: {}, E_in: {:.2f}, E_out: {:.2f}".format(C_val, classifier.n_support_, E_in, E_out)

run_1_vs_5_RBF(train, test)


C: 0.0100, SV_s: [200 203], E_in: 0.38, E_out: 2.36
C: 1.0000, SV_s: [14 17], E_in: 0.45, E_out: 2.12
C: 100.0000, SV_s: [ 8 14], E_in: 0.32, E_out: 1.89
C: 10000.0000, SV_s: [ 7 12], E_in: 0.26, E_out: 2.36
C: 1000000.0000, SV_s: [8 9], E_in: 0.06, E_out: 2.36

In [ ]: