In [1]:
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn import metrics

import numpy as np
import pandas as pd
import scipy.io as sio

load data


In [2]:
mat = sio.loadmat('./data/ex6data3.mat')
print(mat.keys())


dict_keys(['__version__', '__globals__', 'y', 'X', '__header__', 'yval', 'Xval'])

In [3]:
training = pd.DataFrame(mat.get('X'), columns=['X1', 'X2'])
training['y'] = mat.get('y')

cv = pd.DataFrame(mat.get('Xval'), columns=['X1', 'X2'])
cv['y'] = mat.get('yval')

In [4]:
print(training.shape)
training.head()


(211, 3)
Out[4]:
X1 X2 y
0 -0.158986 0.423977 1
1 -0.347926 0.470760 1
2 -0.504608 0.353801 1
3 -0.596774 0.114035 1
4 -0.518433 -0.172515 1

In [5]:
print(cv.shape)
cv.head()


(200, 3)
Out[5]:
X1 X2 y
0 -0.353062 -0.673902 0
1 -0.227126 0.447320 1
2 0.092898 -0.753524 0
3 0.148243 -0.718473 0
4 -0.001512 0.162928 0

In [6]:
candidate = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]

In [7]:
# gamma to comply with sklearn parameter name
combination = [(C, gamma) for C in candidate for gamma in candidate]
len(combination)


Out[7]:
81

In [8]:
search = []

for C, gamma in combination:
    svc = svm.SVC(C=C, gamma=gamma)
    svc.fit(training[['X1', 'X2']], training['y'])
    search.append(svc.score(cv[['X1', 'X2']], cv['y']))

In [9]:
best_score = search[np.argmax(search)]
best_param = combination[np.argmax(search)]

print(best_score, best_param)


0.965 (0.3, 100)

In [10]:
best_svc = svm.SVC(C=100, gamma=0.3)
best_svc.fit(training[['X1', 'X2']], training['y'])
ypred = best_svc.predict(cv[['X1', 'X2']])

print(metrics.classification_report(cv['y'], ypred))


             precision    recall  f1-score   support

          0       0.92      0.96      0.94       113
          1       0.94      0.89      0.91        87

avg / total       0.93      0.93      0.92       200


In [11]:
parameters = {'C': candidate, 'gamma': candidate}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, n_jobs=-1)
clf.fit(training[['X1', 'X2']], training['y'])


Out[11]:
GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'gamma': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [12]:
clf.best_params_


Out[12]:
{'C': 10, 'gamma': 30}

In [13]:
clf.best_score_


Out[13]:
0.90047393364928907

In [14]:
ypred = clf.predict(cv[['X1', 'X2']])
print(metrics.classification_report(cv['y'], ypred))


             precision    recall  f1-score   support

          0       0.95      0.96      0.96       113
          1       0.95      0.93      0.94        87

avg / total       0.95      0.95      0.95       200

curiouly... they are not the same result. What?

So the built in sklearn grid search is trying to find the best candidate from training set
However, when we were doing manual grid search, we train using training set, but we pick the best from cross validation set. This is the reason of difference.

I was wrong. That is not the reason

It turns out that GridSearch will appropriate part of data as CV and use it to find the best candidate.
So the reason for different result is just that GridSearch here is just using part of training data to train because it need part of data as cv set


In [ ]: