Classifier scoring and cross validation


In [ ]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits, load_breast_cancer
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split, KFold, StratifiedKFold
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [ ]:
digits = load_digits()
X = digits.data
y = digits.target
clf = SVC(C=1.0, kernel="linear")
for i in range(5):
    xtrain, xtest, ytrain, ytest = train_test_split(X, y)
    clf.fit(xtrain, ytrain)
    print("Iteration %d" % (i + 1), "Accuracy: %f" % clf.score(xtest, ytest))

Q: What is the true accuracy?

Introducing cross validation

CV Iteration 1: Samples 1-1200 in training, 1200 in testing


In [ ]:
xtrain = X[:1200, :]
xtest = X[1200:, :]
ytrain = y[:1200]
ytest = y[1200:]
clf.fit(xtrain, ytrain)
print(clf.score(xtest, ytest))

CV Iteration 2: Samples 601 onwards in traning, 1-600 in testing


In [ ]:
xtrain = X[600:, :]
xtest = X[:600, :]
ytrain = y[600:]
ytest = y[:600]
clf.fit(xtrain, ytrain)
print(clf.score(xtest, ytest))

CV Iteration 3: Samples 1-600 and 1201 onwards in training, 601-1200 in testing


In [ ]:
xtrain = np.r_[X[:600, :], X[1200:, :]]
xtest = X[600:1200, :]
ytrain = np.r_[y[:600], y[1200:]]
ytest = y[600:1200]
clf.fit(xtrain, ytrain)
print(clf.score(xtest, ytest))

KFold cross validation


In [ ]:
from sklearn.cross_validation import KFold
kfold = KFold(n=X.shape[0], n_folds=6, shuffle=True)

In [ ]:
for train_index, test_index in kfold:
    xtrain = X[train_index, :]
    ytrain = y[train_index]
    xtest = X[test_index, :]
    ytest = y[test_index]
    clf.fit(xtrain, ytrain)
    print(clf.score(xtest, ytest))

Exercise: Try KFold cross validation on the following dataset:


In [ ]:
_x1 = np.random.multivariate_normal(mean=[0, 0], cov=np.array([[0, 0.5], [0.5, 0]]), size=(900,))
_x2 = np.random.multivariate_normal(mean=[0.75, 0.75], cov=np.array([[0, 0.125], [0.125, 0]]), size=(100,))
X = np.r_[_x1, _x2]
y = np.zeros((X.shape[0],))
y[900:] = 1
rand_ix = np.arange(1000)
np.random.shuffle(rand_ix)
X = X[rand_ix, :]
y = y[rand_ix]

In [ ]:
# enter code here

Q: What can we do to reduce variation in scores?


In [ ]:
kfold = StratifiedKFold(y, n_folds=6, shuffle=True)
for train_index, test_index in kfold:
    xtrain = X[train_index, :]
    ytrain = y[train_index]
    xtest = X[test_index, :]
    ytest = y[test_index]
    clf.fit(xtrain, ytrain)
    print(clf.score(xtest, ytest))

Putting it all together


In [ ]:
from sklearn.cross_validation import cross_val_score
cross_val_score(clf, X, y, cv=StratifiedKFold(y, n_folds=6))

Using Cross Validation to select hyperparameters


In [ ]:
X = digits.data
y = digits.target

In [ ]:
clf = SVC(kernel="linear")
cross_val_score(clf, X, y, cv=StratifiedKFold(y, n_folds=6))

In [ ]:
Cs = np.logspace(-10, 0, 10)
accuracies = []
for C in Cs:
    clf = SVC(C=C, kernel="linear")
    acc = cross_val_score(clf, X, y)
    accuracies.append(acc.mean())
plt.semilogx(Cs, accuracies)
plt.xlabel("$\lambda$")
plt.ylabel("Mean score")

Exercise: Find the optimal regularization parameter for LogisticRegression on the breast cancer dataset


In [ ]:
bc = load_breast_cancer()
X = bc.data
y = bc.target

In [ ]:
# enter code here

Automating Hyperparameter Selection


In [ ]:
from sklearn.grid_search import GridSearchCV

In [ ]:
grid = {'C': Cs}

In [ ]:
gcv = GridSearchCV(clf, param_grid=grid, n_jobs=-1)

In [ ]:
gcv.fit(X, y)

In [ ]:
gcv.best_estimator_

In [ ]:
gcv.best_params_

In [ ]:
gcv.best_score_

Exercise: Find best parameters for SVC for digits dataset using the following grid:


In [ ]:
X = digits.data
y = digits.target
grid = {"C": Cs, "kernel": ['linear', 'poly', 'linear']}

In [ ]:
# enter code here

In [ ]:
gcv.grid_scores_