In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
In [3]:
from sklearn.datasets import load_iris
In [4]:
iris = load_iris()
X = iris.data
y = iris.target
In [5]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
In [6]:
cross_val_score(LinearSVC(), X, y, cv=5)
Out[6]:
In [7]:
cross_val_score(LinearSVC(), X, y, cv=5, scoring="f1_macro")
Out[7]:
Let's go to a binary task for a moment
In [8]:
y % 2
Out[8]:
In [9]:
cross_val_score(LinearSVC(), X, y % 2)
Out[9]:
In [10]:
cross_val_score(LinearSVC(), X, y % 2, scoring="average_precision")
Out[10]:
In [11]:
cross_val_score(LinearSVC(), X, y % 2, scoring="roc_auc")
Out[11]:
In [12]:
from sklearn.metrics.scorer import SCORERS
print(SCORERS.keys())
Implementing your own scoring metric:
In [13]:
def my_accuracy_scoring(est, X, y):
return np.mean(est.predict(X) == y)
cross_val_score(LinearSVC(), X, y, scoring=my_accuracy_scoring)
Out[13]:
In [14]:
def my_super_scoring(est, X, y):
return np.mean(est.predict(X) == y) - np.mean(est.coef_ != 0)
In [15]:
from sklearn.grid_search import GridSearchCV
y = iris.target
grid = GridSearchCV(LinearSVC(C=.01, dual=False),
param_grid={'penalty' : ['l1', 'l2']},
scoring=my_super_scoring)
grid.fit(X, y)
print(grid.best_params_)
There are other ways to do cross-valiation
In [16]:
from sklearn.cross_validation import ShuffleSplit
shuffle_split = ShuffleSplit(len(X), 10, test_size=.4)
cross_val_score(LinearSVC(), X, y, cv=shuffle_split)
Out[16]:
In [17]:
from sklearn.cross_validation import StratifiedKFold, KFold, ShuffleSplit
def plot_cv(cv, n_samples):
masks = []
for train, test in cv:
mask = np.zeros(n_samples, dtype=bool)
mask[test] = 1
masks.append(mask)
plt.matshow(masks)
In [18]:
plot_cv(StratifiedKFold(y, n_folds=5), len(y))
In [19]:
plot_cv(KFold(len(iris.target), n_folds=5), len(iris.target))
In [20]:
plot_cv(ShuffleSplit(len(iris.target), n_iter=20, test_size=.2),
len(iris.target))
In [ ]:
# %load solutions/cross_validation_iris.py
In [55]:
kf = KFold(len(X), n_folds=5)
print( cross_val_score(LinearSVC(), X, y, cv=kf) )
print( cross_val_score(LinearSVC(), X, y, cv=KFold(len(X), n_folds=3)) )
In [46]:
skf = StratifiedKFold(y, n_folds=5)
cross_val_score(LinearSVC(), X, y, cv=skf)
Out[46]:
In [51]:
plot_cv(KFold(len(X), n_folds=3), len(y))
In [53]:
y
Out[53]:
In [ ]: