Model Selection via Validation


In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn import model_selection
from sklearn import metrics

from sklearn.datasets import load_digits
digits = load_digits()

X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
                            test_size=0.25, random_state=0)

for Model in [GaussianNB, KNeighborsClassifier, LinearSVC]:
    clf = Model().fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('%s: %s' %
          (Model.__name__, metrics.f1_score(y_test, y_pred, average="macro")))


GaussianNB: 0.833274168101
KNeighborsClassifier: 0.980456280495
LinearSVC: 0.932654051787

Cross-validation


In [8]:
clf = KNeighborsClassifier()
from sklearn.model_selection import cross_val_score
cross_val_score(clf, X, y, cv=5)


Out[8]:
array([ 0.9478022 ,  0.9558011 ,  0.96657382,  0.98039216,  0.96338028])

We can use different splitting strategies, such as random splitting (There exists many different cross-validation strategies in scikit-learn. They are often useful to take in account non iid datasets)


In [10]:
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=5)
cross_val_score(clf, X, y, cv=cv)


Out[10]:
array([ 0.98888889,  0.97777778,  0.98888889,  0.97222222,  0.97222222])

Hyperparameter optimization with cross-validation


In [11]:
from sklearn.datasets import load_diabetes
data = load_diabetes()
X, y = data.data, data.target
print(X.shape)


(442, 10)

In [ ]:


In [12]:
from sklearn.linear_model import Ridge, Lasso

for Model in [Ridge, Lasso]:
    model = Model()
    print('%s: %s' % (Model.__name__, cross_val_score(model, X, y).mean()))


Ridge: 0.409427438303
Lasso: 0.353800083299