From the video series: Introduction to machine learning with scikit-learn
Steps for cross-validation:
Benefits of cross-validation:
Drawbacks of cross-validation:
Goal: Select the best tuning parameters (aka "hyperparameters") for KNN on the iris dataset
In [2]:
    
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
import matplotlib.pyplot as plt
%matplotlib inline
    
In [3]:
    
# read in the iris data
iris = load_iris()
# create X (features) and y (response)
X = iris.data
y = iris.target
    
In [4]:
    
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print scores
    
    
In [5]:
    
# use average accuracy as an estimate of out-of-sample accuracy
print scores.mean()
    
    
In [6]:
    
# search for an optimal value of K for KNN
k_range = range(1, 31)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
print k_scores
    
    
In [7]:
    
# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
    
    Out[7]:
    
Allows you to define a grid of parameters that will be searched using K-fold cross-validation
In [8]:
    
from sklearn.grid_search import GridSearchCV
    
In [9]:
    
# define the parameter values that should be searched
k_range = range(1, 31)
print k_range
    
    
In [10]:
    
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range)
print param_grid
    
    
In [11]:
    
# instantiate the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    
n_jobs = -1 to run computations in parallel (if supported by your computer and OS)
In [12]:
    
# fit the grid with data
grid.fit(X, y)
    
    Out[12]:
In [13]:
    
# view the complete results (list of named tuples)
grid.grid_scores_
    
    Out[13]:
In [14]:
    
# examine the first tuple
print grid.grid_scores_[0].parameters
print grid.grid_scores_[0].cv_validation_scores
print grid.grid_scores_[0].mean_validation_score
    
    
In [15]:
    
# create a list of the mean scores only
grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
print grid_mean_scores
    
    
In [16]:
    
# plot the results
plt.plot(k_range, grid_mean_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
    
    Out[16]:
    
In [17]:
    
# examine the best model
print grid.best_score_
print grid.best_params_
print grid.best_estimator_
    
    
max_depth and min_samples_leaf for a DecisionTreeClassifiermax_depth while leaving min_samples_leaf at its default value, and vice versa
In [18]:
    
# define the parameter values that should be searched
k_range = range(1, 31)
weight_options = ['uniform', 'distance']
    
In [19]:
    
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range, weights=weight_options)
print param_grid
    
    
In [20]:
    
# instantiate and fit the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X, y)
    
    Out[20]:
In [21]:
    
# view the complete results
grid.grid_scores_
    
    Out[21]:
In [22]:
    
# examine the best model
print grid.best_score_
print grid.best_params_
    
    
In [23]:
    
# train your model using all data and the best known parameters
knn = KNeighborsClassifier(n_neighbors=13, weights='uniform')
knn.fit(X, y)
# make a prediction on out-of-sample data
knn.predict([3, 5, 4, 2])
    
    Out[23]:
In [24]:
    
# shortcut: GridSearchCV automatically refits the best model using all of the data
grid.predict([3, 5, 4, 2])
    
    Out[24]:
RandomizedSearchCV searches a subset of the parameters, and you control the computational "budget"
In [25]:
    
from sklearn.grid_search import RandomizedSearchCV
    
In [26]:
    
# specify "parameter distributions" rather than a "parameter grid"
param_dist = dict(n_neighbors=k_range, weights=weight_options)
    
In [27]:
    
# n_iter controls the number of searches
rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5)
rand.fit(X, y)
rand.grid_scores_
    
    Out[27]:
In [28]:
    
# examine the best model
print rand.best_score_
print rand.best_params_
    
    
In [29]:
    
# run RandomizedSearchCV 20 times (with n_iter=10) and record the best score
best_scores = []
for _ in range(20):
    rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10)
    rand.fit(X, y)
    best_scores.append(round(rand.best_score_, 3))
print best_scores
    
    
In [1]:
    
from IPython.core.display import HTML
def css_styling():
    styles = open("styles/custom.css", "r").read()
    return HTML(styles)
css_styling()
    
    Out[1]: