From the video series: Introduction to machine learning with scikit-learn
Steps for cross-validation:
Benefits of cross-validation:
Drawbacks of cross-validation:
Goal: Select the best tuning parameters (aka "hyperparameters") for KNN on the iris dataset
In [2]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
# read in the iris data
iris = load_iris()
# create X (features) and y (response)
X = iris.data
y = iris.target
In [4]:
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print scores
In [5]:
# use average accuracy as an estimate of out-of-sample accuracy
print scores.mean()
In [6]:
# search for an optimal value of K for KNN
k_range = range(1, 31)
k_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
k_scores.append(scores.mean())
print k_scores
In [7]:
# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
Out[7]:
Allows you to define a grid of parameters that will be searched using K-fold cross-validation
In [8]:
from sklearn.grid_search import GridSearchCV
In [9]:
# define the parameter values that should be searched
k_range = range(1, 31)
print k_range
In [10]:
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range)
print param_grid
In [11]:
# instantiate the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
n_jobs = -1
to run computations in parallel (if supported by your computer and OS)
In [12]:
# fit the grid with data
grid.fit(X, y)
Out[12]:
In [13]:
# view the complete results (list of named tuples)
grid.grid_scores_
Out[13]:
In [14]:
# examine the first tuple
print grid.grid_scores_[0].parameters
print grid.grid_scores_[0].cv_validation_scores
print grid.grid_scores_[0].mean_validation_score
In [15]:
# create a list of the mean scores only
grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
print grid_mean_scores
In [16]:
# plot the results
plt.plot(k_range, grid_mean_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
Out[16]:
In [17]:
# examine the best model
print grid.best_score_
print grid.best_params_
print grid.best_estimator_
max_depth
and min_samples_leaf
for a DecisionTreeClassifier
max_depth
while leaving min_samples_leaf
at its default value, and vice versa
In [18]:
# define the parameter values that should be searched
k_range = range(1, 31)
weight_options = ['uniform', 'distance']
In [19]:
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range, weights=weight_options)
print param_grid
In [20]:
# instantiate and fit the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X, y)
Out[20]:
In [21]:
# view the complete results
grid.grid_scores_
Out[21]:
In [22]:
# examine the best model
print grid.best_score_
print grid.best_params_
In [23]:
# train your model using all data and the best known parameters
knn = KNeighborsClassifier(n_neighbors=13, weights='uniform')
knn.fit(X, y)
# make a prediction on out-of-sample data
knn.predict([3, 5, 4, 2])
Out[23]:
In [24]:
# shortcut: GridSearchCV automatically refits the best model using all of the data
grid.predict([3, 5, 4, 2])
Out[24]:
RandomizedSearchCV
searches a subset of the parameters, and you control the computational "budget"
In [25]:
from sklearn.grid_search import RandomizedSearchCV
In [26]:
# specify "parameter distributions" rather than a "parameter grid"
param_dist = dict(n_neighbors=k_range, weights=weight_options)
In [27]:
# n_iter controls the number of searches
rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5)
rand.fit(X, y)
rand.grid_scores_
Out[27]:
In [28]:
# examine the best model
print rand.best_score_
print rand.best_params_
In [29]:
# run RandomizedSearchCV 20 times (with n_iter=10) and record the best score
best_scores = []
for _ in range(20):
rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10)
rand.fit(X, y)
best_scores.append(round(rand.best_score_, 3))
print best_scores
In [1]:
from IPython.core.display import HTML
def css_styling():
styles = open("styles/custom.css", "r").read()
return HTML(styles)
css_styling()
Out[1]: