In [ ]:
## http://www.ritchieng.com/machine-learning-efficiently-search-tuning-param/
In [1]:
# imports
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
import numpy as np
# read in the iris data
iris = load_iris()
# create X (features) and y (response)
X = iris.data
y = iris.target
print('X matrix dimensionality:', X.shape)
print('Y vector dimensionality:', y.shape)
In [3]:
# 10-fold (cv=10) cross-validation with K=5 (n_neighbors=5) for KNN (the n_neighbors parameter)
# instantiate model
knn = KNeighborsClassifier(n_neighbors=5)
# store scores in scores object
# scoring metric used here is 'accuracy' because it's a classification problem
# cross_val_score takes care of splitting X and y into the 10 folds that's why we pass X and y entirely instead of X_train and y_train
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)
In [4]:
# use average accuracy as an estimate of out-of-sample accuracy
# scores is a numpy array so we can use the mean method
print(scores.mean())
In [5]:
# search for an optimal value of K for KNN
# list of integers 1 to 30
# integers we want to try
k_range = range(1, 31)
# list of scores from k_range
k_scores = []
# 1. we will loop through reasonable values of k
for k in k_range:
# 2. run KNeighborsClassifier with k neighbours
knn = KNeighborsClassifier(n_neighbors=k)
# 3. obtain cross_val_score for KNeighborsClassifier with k neighbours
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
# 4. append mean of scores for k neighbors to k_scores list
k_scores.append(scores.mean())
print(k_scores)
In [6]:
# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
Out[6]:
Allows you to define a grid of parameters that will be searched using K-fold cross-validation
This is like an automated version of the "for loop" above
In [7]:
from sklearn.model_selection import GridSearchCV
# define the parameter values that should be searched
# for python 2, k_range = range(1, 31)
k_range = list(range(1, 31))
print(k_range)
In [11]:
## create a parameter grid: map the parameter names to the values that should be searched
## simply a python dictionary
## key: parameter name
## value: list of values that should be searched for that parameter
## single key-value pair for param_grid
#param_grid = dict(n_neighbors=k_range)
param_grid = {'n_neighbors':k_range}
print(param_grid)
In [12]:
# instantiate the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
In [13]:
# fit the grid with data
grid.fit(X, y)
Out[13]:
In [14]:
# view the complete results (list of named tuples)
grid.grid_scores_
Out[14]:
In [15]:
# examine the first tuple
# we will slice the list and select its elements using dot notation and []
print('Parameters')
print(grid.grid_scores_[0].parameters)
# Array of 10 accuracy scores during 10-fold cv using the parameters
print('')
print('CV Validation Score')
print(grid.grid_scores_[0].cv_validation_scores)
# Mean of the 10 scores
print('')
print('Mean Validation Score')
print(grid.grid_scores_[0].mean_validation_score)
In [16]:
# create a list of the mean scores only
# list comprehension to loop through grid.grid_scores
grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
print(grid_mean_scores)
In [17]:
# plot the results
# this is identical to the one we generated above
plt.plot(k_range, grid_mean_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
Out[17]:
In [18]:
# examine the best model
# Single best score achieved across all params (k)
print(grid.best_score_)
# Dictionary containing the parameters (k) used to generate that score
print(grid.best_params_)
# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid.best_estimator_)
In [19]:
# define the parameter values that should be searched
k_range = list(range(1, 31))
# Another parameter besides k that we might vary is the weights parameters
# default options --> uniform (all points in the neighborhood are weighted equally)
# another option --> distance (weights closer neighbors more heavily than further neighbors)
# we create a list
weight_options = ['uniform', 'distance']
In [20]:
# create a parameter grid: map the parameter names to the values that should be searched
# dictionary = dict(key=values, key=values)
param_grid = dict(n_neighbors=k_range, weights=weight_options)
print(param_grid)
In [21]:
# instantiate and fit the grid
# exhaustive grid-search because it's trying every combination
# 10-fold cross-validation is being performed 30 x 2 = 60 times
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X, y)
Out[21]:
In [22]:
# view the complete results
grid.grid_scores_
Out[22]:
In [23]:
# examine the best model
print(grid.best_score_)
print(grid.best_params_)
# Best score did not improve for this model
In [24]:
# train your model using all data and the best known parameters
# instantiate model with best parameters
knn = KNeighborsClassifier(n_neighbors=13, weights='uniform')
# fit with X and y, not X_train and y_train
# even if we use train/test split, we should train on X and y before making predictions on new data
# otherwise we throw away potential valuable data we can learn from
knn.fit(X, y)
# make a prediction on out-of-sample data
knn.predict([3, 5, 4, 2])
Out[24]:
In [27]:
# shortcut:
# GridSearchCV automatically refits the best model using all of the data
# that best fitted model is stored in grid object
# we can then use prediction using the best fitted model
# code in this cell is the same as the top
grid.predict([3, 5, 4, 2])
Out[27]:
In [ ]: