In [1]:
# imports 
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import numpy as np
# read in the iris data
iris = load_iris()

# create X (features) and y (response)
X =
y =

print('X matrix dimensionality:', X.shape)
print('Y vector dimensionality:', y.shape)

X matrix dimensionality: (150, 4)
Y vector dimensionality: (150,)

In [3]:
# 10-fold (cv=10) cross-validation with K=5 (n_neighbors=5) for KNN (the n_neighbors parameter)

# instantiate model
knn = KNeighborsClassifier(n_neighbors=5)

# store scores in scores object
# scoring metric used here is 'accuracy' because it's a classification problem
# cross_val_score takes care of splitting X and y into the 10 folds that's why we pass X and y entirely instead of X_train and y_train
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')

[ 1.          0.93333333  1.          1.          0.86666667  0.93333333
  0.93333333  1.          1.          1.        ]

In [4]:
# use average accuracy as an estimate of out-of-sample accuracy

# scores is a numpy array so we can use the mean method


In [5]:
# search for an optimal value of K for KNN

# list of integers 1 to 30
# integers we want to try
k_range = range(1, 31)

# list of scores from k_range
k_scores = []

# 1. we will loop through reasonable values of k
for k in k_range:
    # 2. run KNeighborsClassifier with k neighbours
    knn = KNeighborsClassifier(n_neighbors=k)
    # 3. obtain cross_val_score for KNeighborsClassifier with k neighbours
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    # 4. append mean of scores for k neighbors to k_scores list

[0.95999999999999996, 0.95333333333333337, 0.96666666666666656, 0.96666666666666656, 0.96666666666666679, 0.96666666666666679, 0.96666666666666679, 0.96666666666666679, 0.97333333333333338, 0.96666666666666679, 0.96666666666666679, 0.97333333333333338, 0.98000000000000009, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.98000000000000009, 0.97333333333333338, 0.98000000000000009, 0.96666666666666656, 0.96666666666666656, 0.97333333333333338, 0.95999999999999996, 0.96666666666666656, 0.95999999999999996, 0.96666666666666656, 0.95333333333333337, 0.95333333333333337, 0.95333333333333337]

In [6]:
# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')

<matplotlib.text.Text at 0x20f12d44f60>
  1. More efficient parameter tuning using GridSearchCV

Allows you to define a grid of parameters that will be searched using K-fold cross-validation

This is like an automated version of the "for loop" above

In [7]:
from sklearn.model_selection import GridSearchCV
# define the parameter values that should be searched
# for python 2, k_range = range(1, 31)
k_range = list(range(1, 31))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]

In [11]:
## create a parameter grid: map the parameter names to the values that should be searched
## simply a python dictionary
## key: parameter name
## value: list of values that should be searched for that parameter
## single key-value pair for param_grid
#param_grid = dict(n_neighbors=k_range)
param_grid = {'n_neighbors':k_range}

{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]}

In [12]:
# instantiate the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')

In [13]:
# fit the grid with data, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=30, p=2,
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [14]:
# view the complete results (list of named tuples)

[mean: 0.96000, std: 0.05333, params: {'n_neighbors': 1},
 mean: 0.95333, std: 0.05207, params: {'n_neighbors': 2},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 3},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 4},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 5},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 6},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 7},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 8},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 9},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 10},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 11},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 12},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 13},
 mean: 0.97333, std: 0.04422, params: {'n_neighbors': 14},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 15},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 16},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 17},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 18},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 19},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 20},
 mean: 0.96667, std: 0.03333, params: {'n_neighbors': 21},
 mean: 0.96667, std: 0.03333, params: {'n_neighbors': 22},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 23},
 mean: 0.96000, std: 0.04422, params: {'n_neighbors': 24},
 mean: 0.96667, std: 0.03333, params: {'n_neighbors': 25},
 mean: 0.96000, std: 0.04422, params: {'n_neighbors': 26},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 27},
 mean: 0.95333, std: 0.04269, params: {'n_neighbors': 28},
 mean: 0.95333, std: 0.04269, params: {'n_neighbors': 29},
 mean: 0.95333, std: 0.04269, params: {'n_neighbors': 30}]

In [15]:
# examine the first tuple
# we will slice the list and select its elements using dot notation and []


# Array of 10 accuracy scores during 10-fold cv using the parameters
print('CV Validation Score')

# Mean of the 10 scores
print('Mean Validation Score')

{'n_neighbors': 1}

CV Validation Score
[ 1.          0.93333333  1.          0.93333333  0.86666667  1.
  0.86666667  1.          1.          1.        ]

Mean Validation Score
In [16]:
# create a list of the mean scores only
# list comprehension to loop through grid.grid_scores
grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]

[0.95999999999999996, 0.95333333333333337, 0.96666666666666667, 0.96666666666666667, 0.96666666666666667, 0.96666666666666667, 0.96666666666666667, 0.96666666666666667, 0.97333333333333338, 0.96666666666666667, 0.96666666666666667, 0.97333333333333338, 0.97999999999999998, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.97999999999999998, 0.97333333333333338, 0.97999999999999998, 0.96666666666666667, 0.96666666666666667, 0.97333333333333338, 0.95999999999999996, 0.96666666666666667, 0.95999999999999996, 0.96666666666666667, 0.95333333333333337, 0.95333333333333337, 0.95333333333333337]
In [17]:
# plot the results
# this is identical to the one we generated above
plt.plot(k_range, grid_mean_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')

<matplotlib.text.Text at 0x20f1326d0f0>

In [18]:
# examine the best model

# Single best score achieved across all params (k)

# Dictionary containing the parameters (k) used to generate that score

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify

{'n_neighbors': 13}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=13, p=2,

In [19]:
# define the parameter values that should be searched
k_range = list(range(1, 31))

# Another parameter besides k that we might vary is the weights parameters
# default options --> uniform (all points in the neighborhood are weighted equally)
# another option --> distance (weights closer neighbors more heavily than further neighbors)

# we create a list
weight_options = ['uniform', 'distance']

In [20]:
# create a parameter grid: map the parameter names to the values that should be searched
# dictionary = dict(key=values, key=values)
param_grid = dict(n_neighbors=k_range, weights=weight_options)

{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'weights': ['uniform', 'distance']}

In [21]:
# instantiate and fit the grid
# exhaustive grid-search because it's trying every combination
# 10-fold cross-validation is being performed 30 x 2 = 60 times

grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy'), y)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=30, p=2,
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [22]:
# view the complete results

[mean: 0.96000, std: 0.05333, params: {'n_neighbors': 1, 'weights': 'uniform'},
 mean: 0.96000, std: 0.05333, params: {'n_neighbors': 1, 'weights': 'distance'},
 mean: 0.95333, std: 0.05207, params: {'n_neighbors': 2, 'weights': 'uniform'},
 mean: 0.96000, std: 0.05333, params: {'n_neighbors': 2, 'weights': 'distance'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 3, 'weights': 'uniform'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 3, 'weights': 'distance'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 4, 'weights': 'uniform'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 4, 'weights': 'distance'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 5, 'weights': 'uniform'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 5, 'weights': 'distance'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 6, 'weights': 'uniform'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 6, 'weights': 'distance'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 7, 'weights': 'uniform'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 7, 'weights': 'distance'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 8, 'weights': 'uniform'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 8, 'weights': 'distance'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 9, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 9, 'weights': 'distance'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 10, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 10, 'weights': 'distance'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 11, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 11, 'weights': 'distance'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 12, 'weights': 'uniform'},
 mean: 0.97333, std: 0.04422, params: {'n_neighbors': 12, 'weights': 'distance'},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 13, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 13, 'weights': 'distance'},
 mean: 0.97333, std: 0.04422, params: {'n_neighbors': 14, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 14, 'weights': 'distance'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 15, 'weights': 'uniform'},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 15, 'weights': 'distance'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 16, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 16, 'weights': 'distance'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 17, 'weights': 'uniform'},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 17, 'weights': 'distance'},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 18, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 18, 'weights': 'distance'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 19, 'weights': 'uniform'},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 19, 'weights': 'distance'},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 20, 'weights': 'uniform'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 20, 'weights': 'distance'},
 mean: 0.96667, std: 0.03333, params: {'n_neighbors': 21, 'weights': 'uniform'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 21, 'weights': 'distance'},
 mean: 0.96667, std: 0.03333, params: {'n_neighbors': 22, 'weights': 'uniform'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 22, 'weights': 'distance'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 23, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 23, 'weights': 'distance'},
 mean: 0.96000, std: 0.04422, params: {'n_neighbors': 24, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 24, 'weights': 'distance'},
 mean: 0.96667, std: 0.03333, params: {'n_neighbors': 25, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 25, 'weights': 'distance'},
 mean: 0.96000, std: 0.04422, params: {'n_neighbors': 26, 'weights': 'uniform'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 26, 'weights': 'distance'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 27, 'weights': 'uniform'},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 27, 'weights': 'distance'},
 mean: 0.95333, std: 0.04269, params: {'n_neighbors': 28, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 28, 'weights': 'distance'},
 mean: 0.95333, std: 0.04269, params: {'n_neighbors': 29, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 29, 'weights': 'distance'},
 mean: 0.95333, std: 0.04269, params: {'n_neighbors': 30, 'weights': 'uniform'},
 mean: 0.96667, std: 0.03333, params: {'n_neighbors': 30, 'weights': 'distance'}]

In [23]:
# examine the best model

# Best score did not improve for this model

{'n_neighbors': 13, 'weights': 'uniform'}
  1. Using the best parameters to make predictions

In [24]:
# train your model using all data and the best known parameters

# instantiate model with best parameters
knn = KNeighborsClassifier(n_neighbors=13, weights='uniform')

# fit with X and y, not X_train and y_train
# even if we use train/test split, we should train on X and y before making predictions on new data
# otherwise we throw away potential valuable data we can learn from, y)

# make a prediction on out-of-sample data
knn.predict([3, 5, 4, 2])

In [27]:
# shortcut: 
# GridSearchCV automatically refits the best model using all of the data
# that best fitted model is stored in grid object 
# we can then use prediction using the best fitted model
# code in this cell is the same as the top

grid.predict([3, 5, 4, 2])

In [ ]: