In [1]:
from IPython.display import HTML
HTML('<iframe src=http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data width=300 height=200></iframe>')


Out[1]:

Machine learning loading iris data base


In [2]:
# import load_iris function from datasets module
from sklearn.datasets import load_iris

# save "bunch" object containing iris dataset and its attributes
iris = load_iris()
type(iris)


Out[2]:
sklearn.datasets.base.Bunch

In [3]:
# print the iris data
print (iris.data[:7,])


[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]
 [ 5.4  3.9  1.7  0.4]
 [ 4.6  3.4  1.4  0.3]]

In [4]:
# print the names of the four features
print "names:"
print iris.feature_names


print "Y catgorical data names:"
# print the encoding scheme for species: 0 = setosa, 1 = versicolor, 2 = virginica
print iris.target_names

print "y:"
# print integers representing the species of each observation
print iris.target


names:
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Y catgorical data names:
['setosa' 'versicolor' 'virginica']
y:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

In [5]:
# check the types of the features and response
print type(iris.data)
print type(iris.target)


<type 'numpy.ndarray'>
<type 'numpy.ndarray'>

In [6]:
# check the shape of the features (first dimension = number of observations, second dimensions = number of features)
print iris.data.shape

# check the shape of the response (single dimension matching the number of observations)
print iris.target.shape


(150, 4)
(150,)

In [7]:
# store feature matrix in "X"
X = iris.data

# store response vector in "y"
y = iris.target

# print the shapes of X and y
print X.shape
print y.shape


(150, 4)
(150,)

Knn

  • pick up a value of K

In [8]:
from sklearn.neighbors import KNeighborsClassifier

# Init classifier
knn = KNeighborsClassifier(n_neighbors=1)
print knn


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=1, p=2, weights='uniform')

In [9]:
knn.fit(X,y)


Out[9]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=1, p=2, weights='uniform')

Predict the outcome of new data


In [10]:
X_new = [[3, 5, 4, 2], [5, 4, 3, 2]]
knn.predict(X_new)


Out[10]:
array([2, 1])

In [11]:
# instantiate the model (using the value K=5)
knn = KNeighborsClassifier(n_neighbors=5)

# fit the model with data
knn.fit(X, y)

# predict the response for new observations
knn.predict(X_new)


Out[11]:
array([1, 1])

In [12]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X, y)

# predict the response for new observations
logreg.predict(X_new)


Out[12]:
array([2, 0])

In [13]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X, y)

# predict the response values for the observations in X
print("Printing prediction...")
print(logreg.predict(X))

# store the predicted response values
y_pred = logreg.predict(X)

# check how many predictions were generated
print("size train: " +  str(len(y_pred)))

# compute classification accuracy for the logistic regression model
from sklearn import metrics

## Number of match/total
print metrics.accuracy_score(y, y_pred)


Printing prediction...
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1
 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
size train: 150
0.96

In [14]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
y_pred = knn.predict(X)
print metrics.accuracy_score(y, y_pred)


0.966666666667

In [15]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X, y)
y_pred = knn.predict(X)
print metrics.accuracy_score(y, y_pred)

print(y)
print(y_pred)


1.0
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
  • KNN com k = 1 sempre acerta tudo na base de treino

In [16]:
# print the shapes of X and y
print X.shape
print y.shape

# STEP 1: split X and y into training and testing sets
from sklearn.cross_validation import train_test_split

## random_state is the seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

print("train size: " + str(y_train.shape))
print("test size: " + str(y_test.shape))


(150, 4)
(150,)
train size: (90,)
test size: (60,)

In [17]:
# STEP 2: train the model on the training set
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# STEP 3: make predictions on the testing set
y_pred = logreg.predict(X_test)

# compare actual response values (y_test) with predicted response values (y_pred)
print metrics.accuracy_score(y_test, y_pred)


0.95

In [18]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)


0.966666666667

In [19]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)


0.95

In [20]:
# try K=1 through K=25 and record testing accuracy
k_range = range(1, 26)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_pred))
    
# import Matplotlib (scientific plotting library)
import matplotlib.pyplot as plt

# allow plots to appear within the notebook
%matplotlib inline

# plot the relationship between K and testing accuracy
plt.plot(k_range, scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')


Out[20]:
<matplotlib.text.Text at 0x7f7cc0047a90>

Making prediction with model tuned

  • Is important to re-train the model with all points. Otherwise you are trhowing away valuable points (informations)

In [21]:
# instantiate the model with the best known parameters
knn = KNeighborsClassifier(n_neighbors=11)

# train the model with X and y (not X_train and y_train)
knn.fit(X, y)

# make a prediction for an out-of-sample observation
knn.predict([3, 5, 4, 2])


Out[21]:
array([1])

Video 5 working with pandas


In [22]:
!cd /home/leandroohf/Documents/kaggle/python_tutorial
!pwd


/home/leandroohf

In [23]:
# conventional way to import pandas
import pandas as pd

# read CSV file directly from a URL and save the results
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)

# display the first 5 rows
data.head()


Out[23]:
TV Radio Newspaper Sales
1 230.1 37.8 69.2 22.1
2 44.5 39.3 45.1 10.4
3 17.2 45.9 69.3 9.3
4 151.5 41.3 58.5 18.5
5 180.8 10.8 58.4 12.9

In [24]:
# display the last 5 rows
data.tail()


Out[24]:
TV Radio Newspaper Sales
196 38.2 3.7 13.8 7.6
197 94.2 4.9 8.1 9.7
198 177.0 9.3 6.4 12.8
199 283.6 42.0 66.2 25.5
200 232.1 8.6 8.7 13.4

In [25]:
# check the shape of the DataFrame (rows, columns)
data.shape


Out[25]:
(200, 4)

In [26]:
# conventional way to import seaborn
import seaborn as sns

# allow plots to appear within the notebook
%matplotlib inline

# visualize the relationship between the features and the response using scatterplots
sns.pairplot(data, x_vars=['TV','Radio','Newspaper'], y_vars='Sales', size=7, aspect=0.7, kind='reg')


Out[26]:
<seaborn.axisgrid.PairGrid at 0x7f7cc46e0050>

In [27]:
# create a Python list of feature names
feature_cols = ['TV', 'Radio', 'Newspaper']

# use the list to select a subset of the original DataFrame
X = data[feature_cols]

# equivalent command to do this in one line
X = data[['TV', 'Radio', 'Newspaper']]

# print the first 5 rows
X.head()


Out[27]:
TV Radio Newspaper
1 230.1 37.8 69.2
2 44.5 39.3 45.1
3 17.2 45.9 69.3
4 151.5 41.3 58.5
5 180.8 10.8 58.4

In [28]:
# check the type and shape of X
print type(X)
print X.shape

# select a Series from the DataFrame
y = data['Sales']

# equivalent command that works if there are no spaces in the column name
y = data.Sales

# print the first 5 values
y.head()


<class 'pandas.core.frame.DataFrame'>
(200, 3)
Out[28]:
1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: Sales, dtype: float64

Video 6: Linear Regression


In [29]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [30]:
# import model
from sklearn.linear_model import LinearRegression

# instantiate
linreg = LinearRegression()

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)


Out[30]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [31]:
# print the intercept and coefficients
print linreg.intercept_
print linreg.coef_

# pair the feature names with the coefficients
zip(feature_cols, linreg.coef_)


2.87696662232
[ 0.04656457  0.17915812  0.00345046]
Out[31]:
[('TV', 0.046564567874150226),
 ('Radio', 0.17915812245088839),
 ('Newspaper', 0.0034504647111804204)]

In [32]:
# make predictions on the testing se
y_pred = linreg.predict(X_test)

print(y_pred)


[ 21.70910292  16.41055243   7.60955058  17.80769552  18.6146359
  23.83573998  16.32488681  13.43225536   9.17173403  17.333853
  14.44479482   9.83511973  17.18797614  16.73086831  15.05529391
  15.61434433  12.42541574  17.17716376  11.08827566  18.00537501
   9.28438889  12.98458458   8.79950614  10.42382499  11.3846456
  14.98082512   9.78853268  19.39643187  18.18099936  17.12807566
  21.54670213  14.69809481  16.24641438  12.32114579  19.92422501
  15.32498602  13.88726522  10.03162255  20.93105915   7.44936831
   3.64695761   7.22020178   5.9962782   18.43381853   8.39408045
  14.08371047  15.02195699  20.35836418  20.57036347  19.60636679]

In [33]:
## Just understandings metrics in python

# define true and predicted response values
true = [100, 50, 30, 20]
pred = [90, 50, 50, 30]

# MAE: Mea Absolute Error
print("MAE: Mean abs error:")

# calculate MAE by hand. 
print (10 + 0 + 20 + 10)/4.

# calculate MAE using scikit-learn
from sklearn import metrics
print metrics.mean_absolute_error(true, pred)

print("MSE:")

# calculate MSE by hand
print (10**2 + 0**2 + 20**2 + 10**2)/4.

# calculate MSE using scikit-learn
print metrics.mean_squared_error(true, pred)

print("RMSE:")

# calculate RMSE by hand
import numpy as np
print np.sqrt((10**2 + 0**2 + 20**2 + 10**2)/4.)

# calculate RMSE using scikit-learn
print np.sqrt(metrics.mean_squared_error(true, pred))


MAE: Mean abs error:
10.0
10.0
MSE:
150.0
150.0
RMSE:
12.2474487139
12.2474487139

In [38]:
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))


1.40465142303

In [34]:
from ggplot import *
diamonds.head()


Out[34]:
carat cut color clarity depth table price x y z
0 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
1 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
2 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
3 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
4 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75

Testing ggplot in python


In [35]:
p = ggplot(aes(x='date', y='beef'), data=meat)
p = p + geom_point() + stat_smooth(color='blue')
p


Out[35]:
<ggplot: (8760854574325)>

Video 7: Cross-validation for parameter tuning, model selection, and feature selection


In [46]:
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

# read in the iris data
iris = load_iris()

# create X (features) and y (response)
X = iris.data
y = iris.target

# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)


0.973684210526

In [53]:
# simulate splitting a dataset of 25 observations into 5 folds
from sklearn.cross_validation import KFold
kf = KFold(25, n_folds=5, shuffle=False)

# print the contents of each training and testing set
print '{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations')
for iteration, data in enumerate(kf, start=1):
    print '{:^9} {} {:^25}'.format(iteration, data[0], data[1])


Iteration                   Training set observations                   Testing set observations
    1     [ 5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24]        [0 1 2 3 4]       
    2     [ 0  1  2  3  4 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24]        [5 6 7 8 9]       
    3     [ 0  1  2  3  4  5  6  7  8  9 15 16 17 18 19 20 21 22 23 24]     [10 11 12 13 14]     
    4     [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 20 21 22 23 24]     [15 16 17 18 19]     
    5     [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]     [20 21 22 23 24]     

Cross-validation recommendations

1. K can be any number, but K=10 is generally recommended
2. For classification problems, stratified sampling is recommended for creating the folds
   * Each response class should be represented with equal proportions in each of the K folds
   * scikit-learn's cross_val_score function does this by default

In [52]:
from sklearn.cross_validation import cross_val_score

# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')

print "Scores:"
print scores

# use average accuracy as an estimate of out-of-sample accuracy
print "Mean scores:"
print scores.mean()


Scores:
[ 1.          0.93333333  1.          1.          0.86666667  0.93333333
  0.93333333  1.          1.          1.        ]
Mean scores:
0.966666666667

In [60]:
# search for an optimal value of K for KNN
k_range = range(1, 31)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())

import matplotlib.pyplot as plt
%matplotlib inline

# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')


Out[60]:
<matplotlib.text.Text at 0x7fd0111acb90>

comparing best KNN with logisc regression


In [62]:
# 10-fold cross-validation with the best KNN model
knn = KNeighborsClassifier(n_neighbors=20)

print "knn mean score:"
print cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean()

# 10-fold cross-validation with logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

print "logisc regression mean score:"
print cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean()


knn mean score:
0.98
logisc regression mean score:
0.953333333333

Cross-validation example: feature selection


In [93]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# read in the advertising dataset
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)

# create a Python list of three feature names
feature_cols = ['TV', 'Radio', 'Newspaper']

# use the list to select a subset of the DataFrame (X)
X = data[feature_cols]

# select the Sales column as the response (y)
y = data.Sales

# 10-fold cross-validation with all three features
lm = LinearRegression()

scores = cross_val_score(lm, X, y, cv=10, scoring='mean_squared_error')
print scores

# fix the sign of MSE scores
mse_scores = -scores

# convert from MSE to RMSE
rmse_scores = np.sqrt(mse_scores)

# 10-fold cross-validation with two features (excluding Newspaper)
feature_cols = ['TV', 'Radio']
X = data[feature_cols]

print "Features TV , Radio mean scores: "
print np.sqrt(-cross_val_score(lm, X, y, cv=10, scoring='mean_squared_error')).mean()

print "All features  mean scores: "
# calculate the average RMSE
print rmse_scores.mean()


[-3.56038438 -3.29767522 -2.08943356 -2.82474283 -1.3027754  -1.74163618
 -8.17338214 -2.11409746 -3.04273109 -2.45281793]
Features TV , Radio mean scores: 
1.67967484191
All features  mean scores: 
1.69135317081
Out[93]:
TV Radio Newspaper Sales
1 230.1 37.8 69.2 22.1
2 44.5 39.3 45.1 10.4
3 17.2 45.9 69.3 9.3
4 151.5 41.3 58.5 18.5
5 180.8 10.8 58.4 12.9

Video 8 Grid Search

More efficient parameter tuning using GridSearchCV

Allows you to define a grid of parameters that will be searched using K-fold cross-validation


In [73]:
# read in the iris data
iris = load_iris()

# create X (features) and y (response)
X = iris.data
y = iris.target

# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')

# use average accuracy as an estimate of out-of-sample accuracy
print scores.mean()

from sklearn.grid_search import GridSearchCV

# define the parameter values that should be searched
k_range = range(1, 31)
print k_range

# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range)
print param_grid

# instantiate the grid
# n_jobs = -1 to run computations in parallel (if supported by your computer and OS)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy',n_jobs=-1)

# fit the grid with data
grid.fit(X, y)

# view the complete results (list of named tuples)
grid.grid_scores_


0.966666666667
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]}
Out[73]:
[mean: 0.96000, std: 0.05333, params: {'n_neighbors': 1},
 mean: 0.95333, std: 0.05207, params: {'n_neighbors': 2},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 3},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 4},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 5},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 6},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 7},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 8},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 9},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 10},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 11},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 12},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 13},
 mean: 0.97333, std: 0.04422, params: {'n_neighbors': 14},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 15},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 16},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 17},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 18},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 19},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 20},
 mean: 0.96667, std: 0.03333, params: {'n_neighbors': 21},
 mean: 0.96667, std: 0.03333, params: {'n_neighbors': 22},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 23},
 mean: 0.96000, std: 0.04422, params: {'n_neighbors': 24},
 mean: 0.96667, std: 0.03333, params: {'n_neighbors': 25},
 mean: 0.96000, std: 0.04422, params: {'n_neighbors': 26},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 27},
 mean: 0.95333, std: 0.04269, params: {'n_neighbors': 28},
 mean: 0.95333, std: 0.04269, params: {'n_neighbors': 29},
 mean: 0.95333, std: 0.04269, params: {'n_neighbors': 30}]

In [74]:
# examine the first tuple
print grid.grid_scores_[0].parameters
print grid.grid_scores_[0].cv_validation_scores
print grid.grid_scores_[0].mean_validation_score


{'n_neighbors': 1}
[ 1.          0.93333333  1.          0.93333333  0.86666667  1.
  0.86666667  1.          1.          1.        ]
0.96

In [76]:
# create a list of the mean scores only
grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]

# plot the results
plt.plot(k_range, grid_mean_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')


Out[76]:
<matplotlib.text.Text at 0x7fd010fb6750>

In [78]:
# examine the best model
print grid.best_score_
print grid.best_params_
print grid.best_estimator_


0.98
{'n_neighbors': 13}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=13, p=2, weights='uniform')

Searching multiple parameters simultaneously

Example: tuning max_depth and min_samples_leaf for a DecisionTreeClassifier

Could tune parameters independently: change max_depth while leaving min_samples_leaf at its default value, and vice versa
But, best performance might be achieved when neither parameter is at its default value

In [82]:
# define the parameter values that should be searched
k_range = range(1, 31)
weight_options = ['uniform', 'distance']

# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range, weights=weight_options)
print param_grid

# instantiate and fit the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X, y)

# view the complete results
grid.grid_scores_[:7]

# examine the best model
print "Best model:"
print grid.best_score_
print grid.best_params_


{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'weights': ['uniform', 'distance']}
Best model:
0.98
{'n_neighbors': 13, 'weights': 'uniform'}

In [87]:
# Using the best model

# train your model using all data and the best known parameters
knn = KNeighborsClassifier(n_neighbors=13, weights='uniform')
knn.fit(X, y)

# make a prediction on out-of-sample data
print knn.predict([3, 5, 4, 2])

# shortcut: GridSearchCV automatically refits the best model using all of the data
print grid.predict([3, 5, 4, 2])


[1]
[1]

Reducing computational expense using RandomizedSearchCV


In [89]:
from sklearn.grid_search import RandomizedSearchCV

## Important: Specify a continuous distribution (rather than a list of values) for any continous parameters
# specify "parameter distributions" rather than a "parameter grid"
param_dist = dict(n_neighbors=k_range, weights=weight_options)

# n_iter controls the number of searches
rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5)
rand.fit(X, y)
rand.grid_scores_[:7]


Out[89]:
[mean: 0.97333, std: 0.03266, params: {'n_neighbors': 18, 'weights': 'distance'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 8, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 24, 'weights': 'distance'},
 mean: 0.98000, std: 0.03055, params: {'n_neighbors': 20, 'weights': 'uniform'},
 mean: 0.95333, std: 0.04269, params: {'n_neighbors': 28, 'weights': 'uniform'},
 mean: 0.97333, std: 0.03266, params: {'n_neighbors': 9, 'weights': 'uniform'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 5, 'weights': 'distance'}]

In [91]:
# examine the best model
print rand.best_score_
print rand.best_params_


0.98
{'n_neighbors': 20, 'weights': 'uniform'}