Selecting the best model in scikit-learn using cross-validation



In [5]:

    
# import
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split



In [6]:

    
iris = load_iris()

X = iris.data
y = iris.target



In [7]:

    
# splitting the data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

# using classifier
scl = KNeighborsClassifier(n_neighbors=6)
scl.fit(X_train, y_train)
y_pred = scl.predict(X_test)

# checking accuracy
accuracy_score(y_test, y_pred)









    Out[7]:





0.97368421052631582



In [10]:

    
# simulate splitting a dataset of 25 observations into 5 folds
from sklearn.cross_validation import KFold
kf = KFold(25, n_folds=5, shuffle=False)

# print the contents of each training and testing set
print('{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations'))
for iteration, data in enumerate(kf, start=1):
    print('{:^9} {} {:^25}'.format(iteration, data[0], data[1]))









    



Iteration                   Training set observations                   Testing set observations






    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-10-90c4f56e5082> in <module>()
      6 print('{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations'))
      7 for iteration, data in enumerate(kf, start=1):
----> 8     print('{:^9} {} {:^25}'.format(iteration, data[0], data[1]))

TypeError: non-empty format string passed to object.__format__

Cross-validation example: parameter tuning



In [11]:

    
from sklearn.cross_validation import cross_val_score



In [12]:

    
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)
# scores are accuracy









    



[ 1.          0.93333333  1.          1.          0.86666667  0.93333333
  0.93333333  1.          1.          1.        ]



In [13]:

    
# mean
scores.mean()









    Out[13]:





0.96666666666666679



In [16]:

    
# finding out the best value of k and their accuracy

K = range(1,31)
accuracy = []

for i in K:
    knn = KNeighborsClassifier(n_neighbors=i)
    acc = cross_val_score(knn, X, y, cv=10, scoring="accuracy")
    accuracy.append(acc.mean())
    
print(accuracy)
#print(list(zip(K,accuracy)))









    



[0.95999999999999996, 0.95333333333333337, 0.96666666666666656, 0.96666666666666656, 0.96666666666666679, 0.96666666666666679, 0.96666666666666679, 0.96666666666666679, 0.97333333333333338, 0.96666666666666679, 0.96666666666666679, 0.97333333333333338, 0.98000000000000009, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.98000000000000009, 0.97333333333333338, 0.98000000000000009, 0.96666666666666656, 0.96666666666666656, 0.97333333333333338, 0.95999999999999996, 0.96666666666666656, 0.95999999999999996, 0.96666666666666656, 0.95333333333333337, 0.95333333333333337, 0.95333333333333337]



In [17]:

    
import matplotlib.pyplot as plt
%matplotlib inline



In [22]:

    
plt.plot(K, accuracy)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')









    Out[22]:





<matplotlib.text.Text at 0x270f1627278>



In [23]:

    
# K = 20, as higher 

knn = KNeighborsClassifier(n_neighbors=20)
acc = cross_val_score(knn, X, y, cv=10, scoring="accuracy")
acc.mean()









    Out[23]:





0.98000000000000009

Comparing it with Logistic model



In [24]:

    
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
print(cross_val_score(lr, X, y, cv=10, scoring="accuracy").mean())









    



0.953333333333

Let's pick the Advertisement data and find out the best model



In [30]:

    
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression



In [27]:

    
data = pd.read_csv("data/Advertising.csv", index_col=0)
print(data.head())









    



      TV  Radio  Newspaper  Sales
1  230.1   37.8       69.2   22.1
2   44.5   39.3       45.1   10.4
3   17.2   45.9       69.3    9.3
4  151.5   41.3       58.5   18.5
5  180.8   10.8       58.4   12.9



In [28]:

    
features = ['TV', 'Radio', 'Newspaper']
response = ['Sales']

X = data[features]
y = data.Sales



In [32]:

    
lreg = LinearRegression()
acc = cross_val_score(lreg, X, y, cv=10, scoring="mean_squared_error")
print(acc)









    



[-3.56038438 -3.29767522 -2.08943356 -2.82474283 -1.3027754  -1.74163618
 -8.17338214 -2.11409746 -3.04273109 -2.45281793]



In [34]:

    
print(np.sqrt((-acc)).mean())









    



1.69135317081



In [38]:

    
# Testing the same when Newspaper is not included in model
X = data[['TV',  'Radio']]

lreg = LinearRegression()
print((np.sqrt(-cross_val_score(lreg, X, y, cv=10, scoring="mean_squared_error"))).mean())









    



1.67967484191



In [39]:

    
# model performs better when Newspaper is not included



In [ ]: