In [2]:
from sklearn.datasets import load_iris
iris= load_iris()
In [3]:
iris.keys()
Out[3]:
In [4]:
iris['target_names']
Out[4]:
In [5]:
iris['feature_names']
Out[5]:
In [6]:
iris['target']
Out[6]:
data from first 5 columns
In [7]:
iris['data'][:5]
Out[7]:
In [8]:
print(iris['DESCR'] + "\n")
In [9]:
iris['data'].shape
Out[9]:
In [10]:
iris['target'].shape
Out[10]:
In [11]:
X= iris['data']
y= iris['target']
instantiate KNN classifier
In [12]:
from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier(n_neighbors=1)
see default settings
In [13]:
print knn
In [14]:
knn.fit(X, y)
Out[14]:
In [15]:
knn.predict([3, 4, 5, 2])
Out[15]:
In [16]:
X_new= ([3, 4, 5, 2], [4, 3, 2, 0.1])
In [17]:
prediction_1= knn.predict(X_new)
prediction_1
Out[17]:
Get labels for prediction
In [18]:
iris['target_names'][prediction_1]
Out[18]:
using different value of K
In [19]:
knn= KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
Out[19]:
In [20]:
knn.predict(X_new)
Out[20]:
applying logistic regression
In [21]:
from sklearn.linear_model import LogisticRegression
logreg= LogisticRegression()
logreg.fit(X, y)
logreg.predict(X_new)
Out[21]:
Okay, we've been working on using the whole dataset. Now we're going to divide the dataset into training and testing dataset to estimate the accuracy of the model
In [22]:
y_pred= logreg.predict(X)
len(y_pred)
Out[22]:
In [23]:
from sklearn import metrics
print metrics.accuracy_score(y_pred, y)
above is classification accuracy on whole dataset, let's try train/split
In [24]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.4, random_state= 4)
In [25]:
print X_train.shape
print X_test.shape
print
print y_train.shape
print y_test.shape
In [26]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Out[26]:
In [27]:
y_pred= logreg.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)
using knn
In [28]:
k_range= range(1, 30)
scores= []
for k in k_range:
knn= KNeighborsClassifier(n_neighbors= k)
knn.fit(X_train, y_train)
y_pred= knn.predict(X_test)
scores.append(metrics.accuracy_score(y_test, y_pred))
In [29]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(k_range, scores)
plt.xlabel('K value in KNN')
plt.ylabel('Testing accuracy')
Out[29]:
best value of K to select is 5<K<17 => K= (6+11)/2 = 11
CROSS VALIDATION TO GET BEST GUESS FOR OUT OF SAMPLE ACCURACY
In [33]:
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
In [38]:
knn= KNeighborsClassifier(n_neighbors=5)
scores= cross_val_score(knn, X, y, cv=10, scoring= 'accuracy')
print scores
print
print "The final cv score is {}".format(scores.mean())
In [39]:
k_range= range(1, 31)
k_scores= []
for k in k_range:
knn= KNeighborsClassifier(n_neighbors=k)
scores= cross_val_score(knn, X, y, cv=10, scoring= 'accuracy')
k_scores.append(scores.mean())
print k_scores
In [40]:
plt.plot(k_range, k_scores)
plt.xlabel('value of K in knn')
plt.ylabel('cross validated accuracy')
Out[40]:
we take largest K to get least complex model
COMPARING MODELS
In [41]:
knn= KNeighborsClassifier(n_neighbors=20)
print cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean()
In [42]:
logreg= LogisticRegression()
print cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean()
USING GRIDSEARCH TO GET BEST PARAMETERS
In [44]:
from sklearn.grid_search import GridSearchCV
k_range= range(1, 31)
print k_range
In [45]:
param_grid= dict(n_neighbors= k_range)
print param_grid
In [50]:
grid= GridSearchCV(knn, param_grid=param_grid, cv=10, scoring='accuracy')
In [51]:
grid.fit(X,y)
Out[51]:
In [52]:
grid.grid_scores_
Out[52]:
In [54]:
grid_mean_scores= [result.mean_validation_score for result in grid.grid_scores_]
print grid_mean_scores
In [55]:
plt.plot(k_range, grid_mean_scores)
plt.xlabel("value of k in knn")
plt.ylabel("grid mean score")
Out[55]:
In [57]:
print grid.best_score_
print "=========================================================="
print grid.best_params_
print "=========================================================="
print grid.best_estimator_
print "=========================================================="
searching multiple parameters
In [65]:
k_range= range(1, 31)
weight_options= ['uniform', 'distance']
param_grid= dict(n_neighbors= k_range, weights= weight_options)
print param_grid
In [66]:
grid= GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X, y)
Out[66]:
In [67]:
grid.grid_scores_
Out[67]:
In [68]:
print grid.best_score_
print "=========================================================="
print grid.best_params_
print "=========================================================="
print grid.best_estimator_
print "=========================================================="
In [73]:
pred_new= grid.predict([3, 4, 5, 2])
print pred_new
In [74]:
iris['target_names'][pred_new]
Out[74]:
USING RANDOMCV SEARCH. USEUFL WHEN COMPUTATION OF GRIDCV IS TOO COSTLY
In [76]:
from sklearn.grid_search import RandomizedSearchCV
In [77]:
param_grid= dict(n_neighbors= k_range, weights= weight_options)
In [81]:
rand = RandomizedSearchCV(knn, param_grid, cv=10, n_iter= 10, random_state= 5, scoring='accuracy')
rand.fit(X, y)
Out[81]:
In [82]:
rand.grid_scores_
Out[82]:
In [83]:
print grid.best_score_
print "=========================================================="
print grid.best_params_
print "=========================================================="
print grid.best_estimator_
print "=========================================================="
In [ ]: