Load Data Set

Tutorials:



In [1]:

    
import pandas as pd



In [2]:

    
# define column names
names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']



In [3]:

    
# loading training data
df = pd.read_csv('dataset/iris.data', header=None, names=names)
df.head()









    Out[3]:






  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      class
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      Iris-setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      Iris-setosa

Visualize Data Set



In [4]:

    
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
# can choose different styles
# print(plt.style.available)
plt.style.use('fivethirtyeight')
# list available fonts: [f.name for f in matplotlib.font_manager.fontManager.ttflist]
matplotlib.rc('font', family='DejaVu Sans')



In [5]:

    
sns.lmplot('sepal_length', 'sepal_width', data=df, hue='class', fit_reg=False)
plt.show()



In [6]:

    
sns.lmplot('petal_length', 'petal_width', data=df, hue='class', fit_reg=False)
plt.show()

Train

Split test and train data



In [7]:

    
import numpy as np
from sklearn.model_selection import train_test_split



In [8]:

    
# create design matrix X and target vector Y
X = np.array(df.ix[:, 0:4]) # end index is exclusive
y = np.array(df['class']) # another way of indexing a pandas df



In [9]:

    
print('{}, {}'.format(len(X), len(y)))



In [10]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)



In [11]:

    
print('X_train {}, X_test {}, y_train {}, y_test {}'.format(len(X_train), len(X_test), len(y_train), len(y_test)))









    



X_train 100, X_test 50, y_train 100, y_test 50

Define classifer



In [12]:

    
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score



In [13]:

    
# instantiate lerning model (k = 3)
knn = KNeighborsClassifier(n_neighbors=11)



In [14]:

    
# fitting the model
knn.fit(X_train, y_train)









    Out[14]:





KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=11, p=2,
           weights='uniform')



In [15]:

    
# predict the response
pred = knn.predict(X_test)



In [16]:

    
print(accuracy_score(y_test, pred))

1.0

k-fold cross validation

[!] Using the test set for hyperparameter tuning can lead to overfitting.



In [17]:

    
from sklearn.model_selection import cross_val_score

# creating odd list of K for KNN
myList = list(range(1,64))



In [18]:

    
# subsetting just the odd ones
neighbors = list(filter(lambda x: x % 2 != 0, myList))



In [19]:

    
# empty list that will hold cv scores
cv_scores = []
cv_scores_std = []

# perform 10-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
    cv_scores_std.append(scores.std())

plot the misclassification error versus K



In [20]:

    
# changing to misclassification error
MSE = [1 - x for x in cv_scores]



In [21]:

    
# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]



In [22]:

    
print ('the optimal number of neighbors is {}'.format(optimal_k))









    



the optimal number of neighbors is 7

scores the missclassification error for pure KNeighborsClassifier



In [23]:

    
cv_scores_for_test = []
# perform 10-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    cv_scores_for_test.append(accuracy_score(y_test, knn.predict(X_test)))
    
MSE_test = [1 - x for x in cv_scores_for_test]



In [24]:

    
# for [:2] features
# perform 10-fold cross validation
cv_scores_for_test_0_2 = []
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train[:,:2], y_train)
    cv_scores_for_test_0_2.append(accuracy_score(y_test, knn.predict(X_test[:,:2])))
    
MSE_test_0_2 = [1 - x for x in cv_scores_for_test_0_2]



In [25]:

    
# for [2:4] features
# perform 10-fold cross validation
cv_scores_for_test_2_4 = []
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train[:,2:4], y_train)
    cv_scores_for_test_2_4.append(accuracy_score(y_test, knn.predict(X_test[:,2:4])))
    
MSE_test_2_4 = [1 - x for x in cv_scores_for_test_2_4]



In [26]:

    
# plot miscllassification error vs k
plt.clf()
plt.plot(neighbors, MSE, label='k-fold')
cv_low = [x - x_std for x, x_std in zip(MSE, cv_scores_std)]
cv_hi = [x + x_std for x, x_std in zip(MSE, cv_scores_std)]
plt.fill_between(neighbors, cv_low, cv_hi, label='k-fold(deviation)', alpha=0.3)
plt.plot(neighbors, MSE_test, label='whole dataset and features')
plt.plot(neighbors, MSE_test_0_2, label='features [:2]')
plt.plot(neighbors, MSE_test_2_4, label='features [2:4]')
plt.xlabel('number of neighbors K')
plt.ylabel('misclassification error')
plt.legend()
plt.show()

K-nn from scratch



In [27]:

    
import collections



In [28]:

    
def train(X_train, y_train):
    # do nothing
    return



In [29]:

    
def predict(X_train, y_train, x_test, k):
    # first we compute the euclidean distance
    distances = [
        [np.sqrt(np.sum(np.square(x_test - x_train))), i] 
        for i, x_train in enumerate(X_train)
    ]
        
    # sort the list
    distances = sorted(distances)
    
    # make a list of the k neighbors' targets
    targets = [y_train[distance[1]] for distance in distances[:k]]
        
    # return most common target
    return collections.Counter(targets).most_common(1)[0][0]



In [30]:

    
def k_nearest_neighbour(X_train, y_train, X_test, k):
    # train on the input data
    train(X_train, y_train)
    
    # loop over all observations
    return [predict(X_train, y_train, x_test, k) for x_test in X_test]



In [31]:

    
# making our predictions 
pred = k_nearest_neighbour(X_train, y_train, X_test, 1)

# transform the list into an array
pred = np.asarray(pred)

# evaluating accuracy
accuracy = accuracy_score(y_test, pred)
print('\nThe accuracy of our classifier is {}'.format(accuracy))









    



The accuracy of our classifier is 0.98

get prediction for [:2] features



In [32]:

    
# making our predictions 
pred = k_nearest_neighbour(X_train[:,:2], y_train, X_test[:,:2], 1)

# transform the list into an array
pred = np.asarray(pred)

# evaluating accuracy
accuracy = accuracy_score(y_test, pred)
print('\nThe accuracy of our classifier is {}'.format(accuracy))









    



The accuracy of our classifier is 0.78

get prediction for [2:4] features



In [33]:

    
# making our predictions 
pred = k_nearest_neighbour(X_train[:,2:4], y_train, X_test[:,2:4], 1)

# transform the list into an array
pred = np.asarray(pred)

# evaluating accuracy
accuracy = accuracy_score(y_test, pred)
print('\nThe accuracy of our classifier is {}'.format(accuracy))









    



The accuracy of our classifier is 0.98

Decision Regions



In [34]:

    
def label_to_int(labels):
    return [list(set(labels)).index(y_value) for y_value in labels]



In [35]:

    
# use color map, otherwise it will be grayscale
from matplotlib import cm

# choose 2 features to classify
features_indexes = [0,1]

# Plotting decision regions
x_min, x_max = X[:, features_indexes[0]].min() - 1, X[:, features_indexes[0]].max() + 1
y_min, y_max = X[:, features_indexes[1]].min() - 1, X[:, features_indexes[1]].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train[:, features_indexes], y_train)

Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = np.array(label_to_int(Z))
Z = Z.reshape(xx.shape)

# TODO: try to use seaborn instead 
plt.contourf(xx, yy, Z, alpha=0.4, cmap=cm.jet)
plt.scatter(X[:, features_indexes[0]], X[:, features_indexes[1]], c=[list(set(y)).index(y_value) for y_value in y], alpha=0.9, cmap=cm.jet)
plt.show()



In [ ]:

	sepal_length	sepal_width	petal_length	petal_width	class
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa

Load Data Set

- http://scikit-learn.org/stable/modules/neighbors.html

Visualize Data Set

Train

Split test and train data

Define classifer

k-fold cross validation

plot the misclassification error versus K

scores the missclassification error for pure KNeighborsClassifier

K-nn from scratch

get prediction for [:2] features

get prediction for [2:4] features

Decision Regions