In [70]:
# import
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


%matplotlib inline

In [37]:
df = pd.read_csv("data/seeds_dataset.txt", delimiter='|', header=None)
df[:3]


Out[37]:
0 1 2 3 4 5 6 7
0 15.26 14.84 0.8710 5.763 3.312 2.221 5.220 1
1 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 1
2 14.29 14.09 0.9050 5.291 3.337 2.699 4.825 1

In [65]:
X = df[df.columns[0:7]]
y = df[7]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23)

Implementing KNN classifier with K = 1


In [56]:
clf = KNeighborsClassifier(n_neighbors=1)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

scores = accuracy_score(y_test, y_pred)

print("Scores : ", scores)


Scores :  0.936507936508

Simulating same with KFold


In [69]:
clf = KNeighborsClassifier(n_neighbors=1)

kf = KFold(n_splits=5, random_state=34, shuffle=True)
print(kf)

X = np.array(X)
y = np.array(y)

scores = []
for train, test in kf.split(X):
    clf.fit(X[train], y[train])
    pred = clf.predict(X[test])
    scores.append(accuracy_score(y[test], pred))
    
print("Scores: ", scores)
print("Mean Scores: ", np.mean(scores))


KFold(n_splits=5, random_state=34, shuffle=True)
Scores:  [0.95238095238095233, 0.8571428571428571, 0.90476190476190477, 0.90476190476190477, 0.8571428571428571]
Mean Scores:  0.895238095238

Standardizing the data and creating pipeline


In [72]:
clf = Pipeline([('scale', StandardScaler()), 
                    ('knn', KNeighborsClassifier(n_neighbors=1))])

scores = []
for train, test in kf.split(X):
    clf.fit(X[train], y[train])
    pred = clf.predict(X[test])
    scores.append(accuracy_score(y[test], pred))
    
print("Scores: ", scores)
print("Mean Scores: ", np.mean(scores))


Scores:  [0.9285714285714286, 0.9285714285714286, 0.9285714285714286, 0.97619047619047616, 0.90476190476190477]
Mean Scores:  0.933333333333

In [ ]: