In [70]:
# import
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
%matplotlib inline
In [37]:
df = pd.read_csv("data/seeds_dataset.txt", delimiter='|', header=None)
df[:3]
Out[37]:
In [65]:
X = df[df.columns[0:7]]
y = df[7]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23)
Implementing KNN classifier with K = 1
In [56]:
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
scores = accuracy_score(y_test, y_pred)
print("Scores : ", scores)
Simulating same with KFold
In [69]:
clf = KNeighborsClassifier(n_neighbors=1)
kf = KFold(n_splits=5, random_state=34, shuffle=True)
print(kf)
X = np.array(X)
y = np.array(y)
scores = []
for train, test in kf.split(X):
clf.fit(X[train], y[train])
pred = clf.predict(X[test])
scores.append(accuracy_score(y[test], pred))
print("Scores: ", scores)
print("Mean Scores: ", np.mean(scores))
Standardizing the data and creating pipeline
In [72]:
clf = Pipeline([('scale', StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=1))])
scores = []
for train, test in kf.split(X):
clf.fit(X[train], y[train])
pred = clf.predict(X[test])
scores.append(accuracy_score(y[test], pred))
print("Scores: ", scores)
print("Mean Scores: ", np.mean(scores))
In [ ]: