In [20]:
import random
from scipy.spatial import distance
from sklearn import datasets
#from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
#from sklearn.neighbors import KNeighborsClassifier
In [35]:
def euc(a, b):
# only measure distance to one neighbor
return distance.euclidean(a, b)
In [36]:
class ScrappyKNN:
def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def predict(self, X_test):
predictions = []
for row in X_test:
#label = random.choice(self.y_train)
label = self.closest(row)
predictions.append(label)
return predictions
def closest(self, row):
best_dist = euc(row, self.X_train[0])
best_index = 0
for i in range(1, len(self.X_train)):
dist = euc(row, self.X_train[i])
if dist < best_dist:
best_dist = dist
best_index = i
return self.y_train[best_index]
In [37]:
iris = datasets.load_iris()
# by analogy of f(x) = y
X = iris.data # array([[ 5.1, 3.5, 1.4, 0.2], [ 4.9, 3. , 1.4, 0.2]])
y = iris.target # array([0, 0])
# 0.5 means use half of data for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# my_classifier = tree.DecisionTreeClassifier()
# my_classifier = KNeighborsClassifier()
my_classifier = ScrappyKNN()
my_classifier.fit(X_train, y_train)
predictions = my_classifier.predict(X_test) # array([2, 0, 2])
accuracy_score(y_test, predictions)
Out[37]:
In [ ]: