In [1]:
import math
import operator
from random import random 

path = 'data/iris.data'

In [2]:
def euclidean_distance(data1,data2):
    result = 0.0
    for val in range(len(data2)-1):
        result += (data1[val]-data2[val])**2
    return math.sqrt(result)

data1 = [2, 2, 2, 'a']
data2 = [4, 4, 4,'c']
print(euclidean_distance(data1,data2))


3.4641016151377544

In [3]:
def knn(train,test,k):
    dist,neighbors = [],[]
    for i in range(len(train)):
        distance= euclidean_distance(train[i],test)
        dist.append((train[i],distance))
        dist.sort(key=operator.itemgetter(1))
    for i in range(k):
        neighbors.append(dist[i][0])
    return neighbors

train= [[2, 2, 2, 'a'], [4, 4, 4, 'b']]
test= [5, 5, 5]
neighbors = knn(train, test, 2)
print(neighbors)


[[4, 4, 4, 'b'], [2, 2, 2, 'a']]

In [4]:
def majorityVote(neighbors):
    vote = {}
    for i in range(len(neighbors)):
        lst = neighbors[i][-1]
        if lst in vote:
            vote[lst]+=1
        else:
            vote[lst]=1
    majority = max(vote.items(), key=operator.itemgetter(1))[0]
    return majority
train= [[2, 2, 2, 'a'], [4, 4, 4, 'b'],[1,1,1,'a'], [3,3,3,'b']]
test= [5, 5, 5]
neighbors = knn(train, test, 2)
print(majorityVote(neighbors))


b

In [5]:
def accuracy(test,predictions):
    result = 0
    for i in range(len(test)):
        if test[i][-1] == predictions[i]: result+=1
    return ((result/float(len(test)))*100)

In [6]:
def traintest(data,split,train,test):
    for x in range(len(data)-1):
        for y in range(4):
            data[x][y] = float(data[x][y])
        if random() < split:
            train.append(data[x])
        else:
            test.append(data[x])

In [7]:
k = int(input("Please enter value of k:\t"))
split = int(input("Please enter the percentage split:\t"))
split = (split/100)


Please enter value of k:	3
Please enter the percentage split:	67

In [8]:
data=[line.strip().split(',') for line in open(path)]
train,test,predictions = [],[],[]
traintest(data,split,train,test)
print ('Train set:\t',len(train))
print ('Test set:\t',len(test))


Train set:	 100
Test set:	 49

In [9]:
for i in range(len(test)):
    neighbors = knn(train,test[i],k)
    responses = majorityVote(neighbors)
    predictions.append(responses)
    print('predicted = %s, actual = %s'%(responses,test[i][-1]))


predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-setosa, actual = Iris-setosa
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-virginica, actual = Iris-versicolor
predicted = Iris-virginica, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-versicolor, actual = Iris-versicolor
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica
predicted = Iris-virginica, actual = Iris-virginica

In [10]:
acc = accuracy(test, predictions)
print('Accuracy: ' + repr(acc) + '%')


Accuracy: 95.91836734693877%