IMPLEMENTING K_NEAREST_NEIGHBOUR

In the given data set we have to classify into which cluster a instance is going to fall

Importing required predifined methods


In [155]:
import pandas as pd
import numpy as np
from collections import Counter
from math import sqrt
import random
import warnings

reading the training data into a data frame and assigning headings


In [156]:
df = pd.read_table('train.csv', sep=',', header=None, names=['Type', 'LifeStyle', 'Vacation', 'eCredit', 'Salary', 'Property', 'Label'])
df.head()


Out[156]:
Type LifeStyle Vacation eCredit Salary Property Label
0 student spend>saving 6 40 13.62 3.2804 C1
1 student spend>saving 11 21 15.32 2.0232 C1
2 student spend>saving 7 64 16.55 3.1202 C1
3 student spend>saving 3 47 15.71 3.4022 C1
4 student spend>saving 15 10 16.96 2.2825 C1

reading the testing data into a data frame and assigning headings


In [157]:
dft = pd.read_table('test.csv', sep=',', header=None, names=['Type', 'LifeStyle', 'Vacation', 'eCredit', 'Salary', 'Property', 'Label'])
dft.head()


Out[157]:
Type LifeStyle Vacation eCredit Salary Property Label
0 student spend<saving 12 19 14.7900 3.7697 C1
1 student spend>>saving 29 10 16.1900 2.4839 C1
2 student spend<<saving 28 60 15.4600 1.1885 C1
3 engineer spend>saving 15 41 21.2600 1.4379 C1
4 librarian spend<saving 2 9 19.7207 0.6913 C1

changing the strings in the train data into to numbers so that they can be used while calculating euclidean distance


In [158]:
df['Type'] = df.Type.map({'student':1,'engineer':2,'librarian':3,'professor':4,'doctor':5 }) 
# df.head()
df['LifeStyle'] = df.LifeStyle.map({'spend<<saving':1, 'spend<saving':2, 'spend>saving':3, 'spend>>saving':4}) 

df['Label'] = df.Label.map({'C1':1, 'C2':2 ,'C3':3 ,'C4':4 ,'C5':5})

# df['Vacation']=df['Vacation']/100
df.head()


Out[158]:
Type LifeStyle Vacation eCredit Salary Property Label
0 1 3 6 40 13.62 3.2804 1
1 1 3 11 21 15.32 2.0232 1
2 1 3 7 64 16.55 3.1202 1
3 1 3 3 47 15.71 3.4022 1
4 1 3 15 10 16.96 2.2825 1

changing the strings in the test data into to numbers so that they can be used while calculating euclidean distance


In [159]:
dft['Type'] = dft.Type.map({'student':1,'engineer':2,'librarian':3,'professor':4,'doctor':5 }) 
# df.head()
dft['LifeStyle'] = dft.LifeStyle.map({'spend<<saving':1, 'spend<saving':2, 'spend>saving':3, 'spend>>saving':4}) 

dft['Label'] = dft.Label.map({'C1':1, 'C2':2 ,'C3':3 ,'C4':4 ,'C5':5})

# df['Vacation']=df['Vacation']/100
dft.head()


Out[159]:
Type LifeStyle Vacation eCredit Salary Property Label
0 1 2 12 19 14.7900 3.7697 1
1 1 4 29 10 16.1900 2.4839 1
2 1 1 28 60 15.4600 1.1885 1
3 2 3 15 41 21.2600 1.4379 1
4 3 2 2 9 19.7207 0.6913 1

Normalizing the values in the Vacation,eCredit,Salary,Property so that all the values range in between 0 to 1.we use a formula to do this.we are going to capture min an max for the categories which should be normalized so that we can use them for test data normalization also


In [160]:
vacmaxval=0
vacminval=0
ecrmaxval=0
ecrminval=0
salmaxval=0
salminval=0
prpminval=0
prpmaxval=0

for attribute in list(df.columns.values):
    if attribute == 'Vacation' or attribute == 'eCredit' or attribute == 'Salary' or attribute == 'Property':
        if attribute == 'Vacation':
            vacmaxval=max(df[attribute])
            vacminval=min(df[attribute])
        elif attribute == 'eCredit':
            ecrmaxval=max(df[attribute])
            ecrminval=min(df[attribute])
        elif attribute == 'Salary':
            salmaxval=max(df[attribute])
            salminval=min(df[attribute])
        elif attribute == 'Property':
            prpmaxval=max(df[attribute])
            prpminval=min(df[attribute])
        maxValue = max(df[attribute])
        minValue = min(df[attribute])

        norm = []
        for i in df[attribute]:
            normalisedValue = (i - minValue + 0.0)/(maxValue - minValue + 0.0)
            norm.append(normalisedValue)
        df[attribute] = norm

df.head()

# uncomment the below line to get a csv file of the dataframe

#df.to_csv('sanitizedData.csv', sep=',', encoding='utf-8', header=False)


Out[160]:
Type LifeStyle Vacation eCredit Salary Property Label
0 1 3 0.079365 0.107558 0.212288 0.183167 1
1 1 3 0.158730 0.052326 0.282879 0.112797 1
2 1 3 0.095238 0.177326 0.333953 0.174200 1
3 1 3 0.031746 0.127907 0.299073 0.189984 1
4 1 3 0.222222 0.020349 0.350978 0.127311 1

normalizing the testing data too


In [161]:
minValue=0
maxValue=0

for attribute in list(dft.columns.values):
    if attribute == 'Vacation' or attribute == 'eCredit' or attribute == 'Salary' or attribute == 'Property':
        norm = []
        for i in dft[attribute]:
            if attribute == 'Vacation':
                minValue=vacminval
                maxValue=vacmaxval
            elif attribute == 'eCredit':
                minValue=ecrminval
                maxValue=ecrmaxval
            elif attribute == 'Salary':
                minValue=salmaxval
                maxValue=salminval
            elif attribute == 'Property':
                minValue=prpminval
                maxValue=prpmaxval
            normalisedValue = (i - minValue + 0.0)/(maxValue - minValue + 0.0)
            norm.append(normalisedValue)
        dft[attribute] = norm

dft.head()


Out[161]:
Type LifeStyle Vacation eCredit Salary Property Label
0 1 2 0.174603 0.046512 0.739129 0.210554 1
1 1 4 0.444444 0.020349 0.680995 0.138584 1
2 1 1 0.428571 0.165698 0.711308 0.066076 1
3 2 3 0.222222 0.110465 0.470468 0.080036 1
4 3 2 0.015873 0.017442 0.534386 0.038246 1

converting all the data too float into make calculations accurate


In [162]:
dataframe= df.astype(float).values.tolist()
dataframet= dft.astype(float).values.tolist()

we have to shuffle data so that we can see how good our own function is classifing the instances into clusters


In [163]:
#print dataframe[:10]
random.shuffle(dataframe)
#print dataframe[:10]

developing our own k nearest algorithm with respective to our requirments, we use similarity matrix to take decision between then two parameters, and the rest we can go with euclidean distance.


In [164]:
def k_nearest(data,predict,k=5):
    #if(len(data)>=k):
    #    warnings.warn('bye')
    distances =[]
    for group in data:
        for features in  data[group]:
            euclidean_distance=0
            if(predict[0]!=features[0]):
                euclidean_distance+=1
            if(predict[1]!=features[1]):
                euclidean_distance+=1
            euclidean_distance += ((predict[2]-features[2])**2 + (predict[3]-features[3])**2 +(predict[4]-features[4])**2 + (predict[5]-features[5])**2)
            #euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
            euclidean_distance=sqrt(euclidean_distance)
            distances.append([euclidean_distance,group])
    votes = [i[1] for i in sorted(distances) [:k]]
    #print distances
    print votes
    #print (Counter(votes).most_common(1))
    vote_result = Counter(votes).most_common(1)[0][0]
    #print vote_result
    
            
    return vote_result

splitting the data with respective to ratio and segregating the train an test data into respective clustres


In [165]:
test_size=0.2
train_set={1.0:[],2.0:[],3.0:[],4.0:[],5.0:[]}
test_set={1.0:[],2.0:[],3.0:[],4.0:[],5.0:[]}


test_setnew={1.0:[],2.0:[],3.0:[],4.0:[],5.0:[]}
train_setnew={1.0:[],2.0:[],3.0:[],4.0:[],5.0:[]}


train_data= dataframe[:-int(test_size*len(dataframe))]
test_data= dataframe[-int(test_size*len(dataframe)):]


train_datanew= dataframe[:int(1*len(dataframe))]
test_datanew=dataframet[:int(1*len(dataframet))]
#print test_set
#print test_datanew

Removing the cluster names from the instances and adding them to lists.


In [166]:
for i in train_data:
    train_set[i[-1]].append(i[:-1])
    
for i in test_data:
    test_set[i[-1]].append(i[:-1])
    
#print test_set
    
for i in test_datanew:
    test_setnew[i[-1]].append(i[:-1])

for i in train_datanew:
    train_setnew[i[-1]].append(i[:-1])

    
    
#print test_set

#print test_setnew

Sending our slpitted test and train from the given train data in order to cross validadate


In [167]:
correct=0.0
total=0.0
for group in test_set:
    for data in test_set[group]:
        vote= k_nearest(train_set,data,k=5)
        #print '*****'
        #print group
        #print vote
        #print '*****'
        if group==vote:
            correct+=1
        total +=1
print correct
print total
print ('Accuracy:', correct/total)


[4.0, 4.0, 1.0, 1.0, 4.0]
[1.0, 1.0, 3.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 4.0]
[1.0, 4.0, 4.0, 1.0, 4.0]
[1.0, 1.0, 1.0, 1.0, 3.0]
[1.0, 1.0, 1.0, 1.0, 1.0]
[2.0, 2.0, 2.0, 2.0, 2.0]
[5.0, 5.0, 1.0, 5.0, 5.0]
[1.0, 1.0, 1.0, 1.0, 3.0]
[1.0, 4.0, 4.0, 4.0, 5.0]
[2.0, 2.0, 2.0, 2.0, 2.0]
[2.0, 2.0, 1.0, 2.0, 2.0]
[1.0, 2.0, 2.0, 2.0, 2.0]
[2.0, 1.0, 2.0, 2.0, 2.0]
[2.0, 2.0, 2.0, 2.0, 2.0]
[2.0, 2.0, 2.0, 2.0, 2.0]
[3.0, 3.0, 3.0, 3.0, 3.0]
[3.0, 3.0, 3.0, 3.0, 3.0]
[3.0, 3.0, 1.0, 3.0, 3.0]
[3.0, 1.0, 3.0, 3.0, 3.0]
[3.0, 1.0, 1.0, 3.0, 1.0]
[1.0, 3.0, 3.0, 3.0, 3.0]
[3.0, 3.0, 3.0, 1.0, 3.0]
[3.0, 1.0, 3.0, 3.0, 3.0]
[4.0, 5.0, 5.0, 5.0, 5.0]
[1.0, 1.0, 4.0, 4.0, 1.0]
[4.0, 4.0, 1.0, 4.0, 4.0]
[4.0, 2.0, 2.0, 2.0, 2.0]
[1.0, 4.0, 4.0, 4.0, 3.0]
[1.0, 3.0, 1.0, 1.0, 1.0]
[4.0, 1.0, 5.0, 5.0, 1.0]
[1.0, 4.0, 4.0, 4.0, 3.0]
[4.0, 4.0, 4.0, 3.0, 3.0]
[5.0, 5.0, 4.0, 1.0, 4.0]
[5.0, 5.0, 5.0, 5.0, 5.0]
[5.0, 5.0, 5.0, 5.0, 4.0]
[5.0, 5.0, 1.0, 5.0, 5.0]
[5.0, 5.0, 5.0, 5.0, 5.0]
[5.0, 5.0, 1.0, 5.0, 5.0]
[4.0, 5.0, 5.0, 4.0, 5.0]
[5.0, 1.0, 5.0, 5.0, 4.0]
29.0
41.0
('Accuracy:', 0.7073170731707317)

Sending our actual testing data to find the accuracy our model


In [168]:
correct =0.0
total =0.0
for group in test_setnew:
    for data in test_setnew[group]:
        vote= k_nearest(train_set,data,k=5)
        print '*****'
        print group
        print vote
        #print '*****'
        if group==vote:
            correct+=1
        total +=1
print correct
print total
print ('Accuracy:', (correct)/total)


[3.0, 1.0, 3.0, 3.0, 1.0]
*****
1.0
3.0
[4.0, 1.0, 1.0, 1.0, 1.0]
*****
1.0
1.0
[4.0, 4.0, 1.0, 1.0, 1.0]
*****
1.0
1.0
[1.0, 1.0, 1.0, 1.0, 1.0]
*****
1.0
1.0
[2.0, 2.0, 2.0, 2.0, 2.0]
*****
1.0
2.0
[2.0, 4.0, 2.0, 2.0, 2.0]
*****
1.0
2.0
[1.0, 2.0, 2.0, 2.0, 2.0]
*****
1.0
2.0
[2.0, 2.0, 2.0, 2.0, 1.0]
*****
1.0
2.0
[1.0, 1.0, 3.0, 1.0, 1.0]
*****
1.0
1.0
[1.0, 1.0, 1.0, 1.0, 3.0]
*****
1.0
1.0
[3.0, 3.0, 3.0, 3.0, 3.0]
*****
1.0
3.0
[3.0, 1.0, 3.0, 3.0, 3.0]
*****
1.0
3.0
[5.0, 5.0, 5.0, 1.0, 5.0]
*****
1.0
5.0
[1.0, 4.0, 4.0, 4.0, 3.0]
*****
1.0
4.0
[1.0, 1.0, 4.0, 1.0, 1.0]
*****
1.0
1.0
[1.0, 4.0, 1.0, 1.0, 1.0]
*****
1.0
1.0
[1.0, 4.0, 4.0, 4.0, 5.0]
*****
1.0
4.0
[1.0, 4.0, 4.0, 4.0, 5.0]
*****
1.0
4.0
[5.0, 5.0, 1.0, 4.0, 4.0]
*****
1.0
4.0
[4.0, 4.0, 5.0, 5.0, 1.0]
*****
1.0
4.0
[5.0, 5.0, 5.0, 1.0, 5.0]
*****
1.0
5.0
7.0
21.0
('Accuracy:', 0.3333333333333333)

In [ ]:


In [ ]: