IMPLEMENTING K_NEAREST_NEIGHBOUR

In the given data set we have to classify into which cluster a instance is going to fall

Importing required predifined methods



In [155]:

    
import pandas as pd
import numpy as np
from collections import Counter
from math import sqrt
import random
import warnings

reading the training data into a data frame and assigning headings



In [156]:

    
df = pd.read_table('train.csv', sep=',', header=None, names=['Type', 'LifeStyle', 'Vacation', 'eCredit', 'Salary', 'Property', 'Label'])
df.head()









    Out[156]:







  
    
      
      Type
      LifeStyle
      Vacation
      eCredit
      Salary
      Property
      Label
    
  
  
    
      0
      student
      spend>saving
      6
      40
      13.62
      3.2804
      C1
    
    
      1
      student
      spend>saving
      11
      21
      15.32
      2.0232
      C1
    
    
      2
      student
      spend>saving
      7
      64
      16.55
      3.1202
      C1
    
    
      3
      student
      spend>saving
      3
      47
      15.71
      3.4022
      C1
    
    
      4
      student
      spend>saving
      15
      10
      16.96
      2.2825
      C1

reading the testing data into a data frame and assigning headings



In [157]:

    
dft = pd.read_table('test.csv', sep=',', header=None, names=['Type', 'LifeStyle', 'Vacation', 'eCredit', 'Salary', 'Property', 'Label'])
dft.head()









    Out[157]:







  
    
      
      Type
      LifeStyle
      Vacation
      eCredit
      Salary
      Property
      Label
    
  
  
    
      0
      student
      spend<saving
      12
      19
      14.7900
      3.7697
      C1
    
    
      1
      student
      spend>>saving
      29
      10
      16.1900
      2.4839
      C1
    
    
      2
      student
      spend<<saving
      28
      60
      15.4600
      1.1885
      C1
    
    
      3
      engineer
      spend>saving
      15
      41
      21.2600
      1.4379
      C1
    
    
      4
      librarian
      spend<saving
      2
      9
      19.7207
      0.6913
      C1

changing the strings in the train data into to numbers so that they can be used while calculating euclidean distance



In [158]:

    
df['Type'] = df.Type.map({'student':1,'engineer':2,'librarian':3,'professor':4,'doctor':5 }) 
# df.head()
df['LifeStyle'] = df.LifeStyle.map({'spend<<saving':1, 'spend<saving':2, 'spend>saving':3, 'spend>>saving':4}) 

df['Label'] = df.Label.map({'C1':1, 'C2':2 ,'C3':3 ,'C4':4 ,'C5':5})

# df['Vacation']=df['Vacation']/100
df.head()

changing the strings in the test data into to numbers so that they can be used while calculating euclidean distance



In [159]:

    
dft['Type'] = dft.Type.map({'student':1,'engineer':2,'librarian':3,'professor':4,'doctor':5 }) 
# df.head()
dft['LifeStyle'] = dft.LifeStyle.map({'spend<<saving':1, 'spend<saving':2, 'spend>saving':3, 'spend>>saving':4}) 

dft['Label'] = dft.Label.map({'C1':1, 'C2':2 ,'C3':3 ,'C4':4 ,'C5':5})

# df['Vacation']=df['Vacation']/100
dft.head()

Normalizing the values in the Vacation,eCredit,Salary,Property so that all the values range in between 0 to 1.we use a formula to do this.we are going to capture min an max for the categories which should be normalized so that we can use them for test data normalization also



In [160]:

    
vacmaxval=0
vacminval=0
ecrmaxval=0
ecrminval=0
salmaxval=0
salminval=0
prpminval=0
prpmaxval=0

for attribute in list(df.columns.values):
    if attribute == 'Vacation' or attribute == 'eCredit' or attribute == 'Salary' or attribute == 'Property':
        if attribute == 'Vacation':
            vacmaxval=max(df[attribute])
            vacminval=min(df[attribute])
        elif attribute == 'eCredit':
            ecrmaxval=max(df[attribute])
            ecrminval=min(df[attribute])
        elif attribute == 'Salary':
            salmaxval=max(df[attribute])
            salminval=min(df[attribute])
        elif attribute == 'Property':
            prpmaxval=max(df[attribute])
            prpminval=min(df[attribute])
        maxValue = max(df[attribute])
        minValue = min(df[attribute])

        norm = []
        for i in df[attribute]:
            normalisedValue = (i - minValue + 0.0)/(maxValue - minValue + 0.0)
            norm.append(normalisedValue)
        df[attribute] = norm

df.head()

# uncomment the below line to get a csv file of the dataframe

#df.to_csv('sanitizedData.csv', sep=',', encoding='utf-8', header=False)

normalizing the testing data too



In [161]:

    
minValue=0
maxValue=0

for attribute in list(dft.columns.values):
    if attribute == 'Vacation' or attribute == 'eCredit' or attribute == 'Salary' or attribute == 'Property':
        norm = []
        for i in dft[attribute]:
            if attribute == 'Vacation':
                minValue=vacminval
                maxValue=vacmaxval
            elif attribute == 'eCredit':
                minValue=ecrminval
                maxValue=ecrmaxval
            elif attribute == 'Salary':
                minValue=salmaxval
                maxValue=salminval
            elif attribute == 'Property':
                minValue=prpminval
                maxValue=prpmaxval
            normalisedValue = (i - minValue + 0.0)/(maxValue - minValue + 0.0)
            norm.append(normalisedValue)
        dft[attribute] = norm

dft.head()

converting all the data too float into make calculations accurate



In [162]:

    
dataframe= df.astype(float).values.tolist()
dataframet= dft.astype(float).values.tolist()

we have to shuffle data so that we can see how good our own function is classifing the instances into clusters



In [163]:

    
#print dataframe[:10]
random.shuffle(dataframe)
#print dataframe[:10]

developing our own k nearest algorithm with respective to our requirments, we use similarity matrix to take decision between then two parameters, and the rest we can go with euclidean distance.



In [164]:

    
def k_nearest(data,predict,k=5):
    #if(len(data)>=k):
    #    warnings.warn('bye')
    distances =[]
    for group in data:
        for features in  data[group]:
            euclidean_distance=0
            if(predict[0]!=features[0]):
                euclidean_distance+=1
            if(predict[1]!=features[1]):
                euclidean_distance+=1
            euclidean_distance += ((predict[2]-features[2])**2 + (predict[3]-features[3])**2 +(predict[4]-features[4])**2 + (predict[5]-features[5])**2)
            #euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
            euclidean_distance=sqrt(euclidean_distance)
            distances.append([euclidean_distance,group])
    votes = [i[1] for i in sorted(distances) [:k]]
    #print distances
    print votes
    #print (Counter(votes).most_common(1))
    vote_result = Counter(votes).most_common(1)[0][0]
    #print vote_result
    
            
    return vote_result

splitting the data with respective to ratio and segregating the train an test data into respective clustres



In [165]:

    
test_size=0.2
train_set={1.0:[],2.0:[],3.0:[],4.0:[],5.0:[]}
test_set={1.0:[],2.0:[],3.0:[],4.0:[],5.0:[]}


test_setnew={1.0:[],2.0:[],3.0:[],4.0:[],5.0:[]}
train_setnew={1.0:[],2.0:[],3.0:[],4.0:[],5.0:[]}


train_data= dataframe[:-int(test_size*len(dataframe))]
test_data= dataframe[-int(test_size*len(dataframe)):]


train_datanew= dataframe[:int(1*len(dataframe))]
test_datanew=dataframet[:int(1*len(dataframet))]
#print test_set
#print test_datanew

Removing the cluster names from the instances and adding them to lists.



In [166]:

    
for i in train_data:
    train_set[i[-1]].append(i[:-1])
    
for i in test_data:
    test_set[i[-1]].append(i[:-1])
    
#print test_set
    
for i in test_datanew:
    test_setnew[i[-1]].append(i[:-1])

for i in train_datanew:
    train_setnew[i[-1]].append(i[:-1])

    
    
#print test_set

#print test_setnew

Sending our slpitted test and train from the given train data in order to cross validadate



In [167]:

    
correct=0.0
total=0.0
for group in test_set:
    for data in test_set[group]:
        vote= k_nearest(train_set,data,k=5)
        #print '*****'
        #print group
        #print vote
        #print '*****'
        if group==vote:
            correct+=1
        total +=1
print correct
print total
print ('Accuracy:', correct/total)









    



[4.0, 4.0, 1.0, 1.0, 4.0]
[1.0, 1.0, 3.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 4.0]
[1.0, 4.0, 4.0, 1.0, 4.0]
[1.0, 1.0, 1.0, 1.0, 3.0]
[1.0, 1.0, 1.0, 1.0, 1.0]
[2.0, 2.0, 2.0, 2.0, 2.0]
[5.0, 5.0, 1.0, 5.0, 5.0]
[1.0, 1.0, 1.0, 1.0, 3.0]
[1.0, 4.0, 4.0, 4.0, 5.0]
[2.0, 2.0, 2.0, 2.0, 2.0]
[2.0, 2.0, 1.0, 2.0, 2.0]
[1.0, 2.0, 2.0, 2.0, 2.0]
[2.0, 1.0, 2.0, 2.0, 2.0]
[2.0, 2.0, 2.0, 2.0, 2.0]
[2.0, 2.0, 2.0, 2.0, 2.0]
[3.0, 3.0, 3.0, 3.0, 3.0]
[3.0, 3.0, 3.0, 3.0, 3.0]
[3.0, 3.0, 1.0, 3.0, 3.0]
[3.0, 1.0, 3.0, 3.0, 3.0]
[3.0, 1.0, 1.0, 3.0, 1.0]
[1.0, 3.0, 3.0, 3.0, 3.0]
[3.0, 3.0, 3.0, 1.0, 3.0]
[3.0, 1.0, 3.0, 3.0, 3.0]
[4.0, 5.0, 5.0, 5.0, 5.0]
[1.0, 1.0, 4.0, 4.0, 1.0]
[4.0, 4.0, 1.0, 4.0, 4.0]
[4.0, 2.0, 2.0, 2.0, 2.0]
[1.0, 4.0, 4.0, 4.0, 3.0]
[1.0, 3.0, 1.0, 1.0, 1.0]
[4.0, 1.0, 5.0, 5.0, 1.0]
[1.0, 4.0, 4.0, 4.0, 3.0]
[4.0, 4.0, 4.0, 3.0, 3.0]
[5.0, 5.0, 4.0, 1.0, 4.0]
[5.0, 5.0, 5.0, 5.0, 5.0]
[5.0, 5.0, 5.0, 5.0, 4.0]
[5.0, 5.0, 1.0, 5.0, 5.0]
[5.0, 5.0, 5.0, 5.0, 5.0]
[5.0, 5.0, 1.0, 5.0, 5.0]
[4.0, 5.0, 5.0, 4.0, 5.0]
[5.0, 1.0, 5.0, 5.0, 4.0]
29.0
41.0
('Accuracy:', 0.7073170731707317)

Sending our actual testing data to find the accuracy our model



In [168]:

    
correct =0.0
total =0.0
for group in test_setnew:
    for data in test_setnew[group]:
        vote= k_nearest(train_set,data,k=5)
        print '*****'
        print group
        print vote
        #print '*****'
        if group==vote:
            correct+=1
        total +=1
print correct
print total
print ('Accuracy:', (correct)/total)









    



[3.0, 1.0, 3.0, 3.0, 1.0]
*****
1.0
3.0
[4.0, 1.0, 1.0, 1.0, 1.0]
*****
1.0
1.0
[4.0, 4.0, 1.0, 1.0, 1.0]
*****
1.0
1.0
[1.0, 1.0, 1.0, 1.0, 1.0]
*****
1.0
1.0
[2.0, 2.0, 2.0, 2.0, 2.0]
*****
1.0
2.0
[2.0, 4.0, 2.0, 2.0, 2.0]
*****
1.0
2.0
[1.0, 2.0, 2.0, 2.0, 2.0]
*****
1.0
2.0
[2.0, 2.0, 2.0, 2.0, 1.0]
*****
1.0
2.0
[1.0, 1.0, 3.0, 1.0, 1.0]
*****
1.0
1.0
[1.0, 1.0, 1.0, 1.0, 3.0]
*****
1.0
1.0
[3.0, 3.0, 3.0, 3.0, 3.0]
*****
1.0
3.0
[3.0, 1.0, 3.0, 3.0, 3.0]
*****
1.0
3.0
[5.0, 5.0, 5.0, 1.0, 5.0]
*****
1.0
5.0
[1.0, 4.0, 4.0, 4.0, 3.0]
*****
1.0
4.0
[1.0, 1.0, 4.0, 1.0, 1.0]
*****
1.0
1.0
[1.0, 4.0, 1.0, 1.0, 1.0]
*****
1.0
1.0
[1.0, 4.0, 4.0, 4.0, 5.0]
*****
1.0
4.0
[1.0, 4.0, 4.0, 4.0, 5.0]
*****
1.0
4.0
[5.0, 5.0, 1.0, 4.0, 4.0]
*****
1.0
4.0
[4.0, 4.0, 5.0, 5.0, 1.0]
*****
1.0
4.0
[5.0, 5.0, 5.0, 1.0, 5.0]
*****
1.0
5.0
7.0
21.0
('Accuracy:', 0.3333333333333333)



In [ ]:



In [ ]:

	Type	LifeStyle	Vacation	eCredit	Salary	Property	Label
0	1	3	0.079365	0.107558	0.212288	0.183167	1
1	1	3	0.158730	0.052326	0.282879	0.112797	1
2	1	3	0.095238	0.177326	0.333953	0.174200	1
3	1	3	0.031746	0.127907	0.299073	0.189984	1
4	1	3	0.222222	0.020349	0.350978	0.127311	1

	Type	LifeStyle	Vacation	eCredit	Salary	Property	Label
0	1	2	0.174603	0.046512	0.739129	0.210554	1
1	1	4	0.444444	0.020349	0.680995	0.138584	1
2	1	1	0.428571	0.165698	0.711308	0.066076	1
3	2	3	0.222222	0.110465	0.470468	0.080036	1
4	3	2	0.015873	0.017442	0.534386	0.038246	1

	Type	LifeStyle	Vacation	eCredit	Salary	Property	Label
0	student	spend>saving	6	40	13.62	3.2804	C1
1	student	spend>saving	11	21	15.32	2.0232	C1
2	student	spend>saving	7	64	16.55	3.1202	C1
3	student	spend>saving	3	47	15.71	3.4022	C1
4	student	spend>saving	15	10	16.96	2.2825	C1

	Type	LifeStyle	Vacation	eCredit	Salary	Property	Label
0	student	spend<saving	12	19	14.7900	3.7697	C1
1	student	spend>>saving	29	10	16.1900	2.4839	C1
2	student	spend<<saving	28	60	15.4600	1.1885	C1
3	engineer	spend>saving	15	41	21.2600	1.4379	C1
4	librarian	spend<saving	2	9	19.7207	0.6913	C1