In [155]:
import pandas as pd
import numpy as np
from collections import Counter
from math import sqrt
import random
import warnings
In [156]:
df = pd.read_table('train.csv', sep=',', header=None, names=['Type', 'LifeStyle', 'Vacation', 'eCredit', 'Salary', 'Property', 'Label'])
df.head()
Out[156]:
In [157]:
dft = pd.read_table('test.csv', sep=',', header=None, names=['Type', 'LifeStyle', 'Vacation', 'eCredit', 'Salary', 'Property', 'Label'])
dft.head()
Out[157]:
In [158]:
df['Type'] = df.Type.map({'student':1,'engineer':2,'librarian':3,'professor':4,'doctor':5 })
# df.head()
df['LifeStyle'] = df.LifeStyle.map({'spend<<saving':1, 'spend<saving':2, 'spend>saving':3, 'spend>>saving':4})
df['Label'] = df.Label.map({'C1':1, 'C2':2 ,'C3':3 ,'C4':4 ,'C5':5})
# df['Vacation']=df['Vacation']/100
df.head()
Out[158]:
In [159]:
dft['Type'] = dft.Type.map({'student':1,'engineer':2,'librarian':3,'professor':4,'doctor':5 })
# df.head()
dft['LifeStyle'] = dft.LifeStyle.map({'spend<<saving':1, 'spend<saving':2, 'spend>saving':3, 'spend>>saving':4})
dft['Label'] = dft.Label.map({'C1':1, 'C2':2 ,'C3':3 ,'C4':4 ,'C5':5})
# df['Vacation']=df['Vacation']/100
dft.head()
Out[159]:
In [160]:
vacmaxval=0
vacminval=0
ecrmaxval=0
ecrminval=0
salmaxval=0
salminval=0
prpminval=0
prpmaxval=0
for attribute in list(df.columns.values):
if attribute == 'Vacation' or attribute == 'eCredit' or attribute == 'Salary' or attribute == 'Property':
if attribute == 'Vacation':
vacmaxval=max(df[attribute])
vacminval=min(df[attribute])
elif attribute == 'eCredit':
ecrmaxval=max(df[attribute])
ecrminval=min(df[attribute])
elif attribute == 'Salary':
salmaxval=max(df[attribute])
salminval=min(df[attribute])
elif attribute == 'Property':
prpmaxval=max(df[attribute])
prpminval=min(df[attribute])
maxValue = max(df[attribute])
minValue = min(df[attribute])
norm = []
for i in df[attribute]:
normalisedValue = (i - minValue + 0.0)/(maxValue - minValue + 0.0)
norm.append(normalisedValue)
df[attribute] = norm
df.head()
# uncomment the below line to get a csv file of the dataframe
#df.to_csv('sanitizedData.csv', sep=',', encoding='utf-8', header=False)
Out[160]:
In [161]:
minValue=0
maxValue=0
for attribute in list(dft.columns.values):
if attribute == 'Vacation' or attribute == 'eCredit' or attribute == 'Salary' or attribute == 'Property':
norm = []
for i in dft[attribute]:
if attribute == 'Vacation':
minValue=vacminval
maxValue=vacmaxval
elif attribute == 'eCredit':
minValue=ecrminval
maxValue=ecrmaxval
elif attribute == 'Salary':
minValue=salmaxval
maxValue=salminval
elif attribute == 'Property':
minValue=prpminval
maxValue=prpmaxval
normalisedValue = (i - minValue + 0.0)/(maxValue - minValue + 0.0)
norm.append(normalisedValue)
dft[attribute] = norm
dft.head()
Out[161]:
In [162]:
dataframe= df.astype(float).values.tolist()
dataframet= dft.astype(float).values.tolist()
In [163]:
#print dataframe[:10]
random.shuffle(dataframe)
#print dataframe[:10]
In [164]:
def k_nearest(data,predict,k=5):
#if(len(data)>=k):
# warnings.warn('bye')
distances =[]
for group in data:
for features in data[group]:
euclidean_distance=0
if(predict[0]!=features[0]):
euclidean_distance+=1
if(predict[1]!=features[1]):
euclidean_distance+=1
euclidean_distance += ((predict[2]-features[2])**2 + (predict[3]-features[3])**2 +(predict[4]-features[4])**2 + (predict[5]-features[5])**2)
#euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
euclidean_distance=sqrt(euclidean_distance)
distances.append([euclidean_distance,group])
votes = [i[1] for i in sorted(distances) [:k]]
#print distances
print votes
#print (Counter(votes).most_common(1))
vote_result = Counter(votes).most_common(1)[0][0]
#print vote_result
return vote_result
In [165]:
test_size=0.2
train_set={1.0:[],2.0:[],3.0:[],4.0:[],5.0:[]}
test_set={1.0:[],2.0:[],3.0:[],4.0:[],5.0:[]}
test_setnew={1.0:[],2.0:[],3.0:[],4.0:[],5.0:[]}
train_setnew={1.0:[],2.0:[],3.0:[],4.0:[],5.0:[]}
train_data= dataframe[:-int(test_size*len(dataframe))]
test_data= dataframe[-int(test_size*len(dataframe)):]
train_datanew= dataframe[:int(1*len(dataframe))]
test_datanew=dataframet[:int(1*len(dataframet))]
#print test_set
#print test_datanew
In [166]:
for i in train_data:
train_set[i[-1]].append(i[:-1])
for i in test_data:
test_set[i[-1]].append(i[:-1])
#print test_set
for i in test_datanew:
test_setnew[i[-1]].append(i[:-1])
for i in train_datanew:
train_setnew[i[-1]].append(i[:-1])
#print test_set
#print test_setnew
In [167]:
correct=0.0
total=0.0
for group in test_set:
for data in test_set[group]:
vote= k_nearest(train_set,data,k=5)
#print '*****'
#print group
#print vote
#print '*****'
if group==vote:
correct+=1
total +=1
print correct
print total
print ('Accuracy:', correct/total)
In [168]:
correct =0.0
total =0.0
for group in test_setnew:
for data in test_setnew[group]:
vote= k_nearest(train_set,data,k=5)
print '*****'
print group
print vote
#print '*****'
if group==vote:
correct+=1
total +=1
print correct
print total
print ('Accuracy:', (correct)/total)
In [ ]:
In [ ]: