In [45]:
import arff
import math

#loading dataset
df=arff.load(open('attachments/trainProdSelection/trainProdSelection.arff','rb'))
train=df['data']

In [46]:
#shuffling dataset and splitting into training and testing sets in the ratio 3:1
from random import shuffle
shuffle(train)
sp=int(0.25*(len(train)))

test = train[:sp]
train=train[sp:]

Pre Processing


In [47]:
#separating all the attributes in the dataset
typ=[]
ls=[]
vac=[]
ec=[]
sal=[]
prp=[]
lab=[]
for i in range(len(train)):
    typ.append(train[i][0])
    ls.append(train[i][1])
    vac.append(train[i][2])
    ec.append(train[i][3])
    sal.append(train[i][4])
    prp.append(train[i][5])
    lab.append(train[i][6])

In [48]:
#normalizing numeric values of train data by finding the minumum and maxmimum values in the list and
#applying the formula (actual-minimum)/(maximum-minimum)

maxx=max(vac)
minn=min(vac)
for i in range(len(vac)):
    vac[i]=(vac[i]-minn)/(maxx-minn)
maxx=max(ec)
minn=min(ec)
for i in range(len(ec)):
    ec[i]=(ec[i]-minn)/(maxx-minn)
maxx=max(sal)
minn=min(sal)
for i in range(len(sal)):
    sal[i]=(sal[i]-minn)/(maxx-minn)
maxx=max(prp)
minn=min(prp)
for i in range(len(prp)):
    prp[i]=(prp[i]-minn)/(maxx-minn)

In [49]:
#separating all the attrbutes and normalizing numeric values of test data
typ1=[]
ls1=[]
vac1=[]
ec1=[]
sal1=[]
prp1=[]
lab1=[]
for i in range(len(test)):
    typ1.append(test[i][0])
    ls1.append(test[i][1])
    vac1.append(test[i][2])
    ec1.append(test[i][3])
    sal1.append(test[i][4])
    prp1.append(test[i][5])
    lab1.append(test[i][6])
maxx=max(vac1)
minn=min(vac1)
for i in range(len(vac1)):
    vac1[i]=(vac1[i]-minn)/(maxx-minn)
maxx=max(ec1)
minn=min(ec1)
for i in range(len(ec1)):
    ec1[i]=(ec1[i]-minn)/(maxx-minn)
maxx=max(sal1)
minn=min(sal1)
for i in range(len(sal1)):
    sal1[i]=(sal1[i]-minn)/(maxx-minn)
maxx=max(prp1)
minn=min(prp1)
for i in range(len(prp1)):
    prp1[i]=(prp1[i]-minn)/(maxx-minn)

Applyting KNN on test data


In [50]:
#taking each sample from test data and finding the inverse euclidian distance to all the training samples and taking top three
#after sorting the training data in descending order based on the distance vector

output=[]
for i in range(len(test)):
    list1=[]
    for j in range(0,len(train)):
        v=((vac1[i]-vac[j])**2)+((ec1[i]-ec[j])**2)+((sal1[i]-sal[j])**2)+((prp1[i]-prp[j])**2)
        if(typ1[i]!=typ[j]):
            v=v+1;
        if(ls1[i]!=ls[j]):
            v=v+1;
        eScore=1/math.sqrt(v)
        list2=[]
        list2.append(eScore)
        list2.append(lab[j])
        list1.append(list2)
    list1=sorted(list1,key=lambda x:x[0])
    temp=[0,0,0,0,0]
    for j in range(len(list1)-3,len(list1)):
        temp[int(list1[j][1][1])-1]=temp[int(list1[j][1][1])-1]+list1[i][0]
    output.append(temp)

Calculating accuracy of the model


In [51]:
#performance metric
#taking the maximum value index, which is the classifer and calulating the number of true predictions

count=0
for i in range(len(output)):
    maxx=output[i][0]
    imax=0
    for j in range(1,len(output[i])):
        if(output[i][j]>maxx):
            maxx=output[i][j]
            imax=j
    imax=imax+1
    if(int(lab1[i][1])==imax):
        count=count+1
print count

#calculating the acuuracy of the model by the formula (number of true predictions / total number of predictions)
acc=(count/(len(test)*1.0))*100
print acc


39
84.7826086957

The model's accuracy is 84.7826086957


In [ ]: