In [16]:
#loading dataset
import arff
import math
from numpy import median
from numpy import mean
df=arff.load(open('attachments/trainProdSelection/trainProdSelection.arff','rb'))
train=df['data']
In [17]:
#shuffling dataset and splitting into training and testing sets in the ratio 3:1
from random import shuffle
shuffle(train)
sp=int(0.25*(len(train)))
test = train[:sp]
train=train[sp:]
In [18]:
#converting categorical string values to integers
#and separating all attributes
dicTyp={"student":0, "engineer":1,"librarian":2,"professor":3,"doctor":4}
dicLs={"spend<<saving":0,"spend<saving":1, "spend>saving":2, "spend>>saving":3}
dicLab={"C1":0,"C2":1,"C3":2,"C4":3,"C5":4}
typ=[]
ls=[]
vac=[]
ec=[]
sal=[]
prp=[]
lab=[]
for i in range(len(train)):
typ.append(dicTyp[train[i][0]])
ls.append(dicLs[train[i][1]])
vac.append(int(round(train[i][2])))
ec.append(int(round(train[i][3])))
sal.append(int(round(train[i][4])))
prp.append(int(round(train[i][5])))
lab.append(int(round(dicLab[train[i][6]])))
typ1=[]
ls1=[]
vac1=[]
ec1=[]
sal1=[]
prp1=[]
lab1=[]
for i in range(len(test)):
typ1.append(dicTyp[test[i][0]])
ls1.append(dicLs[test[i][1]])
vac1.append(int(round(test[i][2])))
ec1.append(int(round(test[i][3])))
sal1.append(int(round(test[i][4])))
prp1.append(int(round(test[i][5])))
lab1.append(int(round(dicLab[test[i][6]])))
In [19]:
#method for calculating entrophy of target variable
def mlab():
count=[1,1,1,1,1,1]
lab1=[0,0,0,0,0]
for i in lab:
lab1[i]=lab1[i]+1
for i in range(len(lab1)):
lab1[i]=-1*((lab1[i]/(len(lab)*1.0))*(math.log(lab1[i]/(len(lab)*1.0),2)))
Hy=sum(lab1)
return Hy
In [20]:
#method for calculating information gain of type attribute
def mtyp():
enTyp=0
for i in range(5):
l1=[0,0,0,0,0]
for j in range(len(typ)):
if(i==typ[j]):
l1[lab[j]]=l1[lab[j]]+1
s=sum(l1)*1.0
lo=0
for j in range(len(l1)):
if(l1[j]!=0):
lo=lo+(-1*(l1[j]/s)*math.log((l1[j]/s),2))
enTyp=enTyp+((s/(len(typ)*1.0))*lo)
igTyp=mlab()-enTyp
return igTyp
In [21]:
#method for calculating information gain of life style attribute
def mls():
enLs=0
for i in range(4):
l1=[0,0,0,0,0]
for j in range(len(ls)):
if(i==ls[j]):
l1[lab[j]]=l1[lab[j]]+1
s=sum(l1)*1.0
lo=0
for j in range(len(l1)):
if(l1[j]!=0):
lo=lo+(-1*(l1[j]/s)*math.log((l1[j]/s),2))
enLs=enLs+((s/(len(ls)*1.0))*lo)
igLs=mlab()-enLs
return igLs
In [22]:
#method for calculating information gain of vacation attribute
def mvac():
m=max(vac)
l1=[]
for i in range(m):
l1.append(0)
enVac=0
for i in range(m+1):
l1=[0,0,0,0,0]
for j in range(len(vac)):
if(i==vac[j]):
l1[lab[j]]=l1[lab[j]]+1
s=sum(l1)*1.0
lo=0
for j in range(len(l1)):
if(l1[j]!=0):
lo=lo+(-1*(l1[j]/s)*math.log((l1[j]/s),2))
enVac=enVac+((s/(len(vac)*1.0))*lo)
igVac=mlab()-enVac
return igVac
In [23]:
#method for calculating information gain of e-credit attribute
def mec():
m=max(ec)
l1=[]
for i in range(m):
l1.append(0)
enEc=0
for i in range(m+1):
l1=[0,0,0,0,0]
for j in range(len(ec)):
if(i==ec[j]):
l1[lab[j]]=l1[lab[j]]+1
s=sum(l1)*1.0
lo=0
for j in range(len(l1)):
if(l1[j]!=0):
lo=lo+(-1*(l1[j]/s)*math.log((l1[j]/s),2))
enEc=enEc+((s/(len(ec)*1.0))*lo)
igEc=mlab()-enEc
return igEc
In [24]:
#method for calculating information gain of salary attribute
def msal():
m=max(sal)
l1=[]
for i in range(m):
l1.append(0)
enSal=0
for i in range(m+1):
l1=[0,0,0,0,0]
for j in range(len(sal)):
if(i==sal[j]):
l1[lab[j]]=l1[lab[j]]+1
s=sum(l1)*1.0
lo=0
for j in range(len(l1)):
if(l1[j]!=0):
lo=lo+(-1*(l1[j]/s)*math.log((l1[j]/s),2))
enSal=enSal+((s/(len(sal)*1.0))*lo)
igSal=mlab()-enSal
return igSal
In [25]:
#method for calculating information gain of property attribute
def mprp():
m=max(prp)
l1=[]
for i in range(m):
l1.append(0)
enPrp=0
for i in range(m+1):
l1=[0,0,0,0,0]
for j in range(len(prp)):
if(i==prp[j]):
l1[lab[j]]=l1[lab[j]]+1
s=sum(l1)*1.0
lo=0
for j in range(len(l1)):
if(l1[j]!=0):
lo=lo+(-1*(l1[j]/s)*math.log((l1[j]/s),2))
enPrp=enPrp+((s/(len(prp)*1.0))*lo)
igPrp=mlab()-enPrp
return igPrp
print mtyp()
print mls()
print mvac()
print mec()
print msal()
print mprp()
In [26]:
# root=[1,1,1,1,1,1]
In [27]:
#node class for decision tree whicch contains all attributes information, information gain and data as threshould for splitting
class Tree(object):
def __init__(self):
self.data = None
self.classifier=None
self.typ=[]
self.ls=[]
self.vac=[]
self.ec=[]
self.sal=[]
self.prp=[]
self.lab=[]
self.attr=[0,0,0,0,0,0]
self.ig=[0,0,0,0,0,0]
self.splitAttr=None
In [28]:
#creating dictionary which contains index of attribute and method to calculate information gain based on the index of attribute
dictAttr1={0:mtyp(),1:mls(),2:mvac(),3:mec(),4:msal(),5:mprp()}
# dictAttr2={mtyp():0,mls():1,mvac():2,3:mec(),4:msal(),5:mprp()}
#constrcuting output array for binary tree with number of number nodes as 2 power of attributes
#Root node starts at index 1
#if index parent node is 1 and its childs indices are 2*i and 2*i+1
output=[None]*(2**7)
#rcontructing root node which starts at index 1 in the output array by selecting the max information gain from all the attributes
output[1]=Tree()
output[1].data=mean(ec)
output[1].typ=typ
output[1].ls=ls
output[1].vac=vac
output[1].ec=ec
output[1].sal=sal
output[1].prp=prp
output[1].lab=lab
output[1].attr=[1,1,1,0,1,1]
output[1].ig=[1,1,1,0,1,1]
output[1].splitAttr=3
#splitting the data on the attribute by data(threshould) value and iteratively calculating the information gain on the
#non splitted attrbutes and constructing decision tree until all the attributes are finished
for i in range(2,(2**7)):
output[i]=Tree()
for j in range(len(output[i/2].ec)):
if(i%2==0 and output[i/2].ec[j]<=output[i/2].data):
output[i].typ.append(output[i/2].typ[j])
output[i].ls.append(output[i/2].ls[j])
output[i].vac.append(output[i/2].vac[j])
output[i].ec.append(output[i/2].ec[j])
output[i].sal.append(output[i/2].sal[j])
output[i].prp.append(output[i/2].prp[j])
output[i].lab.append(output[i/2].lab[j])
elif(i%2==1 and output[i/2].ec[j]>output[i/2].data):
output[i].typ.append(output[i/2].typ[j])
output[i].ls.append(output[i/2].ls[j])
output[i].vac.append(output[i/2].vac[j])
output[i].ec.append(output[i/2].ec[j])
output[i].sal.append(output[i/2].sal[j])
output[i].prp.append(output[i/2].prp[j])
output[i].lab.append(output[i/2].lab[j])
temp=[0]*5
if(max(output[i/2].attr)==0):
for j in range(len(output[i].lab)):
temp[output[i].lab[j]]=temp[output[i].lab[j]]+1
ind=temp.index(max(temp))
output[i].classifier=ind
continue
typ=output[i].typ[:]
ls=output[i].ls[:]
vac=output[i].vac[:]
ec=output[i].ec[:]
sal=output[i].sal[:]
prp=output[i].prp[:]
lab=output[i].lab[:]
output[i].attr=output[i/2].attr[:]
for j in range(len(output[i/2].attr)):
if(output[i/2].attr[j]==1):
output[i].ig[j]=dictAttr1[j]
ind=output[i].ig.index(max(output[i].ig))
output[i].attr[ind]=0
output[i].splitAttr=ind
In [29]:
# predicting the classifier of the testing data from the decision tree and counting the number of correct classifications
dictTest={0:typ1,1:ls1,2:vac1,3:ec1,4:sal1,5:prp1}
count=0
for i in range(len(test)):
ind=1;
temp=output[ind]
while(temp.splitAttr!=None):
if(dictTest[temp.splitAttr][i]<=temp.data):
temp=output[2*ind]
ind=ind*2;
else:
temp=output[2*ind+1]
ind=ind*2+1
if(temp.classifier==lab1[i]):
count=count+1
#calculating the accuracy of the model by the formula (number of true predictions / number of total predictions)
print "The model's accuracy is:",(float(count)/len(test))*100
In [ ]:
In [ ]:
In [ ]: