In [78]:
from __future__ import division
from __future__ import print_function
from math import log
In [1]:
def createDataset():
dataSet = [[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
labels = ['no surfacing', 'flippers']
return dataSet, labels
In [2]:
data, labels = createDataset()
In [3]:
data
Out[3]:
In [18]:
x = {'a':1,'b':2}
In [25]:
x['c'] = 3
In [33]:
x['c'] += 1
In [34]:
x
Out[34]:
In [81]:
def calcEnt(data, pos):
numrows = len(data)
count = {}
entropy = 0
# freq distrbution
for i in data:
key = i[pos]
if key not in count.keys():
count[key] = 1
else:
count[key] += 1
# entropy
for key in count.keys():
c = count[key]
p = c / numrows
ent = p * log(p, 2)
entropy -= ent
return entropy
In [82]:
calcEnt(data,2)
Out[82]:
In [88]:
def infGain(data, target, var):
numrows = len(data)
count = {}
entropy = 0
# freq distrbution
for i in data:
key = i[var]
if key not in count.keys():
count[key] = 1
else:
count[key] += 1
for key in count.keys():
subSet = []
for i in data:
if i[var] == key:
subSet.append(i)
print(subSet)
ent = calcEnt(subSet, target)
print (ent)
In [91]:
infGain(data, 2,1)
In [ ]: