ID3


In [78]:
from __future__ import division
from __future__ import print_function
from math import log

In [1]:
def createDataset():
    dataSet = [[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
    labels = ['no surfacing', 'flippers'] 
    return dataSet, labels

In [2]:
data, labels = createDataset()

In [3]:
data


Out[3]:
[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]

In [18]:
x = {'a':1,'b':2}

In [25]:
x['c'] = 3

In [33]:
x['c'] += 1

In [34]:
x


Out[34]:
{'a': 1, 'b': 2, 'c': 5}

In [81]:
def calcEnt(data, pos):
    numrows = len(data)
    count = {}
    entropy = 0
    
    # freq distrbution 
    for i in data:
        key = i[pos]
        if key not in count.keys():
            count[key] = 1
        else:
            count[key] += 1
    # entropy
    for key in count.keys():
        c = count[key]
        p = c / numrows 
        ent = p * log(p, 2)
        entropy -= ent
            
    return entropy

In [82]:
calcEnt(data,2)


Out[82]:
0.9709505944546686

In [88]:
def infGain(data, target, var):
    numrows = len(data)
    count = {}
    entropy = 0
    
    # freq distrbution 
    for i in data:
        key = i[var]
        if key not in count.keys():
            count[key] = 1
        else:
            count[key] += 1
    
    for key in count.keys():
        subSet = []
        for i in data:
            if i[var] == key:
                subSet.append(i)
        print(subSet)
        ent = calcEnt(subSet, target)
        print (ent)

In [91]:
infGain(data, 2,1)


[[1, 0, 'no']]
0.0
[[1, 1, 'yes'], [1, 1, 'yes'], [0, 1, 'no'], [0, 1, 'no']]
1.0

In [ ]: