ID3

Information Entropy

$$ Entropy(S) = - \sum_{i=1}^{m} P(u_i) log(P(u_i)) $$

where $P(u_i)=\frac{\|u_i\|}{\|S\|}$.

Gain of information entropy

$$ Gain(S,A) = Entropy(S) - \sum_{v \in Value(A)} \frac {\|S_v\|}{\|S\|} Entropy(S_v)$$

Algorithms

Compute Gain of Information Entropy (GIE)
Choose Best Feature with biggest GIE
Split data into kinds of sets based on the best feature
Repeat the above operations until all instances in sub-tree are same kind



In [1]:

    
from math import log
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = dict()
    for featVec in dataSet:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob*log(prob,2)
    return shannonEnt

def splitDataSet(dataSet,axis,value):
    retDataSet = list()
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet,i,value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob*calcShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    storedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    if (len(dataSet[0]) == 1):
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
    return myTree

def classify(inputTree,featLabels,testVec):
    firstStr = inputTree.keys()[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classify(secondDict[key],featLabels,testVec)
            else:
                classLabel = secondDict[key]
    return classLabel



In [2]:

    
def createDataSet():
    dataSet = [[1,1,'yes'],
              [1,1,'yes'],
              [1,0,'no'],
              [0,1,'no'],
              [0,1,'no']]
    labels = ['no surfacing','flippers']
    return dataSet,labels



In [8]:

    
myDat,labels = createDataSet()
myTree = createTree(myDat,labels)
print myTree









    



{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}



In [9]:

    
myDat,labels = createDataSet()
classify(myTree,labels,[1,0])









    Out[9]:





'no'

Pandas Version



In [27]:

    
import pandas as pd
import numpy as np
def CalcShannonEnt(dataSet):
    pass

Reference

[1]Harrington P. Machine Learning in Action[M] Machine learning in action. Manning Publications Co. 2012.