In [28]:
import math
import pandas as pd
data = pd.read_csv("./data1.csv")

In [108]:
def computeEntropy(df, cla):
    count = list(df.groupby(cla)[cla].count())
    total = float(sum(count))
    prob = (c/total for c in count)
    return sum([-p*math.log(p, 2) for p in prob])

def infoGain(df, cla, col):
    res = computeEntropy(df, cla)
    n = df[cla].count()
    for typ in df[col].unique():
        n_i = df[df[col]==typ][cla].count()
        res -= computeEntropy(df[df[col]==typ], cla) * (float(n_i)/n)
    return res

1


In [81]:
print "(a). total entropy = %f" % computeEntropy(data, "Class")
print "(b). movie ID entropy = %f" % computeEntropy(data, "ID")
print "(c). Format entropy = %f" % computeEntropy(data, "Format")
print "(d). Movie Category entropy = %f" % computeEntropy(data, "Category")


(a). total entropy = 1.000000
(b). movie ID entropy = 4.321928
(c). Format entropy = 0.970951
(d). Movie Category entropy = 1.521928

(e). Movie format has the lowest entropy

(f). We choose attributes based on information gain:


In [114]:
print "Info gain if splitting on:" 
print "Format:   %f" % infoGain(data, "Class", "Format")
print "Category: %f" % infoGain(data, "Class", "Category")
print "ID:       %f" % infoGain(data, "Class", "ID")


Info gain if splitting on:
Format:   0.124511
Category: 0.295807
ID:       1.000000

The rule is to choose the attribute that gives you the highest information gain. In this case, although that would be "ID", the thing is "ID" will split the data into 20 nodes where each node contains only one data point. Therefore, we should not consider it as a valid classification. Without that, we should choose "Category" as it gives the second largest info gain.

2