Imports & Dataset

``````

In [9]:

import math
import numpy

``````
``````

In [10]:

dataset = [['dugg', 'clare', 'will', 'donald', 'deril', 'gregory', 'julia'],
['M', 'F', 'M', 'M', 'M', 'M', 'F'],
['20-30', '20-30', '20-30', '20-30', '30-40', '20-30', '5-10'],
['1.60-1.70', '1.70-1.80', '1.70-1.80', '1.80-1.90', '1.70-1.80', '>1.90', '<1.60']]

``````
``````

In [11]:

labels = ['no', 'no', 'yes', 'yes', 'no', 'no', 'yes']

``````

Helper Functions - Decision Trees

``````

In [12]:

def calc_entropy(labels):
entropy = 0.0
for i in set(labels):
q_labels = float(labels.count(i))/len(labels)
entropy += q_labels * math.log(1/q_labels)
return entropy

``````
``````

In [13]:

def calc_split_entropy(splitted_labels):
all_labels = [i for sublist in splitted_labels for i in sublist]
new_entropy = 0.0
for i in splitted_labels:
q_split = float(len(i))/len(all_labels)
new_entropy += q_split * calc_entropy(i)
return new_entropy

``````
``````

In [14]:

def split_by(feature_index, dataset, labels):
splitted_labels = {}
for i, v in enumerate(dataset[feature_index]):
if not splitted_labels.has_key(v):
splitted_labels[v] = []
splitted_labels[v].append(labels[i])
return splitted_labels

``````
``````

In [15]:

def calc_variance(values):
return (len(set(values))/float(len(values)))

``````

Code

``````

In [25]:

calc_split_entropy(split_by(3, dataset, labels).values())

``````
``````

Out[25]:

0.2727917864120626

``````
``````

In [17]:

calc_entropy(labels)

``````
``````

Out[17]:

0.6829081047004716

``````
``````

In [24]:

calc_variance(dataset[3])

``````
``````

Out[24]:

0.7142857142857143

``````
``````

In [ ]:

``````