``````

import math
import numpy

dataset = [['dugg', 'clare', 'will', 'donald', 'deril', 'gregory', 'julia'],
['M', 'F', 'M', 'M', 'M', 'M', 'F'],
['20-30', '20-30', '20-30', '20-30', '30-40', '20-30', '5-10'],
['1.60-1.70', '1.70-1.80', '1.70-1.80', '1.80-1.90', '1.70-1.80', '>1.90', '<1.60']]

labels = ['no', 'no', 'yes', 'yes', 'no', 'no', 'yes']

Helper Functions - Decision Trees

def calc_entropy(labels):
entropy = 0.0
for i in set(labels):
q_labels = float(labels.count(i))/len(labels)
entropy += q_labels * math.log(1/q_labels)
return entropy

def calc_split_entropy(splitted_labels):
all_labels = [i for sublist in splitted_labels for i in sublist]
new_entropy = 0.0
for i in splitted_labels:
q_split = float(len(i))/len(all_labels)
new_entropy += q_split * calc_entropy(i)
return new_entropy

def split_by(feature_index, dataset, labels):
splitted_labels = {}
for i, v in enumerate(dataset[feature_index]):
if not splitted_labels.has_key(v):
splitted_labels[v] = []
splitted_labels[v].append(labels[i])
return splitted_labels

def calc_variance(values):
return (len(set(values))/float(len(values)))

Code

calc_split_entropy(split_by(3, dataset, labels).values())

0.2727917864120626

calc_entropy(labels)

0.6829081047004716

calc_variance(dataset[3])

0.7142857142857143

