In [1]:
from collections import Counter, defaultdict
import math
inputs = [
({'level':'Senior','lang':'Java','tweets':'no','phd':'no'}, False),
({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'}, False),
({'level':'Mid','lang':'Python','tweets':'no','phd':'no'}, True),
({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
({'level':'Senior','lang':'R','tweets':'yes','phd':'no'}, True),
({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
({'level':'Junior','lang':'Python','tweets':'no','phd':'no'}, True),
({'level':'Junior','lang':'R','tweets':'yes','phd':'no'}, True),
({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'}, False),
({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'}, True),
({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'}, True),
({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'}, True),
({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
]
def entropy(class_probs):
"""given a list of class probs, compute the entropy
(a measure of the disorder of the system)"""
return sum(-p * math.log(p, 2) for p in class_probs if p)
def class_probs(labels):
total_count = len(labels)
return [count / total_count for count in Counter(labels).values()]
def data_entropy(labeled_data):
"""entropy of data in pairs (input, label)"""
labels = [label for _, label in labeled_data]
probs = class_probs(labels)
return entropy(probs)
def partition_entropy(subsets):
"""find the entropy from this partition of data into subsets.
subsets is a list of lists of labeled data"""
total_count = sum(len(subset) for subset in subsets)
return sum(data_entropy(subset) * len(subset) / total_count for subset in subsets)
def partition_by(inputs, attr):
"""each input is a pair (attr dict, label).
return a dict: attr value -> inputs"""
groups = defaultdict(list)
for inp in inputs:
key = inp[0][attr]
groups[key].append(inp)
return groups
def partition_entropy_by(inputs, attr):
"""computes the entropy corresponding to the given partition"""
partitions = partition_by(inputs, attr)
return partition_entropy(partitions.values())
In [2]:
for key in ['level', 'lang', 'tweets', 'phd']:
print(key, '\t', round(partition_entropy_by(inputs, key), 2))
In [3]:
""" Notice above 'level' has the lowest entropy. This means it has the highest certainty.
So we would split our tree on that attribute first.
"""
senior_inputs = [(inp, label) for inp, label in inputs if inp['level'] == 'Senior']
for key in ['lang', 'tweets', 'phd']:
print(key, '\t', round(partition_entropy_by(senior_inputs, key), 2))
In [12]:
from functools import partial
def classify(tree, inp):
"""classify the input using the given decision tree"""
# if this is a leaf node, return its value
if tree in [True, False]:
return tree
"""otherwise, this tree consists of an attribute to split on
& a dict whose keys are values of that attribute & whose values
are subtrees to consider next"""
attr, subtree_dict = tree
subtree_key = inp.get(attr)
# if no subtree for key, use None
if subtree_key not in subtree_dict:
subtree_key = None
# choose the appropriate subtree & use it to classify the next input
subtree = subtree_dict[subtree_key]
return classify(subtree, inp)
def build_tree_id3(inputs, split_candidates=None):
if split_candidates is None:
split_candidates = inputs[0][0].keys()
num_inputs = len(inputs)
num_trues = len([label for item, label in inputs if label])
num_falses = num_inputs - num_trues
if num_trues == 0:
return False
if num_falses == 0:
return True
best_attr = min(split_candidates, key=partial(partition_entropy_by, inputs))
partitions = partition_by(inputs, best_attr)
new_candidates = [a for a in split_candidates if a != best_attr]
subtrees = {attr_value : build_tree_id3(subset, new_candidates)
for attr_value, subset in partitions.items()}
subtrees[None] = num_trues > num_falses
return (best_attr, subtrees)
In [20]:
tree = build_tree_id3(inputs)
from pprint import pprint
pprint(tree)
In [14]:
classify(tree, {"level" : "Junior",
"lang" : "Java",
"tweets" : "yes",
"phd" : "no"})
Out[14]:
In [15]:
classify(tree, {"level" : "Junior",
"lang" : "Java",
"tweets" : "yes",
"phd" : "yes"})
Out[15]:
In [16]:
print(classify(tree, {"level" : "Intern"}))
print(classify(tree, {"level" : "Senior"}))
In [18]:
# Where would I fit in here?
classify(tree, {"level" : "Junior",
"lang" : "Python",
"tweets" : "no",
"phd" : "no"})
Out[18]:
In [ ]: