In [1]:
from collections import Counter, defaultdict
import math


inputs = [
    ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),
    ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'},  False),
    ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),
    ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
    ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),
    ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
    ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'},  True),
    ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),
    ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),
    ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),
    ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
    ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),
    ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),
    ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
]


def entropy(class_probs):
    """given a list of class probs, compute the entropy
    (a measure of the disorder of the system)"""
    return sum(-p * math.log(p, 2) for p in class_probs if p)


def class_probs(labels):
    total_count = len(labels)
    return [count / total_count for count in Counter(labels).values()]


def data_entropy(labeled_data):
    """entropy of data in pairs (input, label)"""
    labels = [label for _, label in labeled_data]
    probs = class_probs(labels)
    return entropy(probs)


def partition_entropy(subsets):
    """find the entropy from this partition of data into subsets.
    subsets is a list of lists of labeled data"""
    total_count = sum(len(subset) for subset in subsets)
    return sum(data_entropy(subset) * len(subset) / total_count for subset in subsets)


def partition_by(inputs, attr):
    """each input is a pair (attr dict, label).
    return a dict: attr value -> inputs"""
    groups = defaultdict(list)
    for inp in inputs:
        key = inp[0][attr]
        groups[key].append(inp)
    
    return groups


def partition_entropy_by(inputs, attr):
    """computes the entropy corresponding to the given partition"""
    partitions = partition_by(inputs, attr)
    return partition_entropy(partitions.values())

In [2]:
for key in ['level', 'lang', 'tweets', 'phd']:
    print(key, '\t', round(partition_entropy_by(inputs, key), 2))


level 	 0.69
lang 	 0.86
tweets 	 0.79
phd 	 0.89

In [3]:
""" Notice above 'level' has the lowest entropy. This means it has the highest certainty.
    So we would split our tree on that attribute first.
"""

senior_inputs = [(inp, label) for inp, label in inputs if inp['level'] == 'Senior']

for key in ['lang', 'tweets', 'phd']:
    print(key, '\t', round(partition_entropy_by(senior_inputs, key), 2))


lang 	 0.4
tweets 	 0.0
phd 	 0.95

In [12]:
from functools import partial


def classify(tree, inp):
    """classify the input using the given decision tree"""
    # if this is a leaf node, return its value
    if tree in [True, False]:
        return tree
    
    """otherwise, this tree consists of an attribute to split on
    & a dict whose keys are values of that attribute & whose values
    are subtrees to consider next"""
    attr, subtree_dict = tree
    subtree_key = inp.get(attr)
    # if no subtree for key, use None
    if subtree_key not in subtree_dict:
        subtree_key = None
    
    # choose the appropriate subtree & use it to classify the next input
    subtree = subtree_dict[subtree_key]
    return classify(subtree, inp)


def build_tree_id3(inputs, split_candidates=None):
    if split_candidates is None:
        split_candidates = inputs[0][0].keys()
    
    num_inputs = len(inputs)
    num_trues = len([label for item, label in inputs if label])
    num_falses = num_inputs - num_trues
    
    if num_trues == 0:
        return False
    if num_falses == 0:
        return True
    
    best_attr = min(split_candidates, key=partial(partition_entropy_by, inputs))
    
    partitions = partition_by(inputs, best_attr)
    new_candidates = [a for a in split_candidates if a != best_attr]
    
    subtrees = {attr_value : build_tree_id3(subset, new_candidates)
                for attr_value, subset in partitions.items()}
    
    subtrees[None] = num_trues > num_falses
    
    return (best_attr, subtrees)

In [20]:
tree = build_tree_id3(inputs)

from pprint import pprint
pprint(tree)


('level',
 {None: True,
  'Junior': ('phd', {None: True, 'no': True, 'yes': False}),
  'Mid': True,
  'Senior': ('tweets', {None: False, 'no': False, 'yes': True})})

In [14]:
classify(tree, {"level"  : "Junior",
                "lang"   : "Java",
                "tweets" : "yes",
                "phd"    : "no"})


Out[14]:
True

In [15]:
classify(tree, {"level"  : "Junior",
                "lang"   : "Java",
                "tweets" : "yes",
                "phd"    : "yes"})


Out[15]:
False

In [16]:
print(classify(tree, {"level" : "Intern"}))
print(classify(tree, {"level" : "Senior"}))


True
False

In [18]:
# Where would I fit in here?
classify(tree, {"level"  : "Junior",
                "lang"   : "Python",
                "tweets" : "no",
                "phd"    : "no"})


Out[18]:
True

In [ ]: