Python Decision Trees

(C) 2017 by Damir Cavar

Version: 1.0, March 2017

This is a tutorial related to the discussion of Decision Tree classifiers.

This tutorial was developed as part of my course material for the course Machine Learning for Computational Linguistics in the Computational Linguistics Program of the Department of Linguistics at Indiana University.

Decision Trees

For discussion and explanation of the algorithms and use of implementations in Python libraries, see the following documentation online:

Reimplementation of Roach's Building Decision Trees in Python 3.x

The following example code is taken from Christopher Roach's Building Decision Trees in Python. It has been slightly simplified. You can download the code from his Github repository.


In [31]:
import collections, math, sys

Roach: "This module holds functions that are responsible for creating a new decision tree and for using the tree for data classificiation."


In [36]:
def majority_value(data, target_attr):
    """
    Creates a list of all values in the target attribute for each record
    in the data list object, and returns the value that appears in this list
    the most frequently.
    """
    return most_frequent([record[target_attr] for record in data])


def most_frequent(lst):
    "Returns the item that appears most frequently in the given list."

    highest_freq = 0
    most_freq = None

    for val in unique(lst):
        if lst.count(val) > highest_freq:
            most_freq = val
            highest_freq = lst.count(val)
            
    return most_freq


def unique(lst):
    return list(set(lst))


def get_values(data, attr):
    """
    Creates a list of values in the chosen attribut for each record in data,
    prunes out all of the redundant values, and return the list.  
    """
    return unique([record[attr] for record in data])


def choose_attribute(data, attributes, target_attr, fitness):
    """
    Cycles through all the attributes and returns the attribute with the
    highest information gain (or lowest entropy).
    """
    best_gain = 0.0
    best_attr = None

    for attr in attributes:
        gain = fitness(data, attr, target_attr)
        if (gain >= best_gain and attr != target_attr):
            best_gain = gain
            best_attr = attr
                
    return best_attr


def get_examples(data, attr, value):
    """
    Returns a list of all the records in <data> with the value of <attr>
    matching the given value.
    """
    rtn_lst = []
    
    if not data:
        return rtn_lst
    else:
        record = data.pop()
        if record[attr] == value:
            rtn_lst.append(record)
            rtn_lst.extend(get_examples(data, attr, value))
            return rtn_lst
        else:
            rtn_lst.extend(get_examples(data, attr, value))
            return rtn_lst

def get_classification(record, tree):
    """
    This function recursively traverses the decision tree and returns a
    classification for the given record.
    """
    # If the current node is a string, then we've reached a leaf node and
    # we can return it as our answer
    if type(tree) == type("string"):
        return tree
    if tree == None:
        return tree

    # Traverse the tree further until a leaf node is found.
    myKeys = list(tree.keys())
    attr = myKeys[0]
    t = tree[attr][record[attr]]
    return get_classification(record, t)


def classify(tree, data):
    """
    Returns a list of classifications for each of the records in the data
    list as determined by the given decision tree.
    """
    return [ get_classification(record, tree) for record in data ]


def create_decision_tree(data, attributes, target_attr, fitness_func):
    "Returns a new decision tree based on the examples given."
    vals = [record[target_attr] for record in data]
    default = majority_value(data, target_attr)

    # If the dataset is empty or the attributes list is empty, return the
    # default value. When checking the attributes list for emptiness, we
    # need to subtract 1 to account for the target attribute.
    if not data or len(attributes) == 0:
        return default
    # If all the records in the dataset have the same classification,
    # return that classification.
    elif vals.count(vals[0]) == len(vals):
        return vals[0]
    else:
        # Choose the next best attribute to best classify our data
        best = choose_attribute(data, attributes, target_attr, fitness_func)

        # Create a new decision tree/node with the best attribute and an empty
        # dictionary object--we'll fill that up next.
        # We use the collections.defaultdict function to add a function to the
        # new tree that will be called whenever we query the tree with an
        # attribute that does not exist.  This way we return the default value
        # for the target attribute whenever, we have an attribute combination
        # that wasn't seen during training.
        tree = {best:collections.defaultdict(lambda: default)}

        # Create a new decision tree/sub-node for each of the values in the
        # best attribute field
        for val in get_values(data, best):
            # Create a subtree for the current value under the "best" field
            subtree = create_decision_tree(
                get_examples(data, best, val),
                [attr for attr in attributes if attr != best],
                target_attr,
                fitness_func)

            # Add the new subtree to the empty dictionary object in our new
            # tree/node we just created.
            tree[best][val] = subtree

    return tree

Roach: "This module contains the functions for calculating the information gain of a dataset as defined by the ID3 (Information Theoretic) heuristic."

The following function calculates the entropy for a given data set and the specific target attribute:


In [37]:
def entropy(data, target_attr):
    val_freq = {}
    data_entropy = 0.0

    # Compute the frequency of each of the values in the target attr
    for record in data:
        val_freq[record[target_attr]] = val_freq.get(record[target_attr], 0.0) + 1.0

    # Compute the entropy of the data for the target attribute
    for freq in val_freq.values():
        data_entropy += (-freq/len(data)) * math.log(freq/len(data), 2) 
        
    return data_entropy

The following function computes the information gain (reduction in entropy) that would result by splitting the data on the chosen attribute (attr).


In [38]:
def gain(data, attr, target_attr):
    val_freq = {}
    subset_entropy = 0.0

    # Compute the frequency of each of the values in the target attribute
    for record in data:
        val_freq[record[attr]] = val_freq.get(record[attr], 0.0) + 1.0

    # Compute the sum of the entropy for each subset of records weighted
    # by their probability of occuring in the training set.
    total = sum(val_freq.values())
    for val in val_freq.keys():
        val_prob = val_freq[val] / total
        data_subset = [record for record in data if record[attr] == val]
        subset_entropy += val_prob * entropy(data_subset, target_attr)

    # Subtract the entropy of the chosen attribute from the entropy of the
    # whole data set with respect to the target attribute (and return it)
    return (entropy(data, target_attr) - subset_entropy)

In [39]:
def print_tree(tree, myStr):
    """
    This function recursively crawls through the d-tree and prints it out in a
    more readable format than a straight print of the Python dict object.  
    """
    print(tree)
    if type(tree) == dict:
        myKeys = list(tree.keys())
        print(myStr, myKeys[0])
        myVals = list(tree.values())
        for item in myVals[0].keys():
            print(myStr, '\t', item)
            print_tree(myVals[0][item], myStr + "\t")
    else:
        print(myStr, "\t->\t", tree)


def main():
    # Get the training and test data filenames from the user
    training_filename = "PyDecisionTreesData.txt"
    test_filename = "PyDecisionTreesData.txt"

    training_data = []
    try:
        ifp = open(training_filename, mode='r', encoding='utf8')
        attributes = [attr.strip() for attr in ifp.readline().strip().split(",")]
        for line in ifp.readlines():
            training_data.append(dict(zip(attributes,
                             [datum.strip() for datum in line.strip().split(",")])))
            print(dict(zip(attributes, [datum.strip() for datum in line.strip().split(",")])))
        ifp.close()
    except IOError:
        print("Error reading from file", training_filename)
    # Extract the attribute names and the target attribute from the training
    # data file.
    
    target_attr = attributes[-1]

    # Get the training and test data from the given files
    test_data = training_data[:]
    print(target_attr)
    
    # Create the decision tree
    dtree = create_decision_tree(training_data, attributes, target_attr, gain)

    # Classify the records in the test data
    classification = classify(dtree, test_data)

    # Print the results of the test
    print("------------------------\n")
    print("--   Classification   --\n")
    print("------------------------\n")
    print("\n")
    for item in classification: print(item)
    
    # Print the contents of the decision tree
    print("\n")
    print("------------------------\n")
    print("--   Decision Tree    --\n")
    print("------------------------\n")
    print("\n")
    print_tree(dtree, "")

main()


{'Education': 'masters', 'Marital Status': 'single', 'Age': '36 - 55', 'Income': 'high', 'Purchase?': 'will buy'}
{'Education': 'high school', 'Marital Status': 'single', 'Age': '18 - 35', 'Income': 'low', 'Purchase?': "won't buy"}
{'Education': 'masters', 'Marital Status': 'single', 'Age': '36 - 55', 'Income': 'low', 'Purchase?': 'will buy'}
{'Education': 'bachelors', 'Marital Status': 'single', 'Age': '18 - 35', 'Income': 'high', 'Purchase?': "won't buy"}
{'Education': 'high school', 'Marital Status': 'single', 'Age': '< 18', 'Income': 'low', 'Purchase?': 'will buy'}
{'Education': 'bachelors', 'Marital Status': 'married', 'Age': '18 - 35', 'Income': 'high', 'Purchase?': "won't buy"}
{'Education': 'bachelors', 'Marital Status': 'married', 'Age': '36 - 55', 'Income': 'low', 'Purchase?': "won't buy"}
{'Education': 'bachelors', 'Marital Status': 'single', 'Age': '> 55', 'Income': 'high', 'Purchase?': 'will buy'}
{'Education': 'masters', 'Marital Status': 'married', 'Age': '36 - 55', 'Income': 'low', 'Purchase?': "won't buy"}
{'Education': 'masters', 'Marital Status': 'married', 'Age': '> 55', 'Income': 'low', 'Purchase?': 'will buy'}
{'Education': 'masters', 'Marital Status': 'single', 'Age': '36 - 55', 'Income': 'high', 'Purchase?': 'will buy'}
{'Education': 'masters', 'Marital Status': 'single', 'Age': '> 55', 'Income': 'high', 'Purchase?': 'will buy'}
{'Education': 'high school', 'Marital Status': 'single', 'Age': '< 18', 'Income': 'high', 'Purchase?': "won't buy"}
{'Education': 'masters', 'Marital Status': 'single', 'Age': '36 - 55', 'Income': 'low', 'Purchase?': 'will buy'}
{'Education': 'high school', 'Marital Status': 'single', 'Age': '36 - 55', 'Income': 'low', 'Purchase?': 'will buy'}
{'Education': 'high school', 'Marital Status': 'married', 'Age': '< 18', 'Income': 'low', 'Purchase?': 'will buy'}
{'Education': 'bachelors', 'Marital Status': 'married', 'Age': '18 - 35', 'Income': 'high', 'Purchase?': "won't buy"}
{'Education': 'high school', 'Marital Status': 'married', 'Age': '> 55', 'Income': 'high', 'Purchase?': 'will buy'}
{'Education': 'bachelors', 'Marital Status': 'single', 'Age': '> 55', 'Income': 'low', 'Purchase?': 'will buy'}
{'Education': 'high school', 'Marital Status': 'married', 'Age': '36 - 55', 'Income': 'high', 'Purchase?': "won't buy"}
Purchase?
------------------------

--   Classification   --

------------------------



None
None
None
None
None
None
None
None
None
None
None
None
won't buy
None
None
None
None
None
None
None


------------------------

--   Decision Tree    --

------------------------



{'Age': defaultdict(<function create_decision_tree.<locals>.<lambda> at 0x106b58d08>, {'< 18': {'Income': defaultdict(<function create_decision_tree.<locals>.<lambda> at 0x106b58840>, {'high': "won't buy", 'low': None})}, '18 - 35': None, '> 55': None, '36 - 55': None})}
 Age
 	 < 18
{'Income': defaultdict(<function create_decision_tree.<locals>.<lambda> at 0x106b58840>, {'high': "won't buy", 'low': None})}
	 Income
	 	 high
won't buy
		 	->	 won't buy
	 	 low
None
		 	->	 None
 	 18 - 35
None
	 	->	 None
 	 > 55
None
	 	->	 None
 	 36 - 55
None
	 	->	 None

In [ ]:


In [ ]: