In [26]:
training_data = [
['Green', 3, 'Apple'],
['Yellow', 3, 'Apple'],
['Red', 1, 'Grape'],
['Red', 1, 'Grape'],
['Yellow', 3, 'Lemon'],
]
In [27]:
#Column names for our data
header = ["color","diameter","label"]
In [28]:
"""Find the unique values for a column in dataset"""
def unique_values(rows,col):
return set([row[col] for row in rows])
In [29]:
"""count the no of examples for each label in a dataset"""
def class_counts(rows):
counts = {} # a dictionary of label -> count.
for row in rows:
# in our dataset format, the label is always the last column
label = row[-1]
if label not in counts:
counts[label] = 0
counts[label] += 1
return counts
In [30]:
"""Check if the value is numeric"""
def is_numeric(value):
return isinstance(value, int) or isinstance(value, float)
Each object of a question class holds a column_no and a col_value
Eg. column_no = 0 denotes color and so col_value can be Green, Yellow or Red
We can write a method which would compare the feature value of example with the feature value of Question
In [31]:
class Question:
def __init__(self,col, val):
self.col = col
self.val = val
def match(self,example):
# Compare the feature value in an example to the
# feature value in this question.
value = example[self.col]
if is_numeric(value):
return value >= self.val
else:
return value == self.val
def __repr__(self):
# method to print the question in a readable format.
condition = "=="
if is_numeric(self.val):
condition = ">="
return "Is %s %s %s?" % (
header[self.col], condition, str(self.val))
In [32]:
#create a new question with col = 1 and val = 3
q = Question(1,3)
#print q
q
Out[32]:
In [33]:
"""For each row in the dataset, check if it satisfies the question. If
so, add it to 'true rows', otherwise, add it to 'false rows'.
"""
def partition(rows, question):
true_rows, false_rows = [], []
for row in rows:
if question.match(row):
true_rows.append(row)
else:
false_rows.append(row)
return true_rows, false_rows
In [34]:
"""Calculate the Gini Impurity for a list of rows."""
def gini(rows):
counts = class_counts(rows)
impurity = 1
for lbl in counts:
prob_of_lbl = counts[lbl] / float(len(rows))
impurity -= prob_of_lbl**2
return impurity
In [35]:
def info_gain(left, right, current_uncertainty):
#we need to calculate weighted avg of impurities at both child nodes
p = float(len(left)) / (len(left) + len(right))
return current_uncertainty - p * gini(left) - (1 - p) * gini(right)
In [36]:
"""Find the best question to ask by iterating over every feature / value
and calculating the information gain."""
def find_best_split(rows):
best_gain = 0 # keep track of the best information gain
best_question = None # keep train of the feature / value that produced it
current_uncertainty = gini(rows)
n_features = len(rows[0]) - 1 # number of columns
for col in range(n_features): # for each feature
values = set([row[col] for row in rows]) # unique values in the column
for val in values: # for each value
question = Question(col, val)
# try splitting the dataset
true_rows, false_rows = partition(rows, question)
# Skip this split if it doesn't divide the
# dataset.
if len(true_rows) == 0 or len(false_rows) == 0:
continue
# Calculate the information gain from this split
gain = info_gain(true_rows, false_rows, current_uncertainty)
# You actually can use '>' instead of '>=' here
# but I wanted the tree to look a certain way for our
# toy dataset.
if gain >= best_gain:
best_gain, best_question = gain, question
return best_gain, best_question
In [37]:
"""
A Decision Node asks a question.
This holds a reference to the question, and to the two child nodes.
"""
class Decision_Node:
def __init__(self,question,true_branch,false_branch):
self.question = question
self.true_branch = true_branch
self.false_branch = false_branch
In [38]:
"""
A Leaf node classifies data.
This holds a dictionary of class (e.g., "Apple") -> number of time it
appears in the rows from the training data that reach this leaf.
"""
class Leaf:
def __init__(self, rows):
self.predictions = class_counts(rows)
In [39]:
def build_tree(rows):
# Try partitioing the dataset on each of the unique attribute,
# calculate the information gain,
# and return the question that produces the highest gain.
gain, question = find_best_split(rows)
# Base case: no further info gain
# Since we can ask no further questions,
# we'll return a leaf.
if gain == 0:
return Leaf(rows)
# If we reach here, we have found a useful feature / value
# to partition on.
true_rows, false_rows = partition(rows, question)
# Recursively build the true branch.
true_branch = build_tree(true_rows)
# Recursively build the false branch.
false_branch = build_tree(false_rows)
# Return a Question node.
# This records the best feature / value to ask at this point,
# as well as the branches to follow
# dependingo on the answer.
return Decision_Node(question, true_branch, false_branch)
In [40]:
def print_tree(node, spacing=""):
# Base case: we've reached a leaf
if isinstance(node, Leaf):
print (spacing + "Predict", node.predictions)
return
# Print the question at this node
print (spacing + str(node.question))
# Call this function recursively on the true branch
print (spacing + '--> True:')
print_tree(node.true_branch, spacing + " ")
# Call this function recursively on the false branch
print (spacing + '--> False:')
print_tree(node.false_branch, spacing + " ")
In [41]:
my_tree = build_tree(training_data)
In [42]:
print_tree(my_tree)
In [45]:
def classify(row, node):
# Base case: we've reached a leaf
if isinstance(node, Leaf):
return node.predictions
# Decide whether to follow the true-branch or the false-branch.
# Compare the feature / value stored in the node,
# to the example we're considering.
if node.question.match(row):
return classify(row, node.true_branch)
else:
return classify(row, node.false_branch)
In [46]:
"""A nicer way to print the predictions at a leaf."""
def print_leaf(counts):
total = sum(counts.values()) * 1.0
probs = {}
for lbl in counts.keys():
probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
return probs
In [48]:
print_leaf(classify(training_data[0],my_tree))
Out[48]:
In [49]:
testing_data = [
['Green', 3, 'Apple'],
['Yellow', 4, 'Apple'],
['Red', 2, 'Grape'],
['Red', 1, 'Grape'],
['Yellow', 3, 'Lemon'],
]
In [50]:
for row in testing_data:
print ("Actual: %s. Predicted: %s" %
(row[-1], print_leaf(classify(row, my_tree))))