In [6]:
%matplotlib inline
In [91]:
import csv
# Dati no failiem
def get_attributes():
# read attributes
with open('./ld1/attributes.txt') as f:
# `\d\\t\d\\n` format
attributes = [int(a[0]) for a in csv.reader(f, delimiter='\t')]
return attributes
def get_example_data():
# read example line by line
with open('./ld1/examples.txt') as f:
# `\d\\t\d\\t ... \d\\n` format
data = [tuple(i for i in map(lambda x: int(x), d))
for d in zip(*csv.reader(f, delimiter='\t'))]
return data
attributes = get_attributes()
data = get_example_data()
print(attributes)
data
Out[91]:
In [92]:
import math
def unique(lst):
"""
Returns a list made up of the unique values found in lst. i.e., it
removes the redundant values in lst.
"""
lst = lst[:]
unique_lst = []
# Cycle through the list and add each value to the unique list only once.
for item in lst:
if unique_lst.count(item) <= 0:
unique_lst.append(item)
# Return the list with all redundant values removed.
return unique_lst
def most_frequent(lst):
"""
Returns the item that appears most frequently in the given list.
"""
lst = lst[:]
highest_freq = 0
most_freq = None
for val in unique(lst):
if lst.count(val) > highest_freq:
most_freq = val
highest_freq = lst.count(val)
return most_freq
def majority_value(data, target_attr):
"""
Creates a list of all values in the target attribute for each record
in the ld1 list object, and returns the value that appears in this list
the most frequently.
"""
data = data[:]
return most_frequent([record[target_attr] for record in data])
def choose_attribute(data, attributes, target_attr, fitness):
"""
Cycles through all the attributes and returns the attribute with the
highest information gain (or lowest entropy).
"""
data = data[:]
best_gain = 0.0
best_attr = None
for attr in attributes:
gains = fitness(data, attr, target_attr)
if gains >= best_gain and attr != target_attr:
best_gain = gains
best_attr = attr
return best_attr
def get_values(data, attr):
"""
Creates a list of values in the chosen attribut for each record in ld1,
prunes out all of the redundant values, and return the list.
"""
data = data[:]
return unique([record[attr] for record in data])
def get_examples(data, attr, value):
"""
Returns a list of all the records in <ld1> with the value of <attr>
matching the given value.
"""
data = data[:]
rtn_lst = []
if not data:
return rtn_lst
else:
record = data.pop()
if record[attr] == value:
rtn_lst.append(record)
rtn_lst.extend(get_examples(data, attr, value))
return rtn_lst
else:
rtn_lst.extend(get_examples(data, attr, value))
return rtn_lst
In [93]:
def entropy(data, target_attr):
"""
Calculates the entropy of the given ld1 set for the target attribute.
"""
val_freq = {}
data_entropy = 0.0
# Calculate the frequency of each of the values in the target attr
for record in data:
if record[target_attr] in val_freq:
val_freq[record[target_attr]] += 1.0
else:
val_freq[record[target_attr]] = 1.0
# Calculate the entropy of the ld1 for the target attribute
for freq in val_freq.values():
data_entropy += (-freq / len(data)) * math.log(freq / len(data), 2)
return data_entropy
def gain(data, attr, target_attr):
"""
Calculates the information gain (reduction in entropy) that would
result by splitting the ld1 on the chosen attribute (attr).
"""
val_freq = {}
subset_entropy = 0.0
# Calculate the frequency of each of the values in the target attribute
for record in data:
if record[attr] in val_freq:
val_freq[record[attr]] += 1.0
else:
val_freq[record[attr]] = 1.0
# Calculate the sum of the entropy for each subset of records weighted
# by their probability of occuring in the training set.
for val in val_freq.keys():
val_prob = val_freq[val] / sum(val_freq.values())
data_subset = [record for record in data if record[attr] == val]
subset_entropy += val_prob * entropy(data_subset, target_attr)
# Subtract the entropy of the chosen attribute from the entropy of the
# whole ld1 set with respect to the target attribute (and return it)
return entropy(data, target_attr) - subset_entropy
In [94]:
def create_decision_tree(data, attributes, target_attr, fitness_func):
vals = [record[target_attr] for record in data]
default = majority_value(data, target_attr)
if not data or (len(attributes) - 1) <= 0:
return default
# If all the records in the dataset have the same classification,
# return that classification.
elif vals.count(vals[0]) == len(vals):
return vals[0]
else:
# Choose the next best attribute to best classify our ld1
best = choose_attribute(data, attributes, target_attr,
fitness_func)
attributes.remove(best)
# Create a new decision tree/node with the best attribute and an empty
# dictionary object--we'll fill that up next.
tree = {best: {}}
# Create a new decision tree/sub-node for each of the values in the
# best attribute field
for val in get_values(data, best):
# Create a subtree for the current value under the "best" field
subtree = create_decision_tree(
get_examples(data, best, val),
[attr for attr in attributes if attr != best],
target_attr,
fitness_func)
# Add the new subtree to the empty dictionary object in our new
# tree/node we just created.
tree[best][val] = subtree
return tree
In [95]:
tree = create_decision_tree(data, attributes, 0, gain)
import json
decode_attr = ['history', 'debt', 'recommendations', 'income']
decode_attrs = [
['bad', 'unknown', 'GOOD'],
['low', 'HIGH'],
['no', 'YES'],
['low', 'medium', 'HIGH']
]
def print_tree(tree, indent=1):
for k, v in tree.items():
if isinstance(v, dict):
print(' {}↳ bar {}'.format(' ' * indent, decode_attr[k-1]))
print_tree(v, indent+2)
else:
print('{}↳ bar {} ->(*) {}'.format(' ' * indent, decode_attr[k-1], decode_attrs[3][v-1]))
print(json.dumps(tree, indent=4, sort_keys=True))
print_tree(tree)
In [45]: