In [128]:
class DecisionTree:
def __init__(self, label):
self.label = label
self.attribute = None
self.children = {} # dict of (attribute's value)-(decision tree object) pair
def hasLabel(self):
return self.label is None
def addChild(self, attribute, value, child):
self.attribute = attribute
self.children[value] = child
def __str__(self):
"""
TODO (Optional): The provided implementation is very basic and not pretty.
Use plotting tools to visualize your decision tree.
"""
if self.hasLabel:
return self.label
else:
return self.children.__str__()
def choose_attribute(X, t, attributes, strategy = 'first'):
if strategy == 'first':
return attributes[0]
if strategy == 'entropy':
"""
Will be covered in Hands-on lecture.
"""
raise NotImplementedError
if strategy == 'infogain':
"""
TODO: return the attribute with highest information gain
"""
raise NotImplementedError
def predict_DT(x, T):
"""
TODO: Currently ignores the tree and always return false.
"""
return False
if T.hasLabel():
return T.label
else:
raise NotImplementedError
def train_DT_ID3(X, t, attributes, depth = np.Inf):
if not np.any(t):
return DecisionTree(False)
if np.all(t):
return DecisionTree(True)
if attributes.size == 0 or depth == 0:
return DecisionTree(np.mean(t) > 0.5)
T = DecisionTree(None)
attribute = choose_attribute(X,t, attributes, strategy='first')
X_chosen = np.asarray(X[attribute])
print(X_chosen.shape, np.unique(X_chosen))
for m in np.unique(X_chosen):
i_m = X_chosen == m
"""
TODO: Make a recursive call on T.addChild
You can do logical indexing such as X_chosen[i_m] and t[i_m]
Hint: you need an if statement
"""
raise NotImplementedError
return T
def dt_preprocess_spambase(data, median = None):
X = data.drop(57,axis=1)
if median is None:
median = X.median(axis=1)
return X > median , np.asarray(data[57]), median
else:
return X > X.median, np.asarray(data[57])
X_train, t_train, median_train = dt_preprocess_spambase(pd.read_csv("spambase.train", header=None))
X_test, t_test = dt_preprocess_spambase(pd.read_csv("spambase.test", header=None), median_train)
t_pred = X_train.apply(lambda x: predict_DT(x, T), axis = 1)
print("Classificaion error(train): ", (t_pred != t_train).mean(axis=0))
t_pred = X_test.apply(lambda x: predict_DT(x, T), axis = 1)
print("Classificaion error(test): ", (t_pred != t_test).mean(axis=0))
In [ ]: