In [128]:
class DecisionTree:
    def __init__(self, label):
        self.label = label
        self.attribute = None
        self.children = {} # dict of (attribute's value)-(decision tree object) pair
    def hasLabel(self):
        return self.label is None
    def addChild(self, attribute, value, child):
        self.attribute = attribute
        self.children[value] = child
    def __str__(self):
        """
        TODO (Optional): The provided implementation is very basic and not pretty. 
        Use plotting tools to visualize your decision tree.
        """
        if self.hasLabel:
            return self.label
        else:
            return self.children.__str__()

def choose_attribute(X, t, attributes, strategy = 'first'):
    if strategy == 'first':
        return attributes[0]
    if strategy == 'entropy':
        """
        Will be covered in Hands-on lecture.
        """
        raise NotImplementedError
    if strategy == 'infogain':
        """
        TODO: return the attribute with highest information gain
        """
        raise NotImplementedError
        
def predict_DT(x, T):
    """
    TODO: Currently ignores the tree and always return false.
    """
    return False
    if T.hasLabel():
        return T.label
    else:
        raise NotImplementedError
    
def train_DT_ID3(X, t, attributes, depth = np.Inf):
    if not np.any(t):
        return DecisionTree(False)
    if np.all(t):
        return DecisionTree(True)
    if attributes.size == 0 or depth == 0:
        return DecisionTree(np.mean(t) > 0.5)
    
    T = DecisionTree(None)
    attribute = choose_attribute(X,t, attributes, strategy='first')
    
    X_chosen = np.asarray(X[attribute])
    
    print(X_chosen.shape, np.unique(X_chosen))
    for m in np.unique(X_chosen):
        i_m = X_chosen == m
        """
        TODO: Make a recursive call on T.addChild
        You can do logical indexing such as X_chosen[i_m] and t[i_m]
        Hint: you need an if statement
        """
        raise NotImplementedError
    
    return T
    
    
def dt_preprocess_spambase(data, median = None):
    X = data.drop(57,axis=1)
    if median is None:
        median = X.median(axis=1)
        return X > median , np.asarray(data[57]), median
    else:
        return X > X.median, np.asarray(data[57])

X_train, t_train, median_train = dt_preprocess_spambase(pd.read_csv("spambase.train", header=None)) 
X_test, t_test = dt_preprocess_spambase(pd.read_csv("spambase.test", header=None), median_train)

t_pred = X_train.apply(lambda x: predict_DT(x, T), axis = 1)
print("Classificaion error(train): ", (t_pred != t_train).mean(axis=0))

t_pred = X_test.apply(lambda x: predict_DT(x, T), axis = 1)
print("Classificaion error(test): ", (t_pred != t_test).mean(axis=0))


Classificaion error(train):  0.405
Classificaion error(test):  0.385620915033

In [ ]: