In [12]:
from sklearn.datasets import load_iris

iris = load_iris()
iris.data.shape


Out[12]:
(150, 4)

In [13]:
import numpy as np
np.histogram(iris.target)


Out[13]:
(array([50,  0,  0,  0,  0, 50,  0,  0,  0, 50]),
 array([ 0. ,  0.2,  0.4,  0.6,  0.8,  1. ,  1.2,  1.4,  1.6,  1.8,  2. ]))

In [14]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

In [21]:
from sklearn.externals.six import StringIO
with open('iris.dot', 'w') as f:
    f = tree.export_graphviz(clf, out_file=f)

Implement a CART


In [76]:
import numpy as np

def binSplitData(X, y, feat, val):
    L = np.nonzero(X[:, feat]<=val)
    R = np.nonzero(X[:, feat]>val)
    return X[L], y[L], X[R], y[R]

def chooseBestSplit(X, y, min_s=0.1, min_n=5, leaf_func=np.mean):
    #if all the target variables are the same: quit and return value
    if not np.any(np.subtract(y, y[0])):
        return None, leaf_func(y)
    n, m = np.shape(X)
    S = np.var(y)*n
    bestS = np.inf; bestFeat = None; bestVal = np.mean(y)
    for feat in range(m):
        for val in set(X[:, feat]):
            lX, ly, rX, ry = binSplitData(X, y, feat, val)
            if (len(ly) < min_s) or (len(ry) < min_s):
                continue
            newS = np.var(ly)*len(ly) + np.var(ry)*len(ry)
            if newS < bestS:
                bestFeat = feat
                bestVal = val
                bestS = newS
    #if the decrease (S-bestS) is less than a threshold don't do the split
    if (S - bestS) < min_s:
        return None, leaf_func(y)
    return bestFeat, bestVal

def createTree(X, y, max_depth=10, leaf_func=np.mean):
    if max_depth == 0: return leaf_func(y)
    feat, val = chooseBestSplit(X, y, leaf_func=leaf_func)
    if feat == None: return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lX, ly, rX, ry = binSplitData(X, y, feat, val)
    retTree['left'] = createTree(lX, ly, max_depth=max_depth-1, leaf_func=leaf_func)
    retTree['right'] = createTree(rX, ry, max_depth=max_depth-1, leaf_func=leaf_func)
    return retTree

def isTree(obj):
    return isinstance(obj, dict)

def getMean(tree):
    mean_right = getMean(tree['right']) if isTree(tree['right']) else tree['right']
    mean_left = getMean(tree['left']) if isTree(tree['left']) else tree['left']
    return (mean_right + mean_left) / 2.0

def prune(tree, X, y):
    if (len(y) == 0):
        return getMean(tree)
    if not isTree(tree):
        return tree
    lX, ly, rX, ry = binSplitData(X, y, tree['spInd'], tree['spVal'])
    if isTree(tree['left']):
        tree['left'] = prune(tree['left'], lX, ly)
    if isTree(tree['right']):
        tree['right'] = prune(tree['right'], rX, ry)
    # if they are now both leaf nodes, see if we can merge them
    if (not isTree(tree['left']) and not isTree(tree['right'])):
        errNoMerge = sum(np.power(ly-tree['left'], 2)) + sum(np.power(ry-tree['right'], 2))
        mean_tree = (tree['left'] + tree['right']) / 2.0
        errMerge = sum(np.power(y-mean_tree, 2))
        if errMerge < errNoMerge:
            print('Merge happens')
            return mean_tree
    return tree

def evaluate(tree, x):
    if not isTree(tree): return tree
    if x[tree['spInd']] <= tree['spVal']:
        if isTree(tree['left']): return evaluate(tree['left'], x)
        else: return tree['left']
    else:
        if isTree(tree['right']): return evaluate(tree['right'], x)
        else: return tree['right']
        
def score(tree, X, y):
    n = np.shape(X)[0]
    pred = [evaluate(tree, X[i]) for i in range(n)]
    return sum(np.power(pred-y, 2))/n

In [104]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)

In [105]:
tree = createTree(X_train, y_train)

In [106]:
print(tree)


{'spInd': 2, 'spVal': 1.8999999999999999, 'right': {'spInd': 3, 'spVal': 1.7, 'right': 2.0, 'left': {'spInd': 2, 'spVal': 4.9000000000000004, 'right': {'spInd': 3, 'spVal': 1.5, 'right': 1.0, 'left': 2.0}, 'left': {'spInd': 3, 'spVal': 1.6000000000000001, 'right': 2.0, 'left': 1.0}}}, 'left': 0.0}

In [107]:
score(tree, X_test, y_test)


Out[107]:
0.052631578947368418

In [108]:
tree2 = prune(tree, X_test, y_test)


Merge happens

In [109]:
print(tree2)


{'spInd': 2, 'spVal': 1.8999999999999999, 'right': {'spInd': 3, 'spVal': 1.7, 'right': 2.0, 'left': {'spInd': 2, 'spVal': 4.9000000000000004, 'right': 1.5, 'left': {'spInd': 3, 'spVal': 1.6000000000000001, 'right': 2.0, 'left': 1.0}}}, 'left': 0.0}

In [110]:
score(tree2, X_test, y_test)


Out[110]:
0.032894736842105261

Implement a GBRT


In [7]:
# GBRT Regressor with LS loss 
def gbrt_regressor(X, y, n_evaluator=10, learning_rate=1.0, max_depth=10):
    import numpy as np
    n = np.shape(X)[0]
    F = np.array([0]*n)
    trees = []
    for m in range(n_evaluator):
        yr = y - F
        tree = createTree(X, yr, max_depth=max_depth, leaf_func=np.mean)
        trees.append(tree)
        for i in range(n):
            F[i] += learning_rate * evaluate(tree, X[i])
    weights = [1 for i in range(n_evaluator)]
    return trees, weights

def predict(trees, weights, x):
    return sum(weights[i]*evaluate(trees[i], x) for i in range(len(trees)))

In [77]:
# GBRT Classifier with multi-class Log loss

def kLogLeaf(y):
    return sum(y) / sum(np.abs(y)*(1-np.abs(y)))

def gbrt_classifier(X, y, n_evaluator=10, learning_rate=1.0, max_depth=10):
    import numpy as np
    n_example = np.shape(X)[0]
    classes = list(set(y))
    k_class = len(classes)
    k_val = (k_class-1.0)/k_class
    F = np.zeros((n_example,k_class))
    P = np.zeros((n_example,k_class))
    treemat = []
    for m in range(n_evaluator):
        Fk = sum(np.exp(F[:, k]) for k in range(k_class))
        trees = []
        for k in range(k_class):
            P[:,k] = np.exp(F[:,k]) / Fk
            yk = (y==classes[k]).astype(float)
            yr = yk - P[:,k] 
            tree = createTree(X, yr, max_depth=max_depth, leaf_func=kLogLeaf)
            trees.append(tree)
            for i in range(n_example):
                F[i, k] += learning_rate * k_val * evaluate(tree, X[i])
            #weights = [1 for i in range(n_evaluator)]
        treemat.append(trees)
    return treemat, classes

def class_predict(treemat, classes, x):
    V = np.exp([sum(evaluate(treemat[i][k], x) for i in range(len(treemat))) for k in range(len(classes))])
    P = V / sum(V)
    return zip(classes, P)

In [78]:
treemat, classes = gbrt_classifier(iris.data, iris.target, n_evaluator=100, max_depth=2, learning_rate=1)

In [79]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
for i in range(0,150, 10): pp.pprint(class_predict(treemat, classes, iris.data[i]))


[   (0, 0.99661799585988931),
    (1, 0.0020712791832223042),
    (2, 0.001310724956888351)]
[   (0, 0.99661799585988931),
    (1, 0.0020712791832223042),
    (2, 0.001310724956888351)]
[   (0, 0.99661799585988931),
    (1, 0.0020712791832223042),
    (2, 0.001310724956888351)]
[   (0, 0.99661799585988931),
    (1, 0.0020712791832223042),
    (2, 0.001310724956888351)]
[   (0, 0.99661799585988931),
    (1, 0.0020712791832223042),
    (2, 0.001310724956888351)]
[   (0, 0.0015967581634710678),
    (1, 0.99676446842929267),
    (2, 0.0016387734072361877)]
[(0, 0.01017596030971833), (1, 0.97938032095431082), (2, 0.010443718735970928)]
[   (0, 0.0056513508649954454),
    (1, 0.065557422443763516),
    (2, 0.92879122669124103)]
[   (0, 0.0015967581634710678),
    (1, 0.99676446842929267),
    (2, 0.0016387734072361877)]
[   (0, 0.0015967581634710678),
    (1, 0.99676446842929267),
    (2, 0.0016387734072361877)]
[   (0, 0.001529345584626123),
    (1, 0.0029922345759242162),
    (2, 0.99547841983944962)]
[   (0, 0.001529345584626123),
    (1, 0.0029922345759242162),
    (2, 0.99547841983944962)]
[   (0, 0.001529345584626123),
    (1, 0.0029922345759242162),
    (2, 0.99547841983944962)]
[   (0, 0.001529345584626123),
    (1, 0.0029922345759242162),
    (2, 0.99547841983944962)]
[   (0, 0.001529345584626123),
    (1, 0.0029922345759242162),
    (2, 0.99547841983944962)]

In [38]:
np.shape(treemat)


Out[38]:
(100, 3)

In [264]:
sum(power(y_test[i] - predict(trees, weights, X_test[i]), 2) for i in range(len(y_test))) / len(y_test)


Out[264]:
132.6014074500012

In [224]:
tree3 = createTree(iris.data, iris.target, max_depth=2)

In [225]:
sum(power(y_test[i] - evaluate(tree3, X_test[i]), 2) for i in range(len(y_test))) / len(y_test)


Out[225]:
0.049696359949715968