In [12]:
from sklearn.datasets import load_iris
iris = load_iris()
iris.data.shape
Out[12]:
In [13]:
import numpy as np
np.histogram(iris.target)
Out[13]:
In [14]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)
In [21]:
from sklearn.externals.six import StringIO
with open('iris.dot', 'w') as f:
f = tree.export_graphviz(clf, out_file=f)
In [76]:
import numpy as np
def binSplitData(X, y, feat, val):
L = np.nonzero(X[:, feat]<=val)
R = np.nonzero(X[:, feat]>val)
return X[L], y[L], X[R], y[R]
def chooseBestSplit(X, y, min_s=0.1, min_n=5, leaf_func=np.mean):
#if all the target variables are the same: quit and return value
if not np.any(np.subtract(y, y[0])):
return None, leaf_func(y)
n, m = np.shape(X)
S = np.var(y)*n
bestS = np.inf; bestFeat = None; bestVal = np.mean(y)
for feat in range(m):
for val in set(X[:, feat]):
lX, ly, rX, ry = binSplitData(X, y, feat, val)
if (len(ly) < min_s) or (len(ry) < min_s):
continue
newS = np.var(ly)*len(ly) + np.var(ry)*len(ry)
if newS < bestS:
bestFeat = feat
bestVal = val
bestS = newS
#if the decrease (S-bestS) is less than a threshold don't do the split
if (S - bestS) < min_s:
return None, leaf_func(y)
return bestFeat, bestVal
def createTree(X, y, max_depth=10, leaf_func=np.mean):
if max_depth == 0: return leaf_func(y)
feat, val = chooseBestSplit(X, y, leaf_func=leaf_func)
if feat == None: return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lX, ly, rX, ry = binSplitData(X, y, feat, val)
retTree['left'] = createTree(lX, ly, max_depth=max_depth-1, leaf_func=leaf_func)
retTree['right'] = createTree(rX, ry, max_depth=max_depth-1, leaf_func=leaf_func)
return retTree
def isTree(obj):
return isinstance(obj, dict)
def getMean(tree):
mean_right = getMean(tree['right']) if isTree(tree['right']) else tree['right']
mean_left = getMean(tree['left']) if isTree(tree['left']) else tree['left']
return (mean_right + mean_left) / 2.0
def prune(tree, X, y):
if (len(y) == 0):
return getMean(tree)
if not isTree(tree):
return tree
lX, ly, rX, ry = binSplitData(X, y, tree['spInd'], tree['spVal'])
if isTree(tree['left']):
tree['left'] = prune(tree['left'], lX, ly)
if isTree(tree['right']):
tree['right'] = prune(tree['right'], rX, ry)
# if they are now both leaf nodes, see if we can merge them
if (not isTree(tree['left']) and not isTree(tree['right'])):
errNoMerge = sum(np.power(ly-tree['left'], 2)) + sum(np.power(ry-tree['right'], 2))
mean_tree = (tree['left'] + tree['right']) / 2.0
errMerge = sum(np.power(y-mean_tree, 2))
if errMerge < errNoMerge:
print('Merge happens')
return mean_tree
return tree
def evaluate(tree, x):
if not isTree(tree): return tree
if x[tree['spInd']] <= tree['spVal']:
if isTree(tree['left']): return evaluate(tree['left'], x)
else: return tree['left']
else:
if isTree(tree['right']): return evaluate(tree['right'], x)
else: return tree['right']
def score(tree, X, y):
n = np.shape(X)[0]
pred = [evaluate(tree, X[i]) for i in range(n)]
return sum(np.power(pred-y, 2))/n
In [104]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)
In [105]:
tree = createTree(X_train, y_train)
In [106]:
print(tree)
In [107]:
score(tree, X_test, y_test)
Out[107]:
In [108]:
tree2 = prune(tree, X_test, y_test)
In [109]:
print(tree2)
In [110]:
score(tree2, X_test, y_test)
Out[110]:
In [7]:
# GBRT Regressor with LS loss
def gbrt_regressor(X, y, n_evaluator=10, learning_rate=1.0, max_depth=10):
import numpy as np
n = np.shape(X)[0]
F = np.array([0]*n)
trees = []
for m in range(n_evaluator):
yr = y - F
tree = createTree(X, yr, max_depth=max_depth, leaf_func=np.mean)
trees.append(tree)
for i in range(n):
F[i] += learning_rate * evaluate(tree, X[i])
weights = [1 for i in range(n_evaluator)]
return trees, weights
def predict(trees, weights, x):
return sum(weights[i]*evaluate(trees[i], x) for i in range(len(trees)))
In [77]:
# GBRT Classifier with multi-class Log loss
def kLogLeaf(y):
return sum(y) / sum(np.abs(y)*(1-np.abs(y)))
def gbrt_classifier(X, y, n_evaluator=10, learning_rate=1.0, max_depth=10):
import numpy as np
n_example = np.shape(X)[0]
classes = list(set(y))
k_class = len(classes)
k_val = (k_class-1.0)/k_class
F = np.zeros((n_example,k_class))
P = np.zeros((n_example,k_class))
treemat = []
for m in range(n_evaluator):
Fk = sum(np.exp(F[:, k]) for k in range(k_class))
trees = []
for k in range(k_class):
P[:,k] = np.exp(F[:,k]) / Fk
yk = (y==classes[k]).astype(float)
yr = yk - P[:,k]
tree = createTree(X, yr, max_depth=max_depth, leaf_func=kLogLeaf)
trees.append(tree)
for i in range(n_example):
F[i, k] += learning_rate * k_val * evaluate(tree, X[i])
#weights = [1 for i in range(n_evaluator)]
treemat.append(trees)
return treemat, classes
def class_predict(treemat, classes, x):
V = np.exp([sum(evaluate(treemat[i][k], x) for i in range(len(treemat))) for k in range(len(classes))])
P = V / sum(V)
return zip(classes, P)
In [78]:
treemat, classes = gbrt_classifier(iris.data, iris.target, n_evaluator=100, max_depth=2, learning_rate=1)
In [79]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
for i in range(0,150, 10): pp.pprint(class_predict(treemat, classes, iris.data[i]))
In [38]:
np.shape(treemat)
Out[38]:
In [264]:
sum(power(y_test[i] - predict(trees, weights, X_test[i]), 2) for i in range(len(y_test))) / len(y_test)
Out[264]:
In [224]:
tree3 = createTree(iris.data, iris.target, max_depth=2)
In [225]:
sum(power(y_test[i] - evaluate(tree3, X_test[i]), 2) for i in range(len(y_test))) / len(y_test)
Out[225]: