In [1]:
    
""" TODO """
    
    Out[1]:
In [27]:
    
import numpy as np
import pandas as pd
import sklearn
from sklearn import datasets 
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import tree
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../lib_plot")
import scatter_boxplot as sp
boston = datasets.load_boston()
    
    
In [3]:
    
boston_df_all = pd.DataFrame(boston.data, columns = boston.feature_names)
boston_df_all['MEDV'] = boston.target
boston_df_all.head()
    
    Out[3]:
In [13]:
    
# Lets make it simple, 
# we select the target data: 
# - MEDV     Median value of owner-occupied homes in $1000's
# and we selecvt couple of categorial independent variables:
# - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
# - RAD      index of accessibility to radial highways
boston_df = boston_df_all[["MEDV", "AGE", "PTRATIO"]]
    
In [14]:
    
# for the shake of the exercise here, lets make the target categorial, if >25 (.75 percentile) True else false
# True for expensive
CUT = 25
boston_df["MEDV_cat"] = boston_df_target.apply(lambda x: True if x < CUT else False)
    
    
In [17]:
    
boston_df.head(2)
    
    Out[17]:
In [117]:
    
X = boston_df[["AGE", "PTRATIO"]]
y = boston_df["MEDV_cat"]
clf = tree.DecisionTreeClassifier(max_depth=2)
#clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
    
In [118]:
    
print "names: ", X.columns.values
print "importances: ", clf.feature_importances_
    
    
In [119]:
    
# See visualy the results
    
In [120]:
    
a_df = boston_df[boston_df["MEDV_cat"] == True]
a_x = a_df["AGE"].values
a_y = a_df["PTRATIO"].values
b_df = boston_df[boston_df["MEDV_cat"] == False]
b_x = b_df["AGE"].values
b_y = b_df["PTRATIO"].values
    
In [121]:
    
feature_names = X.columns.values
print clf.tree_.children_left
print clf.tree_.children_right
print clf.tree_.threshold
print clf.tree_.feature
print [feature_names[i] for i in clf.tree_.feature]
    
    
In [122]:
    
# extract of the underlying decision-rules from a trained tree
# http://stackoverflow.com/questions/20224526/how-to-extract-the-decision-rules-from-scikit-learn-decision-tree
def get_code(tree, feature_names):
        left      = tree.tree_.children_left
        right     = tree.tree_.children_right
        threshold = tree.tree_.threshold
        features  = [feature_names[i] for i in tree.tree_.feature]
        value = tree.tree_.value
        def recurse(left, right, threshold, features, node):
                if (threshold[node] != -2):
                        print "if ( " + features[node] + " <= " + str(threshold[node]) + " ) {"
                        if left[node] != -1:
                                recurse (left, right, threshold, features,left[node])
                        print "} else {"
                        if right[node] != -1:
                                recurse (left, right, threshold, features,right[node])
                        print "}"
                else:
                        print "return " + str(value[node])
        recurse(left, right, threshold, features, 0)
    
In [123]:
    
get_code(clf, X.columns.values)
    
    
In [124]:
    
# Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    
# Create color maps
from matplotlib.colors import ListedColormap
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
#!!! check next value - should not be too small for the range we have - better dynamically
h = 1  # step size in the mesh
x_min, x_max = X["AGE"].min() - 1, X["AGE"].max() + 1
y_min, y_max = X["PTRATIO"].min() - 1, X["PTRATIO"].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X["AGE"], X["PTRATIO"], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
#plt.title("3-Class classification (k = %i, weights = '%s')"
#              % (n_neighbors, weights))
plt.show()
    
    
In [ ]:
    
    
In [ ]: