In [1]:
""" TODO """
Out[1]:
In [27]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import tree
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../lib_plot")
import scatter_boxplot as sp
boston = datasets.load_boston()
In [3]:
boston_df_all = pd.DataFrame(boston.data, columns = boston.feature_names)
boston_df_all['MEDV'] = boston.target
boston_df_all.head()
Out[3]:
In [13]:
# Lets make it simple,
# we select the target data:
# - MEDV Median value of owner-occupied homes in $1000's
# and we selecvt couple of categorial independent variables:
# - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
# - RAD index of accessibility to radial highways
boston_df = boston_df_all[["MEDV", "AGE", "PTRATIO"]]
In [14]:
# for the shake of the exercise here, lets make the target categorial, if >25 (.75 percentile) True else false
# True for expensive
CUT = 25
boston_df["MEDV_cat"] = boston_df_target.apply(lambda x: True if x < CUT else False)
In [17]:
boston_df.head(2)
Out[17]:
In [117]:
X = boston_df[["AGE", "PTRATIO"]]
y = boston_df["MEDV_cat"]
clf = tree.DecisionTreeClassifier(max_depth=2)
#clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
In [118]:
print "names: ", X.columns.values
print "importances: ", clf.feature_importances_
In [119]:
# See visualy the results
In [120]:
a_df = boston_df[boston_df["MEDV_cat"] == True]
a_x = a_df["AGE"].values
a_y = a_df["PTRATIO"].values
b_df = boston_df[boston_df["MEDV_cat"] == False]
b_x = b_df["AGE"].values
b_y = b_df["PTRATIO"].values
In [121]:
feature_names = X.columns.values
print clf.tree_.children_left
print clf.tree_.children_right
print clf.tree_.threshold
print clf.tree_.feature
print [feature_names[i] for i in clf.tree_.feature]
In [122]:
# extract of the underlying decision-rules from a trained tree
# http://stackoverflow.com/questions/20224526/how-to-extract-the-decision-rules-from-scikit-learn-decision-tree
def get_code(tree, feature_names):
left = tree.tree_.children_left
right = tree.tree_.children_right
threshold = tree.tree_.threshold
features = [feature_names[i] for i in tree.tree_.feature]
value = tree.tree_.value
def recurse(left, right, threshold, features, node):
if (threshold[node] != -2):
print "if ( " + features[node] + " <= " + str(threshold[node]) + " ) {"
if left[node] != -1:
recurse (left, right, threshold, features,left[node])
print "} else {"
if right[node] != -1:
recurse (left, right, threshold, features,right[node])
print "}"
else:
print "return " + str(value[node])
recurse(left, right, threshold, features, 0)
In [123]:
get_code(clf, X.columns.values)
In [124]:
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
# Create color maps
from matplotlib.colors import ListedColormap
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
#!!! check next value - should not be too small for the range we have - better dynamically
h = 1 # step size in the mesh
x_min, x_max = X["AGE"].min() - 1, X["AGE"].max() + 1
y_min, y_max = X["PTRATIO"].min() - 1, X["PTRATIO"].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X["AGE"], X["PTRATIO"], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
#plt.title("3-Class classification (k = %i, weights = '%s')"
# % (n_neighbors, weights))
plt.show()
In [ ]:
In [ ]: