Make decision tree from iris data

Taken from Google's Visualizing a Decision Tree - Machine Learning Recipes #2


In [1]:
import numpy as np
from sklearn import tree
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()

In [3]:
iris.keys()


Out[3]:
dict_keys(['feature_names', 'DESCR', 'data', 'target', 'target_names'])

In [4]:
iris.feature_names


Out[4]:
['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [5]:
iris.target_names


Out[5]:
array(['setosa', 'versicolor', 'virginica'], 
      dtype='<U10')

In [6]:
# Withhold 3 for testing
test_idx = [0, 50, 100]

train_data = np.delete(iris.data, test_idx, axis=0)
train_target = np.delete(iris.target, test_idx)

In [7]:
test_target = iris.target[test_idx]  # array([0, 1, 2])

In [8]:
test_data = iris.data[test_idx]  # array([[ 5.1,  3.5,  1.4,  0.2], [ 7. ,  3.2,  4.7,  1.4], ...])

Decision Tree


In [9]:
clf = tree.DecisionTreeClassifier()
clf.fit(train_data, train_target)


Out[9]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [10]:
# make prediction
clf.predict(test_data)


Out[10]:
array([0, 1, 2])

In [11]:
# matches input labels?
clf.predict(test_data) == test_target


Out[11]:
array([ True,  True,  True], dtype=bool)

Write tree output


In [12]:
from sklearn.externals.six import StringIO
import pydotplus # note installed pydotplus for Py3 compatibility

In [13]:
dot_data = StringIO()

tree.export_graphviz(clf, 
                     out_file=dot_data, 
                     feature_names=iris.feature_names, 
                     class_names=iris.target_names, 
                     filled=True, 
                     rounded=True, 
                     impurity=False)

In [14]:
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

In [15]:
# graphviz installed on mac with `brew install graphviz`
graph.write_pdf('iris.pdf')

# open -a preview ~/ipython/tensorflow/iris.pdf


Out[15]:
True

Linear regression


In [17]:
# http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
import matplotlib.pyplot as plt
from sklearn import linear_model

In [19]:
# Use fit_intercept=False for Multivariate Linear Regression Model
# http://stackoverflow.com/a/24394996
clf = linear_model.LinearRegression(fit_intercept=False)
clf.fit(train_data, train_target)


Out[19]:
LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [36]:
clf.predict(test_data)


Out[36]:
array([-0.08013057,  1.23215707,  2.24961979])

In [37]:
test_target


Out[37]:
array([0, 1, 2])

In [34]:
# from Wiki: RSS "is a measure of the discrepancy between the data and an estimation model"
print("Residual sum of squares: %.2f" % np.mean((clf.predict(test_data) - test_target) ** 2))


Residual sum of squares: 0.04

In [35]:
# .score() returns the coefficient of determination R^2 of the prediction.
# 1 is perfect prediction
print('Variance score: %.2f' % clf.score(test_data, test_target))


Variance score: 0.94

In [22]:
print('Coefficients:', clf.coef_)


Coefficients: [-0.08676014 -0.0213177   0.22568859  0.6049702 ]

In [40]:
#? Is this right?
list(zip(clf.coef_, iris.feature_names))


Out[40]:
[(-0.086760138420102398, 'sepal length (cm)'),
 (-0.021317695845585738, 'sepal width (cm)'),
 (0.2256885930920452, 'petal length (cm)'),
 (0.60497020008502655, 'petal width (cm)')]

Logistic regression


In [57]:
clf = linear_model.LogisticRegression()

In [58]:
clf.fit(train_data, train_target)


Out[58]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [44]:
clf.predict(test_data)


Out[44]:
array([0, 1, 2])

In [45]:
clf.predict(test_data) == test_target


Out[45]:
array([ True,  True,  True], dtype=bool)

In [47]:
clf.score(test_data, test_target)


Out[47]:
1.0

Random forest classifier


In [64]:
from sklearn.ensemble import RandomForestClassifier

In [65]:
clf = RandomForestClassifier()

In [66]:
clf.fit(train_data, train_target)


Out[66]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [69]:
clf.score(test_data, test_target)


Out[69]:
1.0

In [71]:
clf.predict(test_data) == test_target


Out[71]:
array([ True,  True,  True], dtype=bool)

Random forest regressor


In [9]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
clf = RandomForestRegressor()

In [11]:
clf.fit(train_data, train_target)


Out[11]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [12]:
clf.score(test_data, test_target)


Out[12]:
1.0

In [13]:
clf.predict(test_data) == test_target


Out[13]:
array([ True,  True,  True], dtype=bool)

In [ ]: