Make decision tree from iris data

Taken from Google's Visualizing a Decision Tree - Machine Learning Recipes #2



In [1]:

    
import numpy as np
from sklearn import tree
from sklearn.datasets import load_iris



In [2]:

    
iris = load_iris()



In [3]:

    
iris.keys()









    Out[3]:





dict_keys(['feature_names', 'DESCR', 'data', 'target', 'target_names'])



In [4]:

    
iris.feature_names









    Out[4]:





['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']



In [5]:

    
iris.target_names









    Out[5]:





array(['setosa', 'versicolor', 'virginica'], 
      dtype='<U10')



In [6]:

    
# Withhold 3 for testing
test_idx = [0, 50, 100]

train_data = np.delete(iris.data, test_idx, axis=0)
train_target = np.delete(iris.target, test_idx)



In [7]:

    
test_target = iris.target[test_idx]  # array([0, 1, 2])



In [8]:

    
test_data = iris.data[test_idx]  # array([[ 5.1,  3.5,  1.4,  0.2], [ 7. ,  3.2,  4.7,  1.4], ...])

Decision Tree



In [9]:

    
clf = tree.DecisionTreeClassifier()
clf.fit(train_data, train_target)









    Out[9]:





DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')



In [10]:

    
# make prediction
clf.predict(test_data)









    Out[10]:





array([0, 1, 2])



In [11]:

    
# matches input labels?
clf.predict(test_data) == test_target









    Out[11]:





array([ True,  True,  True], dtype=bool)

Write tree output



In [12]:

    
from sklearn.externals.six import StringIO
import pydotplus # note installed pydotplus for Py3 compatibility



In [13]:

    
dot_data = StringIO()

tree.export_graphviz(clf, 
                     out_file=dot_data, 
                     feature_names=iris.feature_names, 
                     class_names=iris.target_names, 
                     filled=True, 
                     rounded=True, 
                     impurity=False)



In [14]:

    
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())



In [15]:

    
# graphviz installed on mac with `brew install graphviz`
graph.write_pdf('iris.pdf')

# open -a preview ~/ipython/tensorflow/iris.pdf









    Out[15]:





True

Linear regression



In [17]:

    
# http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
import matplotlib.pyplot as plt
from sklearn import linear_model



In [19]:

    
# Use fit_intercept=False for Multivariate Linear Regression Model
# http://stackoverflow.com/a/24394996
clf = linear_model.LinearRegression(fit_intercept=False)
clf.fit(train_data, train_target)









    Out[19]:





LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)



In [36]:

    
clf.predict(test_data)









    Out[36]:





array([-0.08013057,  1.23215707,  2.24961979])



In [37]:

    
test_target









    Out[37]:





array([0, 1, 2])



In [34]:

    
# from Wiki: RSS "is a measure of the discrepancy between the data and an estimation model"
print("Residual sum of squares: %.2f" % np.mean((clf.predict(test_data) - test_target) ** 2))









    



Residual sum of squares: 0.04



In [35]:

    
# .score() returns the coefficient of determination R^2 of the prediction.
# 1 is perfect prediction
print('Variance score: %.2f' % clf.score(test_data, test_target))









    



Variance score: 0.94



In [22]:

    
print('Coefficients:', clf.coef_)









    



Coefficients: [-0.08676014 -0.0213177   0.22568859  0.6049702 ]



In [40]:

    
#? Is this right?
list(zip(clf.coef_, iris.feature_names))









    Out[40]:





[(-0.086760138420102398, 'sepal length (cm)'),
 (-0.021317695845585738, 'sepal width (cm)'),
 (0.2256885930920452, 'petal length (cm)'),
 (0.60497020008502655, 'petal width (cm)')]

Logistic regression



In [57]:

    
clf = linear_model.LogisticRegression()



In [58]:

    
clf.fit(train_data, train_target)









    Out[58]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [44]:

    
clf.predict(test_data)









    Out[44]:





array([0, 1, 2])



In [45]:

    
clf.predict(test_data) == test_target









    Out[45]:





array([ True,  True,  True], dtype=bool)



In [47]:

    
clf.score(test_data, test_target)









    Out[47]:





1.0

Random forest classifier



In [64]:

    
from sklearn.ensemble import RandomForestClassifier



In [65]:

    
clf = RandomForestClassifier()



In [66]:

    
clf.fit(train_data, train_target)









    Out[66]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [69]:

    
clf.score(test_data, test_target)









    Out[69]:





1.0



In [71]:

    
clf.predict(test_data) == test_target









    Out[71]:





array([ True,  True,  True], dtype=bool)

Random forest regressor



In [9]:

    
from sklearn.ensemble import RandomForestRegressor



In [10]:

    
clf = RandomForestRegressor()



In [11]:

    
clf.fit(train_data, train_target)









    Out[11]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)



In [12]:

    
clf.score(test_data, test_target)









    Out[12]:





1.0



In [13]:

    
clf.predict(test_data) == test_target









    Out[13]:





array([ True,  True,  True], dtype=bool)



In [ ]: