Trees and Forests

NOTE: This module code was partly taken from Andreas Muellers Adavanced scikit-learn O'Reilly Course

It is just used to explore the scikit-learn random forest object in a systematic manner

I've added more code to it to understand how to generate tree plots for random forests



In [21]:

    
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

Decision Tree Classification



In [8]:

    
%%bash
pwd
ls









    



/Users/shamindras/PERSONAL/LEARNING/STUDY/UC_BERKELEY/RESEARCH/SPRING_2017/scikit-learn-sandbox/jupyter
Trees and Forests.ipynb
figures



In [12]:

    
from figures import plot_interactive_tree
plot_interactive_tree.plot_tree_interactive()

Random Forests



In [15]:

    
from figures import plot_interactive_forest
plot_interactive_forest.plot_forest_interactive()

Selecting the Optimal Estimator via Cross-Validation



In [36]:

    
from sklearn import grid_search
from sklearn import tree
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier

digits = load_digits()
X, y = digits.data, digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
parameters = {'max_features':['sqrt', 'log2'],
              'max_depth':[5, 7, 9]}

clf_grid = grid_search.GridSearchCV(rf, parameters)
clf_grid.fit(X_train, y_train)









    Out[36]:





GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': ['sqrt', 'log2'], 'max_depth': [5, 7, 9]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)



In [17]:

    
clf_grid.score(X_train, y_train)









    Out[17]:





1.0



In [18]:

    
clf_grid.score(X_test, y_test)









    Out[18]:





0.97111111111111115



In [19]:

    
clf_grid.best_params_









    Out[19]:





{'max_depth': 9, 'max_features': 'sqrt'}



In [20]:

    
clf_grid.best_estimator_









    Out[20]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

Fit the forest manually



In [80]:

    
rf = RandomForestClassifier(n_estimators=5, n_jobs=-1)
rf.fit(X_train, y_train)









    Out[80]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=5, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)



In [81]:

    
rf.score(X_test, y_test)









    Out[81]:





0.91555555555555557



In [82]:

    
print([estimator.tree_.max_depth for estimator in rf.estimators_])









    



[13, 16, 17, 12, 14]



In [86]:

    
for idx, dec_tree in enumerate(rf.estimators_):
    if idx == 0:
        print(dec_tree.tree_.max_depth)
    else:
        pass



In [87]:

    
for idx, dec_tree in enumerate(rf.estimators_):
    if idx == 0:
        tree.export_graphviz(dec_tree)









    



/Users/shamindras/anaconda/envs/sklearnprod0/lib/python3.6/site-packages/sklearn/tree/export.py:386: DeprecationWarning: out_file can be set to None starting from 0.18. This will be the default in 0.20.
  DeprecationWarning)



In [88]:

    
from sklearn import tree
i_tree = 0
for tree_in_forest in rf.estimators_:
    if i_tree ==0:
        with open('tree_' + str(i_tree) + '.png', 'w') as my_file:
            my_file = tree.export_graphviz(tree_in_forest, out_file = my_file)
        i_tree = i_tree + 1
    else:
        pass



In [ ]:

    
import io
from scipy import misc
from sklearn import tree
import pydot

def show_tree(decisionTree, file_path):
    dotfile = io.StringIO()
    tree.export_graphviz(decisionTree, out_file=dotfile)
    (graph,)=pydot.graph_from_dot_data(dotfile.getvalue())
    #pydot.graph_from_dot_data(dotfile.getvalue()).write_png(file_path)
    graph.write_png(file_path)
    i = misc.imread(file_path)
    plt.imshow(i)



In [ ]:

    
from sklearn import tree
i_tree = 0
for tree_in_forest in rf.estimators_:
    if i_tree ==0:
        show_tree(tree_in_forest, 'test.png')