notebook.community

Edit and run



In [1]:

    
# Reference: https://github.com/andosa/treeinterpreter
# Blog: http://blog.datadive.net/random-forest-interpretation-with-scikit-learn/

from treeinterpreter import treeinterpreter as ti
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np



In [2]:

    
from sklearn.datasets import load_boston
boston = load_boston()
rf = RandomForestRegressor()



In [13]:

    
boston.data[:300,].shape









    Out[13]:





(300, 13)



In [16]:

    
rf = RandomForestRegressor()
fit1 = rf.fit(boston.data[:300], boston.target[:300])



In [17]:

    
fit1









    Out[17]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)



In [37]:

    
instances = boston.data[[300, 309]]
print "Instance 0 prediction:", rf.predict(instances[0].reshape(1,13))
print "Instance 1 prediction:", rf.predict(instances[1].reshape(1,13))









    



Instance 0 prediction: [ 29.8]
Instance 1 prediction: [ 22.48]



In [38]:

    
prediction, bias, contributions = ti.predict(rf, instances)



In [40]:

    
for i in range(len(instances)):
    print "Instance", i
    print "Bias (trainset mean)", bias[i]
    print "Feature contributions:"
    for c, feature in sorted(zip(contributions[i], 
                                 boston.feature_names), 
                             key=lambda x: -abs(x[0])):
        print feature, round(c, 2)
    print "-"*20









    



 Instance 0
Bias (trainset mean) 25.5985333333
Feature contributions:
RM 3.94
LSTAT 0.59
INDUS 0.51
CRIM -0.39
PTRATIO 0.35
DIS -0.3
TAX -0.29
B -0.19
NOX 0.18
AGE -0.11
ZN -0.06
CHAS -0.04
RAD 0.02
--------------------
Instance 1
Bias (trainset mean) 25.5985333333
Feature contributions:
RM -5.14
LSTAT 2.27
INDUS -0.35
TAX -0.31
CRIM 0.21
AGE 0.21
B -0.12
NOX -0.09
PTRATIO 0.08
ZN 0.06
DIS 0.06
RAD -0.01
CHAS 0.0
--------------------



In [42]:

    
print prediction
print bias + np.sum(contributions, axis=1)









    



[ 29.8   22.48]
[ 29.8   22.48]



In [43]:

    
#  the basic feature importance feature provided by sklearn
fit1.feature_importances_









    Out[43]:





array([ 0.01602412,  0.00129815,  0.00440821,  0.00173291,  0.00392606,
        0.81480729,  0.01405132,  0.00794089,  0.00397147,  0.0120137 ,
        0.01252419,  0.01006302,  0.09723867])



In [44]:

    
# treeinterpreter uses the apply function to retrieve the leave indicies with the help of which, 
# the tree path is retrieved

rf.apply









    Out[44]:





<bound method RandomForestRegressor.apply of RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)>



In [47]:

    
rf.apply(instances)









    Out[47]:





array([[311, 283, 265, 296,  95, 118, 254, 308, 104, 301],
       [117,  79,  74,  82,  48,  33, 121, 104,  73,  63]])



In [ ]: