In [1]:
# Reference: https://github.com/andosa/treeinterpreter
# Blog: http://blog.datadive.net/random-forest-interpretation-with-scikit-learn/

from treeinterpreter import treeinterpreter as ti
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [2]:
from sklearn.datasets import load_boston
boston = load_boston()
rf = RandomForestRegressor()

In [13]:
boston.data[:300,].shape


Out[13]:
(300, 13)

In [16]:
rf = RandomForestRegressor()
fit1 = rf.fit(boston.data[:300], boston.target[:300])

In [17]:
fit1


Out[17]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [37]:
instances = boston.data[[300, 309]]
print "Instance 0 prediction:", rf.predict(instances[0].reshape(1,13))
print "Instance 1 prediction:", rf.predict(instances[1].reshape(1,13))


Instance 0 prediction: [ 29.8]
Instance 1 prediction: [ 22.48]

In [38]:
prediction, bias, contributions = ti.predict(rf, instances)

In [40]:
for i in range(len(instances)):
    print "Instance", i
    print "Bias (trainset mean)", bias[i]
    print "Feature contributions:"
    for c, feature in sorted(zip(contributions[i], 
                                 boston.feature_names), 
                             key=lambda x: -abs(x[0])):
        print feature, round(c, 2)
    print "-"*20


 Instance 0
Bias (trainset mean) 25.5985333333
Feature contributions:
RM 3.94
LSTAT 0.59
INDUS 0.51
CRIM -0.39
PTRATIO 0.35
DIS -0.3
TAX -0.29
B -0.19
NOX 0.18
AGE -0.11
ZN -0.06
CHAS -0.04
RAD 0.02
--------------------
Instance 1
Bias (trainset mean) 25.5985333333
Feature contributions:
RM -5.14
LSTAT 2.27
INDUS -0.35
TAX -0.31
CRIM 0.21
AGE 0.21
B -0.12
NOX -0.09
PTRATIO 0.08
ZN 0.06
DIS 0.06
RAD -0.01
CHAS 0.0
--------------------

In [42]:
print prediction
print bias + np.sum(contributions, axis=1)


[ 29.8   22.48]
[ 29.8   22.48]

In [43]:
#  the basic feature importance feature provided by sklearn
fit1.feature_importances_


Out[43]:
array([ 0.01602412,  0.00129815,  0.00440821,  0.00173291,  0.00392606,
        0.81480729,  0.01405132,  0.00794089,  0.00397147,  0.0120137 ,
        0.01252419,  0.01006302,  0.09723867])

In [44]:
# treeinterpreter uses the apply function to retrieve the leave indicies with the help of which, 
# the tree path is retrieved

rf.apply


Out[44]:
<bound method RandomForestRegressor.apply of RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)>

In [47]:
rf.apply(instances)


Out[47]:
array([[311, 283, 265, 296,  95, 118, 254, 308, 104, 301],
       [117,  79,  74,  82,  48,  33, 121, 104,  73,  63]])

In [ ]: