In [1]:
from sklearn.datasets import load_boston
import sklearn.ensemble
import numpy as np
from __future__ import print_function
In [2]:
boston = load_boston()
In [3]:
rf = sklearn.ensemble.RandomForestRegressor(n_estimators=1000)
In [4]:
train, test, labels_train, labels_test = sklearn.cross_validation.train_test_split(boston.data, boston.target, train_size=0.80)
In [5]:
rf.fit(train, labels_train)
Out[5]:
In [6]:
print('Random Forest MSError', np.mean((rf.predict(test) - labels_test) ** 2))
In [7]:
print('MSError when predicting the mean mean', np.mean((labels_train.mean() - labels_test) ** 2))
In [8]:
boston.feature_names
Out[8]:
In [9]:
categorical_features = np.argwhere(np.array([len(set(boston.data[:,x])) for x in range(boston.data.shape[1])]) <= 10).flatten()
In [10]:
import lime
import lime.lime_tabular
In [11]:
explainer = lime.lime_tabular.LimeTabularExplainer(train, feature_names=boston.feature_names, class_names=['price'], categorical_features=categorical_features, verbose=True)
In [12]:
predict_fn = lambda x: rf.predict(x).reshape(-1,1)
In [13]:
np.argmin(predict_fn(test))
Out[13]:
In [14]:
i = 8
exp = explainer.explain_instance(test[i], predict_fn, labels=[0], num_features=5)
In [15]:
exp.as_list(0)
Out[15]:
In [16]:
print('Prediction', predict_fn(test[i].reshape(1,-1))[0,0])
print('True', labels_test[i])
In [17]:
exp.show_in_notebook(predict_proba=False)
In [18]:
list(boston.feature_names).index('LSTAT')
Out[18]:
In [19]:
explainer.scaler.scale_[12]
Out[19]:
In [21]:
x = test[i].copy()
before = predict_fn(x.reshape(1, -1))
x[12] = x[12] - 1.22 * explainer.scaler.scale_[12]
after = predict_fn(x.reshape(1, -1))
print('Before', before)
print('After', after)
print('Difference', after - before)