In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
# BaggingClassifier does not have feature_importance exposed
#from sklearn.ensemble import BaggingClassifier
#from sklearn.tree import DecisionTreeClassifier
import lime
import lime.lime_tabular
from __future__ import print_function
np.random.seed(1)
%matplotlib nbagg
In [5]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
input_feature_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
input_dataframe = pd.read_csv(url, names=input_feature_names)
In [325]:
# To get more info about the data
%pycat https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.names
In [6]:
print(input_dataframe.head())
This is a binary classification problem.
In [9]:
input_class_names = input_dataframe['class'].unique()
print("Unique class names: {}".format(input_class_names))
# Not idealistically splitting into train, test and holdout
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=0)
array = input_dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
seed = 0
print(X[0])
print(sss.get_n_splits(X, Y))
In [10]:
for train_index, test_index in sss.split(X, Y):
X_train, X_test = X[train_index], X[test_index]
Y_train, Y_test = Y[train_index], Y[test_index]
print('Train set size: {}'.format(len(X_train)))
print('Test set size: {}'.format(len(X_test)))
In [16]:
# kfold = model_selection.KFold(n_splits=10, random_state=seed)
# cart = DecisionTreeClassifier()
num_trees = 500
init_model = RandomForestClassifier(n_estimators=num_trees, random_state=seed, oob_score=True)
# init_model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
rf_model = init_model.fit(X_train, Y_train)
print(rf_model)
In [330]:
rf_model.predict_proba(X=X_test)
Out[330]:
In [17]:
print(classification_report(y_pred=rf_model.predict(X_test), y_true=Y_test))
In [18]:
# Create an instance of the Local explainer
explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=input_feature_names,
class_names=input_class_names, discretize_continuous=True)
In [42]:
#i = np.random.randint(0, X_test.shape[0])
i = 37
print("Index:{}".format(i))
print("Actual Data:{}".format(X_test[i]))
print("Ground truth for index {}: {}".format(i, Y_test[i]))
print("Predicted label for index {}: {}".format(i, rf_model.predict(X_test[i])))
print(rf_model.predict_proba(X=X_test[i]))
print("Shape of the test dataset:{}".format(X_test.shape))
total_no_of_columns = X_test.shape[1]
%prun
exp = explainer.explain_instance(X_test[i], rf_model.predict_proba, num_features=total_no_of_columns)
In [32]:
# TODO: adjust the figure size
print(exp.as_list())
print(exp.as_pyplot_figure())
In [149]:
# Need to fix this error
# exp.show_in_notebook()
In [43]:
# Lets check out the variable importance to rank order the features
# TODO: map the index to feature names
importances = rf_model.feature_importances_
forest = rf_model
# plot the feature importance
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
print(pd.DataFrame(input_feature_names))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
In [232]:
# The information provided by Variable Importance is at the global level.
In [44]:
# The question that ponders now, is how to improve the model performance ?
num_trees = 500
init_model = RandomForestClassifier(n_estimators=num_trees, random_state=seed, oob_score=True)
# init_model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
feature_indexes = [3, 5, 6, 7]
sliced_feature_names = ['skin', 'mass', 'pedi', 'age']
In [45]:
rf_sub_model = init_model.fit(X_train[:, feature_indexes], Y_train)
print(classification_report(y_pred=rf_sub_model.predict(X_test[:,feature_indexes]), y_true=Y_test))
In [50]:
i = 37
print("Index:{}".format(i))
print("Actual Data:{}".format(X_test[i, feature_indexes]))
print("Ground truth for index {}: {}".format(i, Y_test[i]))
print("Predicted label for index {}: {}".format(i, rf_sub_model.predict(X_test[i, feature_indexes])))
print("Likelihood of occurrence of each class: {}".format(rf_sub_model.predict_proba(X=X_test[i, feature_indexes])))
input_array = X_test[i, feature_indexes]
print("Sliced array shape: {}".format(input_array.shape))
total_no_of_columns = input_array.shape[0]
print("Number of columns in the sliced array: {}".format(total_no_of_columns))
#%prun
print("Local input :{}".format(input_array))
print("Local input without slicing :{}".format(X_test[i]))
In [52]:
# As one can see above the prediction for that particular input improved. And if one interprets for confirmation that
# one would realize that the probability of all the contributing features in driving the prediction is quite high
In [51]:
explainer2 = lime.lime_tabular.LimeTabularExplainer(X_train[:, feature_indexes], feature_names=sliced_feature_names,
class_names=input_class_names, discretize_continuous=True)
exp2 = explainer2.explain_instance(input_array, rf_sub_model.predict_proba, num_features=total_no_of_columns)
print(exp2.as_list())
print(exp2.as_pyplot_figure())
In [ ]: