In [1]:
import lime
import sklearn
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.metrics
In [2]:
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.autos', 'rec.motorcycles']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
class_names = ['cars', 'bikes']
In [3]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=True)
train_vectors = vectorizer.fit_transform(newsgroups_train.data)
test_vectors = vectorizer.transform(newsgroups_test.data)
In [4]:
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf.fit(train_vectors, newsgroups_train.target)
Out[4]:
In [6]:
pred = rf.predict(test_vectors)
print(sklearn.metrics.f1_score(newsgroups_test.target, pred, average='binary'))
print(sklearn.metrics.accuracy_score(newsgroups_test.target, pred))
Explaining predictions
In [7]:
from lime import lime_text
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizer, rf)
In [8]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)
In [22]:
idx = 42
exp = explainer.explain_instance(newsgroups_test.data[idx], c.predict_proba, num_features=10)
print('Document id: %d' % idx)
print('Probability(car) =', c.predict_proba([newsgroups_test.data[idx]])[0,0])
print('True class: %s' % class_names[newsgroups_test.target[idx]])
print(exp.as_list())
In [14]:
%matplotlib inline
fig = exp.as_pyplot_figure()
In [15]:
exp.show_in_notebook(text=False)
In [16]:
exp.show_in_notebook(text=True)
In [ ]: