In [3]:
!sudo pip install lime
In [4]:
def shuffle(df, n=1, axis=0):
df = df.copy()
for _ in range(n):
df.apply(np.random.shuffle, axis=axis)
return df
def myround(x, prec=2, base=.05):
return round(base * round(float(x)/base),prec)
def flatten(l):
return [item for sublist in l for item in sublist]
In [5]:
import lime
import numpy as np
import scipy as sp
import pandas as pd
import psycopg2
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
In [6]:
# cell 2: connection
conn = psycopg2.connect(
database='tradesy',
port=os.environ['REDSHIFT_PORT'],
password=os.environ['REDSHIFT_PASS'],
user=os.environ['REDSHIFT_USER'],
host=os.environ['REDSHIFT_HOST']
)
In [7]:
query = """
select * from saleability_model_v2 limit 10000
"""
df = pd.read_sql(query, conn)
In [20]:
domain = [u'shipping_price_ratio',
u'asking_price', u'price_level', u'brand_score', u'a_over_b', u'a',
u'favorite_count', u'b', u'purchase_count', u'has_blurb', u'has_image',
u'seasonal_component', u'description_length', u'sold_similarity',
u'unsold_similarity', u'description_similarity_delta']
In [21]:
categorical_features = [u'has_blurb', u'has_image']
In [22]:
y = df.purchase_dummy
X = df[domain]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .2)
In [23]:
rf = RandomForestClassifier(n_estimators=250, class_weight='auto')
rf.fit(X_train, y_train)
Out[23]:
In [12]:
pred = rf.predict(X_test)
from sklearn.metrics import classification_report
print classification_report(y_test, pred)
In [13]:
from lime.lime_tabular import LimeTabularExplainer
In [14]:
#need to convert integers to floats for lime to work
for column in X_train.columns:
X_train[column] = X_train[column].astype(float)
X_test[column] = X_test[column].astype(float)
In [16]:
explainer = LimeTabularExplainer(X_train.dropna().values, feature_names=domain, categorical_features=categorical_features)
In [18]:
explained_observation = explainer.explain_instance(X_test.iloc[3], rf.predict_proba)
In [19]:
explained_observation.show_in_notebook(show_table=True, show_all=False)
In [ ]: