In [3]:
!sudo pip install lime


Requirement already satisfied (use --upgrade to upgrade): lime in /usr/local/lib/python2.7/dist-packages
Requirement already satisfied (use --upgrade to upgrade): scipy in /usr/local/lib/python2.7/dist-packages (from lime)
Requirement already satisfied (use --upgrade to upgrade): scikit-learn in /usr/local/lib/python2.7/dist-packages (from lime)
Requirement already satisfied (use --upgrade to upgrade): numpy in /usr/local/lib/python2.7/dist-packages (from lime)
You are using pip version 7.1.2, however version 8.1.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

In [4]:
def shuffle(df, n=1, axis=0):     
    df = df.copy()
    for _ in range(n):
        df.apply(np.random.shuffle, axis=axis)
    return df

def myround(x, prec=2, base=.05):
  return round(base * round(float(x)/base),prec)

def flatten(l):
    return [item for sublist in l for item in sublist]

In [5]:
import lime
import numpy as np
import scipy as sp
import pandas as pd
import psycopg2
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')


/usr/local/lib/python2.7/dist-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [6]:
# cell 2: connection
conn = psycopg2.connect(
   database='tradesy',
   port=os.environ['REDSHIFT_PORT'],
   password=os.environ['REDSHIFT_PASS'],
   user=os.environ['REDSHIFT_USER'],
   host=os.environ['REDSHIFT_HOST']
  )

In [7]:
query = """
select * from saleability_model_v2 limit 10000
"""


df = pd.read_sql(query, conn)

In [20]:
domain = [u'shipping_price_ratio',
       u'asking_price', u'price_level', u'brand_score', u'a_over_b', u'a',
       u'favorite_count', u'b', u'purchase_count', u'has_blurb', u'has_image',
   u'seasonal_component', u'description_length', u'sold_similarity',
       u'unsold_similarity', u'description_similarity_delta']

In [21]:
categorical_features = [u'has_blurb', u'has_image']

In [22]:
y = df.purchase_dummy
X = df[domain]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .2)

In [23]:
rf = RandomForestClassifier(n_estimators=250, class_weight='auto')
rf.fit(X_train, y_train)


Out[23]:
RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [12]:
pred = rf.predict(X_test)
from sklearn.metrics import classification_report
print classification_report(y_test, pred)


             precision    recall  f1-score   support

      False       0.85      0.96      0.91      1662
       True       0.51      0.18      0.27       338

avg / total       0.80      0.83      0.80      2000


In [13]:
from lime.lime_tabular import LimeTabularExplainer

In [14]:
#need to convert integers to floats for lime to work
for column in X_train.columns:
    X_train[column] = X_train[column].astype(float)
    X_test[column] = X_test[column].astype(float)

In [16]:
explainer = LimeTabularExplainer(X_train.dropna().values, feature_names=domain, categorical_features=categorical_features)

In [18]:
explained_observation = explainer.explain_instance(X_test.iloc[3], rf.predict_proba)

In [19]:
explained_observation.show_in_notebook(show_table=True, show_all=False)



In [ ]: