In [83]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn import preprocessing, linear_model
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

%matplotlib inline 

# Data can be down loaded here (need sign in): 
## https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/

In [8]:
train = pd.read_csv("Big_Mart_Train.csv")
train.head()


Out[8]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 FDA15 9.30 Low Fat 0.016047 Dairy 249.8092 OUT049 1999 Medium Tier 1 Supermarket Type1 3735.1380
1 DRC01 5.92 Regular 0.019278 Soft Drinks 48.2692 OUT018 2009 Medium Tier 3 Supermarket Type2 443.4228
2 FDN15 17.50 Low Fat 0.016760 Meat 141.6180 OUT049 1999 Medium Tier 1 Supermarket Type1 2097.2700
3 FDX07 19.20 Regular 0.000000 Fruits and Vegetables 182.0950 OUT010 1998 NaN Tier 3 Grocery Store 732.3800
4 NCD19 8.93 Low Fat 0.000000 Household 53.8614 OUT013 1987 High Tier 3 Supermarket Type1 994.7052

In [9]:
# preprocessing
pd.isnull(train).sum() > 0


Out[9]:
Item_Identifier              False
Item_Weight                   True
Item_Fat_Content             False
Item_Visibility              False
Item_Type                    False
Item_MRP                     False
Outlet_Identifier            False
Outlet_Establishment_Year    False
Outlet_Size                   True
Outlet_Location_Type         False
Outlet_Type                  False
Item_Outlet_Sales            False
dtype: bool

In [10]:
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))

In [11]:
print train.Outlet_Size.unique()


['Medium' nan 'High' 'Small']

In [12]:
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])

In [15]:
print train.Item_Fat_Content.unique()
print train.Item_Type.unique()
print train.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()


['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']
['Medium' 'High' 'Small']
['Tier 1' 'Tier 3' 'Tier 2']
['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']

In [16]:
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])

In [17]:
print train.Item_Fat_Content.unique()


['Low Fat' 'Regular']

In [18]:
print train.Outlet_Establishment_Year.max()


2009

In [19]:
train.Outlet_Establishment_Year = 2017 - train.Outlet_Establishment_Year

In [20]:
train.dtypes


Out[20]:
Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [44]:
# LIME needs categorical feature names
categorical_features = [col for col in train.columns if train.dtypes[col] == 'O']
categorical_features


Out[44]:
['Item_Identifier',
 'Item_Fat_Content',
 'Item_Type',
 'Outlet_Identifier',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type']

In [21]:
number = LabelEncoder()

for i in train.columns:
    if (train[i].dtype == 'object'):
        train[i] = number.fit_transform(train[i].astype('str'))
        train[i] = train[i].astype('object')
        
processed_train = train[:train.shape[0]]

In [22]:
processed_train.head()


Out[22]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 156 9.30 0 0.016047 4 249.8092 9 18 1 0 1 3735.1380
1 8 5.92 1 0.019278 14 48.2692 3 8 1 2 2 443.4228
2 662 17.50 0 0.016760 10 141.6180 9 18 1 0 1 2097.2700
3 1121 19.20 1 0.000000 6 182.0950 0 19 1 2 0 732.3800
4 1297 8.93 0 0.000000 9 53.8614 1 30 0 2 1 994.7052

In [23]:
# remove id and those with more levels
tpot_train = processed_train.drop('Item_Identifier',axis=1)
target = tpot_train['Item_Outlet_Sales']
tpot_train = tpot_train.drop('Item_Outlet_Sales',axis=1)

In [24]:
# split data into training and validation data
X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,
 train_size=0.77, test_size=0.23)

In [47]:
# Model 1 - Rendom Forest
rf = RandomForestRegressor(n_estimators=1000)
rf.fit(X_train, y_train)
print('Random Forest MSError', mean_squared_error(y_test, rf.predict(X_test)))


('Random Forest MSError', 1342865.1468449924)

In [ ]:
# Model 2 - Linear Regression
lg = linear_model.LinearRegression()
lg.fit(X_train, y_train)
print('Random Forest MSError', mean_squared_error(y_test, lg.predict(X_test)))

In [49]:
import lime
import lime.lime_tabular

In [70]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.as_matrix(), feature_names=np.array(list(X_train.columns)), 
                                                   class_names=['Item_Outlet_Sales'], 
                                                   categorical_features=np.array(categorical_features), 
                                                   random_state=410,
                                                   verbose=True, mode='regression')

In [80]:
i = 77  # explain the 77th row
exp = explainer.explain_instance(X_test.as_matrix()[i], rf.predict)
print exp
exp.show_in_notebook(show_table=True)


Intercept 2608.9595771
Prediction_local [ 2252.67396589]
Right: 2910.3529496
<lime.explanation.Explanation object at 0x10dc296d0>

In [81]:
i = 99  # explain the 99th row
exp = explainer.explain_instance(X_test.as_matrix()[i], rf.predict)
print exp
exp.show_in_notebook(show_table=True)


Intercept 2445.63034372
Prediction_local [ 3308.2976027]
Right: 3659.5270888
<lime.explanation.Explanation object at 0x14209ce10>

In [82]:
exp.as_list()


Out[82]:
[('Item_MRP > 185.74', 1768.878161197355),
 ('Outlet_Type <= 1.00', -706.56889205563789),
 ('18.00 < Outlet_Establishment_Year <= 30.00', -146.26376641531726),
 ('Item_Fat_Content <= 0.00', -94.763631534089399),
 ('Item_Weight > 16.00', 55.874342749776076),
 ('0.05 < Item_Visibility <= 0.09', -52.011215753334959),
 ('Item_Type <= 4.00', 25.616550836395668),
 ('Outlet_Identifier > 7.00', 20.684404043292069),
 ('Outlet_Location_Type <= 0.00', -9.9328804793321641),
 ('1.00 < Outlet_Size <= 2.00', 1.1541863885596926)]

In [93]:
i = 99  # explain the 77th row
exp = explainer.explain_instance(X_test.as_matrix()[i], lg.predict)
print exp
exp.show_in_notebook(show_table=True)


Intercept 2843.84011621
Prediction_local [ 3238.94886547]
Right: 2918.91276014
<lime.explanation.Explanation object at 0x10dbfbe90>

In [94]:
exp.as_list()


Out[94]:
[('Item_MRP > 185.74', 1701.0748014953774),
 ('Outlet_Type <= 1.00', -1468.3018537581213),
 ('1.00 < Outlet_Size <= 2.00', -472.50245118259221),
 ('Outlet_Location_Type <= 0.00', 362.81333842988971),
 ('Outlet_Identifier > 7.00', 256.26106531615079),
 ('Item_Fat_Content <= 0.00', -61.967624779084105),
 ('Item_Weight > 16.00', 43.127355959800099),
 ('0.05 < Item_Visibility <= 0.09', -36.766865938999416),
 ('Item_Type <= 4.00', 36.391549810461406),
 ('18.00 < Outlet_Establishment_Year <= 30.00', 34.979433907015071)]