notebook.community

Edit and run



In [83]:

    
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn import preprocessing, linear_model
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

%matplotlib inline 

# Data can be down loaded here (need sign in): 
## https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/



In [8]:

    
train = pd.read_csv("Big_Mart_Train.csv")
train.head()









    Out[8]:







  
    
      
      Item_Identifier
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
      Item_Outlet_Sales
    
  
  
    
      0
      FDA15
      9.30
      Low Fat
      0.016047
      Dairy
      249.8092
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
      3735.1380
    
    
      1
      DRC01
      5.92
      Regular
      0.019278
      Soft Drinks
      48.2692
      OUT018
      2009
      Medium
      Tier 3
      Supermarket Type2
      443.4228
    
    
      2
      FDN15
      17.50
      Low Fat
      0.016760
      Meat
      141.6180
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
      2097.2700
    
    
      3
      FDX07
      19.20
      Regular
      0.000000
      Fruits and Vegetables
      182.0950
      OUT010
      1998
      NaN
      Tier 3
      Grocery Store
      732.3800
    
    
      4
      NCD19
      8.93
      Low Fat
      0.000000
      Household
      53.8614
      OUT013
      1987
      High
      Tier 3
      Supermarket Type1
      994.7052



In [9]:

    
# preprocessing
pd.isnull(train).sum() > 0









    Out[9]:





Item_Identifier              False
Item_Weight                   True
Item_Fat_Content             False
Item_Visibility              False
Item_Type                    False
Item_MRP                     False
Outlet_Identifier            False
Outlet_Establishment_Year    False
Outlet_Size                   True
Outlet_Location_Type         False
Outlet_Type                  False
Item_Outlet_Sales            False
dtype: bool



In [10]:

    
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))



In [11]:

    
print train.Outlet_Size.unique()









    



['Medium' nan 'High' 'Small']



In [12]:

    
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])



In [15]:

    
print train.Item_Fat_Content.unique()
print train.Item_Type.unique()
print train.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()









    



['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']
['Medium' 'High' 'Small']
['Tier 1' 'Tier 3' 'Tier 2']
['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']



In [16]:

    
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])



In [17]:

    
print train.Item_Fat_Content.unique()









    



['Low Fat' 'Regular']



In [18]:

    
print train.Outlet_Establishment_Year.max()



In [19]:

    
train.Outlet_Establishment_Year = 2017 - train.Outlet_Establishment_Year



In [20]:

    
train.dtypes









    Out[20]:





Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object



In [44]:

    
# LIME needs categorical feature names
categorical_features = [col for col in train.columns if train.dtypes[col] == 'O']
categorical_features









    Out[44]:





['Item_Identifier',
 'Item_Fat_Content',
 'Item_Type',
 'Outlet_Identifier',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type']



In [21]:

    
number = LabelEncoder()

for i in train.columns:
    if (train[i].dtype == 'object'):
        train[i] = number.fit_transform(train[i].astype('str'))
        train[i] = train[i].astype('object')
        
processed_train = train[:train.shape[0]]



In [22]:

    
processed_train.head()









    Out[22]:







  
    
      
      Item_Identifier
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
      Item_Outlet_Sales
    
  
  
    
      0
      156
      9.30
      0
      0.016047
      4
      249.8092
      9
      18
      1
      0
      1
      3735.1380
    
    
      1
      8
      5.92
      1
      0.019278
      14
      48.2692
      3
      8
      1
      2
      2
      443.4228
    
    
      2
      662
      17.50
      0
      0.016760
      10
      141.6180
      9
      18
      1
      0
      1
      2097.2700
    
    
      3
      1121
      19.20
      1
      0.000000
      6
      182.0950
      0
      19
      1
      2
      0
      732.3800
    
    
      4
      1297
      8.93
      0
      0.000000
      9
      53.8614
      1
      30
      0
      2
      1
      994.7052



In [23]:

    
# remove id and those with more levels
tpot_train = processed_train.drop('Item_Identifier',axis=1)
target = tpot_train['Item_Outlet_Sales']
tpot_train = tpot_train.drop('Item_Outlet_Sales',axis=1)



In [24]:

    
# split data into training and validation data
X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,
 train_size=0.77, test_size=0.23)



In [47]:

    
# Model 1 - Rendom Forest
rf = RandomForestRegressor(n_estimators=1000)
rf.fit(X_train, y_train)
print('Random Forest MSError', mean_squared_error(y_test, rf.predict(X_test)))









    



('Random Forest MSError', 1342865.1468449924)



In [ ]:

    
# Model 2 - Linear Regression
lg = linear_model.LinearRegression()
lg.fit(X_train, y_train)
print('Random Forest MSError', mean_squared_error(y_test, lg.predict(X_test)))



In [49]:

    
import lime
import lime.lime_tabular



In [70]:

    
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.as_matrix(), feature_names=np.array(list(X_train.columns)), 
                                                   class_names=['Item_Outlet_Sales'], 
                                                   categorical_features=np.array(categorical_features), 
                                                   random_state=410,
                                                   verbose=True, mode='regression')



In [80]:

    
i = 77  # explain the 77th row
exp = explainer.explain_instance(X_test.as_matrix()[i], rf.predict)
print exp
exp.show_in_notebook(show_table=True)









    



Intercept 2608.9595771
Prediction_local [ 2252.67396589]
Right: 2910.3529496
<lime.explanation.Explanation object at 0x10dc296d0>



In [81]:

    
i = 99  # explain the 99th row
exp = explainer.explain_instance(X_test.as_matrix()[i], rf.predict)
print exp
exp.show_in_notebook(show_table=True)









    



Intercept 2445.63034372
Prediction_local [ 3308.2976027]
Right: 3659.5270888
<lime.explanation.Explanation object at 0x14209ce10>



In [82]:

    
exp.as_list()









    Out[82]:





[('Item_MRP > 185.74', 1768.878161197355),
 ('Outlet_Type <= 1.00', -706.56889205563789),
 ('18.00 < Outlet_Establishment_Year <= 30.00', -146.26376641531726),
 ('Item_Fat_Content <= 0.00', -94.763631534089399),
 ('Item_Weight > 16.00', 55.874342749776076),
 ('0.05 < Item_Visibility <= 0.09', -52.011215753334959),
 ('Item_Type <= 4.00', 25.616550836395668),
 ('Outlet_Identifier > 7.00', 20.684404043292069),
 ('Outlet_Location_Type <= 0.00', -9.9328804793321641),
 ('1.00 < Outlet_Size <= 2.00', 1.1541863885596926)]



In [93]:

    
i = 99  # explain the 77th row
exp = explainer.explain_instance(X_test.as_matrix()[i], lg.predict)
print exp
exp.show_in_notebook(show_table=True)









    



Intercept 2843.84011621
Prediction_local [ 3238.94886547]
Right: 2918.91276014
<lime.explanation.Explanation object at 0x10dbfbe90>



In [94]:

    
exp.as_list()









    Out[94]:





[('Item_MRP > 185.74', 1701.0748014953774),
 ('Outlet_Type <= 1.00', -1468.3018537581213),
 ('1.00 < Outlet_Size <= 2.00', -472.50245118259221),
 ('Outlet_Location_Type <= 0.00', 362.81333842988971),
 ('Outlet_Identifier > 7.00', 256.26106531615079),
 ('Item_Fat_Content <= 0.00', -61.967624779084105),
 ('Item_Weight > 16.00', 43.127355959800099),
 ('0.05 < Item_Visibility <= 0.09', -36.766865938999416),
 ('Item_Type <= 4.00', 36.391549810461406),
 ('18.00 < Outlet_Establishment_Year <= 30.00', 34.979433907015071)]

	Item_Identifier	Item_Weight	Item_Fat_Content	Item_Visibility	Item_Type	Item_MRP	Outlet_Identifier	Outlet_Establishment_Year	Outlet_Size	Outlet_Location_Type	Outlet_Type	Item_Outlet_Sales
0	FDA15	9.30	Low Fat	0.016047	Dairy	249.8092	OUT049	1999	Medium	Tier 1	Supermarket Type1	3735.1380
1	DRC01	5.92	Regular	0.019278	Soft Drinks	48.2692	OUT018	2009	Medium	Tier 3	Supermarket Type2	443.4228
2	FDN15	17.50	Low Fat	0.016760	Meat	141.6180	OUT049	1999	Medium	Tier 1	Supermarket Type1	2097.2700
3	FDX07	19.20	Regular	0.000000	Fruits and Vegetables	182.0950	OUT010	1998	NaN	Tier 3	Grocery Store	732.3800
4	NCD19	8.93	Low Fat	0.000000	Household	53.8614	OUT013	1987	High	Tier 3	Supermarket Type1	994.7052