In [83]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing, linear_model
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline
# Data can be down loaded here (need sign in):
## https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/
In [8]:
train = pd.read_csv("Big_Mart_Train.csv")
train.head()
Out[8]:
In [9]:
# preprocessing
pd.isnull(train).sum() > 0
Out[9]:
In [10]:
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))
In [11]:
print train.Outlet_Size.unique()
In [12]:
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
In [15]:
print train.Item_Fat_Content.unique()
print train.Item_Type.unique()
print train.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()
In [16]:
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])
In [17]:
print train.Item_Fat_Content.unique()
In [18]:
print train.Outlet_Establishment_Year.max()
In [19]:
train.Outlet_Establishment_Year = 2017 - train.Outlet_Establishment_Year
In [20]:
train.dtypes
Out[20]:
In [44]:
# LIME needs categorical feature names
categorical_features = [col for col in train.columns if train.dtypes[col] == 'O']
categorical_features
Out[44]:
In [21]:
number = LabelEncoder()
for i in train.columns:
if (train[i].dtype == 'object'):
train[i] = number.fit_transform(train[i].astype('str'))
train[i] = train[i].astype('object')
processed_train = train[:train.shape[0]]
In [22]:
processed_train.head()
Out[22]:
In [23]:
# remove id and those with more levels
tpot_train = processed_train.drop('Item_Identifier',axis=1)
target = tpot_train['Item_Outlet_Sales']
tpot_train = tpot_train.drop('Item_Outlet_Sales',axis=1)
In [24]:
# split data into training and validation data
X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,
train_size=0.77, test_size=0.23)
In [47]:
# Model 1 - Rendom Forest
rf = RandomForestRegressor(n_estimators=1000)
rf.fit(X_train, y_train)
print('Random Forest MSError', mean_squared_error(y_test, rf.predict(X_test)))
In [ ]:
# Model 2 - Linear Regression
lg = linear_model.LinearRegression()
lg.fit(X_train, y_train)
print('Random Forest MSError', mean_squared_error(y_test, lg.predict(X_test)))
In [49]:
import lime
import lime.lime_tabular
In [70]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.as_matrix(), feature_names=np.array(list(X_train.columns)),
class_names=['Item_Outlet_Sales'],
categorical_features=np.array(categorical_features),
random_state=410,
verbose=True, mode='regression')
In [80]:
i = 77 # explain the 77th row
exp = explainer.explain_instance(X_test.as_matrix()[i], rf.predict)
print exp
exp.show_in_notebook(show_table=True)
In [81]:
i = 99 # explain the 99th row
exp = explainer.explain_instance(X_test.as_matrix()[i], rf.predict)
print exp
exp.show_in_notebook(show_table=True)
In [82]:
exp.as_list()
Out[82]:
In [93]:
i = 99 # explain the 77th row
exp = explainer.explain_instance(X_test.as_matrix()[i], lg.predict)
print exp
exp.show_in_notebook(show_table=True)
In [94]:
exp.as_list()
Out[94]: