In [ ]:
# Get data here: https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/

In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor


train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [2]:
train.head()


Out[2]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 FDA15 9.30 Low Fat 0.016047 Dairy 249.8092 OUT049 1999 Medium Tier 1 Supermarket Type1 3735.1380
1 DRC01 5.92 Regular 0.019278 Soft Drinks 48.2692 OUT018 2009 Medium Tier 3 Supermarket Type2 443.4228
2 FDN15 17.50 Low Fat 0.016760 Meat 141.6180 OUT049 1999 Medium Tier 1 Supermarket Type1 2097.2700
3 FDX07 19.20 Regular 0.000000 Fruits and Vegetables 182.0950 OUT010 1998 NaN Tier 3 Grocery Store 732.3800
4 NCD19 8.93 Low Fat 0.000000 Household 53.8614 OUT013 1987 High Tier 3 Supermarket Type1 994.7052

In [3]:
train.dtypes


Out[3]:
Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [4]:
train.isnull().sum()


Out[4]:
Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [5]:
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))
test.Item_Weight = test.Item_Weight.fillna(np.nanmedian(test.Item_Weight))

In [6]:
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()


['Medium' nan 'High' 'Small']
['Medium' nan 'Small' 'High']

In [7]:
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
test.Outlet_Size = test.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])

In [8]:
print train.isnull().sum()
print test.isnull().sum()


Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64
Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

In [9]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
print train.Item_Type.unique()
print test.Item_Type.unique()
print train.Outlet_Identifier.unique()
print test.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print test.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()
print test.Outlet_Type.unique()


['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
['Low Fat' 'reg' 'Regular' 'LF' 'low fat']
['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
['Snack Foods' 'Dairy' 'Others' 'Fruits and Vegetables' 'Baking Goods'
 'Health and Hygiene' 'Breads' 'Hard Drinks' 'Seafood' 'Soft Drinks'
 'Household' 'Frozen Foods' 'Meat' 'Canned' 'Starchy Foods' 'Breakfast']
['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']
['OUT049' 'OUT017' 'OUT010' 'OUT027' 'OUT046' 'OUT018' 'OUT045' 'OUT019'
 'OUT013' 'OUT035']
['Medium' 'High' 'Small']
['Medium' 'Small' 'High']
['Tier 1' 'Tier 3' 'Tier 2']
['Tier 1' 'Tier 2' 'Tier 3']
['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']
['Supermarket Type1' 'Grocery Store' 'Supermarket Type3'
 'Supermarket Type2']

In [10]:
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['reg'], ['Regular'])

In [11]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()


['Low Fat' 'Regular']
['Low Fat' 'Regular']

In [12]:
from sklearn.model_selection import train_test_split

train = train.drop('Item_Identifier',axis=1)
test = test.drop('Item_Identifier',axis=1)

y = train.Item_Outlet_Sales
X = train.drop(['Item_Outlet_Sales'], axis=1)

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=410)

In [16]:
from catboost import CatBoostRegressor
%matplotlib inline

# indicate categorical features for CatBoost
categorical_features_indices = np.where(X.dtypes != np.float)[0]

model=CatBoostRegressor(iterations=50, depth=3, learning_rate=0.1, loss_function='RMSE',
                       train_dir='model_1/',   # define output folder
                        name='model_depth_1')
model.fit(X_train, y_train, cat_features=categorical_features_indices,
          use_best_model=True,
          eval_set=(X_validation, y_validation), plot=True)


Out[16]:
<catboost.core._CatBoostBase at 0x112607650>

In [ ]:
# Took me a while to figure out how to generate the above plot, you can find details here:
## https://stackoverflow.com/questions/45707010/ipython-importerror-cannot-import-name-layout

In [18]:
# You can also save pre-trained model and load the model later

model.save_model('catboost_model.dump')

model = CatBoostRegressor()
model.load_model('catboost_model.dump')


Out[18]:
<catboost.core._CatBoostBase at 0x112b50a50>