notebook.community

Edit and run



In [ ]:

    
# Get data here: https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/



In [1]:

    
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor


train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")



In [2]:

    
train.head()









    Out[2]:







  
    
      
      Item_Identifier
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
      Item_Outlet_Sales
    
  
  
    
      0
      FDA15
      9.30
      Low Fat
      0.016047
      Dairy
      249.8092
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
      3735.1380
    
    
      1
      DRC01
      5.92
      Regular
      0.019278
      Soft Drinks
      48.2692
      OUT018
      2009
      Medium
      Tier 3
      Supermarket Type2
      443.4228
    
    
      2
      FDN15
      17.50
      Low Fat
      0.016760
      Meat
      141.6180
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
      2097.2700
    
    
      3
      FDX07
      19.20
      Regular
      0.000000
      Fruits and Vegetables
      182.0950
      OUT010
      1998
      NaN
      Tier 3
      Grocery Store
      732.3800
    
    
      4
      NCD19
      8.93
      Low Fat
      0.000000
      Household
      53.8614
      OUT013
      1987
      High
      Tier 3
      Supermarket Type1
      994.7052



In [3]:

    
train.dtypes









    Out[3]:





Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object



In [4]:

    
train.isnull().sum()









    Out[4]:





Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64



In [5]:

    
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))
test.Item_Weight = test.Item_Weight.fillna(np.nanmedian(test.Item_Weight))



In [6]:

    
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()









    



['Medium' nan 'High' 'Small']
['Medium' nan 'Small' 'High']



In [7]:

    
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
test.Outlet_Size = test.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])



In [8]:

    
print train.isnull().sum()
print test.isnull().sum()









    



Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64
Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64



In [9]:

    
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
print train.Item_Type.unique()
print test.Item_Type.unique()
print train.Outlet_Identifier.unique()
print test.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print test.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()
print test.Outlet_Type.unique()









    



['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
['Low Fat' 'reg' 'Regular' 'LF' 'low fat']
['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
['Snack Foods' 'Dairy' 'Others' 'Fruits and Vegetables' 'Baking Goods'
 'Health and Hygiene' 'Breads' 'Hard Drinks' 'Seafood' 'Soft Drinks'
 'Household' 'Frozen Foods' 'Meat' 'Canned' 'Starchy Foods' 'Breakfast']
['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']
['OUT049' 'OUT017' 'OUT010' 'OUT027' 'OUT046' 'OUT018' 'OUT045' 'OUT019'
 'OUT013' 'OUT035']
['Medium' 'High' 'Small']
['Medium' 'Small' 'High']
['Tier 1' 'Tier 3' 'Tier 2']
['Tier 1' 'Tier 2' 'Tier 3']
['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']
['Supermarket Type1' 'Grocery Store' 'Supermarket Type3'
 'Supermarket Type2']



In [10]:

    
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['reg'], ['Regular'])



In [11]:

    
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()









    



['Low Fat' 'Regular']
['Low Fat' 'Regular']



In [12]:

    
from sklearn.model_selection import train_test_split

train = train.drop('Item_Identifier',axis=1)
test = test.drop('Item_Identifier',axis=1)

y = train.Item_Outlet_Sales
X = train.drop(['Item_Outlet_Sales'], axis=1)

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=410)



In [16]:

    
from catboost import CatBoostRegressor
%matplotlib inline

# indicate categorical features for CatBoost
categorical_features_indices = np.where(X.dtypes != np.float)[0]

model=CatBoostRegressor(iterations=50, depth=3, learning_rate=0.1, loss_function='RMSE',
                       train_dir='model_1/',   # define output folder
                        name='model_depth_1')
model.fit(X_train, y_train, cat_features=categorical_features_indices,
          use_best_model=True,
          eval_set=(X_validation, y_validation), plot=True)









    





            
            
        






    Out[16]:





<catboost.core._CatBoostBase at 0x112607650>



In [ ]:

    
# Took me a while to figure out how to generate the above plot, you can find details here:
## https://stackoverflow.com/questions/45707010/ipython-importerror-cannot-import-name-layout



In [18]:

    
# You can also save pre-trained model and load the model later

model.save_model('catboost_model.dump')

model = CatBoostRegressor()
model.load_model('catboost_model.dump')









    Out[18]:





<catboost.core._CatBoostBase at 0x112b50a50>

	Item_Identifier	Item_Weight	Item_Fat_Content	Item_Visibility	Item_Type	Item_MRP	Outlet_Identifier	Outlet_Establishment_Year	Outlet_Size	Outlet_Location_Type	Outlet_Type	Item_Outlet_Sales
0	FDA15	9.30	Low Fat	0.016047	Dairy	249.8092	OUT049	1999	Medium	Tier 1	Supermarket Type1	3735.1380
1	DRC01	5.92	Regular	0.019278	Soft Drinks	48.2692	OUT018	2009	Medium	Tier 3	Supermarket Type2	443.4228
2	FDN15	17.50	Low Fat	0.016760	Meat	141.6180	OUT049	1999	Medium	Tier 1	Supermarket Type1	2097.2700
3	FDX07	19.20	Regular	0.000000	Fruits and Vegetables	182.0950	OUT010	1998	NaN	Tier 3	Grocery Store	732.3800
4	NCD19	8.93	Low Fat	0.000000	Household	53.8614	OUT013	1987	High	Tier 3	Supermarket Type1	994.7052