In [ ]:
# Get data here: https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/
In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
In [2]:
train.head()
Out[2]:
In [3]:
train.dtypes
Out[3]:
In [4]:
train.isnull().sum()
Out[4]:
In [5]:
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))
test.Item_Weight = test.Item_Weight.fillna(np.nanmedian(test.Item_Weight))
In [6]:
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
In [7]:
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
test.Outlet_Size = test.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
In [8]:
print train.isnull().sum()
print test.isnull().sum()
In [9]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
print train.Item_Type.unique()
print test.Item_Type.unique()
print train.Outlet_Identifier.unique()
print test.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print test.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()
print test.Outlet_Type.unique()
In [10]:
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['reg'], ['Regular'])
In [11]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
In [12]:
from sklearn.model_selection import train_test_split
train = train.drop('Item_Identifier',axis=1)
test = test.drop('Item_Identifier',axis=1)
y = train.Item_Outlet_Sales
X = train.drop(['Item_Outlet_Sales'], axis=1)
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=410)
In [16]:
from catboost import CatBoostRegressor
%matplotlib inline
# indicate categorical features for CatBoost
categorical_features_indices = np.where(X.dtypes != np.float)[0]
model=CatBoostRegressor(iterations=50, depth=3, learning_rate=0.1, loss_function='RMSE',
train_dir='model_1/', # define output folder
name='model_depth_1')
model.fit(X_train, y_train, cat_features=categorical_features_indices,
use_best_model=True,
eval_set=(X_validation, y_validation), plot=True)
Out[16]:
In [ ]:
# Took me a while to figure out how to generate the above plot, you can find details here:
## https://stackoverflow.com/questions/45707010/ipython-importerror-cannot-import-name-layout
In [18]:
# You can also save pre-trained model and load the model later
model.save_model('catboost_model.dump')
model = CatBoostRegressor()
model.load_model('catboost_model.dump')
Out[18]: