In [ ]:
# TPOT uses genetic algorithm for feature selection and model selection, and it does all these automatically
# want to try it

# Data can be down loaded here (need sign in): 
## https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/

In [97]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn import preprocessing 
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tpot import TPOTRegressor
%matplotlib inline

In [74]:
train = pd.read_csv("train_GA.csv")
test = pd.read_csv("test_GA.csv")

In [44]:
train.head()


Out[44]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 FDA15 9.30 Low Fat 0.016047 Dairy 249.8092 OUT049 1999 Medium Tier 1 Supermarket Type1 3735.1380
1 DRC01 5.92 Regular 0.019278 Soft Drinks 48.2692 OUT018 2009 Medium Tier 3 Supermarket Type2 443.4228
2 FDN15 17.50 Low Fat 0.016760 Meat 141.6180 OUT049 1999 Medium Tier 1 Supermarket Type1 2097.2700
3 FDX07 19.20 Regular 0.000000 Fruits and Vegetables 182.0950 OUT010 1998 NaN Tier 3 Grocery Store 732.3800
4 NCD19 8.93 Low Fat 0.000000 Household 53.8614 OUT013 1987 High Tier 3 Supermarket Type1 994.7052

In [45]:
pd.isnull(train).sum() > 0


Out[45]:
Item_Identifier              False
Item_Weight                   True
Item_Fat_Content             False
Item_Visibility              False
Item_Type                    False
Item_MRP                     False
Outlet_Identifier            False
Outlet_Establishment_Year    False
Outlet_Size                   True
Outlet_Location_Type         False
Outlet_Type                  False
Item_Outlet_Sales            False
dtype: bool

In [29]:
pd.isnull(test).sum() > 0


Out[29]:
Item_Identifier              False
Item_Weight                   True
Item_Fat_Content             False
Item_Visibility              False
Item_Type                    False
Item_MRP                     False
Outlet_Identifier            False
Outlet_Establishment_Year    False
Outlet_Size                   True
Outlet_Location_Type         False
Outlet_Type                  False
dtype: bool

In [75]:
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))
test.Item_Weight = test.Item_Weight.fillna(np.nanmedian(test.Item_Weight))

In [62]:
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()


['Medium' nan 'High' 'Small']
['Medium' nan 'Small' 'High']

In [76]:
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
test.Outlet_Size = test.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])

In [77]:
train.dtypes


Out[77]:
Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [78]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
print train.Item_Type.unique()
print test.Item_Type.unique()
print train.Outlet_Identifier.unique()
print test.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print test.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()
print test.Outlet_Type.unique()


['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
['Low Fat' 'reg' 'Regular' 'LF' 'low fat']
['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
['Snack Foods' 'Dairy' 'Others' 'Fruits and Vegetables' 'Baking Goods'
 'Health and Hygiene' 'Breads' 'Hard Drinks' 'Seafood' 'Soft Drinks'
 'Household' 'Frozen Foods' 'Meat' 'Canned' 'Starchy Foods' 'Breakfast']
['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']
['OUT049' 'OUT017' 'OUT010' 'OUT027' 'OUT046' 'OUT018' 'OUT045' 'OUT019'
 'OUT013' 'OUT035']
['Medium' 'High' 'Small']
['Medium' 'Small' 'High']
['Tier 1' 'Tier 3' 'Tier 2']
['Tier 1' 'Tier 2' 'Tier 3']
['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']
['Supermarket Type1' 'Grocery Store' 'Supermarket Type3'
 'Supermarket Type2']

In [79]:
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['reg'], ['Regular'])

In [80]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()


['Low Fat' 'Regular']
['Low Fat' 'Regular']

In [81]:
print train.Outlet_Establishment_Year.max()
print train.Outlet_Establishment_Year.min()


2009
1985

In [82]:
train.Outlet_Establishment_Year = 2017 - train.Outlet_Establishment_Year
test.Outlet_Establishment_Year = 2017 - test.Outlet_Establishment_Year

In [83]:
train.shape


Out[83]:
(8523, 12)

In [84]:
# label encoding, do this by combining train and test together

test['Item_Outlet_Sales'] = 0
combi = train.append(test)
number = LabelEncoder()

for i in combi.columns:
    if (combi[i].dtype == 'object'):
        combi[i] = number.fit_transform(combi[i].astype('str'))
        combi[i] = combi[i].astype('object')
        
train = combi[:train.shape[0]]
test = combi[train.shape[0]:]

In [86]:
test.head()


Out[86]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 1114 20.750 0 0.007565 13 107.8622 9 18 1 0 1 0.0
1 1078 8.300 1 0.038428 4 87.3198 2 10 1 1 1 0.0
2 1420 14.600 0 0.099575 11 241.7538 0 19 1 2 0 0.0
3 817 7.315 0 0.015388 13 155.0340 2 10 1 1 1 0.0
4 1197 12.500 1 0.118599 4 234.2300 5 32 1 2 3 0.0

In [88]:
test = test.drop('Item_Outlet_Sales',axis=1)

# remove id and those with more levels
tpot_train = train.drop('Item_Identifier',axis=1)
tpot_test = test.drop('Item_Identifier',axis=1)
target = tpot_train['Item_Outlet_Sales']
tpot_train = tpot_train.drop('Item_Outlet_Sales',axis=1)

In [98]:
# build the model with tpot
# finally building model using tpot library

X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,
 train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))   # mean squared error (MSE)
tpot.export('tpot_boston_pipeline.py')


/Library/Python/2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Optimization Progress:  46%|████▌     | 138/300 [02:35<01:59,  1.36pipeline/s]
Generation 1 - Current best internal CV score: 1156077.3302
Generation 2 - Current best internal CV score: 1156077.3302
Optimization Progress:  60%|██████    | 181/300 [03:49<02:55,  1.47s/pipeline]
Generation 3 - Current best internal CV score: 1153466.11981
Optimization Progress:  77%|███████▋  | 230/300 [04:57<01:09,  1.01pipeline/s]
Generation 4 - Current best internal CV score: 1153466.11981
Optimization Progress:  93%|█████████▎| 279/300 [06:10<00:22,  1.06s/pipeline]
Generation 5 - Current best internal CV score: 1151908.128
                                                                              

Best pipeline: ExtraTreesRegressor(input_matrix, ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=1.0, ExtraTreesRegressor__min_samples_leaf=13, ExtraTreesRegressor__min_samples_split=20, ExtraTreesRegressor__n_estimators=100)
1210911.02967