notebook.community

Edit and run



In [ ]:

    
# TPOT uses genetic algorithm for feature selection and model selection, and it does all these automatically
# want to try it

# Data can be down loaded here (need sign in): 
## https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/



In [97]:

    
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn import preprocessing 
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tpot import TPOTRegressor
%matplotlib inline



In [74]:

    
train = pd.read_csv("train_GA.csv")
test = pd.read_csv("test_GA.csv")



In [44]:

    
train.head()









    Out[44]:






  
    
      
      Item_Identifier
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
      Item_Outlet_Sales
    
  
  
    
      0
      FDA15
      9.30
      Low Fat
      0.016047
      Dairy
      249.8092
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
      3735.1380
    
    
      1
      DRC01
      5.92
      Regular
      0.019278
      Soft Drinks
      48.2692
      OUT018
      2009
      Medium
      Tier 3
      Supermarket Type2
      443.4228
    
    
      2
      FDN15
      17.50
      Low Fat
      0.016760
      Meat
      141.6180
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
      2097.2700
    
    
      3
      FDX07
      19.20
      Regular
      0.000000
      Fruits and Vegetables
      182.0950
      OUT010
      1998
      NaN
      Tier 3
      Grocery Store
      732.3800
    
    
      4
      NCD19
      8.93
      Low Fat
      0.000000
      Household
      53.8614
      OUT013
      1987
      High
      Tier 3
      Supermarket Type1
      994.7052



In [45]:

    
pd.isnull(train).sum() > 0









    Out[45]:





Item_Identifier              False
Item_Weight                   True
Item_Fat_Content             False
Item_Visibility              False
Item_Type                    False
Item_MRP                     False
Outlet_Identifier            False
Outlet_Establishment_Year    False
Outlet_Size                   True
Outlet_Location_Type         False
Outlet_Type                  False
Item_Outlet_Sales            False
dtype: bool



In [29]:

    
pd.isnull(test).sum() > 0









    Out[29]:





Item_Identifier              False
Item_Weight                   True
Item_Fat_Content             False
Item_Visibility              False
Item_Type                    False
Item_MRP                     False
Outlet_Identifier            False
Outlet_Establishment_Year    False
Outlet_Size                   True
Outlet_Location_Type         False
Outlet_Type                  False
dtype: bool



In [75]:

    
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))
test.Item_Weight = test.Item_Weight.fillna(np.nanmedian(test.Item_Weight))



In [62]:

    
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()









    



['Medium' nan 'High' 'Small']
['Medium' nan 'Small' 'High']



In [76]:

    
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
test.Outlet_Size = test.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])



In [77]:

    
train.dtypes









    Out[77]:





Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object



In [78]:

    
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
print train.Item_Type.unique()
print test.Item_Type.unique()
print train.Outlet_Identifier.unique()
print test.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print test.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()
print test.Outlet_Type.unique()









    



['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
['Low Fat' 'reg' 'Regular' 'LF' 'low fat']
['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
['Snack Foods' 'Dairy' 'Others' 'Fruits and Vegetables' 'Baking Goods'
 'Health and Hygiene' 'Breads' 'Hard Drinks' 'Seafood' 'Soft Drinks'
 'Household' 'Frozen Foods' 'Meat' 'Canned' 'Starchy Foods' 'Breakfast']
['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']
['OUT049' 'OUT017' 'OUT010' 'OUT027' 'OUT046' 'OUT018' 'OUT045' 'OUT019'
 'OUT013' 'OUT035']
['Medium' 'High' 'Small']
['Medium' 'Small' 'High']
['Tier 1' 'Tier 3' 'Tier 2']
['Tier 1' 'Tier 2' 'Tier 3']
['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']
['Supermarket Type1' 'Grocery Store' 'Supermarket Type3'
 'Supermarket Type2']



In [79]:

    
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['reg'], ['Regular'])



In [80]:

    
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()









    



['Low Fat' 'Regular']
['Low Fat' 'Regular']



In [81]:

    
print train.Outlet_Establishment_Year.max()
print train.Outlet_Establishment_Year.min()



In [82]:

    
train.Outlet_Establishment_Year = 2017 - train.Outlet_Establishment_Year
test.Outlet_Establishment_Year = 2017 - test.Outlet_Establishment_Year



In [83]:

    
train.shape









    Out[83]:





(8523, 12)



In [84]:

    
# label encoding, do this by combining train and test together

test['Item_Outlet_Sales'] = 0
combi = train.append(test)
number = LabelEncoder()

for i in combi.columns:
    if (combi[i].dtype == 'object'):
        combi[i] = number.fit_transform(combi[i].astype('str'))
        combi[i] = combi[i].astype('object')
        
train = combi[:train.shape[0]]
test = combi[train.shape[0]:]



In [86]:

    
test.head()









    Out[86]:






  
    
      
      Item_Identifier
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
      Item_Outlet_Sales
    
  
  
    
      0
      1114
      20.750
      0
      0.007565
      13
      107.8622
      9
      18
      1
      0
      1
      0.0
    
    
      1
      1078
      8.300
      1
      0.038428
      4
      87.3198
      2
      10
      1
      1
      1
      0.0
    
    
      2
      1420
      14.600
      0
      0.099575
      11
      241.7538
      0
      19
      1
      2
      0
      0.0
    
    
      3
      817
      7.315
      0
      0.015388
      13
      155.0340
      2
      10
      1
      1
      1
      0.0
    
    
      4
      1197
      12.500
      1
      0.118599
      4
      234.2300
      5
      32
      1
      2
      3
      0.0



In [88]:

    
test = test.drop('Item_Outlet_Sales',axis=1)

# remove id and those with more levels
tpot_train = train.drop('Item_Identifier',axis=1)
tpot_test = test.drop('Item_Identifier',axis=1)
target = tpot_train['Item_Outlet_Sales']
tpot_train = tpot_train.drop('Item_Outlet_Sales',axis=1)



In [98]:

    
# build the model with tpot
# finally building model using tpot library

X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,
 train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))   # mean squared error (MSE)
tpot.export('tpot_boston_pipeline.py')









    



/Library/Python/2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Optimization Progress:  46%|████▌     | 138/300 [02:35<01:59,  1.36pipeline/s]





    



Generation 1 - Current best internal CV score: 1156077.3302
Generation 2 - Current best internal CV score: 1156077.3302





    



Optimization Progress:  60%|██████    | 181/300 [03:49<02:55,  1.47s/pipeline]





    



Generation 3 - Current best internal CV score: 1153466.11981





    



Optimization Progress:  77%|███████▋  | 230/300 [04:57<01:09,  1.01pipeline/s]





    



Generation 4 - Current best internal CV score: 1153466.11981





    



Optimization Progress:  93%|█████████▎| 279/300 [06:10<00:22,  1.06s/pipeline]





    



Generation 5 - Current best internal CV score: 1151908.128





    



                                                                              





    




Best pipeline: ExtraTreesRegressor(input_matrix, ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=1.0, ExtraTreesRegressor__min_samples_leaf=13, ExtraTreesRegressor__min_samples_split=20, ExtraTreesRegressor__n_estimators=100)
1210911.02967

	Item_Identifier	Item_Weight	Item_Fat_Content	Item_Visibility	Item_Type	Item_MRP	Outlet_Identifier	Outlet_Establishment_Year	Outlet_Size	Outlet_Location_Type	Outlet_Type	Item_Outlet_Sales
0	FDA15	9.30	Low Fat	0.016047	Dairy	249.8092	OUT049	1999	Medium	Tier 1	Supermarket Type1	3735.1380
1	DRC01	5.92	Regular	0.019278	Soft Drinks	48.2692	OUT018	2009	Medium	Tier 3	Supermarket Type2	443.4228
2	FDN15	17.50	Low Fat	0.016760	Meat	141.6180	OUT049	1999	Medium	Tier 1	Supermarket Type1	2097.2700
3	FDX07	19.20	Regular	0.000000	Fruits and Vegetables	182.0950	OUT010	1998	NaN	Tier 3	Grocery Store	732.3800
4	NCD19	8.93	Low Fat	0.000000	Household	53.8614	OUT013	1987	High	Tier 3	Supermarket Type1	994.7052

	Item_Identifier	Item_Weight	Item_Fat_Content	Item_Visibility	Item_Type	Item_MRP	Outlet_Identifier	Outlet_Establishment_Year	Outlet_Size	Outlet_Location_Type	Outlet_Type
0	1114	20.750	0	0.007565	13	107.8622	9	18	1	0	1
1	1078	8.300	1	0.038428	4	87.3198	2	10	1	1	1
2	1420	14.600	0	0.099575	11	241.7538	0	19	1	2	0
3	817	7.315	0	0.015388	13	155.0340	2	10	1	1	1
4	1197	12.500	1	0.118599	4	234.2300	5	32	1	2	3