Try Featuretools - Auto Basic Feature Engineering

It does basic feature engineering, by generating MIN, MAX, SUM, STD, SKEW, NUM_UNIQUE, MEAN, MODE.
About Featuretools: https://github.com/Featuretools/featuretools

Data can be downloaded here (need sign in): https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/
Reference: https://www.analyticsvidhya.com/blog/2018/08/guide-automated-feature-engineering-featuretools-python/?utm_source=feedburner&utm_medium=email&utm_campaign=Feed%3A+AnalyticsVidhya+%28Analytics+Vidhya%29



In [82]:

    
import featuretools as ft
import numpy as np
import pandas as pd



In [83]:

    
train = pd.read_csv("Big_Mart_Train.csv")
train.head()









    Out[83]:







  
    
      
      Item_Identifier
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
      Item_Outlet_Sales
    
  
  
    
      0
      FDA15
      9.30
      Low Fat
      0.016047
      Dairy
      249.8092
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
      3735.1380
    
    
      1
      DRC01
      5.92
      Regular
      0.019278
      Soft Drinks
      48.2692
      OUT018
      2009
      Medium
      Tier 3
      Supermarket Type2
      443.4228
    
    
      2
      FDN15
      17.50
      Low Fat
      0.016760
      Meat
      141.6180
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
      2097.2700
    
    
      3
      FDX07
      19.20
      Regular
      0.000000
      Fruits and Vegetables
      182.0950
      OUT010
      1998
      NaN
      Tier 3
      Grocery Store
      732.3800
    
    
      4
      NCD19
      8.93
      Low Fat
      0.000000
      Household
      53.8614
      OUT013
      1987
      High
      Tier 3
      Supermarket Type1
      994.7052



In [84]:

    
# Data Preprocessing

train.isnull().sum()









    Out[84]:





Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64



In [85]:

    
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))



In [86]:

    
print train.Outlet_Size.unique()









    



['Medium' nan 'High' 'Small']



In [87]:

    
# fill NA with mode
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])



In [88]:

    
train.dtypes









    Out[88]:





Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object



In [89]:

    
print train.Item_Fat_Content.unique()
print train.Item_Type.unique()
print train.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()









    



['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']
['Medium' 'High' 'Small']
['Tier 1' 'Tier 3' 'Tier 2']
['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']



In [90]:

    
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])



In [91]:

    
print train.Item_Fat_Content.unique()









    



['Low Fat' 'Regular']



In [92]:

    
train.head()









    Out[92]:







  
    
      
      Item_Identifier
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
      Item_Outlet_Sales
    
  
  
    
      0
      FDA15
      9.30
      Low Fat
      0.016047
      Dairy
      249.8092
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
      3735.1380
    
    
      1
      DRC01
      5.92
      Regular
      0.019278
      Soft Drinks
      48.2692
      OUT018
      2009
      Medium
      Tier 3
      Supermarket Type2
      443.4228
    
    
      2
      FDN15
      17.50
      Low Fat
      0.016760
      Meat
      141.6180
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
      2097.2700
    
    
      3
      FDX07
      19.20
      Regular
      0.000000
      Fruits and Vegetables
      182.0950
      OUT010
      1998
      Medium
      Tier 3
      Grocery Store
      732.3800
    
    
      4
      NCD19
      8.93
      Low Fat
      0.000000
      Household
      53.8614
      OUT013
      1987
      High
      Tier 3
      Supermarket Type1
      994.7052



In [93]:

    
train2 = train.copy(deep=True)  # make a copy of the dataframe, not the reference, changes won't change this copy



In [94]:

    
# Featuretools allows you to create multiple tables
train['id'] = train['Item_Identifier'] + train['Outlet_Identifier']  # id used for index
# train.drop(['Item_Identifier'], axis=1, inplace=True)

## step 1 - create an entity set, it could contain multiple tables and the relationships between tables
es = ft.EntitySet(id = 'sales')

## step 2 - create base entity
es.entity_from_dataframe(entity_id = 'bigmart', dataframe = train, index = 'id')

## step 3 - create 2 seperate tables
es.normalize_entity(base_entity_id='bigmart', new_entity_id='outlet', index = 'Outlet_Identifier', 
additional_variables = ['Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])

es.normalize_entity(base_entity_id='bigmart', new_entity_id='item', index = 'Item_Identifier', 
additional_variables = ['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type', 'Item_MRP'])









    Out[94]:





Entityset: sales
  Entities:
    item [Rows: 1559, Columns: 6]
    bigmart [Rows: 8523, Columns: 4]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier
    bigmart.Item_Identifier -> item.Item_Identifier



In [96]:

    
feature_matrix, feature_names = ft.dfs(entityset=es, 
target_entity = 'outlet', 
max_depth = 2, 
verbose = 1, 
n_jobs = 3)









    



Built 35 features
EntitySet scattered to workers in 0.184 seconds
Elapsed: 00:00 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 1/1 chunks



In [97]:

    
feature_matrix.head()









    Out[97]:







  
    
      
      Outlet_Type
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      MODE(bigmart.Item_Identifier)
      COUNT(bigmart)
      SKEW(bigmart.Item_Outlet_Sales)
      STD(bigmart.Item_Outlet_Sales)
      SUM(bigmart.Item_Outlet_Sales)
      MEAN(bigmart.Item_Outlet_Sales)
      ...
      MAX(bigmart.item.Item_MRP)
      NUM_UNIQUE(bigmart.item.Item_Fat_Content)
      STD(bigmart.item.Item_Weight)
      SUM(bigmart.item.Item_Visibility)
      MAX(bigmart.item.Item_Weight)
      MEAN(bigmart.item.Item_MRP)
      MAX(bigmart.item.Item_Visibility)
      SUM(bigmart.item.Item_MRP)
      SKEW(bigmart.item.Item_Visibility)
      MODE(bigmart.item.Item_Type)
    
    
      Outlet_Identifier
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      OUT010
      Grocery Store
      1998
      Medium
      Tier 3
      DRA12
      555
      1.723945
      271.014855
      1.883402e+05
      339.351662
      ...
      266.6884
      2
      4.638683
      56.308832
      21.35
      140.777594
      0.311090
      78131.5646
      0.740390
      Fruits and Vegetables
    
    
      OUT013
      Supermarket Type1
      1987
      High
      Tier 3
      DRA12
      932
      1.020214
      1533.531664
      2.142664e+06
      2298.995256
      ...
      266.6884
      2
      4.666798
      70.940350
      21.35
      141.419974
      0.311090
      131803.4156
      1.182016
      Fruits and Vegetables
    
    
      OUT017
      Supermarket Type1
      2007
      Medium
      Tier 2
      DRA12
      926
      1.179987
      1550.778025
      2.167465e+06
      2340.675263
      ...
      266.6884
      2
      4.579025
      69.934853
      21.25
      139.301789
      0.309390
      128993.4564
      1.122480
      Snack Foods
    
    
      OUT018
      Supermarket Type2
      2009
      Medium
      Tier 3
      DRA12
      928
      1.079195
      1375.932889
      1.851823e+06
      1995.498739
      ...
      265.6884
      2
      4.689009
      72.596562
      21.35
      141.578634
      0.311090
      131384.9724
      1.062392
      Fruits and Vegetables
    
    
      OUT019
      Grocery Store
      1985
      Small
      Tier 1
      DRA24
      528
      1.413909
      249.979449
      1.796941e+05
      340.329723
      ...
      266.6884
      2
      4.530153
      40.388090
      21.35
      139.748073
      0.291865
      73786.9824
      0.916637
      Snack Foods
    
  

5 rows × 35 columns



In [98]:

    
# correct the index
feature_matrix = feature_matrix.reindex(index=train['Outlet_Identifier'])
feature_matrix = feature_matrix.reset_index()



In [37]:

    
feature_matrix.head()









    Out[37]:







  
    
      
      Outlet_Identifier
      Outlet_Type
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      SKEW(bigmart.Item_Visibility)
      COUNT(bigmart)
      STD(bigmart.Item_Visibility)
      SUM(bigmart.Item_Visibility)
      MEAN(bigmart.Item_Visibility)
      ...
      STD(bigmart.Item_Weight)
      MAX(bigmart.Item_Weight)
      NUM_UNIQUE(bigmart.Item_Type)
      SUM(bigmart.Item_Outlet_Sales)
      MEAN(bigmart.Item_Weight)
      MAX(bigmart.Item_MRP)
      SUM(bigmart.Item_MRP)
      MAX(bigmart.Item_Outlet_Sales)
      MIN(bigmart.Item_Outlet_Sales)
      SUM(bigmart.Item_Weight)
    
  
  
    
      0
      OUT049
      Supermarket Type1
      1999
      Medium
      Tier 1
      0.801043
      930
      0.044602
      56.549156
      0.060806
      ...
      4.617003
      21.25
      16
      2.183970e+06
      12.917446
      266.2884
      130476.8598
      7646.0472
      111.8544
      12013.225
    
    
      1
      OUT018
      Supermarket Type2
      2009
      Medium
      Tier 3
      0.751015
      928
      0.045386
      56.621454
      0.061014
      ...
      4.689009
      21.35
      16
      1.851823e+06
      12.873346
      265.1884
      131477.7724
      6768.5228
      69.2432
      11946.465
    
    
      2
      OUT049
      Supermarket Type1
      1999
      Medium
      Tier 1
      0.801043
      930
      0.044602
      56.549156
      0.060806
      ...
      4.617003
      21.25
      16
      2.183970e+06
      12.917446
      266.2884
      130476.8598
      7646.0472
      111.8544
      12013.225
    
    
      3
      OUT010
      Grocery Store
      1998
      Medium
      Tier 3
      0.740390
      555
      0.072047
      56.308832
      0.101457
      ...
      4.638683
      21.35
      16
      1.883402e+05
      12.913153
      266.6884
      78131.5646
      1775.6886
      33.2900
      7166.800
    
    
      4
      OUT013
      Supermarket Type1
      1987
      High
      Tier 3
      0.782251
      932
      0.044235
      55.879859
      0.059957
      ...
      4.666798
      21.35
      16
      2.142664e+06
      13.006148
      266.6884
      131809.0156
      10256.6490
      73.2380
      12121.730
    
  

5 rows × 34 columns



In [99]:

    
feature_matrix.columns









    Out[99]:





Index([                        u'Outlet_Identifier',
                                     u'Outlet_Type',
                       u'Outlet_Establishment_Year',
                                     u'Outlet_Size',
                            u'Outlet_Location_Type',
                   u'MODE(bigmart.Item_Identifier)',
                                  u'COUNT(bigmart)',
                 u'SKEW(bigmart.Item_Outlet_Sales)',
                  u'STD(bigmart.Item_Outlet_Sales)',
                  u'SUM(bigmart.Item_Outlet_Sales)',
                 u'MEAN(bigmart.Item_Outlet_Sales)',
                  u'MAX(bigmart.Item_Outlet_Sales)',
                  u'MIN(bigmart.Item_Outlet_Sales)',
             u'NUM_UNIQUE(bigmart.Item_Identifier)',
                  u'MEAN(bigmart.item.Item_Weight)',
                     u'SKEW(bigmart.item.Item_MRP)',
              u'MEAN(bigmart.item.Item_Visibility)',
                   u'SUM(bigmart.item.Item_Weight)',
               u'MIN(bigmart.item.Item_Visibility)',
                      u'STD(bigmart.item.Item_MRP)',
                      u'MIN(bigmart.item.Item_MRP)',
              u'NUM_UNIQUE(bigmart.item.Item_Type)',
                  u'SKEW(bigmart.item.Item_Weight)',
               u'STD(bigmart.item.Item_Visibility)',
                   u'MIN(bigmart.item.Item_Weight)',
             u'MODE(bigmart.item.Item_Fat_Content)',
                      u'MAX(bigmart.item.Item_MRP)',
       u'NUM_UNIQUE(bigmart.item.Item_Fat_Content)',
                   u'STD(bigmart.item.Item_Weight)',
               u'SUM(bigmart.item.Item_Visibility)',
                   u'MAX(bigmart.item.Item_Weight)',
                     u'MEAN(bigmart.item.Item_MRP)',
               u'MAX(bigmart.item.Item_Visibility)',
                      u'SUM(bigmart.item.Item_MRP)',
              u'SKEW(bigmart.item.Item_Visibility)',
                    u'MODE(bigmart.item.Item_Type)'],
      dtype='object')

Summary

When I chose the target_entity as "outlet", its additional variables won't go through feature engineering. But other columns in the base entity will all be processed.
- So it seems that "additional variables" are just to specify those columns in the target entity that won't be processed.
For some reason, when I tried to process "item" as the target entity, always get many errors.
max_depth is 2 is enough for the columns here, if at level 2 there will be columns that can be processed further, higher depth will be good.

	Item_Identifier	Item_Weight	Item_Fat_Content	Item_Visibility	Item_Type	Item_MRP	Outlet_Identifier	Outlet_Establishment_Year	Outlet_Size	Outlet_Location_Type	Outlet_Type	Item_Outlet_Sales
0	FDA15	9.30	Low Fat	0.016047	Dairy	249.8092	OUT049	1999	Medium	Tier 1	Supermarket Type1	3735.1380
1	DRC01	5.92	Regular	0.019278	Soft Drinks	48.2692	OUT018	2009	Medium	Tier 3	Supermarket Type2	443.4228
2	FDN15	17.50	Low Fat	0.016760	Meat	141.6180	OUT049	1999	Medium	Tier 1	Supermarket Type1	2097.2700
3	FDX07	19.20	Regular	0.000000	Fruits and Vegetables	182.0950	OUT010	1998	NaN	Tier 3	Grocery Store	732.3800
4	NCD19	8.93	Low Fat	0.000000	Household	53.8614	OUT013	1987	High	Tier 3	Supermarket Type1	994.7052

	Outlet_Type	Outlet_Establishment_Year	Outlet_Size	Outlet_Location_Type	MODE(bigmart.Item_Identifier)	COUNT(bigmart)	SKEW(bigmart.Item_Outlet_Sales)	STD(bigmart.Item_Outlet_Sales)	SUM(bigmart.Item_Outlet_Sales)	MEAN(bigmart.Item_Outlet_Sales)	...	MAX(bigmart.item.Item_MRP)	NUM_UNIQUE(bigmart.item.Item_Fat_Content)	STD(bigmart.item.Item_Weight)	SUM(bigmart.item.Item_Visibility)	MAX(bigmart.item.Item_Weight)	MEAN(bigmart.item.Item_MRP)	MAX(bigmart.item.Item_Visibility)	SUM(bigmart.item.Item_MRP)	SKEW(bigmart.item.Item_Visibility)	MODE(bigmart.item.Item_Type)
Outlet_Identifier
OUT010	Grocery Store	1998	Medium	Tier 3	DRA12	555	1.723945	271.014855	1.883402e+05	339.351662	...	266.6884	2	4.638683	56.308832	21.35	140.777594	0.311090	78131.5646	0.740390	Fruits and Vegetables
OUT013	Supermarket Type1	1987	High	Tier 3	DRA12	932	1.020214	1533.531664	2.142664e+06	2298.995256	...	266.6884	2	4.666798	70.940350	21.35	141.419974	0.311090	131803.4156	1.182016	Fruits and Vegetables
OUT017	Supermarket Type1	2007	Medium	Tier 2	DRA12	926	1.179987	1550.778025	2.167465e+06	2340.675263	...	266.6884	2	4.579025	69.934853	21.25	139.301789	0.309390	128993.4564	1.122480	Snack Foods
OUT018	Supermarket Type2	2009	Medium	Tier 3	DRA12	928	1.079195	1375.932889	1.851823e+06	1995.498739	...	265.6884	2	4.689009	72.596562	21.35	141.578634	0.311090	131384.9724	1.062392	Fruits and Vegetables
OUT019	Grocery Store	1985	Small	Tier 1	DRA24	528	1.413909	249.979449	1.796941e+05	340.329723	...	266.6884	2	4.530153	40.388090	21.35	139.748073	0.291865	73786.9824	0.916637	Snack Foods