Try Featuretools - Auto Basic Feature Engineering


In [82]:
import featuretools as ft
import numpy as np
import pandas as pd

In [83]:
train = pd.read_csv("Big_Mart_Train.csv")
train.head()


Out[83]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 FDA15 9.30 Low Fat 0.016047 Dairy 249.8092 OUT049 1999 Medium Tier 1 Supermarket Type1 3735.1380
1 DRC01 5.92 Regular 0.019278 Soft Drinks 48.2692 OUT018 2009 Medium Tier 3 Supermarket Type2 443.4228
2 FDN15 17.50 Low Fat 0.016760 Meat 141.6180 OUT049 1999 Medium Tier 1 Supermarket Type1 2097.2700
3 FDX07 19.20 Regular 0.000000 Fruits and Vegetables 182.0950 OUT010 1998 NaN Tier 3 Grocery Store 732.3800
4 NCD19 8.93 Low Fat 0.000000 Household 53.8614 OUT013 1987 High Tier 3 Supermarket Type1 994.7052

In [84]:
# Data Preprocessing

train.isnull().sum()


Out[84]:
Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [85]:
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))

In [86]:
print train.Outlet_Size.unique()


['Medium' nan 'High' 'Small']

In [87]:
# fill NA with mode
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])

In [88]:
train.dtypes


Out[88]:
Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [89]:
print train.Item_Fat_Content.unique()
print train.Item_Type.unique()
print train.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()


['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']
['Medium' 'High' 'Small']
['Tier 1' 'Tier 3' 'Tier 2']
['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']

In [90]:
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])

In [91]:
print train.Item_Fat_Content.unique()


['Low Fat' 'Regular']

In [92]:
train.head()


Out[92]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 FDA15 9.30 Low Fat 0.016047 Dairy 249.8092 OUT049 1999 Medium Tier 1 Supermarket Type1 3735.1380
1 DRC01 5.92 Regular 0.019278 Soft Drinks 48.2692 OUT018 2009 Medium Tier 3 Supermarket Type2 443.4228
2 FDN15 17.50 Low Fat 0.016760 Meat 141.6180 OUT049 1999 Medium Tier 1 Supermarket Type1 2097.2700
3 FDX07 19.20 Regular 0.000000 Fruits and Vegetables 182.0950 OUT010 1998 Medium Tier 3 Grocery Store 732.3800
4 NCD19 8.93 Low Fat 0.000000 Household 53.8614 OUT013 1987 High Tier 3 Supermarket Type1 994.7052

In [93]:
train2 = train.copy(deep=True)  # make a copy of the dataframe, not the reference, changes won't change this copy

In [94]:
# Featuretools allows you to create multiple tables
train['id'] = train['Item_Identifier'] + train['Outlet_Identifier']  # id used for index
# train.drop(['Item_Identifier'], axis=1, inplace=True)

## step 1 - create an entity set, it could contain multiple tables and the relationships between tables
es = ft.EntitySet(id = 'sales')

## step 2 - create base entity
es.entity_from_dataframe(entity_id = 'bigmart', dataframe = train, index = 'id')

## step 3 - create 2 seperate tables
es.normalize_entity(base_entity_id='bigmart', new_entity_id='outlet', index = 'Outlet_Identifier', 
additional_variables = ['Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])

es.normalize_entity(base_entity_id='bigmart', new_entity_id='item', index = 'Item_Identifier', 
additional_variables = ['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type', 'Item_MRP'])


Out[94]:
Entityset: sales
  Entities:
    item [Rows: 1559, Columns: 6]
    bigmart [Rows: 8523, Columns: 4]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier
    bigmart.Item_Identifier -> item.Item_Identifier

In [96]:
feature_matrix, feature_names = ft.dfs(entityset=es, 
target_entity = 'outlet', 
max_depth = 2, 
verbose = 1, 
n_jobs = 3)


Built 35 features
EntitySet scattered to workers in 0.184 seconds
Elapsed: 00:00 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 1/1 chunks

In [97]:
feature_matrix.head()


Out[97]:
Outlet_Type Outlet_Establishment_Year Outlet_Size Outlet_Location_Type MODE(bigmart.Item_Identifier) COUNT(bigmart) SKEW(bigmart.Item_Outlet_Sales) STD(bigmart.Item_Outlet_Sales) SUM(bigmart.Item_Outlet_Sales) MEAN(bigmart.Item_Outlet_Sales) ... MAX(bigmart.item.Item_MRP) NUM_UNIQUE(bigmart.item.Item_Fat_Content) STD(bigmart.item.Item_Weight) SUM(bigmart.item.Item_Visibility) MAX(bigmart.item.Item_Weight) MEAN(bigmart.item.Item_MRP) MAX(bigmart.item.Item_Visibility) SUM(bigmart.item.Item_MRP) SKEW(bigmart.item.Item_Visibility) MODE(bigmart.item.Item_Type)
Outlet_Identifier
OUT010 Grocery Store 1998 Medium Tier 3 DRA12 555 1.723945 271.014855 1.883402e+05 339.351662 ... 266.6884 2 4.638683 56.308832 21.35 140.777594 0.311090 78131.5646 0.740390 Fruits and Vegetables
OUT013 Supermarket Type1 1987 High Tier 3 DRA12 932 1.020214 1533.531664 2.142664e+06 2298.995256 ... 266.6884 2 4.666798 70.940350 21.35 141.419974 0.311090 131803.4156 1.182016 Fruits and Vegetables
OUT017 Supermarket Type1 2007 Medium Tier 2 DRA12 926 1.179987 1550.778025 2.167465e+06 2340.675263 ... 266.6884 2 4.579025 69.934853 21.25 139.301789 0.309390 128993.4564 1.122480 Snack Foods
OUT018 Supermarket Type2 2009 Medium Tier 3 DRA12 928 1.079195 1375.932889 1.851823e+06 1995.498739 ... 265.6884 2 4.689009 72.596562 21.35 141.578634 0.311090 131384.9724 1.062392 Fruits and Vegetables
OUT019 Grocery Store 1985 Small Tier 1 DRA24 528 1.413909 249.979449 1.796941e+05 340.329723 ... 266.6884 2 4.530153 40.388090 21.35 139.748073 0.291865 73786.9824 0.916637 Snack Foods

5 rows × 35 columns


In [98]:
# correct the index
feature_matrix = feature_matrix.reindex(index=train['Outlet_Identifier'])
feature_matrix = feature_matrix.reset_index()

In [37]:
feature_matrix.head()


Out[37]:
Outlet_Identifier Outlet_Type Outlet_Establishment_Year Outlet_Size Outlet_Location_Type SKEW(bigmart.Item_Visibility) COUNT(bigmart) STD(bigmart.Item_Visibility) SUM(bigmart.Item_Visibility) MEAN(bigmart.Item_Visibility) ... STD(bigmart.Item_Weight) MAX(bigmart.Item_Weight) NUM_UNIQUE(bigmart.Item_Type) SUM(bigmart.Item_Outlet_Sales) MEAN(bigmart.Item_Weight) MAX(bigmart.Item_MRP) SUM(bigmart.Item_MRP) MAX(bigmart.Item_Outlet_Sales) MIN(bigmart.Item_Outlet_Sales) SUM(bigmart.Item_Weight)
0 OUT049 Supermarket Type1 1999 Medium Tier 1 0.801043 930 0.044602 56.549156 0.060806 ... 4.617003 21.25 16 2.183970e+06 12.917446 266.2884 130476.8598 7646.0472 111.8544 12013.225
1 OUT018 Supermarket Type2 2009 Medium Tier 3 0.751015 928 0.045386 56.621454 0.061014 ... 4.689009 21.35 16 1.851823e+06 12.873346 265.1884 131477.7724 6768.5228 69.2432 11946.465
2 OUT049 Supermarket Type1 1999 Medium Tier 1 0.801043 930 0.044602 56.549156 0.060806 ... 4.617003 21.25 16 2.183970e+06 12.917446 266.2884 130476.8598 7646.0472 111.8544 12013.225
3 OUT010 Grocery Store 1998 Medium Tier 3 0.740390 555 0.072047 56.308832 0.101457 ... 4.638683 21.35 16 1.883402e+05 12.913153 266.6884 78131.5646 1775.6886 33.2900 7166.800
4 OUT013 Supermarket Type1 1987 High Tier 3 0.782251 932 0.044235 55.879859 0.059957 ... 4.666798 21.35 16 2.142664e+06 13.006148 266.6884 131809.0156 10256.6490 73.2380 12121.730

5 rows × 34 columns


In [99]:
feature_matrix.columns


Out[99]:
Index([                        u'Outlet_Identifier',
                                     u'Outlet_Type',
                       u'Outlet_Establishment_Year',
                                     u'Outlet_Size',
                            u'Outlet_Location_Type',
                   u'MODE(bigmart.Item_Identifier)',
                                  u'COUNT(bigmart)',
                 u'SKEW(bigmart.Item_Outlet_Sales)',
                  u'STD(bigmart.Item_Outlet_Sales)',
                  u'SUM(bigmart.Item_Outlet_Sales)',
                 u'MEAN(bigmart.Item_Outlet_Sales)',
                  u'MAX(bigmart.Item_Outlet_Sales)',
                  u'MIN(bigmart.Item_Outlet_Sales)',
             u'NUM_UNIQUE(bigmart.Item_Identifier)',
                  u'MEAN(bigmart.item.Item_Weight)',
                     u'SKEW(bigmart.item.Item_MRP)',
              u'MEAN(bigmart.item.Item_Visibility)',
                   u'SUM(bigmart.item.Item_Weight)',
               u'MIN(bigmart.item.Item_Visibility)',
                      u'STD(bigmart.item.Item_MRP)',
                      u'MIN(bigmart.item.Item_MRP)',
              u'NUM_UNIQUE(bigmart.item.Item_Type)',
                  u'SKEW(bigmart.item.Item_Weight)',
               u'STD(bigmart.item.Item_Visibility)',
                   u'MIN(bigmart.item.Item_Weight)',
             u'MODE(bigmart.item.Item_Fat_Content)',
                      u'MAX(bigmart.item.Item_MRP)',
       u'NUM_UNIQUE(bigmart.item.Item_Fat_Content)',
                   u'STD(bigmart.item.Item_Weight)',
               u'SUM(bigmart.item.Item_Visibility)',
                   u'MAX(bigmart.item.Item_Weight)',
                     u'MEAN(bigmart.item.Item_MRP)',
               u'MAX(bigmart.item.Item_Visibility)',
                      u'SUM(bigmart.item.Item_MRP)',
              u'SKEW(bigmart.item.Item_Visibility)',
                    u'MODE(bigmart.item.Item_Type)'],
      dtype='object')

Summary

  • When I chose the target_entity as "outlet", its additional variables won't go through feature engineering. But other columns in the base entity will all be processed.
    • So it seems that "additional variables" are just to specify those columns in the target entity that won't be processed.
  • For some reason, when I tried to process "item" as the target entity, always get many errors.
  • max_depth is 2 is enough for the columns here, if at level 2 there will be columns that can be processed further, higher depth will be good.