In [82]:
import featuretools as ft
import numpy as np
import pandas as pd
In [83]:
train = pd.read_csv("Big_Mart_Train.csv")
train.head()
Out[83]:
In [84]:
# Data Preprocessing
train.isnull().sum()
Out[84]:
In [85]:
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))
In [86]:
print train.Outlet_Size.unique()
In [87]:
# fill NA with mode
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
In [88]:
train.dtypes
Out[88]:
In [89]:
print train.Item_Fat_Content.unique()
print train.Item_Type.unique()
print train.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()
In [90]:
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])
In [91]:
print train.Item_Fat_Content.unique()
In [92]:
train.head()
Out[92]:
In [93]:
train2 = train.copy(deep=True) # make a copy of the dataframe, not the reference, changes won't change this copy
In [94]:
# Featuretools allows you to create multiple tables
train['id'] = train['Item_Identifier'] + train['Outlet_Identifier'] # id used for index
# train.drop(['Item_Identifier'], axis=1, inplace=True)
## step 1 - create an entity set, it could contain multiple tables and the relationships between tables
es = ft.EntitySet(id = 'sales')
## step 2 - create base entity
es.entity_from_dataframe(entity_id = 'bigmart', dataframe = train, index = 'id')
## step 3 - create 2 seperate tables
es.normalize_entity(base_entity_id='bigmart', new_entity_id='outlet', index = 'Outlet_Identifier',
additional_variables = ['Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])
es.normalize_entity(base_entity_id='bigmart', new_entity_id='item', index = 'Item_Identifier',
additional_variables = ['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type', 'Item_MRP'])
Out[94]:
In [96]:
feature_matrix, feature_names = ft.dfs(entityset=es,
target_entity = 'outlet',
max_depth = 2,
verbose = 1,
n_jobs = 3)
In [97]:
feature_matrix.head()
Out[97]:
In [98]:
# correct the index
feature_matrix = feature_matrix.reindex(index=train['Outlet_Identifier'])
feature_matrix = feature_matrix.reset_index()
In [37]:
feature_matrix.head()
Out[37]:
In [99]:
feature_matrix.columns
Out[99]:
max_depth is 2 is enough for the columns here, if at level 2 there will be columns that can be processed further, higher depth will be good.