In [54]:

    
from IPython.display import SVG

Featuretools

a python library/ framework for automated feature engineering
based on "Deep Feature Synthesis" paper/ research
by Featurelabs https://www.featurelabs.com/
Website: https://www.featuretools.com/
Documentation: https://docs.featuretools.com/
Source code: https://github.com/Featuretools/featuretools
Examples: https://github.com/Featuretools/

Deep Feature Synthesis

Paper: http://www.jmaxkanter.com/static/papers/DSAA_DSM_2015.pdf
Article: https://www.featurelabs.com/blog/deep-feature-synthesis/
DFS works with the structured transactional and relational datasets
Across datasets features are derived by using primitive mathematical operations
New features are composed from using derived features (hence "Deep")

DFS example



In [56]:

    
Image(url= "../img/max-order-size.svg", width=600, height=600)









    Out[56]:

1. Working with data



In [17]:

    
import numpy as np
import pandas as pd
import featuretools as ft

from featuretools.primitives import *



In [20]:

    
# data is taken from kaggle.com/c/talkingdata-adtracking-fraud-detection
input_file = '../data/train_sample.csv'
pd.read_csv(input_file).head()









    Out[20]:







  
    
      
      ip
      app
      device
      os
      channel
      click_time
      attributed_time
      is_attributed
    
  
  
    
      0
      87540
      12
      1
      13
      497
      2017-11-07 09:30:38
      NaN
      0
    
    
      1
      105560
      25
      1
      17
      259
      2017-11-07 13:40:27
      NaN
      0
    
    
      2
      101424
      12
      1
      19
      212
      2017-11-07 18:05:24
      NaN
      0
    
    
      3
      94584
      13
      1
      13
      477
      2017-11-07 04:58:08
      NaN
      0
    
    
      4
      68413
      12
      1
      1
      178
      2017-11-09 09:00:09
      NaN
      0

Load typed data



In [21]:

    
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
to_read = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'click_time']
to_parse = ['click_time']
df_train = pd.read_csv(input_file, usecols=to_read, dtype=dtypes, parse_dates=to_parse)
df_train['id'] = range(len(df_train))

Create an EntitySet



In [24]:

    
es = ft.EntitySet(id='clicks')

Create and add an entity



In [26]:

    
es = es.entity_from_dataframe(
    entity_id='clicks',
    dataframe=df_train,
    index='id',
    time_index='click_time',
    variable_types={
        'ip': ft.variable_types.Categorical,
        'app': ft.variable_types.Categorical,
        'device': ft.variable_types.Categorical,
        'os': ft.variable_types.Categorical,
        'channel': ft.variable_types.Categorical,
        'is_attributed': ft.variable_types.Boolean,
    }
)



In [28]:

    
es









    Out[28]:





Entityset: clicks
  Entities:
    clicks (shape = [100000, 8])
  Relationships:
    No relationships



In [27]:

    
es["clicks"].variables









    Out[27]:





[<Variable: click_time (dtype: datetime_time_index, format: None)>,
 <Variable: ip (dtype = categorical, count = 100000)>,
 <Variable: app (dtype = categorical, count = 100000)>,
 <Variable: device (dtype = categorical, count = 100000)>,
 <Variable: os (dtype = categorical, count = 100000)>,
 <Variable: channel (dtype = categorical, count = 100000)>,
 <Variable: is_attributed (dtype = boolean, count = 100000)>,
 <Variable: id (dtype = index, count = 100000)>]

Create another entity with relationship to the "clicks" table



In [35]:

    
'''
es = es.entity_from_dataframe(
    entity_id='users',
    dataframe=df_users,
    index='id')

new_relationship = ft.Relationship(
    es["users"]["id"],
    es["clicks"]["user_id"])

es = es.add_relationship(new_relationship)
'''









    Out[35]:





'\nes = es.entity_from_dataframe(\n    entity_id=\'users\',\n    dataframe=df_users,\n    index=\'id\')\n\nnew_relationship = ft.Relationship(\n    es["users"]["id"],\n    es["clicks"]["user_id"])\n\nes = es.add_relationship(new_relationship)\n'

Create an entity from the existing table



In [29]:

    
es = es.normalize_entity(
    base_entity_id='clicks', 
    new_entity_id='apps', 
    index='app', 
    make_time_index=False)

Creates a new "apps" entity based on data from "clicks" entity
Creates a new relationship between "apps" and "clicks" and adds it to the EntitySet



In [30]:

    
es['clicks'].variables









    Out[30]:





[<Variable: click_time (dtype: datetime_time_index, format: None)>,
 <Variable: ip (dtype = categorical, count = 100000)>,
 <Variable: app (dtype = id, count = 100000)>,
 <Variable: device (dtype = categorical, count = 100000)>,
 <Variable: os (dtype = categorical, count = 100000)>,
 <Variable: channel (dtype = categorical, count = 100000)>,
 <Variable: is_attributed (dtype = boolean, count = 100000)>,
 <Variable: id (dtype = index, count = 100000)>]



In [31]:

    
es['apps'].variables









    Out[31]:





[<Variable: app (dtype = index, count = 161)>]



In [32]:

    
es









    Out[32]:





Entityset: clicks
  Entities:
    clicks (shape = [100000, 8])
    apps (shape = [161, 1])
  Relationships:
    clicks.app -> apps.app

2. Create features



In [44]:

    
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity="apps", max_depth=3)



In [45]:

    
feature_defs









    Out[45]:





[<Feature: COUNT(clicks)>,
 <Feature: PERCENT_TRUE(clicks.is_attributed)>,
 <Feature: NUM_UNIQUE(clicks.ip)>,
 <Feature: NUM_UNIQUE(clicks.device)>,
 <Feature: NUM_UNIQUE(clicks.os)>,
 <Feature: NUM_UNIQUE(clicks.channel)>,
 <Feature: MODE(clicks.ip)>,
 <Feature: MODE(clicks.device)>,
 <Feature: MODE(clicks.os)>,
 <Feature: MODE(clicks.channel)>,
 <Feature: NUM_UNIQUE(clicks.DAY(click_time))>,
 <Feature: NUM_UNIQUE(clicks.YEAR(click_time))>,
 <Feature: NUM_UNIQUE(clicks.MONTH(click_time))>,
 <Feature: NUM_UNIQUE(clicks.WEEKDAY(click_time))>,
 <Feature: MODE(clicks.DAY(click_time))>,
 <Feature: MODE(clicks.YEAR(click_time))>,
 <Feature: MODE(clicks.MONTH(click_time))>,
 <Feature: MODE(clicks.WEEKDAY(click_time))>]



In [43]:

    
feature_matrix.head()









    Out[43]:







  
    
      
      COUNT(clicks)
      PERCENT_TRUE(clicks.is_attributed)
      NUM_UNIQUE(clicks.ip)
      NUM_UNIQUE(clicks.device)
      NUM_UNIQUE(clicks.os)
      NUM_UNIQUE(clicks.channel)
      MODE(clicks.ip)
      MODE(clicks.device)
      MODE(clicks.os)
      MODE(clicks.channel)
      NUM_UNIQUE(clicks.DAY(click_time))
      NUM_UNIQUE(clicks.YEAR(click_time))
      NUM_UNIQUE(clicks.MONTH(click_time))
      NUM_UNIQUE(clicks.WEEKDAY(click_time))
      MODE(clicks.DAY(click_time))
      MODE(clicks.YEAR(click_time))
      MODE(clicks.MONTH(click_time))
      MODE(clicks.WEEKDAY(click_time))
    
    
      app
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      3135
      0.000000
      2723
      4
      56
      27
      5348
      1
      19
      134
      4
      1
      1
      4
      7
      2017
      11
      1
    
    
      2
      11737
      0.000000
      7759
      3
      75
      21
      5314
      1
      19
      477
      4
      1
      1
      4
      8
      2017
      11
      2
    
    
      3
      18279
      0.000219
      12040
      4
      78
      32
      5348
      1
      19
      280
      4
      1
      1
      4
      8
      2017
      11
      2
    
    
      4
      58
      0.000000
      56
      2
      20
      1
      79881
      1
      19
      101
      3
      1
      1
      3
      9
      2017
      11
      3
    
    
      5
      188
      0.074468
      187
      2
      36
      4
      26995
      1
      19
      377
      4
      1
      1
      4
      7
      2017
      11
      1

3. Feature primitives

The units/ building blocks of Featuretools
Computations applied to raw datasets to create new features
Constrains the input and output data types
Two types of primitives: aggregation and transform

Aggregation vs Transform Primitive

Aggregation primitives: These primitives take related instances as an input and output a single value. They are applied across a parent-child relationship in an entity set. E.g: Count, Sum, AvgTimeBetween.

Transform primitives: These primitives take one or more variables from an entity as an input and output a new variable for that entity. They are applied to a single entity. E.g: Hour, TimeSincePrevious, Absolute.

Custom primitives: You can define your own aggregation and transform primitives



In [61]:

    
class Sum(AggregationPrimitive):
    """Counts the number of elements of a numeric or boolean feature"""
    name = "sum"
    input_types = [Numeric]
    return_type = Numeric
    stack_on_self = False
    stack_on_exclude = [Count]

    # todo: handle count nulls
    def get_function(self):
        def sum_func(x):
            return np.nan_to_num(x.values).sum(dtype=np.float)
        return sum_func



In [ ]:

    
class Day(DatetimeUnitBasePrimitive):
    """Transform a Datetime feature into the day"""
    name = "day"

class DatetimeUnitBasePrimitive(TransformPrimitive):
    """Transform Datetime feature into time or calendar units
     (second/day/week/etc)"""
    name = None
    input_types = [Datetime]
    return_type = Ordinal

    def get_function(self):
        return lambda array: pd_time_unit(self.name)(pd.DatetimeIndex(array))

4. Handling time

Designed to take time into consideration
Entities have a column (time index) that specifies the point in time when data in that row became available
Cutoff Time specifies the time to calculate features. Only data prior to this time will be used.
Training window specifies the time to calculate features. Only data after this time will be used.



In [ ]:

	ip	app	device	os	channel	click_time	attributed_time
0	87540	12	1	13	497	2017-11-07 09:30:38	NaN
1	105560	25	1	17	259	2017-11-07 13:40:27	NaN
2	101424	12	1	19	212	2017-11-07 18:05:24	NaN
3	94584	13	1	13	477	2017-11-07 04:58:08	NaN
4	68413	12	1	1	178	2017-11-09 09:00:09	NaN

	COUNT(clicks)	PERCENT_TRUE(clicks.is_attributed)	NUM_UNIQUE(clicks.ip)	NUM_UNIQUE(clicks.device)	NUM_UNIQUE(clicks.os)	NUM_UNIQUE(clicks.channel)	MODE(clicks.ip)	MODE(clicks.device)	MODE(clicks.os)	MODE(clicks.channel)	NUM_UNIQUE(clicks.DAY(click_time))	NUM_UNIQUE(clicks.YEAR(click_time))	NUM_UNIQUE(clicks.MONTH(click_time))	NUM_UNIQUE(clicks.WEEKDAY(click_time))	MODE(clicks.DAY(click_time))	MODE(clicks.YEAR(click_time))	MODE(clicks.MONTH(click_time))	MODE(clicks.WEEKDAY(click_time))
app
1	3135	0.000000	2723	4	56	27	5348	1	19	134	4	1	1	4	7	2017	11	1
2	11737	0.000000	7759	3	75	21	5314	1	19	477	4	1	1	4	8	2017	11	2
3	18279	0.000219	12040	4	78	32	5348	1	19	280	4	1	1	4	8	2017	11	2
4	58	0.000000	56	2	20	1	79881	1	19	101	3	1	1	3	9	2017	11	3
5	188	0.074468	187	2	36	4	26995	1	19	377	4	1	1	4	7	2017	11	1