Featuretools by example


In [5]:
import numpy as np
import pandas as pd
import featuretools as ft

from featuretools.primitives import *

In [6]:
# data is taken from kaggle.com/c/talkingdata-adtracking-fraud-detection
input_file = '../data/train_sample.csv'
pd.read_csv(input_file).head()


Out[6]:
ip app device os channel click_time attributed_time is_attributed
0 87540 12 1 13 497 2017-11-07 09:30:38 NaN 0
1 105560 25 1 17 259 2017-11-07 13:40:27 NaN 0
2 101424 12 1 19 212 2017-11-07 18:05:24 NaN 0
3 94584 13 1 13 477 2017-11-07 04:58:08 NaN 0
4 68413 12 1 1 178 2017-11-09 09:00:09 NaN 0

In [7]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
to_read = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'click_time']
to_parse = ['click_time']

In [33]:
df_train = pd.read_csv(input_file, usecols=to_read, dtype=dtypes, parse_dates=to_parse)
df_train.sort_values(by='click_time')
df_train['id'] = range(len(df_train))
df_train.head()


Out[33]:
ip app device os channel click_time is_attributed id
0 87540 12 1 13 497 2017-11-07 09:30:38 0 0
1 105560 25 1 17 259 2017-11-07 13:40:27 0 1
2 101424 12 1 19 212 2017-11-07 18:05:24 0 2
3 94584 13 1 13 477 2017-11-07 04:58:08 0 3
4 68413 12 1 1 178 2017-11-09 09:00:09 0 4

In [34]:
es = ft.EntitySet(id='clicks')

es = es.entity_from_dataframe(
    entity_id='clicks',
    dataframe=df_train,
    index='id',
    time_index='click_time',
    variable_types={
        'ip': ft.variable_types.Categorical,
        'app': ft.variable_types.Categorical,
        'device': ft.variable_types.Categorical,
        'os': ft.variable_types.Categorical,
        'channel': ft.variable_types.Categorical,
        'is_attributed': ft.variable_types.Boolean,
    }
)

es = es.normalize_entity(base_entity_id='clicks', new_entity_id='apps', index='app', make_time_index=False)

In [35]:
es


Out[35]:
Entityset: clicks
  Entities:
    clicks (shape = [100000, 8])
    apps (shape = [161, 1])
  Relationships:
    clicks.app -> apps.app

In [36]:
# Hint Featuretools: creates features for the particular value
es['clicks']['is_attributed'].interesting_values = [True]

In [45]:
# Manually defined domain specific feature: "Seed Features"
google_apps = ft.Feature(es['apps']['app']).isin([1,2,3,4,5])

In [56]:
X = df_train.copy()
X_features = X[X['id'] < 70000]
X_train = X[X['id'] >= 70000]
cutoff_time = X_features['click_time'].max()
cutoff_time


Out[56]:
Timestamp('2017-11-09 15:59:51')

In [63]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity='apps',
    seed_features=[google_apps],
    cutoff_time=cutoff_time,
    training_window=ft.Timedelta("3 days"),
    max_depth=5
)


2018-04-02 17:55:43,927 featuretools.entityset - WARNING    Using training_window but last_time_index is not set on entity clicks

In [64]:
feature_defs


Out[64]:
[<Feature: app.isin([1, 2, 3, 4, 5])>,
 <Feature: COUNT(clicks)>,
 <Feature: PERCENT_TRUE(clicks.is_attributed)>,
 <Feature: NUM_UNIQUE(clicks.ip)>,
 <Feature: NUM_UNIQUE(clicks.device)>,
 <Feature: NUM_UNIQUE(clicks.os)>,
 <Feature: NUM_UNIQUE(clicks.channel)>,
 <Feature: MODE(clicks.ip)>,
 <Feature: MODE(clicks.device)>,
 <Feature: MODE(clicks.os)>,
 <Feature: MODE(clicks.channel)>,
 <Feature: COUNT(clicks WHERE is_attributed = True)>,
 <Feature: NUM_UNIQUE(clicks.DAY(click_time))>,
 <Feature: NUM_UNIQUE(clicks.YEAR(click_time))>,
 <Feature: NUM_UNIQUE(clicks.MONTH(click_time))>,
 <Feature: NUM_UNIQUE(clicks.WEEKDAY(click_time))>,
 <Feature: MODE(clicks.DAY(click_time))>,
 <Feature: MODE(clicks.YEAR(click_time))>,
 <Feature: MODE(clicks.MONTH(click_time))>,
 <Feature: MODE(clicks.WEEKDAY(click_time))>]

In [62]:
feature_matrix


Out[62]:
app.isin([1, 2, 3, 4, 5]) COUNT(clicks) PERCENT_TRUE(clicks.is_attributed) NUM_UNIQUE(clicks.ip) NUM_UNIQUE(clicks.device) NUM_UNIQUE(clicks.os) NUM_UNIQUE(clicks.channel) MODE(clicks.ip) MODE(clicks.device) MODE(clicks.os) MODE(clicks.channel) COUNT(clicks WHERE is_attributed = True) NUM_UNIQUE(clicks.DAY(click_time)) NUM_UNIQUE(clicks.YEAR(click_time)) NUM_UNIQUE(clicks.MONTH(click_time)) NUM_UNIQUE(clicks.WEEKDAY(click_time)) MODE(clicks.DAY(click_time)) MODE(clicks.YEAR(click_time)) MODE(clicks.MONTH(click_time)) MODE(clicks.WEEKDAY(click_time))
app
1 True 3135 0.000000 2723 4 56 27 5348 1 19 134 0.0 4 1 1 4 7 2017 11 1
2 True 11737 0.000000 7759 3 75 21 5314 1 19 477 0.0 4 1 1 4 8 2017 11 2
3 True 18279 0.000219 12040 4 78 32 5348 1 19 280 4.0 4 1 1 4 8 2017 11 2
4 True 58 0.000000 56 2 20 1 79881 1 19 101 0.0 3 1 1 3 9 2017 11 3
5 True 188 0.074468 187 2 36 4 26995 1 19 377 14.0 4 1 1 4 7 2017 11 1
6 False 1303 0.000000 1209 2 47 4 5314 1 19 459 0.0 4 1 1 4 7 2017 11 1
7 False 981 0.000000 790 4 56 1 48240 1 13 101 0.0 3 1 1 3 9 2017 11 3
8 False 2004 0.001996 1778 3 51 3 5348 1 19 145 4.0 4 1 1 4 9 2017 11 3
9 False 8992 0.000890 6721 5 73 29 5348 1 19 466 8.0 4 1 1 4 9 2017 11 3
10 False 388 0.046392 373 1 42 5 5348 1 19 377 18.0 4 1 1 4 7 2017 11 1
11 False 1927 0.001038 1742 2 53 12 5348 1 19 319 2.0 4 1 1 4 8 2017 11 2
12 False 13198 0.000076 9190 4 71 26 73487 1 19 178 1.0 4 1 1 4 8 2017 11 2
13 False 2422 0.000000 2179 4 60 6 5314 1 19 477 0.0 4 1 1 4 7 2017 11 1
14 False 5359 0.000000 4442 3 63 28 5314 1 19 379 0.0 4 1 1 4 8 2017 11 2
15 False 8595 0.000233 6473 2 67 24 5348 1 19 245 2.0 4 1 1 4 7 2017 11 1
16 False 3 0.000000 3 1 2 1 18966 1 18 268 0.0 2 1 1 2 7 2017 11 1
17 False 380 0.000000 345 2 40 4 17149 1 19 280 0.0 4 1 1 4 8 2017 11 2
18 False 8315 0.000601 6441 5 69 12 5348 1 19 107 5.0 4 1 1 4 8 2017 11 2
19 False 478 0.146444 448 76 14 8 5348 0 24 213 70.0 4 1 1 4 9 2017 11 3
20 False 911 0.001098 856 2 49 4 5314 1 19 259 1.0 4 1 1 4 7 2017 11 1
21 False 1979 0.000000 1732 2 57 3 5314 1 19 128 0.0 4 1 1 4 7 2017 11 1
22 False 386 0.000000 373 2 37 2 5314 1 19 116 0.0 4 1 1 4 7 2017 11 1
23 False 1454 0.000000 1351 2 46 3 5314 1 19 153 0.0 4 1 1 4 8 2017 11 2
24 False 704 0.000000 645 2 52 2 5348 1 19 105 0.0 4 1 1 4 7 2017 11 1
25 False 804 0.000000 740 2 41 1 5348 1 13 259 0.0 4 1 1 4 7 2017 11 1
26 False 1633 0.000000 1511 2 53 4 5348 1 19 121 0.0 4 1 1 4 8 2017 11 2
27 False 696 0.000000 662 2 43 2 5314 1 19 153 0.0 4 1 1 4 7 2017 11 1
28 False 720 0.000000 691 2 42 2 5348 1 19 135 0.0 4 1 1 4 7 2017 11 1
29 False 360 0.061111 329 3 43 8 5348 1 19 343 22.0 4 1 1 4 7 2017 11 1
30 False 2 0.000000 2 1 1 1 81643 3866 866 347 0.0 2 1 1 2 8 2017 11 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
202 False 6 0.166667 6 1 5 1 31277 1 6 421 1.0 3 1 1 3 7 2017 11 1
204 False 2 0.000000 2 2 2 1 37424 3032 607 347 0.0 2 1 1 2 7 2017 11 1
208 False 13 0.076923 13 1 5 2 7308 1 19 330 1.0 2 1 1 2 7 2017 11 1
215 False 4 0.000000 3 2 2 1 269087 3543 748 347 0.0 2 1 1 2 8 2017 11 2
216 False 1 0.000000 1 1 1 1 217078 1 22 213 0.0 1 1 1 1 8 2017 11 2
232 False 9 0.000000 9 3 3 1 32356 3866 866 347 0.0 3 1 1 3 8 2017 11 2
233 False 1 0.000000 1 1 1 1 27629 3032 607 347 0.0 1 1 1 1 7 2017 11 1
261 False 1 1.000000 1 1 1 1 288079 0 0 243 1.0 1 1 1 1 9 2017 11 3
266 False 2 0.000000 2 2 2 1 26622 3032 607 347 0.0 2 1 1 2 6 2017 11 0
267 False 1 0.000000 1 1 1 1 99154 3543 748 347 0.0 1 1 1 1 8 2017 11 2
268 False 1 0.000000 1 1 1 1 43827 3032 607 347 0.0 1 1 1 1 6 2017 11 0
271 False 1 0.000000 1 1 1 1 67316 3543 748 347 0.0 1 1 1 1 8 2017 11 2
273 False 3 0.000000 2 1 2 1 201182 0 0 347 0.0 2 1 1 2 7 2017 11 1
293 False 1 0.000000 1 1 1 1 59384 0 0 347 0.0 1 1 1 1 9 2017 11 3
302 False 1 0.000000 1 1 1 1 44725 3543 748 347 0.0 1 1 1 1 8 2017 11 2
310 False 3 0.000000 3 1 3 1 5348 0 0 272 0.0 1 1 1 1 9 2017 11 3
315 False 4 0.000000 4 1 4 1 14884 1 13 110 0.0 3 1 1 3 7 2017 11 1
347 False 1 0.000000 1 1 1 1 245890 3543 748 347 0.0 1 1 1 1 8 2017 11 2
363 False 2 0.000000 2 1 1 1 41463 3543 748 347 0.0 2 1 1 2 7 2017 11 1
372 False 1 0.000000 1 1 1 1 125141 1 18 107 0.0 1 1 1 1 7 2017 11 1
394 False 2 0.000000 2 2 2 1 105475 3032 607 347 0.0 2 1 1 2 7 2017 11 1
398 False 1 0.000000 1 1 1 1 49462 1 36 347 0.0 1 1 1 1 7 2017 11 1
407 False 1 0.000000 1 1 1 1 44527 3543 748 347 0.0 1 1 1 1 8 2017 11 2
425 False 2 0.000000 2 2 2 1 26995 3032 607 347 0.0 2 1 1 2 7 2017 11 1
474 False 1 0.000000 1 1 1 1 114220 0 38 272 0.0 1 1 1 1 9 2017 11 3
486 False 1 0.000000 1 1 1 1 245311 0 24 21 0.0 1 1 1 1 7 2017 11 1
536 False 1 0.000000 1 1 1 1 49856 1 19 21 0.0 1 1 1 1 6 2017 11 0
538 False 1 0.000000 1 1 1 1 4136 3032 607 347 0.0 1 1 1 1 7 2017 11 1
548 False 1 0.000000 1 1 1 1 191061 3543 748 347 0.0 1 1 1 1 8 2017 11 2
551 False 1 0.000000 1 1 1 1 67285 347 113 243 0.0 1 1 1 1 7 2017 11 1

161 rows × 20 columns


In [ ]: